src/hdalab/management/commands/query_dbpedia.py
changeset 271 8f77cf71ab02
parent 240 c8627191f2d7
child 272 1c774f7a0341
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/hdalab/management/commands/query_dbpedia.py	Tue Jun 17 10:25:33 2014 +0200
@@ -0,0 +1,205 @@
+# -*- coding: utf-8 -*-
+'''
+Created on Jan 30, 2012
+
+@author: ymh
+'''
+
+from django.core.management.base import NoArgsCommand
+from django.core.management.color import no_style
+
+from optparse import make_option
+from django.conf import settings
+from django.db.models import Count
+from django.db import transaction
+from hdabo.models import Tag
+from hdalab.models import DbpediaFields, TagLinks
+from hdabo.utils import show_progress
+from rdflib.graph import Graph
+from rdflib import URIRef
+import traceback
+import sys
+from hdalab.models.dataviz import DbpediaFieldsTranslation
+from django import db
+from SPARQLWrapper import SPARQLWrapper, RDF
+
+class Command(NoArgsCommand):
+    '''
+    query and update wikipedia for tag title.
+    '''
+    options = ''
+    help = """query and update wikipedia for tag title."""
+    
+    option_list = NoArgsCommand.option_list + (
+        make_option('--all',
+            action='store_true',
+            dest='all',
+            default=False,
+            help='force all tags to be updated, not only those not yet processed'),
+        make_option('--force',
+            action='store_true',
+            dest='force',
+            default=False,
+            help='ask no questions'),
+        make_option('--random',
+            action='store_true',
+            dest='random',
+            default=False,
+            help='randomize query on tags'),
+        make_option('--limit',
+            action='store',
+            type='int',
+            dest='limit',
+            default= -1,
+            help='number of tag to process'),
+        make_option('--start',
+            action='store',
+            type='int',
+            dest='start',
+            default=0,
+            help='number of tag to ignore'),
+        make_option('--tag',
+            action='append',
+            dest='tags',
+            type='string',
+            default=[],
+            help='the tag to query'),
+    )
+    
+    def handle_noargs(self, **options):
+                
+        self.style = no_style()
+        
+        self.interactive = options.get('interactive', True)
+        
+        self.verbosity = int(options.get('verbosity', '1'))
+        
+        self.force = options.get('force', False)
+        
+        self.limit = options.get("limit", -1)
+        self.start = options.get("start", 0)
+        
+        self.random = options.get('random', False)
+                        
+        if self.verbosity > 2:
+            print "option passed : " + repr(options)
+
+        self.tag_list = options.get("tags", []);
+
+        queryset = Tag.objects.exclude(dbpedia_uri= None)
+        
+                
+        if self.tag_list:
+            queryset = queryset.filter(label__in=self.tag_list)
+        elif not options.get('all',False):            
+            queryset = queryset.annotate(dbfc=Count('dbpedia_fields')).filter(dbfc = 0)
+
+        if self.random:
+            queryset = queryset.order_by("?")
+        else:
+            queryset = queryset.order_by("label")
+        
+        if self.limit >= 0:
+            queryset = queryset[self.start:self.limit]
+        elif self.start > 0:
+            queryset = queryset[self.start:]
+        
+        if self.verbosity > 2 :
+            print "Tag Query is %s" % (queryset.query)
+
+        count = queryset.count()
+        
+        if count == 0:
+            print "No tag to query : exit."
+            return
+            
+            
+        if not self.force and self.interactive:
+            confirm = raw_input("You have requested to query and replace the dbpedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
+        else:
+            confirm = 'yes'
+            
+        if confirm != "yes":
+            print "dbpedia query cancelled"
+            return
+
+        endpoint = SPARQLWrapper("http://dbpedia.org/sparql", returnFormat=RDF)
+
+        writer = None
+        for i,tag in enumerate(queryset):
+            writer = show_progress(i+1, count, tag.label, 50, writer)
+            db.reset_queries()
+            
+            #abstract query
+            #"select ?y
+            # where {<%s>  <http://dbpedia.org/ontology/abstract> ?y}" % (tag.dbpedia_uri)
+            
+            #rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3"            
+            #g = Graph()
+            try :
+                abstracts = {}
+                labels = {}
+                thumbnail = None
+                with transaction.commit_on_success():
+                    endpoint.setQuery("select distinct ?y where {<%s>  <http://dbpedia.org/ontology/abstract> ?y}" % (tag.dbpedia_uri))
+                    res_abstracts = endpoint.queryAndConvert()
+                    for _,_,o in res_abstracts.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)):
+                        abstracts[o.language] = (unicode(o), True)
+
+                    endpoint.setQuery("select distinct ?y where {<%s>  <http://www.w3.org/2000/01/rdf-schema#label> ?y}" % (tag.dbpedia_uri))
+                    res_labels = endpoint.queryAndConvert()
+                    for _,_,o in res_labels.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)):
+                        labels[o.language] = (unicode(o), True)
+                            
+                    endpoint.setQuery("select distinct ?y where {<%s>  <http://dbpedia.org/ontology/thumbnail> ?y} limit 1" % (tag.dbpedia_uri))
+                    res_thumbnails = endpoint.queryAndConvert()
+                    for _,_,o in res_thumbnails.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)):
+                        thumbnail = unicode(o)
+
+                    endpoint.setQuery('select distinct ?y where { <%s> ?p ?y . FILTER regex(?y, "^http://dbpedia.org/resource")}' % (tag.dbpedia_uri))
+                    res_links = endpoint.queryAndConvert()
+                    for _,_,o in res_links.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)):
+                        tagqs = Tag.objects.filter(dbpedia_uri=unicode(o))
+                        if tagqs:
+                            TagLinks.objects.get_or_create(subject=tag, object=tagqs[0])                        
+                    
+                    ref_label_lang, (ref_label, _) = ('fr',labels['fr']) if 'fr' in labels else ('en',labels['en']) if 'en' in labels else labels.items()[0] if len(labels) > 0 else ('fr',(tag.label, True))
+                    ref_abstract_lang, (ref_abstract, _) = ('fr',abstracts['fr']) if 'fr' in abstracts else ('en',abstracts['en']) if 'en' in abstracts else abstracts.items()[0] if len(abstracts) > 0 else ('fr',(None, 'True'))
+
+                    for lang in settings.LANGUAGES:
+                        if lang[0] not in labels:
+                            labels[lang[0]]= (ref_label, False)
+                        if lang[0] not in abstracts:
+                            abstracts[lang[0]] = (ref_abstract, False)
+
+                    dbfield , created = DbpediaFields.objects.get_or_create(dbpedia_uri=tag.dbpedia_uri,tag=tag, defaults={'abstract':ref_abstract, 'thumbnail':thumbnail, 'label':ref_label}) #@UndefinedVariable                    
+                    if not created:
+                        dbfield.abstract = ref_abstract
+                        dbfield.thumbnail = thumbnail
+                        dbfield.label = ref_label
+                        dbfield.save()
+                        DbpediaFieldsTranslation.objects.filter(master=dbfield).delete()
+
+                    consolidated_trans = {}
+                    for lang,label in labels.iteritems():
+                        consolidated_trans[lang] = [label,(ref_abstract, lang==ref_abstract_lang)]
+                    for lang,abstract in abstracts.iteritems():
+                        if lang in consolidated_trans:
+                            consolidated_trans[lang][1] = abstract
+                        else:
+                            consolidated_trans[lang] = [(ref_label, lang==ref_label_lang), abstract] 
+                         
+                    for lang, trans in consolidated_trans.iteritems():
+                        label, abstract = tuple(trans)
+                        DbpediaFieldsTranslation.objects.create(master=dbfield, language_code=lang, label=label[0], is_label_translated=label[1], abstract=abstract[0], is_abstract_translated=abstract[1])
+                    
+                    
+            except Exception as e:
+                print "\nError processing resource %s : %s" %(rdf_uri,unicode(e))
+                traceback.print_exception(type(e), e, sys.exc_info()[2])
+                
+
+
+            
+            
+