web/hdalab/management/commands/query_dbpedia.py
changeset 240 c8627191f2d7
parent 135 dd6578e36a57
--- a/web/hdalab/management/commands/query_dbpedia.py	Thu Sep 06 12:16:27 2012 +0200
+++ b/web/hdalab/management/commands/query_dbpedia.py	Sat Sep 08 03:49:10 2012 +0200
@@ -17,11 +17,11 @@
 from hdabo.utils import show_progress
 from rdflib.graph import Graph
 from rdflib import URIRef
-import re
 import traceback
 import sys
 from hdalab.models.dataviz import DbpediaFieldsTranslation
 from django import db
+from SPARQLWrapper import SPARQLWrapper, RDF
 
 class Command(NoArgsCommand):
     '''
@@ -123,41 +123,54 @@
             print "dbpedia query cancelled"
             return
 
+        endpoint = SPARQLWrapper("http://dbpedia.org/sparql", returnFormat=RDF)
+
         writer = None
         for i,tag in enumerate(queryset):
             writer = show_progress(i+1, count, tag.label, 50, writer)
             db.reset_queries()
-            rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3"            
-            g = Graph()
+            
+            #abstract query
+            #"select ?y
+            # where {<%s>  <http://dbpedia.org/ontology/abstract> ?y}" % (tag.dbpedia_uri)
+            
+            #rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3"            
+            #g = Graph()
             try :
-                g.parse(rdf_uri, format="n3")
-            
+                abstracts = {}
+                labels = {}
+                thumbnail = None
                 with transaction.commit_on_success():
+                    endpoint.setQuery("select distinct ?y where {<%s>  <http://dbpedia.org/ontology/abstract> ?y}" % (tag.dbpedia_uri))
+                    res_abstracts = endpoint.queryAndConvert()
+                    for _,_,o in res_abstracts.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)):
+                        abstracts[o.language] = (unicode(o), True)
+
+                    endpoint.setQuery("select distinct ?y where {<%s>  <http://www.w3.org/2000/01/rdf-schema#label> ?y}" % (tag.dbpedia_uri))
+                    res_labels = endpoint.queryAndConvert()
+                    for _,_,o in res_labels.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)):
+                        labels[o.language] = (unicode(o), True)
+                            
+                    endpoint.setQuery("select distinct ?y where {<%s>  <http://dbpedia.org/ontology/thumbnail> ?y} limit 1" % (tag.dbpedia_uri))
+                    res_thumbnails = endpoint.queryAndConvert()
+                    for _,_,o in res_thumbnails.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)):
+                        thumbnail = unicode(o)
+
+                    endpoint.setQuery('select distinct ?y where { <%s> ?p ?y . FILTER regex(?y, "^http://dbpedia.org/resource")}' % (tag.dbpedia_uri))
+                    res_links = endpoint.queryAndConvert()
+                    for _,_,o in res_links.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)):
+                        tagqs = Tag.objects.filter(dbpedia_uri=unicode(o))
+                        if tagqs:
+                            TagLinks.objects.get_or_create(subject=tag, object=tagqs[0])                        
                     
-                    abstracts = {}
-                    labels = {}
-                    thumbnail = None
-                    for t in g:
-                        if t[1] == URIRef(u'http://dbpedia.org/ontology/abstract') and t[2] is not None \
-                            and hasattr(t[2], 'language'):
-                            abstracts[t[2].language] = unicode(t[2])
-                        if t[1] ==  URIRef(u'http://www.w3.org/2000/01/rdf-schema#label') and t[2] is not None \
-                            and hasattr(t[2], 'language'):
-                            labels[t[2].language] = unicode(t[2]) 
-                        if t[1] == URIRef(u'http://dbpedia.org/ontology/thumbnail') and t[2] is not None:
-                            thumbnail = unicode(t[2])
-                        if u'http://dbpedia.org/resource' in t[2]:
-                            tagqs = Tag.objects.filter(dbpedia_uri=unicode(t[2]))
-                            if tagqs:
-                                TagLinks.objects.get_or_create(subject=tag, object=tagqs[0])                        
-                    
-                    ref_label = labels['fr'] if 'fr' in labels else labels['en'] if 'en' in labels else labels.values()[0] if len(labels) > 0 else tag.label
-                    ref_abstract = abstracts['fr'] if 'fr' in abstracts else abstracts['en'] if 'en' in abstracts else abstracts.values()[0] if len(abstracts) > 0 else None
+                    ref_label_lang, (ref_label, _) = ('fr',labels['fr']) if 'fr' in labels else ('en',labels['en']) if 'en' in labels else labels.items()[0] if len(labels) > 0 else ('fr',(tag.label, True))
+                    ref_abstract_lang, (ref_abstract, _) = ('fr',abstracts['fr']) if 'fr' in abstracts else ('en',abstracts['en']) if 'en' in abstracts else abstracts.items()[0] if len(abstracts) > 0 else ('fr',(None, 'True'))
+
                     for lang in settings.LANGUAGES:
                         if lang[0] not in labels:
-                            labels[lang[0]]= ref_label
+                            labels[lang[0]]= (ref_label, False)
                         if lang[0] not in abstracts:
-                            abstracts[lang[0]] = ref_abstract
+                            abstracts[lang[0]] = (ref_abstract, False)
 
                     dbfield , created = DbpediaFields.objects.get_or_create(dbpedia_uri=tag.dbpedia_uri,tag=tag, defaults={'abstract':ref_abstract, 'thumbnail':thumbnail, 'label':ref_label}) #@UndefinedVariable                    
                     if not created:
@@ -169,16 +182,16 @@
 
                     consolidated_trans = {}
                     for lang,label in labels.iteritems():
-                        consolidated_trans[lang] = [label,ref_abstract]
+                        consolidated_trans[lang] = [label,(ref_abstract, lang==ref_abstract_lang)]
                     for lang,abstract in abstracts.iteritems():
                         if lang in consolidated_trans:
                             consolidated_trans[lang][1] = abstract
                         else:
-                            consolidated_trans[lang] = [ref_label, abstract] 
+                            consolidated_trans[lang] = [(ref_label, lang==ref_label_lang), abstract] 
                          
                     for lang, trans in consolidated_trans.iteritems():
                         label, abstract = tuple(trans)
-                        DbpediaFieldsTranslation.objects.create(master=dbfield, language_code=lang, label=label, abstract=abstract)
+                        DbpediaFieldsTranslation.objects.create(master=dbfield, language_code=lang, label=label[0], is_label_translated=label[1], abstract=abstract[0], is_abstract_translated=abstract[1])
                     
                     
             except Exception as e: