Corrections in dbpedia label import
authorveltr
Fri, 16 Aug 2013 16:32:06 +0200
changeset 89 fa40437d5991
parent 88 87443e64bece
child 90 90a5258b3cc4
Corrections in dbpedia label import
src/jocondelab/management/commands/import_dbpedia_translations.py
--- a/src/jocondelab/management/commands/import_dbpedia_translations.py	Fri Aug 16 15:28:40 2013 +0200
+++ b/src/jocondelab/management/commands/import_dbpedia_translations.py	Fri Aug 16 16:32:06 2013 +0200
@@ -14,6 +14,9 @@
 from SPARQLWrapper import SPARQLWrapper2
 from optparse import make_option
 import traceback
+import sys
+import urllib
+import json
 
 class Command(NoArgsCommand):
     
@@ -30,7 +33,6 @@
     
     def handle_noargs(self, **options):
         
-        langstr = ','.join(['"%s"'%l[0] for l in settings.LANGUAGES])
         endpointre = re.compile('^http:\/\/\w+\.?dbpedia.org')
         
         qs = DbpediaResource.objects
@@ -46,29 +48,36 @@
         
         for i,obj in enumerate(qs):
             writer = show_progress(i+1, count, obj.uri, 50, writer)
-            
-            try:
-                with transaction.commit_on_success():
-                    DbpediaTranslation.objects.filter(dbpediaresource=obj).delete()
-                    endpointuri = settings.WIKIPEDIA_URLS[obj.lang]['dbpedia_sparql_url']
-                    endpoint = SPARQLWrapper2(endpointuri)
-                    sparql = """
-                    select lang(?label) as ?lang, ?label where {
-                        <%s> rdfs:label ?label .
-                        FILTER (lang(?label) IN (%s))
-                    }
-                    """%(obj.uri, langstr)
-                    endpoint.setQuery(sparql)
-                    results = endpoint.query()
-                    
-                    for binding in results.bindings:
-                        DbpediaTranslation.objects.create(
-                            dbpediaresource = obj,
-                            lang = binding[u"lang"].value,
-                            label = binding[u"label"].value
-                                                           )
-            except Exception as e:
-                print "\nError processing resource %s : %s" %(obj.uri,unicode(e))
-                traceback.print_exception(type(e), e, sys.exc_info()[2])
+            for langtuple in settings.LANGUAGES:
+                lang = langtuple[0]
+                try:
+                    with transaction.commit_on_success():
+                        DbpediaTranslation.objects.filter(dbpediaresource=obj).delete()
+                        endpointuri = settings.WIKIPEDIA_URLS[obj.lang]['dbpedia_sparql_url']
+                        endpoint = SPARQLWrapper2(endpointuri)
+                        uri = urllib.unquote(str(obj.uri)).decode('utf8')
+                        sparql = u"""
+                        select ?label, ?rlabel where {
+                            OPTIONAL { <%s> rdfs:label ?label FILTER (lang(?label) = "%s") } .
+                            OPTIONAL { <%s> dbpedia-owl:wikiPageRedirects ?r }.
+                            OPTIONAL { ?r rdfs:label ?rlabel FILTER (lang(?rlabel) = "%s") }.
+                        }
+                        """%(uri, lang, uri, lang)
+                        endpoint.setQuery(sparql)
+                        results = endpoint.query()
+                        
+                        if len(results.bindings):
+                            binding = results.bindings[0]
+                            label = binding[u"label"].value if "label" in binding else binding[u"rlabel"].value if "rlabel" in binding else None
+                            if label is not None:
+                                DbpediaTranslation.objects.create(
+                                    dbpediaresource = obj,
+                                    lang = lang,
+                                    label = label
+                                )
+                        
+                except Exception as e:
+                    print "\nError processing resource %s : %s" %(obj.uri,unicode(e))
+                    traceback.print_exception(type(e), e, sys.exc_info()[2])
                 
         
\ No newline at end of file