src/hdalab/management/commands/query_dbpedia.py
branchdocumentation
changeset 693 09e00f38d177
parent 571 d9642be7c937
--- a/src/hdalab/management/commands/query_dbpedia.py	Thu Apr 12 01:27:16 2018 +0200
+++ b/src/hdalab/management/commands/query_dbpedia.py	Wed Apr 11 12:19:47 2018 +0200
@@ -1,8 +1,26 @@
 # -*- coding: utf-8 -*-
 '''
-Created on Jan 30, 2012
+Requête DBPedia pour renseigner les objets :class:`hdabo.models.Tag`.
+Seuls les tags sémantisés sont traités.
+
+Les données suivantes sont moissonnées:
+
+  - label dans toutes les langues disponibles
+  - résumé dans toutes les langues disponibles
+  - thumbnail
+  - lien entre les tags
 
-@author: ymh
+**Usage**: ``django-admin query_dbpedia [options]``
+
+**Options spécifiques:**
+
+    - *\-\-all* :               force à traiter tous les tags
+    - *\-\-random* :            faire le traitement des tags dans un ordre aléatoire
+    - *\-\-force* :             ne pose aucune question
+    - *\-\-limit=LIMIT* :       Nombre de tags à traiter
+    - *\-\-start=START* :       Nombre de tags à ignorer
+    - *\-\-tag=TAG* :           Limite le traitement à ce tag
+
 '''
 
 from hdabo.models import Tag
@@ -32,7 +50,7 @@
     '''
     options = ''
     help = """query and update wikipedia for tag title."""
-    
+
     option_list = NoArgsCommand.option_list + (
         make_option('--all',
             action='store_true',
@@ -68,7 +86,7 @@
             default=[],
             help='the tag to query'),
     )
-    
+
     def query_dbpedia(self, query, fmt='n3'):
         url = settings.DBPEDIA_URI_TEMPLATE % ( 'sparql', '' )
         params = {
@@ -81,58 +99,58 @@
 
 
     def handle_noargs(self, **options):
-                
+
         self.style = no_style()
-        
+
         self.interactive = options.get('interactive', True)
-        
+
         self.verbosity = int(options.get('verbosity', '1'))
-        
+
         self.force = options.get('force', False)
-        
+
         self.limit = options.get("limit", -1)
         self.start = options.get("start", 0)
-        
+
         self.random = options.get('random', False)
-                        
+
         if self.verbosity > 2:
             print "option passed : " + repr(options)
 
         self.tag_list = options.get("tags", []);
 
         queryset = Tag.objects.exclude(dbpedia_uri= None)
-        
-                
+
+
         if self.tag_list:
             queryset = queryset.filter(label__in=self.tag_list)
-        elif not options.get('all',False):            
+        elif not options.get('all',False):
             queryset = queryset.annotate(dbfc=Count('dbpedia_fields')).filter(dbfc = 0)
 
         if self.random:
             queryset = queryset.order_by("?")
         else:
             queryset = queryset.order_by("label")
-        
+
         if self.limit >= 0:
             queryset = queryset[self.start:self.limit]
         elif self.start > 0:
             queryset = queryset[self.start:]
-        
+
         if self.verbosity > 2 :
             print "Tag Query is %s" % (queryset.query)
 
         count = queryset.count()
-        
+
         if count == 0:
             print "No tag to query : exit."
             return
-            
-            
+
+
         if not self.force and self.interactive:
             confirm = raw_input("You have requested to query and replace the dbpedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
         else:
             confirm = 'yes'
-            
+
         if confirm != "yes":
             print "dbpedia query cancelled"
             return
@@ -141,12 +159,12 @@
         for i,tag in enumerate(queryset):
             writer = show_progress(i+1, count, tag.label, 50, writer)
             db.reset_queries()
-            
+
             #abstract query
             #"select ?y
             # where {<%s>  <http://dbpedia.org/ontology/abstract> ?y}" % (tag.dbpedia_uri)
-            
-            #rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3"            
+
+            #rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3"
             #g = Graph()
             try :
                 abstracts = {}
@@ -157,12 +175,12 @@
                     for _,_,o in res_abstracts.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)):
                         abstracts[o.language] = (unicode(o), True)
                     logger.debug("Abstracts: %r" % abstracts)
-                    
+
                     res_labels = self.query_dbpedia("select distinct ?y where {<%s>  <http://www.w3.org/2000/01/rdf-schema#label> ?y}" % (tag.dbpedia_uri), 'n3')
                     for _,_,o in res_labels.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)):
                         labels[o.language] = (unicode(o), True)
                     logger.debug("Labels: %r" % labels)
-                    
+
                     res_thumbnails = self.query_dbpedia("select distinct ?y where {<%s>  <http://dbpedia.org/ontology/thumbnail> ?y} limit 1" % (tag.dbpedia_uri), 'n3')
                     for _,_,o in res_thumbnails.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)):
                         thumbnail = unicode(o)
@@ -172,7 +190,7 @@
                         tagqs = Tag.objects.filter(dbpedia_uri=unicode(o))
                         if tagqs:
                             TagLinks.objects.get_or_create(subject=tag, object=tagqs[0])
-                    
+
                     ref_label_lang, (ref_label, _) = ('fr',labels['fr']) if 'fr' in labels else ('en',labels['en']) if 'en' in labels else labels.items()[0] if len(labels) > 0 else ('fr',(tag.label, True))
                     ref_abstract_lang, (ref_abstract, _) = ('fr',abstracts['fr']) if 'fr' in abstracts else ('en',abstracts['en']) if 'en' in abstracts else abstracts.items()[0] if len(abstracts) > 0 else ('fr',(None, 'True'))
 
@@ -182,7 +200,7 @@
                         if lang[0] not in abstracts:
                             abstracts[lang[0]] = (ref_abstract, False)
 
-                    dbfield , created = DbpediaFields.objects.get_or_create(tag=tag, defaults={'dbpedia_uri':tag.dbpedia_uri, 'abstract':ref_abstract, 'thumbnail':thumbnail, 'label':ref_label}) #@UndefinedVariable                    
+                    dbfield , created = DbpediaFields.objects.get_or_create(tag=tag, defaults={'dbpedia_uri':tag.dbpedia_uri, 'abstract':ref_abstract, 'thumbnail':thumbnail, 'label':ref_label}) #@UndefinedVariable
                     if not created:
                         dbfield.dbpedia_uri = tag.dbpedia_uri
                         dbfield.abstract = ref_abstract
@@ -198,22 +216,22 @@
                         if lang in consolidated_trans:
                             consolidated_trans[lang][1] = abstract
                         else:
-                            consolidated_trans[lang] = [(ref_label, lang==ref_label_lang), abstract] 
-                         
+                            consolidated_trans[lang] = [(ref_label, lang==ref_label_lang), abstract]
+
                     for lang, trans in consolidated_trans.iteritems():
                         label, abstract = tuple(trans)
                         DbpediaFieldsTranslation.objects.create(master=dbfield, language_code=lang, label=label[0], is_label_translated=label[1], abstract=abstract[0], is_abstract_translated=abstract[1])
-                    
-                    
+
+
             except Exception as e:
                 if tag.dbpedia_uri:
                     print "\nError processing resource %s : %s" %(tag.dbpedia_uri,unicode(e))
                 else:
                     print "\nError processing resource %s" % unicode(e)
                 traceback.print_exception(type(e), e, sys.exc_info()[2])
-                
+
 
 
-            
-            
+
 
+