diff -r b7d19cd87fcf -r 09e00f38d177 src/hdalab/management/commands/query_dbpedia.py --- a/src/hdalab/management/commands/query_dbpedia.py Thu Apr 12 01:27:16 2018 +0200 +++ b/src/hdalab/management/commands/query_dbpedia.py Wed Apr 11 12:19:47 2018 +0200 @@ -1,8 +1,26 @@ # -*- coding: utf-8 -*- ''' -Created on Jan 30, 2012 +Requête DBPedia pour renseigner les objets :class:`hdabo.models.Tag`. +Seuls les tags sémantisés sont traités. + +Les données suivantes sont moissonnées: + + - label dans toutes les langues disponibles + - résumé dans toutes les langues disponibles + - thumbnail + - lien entre les tags -@author: ymh +**Usage**: ``django-admin query_dbpedia [options]`` + +**Options spécifiques:** + + - *\-\-all* : force à traiter tous les tags + - *\-\-random* : faire le traitement des tags dans un ordre aléatoire + - *\-\-force* : ne pose aucune question + - *\-\-limit=LIMIT* : Nombre de tags à traiter + - *\-\-start=START* : Nombre de tags à ignorer + - *\-\-tag=TAG* : Limite le traitement à ce tag + ''' from hdabo.models import Tag @@ -32,7 +50,7 @@ ''' options = '' help = """query and update wikipedia for tag title.""" - + option_list = NoArgsCommand.option_list + ( make_option('--all', action='store_true', @@ -68,7 +86,7 @@ default=[], help='the tag to query'), ) - + def query_dbpedia(self, query, fmt='n3'): url = settings.DBPEDIA_URI_TEMPLATE % ( 'sparql', '' ) params = { @@ -81,58 +99,58 @@ def handle_noargs(self, **options): - + self.style = no_style() - + self.interactive = options.get('interactive', True) - + self.verbosity = int(options.get('verbosity', '1')) - + self.force = options.get('force', False) - + self.limit = options.get("limit", -1) self.start = options.get("start", 0) - + self.random = options.get('random', False) - + if self.verbosity > 2: print "option passed : " + repr(options) self.tag_list = options.get("tags", []); queryset = Tag.objects.exclude(dbpedia_uri= None) - - + + if self.tag_list: queryset = queryset.filter(label__in=self.tag_list) - elif not options.get('all',False): + elif not options.get('all',False): queryset = queryset.annotate(dbfc=Count('dbpedia_fields')).filter(dbfc = 0) if self.random: queryset = queryset.order_by("?") else: queryset = queryset.order_by("label") - + if self.limit >= 0: queryset = queryset[self.start:self.limit] elif self.start > 0: queryset = queryset[self.start:] - + if self.verbosity > 2 : print "Tag Query is %s" % (queryset.query) count = queryset.count() - + if count == 0: print "No tag to query : exit." return - - + + if not self.force and self.interactive: confirm = raw_input("You have requested to query and replace the dbpedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count)) else: confirm = 'yes' - + if confirm != "yes": print "dbpedia query cancelled" return @@ -141,12 +159,12 @@ for i,tag in enumerate(queryset): writer = show_progress(i+1, count, tag.label, 50, writer) db.reset_queries() - + #abstract query #"select ?y # where {<%s> ?y}" % (tag.dbpedia_uri) - - #rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3" + + #rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3" #g = Graph() try : abstracts = {} @@ -157,12 +175,12 @@ for _,_,o in res_abstracts.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)): abstracts[o.language] = (unicode(o), True) logger.debug("Abstracts: %r" % abstracts) - + res_labels = self.query_dbpedia("select distinct ?y where {<%s> ?y}" % (tag.dbpedia_uri), 'n3') for _,_,o in res_labels.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)): labels[o.language] = (unicode(o), True) logger.debug("Labels: %r" % labels) - + res_thumbnails = self.query_dbpedia("select distinct ?y where {<%s> ?y} limit 1" % (tag.dbpedia_uri), 'n3') for _,_,o in res_thumbnails.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)): thumbnail = unicode(o) @@ -172,7 +190,7 @@ tagqs = Tag.objects.filter(dbpedia_uri=unicode(o)) if tagqs: TagLinks.objects.get_or_create(subject=tag, object=tagqs[0]) - + ref_label_lang, (ref_label, _) = ('fr',labels['fr']) if 'fr' in labels else ('en',labels['en']) if 'en' in labels else labels.items()[0] if len(labels) > 0 else ('fr',(tag.label, True)) ref_abstract_lang, (ref_abstract, _) = ('fr',abstracts['fr']) if 'fr' in abstracts else ('en',abstracts['en']) if 'en' in abstracts else abstracts.items()[0] if len(abstracts) > 0 else ('fr',(None, 'True')) @@ -182,7 +200,7 @@ if lang[0] not in abstracts: abstracts[lang[0]] = (ref_abstract, False) - dbfield , created = DbpediaFields.objects.get_or_create(tag=tag, defaults={'dbpedia_uri':tag.dbpedia_uri, 'abstract':ref_abstract, 'thumbnail':thumbnail, 'label':ref_label}) #@UndefinedVariable + dbfield , created = DbpediaFields.objects.get_or_create(tag=tag, defaults={'dbpedia_uri':tag.dbpedia_uri, 'abstract':ref_abstract, 'thumbnail':thumbnail, 'label':ref_label}) #@UndefinedVariable if not created: dbfield.dbpedia_uri = tag.dbpedia_uri dbfield.abstract = ref_abstract @@ -198,22 +216,22 @@ if lang in consolidated_trans: consolidated_trans[lang][1] = abstract else: - consolidated_trans[lang] = [(ref_label, lang==ref_label_lang), abstract] - + consolidated_trans[lang] = [(ref_label, lang==ref_label_lang), abstract] + for lang, trans in consolidated_trans.iteritems(): label, abstract = tuple(trans) DbpediaFieldsTranslation.objects.create(master=dbfield, language_code=lang, label=label[0], is_label_translated=label[1], abstract=abstract[0], is_abstract_translated=abstract[1]) - - + + except Exception as e: if tag.dbpedia_uri: print "\nError processing resource %s : %s" %(tag.dbpedia_uri,unicode(e)) else: print "\nError processing resource %s" % unicode(e) traceback.print_exception(type(e), e, sys.exc_info()[2]) - + - - + +