diff -r 73f19fa4f997 -r 8f77cf71ab02 src/hdalab/management/commands/query_dbpedia.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/hdalab/management/commands/query_dbpedia.py Tue Jun 17 10:25:33 2014 +0200 @@ -0,0 +1,205 @@ +# -*- coding: utf-8 -*- +''' +Created on Jan 30, 2012 + +@author: ymh +''' + +from django.core.management.base import NoArgsCommand +from django.core.management.color import no_style + +from optparse import make_option +from django.conf import settings +from django.db.models import Count +from django.db import transaction +from hdabo.models import Tag +from hdalab.models import DbpediaFields, TagLinks +from hdabo.utils import show_progress +from rdflib.graph import Graph +from rdflib import URIRef +import traceback +import sys +from hdalab.models.dataviz import DbpediaFieldsTranslation +from django import db +from SPARQLWrapper import SPARQLWrapper, RDF + +class Command(NoArgsCommand): + ''' + query and update wikipedia for tag title. + ''' + options = '' + help = """query and update wikipedia for tag title.""" + + option_list = NoArgsCommand.option_list + ( + make_option('--all', + action='store_true', + dest='all', + default=False, + help='force all tags to be updated, not only those not yet processed'), + make_option('--force', + action='store_true', + dest='force', + default=False, + help='ask no questions'), + make_option('--random', + action='store_true', + dest='random', + default=False, + help='randomize query on tags'), + make_option('--limit', + action='store', + type='int', + dest='limit', + default= -1, + help='number of tag to process'), + make_option('--start', + action='store', + type='int', + dest='start', + default=0, + help='number of tag to ignore'), + make_option('--tag', + action='append', + dest='tags', + type='string', + default=[], + help='the tag to query'), + ) + + def handle_noargs(self, **options): + + self.style = no_style() + + self.interactive = options.get('interactive', True) + + self.verbosity = int(options.get('verbosity', '1')) + + self.force = options.get('force', False) + + self.limit = options.get("limit", -1) + self.start = options.get("start", 0) + + self.random = options.get('random', False) + + if self.verbosity > 2: + print "option passed : " + repr(options) + + self.tag_list = options.get("tags", []); + + queryset = Tag.objects.exclude(dbpedia_uri= None) + + + if self.tag_list: + queryset = queryset.filter(label__in=self.tag_list) + elif not options.get('all',False): + queryset = queryset.annotate(dbfc=Count('dbpedia_fields')).filter(dbfc = 0) + + if self.random: + queryset = queryset.order_by("?") + else: + queryset = queryset.order_by("label") + + if self.limit >= 0: + queryset = queryset[self.start:self.limit] + elif self.start > 0: + queryset = queryset[self.start:] + + if self.verbosity > 2 : + print "Tag Query is %s" % (queryset.query) + + count = queryset.count() + + if count == 0: + print "No tag to query : exit." + return + + + if not self.force and self.interactive: + confirm = raw_input("You have requested to query and replace the dbpedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count)) + else: + confirm = 'yes' + + if confirm != "yes": + print "dbpedia query cancelled" + return + + endpoint = SPARQLWrapper("http://dbpedia.org/sparql", returnFormat=RDF) + + writer = None + for i,tag in enumerate(queryset): + writer = show_progress(i+1, count, tag.label, 50, writer) + db.reset_queries() + + #abstract query + #"select ?y + # where {<%s> ?y}" % (tag.dbpedia_uri) + + #rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3" + #g = Graph() + try : + abstracts = {} + labels = {} + thumbnail = None + with transaction.commit_on_success(): + endpoint.setQuery("select distinct ?y where {<%s> ?y}" % (tag.dbpedia_uri)) + res_abstracts = endpoint.queryAndConvert() + for _,_,o in res_abstracts.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)): + abstracts[o.language] = (unicode(o), True) + + endpoint.setQuery("select distinct ?y where {<%s> ?y}" % (tag.dbpedia_uri)) + res_labels = endpoint.queryAndConvert() + for _,_,o in res_labels.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)): + labels[o.language] = (unicode(o), True) + + endpoint.setQuery("select distinct ?y where {<%s> ?y} limit 1" % (tag.dbpedia_uri)) + res_thumbnails = endpoint.queryAndConvert() + for _,_,o in res_thumbnails.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)): + thumbnail = unicode(o) + + endpoint.setQuery('select distinct ?y where { <%s> ?p ?y . FILTER regex(?y, "^http://dbpedia.org/resource")}' % (tag.dbpedia_uri)) + res_links = endpoint.queryAndConvert() + for _,_,o in res_links.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)): + tagqs = Tag.objects.filter(dbpedia_uri=unicode(o)) + if tagqs: + TagLinks.objects.get_or_create(subject=tag, object=tagqs[0]) + + ref_label_lang, (ref_label, _) = ('fr',labels['fr']) if 'fr' in labels else ('en',labels['en']) if 'en' in labels else labels.items()[0] if len(labels) > 0 else ('fr',(tag.label, True)) + ref_abstract_lang, (ref_abstract, _) = ('fr',abstracts['fr']) if 'fr' in abstracts else ('en',abstracts['en']) if 'en' in abstracts else abstracts.items()[0] if len(abstracts) > 0 else ('fr',(None, 'True')) + + for lang in settings.LANGUAGES: + if lang[0] not in labels: + labels[lang[0]]= (ref_label, False) + if lang[0] not in abstracts: + abstracts[lang[0]] = (ref_abstract, False) + + dbfield , created = DbpediaFields.objects.get_or_create(dbpedia_uri=tag.dbpedia_uri,tag=tag, defaults={'abstract':ref_abstract, 'thumbnail':thumbnail, 'label':ref_label}) #@UndefinedVariable + if not created: + dbfield.abstract = ref_abstract + dbfield.thumbnail = thumbnail + dbfield.label = ref_label + dbfield.save() + DbpediaFieldsTranslation.objects.filter(master=dbfield).delete() + + consolidated_trans = {} + for lang,label in labels.iteritems(): + consolidated_trans[lang] = [label,(ref_abstract, lang==ref_abstract_lang)] + for lang,abstract in abstracts.iteritems(): + if lang in consolidated_trans: + consolidated_trans[lang][1] = abstract + else: + consolidated_trans[lang] = [(ref_label, lang==ref_label_lang), abstract] + + for lang, trans in consolidated_trans.iteritems(): + label, abstract = tuple(trans) + DbpediaFieldsTranslation.objects.create(master=dbfield, language_code=lang, label=label[0], is_label_translated=label[1], abstract=abstract[0], is_abstract_translated=abstract[1]) + + + except Exception as e: + print "\nError processing resource %s : %s" %(rdf_uri,unicode(e)) + traceback.print_exception(type(e), e, sys.exc_info()[2]) + + + + + +