diff -r 73f19fa4f997 -r 8f77cf71ab02 web/hdalab/management/commands/query_dbpedia.py --- a/web/hdalab/management/commands/query_dbpedia.py Fri Nov 16 18:12:05 2012 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,205 +0,0 @@ -# -*- coding: utf-8 -*- -''' -Created on Jan 30, 2012 - -@author: ymh -''' - -from django.core.management.base import NoArgsCommand -from django.core.management.color import no_style - -from optparse import make_option -from django.conf import settings -from django.db.models import Count -from django.db import transaction -from hdabo.models import Tag -from hdalab.models import DbpediaFields, TagLinks -from hdabo.utils import show_progress -from rdflib.graph import Graph -from rdflib import URIRef -import traceback -import sys -from hdalab.models.dataviz import DbpediaFieldsTranslation -from django import db -from SPARQLWrapper import SPARQLWrapper, RDF - -class Command(NoArgsCommand): - ''' - query and update wikipedia for tag title. - ''' - options = '' - help = """query and update wikipedia for tag title.""" - - option_list = NoArgsCommand.option_list + ( - make_option('--all', - action='store_true', - dest='all', - default=False, - help='force all tags to be updated, not only those not yet processed'), - make_option('--force', - action='store_true', - dest='force', - default=False, - help='ask no questions'), - make_option('--random', - action='store_true', - dest='random', - default=False, - help='randomize query on tags'), - make_option('--limit', - action='store', - type='int', - dest='limit', - default= -1, - help='number of tag to process'), - make_option('--start', - action='store', - type='int', - dest='start', - default=0, - help='number of tag to ignore'), - make_option('--tag', - action='append', - dest='tags', - type='string', - default=[], - help='the tag to query'), - ) - - def handle_noargs(self, **options): - - self.style = no_style() - - self.interactive = options.get('interactive', True) - - self.verbosity = int(options.get('verbosity', '1')) - - self.force = options.get('force', False) - - self.limit = options.get("limit", -1) - self.start = options.get("start", 0) - - self.random = options.get('random', False) - - if self.verbosity > 2: - print "option passed : " + repr(options) - - self.tag_list = options.get("tags", []); - - queryset = Tag.objects.exclude(dbpedia_uri= None) - - - if self.tag_list: - queryset = queryset.filter(label__in=self.tag_list) - elif not options.get('all',False): - queryset = queryset.annotate(dbfc=Count('dbpedia_fields')).filter(dbfc = 0) - - if self.random: - queryset = queryset.order_by("?") - else: - queryset = queryset.order_by("label") - - if self.limit >= 0: - queryset = queryset[self.start:self.limit] - elif self.start > 0: - queryset = queryset[self.start:] - - if self.verbosity > 2 : - print "Tag Query is %s" % (queryset.query) - - count = queryset.count() - - if count == 0: - print "No tag to query : exit." - return - - - if not self.force and self.interactive: - confirm = raw_input("You have requested to query and replace the dbpedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count)) - else: - confirm = 'yes' - - if confirm != "yes": - print "dbpedia query cancelled" - return - - endpoint = SPARQLWrapper("http://dbpedia.org/sparql", returnFormat=RDF) - - writer = None - for i,tag in enumerate(queryset): - writer = show_progress(i+1, count, tag.label, 50, writer) - db.reset_queries() - - #abstract query - #"select ?y - # where {<%s> ?y}" % (tag.dbpedia_uri) - - #rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3" - #g = Graph() - try : - abstracts = {} - labels = {} - thumbnail = None - with transaction.commit_on_success(): - endpoint.setQuery("select distinct ?y where {<%s> ?y}" % (tag.dbpedia_uri)) - res_abstracts = endpoint.queryAndConvert() - for _,_,o in res_abstracts.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)): - abstracts[o.language] = (unicode(o), True) - - endpoint.setQuery("select distinct ?y where {<%s> ?y}" % (tag.dbpedia_uri)) - res_labels = endpoint.queryAndConvert() - for _,_,o in res_labels.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)): - labels[o.language] = (unicode(o), True) - - endpoint.setQuery("select distinct ?y where {<%s> ?y} limit 1" % (tag.dbpedia_uri)) - res_thumbnails = endpoint.queryAndConvert() - for _,_,o in res_thumbnails.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)): - thumbnail = unicode(o) - - endpoint.setQuery('select distinct ?y where { <%s> ?p ?y . FILTER regex(?y, "^http://dbpedia.org/resource")}' % (tag.dbpedia_uri)) - res_links = endpoint.queryAndConvert() - for _,_,o in res_links.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)): - tagqs = Tag.objects.filter(dbpedia_uri=unicode(o)) - if tagqs: - TagLinks.objects.get_or_create(subject=tag, object=tagqs[0]) - - ref_label_lang, (ref_label, _) = ('fr',labels['fr']) if 'fr' in labels else ('en',labels['en']) if 'en' in labels else labels.items()[0] if len(labels) > 0 else ('fr',(tag.label, True)) - ref_abstract_lang, (ref_abstract, _) = ('fr',abstracts['fr']) if 'fr' in abstracts else ('en',abstracts['en']) if 'en' in abstracts else abstracts.items()[0] if len(abstracts) > 0 else ('fr',(None, 'True')) - - for lang in settings.LANGUAGES: - if lang[0] not in labels: - labels[lang[0]]= (ref_label, False) - if lang[0] not in abstracts: - abstracts[lang[0]] = (ref_abstract, False) - - dbfield , created = DbpediaFields.objects.get_or_create(dbpedia_uri=tag.dbpedia_uri,tag=tag, defaults={'abstract':ref_abstract, 'thumbnail':thumbnail, 'label':ref_label}) #@UndefinedVariable - if not created: - dbfield.abstract = ref_abstract - dbfield.thumbnail = thumbnail - dbfield.label = ref_label - dbfield.save() - DbpediaFieldsTranslation.objects.filter(master=dbfield).delete() - - consolidated_trans = {} - for lang,label in labels.iteritems(): - consolidated_trans[lang] = [label,(ref_abstract, lang==ref_abstract_lang)] - for lang,abstract in abstracts.iteritems(): - if lang in consolidated_trans: - consolidated_trans[lang][1] = abstract - else: - consolidated_trans[lang] = [(ref_label, lang==ref_label_lang), abstract] - - for lang, trans in consolidated_trans.iteritems(): - label, abstract = tuple(trans) - DbpediaFieldsTranslation.objects.create(master=dbfield, language_code=lang, label=label[0], is_label_translated=label[1], abstract=abstract[0], is_abstract_translated=abstract[1]) - - - except Exception as e: - print "\nError processing resource %s : %s" %(rdf_uri,unicode(e)) - traceback.print_exception(type(e), e, sys.exc_info()[2]) - - - - - -