diff -r 00fc169cc6a9 -r 825ff4d6a8ac web/hdalab/management/commands/query_dbpedia.py --- a/web/hdalab/management/commands/query_dbpedia.py Fri Jun 22 19:16:46 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,192 +0,0 @@ -# -*- coding: utf-8 -*- -''' -Created on Jan 30, 2012 - -@author: ymh -''' - -from django.core.management.base import NoArgsCommand -from django.core.management.color import no_style - -from optparse import make_option -from django.conf import settings -from django.db.models import Count -from django.db import transaction -from hdabo.models import Tag -from hdalab.models import DbpediaFields, TagLinks -from hdabo.utils import show_progress -from rdflib.graph import Graph -from rdflib import URIRef -import re -import traceback -import sys -from hdalab.models.dataviz import DbpediaFieldsTranslation -from django import db - -class Command(NoArgsCommand): - ''' - query and update wikipedia for tag title. - ''' - options = '' - help = """query and update wikipedia for tag title.""" - - option_list = NoArgsCommand.option_list + ( - make_option('--all', - action='store_true', - dest='all', - default=False, - help='force all tags to be updated, not only those not yet processed'), - make_option('--force', - action='store_true', - dest='force', - default=False, - help='ask no questions'), - make_option('--random', - action='store_true', - dest='random', - default=False, - help='randomize query on tags'), - make_option('--limit', - action='store', - type='int', - dest='limit', - default= -1, - help='number of tag to process'), - make_option('--start', - action='store', - type='int', - dest='start', - default=0, - help='number of tag to ignore'), - make_option('--tag', - action='append', - dest='tags', - type='string', - default=[], - help='the tag to query'), - ) - - def handle_noargs(self, **options): - - self.style = no_style() - - self.interactive = options.get('interactive', True) - - self.verbosity = int(options.get('verbosity', '1')) - - self.force = options.get('force', False) - - self.limit = options.get("limit", -1) - self.start = options.get("start", 0) - - self.random = options.get('random', False) - - if self.verbosity > 2: - print "option passed : " + repr(options) - - self.tag_list = options.get("tags", []); - - queryset = Tag.objects.exclude(dbpedia_uri= None) - - - if self.tag_list: - queryset = queryset.filter(label__in=self.tag_list) - elif not options.get('all',False): - queryset = queryset.annotate(dbfc=Count('dbpedia_fields')).filter(dbfc = 0) - - if self.random: - queryset = queryset.order_by("?") - else: - queryset = queryset.order_by("label") - - if self.limit >= 0: - queryset = queryset[self.start:self.limit] - elif self.start > 0: - queryset = queryset[self.start:] - - if self.verbosity > 2 : - print "Tag Query is %s" % (queryset.query) - - count = queryset.count() - - if count == 0: - print "No tag to query : exit." - return - - - if not self.force and self.interactive: - confirm = raw_input("You have requested to query and replace the dbpedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count)) - else: - confirm = 'yes' - - if confirm != "yes": - print "dbpedia query cancelled" - return - - writer = None - for i,tag in enumerate(queryset): - writer = show_progress(i+1, count, tag.label, 50, writer) - db.reset_queries() - rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3" - g = Graph() - try : - g.parse(rdf_uri, format="n3") - - with transaction.commit_on_success(): - - abstracts = {} - labels = {} - thumbnail = None - for t in g: - if t[1] == URIRef(u'http://dbpedia.org/ontology/abstract') and t[2] is not None \ - and hasattr(t[2], 'language'): - abstracts[t[2].language] = unicode(t[2]) - if t[1] == URIRef(u'http://www.w3.org/2000/01/rdf-schema#label') and t[2] is not None \ - and hasattr(t[2], 'language'): - labels[t[2].language] = unicode(t[2]) - if t[1] == URIRef(u'http://dbpedia.org/ontology/thumbnail') and t[2] is not None: - thumbnail = unicode(t[2]) - if u'http://dbpedia.org/resource' in t[2]: - tagqs = Tag.objects.filter(dbpedia_uri=unicode(t[2])) - if tagqs: - TagLinks.objects.get_or_create(subject=tag, object=tagqs[0]) - - ref_label = labels['fr'] if 'fr' in labels else labels['en'] if 'en' in labels else labels.values()[0] if len(labels) > 0 else tag.label - ref_abstract = abstracts['fr'] if 'fr' in abstracts else abstracts['en'] if 'en' in abstracts else abstracts.values()[0] if len(abstracts) > 0 else None - for lang in settings.LANGUAGES: - if lang[0] not in labels: - labels[lang[0]]= ref_label - if lang[0] not in abstracts: - abstracts[lang[0]] = ref_abstract - - dbfield , created = DbpediaFields.objects.get_or_create(dbpedia_uri=tag.dbpedia_uri,tag=tag, defaults={'abstract':ref_abstract, 'thumbnail':thumbnail, 'label':ref_label}) #@UndefinedVariable - if not created: - dbfield.abstract = ref_abstract - dbfield.thumbnail = thumbnail - dbfield.label = ref_label - dbfield.save() - DbpediaFieldsTranslation.objects.filter(master=dbfield).delete() - - consolidated_trans = {} - for lang,label in labels.iteritems(): - consolidated_trans[lang] = [label,ref_abstract] - for lang,abstract in abstracts.iteritems(): - if lang in consolidated_trans: - consolidated_trans[lang][1] = abstract - else: - consolidated_trans[lang] = [ref_label, abstract] - - for lang, trans in consolidated_trans.iteritems(): - label, abstract = tuple(trans) - DbpediaFieldsTranslation.objects.create(master=dbfield, language_code=lang, label=label, abstract=abstract) - - - except Exception as e: - print "\nError processing resource %s : %s" %(rdf_uri,unicode(e)) - traceback.print_exception(type(e), e, sys.exc_info()[2]) - - - - - -