diff -r fdf808d7d374 -r e3ebe3545f72 web/hdalab/management/commands/query_dbpedia.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/web/hdalab/management/commands/query_dbpedia.py Thu Feb 16 21:48:40 2012 +0100 @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- +''' +Created on Jan 30, 2012 + +@author: ymh +''' + +from django.core.management.base import NoArgsCommand +from django.core.management.color import no_style + +from optparse import make_option +from django.db.models import Count +from django.db import transaction +from hdabo.models import Tag +from hdalab.models import DbpediaFields, TagLinks +from hdabo.utils import show_progress +from rdflib.graph import Graph +from rdflib import URIRef +import re + +class Command(NoArgsCommand): + ''' + query and update wikipedia for tag title. + ''' + options = '' + help = """query and update wikipedia for tag title.""" + + option_list = NoArgsCommand.option_list + ( + make_option('--all', + action='store_true', + dest='all', + default=False, + help='force all tags to be updated, not only those not yet processed'), + make_option('--force', + action='store_true', + dest='force', + default=False, + help='ask no questions'), + make_option('--random', + action='store_true', + dest='random', + default=False, + help='randomize query on tags'), + make_option('--limit', + action='store', + type='int', + dest='limit', + default= -1, + help='number of tag to process'), + make_option('--start', + action='store', + type='int', + dest='start', + default=0, + help='number of tag to ignore'), + make_option('--tag', + action='append', + dest='tags', + type='string', + default=[], + help='the tag to query'), + ) + + def handle_noargs(self, **options): + + self.style = no_style() + + self.interactive = options.get('interactive', True) + + self.verbosity = int(options.get('verbosity', '1')) + + self.force = options.get('force', False) + + self.limit = options.get("limit", -1) + self.start = options.get("start", 0) + + self.random = options.get('random', False) + + if self.verbosity > 2: + print "option passed : " + repr(options) + + self.tag_list = options.get("tags", []); + + queryset = Tag.objects.exclude(dbpedia_uri= None) + + + if self.tag_list: + queryset = queryset.filter(label__in=self.tag_list) + elif not options.get('all',False): + queryset = queryset.annotate(dbfc=Count('dbpedia_fields')).filter(dbfc = 0) + + if self.random: + queryset = queryset.order_by("?") + else: + queryset = queryset.order_by("label") + + if self.limit >= 0: + queryset = queryset[self.start:self.limit] + elif self.start > 0: + queryset = queryset[self.start:] + + if self.verbosity > 2 : + print "Tag Query is %s" % (queryset.query) + + count = queryset.count() + + if not self.force and self.interactive: + confirm = raw_input("You have requested to query and replace the dbpedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count)) + else: + confirm = 'yes' + + if confirm != "yes": + print "dbpedia query cancelled" + return + + writer = None + for i,tag in enumerate(queryset): + writer = show_progress(i+1, count, tag.label, 50, writer) + + rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3" + g = Graph() + try : + g.parse(rdf_uri, format="n3") + + with transaction.commit_on_success(): + + abstract = None + label = None + thumbnail = None + for t in g: + if t[1] == URIRef(u'http://dbpedia.org/ontology/abstract') and t[2] is not None \ + and hasattr(t[2], 'language') and (t[2].language == u"fr" or (abstract is None and t[2].language == u"en")): + abstract = unicode(t[2]) + if t[1] == URIRef(u'http://www.w3.org/2000/01/rdf-schema#label') and t[2] is not None \ + and hasattr(t[2], 'language') and (t[2].language == u"fr" or (label is None and t[2].language == u"en")): + label = unicode(t[2]) + if t[1] == URIRef(u'http://dbpedia.org/ontology/thumbnail') and t[2] is not None: + thumbnail = unicode(t[2]) + if u'http://dbpedia.org/resource' in t[2]: + tagqs = Tag.objects.filter(dbpedia_uri=unicode(t[2])) + if tagqs: + TagLinks.objects.get_or_create(subject=tag, object=tagqs[0]) + + dbfield , created = DbpediaFields.objects.get_or_create(dbpedia_uri=tag.dbpedia_uri,tag=tag, defaults={'abstract':abstract, 'label':label, 'thumbnail':thumbnail}) + if not created: + dbfield.abstract = abstract + dbfield.label = label + dbfield.thumbnail = thumbnail + dbfield.save() + + except Exception as e: + print "\nError processing resource %s : %s" %(rdf_uri,unicode(e)) + + + + +