web/hdalab/management/commands/query_dbpedia.py
author ymh <ymh.work@gmail.com>
Thu, 16 Feb 2012 21:48:40 +0100
changeset 119 e3ebe3545f72
child 135 dd6578e36a57
permissions -rw-r--r--
first implementation of django version. Kind of work but need optimisation. Will do them after update from raphael

# -*- coding: utf-8 -*-
'''
Created on Jan 30, 2012

@author: ymh
'''

from django.core.management.base import NoArgsCommand
from django.core.management.color import no_style

from optparse import make_option
from django.db.models import Count
from django.db import transaction
from hdabo.models import Tag
from hdalab.models import DbpediaFields, TagLinks
from hdabo.utils import show_progress
from rdflib.graph import Graph
from rdflib import URIRef
import re

class Command(NoArgsCommand):
    '''
    query and update wikipedia for tag title.
    '''
    options = ''
    help = """query and update wikipedia for tag title."""
    
    option_list = NoArgsCommand.option_list + (
        make_option('--all',
            action='store_true',
            dest='all',
            default=False,
            help='force all tags to be updated, not only those not yet processed'),
        make_option('--force',
            action='store_true',
            dest='force',
            default=False,
            help='ask no questions'),
        make_option('--random',
            action='store_true',
            dest='random',
            default=False,
            help='randomize query on tags'),
        make_option('--limit',
            action='store',
            type='int',
            dest='limit',
            default= -1,
            help='number of tag to process'),
        make_option('--start',
            action='store',
            type='int',
            dest='start',
            default=0,
            help='number of tag to ignore'),
        make_option('--tag',
            action='append',
            dest='tags',
            type='string',
            default=[],
            help='the tag to query'),
    )
    
    def handle_noargs(self, **options):
        
        self.style = no_style()
        
        self.interactive = options.get('interactive', True)
        
        self.verbosity = int(options.get('verbosity', '1'))
        
        self.force = options.get('force', False)
        
        self.limit = options.get("limit", -1)
        self.start = options.get("start", 0)
        
        self.random = options.get('random', False)
                        
        if self.verbosity > 2:
            print "option passed : " + repr(options)

        self.tag_list = options.get("tags", []);

        queryset = Tag.objects.exclude(dbpedia_uri= None)
        
                
        if self.tag_list:
            queryset = queryset.filter(label__in=self.tag_list)
        elif not options.get('all',False):            
            queryset = queryset.annotate(dbfc=Count('dbpedia_fields')).filter(dbfc = 0)

        if self.random:
            queryset = queryset.order_by("?")
        else:
            queryset = queryset.order_by("label")
        
        if self.limit >= 0:
            queryset = queryset[self.start:self.limit]
        elif self.start > 0:
            queryset = queryset[self.start:]
        
        if self.verbosity > 2 :
            print "Tag Query is %s" % (queryset.query)

        count = queryset.count()
        
        if not self.force and self.interactive:
            confirm = raw_input("You have requested to query and replace the dbpedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
        else:
            confirm = 'yes'
            
        if confirm != "yes":
            print "dbpedia query cancelled"
            return

        writer = None
        for i,tag in enumerate(queryset):
            writer = show_progress(i+1, count, tag.label, 50, writer)
            
            rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3"            
            g = Graph()
            try :
                g.parse(rdf_uri, format="n3")
            
                with transaction.commit_on_success():
                    
                    abstract = None
                    label = None
                    thumbnail = None
                    for t in g:
                        if t[1] == URIRef(u'http://dbpedia.org/ontology/abstract') and t[2] is not None \
                            and hasattr(t[2], 'language') and (t[2].language == u"fr" or (abstract is None and t[2].language == u"en")):
                            abstract = unicode(t[2])
                        if t[1] ==  URIRef(u'http://www.w3.org/2000/01/rdf-schema#label') and t[2] is not None \
                            and hasattr(t[2], 'language') and (t[2].language == u"fr" or (label is None and t[2].language == u"en")):
                            label = unicode(t[2]) 
                        if t[1] == URIRef(u'http://dbpedia.org/ontology/thumbnail') and t[2] is not None:
                            thumbnail = unicode(t[2])
                        if u'http://dbpedia.org/resource' in t[2]:
                            tagqs = Tag.objects.filter(dbpedia_uri=unicode(t[2]))
                            if tagqs:
                                TagLinks.objects.get_or_create(subject=tag, object=tagqs[0])                        
                    
                    dbfield , created = DbpediaFields.objects.get_or_create(dbpedia_uri=tag.dbpedia_uri,tag=tag, defaults={'abstract':abstract, 'label':label, 'thumbnail':thumbnail})
                    if not created:
                        dbfield.abstract = abstract
                        dbfield.label = label
                        dbfield.thumbnail = thumbnail
                        dbfield.save()
                    
            except Exception as e:
                print "\nError processing resource %s : %s" %(rdf_uri,unicode(e))