web/hdalab/management/commands/query_dbpedia.py
changeset 119 e3ebe3545f72
child 135 dd6578e36a57
equal deleted inserted replaced
118:fdf808d7d374 119:e3ebe3545f72
       
     1 # -*- coding: utf-8 -*-
       
     2 '''
       
     3 Created on Jan 30, 2012
       
     4 
       
     5 @author: ymh
       
     6 '''
       
     7 
       
     8 from django.core.management.base import NoArgsCommand
       
     9 from django.core.management.color import no_style
       
    10 
       
    11 from optparse import make_option
       
    12 from django.db.models import Count
       
    13 from django.db import transaction
       
    14 from hdabo.models import Tag
       
    15 from hdalab.models import DbpediaFields, TagLinks
       
    16 from hdabo.utils import show_progress
       
    17 from rdflib.graph import Graph
       
    18 from rdflib import URIRef
       
    19 import re
       
    20 
       
    21 class Command(NoArgsCommand):
       
    22     '''
       
    23     query and update wikipedia for tag title.
       
    24     '''
       
    25     options = ''
       
    26     help = """query and update wikipedia for tag title."""
       
    27     
       
    28     option_list = NoArgsCommand.option_list + (
       
    29         make_option('--all',
       
    30             action='store_true',
       
    31             dest='all',
       
    32             default=False,
       
    33             help='force all tags to be updated, not only those not yet processed'),
       
    34         make_option('--force',
       
    35             action='store_true',
       
    36             dest='force',
       
    37             default=False,
       
    38             help='ask no questions'),
       
    39         make_option('--random',
       
    40             action='store_true',
       
    41             dest='random',
       
    42             default=False,
       
    43             help='randomize query on tags'),
       
    44         make_option('--limit',
       
    45             action='store',
       
    46             type='int',
       
    47             dest='limit',
       
    48             default= -1,
       
    49             help='number of tag to process'),
       
    50         make_option('--start',
       
    51             action='store',
       
    52             type='int',
       
    53             dest='start',
       
    54             default=0,
       
    55             help='number of tag to ignore'),
       
    56         make_option('--tag',
       
    57             action='append',
       
    58             dest='tags',
       
    59             type='string',
       
    60             default=[],
       
    61             help='the tag to query'),
       
    62     )
       
    63     
       
    64     def handle_noargs(self, **options):
       
    65         
       
    66         self.style = no_style()
       
    67         
       
    68         self.interactive = options.get('interactive', True)
       
    69         
       
    70         self.verbosity = int(options.get('verbosity', '1'))
       
    71         
       
    72         self.force = options.get('force', False)
       
    73         
       
    74         self.limit = options.get("limit", -1)
       
    75         self.start = options.get("start", 0)
       
    76         
       
    77         self.random = options.get('random', False)
       
    78                         
       
    79         if self.verbosity > 2:
       
    80             print "option passed : " + repr(options)
       
    81 
       
    82         self.tag_list = options.get("tags", []);
       
    83 
       
    84         queryset = Tag.objects.exclude(dbpedia_uri= None)
       
    85         
       
    86                 
       
    87         if self.tag_list:
       
    88             queryset = queryset.filter(label__in=self.tag_list)
       
    89         elif not options.get('all',False):            
       
    90             queryset = queryset.annotate(dbfc=Count('dbpedia_fields')).filter(dbfc = 0)
       
    91 
       
    92         if self.random:
       
    93             queryset = queryset.order_by("?")
       
    94         else:
       
    95             queryset = queryset.order_by("label")
       
    96         
       
    97         if self.limit >= 0:
       
    98             queryset = queryset[self.start:self.limit]
       
    99         elif self.start > 0:
       
   100             queryset = queryset[self.start:]
       
   101         
       
   102         if self.verbosity > 2 :
       
   103             print "Tag Query is %s" % (queryset.query)
       
   104 
       
   105         count = queryset.count()
       
   106         
       
   107         if not self.force and self.interactive:
       
   108             confirm = raw_input("You have requested to query and replace the dbpedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
       
   109         else:
       
   110             confirm = 'yes'
       
   111             
       
   112         if confirm != "yes":
       
   113             print "dbpedia query cancelled"
       
   114             return
       
   115 
       
   116         writer = None
       
   117         for i,tag in enumerate(queryset):
       
   118             writer = show_progress(i+1, count, tag.label, 50, writer)
       
   119             
       
   120             rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3"            
       
   121             g = Graph()
       
   122             try :
       
   123                 g.parse(rdf_uri, format="n3")
       
   124             
       
   125                 with transaction.commit_on_success():
       
   126                     
       
   127                     abstract = None
       
   128                     label = None
       
   129                     thumbnail = None
       
   130                     for t in g:
       
   131                         if t[1] == URIRef(u'http://dbpedia.org/ontology/abstract') and t[2] is not None \
       
   132                             and hasattr(t[2], 'language') and (t[2].language == u"fr" or (abstract is None and t[2].language == u"en")):
       
   133                             abstract = unicode(t[2])
       
   134                         if t[1] ==  URIRef(u'http://www.w3.org/2000/01/rdf-schema#label') and t[2] is not None \
       
   135                             and hasattr(t[2], 'language') and (t[2].language == u"fr" or (label is None and t[2].language == u"en")):
       
   136                             label = unicode(t[2]) 
       
   137                         if t[1] == URIRef(u'http://dbpedia.org/ontology/thumbnail') and t[2] is not None:
       
   138                             thumbnail = unicode(t[2])
       
   139                         if u'http://dbpedia.org/resource' in t[2]:
       
   140                             tagqs = Tag.objects.filter(dbpedia_uri=unicode(t[2]))
       
   141                             if tagqs:
       
   142                                 TagLinks.objects.get_or_create(subject=tag, object=tagqs[0])                        
       
   143                     
       
   144                     dbfield , created = DbpediaFields.objects.get_or_create(dbpedia_uri=tag.dbpedia_uri,tag=tag, defaults={'abstract':abstract, 'label':label, 'thumbnail':thumbnail})
       
   145                     if not created:
       
   146                         dbfield.abstract = abstract
       
   147                         dbfield.label = label
       
   148                         dbfield.thumbnail = thumbnail
       
   149                         dbfield.save()
       
   150                     
       
   151             except Exception as e:
       
   152                 print "\nError processing resource %s : %s" %(rdf_uri,unicode(e))
       
   153 
       
   154 
       
   155             
       
   156             
       
   157