src/hdalab/management/commands/query_dbpedia.py
changeset 266 825ff4d6a8ac
parent 135 dd6578e36a57
child 272 1c774f7a0341
equal deleted inserted replaced
203:00fc169cc6a9 266:825ff4d6a8ac
       
     1 # -*- coding: utf-8 -*-
       
     2 '''
       
     3 Created on Jan 30, 2012
       
     4 
       
     5 @author: ymh
       
     6 '''
       
     7 
       
     8 from django.core.management.base import NoArgsCommand
       
     9 from django.core.management.color import no_style
       
    10 
       
    11 from optparse import make_option
       
    12 from django.conf import settings
       
    13 from django.db.models import Count
       
    14 from django.db import transaction
       
    15 from hdabo.models import Tag
       
    16 from hdalab.models import DbpediaFields, TagLinks
       
    17 from hdabo.utils import show_progress
       
    18 from rdflib.graph import Graph
       
    19 from rdflib import URIRef
       
    20 import re
       
    21 import traceback
       
    22 import sys
       
    23 from hdalab.models.dataviz import DbpediaFieldsTranslation
       
    24 from django import db
       
    25 
       
    26 class Command(NoArgsCommand):
       
    27     '''
       
    28     query and update wikipedia for tag title.
       
    29     '''
       
    30     options = ''
       
    31     help = """query and update wikipedia for tag title."""
       
    32     
       
    33     option_list = NoArgsCommand.option_list + (
       
    34         make_option('--all',
       
    35             action='store_true',
       
    36             dest='all',
       
    37             default=False,
       
    38             help='force all tags to be updated, not only those not yet processed'),
       
    39         make_option('--force',
       
    40             action='store_true',
       
    41             dest='force',
       
    42             default=False,
       
    43             help='ask no questions'),
       
    44         make_option('--random',
       
    45             action='store_true',
       
    46             dest='random',
       
    47             default=False,
       
    48             help='randomize query on tags'),
       
    49         make_option('--limit',
       
    50             action='store',
       
    51             type='int',
       
    52             dest='limit',
       
    53             default= -1,
       
    54             help='number of tag to process'),
       
    55         make_option('--start',
       
    56             action='store',
       
    57             type='int',
       
    58             dest='start',
       
    59             default=0,
       
    60             help='number of tag to ignore'),
       
    61         make_option('--tag',
       
    62             action='append',
       
    63             dest='tags',
       
    64             type='string',
       
    65             default=[],
       
    66             help='the tag to query'),
       
    67     )
       
    68     
       
    69     def handle_noargs(self, **options):
       
    70                 
       
    71         self.style = no_style()
       
    72         
       
    73         self.interactive = options.get('interactive', True)
       
    74         
       
    75         self.verbosity = int(options.get('verbosity', '1'))
       
    76         
       
    77         self.force = options.get('force', False)
       
    78         
       
    79         self.limit = options.get("limit", -1)
       
    80         self.start = options.get("start", 0)
       
    81         
       
    82         self.random = options.get('random', False)
       
    83                         
       
    84         if self.verbosity > 2:
       
    85             print "option passed : " + repr(options)
       
    86 
       
    87         self.tag_list = options.get("tags", []);
       
    88 
       
    89         queryset = Tag.objects.exclude(dbpedia_uri= None)
       
    90         
       
    91                 
       
    92         if self.tag_list:
       
    93             queryset = queryset.filter(label__in=self.tag_list)
       
    94         elif not options.get('all',False):            
       
    95             queryset = queryset.annotate(dbfc=Count('dbpedia_fields')).filter(dbfc = 0)
       
    96 
       
    97         if self.random:
       
    98             queryset = queryset.order_by("?")
       
    99         else:
       
   100             queryset = queryset.order_by("label")
       
   101         
       
   102         if self.limit >= 0:
       
   103             queryset = queryset[self.start:self.limit]
       
   104         elif self.start > 0:
       
   105             queryset = queryset[self.start:]
       
   106         
       
   107         if self.verbosity > 2 :
       
   108             print "Tag Query is %s" % (queryset.query)
       
   109 
       
   110         count = queryset.count()
       
   111         
       
   112         if count == 0:
       
   113             print "No tag to query : exit."
       
   114             return
       
   115             
       
   116             
       
   117         if not self.force and self.interactive:
       
   118             confirm = raw_input("You have requested to query and replace the dbpedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
       
   119         else:
       
   120             confirm = 'yes'
       
   121             
       
   122         if confirm != "yes":
       
   123             print "dbpedia query cancelled"
       
   124             return
       
   125 
       
   126         writer = None
       
   127         for i,tag in enumerate(queryset):
       
   128             writer = show_progress(i+1, count, tag.label, 50, writer)
       
   129             db.reset_queries()
       
   130             rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3"            
       
   131             g = Graph()
       
   132             try :
       
   133                 g.parse(rdf_uri, format="n3")
       
   134             
       
   135                 with transaction.commit_on_success():
       
   136                     
       
   137                     abstracts = {}
       
   138                     labels = {}
       
   139                     thumbnail = None
       
   140                     for t in g:
       
   141                         if t[1] == URIRef(u'http://dbpedia.org/ontology/abstract') and t[2] is not None \
       
   142                             and hasattr(t[2], 'language'):
       
   143                             abstracts[t[2].language] = unicode(t[2])
       
   144                         if t[1] ==  URIRef(u'http://www.w3.org/2000/01/rdf-schema#label') and t[2] is not None \
       
   145                             and hasattr(t[2], 'language'):
       
   146                             labels[t[2].language] = unicode(t[2]) 
       
   147                         if t[1] == URIRef(u'http://dbpedia.org/ontology/thumbnail') and t[2] is not None:
       
   148                             thumbnail = unicode(t[2])
       
   149                         if u'http://dbpedia.org/resource' in t[2]:
       
   150                             tagqs = Tag.objects.filter(dbpedia_uri=unicode(t[2]))
       
   151                             if tagqs:
       
   152                                 TagLinks.objects.get_or_create(subject=tag, object=tagqs[0])                        
       
   153                     
       
   154                     ref_label = labels['fr'] if 'fr' in labels else labels['en'] if 'en' in labels else labels.values()[0] if len(labels) > 0 else tag.label
       
   155                     ref_abstract = abstracts['fr'] if 'fr' in abstracts else abstracts['en'] if 'en' in abstracts else abstracts.values()[0] if len(abstracts) > 0 else None
       
   156                     for lang in settings.LANGUAGES:
       
   157                         if lang[0] not in labels:
       
   158                             labels[lang[0]]= ref_label
       
   159                         if lang[0] not in abstracts:
       
   160                             abstracts[lang[0]] = ref_abstract
       
   161 
       
   162                     dbfield , created = DbpediaFields.objects.get_or_create(dbpedia_uri=tag.dbpedia_uri,tag=tag, defaults={'abstract':ref_abstract, 'thumbnail':thumbnail, 'label':ref_label}) #@UndefinedVariable                    
       
   163                     if not created:
       
   164                         dbfield.abstract = ref_abstract
       
   165                         dbfield.thumbnail = thumbnail
       
   166                         dbfield.label = ref_label
       
   167                         dbfield.save()
       
   168                         DbpediaFieldsTranslation.objects.filter(master=dbfield).delete()
       
   169 
       
   170                     consolidated_trans = {}
       
   171                     for lang,label in labels.iteritems():
       
   172                         consolidated_trans[lang] = [label,ref_abstract]
       
   173                     for lang,abstract in abstracts.iteritems():
       
   174                         if lang in consolidated_trans:
       
   175                             consolidated_trans[lang][1] = abstract
       
   176                         else:
       
   177                             consolidated_trans[lang] = [ref_label, abstract] 
       
   178                          
       
   179                     for lang, trans in consolidated_trans.iteritems():
       
   180                         label, abstract = tuple(trans)
       
   181                         DbpediaFieldsTranslation.objects.create(master=dbfield, language_code=lang, label=label, abstract=abstract)
       
   182                     
       
   183                     
       
   184             except Exception as e:
       
   185                 print "\nError processing resource %s : %s" %(rdf_uri,unicode(e))
       
   186                 traceback.print_exception(type(e), e, sys.exc_info()[2])
       
   187                 
       
   188 
       
   189 
       
   190             
       
   191             
       
   192