web/hdalab/management/commands/query_dbpedia.py
changeset 271 8f77cf71ab02
parent 265 73f19fa4f997
child 272 1c774f7a0341
equal deleted inserted replaced
265:73f19fa4f997 271:8f77cf71ab02
     1 # -*- coding: utf-8 -*-
       
     2 '''
       
     3 Created on Jan 30, 2012
       
     4 
       
     5 @author: ymh
       
     6 '''
       
     7 
       
     8 from django.core.management.base import NoArgsCommand
       
     9 from django.core.management.color import no_style
       
    10 
       
    11 from optparse import make_option
       
    12 from django.conf import settings
       
    13 from django.db.models import Count
       
    14 from django.db import transaction
       
    15 from hdabo.models import Tag
       
    16 from hdalab.models import DbpediaFields, TagLinks
       
    17 from hdabo.utils import show_progress
       
    18 from rdflib.graph import Graph
       
    19 from rdflib import URIRef
       
    20 import traceback
       
    21 import sys
       
    22 from hdalab.models.dataviz import DbpediaFieldsTranslation
       
    23 from django import db
       
    24 from SPARQLWrapper import SPARQLWrapper, RDF
       
    25 
       
    26 class Command(NoArgsCommand):
       
    27     '''
       
    28     query and update wikipedia for tag title.
       
    29     '''
       
    30     options = ''
       
    31     help = """query and update wikipedia for tag title."""
       
    32     
       
    33     option_list = NoArgsCommand.option_list + (
       
    34         make_option('--all',
       
    35             action='store_true',
       
    36             dest='all',
       
    37             default=False,
       
    38             help='force all tags to be updated, not only those not yet processed'),
       
    39         make_option('--force',
       
    40             action='store_true',
       
    41             dest='force',
       
    42             default=False,
       
    43             help='ask no questions'),
       
    44         make_option('--random',
       
    45             action='store_true',
       
    46             dest='random',
       
    47             default=False,
       
    48             help='randomize query on tags'),
       
    49         make_option('--limit',
       
    50             action='store',
       
    51             type='int',
       
    52             dest='limit',
       
    53             default= -1,
       
    54             help='number of tag to process'),
       
    55         make_option('--start',
       
    56             action='store',
       
    57             type='int',
       
    58             dest='start',
       
    59             default=0,
       
    60             help='number of tag to ignore'),
       
    61         make_option('--tag',
       
    62             action='append',
       
    63             dest='tags',
       
    64             type='string',
       
    65             default=[],
       
    66             help='the tag to query'),
       
    67     )
       
    68     
       
    69     def handle_noargs(self, **options):
       
    70                 
       
    71         self.style = no_style()
       
    72         
       
    73         self.interactive = options.get('interactive', True)
       
    74         
       
    75         self.verbosity = int(options.get('verbosity', '1'))
       
    76         
       
    77         self.force = options.get('force', False)
       
    78         
       
    79         self.limit = options.get("limit", -1)
       
    80         self.start = options.get("start", 0)
       
    81         
       
    82         self.random = options.get('random', False)
       
    83                         
       
    84         if self.verbosity > 2:
       
    85             print "option passed : " + repr(options)
       
    86 
       
    87         self.tag_list = options.get("tags", []);
       
    88 
       
    89         queryset = Tag.objects.exclude(dbpedia_uri= None)
       
    90         
       
    91                 
       
    92         if self.tag_list:
       
    93             queryset = queryset.filter(label__in=self.tag_list)
       
    94         elif not options.get('all',False):            
       
    95             queryset = queryset.annotate(dbfc=Count('dbpedia_fields')).filter(dbfc = 0)
       
    96 
       
    97         if self.random:
       
    98             queryset = queryset.order_by("?")
       
    99         else:
       
   100             queryset = queryset.order_by("label")
       
   101         
       
   102         if self.limit >= 0:
       
   103             queryset = queryset[self.start:self.limit]
       
   104         elif self.start > 0:
       
   105             queryset = queryset[self.start:]
       
   106         
       
   107         if self.verbosity > 2 :
       
   108             print "Tag Query is %s" % (queryset.query)
       
   109 
       
   110         count = queryset.count()
       
   111         
       
   112         if count == 0:
       
   113             print "No tag to query : exit."
       
   114             return
       
   115             
       
   116             
       
   117         if not self.force and self.interactive:
       
   118             confirm = raw_input("You have requested to query and replace the dbpedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
       
   119         else:
       
   120             confirm = 'yes'
       
   121             
       
   122         if confirm != "yes":
       
   123             print "dbpedia query cancelled"
       
   124             return
       
   125 
       
   126         endpoint = SPARQLWrapper("http://dbpedia.org/sparql", returnFormat=RDF)
       
   127 
       
   128         writer = None
       
   129         for i,tag in enumerate(queryset):
       
   130             writer = show_progress(i+1, count, tag.label, 50, writer)
       
   131             db.reset_queries()
       
   132             
       
   133             #abstract query
       
   134             #"select ?y
       
   135             # where {<%s>  <http://dbpedia.org/ontology/abstract> ?y}" % (tag.dbpedia_uri)
       
   136             
       
   137             #rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3"            
       
   138             #g = Graph()
       
   139             try :
       
   140                 abstracts = {}
       
   141                 labels = {}
       
   142                 thumbnail = None
       
   143                 with transaction.commit_on_success():
       
   144                     endpoint.setQuery("select distinct ?y where {<%s>  <http://dbpedia.org/ontology/abstract> ?y}" % (tag.dbpedia_uri))
       
   145                     res_abstracts = endpoint.queryAndConvert()
       
   146                     for _,_,o in res_abstracts.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)):
       
   147                         abstracts[o.language] = (unicode(o), True)
       
   148 
       
   149                     endpoint.setQuery("select distinct ?y where {<%s>  <http://www.w3.org/2000/01/rdf-schema#label> ?y}" % (tag.dbpedia_uri))
       
   150                     res_labels = endpoint.queryAndConvert()
       
   151                     for _,_,o in res_labels.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)):
       
   152                         labels[o.language] = (unicode(o), True)
       
   153                             
       
   154                     endpoint.setQuery("select distinct ?y where {<%s>  <http://dbpedia.org/ontology/thumbnail> ?y} limit 1" % (tag.dbpedia_uri))
       
   155                     res_thumbnails = endpoint.queryAndConvert()
       
   156                     for _,_,o in res_thumbnails.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)):
       
   157                         thumbnail = unicode(o)
       
   158 
       
   159                     endpoint.setQuery('select distinct ?y where { <%s> ?p ?y . FILTER regex(?y, "^http://dbpedia.org/resource")}' % (tag.dbpedia_uri))
       
   160                     res_links = endpoint.queryAndConvert()
       
   161                     for _,_,o in res_links.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)):
       
   162                         tagqs = Tag.objects.filter(dbpedia_uri=unicode(o))
       
   163                         if tagqs:
       
   164                             TagLinks.objects.get_or_create(subject=tag, object=tagqs[0])                        
       
   165                     
       
   166                     ref_label_lang, (ref_label, _) = ('fr',labels['fr']) if 'fr' in labels else ('en',labels['en']) if 'en' in labels else labels.items()[0] if len(labels) > 0 else ('fr',(tag.label, True))
       
   167                     ref_abstract_lang, (ref_abstract, _) = ('fr',abstracts['fr']) if 'fr' in abstracts else ('en',abstracts['en']) if 'en' in abstracts else abstracts.items()[0] if len(abstracts) > 0 else ('fr',(None, 'True'))
       
   168 
       
   169                     for lang in settings.LANGUAGES:
       
   170                         if lang[0] not in labels:
       
   171                             labels[lang[0]]= (ref_label, False)
       
   172                         if lang[0] not in abstracts:
       
   173                             abstracts[lang[0]] = (ref_abstract, False)
       
   174 
       
   175                     dbfield , created = DbpediaFields.objects.get_or_create(dbpedia_uri=tag.dbpedia_uri,tag=tag, defaults={'abstract':ref_abstract, 'thumbnail':thumbnail, 'label':ref_label}) #@UndefinedVariable                    
       
   176                     if not created:
       
   177                         dbfield.abstract = ref_abstract
       
   178                         dbfield.thumbnail = thumbnail
       
   179                         dbfield.label = ref_label
       
   180                         dbfield.save()
       
   181                         DbpediaFieldsTranslation.objects.filter(master=dbfield).delete()
       
   182 
       
   183                     consolidated_trans = {}
       
   184                     for lang,label in labels.iteritems():
       
   185                         consolidated_trans[lang] = [label,(ref_abstract, lang==ref_abstract_lang)]
       
   186                     for lang,abstract in abstracts.iteritems():
       
   187                         if lang in consolidated_trans:
       
   188                             consolidated_trans[lang][1] = abstract
       
   189                         else:
       
   190                             consolidated_trans[lang] = [(ref_label, lang==ref_label_lang), abstract] 
       
   191                          
       
   192                     for lang, trans in consolidated_trans.iteritems():
       
   193                         label, abstract = tuple(trans)
       
   194                         DbpediaFieldsTranslation.objects.create(master=dbfield, language_code=lang, label=label[0], is_label_translated=label[1], abstract=abstract[0], is_abstract_translated=abstract[1])
       
   195                     
       
   196                     
       
   197             except Exception as e:
       
   198                 print "\nError processing resource %s : %s" %(rdf_uri,unicode(e))
       
   199                 traceback.print_exception(type(e), e, sys.exc_info()[2])
       
   200                 
       
   201 
       
   202 
       
   203             
       
   204             
       
   205