web/hdalab/management/commands/query_dbpedia.py
changeset 135 dd6578e36a57
parent 119 e3ebe3545f72
child 240 c8627191f2d7
equal deleted inserted replaced
134:75f8f05f9a60 135:dd6578e36a57
     7 
     7 
     8 from django.core.management.base import NoArgsCommand
     8 from django.core.management.base import NoArgsCommand
     9 from django.core.management.color import no_style
     9 from django.core.management.color import no_style
    10 
    10 
    11 from optparse import make_option
    11 from optparse import make_option
       
    12 from django.conf import settings
    12 from django.db.models import Count
    13 from django.db.models import Count
    13 from django.db import transaction
    14 from django.db import transaction
    14 from hdabo.models import Tag
    15 from hdabo.models import Tag
    15 from hdalab.models import DbpediaFields, TagLinks
    16 from hdalab.models import DbpediaFields, TagLinks
    16 from hdabo.utils import show_progress
    17 from hdabo.utils import show_progress
    17 from rdflib.graph import Graph
    18 from rdflib.graph import Graph
    18 from rdflib import URIRef
    19 from rdflib import URIRef
    19 import re
    20 import re
       
    21 import traceback
       
    22 import sys
       
    23 from hdalab.models.dataviz import DbpediaFieldsTranslation
       
    24 from django import db
    20 
    25 
    21 class Command(NoArgsCommand):
    26 class Command(NoArgsCommand):
    22     '''
    27     '''
    23     query and update wikipedia for tag title.
    28     query and update wikipedia for tag title.
    24     '''
    29     '''
    60             default=[],
    65             default=[],
    61             help='the tag to query'),
    66             help='the tag to query'),
    62     )
    67     )
    63     
    68     
    64     def handle_noargs(self, **options):
    69     def handle_noargs(self, **options):
    65         
    70                 
    66         self.style = no_style()
    71         self.style = no_style()
    67         
    72         
    68         self.interactive = options.get('interactive', True)
    73         self.interactive = options.get('interactive', True)
    69         
    74         
    70         self.verbosity = int(options.get('verbosity', '1'))
    75         self.verbosity = int(options.get('verbosity', '1'))
   102         if self.verbosity > 2 :
   107         if self.verbosity > 2 :
   103             print "Tag Query is %s" % (queryset.query)
   108             print "Tag Query is %s" % (queryset.query)
   104 
   109 
   105         count = queryset.count()
   110         count = queryset.count()
   106         
   111         
       
   112         if count == 0:
       
   113             print "No tag to query : exit."
       
   114             return
       
   115             
       
   116             
   107         if not self.force and self.interactive:
   117         if not self.force and self.interactive:
   108             confirm = raw_input("You have requested to query and replace the dbpedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
   118             confirm = raw_input("You have requested to query and replace the dbpedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
   109         else:
   119         else:
   110             confirm = 'yes'
   120             confirm = 'yes'
   111             
   121             
   114             return
   124             return
   115 
   125 
   116         writer = None
   126         writer = None
   117         for i,tag in enumerate(queryset):
   127         for i,tag in enumerate(queryset):
   118             writer = show_progress(i+1, count, tag.label, 50, writer)
   128             writer = show_progress(i+1, count, tag.label, 50, writer)
   119             
   129             db.reset_queries()
   120             rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3"            
   130             rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3"            
   121             g = Graph()
   131             g = Graph()
   122             try :
   132             try :
   123                 g.parse(rdf_uri, format="n3")
   133                 g.parse(rdf_uri, format="n3")
   124             
   134             
   125                 with transaction.commit_on_success():
   135                 with transaction.commit_on_success():
   126                     
   136                     
   127                     abstract = None
   137                     abstracts = {}
   128                     label = None
   138                     labels = {}
   129                     thumbnail = None
   139                     thumbnail = None
   130                     for t in g:
   140                     for t in g:
   131                         if t[1] == URIRef(u'http://dbpedia.org/ontology/abstract') and t[2] is not None \
   141                         if t[1] == URIRef(u'http://dbpedia.org/ontology/abstract') and t[2] is not None \
   132                             and hasattr(t[2], 'language') and (t[2].language == u"fr" or (abstract is None and t[2].language == u"en")):
   142                             and hasattr(t[2], 'language'):
   133                             abstract = unicode(t[2])
   143                             abstracts[t[2].language] = unicode(t[2])
   134                         if t[1] ==  URIRef(u'http://www.w3.org/2000/01/rdf-schema#label') and t[2] is not None \
   144                         if t[1] ==  URIRef(u'http://www.w3.org/2000/01/rdf-schema#label') and t[2] is not None \
   135                             and hasattr(t[2], 'language') and (t[2].language == u"fr" or (label is None and t[2].language == u"en")):
   145                             and hasattr(t[2], 'language'):
   136                             label = unicode(t[2]) 
   146                             labels[t[2].language] = unicode(t[2]) 
   137                         if t[1] == URIRef(u'http://dbpedia.org/ontology/thumbnail') and t[2] is not None:
   147                         if t[1] == URIRef(u'http://dbpedia.org/ontology/thumbnail') and t[2] is not None:
   138                             thumbnail = unicode(t[2])
   148                             thumbnail = unicode(t[2])
   139                         if u'http://dbpedia.org/resource' in t[2]:
   149                         if u'http://dbpedia.org/resource' in t[2]:
   140                             tagqs = Tag.objects.filter(dbpedia_uri=unicode(t[2]))
   150                             tagqs = Tag.objects.filter(dbpedia_uri=unicode(t[2]))
   141                             if tagqs:
   151                             if tagqs:
   142                                 TagLinks.objects.get_or_create(subject=tag, object=tagqs[0])                        
   152                                 TagLinks.objects.get_or_create(subject=tag, object=tagqs[0])                        
   143                     
   153                     
   144                     dbfield , created = DbpediaFields.objects.get_or_create(dbpedia_uri=tag.dbpedia_uri,tag=tag, defaults={'abstract':abstract, 'label':label, 'thumbnail':thumbnail})
   154                     ref_label = labels['fr'] if 'fr' in labels else labels['en'] if 'en' in labels else labels.values()[0] if len(labels) > 0 else tag.label
       
   155                     ref_abstract = abstracts['fr'] if 'fr' in abstracts else abstracts['en'] if 'en' in abstracts else abstracts.values()[0] if len(abstracts) > 0 else None
       
   156                     for lang in settings.LANGUAGES:
       
   157                         if lang[0] not in labels:
       
   158                             labels[lang[0]]= ref_label
       
   159                         if lang[0] not in abstracts:
       
   160                             abstracts[lang[0]] = ref_abstract
       
   161 
       
   162                     dbfield , created = DbpediaFields.objects.get_or_create(dbpedia_uri=tag.dbpedia_uri,tag=tag, defaults={'abstract':ref_abstract, 'thumbnail':thumbnail, 'label':ref_label}) #@UndefinedVariable                    
   145                     if not created:
   163                     if not created:
   146                         dbfield.abstract = abstract
   164                         dbfield.abstract = ref_abstract
   147                         dbfield.label = label
       
   148                         dbfield.thumbnail = thumbnail
   165                         dbfield.thumbnail = thumbnail
       
   166                         dbfield.label = ref_label
   149                         dbfield.save()
   167                         dbfield.save()
       
   168                         DbpediaFieldsTranslation.objects.filter(master=dbfield).delete()
       
   169 
       
   170                     consolidated_trans = {}
       
   171                     for lang,label in labels.iteritems():
       
   172                         consolidated_trans[lang] = [label,ref_abstract]
       
   173                     for lang,abstract in abstracts.iteritems():
       
   174                         if lang in consolidated_trans:
       
   175                             consolidated_trans[lang][1] = abstract
       
   176                         else:
       
   177                             consolidated_trans[lang] = [ref_label, abstract] 
       
   178                          
       
   179                     for lang, trans in consolidated_trans.iteritems():
       
   180                         label, abstract = tuple(trans)
       
   181                         DbpediaFieldsTranslation.objects.create(master=dbfield, language_code=lang, label=label, abstract=abstract)
       
   182                     
   150                     
   183                     
   151             except Exception as e:
   184             except Exception as e:
   152                 print "\nError processing resource %s : %s" %(rdf_uri,unicode(e))
   185                 print "\nError processing resource %s : %s" %(rdf_uri,unicode(e))
       
   186                 traceback.print_exception(type(e), e, sys.exc_info()[2])
       
   187                 
   153 
   188 
   154 
   189 
   155             
   190             
   156             
   191             
   157 
   192