src/hdalab/management/commands/query_dbpedia.py
author ymh <ymh.work@gmail.com>
Fri, 19 Jul 2024 09:38:03 +0200
changeset 704 b5835dca2624
parent 571 d9642be7c937
child 693 09e00f38d177
permissions -rw-r--r--
Adapt renkan preview to uses chrome headless/puppeteer

# -*- coding: utf-8 -*-
'''
Created on Jan 30, 2012

@author: ymh
'''

from hdabo.models import Tag
from hdabo.utils import show_progress
from hdalab.models import DbpediaFields, TagLinks
from hdalab.models.dataviz import DbpediaFieldsTranslation
import logging
from optparse import make_option
import sys
import traceback

from django import db
from django.conf import settings
from django.core.management.base import NoArgsCommand
from django.core.management.color import no_style
from django.db import transaction
from django.db.models import Count
from rdflib import URIRef, Graph
import requests


logger = logging.getLogger(__name__)

class Command(NoArgsCommand):
    '''
    query and update wikipedia for tag title.
    '''
    options = ''
    help = """query and update wikipedia for tag title."""
    
    option_list = NoArgsCommand.option_list + (
        make_option('--all',
            action='store_true',
            dest='all',
            default=False,
            help='force all tags to be updated, not only those not yet processed'),
        make_option('--force',
            action='store_true',
            dest='force',
            default=False,
            help='ask no questions'),
        make_option('--random',
            action='store_true',
            dest='random',
            default=False,
            help='randomize query on tags'),
        make_option('--limit',
            action='store',
            type='int',
            dest='limit',
            default= -1,
            help='number of tag to process'),
        make_option('--start',
            action='store',
            type='int',
            dest='start',
            default=0,
            help='number of tag to ignore'),
        make_option('--tag',
            action='append',
            dest='tags',
            type='string',
            default=[],
            help='the tag to query'),
    )
    
    def query_dbpedia(self, query, fmt='n3'):
        url = settings.DBPEDIA_URI_TEMPLATE % ( 'sparql', '' )
        params = {
            'query': query,
            'format': {'n3':'text/turtle', 'rdf/xml':"application/rdf+xml", 'nt': 'text/plain'}.get(fmt, 'text/turtle')
        }
        resp = requests.get(url, params=params)
        logger.debug("Query dbpedia : %s", resp.text)
        return Graph().parse(data=resp.text, format=fmt)


    def handle_noargs(self, **options):
                
        self.style = no_style()
        
        self.interactive = options.get('interactive', True)
        
        self.verbosity = int(options.get('verbosity', '1'))
        
        self.force = options.get('force', False)
        
        self.limit = options.get("limit", -1)
        self.start = options.get("start", 0)
        
        self.random = options.get('random', False)
                        
        if self.verbosity > 2:
            print "option passed : " + repr(options)

        self.tag_list = options.get("tags", []);

        queryset = Tag.objects.exclude(dbpedia_uri= None)
        
                
        if self.tag_list:
            queryset = queryset.filter(label__in=self.tag_list)
        elif not options.get('all',False):            
            queryset = queryset.annotate(dbfc=Count('dbpedia_fields')).filter(dbfc = 0)

        if self.random:
            queryset = queryset.order_by("?")
        else:
            queryset = queryset.order_by("label")
        
        if self.limit >= 0:
            queryset = queryset[self.start:self.limit]
        elif self.start > 0:
            queryset = queryset[self.start:]
        
        if self.verbosity > 2 :
            print "Tag Query is %s" % (queryset.query)

        count = queryset.count()
        
        if count == 0:
            print "No tag to query : exit."
            return
            
            
        if not self.force and self.interactive:
            confirm = raw_input("You have requested to query and replace the dbpedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
        else:
            confirm = 'yes'
            
        if confirm != "yes":
            print "dbpedia query cancelled"
            return

        writer = None
        for i,tag in enumerate(queryset):
            writer = show_progress(i+1, count, tag.label, 50, writer)
            db.reset_queries()
            
            #abstract query
            #"select ?y
            # where {<%s>  <http://dbpedia.org/ontology/abstract> ?y}" % (tag.dbpedia_uri)
            
            #rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3"            
            #g = Graph()
            try :
                abstracts = {}
                labels = {}
                thumbnail = None
                with transaction.atomic():
                    res_abstracts = self.query_dbpedia("select distinct ?y where {<%s>  <http://dbpedia.org/ontology/abstract> ?y}" % (tag.dbpedia_uri), 'n3')
                    for _,_,o in res_abstracts.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)):
                        abstracts[o.language] = (unicode(o), True)
                    logger.debug("Abstracts: %r" % abstracts)
                    
                    res_labels = self.query_dbpedia("select distinct ?y where {<%s>  <http://www.w3.org/2000/01/rdf-schema#label> ?y}" % (tag.dbpedia_uri), 'n3')
                    for _,_,o in res_labels.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)):
                        labels[o.language] = (unicode(o), True)
                    logger.debug("Labels: %r" % labels)
                    
                    res_thumbnails = self.query_dbpedia("select distinct ?y where {<%s>  <http://dbpedia.org/ontology/thumbnail> ?y} limit 1" % (tag.dbpedia_uri), 'n3')
                    for _,_,o in res_thumbnails.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)):
                        thumbnail = unicode(o)

                    res_links = self.query_dbpedia('select distinct ?y where { <%s> ?p ?y . FILTER regex(?y, "^%s")}' % (tag.dbpedia_uri, settings.DBPEDIA_URI_TEMPLATE % ( 'resource', '' )), 'n3')
                    for _,_,o in res_links.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)):
                        tagqs = Tag.objects.filter(dbpedia_uri=unicode(o))
                        if tagqs:
                            TagLinks.objects.get_or_create(subject=tag, object=tagqs[0])
                    
                    ref_label_lang, (ref_label, _) = ('fr',labels['fr']) if 'fr' in labels else ('en',labels['en']) if 'en' in labels else labels.items()[0] if len(labels) > 0 else ('fr',(tag.label, True))
                    ref_abstract_lang, (ref_abstract, _) = ('fr',abstracts['fr']) if 'fr' in abstracts else ('en',abstracts['en']) if 'en' in abstracts else abstracts.items()[0] if len(abstracts) > 0 else ('fr',(None, 'True'))

                    for lang in settings.LANGUAGES:
                        if lang[0] not in labels:
                            labels[lang[0]]= (ref_label, False)
                        if lang[0] not in abstracts:
                            abstracts[lang[0]] = (ref_abstract, False)

                    dbfield , created = DbpediaFields.objects.get_or_create(tag=tag, defaults={'dbpedia_uri':tag.dbpedia_uri, 'abstract':ref_abstract, 'thumbnail':thumbnail, 'label':ref_label}) #@UndefinedVariable                    
                    if not created:
                        dbfield.dbpedia_uri = tag.dbpedia_uri
                        dbfield.abstract = ref_abstract
                        dbfield.thumbnail = thumbnail
                        dbfield.label = ref_label
                        dbfield.save()
                        DbpediaFieldsTranslation.objects.filter(master=dbfield).delete()

                    consolidated_trans = {}
                    for lang,label in labels.iteritems():
                        consolidated_trans[lang] = [label,(ref_abstract, lang==ref_abstract_lang)]
                    for lang,abstract in abstracts.iteritems():
                        if lang in consolidated_trans:
                            consolidated_trans[lang][1] = abstract
                        else:
                            consolidated_trans[lang] = [(ref_label, lang==ref_label_lang), abstract] 
                         
                    for lang, trans in consolidated_trans.iteritems():
                        label, abstract = tuple(trans)
                        DbpediaFieldsTranslation.objects.create(master=dbfield, language_code=lang, label=label[0], is_label_translated=label[1], abstract=abstract[0], is_abstract_translated=abstract[1])
                    
                    
            except Exception as e:
                if tag.dbpedia_uri:
                    print "\nError processing resource %s : %s" %(tag.dbpedia_uri,unicode(e))
                else:
                    print "\nError processing resource %s" % unicode(e)
                traceback.print_exception(type(e), e, sys.exc_info()[2])