web/hdalab/management/commands/query_dbpedia.py
changeset 266 825ff4d6a8ac
parent 203 00fc169cc6a9
child 267 24ff98f2a122
--- a/web/hdalab/management/commands/query_dbpedia.py	Fri Jun 22 19:16:46 2012 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,192 +0,0 @@
-# -*- coding: utf-8 -*-
-'''
-Created on Jan 30, 2012
-
-@author: ymh
-'''
-
-from django.core.management.base import NoArgsCommand
-from django.core.management.color import no_style
-
-from optparse import make_option
-from django.conf import settings
-from django.db.models import Count
-from django.db import transaction
-from hdabo.models import Tag
-from hdalab.models import DbpediaFields, TagLinks
-from hdabo.utils import show_progress
-from rdflib.graph import Graph
-from rdflib import URIRef
-import re
-import traceback
-import sys
-from hdalab.models.dataviz import DbpediaFieldsTranslation
-from django import db
-
-class Command(NoArgsCommand):
-    '''
-    query and update wikipedia for tag title.
-    '''
-    options = ''
-    help = """query and update wikipedia for tag title."""
-    
-    option_list = NoArgsCommand.option_list + (
-        make_option('--all',
-            action='store_true',
-            dest='all',
-            default=False,
-            help='force all tags to be updated, not only those not yet processed'),
-        make_option('--force',
-            action='store_true',
-            dest='force',
-            default=False,
-            help='ask no questions'),
-        make_option('--random',
-            action='store_true',
-            dest='random',
-            default=False,
-            help='randomize query on tags'),
-        make_option('--limit',
-            action='store',
-            type='int',
-            dest='limit',
-            default= -1,
-            help='number of tag to process'),
-        make_option('--start',
-            action='store',
-            type='int',
-            dest='start',
-            default=0,
-            help='number of tag to ignore'),
-        make_option('--tag',
-            action='append',
-            dest='tags',
-            type='string',
-            default=[],
-            help='the tag to query'),
-    )
-    
-    def handle_noargs(self, **options):
-                
-        self.style = no_style()
-        
-        self.interactive = options.get('interactive', True)
-        
-        self.verbosity = int(options.get('verbosity', '1'))
-        
-        self.force = options.get('force', False)
-        
-        self.limit = options.get("limit", -1)
-        self.start = options.get("start", 0)
-        
-        self.random = options.get('random', False)
-                        
-        if self.verbosity > 2:
-            print "option passed : " + repr(options)
-
-        self.tag_list = options.get("tags", []);
-
-        queryset = Tag.objects.exclude(dbpedia_uri= None)
-        
-                
-        if self.tag_list:
-            queryset = queryset.filter(label__in=self.tag_list)
-        elif not options.get('all',False):            
-            queryset = queryset.annotate(dbfc=Count('dbpedia_fields')).filter(dbfc = 0)
-
-        if self.random:
-            queryset = queryset.order_by("?")
-        else:
-            queryset = queryset.order_by("label")
-        
-        if self.limit >= 0:
-            queryset = queryset[self.start:self.limit]
-        elif self.start > 0:
-            queryset = queryset[self.start:]
-        
-        if self.verbosity > 2 :
-            print "Tag Query is %s" % (queryset.query)
-
-        count = queryset.count()
-        
-        if count == 0:
-            print "No tag to query : exit."
-            return
-            
-            
-        if not self.force and self.interactive:
-            confirm = raw_input("You have requested to query and replace the dbpedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
-        else:
-            confirm = 'yes'
-            
-        if confirm != "yes":
-            print "dbpedia query cancelled"
-            return
-
-        writer = None
-        for i,tag in enumerate(queryset):
-            writer = show_progress(i+1, count, tag.label, 50, writer)
-            db.reset_queries()
-            rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3"            
-            g = Graph()
-            try :
-                g.parse(rdf_uri, format="n3")
-            
-                with transaction.commit_on_success():
-                    
-                    abstracts = {}
-                    labels = {}
-                    thumbnail = None
-                    for t in g:
-                        if t[1] == URIRef(u'http://dbpedia.org/ontology/abstract') and t[2] is not None \
-                            and hasattr(t[2], 'language'):
-                            abstracts[t[2].language] = unicode(t[2])
-                        if t[1] ==  URIRef(u'http://www.w3.org/2000/01/rdf-schema#label') and t[2] is not None \
-                            and hasattr(t[2], 'language'):
-                            labels[t[2].language] = unicode(t[2]) 
-                        if t[1] == URIRef(u'http://dbpedia.org/ontology/thumbnail') and t[2] is not None:
-                            thumbnail = unicode(t[2])
-                        if u'http://dbpedia.org/resource' in t[2]:
-                            tagqs = Tag.objects.filter(dbpedia_uri=unicode(t[2]))
-                            if tagqs:
-                                TagLinks.objects.get_or_create(subject=tag, object=tagqs[0])                        
-                    
-                    ref_label = labels['fr'] if 'fr' in labels else labels['en'] if 'en' in labels else labels.values()[0] if len(labels) > 0 else tag.label
-                    ref_abstract = abstracts['fr'] if 'fr' in abstracts else abstracts['en'] if 'en' in abstracts else abstracts.values()[0] if len(abstracts) > 0 else None
-                    for lang in settings.LANGUAGES:
-                        if lang[0] not in labels:
-                            labels[lang[0]]= ref_label
-                        if lang[0] not in abstracts:
-                            abstracts[lang[0]] = ref_abstract
-
-                    dbfield , created = DbpediaFields.objects.get_or_create(dbpedia_uri=tag.dbpedia_uri,tag=tag, defaults={'abstract':ref_abstract, 'thumbnail':thumbnail, 'label':ref_label}) #@UndefinedVariable                    
-                    if not created:
-                        dbfield.abstract = ref_abstract
-                        dbfield.thumbnail = thumbnail
-                        dbfield.label = ref_label
-                        dbfield.save()
-                        DbpediaFieldsTranslation.objects.filter(master=dbfield).delete()
-
-                    consolidated_trans = {}
-                    for lang,label in labels.iteritems():
-                        consolidated_trans[lang] = [label,ref_abstract]
-                    for lang,abstract in abstracts.iteritems():
-                        if lang in consolidated_trans:
-                            consolidated_trans[lang][1] = abstract
-                        else:
-                            consolidated_trans[lang] = [ref_label, abstract] 
-                         
-                    for lang, trans in consolidated_trans.iteritems():
-                        label, abstract = tuple(trans)
-                        DbpediaFieldsTranslation.objects.create(master=dbfield, language_code=lang, label=label, abstract=abstract)
-                    
-                    
-            except Exception as e:
-                print "\nError processing resource %s : %s" %(rdf_uri,unicode(e))
-                traceback.print_exception(type(e), e, sys.exc_info()[2])
-                
-
-
-            
-            
-