web/hdalab/management/commands/query_dbpedia.py
author ymh <ymh.work@gmail.com>
Thu, 16 Feb 2012 21:48:40 +0100
changeset 119 e3ebe3545f72
child 135 dd6578e36a57
permissions -rw-r--r--
first implementation of django version. Kind of work but need optimisation. Will do them after update from raphael
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
119
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
     1
# -*- coding: utf-8 -*-
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
     2
'''
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
     3
Created on Jan 30, 2012
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
     4
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
     5
@author: ymh
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
     6
'''
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
     7
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
     8
from django.core.management.base import NoArgsCommand
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
     9
from django.core.management.color import no_style
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    10
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    11
from optparse import make_option
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    12
from django.db.models import Count
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    13
from django.db import transaction
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    14
from hdabo.models import Tag
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    15
from hdalab.models import DbpediaFields, TagLinks
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    16
from hdabo.utils import show_progress
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    17
from rdflib.graph import Graph
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    18
from rdflib import URIRef
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    19
import re
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    20
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    21
class Command(NoArgsCommand):
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    22
    '''
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    23
    query and update wikipedia for tag title.
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    24
    '''
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    25
    options = ''
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    26
    help = """query and update wikipedia for tag title."""
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    27
    
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    28
    option_list = NoArgsCommand.option_list + (
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    29
        make_option('--all',
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    30
            action='store_true',
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    31
            dest='all',
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    32
            default=False,
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    33
            help='force all tags to be updated, not only those not yet processed'),
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    34
        make_option('--force',
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    35
            action='store_true',
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    36
            dest='force',
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    37
            default=False,
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    38
            help='ask no questions'),
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    39
        make_option('--random',
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    40
            action='store_true',
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    41
            dest='random',
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    42
            default=False,
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    43
            help='randomize query on tags'),
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    44
        make_option('--limit',
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    45
            action='store',
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    46
            type='int',
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    47
            dest='limit',
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    48
            default= -1,
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    49
            help='number of tag to process'),
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    50
        make_option('--start',
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    51
            action='store',
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    52
            type='int',
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    53
            dest='start',
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    54
            default=0,
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    55
            help='number of tag to ignore'),
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    56
        make_option('--tag',
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    57
            action='append',
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    58
            dest='tags',
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    59
            type='string',
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    60
            default=[],
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    61
            help='the tag to query'),
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    62
    )
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    63
    
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    64
    def handle_noargs(self, **options):
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    65
        
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    66
        self.style = no_style()
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    67
        
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    68
        self.interactive = options.get('interactive', True)
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    69
        
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    70
        self.verbosity = int(options.get('verbosity', '1'))
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    71
        
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    72
        self.force = options.get('force', False)
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    73
        
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    74
        self.limit = options.get("limit", -1)
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    75
        self.start = options.get("start", 0)
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    76
        
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    77
        self.random = options.get('random', False)
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    78
                        
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    79
        if self.verbosity > 2:
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    80
            print "option passed : " + repr(options)
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    81
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    82
        self.tag_list = options.get("tags", []);
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    83
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    84
        queryset = Tag.objects.exclude(dbpedia_uri= None)
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    85
        
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    86
                
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    87
        if self.tag_list:
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    88
            queryset = queryset.filter(label__in=self.tag_list)
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    89
        elif not options.get('all',False):            
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    90
            queryset = queryset.annotate(dbfc=Count('dbpedia_fields')).filter(dbfc = 0)
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    91
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    92
        if self.random:
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    93
            queryset = queryset.order_by("?")
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    94
        else:
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    95
            queryset = queryset.order_by("label")
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    96
        
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    97
        if self.limit >= 0:
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    98
            queryset = queryset[self.start:self.limit]
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
    99
        elif self.start > 0:
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   100
            queryset = queryset[self.start:]
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   101
        
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   102
        if self.verbosity > 2 :
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   103
            print "Tag Query is %s" % (queryset.query)
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   104
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   105
        count = queryset.count()
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   106
        
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   107
        if not self.force and self.interactive:
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   108
            confirm = raw_input("You have requested to query and replace the dbpedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   109
        else:
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   110
            confirm = 'yes'
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   111
            
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   112
        if confirm != "yes":
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   113
            print "dbpedia query cancelled"
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   114
            return
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   115
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   116
        writer = None
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   117
        for i,tag in enumerate(queryset):
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   118
            writer = show_progress(i+1, count, tag.label, 50, writer)
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   119
            
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   120
            rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3"            
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   121
            g = Graph()
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   122
            try :
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   123
                g.parse(rdf_uri, format="n3")
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   124
            
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   125
                with transaction.commit_on_success():
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   126
                    
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   127
                    abstract = None
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   128
                    label = None
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   129
                    thumbnail = None
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   130
                    for t in g:
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   131
                        if t[1] == URIRef(u'http://dbpedia.org/ontology/abstract') and t[2] is not None \
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   132
                            and hasattr(t[2], 'language') and (t[2].language == u"fr" or (abstract is None and t[2].language == u"en")):
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   133
                            abstract = unicode(t[2])
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   134
                        if t[1] ==  URIRef(u'http://www.w3.org/2000/01/rdf-schema#label') and t[2] is not None \
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   135
                            and hasattr(t[2], 'language') and (t[2].language == u"fr" or (label is None and t[2].language == u"en")):
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   136
                            label = unicode(t[2]) 
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   137
                        if t[1] == URIRef(u'http://dbpedia.org/ontology/thumbnail') and t[2] is not None:
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   138
                            thumbnail = unicode(t[2])
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   139
                        if u'http://dbpedia.org/resource' in t[2]:
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   140
                            tagqs = Tag.objects.filter(dbpedia_uri=unicode(t[2]))
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   141
                            if tagqs:
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   142
                                TagLinks.objects.get_or_create(subject=tag, object=tagqs[0])                        
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   143
                    
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   144
                    dbfield , created = DbpediaFields.objects.get_or_create(dbpedia_uri=tag.dbpedia_uri,tag=tag, defaults={'abstract':abstract, 'label':label, 'thumbnail':thumbnail})
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   145
                    if not created:
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   146
                        dbfield.abstract = abstract
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   147
                        dbfield.label = label
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   148
                        dbfield.thumbnail = thumbnail
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   149
                        dbfield.save()
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   150
                    
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   151
            except Exception as e:
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   152
                print "\nError processing resource %s : %s" %(rdf_uri,unicode(e))
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   153
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   154
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   155
            
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   156
            
e3ebe3545f72 first implementation of django version.
ymh <ymh.work@gmail.com>
parents:
diff changeset
   157