web/hdabo/management/commands/query_wikipedia.py
author ymh <ymh.work@gmail.com>
Wed, 22 Jun 2011 01:00:47 +0200
changeset 47 08b008c5a07d
parent 43 web/hdabo/management/commands/querywikipedia.py@e0812bc3ef44
child 108 4b73a767a6c0
permissions -rw-r--r--
- add popularity - calculate dbpedia_uri - display dbpedia uri - add manual_order - various corrections

# -*- coding: utf-8 -*-
'''
Created on Jun 7, 2011

@author: ymh
'''

from django.conf import settings
from django.core.management.base import NoArgsCommand
from django.core.management.color import no_style
from hdabo.models import Tag
from hdabo.wp_utils import process_tag
from optparse import make_option
from wikitools import wiki
import math
import sys



class Command(NoArgsCommand):
    '''
    query and update wikipedia for tag title.
    '''
    options = ''
    help = """query and update wikipedia for tag title."""
    
    option_list = NoArgsCommand.option_list + (
        make_option('--force',
            action='store_true',
            dest='force',
            default=False,
            help='force all tags to be updated, not only those not yet processed'),
        make_option('--random',
            action='store_true',
            dest='random',
            default=False,
            help='randomize query on tags'),
        make_option('--site',
            action='store',
            type='string',
            dest='site_url',
            default="http://fr.wikipedia.org/w/api.php",
            help='the url for the wikipedia site'),
        make_option('--limit',
            action='store',
            type='int',
            dest='limit',
            default= -1,
            help='number of tag to process'),
        make_option('--start',
            action='store',
            type='int',
            dest='start',
            default=0,
            help='number of tag to ignore'),
        )
    
    def __is_homonymie(self, page_dict):
        for cat in page_dict.get(u"categories", []):
            if u'Catégorie:Homonymie' in cat.get(u"title", u"") or u'Category:Disambiguation pages' in cat.get(u"title", u""):
                return True
        return False
        

    def process_wp_response(self, label, response):
        

        query_dict = response['query']
        # get page if multiple pages or none -> return Tag.null_result
        pages = query_dict.get("pages", {})
        if len(pages) > 1 or len(pages) == 0:
            return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None
        
        page = pages.values()[0]
        
        if u"invalid" in page or u"missing" in page:
            return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None

        url = page.get(u'fullurl', None)
        pageid = page.get(u'pageid', None)
        new_label = page[u'title']
        
        if self.__is_homonymie(page):
            status = Tag.TAG_URL_STATUS_DICT["homonyme"]
        elif u"redirect" in page:
            status = Tag.TAG_URL_STATUS_DICT["redirection"]
        else:
            status = Tag.TAG_URL_STATUS_DICT["match"]
        
        return new_label, status, url, pageid 

    def show_progress(self, current_line, total_line, label, width):

        percent = (float(current_line) / float(total_line)) * 100.0

        marks = math.floor(width * (percent / 100.0))
        spaces = math.floor(width - marks)
    
        loader = u'[' + (u'=' * int(marks)) + (u' ' * int(spaces)) + u']'
    
        sys.stdout.write(u"%s %d%% %d/%d - %s\r" % (loader, percent, current_line - 1, total_line - 1, repr(label))) #takes the header into account
        if percent >= 100:
            sys.stdout.write("\n")
        sys.stdout.flush()
        
    def handle_noargs(self, **options):
        
        self.style = no_style()
        
        interactive = options.get('interactive', True)
        
        verbosity = int(options.get('verbosity', '1'))
        
        force = options.get('force', False)
        
        limit = options.get("limit", -1)
        start = options.get("start", 0)
        
        site_url = options.get('site_url', settings.WIKIPEDIA_API_URL)
        
        random = options.get('random', False)
        
        if verbosity > 2:
            print "option passed : " + repr(options)

        if force and interactive:
            confirm = raw_input("""You have requested to query and replace the wikipedia information for all datasheets.
Are you sure you want to do this ?
    Type 'yes' to continue, or 'no' to cancel: """)
        else:
            confirm = 'yes'
            
        if confirm != "yes":
            print "wikipedia query cancelled"
            return

        if force:
            queryset = Tag.objects.all()
        else:
            queryset = Tag.objects.filter(url_status=None)                    
        
        if random:
            queryset = queryset.order_by("?")
        else:
            queryset = queryset.order_by("label")
        
        if limit >= 0:
            queryset = queryset[start:limit]
        else:
            queryset = queryset[start:]
            
        
        if verbosity > 2 :
            print "Tag Query is %s" % (queryset.query)
        
        site = wiki.Wiki(site_url) #@UndefinedVariable
        
        
        count = queryset.count()
        if verbosity > 1:
            print "Processing %d tags" % (count)
        
        
        
        for i, tag in enumerate(queryset):
            
            if verbosity > 1:
                print "processing tag %s (%d/%d)" % (tag.label, i + 1, count)
            else:
                self.show_progress(i + 1, count, tag.label, 60)                            
            
            process_tag(site, tag, verbosity)