web/hdalab/management/commands/query_wikipedia_category.py
changeset 271 8f77cf71ab02
parent 265 73f19fa4f997
child 272 1c774f7a0341
--- a/web/hdalab/management/commands/query_wikipedia_category.py	Fri Nov 16 18:12:05 2012 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,396 +0,0 @@
-# -*- coding: utf-8 -*-
-'''
-Created on Jun 7, 2011
-
-@author: ymh
-'''
-
-from django.conf import settings
-from django.core.management.base import NoArgsCommand
-from django.core.management.color import no_style
-from hdabo.models import Tag
-from hdalab.models import WpCategory, TagWpCategory, TagInfobox, InfoboxParameter
-from optparse import make_option
-from wikitools import api,wiki
-import sys
-import re
-import itertools
-from hdabo import utils
-from django.db.models import Count
-from django.db import transaction
-
-
-TYPES_MASK_DICT = {
-        u'visible': 0b001,
-        u'hidden': 0b010,
-        u'infobox': 0b100,
-        u'all': 0b111,
-    }
-
-START_PATTERN = re.compile(u"\{\{\s?Infobox\s+([^|]+)", re.M|re.U|re.I)
-END_PATTERN = re.compile(u"\{\{|\}\}", re.M|re.U)
-SPLIT_PATTERN = re.compile("\s*?\|\s*([\w]+[^=|]*)\s*=", re.U|re.M)
-DELIMITER_PATTERN = re.compile("\{{2,3}|\}{2,3}|\[\[|\]\]|\[|\]")
-COMMENT_PATTERN = re.compile("<!--.*?-->",re.U|re.M)
-
-
-
-class Command(NoArgsCommand):
-    '''
-    query and update wikipedia for tag title.
-    '''
-    options = ''
-    help = """query and update wikipedia for tag title."""
-    
-    option_list = NoArgsCommand.option_list + (
-        make_option('--all',
-            action='store_true',
-            dest='all',
-            default=False,
-            help='force all tags to be updated, not only those not yet processed'),
-        make_option('--force',
-            action='store_true',
-            dest='force',
-            default=False,
-            help='ask no questions'),
-        make_option('--random',
-            action='store_true',
-            dest='random',
-            default=False,
-            help='randomize query on tags'),
-        make_option('--site',
-            action='store',
-            type='string',
-            dest='site_url',
-            default="http://fr.wikipedia.org/w/api.php",
-            help='the url for the wikipedia site'),
-        make_option('--limit',
-            action='store',
-            type='int',
-            dest='limit',
-            default= -1,
-            help='number of tag to process'),
-        make_option('--start',
-            action='store',
-            type='int',
-            dest='start',
-            default=0,
-            help='number of tag to ignore'),
-        make_option('--type',
-            action='append',
-            dest='types',
-            type='choice',
-            choices=['visible','hidden', 'infobox', 'all'],
-            default=[],
-            help='what type of query to perform : visible : visible categories, hidden : hidden categories, infobox: infoboxes, all: all of them. This option can be assed multiple times'),
-        make_option('--use-label',
-            action='store_true',
-            dest='use_label',
-            default=False,
-            help='use label instead of pageid to query wikipedia'),
-        make_option('--tag',
-            action='append',
-            dest='tags',
-            type='string',
-            default=[],
-            help='the tag to query'),
-
-    )
-    
-    
-#    def process_wp_response(self, label, response):        
-#
-#        query_dict = response['query']
-#        # get page if multiple pages or none -> return Tag.null_result
-#        pages = query_dict.get("pages", {})
-#        if len(pages) > 1 or len(pages) == 0:
-#            return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None
-#        
-#        page = pages.values()[0]
-#        
-#        if u"invalid" in page or u"missing" in page:
-#            return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None
-#
-#        url = page.get(u'fullurl', None)
-#        pageid = page.get(u'pageid', None)
-#        new_label = page[u'title']
-#        
-#        if self.__is_homonymie(page):
-#            status = Tag.TAG_URL_STATUS_DICT["homonyme"]
-#        elif u"redirect" in page:
-#            status = Tag.TAG_URL_STATUS_DICT["redirection"]
-#        else:
-#            status = Tag.TAG_URL_STATUS_DICT["match"]
-#        
-#        return new_label, status, url, pageid 
-
-    def query_all_categories(self, hidden, site, pageid, use_label):
-        
-        clshow = 'hidden' if hidden else '!hidden'
-        params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'categories', 'clshow': clshow}
-        
-        clcontinue = ""        
-        res = []
-        
-        while clcontinue is not None:
-            if clcontinue:
-                params['clcontinue'] = clcontinue
-                
-            wpquery = api.APIRequest(site, params) #@UndefinedVariable
-            response = wpquery.query()
-            
-            if self.verbosity > 1:
-                print "Query infoboxes : " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
-                print repr(response)
-
-            
-            query_dict = response.get('query', None)
-            
-            if query_dict is None:
-                return res
-            
-            pages = query_dict.get("pages", {})
-            if len(pages) > 1 or len(pages) == 0:
-                return res
-            
-            page = pages.values()[0]
-                        
-            for cat in page.get('categories',[]):
-                title = cat.get('title',"")
-                title = title[title.find(":")+1:]
-                if title and clcontinue != ("%s|%s" % (pageid,title)):
-                    res.append(title)
-            
-            clcontinue = response.get('query-continue', {}).get('categories',{}).get('clcontinue', None)
-
-        if self.verbosity > 1:
-            print "Query infoboxes RES: "
-            print repr(res)
-            
-        return res
-    
-    def process_categories(self, cat_list, hidden, tag):
-        
-        for cat in cat_list:
-            wp_cat,created = WpCategory.objects.get_or_create(label=cat) #@UnusedVariable
-            TagWpCategory.objects.get_or_create(tag=tag, wp_category=wp_cat, hidden=hidden)
-            
-                
-    def query_infoboxes(self, site, pageid, use_label):
-        
-        res = []
-        params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'revisions', 'rvprop': 'ids|content'}
-        wpquery = api.APIRequest(site, params) #@UndefinedVariable
-        response = wpquery.query()
-        
-        query_dict = response.get('query', None)
-            
-        if query_dict is None:
-            return res
-            
-        pages = query_dict.get("pages", {})
-        if len(pages) > 1 or len(pages) == 0:
-            return res
-
-        page = pages.values()[0]
-        
-        if 'revisions' not in page or not page['revisions']:
-            return res
-        
-        rev = page['revisions'][0]
-        
-        content = rev['*']
-                
-        start = 0
-        depth = 0
-        current_infobox_name = None
-        current_start = 0
-        
-        while start <= len(content):
-            if depth==0:
-                resm = START_PATTERN.search(content[start:])
-                if resm is None:
-                    break
-                depth = 1
-                current_start = resm.start()+start
-                start += resm.end()+1
-                current_infobox_name = resm.group(1)                    
-            else:
-                resm = END_PATTERN.search(content[start:])
-                if resm is None:
-                    break
-                if resm.group(0) == "{{":
-                    depth += 1
-                elif resm.group(0) == "}}":
-                    depth -= 1
-                if depth == 0:
-                    res.append((content[current_start:resm.end()+start], current_infobox_name))
-                start += resm.end()+1
-
-        return_val = (rev['revid'],res)
-        
-        if self.verbosity > 1:
-            print "Query infoboxes url: " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
-            print repr(return_val)
-        
-        return return_val
-    
-    def split_infoboxes(self, src):
-        
-        start = 0
-        previous_end = 0
-        split_indexes = []
-        delimiter_stack = []
-        while start<=len(src):            
-            resd = DELIMITER_PATTERN.search(src[start:])
-            ress = SPLIT_PATTERN.search(src[start:]) if len(delimiter_stack) == 0 else None
-            startd = resd.start() if resd is not None else sys.maxint
-            starts = ress.start() if ress is not None else sys.maxint
-            if starts < startd:
-                if len(split_indexes)>0:
-                    split_indexes.append((previous_end, ress.start(0)+start))
-                split_indexes.append((ress.start(1)+start, ress.end(1)+start))
-                start += ress.end(0)
-                previous_end = start
-            elif startd < sys.maxint:
-                if resd.group().startswith("{") or resd.group().startswith("[") :
-                    delimiter_stack.append(resd.group())
-                elif len(delimiter_stack)>0 and ( (delimiter_stack[-1].startswith('{') and resd.group()[0] == '}') or (delimiter_stack[-1].startswith('[') and resd.group()[0] == ']') ) and len(delimiter_stack[-1]) == len(resd.group()):
-                    delimiter_stack.pop()
-                start += resd.end()
-            else:
-                break
-            
-        if previous_end > 0:
-            split_indexes.append((previous_end,len(src)))
-        res = [src[start:end] for start,end in split_indexes]
-        return res
-
-
-
-    def process_infoboxes(self, infobox_defs, tag):
-        
-        if not infobox_defs:
-            return
-        
-        revision_id = infobox_defs[0]
-        for infobox in infobox_defs[1]:
-            src = infobox[0].strip(' \t\n\r')            
-            name = infobox[1]
-            tag_infobox, created = TagInfobox.objects.get_or_create(tag=tag, name=name, revision_id = revision_id, defaults={'source': src})
-            if not created:
-                tag_infobox.source = src
-                tag_infobox.save()
-
-            src = COMMENT_PATTERN.sub('',src)
-            src = START_PATTERN.sub('',src[:-2]).strip()
-            
-            keyvalues = self.split_infoboxes(src)
-
-            for key,value in itertools.izip(*[itertools.islice(keyvalues, i, None, 2) for i in range(2)]):
-                param, created = InfoboxParameter.objects.get_or_create(tag_infobox=tag_infobox, param_name=key.strip(), defaults={'param_value':value.strip()})
-                if not created:
-                    param.param_value = value.strip()
-                    param.save()
-        
-    def handle_noargs(self, **options):
-        
-        self.style = no_style()
-        
-        interactive = options.get('interactive', True)
-        
-        self.verbosity = int(options.get('verbosity', '1'))
-        use_label = options.get('use_label', False)
-        
-        force = options.get('force', False)
-        
-        limit = options.get("limit", -1)
-        start = options.get("start", 0)
-        
-        site_url = options.get('site_url', settings.WIKIPEDIA_API_URL)
-        
-        random = options.get('random', False)
-        
-        types_mask = 0
-        types_list = options.get('types', [])
-        
-        if len(types_list) == 0:
-            types_mask = TYPES_MASK_DICT['all']
-        else:
-            for t in types_list:
-                types_mask |=  TYPES_MASK_DICT[t]
-                
-        if self.verbosity > 1 :
-            print "types mask %s " % (bin(types_mask))  
-        
-        if self.verbosity > 2:
-            print "option passed : " + repr(options)
-
-
-        queryset = Tag.objects.exclude(wikipedia_pageid= None)
-        
-        tag_list = options.get("tags", []);
-        
-        if tag_list:
-            queryset = queryset.filter(label__in=tag_list)
-        elif not options.get('all',False):            
-            queryset = queryset.annotate(wpc=Count('wp_categories')).filter(wpc = 0)
-        #else:
-        #    queryset = Tag.objects.filter(url_status=None)                    
-        
-        if random:
-            queryset = queryset.order_by("?")
-        else:
-            queryset = queryset.order_by("label")
-        
-        if limit >= 0:
-            queryset = queryset[start:limit]
-        elif start > 0:
-            queryset = queryset[start:]            
-        
-        if self.verbosity > 2 :
-            print "Tag Query is %s" % (queryset.query)
-        
-        site = wiki.Wiki(site_url) #@UndefinedVariable
-        
-        
-        count = queryset.count()
-        if self.verbosity > 1:
-            print "Processing %d tags" % (count)
-        
-        if not force and interactive:
-            confirm = raw_input("You have requested to query and replace the wikipedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
-        else:
-            confirm = 'yes'
-            
-        if confirm != "yes":
-            print "wikipedia query cancelled"
-            return
-
-        
-        
-        for i, tag in enumerate(queryset):
-            
-            if self.verbosity > 1:
-                print "processing tag %s (%d/%d)" % (tag.label, i + 1, count)
-            else:
-                utils.show_progress(i + 1, count, tag.label, 60)                            
-
-            # query categories
-            wikipedia_pageid = tag.label if use_label else tag.wikipedia_pageid
-            if tag.url_status == Tag.TAG_URL_STATUS_DICT['redirection'] and tag.alternative_wikipedia_pageid is not None :
-                wikipedia_pageid = tag.alternative_label if use_label else tag.alternative_wikipedia_pageid
-
-            with transaction.commit_on_success():
-                if types_mask & TYPES_MASK_DICT['visible']:
-                    res = self.query_all_categories(False, site, wikipedia_pageid, use_label)
-                    self.process_categories(res, False, tag)
-    
-                if types_mask & TYPES_MASK_DICT['hidden']:
-                    res = self.query_all_categories(True, site, wikipedia_pageid, use_label)
-                    self.process_categories(res, True, tag)
-                
-                if types_mask & TYPES_MASK_DICT['infobox']:
-                    res = self.query_infoboxes(site, wikipedia_pageid, use_label)
-                    self.process_infoboxes(res, tag)
-