diff -r 73f19fa4f997 -r 8f77cf71ab02 src/hdalab/management/commands/query_wikipedia_category.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/hdalab/management/commands/query_wikipedia_category.py Tue Jun 17 10:25:33 2014 +0200 @@ -0,0 +1,396 @@ +# -*- coding: utf-8 -*- +''' +Created on Jun 7, 2011 + +@author: ymh +''' + +from django.conf import settings +from django.core.management.base import NoArgsCommand +from django.core.management.color import no_style +from hdabo.models import Tag +from hdalab.models import WpCategory, TagWpCategory, TagInfobox, InfoboxParameter +from optparse import make_option +from wikitools import api,wiki +import sys +import re +import itertools +from hdabo import utils +from django.db.models import Count +from django.db import transaction + + +TYPES_MASK_DICT = { + u'visible': 0b001, + u'hidden': 0b010, + u'infobox': 0b100, + u'all': 0b111, + } + +START_PATTERN = re.compile(u"\{\{\s?Infobox\s+([^|]+)", re.M|re.U|re.I) +END_PATTERN = re.compile(u"\{\{|\}\}", re.M|re.U) +SPLIT_PATTERN = re.compile("\s*?\|\s*([\w]+[^=|]*)\s*=", re.U|re.M) +DELIMITER_PATTERN = re.compile("\{{2,3}|\}{2,3}|\[\[|\]\]|\[|\]") +COMMENT_PATTERN = re.compile("",re.U|re.M) + + + +class Command(NoArgsCommand): + ''' + query and update wikipedia for tag title. + ''' + options = '' + help = """query and update wikipedia for tag title.""" + + option_list = NoArgsCommand.option_list + ( + make_option('--all', + action='store_true', + dest='all', + default=False, + help='force all tags to be updated, not only those not yet processed'), + make_option('--force', + action='store_true', + dest='force', + default=False, + help='ask no questions'), + make_option('--random', + action='store_true', + dest='random', + default=False, + help='randomize query on tags'), + make_option('--site', + action='store', + type='string', + dest='site_url', + default="http://fr.wikipedia.org/w/api.php", + help='the url for the wikipedia site'), + make_option('--limit', + action='store', + type='int', + dest='limit', + default= -1, + help='number of tag to process'), + make_option('--start', + action='store', + type='int', + dest='start', + default=0, + help='number of tag to ignore'), + make_option('--type', + action='append', + dest='types', + type='choice', + choices=['visible','hidden', 'infobox', 'all'], + default=[], + help='what type of query to perform : visible : visible categories, hidden : hidden categories, infobox: infoboxes, all: all of them. This option can be assed multiple times'), + make_option('--use-label', + action='store_true', + dest='use_label', + default=False, + help='use label instead of pageid to query wikipedia'), + make_option('--tag', + action='append', + dest='tags', + type='string', + default=[], + help='the tag to query'), + + ) + + +# def process_wp_response(self, label, response): +# +# query_dict = response['query'] +# # get page if multiple pages or none -> return Tag.null_result +# pages = query_dict.get("pages", {}) +# if len(pages) > 1 or len(pages) == 0: +# return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None +# +# page = pages.values()[0] +# +# if u"invalid" in page or u"missing" in page: +# return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None +# +# url = page.get(u'fullurl', None) +# pageid = page.get(u'pageid', None) +# new_label = page[u'title'] +# +# if self.__is_homonymie(page): +# status = Tag.TAG_URL_STATUS_DICT["homonyme"] +# elif u"redirect" in page: +# status = Tag.TAG_URL_STATUS_DICT["redirection"] +# else: +# status = Tag.TAG_URL_STATUS_DICT["match"] +# +# return new_label, status, url, pageid + + def query_all_categories(self, hidden, site, pageid, use_label): + + clshow = 'hidden' if hidden else '!hidden' + params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'categories', 'clshow': clshow} + + clcontinue = "" + res = [] + + while clcontinue is not None: + if clcontinue: + params['clcontinue'] = clcontinue + + wpquery = api.APIRequest(site, params) #@UndefinedVariable + response = wpquery.query() + + if self.verbosity > 1: + print "Query infoboxes : " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data()) + print repr(response) + + + query_dict = response.get('query', None) + + if query_dict is None: + return res + + pages = query_dict.get("pages", {}) + if len(pages) > 1 or len(pages) == 0: + return res + + page = pages.values()[0] + + for cat in page.get('categories',[]): + title = cat.get('title',"") + title = title[title.find(":")+1:] + if title and clcontinue != ("%s|%s" % (pageid,title)): + res.append(title) + + clcontinue = response.get('query-continue', {}).get('categories',{}).get('clcontinue', None) + + if self.verbosity > 1: + print "Query infoboxes RES: " + print repr(res) + + return res + + def process_categories(self, cat_list, hidden, tag): + + for cat in cat_list: + wp_cat,created = WpCategory.objects.get_or_create(label=cat) #@UnusedVariable + TagWpCategory.objects.get_or_create(tag=tag, wp_category=wp_cat, hidden=hidden) + + + def query_infoboxes(self, site, pageid, use_label): + + res = [] + params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'revisions', 'rvprop': 'ids|content'} + wpquery = api.APIRequest(site, params) #@UndefinedVariable + response = wpquery.query() + + query_dict = response.get('query', None) + + if query_dict is None: + return res + + pages = query_dict.get("pages", {}) + if len(pages) > 1 or len(pages) == 0: + return res + + page = pages.values()[0] + + if 'revisions' not in page or not page['revisions']: + return res + + rev = page['revisions'][0] + + content = rev['*'] + + start = 0 + depth = 0 + current_infobox_name = None + current_start = 0 + + while start <= len(content): + if depth==0: + resm = START_PATTERN.search(content[start:]) + if resm is None: + break + depth = 1 + current_start = resm.start()+start + start += resm.end()+1 + current_infobox_name = resm.group(1) + else: + resm = END_PATTERN.search(content[start:]) + if resm is None: + break + if resm.group(0) == "{{": + depth += 1 + elif resm.group(0) == "}}": + depth -= 1 + if depth == 0: + res.append((content[current_start:resm.end()+start], current_infobox_name)) + start += resm.end()+1 + + return_val = (rev['revid'],res) + + if self.verbosity > 1: + print "Query infoboxes url: " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data()) + print repr(return_val) + + return return_val + + def split_infoboxes(self, src): + + start = 0 + previous_end = 0 + split_indexes = [] + delimiter_stack = [] + while start<=len(src): + resd = DELIMITER_PATTERN.search(src[start:]) + ress = SPLIT_PATTERN.search(src[start:]) if len(delimiter_stack) == 0 else None + startd = resd.start() if resd is not None else sys.maxint + starts = ress.start() if ress is not None else sys.maxint + if starts < startd: + if len(split_indexes)>0: + split_indexes.append((previous_end, ress.start(0)+start)) + split_indexes.append((ress.start(1)+start, ress.end(1)+start)) + start += ress.end(0) + previous_end = start + elif startd < sys.maxint: + if resd.group().startswith("{") or resd.group().startswith("[") : + delimiter_stack.append(resd.group()) + elif len(delimiter_stack)>0 and ( (delimiter_stack[-1].startswith('{') and resd.group()[0] == '}') or (delimiter_stack[-1].startswith('[') and resd.group()[0] == ']') ) and len(delimiter_stack[-1]) == len(resd.group()): + delimiter_stack.pop() + start += resd.end() + else: + break + + if previous_end > 0: + split_indexes.append((previous_end,len(src))) + res = [src[start:end] for start,end in split_indexes] + return res + + + + def process_infoboxes(self, infobox_defs, tag): + + if not infobox_defs: + return + + revision_id = infobox_defs[0] + for infobox in infobox_defs[1]: + src = infobox[0].strip(' \t\n\r') + name = infobox[1] + tag_infobox, created = TagInfobox.objects.get_or_create(tag=tag, name=name, revision_id = revision_id, defaults={'source': src}) + if not created: + tag_infobox.source = src + tag_infobox.save() + + src = COMMENT_PATTERN.sub('',src) + src = START_PATTERN.sub('',src[:-2]).strip() + + keyvalues = self.split_infoboxes(src) + + for key,value in itertools.izip(*[itertools.islice(keyvalues, i, None, 2) for i in range(2)]): + param, created = InfoboxParameter.objects.get_or_create(tag_infobox=tag_infobox, param_name=key.strip(), defaults={'param_value':value.strip()}) + if not created: + param.param_value = value.strip() + param.save() + + def handle_noargs(self, **options): + + self.style = no_style() + + interactive = options.get('interactive', True) + + self.verbosity = int(options.get('verbosity', '1')) + use_label = options.get('use_label', False) + + force = options.get('force', False) + + limit = options.get("limit", -1) + start = options.get("start", 0) + + site_url = options.get('site_url', settings.WIKIPEDIA_API_URL) + + random = options.get('random', False) + + types_mask = 0 + types_list = options.get('types', []) + + if len(types_list) == 0: + types_mask = TYPES_MASK_DICT['all'] + else: + for t in types_list: + types_mask |= TYPES_MASK_DICT[t] + + if self.verbosity > 1 : + print "types mask %s " % (bin(types_mask)) + + if self.verbosity > 2: + print "option passed : " + repr(options) + + + queryset = Tag.objects.exclude(wikipedia_pageid= None) + + tag_list = options.get("tags", []); + + if tag_list: + queryset = queryset.filter(label__in=tag_list) + elif not options.get('all',False): + queryset = queryset.annotate(wpc=Count('wp_categories')).filter(wpc = 0) + #else: + # queryset = Tag.objects.filter(url_status=None) + + if random: + queryset = queryset.order_by("?") + else: + queryset = queryset.order_by("label") + + if limit >= 0: + queryset = queryset[start:limit] + elif start > 0: + queryset = queryset[start:] + + if self.verbosity > 2 : + print "Tag Query is %s" % (queryset.query) + + site = wiki.Wiki(site_url) #@UndefinedVariable + + + count = queryset.count() + if self.verbosity > 1: + print "Processing %d tags" % (count) + + if not force and interactive: + confirm = raw_input("You have requested to query and replace the wikipedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count)) + else: + confirm = 'yes' + + if confirm != "yes": + print "wikipedia query cancelled" + return + + + + for i, tag in enumerate(queryset): + + if self.verbosity > 1: + print "processing tag %s (%d/%d)" % (tag.label, i + 1, count) + else: + utils.show_progress(i + 1, count, tag.label, 60) + + # query categories + wikipedia_pageid = tag.label if use_label else tag.wikipedia_pageid + if tag.url_status == Tag.TAG_URL_STATUS_DICT['redirection'] and tag.alternative_wikipedia_pageid is not None : + wikipedia_pageid = tag.alternative_label if use_label else tag.alternative_wikipedia_pageid + + with transaction.commit_on_success(): + if types_mask & TYPES_MASK_DICT['visible']: + res = self.query_all_categories(False, site, wikipedia_pageid, use_label) + self.process_categories(res, False, tag) + + if types_mask & TYPES_MASK_DICT['hidden']: + res = self.query_all_categories(True, site, wikipedia_pageid, use_label) + self.process_categories(res, True, tag) + + if types_mask & TYPES_MASK_DICT['infobox']: + res = self.query_infoboxes(site, wikipedia_pageid, use_label) + self.process_infoboxes(res, tag) +