diff -r 00fc169cc6a9 -r 825ff4d6a8ac web/hdalab/management/commands/query_wikipedia_category.py --- a/web/hdalab/management/commands/query_wikipedia_category.py Fri Jun 22 19:16:46 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,396 +0,0 @@ -# -*- coding: utf-8 -*- -''' -Created on Jun 7, 2011 - -@author: ymh -''' - -from django.conf import settings -from django.core.management.base import NoArgsCommand -from django.core.management.color import no_style -from hdabo.models import Tag -from hdalab.models import WpCategory, TagWpCategory, TagInfobox, InfoboxParameter -from optparse import make_option -from wikitools import api,wiki -import sys -import re -import itertools -from hdabo import utils -from django.db.models import Count -from django.db import transaction - - -TYPES_MASK_DICT = { - u'visible': 0b001, - u'hidden': 0b010, - u'infobox': 0b100, - u'all': 0b111, - } - -START_PATTERN = re.compile(u"\{\{\s?Infobox\s+([^|]+)", re.M|re.U|re.I) -END_PATTERN = re.compile(u"\{\{|\}\}", re.M|re.U) -SPLIT_PATTERN = re.compile("\s*?\|\s*([\w]+[^=|]*)\s*=", re.U|re.M) -DELIMITER_PATTERN = re.compile("\{{2,3}|\}{2,3}|\[\[|\]\]|\[|\]") -COMMENT_PATTERN = re.compile("",re.U|re.M) - - - -class Command(NoArgsCommand): - ''' - query and update wikipedia for tag title. - ''' - options = '' - help = """query and update wikipedia for tag title.""" - - option_list = NoArgsCommand.option_list + ( - make_option('--all', - action='store_true', - dest='all', - default=False, - help='force all tags to be updated, not only those not yet processed'), - make_option('--force', - action='store_true', - dest='force', - default=False, - help='ask no questions'), - make_option('--random', - action='store_true', - dest='random', - default=False, - help='randomize query on tags'), - make_option('--site', - action='store', - type='string', - dest='site_url', - default="http://fr.wikipedia.org/w/api.php", - help='the url for the wikipedia site'), - make_option('--limit', - action='store', - type='int', - dest='limit', - default= -1, - help='number of tag to process'), - make_option('--start', - action='store', - type='int', - dest='start', - default=0, - help='number of tag to ignore'), - make_option('--type', - action='append', - dest='types', - type='choice', - choices=['visible','hidden', 'infobox', 'all'], - default=[], - help='what type of query to perform : visible : visible categories, hidden : hidden categories, infobox: infoboxes, all: all of them. This option can be assed multiple times'), - make_option('--use-label', - action='store_true', - dest='use_label', - default=False, - help='use label instead of pageid to query wikipedia'), - make_option('--tag', - action='append', - dest='tags', - type='string', - default=[], - help='the tag to query'), - - ) - - -# def process_wp_response(self, label, response): -# -# query_dict = response['query'] -# # get page if multiple pages or none -> return Tag.null_result -# pages = query_dict.get("pages", {}) -# if len(pages) > 1 or len(pages) == 0: -# return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None -# -# page = pages.values()[0] -# -# if u"invalid" in page or u"missing" in page: -# return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None -# -# url = page.get(u'fullurl', None) -# pageid = page.get(u'pageid', None) -# new_label = page[u'title'] -# -# if self.__is_homonymie(page): -# status = Tag.TAG_URL_STATUS_DICT["homonyme"] -# elif u"redirect" in page: -# status = Tag.TAG_URL_STATUS_DICT["redirection"] -# else: -# status = Tag.TAG_URL_STATUS_DICT["match"] -# -# return new_label, status, url, pageid - - def query_all_categories(self, hidden, site, pageid, use_label): - - clshow = 'hidden' if hidden else '!hidden' - params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'categories', 'clshow': clshow} - - clcontinue = "" - res = [] - - while clcontinue is not None: - if clcontinue: - params['clcontinue'] = clcontinue - - wpquery = api.APIRequest(site, params) #@UndefinedVariable - response = wpquery.query() - - if self.verbosity > 1: - print "Query infoboxes : " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data()) - print repr(response) - - - query_dict = response.get('query', None) - - if query_dict is None: - return res - - pages = query_dict.get("pages", {}) - if len(pages) > 1 or len(pages) == 0: - return res - - page = pages.values()[0] - - for cat in page.get('categories',[]): - title = cat.get('title',"") - title = title[title.find(":")+1:] - if title and clcontinue != ("%s|%s" % (pageid,title)): - res.append(title) - - clcontinue = response.get('query-continue', {}).get('categories',{}).get('clcontinue', None) - - if self.verbosity > 1: - print "Query infoboxes RES: " - print repr(res) - - return res - - def process_categories(self, cat_list, hidden, tag): - - for cat in cat_list: - wp_cat,created = WpCategory.objects.get_or_create(label=cat) #@UnusedVariable - TagWpCategory.objects.get_or_create(tag=tag, wp_category=wp_cat, hidden=hidden) - - - def query_infoboxes(self, site, pageid, use_label): - - res = [] - params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'revisions', 'rvprop': 'ids|content'} - wpquery = api.APIRequest(site, params) #@UndefinedVariable - response = wpquery.query() - - query_dict = response.get('query', None) - - if query_dict is None: - return res - - pages = query_dict.get("pages", {}) - if len(pages) > 1 or len(pages) == 0: - return res - - page = pages.values()[0] - - if 'revisions' not in page or not page['revisions']: - return res - - rev = page['revisions'][0] - - content = rev['*'] - - start = 0 - depth = 0 - current_infobox_name = None - current_start = 0 - - while start <= len(content): - if depth==0: - resm = START_PATTERN.search(content[start:]) - if resm is None: - break - depth = 1 - current_start = resm.start()+start - start += resm.end()+1 - current_infobox_name = resm.group(1) - else: - resm = END_PATTERN.search(content[start:]) - if resm is None: - break - if resm.group(0) == "{{": - depth += 1 - elif resm.group(0) == "}}": - depth -= 1 - if depth == 0: - res.append((content[current_start:resm.end()+start], current_infobox_name)) - start += resm.end()+1 - - return_val = (rev['revid'],res) - - if self.verbosity > 1: - print "Query infoboxes url: " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data()) - print repr(return_val) - - return return_val - - def split_infoboxes(self, src): - - start = 0 - previous_end = 0 - split_indexes = [] - delimiter_stack = [] - while start<=len(src): - resd = DELIMITER_PATTERN.search(src[start:]) - ress = SPLIT_PATTERN.search(src[start:]) if len(delimiter_stack) == 0 else None - startd = resd.start() if resd is not None else sys.maxint - starts = ress.start() if ress is not None else sys.maxint - if starts < startd: - if len(split_indexes)>0: - split_indexes.append((previous_end, ress.start(0)+start)) - split_indexes.append((ress.start(1)+start, ress.end(1)+start)) - start += ress.end(0) - previous_end = start - elif startd < sys.maxint: - if resd.group().startswith("{") or resd.group().startswith("[") : - delimiter_stack.append(resd.group()) - elif len(delimiter_stack)>0 and ( (delimiter_stack[-1].startswith('{') and resd.group()[0] == '}') or (delimiter_stack[-1].startswith('[') and resd.group()[0] == ']') ) and len(delimiter_stack[-1]) == len(resd.group()): - delimiter_stack.pop() - start += resd.end() - else: - break - - if previous_end > 0: - split_indexes.append((previous_end,len(src))) - res = [src[start:end] for start,end in split_indexes] - return res - - - - def process_infoboxes(self, infobox_defs, tag): - - if not infobox_defs: - return - - revision_id = infobox_defs[0] - for infobox in infobox_defs[1]: - src = infobox[0].strip(' \t\n\r') - name = infobox[1] - tag_infobox, created = TagInfobox.objects.get_or_create(tag=tag, name=name, revision_id = revision_id, defaults={'source': src}) - if not created: - tag_infobox.source = src - tag_infobox.save() - - src = COMMENT_PATTERN.sub('',src) - src = START_PATTERN.sub('',src[:-2]).strip() - - keyvalues = self.split_infoboxes(src) - - for key,value in itertools.izip(*[itertools.islice(keyvalues, i, None, 2) for i in range(2)]): - param, created = InfoboxParameter.objects.get_or_create(tag_infobox=tag_infobox, param_name=key.strip(), defaults={'param_value':value.strip()}) - if not created: - param.param_value = value.strip() - param.save() - - def handle_noargs(self, **options): - - self.style = no_style() - - interactive = options.get('interactive', True) - - self.verbosity = int(options.get('verbosity', '1')) - use_label = options.get('use_label', False) - - force = options.get('force', False) - - limit = options.get("limit", -1) - start = options.get("start", 0) - - site_url = options.get('site_url', settings.WIKIPEDIA_API_URL) - - random = options.get('random', False) - - types_mask = 0 - types_list = options.get('types', []) - - if len(types_list) == 0: - types_mask = TYPES_MASK_DICT['all'] - else: - for t in types_list: - types_mask |= TYPES_MASK_DICT[t] - - if self.verbosity > 1 : - print "types mask %s " % (bin(types_mask)) - - if self.verbosity > 2: - print "option passed : " + repr(options) - - - queryset = Tag.objects.exclude(wikipedia_pageid= None) - - tag_list = options.get("tags", []); - - if tag_list: - queryset = queryset.filter(label__in=tag_list) - elif not options.get('all',False): - queryset = queryset.annotate(wpc=Count('wp_categories')).filter(wpc = 0) - #else: - # queryset = Tag.objects.filter(url_status=None) - - if random: - queryset = queryset.order_by("?") - else: - queryset = queryset.order_by("label") - - if limit >= 0: - queryset = queryset[start:limit] - elif start > 0: - queryset = queryset[start:] - - if self.verbosity > 2 : - print "Tag Query is %s" % (queryset.query) - - site = wiki.Wiki(site_url) #@UndefinedVariable - - - count = queryset.count() - if self.verbosity > 1: - print "Processing %d tags" % (count) - - if not force and interactive: - confirm = raw_input("You have requested to query and replace the wikipedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count)) - else: - confirm = 'yes' - - if confirm != "yes": - print "wikipedia query cancelled" - return - - - - for i, tag in enumerate(queryset): - - if self.verbosity > 1: - print "processing tag %s (%d/%d)" % (tag.label, i + 1, count) - else: - utils.show_progress(i + 1, count, tag.label, 60) - - # query categories - wikipedia_pageid = tag.label if use_label else tag.wikipedia_pageid - if tag.url_status == Tag.TAG_URL_STATUS_DICT['redirection'] and tag.alternative_wikipedia_pageid is not None : - wikipedia_pageid = tag.alternative_label if use_label else tag.alternative_wikipedia_pageid - - with transaction.commit_on_success(): - if types_mask & TYPES_MASK_DICT['visible']: - res = self.query_all_categories(False, site, wikipedia_pageid, use_label) - self.process_categories(res, False, tag) - - if types_mask & TYPES_MASK_DICT['hidden']: - res = self.query_all_categories(True, site, wikipedia_pageid, use_label) - self.process_categories(res, True, tag) - - if types_mask & TYPES_MASK_DICT['infobox']: - res = self.query_infoboxes(site, wikipedia_pageid, use_label) - self.process_infoboxes(res, tag) -