--- a/web/hdalab/management/commands/query_wikipedia_category.py Fri Nov 16 18:12:05 2012 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,396 +0,0 @@
-# -*- coding: utf-8 -*-
-'''
-Created on Jun 7, 2011
-
-@author: ymh
-'''
-
-from django.conf import settings
-from django.core.management.base import NoArgsCommand
-from django.core.management.color import no_style
-from hdabo.models import Tag
-from hdalab.models import WpCategory, TagWpCategory, TagInfobox, InfoboxParameter
-from optparse import make_option
-from wikitools import api,wiki
-import sys
-import re
-import itertools
-from hdabo import utils
-from django.db.models import Count
-from django.db import transaction
-
-
-TYPES_MASK_DICT = {
- u'visible': 0b001,
- u'hidden': 0b010,
- u'infobox': 0b100,
- u'all': 0b111,
- }
-
-START_PATTERN = re.compile(u"\{\{\s?Infobox\s+([^|]+)", re.M|re.U|re.I)
-END_PATTERN = re.compile(u"\{\{|\}\}", re.M|re.U)
-SPLIT_PATTERN = re.compile("\s*?\|\s*([\w]+[^=|]*)\s*=", re.U|re.M)
-DELIMITER_PATTERN = re.compile("\{{2,3}|\}{2,3}|\[\[|\]\]|\[|\]")
-COMMENT_PATTERN = re.compile("<!--.*?-->",re.U|re.M)
-
-
-
-class Command(NoArgsCommand):
- '''
- query and update wikipedia for tag title.
- '''
- options = ''
- help = """query and update wikipedia for tag title."""
-
- option_list = NoArgsCommand.option_list + (
- make_option('--all',
- action='store_true',
- dest='all',
- default=False,
- help='force all tags to be updated, not only those not yet processed'),
- make_option('--force',
- action='store_true',
- dest='force',
- default=False,
- help='ask no questions'),
- make_option('--random',
- action='store_true',
- dest='random',
- default=False,
- help='randomize query on tags'),
- make_option('--site',
- action='store',
- type='string',
- dest='site_url',
- default="http://fr.wikipedia.org/w/api.php",
- help='the url for the wikipedia site'),
- make_option('--limit',
- action='store',
- type='int',
- dest='limit',
- default= -1,
- help='number of tag to process'),
- make_option('--start',
- action='store',
- type='int',
- dest='start',
- default=0,
- help='number of tag to ignore'),
- make_option('--type',
- action='append',
- dest='types',
- type='choice',
- choices=['visible','hidden', 'infobox', 'all'],
- default=[],
- help='what type of query to perform : visible : visible categories, hidden : hidden categories, infobox: infoboxes, all: all of them. This option can be assed multiple times'),
- make_option('--use-label',
- action='store_true',
- dest='use_label',
- default=False,
- help='use label instead of pageid to query wikipedia'),
- make_option('--tag',
- action='append',
- dest='tags',
- type='string',
- default=[],
- help='the tag to query'),
-
- )
-
-
-# def process_wp_response(self, label, response):
-#
-# query_dict = response['query']
-# # get page if multiple pages or none -> return Tag.null_result
-# pages = query_dict.get("pages", {})
-# if len(pages) > 1 or len(pages) == 0:
-# return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None
-#
-# page = pages.values()[0]
-#
-# if u"invalid" in page or u"missing" in page:
-# return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None
-#
-# url = page.get(u'fullurl', None)
-# pageid = page.get(u'pageid', None)
-# new_label = page[u'title']
-#
-# if self.__is_homonymie(page):
-# status = Tag.TAG_URL_STATUS_DICT["homonyme"]
-# elif u"redirect" in page:
-# status = Tag.TAG_URL_STATUS_DICT["redirection"]
-# else:
-# status = Tag.TAG_URL_STATUS_DICT["match"]
-#
-# return new_label, status, url, pageid
-
- def query_all_categories(self, hidden, site, pageid, use_label):
-
- clshow = 'hidden' if hidden else '!hidden'
- params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'categories', 'clshow': clshow}
-
- clcontinue = ""
- res = []
-
- while clcontinue is not None:
- if clcontinue:
- params['clcontinue'] = clcontinue
-
- wpquery = api.APIRequest(site, params) #@UndefinedVariable
- response = wpquery.query()
-
- if self.verbosity > 1:
- print "Query infoboxes : " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
- print repr(response)
-
-
- query_dict = response.get('query', None)
-
- if query_dict is None:
- return res
-
- pages = query_dict.get("pages", {})
- if len(pages) > 1 or len(pages) == 0:
- return res
-
- page = pages.values()[0]
-
- for cat in page.get('categories',[]):
- title = cat.get('title',"")
- title = title[title.find(":")+1:]
- if title and clcontinue != ("%s|%s" % (pageid,title)):
- res.append(title)
-
- clcontinue = response.get('query-continue', {}).get('categories',{}).get('clcontinue', None)
-
- if self.verbosity > 1:
- print "Query infoboxes RES: "
- print repr(res)
-
- return res
-
- def process_categories(self, cat_list, hidden, tag):
-
- for cat in cat_list:
- wp_cat,created = WpCategory.objects.get_or_create(label=cat) #@UnusedVariable
- TagWpCategory.objects.get_or_create(tag=tag, wp_category=wp_cat, hidden=hidden)
-
-
- def query_infoboxes(self, site, pageid, use_label):
-
- res = []
- params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'revisions', 'rvprop': 'ids|content'}
- wpquery = api.APIRequest(site, params) #@UndefinedVariable
- response = wpquery.query()
-
- query_dict = response.get('query', None)
-
- if query_dict is None:
- return res
-
- pages = query_dict.get("pages", {})
- if len(pages) > 1 or len(pages) == 0:
- return res
-
- page = pages.values()[0]
-
- if 'revisions' not in page or not page['revisions']:
- return res
-
- rev = page['revisions'][0]
-
- content = rev['*']
-
- start = 0
- depth = 0
- current_infobox_name = None
- current_start = 0
-
- while start <= len(content):
- if depth==0:
- resm = START_PATTERN.search(content[start:])
- if resm is None:
- break
- depth = 1
- current_start = resm.start()+start
- start += resm.end()+1
- current_infobox_name = resm.group(1)
- else:
- resm = END_PATTERN.search(content[start:])
- if resm is None:
- break
- if resm.group(0) == "{{":
- depth += 1
- elif resm.group(0) == "}}":
- depth -= 1
- if depth == 0:
- res.append((content[current_start:resm.end()+start], current_infobox_name))
- start += resm.end()+1
-
- return_val = (rev['revid'],res)
-
- if self.verbosity > 1:
- print "Query infoboxes url: " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
- print repr(return_val)
-
- return return_val
-
- def split_infoboxes(self, src):
-
- start = 0
- previous_end = 0
- split_indexes = []
- delimiter_stack = []
- while start<=len(src):
- resd = DELIMITER_PATTERN.search(src[start:])
- ress = SPLIT_PATTERN.search(src[start:]) if len(delimiter_stack) == 0 else None
- startd = resd.start() if resd is not None else sys.maxint
- starts = ress.start() if ress is not None else sys.maxint
- if starts < startd:
- if len(split_indexes)>0:
- split_indexes.append((previous_end, ress.start(0)+start))
- split_indexes.append((ress.start(1)+start, ress.end(1)+start))
- start += ress.end(0)
- previous_end = start
- elif startd < sys.maxint:
- if resd.group().startswith("{") or resd.group().startswith("[") :
- delimiter_stack.append(resd.group())
- elif len(delimiter_stack)>0 and ( (delimiter_stack[-1].startswith('{') and resd.group()[0] == '}') or (delimiter_stack[-1].startswith('[') and resd.group()[0] == ']') ) and len(delimiter_stack[-1]) == len(resd.group()):
- delimiter_stack.pop()
- start += resd.end()
- else:
- break
-
- if previous_end > 0:
- split_indexes.append((previous_end,len(src)))
- res = [src[start:end] for start,end in split_indexes]
- return res
-
-
-
- def process_infoboxes(self, infobox_defs, tag):
-
- if not infobox_defs:
- return
-
- revision_id = infobox_defs[0]
- for infobox in infobox_defs[1]:
- src = infobox[0].strip(' \t\n\r')
- name = infobox[1]
- tag_infobox, created = TagInfobox.objects.get_or_create(tag=tag, name=name, revision_id = revision_id, defaults={'source': src})
- if not created:
- tag_infobox.source = src
- tag_infobox.save()
-
- src = COMMENT_PATTERN.sub('',src)
- src = START_PATTERN.sub('',src[:-2]).strip()
-
- keyvalues = self.split_infoboxes(src)
-
- for key,value in itertools.izip(*[itertools.islice(keyvalues, i, None, 2) for i in range(2)]):
- param, created = InfoboxParameter.objects.get_or_create(tag_infobox=tag_infobox, param_name=key.strip(), defaults={'param_value':value.strip()})
- if not created:
- param.param_value = value.strip()
- param.save()
-
- def handle_noargs(self, **options):
-
- self.style = no_style()
-
- interactive = options.get('interactive', True)
-
- self.verbosity = int(options.get('verbosity', '1'))
- use_label = options.get('use_label', False)
-
- force = options.get('force', False)
-
- limit = options.get("limit", -1)
- start = options.get("start", 0)
-
- site_url = options.get('site_url', settings.WIKIPEDIA_API_URL)
-
- random = options.get('random', False)
-
- types_mask = 0
- types_list = options.get('types', [])
-
- if len(types_list) == 0:
- types_mask = TYPES_MASK_DICT['all']
- else:
- for t in types_list:
- types_mask |= TYPES_MASK_DICT[t]
-
- if self.verbosity > 1 :
- print "types mask %s " % (bin(types_mask))
-
- if self.verbosity > 2:
- print "option passed : " + repr(options)
-
-
- queryset = Tag.objects.exclude(wikipedia_pageid= None)
-
- tag_list = options.get("tags", []);
-
- if tag_list:
- queryset = queryset.filter(label__in=tag_list)
- elif not options.get('all',False):
- queryset = queryset.annotate(wpc=Count('wp_categories')).filter(wpc = 0)
- #else:
- # queryset = Tag.objects.filter(url_status=None)
-
- if random:
- queryset = queryset.order_by("?")
- else:
- queryset = queryset.order_by("label")
-
- if limit >= 0:
- queryset = queryset[start:limit]
- elif start > 0:
- queryset = queryset[start:]
-
- if self.verbosity > 2 :
- print "Tag Query is %s" % (queryset.query)
-
- site = wiki.Wiki(site_url) #@UndefinedVariable
-
-
- count = queryset.count()
- if self.verbosity > 1:
- print "Processing %d tags" % (count)
-
- if not force and interactive:
- confirm = raw_input("You have requested to query and replace the wikipedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
- else:
- confirm = 'yes'
-
- if confirm != "yes":
- print "wikipedia query cancelled"
- return
-
-
-
- for i, tag in enumerate(queryset):
-
- if self.verbosity > 1:
- print "processing tag %s (%d/%d)" % (tag.label, i + 1, count)
- else:
- utils.show_progress(i + 1, count, tag.label, 60)
-
- # query categories
- wikipedia_pageid = tag.label if use_label else tag.wikipedia_pageid
- if tag.url_status == Tag.TAG_URL_STATUS_DICT['redirection'] and tag.alternative_wikipedia_pageid is not None :
- wikipedia_pageid = tag.alternative_label if use_label else tag.alternative_wikipedia_pageid
-
- with transaction.commit_on_success():
- if types_mask & TYPES_MASK_DICT['visible']:
- res = self.query_all_categories(False, site, wikipedia_pageid, use_label)
- self.process_categories(res, False, tag)
-
- if types_mask & TYPES_MASK_DICT['hidden']:
- res = self.query_all_categories(True, site, wikipedia_pageid, use_label)
- self.process_categories(res, True, tag)
-
- if types_mask & TYPES_MASK_DICT['infobox']:
- res = self.query_infoboxes(site, wikipedia_pageid, use_label)
- self.process_infoboxes(res, tag)
-