--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/hdalab/management/commands/query_wikipedia_category.py Tue Jun 17 10:25:33 2014 +0200
@@ -0,0 +1,396 @@
+# -*- coding: utf-8 -*-
+'''
+Created on Jun 7, 2011
+
+@author: ymh
+'''
+
+from django.conf import settings
+from django.core.management.base import NoArgsCommand
+from django.core.management.color import no_style
+from hdabo.models import Tag
+from hdalab.models import WpCategory, TagWpCategory, TagInfobox, InfoboxParameter
+from optparse import make_option
+from wikitools import api,wiki
+import sys
+import re
+import itertools
+from hdabo import utils
+from django.db.models import Count
+from django.db import transaction
+
+
+TYPES_MASK_DICT = {
+ u'visible': 0b001,
+ u'hidden': 0b010,
+ u'infobox': 0b100,
+ u'all': 0b111,
+ }
+
+START_PATTERN = re.compile(u"\{\{\s?Infobox\s+([^|]+)", re.M|re.U|re.I)
+END_PATTERN = re.compile(u"\{\{|\}\}", re.M|re.U)
+SPLIT_PATTERN = re.compile("\s*?\|\s*([\w]+[^=|]*)\s*=", re.U|re.M)
+DELIMITER_PATTERN = re.compile("\{{2,3}|\}{2,3}|\[\[|\]\]|\[|\]")
+COMMENT_PATTERN = re.compile("<!--.*?-->",re.U|re.M)
+
+
+
+class Command(NoArgsCommand):
+ '''
+ query and update wikipedia for tag title.
+ '''
+ options = ''
+ help = """query and update wikipedia for tag title."""
+
+ option_list = NoArgsCommand.option_list + (
+ make_option('--all',
+ action='store_true',
+ dest='all',
+ default=False,
+ help='force all tags to be updated, not only those not yet processed'),
+ make_option('--force',
+ action='store_true',
+ dest='force',
+ default=False,
+ help='ask no questions'),
+ make_option('--random',
+ action='store_true',
+ dest='random',
+ default=False,
+ help='randomize query on tags'),
+ make_option('--site',
+ action='store',
+ type='string',
+ dest='site_url',
+ default="http://fr.wikipedia.org/w/api.php",
+ help='the url for the wikipedia site'),
+ make_option('--limit',
+ action='store',
+ type='int',
+ dest='limit',
+ default= -1,
+ help='number of tag to process'),
+ make_option('--start',
+ action='store',
+ type='int',
+ dest='start',
+ default=0,
+ help='number of tag to ignore'),
+ make_option('--type',
+ action='append',
+ dest='types',
+ type='choice',
+ choices=['visible','hidden', 'infobox', 'all'],
+ default=[],
+ help='what type of query to perform : visible : visible categories, hidden : hidden categories, infobox: infoboxes, all: all of them. This option can be assed multiple times'),
+ make_option('--use-label',
+ action='store_true',
+ dest='use_label',
+ default=False,
+ help='use label instead of pageid to query wikipedia'),
+ make_option('--tag',
+ action='append',
+ dest='tags',
+ type='string',
+ default=[],
+ help='the tag to query'),
+
+ )
+
+
+# def process_wp_response(self, label, response):
+#
+# query_dict = response['query']
+# # get page if multiple pages or none -> return Tag.null_result
+# pages = query_dict.get("pages", {})
+# if len(pages) > 1 or len(pages) == 0:
+# return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None
+#
+# page = pages.values()[0]
+#
+# if u"invalid" in page or u"missing" in page:
+# return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None
+#
+# url = page.get(u'fullurl', None)
+# pageid = page.get(u'pageid', None)
+# new_label = page[u'title']
+#
+# if self.__is_homonymie(page):
+# status = Tag.TAG_URL_STATUS_DICT["homonyme"]
+# elif u"redirect" in page:
+# status = Tag.TAG_URL_STATUS_DICT["redirection"]
+# else:
+# status = Tag.TAG_URL_STATUS_DICT["match"]
+#
+# return new_label, status, url, pageid
+
+ def query_all_categories(self, hidden, site, pageid, use_label):
+
+ clshow = 'hidden' if hidden else '!hidden'
+ params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'categories', 'clshow': clshow}
+
+ clcontinue = ""
+ res = []
+
+ while clcontinue is not None:
+ if clcontinue:
+ params['clcontinue'] = clcontinue
+
+ wpquery = api.APIRequest(site, params) #@UndefinedVariable
+ response = wpquery.query()
+
+ if self.verbosity > 1:
+ print "Query infoboxes : " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
+ print repr(response)
+
+
+ query_dict = response.get('query', None)
+
+ if query_dict is None:
+ return res
+
+ pages = query_dict.get("pages", {})
+ if len(pages) > 1 or len(pages) == 0:
+ return res
+
+ page = pages.values()[0]
+
+ for cat in page.get('categories',[]):
+ title = cat.get('title',"")
+ title = title[title.find(":")+1:]
+ if title and clcontinue != ("%s|%s" % (pageid,title)):
+ res.append(title)
+
+ clcontinue = response.get('query-continue', {}).get('categories',{}).get('clcontinue', None)
+
+ if self.verbosity > 1:
+ print "Query infoboxes RES: "
+ print repr(res)
+
+ return res
+
+ def process_categories(self, cat_list, hidden, tag):
+
+ for cat in cat_list:
+ wp_cat,created = WpCategory.objects.get_or_create(label=cat) #@UnusedVariable
+ TagWpCategory.objects.get_or_create(tag=tag, wp_category=wp_cat, hidden=hidden)
+
+
+ def query_infoboxes(self, site, pageid, use_label):
+
+ res = []
+ params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'revisions', 'rvprop': 'ids|content'}
+ wpquery = api.APIRequest(site, params) #@UndefinedVariable
+ response = wpquery.query()
+
+ query_dict = response.get('query', None)
+
+ if query_dict is None:
+ return res
+
+ pages = query_dict.get("pages", {})
+ if len(pages) > 1 or len(pages) == 0:
+ return res
+
+ page = pages.values()[0]
+
+ if 'revisions' not in page or not page['revisions']:
+ return res
+
+ rev = page['revisions'][0]
+
+ content = rev['*']
+
+ start = 0
+ depth = 0
+ current_infobox_name = None
+ current_start = 0
+
+ while start <= len(content):
+ if depth==0:
+ resm = START_PATTERN.search(content[start:])
+ if resm is None:
+ break
+ depth = 1
+ current_start = resm.start()+start
+ start += resm.end()+1
+ current_infobox_name = resm.group(1)
+ else:
+ resm = END_PATTERN.search(content[start:])
+ if resm is None:
+ break
+ if resm.group(0) == "{{":
+ depth += 1
+ elif resm.group(0) == "}}":
+ depth -= 1
+ if depth == 0:
+ res.append((content[current_start:resm.end()+start], current_infobox_name))
+ start += resm.end()+1
+
+ return_val = (rev['revid'],res)
+
+ if self.verbosity > 1:
+ print "Query infoboxes url: " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
+ print repr(return_val)
+
+ return return_val
+
+ def split_infoboxes(self, src):
+
+ start = 0
+ previous_end = 0
+ split_indexes = []
+ delimiter_stack = []
+ while start<=len(src):
+ resd = DELIMITER_PATTERN.search(src[start:])
+ ress = SPLIT_PATTERN.search(src[start:]) if len(delimiter_stack) == 0 else None
+ startd = resd.start() if resd is not None else sys.maxint
+ starts = ress.start() if ress is not None else sys.maxint
+ if starts < startd:
+ if len(split_indexes)>0:
+ split_indexes.append((previous_end, ress.start(0)+start))
+ split_indexes.append((ress.start(1)+start, ress.end(1)+start))
+ start += ress.end(0)
+ previous_end = start
+ elif startd < sys.maxint:
+ if resd.group().startswith("{") or resd.group().startswith("[") :
+ delimiter_stack.append(resd.group())
+ elif len(delimiter_stack)>0 and ( (delimiter_stack[-1].startswith('{') and resd.group()[0] == '}') or (delimiter_stack[-1].startswith('[') and resd.group()[0] == ']') ) and len(delimiter_stack[-1]) == len(resd.group()):
+ delimiter_stack.pop()
+ start += resd.end()
+ else:
+ break
+
+ if previous_end > 0:
+ split_indexes.append((previous_end,len(src)))
+ res = [src[start:end] for start,end in split_indexes]
+ return res
+
+
+
+ def process_infoboxes(self, infobox_defs, tag):
+
+ if not infobox_defs:
+ return
+
+ revision_id = infobox_defs[0]
+ for infobox in infobox_defs[1]:
+ src = infobox[0].strip(' \t\n\r')
+ name = infobox[1]
+ tag_infobox, created = TagInfobox.objects.get_or_create(tag=tag, name=name, revision_id = revision_id, defaults={'source': src})
+ if not created:
+ tag_infobox.source = src
+ tag_infobox.save()
+
+ src = COMMENT_PATTERN.sub('',src)
+ src = START_PATTERN.sub('',src[:-2]).strip()
+
+ keyvalues = self.split_infoboxes(src)
+
+ for key,value in itertools.izip(*[itertools.islice(keyvalues, i, None, 2) for i in range(2)]):
+ param, created = InfoboxParameter.objects.get_or_create(tag_infobox=tag_infobox, param_name=key.strip(), defaults={'param_value':value.strip()})
+ if not created:
+ param.param_value = value.strip()
+ param.save()
+
+ def handle_noargs(self, **options):
+
+ self.style = no_style()
+
+ interactive = options.get('interactive', True)
+
+ self.verbosity = int(options.get('verbosity', '1'))
+ use_label = options.get('use_label', False)
+
+ force = options.get('force', False)
+
+ limit = options.get("limit", -1)
+ start = options.get("start", 0)
+
+ site_url = options.get('site_url', settings.WIKIPEDIA_API_URL)
+
+ random = options.get('random', False)
+
+ types_mask = 0
+ types_list = options.get('types', [])
+
+ if len(types_list) == 0:
+ types_mask = TYPES_MASK_DICT['all']
+ else:
+ for t in types_list:
+ types_mask |= TYPES_MASK_DICT[t]
+
+ if self.verbosity > 1 :
+ print "types mask %s " % (bin(types_mask))
+
+ if self.verbosity > 2:
+ print "option passed : " + repr(options)
+
+
+ queryset = Tag.objects.exclude(wikipedia_pageid= None)
+
+ tag_list = options.get("tags", []);
+
+ if tag_list:
+ queryset = queryset.filter(label__in=tag_list)
+ elif not options.get('all',False):
+ queryset = queryset.annotate(wpc=Count('wp_categories')).filter(wpc = 0)
+ #else:
+ # queryset = Tag.objects.filter(url_status=None)
+
+ if random:
+ queryset = queryset.order_by("?")
+ else:
+ queryset = queryset.order_by("label")
+
+ if limit >= 0:
+ queryset = queryset[start:limit]
+ elif start > 0:
+ queryset = queryset[start:]
+
+ if self.verbosity > 2 :
+ print "Tag Query is %s" % (queryset.query)
+
+ site = wiki.Wiki(site_url) #@UndefinedVariable
+
+
+ count = queryset.count()
+ if self.verbosity > 1:
+ print "Processing %d tags" % (count)
+
+ if not force and interactive:
+ confirm = raw_input("You have requested to query and replace the wikipedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
+ else:
+ confirm = 'yes'
+
+ if confirm != "yes":
+ print "wikipedia query cancelled"
+ return
+
+
+
+ for i, tag in enumerate(queryset):
+
+ if self.verbosity > 1:
+ print "processing tag %s (%d/%d)" % (tag.label, i + 1, count)
+ else:
+ utils.show_progress(i + 1, count, tag.label, 60)
+
+ # query categories
+ wikipedia_pageid = tag.label if use_label else tag.wikipedia_pageid
+ if tag.url_status == Tag.TAG_URL_STATUS_DICT['redirection'] and tag.alternative_wikipedia_pageid is not None :
+ wikipedia_pageid = tag.alternative_label if use_label else tag.alternative_wikipedia_pageid
+
+ with transaction.commit_on_success():
+ if types_mask & TYPES_MASK_DICT['visible']:
+ res = self.query_all_categories(False, site, wikipedia_pageid, use_label)
+ self.process_categories(res, False, tag)
+
+ if types_mask & TYPES_MASK_DICT['hidden']:
+ res = self.query_all_categories(True, site, wikipedia_pageid, use_label)
+ self.process_categories(res, True, tag)
+
+ if types_mask & TYPES_MASK_DICT['infobox']:
+ res = self.query_infoboxes(site, wikipedia_pageid, use_label)
+ self.process_infoboxes(res, tag)
+