src/hdalab/management/commands/query_wikipedia_category.py
changeset 271 8f77cf71ab02
parent 114 c59383cc9940
child 571 d9642be7c937
equal deleted inserted replaced
265:73f19fa4f997 271:8f77cf71ab02
       
     1 # -*- coding: utf-8 -*-
       
     2 '''
       
     3 Created on Jun 7, 2011
       
     4 
       
     5 @author: ymh
       
     6 '''
       
     7 
       
     8 from django.conf import settings
       
     9 from django.core.management.base import NoArgsCommand
       
    10 from django.core.management.color import no_style
       
    11 from hdabo.models import Tag
       
    12 from hdalab.models import WpCategory, TagWpCategory, TagInfobox, InfoboxParameter
       
    13 from optparse import make_option
       
    14 from wikitools import api,wiki
       
    15 import sys
       
    16 import re
       
    17 import itertools
       
    18 from hdabo import utils
       
    19 from django.db.models import Count
       
    20 from django.db import transaction
       
    21 
       
    22 
       
    23 TYPES_MASK_DICT = {
       
    24         u'visible': 0b001,
       
    25         u'hidden': 0b010,
       
    26         u'infobox': 0b100,
       
    27         u'all': 0b111,
       
    28     }
       
    29 
       
    30 START_PATTERN = re.compile(u"\{\{\s?Infobox\s+([^|]+)", re.M|re.U|re.I)
       
    31 END_PATTERN = re.compile(u"\{\{|\}\}", re.M|re.U)
       
    32 SPLIT_PATTERN = re.compile("\s*?\|\s*([\w]+[^=|]*)\s*=", re.U|re.M)
       
    33 DELIMITER_PATTERN = re.compile("\{{2,3}|\}{2,3}|\[\[|\]\]|\[|\]")
       
    34 COMMENT_PATTERN = re.compile("<!--.*?-->",re.U|re.M)
       
    35 
       
    36 
       
    37 
       
    38 class Command(NoArgsCommand):
       
    39     '''
       
    40     query and update wikipedia for tag title.
       
    41     '''
       
    42     options = ''
       
    43     help = """query and update wikipedia for tag title."""
       
    44     
       
    45     option_list = NoArgsCommand.option_list + (
       
    46         make_option('--all',
       
    47             action='store_true',
       
    48             dest='all',
       
    49             default=False,
       
    50             help='force all tags to be updated, not only those not yet processed'),
       
    51         make_option('--force',
       
    52             action='store_true',
       
    53             dest='force',
       
    54             default=False,
       
    55             help='ask no questions'),
       
    56         make_option('--random',
       
    57             action='store_true',
       
    58             dest='random',
       
    59             default=False,
       
    60             help='randomize query on tags'),
       
    61         make_option('--site',
       
    62             action='store',
       
    63             type='string',
       
    64             dest='site_url',
       
    65             default="http://fr.wikipedia.org/w/api.php",
       
    66             help='the url for the wikipedia site'),
       
    67         make_option('--limit',
       
    68             action='store',
       
    69             type='int',
       
    70             dest='limit',
       
    71             default= -1,
       
    72             help='number of tag to process'),
       
    73         make_option('--start',
       
    74             action='store',
       
    75             type='int',
       
    76             dest='start',
       
    77             default=0,
       
    78             help='number of tag to ignore'),
       
    79         make_option('--type',
       
    80             action='append',
       
    81             dest='types',
       
    82             type='choice',
       
    83             choices=['visible','hidden', 'infobox', 'all'],
       
    84             default=[],
       
    85             help='what type of query to perform : visible : visible categories, hidden : hidden categories, infobox: infoboxes, all: all of them. This option can be assed multiple times'),
       
    86         make_option('--use-label',
       
    87             action='store_true',
       
    88             dest='use_label',
       
    89             default=False,
       
    90             help='use label instead of pageid to query wikipedia'),
       
    91         make_option('--tag',
       
    92             action='append',
       
    93             dest='tags',
       
    94             type='string',
       
    95             default=[],
       
    96             help='the tag to query'),
       
    97 
       
    98     )
       
    99     
       
   100     
       
   101 #    def process_wp_response(self, label, response):        
       
   102 #
       
   103 #        query_dict = response['query']
       
   104 #        # get page if multiple pages or none -> return Tag.null_result
       
   105 #        pages = query_dict.get("pages", {})
       
   106 #        if len(pages) > 1 or len(pages) == 0:
       
   107 #            return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None
       
   108 #        
       
   109 #        page = pages.values()[0]
       
   110 #        
       
   111 #        if u"invalid" in page or u"missing" in page:
       
   112 #            return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None
       
   113 #
       
   114 #        url = page.get(u'fullurl', None)
       
   115 #        pageid = page.get(u'pageid', None)
       
   116 #        new_label = page[u'title']
       
   117 #        
       
   118 #        if self.__is_homonymie(page):
       
   119 #            status = Tag.TAG_URL_STATUS_DICT["homonyme"]
       
   120 #        elif u"redirect" in page:
       
   121 #            status = Tag.TAG_URL_STATUS_DICT["redirection"]
       
   122 #        else:
       
   123 #            status = Tag.TAG_URL_STATUS_DICT["match"]
       
   124 #        
       
   125 #        return new_label, status, url, pageid 
       
   126 
       
   127     def query_all_categories(self, hidden, site, pageid, use_label):
       
   128         
       
   129         clshow = 'hidden' if hidden else '!hidden'
       
   130         params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'categories', 'clshow': clshow}
       
   131         
       
   132         clcontinue = ""        
       
   133         res = []
       
   134         
       
   135         while clcontinue is not None:
       
   136             if clcontinue:
       
   137                 params['clcontinue'] = clcontinue
       
   138                 
       
   139             wpquery = api.APIRequest(site, params) #@UndefinedVariable
       
   140             response = wpquery.query()
       
   141             
       
   142             if self.verbosity > 1:
       
   143                 print "Query infoboxes : " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
       
   144                 print repr(response)
       
   145 
       
   146             
       
   147             query_dict = response.get('query', None)
       
   148             
       
   149             if query_dict is None:
       
   150                 return res
       
   151             
       
   152             pages = query_dict.get("pages", {})
       
   153             if len(pages) > 1 or len(pages) == 0:
       
   154                 return res
       
   155             
       
   156             page = pages.values()[0]
       
   157                         
       
   158             for cat in page.get('categories',[]):
       
   159                 title = cat.get('title',"")
       
   160                 title = title[title.find(":")+1:]
       
   161                 if title and clcontinue != ("%s|%s" % (pageid,title)):
       
   162                     res.append(title)
       
   163             
       
   164             clcontinue = response.get('query-continue', {}).get('categories',{}).get('clcontinue', None)
       
   165 
       
   166         if self.verbosity > 1:
       
   167             print "Query infoboxes RES: "
       
   168             print repr(res)
       
   169             
       
   170         return res
       
   171     
       
   172     def process_categories(self, cat_list, hidden, tag):
       
   173         
       
   174         for cat in cat_list:
       
   175             wp_cat,created = WpCategory.objects.get_or_create(label=cat) #@UnusedVariable
       
   176             TagWpCategory.objects.get_or_create(tag=tag, wp_category=wp_cat, hidden=hidden)
       
   177             
       
   178                 
       
   179     def query_infoboxes(self, site, pageid, use_label):
       
   180         
       
   181         res = []
       
   182         params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'revisions', 'rvprop': 'ids|content'}
       
   183         wpquery = api.APIRequest(site, params) #@UndefinedVariable
       
   184         response = wpquery.query()
       
   185         
       
   186         query_dict = response.get('query', None)
       
   187             
       
   188         if query_dict is None:
       
   189             return res
       
   190             
       
   191         pages = query_dict.get("pages", {})
       
   192         if len(pages) > 1 or len(pages) == 0:
       
   193             return res
       
   194 
       
   195         page = pages.values()[0]
       
   196         
       
   197         if 'revisions' not in page or not page['revisions']:
       
   198             return res
       
   199         
       
   200         rev = page['revisions'][0]
       
   201         
       
   202         content = rev['*']
       
   203                 
       
   204         start = 0
       
   205         depth = 0
       
   206         current_infobox_name = None
       
   207         current_start = 0
       
   208         
       
   209         while start <= len(content):
       
   210             if depth==0:
       
   211                 resm = START_PATTERN.search(content[start:])
       
   212                 if resm is None:
       
   213                     break
       
   214                 depth = 1
       
   215                 current_start = resm.start()+start
       
   216                 start += resm.end()+1
       
   217                 current_infobox_name = resm.group(1)                    
       
   218             else:
       
   219                 resm = END_PATTERN.search(content[start:])
       
   220                 if resm is None:
       
   221                     break
       
   222                 if resm.group(0) == "{{":
       
   223                     depth += 1
       
   224                 elif resm.group(0) == "}}":
       
   225                     depth -= 1
       
   226                 if depth == 0:
       
   227                     res.append((content[current_start:resm.end()+start], current_infobox_name))
       
   228                 start += resm.end()+1
       
   229 
       
   230         return_val = (rev['revid'],res)
       
   231         
       
   232         if self.verbosity > 1:
       
   233             print "Query infoboxes url: " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
       
   234             print repr(return_val)
       
   235         
       
   236         return return_val
       
   237     
       
   238     def split_infoboxes(self, src):
       
   239         
       
   240         start = 0
       
   241         previous_end = 0
       
   242         split_indexes = []
       
   243         delimiter_stack = []
       
   244         while start<=len(src):            
       
   245             resd = DELIMITER_PATTERN.search(src[start:])
       
   246             ress = SPLIT_PATTERN.search(src[start:]) if len(delimiter_stack) == 0 else None
       
   247             startd = resd.start() if resd is not None else sys.maxint
       
   248             starts = ress.start() if ress is not None else sys.maxint
       
   249             if starts < startd:
       
   250                 if len(split_indexes)>0:
       
   251                     split_indexes.append((previous_end, ress.start(0)+start))
       
   252                 split_indexes.append((ress.start(1)+start, ress.end(1)+start))
       
   253                 start += ress.end(0)
       
   254                 previous_end = start
       
   255             elif startd < sys.maxint:
       
   256                 if resd.group().startswith("{") or resd.group().startswith("[") :
       
   257                     delimiter_stack.append(resd.group())
       
   258                 elif len(delimiter_stack)>0 and ( (delimiter_stack[-1].startswith('{') and resd.group()[0] == '}') or (delimiter_stack[-1].startswith('[') and resd.group()[0] == ']') ) and len(delimiter_stack[-1]) == len(resd.group()):
       
   259                     delimiter_stack.pop()
       
   260                 start += resd.end()
       
   261             else:
       
   262                 break
       
   263             
       
   264         if previous_end > 0:
       
   265             split_indexes.append((previous_end,len(src)))
       
   266         res = [src[start:end] for start,end in split_indexes]
       
   267         return res
       
   268 
       
   269 
       
   270 
       
   271     def process_infoboxes(self, infobox_defs, tag):
       
   272         
       
   273         if not infobox_defs:
       
   274             return
       
   275         
       
   276         revision_id = infobox_defs[0]
       
   277         for infobox in infobox_defs[1]:
       
   278             src = infobox[0].strip(' \t\n\r')            
       
   279             name = infobox[1]
       
   280             tag_infobox, created = TagInfobox.objects.get_or_create(tag=tag, name=name, revision_id = revision_id, defaults={'source': src})
       
   281             if not created:
       
   282                 tag_infobox.source = src
       
   283                 tag_infobox.save()
       
   284 
       
   285             src = COMMENT_PATTERN.sub('',src)
       
   286             src = START_PATTERN.sub('',src[:-2]).strip()
       
   287             
       
   288             keyvalues = self.split_infoboxes(src)
       
   289 
       
   290             for key,value in itertools.izip(*[itertools.islice(keyvalues, i, None, 2) for i in range(2)]):
       
   291                 param, created = InfoboxParameter.objects.get_or_create(tag_infobox=tag_infobox, param_name=key.strip(), defaults={'param_value':value.strip()})
       
   292                 if not created:
       
   293                     param.param_value = value.strip()
       
   294                     param.save()
       
   295         
       
   296     def handle_noargs(self, **options):
       
   297         
       
   298         self.style = no_style()
       
   299         
       
   300         interactive = options.get('interactive', True)
       
   301         
       
   302         self.verbosity = int(options.get('verbosity', '1'))
       
   303         use_label = options.get('use_label', False)
       
   304         
       
   305         force = options.get('force', False)
       
   306         
       
   307         limit = options.get("limit", -1)
       
   308         start = options.get("start", 0)
       
   309         
       
   310         site_url = options.get('site_url', settings.WIKIPEDIA_API_URL)
       
   311         
       
   312         random = options.get('random', False)
       
   313         
       
   314         types_mask = 0
       
   315         types_list = options.get('types', [])
       
   316         
       
   317         if len(types_list) == 0:
       
   318             types_mask = TYPES_MASK_DICT['all']
       
   319         else:
       
   320             for t in types_list:
       
   321                 types_mask |=  TYPES_MASK_DICT[t]
       
   322                 
       
   323         if self.verbosity > 1 :
       
   324             print "types mask %s " % (bin(types_mask))  
       
   325         
       
   326         if self.verbosity > 2:
       
   327             print "option passed : " + repr(options)
       
   328 
       
   329 
       
   330         queryset = Tag.objects.exclude(wikipedia_pageid= None)
       
   331         
       
   332         tag_list = options.get("tags", []);
       
   333         
       
   334         if tag_list:
       
   335             queryset = queryset.filter(label__in=tag_list)
       
   336         elif not options.get('all',False):            
       
   337             queryset = queryset.annotate(wpc=Count('wp_categories')).filter(wpc = 0)
       
   338         #else:
       
   339         #    queryset = Tag.objects.filter(url_status=None)                    
       
   340         
       
   341         if random:
       
   342             queryset = queryset.order_by("?")
       
   343         else:
       
   344             queryset = queryset.order_by("label")
       
   345         
       
   346         if limit >= 0:
       
   347             queryset = queryset[start:limit]
       
   348         elif start > 0:
       
   349             queryset = queryset[start:]            
       
   350         
       
   351         if self.verbosity > 2 :
       
   352             print "Tag Query is %s" % (queryset.query)
       
   353         
       
   354         site = wiki.Wiki(site_url) #@UndefinedVariable
       
   355         
       
   356         
       
   357         count = queryset.count()
       
   358         if self.verbosity > 1:
       
   359             print "Processing %d tags" % (count)
       
   360         
       
   361         if not force and interactive:
       
   362             confirm = raw_input("You have requested to query and replace the wikipedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
       
   363         else:
       
   364             confirm = 'yes'
       
   365             
       
   366         if confirm != "yes":
       
   367             print "wikipedia query cancelled"
       
   368             return
       
   369 
       
   370         
       
   371         
       
   372         for i, tag in enumerate(queryset):
       
   373             
       
   374             if self.verbosity > 1:
       
   375                 print "processing tag %s (%d/%d)" % (tag.label, i + 1, count)
       
   376             else:
       
   377                 utils.show_progress(i + 1, count, tag.label, 60)                            
       
   378 
       
   379             # query categories
       
   380             wikipedia_pageid = tag.label if use_label else tag.wikipedia_pageid
       
   381             if tag.url_status == Tag.TAG_URL_STATUS_DICT['redirection'] and tag.alternative_wikipedia_pageid is not None :
       
   382                 wikipedia_pageid = tag.alternative_label if use_label else tag.alternative_wikipedia_pageid
       
   383 
       
   384             with transaction.commit_on_success():
       
   385                 if types_mask & TYPES_MASK_DICT['visible']:
       
   386                     res = self.query_all_categories(False, site, wikipedia_pageid, use_label)
       
   387                     self.process_categories(res, False, tag)
       
   388     
       
   389                 if types_mask & TYPES_MASK_DICT['hidden']:
       
   390                     res = self.query_all_categories(True, site, wikipedia_pageid, use_label)
       
   391                     self.process_categories(res, True, tag)
       
   392                 
       
   393                 if types_mask & TYPES_MASK_DICT['infobox']:
       
   394                     res = self.query_infoboxes(site, wikipedia_pageid, use_label)
       
   395                     self.process_infoboxes(res, tag)
       
   396