src/hdalab/management/commands/query_wikipedia_category.py
author cavaliet
Tue, 17 Jun 2014 10:25:33 +0200
changeset 271 8f77cf71ab02
parent 114 web/hdalab/management/commands/query_wikipedia_category.py@c59383cc9940
child 571 d9642be7c937
permissions -rw-r--r--
commit the venv update (django and dependancies) in the good head
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
     1
# -*- coding: utf-8 -*-
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
     2
'''
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
     3
Created on Jun 7, 2011
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
     4
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
     5
@author: ymh
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
     6
'''
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
     7
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
     8
from django.conf import settings
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
     9
from django.core.management.base import NoArgsCommand
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    10
from django.core.management.color import no_style
114
c59383cc9940 migrate categories extraction to hdalab
ymh <ymh.work@gmail.com>
parents: 113
diff changeset
    11
from hdabo.models import Tag
c59383cc9940 migrate categories extraction to hdalab
ymh <ymh.work@gmail.com>
parents: 113
diff changeset
    12
from hdalab.models import WpCategory, TagWpCategory, TagInfobox, InfoboxParameter
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    13
from optparse import make_option
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    14
from wikitools import api,wiki
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    15
import sys
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    16
import re
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    17
import itertools
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    18
from hdabo import utils
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    19
from django.db.models import Count
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    20
from django.db import transaction
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    21
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    22
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    23
TYPES_MASK_DICT = {
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    24
        u'visible': 0b001,
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    25
        u'hidden': 0b010,
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    26
        u'infobox': 0b100,
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    27
        u'all': 0b111,
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    28
    }
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    29
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    30
START_PATTERN = re.compile(u"\{\{\s?Infobox\s+([^|]+)", re.M|re.U|re.I)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    31
END_PATTERN = re.compile(u"\{\{|\}\}", re.M|re.U)
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
    32
SPLIT_PATTERN = re.compile("\s*?\|\s*([\w]+[^=|]*)\s*=", re.U|re.M)
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    33
DELIMITER_PATTERN = re.compile("\{{2,3}|\}{2,3}|\[\[|\]\]|\[|\]")
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
    34
COMMENT_PATTERN = re.compile("<!--.*?-->",re.U|re.M)
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    35
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    36
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    37
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    38
class Command(NoArgsCommand):
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    39
    '''
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    40
    query and update wikipedia for tag title.
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    41
    '''
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    42
    options = ''
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    43
    help = """query and update wikipedia for tag title."""
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    44
    
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    45
    option_list = NoArgsCommand.option_list + (
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    46
        make_option('--all',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    47
            action='store_true',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    48
            dest='all',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    49
            default=False,
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    50
            help='force all tags to be updated, not only those not yet processed'),
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    51
        make_option('--force',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    52
            action='store_true',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    53
            dest='force',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    54
            default=False,
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    55
            help='ask no questions'),
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    56
        make_option('--random',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    57
            action='store_true',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    58
            dest='random',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    59
            default=False,
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    60
            help='randomize query on tags'),
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    61
        make_option('--site',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    62
            action='store',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    63
            type='string',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    64
            dest='site_url',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    65
            default="http://fr.wikipedia.org/w/api.php",
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    66
            help='the url for the wikipedia site'),
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    67
        make_option('--limit',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    68
            action='store',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    69
            type='int',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    70
            dest='limit',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    71
            default= -1,
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    72
            help='number of tag to process'),
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    73
        make_option('--start',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    74
            action='store',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    75
            type='int',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    76
            dest='start',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    77
            default=0,
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    78
            help='number of tag to ignore'),
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    79
        make_option('--type',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    80
            action='append',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    81
            dest='types',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    82
            type='choice',
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    83
            choices=['visible','hidden', 'infobox', 'all'],
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    84
            default=[],
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
    85
            help='what type of query to perform : visible : visible categories, hidden : hidden categories, infobox: infoboxes, all: all of them. This option can be assed multiple times'),
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
    86
        make_option('--use-label',
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
    87
            action='store_true',
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
    88
            dest='use_label',
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
    89
            default=False,
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
    90
            help='use label instead of pageid to query wikipedia'),
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
    91
        make_option('--tag',
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
    92
            action='append',
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
    93
            dest='tags',
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
    94
            type='string',
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
    95
            default=[],
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
    96
            help='the tag to query'),
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
    97
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
    98
    )
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
    99
    
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   100
    
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   101
#    def process_wp_response(self, label, response):        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   102
#
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   103
#        query_dict = response['query']
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   104
#        # get page if multiple pages or none -> return Tag.null_result
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   105
#        pages = query_dict.get("pages", {})
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   106
#        if len(pages) > 1 or len(pages) == 0:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   107
#            return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   108
#        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   109
#        page = pages.values()[0]
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   110
#        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   111
#        if u"invalid" in page or u"missing" in page:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   112
#            return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   113
#
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   114
#        url = page.get(u'fullurl', None)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   115
#        pageid = page.get(u'pageid', None)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   116
#        new_label = page[u'title']
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   117
#        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   118
#        if self.__is_homonymie(page):
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   119
#            status = Tag.TAG_URL_STATUS_DICT["homonyme"]
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   120
#        elif u"redirect" in page:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   121
#            status = Tag.TAG_URL_STATUS_DICT["redirection"]
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   122
#        else:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   123
#            status = Tag.TAG_URL_STATUS_DICT["match"]
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   124
#        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   125
#        return new_label, status, url, pageid 
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   126
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   127
    def query_all_categories(self, hidden, site, pageid, use_label):
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   128
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   129
        clshow = 'hidden' if hidden else '!hidden'
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   130
        params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'categories', 'clshow': clshow}
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   131
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   132
        clcontinue = ""        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   133
        res = []
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   134
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   135
        while clcontinue is not None:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   136
            if clcontinue:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   137
                params['clcontinue'] = clcontinue
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   138
                
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   139
            wpquery = api.APIRequest(site, params) #@UndefinedVariable
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   140
            response = wpquery.query()
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   141
            
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   142
            if self.verbosity > 1:
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   143
                print "Query infoboxes : " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   144
                print repr(response)
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   145
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   146
            
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   147
            query_dict = response.get('query', None)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   148
            
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   149
            if query_dict is None:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   150
                return res
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   151
            
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   152
            pages = query_dict.get("pages", {})
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   153
            if len(pages) > 1 or len(pages) == 0:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   154
                return res
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   155
            
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   156
            page = pages.values()[0]
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   157
                        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   158
            for cat in page.get('categories',[]):
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   159
                title = cat.get('title',"")
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   160
                title = title[title.find(":")+1:]
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   161
                if title and clcontinue != ("%s|%s" % (pageid,title)):
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   162
                    res.append(title)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   163
            
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   164
            clcontinue = response.get('query-continue', {}).get('categories',{}).get('clcontinue', None)
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   165
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   166
        if self.verbosity > 1:
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   167
            print "Query infoboxes RES: "
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   168
            print repr(res)
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   169
            
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   170
        return res
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   171
    
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   172
    def process_categories(self, cat_list, hidden, tag):
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   173
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   174
        for cat in cat_list:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   175
            wp_cat,created = WpCategory.objects.get_or_create(label=cat) #@UnusedVariable
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   176
            TagWpCategory.objects.get_or_create(tag=tag, wp_category=wp_cat, hidden=hidden)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   177
            
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   178
                
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   179
    def query_infoboxes(self, site, pageid, use_label):
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   180
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   181
        res = []
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   182
        params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'revisions', 'rvprop': 'ids|content'}
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   183
        wpquery = api.APIRequest(site, params) #@UndefinedVariable
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   184
        response = wpquery.query()
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   185
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   186
        query_dict = response.get('query', None)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   187
            
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   188
        if query_dict is None:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   189
            return res
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   190
            
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   191
        pages = query_dict.get("pages", {})
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   192
        if len(pages) > 1 or len(pages) == 0:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   193
            return res
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   194
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   195
        page = pages.values()[0]
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   196
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   197
        if 'revisions' not in page or not page['revisions']:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   198
            return res
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   199
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   200
        rev = page['revisions'][0]
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   201
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   202
        content = rev['*']
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   203
                
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   204
        start = 0
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   205
        depth = 0
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   206
        current_infobox_name = None
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   207
        current_start = 0
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   208
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   209
        while start <= len(content):
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   210
            if depth==0:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   211
                resm = START_PATTERN.search(content[start:])
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   212
                if resm is None:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   213
                    break
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   214
                depth = 1
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   215
                current_start = resm.start()+start
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   216
                start += resm.end()+1
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   217
                current_infobox_name = resm.group(1)                    
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   218
            else:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   219
                resm = END_PATTERN.search(content[start:])
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   220
                if resm is None:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   221
                    break
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   222
                if resm.group(0) == "{{":
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   223
                    depth += 1
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   224
                elif resm.group(0) == "}}":
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   225
                    depth -= 1
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   226
                if depth == 0:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   227
                    res.append((content[current_start:resm.end()+start], current_infobox_name))
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   228
                start += resm.end()+1
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   229
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   230
        return_val = (rev['revid'],res)
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   231
        
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   232
        if self.verbosity > 1:
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   233
            print "Query infoboxes url: " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   234
            print repr(return_val)
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   235
        
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   236
        return return_val
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   237
    
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   238
    def split_infoboxes(self, src):
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   239
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   240
        start = 0
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   241
        previous_end = 0
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   242
        split_indexes = []
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   243
        delimiter_stack = []
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   244
        while start<=len(src):            
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   245
            resd = DELIMITER_PATTERN.search(src[start:])
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   246
            ress = SPLIT_PATTERN.search(src[start:]) if len(delimiter_stack) == 0 else None
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   247
            startd = resd.start() if resd is not None else sys.maxint
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   248
            starts = ress.start() if ress is not None else sys.maxint
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   249
            if starts < startd:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   250
                if len(split_indexes)>0:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   251
                    split_indexes.append((previous_end, ress.start(0)+start))
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   252
                split_indexes.append((ress.start(1)+start, ress.end(1)+start))
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   253
                start += ress.end(0)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   254
                previous_end = start
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   255
            elif startd < sys.maxint:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   256
                if resd.group().startswith("{") or resd.group().startswith("[") :
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   257
                    delimiter_stack.append(resd.group())
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   258
                elif len(delimiter_stack)>0 and ( (delimiter_stack[-1].startswith('{') and resd.group()[0] == '}') or (delimiter_stack[-1].startswith('[') and resd.group()[0] == ']') ) and len(delimiter_stack[-1]) == len(resd.group()):
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   259
                    delimiter_stack.pop()
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   260
                start += resd.end()
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   261
            else:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   262
                break
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   263
            
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   264
        if previous_end > 0:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   265
            split_indexes.append((previous_end,len(src)))
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   266
        res = [src[start:end] for start,end in split_indexes]
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   267
        return res
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   268
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   269
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   270
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   271
    def process_infoboxes(self, infobox_defs, tag):
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   272
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   273
        if not infobox_defs:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   274
            return
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   275
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   276
        revision_id = infobox_defs[0]
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   277
        for infobox in infobox_defs[1]:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   278
            src = infobox[0].strip(' \t\n\r')            
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   279
            name = infobox[1]
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   280
            tag_infobox, created = TagInfobox.objects.get_or_create(tag=tag, name=name, revision_id = revision_id, defaults={'source': src})
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   281
            if not created:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   282
                tag_infobox.source = src
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   283
                tag_infobox.save()
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   284
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   285
            src = COMMENT_PATTERN.sub('',src)
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   286
            src = START_PATTERN.sub('',src[:-2]).strip()
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   287
            
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   288
            keyvalues = self.split_infoboxes(src)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   289
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   290
            for key,value in itertools.izip(*[itertools.islice(keyvalues, i, None, 2) for i in range(2)]):
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   291
                param, created = InfoboxParameter.objects.get_or_create(tag_infobox=tag_infobox, param_name=key.strip(), defaults={'param_value':value.strip()})
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   292
                if not created:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   293
                    param.param_value = value.strip()
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   294
                    param.save()
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   295
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   296
    def handle_noargs(self, **options):
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   297
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   298
        self.style = no_style()
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   299
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   300
        interactive = options.get('interactive', True)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   301
        
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   302
        self.verbosity = int(options.get('verbosity', '1'))
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   303
        use_label = options.get('use_label', False)
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   304
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   305
        force = options.get('force', False)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   306
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   307
        limit = options.get("limit", -1)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   308
        start = options.get("start", 0)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   309
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   310
        site_url = options.get('site_url', settings.WIKIPEDIA_API_URL)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   311
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   312
        random = options.get('random', False)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   313
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   314
        types_mask = 0
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   315
        types_list = options.get('types', [])
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   316
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   317
        if len(types_list) == 0:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   318
            types_mask = TYPES_MASK_DICT['all']
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   319
        else:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   320
            for t in types_list:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   321
                types_mask |=  TYPES_MASK_DICT[t]
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   322
                
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   323
        if self.verbosity > 1 :
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   324
            print "types mask %s " % (bin(types_mask))  
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   325
        
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   326
        if self.verbosity > 2:
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   327
            print "option passed : " + repr(options)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   328
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   329
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   330
        queryset = Tag.objects.exclude(wikipedia_pageid= None)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   331
        
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   332
        tag_list = options.get("tags", []);
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   333
        
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   334
        if tag_list:
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   335
            queryset = queryset.filter(label__in=tag_list)
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   336
        elif not options.get('all',False):            
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   337
            queryset = queryset.annotate(wpc=Count('wp_categories')).filter(wpc = 0)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   338
        #else:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   339
        #    queryset = Tag.objects.filter(url_status=None)                    
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   340
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   341
        if random:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   342
            queryset = queryset.order_by("?")
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   343
        else:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   344
            queryset = queryset.order_by("label")
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   345
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   346
        if limit >= 0:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   347
            queryset = queryset[start:limit]
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   348
        elif start > 0:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   349
            queryset = queryset[start:]            
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   350
        
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   351
        if self.verbosity > 2 :
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   352
            print "Tag Query is %s" % (queryset.query)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   353
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   354
        site = wiki.Wiki(site_url) #@UndefinedVariable
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   355
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   356
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   357
        count = queryset.count()
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   358
        if self.verbosity > 1:
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   359
            print "Processing %d tags" % (count)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   360
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   361
        if not force and interactive:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   362
            confirm = raw_input("You have requested to query and replace the wikipedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   363
        else:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   364
            confirm = 'yes'
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   365
            
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   366
        if confirm != "yes":
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   367
            print "wikipedia query cancelled"
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   368
            return
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   369
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   370
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   371
        
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   372
        for i, tag in enumerate(queryset):
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   373
            
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   374
            if self.verbosity > 1:
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   375
                print "processing tag %s (%d/%d)" % (tag.label, i + 1, count)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   376
            else:
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   377
                utils.show_progress(i + 1, count, tag.label, 60)                            
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   378
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   379
            # query categories
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   380
            wikipedia_pageid = tag.label if use_label else tag.wikipedia_pageid
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   381
            if tag.url_status == Tag.TAG_URL_STATUS_DICT['redirection'] and tag.alternative_wikipedia_pageid is not None :
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   382
                wikipedia_pageid = tag.alternative_label if use_label else tag.alternative_wikipedia_pageid
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   383
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   384
            with transaction.commit_on_success():
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   385
                if types_mask & TYPES_MASK_DICT['visible']:
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   386
                    res = self.query_all_categories(False, site, wikipedia_pageid, use_label)
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   387
                    self.process_categories(res, False, tag)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   388
    
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   389
                if types_mask & TYPES_MASK_DICT['hidden']:
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   390
                    res = self.query_all_categories(True, site, wikipedia_pageid, use_label)
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   391
                    self.process_categories(res, True, tag)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   392
                
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   393
                if types_mask & TYPES_MASK_DICT['infobox']:
113
0d2bfd84b989 improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents: 111
diff changeset
   394
                    res = self.query_infoboxes(site, wikipedia_pageid, use_label)
111
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   395
                    self.process_infoboxes(res, tag)
ceb381f5b0c7 query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
diff changeset
   396