|
1 # -*- coding: utf-8 -*- |
|
2 ''' |
|
3 Created on Jun 7, 2011 |
|
4 |
|
5 @author: ymh |
|
6 ''' |
|
7 |
|
8 from django.conf import settings |
|
9 from django.core.management.base import NoArgsCommand |
|
10 from django.core.management.color import no_style |
|
11 from hdabo.models import Tag |
|
12 from hdalab.models import WpCategory, TagWpCategory, TagInfobox, InfoboxParameter |
|
13 from optparse import make_option |
|
14 from wikitools import api,wiki |
|
15 import sys |
|
16 import re |
|
17 import itertools |
|
18 from hdabo import utils |
|
19 from django.db.models import Count |
|
20 from django.db import transaction |
|
21 |
|
22 |
|
23 TYPES_MASK_DICT = { |
|
24 u'visible': 0b001, |
|
25 u'hidden': 0b010, |
|
26 u'infobox': 0b100, |
|
27 u'all': 0b111, |
|
28 } |
|
29 |
|
30 START_PATTERN = re.compile(u"\{\{\s?Infobox\s+([^|]+)", re.M|re.U|re.I) |
|
31 END_PATTERN = re.compile(u"\{\{|\}\}", re.M|re.U) |
|
32 SPLIT_PATTERN = re.compile("\s*?\|\s*([\w]+[^=|]*)\s*=", re.U|re.M) |
|
33 DELIMITER_PATTERN = re.compile("\{{2,3}|\}{2,3}|\[\[|\]\]|\[|\]") |
|
34 COMMENT_PATTERN = re.compile("<!--.*?-->",re.U|re.M) |
|
35 |
|
36 |
|
37 |
|
38 class Command(NoArgsCommand): |
|
39 ''' |
|
40 query and update wikipedia for tag title. |
|
41 ''' |
|
42 options = '' |
|
43 help = """query and update wikipedia for tag title.""" |
|
44 |
|
45 option_list = NoArgsCommand.option_list + ( |
|
46 make_option('--all', |
|
47 action='store_true', |
|
48 dest='all', |
|
49 default=False, |
|
50 help='force all tags to be updated, not only those not yet processed'), |
|
51 make_option('--force', |
|
52 action='store_true', |
|
53 dest='force', |
|
54 default=False, |
|
55 help='ask no questions'), |
|
56 make_option('--random', |
|
57 action='store_true', |
|
58 dest='random', |
|
59 default=False, |
|
60 help='randomize query on tags'), |
|
61 make_option('--site', |
|
62 action='store', |
|
63 type='string', |
|
64 dest='site_url', |
|
65 default="http://fr.wikipedia.org/w/api.php", |
|
66 help='the url for the wikipedia site'), |
|
67 make_option('--limit', |
|
68 action='store', |
|
69 type='int', |
|
70 dest='limit', |
|
71 default= -1, |
|
72 help='number of tag to process'), |
|
73 make_option('--start', |
|
74 action='store', |
|
75 type='int', |
|
76 dest='start', |
|
77 default=0, |
|
78 help='number of tag to ignore'), |
|
79 make_option('--type', |
|
80 action='append', |
|
81 dest='types', |
|
82 type='choice', |
|
83 choices=['visible','hidden', 'infobox', 'all'], |
|
84 default=[], |
|
85 help='what type of query to perform : visible : visible categories, hidden : hidden categories, infobox: infoboxes, all: all of them. This option can be assed multiple times'), |
|
86 make_option('--use-label', |
|
87 action='store_true', |
|
88 dest='use_label', |
|
89 default=False, |
|
90 help='use label instead of pageid to query wikipedia'), |
|
91 make_option('--tag', |
|
92 action='append', |
|
93 dest='tags', |
|
94 type='string', |
|
95 default=[], |
|
96 help='the tag to query'), |
|
97 |
|
98 ) |
|
99 |
|
100 |
|
101 # def process_wp_response(self, label, response): |
|
102 # |
|
103 # query_dict = response['query'] |
|
104 # # get page if multiple pages or none -> return Tag.null_result |
|
105 # pages = query_dict.get("pages", {}) |
|
106 # if len(pages) > 1 or len(pages) == 0: |
|
107 # return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None |
|
108 # |
|
109 # page = pages.values()[0] |
|
110 # |
|
111 # if u"invalid" in page or u"missing" in page: |
|
112 # return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None |
|
113 # |
|
114 # url = page.get(u'fullurl', None) |
|
115 # pageid = page.get(u'pageid', None) |
|
116 # new_label = page[u'title'] |
|
117 # |
|
118 # if self.__is_homonymie(page): |
|
119 # status = Tag.TAG_URL_STATUS_DICT["homonyme"] |
|
120 # elif u"redirect" in page: |
|
121 # status = Tag.TAG_URL_STATUS_DICT["redirection"] |
|
122 # else: |
|
123 # status = Tag.TAG_URL_STATUS_DICT["match"] |
|
124 # |
|
125 # return new_label, status, url, pageid |
|
126 |
|
127 def query_all_categories(self, hidden, site, pageid, use_label): |
|
128 |
|
129 clshow = 'hidden' if hidden else '!hidden' |
|
130 params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'categories', 'clshow': clshow} |
|
131 |
|
132 clcontinue = "" |
|
133 res = [] |
|
134 |
|
135 while clcontinue is not None: |
|
136 if clcontinue: |
|
137 params['clcontinue'] = clcontinue |
|
138 |
|
139 wpquery = api.APIRequest(site, params) #@UndefinedVariable |
|
140 response = wpquery.query() |
|
141 |
|
142 if self.verbosity > 1: |
|
143 print "Query infoboxes : " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data()) |
|
144 print repr(response) |
|
145 |
|
146 |
|
147 query_dict = response.get('query', None) |
|
148 |
|
149 if query_dict is None: |
|
150 return res |
|
151 |
|
152 pages = query_dict.get("pages", {}) |
|
153 if len(pages) > 1 or len(pages) == 0: |
|
154 return res |
|
155 |
|
156 page = pages.values()[0] |
|
157 |
|
158 for cat in page.get('categories',[]): |
|
159 title = cat.get('title',"") |
|
160 title = title[title.find(":")+1:] |
|
161 if title and clcontinue != ("%s|%s" % (pageid,title)): |
|
162 res.append(title) |
|
163 |
|
164 clcontinue = response.get('query-continue', {}).get('categories',{}).get('clcontinue', None) |
|
165 |
|
166 if self.verbosity > 1: |
|
167 print "Query infoboxes RES: " |
|
168 print repr(res) |
|
169 |
|
170 return res |
|
171 |
|
172 def process_categories(self, cat_list, hidden, tag): |
|
173 |
|
174 for cat in cat_list: |
|
175 wp_cat,created = WpCategory.objects.get_or_create(label=cat) #@UnusedVariable |
|
176 TagWpCategory.objects.get_or_create(tag=tag, wp_category=wp_cat, hidden=hidden) |
|
177 |
|
178 |
|
179 def query_infoboxes(self, site, pageid, use_label): |
|
180 |
|
181 res = [] |
|
182 params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'revisions', 'rvprop': 'ids|content'} |
|
183 wpquery = api.APIRequest(site, params) #@UndefinedVariable |
|
184 response = wpquery.query() |
|
185 |
|
186 query_dict = response.get('query', None) |
|
187 |
|
188 if query_dict is None: |
|
189 return res |
|
190 |
|
191 pages = query_dict.get("pages", {}) |
|
192 if len(pages) > 1 or len(pages) == 0: |
|
193 return res |
|
194 |
|
195 page = pages.values()[0] |
|
196 |
|
197 if 'revisions' not in page or not page['revisions']: |
|
198 return res |
|
199 |
|
200 rev = page['revisions'][0] |
|
201 |
|
202 content = rev['*'] |
|
203 |
|
204 start = 0 |
|
205 depth = 0 |
|
206 current_infobox_name = None |
|
207 current_start = 0 |
|
208 |
|
209 while start <= len(content): |
|
210 if depth==0: |
|
211 resm = START_PATTERN.search(content[start:]) |
|
212 if resm is None: |
|
213 break |
|
214 depth = 1 |
|
215 current_start = resm.start()+start |
|
216 start += resm.end()+1 |
|
217 current_infobox_name = resm.group(1) |
|
218 else: |
|
219 resm = END_PATTERN.search(content[start:]) |
|
220 if resm is None: |
|
221 break |
|
222 if resm.group(0) == "{{": |
|
223 depth += 1 |
|
224 elif resm.group(0) == "}}": |
|
225 depth -= 1 |
|
226 if depth == 0: |
|
227 res.append((content[current_start:resm.end()+start], current_infobox_name)) |
|
228 start += resm.end()+1 |
|
229 |
|
230 return_val = (rev['revid'],res) |
|
231 |
|
232 if self.verbosity > 1: |
|
233 print "Query infoboxes url: " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data()) |
|
234 print repr(return_val) |
|
235 |
|
236 return return_val |
|
237 |
|
238 def split_infoboxes(self, src): |
|
239 |
|
240 start = 0 |
|
241 previous_end = 0 |
|
242 split_indexes = [] |
|
243 delimiter_stack = [] |
|
244 while start<=len(src): |
|
245 resd = DELIMITER_PATTERN.search(src[start:]) |
|
246 ress = SPLIT_PATTERN.search(src[start:]) if len(delimiter_stack) == 0 else None |
|
247 startd = resd.start() if resd is not None else sys.maxint |
|
248 starts = ress.start() if ress is not None else sys.maxint |
|
249 if starts < startd: |
|
250 if len(split_indexes)>0: |
|
251 split_indexes.append((previous_end, ress.start(0)+start)) |
|
252 split_indexes.append((ress.start(1)+start, ress.end(1)+start)) |
|
253 start += ress.end(0) |
|
254 previous_end = start |
|
255 elif startd < sys.maxint: |
|
256 if resd.group().startswith("{") or resd.group().startswith("[") : |
|
257 delimiter_stack.append(resd.group()) |
|
258 elif len(delimiter_stack)>0 and ( (delimiter_stack[-1].startswith('{') and resd.group()[0] == '}') or (delimiter_stack[-1].startswith('[') and resd.group()[0] == ']') ) and len(delimiter_stack[-1]) == len(resd.group()): |
|
259 delimiter_stack.pop() |
|
260 start += resd.end() |
|
261 else: |
|
262 break |
|
263 |
|
264 if previous_end > 0: |
|
265 split_indexes.append((previous_end,len(src))) |
|
266 res = [src[start:end] for start,end in split_indexes] |
|
267 return res |
|
268 |
|
269 |
|
270 |
|
271 def process_infoboxes(self, infobox_defs, tag): |
|
272 |
|
273 if not infobox_defs: |
|
274 return |
|
275 |
|
276 revision_id = infobox_defs[0] |
|
277 for infobox in infobox_defs[1]: |
|
278 src = infobox[0].strip(' \t\n\r') |
|
279 name = infobox[1] |
|
280 tag_infobox, created = TagInfobox.objects.get_or_create(tag=tag, name=name, revision_id = revision_id, defaults={'source': src}) |
|
281 if not created: |
|
282 tag_infobox.source = src |
|
283 tag_infobox.save() |
|
284 |
|
285 src = COMMENT_PATTERN.sub('',src) |
|
286 src = START_PATTERN.sub('',src[:-2]).strip() |
|
287 |
|
288 keyvalues = self.split_infoboxes(src) |
|
289 |
|
290 for key,value in itertools.izip(*[itertools.islice(keyvalues, i, None, 2) for i in range(2)]): |
|
291 param, created = InfoboxParameter.objects.get_or_create(tag_infobox=tag_infobox, param_name=key.strip(), defaults={'param_value':value.strip()}) |
|
292 if not created: |
|
293 param.param_value = value.strip() |
|
294 param.save() |
|
295 |
|
296 def handle_noargs(self, **options): |
|
297 |
|
298 self.style = no_style() |
|
299 |
|
300 interactive = options.get('interactive', True) |
|
301 |
|
302 self.verbosity = int(options.get('verbosity', '1')) |
|
303 use_label = options.get('use_label', False) |
|
304 |
|
305 force = options.get('force', False) |
|
306 |
|
307 limit = options.get("limit", -1) |
|
308 start = options.get("start", 0) |
|
309 |
|
310 site_url = options.get('site_url', settings.WIKIPEDIA_API_URL) |
|
311 |
|
312 random = options.get('random', False) |
|
313 |
|
314 types_mask = 0 |
|
315 types_list = options.get('types', []) |
|
316 |
|
317 if len(types_list) == 0: |
|
318 types_mask = TYPES_MASK_DICT['all'] |
|
319 else: |
|
320 for t in types_list: |
|
321 types_mask |= TYPES_MASK_DICT[t] |
|
322 |
|
323 if self.verbosity > 1 : |
|
324 print "types mask %s " % (bin(types_mask)) |
|
325 |
|
326 if self.verbosity > 2: |
|
327 print "option passed : " + repr(options) |
|
328 |
|
329 |
|
330 queryset = Tag.objects.exclude(wikipedia_pageid= None) |
|
331 |
|
332 tag_list = options.get("tags", []); |
|
333 |
|
334 if tag_list: |
|
335 queryset = queryset.filter(label__in=tag_list) |
|
336 elif not options.get('all',False): |
|
337 queryset = queryset.annotate(wpc=Count('wp_categories')).filter(wpc = 0) |
|
338 #else: |
|
339 # queryset = Tag.objects.filter(url_status=None) |
|
340 |
|
341 if random: |
|
342 queryset = queryset.order_by("?") |
|
343 else: |
|
344 queryset = queryset.order_by("label") |
|
345 |
|
346 if limit >= 0: |
|
347 queryset = queryset[start:limit] |
|
348 elif start > 0: |
|
349 queryset = queryset[start:] |
|
350 |
|
351 if self.verbosity > 2 : |
|
352 print "Tag Query is %s" % (queryset.query) |
|
353 |
|
354 site = wiki.Wiki(site_url) #@UndefinedVariable |
|
355 |
|
356 |
|
357 count = queryset.count() |
|
358 if self.verbosity > 1: |
|
359 print "Processing %d tags" % (count) |
|
360 |
|
361 if not force and interactive: |
|
362 confirm = raw_input("You have requested to query and replace the wikipedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count)) |
|
363 else: |
|
364 confirm = 'yes' |
|
365 |
|
366 if confirm != "yes": |
|
367 print "wikipedia query cancelled" |
|
368 return |
|
369 |
|
370 |
|
371 |
|
372 for i, tag in enumerate(queryset): |
|
373 |
|
374 if self.verbosity > 1: |
|
375 print "processing tag %s (%d/%d)" % (tag.label, i + 1, count) |
|
376 else: |
|
377 utils.show_progress(i + 1, count, tag.label, 60) |
|
378 |
|
379 # query categories |
|
380 wikipedia_pageid = tag.label if use_label else tag.wikipedia_pageid |
|
381 if tag.url_status == Tag.TAG_URL_STATUS_DICT['redirection'] and tag.alternative_wikipedia_pageid is not None : |
|
382 wikipedia_pageid = tag.alternative_label if use_label else tag.alternative_wikipedia_pageid |
|
383 |
|
384 with transaction.commit_on_success(): |
|
385 if types_mask & TYPES_MASK_DICT['visible']: |
|
386 res = self.query_all_categories(False, site, wikipedia_pageid, use_label) |
|
387 self.process_categories(res, False, tag) |
|
388 |
|
389 if types_mask & TYPES_MASK_DICT['hidden']: |
|
390 res = self.query_all_categories(True, site, wikipedia_pageid, use_label) |
|
391 self.process_categories(res, True, tag) |
|
392 |
|
393 if types_mask & TYPES_MASK_DICT['infobox']: |
|
394 res = self.query_infoboxes(site, wikipedia_pageid, use_label) |
|
395 self.process_infoboxes(res, tag) |
|
396 |