diff -r 206859005b33 -r 434737bd64e5 src/hdalab/management/commands/query_wikipedia_category.py --- a/src/hdalab/management/commands/query_wikipedia_category.py Thu Sep 24 13:40:54 2015 +0200 +++ b/src/hdalab/management/commands/query_wikipedia_category.py Sat Sep 26 11:55:11 2015 +0200 @@ -41,7 +41,7 @@ ''' options = '' help = """query and update wikipedia for tag title.""" - + option_list = NoArgsCommand.option_list + ( make_option('--all', action='store_true', @@ -96,116 +96,116 @@ help='the tag to query'), ) - - -# def process_wp_response(self, label, response): + + +# def process_wp_response(self, label, response): # # query_dict = response['query'] # # get page if multiple pages or none -> return Tag.null_result # pages = query_dict.get("pages", {}) # if len(pages) > 1 or len(pages) == 0: # return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None -# +# # page = pages.values()[0] -# +# # if u"invalid" in page or u"missing" in page: # return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None # # url = page.get(u'fullurl', None) # pageid = page.get(u'pageid', None) # new_label = page[u'title'] -# +# # if self.__is_homonymie(page): # status = Tag.TAG_URL_STATUS_DICT["homonyme"] # elif u"redirect" in page: # status = Tag.TAG_URL_STATUS_DICT["redirection"] # else: # status = Tag.TAG_URL_STATUS_DICT["match"] -# -# return new_label, status, url, pageid +# +# return new_label, status, url, pageid def query_all_categories(self, hidden, site, pageid, use_label): - + clshow = 'hidden' if hidden else '!hidden' params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'categories', 'clshow': clshow} - - clcontinue = "" + + clcontinue = "" res = [] - + while clcontinue is not None: if clcontinue: params['clcontinue'] = clcontinue - + wpquery = api.APIRequest(site, params) #@UndefinedVariable response = wpquery.query() - + if self.verbosity > 1: print "Query infoboxes : " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data()) print repr(response) - + query_dict = response.get('query', None) - + if query_dict is None: return res - + pages = query_dict.get("pages", {}) if len(pages) > 1 or len(pages) == 0: return res - + page = pages.values()[0] - + for cat in page.get('categories',[]): title = cat.get('title',"") title = title[title.find(":")+1:] if title and clcontinue != ("%s|%s" % (pageid,title)): res.append(title) - + clcontinue = response.get('query-continue', {}).get('categories',{}).get('clcontinue', None) if self.verbosity > 1: print "Query infoboxes RES: " print repr(res) - + return res - + def process_categories(self, cat_list, hidden, tag): - + for cat in cat_list: wp_cat,created = WpCategory.objects.get_or_create(label=cat) #@UnusedVariable TagWpCategory.objects.get_or_create(tag=tag, wp_category=wp_cat, hidden=hidden) - - + + def query_infoboxes(self, site, pageid, use_label): - + res = [] params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'revisions', 'rvprop': 'ids|content'} wpquery = api.APIRequest(site, params) #@UndefinedVariable response = wpquery.query() - + query_dict = response.get('query', None) - + if query_dict is None: return res - + pages = query_dict.get("pages", {}) if len(pages) > 1 or len(pages) == 0: return res page = pages.values()[0] - + if 'revisions' not in page or not page['revisions']: return res - + rev = page['revisions'][0] - + content = rev['*'] - + start = 0 depth = 0 current_infobox_name = None current_start = 0 - + while start <= len(content): if depth==0: resm = START_PATTERN.search(content[start:]) @@ -214,7 +214,7 @@ depth = 1 current_start = resm.start()+start start += resm.end()+1 - current_infobox_name = resm.group(1) + current_infobox_name = resm.group(1) else: resm = END_PATTERN.search(content[start:]) if resm is None: @@ -228,20 +228,20 @@ start += resm.end()+1 return_val = (rev['revid'],res) - + if self.verbosity > 1: print "Query infoboxes url: " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data()) print repr(return_val) - + return return_val - + def split_infoboxes(self, src): - + start = 0 previous_end = 0 split_indexes = [] delimiter_stack = [] - while start<=len(src): + while start<=len(src): resd = DELIMITER_PATTERN.search(src[start:]) ress = SPLIT_PATTERN.search(src[start:]) if len(delimiter_stack) == 0 else None startd = resd.start() if resd is not None else sys.maxint @@ -260,7 +260,7 @@ start += resd.end() else: break - + if previous_end > 0: split_indexes.append((previous_end,len(src))) res = [src[start:end] for start,end in split_indexes] @@ -269,14 +269,16 @@ def process_infoboxes(self, infobox_defs, tag): - + if not infobox_defs: return - + revision_id = infobox_defs[0] for infobox in infobox_defs[1]: - src = infobox[0].strip(' \t\n\r') + src = infobox[0].strip(' \t\n\r') name = infobox[1] + if name and len(name) > 2048: + name = name[0:2048] tag_infobox, created = TagInfobox.objects.get_or_create(tag=tag, name=name, revision_id = revision_id, defaults={'source': src}) if not created: tag_infobox.source = src @@ -284,7 +286,7 @@ src = COMMENT_PATTERN.sub('',src) src = START_PATTERN.sub('',src[:-2]).strip() - + keyvalues = self.split_infoboxes(src) for key,value in itertools.izip(*[itertools.islice(keyvalues, i, None, 2) for i in range(2)]): @@ -292,89 +294,89 @@ if not created: param.param_value = value.strip() param.save() - + def handle_noargs(self, **options): - + self.style = no_style() - + interactive = options.get('interactive', True) - + self.verbosity = int(options.get('verbosity', '1')) use_label = options.get('use_label', False) - + force = options.get('force', False) - + limit = options.get("limit", -1) start = options.get("start", 0) - + site_url = options.get('site_url', settings.WIKIPEDIA_API_URL) - + random = options.get('random', False) - + types_mask = 0 types_list = options.get('types', []) - + if len(types_list) == 0: types_mask = TYPES_MASK_DICT['all'] else: for t in types_list: types_mask |= TYPES_MASK_DICT[t] - + if self.verbosity > 1 : - print "types mask %s " % (bin(types_mask)) - + print "types mask %s " % (bin(types_mask)) + if self.verbosity > 2: print "option passed : " + repr(options) queryset = Tag.objects.exclude(wikipedia_pageid= None) - + tag_list = options.get("tags", []); - + if tag_list: queryset = queryset.filter(label__in=tag_list) - elif not options.get('all',False): + elif not options.get('all',False): queryset = queryset.annotate(wpc=Count('wp_categories')).filter(wpc = 0) #else: - # queryset = Tag.objects.filter(url_status=None) - + # queryset = Tag.objects.filter(url_status=None) + if random: queryset = queryset.order_by("?") else: queryset = queryset.order_by("label") - + if limit >= 0: queryset = queryset[start:limit] elif start > 0: - queryset = queryset[start:] - + queryset = queryset[start:] + if self.verbosity > 2 : print "Tag Query is %s" % (queryset.query) - + site = wiki.Wiki(site_url) #@UndefinedVariable - - + + count = queryset.count() if self.verbosity > 1: print "Processing %d tags" % (count) - + if not force and interactive: confirm = raw_input("You have requested to query and replace the wikipedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count)) else: confirm = 'yes' - + if confirm != "yes": print "wikipedia query cancelled" return - - + + for i, tag in enumerate(queryset): - + if self.verbosity > 1: print "processing tag %s (%d/%d)" % (tag.label, i + 1, count) else: - utils.show_progress(i + 1, count, tag.label, 60) + utils.show_progress(i + 1, count, tag.label, 60) # query categories wikipedia_pageid = tag.label if use_label else tag.wikipedia_pageid @@ -385,12 +387,11 @@ if types_mask & TYPES_MASK_DICT['visible']: res = self.query_all_categories(False, site, wikipedia_pageid, use_label) self.process_categories(res, False, tag) - + if types_mask & TYPES_MASK_DICT['hidden']: res = self.query_all_categories(True, site, wikipedia_pageid, use_label) self.process_categories(res, True, tag) - + if types_mask & TYPES_MASK_DICT['infobox']: res = self.query_infoboxes(site, wikipedia_pageid, use_label) self.process_infoboxes(res, tag) -