--- a/src/hdalab/management/commands/query_wikipedia_category.py Thu Sep 24 13:40:54 2015 +0200
+++ b/src/hdalab/management/commands/query_wikipedia_category.py Sat Sep 26 11:55:11 2015 +0200
@@ -41,7 +41,7 @@
'''
options = ''
help = """query and update wikipedia for tag title."""
-
+
option_list = NoArgsCommand.option_list + (
make_option('--all',
action='store_true',
@@ -96,116 +96,116 @@
help='the tag to query'),
)
-
-
-# def process_wp_response(self, label, response):
+
+
+# def process_wp_response(self, label, response):
#
# query_dict = response['query']
# # get page if multiple pages or none -> return Tag.null_result
# pages = query_dict.get("pages", {})
# if len(pages) > 1 or len(pages) == 0:
# return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None
-#
+#
# page = pages.values()[0]
-#
+#
# if u"invalid" in page or u"missing" in page:
# return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None
#
# url = page.get(u'fullurl', None)
# pageid = page.get(u'pageid', None)
# new_label = page[u'title']
-#
+#
# if self.__is_homonymie(page):
# status = Tag.TAG_URL_STATUS_DICT["homonyme"]
# elif u"redirect" in page:
# status = Tag.TAG_URL_STATUS_DICT["redirection"]
# else:
# status = Tag.TAG_URL_STATUS_DICT["match"]
-#
-# return new_label, status, url, pageid
+#
+# return new_label, status, url, pageid
def query_all_categories(self, hidden, site, pageid, use_label):
-
+
clshow = 'hidden' if hidden else '!hidden'
params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'categories', 'clshow': clshow}
-
- clcontinue = ""
+
+ clcontinue = ""
res = []
-
+
while clcontinue is not None:
if clcontinue:
params['clcontinue'] = clcontinue
-
+
wpquery = api.APIRequest(site, params) #@UndefinedVariable
response = wpquery.query()
-
+
if self.verbosity > 1:
print "Query infoboxes : " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
print repr(response)
-
+
query_dict = response.get('query', None)
-
+
if query_dict is None:
return res
-
+
pages = query_dict.get("pages", {})
if len(pages) > 1 or len(pages) == 0:
return res
-
+
page = pages.values()[0]
-
+
for cat in page.get('categories',[]):
title = cat.get('title',"")
title = title[title.find(":")+1:]
if title and clcontinue != ("%s|%s" % (pageid,title)):
res.append(title)
-
+
clcontinue = response.get('query-continue', {}).get('categories',{}).get('clcontinue', None)
if self.verbosity > 1:
print "Query infoboxes RES: "
print repr(res)
-
+
return res
-
+
def process_categories(self, cat_list, hidden, tag):
-
+
for cat in cat_list:
wp_cat,created = WpCategory.objects.get_or_create(label=cat) #@UnusedVariable
TagWpCategory.objects.get_or_create(tag=tag, wp_category=wp_cat, hidden=hidden)
-
-
+
+
def query_infoboxes(self, site, pageid, use_label):
-
+
res = []
params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'revisions', 'rvprop': 'ids|content'}
wpquery = api.APIRequest(site, params) #@UndefinedVariable
response = wpquery.query()
-
+
query_dict = response.get('query', None)
-
+
if query_dict is None:
return res
-
+
pages = query_dict.get("pages", {})
if len(pages) > 1 or len(pages) == 0:
return res
page = pages.values()[0]
-
+
if 'revisions' not in page or not page['revisions']:
return res
-
+
rev = page['revisions'][0]
-
+
content = rev['*']
-
+
start = 0
depth = 0
current_infobox_name = None
current_start = 0
-
+
while start <= len(content):
if depth==0:
resm = START_PATTERN.search(content[start:])
@@ -214,7 +214,7 @@
depth = 1
current_start = resm.start()+start
start += resm.end()+1
- current_infobox_name = resm.group(1)
+ current_infobox_name = resm.group(1)
else:
resm = END_PATTERN.search(content[start:])
if resm is None:
@@ -228,20 +228,20 @@
start += resm.end()+1
return_val = (rev['revid'],res)
-
+
if self.verbosity > 1:
print "Query infoboxes url: " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
print repr(return_val)
-
+
return return_val
-
+
def split_infoboxes(self, src):
-
+
start = 0
previous_end = 0
split_indexes = []
delimiter_stack = []
- while start<=len(src):
+ while start<=len(src):
resd = DELIMITER_PATTERN.search(src[start:])
ress = SPLIT_PATTERN.search(src[start:]) if len(delimiter_stack) == 0 else None
startd = resd.start() if resd is not None else sys.maxint
@@ -260,7 +260,7 @@
start += resd.end()
else:
break
-
+
if previous_end > 0:
split_indexes.append((previous_end,len(src)))
res = [src[start:end] for start,end in split_indexes]
@@ -269,14 +269,16 @@
def process_infoboxes(self, infobox_defs, tag):
-
+
if not infobox_defs:
return
-
+
revision_id = infobox_defs[0]
for infobox in infobox_defs[1]:
- src = infobox[0].strip(' \t\n\r')
+ src = infobox[0].strip(' \t\n\r')
name = infobox[1]
+ if name and len(name) > 2048:
+ name = name[0:2048]
tag_infobox, created = TagInfobox.objects.get_or_create(tag=tag, name=name, revision_id = revision_id, defaults={'source': src})
if not created:
tag_infobox.source = src
@@ -284,7 +286,7 @@
src = COMMENT_PATTERN.sub('',src)
src = START_PATTERN.sub('',src[:-2]).strip()
-
+
keyvalues = self.split_infoboxes(src)
for key,value in itertools.izip(*[itertools.islice(keyvalues, i, None, 2) for i in range(2)]):
@@ -292,89 +294,89 @@
if not created:
param.param_value = value.strip()
param.save()
-
+
def handle_noargs(self, **options):
-
+
self.style = no_style()
-
+
interactive = options.get('interactive', True)
-
+
self.verbosity = int(options.get('verbosity', '1'))
use_label = options.get('use_label', False)
-
+
force = options.get('force', False)
-
+
limit = options.get("limit", -1)
start = options.get("start", 0)
-
+
site_url = options.get('site_url', settings.WIKIPEDIA_API_URL)
-
+
random = options.get('random', False)
-
+
types_mask = 0
types_list = options.get('types', [])
-
+
if len(types_list) == 0:
types_mask = TYPES_MASK_DICT['all']
else:
for t in types_list:
types_mask |= TYPES_MASK_DICT[t]
-
+
if self.verbosity > 1 :
- print "types mask %s " % (bin(types_mask))
-
+ print "types mask %s " % (bin(types_mask))
+
if self.verbosity > 2:
print "option passed : " + repr(options)
queryset = Tag.objects.exclude(wikipedia_pageid= None)
-
+
tag_list = options.get("tags", []);
-
+
if tag_list:
queryset = queryset.filter(label__in=tag_list)
- elif not options.get('all',False):
+ elif not options.get('all',False):
queryset = queryset.annotate(wpc=Count('wp_categories')).filter(wpc = 0)
#else:
- # queryset = Tag.objects.filter(url_status=None)
-
+ # queryset = Tag.objects.filter(url_status=None)
+
if random:
queryset = queryset.order_by("?")
else:
queryset = queryset.order_by("label")
-
+
if limit >= 0:
queryset = queryset[start:limit]
elif start > 0:
- queryset = queryset[start:]
-
+ queryset = queryset[start:]
+
if self.verbosity > 2 :
print "Tag Query is %s" % (queryset.query)
-
+
site = wiki.Wiki(site_url) #@UndefinedVariable
-
-
+
+
count = queryset.count()
if self.verbosity > 1:
print "Processing %d tags" % (count)
-
+
if not force and interactive:
confirm = raw_input("You have requested to query and replace the wikipedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
else:
confirm = 'yes'
-
+
if confirm != "yes":
print "wikipedia query cancelled"
return
-
-
+
+
for i, tag in enumerate(queryset):
-
+
if self.verbosity > 1:
print "processing tag %s (%d/%d)" % (tag.label, i + 1, count)
else:
- utils.show_progress(i + 1, count, tag.label, 60)
+ utils.show_progress(i + 1, count, tag.label, 60)
# query categories
wikipedia_pageid = tag.label if use_label else tag.wikipedia_pageid
@@ -385,12 +387,11 @@
if types_mask & TYPES_MASK_DICT['visible']:
res = self.query_all_categories(False, site, wikipedia_pageid, use_label)
self.process_categories(res, False, tag)
-
+
if types_mask & TYPES_MASK_DICT['hidden']:
res = self.query_all_categories(True, site, wikipedia_pageid, use_label)
self.process_categories(res, True, tag)
-
+
if types_mask & TYPES_MASK_DICT['infobox']:
res = self.query_infoboxes(site, wikipedia_pageid, use_label)
self.process_infoboxes(res, tag)
-