# HG changeset patch # User ymh # Date 1327598377 -3600 # Node ID 0d2bfd84b989f2c8b95a5436aa5de7c9c5db9dc5 # Parent e7086d345a7c902a507f35a676d386540c884377 improve cat and infobox extraction + export csv diff -r e7086d345a7c -r 0d2bfd84b989 .settings/org.eclipse.core.resources.prefs --- a/.settings/org.eclipse.core.resources.prefs Tue Jan 17 00:19:27 2012 +0100 +++ b/.settings/org.eclipse.core.resources.prefs Thu Jan 26 18:19:37 2012 +0100 @@ -1,10 +1,11 @@ -#Mon Jan 16 02:39:01 CET 2012 +#Tue Jan 24 17:50:37 CET 2012 eclipse.preferences.version=1 encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/haystack/backends/__init__.py=utf-8 encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/sortedm2m/fields.py=utf-8 encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/sortedm2m/forms.py=utf-8 encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/sortedm2m/tests.py=utf-8 encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/whoosh/analysis.py=utf8 +encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/wikitools/api.py=utf-8 encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/wikitools/wiki.py=utf-8 encoding//web/hdabo/forms.py=utf-8 encoding//web/hdabo/management/commands/import_csv.py=utf-8 diff -r e7086d345a7c -r 0d2bfd84b989 web/hdabo/management/commands/export_tags_csv.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/web/hdabo/management/commands/export_tags_csv.py Thu Jan 26 18:19:37 2012 +0100 @@ -0,0 +1,155 @@ +''' +Created on Jan 25, 2012 + +@author: ymh +''' + +from django.core.management.base import BaseCommand, CommandError +from django.db.models import Q +from hdabo import utils +from hdabo.models import Tag +from optparse import make_option +import csv +import cStringIO +import codecs + +class UnicodeWriter: + """ + A CSV writer which will write rows to CSV file "f", + which is encoded in the given encoding. + """ + + def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): + # Redirect output to a queue + self.queue = cStringIO.StringIO() + self.writer = csv.writer(self.queue, dialect=dialect, **kwds) + self.stream = f + self.encoder = codecs.getincrementalencoder(encoding)() + + def writerow(self, row): + self.writer.writerow([s.encode("utf-8") for s in row]) + # Fetch UTF-8 output from the queue ... + data = self.queue.getvalue() + data = data.decode("utf-8") + # ... and reencode it into the target encoding + data = self.encoder.encode(data) + # write to the target stream + self.stream.write(data) + # empty queue + self.queue.truncate(0) + + def writerows(self, rows): + for row in rows: + self.writerow(row) + +class Command(BaseCommand): + ''' + Command to export tags + ''' + args = '' + options = '[-c|--category filter by category name] [-e|--encoding csv file encoding]' + help = """export csv files for hdabo +Options: + -c, --category : filter by category + --lines : max number of lines to load (for each file). 0 means all. + --encoding : files encoding. default to latin-1""" + + option_list = BaseCommand.option_list + ( + make_option("-c","--category", + action='append', + type='string', + dest='categories', + default=[], + help='filter tag by category (or)'), + make_option("-e","--encoding", + action='store', + type='string', + dest='encoding', + default="utf-8", + help='file encoding, default utf-8'), + make_option("-f","--force", + action='store_true', + dest='force', + default=False, + help='force file overwrite'), + ) + + def handle(self, *args, **options): + + if len(args) == 0 or not args[0]: + raise CommandError("Gives at last one csv file to export") + + self.encoding = options.get("encoding", "utf-8") + self.categories = options.get("categories", []) + self.force = options.get("force", False) + self.base_path = args[0].strip() + self.interactive = options.get("interactive",True) + + files_path = { + "visible" : { 'path':self.base_path + "_visible.txt",}, + "hidden" : { 'path':self.base_path + "_hidden.txt",}, + "infobox" : { 'path':self.base_path + "_infobox.txt",}, + } + + try: + for filedef in files_path.itervalues(): + try: + filedef['file'] = open(filedef['path'],'a') + if (not self.force) and self.interactive: + print filedef['path'] + resp = raw_input("export file already exists. override ? type yes to continue : ") + if resp is not None and (resp.lower() == "yes" or resp.lower() == "y"): + self.force = True + # clear file + filedef['file'].truncate() + else: + return "error" + elif not self.interactive and not self.force: + print "Export file %s already exists. Exit." % (filedef['path']) + return "error" + except: + filedef['file'] = open(filedef['path'],'w') + + filedef['csv'] = UnicodeWriter(filedef['file'], doublequote=False, escapechar="\\", encoding=self.encoding) + + queryset = Tag.objects.exclude(wikipedia_pageid= None) + cat_filter = None + for cat in self.categories: + if cat_filter is None: + cat_filter = Q(category__label = cat) + else: + cat_filter = cat_filter | Q(category__label = cat) + if cat_filter is not None: + queryset = queryset.filter(cat_filter) + + tcount = queryset.count() + + print "Exporting %d tags" % (tcount) + writer = None + + for i,t in enumerate(queryset.order_by("label")): + + writer = utils.show_progress(i+1, tcount, t.label, 50, writer) + #normal category + row = [t.label,] + [cat.wp_category.label for cat in t.tagwpcategory_set.filter(hidden=False)] + files_path['visible']['csv'].writerow(row) + + #hidden category + row = [t.label,] + [cat.wp_category.label for cat in t.tagwpcategory_set.filter(hidden=True)] + files_path['hidden']['csv'].writerow(row) + + #infobox + for i in t.infoboxes.all(): + vec = [[p.param_name,p.param_value.replace('\n',"\\n")] for p in i.infoboxparameter_set.all()] + ib_params = [num for elem in vec for num in elem] + row = [t.label, i.name.strip()] + ib_params + files_path['infobox']['csv'].writerow(row) + + finally: + for filedef in files_path.itervalues(): + if filedef.get('file',None): + filedef['file'].close() + + + + \ No newline at end of file diff -r e7086d345a7c -r 0d2bfd84b989 web/hdabo/management/commands/query_wikipedia_category.py --- a/web/hdabo/management/commands/query_wikipedia_category.py Tue Jan 17 00:19:27 2012 +0100 +++ b/web/hdabo/management/commands/query_wikipedia_category.py Thu Jan 26 18:19:37 2012 +0100 @@ -28,8 +28,9 @@ START_PATTERN = re.compile(u"\{\{\s?Infobox\s+([^|]+)", re.M|re.U|re.I) END_PATTERN = re.compile(u"\{\{|\}\}", re.M|re.U) -SPLIT_PATTERN = re.compile("\s*?\|\s*([\w]+[\w \t-]*)\s*=", re.U|re.M) +SPLIT_PATTERN = re.compile("\s*?\|\s*([\w]+[^=|]*)\s*=", re.U|re.M) DELIMITER_PATTERN = re.compile("\{{2,3}|\}{2,3}|\[\[|\]\]|\[|\]") +COMMENT_PATTERN = re.compile("",re.U|re.M) @@ -80,9 +81,20 @@ type='choice', choices=['visible','hidden', 'infobox', 'all'], default=[], - help='what type of query to oerform : visible : visible categories, hidden : hidden categories, infobox: infoboxes, all: all of them. This option can be assed multiple times' - ), - ) + help='what type of query to perform : visible : visible categories, hidden : hidden categories, infobox: infoboxes, all: all of them. This option can be assed multiple times'), + make_option('--use-label', + action='store_true', + dest='use_label', + default=False, + help='use label instead of pageid to query wikipedia'), + make_option('--tag', + action='append', + dest='tags', + type='string', + default=[], + help='the tag to query'), + + ) # def process_wp_response(self, label, response): @@ -111,10 +123,10 @@ # # return new_label, status, url, pageid - def query_all_categories(self, hidden, site, pageid): + def query_all_categories(self, hidden, site, pageid, use_label): clshow = 'hidden' if hidden else '!hidden' - params = {'action':'query', 'pageids': pageid, 'prop':'categories', 'clshow': clshow} + params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'categories', 'clshow': clshow} clcontinue = "" res = [] @@ -126,6 +138,11 @@ wpquery = api.APIRequest(site, params) #@UndefinedVariable response = wpquery.query() + if self.verbosity > 1: + print "Query infoboxes : " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data()) + print repr(response) + + query_dict = response.get('query', None) if query_dict is None: @@ -144,6 +161,10 @@ res.append(title) clcontinue = response.get('query-continue', {}).get('categories',{}).get('clcontinue', None) + + if self.verbosity > 1: + print "Query infoboxes RES: " + print repr(res) return res @@ -154,10 +175,10 @@ TagWpCategory.objects.get_or_create(tag=tag, wp_category=wp_cat, hidden=hidden) - def query_infoboxes(self, site, pageid): + def query_infoboxes(self, site, pageid, use_label): res = [] - params = {'action':'query', 'pageids': pageid, 'prop':'revisions', 'rvprop': 'ids|content'} + params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'revisions', 'rvprop': 'ids|content'} wpquery = api.APIRequest(site, params) #@UndefinedVariable response = wpquery.query() @@ -205,8 +226,13 @@ res.append((content[current_start:resm.end()+start], current_infobox_name)) start += resm.end()+1 + return_val = (rev['revid'],res) - return rev['revid'],res + if self.verbosity > 1: + print "Query infoboxes url: " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data()) + print repr(return_val) + + return return_val def split_infoboxes(self, src): @@ -255,7 +281,9 @@ tag_infobox.source = src tag_infobox.save() + src = COMMENT_PATTERN.sub('',src) src = START_PATTERN.sub('',src[:-2]).strip() + keyvalues = self.split_infoboxes(src) for key,value in itertools.izip(*[itertools.islice(keyvalues, i, None, 2) for i in range(2)]): @@ -270,7 +298,8 @@ interactive = options.get('interactive', True) - verbosity = int(options.get('verbosity', '1')) + self.verbosity = int(options.get('verbosity', '1')) + use_label = options.get('use_label', False) force = options.get('force', False) @@ -290,16 +319,20 @@ for t in types_list: types_mask |= TYPES_MASK_DICT[t] - if verbosity > 1 : + if self.verbosity > 1 : print "types mask %s " % (bin(types_mask)) - if verbosity > 2: + if self.verbosity > 2: print "option passed : " + repr(options) queryset = Tag.objects.exclude(wikipedia_pageid= None) - if not options.get('all',False): + tag_list = options.get("tags", []); + + if tag_list: + queryset = queryset.filter(label__in=tag_list) + elif not options.get('all',False): queryset = queryset.annotate(wpc=Count('wp_categories')).filter(wpc = 0) #else: # queryset = Tag.objects.filter(url_status=None) @@ -314,14 +347,14 @@ elif start > 0: queryset = queryset[start:] - if verbosity > 2 : + if self.verbosity > 2 : print "Tag Query is %s" % (queryset.query) site = wiki.Wiki(site_url) #@UndefinedVariable count = queryset.count() - if verbosity > 1: + if self.verbosity > 1: print "Processing %d tags" % (count) if not force and interactive: @@ -337,26 +370,26 @@ for i, tag in enumerate(queryset): - if verbosity > 1: + if self.verbosity > 1: print "processing tag %s (%d/%d)" % (tag.label, i + 1, count) else: utils.show_progress(i + 1, count, tag.label, 60) # query categories - wikipedia_pageid = tag.wikipedia_pageid + wikipedia_pageid = tag.label if use_label else tag.wikipedia_pageid if tag.url_status == Tag.TAG_URL_STATUS_DICT['redirection'] and tag.alternative_wikipedia_pageid is not None : - wikipedia_pageid = tag.alternative_wikipedia_pageid + wikipedia_pageid = tag.alternative_label if use_label else tag.alternative_wikipedia_pageid with transaction.commit_on_success(): if types_mask & TYPES_MASK_DICT['visible']: - res = self.query_all_categories(False, site, wikipedia_pageid) + res = self.query_all_categories(False, site, wikipedia_pageid, use_label) self.process_categories(res, False, tag) if types_mask & TYPES_MASK_DICT['hidden']: - res = self.query_all_categories(True, site, wikipedia_pageid) + res = self.query_all_categories(True, site, wikipedia_pageid, use_label) self.process_categories(res, True, tag) if types_mask & TYPES_MASK_DICT['infobox']: - res = self.query_infoboxes(site, wikipedia_pageid) + res = self.query_infoboxes(site, wikipedia_pageid, use_label) self.process_infoboxes(res, tag) diff -r e7086d345a7c -r 0d2bfd84b989 web/hdabo/utils.py --- a/web/hdabo/utils.py Tue Jan 17 00:19:27 2012 +0100 +++ b/web/hdabo/utils.py Thu Jan 26 18:19:37 2012 +0100 @@ -3,6 +3,7 @@ import unicodedata import sys import math +import codecs ### # allow to declare a property as a decorator @@ -348,7 +349,12 @@ def normalize(str): return remove_accents(str).lower().replace(u"œ",u"oe") -def show_progress(current_line, total_line, label, width): +def show_progress(current_line, total_line, label, width, writer=None): + + if writer is None: + writer = sys.stdout + if sys.stdout.encoding is not None: + writer = codecs.getwriter(sys.stdout.encoding)(sys.stdout) percent = (float(current_line) / float(total_line)) * 100.0 @@ -357,8 +363,12 @@ loader = u'[' + (u'=' * int(marks)) + (u' ' * int(spaces)) + u']' - sys.stdout.write(u"%s %d%% %d/%d - %r\r" % (loader, percent, current_line, total_line, label[:50].rjust(50))) #takes the header into account + str = u"%s %d%% %d/%d - %s\r" % (loader, percent, current_line, total_line, label[:50].rjust(50)) + + writer.write(str) #takes the header into account if percent >= 100: - sys.stdout.write("\n") - sys.stdout.flush() + writer.write("\n") + writer.flush() + + return writer