--- a/.settings/org.eclipse.core.resources.prefs Tue Jan 17 00:19:27 2012 +0100
+++ b/.settings/org.eclipse.core.resources.prefs Thu Jan 26 18:19:37 2012 +0100
@@ -1,10 +1,11 @@
-#Mon Jan 16 02:39:01 CET 2012
+#Tue Jan 24 17:50:37 CET 2012
eclipse.preferences.version=1
encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/haystack/backends/__init__.py=utf-8
encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/sortedm2m/fields.py=utf-8
encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/sortedm2m/forms.py=utf-8
encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/sortedm2m/tests.py=utf-8
encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/whoosh/analysis.py=utf8
+encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/wikitools/api.py=utf-8
encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/wikitools/wiki.py=utf-8
encoding//web/hdabo/forms.py=utf-8
encoding//web/hdabo/management/commands/import_csv.py=utf-8
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/web/hdabo/management/commands/export_tags_csv.py Thu Jan 26 18:19:37 2012 +0100
@@ -0,0 +1,155 @@
+'''
+Created on Jan 25, 2012
+
+@author: ymh
+'''
+
+from django.core.management.base import BaseCommand, CommandError
+from django.db.models import Q
+from hdabo import utils
+from hdabo.models import Tag
+from optparse import make_option
+import csv
+import cStringIO
+import codecs
+
+class UnicodeWriter:
+ """
+ A CSV writer which will write rows to CSV file "f",
+ which is encoded in the given encoding.
+ """
+
+ def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
+ # Redirect output to a queue
+ self.queue = cStringIO.StringIO()
+ self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
+ self.stream = f
+ self.encoder = codecs.getincrementalencoder(encoding)()
+
+ def writerow(self, row):
+ self.writer.writerow([s.encode("utf-8") for s in row])
+ # Fetch UTF-8 output from the queue ...
+ data = self.queue.getvalue()
+ data = data.decode("utf-8")
+ # ... and reencode it into the target encoding
+ data = self.encoder.encode(data)
+ # write to the target stream
+ self.stream.write(data)
+ # empty queue
+ self.queue.truncate(0)
+
+ def writerows(self, rows):
+ for row in rows:
+ self.writerow(row)
+
+class Command(BaseCommand):
+ '''
+ Command to export tags
+ '''
+ args = '<path_to_name_base_csv_file>'
+ options = '[-c|--category filter by category name] [-e|--encoding csv file encoding]'
+ help = """export csv files for hdabo
+Options:
+ -c, --category : filter by category
+ --lines : max number of lines to load (for each file). 0 means all.
+ --encoding : files encoding. default to latin-1"""
+
+ option_list = BaseCommand.option_list + (
+ make_option("-c","--category",
+ action='append',
+ type='string',
+ dest='categories',
+ default=[],
+ help='filter tag by category (or)'),
+ make_option("-e","--encoding",
+ action='store',
+ type='string',
+ dest='encoding',
+ default="utf-8",
+ help='file encoding, default utf-8'),
+ make_option("-f","--force",
+ action='store_true',
+ dest='force',
+ default=False,
+ help='force file overwrite'),
+ )
+
+ def handle(self, *args, **options):
+
+ if len(args) == 0 or not args[0]:
+ raise CommandError("Gives at last one csv file to export")
+
+ self.encoding = options.get("encoding", "utf-8")
+ self.categories = options.get("categories", [])
+ self.force = options.get("force", False)
+ self.base_path = args[0].strip()
+ self.interactive = options.get("interactive",True)
+
+ files_path = {
+ "visible" : { 'path':self.base_path + "_visible.txt",},
+ "hidden" : { 'path':self.base_path + "_hidden.txt",},
+ "infobox" : { 'path':self.base_path + "_infobox.txt",},
+ }
+
+ try:
+ for filedef in files_path.itervalues():
+ try:
+ filedef['file'] = open(filedef['path'],'a')
+ if (not self.force) and self.interactive:
+ print filedef['path']
+ resp = raw_input("export file already exists. override ? type yes to continue : ")
+ if resp is not None and (resp.lower() == "yes" or resp.lower() == "y"):
+ self.force = True
+ # clear file
+ filedef['file'].truncate()
+ else:
+ return "error"
+ elif not self.interactive and not self.force:
+ print "Export file %s already exists. Exit." % (filedef['path'])
+ return "error"
+ except:
+ filedef['file'] = open(filedef['path'],'w')
+
+ filedef['csv'] = UnicodeWriter(filedef['file'], doublequote=False, escapechar="\\", encoding=self.encoding)
+
+ queryset = Tag.objects.exclude(wikipedia_pageid= None)
+ cat_filter = None
+ for cat in self.categories:
+ if cat_filter is None:
+ cat_filter = Q(category__label = cat)
+ else:
+ cat_filter = cat_filter | Q(category__label = cat)
+ if cat_filter is not None:
+ queryset = queryset.filter(cat_filter)
+
+ tcount = queryset.count()
+
+ print "Exporting %d tags" % (tcount)
+ writer = None
+
+ for i,t in enumerate(queryset.order_by("label")):
+
+ writer = utils.show_progress(i+1, tcount, t.label, 50, writer)
+ #normal category
+ row = [t.label,] + [cat.wp_category.label for cat in t.tagwpcategory_set.filter(hidden=False)]
+ files_path['visible']['csv'].writerow(row)
+
+ #hidden category
+ row = [t.label,] + [cat.wp_category.label for cat in t.tagwpcategory_set.filter(hidden=True)]
+ files_path['hidden']['csv'].writerow(row)
+
+ #infobox
+ for i in t.infoboxes.all():
+ vec = [[p.param_name,p.param_value.replace('\n',"\\n")] for p in i.infoboxparameter_set.all()]
+ ib_params = [num for elem in vec for num in elem]
+ row = [t.label, i.name.strip()] + ib_params
+ files_path['infobox']['csv'].writerow(row)
+
+ finally:
+ for filedef in files_path.itervalues():
+ if filedef.get('file',None):
+ filedef['file'].close()
+
+
+
+
\ No newline at end of file
--- a/web/hdabo/management/commands/query_wikipedia_category.py Tue Jan 17 00:19:27 2012 +0100
+++ b/web/hdabo/management/commands/query_wikipedia_category.py Thu Jan 26 18:19:37 2012 +0100
@@ -28,8 +28,9 @@
START_PATTERN = re.compile(u"\{\{\s?Infobox\s+([^|]+)", re.M|re.U|re.I)
END_PATTERN = re.compile(u"\{\{|\}\}", re.M|re.U)
-SPLIT_PATTERN = re.compile("\s*?\|\s*([\w]+[\w \t-]*)\s*=", re.U|re.M)
+SPLIT_PATTERN = re.compile("\s*?\|\s*([\w]+[^=|]*)\s*=", re.U|re.M)
DELIMITER_PATTERN = re.compile("\{{2,3}|\}{2,3}|\[\[|\]\]|\[|\]")
+COMMENT_PATTERN = re.compile("<!--.*?-->",re.U|re.M)
@@ -80,9 +81,20 @@
type='choice',
choices=['visible','hidden', 'infobox', 'all'],
default=[],
- help='what type of query to oerform : visible : visible categories, hidden : hidden categories, infobox: infoboxes, all: all of them. This option can be assed multiple times'
- ),
- )
+ help='what type of query to perform : visible : visible categories, hidden : hidden categories, infobox: infoboxes, all: all of them. This option can be assed multiple times'),
+ make_option('--use-label',
+ action='store_true',
+ dest='use_label',
+ default=False,
+ help='use label instead of pageid to query wikipedia'),
+ make_option('--tag',
+ action='append',
+ dest='tags',
+ type='string',
+ default=[],
+ help='the tag to query'),
+
+ )
# def process_wp_response(self, label, response):
@@ -111,10 +123,10 @@
#
# return new_label, status, url, pageid
- def query_all_categories(self, hidden, site, pageid):
+ def query_all_categories(self, hidden, site, pageid, use_label):
clshow = 'hidden' if hidden else '!hidden'
- params = {'action':'query', 'pageids': pageid, 'prop':'categories', 'clshow': clshow}
+ params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'categories', 'clshow': clshow}
clcontinue = ""
res = []
@@ -126,6 +138,11 @@
wpquery = api.APIRequest(site, params) #@UndefinedVariable
response = wpquery.query()
+ if self.verbosity > 1:
+ print "Query infoboxes : " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
+ print repr(response)
+
+
query_dict = response.get('query', None)
if query_dict is None:
@@ -144,6 +161,10 @@
res.append(title)
clcontinue = response.get('query-continue', {}).get('categories',{}).get('clcontinue', None)
+
+ if self.verbosity > 1:
+ print "Query infoboxes RES: "
+ print repr(res)
return res
@@ -154,10 +175,10 @@
TagWpCategory.objects.get_or_create(tag=tag, wp_category=wp_cat, hidden=hidden)
- def query_infoboxes(self, site, pageid):
+ def query_infoboxes(self, site, pageid, use_label):
res = []
- params = {'action':'query', 'pageids': pageid, 'prop':'revisions', 'rvprop': 'ids|content'}
+ params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'revisions', 'rvprop': 'ids|content'}
wpquery = api.APIRequest(site, params) #@UndefinedVariable
response = wpquery.query()
@@ -205,8 +226,13 @@
res.append((content[current_start:resm.end()+start], current_infobox_name))
start += resm.end()+1
+ return_val = (rev['revid'],res)
- return rev['revid'],res
+ if self.verbosity > 1:
+ print "Query infoboxes url: " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
+ print repr(return_val)
+
+ return return_val
def split_infoboxes(self, src):
@@ -255,7 +281,9 @@
tag_infobox.source = src
tag_infobox.save()
+ src = COMMENT_PATTERN.sub('',src)
src = START_PATTERN.sub('',src[:-2]).strip()
+
keyvalues = self.split_infoboxes(src)
for key,value in itertools.izip(*[itertools.islice(keyvalues, i, None, 2) for i in range(2)]):
@@ -270,7 +298,8 @@
interactive = options.get('interactive', True)
- verbosity = int(options.get('verbosity', '1'))
+ self.verbosity = int(options.get('verbosity', '1'))
+ use_label = options.get('use_label', False)
force = options.get('force', False)
@@ -290,16 +319,20 @@
for t in types_list:
types_mask |= TYPES_MASK_DICT[t]
- if verbosity > 1 :
+ if self.verbosity > 1 :
print "types mask %s " % (bin(types_mask))
- if verbosity > 2:
+ if self.verbosity > 2:
print "option passed : " + repr(options)
queryset = Tag.objects.exclude(wikipedia_pageid= None)
- if not options.get('all',False):
+ tag_list = options.get("tags", []);
+
+ if tag_list:
+ queryset = queryset.filter(label__in=tag_list)
+ elif not options.get('all',False):
queryset = queryset.annotate(wpc=Count('wp_categories')).filter(wpc = 0)
#else:
# queryset = Tag.objects.filter(url_status=None)
@@ -314,14 +347,14 @@
elif start > 0:
queryset = queryset[start:]
- if verbosity > 2 :
+ if self.verbosity > 2 :
print "Tag Query is %s" % (queryset.query)
site = wiki.Wiki(site_url) #@UndefinedVariable
count = queryset.count()
- if verbosity > 1:
+ if self.verbosity > 1:
print "Processing %d tags" % (count)
if not force and interactive:
@@ -337,26 +370,26 @@
for i, tag in enumerate(queryset):
- if verbosity > 1:
+ if self.verbosity > 1:
print "processing tag %s (%d/%d)" % (tag.label, i + 1, count)
else:
utils.show_progress(i + 1, count, tag.label, 60)
# query categories
- wikipedia_pageid = tag.wikipedia_pageid
+ wikipedia_pageid = tag.label if use_label else tag.wikipedia_pageid
if tag.url_status == Tag.TAG_URL_STATUS_DICT['redirection'] and tag.alternative_wikipedia_pageid is not None :
- wikipedia_pageid = tag.alternative_wikipedia_pageid
+ wikipedia_pageid = tag.alternative_label if use_label else tag.alternative_wikipedia_pageid
with transaction.commit_on_success():
if types_mask & TYPES_MASK_DICT['visible']:
- res = self.query_all_categories(False, site, wikipedia_pageid)
+ res = self.query_all_categories(False, site, wikipedia_pageid, use_label)
self.process_categories(res, False, tag)
if types_mask & TYPES_MASK_DICT['hidden']:
- res = self.query_all_categories(True, site, wikipedia_pageid)
+ res = self.query_all_categories(True, site, wikipedia_pageid, use_label)
self.process_categories(res, True, tag)
if types_mask & TYPES_MASK_DICT['infobox']:
- res = self.query_infoboxes(site, wikipedia_pageid)
+ res = self.query_infoboxes(site, wikipedia_pageid, use_label)
self.process_infoboxes(res, tag)
--- a/web/hdabo/utils.py Tue Jan 17 00:19:27 2012 +0100
+++ b/web/hdabo/utils.py Thu Jan 26 18:19:37 2012 +0100
@@ -3,6 +3,7 @@
import unicodedata
import sys
import math
+import codecs
###
# allow to declare a property as a decorator
@@ -348,7 +349,12 @@
def normalize(str):
return remove_accents(str).lower().replace(u"œ",u"oe")
-def show_progress(current_line, total_line, label, width):
+def show_progress(current_line, total_line, label, width, writer=None):
+
+ if writer is None:
+ writer = sys.stdout
+ if sys.stdout.encoding is not None:
+ writer = codecs.getwriter(sys.stdout.encoding)(sys.stdout)
percent = (float(current_line) / float(total_line)) * 100.0
@@ -357,8 +363,12 @@
loader = u'[' + (u'=' * int(marks)) + (u' ' * int(spaces)) + u']'
- sys.stdout.write(u"%s %d%% %d/%d - %r\r" % (loader, percent, current_line, total_line, label[:50].rjust(50))) #takes the header into account
+ str = u"%s %d%% %d/%d - %s\r" % (loader, percent, current_line, total_line, label[:50].rjust(50))
+
+ writer.write(str) #takes the header into account
if percent >= 100:
- sys.stdout.write("\n")
- sys.stdout.flush()
+ writer.write("\n")
+ writer.flush()
+
+ return writer