improve cat and infobox extraction + export csv
authorymh <ymh.work@gmail.com>
Thu, 26 Jan 2012 18:19:37 +0100
changeset 113 0d2bfd84b989
parent 112 e7086d345a7c
child 114 c59383cc9940
improve cat and infobox extraction + export csv
.settings/org.eclipse.core.resources.prefs
web/hdabo/management/commands/export_tags_csv.py
web/hdabo/management/commands/query_wikipedia_category.py
web/hdabo/utils.py
--- a/.settings/org.eclipse.core.resources.prefs	Tue Jan 17 00:19:27 2012 +0100
+++ b/.settings/org.eclipse.core.resources.prefs	Thu Jan 26 18:19:37 2012 +0100
@@ -1,10 +1,11 @@
-#Mon Jan 16 02:39:01 CET 2012
+#Tue Jan 24 17:50:37 CET 2012
 eclipse.preferences.version=1
 encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/haystack/backends/__init__.py=utf-8
 encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/sortedm2m/fields.py=utf-8
 encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/sortedm2m/forms.py=utf-8
 encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/sortedm2m/tests.py=utf-8
 encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/whoosh/analysis.py=utf8
+encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/wikitools/api.py=utf-8
 encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/wikitools/wiki.py=utf-8
 encoding//web/hdabo/forms.py=utf-8
 encoding//web/hdabo/management/commands/import_csv.py=utf-8
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/web/hdabo/management/commands/export_tags_csv.py	Thu Jan 26 18:19:37 2012 +0100
@@ -0,0 +1,155 @@
+'''
+Created on Jan 25, 2012
+
+@author: ymh
+'''
+
+from django.core.management.base import BaseCommand, CommandError
+from django.db.models import Q
+from hdabo import utils
+from hdabo.models import Tag
+from optparse import make_option
+import csv
+import cStringIO
+import codecs
+
+class UnicodeWriter:
+    """
+    A CSV writer which will write rows to CSV file "f",
+    which is encoded in the given encoding.
+    """
+
+    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
+        # Redirect output to a queue
+        self.queue = cStringIO.StringIO()
+        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
+        self.stream = f
+        self.encoder = codecs.getincrementalencoder(encoding)()
+
+    def writerow(self, row):
+        self.writer.writerow([s.encode("utf-8") for s in row])
+        # Fetch UTF-8 output from the queue ...
+        data = self.queue.getvalue()
+        data = data.decode("utf-8")
+        # ... and reencode it into the target encoding
+        data = self.encoder.encode(data)
+        # write to the target stream
+        self.stream.write(data)
+        # empty queue
+        self.queue.truncate(0)
+
+    def writerows(self, rows):
+        for row in rows:
+            self.writerow(row)
+
+class Command(BaseCommand):
+    '''
+    Command to export tags
+    '''
+    args = '<path_to_name_base_csv_file>'
+    options = '[-c|--category filter by category name] [-e|--encoding csv file encoding]'
+    help = """export csv files for hdabo
+Options:
+    -c, --category : filter by category
+    --lines : max number of lines to load (for each file). 0 means all.
+    --encoding : files encoding. default to latin-1"""
+    
+    option_list = BaseCommand.option_list + (
+        make_option("-c","--category",
+            action='append',
+            type='string',
+            dest='categories',
+            default=[],
+            help='filter tag by category (or)'),
+        make_option("-e","--encoding",
+            action='store',
+            type='string',
+            dest='encoding',
+            default="utf-8",
+            help='file encoding, default utf-8'),
+        make_option("-f","--force",
+            action='store_true',
+            dest='force',
+            default=False,
+            help='force file overwrite'),
+        )
+        
+    def handle(self, *args, **options):
+        
+        if len(args) == 0 or not args[0]:
+            raise CommandError("Gives at last one csv file to export")
+
+        self.encoding = options.get("encoding", "utf-8")
+        self.categories = options.get("categories", [])
+        self.force = options.get("force", False)
+        self.base_path = args[0].strip()
+        self.interactive = options.get("interactive",True)
+        
+        files_path = {
+            "visible" : { 'path':self.base_path + "_visible.txt",},
+            "hidden" : { 'path':self.base_path + "_hidden.txt",},
+            "infobox" : { 'path':self.base_path + "_infobox.txt",},
+        }
+        
+        try:
+            for filedef in files_path.itervalues():
+                try:
+                    filedef['file'] = open(filedef['path'],'a')
+                    if (not self.force) and self.interactive:
+                        print filedef['path']
+                        resp = raw_input("export file already exists. override ? type yes to continue : ")
+                        if resp is not None and (resp.lower() == "yes" or resp.lower() == "y"):
+                            self.force = True
+                            # clear file
+                            filedef['file'].truncate()
+                        else:
+                            return "error"
+                    elif not self.interactive and not self.force:
+                        print "Export file %s already exists. Exit." % (filedef['path'])
+                        return "error"
+                except:
+                    filedef['file'] = open(filedef['path'],'w')
+                
+                filedef['csv'] = UnicodeWriter(filedef['file'], doublequote=False, escapechar="\\", encoding=self.encoding)
+                
+            queryset = Tag.objects.exclude(wikipedia_pageid= None)
+            cat_filter = None
+            for cat in self.categories:
+                if cat_filter is None:
+                    cat_filter = Q(category__label = cat)
+                else:
+                    cat_filter = cat_filter | Q(category__label = cat)
+            if cat_filter is not None:
+                queryset = queryset.filter(cat_filter)
+            
+            tcount = queryset.count()
+            
+            print "Exporting %d tags" % (tcount)
+            writer = None
+                
+            for i,t in enumerate(queryset.order_by("label")):
+                
+                writer = utils.show_progress(i+1, tcount, t.label, 50, writer)
+                #normal category
+                row = [t.label,] + [cat.wp_category.label for cat in t.tagwpcategory_set.filter(hidden=False)]
+                files_path['visible']['csv'].writerow(row)
+                
+                #hidden category
+                row = [t.label,] + [cat.wp_category.label for cat in t.tagwpcategory_set.filter(hidden=True)]
+                files_path['hidden']['csv'].writerow(row)
+                
+                #infobox
+                for i in t.infoboxes.all():
+                    vec = [[p.param_name,p.param_value.replace('\n',"\\n")] for p in i.infoboxparameter_set.all()]
+                    ib_params = [num for elem in vec for num in elem]
+                    row = [t.label, i.name.strip()] + ib_params
+                    files_path['infobox']['csv'].writerow(row)
+            
+        finally:
+            for filedef in files_path.itervalues():
+                if filedef.get('file',None):
+                    filedef['file'].close()
+
+            
+        
+        
\ No newline at end of file
--- a/web/hdabo/management/commands/query_wikipedia_category.py	Tue Jan 17 00:19:27 2012 +0100
+++ b/web/hdabo/management/commands/query_wikipedia_category.py	Thu Jan 26 18:19:37 2012 +0100
@@ -28,8 +28,9 @@
 
 START_PATTERN = re.compile(u"\{\{\s?Infobox\s+([^|]+)", re.M|re.U|re.I)
 END_PATTERN = re.compile(u"\{\{|\}\}", re.M|re.U)
-SPLIT_PATTERN = re.compile("\s*?\|\s*([\w]+[\w \t-]*)\s*=", re.U|re.M)
+SPLIT_PATTERN = re.compile("\s*?\|\s*([\w]+[^=|]*)\s*=", re.U|re.M)
 DELIMITER_PATTERN = re.compile("\{{2,3}|\}{2,3}|\[\[|\]\]|\[|\]")
+COMMENT_PATTERN = re.compile("<!--.*?-->",re.U|re.M)
 
 
 
@@ -80,9 +81,20 @@
             type='choice',
             choices=['visible','hidden', 'infobox', 'all'],
             default=[],
-            help='what type of query to oerform : visible : visible categories, hidden : hidden categories, infobox: infoboxes, all: all of them. This option can be assed multiple times'
-        ),
-        )
+            help='what type of query to perform : visible : visible categories, hidden : hidden categories, infobox: infoboxes, all: all of them. This option can be assed multiple times'),
+        make_option('--use-label',
+            action='store_true',
+            dest='use_label',
+            default=False,
+            help='use label instead of pageid to query wikipedia'),
+        make_option('--tag',
+            action='append',
+            dest='tags',
+            type='string',
+            default=[],
+            help='the tag to query'),
+
+    )
     
     
 #    def process_wp_response(self, label, response):        
@@ -111,10 +123,10 @@
 #        
 #        return new_label, status, url, pageid 
 
-    def query_all_categories(self, hidden, site, pageid):
+    def query_all_categories(self, hidden, site, pageid, use_label):
         
         clshow = 'hidden' if hidden else '!hidden'
-        params = {'action':'query', 'pageids': pageid, 'prop':'categories', 'clshow': clshow}
+        params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'categories', 'clshow': clshow}
         
         clcontinue = ""        
         res = []
@@ -126,6 +138,11 @@
             wpquery = api.APIRequest(site, params) #@UndefinedVariable
             response = wpquery.query()
             
+            if self.verbosity > 1:
+                print "Query infoboxes : " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
+                print repr(response)
+
+            
             query_dict = response.get('query', None)
             
             if query_dict is None:
@@ -144,6 +161,10 @@
                     res.append(title)
             
             clcontinue = response.get('query-continue', {}).get('categories',{}).get('clcontinue', None)
+
+        if self.verbosity > 1:
+            print "Query infoboxes RES: "
+            print repr(res)
             
         return res
     
@@ -154,10 +175,10 @@
             TagWpCategory.objects.get_or_create(tag=tag, wp_category=wp_cat, hidden=hidden)
             
                 
-    def query_infoboxes(self, site, pageid):
+    def query_infoboxes(self, site, pageid, use_label):
         
         res = []
-        params = {'action':'query', 'pageids': pageid, 'prop':'revisions', 'rvprop': 'ids|content'}
+        params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'revisions', 'rvprop': 'ids|content'}
         wpquery = api.APIRequest(site, params) #@UndefinedVariable
         response = wpquery.query()
         
@@ -205,8 +226,13 @@
                     res.append((content[current_start:resm.end()+start], current_infobox_name))
                 start += resm.end()+1
 
+        return_val = (rev['revid'],res)
         
-        return rev['revid'],res
+        if self.verbosity > 1:
+            print "Query infoboxes url: " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
+            print repr(return_val)
+        
+        return return_val
     
     def split_infoboxes(self, src):
         
@@ -255,7 +281,9 @@
                 tag_infobox.source = src
                 tag_infobox.save()
 
+            src = COMMENT_PATTERN.sub('',src)
             src = START_PATTERN.sub('',src[:-2]).strip()
+            
             keyvalues = self.split_infoboxes(src)
 
             for key,value in itertools.izip(*[itertools.islice(keyvalues, i, None, 2) for i in range(2)]):
@@ -270,7 +298,8 @@
         
         interactive = options.get('interactive', True)
         
-        verbosity = int(options.get('verbosity', '1'))
+        self.verbosity = int(options.get('verbosity', '1'))
+        use_label = options.get('use_label', False)
         
         force = options.get('force', False)
         
@@ -290,16 +319,20 @@
             for t in types_list:
                 types_mask |=  TYPES_MASK_DICT[t]
                 
-        if verbosity > 1 :
+        if self.verbosity > 1 :
             print "types mask %s " % (bin(types_mask))  
         
-        if verbosity > 2:
+        if self.verbosity > 2:
             print "option passed : " + repr(options)
 
 
         queryset = Tag.objects.exclude(wikipedia_pageid= None)
         
-        if not options.get('all',False):
+        tag_list = options.get("tags", []);
+        
+        if tag_list:
+            queryset = queryset.filter(label__in=tag_list)
+        elif not options.get('all',False):            
             queryset = queryset.annotate(wpc=Count('wp_categories')).filter(wpc = 0)
         #else:
         #    queryset = Tag.objects.filter(url_status=None)                    
@@ -314,14 +347,14 @@
         elif start > 0:
             queryset = queryset[start:]            
         
-        if verbosity > 2 :
+        if self.verbosity > 2 :
             print "Tag Query is %s" % (queryset.query)
         
         site = wiki.Wiki(site_url) #@UndefinedVariable
         
         
         count = queryset.count()
-        if verbosity > 1:
+        if self.verbosity > 1:
             print "Processing %d tags" % (count)
         
         if not force and interactive:
@@ -337,26 +370,26 @@
         
         for i, tag in enumerate(queryset):
             
-            if verbosity > 1:
+            if self.verbosity > 1:
                 print "processing tag %s (%d/%d)" % (tag.label, i + 1, count)
             else:
                 utils.show_progress(i + 1, count, tag.label, 60)                            
 
             # query categories
-            wikipedia_pageid = tag.wikipedia_pageid
+            wikipedia_pageid = tag.label if use_label else tag.wikipedia_pageid
             if tag.url_status == Tag.TAG_URL_STATUS_DICT['redirection'] and tag.alternative_wikipedia_pageid is not None :
-                wikipedia_pageid = tag.alternative_wikipedia_pageid
+                wikipedia_pageid = tag.alternative_label if use_label else tag.alternative_wikipedia_pageid
 
             with transaction.commit_on_success():
                 if types_mask & TYPES_MASK_DICT['visible']:
-                    res = self.query_all_categories(False, site, wikipedia_pageid)
+                    res = self.query_all_categories(False, site, wikipedia_pageid, use_label)
                     self.process_categories(res, False, tag)
     
                 if types_mask & TYPES_MASK_DICT['hidden']:
-                    res = self.query_all_categories(True, site, wikipedia_pageid)
+                    res = self.query_all_categories(True, site, wikipedia_pageid, use_label)
                     self.process_categories(res, True, tag)
                 
                 if types_mask & TYPES_MASK_DICT['infobox']:
-                    res = self.query_infoboxes(site, wikipedia_pageid)
+                    res = self.query_infoboxes(site, wikipedia_pageid, use_label)
                     self.process_infoboxes(res, tag)
             
--- a/web/hdabo/utils.py	Tue Jan 17 00:19:27 2012 +0100
+++ b/web/hdabo/utils.py	Thu Jan 26 18:19:37 2012 +0100
@@ -3,6 +3,7 @@
 import unicodedata
 import sys
 import math
+import codecs
 
 ###
 # allow to declare a property as a decorator
@@ -348,7 +349,12 @@
 def normalize(str):
     return remove_accents(str).lower().replace(u"œ",u"oe")
 
-def show_progress(current_line, total_line, label, width):
+def show_progress(current_line, total_line, label, width, writer=None):
+
+    if writer is None:
+        writer = sys.stdout
+        if sys.stdout.encoding is not None:
+            writer = codecs.getwriter(sys.stdout.encoding)(sys.stdout)
 
     percent = (float(current_line) / float(total_line)) * 100.0
 
@@ -357,8 +363,12 @@
 
     loader = u'[' + (u'=' * int(marks)) + (u' ' * int(spaces)) + u']'
     
-    sys.stdout.write(u"%s %d%% %d/%d - %r\r" % (loader, percent, current_line, total_line, label[:50].rjust(50))) #takes the header into account
+    str = u"%s %d%% %d/%d - %s\r" % (loader, percent, current_line, total_line, label[:50].rjust(50))
+    
+    writer.write(str) #takes the header into account
     if percent >= 100:
-        sys.stdout.write("\n")
-    sys.stdout.flush()
+        writer.write("\n")
+    writer.flush()
+    
+    return writer