src/hdalab/management/commands/query_category_inclusion.py
changeset 271 8f77cf71ab02
parent 216 c4953332bc52
child 571 d9642be7c937
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/hdalab/management/commands/query_category_inclusion.py	Tue Jun 17 10:25:33 2014 +0200
@@ -0,0 +1,167 @@
+# -*- coding: utf-8 -*-
+'''
+Created on July 2, 2012
+
+@author: raphv
+'''
+
+from django.conf import settings
+from django.core.management.base import NoArgsCommand
+from django.core.management.color import no_style
+from hdalab.models import WpCategory, WpCategoryInclusion
+from optparse import make_option
+from wikitools import api,wiki
+import sys
+import re
+import itertools
+from hdabo import utils
+from django.db.models import Count
+from django.db import transaction
+
+CATEGORY_PREFIX = u'Catégorie:'
+
+class Command(NoArgsCommand):
+    '''
+    query and update wikipedia for tag title.
+    '''
+    options = ''
+    help = """query and update wikipedia for tag title."""
+    
+    option_list = NoArgsCommand.option_list + (
+        make_option('--all',
+            action='store_true',
+            dest='all',
+            default=False,
+            help='force all categories to be updated, not only those not yet processed'),
+        make_option('--force',
+            action='store_true',
+            dest='force',
+            default=False,
+            help='ask no questions'),
+        make_option('--site',
+            action='store',
+            type='string',
+            dest='site_url',
+            default="http://fr.wikipedia.org/w/api.php",
+            help='the url for the wikipedia site'),
+        make_option('--limit',
+            action='store',
+            type='int',
+            dest='limit',
+            default= -1,
+            help='number of categories to process'),
+        make_option('--start',
+            action='store',
+            type='int',
+            dest='start',
+            default=0,
+            help='number of categories to ignore'),
+        make_option('--category',
+            action='append',
+            dest='category',
+            type='string',
+            default=[],
+            help='the categories to query'),
+
+    )
+
+
+    def query_all_categories(self, category_title, site):
+        
+        params = {'action':'query', 'cmtitle':category_title, 'list':'categorymembers', 'cmlimit': 'max'}
+        
+        res = []
+        
+        wpquery = api.APIRequest(site, params) #@UndefinedVariable
+        response = wpquery.query()
+        
+        if self.verbosity > 1:
+            print "Query category : " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
+            print repr(response)
+        
+        members = response.get('query', {}).get('categorymembers', [])
+                    
+        for member in members:
+            title = member.get('title',"")
+            if re.match(CATEGORY_PREFIX, title):
+                res.append(re.sub(CATEGORY_PREFIX, "", title))
+            
+        if self.verbosity > 1:
+            print "Query categories result: "
+            print repr(res)
+            
+        return res
+    
+    def process_categories(self, cat_list, parent_cat):
+        for cat in cat_list:
+            child_cat,created = WpCategory.objects.get_or_create(label=cat) #@UnusedVariable
+            WpCategoryInclusion.objects.get_or_create(parent_category=parent_cat, child_category=child_cat)
+        
+    def handle_noargs(self, **options):
+        
+        self.style = no_style()
+        
+        interactive = options.get('interactive', True)
+        
+        self.verbosity = int(options.get('verbosity', '1'))
+        
+        force = options.get('force', False)
+        
+        limit = options.get("limit", -1)
+        start = options.get("start", 0)
+        
+        site_url = options.get('site_url', settings.WIKIPEDIA_API_URL)
+        
+        types_mask = 0
+        
+        if self.verbosity > 2:
+            print "option passed : " + repr(options)
+
+        queryset = WpCategory.objects.filter(tags__hidden = False).distinct()
+        
+        cat_list = options.get("category", []);
+        
+        if cat_list:
+            queryset = queryset.filter(label__in=cat_list)
+        elif options.get('all',False):            
+            queryset = queryset.annotate(wpc=Count('child_categories')).filter(wpc = 0)                    
+        
+        queryset = queryset.order_by("label")
+        
+        if limit >= 0:
+            queryset = queryset[start:limit]
+        elif start > 0:
+            queryset = queryset[start:]            
+        
+        if self.verbosity > 2 :
+            print "Category Query is %s" % (queryset.query)
+        
+        site = wiki.Wiki(site_url) #@UndefinedVariable
+        
+        
+        count = queryset.count()
+        if self.verbosity > 1:
+            print "Processing %d categories" % (count)
+        
+        if not force and interactive:
+            confirm = raw_input("You have requested to query and replace the wikipedia information for %d categories.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
+        else:
+            confirm = 'yes'
+            
+        if confirm != "yes":
+            print "wikipedia query cancelled"
+            return
+        
+        for i, category in enumerate(queryset):
+            
+            if self.verbosity > 1:
+                print "processing category %s (%d/%d)" % (category.label, i + 1, count)
+            else:
+                utils.show_progress(i + 1, count, category.label, 60)                            
+                
+            title = CATEGORY_PREFIX + category.label
+            # query categories
+
+            with transaction.commit_on_success():
+                res = self.query_all_categories(title, site)
+                self.process_categories(res, category)
\ No newline at end of file