web/hdalab/management/commands/query_category_inclusion.py
author veltr
Mon, 02 Jul 2012 19:19:02 +0200
changeset 204 0a1744477bc1
child 206 7070d3acc3d4
permissions -rw-r--r--
Added category tree Ajax API + Tables
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
204
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
     1
# -*- coding: utf-8 -*-
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
     2
'''
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
     3
Created on July 2, 2012
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
     4
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
     5
@author: raphv
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
     6
'''
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
     7
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
     8
from django.conf import settings
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
     9
from django.core.management.base import NoArgsCommand
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    10
from django.core.management.color import no_style
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    11
from hdalab.models import WpCategory, WpCategoryInclusion
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    12
from optparse import make_option
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    13
from wikitools import api,wiki
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    14
import sys
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    15
import re
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    16
import itertools
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    17
from hdabo import utils
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    18
from django.db.models import Count
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    19
from django.db import transaction
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    20
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    21
CATEGORY_PREFIX = u'Catégorie:'
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    22
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    23
class Command(NoArgsCommand):
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    24
    '''
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    25
    query and update wikipedia for tag title.
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    26
    '''
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    27
    options = ''
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    28
    help = """query and update wikipedia for tag title."""
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    29
    
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    30
    option_list = NoArgsCommand.option_list + (
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    31
        make_option('--all',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    32
            action='store_true',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    33
            dest='all',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    34
            default=False,
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    35
            help='force all categories to be updated, not only those not yet processed'),
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    36
        make_option('--force',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    37
            action='store_true',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    38
            dest='force',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    39
            default=False,
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    40
            help='ask no questions'),
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    41
        make_option('--random',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    42
            action='store_true',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    43
            dest='random',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    44
            default=False,
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    45
            help='randomize query on categories'),
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    46
        make_option('--site',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    47
            action='store',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    48
            type='string',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    49
            dest='site_url',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    50
            default="http://fr.wikipedia.org/w/api.php",
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    51
            help='the url for the wikipedia site'),
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    52
        make_option('--limit',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    53
            action='store',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    54
            type='int',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    55
            dest='limit',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    56
            default= -1,
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    57
            help='number of categories to process'),
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    58
        make_option('--start',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    59
            action='store',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    60
            type='int',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    61
            dest='start',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    62
            default=0,
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    63
            help='number of categories to ignore'),
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    64
        make_option('--category',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    65
            action='append',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    66
            dest='category',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    67
            type='string',
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    68
            default=[],
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    69
            help='the categories to query'),
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    70
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    71
    )
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    72
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    73
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    74
    def query_all_categories(self, category_title, site):
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    75
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    76
        params = {'action':'query', 'cmtitle':category_title, 'list':'categorymembers', 'cmlimit': 'max'}
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    77
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    78
        res = []
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    79
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    80
        wpquery = api.APIRequest(site, params) #@UndefinedVariable
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    81
        response = wpquery.query()
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    82
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    83
        if self.verbosity > 1:
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    84
            print "Query category : " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    85
            print repr(response)
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    86
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    87
        members = response.get('query', {}).get('categorymembers', [])
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    88
                    
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    89
        for member in members:
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    90
            title = member.get('title',"")
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    91
            if re.match(CATEGORY_PREFIX, title):
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    92
                res.append(re.sub(CATEGORY_PREFIX, "", title))
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    93
            
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    94
        if self.verbosity > 1:
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    95
            print "Query categories result: "
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    96
            print repr(res)
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    97
            
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    98
        return res
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
    99
    
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   100
    def process_categories(self, cat_list, parent_cat):
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   101
        for cat in cat_list:
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   102
            child_cat,created = WpCategory.objects.get_or_create(label=cat) #@UnusedVariable
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   103
            WpCategoryInclusion.objects.get_or_create(parent_category=parent_cat, child_category=child_cat)
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   104
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   105
    def handle_noargs(self, **options):
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   106
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   107
        self.style = no_style()
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   108
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   109
        interactive = options.get('interactive', True)
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   110
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   111
        self.verbosity = int(options.get('verbosity', '1'))
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   112
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   113
        force = options.get('force', False)
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   114
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   115
        limit = options.get("limit", -1)
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   116
        start = options.get("start", 0)
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   117
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   118
        site_url = options.get('site_url', settings.WIKIPEDIA_API_URL)
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   119
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   120
        random = options.get('random', False)
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   121
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   122
        types_mask = 0
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   123
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   124
        if self.verbosity > 2:
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   125
            print "option passed : " + repr(options)
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   126
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   127
        queryset = WpCategory.objects
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   128
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   129
        cat_list = options.get("category", []);
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   130
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   131
        if cat_list:
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   132
            queryset = queryset.filter(label__in=cat_list)
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   133
        elif not options.get('all',False):            
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   134
            queryset = queryset.annotate(wpc=Count('child_categories')).filter(wpc = 0)
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   135
        #else:
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   136
        #    queryset = Tag.objects.filter(url_status=None)                    
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   137
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   138
        if random:
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   139
            queryset = queryset.order_by("?")
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   140
        else:
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   141
            queryset = queryset.order_by("label")
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   142
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   143
        if limit >= 0:
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   144
            queryset = queryset[start:limit]
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   145
        elif start > 0:
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   146
            queryset = queryset[start:]            
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   147
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   148
        if self.verbosity > 2 :
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   149
            print "Category Query is %s" % (queryset.query)
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   150
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   151
        site = wiki.Wiki(site_url) #@UndefinedVariable
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   152
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   153
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   154
        count = queryset.count()
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   155
        if self.verbosity > 1:
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   156
            print "Processing %d categories" % (count)
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   157
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   158
        if not force and interactive:
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   159
            confirm = raw_input("You have requested to query and replace the wikipedia information for %d categories.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   160
        else:
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   161
            confirm = 'yes'
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   162
            
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   163
        if confirm != "yes":
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   164
            print "wikipedia query cancelled"
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   165
            return
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   166
        
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   167
        for i, category in enumerate(queryset):
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   168
            
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   169
            if self.verbosity > 1:
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   170
                print "processing category %s (%d/%d)" % (category.label, i + 1, count)
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   171
            else:
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   172
                utils.show_progress(i + 1, count, category.label, 60)                            
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   173
                
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   174
            title = CATEGORY_PREFIX + category.label
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   175
            # query categories
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   176
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   177
            with transaction.commit_on_success():
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   178
                res = self.query_all_categories(title, site)
0a1744477bc1 Added category tree Ajax API + Tables
veltr
parents:
diff changeset
   179
                self.process_categories(res, category)