src/hdalab/management/commands/query_category_inclusion.py
author ymh <ymh.work@gmail.com>
Wed, 11 Apr 2018 12:19:47 +0200
branchdocumentation
changeset 693 09e00f38d177
parent 683 59d49ab04ded
permissions -rw-r--r--
Add hdabo/hdalab documentations

# -*- coding: utf-8 -*-
'''
Requête wikipedia pour reconstituer l'arbre des catégories.

Cette commande utilise directement `l'api wikipedia <https://www.mediawiki.org/wiki/API:Main_page>`_ pour faire ses requêtes.

**Usage**: ``django-admin query_category_inclusion [options]``

**Options spécifiques:**

    - *\-\-all* :               force à traiter toutes les catégories
    - *\-\-force* :             ne pose aucune question
    - *\-\-site=SITE_URL* :     url du site wikipedia (défaut: https://fr.wikipedia.org/w/api.php)
    - *\-\-limit=LIMIT* :       Nombre de catégories à traiter
    - *\-\-start=START* :       Nombre de catégories à ignorer
    - *\-\-category=CATEGORY* : Limite le traitement à cette catégorie

'''

from django.conf import settings
from django.core.management.base import NoArgsCommand
from django.core.management.color import no_style
from hdalab.models import WpCategory, WpCategoryInclusion
from optparse import make_option
from wikitools import api,wiki
import sys
import re
import itertools
from hdabo import utils
from django.db.models import Count
from django.db import transaction

CATEGORY_PREFIX = u'Catégorie:'

class Command(NoArgsCommand):
    '''
    query and update wikipedia for tag title.
    '''
    options = ''
    help = """query and update wikipedia for tag title."""

    option_list = NoArgsCommand.option_list + (
        make_option('--all',
            action='store_true',
            dest='all',
            default=False,
            help='force all categories to be updated, not only those not yet processed'),
        make_option('--force',
            action='store_true',
            dest='force',
            default=False,
            help='ask no questions'),
        make_option('--site',
            action='store',
            type='string',
            dest='site_url',
            default="https://fr.wikipedia.org/w/api.php",
            help='the url for the wikipedia site'),
        make_option('--limit',
            action='store',
            type='int',
            dest='limit',
            default= -1,
            help='number of categories to process'),
        make_option('--start',
            action='store',
            type='int',
            dest='start',
            default=0,
            help='number of categories to ignore'),
        make_option('--category',
            action='append',
            dest='category',
            type='string',
            default=[],
            help='the categories to query'),

    )


    def query_all_categories(self, category_title, site):

        params = {'action':'query', 'cmtitle':category_title, 'list':'categorymembers', 'cmlimit': 'max'}

        res = []

        wpquery = api.APIRequest(site, params) #@UndefinedVariable
        response = wpquery.query()

        if self.verbosity > 1:
            print "Query category : " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
            print repr(response)

        members = response.get('query', {}).get('categorymembers', [])

        for member in members:
            title = member.get('title',"")
            if re.match(CATEGORY_PREFIX, title):
                res.append(re.sub(CATEGORY_PREFIX, "", title))

        if self.verbosity > 1:
            print "Query categories result: "
            print repr(res)

        return res

    def process_categories(self, cat_list, parent_cat):
        for cat in cat_list:
            child_cat,created = WpCategory.objects.get_or_create(label=cat) #@UnusedVariable
            WpCategoryInclusion.objects.get_or_create(parent_category=parent_cat, child_category=child_cat)

    def handle_noargs(self, **options):

        self.style = no_style()

        interactive = options.get('interactive', True)

        self.verbosity = int(options.get('verbosity', '1'))

        force = options.get('force', False)

        limit = options.get("limit", -1)
        start = options.get("start", 0)

        site_url = options.get('site_url', settings.WIKIPEDIA_API_URL)

        types_mask = 0

        if self.verbosity > 2:
            print "option passed : " + repr(options)

        queryset = WpCategory.objects.filter(tags__hidden = False).distinct()

        cat_list = options.get("category", []);

        if cat_list:
            queryset = queryset.filter(label__in=cat_list)
        elif options.get('all',False):
            queryset = queryset.annotate(wpc=Count('child_categories')).filter(wpc = 0)

        queryset = queryset.order_by("label")

        if limit >= 0:
            queryset = queryset[start:limit]
        elif start > 0:
            queryset = queryset[start:]

        if self.verbosity > 2 :
            print "Category Query is %s" % (queryset.query)

        site = wiki.Wiki(site_url) #@UndefinedVariable


        count = queryset.count()
        if self.verbosity > 1:
            print "Processing %d categories" % (count)

        if not force and interactive:
            confirm = raw_input("You have requested to query and replace the wikipedia information for %d categories.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
        else:
            confirm = 'yes'

        if confirm != "yes":
            print "wikipedia query cancelled"
            return

        for i, category in enumerate(queryset):

            if self.verbosity > 1:
                print "processing category %s (%d/%d)" % (category.label, i + 1, count)
            else:
                utils.show_progress(i + 1, count, category.label, 60)

            title = CATEGORY_PREFIX + category.label
            # query categories

            with transaction.atomic():
                res = self.query_all_categories(title, site)
                self.process_categories(res, category)