src/hdalab/management/commands/query_category_inclusion.py
branchdocumentation
changeset 693 09e00f38d177
parent 683 59d49ab04ded
equal deleted inserted replaced
692:b7d19cd87fcf 693:09e00f38d177
     1 # -*- coding: utf-8 -*-
     1 # -*- coding: utf-8 -*-
     2 '''
     2 '''
     3 Created on July 2, 2012
     3 Requête wikipedia pour reconstituer l'arbre des catégories.
     4 
     4 
     5 @author: raphv
     5 Cette commande utilise directement `l'api wikipedia <https://www.mediawiki.org/wiki/API:Main_page>`_ pour faire ses requêtes.
       
     6 
       
     7 **Usage**: ``django-admin query_category_inclusion [options]``
       
     8 
       
     9 **Options spécifiques:**
       
    10 
       
    11     - *\-\-all* :               force à traiter toutes les catégories
       
    12     - *\-\-force* :             ne pose aucune question
       
    13     - *\-\-site=SITE_URL* :     url du site wikipedia (défaut: https://fr.wikipedia.org/w/api.php)
       
    14     - *\-\-limit=LIMIT* :       Nombre de catégories à traiter
       
    15     - *\-\-start=START* :       Nombre de catégories à ignorer
       
    16     - *\-\-category=CATEGORY* : Limite le traitement à cette catégorie
       
    17 
     6 '''
    18 '''
     7 
    19 
     8 from django.conf import settings
    20 from django.conf import settings
     9 from django.core.management.base import NoArgsCommand
    21 from django.core.management.base import NoArgsCommand
    10 from django.core.management.color import no_style
    22 from django.core.management.color import no_style
    24     '''
    36     '''
    25     query and update wikipedia for tag title.
    37     query and update wikipedia for tag title.
    26     '''
    38     '''
    27     options = ''
    39     options = ''
    28     help = """query and update wikipedia for tag title."""
    40     help = """query and update wikipedia for tag title."""
    29     
    41 
    30     option_list = NoArgsCommand.option_list + (
    42     option_list = NoArgsCommand.option_list + (
    31         make_option('--all',
    43         make_option('--all',
    32             action='store_true',
    44             action='store_true',
    33             dest='all',
    45             dest='all',
    34             default=False,
    46             default=False,
    65 
    77 
    66     )
    78     )
    67 
    79 
    68 
    80 
    69     def query_all_categories(self, category_title, site):
    81     def query_all_categories(self, category_title, site):
    70         
    82 
    71         params = {'action':'query', 'cmtitle':category_title, 'list':'categorymembers', 'cmlimit': 'max'}
    83         params = {'action':'query', 'cmtitle':category_title, 'list':'categorymembers', 'cmlimit': 'max'}
    72         
    84 
    73         res = []
    85         res = []
    74         
    86 
    75         wpquery = api.APIRequest(site, params) #@UndefinedVariable
    87         wpquery = api.APIRequest(site, params) #@UndefinedVariable
    76         response = wpquery.query()
    88         response = wpquery.query()
    77         
    89 
    78         if self.verbosity > 1:
    90         if self.verbosity > 1:
    79             print "Query category : " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
    91             print "Query category : " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
    80             print repr(response)
    92             print repr(response)
    81         
    93 
    82         members = response.get('query', {}).get('categorymembers', [])
    94         members = response.get('query', {}).get('categorymembers', [])
    83                     
    95 
    84         for member in members:
    96         for member in members:
    85             title = member.get('title',"")
    97             title = member.get('title',"")
    86             if re.match(CATEGORY_PREFIX, title):
    98             if re.match(CATEGORY_PREFIX, title):
    87                 res.append(re.sub(CATEGORY_PREFIX, "", title))
    99                 res.append(re.sub(CATEGORY_PREFIX, "", title))
    88             
   100 
    89         if self.verbosity > 1:
   101         if self.verbosity > 1:
    90             print "Query categories result: "
   102             print "Query categories result: "
    91             print repr(res)
   103             print repr(res)
    92             
   104 
    93         return res
   105         return res
    94     
   106 
    95     def process_categories(self, cat_list, parent_cat):
   107     def process_categories(self, cat_list, parent_cat):
    96         for cat in cat_list:
   108         for cat in cat_list:
    97             child_cat,created = WpCategory.objects.get_or_create(label=cat) #@UnusedVariable
   109             child_cat,created = WpCategory.objects.get_or_create(label=cat) #@UnusedVariable
    98             WpCategoryInclusion.objects.get_or_create(parent_category=parent_cat, child_category=child_cat)
   110             WpCategoryInclusion.objects.get_or_create(parent_category=parent_cat, child_category=child_cat)
    99         
   111 
   100     def handle_noargs(self, **options):
   112     def handle_noargs(self, **options):
   101         
   113 
   102         self.style = no_style()
   114         self.style = no_style()
   103         
   115 
   104         interactive = options.get('interactive', True)
   116         interactive = options.get('interactive', True)
   105         
   117 
   106         self.verbosity = int(options.get('verbosity', '1'))
   118         self.verbosity = int(options.get('verbosity', '1'))
   107         
   119 
   108         force = options.get('force', False)
   120         force = options.get('force', False)
   109         
   121 
   110         limit = options.get("limit", -1)
   122         limit = options.get("limit", -1)
   111         start = options.get("start", 0)
   123         start = options.get("start", 0)
   112         
   124 
   113         site_url = options.get('site_url', settings.WIKIPEDIA_API_URL)
   125         site_url = options.get('site_url', settings.WIKIPEDIA_API_URL)
   114         
   126 
   115         types_mask = 0
   127         types_mask = 0
   116         
   128 
   117         if self.verbosity > 2:
   129         if self.verbosity > 2:
   118             print "option passed : " + repr(options)
   130             print "option passed : " + repr(options)
   119 
   131 
   120         queryset = WpCategory.objects.filter(tags__hidden = False).distinct()
   132         queryset = WpCategory.objects.filter(tags__hidden = False).distinct()
   121         
   133 
   122         cat_list = options.get("category", []);
   134         cat_list = options.get("category", []);
   123         
   135 
   124         if cat_list:
   136         if cat_list:
   125             queryset = queryset.filter(label__in=cat_list)
   137             queryset = queryset.filter(label__in=cat_list)
   126         elif options.get('all',False):            
   138         elif options.get('all',False):
   127             queryset = queryset.annotate(wpc=Count('child_categories')).filter(wpc = 0)                    
   139             queryset = queryset.annotate(wpc=Count('child_categories')).filter(wpc = 0)
   128         
   140 
   129         queryset = queryset.order_by("label")
   141         queryset = queryset.order_by("label")
   130         
   142 
   131         if limit >= 0:
   143         if limit >= 0:
   132             queryset = queryset[start:limit]
   144             queryset = queryset[start:limit]
   133         elif start > 0:
   145         elif start > 0:
   134             queryset = queryset[start:]            
   146             queryset = queryset[start:]
   135         
   147 
   136         if self.verbosity > 2 :
   148         if self.verbosity > 2 :
   137             print "Category Query is %s" % (queryset.query)
   149             print "Category Query is %s" % (queryset.query)
   138         
   150 
   139         site = wiki.Wiki(site_url) #@UndefinedVariable
   151         site = wiki.Wiki(site_url) #@UndefinedVariable
   140         
   152 
   141         
   153 
   142         count = queryset.count()
   154         count = queryset.count()
   143         if self.verbosity > 1:
   155         if self.verbosity > 1:
   144             print "Processing %d categories" % (count)
   156             print "Processing %d categories" % (count)
   145         
   157 
   146         if not force and interactive:
   158         if not force and interactive:
   147             confirm = raw_input("You have requested to query and replace the wikipedia information for %d categories.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
   159             confirm = raw_input("You have requested to query and replace the wikipedia information for %d categories.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
   148         else:
   160         else:
   149             confirm = 'yes'
   161             confirm = 'yes'
   150             
   162 
   151         if confirm != "yes":
   163         if confirm != "yes":
   152             print "wikipedia query cancelled"
   164             print "wikipedia query cancelled"
   153             return
   165             return
   154         
   166 
   155         for i, category in enumerate(queryset):
   167         for i, category in enumerate(queryset):
   156             
   168 
   157             if self.verbosity > 1:
   169             if self.verbosity > 1:
   158                 print "processing category %s (%d/%d)" % (category.label, i + 1, count)
   170                 print "processing category %s (%d/%d)" % (category.label, i + 1, count)
   159             else:
   171             else:
   160                 utils.show_progress(i + 1, count, category.label, 60)                            
   172                 utils.show_progress(i + 1, count, category.label, 60)
   161                 
   173 
   162             title = CATEGORY_PREFIX + category.label
   174             title = CATEGORY_PREFIX + category.label
   163             # query categories
   175             # query categories
   164 
   176 
   165             with transaction.atomic():
   177             with transaction.atomic():
   166                 res = self.query_all_categories(title, site)
   178                 res = self.query_all_categories(title, site)