src/hdalab/management/commands/query_wikipedia_category.py
changeset 649 434737bd64e5
parent 571 d9642be7c937
child 683 59d49ab04ded
--- a/src/hdalab/management/commands/query_wikipedia_category.py	Thu Sep 24 13:40:54 2015 +0200
+++ b/src/hdalab/management/commands/query_wikipedia_category.py	Sat Sep 26 11:55:11 2015 +0200
@@ -41,7 +41,7 @@
     '''
     options = ''
     help = """query and update wikipedia for tag title."""
-    
+
     option_list = NoArgsCommand.option_list + (
         make_option('--all',
             action='store_true',
@@ -96,116 +96,116 @@
             help='the tag to query'),
 
     )
-    
-    
-#    def process_wp_response(self, label, response):        
+
+
+#    def process_wp_response(self, label, response):
 #
 #        query_dict = response['query']
 #        # get page if multiple pages or none -> return Tag.null_result
 #        pages = query_dict.get("pages", {})
 #        if len(pages) > 1 or len(pages) == 0:
 #            return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None
-#        
+#
 #        page = pages.values()[0]
-#        
+#
 #        if u"invalid" in page or u"missing" in page:
 #            return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None
 #
 #        url = page.get(u'fullurl', None)
 #        pageid = page.get(u'pageid', None)
 #        new_label = page[u'title']
-#        
+#
 #        if self.__is_homonymie(page):
 #            status = Tag.TAG_URL_STATUS_DICT["homonyme"]
 #        elif u"redirect" in page:
 #            status = Tag.TAG_URL_STATUS_DICT["redirection"]
 #        else:
 #            status = Tag.TAG_URL_STATUS_DICT["match"]
-#        
-#        return new_label, status, url, pageid 
+#
+#        return new_label, status, url, pageid
 
     def query_all_categories(self, hidden, site, pageid, use_label):
-        
+
         clshow = 'hidden' if hidden else '!hidden'
         params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'categories', 'clshow': clshow}
-        
-        clcontinue = ""        
+
+        clcontinue = ""
         res = []
-        
+
         while clcontinue is not None:
             if clcontinue:
                 params['clcontinue'] = clcontinue
-                
+
             wpquery = api.APIRequest(site, params) #@UndefinedVariable
             response = wpquery.query()
-            
+
             if self.verbosity > 1:
                 print "Query infoboxes : " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
                 print repr(response)
 
-            
+
             query_dict = response.get('query', None)
-            
+
             if query_dict is None:
                 return res
-            
+
             pages = query_dict.get("pages", {})
             if len(pages) > 1 or len(pages) == 0:
                 return res
-            
+
             page = pages.values()[0]
-                        
+
             for cat in page.get('categories',[]):
                 title = cat.get('title',"")
                 title = title[title.find(":")+1:]
                 if title and clcontinue != ("%s|%s" % (pageid,title)):
                     res.append(title)
-            
+
             clcontinue = response.get('query-continue', {}).get('categories',{}).get('clcontinue', None)
 
         if self.verbosity > 1:
             print "Query infoboxes RES: "
             print repr(res)
-            
+
         return res
-    
+
     def process_categories(self, cat_list, hidden, tag):
-        
+
         for cat in cat_list:
             wp_cat,created = WpCategory.objects.get_or_create(label=cat) #@UnusedVariable
             TagWpCategory.objects.get_or_create(tag=tag, wp_category=wp_cat, hidden=hidden)
-            
-                
+
+
     def query_infoboxes(self, site, pageid, use_label):
-        
+
         res = []
         params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'revisions', 'rvprop': 'ids|content'}
         wpquery = api.APIRequest(site, params) #@UndefinedVariable
         response = wpquery.query()
-        
+
         query_dict = response.get('query', None)
-            
+
         if query_dict is None:
             return res
-            
+
         pages = query_dict.get("pages", {})
         if len(pages) > 1 or len(pages) == 0:
             return res
 
         page = pages.values()[0]
-        
+
         if 'revisions' not in page or not page['revisions']:
             return res
-        
+
         rev = page['revisions'][0]
-        
+
         content = rev['*']
-                
+
         start = 0
         depth = 0
         current_infobox_name = None
         current_start = 0
-        
+
         while start <= len(content):
             if depth==0:
                 resm = START_PATTERN.search(content[start:])
@@ -214,7 +214,7 @@
                 depth = 1
                 current_start = resm.start()+start
                 start += resm.end()+1
-                current_infobox_name = resm.group(1)                    
+                current_infobox_name = resm.group(1)
             else:
                 resm = END_PATTERN.search(content[start:])
                 if resm is None:
@@ -228,20 +228,20 @@
                 start += resm.end()+1
 
         return_val = (rev['revid'],res)
-        
+
         if self.verbosity > 1:
             print "Query infoboxes url: " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data())
             print repr(return_val)
-        
+
         return return_val
-    
+
     def split_infoboxes(self, src):
-        
+
         start = 0
         previous_end = 0
         split_indexes = []
         delimiter_stack = []
-        while start<=len(src):            
+        while start<=len(src):
             resd = DELIMITER_PATTERN.search(src[start:])
             ress = SPLIT_PATTERN.search(src[start:]) if len(delimiter_stack) == 0 else None
             startd = resd.start() if resd is not None else sys.maxint
@@ -260,7 +260,7 @@
                 start += resd.end()
             else:
                 break
-            
+
         if previous_end > 0:
             split_indexes.append((previous_end,len(src)))
         res = [src[start:end] for start,end in split_indexes]
@@ -269,14 +269,16 @@
 
 
     def process_infoboxes(self, infobox_defs, tag):
-        
+
         if not infobox_defs:
             return
-        
+
         revision_id = infobox_defs[0]
         for infobox in infobox_defs[1]:
-            src = infobox[0].strip(' \t\n\r')            
+            src = infobox[0].strip(' \t\n\r')
             name = infobox[1]
+            if name and len(name) > 2048:
+                name = name[0:2048]
             tag_infobox, created = TagInfobox.objects.get_or_create(tag=tag, name=name, revision_id = revision_id, defaults={'source': src})
             if not created:
                 tag_infobox.source = src
@@ -284,7 +286,7 @@
 
             src = COMMENT_PATTERN.sub('',src)
             src = START_PATTERN.sub('',src[:-2]).strip()
-            
+
             keyvalues = self.split_infoboxes(src)
 
             for key,value in itertools.izip(*[itertools.islice(keyvalues, i, None, 2) for i in range(2)]):
@@ -292,89 +294,89 @@
                 if not created:
                     param.param_value = value.strip()
                     param.save()
-        
+
     def handle_noargs(self, **options):
-        
+
         self.style = no_style()
-        
+
         interactive = options.get('interactive', True)
-        
+
         self.verbosity = int(options.get('verbosity', '1'))
         use_label = options.get('use_label', False)
-        
+
         force = options.get('force', False)
-        
+
         limit = options.get("limit", -1)
         start = options.get("start", 0)
-        
+
         site_url = options.get('site_url', settings.WIKIPEDIA_API_URL)
-        
+
         random = options.get('random', False)
-        
+
         types_mask = 0
         types_list = options.get('types', [])
-        
+
         if len(types_list) == 0:
             types_mask = TYPES_MASK_DICT['all']
         else:
             for t in types_list:
                 types_mask |=  TYPES_MASK_DICT[t]
-                
+
         if self.verbosity > 1 :
-            print "types mask %s " % (bin(types_mask))  
-        
+            print "types mask %s " % (bin(types_mask))
+
         if self.verbosity > 2:
             print "option passed : " + repr(options)
 
 
         queryset = Tag.objects.exclude(wikipedia_pageid= None)
-        
+
         tag_list = options.get("tags", []);
-        
+
         if tag_list:
             queryset = queryset.filter(label__in=tag_list)
-        elif not options.get('all',False):            
+        elif not options.get('all',False):
             queryset = queryset.annotate(wpc=Count('wp_categories')).filter(wpc = 0)
         #else:
-        #    queryset = Tag.objects.filter(url_status=None)                    
-        
+        #    queryset = Tag.objects.filter(url_status=None)
+
         if random:
             queryset = queryset.order_by("?")
         else:
             queryset = queryset.order_by("label")
-        
+
         if limit >= 0:
             queryset = queryset[start:limit]
         elif start > 0:
-            queryset = queryset[start:]            
-        
+            queryset = queryset[start:]
+
         if self.verbosity > 2 :
             print "Tag Query is %s" % (queryset.query)
-        
+
         site = wiki.Wiki(site_url) #@UndefinedVariable
-        
-        
+
+
         count = queryset.count()
         if self.verbosity > 1:
             print "Processing %d tags" % (count)
-        
+
         if not force and interactive:
             confirm = raw_input("You have requested to query and replace the wikipedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count))
         else:
             confirm = 'yes'
-            
+
         if confirm != "yes":
             print "wikipedia query cancelled"
             return
 
-        
-        
+
+
         for i, tag in enumerate(queryset):
-            
+
             if self.verbosity > 1:
                 print "processing tag %s (%d/%d)" % (tag.label, i + 1, count)
             else:
-                utils.show_progress(i + 1, count, tag.label, 60)                            
+                utils.show_progress(i + 1, count, tag.label, 60)
 
             # query categories
             wikipedia_pageid = tag.label if use_label else tag.wikipedia_pageid
@@ -385,12 +387,11 @@
                 if types_mask & TYPES_MASK_DICT['visible']:
                     res = self.query_all_categories(False, site, wikipedia_pageid, use_label)
                     self.process_categories(res, False, tag)
-    
+
                 if types_mask & TYPES_MASK_DICT['hidden']:
                     res = self.query_all_categories(True, site, wikipedia_pageid, use_label)
                     self.process_categories(res, True, tag)
-                
+
                 if types_mask & TYPES_MASK_DICT['infobox']:
                     res = self.query_infoboxes(site, wikipedia_pageid, use_label)
                     self.process_infoboxes(res, tag)
-