src/core/wp_utils.py
changeset 67 5d9223bb3aab
parent 63 a7f4a418d5af
child 71 3fde7d26ad08
--- a/src/core/wp_utils.py	Thu Jul 11 14:26:00 2013 +0200
+++ b/src/core/wp_utils.py	Thu Jul 18 10:39:26 2013 +0200
@@ -9,12 +9,13 @@
 import urllib2
 
 logger = logging.getLogger(__name__)
-site = None
+sites = {}
 
-def __get_site():
-    global site
+def __get_site(lang):
+    site = sites.get(lang, None)
     if site is None:
-        site = wiki.Wiki(settings.WIKIPEDIA_API_URL) #@UndefinedVariable
+        site = wiki.Wiki(settings.WIKIPEDIA_URLS[lang]['api_url'])  # @UndefinedVariable
+        sites[lang] = site        
     return site
 
 
@@ -52,14 +53,14 @@
     return urlquote(label.replace(" ", "_"))
 
 
-def __is_homonymie(page_dict):
+def __is_homonymie(page_dict, lang):
     for cat in page_dict.get(u"categories", []):
-        if u'Catégorie:Homonymie' in cat.get(u"title", u"") or u'Category:Disambiguation pages' in cat.get(u"title", u""):
+        if settings.WIKIPEDIA_URLS[lang]['disambiguation_cat'] in cat.get(u"title", u""):
             return True
     return False
 
 
-def query_wikipedia_title(site, label=None, pageid=None):
+def query_wikipedia_title(site, lang, label=None, pageid=None):
     
     params = {'action':'query', 'prop':'info|categories|langlinks', 'inprop':'url', 'lllimit':'500', 'cllimit':'500'}
         
@@ -101,7 +102,7 @@
     alternative_url = None
     alternative_pageid = None
     
-    if __is_homonymie(page):
+    if __is_homonymie(page, lang):
         status = TERM_URL_STATUS_DICT["homonyme"]
     elif u"redirect" in page:
         status = TERM_URL_STATUS_DICT["redirection"]
@@ -131,7 +132,7 @@
     
     
     if status == TERM_URL_STATUS_DICT['match'] or status == TERM_URL_STATUS_DICT['redirection']:
-        dbpedia_uri = settings.DBPEDIA_URI_TEMPLATE % (urlize_for_wikipedia(new_label))
+        dbpedia_uri = settings.WIKIPEDIA_URLS[lang]['dbpedia_uri'] % (urlize_for_wikipedia(new_label))
     else:
         dbpedia_uri = None
             
@@ -140,23 +141,22 @@
 
 
 
-def get_or_create_term(term_label, term_uri, term_lang, thesaurus, wp_label_transform=(lambda l:l), skip_wp_query=False):
+def get_or_create_term(term_label, term_uri, term_lang, thesaurus, lang, wp_label_transform=(lambda l:l), skip_wp_query=False):
     
     term_label_normalized = normalize_term(term_label)
     # We get the wikipedia references for the tag_label
     # We get or create the tag object
-    global site
     
     
     term, created = Term.objects.get_or_create(uri=term_uri, defaults = {'label':term_label, 'thesaurus':thesaurus, 'normalized_label':term_label_normalized, 'lang' : term_lang})  # @UndefinedVariable
  
     if created:
-        wikipedia_revision_id = process_term(__get_site(), term, label=wp_label_transform(term_label_normalized))
+        wikipedia_revision_id = process_term(__get_site(lang), term, lang, label=wp_label_transform(term_label_normalized))
         term_label_obj = TermLabel(label=term_label, term=term, lang=term_lang)
         term_label_obj.save()
         
     elif term.wikipedia_pageid and not skip_wp_query:
-        wp_res = query_wikipedia_title(__get_site(), pageid=term.wikipedia_pageid)
+        wp_res = query_wikipedia_title(__get_site(lang), lang, pageid=term.wikipedia_pageid)
         wikipedia_revision_id = wp_res['revision_id']
         term.wikipedia_revision_id = wikipedia_revision_id
         term.save()
@@ -167,18 +167,23 @@
     return term, wikipedia_revision_id, created
 
 
-def process_term(site, term, label=None, verbosity=0):
-    
-    if site == None:
-        site = wiki.Wiki(settings.WIKIPEDIA_API_URL)  # @UndefinedVariable
-        
+def process_term(site, term, lang, label=None, verbosity=0):
+            
     if not label:
         label = term.label
-    elif label.startswith(settings.WIKIPEDIA_PAGE_URL):
-        url_parts = urlparse(label)
-        label = urllib2.unquote(str(url_parts.path.split('/')[-1])).decode("utf-8")
+    else:
+        for lang_code, urls in settings.WIKIPEDIA_URLS.iteritems():
+            if label.startswith(urls['page_url']):
+                # lang is overrided when an url is passed as a label.
+                lang = lang_code
+                url_parts = urlparse(label)
+                label = urllib2.unquote(str(url_parts.path.split('/')[-1])).decode("utf-8")
+                break
 
-    wp_res = query_wikipedia_title(site, label=label)
+    if site == None:
+        site = __get_site(lang)
+
+    wp_res = query_wikipedia_title(site, lang, label=label)
     new_label = wp_res['new_label']
     alternative_label= wp_res['alternative_label']
     status =  wp_res['status']