--- a/src/core/wp_utils.py Thu Jul 11 14:26:00 2013 +0200
+++ b/src/core/wp_utils.py Thu Jul 18 10:39:26 2013 +0200
@@ -9,12 +9,13 @@
import urllib2
logger = logging.getLogger(__name__)
-site = None
+sites = {}
-def __get_site():
- global site
+def __get_site(lang):
+ site = sites.get(lang, None)
if site is None:
- site = wiki.Wiki(settings.WIKIPEDIA_API_URL) #@UndefinedVariable
+ site = wiki.Wiki(settings.WIKIPEDIA_URLS[lang]['api_url']) # @UndefinedVariable
+ sites[lang] = site
return site
@@ -52,14 +53,14 @@
return urlquote(label.replace(" ", "_"))
-def __is_homonymie(page_dict):
+def __is_homonymie(page_dict, lang):
for cat in page_dict.get(u"categories", []):
- if u'Catégorie:Homonymie' in cat.get(u"title", u"") or u'Category:Disambiguation pages' in cat.get(u"title", u""):
+ if settings.WIKIPEDIA_URLS[lang]['disambiguation_cat'] in cat.get(u"title", u""):
return True
return False
-def query_wikipedia_title(site, label=None, pageid=None):
+def query_wikipedia_title(site, lang, label=None, pageid=None):
params = {'action':'query', 'prop':'info|categories|langlinks', 'inprop':'url', 'lllimit':'500', 'cllimit':'500'}
@@ -101,7 +102,7 @@
alternative_url = None
alternative_pageid = None
- if __is_homonymie(page):
+ if __is_homonymie(page, lang):
status = TERM_URL_STATUS_DICT["homonyme"]
elif u"redirect" in page:
status = TERM_URL_STATUS_DICT["redirection"]
@@ -131,7 +132,7 @@
if status == TERM_URL_STATUS_DICT['match'] or status == TERM_URL_STATUS_DICT['redirection']:
- dbpedia_uri = settings.DBPEDIA_URI_TEMPLATE % (urlize_for_wikipedia(new_label))
+ dbpedia_uri = settings.WIKIPEDIA_URLS[lang]['dbpedia_uri'] % (urlize_for_wikipedia(new_label))
else:
dbpedia_uri = None
@@ -140,23 +141,22 @@
-def get_or_create_term(term_label, term_uri, term_lang, thesaurus, wp_label_transform=(lambda l:l), skip_wp_query=False):
+def get_or_create_term(term_label, term_uri, term_lang, thesaurus, lang, wp_label_transform=(lambda l:l), skip_wp_query=False):
term_label_normalized = normalize_term(term_label)
# We get the wikipedia references for the tag_label
# We get or create the tag object
- global site
term, created = Term.objects.get_or_create(uri=term_uri, defaults = {'label':term_label, 'thesaurus':thesaurus, 'normalized_label':term_label_normalized, 'lang' : term_lang}) # @UndefinedVariable
if created:
- wikipedia_revision_id = process_term(__get_site(), term, label=wp_label_transform(term_label_normalized))
+ wikipedia_revision_id = process_term(__get_site(lang), term, lang, label=wp_label_transform(term_label_normalized))
term_label_obj = TermLabel(label=term_label, term=term, lang=term_lang)
term_label_obj.save()
elif term.wikipedia_pageid and not skip_wp_query:
- wp_res = query_wikipedia_title(__get_site(), pageid=term.wikipedia_pageid)
+ wp_res = query_wikipedia_title(__get_site(lang), lang, pageid=term.wikipedia_pageid)
wikipedia_revision_id = wp_res['revision_id']
term.wikipedia_revision_id = wikipedia_revision_id
term.save()
@@ -167,18 +167,23 @@
return term, wikipedia_revision_id, created
-def process_term(site, term, label=None, verbosity=0):
-
- if site == None:
- site = wiki.Wiki(settings.WIKIPEDIA_API_URL) # @UndefinedVariable
-
+def process_term(site, term, lang, label=None, verbosity=0):
+
if not label:
label = term.label
- elif label.startswith(settings.WIKIPEDIA_PAGE_URL):
- url_parts = urlparse(label)
- label = urllib2.unquote(str(url_parts.path.split('/')[-1])).decode("utf-8")
+ else:
+ for lang_code, urls in settings.WIKIPEDIA_URLS.iteritems():
+ if label.startswith(urls['page_url']):
+ # lang is overrided when an url is passed as a label.
+ lang = lang_code
+ url_parts = urlparse(label)
+ label = urllib2.unquote(str(url_parts.path.split('/')[-1])).decode("utf-8")
+ break
- wp_res = query_wikipedia_title(site, label=label)
+ if site == None:
+ site = __get_site(lang)
+
+ wp_res = query_wikipedia_title(site, lang, label=label)
new_label = wp_res['new_label']
alternative_label= wp_res['alternative_label']
status = wp_res['status']