diff -r 9a7c391fb123 -r 5d9223bb3aab src/core/wp_utils.py --- a/src/core/wp_utils.py Thu Jul 11 14:26:00 2013 +0200 +++ b/src/core/wp_utils.py Thu Jul 18 10:39:26 2013 +0200 @@ -9,12 +9,13 @@ import urllib2 logger = logging.getLogger(__name__) -site = None +sites = {} -def __get_site(): - global site +def __get_site(lang): + site = sites.get(lang, None) if site is None: - site = wiki.Wiki(settings.WIKIPEDIA_API_URL) #@UndefinedVariable + site = wiki.Wiki(settings.WIKIPEDIA_URLS[lang]['api_url']) # @UndefinedVariable + sites[lang] = site return site @@ -52,14 +53,14 @@ return urlquote(label.replace(" ", "_")) -def __is_homonymie(page_dict): +def __is_homonymie(page_dict, lang): for cat in page_dict.get(u"categories", []): - if u'Catégorie:Homonymie' in cat.get(u"title", u"") or u'Category:Disambiguation pages' in cat.get(u"title", u""): + if settings.WIKIPEDIA_URLS[lang]['disambiguation_cat'] in cat.get(u"title", u""): return True return False -def query_wikipedia_title(site, label=None, pageid=None): +def query_wikipedia_title(site, lang, label=None, pageid=None): params = {'action':'query', 'prop':'info|categories|langlinks', 'inprop':'url', 'lllimit':'500', 'cllimit':'500'} @@ -101,7 +102,7 @@ alternative_url = None alternative_pageid = None - if __is_homonymie(page): + if __is_homonymie(page, lang): status = TERM_URL_STATUS_DICT["homonyme"] elif u"redirect" in page: status = TERM_URL_STATUS_DICT["redirection"] @@ -131,7 +132,7 @@ if status == TERM_URL_STATUS_DICT['match'] or status == TERM_URL_STATUS_DICT['redirection']: - dbpedia_uri = settings.DBPEDIA_URI_TEMPLATE % (urlize_for_wikipedia(new_label)) + dbpedia_uri = settings.WIKIPEDIA_URLS[lang]['dbpedia_uri'] % (urlize_for_wikipedia(new_label)) else: dbpedia_uri = None @@ -140,23 +141,22 @@ -def get_or_create_term(term_label, term_uri, term_lang, thesaurus, wp_label_transform=(lambda l:l), skip_wp_query=False): +def get_or_create_term(term_label, term_uri, term_lang, thesaurus, lang, wp_label_transform=(lambda l:l), skip_wp_query=False): term_label_normalized = normalize_term(term_label) # We get the wikipedia references for the tag_label # We get or create the tag object - global site term, created = Term.objects.get_or_create(uri=term_uri, defaults = {'label':term_label, 'thesaurus':thesaurus, 'normalized_label':term_label_normalized, 'lang' : term_lang}) # @UndefinedVariable if created: - wikipedia_revision_id = process_term(__get_site(), term, label=wp_label_transform(term_label_normalized)) + wikipedia_revision_id = process_term(__get_site(lang), term, lang, label=wp_label_transform(term_label_normalized)) term_label_obj = TermLabel(label=term_label, term=term, lang=term_lang) term_label_obj.save() elif term.wikipedia_pageid and not skip_wp_query: - wp_res = query_wikipedia_title(__get_site(), pageid=term.wikipedia_pageid) + wp_res = query_wikipedia_title(__get_site(lang), lang, pageid=term.wikipedia_pageid) wikipedia_revision_id = wp_res['revision_id'] term.wikipedia_revision_id = wikipedia_revision_id term.save() @@ -167,18 +167,23 @@ return term, wikipedia_revision_id, created -def process_term(site, term, label=None, verbosity=0): - - if site == None: - site = wiki.Wiki(settings.WIKIPEDIA_API_URL) # @UndefinedVariable - +def process_term(site, term, lang, label=None, verbosity=0): + if not label: label = term.label - elif label.startswith(settings.WIKIPEDIA_PAGE_URL): - url_parts = urlparse(label) - label = urllib2.unquote(str(url_parts.path.split('/')[-1])).decode("utf-8") + else: + for lang_code, urls in settings.WIKIPEDIA_URLS.iteritems(): + if label.startswith(urls['page_url']): + # lang is overrided when an url is passed as a label. + lang = lang_code + url_parts = urlparse(label) + label = urllib2.unquote(str(url_parts.path.split('/')[-1])).decode("utf-8") + break - wp_res = query_wikipedia_title(site, label=label) + if site == None: + site = __get_site(lang) + + wp_res = query_wikipedia_title(site, lang, label=label) new_label = wp_res['new_label'] alternative_label= wp_res['alternative_label'] status = wp_res['status']