src/core/wp_utils.py
changeset 67 5d9223bb3aab
parent 63 a7f4a418d5af
child 71 3fde7d26ad08
equal deleted inserted replaced
66:9a7c391fb123 67:5d9223bb3aab
     7 from wikitools import api, wiki
     7 from wikitools import api, wiki
     8 import logging
     8 import logging
     9 import urllib2
     9 import urllib2
    10 
    10 
    11 logger = logging.getLogger(__name__)
    11 logger = logging.getLogger(__name__)
    12 site = None
    12 sites = {}
    13 
    13 
    14 def __get_site():
    14 def __get_site(lang):
    15     global site
    15     site = sites.get(lang, None)
    16     if site is None:
    16     if site is None:
    17         site = wiki.Wiki(settings.WIKIPEDIA_API_URL) #@UndefinedVariable
    17         site = wiki.Wiki(settings.WIKIPEDIA_URLS[lang]['api_url'])  # @UndefinedVariable
       
    18         sites[lang] = site        
    18     return site
    19     return site
    19 
    20 
    20 
    21 
    21 def normalize_term(term):
    22 def normalize_term(term):
    22     if len(term) == 0:
    23     if len(term) == 0:
    50 
    51 
    51 def urlize_for_wikipedia(label):
    52 def urlize_for_wikipedia(label):
    52     return urlquote(label.replace(" ", "_"))
    53     return urlquote(label.replace(" ", "_"))
    53 
    54 
    54 
    55 
    55 def __is_homonymie(page_dict):
    56 def __is_homonymie(page_dict, lang):
    56     for cat in page_dict.get(u"categories", []):
    57     for cat in page_dict.get(u"categories", []):
    57         if u'Catégorie:Homonymie' in cat.get(u"title", u"") or u'Category:Disambiguation pages' in cat.get(u"title", u""):
    58         if settings.WIKIPEDIA_URLS[lang]['disambiguation_cat'] in cat.get(u"title", u""):
    58             return True
    59             return True
    59     return False
    60     return False
    60 
    61 
    61 
    62 
    62 def query_wikipedia_title(site, label=None, pageid=None):
    63 def query_wikipedia_title(site, lang, label=None, pageid=None):
    63     
    64     
    64     params = {'action':'query', 'prop':'info|categories|langlinks', 'inprop':'url', 'lllimit':'500', 'cllimit':'500'}
    65     params = {'action':'query', 'prop':'info|categories|langlinks', 'inprop':'url', 'lllimit':'500', 'cllimit':'500'}
    65         
    66         
    66     if label:
    67     if label:
    67         params['titles'] = label
    68         params['titles'] = label
    99     new_label = page[u'title']
   100     new_label = page[u'title']
   100     alternative_label = None
   101     alternative_label = None
   101     alternative_url = None
   102     alternative_url = None
   102     alternative_pageid = None
   103     alternative_pageid = None
   103     
   104     
   104     if __is_homonymie(page):
   105     if __is_homonymie(page, lang):
   105         status = TERM_URL_STATUS_DICT["homonyme"]
   106         status = TERM_URL_STATUS_DICT["homonyme"]
   106     elif u"redirect" in page:
   107     elif u"redirect" in page:
   107         status = TERM_URL_STATUS_DICT["redirection"]
   108         status = TERM_URL_STATUS_DICT["redirection"]
   108     else:
   109     else:
   109         status = TERM_URL_STATUS_DICT["match"]
   110         status = TERM_URL_STATUS_DICT["match"]
   129 
   130 
   130     revision_id = page.get('lastrevid', None)
   131     revision_id = page.get('lastrevid', None)
   131     
   132     
   132     
   133     
   133     if status == TERM_URL_STATUS_DICT['match'] or status == TERM_URL_STATUS_DICT['redirection']:
   134     if status == TERM_URL_STATUS_DICT['match'] or status == TERM_URL_STATUS_DICT['redirection']:
   134         dbpedia_uri = settings.DBPEDIA_URI_TEMPLATE % (urlize_for_wikipedia(new_label))
   135         dbpedia_uri = settings.WIKIPEDIA_URLS[lang]['dbpedia_uri'] % (urlize_for_wikipedia(new_label))
   135     else:
   136     else:
   136         dbpedia_uri = None
   137         dbpedia_uri = None
   137             
   138             
   138 
   139 
   139     return { 'new_label': new_label, 'alternative_label': alternative_label, 'status': status, 'wikipedia_url': url, 'pageid': pageid, 'alternative_wikipedia_url': alternative_url, 'alternative_pageid': alternative_pageid, 'dbpedia_uri': dbpedia_uri, 'revision_id': revision_id, 'response': original_response }
   140     return { 'new_label': new_label, 'alternative_label': alternative_label, 'status': status, 'wikipedia_url': url, 'pageid': pageid, 'alternative_wikipedia_url': alternative_url, 'alternative_pageid': alternative_pageid, 'dbpedia_uri': dbpedia_uri, 'revision_id': revision_id, 'response': original_response }
   140 
   141 
   141 
   142 
   142 
   143 
   143 def get_or_create_term(term_label, term_uri, term_lang, thesaurus, wp_label_transform=(lambda l:l), skip_wp_query=False):
   144 def get_or_create_term(term_label, term_uri, term_lang, thesaurus, lang, wp_label_transform=(lambda l:l), skip_wp_query=False):
   144     
   145     
   145     term_label_normalized = normalize_term(term_label)
   146     term_label_normalized = normalize_term(term_label)
   146     # We get the wikipedia references for the tag_label
   147     # We get the wikipedia references for the tag_label
   147     # We get or create the tag object
   148     # We get or create the tag object
   148     global site
       
   149     
   149     
   150     
   150     
   151     term, created = Term.objects.get_or_create(uri=term_uri, defaults = {'label':term_label, 'thesaurus':thesaurus, 'normalized_label':term_label_normalized, 'lang' : term_lang})  # @UndefinedVariable
   151     term, created = Term.objects.get_or_create(uri=term_uri, defaults = {'label':term_label, 'thesaurus':thesaurus, 'normalized_label':term_label_normalized, 'lang' : term_lang})  # @UndefinedVariable
   152  
   152  
   153     if created:
   153     if created:
   154         wikipedia_revision_id = process_term(__get_site(), term, label=wp_label_transform(term_label_normalized))
   154         wikipedia_revision_id = process_term(__get_site(lang), term, lang, label=wp_label_transform(term_label_normalized))
   155         term_label_obj = TermLabel(label=term_label, term=term, lang=term_lang)
   155         term_label_obj = TermLabel(label=term_label, term=term, lang=term_lang)
   156         term_label_obj.save()
   156         term_label_obj.save()
   157         
   157         
   158     elif term.wikipedia_pageid and not skip_wp_query:
   158     elif term.wikipedia_pageid and not skip_wp_query:
   159         wp_res = query_wikipedia_title(__get_site(), pageid=term.wikipedia_pageid)
   159         wp_res = query_wikipedia_title(__get_site(lang), lang, pageid=term.wikipedia_pageid)
   160         wikipedia_revision_id = wp_res['revision_id']
   160         wikipedia_revision_id = wp_res['revision_id']
   161         term.wikipedia_revision_id = wikipedia_revision_id
   161         term.wikipedia_revision_id = wikipedia_revision_id
   162         term.save()
   162         term.save()
   163     else:
   163     else:
   164         wikipedia_revision_id = None
   164         wikipedia_revision_id = None
   165         
   165         
   166 
   166 
   167     return term, wikipedia_revision_id, created
   167     return term, wikipedia_revision_id, created
   168 
   168 
   169 
   169 
   170 def process_term(site, term, label=None, verbosity=0):
   170 def process_term(site, term, lang, label=None, verbosity=0):
   171     
   171             
   172     if site == None:
       
   173         site = wiki.Wiki(settings.WIKIPEDIA_API_URL)  # @UndefinedVariable
       
   174         
       
   175     if not label:
   172     if not label:
   176         label = term.label
   173         label = term.label
   177     elif label.startswith(settings.WIKIPEDIA_PAGE_URL):
   174     else:
   178         url_parts = urlparse(label)
   175         for lang_code, urls in settings.WIKIPEDIA_URLS.iteritems():
   179         label = urllib2.unquote(str(url_parts.path.split('/')[-1])).decode("utf-8")
   176             if label.startswith(urls['page_url']):
   180 
   177                 # lang is overrided when an url is passed as a label.
   181     wp_res = query_wikipedia_title(site, label=label)
   178                 lang = lang_code
       
   179                 url_parts = urlparse(label)
       
   180                 label = urllib2.unquote(str(url_parts.path.split('/')[-1])).decode("utf-8")
       
   181                 break
       
   182 
       
   183     if site == None:
       
   184         site = __get_site(lang)
       
   185 
       
   186     wp_res = query_wikipedia_title(site, lang, label=label)
   182     new_label = wp_res['new_label']
   187     new_label = wp_res['new_label']
   183     alternative_label= wp_res['alternative_label']
   188     alternative_label= wp_res['alternative_label']
   184     status =  wp_res['status']
   189     status =  wp_res['status']
   185     url = wp_res['wikipedia_url']
   190     url = wp_res['wikipedia_url']
   186     alternative_url = wp_res['alternative_wikipedia_url']
   191     alternative_url = wp_res['alternative_wikipedia_url']