src/core/wp_utils.py
author ymh <ymh.work@gmail.com>
Sat, 25 Jan 2020 02:09:46 +0100
changeset 381 f53e667ab25f
parent 334 169b7cfd1f58
permissions -rw-r--r--
Added tag V00.74 for changeset 63dfc6dc3128
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
     1
# -*- coding: utf-8 -*-
334
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
     2
#
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
     3
# Copyright Institut de Recherche et d'Innovation © 2014
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
     4
#
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
     5
# contact@iri.centrepompidou.fr
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
     6
#
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
     7
# Ce code a été développé pour un premier usage dans JocondeLab, projet du 
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
     8
# ministère de la culture et de la communication visant à expérimenter la
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
     9
# recherche sémantique dans la base Joconde
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    10
# (http://jocondelab.iri-research.org/).
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    11
#
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    12
# Ce logiciel est régi par la licence CeCILL-C soumise au droit français et
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    13
# respectant les principes de diffusion des logiciels libres. Vous pouvez
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    14
# utiliser, modifier et/ou redistribuer ce programme sous les conditions
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    15
# de la licence CeCILL-C telle que diffusée par le CEA, le CNRS et l'INRIA 
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    16
# sur le site "http://www.cecill.info".
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    17
#
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    18
# En contrepartie de l'accessibilité au code source et des droits de copie,
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    19
# de modification et de redistribution accordés par cette licence, il n'est
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    20
# offert aux utilisateurs qu'une garantie limitée.  Pour les mêmes raisons,
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    21
# seule une responsabilité restreinte pèse sur l'auteur du programme,  le
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    22
# titulaire des droits patrimoniaux et les concédants successifs.
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    23
#
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    24
# A cet égard  l'attention de l'utilisateur est attirée sur les risques
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    25
# associés au chargement,  à l'utilisation,  à la modification et/ou au
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    26
# développement et à la reproduction du logiciel par l'utilisateur étant 
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    27
# donné sa spécificité de logiciel libre, qui peut le rendre complexe à 
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    28
# manipuler et qui le réserve donc à des développeurs et des professionnels
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    29
# avertis possédant  des  connaissances  informatiques approfondies.  Les
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    30
# utilisateurs sont donc invités à charger  et  tester  l'adéquation  du
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    31
# logiciel à leurs besoins dans des conditions permettant d'assurer la
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    32
# sécurité de leurs systèmes et ou de leurs données et, plus généralement, 
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    33
# à l'utiliser et l'exploiter dans les mêmes conditions de sécurité. 
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    34
#
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    35
# Le fait que vous puissiez accéder à cet en-tête signifie que vous avez 
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    36
# pris connaissance de la licence CeCILL-C, et que vous en avez accepté les
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    37
# termes.
169b7cfd1f58 Add headers to py files
ymh <ymh.work@gmail.com>
parents: 91
diff changeset
    38
#
0
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    39
from .models import Term, TermLabel, TERM_URL_STATUS_DICT
63
a7f4a418d5af can directly paste wikipedia urls
ymh <ymh.work@gmail.com>
parents: 61
diff changeset
    40
from core.models.term import TERM_WK_LINK_SEMANTIC_LEVEL_DICT
0
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    41
from django.conf import settings
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    42
from django.utils.http import urlquote
63
a7f4a418d5af can directly paste wikipedia urls
ymh <ymh.work@gmail.com>
parents: 61
diff changeset
    43
from urlparse import urlparse
0
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    44
from wikitools import api, wiki
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    45
import logging
63
a7f4a418d5af can directly paste wikipedia urls
ymh <ymh.work@gmail.com>
parents: 61
diff changeset
    46
import urllib2
0
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    47
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    48
logger = logging.getLogger(__name__)
67
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
    49
sites = {}
33
61c3ffd94f11 - correct imports.
ymh <ymh.work@gmail.com>
parents: 4
diff changeset
    50
67
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
    51
def __get_site(lang):
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
    52
    site = sites.get(lang, None)
33
61c3ffd94f11 - correct imports.
ymh <ymh.work@gmail.com>
parents: 4
diff changeset
    53
    if site is None:
67
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
    54
        site = wiki.Wiki(settings.WIKIPEDIA_URLS[lang]['api_url'])  # @UndefinedVariable
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
    55
        sites[lang] = site        
33
61c3ffd94f11 - correct imports.
ymh <ymh.work@gmail.com>
parents: 4
diff changeset
    56
    return site
0
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    57
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    58
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    59
def normalize_term(term):
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    60
    if len(term) == 0:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    61
        return term
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    62
    term = term.strip()
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    63
    term = term.replace("_", " ")
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    64
    term = " ".join(term.split())
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    65
    term = term[0].upper() + term[1:]
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    66
    return term
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    67
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    68
def switch_case_group(term):
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    69
    seg_group = term.split()
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    70
    uc_group = []
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    71
    lc_group = []
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    72
    for seg in seg_group:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    73
        is_all_upper = all(c.isupper() or not c.isalpha() for c in seg) 
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    74
        if is_all_upper and not lc_group:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    75
            uc_group.append(seg)
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    76
        elif not is_all_upper and uc_group:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    77
            lc_group.append(seg)
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    78
        else:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    79
            return term
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    80
            
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    81
    if uc_group and lc_group and len(uc_group)+len(lc_group) == len(seg_group):        
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    82
        return " ".join(lc_group + [normalize_term(t.lower()) for t in uc_group])
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    83
    elif uc_group and not lc_group and len(uc_group) == len(seg_group):
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    84
        return " ".join([normalize_term(t.lower()) for t in uc_group])
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    85
    else:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    86
        return term
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    87
    
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    88
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    89
def urlize_for_wikipedia(label):
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    90
    return urlquote(label.replace(" ", "_"))
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    91
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    92
67
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
    93
def __is_homonymie(page_dict, lang):
0
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    94
    for cat in page_dict.get(u"categories", []):
67
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
    95
        if settings.WIKIPEDIA_URLS[lang]['disambiguation_cat'] in cat.get(u"title", u""):
0
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    96
            return True
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    97
    return False
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    98
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
    99
67
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   100
def query_wikipedia_title(site, lang, label=None, pageid=None):
0
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   101
    
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   102
    params = {'action':'query', 'prop':'info|categories|langlinks', 'inprop':'url', 'lllimit':'500', 'cllimit':'500'}
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   103
        
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   104
    if label:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   105
        params['titles'] = label
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   106
    else:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   107
        params['pageids'] = pageid
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   108
    
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   109
    response = None
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   110
        
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   111
    def return_null_result():
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   112
        return { 'new_label': None, 'alternative_label': None, 'status': TERM_URL_STATUS_DICT["null_result"], 'wikipedia_url': None, 'pageid': None, 'alternative_wikipedia_url': None, 'alternative_pageid': None, 'dbpedia_uri': None, 'revision_id': None, 'response': response }
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   113
    
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   114
    try:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   115
        wpquery = api.APIRequest(site, params) #@UndefinedVariable
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   116
        response = wpquery.query()
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   117
    except:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   118
        logger.exception("Exception when querying wikipedia")
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   119
        return return_null_result()
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   120
        
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   121
    original_response = response
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   122
    
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   123
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   124
    query_dict = response['query']
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   125
    # get page if multiple pages or none -> return Tag.null_result
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   126
    pages = query_dict.get("pages", {})
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   127
    if len(pages) > 1 or len(pages) == 0:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   128
        return return_null_result()
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   129
    
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   130
    page = pages.values()[0]
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   131
    
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   132
    if u"invalid" in page or u"missing" in page:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   133
        return return_null_result()
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   134
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   135
    url = page.get(u'fullurl', None)
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   136
    pageid = page.get(u'pageid', None)
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   137
    new_label = page[u'title']
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   138
    alternative_label = None
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   139
    alternative_url = None
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   140
    alternative_pageid = None
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   141
    
67
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   142
    if __is_homonymie(page, lang):
0
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   143
        status = TERM_URL_STATUS_DICT["homonyme"]
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   144
    elif u"redirect" in page:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   145
        status = TERM_URL_STATUS_DICT["redirection"]
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   146
    else:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   147
        status = TERM_URL_STATUS_DICT["match"]
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   148
    
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   149
    if status == TERM_URL_STATUS_DICT["redirection"]:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   150
        params['redirects'] = True
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   151
        try:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   152
            wpquery = api.APIRequest(site, params) #@UndefinedVariable    
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   153
            response = wpquery.query()
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   154
        except:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   155
            logger.exception("Exception when querying wikipedia for redirects")
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   156
            return return_null_result()
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   157
        query_dict = response['query']
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   158
        pages = query_dict.get("pages", {})
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   159
        #we know that we have at least one answer        
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   160
        if len(pages) > 1 or len(pages) == 0:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   161
            return return_null_result()
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   162
        page = pages.values()[0]
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   163
        alternative_label = page.get('title', None)
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   164
        alternative_url = page.get('fullurl', None)
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   165
        alternative_pageid = page.get('pageid',None)
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   166
        
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   167
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   168
    revision_id = page.get('lastrevid', None)
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   169
    
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   170
    
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   171
    if status == TERM_URL_STATUS_DICT['match'] or status == TERM_URL_STATUS_DICT['redirection']:
67
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   172
        dbpedia_uri = settings.WIKIPEDIA_URLS[lang]['dbpedia_uri'] % (urlize_for_wikipedia(new_label))
0
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   173
    else:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   174
        dbpedia_uri = None
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   175
            
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   176
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   177
    return { 'new_label': new_label, 'alternative_label': alternative_label, 'status': status, 'wikipedia_url': url, 'pageid': pageid, 'alternative_wikipedia_url': alternative_url, 'alternative_pageid': alternative_pageid, 'dbpedia_uri': dbpedia_uri, 'revision_id': revision_id, 'response': original_response }
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   178
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   179
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   180
67
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   181
def get_or_create_term(term_label, term_uri, term_lang, thesaurus, lang, wp_label_transform=(lambda l:l), skip_wp_query=False):
0
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   182
    
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   183
    term_label_normalized = normalize_term(term_label)
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   184
    # We get the wikipedia references for the tag_label
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   185
    # We get or create the tag object
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   186
    
33
61c3ffd94f11 - correct imports.
ymh <ymh.work@gmail.com>
parents: 4
diff changeset
   187
    
61
0048668779c0 change model for thesaurus tree. show level and ancestor
ymh <ymh.work@gmail.com>
parents: 35
diff changeset
   188
    term, created = Term.objects.get_or_create(uri=term_uri, defaults = {'label':term_label, 'thesaurus':thesaurus, 'normalized_label':term_label_normalized, 'lang' : term_lang})  # @UndefinedVariable
0
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   189
 
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   190
    if created:
67
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   191
        wikipedia_revision_id = process_term(__get_site(lang), term, lang, label=wp_label_transform(term_label_normalized))
0
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   192
        term_label_obj = TermLabel(label=term_label, term=term, lang=term_lang)
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   193
        term_label_obj.save()
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   194
        
4
ae066d797a66 add link to joconde search. Add option to import
ymh <ymh.work@gmail.com>
parents: 3
diff changeset
   195
    elif term.wikipedia_pageid and not skip_wp_query:
67
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   196
        wp_res = query_wikipedia_title(__get_site(lang), lang, pageid=term.wikipedia_pageid)
0
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   197
        wikipedia_revision_id = wp_res['revision_id']
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   198
        term.wikipedia_revision_id = wikipedia_revision_id
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   199
        term.save()
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   200
    else:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   201
        wikipedia_revision_id = None
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   202
        
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   203
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   204
    return term, wikipedia_revision_id, created
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   205
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   206
67
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   207
def process_term(site, term, lang, label=None, verbosity=0):
71
3fde7d26ad08 Add link_semantic_level to filter (bug #17542)
ymh <ymh.work@gmail.com>
parents: 67
diff changeset
   208
    
3fde7d26ad08 Add link_semantic_level to filter (bug #17542)
ymh <ymh.work@gmail.com>
parents: 67
diff changeset
   209
    label_is_url = False
3fde7d26ad08 Add link_semantic_level to filter (bug #17542)
ymh <ymh.work@gmail.com>
parents: 67
diff changeset
   210
    fragment = ""
0
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   211
    if not label:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   212
        label = term.label
67
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   213
    else:
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   214
        for lang_code, urls in settings.WIKIPEDIA_URLS.iteritems():
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   215
            if label.startswith(urls['page_url']):
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   216
                # lang is overrided when an url is passed as a label.
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   217
                lang = lang_code
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   218
                url_parts = urlparse(label)
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   219
                label = urllib2.unquote(str(url_parts.path.split('/')[-1])).decode("utf-8")
71
3fde7d26ad08 Add link_semantic_level to filter (bug #17542)
ymh <ymh.work@gmail.com>
parents: 67
diff changeset
   220
                if url_parts.fragment:
3fde7d26ad08 Add link_semantic_level to filter (bug #17542)
ymh <ymh.work@gmail.com>
parents: 67
diff changeset
   221
                    label_is_url = True
3fde7d26ad08 Add link_semantic_level to filter (bug #17542)
ymh <ymh.work@gmail.com>
parents: 67
diff changeset
   222
                    fragment = url_parts.fragment
67
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   223
                break
63
a7f4a418d5af can directly paste wikipedia urls
ymh <ymh.work@gmail.com>
parents: 61
diff changeset
   224
67
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   225
    if site == None:
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   226
        site = __get_site(lang)
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   227
5d9223bb3aab Add other wikipedia.
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   228
    wp_res = query_wikipedia_title(site, lang, label=label)
0
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   229
    new_label = wp_res['new_label']
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   230
    alternative_label= wp_res['alternative_label']
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   231
    status =  wp_res['status']
80
3851909cb730 better error management when editing wikipedia.
ymh <ymh.work@gmail.com>
parents: 71
diff changeset
   232
    url = wp_res['wikipedia_url'] + ("#"+fragment if label_is_url else "") if wp_res['wikipedia_url'] else None    
0
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   233
    alternative_url = wp_res['alternative_wikipedia_url']
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   234
    pageid = wp_res['pageid']
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   235
    alternative_pageid = wp_res['alternative_pageid']
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   236
    response = wp_res['response']
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   237
    dbpedia_uri =  wp_res["dbpedia_uri"]
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   238
    revision_id = wp_res["revision_id"]
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   239
    
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   240
    if verbosity >= 2 :
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   241
        print "response from query to %s with parameters %s :" % (site.apibase, repr(new_label))
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   242
        print repr(response)
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   243
    
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   244
    if new_label is not None:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   245
        term.wp_label = new_label
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   246
    if status is not None:
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   247
        term.url_status = status
35
859862939996 add qualifier on the wikipedia link
ymh <ymh.work@gmail.com>
parents: 33
diff changeset
   248
        term.link_semantic_level = TERM_WK_LINK_SEMANTIC_LEVEL_DICT['--']
0
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   249
    term.wikipedia_url = url
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   250
    term.wikipedia_pageid = pageid
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   251
    term.dbpedia_uri = dbpedia_uri
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   252
    term.alternative_label = alternative_label
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   253
    term.alternative_wikipedia_url = alternative_url
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   254
    term.alternative_wikipedia_pageid = alternative_pageid
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   255
    term.wikipedia_revision_id=revision_id
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   256
        
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   257
    term.save()
4095911a7830 Jocondelab first commit before design
ymh <ymh.work@gmail.com>
parents:
diff changeset
   258
    
3
221af1052ec4 Correct process wikipedia
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
   259
    return revision_id
221af1052ec4 Correct process wikipedia
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
   260
    
91
3bbf7371378a Model reorganization for user + migration.
ymh <ymh.work@gmail.com>
parents: 80
diff changeset
   261
def get_dbpedia_lang(dbp_uri):
3bbf7371378a Model reorganization for user + migration.
ymh <ymh.work@gmail.com>
parents: 80
diff changeset
   262
    
3bbf7371378a Model reorganization for user + migration.
ymh <ymh.work@gmail.com>
parents: 80
diff changeset
   263
    for lang, props in settings.WIKIPEDIA_URLS.iteritems():
3bbf7371378a Model reorganization for user + migration.
ymh <ymh.work@gmail.com>
parents: 80
diff changeset
   264
        if dbp_uri.startswith(props['dbpedia_base_url']):
3bbf7371378a Model reorganization for user + migration.
ymh <ymh.work@gmail.com>
parents: 80
diff changeset
   265
            return lang
3bbf7371378a Model reorganization for user + migration.
ymh <ymh.work@gmail.com>
parents: 80
diff changeset
   266
    return None
3bbf7371378a Model reorganization for user + migration.
ymh <ymh.work@gmail.com>
parents: 80
diff changeset
   267