| author | ymh <ymh.work@gmail.com> |
| Wed, 05 Feb 2014 13:32:41 +0100 | |
| changeset 341 | 9b7682bc6f0c |
| parent 334 | 169b7cfd1f58 |
| permissions | -rw-r--r-- |
| 0 | 1 |
# -*- coding: utf-8 -*- |
| 334 | 2 |
# |
3 |
# Copyright Institut de Recherche et d'Innovation © 2014 |
|
4 |
# |
|
5 |
# contact@iri.centrepompidou.fr |
|
6 |
# |
|
7 |
# Ce code a été développé pour un premier usage dans JocondeLab, projet du |
|
8 |
# ministère de la culture et de la communication visant à expérimenter la |
|
9 |
# recherche sémantique dans la base Joconde |
|
10 |
# (http://jocondelab.iri-research.org/). |
|
11 |
# |
|
12 |
# Ce logiciel est régi par la licence CeCILL-C soumise au droit français et |
|
13 |
# respectant les principes de diffusion des logiciels libres. Vous pouvez |
|
14 |
# utiliser, modifier et/ou redistribuer ce programme sous les conditions |
|
15 |
# de la licence CeCILL-C telle que diffusée par le CEA, le CNRS et l'INRIA |
|
16 |
# sur le site "http://www.cecill.info". |
|
17 |
# |
|
18 |
# En contrepartie de l'accessibilité au code source et des droits de copie, |
|
19 |
# de modification et de redistribution accordés par cette licence, il n'est |
|
20 |
# offert aux utilisateurs qu'une garantie limitée. Pour les mêmes raisons, |
|
21 |
# seule une responsabilité restreinte pèse sur l'auteur du programme, le |
|
22 |
# titulaire des droits patrimoniaux et les concédants successifs. |
|
23 |
# |
|
24 |
# A cet égard l'attention de l'utilisateur est attirée sur les risques |
|
25 |
# associés au chargement, à l'utilisation, à la modification et/ou au |
|
26 |
# développement et à la reproduction du logiciel par l'utilisateur étant |
|
27 |
# donné sa spécificité de logiciel libre, qui peut le rendre complexe à |
|
28 |
# manipuler et qui le réserve donc à des développeurs et des professionnels |
|
29 |
# avertis possédant des connaissances informatiques approfondies. Les |
|
30 |
# utilisateurs sont donc invités à charger et tester l'adéquation du |
|
31 |
# logiciel à leurs besoins dans des conditions permettant d'assurer la |
|
32 |
# sécurité de leurs systèmes et ou de leurs données et, plus généralement, |
|
33 |
# à l'utiliser et l'exploiter dans les mêmes conditions de sécurité. |
|
34 |
# |
|
35 |
# Le fait que vous puissiez accéder à cet en-tête signifie que vous avez |
|
36 |
# pris connaissance de la licence CeCILL-C, et que vous en avez accepté les |
|
37 |
# termes. |
|
38 |
# |
|
| 0 | 39 |
from .models import Term, TermLabel, TERM_URL_STATUS_DICT |
| 63 | 40 |
from core.models.term import TERM_WK_LINK_SEMANTIC_LEVEL_DICT |
| 0 | 41 |
from django.conf import settings |
42 |
from django.utils.http import urlquote |
|
| 63 | 43 |
from urlparse import urlparse |
| 0 | 44 |
from wikitools import api, wiki |
45 |
import logging |
|
| 63 | 46 |
import urllib2 |
| 0 | 47 |
|
48 |
logger = logging.getLogger(__name__) |
|
| 67 | 49 |
sites = {} |
| 33 | 50 |
|
| 67 | 51 |
def __get_site(lang): |
52 |
site = sites.get(lang, None) |
|
| 33 | 53 |
if site is None: |
| 67 | 54 |
site = wiki.Wiki(settings.WIKIPEDIA_URLS[lang]['api_url']) # @UndefinedVariable |
55 |
sites[lang] = site |
|
| 33 | 56 |
return site |
| 0 | 57 |
|
58 |
||
59 |
def normalize_term(term): |
|
60 |
if len(term) == 0: |
|
61 |
return term |
|
62 |
term = term.strip() |
|
63 |
term = term.replace("_", " ") |
|
64 |
term = " ".join(term.split()) |
|
65 |
term = term[0].upper() + term[1:] |
|
66 |
return term |
|
67 |
||
68 |
def switch_case_group(term): |
|
69 |
seg_group = term.split() |
|
70 |
uc_group = [] |
|
71 |
lc_group = [] |
|
72 |
for seg in seg_group: |
|
73 |
is_all_upper = all(c.isupper() or not c.isalpha() for c in seg) |
|
74 |
if is_all_upper and not lc_group: |
|
75 |
uc_group.append(seg) |
|
76 |
elif not is_all_upper and uc_group: |
|
77 |
lc_group.append(seg) |
|
78 |
else: |
|
79 |
return term |
|
80 |
||
81 |
if uc_group and lc_group and len(uc_group)+len(lc_group) == len(seg_group): |
|
82 |
return " ".join(lc_group + [normalize_term(t.lower()) for t in uc_group]) |
|
83 |
elif uc_group and not lc_group and len(uc_group) == len(seg_group): |
|
84 |
return " ".join([normalize_term(t.lower()) for t in uc_group]) |
|
85 |
else: |
|
86 |
return term |
|
87 |
||
88 |
||
89 |
def urlize_for_wikipedia(label): |
|
90 |
return urlquote(label.replace(" ", "_")) |
|
91 |
||
92 |
||
| 67 | 93 |
def __is_homonymie(page_dict, lang): |
| 0 | 94 |
for cat in page_dict.get(u"categories", []): |
| 67 | 95 |
if settings.WIKIPEDIA_URLS[lang]['disambiguation_cat'] in cat.get(u"title", u""): |
| 0 | 96 |
return True |
97 |
return False |
|
98 |
||
99 |
||
| 67 | 100 |
def query_wikipedia_title(site, lang, label=None, pageid=None): |
| 0 | 101 |
|
102 |
params = {'action':'query', 'prop':'info|categories|langlinks', 'inprop':'url', 'lllimit':'500', 'cllimit':'500'} |
|
103 |
||
104 |
if label: |
|
105 |
params['titles'] = label |
|
106 |
else: |
|
107 |
params['pageids'] = pageid |
|
108 |
||
109 |
response = None |
|
110 |
||
111 |
def return_null_result(): |
|
112 |
return { 'new_label': None, 'alternative_label': None, 'status': TERM_URL_STATUS_DICT["null_result"], 'wikipedia_url': None, 'pageid': None, 'alternative_wikipedia_url': None, 'alternative_pageid': None, 'dbpedia_uri': None, 'revision_id': None, 'response': response } |
|
113 |
||
114 |
try: |
|
115 |
wpquery = api.APIRequest(site, params) #@UndefinedVariable |
|
116 |
response = wpquery.query() |
|
117 |
except: |
|
118 |
logger.exception("Exception when querying wikipedia") |
|
119 |
return return_null_result() |
|
120 |
||
121 |
original_response = response |
|
122 |
||
123 |
||
124 |
query_dict = response['query'] |
|
125 |
# get page if multiple pages or none -> return Tag.null_result |
|
126 |
pages = query_dict.get("pages", {}) |
|
127 |
if len(pages) > 1 or len(pages) == 0: |
|
128 |
return return_null_result() |
|
129 |
||
130 |
page = pages.values()[0] |
|
131 |
||
132 |
if u"invalid" in page or u"missing" in page: |
|
133 |
return return_null_result() |
|
134 |
||
135 |
url = page.get(u'fullurl', None) |
|
136 |
pageid = page.get(u'pageid', None) |
|
137 |
new_label = page[u'title'] |
|
138 |
alternative_label = None |
|
139 |
alternative_url = None |
|
140 |
alternative_pageid = None |
|
141 |
||
| 67 | 142 |
if __is_homonymie(page, lang): |
| 0 | 143 |
status = TERM_URL_STATUS_DICT["homonyme"] |
144 |
elif u"redirect" in page: |
|
145 |
status = TERM_URL_STATUS_DICT["redirection"] |
|
146 |
else: |
|
147 |
status = TERM_URL_STATUS_DICT["match"] |
|
148 |
||
149 |
if status == TERM_URL_STATUS_DICT["redirection"]: |
|
150 |
params['redirects'] = True |
|
151 |
try: |
|
152 |
wpquery = api.APIRequest(site, params) #@UndefinedVariable |
|
153 |
response = wpquery.query() |
|
154 |
except: |
|
155 |
logger.exception("Exception when querying wikipedia for redirects") |
|
156 |
return return_null_result() |
|
157 |
query_dict = response['query'] |
|
158 |
pages = query_dict.get("pages", {}) |
|
159 |
#we know that we have at least one answer |
|
160 |
if len(pages) > 1 or len(pages) == 0: |
|
161 |
return return_null_result() |
|
162 |
page = pages.values()[0] |
|
163 |
alternative_label = page.get('title', None) |
|
164 |
alternative_url = page.get('fullurl', None) |
|
165 |
alternative_pageid = page.get('pageid',None) |
|
166 |
||
167 |
||
168 |
revision_id = page.get('lastrevid', None) |
|
169 |
||
170 |
||
171 |
if status == TERM_URL_STATUS_DICT['match'] or status == TERM_URL_STATUS_DICT['redirection']: |
|
| 67 | 172 |
dbpedia_uri = settings.WIKIPEDIA_URLS[lang]['dbpedia_uri'] % (urlize_for_wikipedia(new_label)) |
| 0 | 173 |
else: |
174 |
dbpedia_uri = None |
|
175 |
||
176 |
||
177 |
return { 'new_label': new_label, 'alternative_label': alternative_label, 'status': status, 'wikipedia_url': url, 'pageid': pageid, 'alternative_wikipedia_url': alternative_url, 'alternative_pageid': alternative_pageid, 'dbpedia_uri': dbpedia_uri, 'revision_id': revision_id, 'response': original_response } |
|
178 |
||
179 |
||
180 |
||
| 67 | 181 |
def get_or_create_term(term_label, term_uri, term_lang, thesaurus, lang, wp_label_transform=(lambda l:l), skip_wp_query=False): |
| 0 | 182 |
|
183 |
term_label_normalized = normalize_term(term_label) |
|
184 |
# We get the wikipedia references for the tag_label |
|
185 |
# We get or create the tag object |
|
186 |
||
| 33 | 187 |
|
|
61
0048668779c0
change model for thesaurus tree. show level and ancestor
ymh <ymh.work@gmail.com>
parents:
35
diff
changeset
|
188 |
term, created = Term.objects.get_or_create(uri=term_uri, defaults = {'label':term_label, 'thesaurus':thesaurus, 'normalized_label':term_label_normalized, 'lang' : term_lang}) # @UndefinedVariable |
| 0 | 189 |
|
190 |
if created: |
|
| 67 | 191 |
wikipedia_revision_id = process_term(__get_site(lang), term, lang, label=wp_label_transform(term_label_normalized)) |
| 0 | 192 |
term_label_obj = TermLabel(label=term_label, term=term, lang=term_lang) |
193 |
term_label_obj.save() |
|
194 |
||
|
4
ae066d797a66
add link to joconde search. Add option to import
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
195 |
elif term.wikipedia_pageid and not skip_wp_query: |
| 67 | 196 |
wp_res = query_wikipedia_title(__get_site(lang), lang, pageid=term.wikipedia_pageid) |
| 0 | 197 |
wikipedia_revision_id = wp_res['revision_id'] |
198 |
term.wikipedia_revision_id = wikipedia_revision_id |
|
199 |
term.save() |
|
200 |
else: |
|
201 |
wikipedia_revision_id = None |
|
202 |
||
203 |
||
204 |
return term, wikipedia_revision_id, created |
|
205 |
||
206 |
||
| 67 | 207 |
def process_term(site, term, lang, label=None, verbosity=0): |
|
71
3fde7d26ad08
Add link_semantic_level to filter (bug #17542)
ymh <ymh.work@gmail.com>
parents:
67
diff
changeset
|
208 |
|
|
3fde7d26ad08
Add link_semantic_level to filter (bug #17542)
ymh <ymh.work@gmail.com>
parents:
67
diff
changeset
|
209 |
label_is_url = False |
|
3fde7d26ad08
Add link_semantic_level to filter (bug #17542)
ymh <ymh.work@gmail.com>
parents:
67
diff
changeset
|
210 |
fragment = "" |
| 0 | 211 |
if not label: |
212 |
label = term.label |
|
| 67 | 213 |
else: |
214 |
for lang_code, urls in settings.WIKIPEDIA_URLS.iteritems(): |
|
215 |
if label.startswith(urls['page_url']): |
|
216 |
# lang is overrided when an url is passed as a label. |
|
217 |
lang = lang_code |
|
218 |
url_parts = urlparse(label) |
|
219 |
label = urllib2.unquote(str(url_parts.path.split('/')[-1])).decode("utf-8") |
|
|
71
3fde7d26ad08
Add link_semantic_level to filter (bug #17542)
ymh <ymh.work@gmail.com>
parents:
67
diff
changeset
|
220 |
if url_parts.fragment: |
|
3fde7d26ad08
Add link_semantic_level to filter (bug #17542)
ymh <ymh.work@gmail.com>
parents:
67
diff
changeset
|
221 |
label_is_url = True |
|
3fde7d26ad08
Add link_semantic_level to filter (bug #17542)
ymh <ymh.work@gmail.com>
parents:
67
diff
changeset
|
222 |
fragment = url_parts.fragment |
| 67 | 223 |
break |
| 63 | 224 |
|
| 67 | 225 |
if site == None: |
226 |
site = __get_site(lang) |
|
227 |
||
228 |
wp_res = query_wikipedia_title(site, lang, label=label) |
|
| 0 | 229 |
new_label = wp_res['new_label'] |
230 |
alternative_label= wp_res['alternative_label'] |
|
231 |
status = wp_res['status'] |
|
|
80
3851909cb730
better error management when editing wikipedia.
ymh <ymh.work@gmail.com>
parents:
71
diff
changeset
|
232 |
url = wp_res['wikipedia_url'] + ("#"+fragment if label_is_url else "") if wp_res['wikipedia_url'] else None |
| 0 | 233 |
alternative_url = wp_res['alternative_wikipedia_url'] |
234 |
pageid = wp_res['pageid'] |
|
235 |
alternative_pageid = wp_res['alternative_pageid'] |
|
236 |
response = wp_res['response'] |
|
237 |
dbpedia_uri = wp_res["dbpedia_uri"] |
|
238 |
revision_id = wp_res["revision_id"] |
|
239 |
||
240 |
if verbosity >= 2 : |
|
241 |
print "response from query to %s with parameters %s :" % (site.apibase, repr(new_label)) |
|
242 |
print repr(response) |
|
243 |
||
244 |
if new_label is not None: |
|
245 |
term.wp_label = new_label |
|
246 |
if status is not None: |
|
247 |
term.url_status = status |
|
|
35
859862939996
add qualifier on the wikipedia link
ymh <ymh.work@gmail.com>
parents:
33
diff
changeset
|
248 |
term.link_semantic_level = TERM_WK_LINK_SEMANTIC_LEVEL_DICT['--'] |
| 0 | 249 |
term.wikipedia_url = url |
250 |
term.wikipedia_pageid = pageid |
|
251 |
term.dbpedia_uri = dbpedia_uri |
|
252 |
term.alternative_label = alternative_label |
|
253 |
term.alternative_wikipedia_url = alternative_url |
|
254 |
term.alternative_wikipedia_pageid = alternative_pageid |
|
255 |
term.wikipedia_revision_id=revision_id |
|
256 |
||
257 |
term.save() |
|
258 |
||
| 3 | 259 |
return revision_id |
260 |
||
|
91
3bbf7371378a
Model reorganization for user + migration.
ymh <ymh.work@gmail.com>
parents:
80
diff
changeset
|
261 |
def get_dbpedia_lang(dbp_uri): |
|
3bbf7371378a
Model reorganization for user + migration.
ymh <ymh.work@gmail.com>
parents:
80
diff
changeset
|
262 |
|
|
3bbf7371378a
Model reorganization for user + migration.
ymh <ymh.work@gmail.com>
parents:
80
diff
changeset
|
263 |
for lang, props in settings.WIKIPEDIA_URLS.iteritems(): |
|
3bbf7371378a
Model reorganization for user + migration.
ymh <ymh.work@gmail.com>
parents:
80
diff
changeset
|
264 |
if dbp_uri.startswith(props['dbpedia_base_url']): |
|
3bbf7371378a
Model reorganization for user + migration.
ymh <ymh.work@gmail.com>
parents:
80
diff
changeset
|
265 |
return lang |
|
3bbf7371378a
Model reorganization for user + migration.
ymh <ymh.work@gmail.com>
parents:
80
diff
changeset
|
266 |
return None |
|
3bbf7371378a
Model reorganization for user + migration.
ymh <ymh.work@gmail.com>
parents:
80
diff
changeset
|
267 |