50 |
51 |
51 def urlize_for_wikipedia(label): |
52 def urlize_for_wikipedia(label): |
52 return urlquote(label.replace(" ", "_")) |
53 return urlquote(label.replace(" ", "_")) |
53 |
54 |
54 |
55 |
55 def __is_homonymie(page_dict): |
56 def __is_homonymie(page_dict, lang): |
56 for cat in page_dict.get(u"categories", []): |
57 for cat in page_dict.get(u"categories", []): |
57 if u'Catégorie:Homonymie' in cat.get(u"title", u"") or u'Category:Disambiguation pages' in cat.get(u"title", u""): |
58 if settings.WIKIPEDIA_URLS[lang]['disambiguation_cat'] in cat.get(u"title", u""): |
58 return True |
59 return True |
59 return False |
60 return False |
60 |
61 |
61 |
62 |
62 def query_wikipedia_title(site, label=None, pageid=None): |
63 def query_wikipedia_title(site, lang, label=None, pageid=None): |
63 |
64 |
64 params = {'action':'query', 'prop':'info|categories|langlinks', 'inprop':'url', 'lllimit':'500', 'cllimit':'500'} |
65 params = {'action':'query', 'prop':'info|categories|langlinks', 'inprop':'url', 'lllimit':'500', 'cllimit':'500'} |
65 |
66 |
66 if label: |
67 if label: |
67 params['titles'] = label |
68 params['titles'] = label |
129 |
130 |
130 revision_id = page.get('lastrevid', None) |
131 revision_id = page.get('lastrevid', None) |
131 |
132 |
132 |
133 |
133 if status == TERM_URL_STATUS_DICT['match'] or status == TERM_URL_STATUS_DICT['redirection']: |
134 if status == TERM_URL_STATUS_DICT['match'] or status == TERM_URL_STATUS_DICT['redirection']: |
134 dbpedia_uri = settings.DBPEDIA_URI_TEMPLATE % (urlize_for_wikipedia(new_label)) |
135 dbpedia_uri = settings.WIKIPEDIA_URLS[lang]['dbpedia_uri'] % (urlize_for_wikipedia(new_label)) |
135 else: |
136 else: |
136 dbpedia_uri = None |
137 dbpedia_uri = None |
137 |
138 |
138 |
139 |
139 return { 'new_label': new_label, 'alternative_label': alternative_label, 'status': status, 'wikipedia_url': url, 'pageid': pageid, 'alternative_wikipedia_url': alternative_url, 'alternative_pageid': alternative_pageid, 'dbpedia_uri': dbpedia_uri, 'revision_id': revision_id, 'response': original_response } |
140 return { 'new_label': new_label, 'alternative_label': alternative_label, 'status': status, 'wikipedia_url': url, 'pageid': pageid, 'alternative_wikipedia_url': alternative_url, 'alternative_pageid': alternative_pageid, 'dbpedia_uri': dbpedia_uri, 'revision_id': revision_id, 'response': original_response } |
140 |
141 |
141 |
142 |
142 |
143 |
143 def get_or_create_term(term_label, term_uri, term_lang, thesaurus, wp_label_transform=(lambda l:l), skip_wp_query=False): |
144 def get_or_create_term(term_label, term_uri, term_lang, thesaurus, lang, wp_label_transform=(lambda l:l), skip_wp_query=False): |
144 |
145 |
145 term_label_normalized = normalize_term(term_label) |
146 term_label_normalized = normalize_term(term_label) |
146 # We get the wikipedia references for the tag_label |
147 # We get the wikipedia references for the tag_label |
147 # We get or create the tag object |
148 # We get or create the tag object |
148 global site |
|
149 |
149 |
150 |
150 |
151 term, created = Term.objects.get_or_create(uri=term_uri, defaults = {'label':term_label, 'thesaurus':thesaurus, 'normalized_label':term_label_normalized, 'lang' : term_lang}) # @UndefinedVariable |
151 term, created = Term.objects.get_or_create(uri=term_uri, defaults = {'label':term_label, 'thesaurus':thesaurus, 'normalized_label':term_label_normalized, 'lang' : term_lang}) # @UndefinedVariable |
152 |
152 |
153 if created: |
153 if created: |
154 wikipedia_revision_id = process_term(__get_site(), term, label=wp_label_transform(term_label_normalized)) |
154 wikipedia_revision_id = process_term(__get_site(lang), term, lang, label=wp_label_transform(term_label_normalized)) |
155 term_label_obj = TermLabel(label=term_label, term=term, lang=term_lang) |
155 term_label_obj = TermLabel(label=term_label, term=term, lang=term_lang) |
156 term_label_obj.save() |
156 term_label_obj.save() |
157 |
157 |
158 elif term.wikipedia_pageid and not skip_wp_query: |
158 elif term.wikipedia_pageid and not skip_wp_query: |
159 wp_res = query_wikipedia_title(__get_site(), pageid=term.wikipedia_pageid) |
159 wp_res = query_wikipedia_title(__get_site(lang), lang, pageid=term.wikipedia_pageid) |
160 wikipedia_revision_id = wp_res['revision_id'] |
160 wikipedia_revision_id = wp_res['revision_id'] |
161 term.wikipedia_revision_id = wikipedia_revision_id |
161 term.wikipedia_revision_id = wikipedia_revision_id |
162 term.save() |
162 term.save() |
163 else: |
163 else: |
164 wikipedia_revision_id = None |
164 wikipedia_revision_id = None |
165 |
165 |
166 |
166 |
167 return term, wikipedia_revision_id, created |
167 return term, wikipedia_revision_id, created |
168 |
168 |
169 |
169 |
170 def process_term(site, term, label=None, verbosity=0): |
170 def process_term(site, term, lang, label=None, verbosity=0): |
171 |
171 |
172 if site == None: |
|
173 site = wiki.Wiki(settings.WIKIPEDIA_API_URL) # @UndefinedVariable |
|
174 |
|
175 if not label: |
172 if not label: |
176 label = term.label |
173 label = term.label |
177 elif label.startswith(settings.WIKIPEDIA_PAGE_URL): |
174 else: |
178 url_parts = urlparse(label) |
175 for lang_code, urls in settings.WIKIPEDIA_URLS.iteritems(): |
179 label = urllib2.unquote(str(url_parts.path.split('/')[-1])).decode("utf-8") |
176 if label.startswith(urls['page_url']): |
180 |
177 # lang is overrided when an url is passed as a label. |
181 wp_res = query_wikipedia_title(site, label=label) |
178 lang = lang_code |
|
179 url_parts = urlparse(label) |
|
180 label = urllib2.unquote(str(url_parts.path.split('/')[-1])).decode("utf-8") |
|
181 break |
|
182 |
|
183 if site == None: |
|
184 site = __get_site(lang) |
|
185 |
|
186 wp_res = query_wikipedia_title(site, lang, label=label) |
182 new_label = wp_res['new_label'] |
187 new_label = wp_res['new_label'] |
183 alternative_label= wp_res['alternative_label'] |
188 alternative_label= wp_res['alternative_label'] |
184 status = wp_res['status'] |
189 status = wp_res['status'] |
185 url = wp_res['wikipedia_url'] |
190 url = wp_res['wikipedia_url'] |
186 alternative_url = wp_res['alternative_wikipedia_url'] |
191 alternative_url = wp_res['alternative_wikipedia_url'] |