| author | ymh <ymh.work@gmail.com> |
| Wed, 11 Apr 2018 12:19:47 +0200 | |
| branch | documentation |
| changeset 693 | 09e00f38d177 |
| parent 281 | bc0f26b1acc2 |
| permissions | -rw-r--r-- |
| 24 | 1 |
# -*- coding: utf-8 -*- |
|
42
861a78f74a37
modify behavior for tag modification on the datasheet
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
2 |
from django.conf import settings |
| 72 | 3 |
from django.utils.http import urlquote |
4 |
from haystack.constants import DJANGO_ID |
|
5 |
from haystack.query import SearchQuerySet |
|
| 66 | 6 |
from hdabo.models import Tag, TaggedSheet |
|
42
861a78f74a37
modify behavior for tag modification on the datasheet
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
7 |
from wikitools import api, wiki |
| 72 | 8 |
|
| 47 | 9 |
|
10 |
def normalize_tag(tag): |
|
11 |
if len(tag) == 0: |
|
12 |
return tag |
|
13 |
tag = tag.strip() |
|
14 |
tag = tag.replace("_", " ") |
|
15 |
tag = " ".join(tag.split()) |
|
16 |
tag = tag[0].upper() + tag[1:] |
|
17 |
return tag |
|
18 |
||
| 66 | 19 |
def urlize_for_wikipedia(label): |
| 72 | 20 |
return urlquote(label.replace(" ", "_")) |
| 47 | 21 |
|
| 24 | 22 |
|
23 |
def __is_homonymie(page_dict): |
|
24 |
for cat in page_dict.get(u"categories", []): |
|
25 |
if u'Catégorie:Homonymie' in cat.get(u"title", u"") or u'Category:Disambiguation pages' in cat.get(u"title", u""): |
|
26 |
return True |
|
27 |
return False |
|
28 |
||
|
111
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
29 |
|
| 66 | 30 |
def query_wikipedia_title(site, label=None, pageid=None): |
| 693 | 31 |
|
| 66 | 32 |
params = {'action':'query', 'prop':'info|categories|langlinks', 'inprop':'url', 'lllimit':'500', 'cllimit':'500', 'rvprop':'ids'} |
| 693 | 33 |
|
| 66 | 34 |
if label: |
35 |
params['titles'] = label |
|
36 |
else: |
|
37 |
params['pageids'] = pageid |
|
| 24 | 38 |
wpquery = api.APIRequest(site, params) #@UndefinedVariable |
| 693 | 39 |
|
| 47 | 40 |
response = wpquery.query() |
41 |
original_response = response |
|
| 66 | 42 |
def return_null_result(): |
|
111
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
43 |
return { 'new_label': None, 'alternative_label': None, 'status': Tag.TAG_URL_STATUS_DICT["null_result"], 'wikipedia_url': None, 'pageid': None, 'alternative_wikipedia_url': None, 'alternative_pageid': None, 'dbpedia_uri': None, 'revision_id': None, 'response': response } |
| 693 | 44 |
|
| 24 | 45 |
|
46 |
query_dict = response['query'] |
|
47 |
# get page if multiple pages or none -> return Tag.null_result |
|
48 |
pages = query_dict.get("pages", {}) |
|
49 |
if len(pages) > 1 or len(pages) == 0: |
|
| 66 | 50 |
return return_null_result() |
| 693 | 51 |
|
| 24 | 52 |
page = pages.values()[0] |
| 693 | 53 |
|
| 24 | 54 |
if u"invalid" in page or u"missing" in page: |
| 66 | 55 |
return return_null_result() |
| 24 | 56 |
|
57 |
url = page.get(u'fullurl', None) |
|
58 |
pageid = page.get(u'pageid', None) |
|
59 |
new_label = page[u'title'] |
|
|
108
4b73a767a6c0
backport changes made on model for hdabo_sf
ymh <ymh.work@gmail.com>
parents:
84
diff
changeset
|
60 |
alternative_label = None |
|
4b73a767a6c0
backport changes made on model for hdabo_sf
ymh <ymh.work@gmail.com>
parents:
84
diff
changeset
|
61 |
alternative_url = None |
|
4b73a767a6c0
backport changes made on model for hdabo_sf
ymh <ymh.work@gmail.com>
parents:
84
diff
changeset
|
62 |
alternative_pageid = None |
| 693 | 63 |
|
| 24 | 64 |
if __is_homonymie(page): |
65 |
status = Tag.TAG_URL_STATUS_DICT["homonyme"] |
|
66 |
elif u"redirect" in page: |
|
67 |
status = Tag.TAG_URL_STATUS_DICT["redirection"] |
|
68 |
else: |
|
69 |
status = Tag.TAG_URL_STATUS_DICT["match"] |
|
| 693 | 70 |
|
| 47 | 71 |
if status == Tag.TAG_URL_STATUS_DICT["redirection"]: |
| 66 | 72 |
params['redirects'] = True |
| 693 | 73 |
wpquery = api.APIRequest(site, params) #@UndefinedVariable |
| 47 | 74 |
response = wpquery.query() |
75 |
query_dict = response['query'] |
|
76 |
pages = query_dict.get("pages", {}) |
|
| 693 | 77 |
#we know that we have at least one answer |
| 47 | 78 |
if len(pages) > 1 or len(pages) == 0: |
| 66 | 79 |
return return_null_result() |
| 47 | 80 |
page = pages.values()[0] |
|
108
4b73a767a6c0
backport changes made on model for hdabo_sf
ymh <ymh.work@gmail.com>
parents:
84
diff
changeset
|
81 |
alternative_label = page.get('title', None) |
|
4b73a767a6c0
backport changes made on model for hdabo_sf
ymh <ymh.work@gmail.com>
parents:
84
diff
changeset
|
82 |
alternative_url = page.get('fullurl', None) |
|
4b73a767a6c0
backport changes made on model for hdabo_sf
ymh <ymh.work@gmail.com>
parents:
84
diff
changeset
|
83 |
alternative_pageid = page.get('pageid',None) |
| 693 | 84 |
|
| 66 | 85 |
revision_id = page.get('lastrevid', None) |
| 693 | 86 |
|
|
281
bc0f26b1acc2
Hdalab : commands now work after update. Requests update with a dbpedia url from settings.
cavaliet
parents:
266
diff
changeset
|
87 |
# to be perfect we should sparql request DBPEDIA_URI_TEMPLATE, but we simply build the url |
|
bc0f26b1acc2
Hdalab : commands now work after update. Requests update with a dbpedia url from settings.
cavaliet
parents:
266
diff
changeset
|
88 |
dbpedia_uri = settings.DBPEDIA_URI_TEMPLATE % ("resource", urlize_for_wikipedia(new_label)) |
| 693 | 89 |
|
| 47 | 90 |
|
|
108
4b73a767a6c0
backport changes made on model for hdabo_sf
ymh <ymh.work@gmail.com>
parents:
84
diff
changeset
|
91 |
return { 'new_label': new_label, 'alternative_label': alternative_label, 'status': status, 'wikipedia_url': url, 'pageid': pageid, 'alternative_wikipedia_url': alternative_url, 'alternative_pageid': alternative_pageid, 'dbpedia_uri': dbpedia_uri, 'revision_id': revision_id, 'response': original_response } |
| 47 | 92 |
|
|
42
861a78f74a37
modify behavior for tag modification on the datasheet
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
93 |
|
|
861a78f74a37
modify behavior for tag modification on the datasheet
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
94 |
|
|
861a78f74a37
modify behavior for tag modification on the datasheet
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
95 |
def get_or_create_tag(tag_label): |
| 693 | 96 |
|
|
42
861a78f74a37
modify behavior for tag modification on the datasheet
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
97 |
tag_label_normalized = normalize_tag(tag_label) |
|
861a78f74a37
modify behavior for tag modification on the datasheet
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
98 |
# We get the wikipedia references for the tag_label |
|
861a78f74a37
modify behavior for tag modification on the datasheet
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
99 |
# We get or create the tag object |
| 693 | 100 |
|
| 69 | 101 |
tag = None |
102 |
for t in Tag.objects.filter(label__iexact=tag_label_normalized): |
|
103 |
if tag is None or t.url_status != Tag.TAG_URL_STATUS_DICT['null_result']: |
|
104 |
tag = t |
|
105 |
if tag.url_status != Tag.TAG_URL_STATUS_DICT['null_result']: |
|
106 |
break |
|
| 693 | 107 |
|
| 69 | 108 |
if tag is None: |
109 |
tag = Tag(label=tag_label_normalized, original_label=tag_label) |
|
110 |
created = True |
|
111 |
else: |
|
112 |
created = False |
|
| 693 | 113 |
|
| 66 | 114 |
site = wiki.Wiki(settings.WIKIPEDIA_API_URL) #@UndefinedVariable |
115 |
||
|
42
861a78f74a37
modify behavior for tag modification on the datasheet
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
116 |
if created: |
| 66 | 117 |
wp_res = query_wikipedia_title(site, label=tag_label_normalized) |
|
108
4b73a767a6c0
backport changes made on model for hdabo_sf
ymh <ymh.work@gmail.com>
parents:
84
diff
changeset
|
118 |
new_label = wp_res['new_label'] |
|
4b73a767a6c0
backport changes made on model for hdabo_sf
ymh <ymh.work@gmail.com>
parents:
84
diff
changeset
|
119 |
alternative_label= wp_res['alternative_label'] |
|
4b73a767a6c0
backport changes made on model for hdabo_sf
ymh <ymh.work@gmail.com>
parents:
84
diff
changeset
|
120 |
status = wp_res['status'] |
|
4b73a767a6c0
backport changes made on model for hdabo_sf
ymh <ymh.work@gmail.com>
parents:
84
diff
changeset
|
121 |
url = wp_res['wikipedia_url'] |
|
4b73a767a6c0
backport changes made on model for hdabo_sf
ymh <ymh.work@gmail.com>
parents:
84
diff
changeset
|
122 |
alternative_url = wp_res['alternative_wikipedia_url'] |
|
4b73a767a6c0
backport changes made on model for hdabo_sf
ymh <ymh.work@gmail.com>
parents:
84
diff
changeset
|
123 |
pageid = wp_res['pageid'] |
|
4b73a767a6c0
backport changes made on model for hdabo_sf
ymh <ymh.work@gmail.com>
parents:
84
diff
changeset
|
124 |
alternative_pageid = wp_res['alternative_pageid'] |
|
4b73a767a6c0
backport changes made on model for hdabo_sf
ymh <ymh.work@gmail.com>
parents:
84
diff
changeset
|
125 |
dbpedia_uri = wp_res["dbpedia_uri"] |
|
4b73a767a6c0
backport changes made on model for hdabo_sf
ymh <ymh.work@gmail.com>
parents:
84
diff
changeset
|
126 |
wikipedia_revision_id = wp_res['revision_id'] |
| 47 | 127 |
|
| 693 | 128 |
|
|
42
861a78f74a37
modify behavior for tag modification on the datasheet
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
129 |
# We save the datas |
|
861a78f74a37
modify behavior for tag modification on the datasheet
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
130 |
if new_label is not None: |
|
861a78f74a37
modify behavior for tag modification on the datasheet
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
131 |
tag.label = new_label |
|
861a78f74a37
modify behavior for tag modification on the datasheet
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
132 |
if status is not None: |
|
861a78f74a37
modify behavior for tag modification on the datasheet
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
133 |
tag.url_status = status |
|
108
4b73a767a6c0
backport changes made on model for hdabo_sf
ymh <ymh.work@gmail.com>
parents:
84
diff
changeset
|
134 |
tag.alternative_label = alternative_label |
|
4b73a767a6c0
backport changes made on model for hdabo_sf
ymh <ymh.work@gmail.com>
parents:
84
diff
changeset
|
135 |
tag.alternative_wikipedia_url = alternative_url |
|
4b73a767a6c0
backport changes made on model for hdabo_sf
ymh <ymh.work@gmail.com>
parents:
84
diff
changeset
|
136 |
tag.alternative_wikipedia_pageid = alternative_pageid |
| 693 | 137 |
tag.wikipedia_url = url |
| 47 | 138 |
tag.wikipedia_pageid = pageid |
| 693 | 139 |
tag.dbpedia_uri = dbpedia_uri |
|
42
861a78f74a37
modify behavior for tag modification on the datasheet
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
140 |
|
|
861a78f74a37
modify behavior for tag modification on the datasheet
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
141 |
tag.save() |
| 693 | 142 |
|
| 66 | 143 |
elif tag.wikipedia_pageid: |
144 |
wp_res = query_wikipedia_title(site, pageid=tag.wikipedia_pageid) |
|
145 |
wikipedia_revision_id = wp_res['revision_id'] |
|
146 |
else: |
|
147 |
wikipedia_revision_id = None |
|
| 693 | 148 |
|
149 |
||
| 66 | 150 |
return tag, wikipedia_revision_id, created |
|
42
861a78f74a37
modify behavior for tag modification on the datasheet
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
151 |
|
| 66 | 152 |
def process_tag(site, tag, verbosity=0): |
| 693 | 153 |
|
| 66 | 154 |
wp_res = query_wikipedia_title(site, label=tag.label) |
|
111
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
155 |
new_label = wp_res['new_label'] |
|
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
156 |
alternative_label= wp_res['alternative_label'] |
|
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
157 |
status = wp_res['status'] |
|
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
158 |
url = wp_res['wikipedia_url'] |
|
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
159 |
alternative_url = wp_res['alternative_wikipedia_url'] |
|
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
160 |
pageid = wp_res['pageid'] |
|
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
161 |
alternative_pageid = wp_res['alternative_pageid'] |
|
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
162 |
response = wp_res['response'] |
|
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
163 |
dbpedia_uri = wp_res["dbpedia_uri"] |
|
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
164 |
revision_id = wp_res["revision_id"] |
| 693 | 165 |
|
| 47 | 166 |
if verbosity >= 2 : |
167 |
print "response from query to %s with parameters %s :" % (site.apibase, repr(new_label)) |
|
168 |
print repr(response) |
|
| 693 | 169 |
|
| 66 | 170 |
prev_wikipedia_pageid = tag.wikipedia_pageid |
| 693 | 171 |
|
| 47 | 172 |
if new_label is not None: |
173 |
tag.label = new_label |
|
174 |
if status is not None: |
|
175 |
tag.url_status = status |
|
176 |
tag.wikipedia_url = url |
|
177 |
tag.wikipedia_pageid = pageid |
|
178 |
tag.dbpedia_uri = dbpedia_uri |
|
|
111
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
179 |
tag.alternative_label = alternative_label |
|
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
180 |
tag.alternative_wikipedia_url = alternative_url |
|
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
181 |
tag.alternative_wikipedia_pageid = alternative_pageid |
| 693 | 182 |
|
| 47 | 183 |
tag.save() |
| 693 | 184 |
|
| 66 | 185 |
if prev_wikipedia_pageid != pageid: |
186 |
TaggedSheet.objects.filter(tag=tag).update(wikipedia_revision_id=revision_id) |
|
| 72 | 187 |
|
188 |
||
189 |
def reorder_datasheet_tags(ds): |
|
| 693 | 190 |
""" |
191 |
Reorder a Datasheet tags (object Tag) according to the score they obtain on a search on the title and description of the Datasheet. |
|
192 |
""" |
|
| 72 | 193 |
ts_list = [] |
194 |
for ts in ds.taggedsheet_set.all(): |
|
|
83
1c4729b3dac1
Correction bug #20. The solution is mainly to make sure that the index is recalculated
ymh <ymh.work@gmail.com>
parents:
72
diff
changeset
|
195 |
ts.index_note = 0 |
| 72 | 196 |
kwargs = {DJANGO_ID + "__exact": unicode(ds.pk)} |
|
83
1c4729b3dac1
Correction bug #20. The solution is mainly to make sure that the index is recalculated
ymh <ymh.work@gmail.com>
parents:
72
diff
changeset
|
197 |
|
| 72 | 198 |
results = SearchQuerySet().filter(title=ts.tag.label).filter_or(description=ts.tag.label).filter(**kwargs) |
199 |
if len(results) > 0: |
|
|
83
1c4729b3dac1
Correction bug #20. The solution is mainly to make sure that the index is recalculated
ymh <ymh.work@gmail.com>
parents:
72
diff
changeset
|
200 |
ts.index_note += results[0].score |
|
84
84dc6383323d
Correction bug #20. The solution is mainly to make sure that the index is recalculated - small optimization
ymh <ymh.work@gmail.com>
parents:
83
diff
changeset
|
201 |
ts.save() |
|
83
1c4729b3dac1
Correction bug #20. The solution is mainly to make sure that the index is recalculated
ymh <ymh.work@gmail.com>
parents:
72
diff
changeset
|
202 |
|
| 72 | 203 |
ts_list.append(ts) |
204 |
ts_list.sort(key=lambda t: (-t.index_note, t.order)) |
|
205 |
for k, ts in enumerate(ts_list): |
|
206 |
ts.order = k + 1 |
|
207 |
ts.save() |
|
208 |
if ds.manual_order: |
|
209 |
ds.manual_order = False |
|
210 |
ds.save() |
|
| 47 | 211 |
|
| 693 | 212 |