| author | ymh <ymh.work@gmail.com> |
| Wed, 11 Apr 2018 12:19:47 +0200 | |
| branch | documentation |
| changeset 693 | 09e00f38d177 |
| parent 683 | 59d49ab04ded |
| permissions | -rw-r--r-- |
| 111 | 1 |
# -*- coding: utf-8 -*- |
2 |
''' |
|
| 693 | 3 |
Requête Wikipedia qui renseigne les différentes catégories wikipedia pour les tag sémantisés. |
4 |
||
5 |
Les données suivantes sont moissonée pour chaque tag sémantisé (i.e. article Wikipedia) |
|
6 |
- catégories visibles (`visible`) |
|
7 |
- catégories cachées (`hidden`) |
|
8 |
- paramêtres d'infobox (`infobox`) |
|
9 |
||
10 |
Les objets créé sont les suivants: |
|
11 |
||
12 |
- catégories : :class:`hdalab.models.WpCategory` et :class:`hdalab.models.TagWpCategory` |
|
13 |
- paramêtre d'infobox : :class:`hdalab.models.InfoboxParameter` et :class:`hdalab.models.TagInfobox` |
|
14 |
||
15 |
Cette commande utilise directement `l'api wikipedia <https://www.mediawiki.org/wiki/API:Main_page>`_ pour faire ses requêtes. |
|
| 111 | 16 |
|
| 693 | 17 |
**Usage**: ``django-admin query_wikipedia_category [options]`` |
18 |
||
19 |
**Options spécifiques:** |
|
20 |
||
21 |
- *\-\-all* : force à traiter tous les tags |
|
22 |
- *\-\-random* : faire le traitement des tags dans un ordre aléatoire |
|
23 |
- *\-\-force* : ne pose aucune question |
|
24 |
- *\-\-limit=LIMIT* : Nombre de tags à traiter |
|
25 |
- *\-\-start=START* : Nombre de tags à ignorer |
|
26 |
- *\-\-type=TYPES* : Quel type de requête faire : `visible` : catégories visibles, `hidden` : catégories cachées, `infobox`: infoboxes, `all`: toutes (défaut). cette option peut être passée plusieurs fois. |
|
27 |
- *\-\-use\-label* : Utilise le label du tag au lieu du pageid pour faire la requête wikipedia |
|
28 |
- *\-\-tag=TAG* : Limite le traitement à ce tag |
|
29 |
||
| 111 | 30 |
''' |
31 |
||
32 |
from django.conf import settings |
|
33 |
from django.core.management.base import NoArgsCommand |
|
34 |
from django.core.management.color import no_style |
|
|
114
c59383cc9940
migrate categories extraction to hdalab
ymh <ymh.work@gmail.com>
parents:
113
diff
changeset
|
35 |
from hdabo.models import Tag |
|
c59383cc9940
migrate categories extraction to hdalab
ymh <ymh.work@gmail.com>
parents:
113
diff
changeset
|
36 |
from hdalab.models import WpCategory, TagWpCategory, TagInfobox, InfoboxParameter |
| 111 | 37 |
from optparse import make_option |
38 |
from wikitools import api,wiki |
|
39 |
import sys |
|
40 |
import re |
|
41 |
import itertools |
|
42 |
from hdabo import utils |
|
43 |
from django.db.models import Count |
|
44 |
from django.db import transaction |
|
45 |
||
46 |
||
47 |
TYPES_MASK_DICT = { |
|
48 |
u'visible': 0b001, |
|
49 |
u'hidden': 0b010, |
|
50 |
u'infobox': 0b100, |
|
51 |
u'all': 0b111, |
|
52 |
} |
|
53 |
||
54 |
START_PATTERN = re.compile(u"\{\{\s?Infobox\s+([^|]+)", re.M|re.U|re.I) |
|
55 |
END_PATTERN = re.compile(u"\{\{|\}\}", re.M|re.U) |
|
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
56 |
SPLIT_PATTERN = re.compile("\s*?\|\s*([\w]+[^=|]*)\s*=", re.U|re.M) |
| 111 | 57 |
DELIMITER_PATTERN = re.compile("\{{2,3}|\}{2,3}|\[\[|\]\]|\[|\]") |
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
58 |
COMMENT_PATTERN = re.compile("<!--.*?-->",re.U|re.M) |
| 111 | 59 |
|
60 |
||
61 |
||
62 |
class Command(NoArgsCommand): |
|
63 |
''' |
|
64 |
query and update wikipedia for tag title. |
|
65 |
''' |
|
66 |
options = '' |
|
67 |
help = """query and update wikipedia for tag title.""" |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
68 |
|
| 111 | 69 |
option_list = NoArgsCommand.option_list + ( |
70 |
make_option('--all', |
|
71 |
action='store_true', |
|
72 |
dest='all', |
|
73 |
default=False, |
|
74 |
help='force all tags to be updated, not only those not yet processed'), |
|
75 |
make_option('--force', |
|
76 |
action='store_true', |
|
77 |
dest='force', |
|
78 |
default=False, |
|
79 |
help='ask no questions'), |
|
80 |
make_option('--random', |
|
81 |
action='store_true', |
|
82 |
dest='random', |
|
83 |
default=False, |
|
84 |
help='randomize query on tags'), |
|
85 |
make_option('--site', |
|
86 |
action='store', |
|
87 |
type='string', |
|
88 |
dest='site_url', |
|
|
683
59d49ab04ded
use https for wikipedia api endpoint
ymh <ymh.work@gmail.com>
parents:
649
diff
changeset
|
89 |
default="https://fr.wikipedia.org/w/api.php", |
| 111 | 90 |
help='the url for the wikipedia site'), |
91 |
make_option('--limit', |
|
92 |
action='store', |
|
93 |
type='int', |
|
94 |
dest='limit', |
|
95 |
default= -1, |
|
96 |
help='number of tag to process'), |
|
97 |
make_option('--start', |
|
98 |
action='store', |
|
99 |
type='int', |
|
100 |
dest='start', |
|
101 |
default=0, |
|
102 |
help='number of tag to ignore'), |
|
103 |
make_option('--type', |
|
104 |
action='append', |
|
105 |
dest='types', |
|
106 |
type='choice', |
|
107 |
choices=['visible','hidden', 'infobox', 'all'], |
|
108 |
default=[], |
|
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
109 |
help='what type of query to perform : visible : visible categories, hidden : hidden categories, infobox: infoboxes, all: all of them. This option can be assed multiple times'), |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
110 |
make_option('--use-label', |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
111 |
action='store_true', |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
112 |
dest='use_label', |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
113 |
default=False, |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
114 |
help='use label instead of pageid to query wikipedia'), |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
115 |
make_option('--tag', |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
116 |
action='append', |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
117 |
dest='tags', |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
118 |
type='string', |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
119 |
default=[], |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
120 |
help='the tag to query'), |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
121 |
|
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
122 |
) |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
123 |
|
|
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
124 |
|
|
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
125 |
# def process_wp_response(self, label, response): |
| 111 | 126 |
# |
127 |
# query_dict = response['query'] |
|
128 |
# # get page if multiple pages or none -> return Tag.null_result |
|
129 |
# pages = query_dict.get("pages", {}) |
|
130 |
# if len(pages) > 1 or len(pages) == 0: |
|
131 |
# return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
132 |
# |
| 111 | 133 |
# page = pages.values()[0] |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
134 |
# |
| 111 | 135 |
# if u"invalid" in page or u"missing" in page: |
136 |
# return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None |
|
137 |
# |
|
138 |
# url = page.get(u'fullurl', None) |
|
139 |
# pageid = page.get(u'pageid', None) |
|
140 |
# new_label = page[u'title'] |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
141 |
# |
| 111 | 142 |
# if self.__is_homonymie(page): |
143 |
# status = Tag.TAG_URL_STATUS_DICT["homonyme"] |
|
144 |
# elif u"redirect" in page: |
|
145 |
# status = Tag.TAG_URL_STATUS_DICT["redirection"] |
|
146 |
# else: |
|
147 |
# status = Tag.TAG_URL_STATUS_DICT["match"] |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
148 |
# |
|
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
149 |
# return new_label, status, url, pageid |
| 111 | 150 |
|
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
151 |
def query_all_categories(self, hidden, site, pageid, use_label): |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
152 |
|
| 111 | 153 |
clshow = 'hidden' if hidden else '!hidden' |
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
154 |
params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'categories', 'clshow': clshow} |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
155 |
|
|
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
156 |
clcontinue = "" |
| 111 | 157 |
res = [] |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
158 |
|
| 111 | 159 |
while clcontinue is not None: |
160 |
if clcontinue: |
|
161 |
params['clcontinue'] = clcontinue |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
162 |
|
| 111 | 163 |
wpquery = api.APIRequest(site, params) #@UndefinedVariable |
164 |
response = wpquery.query() |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
165 |
|
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
166 |
if self.verbosity > 1: |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
167 |
print "Query infoboxes : " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data()) |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
168 |
print repr(response) |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
169 |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
170 |
|
| 111 | 171 |
query_dict = response.get('query', None) |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
172 |
|
| 111 | 173 |
if query_dict is None: |
174 |
return res |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
175 |
|
| 111 | 176 |
pages = query_dict.get("pages", {}) |
177 |
if len(pages) > 1 or len(pages) == 0: |
|
178 |
return res |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
179 |
|
| 111 | 180 |
page = pages.values()[0] |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
181 |
|
| 111 | 182 |
for cat in page.get('categories',[]): |
183 |
title = cat.get('title',"") |
|
184 |
title = title[title.find(":")+1:] |
|
185 |
if title and clcontinue != ("%s|%s" % (pageid,title)): |
|
186 |
res.append(title) |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
187 |
|
| 111 | 188 |
clcontinue = response.get('query-continue', {}).get('categories',{}).get('clcontinue', None) |
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
189 |
|
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
190 |
if self.verbosity > 1: |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
191 |
print "Query infoboxes RES: " |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
192 |
print repr(res) |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
193 |
|
| 111 | 194 |
return res |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
195 |
|
| 111 | 196 |
def process_categories(self, cat_list, hidden, tag): |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
197 |
|
| 111 | 198 |
for cat in cat_list: |
199 |
wp_cat,created = WpCategory.objects.get_or_create(label=cat) #@UnusedVariable |
|
200 |
TagWpCategory.objects.get_or_create(tag=tag, wp_category=wp_cat, hidden=hidden) |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
201 |
|
|
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
202 |
|
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
203 |
def query_infoboxes(self, site, pageid, use_label): |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
204 |
|
| 111 | 205 |
res = [] |
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
206 |
params = {'action':'query', 'titles' if use_label else 'pageids': pageid, 'prop':'revisions', 'rvprop': 'ids|content'} |
| 111 | 207 |
wpquery = api.APIRequest(site, params) #@UndefinedVariable |
208 |
response = wpquery.query() |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
209 |
|
| 111 | 210 |
query_dict = response.get('query', None) |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
211 |
|
| 111 | 212 |
if query_dict is None: |
213 |
return res |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
214 |
|
| 111 | 215 |
pages = query_dict.get("pages", {}) |
216 |
if len(pages) > 1 or len(pages) == 0: |
|
217 |
return res |
|
218 |
||
219 |
page = pages.values()[0] |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
220 |
|
| 111 | 221 |
if 'revisions' not in page or not page['revisions']: |
222 |
return res |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
223 |
|
| 111 | 224 |
rev = page['revisions'][0] |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
225 |
|
| 111 | 226 |
content = rev['*'] |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
227 |
|
| 111 | 228 |
start = 0 |
229 |
depth = 0 |
|
230 |
current_infobox_name = None |
|
231 |
current_start = 0 |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
232 |
|
| 111 | 233 |
while start <= len(content): |
234 |
if depth==0: |
|
235 |
resm = START_PATTERN.search(content[start:]) |
|
236 |
if resm is None: |
|
237 |
break |
|
238 |
depth = 1 |
|
239 |
current_start = resm.start()+start |
|
240 |
start += resm.end()+1 |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
241 |
current_infobox_name = resm.group(1) |
| 111 | 242 |
else: |
243 |
resm = END_PATTERN.search(content[start:]) |
|
244 |
if resm is None: |
|
245 |
break |
|
246 |
if resm.group(0) == "{{": |
|
247 |
depth += 1 |
|
248 |
elif resm.group(0) == "}}": |
|
249 |
depth -= 1 |
|
250 |
if depth == 0: |
|
251 |
res.append((content[current_start:resm.end()+start], current_infobox_name)) |
|
252 |
start += resm.end()+1 |
|
253 |
||
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
254 |
return_val = (rev['revid'],res) |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
255 |
|
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
256 |
if self.verbosity > 1: |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
257 |
print "Query infoboxes url: " + repr(wpquery.request.get_full_url()+"?"+wpquery.request.get_data()) |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
258 |
print repr(return_val) |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
259 |
|
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
260 |
return return_val |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
261 |
|
| 111 | 262 |
def split_infoboxes(self, src): |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
263 |
|
| 111 | 264 |
start = 0 |
265 |
previous_end = 0 |
|
266 |
split_indexes = [] |
|
267 |
delimiter_stack = [] |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
268 |
while start<=len(src): |
| 111 | 269 |
resd = DELIMITER_PATTERN.search(src[start:]) |
270 |
ress = SPLIT_PATTERN.search(src[start:]) if len(delimiter_stack) == 0 else None |
|
271 |
startd = resd.start() if resd is not None else sys.maxint |
|
272 |
starts = ress.start() if ress is not None else sys.maxint |
|
273 |
if starts < startd: |
|
274 |
if len(split_indexes)>0: |
|
275 |
split_indexes.append((previous_end, ress.start(0)+start)) |
|
276 |
split_indexes.append((ress.start(1)+start, ress.end(1)+start)) |
|
277 |
start += ress.end(0) |
|
278 |
previous_end = start |
|
279 |
elif startd < sys.maxint: |
|
280 |
if resd.group().startswith("{") or resd.group().startswith("[") : |
|
281 |
delimiter_stack.append(resd.group()) |
|
282 |
elif len(delimiter_stack)>0 and ( (delimiter_stack[-1].startswith('{') and resd.group()[0] == '}') or (delimiter_stack[-1].startswith('[') and resd.group()[0] == ']') ) and len(delimiter_stack[-1]) == len(resd.group()): |
|
283 |
delimiter_stack.pop() |
|
284 |
start += resd.end() |
|
285 |
else: |
|
286 |
break |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
287 |
|
| 111 | 288 |
if previous_end > 0: |
289 |
split_indexes.append((previous_end,len(src))) |
|
290 |
res = [src[start:end] for start,end in split_indexes] |
|
291 |
return res |
|
292 |
||
293 |
||
294 |
||
295 |
def process_infoboxes(self, infobox_defs, tag): |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
296 |
|
| 111 | 297 |
if not infobox_defs: |
298 |
return |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
299 |
|
| 111 | 300 |
revision_id = infobox_defs[0] |
301 |
for infobox in infobox_defs[1]: |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
302 |
src = infobox[0].strip(' \t\n\r') |
| 111 | 303 |
name = infobox[1] |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
304 |
if name and len(name) > 2048: |
|
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
305 |
name = name[0:2048] |
| 111 | 306 |
tag_infobox, created = TagInfobox.objects.get_or_create(tag=tag, name=name, revision_id = revision_id, defaults={'source': src}) |
307 |
if not created: |
|
308 |
tag_infobox.source = src |
|
309 |
tag_infobox.save() |
|
310 |
||
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
311 |
src = COMMENT_PATTERN.sub('',src) |
| 111 | 312 |
src = START_PATTERN.sub('',src[:-2]).strip() |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
313 |
|
| 111 | 314 |
keyvalues = self.split_infoboxes(src) |
315 |
||
316 |
for key,value in itertools.izip(*[itertools.islice(keyvalues, i, None, 2) for i in range(2)]): |
|
317 |
param, created = InfoboxParameter.objects.get_or_create(tag_infobox=tag_infobox, param_name=key.strip(), defaults={'param_value':value.strip()}) |
|
318 |
if not created: |
|
319 |
param.param_value = value.strip() |
|
320 |
param.save() |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
321 |
|
| 111 | 322 |
def handle_noargs(self, **options): |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
323 |
|
| 111 | 324 |
self.style = no_style() |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
325 |
|
| 111 | 326 |
interactive = options.get('interactive', True) |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
327 |
|
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
328 |
self.verbosity = int(options.get('verbosity', '1')) |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
329 |
use_label = options.get('use_label', False) |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
330 |
|
| 111 | 331 |
force = options.get('force', False) |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
332 |
|
| 111 | 333 |
limit = options.get("limit", -1) |
334 |
start = options.get("start", 0) |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
335 |
|
| 111 | 336 |
site_url = options.get('site_url', settings.WIKIPEDIA_API_URL) |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
337 |
|
| 111 | 338 |
random = options.get('random', False) |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
339 |
|
| 111 | 340 |
types_mask = 0 |
341 |
types_list = options.get('types', []) |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
342 |
|
| 111 | 343 |
if len(types_list) == 0: |
344 |
types_mask = TYPES_MASK_DICT['all'] |
|
345 |
else: |
|
346 |
for t in types_list: |
|
347 |
types_mask |= TYPES_MASK_DICT[t] |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
348 |
|
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
349 |
if self.verbosity > 1 : |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
350 |
print "types mask %s " % (bin(types_mask)) |
|
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
351 |
|
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
352 |
if self.verbosity > 2: |
| 111 | 353 |
print "option passed : " + repr(options) |
354 |
||
355 |
||
356 |
queryset = Tag.objects.exclude(wikipedia_pageid= None) |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
357 |
|
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
358 |
tag_list = options.get("tags", []); |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
359 |
|
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
360 |
if tag_list: |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
361 |
queryset = queryset.filter(label__in=tag_list) |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
362 |
elif not options.get('all',False): |
| 111 | 363 |
queryset = queryset.annotate(wpc=Count('wp_categories')).filter(wpc = 0) |
364 |
#else: |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
365 |
# queryset = Tag.objects.filter(url_status=None) |
|
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
366 |
|
| 111 | 367 |
if random: |
368 |
queryset = queryset.order_by("?") |
|
369 |
else: |
|
370 |
queryset = queryset.order_by("label") |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
371 |
|
| 111 | 372 |
if limit >= 0: |
373 |
queryset = queryset[start:limit] |
|
374 |
elif start > 0: |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
375 |
queryset = queryset[start:] |
|
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
376 |
|
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
377 |
if self.verbosity > 2 : |
| 111 | 378 |
print "Tag Query is %s" % (queryset.query) |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
379 |
|
| 111 | 380 |
site = wiki.Wiki(site_url) #@UndefinedVariable |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
381 |
|
|
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
382 |
|
| 111 | 383 |
count = queryset.count() |
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
384 |
if self.verbosity > 1: |
| 111 | 385 |
print "Processing %d tags" % (count) |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
386 |
|
| 111 | 387 |
if not force and interactive: |
388 |
confirm = raw_input("You have requested to query and replace the wikipedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count)) |
|
389 |
else: |
|
390 |
confirm = 'yes' |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
391 |
|
| 111 | 392 |
if confirm != "yes": |
393 |
print "wikipedia query cancelled" |
|
394 |
return |
|
395 |
||
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
396 |
|
|
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
397 |
|
| 111 | 398 |
for i, tag in enumerate(queryset): |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
399 |
|
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
400 |
if self.verbosity > 1: |
| 111 | 401 |
print "processing tag %s (%d/%d)" % (tag.label, i + 1, count) |
402 |
else: |
|
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
403 |
utils.show_progress(i + 1, count, tag.label, 60) |
| 111 | 404 |
|
405 |
# query categories |
|
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
406 |
wikipedia_pageid = tag.label if use_label else tag.wikipedia_pageid |
| 111 | 407 |
if tag.url_status == Tag.TAG_URL_STATUS_DICT['redirection'] and tag.alternative_wikipedia_pageid is not None : |
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
408 |
wikipedia_pageid = tag.alternative_label if use_label else tag.alternative_wikipedia_pageid |
| 111 | 409 |
|
|
571
d9642be7c937
replace commit_on_success with atomic
ymh <ymh.work@gmail.com>
parents:
266
diff
changeset
|
410 |
with transaction.atomic(): |
| 111 | 411 |
if types_mask & TYPES_MASK_DICT['visible']: |
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
412 |
res = self.query_all_categories(False, site, wikipedia_pageid, use_label) |
| 111 | 413 |
self.process_categories(res, False, tag) |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
414 |
|
| 111 | 415 |
if types_mask & TYPES_MASK_DICT['hidden']: |
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
416 |
res = self.query_all_categories(True, site, wikipedia_pageid, use_label) |
| 111 | 417 |
self.process_categories(res, True, tag) |
|
649
434737bd64e5
rdf import correction + new version
ymh <ymh.work@gmail.com>
parents:
571
diff
changeset
|
418 |
|
| 111 | 419 |
if types_mask & TYPES_MASK_DICT['infobox']: |
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
420 |
res = self.query_infoboxes(site, wikipedia_pageid, use_label) |
| 111 | 421 |
self.process_infoboxes(res, tag) |