| author | rougeronj |
| Fri, 13 Mar 2015 15:11:09 +0100 | |
| changeset 540 | dcea08e78780 |
| parent 443 | 27f71b0a772d |
| permissions | -rw-r--r-- |
| 24 | 1 |
# -*- coding: utf-8 -*- |
| 443 | 2 |
import codecs |
|
11
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
3 |
import collections |
| 443 | 4 |
import math |
5 |
import re |
|
6 |
import sys |
|
| 56 | 7 |
import unicodedata |
| 443 | 8 |
|
9 |
import unidecode |
|
10 |
||
| 0 | 11 |
|
12 |
### |
|
13 |
# allow to declare a property as a decorator |
|
14 |
### |
|
15 |
def Property(func): |
|
|
11
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
16 |
return property(**func()) |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
17 |
|
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
18 |
|
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
19 |
## {{{ http://code.activestate.com/recipes/576694/ (r7) |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
20 |
|
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
21 |
KEY, PREV, NEXT = range(3) |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
22 |
|
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
23 |
class OrderedSet(collections.MutableSet): |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
24 |
|
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
25 |
def __init__(self, iterable=None): |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
26 |
self.end = end = [] |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
27 |
end += [None, end, end] # sentinel node for doubly linked list |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
28 |
self.map = {} # key --> [key, prev, next] |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
29 |
if iterable is not None: |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
30 |
self |= iterable |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
31 |
|
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
32 |
def __len__(self): |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
33 |
return len(self.map) |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
34 |
|
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
35 |
def __contains__(self, key): |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
36 |
return key in self.map |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
37 |
|
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
38 |
def add(self, key): |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
39 |
if key not in self.map: |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
40 |
end = self.end |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
41 |
curr = end[PREV] |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
42 |
curr[NEXT] = end[PREV] = self.map[key] = [key, curr, end] |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
43 |
|
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
44 |
def discard(self, key): |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
45 |
if key in self.map: |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
46 |
key, prev, next = self.map.pop(key) |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
47 |
prev[NEXT] = next |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
48 |
next[PREV] = prev |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
49 |
|
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
50 |
def __iter__(self): |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
51 |
end = self.end |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
52 |
curr = end[NEXT] |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
53 |
while curr is not end: |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
54 |
yield curr[KEY] |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
55 |
curr = curr[NEXT] |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
56 |
|
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
57 |
def __reversed__(self): |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
58 |
end = self.end |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
59 |
curr = end[PREV] |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
60 |
while curr is not end: |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
61 |
yield curr[KEY] |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
62 |
curr = curr[PREV] |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
63 |
|
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
64 |
def pop(self, last=True): |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
65 |
if not self: |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
66 |
raise KeyError('set is empty') |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
67 |
key = next(reversed(self)) if last else next(iter(self)) |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
68 |
self.discard(key) |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
69 |
return key |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
70 |
|
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
71 |
def __repr__(self): |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
72 |
if not self: |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
73 |
return '%s()' % (self.__class__.__name__,) |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
74 |
return '%s(%r)' % (self.__class__.__name__, list(self)) |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
75 |
|
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
76 |
def __eq__(self, other): |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
77 |
if isinstance(other, OrderedSet): |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
78 |
return len(self) == len(other) and list(self) == list(other) |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
79 |
return set(self) == set(other) |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
80 |
|
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
81 |
def __del__(self): |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
82 |
self.clear() # remove circular references |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
83 |
|
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
84 |
|
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
0
diff
changeset
|
85 |
## end of http://code.activestate.com/recipes/576694/ }}} |
| 24 | 86 |
|
| 46 | 87 |
|
88 |
## {{{ http://code.activestate.com/recipes/576693/ (r9) |
|
89 |
# Backport of OrderedDict() class that runs on Python 2.4, 2.5, 2.6, 2.7 and pypy. |
|
90 |
# Passes Python2.7's test suite and incorporates all the latest updates. |
|
91 |
||
92 |
try: |
|
93 |
from thread import get_ident as _get_ident |
|
94 |
except ImportError: |
|
95 |
from dummy_thread import get_ident as _get_ident |
|
96 |
||
97 |
try: |
|
98 |
from _abcoll import KeysView, ValuesView, ItemsView |
|
99 |
except ImportError: |
|
100 |
pass |
|
101 |
||
102 |
||
103 |
class OrderedDict(dict): |
|
104 |
'Dictionary that remembers insertion order' |
|
105 |
# An inherited dict maps keys to values. |
|
106 |
# The inherited dict provides __getitem__, __len__, __contains__, and get. |
|
107 |
# The remaining methods are order-aware. |
|
108 |
# Big-O running times for all methods are the same as for regular dictionaries. |
|
109 |
||
110 |
# The internal self.__map dictionary maps keys to links in a doubly linked list. |
|
111 |
# The circular doubly linked list starts and ends with a sentinel element. |
|
112 |
# The sentinel element never gets deleted (this simplifies the algorithm). |
|
113 |
# Each link is stored as a list of length three: [PREV, NEXT, KEY]. |
|
114 |
||
115 |
def __init__(self, *args, **kwds): |
|
116 |
'''Initialize an ordered dictionary. Signature is the same as for |
|
117 |
regular dictionaries, but keyword arguments are not recommended |
|
118 |
because their insertion order is arbitrary. |
|
119 |
||
120 |
''' |
|
121 |
if len(args) > 1: |
|
122 |
raise TypeError('expected at most 1 arguments, got %d' % len(args)) |
|
123 |
try: |
|
124 |
self.__root |
|
125 |
except AttributeError: |
|
126 |
self.__root = root = [] # sentinel node |
|
127 |
root[:] = [root, root, None] |
|
128 |
self.__map = {} |
|
129 |
self.__update(*args, **kwds) |
|
130 |
||
131 |
def __setitem__(self, key, value, dict_setitem=dict.__setitem__): |
|
132 |
'od.__setitem__(i, y) <==> od[i]=y' |
|
133 |
# Setting a new item creates a new link which goes at the end of the linked |
|
134 |
# list, and the inherited dictionary is updated with the new key/value pair. |
|
135 |
if key not in self: |
|
136 |
root = self.__root |
|
137 |
last = root[0] |
|
138 |
last[1] = root[0] = self.__map[key] = [last, root, key] |
|
139 |
dict_setitem(self, key, value) |
|
140 |
||
141 |
def __delitem__(self, key, dict_delitem=dict.__delitem__): |
|
142 |
'od.__delitem__(y) <==> del od[y]' |
|
143 |
# Deleting an existing item uses self.__map to find the link which is |
|
144 |
# then removed by updating the links in the predecessor and successor nodes. |
|
145 |
dict_delitem(self, key) |
|
146 |
link_prev, link_next, key = self.__map.pop(key) |
|
147 |
link_prev[1] = link_next |
|
148 |
link_next[0] = link_prev |
|
149 |
||
150 |
def __iter__(self): |
|
151 |
'od.__iter__() <==> iter(od)' |
|
152 |
root = self.__root |
|
153 |
curr = root[1] |
|
154 |
while curr is not root: |
|
155 |
yield curr[2] |
|
156 |
curr = curr[1] |
|
157 |
||
158 |
def __reversed__(self): |
|
159 |
'od.__reversed__() <==> reversed(od)' |
|
160 |
root = self.__root |
|
161 |
curr = root[0] |
|
162 |
while curr is not root: |
|
163 |
yield curr[2] |
|
164 |
curr = curr[0] |
|
165 |
||
166 |
def clear(self): |
|
167 |
'od.clear() -> None. Remove all items from od.' |
|
168 |
try: |
|
169 |
for node in self.__map.itervalues(): |
|
170 |
del node[:] |
|
171 |
root = self.__root |
|
172 |
root[:] = [root, root, None] |
|
173 |
self.__map.clear() |
|
174 |
except AttributeError: |
|
175 |
pass |
|
176 |
dict.clear(self) |
|
177 |
||
178 |
def popitem(self, last=True): |
|
179 |
'''od.popitem() -> (k, v), return and remove a (key, value) pair. |
|
180 |
Pairs are returned in LIFO order if last is true or FIFO order if false. |
|
181 |
||
182 |
''' |
|
183 |
if not self: |
|
184 |
raise KeyError('dictionary is empty') |
|
185 |
root = self.__root |
|
186 |
if last: |
|
187 |
link = root[0] |
|
188 |
link_prev = link[0] |
|
189 |
link_prev[1] = root |
|
190 |
root[0] = link_prev |
|
191 |
else: |
|
192 |
link = root[1] |
|
193 |
link_next = link[1] |
|
194 |
root[1] = link_next |
|
195 |
link_next[0] = root |
|
196 |
key = link[2] |
|
197 |
del self.__map[key] |
|
198 |
value = dict.pop(self, key) |
|
199 |
return key, value |
|
200 |
||
201 |
# -- the following methods do not depend on the internal structure -- |
|
202 |
||
203 |
def keys(self): |
|
204 |
'od.keys() -> list of keys in od' |
|
205 |
return list(self) |
|
206 |
||
207 |
def values(self): |
|
208 |
'od.values() -> list of values in od' |
|
209 |
return [self[key] for key in self] |
|
210 |
||
211 |
def items(self): |
|
212 |
'od.items() -> list of (key, value) pairs in od' |
|
213 |
return [(key, self[key]) for key in self] |
|
214 |
||
215 |
def iterkeys(self): |
|
216 |
'od.iterkeys() -> an iterator over the keys in od' |
|
217 |
return iter(self) |
|
218 |
||
219 |
def itervalues(self): |
|
220 |
'od.itervalues -> an iterator over the values in od' |
|
221 |
for k in self: |
|
222 |
yield self[k] |
|
223 |
||
224 |
def iteritems(self): |
|
225 |
'od.iteritems -> an iterator over the (key, value) items in od' |
|
226 |
for k in self: |
|
227 |
yield (k, self[k]) |
|
228 |
||
|
49
55e5f8a878ae
add the version context_processor and correct css to display it correctly
ymh <ymh.work@gmail.com>
parents:
46
diff
changeset
|
229 |
def update(*args, **kwds): #@NoSelf |
| 46 | 230 |
'''od.update(E, **F) -> None. Update od from dict/iterable E and F. |
231 |
||
232 |
If E is a dict instance, does: for k in E: od[k] = E[k] |
|
233 |
If E has a .keys() method, does: for k in E.keys(): od[k] = E[k] |
|
234 |
Or if E is an iterable of items, does: for k, v in E: od[k] = v |
|
235 |
In either case, this is followed by: for k, v in F.items(): od[k] = v |
|
236 |
||
237 |
''' |
|
238 |
if len(args) > 2: |
|
239 |
raise TypeError('update() takes at most 2 positional ' |
|
240 |
'arguments (%d given)' % (len(args),)) |
|
241 |
elif not args: |
|
242 |
raise TypeError('update() takes at least 1 argument (0 given)') |
|
243 |
self = args[0] |
|
244 |
# Make progressively weaker assumptions about "other" |
|
245 |
other = () |
|
246 |
if len(args) == 2: |
|
247 |
other = args[1] |
|
248 |
if isinstance(other, dict): |
|
249 |
for key in other: |
|
250 |
self[key] = other[key] |
|
251 |
elif hasattr(other, 'keys'): |
|
252 |
for key in other.keys(): |
|
253 |
self[key] = other[key] |
|
254 |
else: |
|
255 |
for key, value in other: |
|
256 |
self[key] = value |
|
257 |
for key, value in kwds.items(): |
|
258 |
self[key] = value |
|
259 |
||
260 |
__update = update # let subclasses override update without breaking __init__ |
|
261 |
||
262 |
__marker = object() |
|
263 |
||
264 |
def pop(self, key, default=__marker): |
|
265 |
'''od.pop(k[,d]) -> v, remove specified key and return the corresponding value. |
|
266 |
If key is not found, d is returned if given, otherwise KeyError is raised. |
|
267 |
||
268 |
''' |
|
269 |
if key in self: |
|
270 |
result = self[key] |
|
271 |
del self[key] |
|
272 |
return result |
|
273 |
if default is self.__marker: |
|
274 |
raise KeyError(key) |
|
275 |
return default |
|
276 |
||
277 |
def setdefault(self, key, default=None): |
|
278 |
'od.setdefault(k[,d]) -> od.get(k,d), also set od[k]=d if k not in od' |
|
279 |
if key in self: |
|
280 |
return self[key] |
|
281 |
self[key] = default |
|
282 |
return default |
|
283 |
||
284 |
def __repr__(self, _repr_running={}): |
|
285 |
'od.__repr__() <==> repr(od)' |
|
286 |
call_key = id(self), _get_ident() |
|
287 |
if call_key in _repr_running: |
|
288 |
return '...' |
|
289 |
_repr_running[call_key] = 1 |
|
290 |
try: |
|
291 |
if not self: |
|
292 |
return '%s()' % (self.__class__.__name__,) |
|
293 |
return '%s(%r)' % (self.__class__.__name__, self.items()) |
|
294 |
finally: |
|
295 |
del _repr_running[call_key] |
|
296 |
||
297 |
def __reduce__(self): |
|
298 |
'Return state information for pickling' |
|
299 |
items = [[k, self[k]] for k in self] |
|
300 |
inst_dict = vars(self).copy() |
|
301 |
for k in vars(OrderedDict()): |
|
302 |
inst_dict.pop(k, None) |
|
303 |
if inst_dict: |
|
304 |
return (self.__class__, (items,), inst_dict) |
|
305 |
return self.__class__, (items,) |
|
306 |
||
307 |
def copy(self): |
|
308 |
'od.copy() -> a shallow copy of od' |
|
309 |
return self.__class__(self) |
|
310 |
||
311 |
@classmethod |
|
312 |
def fromkeys(cls, iterable, value=None): |
|
313 |
'''OD.fromkeys(S[, v]) -> New ordered dictionary with keys from S |
|
314 |
and values equal to v (which defaults to None). |
|
315 |
||
316 |
''' |
|
317 |
d = cls() |
|
318 |
for key in iterable: |
|
319 |
d[key] = value |
|
320 |
return d |
|
321 |
||
322 |
def __eq__(self, other): |
|
323 |
'''od.__eq__(y) <==> od==y. Comparison to another OD is order-sensitive |
|
324 |
while comparison to a regular mapping is order-insensitive. |
|
325 |
||
326 |
''' |
|
327 |
if isinstance(other, OrderedDict): |
|
| 56 | 328 |
return len(self) == len(other) and self.items() == other.items() |
| 46 | 329 |
return dict.__eq__(self, other) |
330 |
||
331 |
def __ne__(self, other): |
|
332 |
return not self == other |
|
333 |
||
334 |
# -- the following methods are only used in Python 2.7 -- |
|
335 |
||
336 |
def viewkeys(self): |
|
337 |
"od.viewkeys() -> a set-like object providing a view on od's keys" |
|
338 |
return KeysView(self) |
|
339 |
||
340 |
def viewvalues(self): |
|
341 |
"od.viewvalues() -> an object providing a view on od's values" |
|
342 |
return ValuesView(self) |
|
343 |
||
344 |
def viewitems(self): |
|
345 |
"od.viewitems() -> a set-like object providing a view on od's items" |
|
346 |
return ItemsView(self) |
|
347 |
## end of http://code.activestate.com/recipes/576693/ }}} |
|
348 |
||
| 443 | 349 |
def remove_accents(lne): |
350 |
nkfd_form = unicodedata.normalize('NFKD', unicode(lne)) |
|
| 56 | 351 |
return u"".join([c for c in nkfd_form if not unicodedata.combining(c)]) |
| 72 | 352 |
|
| 443 | 353 |
def normalize(lne): |
354 |
return remove_accents(lne).lower().replace(u"œ",u"oe") |
|
355 |
||
356 |
def sanitize(line, separator = '-', ascii_only = True): |
|
357 |
||
358 |
if not line: |
|
359 |
return '' |
|
360 |
||
361 |
#Transliterate non-ASCII characters |
|
362 |
line = unidecode.unidecode(line) |
|
363 |
#Remove all characters that are not the separator, a-z, 0-9, or whitespace |
|
364 |
line = re.sub('[^\%sa-z0-9\s]+'%separator, '', line.lower()) |
|
365 |
#// Replace all separator characters and whitespace by a single separator |
|
366 |
line = re.sub('[\%s\s]+' % separator, separator, line) |
|
367 |
||
368 |
return line.strip(separator) |
|
369 |
||
|
111
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
74
diff
changeset
|
370 |
|
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
371 |
def show_progress(current_line, total_line, label, width, writer=None): |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
372 |
|
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
373 |
if writer is None: |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
374 |
writer = sys.stdout |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
375 |
if sys.stdout.encoding is not None: |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
376 |
writer = codecs.getwriter(sys.stdout.encoding)(sys.stdout) |
|
111
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
74
diff
changeset
|
377 |
|
|
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
74
diff
changeset
|
378 |
percent = (float(current_line) / float(total_line)) * 100.0 |
|
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
74
diff
changeset
|
379 |
|
|
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
74
diff
changeset
|
380 |
marks = math.floor(width * (percent / 100.0)) |
|
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
74
diff
changeset
|
381 |
spaces = math.floor(width - marks) |
|
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
74
diff
changeset
|
382 |
|
|
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
74
diff
changeset
|
383 |
loader = u'[' + (u'=' * int(marks)) + (u' ' * int(spaces)) + u']' |
|
114
c59383cc9940
migrate categories extraction to hdalab
ymh <ymh.work@gmail.com>
parents:
113
diff
changeset
|
384 |
|
|
c59383cc9940
migrate categories extraction to hdalab
ymh <ymh.work@gmail.com>
parents:
113
diff
changeset
|
385 |
s = u"%s %3d%% %*d/%d - %*s\r" % (loader, percent, len(str(total_line)), current_line, total_line, width, label[:width]) |
|
111
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
74
diff
changeset
|
386 |
|
|
114
c59383cc9940
migrate categories extraction to hdalab
ymh <ymh.work@gmail.com>
parents:
113
diff
changeset
|
387 |
writer.write(s) #takes the header into account |
|
111
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
74
diff
changeset
|
388 |
if percent >= 100: |
|
113
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
389 |
writer.write("\n") |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
390 |
writer.flush() |
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
391 |
|
|
0d2bfd84b989
improve cat and infobox extraction + export csv
ymh <ymh.work@gmail.com>
parents:
111
diff
changeset
|
392 |
return writer |
|
111
ceb381f5b0c7
query wp for categories and infoboxes
ymh <ymh.work@gmail.com>
parents:
74
diff
changeset
|
393 |