|
1 #!/usr/bin/env python |
|
2 # coding=utf-8 |
|
3 |
|
4 from lxml import etree |
|
5 from iri_tweet.models import setup_database, Tweet, User |
|
6 from sqlalchemy.sql import select, func |
|
7 from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, |
|
8 get_logger) |
|
9 import argparse |
|
10 import json |
|
11 import datetime |
|
12 import requests |
|
13 import os.path |
|
14 import re |
|
15 import sys |
|
16 import time |
|
17 import uuid #@UnresolvedImport |
|
18 from dateutil.parser import parse as parse_date |
|
19 import bisect |
|
20 |
|
21 #class TweetExclude(object): |
|
22 # def __init__(self, id): |
|
23 # self.id = id |
|
24 # |
|
25 # def __repr__(self): |
|
26 # return "<TweetExclude(id=%d)>" % (self.id) |
|
27 |
|
28 LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/" |
|
29 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/" |
|
30 DEFAULT_ANNOTATION_CHANNEL = 'ANNOT' |
|
31 |
|
32 |
|
33 def get_filter(start_date, end_date, events, channels, user_whitelist): |
|
34 res = [] |
|
35 #TODO: check timezone... |
|
36 if start_date: |
|
37 res.append({'name': 'ts', 'op': ">=", 'val':start_date.isoformat() }) |
|
38 if end_date: |
|
39 res.append({'name': 'ts', 'op': "<=", 'val':end_date.isoformat() }) |
|
40 if events: |
|
41 res.append({'name': 'event', 'op': "in", 'val':events }) |
|
42 if channels: |
|
43 res.append({'name': 'channel', 'op': "in", 'val':channels }) |
|
44 if user_whitelist: |
|
45 res.append({'name': 'user', 'op': "in", 'val':user_whitelist }) |
|
46 |
|
47 # def parse_polemics(tw, extended_mode): |
|
48 # """ |
|
49 # parse polemics in text and return a list of polemic code. None if not polemic found |
|
50 # """ |
|
51 # polemics = {} |
|
52 # for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text): |
|
53 # pol_link = { |
|
54 # '++' : u'OK', |
|
55 # '--' : u'KO', |
|
56 # '??' : u'Q', |
|
57 # '==' : u'REF'}[m.group(1)] |
|
58 # polemics[pol_link] = pol_link |
|
59 # |
|
60 # if extended_mode: |
|
61 # if "?" in tw.text: |
|
62 # polemics["Q"] = "Q" |
|
63 # |
|
64 # for entity in tw.entity_list: |
|
65 # if entity.type == "entity_url": |
|
66 # polemics["REF"] = "REF" |
|
67 # |
|
68 # if len(polemics) > 0: |
|
69 # return polemics.keys() |
|
70 # else: |
|
71 # return None |
|
72 |
|
73 def get_options(): |
|
74 |
|
75 usage = "usage: %(prog)s [options]" |
|
76 |
|
77 parser = argparse.ArgumentParser(usage) |
|
78 |
|
79 parser.add_argument("-f", "--file", dest="filename", |
|
80 help="write export to file", metavar="FILE", default="project.ldt") |
|
81 parser.add_argument("-a", "--annot-url", dest="annot_url", |
|
82 help="annotation server url", metavar="ANNOT-URL", required=True) |
|
83 parser.add_argument("-s", "--start-date", dest="start_date", |
|
84 help="start date", metavar="START_DATE", default=None) |
|
85 parser.add_argument("-e", "--end-date", dest="end_date", |
|
86 help="end date", metavar="END_DATE", default=None) |
|
87 parser.add_argument("-I", "--content-file", dest="content_file", |
|
88 help="Content file", metavar="CONTENT_FILE") |
|
89 parser.add_argument("-c", "--content", dest="content", |
|
90 help="Content url", metavar="CONTENT") |
|
91 parser.add_argument("-V", "--video-url", dest="video", |
|
92 help="video url", metavar="VIDEO") |
|
93 parser.add_argument("-i", "--content-id", dest="content_id", |
|
94 help="Content id", metavar="CONTENT_ID") |
|
95 parser.add_argument("-x", "--exclude", dest="exclude", |
|
96 help="file containing the id to exclude", metavar="EXCLUDE") |
|
97 parser.add_argument("-C", "--color", dest="color", |
|
98 help="Color code", metavar="COLOR", default="16763904") |
|
99 parser.add_argument("-H", "--channel", dest="channels", |
|
100 help="Channel", metavar="CHANNEL", default=[DEFAULT_ANNOTATION_CHANNEL], action="append") |
|
101 parser.add_argument("-E", "--event", dest="events", |
|
102 help="Event", metavar="EVENT", default=[], action="append") |
|
103 parser.add_argument("-D", "--duration", dest="duration", type=int, |
|
104 help="Duration", metavar="DURATION", default=None) |
|
105 parser.add_argument("-n", "--name", dest="name", |
|
106 help="Cutting name", metavar="NAME", default=u"annotations") |
|
107 parser.add_argument("-R", "--replace", dest="replace", action="store_true", |
|
108 help="Replace annotation ensemble", default=False) |
|
109 parser.add_argument("-m", "--merge", dest="merge", action="store_true", |
|
110 help="merge annotation ensemble, choose the first ensemble", default=False) |
|
111 parser.add_argument("-L", "--list-conf", dest="listconf", |
|
112 help="list of file to process", metavar="LIST_CONF", default=None) |
|
113 parser.add_argument("-b", "--base-url", dest="base_url", |
|
114 help="base URL of the platform", metavar="BASE_URL", default="http://ldt.iri.centrepompidou.fr/ldtplatform/") |
|
115 parser.add_argument("-p", "--project", dest="project_id", |
|
116 help="Project id", metavar="PROJECT_ID", default=None) |
|
117 parser.add_argument("-P", "--post-param", dest="post_param", |
|
118 help="Post param", metavar="POST_PARAM", default=None) |
|
119 parser.add_argument("--user-whitelist", dest="user_whitelist", action="store", |
|
120 help="A list of user screen name", metavar="USER_WHITELIST",default=None) |
|
121 parser.add_argument("--cut", dest="cuts", action="append", |
|
122 help="A cut with the forma <ts in ms>::<duration>", metavar="CUT", default=[]) |
|
123 |
|
124 set_logging_options(parser) |
|
125 |
|
126 return (parser.parse_args(), parser) |
|
127 |
|
128 |
|
129 def find_delta(deltas, ts): |
|
130 i = bisect.bisect_right(deltas, (ts+1,0)) |
|
131 if i: |
|
132 return deltas[i-1] |
|
133 return (0,0) |
|
134 |
|
135 |
|
136 def parse_duration(s): |
|
137 try: |
|
138 return int(s) |
|
139 except ValueError: |
|
140 parts = s.split(":") |
|
141 if len(parts) < 2: |
|
142 raise ValueError("Bad duration format") |
|
143 time_params = { |
|
144 'hours': int(parts[0]), |
|
145 'minutes': int(parts[1]), |
|
146 'seconds': int(parts[2]) if len(parts)>2 else 0 |
|
147 } |
|
148 return int(datetime.timedelta(**time_params).total_seconds()*1000) |
|
149 |
|
150 def build_annotation_iterator(url, params, headers): |
|
151 page = 0 |
|
152 page_nb = 1 |
|
153 while page < page_nb: |
|
154 page += 1 |
|
155 params['page'] = page |
|
156 resp = requests.get(url, params=params, headers=headers) |
|
157 if resp.code != 200: |
|
158 return |
|
159 resp_json = resp.json() |
|
160 page_nb = resp_json.get('total_pages', 1) |
|
161 for item in resp_json.get('results', []): |
|
162 #TODO: add progress log |
|
163 yield item |
|
164 |
|
165 |
|
166 if __name__ == "__main__" : |
|
167 |
|
168 (options, parser) = get_options() |
|
169 |
|
170 set_logging(options) |
|
171 |
|
172 get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable |
|
173 |
|
174 |
|
175 deltas = [(0,0)] |
|
176 total_delta = 0 |
|
177 if options.cuts: |
|
178 cuts_raw = sorted([tuple([parse_duration(s) for s in c.split("::")]) for c in options.cuts]) |
|
179 for c, d in cuts_raw: |
|
180 deltas.append((c+total_delta, -1)) |
|
181 total_delta += d |
|
182 deltas.append((c+total_delta, total_delta)) |
|
183 |
|
184 if len(sys.argv) == 1 or options.database is None: |
|
185 parser.print_help() |
|
186 sys.exit(1) |
|
187 |
|
188 user_whitelist_file = options.user_whitelist |
|
189 user_whitelist = None |
|
190 |
|
191 annotation_url = options.annot_url |
|
192 |
|
193 if options.listconf: |
|
194 |
|
195 parameters = [] |
|
196 confdoc = etree.parse(options.listconf) |
|
197 for node in confdoc.xpath("/twitter_export/file"): |
|
198 params = {} |
|
199 for snode in node: |
|
200 if snode.tag == "path": |
|
201 params['content_file'] = snode.text |
|
202 params['content_file_write'] = snode.text |
|
203 elif snode.tag == "project_id": |
|
204 params['content_file'] = options.base_url + LDT_PROJECT_REST_API_PATH + snode.text + "/?format=json" |
|
205 params['content_file_write'] = options.base_url + LDT_PROJECT_REST_API_PATH + snode.text + "/?format=json" |
|
206 params['project_id'] = snode.text |
|
207 elif snode.tag == "start_date": |
|
208 params['start_date'] = snode.text |
|
209 elif snode.tag == "end_date": |
|
210 params['end_date'] = snode.text |
|
211 elif snode.tag == "duration": |
|
212 params['duration'] = int(snode.text) |
|
213 elif snode.tag == "events": |
|
214 params['events'] = [snode.text] |
|
215 elif snode.tag == "channels": |
|
216 params['channels'] = [snode.text] |
|
217 if options.events or 'events' not in params : |
|
218 params['events'] = options.events |
|
219 if options.channels or 'channels' not in params : |
|
220 params['channels'] = options.channels |
|
221 |
|
222 parameters.append(params) |
|
223 else: |
|
224 if options.project_id: |
|
225 content_file = options.base_url + LDT_PROJECT_REST_API_PATH + options.project_id + "/?format=json" |
|
226 else: |
|
227 content_file = options.content_file |
|
228 parameters = [{ |
|
229 'start_date' : options.start_date, |
|
230 'end_date' : options.end_date, |
|
231 'duration' : options.duration, |
|
232 'events' : options.events, |
|
233 'channels' : options.channels, |
|
234 'content_file' : content_file, |
|
235 'content_file_write' : content_file, |
|
236 'project_id' : options.project_id |
|
237 }] |
|
238 post_param = {} |
|
239 if options.post_param: |
|
240 post_param = json.loads(options.post_param) |
|
241 |
|
242 for params in parameters: |
|
243 |
|
244 get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable |
|
245 |
|
246 start_date_str = params.get("start_date",None) |
|
247 end_date_str = params.get("end_date", None) |
|
248 duration = params.get("duration", None) |
|
249 content_file = params.get("content_file", None) |
|
250 content_file_write = params.get("content_file_write", None) |
|
251 channels = params.get('channels', [DEFAULT_ANNOTATION_CHANNEL]) |
|
252 events = params.get('events', []) |
|
253 |
|
254 if user_whitelist_file: |
|
255 with open(user_whitelist_file, 'r+') as f: |
|
256 user_whitelist = list(set([s.strip() for s in f])) |
|
257 |
|
258 start_date = None |
|
259 ts = None |
|
260 if start_date_str: |
|
261 start_date = parse_date(start_date_str) |
|
262 ts = time.mktime(start_date.timetuple()) |
|
263 |
|
264 |
|
265 root = None |
|
266 ensemble_parent = None |
|
267 |
|
268 #to do : analyse situation ldt or iri ? filename set or not ? |
|
269 |
|
270 if content_file and content_file.find("http") == 0: |
|
271 |
|
272 get_logger().debug("url : " + content_file) #@UndefinedVariable |
|
273 |
|
274 r = requests.get(content_file, params=post_param) |
|
275 get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable |
|
276 project = r.json() |
|
277 text_match = re.match(r"\<\?\s*xml.*?\?\>(.*)", project['ldt'], re.I|re.S) |
|
278 root = etree.fromstring(text_match.group(1) if text_match else project['ldt']) |
|
279 |
|
280 elif content_file and os.path.exists(content_file): |
|
281 |
|
282 doc = etree.parse(content_file) |
|
283 root = doc.getroot() |
|
284 |
|
285 content_id = None |
|
286 |
|
287 if root is None: |
|
288 |
|
289 root = etree.Element(u"iri") |
|
290 |
|
291 project = etree.SubElement(root, u"project", {u"abstract":u"Polemics Tweets",u"title":u"Polemic Tweets", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())}) |
|
292 |
|
293 medias = etree.SubElement(root, u"medias") |
|
294 media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""}) |
|
295 |
|
296 annotations = etree.SubElement(root, u"annotations") |
|
297 content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)}) |
|
298 ensemble_parent = content |
|
299 |
|
300 content_id = options.content_id |
|
301 |
|
302 |
|
303 if ensemble_parent is None: |
|
304 file_type = None |
|
305 for node in root: |
|
306 if node.tag == "project": |
|
307 file_type = "ldt" |
|
308 break |
|
309 elif node.tag == "head": |
|
310 file_type = "iri" |
|
311 break |
|
312 |
|
313 if file_type == "ldt": |
|
314 media_nodes = root.xpath("//media") |
|
315 if len(media_nodes) > 0: |
|
316 media = media_nodes[0] |
|
317 annotations_node = root.find(u"annotations") |
|
318 if annotations_node is None: |
|
319 annotations_node = etree.SubElement(root, u"annotations") |
|
320 content_node = annotations_node.find(u"content") |
|
321 if content_node is None: |
|
322 content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id")) |
|
323 ensemble_parent = content_node |
|
324 content_id = content_node.get(u"id") |
|
325 display_nodes = root.xpath("//displays/display/content[@id='%s']" % content_id) |
|
326 if len(display_nodes) == 0: |
|
327 get_logger().info("No display node found. Will not update display") |
|
328 display_content_node = None |
|
329 else: |
|
330 display_content_node = display_nodes[0] |
|
331 |
|
332 elif file_type == "iri": |
|
333 body_node = root.find(u"body") |
|
334 if body_node is None: |
|
335 body_node = etree.SubElement(root, u"body") |
|
336 ensembles_node = body_node.find(u"ensembles") |
|
337 if ensembles_node is None: |
|
338 ensembles_node = etree.SubElement(body_node, u"ensembles") |
|
339 ensemble_parent = ensembles_node |
|
340 content_id = root.xpath("head/meta[@name='id']/@content")[0] |
|
341 display_content_node = None |
|
342 |
|
343 |
|
344 if ensemble_parent is None: |
|
345 get_logger().error("Can not process file") #@UndefinedVariable |
|
346 sys.exit() |
|
347 |
|
348 if options.replace: |
|
349 for ens in ensemble_parent.iterchildren(tag=u"ensemble"): |
|
350 ens_id = ens.get("id","") |
|
351 if ens_id.startswith("annot_"): |
|
352 ensemble_parent.remove(ens) |
|
353 # remove in display nodes |
|
354 if display_content_node is not None: |
|
355 for cut_display in display_content_node.iterchildren(): |
|
356 if cut_display.get('idens','') == ens_id: |
|
357 display_content_node.remove(cut_display) |
|
358 |
|
359 ensemble = None |
|
360 elements = None |
|
361 |
|
362 if options.merge: |
|
363 for ens in ensemble_parent.findall(u"ensemble"): |
|
364 if ens.get('id',"").startswith("annot_"): |
|
365 ensemble = ens |
|
366 break |
|
367 if ensemble is not None: |
|
368 elements = ensemble.find(u".//elements") |
|
369 decoupage = ensemble.find(u"decoupage") |
|
370 |
|
371 if ensemble is None or elements is None: |
|
372 ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"annot_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"}) |
|
373 decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"}) |
|
374 |
|
375 etree.SubElement(decoupage, u"title").text = unicode(options.name) |
|
376 etree.SubElement(decoupage, u"abstract").text = unicode(options.name) |
|
377 |
|
378 elements = etree.SubElement(decoupage, u"elements") |
|
379 |
|
380 ensemble_id = ensemble.get('id', '') |
|
381 decoupage_id = decoupage.get('id', '') if decoupage is not None else None |
|
382 |
|
383 end_date = None |
|
384 if end_date_str: |
|
385 end_date = parse_date(end_date_str) |
|
386 elif start_date and duration: |
|
387 end_date = start_date + datetime.timedelta(seconds=duration) |
|
388 elif start_date and options.base_url: |
|
389 # get duration from api |
|
390 content_url = options.base_url + LDT_CONTENT_REST_API_PATH + content_id + "/?format=json" |
|
391 r = requests.get(content_url) |
|
392 duration = int(r.json()['duration']) |
|
393 get_logger().debug("get duration " + content_url) #@UndefinedVariable |
|
394 get_logger().debug("get duration " + repr(duration)) #@UndefinedVariable |
|
395 |
|
396 end_date = start_date + datetime.timedelta(seconds=int(duration/1000)) |
|
397 |
|
398 if end_date and deltas: |
|
399 end_date = end_date + datetime.timedelta(milliseconds=deltas[-1][1]) |
|
400 |
|
401 |
|
402 filters = get_filter(start_date, end_date, user_whitelist) |
|
403 |
|
404 headers = {'Content-Type': 'application/json'} |
|
405 |
|
406 params = { 'q':json.dumps({'filters':filters}), 'results_per_page': options.batch_size} |
|
407 |
|
408 |
|
409 for annot in build_annotation_iterator(url, params, headers):: |
|
410 #TODO : check timezone !!! |
|
411 annot_ts_dt = annot['ts'] |
|
412 annot_ts = int(time.mktime(annot_ts_dt.timetuple())) |
|
413 if ts is None: |
|
414 ts = annot_ts |
|
415 annot_ts_rel = (annot_ts-ts) * 1000 |
|
416 if deltas: |
|
417 d = find_delta(deltas, annot_ts_rel) |
|
418 if d[1] < 0: |
|
419 continue |
|
420 else : |
|
421 annot_ts_rel -= d[1] |
|
422 annot_content = annot.get('content',{'category':'', 'user':None}) |
|
423 |
|
424 username = annot_content.get('user', 'anon.') or 'anon.' |
|
425 |
|
426 category = annot_content.get('category', None) |
|
427 if category is None: |
|
428 continue |
|
429 |
|
430 element = etree.SubElement(elements, u"element" , {u"id":annot.get('uuid', uuid.uuid4()), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(annot_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(annot_ts_rel), u"dur":u"0"}) |
|
431 etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(category.get('label', category.get('code', ''))) |
|
432 etree.SubElement(element, u"abstract").text = unicode(category.get('label', category.get('code', ''))) |
|
433 |
|
434 tags_node = etree.SubElement(element, u"tags") |
|
435 etree.SubElement(tags_node,u"tag").text = category.get('code', '') |
|
436 |
|
437 meta_element = etree.SubElement(element, u'meta') |
|
438 |
|
439 polemics_element = etree.Element(u'polemics') |
|
440 etree.SubElement(polemics_element, u'polemic').text = category.get('code', '') |
|
441 meta_element.append(polemics_element) |
|
442 |
|
443 etree.SubElement(meta_element, u"source", attrib={"url":annotation_url + "/" + annot['uuid'], "mimetype":u"application/json"}).text = etree.CDATA(json.dumps(annot)) |
|
444 |
|
445 # sort by tc in |
|
446 if options.merge : |
|
447 # remove all elements and put them in a array |
|
448 # sort them with tc |
|
449 #put them back |
|
450 elements[:] = sorted(elements,key=lambda n: int(n.get('begin'))) |
|
451 |
|
452 #add to display node |
|
453 if display_content_node is not None: |
|
454 display_dec = None |
|
455 for dec in display_content_node.iterchildren(tag=u"decoupage"): |
|
456 if dec.get('idens','') == ensemble_id and dec.get('id', '') == decoupage_id: |
|
457 display_dec = dec |
|
458 break |
|
459 if display_dec is None and ensemble_id and decoupage_id: |
|
460 etree.SubElement(display_content_node, u"decoupage", attrib={'idens': ensemble_id, 'id': decoupage_id, 'tagsSelect':''}) |
|
461 |
|
462 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True) |
|
463 |
|
464 if content_file_write and content_file_write.find("http") == 0: |
|
465 |
|
466 project["ldt"] = output_data |
|
467 project['owner'] = project['owner'].replace('%7E','~') |
|
468 project['contents'] = [c_url.replace('%7E','~') for c_url in project['contents']] |
|
469 |
|
470 post_param = {} |
|
471 if options.post_param: |
|
472 post_param = json.loads(options.post_param) |
|
473 |
|
474 get_logger().debug("write http " + content_file_write) #@UndefinedVariable |
|
475 get_logger().debug("write http " + repr(post_param)) #@UndefinedVariable |
|
476 get_logger().debug("write http " + repr(project)) #@UndefinedVariable |
|
477 r = requests.put(content_file_write, data=json.dumps(project), headers={'content-type':'application/json'}, params=post_param); |
|
478 get_logger().debug("write http " + repr(r) + " content " + r.text) #@UndefinedVariable |
|
479 if r.status_code != requests.codes.ok: # @UndefinedVariable |
|
480 r.raise_for_status() |
|
481 else: |
|
482 if content_file_write and os.path.exists(content_file_write): |
|
483 dest_file_name = content_file_write |
|
484 else: |
|
485 dest_file_name = options.filename |
|
486 |
|
487 get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable |
|
488 output = open(dest_file_name, "w") |
|
489 output.write(output_data) |
|
490 output.flush() |
|
491 output.close() |