|
1 #!/usr/bin/env python |
|
2 # coding=utf-8 |
|
3 |
|
4 import argparse |
|
5 import bisect |
|
6 import datetime |
|
7 import json |
|
8 import os.path |
|
9 import re |
|
10 import sys |
|
11 import uuid # @UnresolvedImport |
|
12 |
|
13 import requests |
|
14 |
|
15 from dateutil.parser import parse as parse_date |
|
16 from iri_tweet.utils import get_logger, set_logging, set_logging_options |
|
17 from lxml import etree |
|
18 |
|
19 |
|
20 LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/" |
|
21 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/" |
|
22 |
|
23 |
|
24 def re_fn(expr, item): |
|
25 reg = re.compile(expr, re.I) |
|
26 res = reg.search(item) |
|
27 if res: |
|
28 get_logger().debug("re_fn : " + repr(expr) + "~" + repr(item)) #@UndefinedVariable |
|
29 return res is not None |
|
30 |
|
31 def parse_polemics_1(tw_text, extended_mode): |
|
32 """ |
|
33 parse polemics in text and return a list of polemic code. None if not polemic found |
|
34 """ |
|
35 polemics = {} |
|
36 for m in re.finditer(r"(\+\+|\-\-|\?\?|\=\=)",tw_text): |
|
37 pol_link = { |
|
38 '++' : 'OK', |
|
39 '--' : 'KO', |
|
40 '??' : 'Q', |
|
41 '==' : 'REF'}[m.group(1)] |
|
42 polemics[pol_link] = pol_link |
|
43 |
|
44 if extended_mode: |
|
45 if "?" in tw_text: |
|
46 polemics["Q"] = "Q" |
|
47 |
|
48 if len(polemics) > 0: |
|
49 return polemics.keys() |
|
50 else: |
|
51 return None |
|
52 |
|
53 def parse_polemics_2(tw_text, extended_mode): |
|
54 """ |
|
55 parse polemics in text and return a list of polemic code. None if not polemic found |
|
56 """ |
|
57 polemics = {} |
|
58 for m in re.finditer(r"(\+\+|\!\!|\?\?|\=\=)",tw_text): |
|
59 pol_link = { |
|
60 '++' : 'OK', |
|
61 '!!' : 'KO', |
|
62 '??' : 'Q', |
|
63 '==' : 'REF'}[m.group(1)] |
|
64 polemics[pol_link] = pol_link |
|
65 |
|
66 if extended_mode: |
|
67 if "?" in tw_text: |
|
68 polemics["Q"] = "Q" |
|
69 |
|
70 |
|
71 if len(polemics) > 0: |
|
72 return polemics.keys() |
|
73 else: |
|
74 return None |
|
75 |
|
76 def parse_polemics_3(tw_text, extended_mode): |
|
77 """ |
|
78 parse polemics in text and return a list of polemic code. None if not polemic found |
|
79 """ |
|
80 polemics = {} |
|
81 for m in re.finditer(r"(\+\+|\?\?|\*\*|\=\=)",tw_text): |
|
82 pol_link = { |
|
83 '++' : 'OK', |
|
84 '??' : 'KO', |
|
85 '**' : 'REF', |
|
86 '==' : 'Q'}[m.group(1)] |
|
87 polemics[pol_link] = pol_link |
|
88 |
|
89 if len(polemics) > 0: |
|
90 return polemics.keys() |
|
91 else: |
|
92 return None |
|
93 |
|
94 |
|
95 protocol_version_map = { |
|
96 "1" : parse_polemics_1, |
|
97 "2" : parse_polemics_2, |
|
98 "3" : parse_polemics_3 |
|
99 } |
|
100 |
|
101 def get_options(): |
|
102 |
|
103 parser = argparse.ArgumentParser(description="All date should be given using iso8601 format. If no timezone is used, the date is considered as UTC") |
|
104 |
|
105 parser.add_argument("-f", "--file", dest="filename", |
|
106 help="write export to file", metavar="FILE", default="project.ldt") |
|
107 parser.add_argument("-d", "--chat-database", dest="database", |
|
108 help="Input chat file", metavar="CHAT_DATABASE") |
|
109 parser.add_argument("-s", "--start-date", dest="start_date", |
|
110 help="start date", metavar="START_DATE", default=None) |
|
111 parser.add_argument("-a", "--annotation-protocol", dest="protocol_version", |
|
112 help="annotation protocol version", metavar="PROTOCOL_VERSION", |
|
113 default="2") |
|
114 parser.add_argument("-I", "--content-file", dest="content_file", |
|
115 help="Content file", metavar="CONTENT_FILE") |
|
116 parser.add_argument("-c", "--content", dest="content", |
|
117 help="Content url", metavar="CONTENT") |
|
118 parser.add_argument("-V", "--video-url", dest="video", |
|
119 help="video url", metavar="VIDEO") |
|
120 parser.add_argument("-i", "--content-id", dest="content_id", |
|
121 help="Content id", metavar="CONTENT_ID") |
|
122 parser.add_argument("-C", "--color", dest="color", |
|
123 help="Color code", metavar="COLOR", default="16763904") |
|
124 parser.add_argument("-D", "--duration", dest="duration", type=int, |
|
125 help="Duration", metavar="DURATION", default=None) |
|
126 parser.add_argument("-n", "--name", dest="name", |
|
127 help="Cutting name", metavar="NAME", default="Chats") |
|
128 parser.add_argument("-R", "--replace", dest="replace", action="store_true", |
|
129 help="Replace tweet ensemble", default=False) |
|
130 parser.add_argument("-m", "--merge", dest="merge", action="store_true", |
|
131 help="merge tweet ensemble, choose the first ensemble", default=False) |
|
132 parser.add_argument("-E", "--extended", dest="extended_mode", action="store_true", |
|
133 help="Trigger polemic extended mode", default=False) |
|
134 parser.add_argument("-b", "--base-url", dest="base_url", |
|
135 help="base URL of the platform", metavar="BASE_URL", default="http://ldt.iri.centrepompidou.fr/ldtplatform/") |
|
136 parser.add_argument("-p", "--project", dest="project_id", |
|
137 help="Project id", metavar="PROJECT_ID", default=None) |
|
138 parser.add_argument("-P", "--post-param", dest="post_param", |
|
139 help="Post param", metavar="POST_PARAM", default=None) |
|
140 parser.add_argument("--user-whitelist", dest="user_whitelist", action="store", |
|
141 help="A list of user screen name", metavar="USER_WHITELIST",default=None) |
|
142 parser.add_argument("--cut", dest="cuts", action="append", |
|
143 help="A cut with the forma <ts in ms>::<duration>", metavar="CUT", default=[]) |
|
144 |
|
145 set_logging_options(parser) |
|
146 |
|
147 return (parser.parse_args(), parser) |
|
148 |
|
149 |
|
150 def find_delta(deltas, ts): |
|
151 i = bisect.bisect_right(deltas, (ts+1,0)) |
|
152 if i: |
|
153 return deltas[i-1] |
|
154 return (0,0) |
|
155 |
|
156 |
|
157 def parse_duration(s): |
|
158 try: |
|
159 return int(s) |
|
160 except ValueError: |
|
161 parts = s.split(":") |
|
162 if len(parts) < 2: |
|
163 raise ValueError("Bad duration format") |
|
164 time_params = { |
|
165 'hours': int(parts[0]), |
|
166 'minutes': int(parts[1]), |
|
167 'seconds': int(parts[2]) if len(parts)>2 else 0 |
|
168 } |
|
169 return int(round(datetime.timedelta(**time_params).total_seconds()*1000)) |
|
170 |
|
171 CHAT_REGEXP = re.compile(r"^(?P<created_at>\d{2}:\d{2}:\d{2})\t(?P<user>.+?)\s?:\s(?P<text>.*)$", re.DOTALL) |
|
172 CHAT_LINE_REGEXP = re.compile(r"^\d{2}:\d{2}:\d{2}\t.+?:\s") |
|
173 |
|
174 def parse_chat_line(chat_id, chat_line): |
|
175 if (m := CHAT_REGEXP.match(chat_line)) is not None: |
|
176 res = {k: v.replace('\r','\n') if k == 'text' else v for k,v in m.groupdict().items()} |
|
177 res['id'] = chat_id |
|
178 res['tags'] = re.findall('#(\w+)',res['text']) |
|
179 return res |
|
180 else: |
|
181 return {} |
|
182 |
|
183 def read_chat_file(chat_file_path): |
|
184 current_line = "" |
|
185 chat_content = [] |
|
186 with open(chat_file_path, "r") as chat_file: |
|
187 for chat_line in chat_file: |
|
188 if CHAT_LINE_REGEXP.match(chat_line) is not None: |
|
189 if current_line: |
|
190 chat_content.append(current_line) |
|
191 current_line = chat_line |
|
192 else: |
|
193 current_line = current_line + "\n" + chat_line |
|
194 if current_line: |
|
195 chat_content.append(current_line) |
|
196 return chat_content |
|
197 |
|
198 |
|
199 if __name__ == "__main__" : |
|
200 |
|
201 (options, parser) = get_options() |
|
202 |
|
203 set_logging(options) |
|
204 |
|
205 get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable |
|
206 |
|
207 |
|
208 deltas = [(0,0)] |
|
209 total_delta = 0 |
|
210 if options.cuts: |
|
211 cuts_raw = sorted([tuple([parse_duration(s) for s in c.split("::")]) for c in options.cuts]) |
|
212 for c, d in cuts_raw: |
|
213 deltas.append((c+total_delta, -1)) |
|
214 total_delta += d |
|
215 deltas.append((c+total_delta, total_delta)) |
|
216 |
|
217 if len(sys.argv) == 1 or options.database is None: |
|
218 parser.print_help() |
|
219 sys.exit(1) |
|
220 |
|
221 user_whitelist_file = options.user_whitelist |
|
222 user_whitelist = None |
|
223 |
|
224 if options.project_id: |
|
225 content_file = options.base_url + LDT_PROJECT_REST_API_PATH + options.project_id + "/?format=json" |
|
226 else: |
|
227 content_file = options.content_file |
|
228 |
|
229 params = { |
|
230 'start_date': options.start_date, |
|
231 'duration' : options.duration, |
|
232 'content_file' : content_file, |
|
233 'content_file_write' : content_file, |
|
234 'project_id' : options.project_id |
|
235 } |
|
236 post_param = {} |
|
237 |
|
238 if options.post_param: |
|
239 post_param = json.loads(options.post_param) |
|
240 |
|
241 display_content_node = None |
|
242 |
|
243 get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable |
|
244 |
|
245 start_date_str = params.get("start_date",None) |
|
246 duration = params.get("duration", None) |
|
247 content_file = params.get("content_file", None) |
|
248 content_file_write = params.get("content_file_write", None) |
|
249 if user_whitelist_file: |
|
250 with open(user_whitelist_file, 'r+') as f: |
|
251 user_whitelist = list(set([s.strip() for s in f])) |
|
252 |
|
253 start_date = datetime.datetime.now() |
|
254 if start_date_str: |
|
255 start_date = parse_date(start_date_str) |
|
256 |
|
257 root = None |
|
258 ensemble_parent = None |
|
259 project = None |
|
260 |
|
261 #to do : analyse situation ldt or iri ? filename set or not ? |
|
262 |
|
263 if content_file and content_file.find("http") == 0: |
|
264 |
|
265 get_logger().debug("url : " + content_file) #@UndefinedVariable |
|
266 |
|
267 r = requests.get(content_file, params=post_param) |
|
268 get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable |
|
269 project = r.json() |
|
270 text_match = re.match(r"\<\?\s*xml.*?\?\>(.*)", project['ldt'], re.I|re.S) |
|
271 root = etree.fromstring(text_match.group(1) if text_match else project['ldt']) |
|
272 |
|
273 elif content_file and os.path.exists(content_file): |
|
274 |
|
275 doc = etree.parse(content_file) |
|
276 root = doc.getroot() |
|
277 for child in root: |
|
278 if child.tag == "project": |
|
279 project = child |
|
280 break |
|
281 if project is None: |
|
282 root = None |
|
283 |
|
284 content_id = None |
|
285 |
|
286 if root is None: |
|
287 |
|
288 root = etree.Element("iri") |
|
289 |
|
290 project = etree.SubElement(root, "project", {"abstract":"Polemics Chat","title":"Polemic Chat", "user":"IRI Web", "id":str(uuid.uuid4())}) |
|
291 |
|
292 medias = etree.SubElement(root, "medias") |
|
293 media = etree.SubElement(medias, "media", {"pict":"", "src":options.content, "video":options.video, "id":options.content_id, "extra":""}) |
|
294 |
|
295 annotations = etree.SubElement(root, "annotations") |
|
296 content = etree.SubElement(annotations, "content", {"id":options.content_id}) |
|
297 ensemble_parent = content |
|
298 |
|
299 content_id = options.content_id |
|
300 |
|
301 |
|
302 if ensemble_parent is None: |
|
303 file_type = None |
|
304 for node in root: |
|
305 if node.tag == "project": |
|
306 file_type = "ldt" |
|
307 break |
|
308 elif node.tag == "head": |
|
309 file_type = "iri" |
|
310 break |
|
311 |
|
312 if file_type == "ldt": |
|
313 media_nodes = root.xpath("//media") |
|
314 media = None |
|
315 if len(media_nodes) > 0: |
|
316 media = media_nodes[0] |
|
317 annotations_node = root.find("annotations") |
|
318 if annotations_node is None: |
|
319 annotations_node = etree.SubElement(root, "annotations") |
|
320 content_node = annotations_node.find("content") |
|
321 if content_node is None and media is not None: |
|
322 content_node = etree.SubElement(annotations_node,"content", id=media.get("id")) |
|
323 ensemble_parent = content_node |
|
324 content_id = content_node.get("id") |
|
325 display_nodes = root.xpath("//displays/display/content[@id='%s']" % content_id) |
|
326 if len(display_nodes) == 0: |
|
327 get_logger().info("No display node found. Will not update display") |
|
328 display_content_node = None |
|
329 else: |
|
330 display_content_node = display_nodes[0] |
|
331 |
|
332 elif file_type == "iri": |
|
333 body_node = root.find("body") |
|
334 if body_node is None: |
|
335 body_node = etree.SubElement(root, "body") |
|
336 ensembles_node = body_node.find("ensembles") |
|
337 if ensembles_node is None: |
|
338 ensembles_node = etree.SubElement(body_node, "ensembles") |
|
339 ensemble_parent = ensembles_node |
|
340 content_id = root.xpath("head/meta[@name='id']/@content")[0] |
|
341 display_content_node = None |
|
342 |
|
343 |
|
344 if ensemble_parent is None: |
|
345 get_logger().error("Can not process file") #@UndefinedVariable |
|
346 sys.exit() |
|
347 |
|
348 if options.replace: |
|
349 for ens in ensemble_parent.iterchildren(tag="ensemble"): |
|
350 ens_id = ens.get("id","") |
|
351 if ens_id.startswith("chat_"): |
|
352 ensemble_parent.remove(ens) |
|
353 # remove in display nodes |
|
354 if display_content_node is not None: |
|
355 for cut_display in display_content_node.iterchildren(): |
|
356 if cut_display.get('idens','') == ens_id: |
|
357 display_content_node.remove(cut_display) |
|
358 |
|
359 ensemble = None |
|
360 elements = None |
|
361 decoupage = None |
|
362 |
|
363 if options.merge: |
|
364 for ens in ensemble_parent.findall("ensemble"): |
|
365 if ens.get('id',"").startswith("chat_"): |
|
366 ensemble = ens |
|
367 break |
|
368 if ensemble is not None: |
|
369 elements = ensemble.find(".//elements") |
|
370 decoupage = ensemble.find("decoupage") |
|
371 |
|
372 if ensemble is None or elements is None: |
|
373 ensemble = etree.SubElement(ensemble_parent, "ensemble", {"id":"chat_" + str(uuid.uuid4()), "title":"Ensemble Chat", "author":"IRI Web", "abstract":"Ensemble Chat"}) |
|
374 decoupage = etree.SubElement(ensemble, "decoupage", {"id": str(uuid.uuid4()), "author": "IRI Web"}) |
|
375 |
|
376 etree.SubElement(decoupage, "title").text = options.name |
|
377 etree.SubElement(decoupage, "abstract").text = options.name |
|
378 |
|
379 elements = etree.SubElement(decoupage, "elements") |
|
380 |
|
381 ensemble_id = ensemble.get('id', '') |
|
382 decoupage_id = decoupage.get('id', '') if decoupage is not None else None |
|
383 |
|
384 if not duration and options.base_url: |
|
385 content_url = options.base_url + LDT_CONTENT_REST_API_PATH + content_id + "/?format=json" |
|
386 r = requests.get(content_url) |
|
387 duration = int(r.json()['duration']) |
|
388 get_logger().debug("get duration " + content_url) #@UndefinedVariable |
|
389 get_logger().debug("get duration " + repr(duration)) #@UndefinedVariable |
|
390 |
|
391 chat_content_lines = read_chat_file(options.database.strip()) |
|
392 for i,chat_line in enumerate(chat_content_lines): |
|
393 |
|
394 cht = parse_chat_line("%04d" % (i+1) ,chat_line.strip()) |
|
395 |
|
396 #TODO parse chat line |
|
397 cht_ts_dt = cht['created_at'] |
|
398 cht_ts_rel_milli = parse_duration(cht_ts_dt) |
|
399 element_date = start_date + datetime.timedelta(milliseconds=cht_ts_rel_milli) |
|
400 if deltas: |
|
401 d = find_delta(deltas, cht_ts_rel_milli) |
|
402 if d[1] < 0: |
|
403 continue |
|
404 else : |
|
405 cht_ts_rel_milli -= d[1] |
|
406 |
|
407 username = cht['user'] or "anon." |
|
408 |
|
409 element = etree.SubElement(elements, "element" , {"id": "%s-%s" % (uuid.uuid4(),cht['id']), "color":options.color, "author":username, "date":element_date.strftime("%Y/%m/%d"), "begin": str(cht_ts_rel_milli), "dur":"0", "src":"zoom"}) |
|
410 etree.SubElement(element, "title").text = username + ": " + cht['text'][:255] |
|
411 etree.SubElement(element, "abstract").text = cht['text'] |
|
412 |
|
413 tags_node = etree.SubElement(element, "tags") |
|
414 |
|
415 for tag in cht['tags']: |
|
416 etree.SubElement(tags_node,"tag").text = tag |
|
417 |
|
418 meta_element = etree.SubElement(element, 'meta') |
|
419 |
|
420 etree.SubElement(meta_element, "polemic_version").text = options.protocol_version |
|
421 parse_polemics = protocol_version_map.get(options.protocol_version, parse_polemics_2) |
|
422 polemics_list = parse_polemics(cht['text'], options.extended_mode) |
|
423 if polemics_list: |
|
424 polemics_element = etree.Element('polemics') |
|
425 for pol in polemics_list: |
|
426 etree.SubElement(polemics_element, 'polemic').text = pol |
|
427 meta_element.append(polemics_element) |
|
428 |
|
429 etree.SubElement(meta_element, "source", attrib={"url":"http://zoom.io", "mimetype":"text/plain"}).text = etree.CDATA(json.dumps({'chat': chat_line})) |
|
430 |
|
431 # sort by tc in |
|
432 if options.merge : |
|
433 # remove all elements and put them in a array |
|
434 # sort them with tc |
|
435 #put them back |
|
436 elements[:] = sorted(elements,key=lambda n: int(n.get('begin'))) |
|
437 |
|
438 #add to display node |
|
439 if display_content_node is not None: |
|
440 display_dec = None |
|
441 for dec in display_content_node.iterchildren(tag="decoupage"): |
|
442 if dec.get('idens','') == ensemble_id and dec.get('id', '') == decoupage_id: |
|
443 display_dec = dec |
|
444 break |
|
445 if display_dec is None and ensemble_id and decoupage_id: |
|
446 etree.SubElement(display_content_node, "decoupage", attrib={'idens': ensemble_id, 'id': decoupage_id, 'tagsSelect':''}) |
|
447 |
|
448 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True).decode('utf-8') |
|
449 |
|
450 if content_file_write and content_file_write.find("http") == 0: |
|
451 |
|
452 project["ldt"] = output_data |
|
453 project['owner'] = project['owner'].replace('%7E','~') |
|
454 project['contents'] = [c_url.replace('%7E','~') for c_url in project['contents']] |
|
455 |
|
456 post_param = {} |
|
457 if options.post_param: |
|
458 post_param = json.loads(options.post_param) |
|
459 |
|
460 get_logger().debug("write http " + content_file_write) #@UndefinedVariable |
|
461 get_logger().debug("write http " + repr(post_param)) #@UndefinedVariable |
|
462 get_logger().debug("write http " + repr(project)) #@UndefinedVariable |
|
463 r = requests.put(content_file_write, data=json.dumps(project), headers={'content-type':'application/json'}, params=post_param) |
|
464 get_logger().debug("write http " + repr(r) + " content " + r.text) #@UndefinedVariable |
|
465 if r.status_code != requests.codes.ok: # pylint: disable=E1101 |
|
466 r.raise_for_status() |
|
467 else: |
|
468 if content_file_write and os.path.exists(content_file_write): |
|
469 dest_file_name = content_file_write |
|
470 else: |
|
471 dest_file_name = options.filename |
|
472 |
|
473 get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable |
|
474 output = open(dest_file_name, "w") |
|
475 output.write(output_data) |
|
476 output.flush() |
|
477 output.close() |