1 #!/usr/bin/env python |
1 #!/usr/bin/env python |
2 # coding=utf-8 |
2 # coding=utf-8 |
3 |
3 |
4 from lxml import etree |
|
5 from iri_tweet.models import setup_database, Tweet, User |
|
6 from sqlalchemy import Table, Column, BigInteger, event, bindparam |
|
7 from sqlalchemy.sql import select, func |
|
8 from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, |
|
9 get_logger) |
|
10 import argparse |
4 import argparse |
11 import anyjson |
5 import bisect |
12 import datetime |
6 import datetime |
13 import requests |
7 import json |
14 import os.path |
8 import os.path |
15 import re |
9 import re |
16 import sys |
10 import sys |
17 import time |
11 import time |
18 import uuid #@UnresolvedImport |
12 import uuid # @UnresolvedImport |
|
13 |
|
14 import requests |
|
15 from lxml import etree |
|
16 from sqlalchemy import BigInteger, Column, Table, bindparam, event |
|
17 from sqlalchemy.sql import func, select |
|
18 |
19 from dateutil.parser import parse as parse_date_raw |
19 from dateutil.parser import parse as parse_date_raw |
20 from dateutil.tz import tzutc |
20 from dateutil.tz import tzutc |
21 import bisect |
21 from iri_tweet.models import Tweet, User, setup_database |
|
22 from iri_tweet.utils import (get_filter_query, get_logger, set_logging, |
|
23 set_logging_options) |
22 |
24 |
23 #class TweetExclude(object): |
25 #class TweetExclude(object): |
24 # def __init__(self, id): |
26 # def __init__(self, id): |
25 # self.id = id |
27 # self.id = id |
26 # |
28 # |
99 def parse_polemics_3(tw, extended_mode): |
101 def parse_polemics_3(tw, extended_mode): |
100 """ |
102 """ |
101 parse polemics in text and return a list of polemic code. None if not polemic found |
103 parse polemics in text and return a list of polemic code. None if not polemic found |
102 """ |
104 """ |
103 polemics = {} |
105 polemics = {} |
104 for m in re.finditer("(\+\+|\?\?|\*\*|\=\=)",tw.text): |
106 for m in re.finditer(r"(\+\+|\?\?|\*\*|\=\=)",tw.text): |
105 pol_link = { |
107 pol_link = { |
106 '++' : u'OK', |
108 '++' : 'OK', |
107 '??' : u'KO', |
109 '??' : 'KO', |
108 '**' : u'REF', |
110 '**' : 'REF', |
109 '==' : u'Q'}[m.group(1)] |
111 '==' : 'Q'}[m.group(1)] |
110 polemics[pol_link] = pol_link |
112 polemics[pol_link] = pol_link |
111 |
113 |
112 if extended_mode: |
114 if extended_mode: |
113 for entity in tw.entity_list: |
115 for entity in tw.entity_list: |
114 if entity.type == "entity_url": |
116 if entity.type == "entity_url": |
156 parser.add_argument("-H", "--hashtag", dest="hashtag", |
158 parser.add_argument("-H", "--hashtag", dest="hashtag", |
157 help="Hashtag", metavar="HASHTAG", default=[], action="append") |
159 help="Hashtag", metavar="HASHTAG", default=[], action="append") |
158 parser.add_argument("-D", "--duration", dest="duration", type=int, |
160 parser.add_argument("-D", "--duration", dest="duration", type=int, |
159 help="Duration", metavar="DURATION", default=None) |
161 help="Duration", metavar="DURATION", default=None) |
160 parser.add_argument("-n", "--name", dest="name", |
162 parser.add_argument("-n", "--name", dest="name", |
161 help="Cutting name", metavar="NAME", default=u"Tweets") |
163 help="Cutting name", metavar="NAME", default="Tweets") |
162 parser.add_argument("-R", "--replace", dest="replace", action="store_true", |
164 parser.add_argument("-R", "--replace", dest="replace", action="store_true", |
163 help="Replace tweet ensemble", default=False) |
165 help="Replace tweet ensemble", default=False) |
164 parser.add_argument("-m", "--merge", dest="merge", action="store_true", |
166 parser.add_argument("-m", "--merge", dest="merge", action="store_true", |
165 help="merge tweet ensemble, choose the first ensemble", default=False) |
167 help="merge tweet ensemble, choose the first ensemble", default=False) |
166 parser.add_argument("-L", "--list-conf", dest="listconf", |
168 parser.add_argument("-L", "--list-conf", dest="listconf", |
247 #mapper(TweetExclude, tweet_exclude_table) |
249 #mapper(TweetExclude, tweet_exclude_table) |
248 metadata.create_all(bind=conn, tables=[tweet_exclude_table]) |
250 metadata.create_all(bind=conn, tables=[tweet_exclude_table]) |
249 |
251 |
250 if options.exclude and os.path.exists(options.exclude): |
252 if options.exclude and os.path.exists(options.exclude): |
251 with open(options.exclude, 'r+') as f: |
253 with open(options.exclude, 'r+') as f: |
252 tei = tweet_exclude_table.insert() |
254 tei = tweet_exclude_table.insert() # pylint: disable=E1120 |
253 ex_regexp = re.compile("(?P<field>\w+)(?P<op>[~=])(?P<value>.+)", re.I) |
255 ex_regexp = re.compile(r"(?P<field>\w+)(?P<op>[~=])(?P<value>.+)", re.I) |
254 for line in f: |
256 for line in f: |
255 res = ex_regexp.match(line.strip()) |
257 res = ex_regexp.match(line.strip()) |
256 if res: |
258 if res: |
257 if res.group('field') == "id": |
259 if res.group('field') == "id": |
258 conn.execute(tei.values(id=res.group('value'))) |
260 conn.execute(tei.values(id=res.group('value'))) |
363 |
365 |
364 content_id = None |
366 content_id = None |
365 |
367 |
366 if root is None: |
368 if root is None: |
367 |
369 |
368 root = etree.Element(u"iri") |
370 root = etree.Element("iri") |
369 |
371 |
370 project = etree.SubElement(root, u"project", {u"abstract":u"Polemics Tweets",u"title":u"Polemic Tweets", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())}) |
372 project = etree.SubElement(root, "project", {"abstract":"Polemics Tweets","title":"Polemic Tweets", "user":"IRI Web", "id":str(uuid.uuid4())}) |
371 |
373 |
372 medias = etree.SubElement(root, u"medias") |
374 medias = etree.SubElement(root, "medias") |
373 media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""}) |
375 media = etree.SubElement(medias, "media", {"pict":"", "src":options.content, "video":options.video, "id":options.content_id, "extra":""}) |
374 |
376 |
375 annotations = etree.SubElement(root, u"annotations") |
377 annotations = etree.SubElement(root, "annotations") |
376 content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)}) |
378 content = etree.SubElement(annotations, "content", {"id":options.content_id}) |
377 ensemble_parent = content |
379 ensemble_parent = content |
378 |
380 |
379 content_id = options.content_id |
381 content_id = options.content_id |
380 |
382 |
381 |
383 |
391 |
393 |
392 if file_type == "ldt": |
394 if file_type == "ldt": |
393 media_nodes = root.xpath("//media") |
395 media_nodes = root.xpath("//media") |
394 if len(media_nodes) > 0: |
396 if len(media_nodes) > 0: |
395 media = media_nodes[0] |
397 media = media_nodes[0] |
396 annotations_node = root.find(u"annotations") |
398 annotations_node = root.find("annotations") |
397 if annotations_node is None: |
399 if annotations_node is None: |
398 annotations_node = etree.SubElement(root, u"annotations") |
400 annotations_node = etree.SubElement(root, "annotations") |
399 content_node = annotations_node.find(u"content") |
401 content_node = annotations_node.find("content") |
400 if content_node is None: |
402 if content_node is None: |
401 content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id")) |
403 content_node = etree.SubElement(annotations_node,"content", id=media.get("id")) |
402 ensemble_parent = content_node |
404 ensemble_parent = content_node |
403 content_id = content_node.get(u"id") |
405 content_id = content_node.get("id") |
404 display_nodes = root.xpath("//displays/display/content[@id='%s']" % content_id) |
406 display_nodes = root.xpath("//displays/display/content[@id='%s']" % content_id) |
405 if len(display_nodes) == 0: |
407 if len(display_nodes) == 0: |
406 get_logger().info("No display node found. Will not update display") |
408 get_logger().info("No display node found. Will not update display") |
407 display_content_node = None |
409 display_content_node = None |
408 else: |
410 else: |
409 display_content_node = display_nodes[0] |
411 display_content_node = display_nodes[0] |
410 |
412 |
411 elif file_type == "iri": |
413 elif file_type == "iri": |
412 body_node = root.find(u"body") |
414 body_node = root.find("body") |
413 if body_node is None: |
415 if body_node is None: |
414 body_node = etree.SubElement(root, u"body") |
416 body_node = etree.SubElement(root, "body") |
415 ensembles_node = body_node.find(u"ensembles") |
417 ensembles_node = body_node.find("ensembles") |
416 if ensembles_node is None: |
418 if ensembles_node is None: |
417 ensembles_node = etree.SubElement(body_node, u"ensembles") |
419 ensembles_node = etree.SubElement(body_node, "ensembles") |
418 ensemble_parent = ensembles_node |
420 ensemble_parent = ensembles_node |
419 content_id = root.xpath("head/meta[@name='id']/@content")[0] |
421 content_id = root.xpath("head/meta[@name='id']/@content")[0] |
420 display_content_node = None |
422 display_content_node = None |
421 |
423 |
422 |
424 |
423 if ensemble_parent is None: |
425 if ensemble_parent is None: |
424 get_logger().error("Can not process file") #@UndefinedVariable |
426 get_logger().error("Can not process file") #@UndefinedVariable |
425 sys.exit() |
427 sys.exit() |
426 |
428 |
427 if options.replace: |
429 if options.replace: |
428 for ens in ensemble_parent.iterchildren(tag=u"ensemble"): |
430 for ens in ensemble_parent.iterchildren(tag="ensemble"): |
429 ens_id = ens.get("id","") |
431 ens_id = ens.get("id","") |
430 if ens_id.startswith("tweet_"): |
432 if ens_id.startswith("tweet_"): |
431 ensemble_parent.remove(ens) |
433 ensemble_parent.remove(ens) |
432 # remove in display nodes |
434 # remove in display nodes |
433 if display_content_node is not None: |
435 if display_content_node is not None: |
437 |
439 |
438 ensemble = None |
440 ensemble = None |
439 elements = None |
441 elements = None |
440 |
442 |
441 if options.merge: |
443 if options.merge: |
442 for ens in ensemble_parent.findall(u"ensemble"): |
444 for ens in ensemble_parent.findall("ensemble"): |
443 if ens.get('id',"").startswith("tweet_"): |
445 if ens.get('id',"").startswith("tweet_"): |
444 ensemble = ens |
446 ensemble = ens |
445 break |
447 break |
446 if ensemble is not None: |
448 if ensemble is not None: |
447 elements = ensemble.find(u".//elements") |
449 elements = ensemble.find(".//elements") |
448 decoupage = ensemble.find(u"decoupage") |
450 decoupage = ensemble.find("decoupage") |
449 |
451 |
450 if ensemble is None or elements is None: |
452 if ensemble is None or elements is None: |
451 ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"}) |
453 ensemble = etree.SubElement(ensemble_parent, "ensemble", {"id":"tweet_" + str(uuid.uuid4()), "title":"Ensemble Twitter", "author":"IRI Web", "abstract":"Ensemble Twitter"}) |
452 decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"}) |
454 decoupage = etree.SubElement(ensemble, "decoupage", {"id": str(uuid.uuid4()), "author": "IRI Web"}) |
453 |
455 |
454 etree.SubElement(decoupage, u"title").text = unicode(options.name) |
456 etree.SubElement(decoupage, "title").text = options.name |
455 etree.SubElement(decoupage, u"abstract").text = unicode(options.name) |
457 etree.SubElement(decoupage, "abstract").text = options.name |
456 |
458 |
457 elements = etree.SubElement(decoupage, u"elements") |
459 elements = etree.SubElement(decoupage, "elements") |
458 |
460 |
459 ensemble_id = ensemble.get('id', '') |
461 ensemble_id = ensemble.get('id', '') |
460 decoupage_id = decoupage.get('id', '') if decoupage is not None else None |
462 decoupage_id = decoupage.get('id', '') if decoupage is not None else None |
461 |
463 |
462 end_date = None |
464 end_date = None |
502 username = tw.user.screen_name |
504 username = tw.user.screen_name |
503 profile_url = tw.user.profile_image_url if tw.user.profile_image_url is not None else "" |
505 profile_url = tw.user.profile_image_url if tw.user.profile_image_url is not None else "" |
504 if not username: |
506 if not username: |
505 username = "anon." |
507 username = "anon." |
506 |
508 |
507 element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel_milli), u"dur":u"0", u"src":unicode(profile_url)}) |
509 element = etree.SubElement(elements, "element" , {"id": "%s-%s" % (uuid.uuid4(),tw.id), "color":options.color, "author":username, "date":tweet_ts_dt.strftime("%Y/%m/%d"), "begin": str(tweet_ts_rel_milli), "dur":"0", "src":profile_url}) |
508 etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text) |
510 etree.SubElement(element, "title").text = username + ": " + tw.text |
509 etree.SubElement(element, u"abstract").text = unicode(tw.text) |
511 etree.SubElement(element, "abstract").text = tw.text |
510 |
512 |
511 tags_node = etree.SubElement(element, u"tags") |
513 tags_node = etree.SubElement(element, "tags") |
512 |
514 |
513 for entity in tw.entity_list: |
515 for entity in tw.entity_list: |
514 if entity.type == u'entity_hashtag': |
516 if entity.type == 'entity_hashtag': |
515 etree.SubElement(tags_node,u"tag").text = entity.hashtag.text |
517 etree.SubElement(tags_node,"tag").text = entity.hashtag.text |
516 |
518 |
517 meta_element = etree.SubElement(element, u'meta') |
519 meta_element = etree.SubElement(element, 'meta') |
518 |
520 |
519 etree.SubElement(meta_element, u"polemic_version").text = options.protocol_version |
521 etree.SubElement(meta_element, "polemic_version").text = options.protocol_version |
520 parse_polemics = protocol_version_map.get(options.protocol_version, parse_polemics_2) |
522 parse_polemics = protocol_version_map.get(options.protocol_version, parse_polemics_2) |
521 polemics_list = parse_polemics(tw, options.extended_mode) |
523 polemics_list = parse_polemics(tw, options.extended_mode) |
522 if polemics_list: |
524 if polemics_list: |
523 polemics_element = etree.Element(u'polemics') |
525 polemics_element = etree.Element('polemics') |
524 for pol in polemics_list: |
526 for pol in polemics_list: |
525 etree.SubElement(polemics_element, u'polemic').text = pol |
527 etree.SubElement(polemics_element, 'polemic').text = pol |
526 meta_element.append(polemics_element) |
528 meta_element.append(polemics_element) |
527 |
529 |
528 etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.tweet_source.original_json)) |
530 etree.SubElement(meta_element, "source", attrib={"url":"http://dev.twitter.com", "mimetype":"application/json"}).text = etree.CDATA(tw.tweet_source.original_json) |
529 |
531 |
530 # sort by tc in |
532 # sort by tc in |
531 if options.merge : |
533 if options.merge : |
532 # remove all elements and put them in a array |
534 # remove all elements and put them in a array |
533 # sort them with tc |
535 # sort them with tc |
535 elements[:] = sorted(elements,key=lambda n: int(n.get('begin'))) |
537 elements[:] = sorted(elements,key=lambda n: int(n.get('begin'))) |
536 |
538 |
537 #add to display node |
539 #add to display node |
538 if display_content_node is not None: |
540 if display_content_node is not None: |
539 display_dec = None |
541 display_dec = None |
540 for dec in display_content_node.iterchildren(tag=u"decoupage"): |
542 for dec in display_content_node.iterchildren(tag="decoupage"): |
541 if dec.get('idens','') == ensemble_id and dec.get('id', '') == decoupage_id: |
543 if dec.get('idens','') == ensemble_id and dec.get('id', '') == decoupage_id: |
542 display_dec = dec |
544 display_dec = dec |
543 break |
545 break |
544 if display_dec is None and ensemble_id and decoupage_id: |
546 if display_dec is None and ensemble_id and decoupage_id: |
545 etree.SubElement(display_content_node, u"decoupage", attrib={'idens': ensemble_id, 'id': decoupage_id, 'tagsSelect':''}) |
547 etree.SubElement(display_content_node, "decoupage", attrib={'idens': ensemble_id, 'id': decoupage_id, 'tagsSelect':''}) |
546 |
548 |
547 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True) |
549 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True).decode('utf-8') |
548 |
550 |
549 if content_file_write and content_file_write.find("http") == 0: |
551 if content_file_write and content_file_write.find("http") == 0: |
550 |
552 |
551 project["ldt"] = output_data |
553 project["ldt"] = output_data |
552 project['owner'] = project['owner'].replace('%7E','~') |
554 project['owner'] = project['owner'].replace('%7E','~') |
553 project['contents'] = [c_url.replace('%7E','~') for c_url in project['contents']] |
555 project['contents'] = [c_url.replace('%7E','~') for c_url in project['contents']] |
554 |
556 |
555 post_param = {} |
557 post_param = {} |
556 if options.post_param: |
558 if options.post_param: |
557 post_param = anyjson.loads(options.post_param) |
559 post_param = json.loads(options.post_param) |
558 |
560 |
559 get_logger().debug("write http " + content_file_write) #@UndefinedVariable |
561 get_logger().debug("write http " + content_file_write) #@UndefinedVariable |
560 get_logger().debug("write http " + repr(post_param)) #@UndefinedVariable |
562 get_logger().debug("write http " + repr(post_param)) #@UndefinedVariable |
561 get_logger().debug("write http " + repr(project)) #@UndefinedVariable |
563 get_logger().debug("write http " + repr(project)) #@UndefinedVariable |
562 r = requests.put(content_file_write, data=anyjson.dumps(project), headers={'content-type':'application/json'}, params=post_param); |
564 r = requests.put(content_file_write, data=json.dumps(project), headers={'content-type':'application/json'}, params=post_param) |
563 get_logger().debug("write http " + repr(r) + " content " + r.text) #@UndefinedVariable |
565 get_logger().debug("write http " + repr(r) + " content " + r.text) #@UndefinedVariable |
564 if r.status_code != requests.codes.ok: # @UndefinedVariable |
566 if r.status_code != requests.codes.ok: # pylint: disable=E1101 |
565 r.raise_for_status() |
567 r.raise_for_status() |
566 else: |
568 else: |
567 if content_file_write and os.path.exists(content_file_write): |
569 if content_file_write and os.path.exists(content_file_write): |
568 dest_file_name = content_file_write |
570 dest_file_name = content_file_write |
569 else: |
571 else: |