diff -r efbda157eb57 -r 184372ec27e2 script/utils/export_twitter_alchemy.py --- a/script/utils/export_twitter_alchemy.py Fri Dec 21 12:33:01 2018 +0100 +++ b/script/utils/export_twitter_alchemy.py Wed Jan 02 17:49:19 2019 +0100 @@ -1,24 +1,26 @@ #!/usr/bin/env python # coding=utf-8 -from lxml import etree -from iri_tweet.models import setup_database, Tweet, User -from sqlalchemy import Table, Column, BigInteger, event, bindparam -from sqlalchemy.sql import select, func -from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, - get_logger) import argparse -import anyjson +import bisect import datetime -import requests +import json import os.path import re import sys import time -import uuid #@UnresolvedImport +import uuid # @UnresolvedImport + +import requests +from lxml import etree +from sqlalchemy import BigInteger, Column, Table, bindparam, event +from sqlalchemy.sql import func, select + from dateutil.parser import parse as parse_date_raw from dateutil.tz import tzutc -import bisect +from iri_tweet.models import Tweet, User, setup_database +from iri_tweet.utils import (get_filter_query, get_logger, set_logging, + set_logging_options) #class TweetExclude(object): # def __init__(self, id): @@ -49,12 +51,12 @@ parse polemics in text and return a list of polemic code. None if not polemic found """ polemics = {} - for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text): + for m in re.finditer(r"(\+\+|\-\-|\?\?|\=\=)",tw.text): pol_link = { - '++' : u'OK', - '--' : u'KO', - '??' : u'Q', - '==' : u'REF'}[m.group(1)] + '++' : 'OK', + '--' : 'KO', + '??' : 'Q', + '==' : 'REF'}[m.group(1)] polemics[pol_link] = pol_link if extended_mode: @@ -75,12 +77,12 @@ parse polemics in text and return a list of polemic code. None if not polemic found """ polemics = {} - for m in re.finditer("(\+\+|\!\!|\?\?|\=\=)",tw.text): + for m in re.finditer(r"(\+\+|\!\!|\?\?|\=\=)",tw.text): pol_link = { - '++' : u'OK', - '!!' : u'KO', - '??' : u'Q', - '==' : u'REF'}[m.group(1)] + '++' : 'OK', + '!!' : 'KO', + '??' : 'Q', + '==' : 'REF'}[m.group(1)] polemics[pol_link] = pol_link if extended_mode: @@ -101,12 +103,12 @@ parse polemics in text and return a list of polemic code. None if not polemic found """ polemics = {} - for m in re.finditer("(\+\+|\?\?|\*\*|\=\=)",tw.text): + for m in re.finditer(r"(\+\+|\?\?|\*\*|\=\=)",tw.text): pol_link = { - '++' : u'OK', - '??' : u'KO', - '**' : u'REF', - '==' : u'Q'}[m.group(1)] + '++' : 'OK', + '??' : 'KO', + '**' : 'REF', + '==' : 'Q'}[m.group(1)] polemics[pol_link] = pol_link if extended_mode: @@ -158,7 +160,7 @@ parser.add_argument("-D", "--duration", dest="duration", type=int, help="Duration", metavar="DURATION", default=None) parser.add_argument("-n", "--name", dest="name", - help="Cutting name", metavar="NAME", default=u"Tweets") + help="Cutting name", metavar="NAME", default="Tweets") parser.add_argument("-R", "--replace", dest="replace", action="store_true", help="Replace tweet ensemble", default=False) parser.add_argument("-m", "--merge", dest="merge", action="store_true", @@ -228,7 +230,7 @@ sys.exit(1) conn_str = options.database.strip() - if not re.match("^\w+://.+", conn_str): + if not re.match(r"^\w+://.+", conn_str): conn_str = 'sqlite:///' + conn_str engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False) @@ -249,8 +251,8 @@ if options.exclude and os.path.exists(options.exclude): with open(options.exclude, 'r+') as f: - tei = tweet_exclude_table.insert() - ex_regexp = re.compile("(?P\w+)(?P[~=])(?P.+)", re.I) + tei = tweet_exclude_table.insert() # pylint: disable=E1120 + ex_regexp = re.compile(r"(?P\w+)(?P[~=])(?P.+)", re.I) for line in f: res = ex_regexp.match(line.strip()) if res: @@ -320,7 +322,7 @@ }] post_param = {} if options.post_param: - post_param = anyjson.loads(options.post_param) + post_param = json.loads(options.post_param) for params in parameters: @@ -365,15 +367,15 @@ if root is None: - root = etree.Element(u"iri") + root = etree.Element("iri") - project = etree.SubElement(root, u"project", {u"abstract":u"Polemics Tweets",u"title":u"Polemic Tweets", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())}) + project = etree.SubElement(root, "project", {"abstract":"Polemics Tweets","title":"Polemic Tweets", "user":"IRI Web", "id":str(uuid.uuid4())}) - medias = etree.SubElement(root, u"medias") - media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""}) + medias = etree.SubElement(root, "medias") + media = etree.SubElement(medias, "media", {"pict":"", "src":options.content, "video":options.video, "id":options.content_id, "extra":""}) - annotations = etree.SubElement(root, u"annotations") - content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)}) + annotations = etree.SubElement(root, "annotations") + content = etree.SubElement(annotations, "content", {"id":options.content_id}) ensemble_parent = content content_id = options.content_id @@ -393,14 +395,14 @@ media_nodes = root.xpath("//media") if len(media_nodes) > 0: media = media_nodes[0] - annotations_node = root.find(u"annotations") + annotations_node = root.find("annotations") if annotations_node is None: - annotations_node = etree.SubElement(root, u"annotations") - content_node = annotations_node.find(u"content") + annotations_node = etree.SubElement(root, "annotations") + content_node = annotations_node.find("content") if content_node is None: - content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id")) + content_node = etree.SubElement(annotations_node,"content", id=media.get("id")) ensemble_parent = content_node - content_id = content_node.get(u"id") + content_id = content_node.get("id") display_nodes = root.xpath("//displays/display/content[@id='%s']" % content_id) if len(display_nodes) == 0: get_logger().info("No display node found. Will not update display") @@ -409,12 +411,12 @@ display_content_node = display_nodes[0] elif file_type == "iri": - body_node = root.find(u"body") + body_node = root.find("body") if body_node is None: - body_node = etree.SubElement(root, u"body") - ensembles_node = body_node.find(u"ensembles") + body_node = etree.SubElement(root, "body") + ensembles_node = body_node.find("ensembles") if ensembles_node is None: - ensembles_node = etree.SubElement(body_node, u"ensembles") + ensembles_node = etree.SubElement(body_node, "ensembles") ensemble_parent = ensembles_node content_id = root.xpath("head/meta[@name='id']/@content")[0] display_content_node = None @@ -425,7 +427,7 @@ sys.exit() if options.replace: - for ens in ensemble_parent.iterchildren(tag=u"ensemble"): + for ens in ensemble_parent.iterchildren(tag="ensemble"): ens_id = ens.get("id","") if ens_id.startswith("tweet_"): ensemble_parent.remove(ens) @@ -439,22 +441,22 @@ elements = None if options.merge: - for ens in ensemble_parent.findall(u"ensemble"): + for ens in ensemble_parent.findall("ensemble"): if ens.get('id',"").startswith("tweet_"): ensemble = ens break if ensemble is not None: - elements = ensemble.find(u".//elements") - decoupage = ensemble.find(u"decoupage") + elements = ensemble.find(".//elements") + decoupage = ensemble.find("decoupage") if ensemble is None or elements is None: - ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"}) - decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"}) + ensemble = etree.SubElement(ensemble_parent, "ensemble", {"id":"tweet_" + str(uuid.uuid4()), "title":"Ensemble Twitter", "author":"IRI Web", "abstract":"Ensemble Twitter"}) + decoupage = etree.SubElement(ensemble, "decoupage", {"id": str(uuid.uuid4()), "author": "IRI Web"}) - etree.SubElement(decoupage, u"title").text = unicode(options.name) - etree.SubElement(decoupage, u"abstract").text = unicode(options.name) + etree.SubElement(decoupage, "title").text = options.name + etree.SubElement(decoupage, "abstract").text = options.name - elements = etree.SubElement(decoupage, u"elements") + elements = etree.SubElement(decoupage, "elements") ensemble_id = ensemble.get('id', '') decoupage_id = decoupage.get('id', '') if decoupage is not None else None @@ -504,28 +506,28 @@ if not username: username = "anon." - element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel_milli), u"dur":u"0", u"src":unicode(profile_url)}) - etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text) - etree.SubElement(element, u"abstract").text = unicode(tw.text) + element = etree.SubElement(elements, "element" , {"id": "%s-%s" % (uuid.uuid4(),tw.id), "color":options.color, "author":username, "date":tweet_ts_dt.strftime("%Y/%m/%d"), "begin": str(tweet_ts_rel_milli), "dur":"0", "src":profile_url}) + etree.SubElement(element, "title").text = username + ": " + tw.text + etree.SubElement(element, "abstract").text = tw.text - tags_node = etree.SubElement(element, u"tags") + tags_node = etree.SubElement(element, "tags") for entity in tw.entity_list: - if entity.type == u'entity_hashtag': - etree.SubElement(tags_node,u"tag").text = entity.hashtag.text + if entity.type == 'entity_hashtag': + etree.SubElement(tags_node,"tag").text = entity.hashtag.text - meta_element = etree.SubElement(element, u'meta') + meta_element = etree.SubElement(element, 'meta') - etree.SubElement(meta_element, u"polemic_version").text = options.protocol_version + etree.SubElement(meta_element, "polemic_version").text = options.protocol_version parse_polemics = protocol_version_map.get(options.protocol_version, parse_polemics_2) polemics_list = parse_polemics(tw, options.extended_mode) if polemics_list: - polemics_element = etree.Element(u'polemics') + polemics_element = etree.Element('polemics') for pol in polemics_list: - etree.SubElement(polemics_element, u'polemic').text = pol + etree.SubElement(polemics_element, 'polemic').text = pol meta_element.append(polemics_element) - etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.tweet_source.original_json)) + etree.SubElement(meta_element, "source", attrib={"url":"http://dev.twitter.com", "mimetype":"application/json"}).text = etree.CDATA(tw.tweet_source.original_json) # sort by tc in if options.merge : @@ -537,14 +539,14 @@ #add to display node if display_content_node is not None: display_dec = None - for dec in display_content_node.iterchildren(tag=u"decoupage"): + for dec in display_content_node.iterchildren(tag="decoupage"): if dec.get('idens','') == ensemble_id and dec.get('id', '') == decoupage_id: display_dec = dec break if display_dec is None and ensemble_id and decoupage_id: - etree.SubElement(display_content_node, u"decoupage", attrib={'idens': ensemble_id, 'id': decoupage_id, 'tagsSelect':''}) + etree.SubElement(display_content_node, "decoupage", attrib={'idens': ensemble_id, 'id': decoupage_id, 'tagsSelect':''}) - output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True) + output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True).decode('utf-8') if content_file_write and content_file_write.find("http") == 0: @@ -554,14 +556,14 @@ post_param = {} if options.post_param: - post_param = anyjson.loads(options.post_param) + post_param = json.loads(options.post_param) get_logger().debug("write http " + content_file_write) #@UndefinedVariable get_logger().debug("write http " + repr(post_param)) #@UndefinedVariable get_logger().debug("write http " + repr(project)) #@UndefinedVariable - r = requests.put(content_file_write, data=anyjson.dumps(project), headers={'content-type':'application/json'}, params=post_param); + r = requests.put(content_file_write, data=json.dumps(project), headers={'content-type':'application/json'}, params=post_param) get_logger().debug("write http " + repr(r) + " content " + r.text) #@UndefinedVariable - if r.status_code != requests.codes.ok: # @UndefinedVariable + if r.status_code != requests.codes.ok: # pylint: disable=E1101 r.raise_for_status() else: if content_file_write and os.path.exists(content_file_write):