# HG changeset patch # User ymh # Date 1546447759 -3600 # Node ID 184372ec27e2be9730217317398a9f7063d45005 # Parent efbda157eb57d2bfc05283d0709fc4ff3bfd2951 upgrade to python 3 and twitter api diff -r efbda157eb57 -r 184372ec27e2 .hgignore --- a/.hgignore Fri Dec 21 12:33:01 2018 +0100 +++ b/.hgignore Wed Jan 02 17:49:19 2019 +0100 @@ -30,6 +30,8 @@ ^script/lib/tweetstream/tweetstream\.egg-info$ ^script/virtualenv/script/env$ ^script/virtualenv/script/project-boot\.py$ +^script/.direnv +^script/lib/iri_tweet/build ^web/event_props$ ^script/utils/ghostdriver.log$ ^sbin/sync/sync_live diff -r efbda157eb57 -r 184372ec27e2 script/.envrc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/.envrc Wed Jan 02 17:49:19 2019 +0100 @@ -0,0 +1,1 @@ +use pythonvenv 3.7.1+brew diff -r efbda157eb57 -r 184372ec27e2 script/.vscode/settings.json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/.vscode/settings.json Wed Jan 02 17:49:19 2019 +0100 @@ -0,0 +1,4 @@ +{ + "python.pythonPath": "/Users/ymh/dev/projects/tweet_live/script/.direnv/python-3.7.1/bin/python", + "python.analysis.diagnosticPublishDelay": 996 +} \ No newline at end of file diff -r efbda157eb57 -r 184372ec27e2 script/lib/iri_tweet/iri_tweet/models.py --- a/script/lib/iri_tweet/iri_tweet/models.py Fri Dec 21 12:33:01 2018 +0100 +++ b/script/lib/iri_tweet/iri_tweet/models.py Wed Jan 02 17:49:19 2019 +0100 @@ -1,14 +1,15 @@ -from sqlalchemy import (Boolean, Column, Enum, BigInteger, Integer, String, - ForeignKey, DateTime, create_engine, event) +import datetime +import email.utils +import json + +from sqlalchemy import (BigInteger, Boolean, Column, DateTime, Enum, + ForeignKey, Integer, String, create_engine, event) from sqlalchemy.engine import Engine from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import relationship, sessionmaker -import anyjson -import datetime -import email.utils + import iri_tweet - Base = declarative_base() APPLICATION_NAME = "IRI_TWITTER" @@ -23,13 +24,13 @@ if obj is None: return None else: - return anyjson.serialize(obj) + return json.dumps(obj) class TweetMeta(type(Base)): def __init__(cls, name, bases, ns): #@NoSelf def init(self, **kwargs): - for key, value in kwargs.iteritems(): + for key, value in kwargs.items(): if hasattr(self, key): setattr(self, key, value) super(cls, self).__init__() @@ -272,13 +273,13 @@ session_argname = [ 'autoflush','binds', "class_", "_enable_transaction_accounting","expire_on_commit", "extension", "query_cls", "twophase", "weak_identity_map", "autocommit"] - kwargs_ce = dict((k, v) for k,v in kwargs.iteritems() if (k not in session_argname and k != "create_all")) + kwargs_ce = dict((k, v) for k,v in kwargs.items() if (k not in session_argname and k != "create_all")) engine = create_engine(*args, **kwargs_ce) if engine.name == "sqlite": @event.listens_for(Engine, "connect") - def set_sqlite_pragma(dbapi_connection, connection_record): + def set_sqlite_pragma(dbapi_connection, connection_record): #pylint: W0612 cursor = dbapi_connection.cursor() cursor.execute("PRAGMA foreign_keys=ON") cursor.close() @@ -301,4 +302,3 @@ session.close() return (engine, metadata, Session) - diff -r efbda157eb57 -r 184372ec27e2 script/lib/iri_tweet/iri_tweet/processor.py --- a/script/lib/iri_tweet/iri_tweet/processor.py Fri Dec 21 12:33:01 2018 +0100 +++ b/script/lib/iri_tweet/iri_tweet/processor.py Wed Jan 02 17:49:19 2019 +0100 @@ -10,10 +10,10 @@ from iri_tweet.utils import (ObjectsBuffer, adapt_fields, fields_adapter, ObjectBufferProxy, get_oauth_token, clean_keys) from sqlalchemy.orm import joinedload -import anyjson import logging import twitter import twitter_text +import json class TwitterProcessorException(Exception): @@ -26,12 +26,12 @@ raise TwitterProcessorException("No json") if json_dict is None: - self.json_dict = anyjson.deserialize(json_txt) + self.json_dict = json.loads(json_txt) else: self.json_dict = json_dict if not json_txt: - self.json_txt = anyjson.serialize(json_dict) + self.json_txt = json.dumps(json_dict) else: self.json_txt = json_txt @@ -258,7 +258,7 @@ def __process_entities(self): if "entities" in self.json_dict: - for ind_type, entity_list in self.json_dict["entities"].iteritems(): + for ind_type, entity_list in self.json_dict["entities"].items(): for ind in entity_list: self.__process_entity(ind, ind_type) else: @@ -281,7 +281,7 @@ status_id = self.json_dict["id"] log = self.session.query(TweetLog).filter(TweetLog.status_id==status_id).first() if(log): - self.obj_buffer.add_object(TweetLog, log, {'status': TweetLog.TWEET_STATUS['DELETE'], 'status_id': None}) + self.obj_buffer.add_object(TweetLog, log, {'status': TweetLog.TWEET_STATUS['DELETE'], 'status_id': None}, False) self.session.query(TweetSource).filter(TweetSource.id==self.source_id).delete() else: self.__process_twitter() @@ -350,12 +350,12 @@ return tweets = self.session.query(Tweet).options(joinedload(Tweet.tweet_source)).filter(Tweet.id <= up_to_status_id) for t in tweets: - self.obj_buffer.add_object(Tweet, t, {'geo': None}) + self.obj_buffer.add_object(Tweet, t, {'geo': None}, False) tsource = t.tweet_source - tsource_dict = anyjson.serialize(tsource.original_json) + tsource_dict = json.loads(tsource.original_json) if tsource_dict.get("geo", None): tsource_dict["geo"] = None - self.obj_buffer.add_object(TweetSource, tsource, {'original_json': anyjson.serialize(tsource_dict)}) + self.obj_buffer.add_object(TweetSource, tsource, {'original_json': json.dumps(tsource_dict)}, False) self.obj_buffer.add_object(TweetLog, None, {'tweet_source_id':self.source_id, 'status':TweetLog.TWEET_STATUS['SCRUB_GEO']}, True) def log_info(self): @@ -486,7 +486,7 @@ } def get_processor(tweet_dict): - for processor_key,processor_klass in TWEET_PROCESSOR_MAP.iteritems(): + for processor_key,processor_klass in TWEET_PROCESSOR_MAP.items(): if processor_key in tweet_dict: return processor_klass return None diff -r efbda157eb57 -r 184372ec27e2 script/lib/iri_tweet/iri_tweet/utils.py --- a/script/lib/iri_tweet/iri_tweet/utils.py Fri Dec 21 12:33:01 2018 +0100 +++ b/script/lib/iri_tweet/iri_tweet/utils.py Wed Jan 02 17:49:19 2019 +0100 @@ -1,18 +1,21 @@ -from models import (Tweet, User, Hashtag, EntityHashtag, APPLICATION_NAME, ACCESS_TOKEN_SECRET, adapt_date, adapt_json, - ACCESS_TOKEN_KEY) -from sqlalchemy.sql import select, or_ -import Queue import codecs import datetime import email.utils +import functools import logging import math import os.path +import Queue import socket import sys + import twitter.oauth import twitter.oauth_dance +from sqlalchemy.sql import or_, select +from .models import (ACCESS_TOKEN_KEY, ACCESS_TOKEN_SECRET, APPLICATION_NAME, + EntityHashtag, Hashtag, Tweet, User, adapt_date, + adapt_json) CACHE_ACCESS_TOKEN = {} @@ -160,12 +163,12 @@ if proxy.kwargs is None or len(proxy.kwargs) == 0 or proxy.klass != klass: continue found = True - for k,v in kwargs.iteritems(): + for k,v in kwargs.items(): if (k not in proxy.kwargs) or v != proxy.kwargs[k]: found = False break if found: - return proxy + return proxy return None @@ -239,7 +242,7 @@ def merge_hash(l,h): l.extend(h.split(",")) return l - htags = reduce(merge_hash, hashtags, []) + htags = functools.reduce(merge_hash, hashtags, []) query = query.filter(or_(*map(lambda h: Hashtag.text.contains(h), htags))) #@UndefinedVariable @@ -311,17 +314,15 @@ if writer is None: writer = sys.stdout - if sys.stdout.encoding is not None: - writer = codecs.getwriter(sys.stdout.encoding)(sys.stdout) percent = (float(current_line) / float(total_line)) * 100.0 marks = math.floor(width * (percent / 100.0)) spaces = math.floor(width - marks) - loader = u'[' + (u'=' * int(marks)) + (u' ' * int(spaces)) + u']' + loader = '[' + ('=' * int(marks)) + (' ' * int(spaces)) + ']' - s = u"%s %3d%% %*d/%d - %*s\r" % (loader, percent, len(str(total_line)), current_line, total_line, width, label[:width]) + s = "%s %3d%% %*d/%d - %*s\r" % (loader, percent, len(str(total_line)), current_line, total_line, width, label[:width]) writer.write(s) #takes the header into account if percent >= 100: @@ -336,4 +337,3 @@ _, port = s.getsockname() s.close() return port - diff -r efbda157eb57 -r 184372ec27e2 script/lib/iri_tweet/setup.py --- a/script/lib/iri_tweet/setup.py Fri Dec 21 12:33:01 2018 +0100 +++ b/script/lib/iri_tweet/setup.py Wed Jan 02 17:49:19 2019 +0100 @@ -1,4 +1,3 @@ -#@PydevCodeAnalysisIgnore import sys import os @@ -45,7 +44,7 @@ if line.strip() == '# -eof meta-': break acc.append(line) - for pattern, handler in pats.iteritems(): + for pattern, handler in pats.items(): m = pattern.match(line.strip()) if m: meta.update(handler(m)) diff -r efbda157eb57 -r 184372ec27e2 script/utils/export_twitter_alchemy.py --- a/script/utils/export_twitter_alchemy.py Fri Dec 21 12:33:01 2018 +0100 +++ b/script/utils/export_twitter_alchemy.py Wed Jan 02 17:49:19 2019 +0100 @@ -1,24 +1,26 @@ #!/usr/bin/env python # coding=utf-8 -from lxml import etree -from iri_tweet.models import setup_database, Tweet, User -from sqlalchemy import Table, Column, BigInteger, event, bindparam -from sqlalchemy.sql import select, func -from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, - get_logger) import argparse -import anyjson +import bisect import datetime -import requests +import json import os.path import re import sys import time -import uuid #@UnresolvedImport +import uuid # @UnresolvedImport + +import requests +from lxml import etree +from sqlalchemy import BigInteger, Column, Table, bindparam, event +from sqlalchemy.sql import func, select + from dateutil.parser import parse as parse_date_raw from dateutil.tz import tzutc -import bisect +from iri_tweet.models import Tweet, User, setup_database +from iri_tweet.utils import (get_filter_query, get_logger, set_logging, + set_logging_options) #class TweetExclude(object): # def __init__(self, id): @@ -49,12 +51,12 @@ parse polemics in text and return a list of polemic code. None if not polemic found """ polemics = {} - for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text): + for m in re.finditer(r"(\+\+|\-\-|\?\?|\=\=)",tw.text): pol_link = { - '++' : u'OK', - '--' : u'KO', - '??' : u'Q', - '==' : u'REF'}[m.group(1)] + '++' : 'OK', + '--' : 'KO', + '??' : 'Q', + '==' : 'REF'}[m.group(1)] polemics[pol_link] = pol_link if extended_mode: @@ -75,12 +77,12 @@ parse polemics in text and return a list of polemic code. None if not polemic found """ polemics = {} - for m in re.finditer("(\+\+|\!\!|\?\?|\=\=)",tw.text): + for m in re.finditer(r"(\+\+|\!\!|\?\?|\=\=)",tw.text): pol_link = { - '++' : u'OK', - '!!' : u'KO', - '??' : u'Q', - '==' : u'REF'}[m.group(1)] + '++' : 'OK', + '!!' : 'KO', + '??' : 'Q', + '==' : 'REF'}[m.group(1)] polemics[pol_link] = pol_link if extended_mode: @@ -101,12 +103,12 @@ parse polemics in text and return a list of polemic code. None if not polemic found """ polemics = {} - for m in re.finditer("(\+\+|\?\?|\*\*|\=\=)",tw.text): + for m in re.finditer(r"(\+\+|\?\?|\*\*|\=\=)",tw.text): pol_link = { - '++' : u'OK', - '??' : u'KO', - '**' : u'REF', - '==' : u'Q'}[m.group(1)] + '++' : 'OK', + '??' : 'KO', + '**' : 'REF', + '==' : 'Q'}[m.group(1)] polemics[pol_link] = pol_link if extended_mode: @@ -158,7 +160,7 @@ parser.add_argument("-D", "--duration", dest="duration", type=int, help="Duration", metavar="DURATION", default=None) parser.add_argument("-n", "--name", dest="name", - help="Cutting name", metavar="NAME", default=u"Tweets") + help="Cutting name", metavar="NAME", default="Tweets") parser.add_argument("-R", "--replace", dest="replace", action="store_true", help="Replace tweet ensemble", default=False) parser.add_argument("-m", "--merge", dest="merge", action="store_true", @@ -228,7 +230,7 @@ sys.exit(1) conn_str = options.database.strip() - if not re.match("^\w+://.+", conn_str): + if not re.match(r"^\w+://.+", conn_str): conn_str = 'sqlite:///' + conn_str engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False) @@ -249,8 +251,8 @@ if options.exclude and os.path.exists(options.exclude): with open(options.exclude, 'r+') as f: - tei = tweet_exclude_table.insert() - ex_regexp = re.compile("(?P\w+)(?P[~=])(?P.+)", re.I) + tei = tweet_exclude_table.insert() # pylint: disable=E1120 + ex_regexp = re.compile(r"(?P\w+)(?P[~=])(?P.+)", re.I) for line in f: res = ex_regexp.match(line.strip()) if res: @@ -320,7 +322,7 @@ }] post_param = {} if options.post_param: - post_param = anyjson.loads(options.post_param) + post_param = json.loads(options.post_param) for params in parameters: @@ -365,15 +367,15 @@ if root is None: - root = etree.Element(u"iri") + root = etree.Element("iri") - project = etree.SubElement(root, u"project", {u"abstract":u"Polemics Tweets",u"title":u"Polemic Tweets", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())}) + project = etree.SubElement(root, "project", {"abstract":"Polemics Tweets","title":"Polemic Tweets", "user":"IRI Web", "id":str(uuid.uuid4())}) - medias = etree.SubElement(root, u"medias") - media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""}) + medias = etree.SubElement(root, "medias") + media = etree.SubElement(medias, "media", {"pict":"", "src":options.content, "video":options.video, "id":options.content_id, "extra":""}) - annotations = etree.SubElement(root, u"annotations") - content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)}) + annotations = etree.SubElement(root, "annotations") + content = etree.SubElement(annotations, "content", {"id":options.content_id}) ensemble_parent = content content_id = options.content_id @@ -393,14 +395,14 @@ media_nodes = root.xpath("//media") if len(media_nodes) > 0: media = media_nodes[0] - annotations_node = root.find(u"annotations") + annotations_node = root.find("annotations") if annotations_node is None: - annotations_node = etree.SubElement(root, u"annotations") - content_node = annotations_node.find(u"content") + annotations_node = etree.SubElement(root, "annotations") + content_node = annotations_node.find("content") if content_node is None: - content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id")) + content_node = etree.SubElement(annotations_node,"content", id=media.get("id")) ensemble_parent = content_node - content_id = content_node.get(u"id") + content_id = content_node.get("id") display_nodes = root.xpath("//displays/display/content[@id='%s']" % content_id) if len(display_nodes) == 0: get_logger().info("No display node found. Will not update display") @@ -409,12 +411,12 @@ display_content_node = display_nodes[0] elif file_type == "iri": - body_node = root.find(u"body") + body_node = root.find("body") if body_node is None: - body_node = etree.SubElement(root, u"body") - ensembles_node = body_node.find(u"ensembles") + body_node = etree.SubElement(root, "body") + ensembles_node = body_node.find("ensembles") if ensembles_node is None: - ensembles_node = etree.SubElement(body_node, u"ensembles") + ensembles_node = etree.SubElement(body_node, "ensembles") ensemble_parent = ensembles_node content_id = root.xpath("head/meta[@name='id']/@content")[0] display_content_node = None @@ -425,7 +427,7 @@ sys.exit() if options.replace: - for ens in ensemble_parent.iterchildren(tag=u"ensemble"): + for ens in ensemble_parent.iterchildren(tag="ensemble"): ens_id = ens.get("id","") if ens_id.startswith("tweet_"): ensemble_parent.remove(ens) @@ -439,22 +441,22 @@ elements = None if options.merge: - for ens in ensemble_parent.findall(u"ensemble"): + for ens in ensemble_parent.findall("ensemble"): if ens.get('id',"").startswith("tweet_"): ensemble = ens break if ensemble is not None: - elements = ensemble.find(u".//elements") - decoupage = ensemble.find(u"decoupage") + elements = ensemble.find(".//elements") + decoupage = ensemble.find("decoupage") if ensemble is None or elements is None: - ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"}) - decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"}) + ensemble = etree.SubElement(ensemble_parent, "ensemble", {"id":"tweet_" + str(uuid.uuid4()), "title":"Ensemble Twitter", "author":"IRI Web", "abstract":"Ensemble Twitter"}) + decoupage = etree.SubElement(ensemble, "decoupage", {"id": str(uuid.uuid4()), "author": "IRI Web"}) - etree.SubElement(decoupage, u"title").text = unicode(options.name) - etree.SubElement(decoupage, u"abstract").text = unicode(options.name) + etree.SubElement(decoupage, "title").text = options.name + etree.SubElement(decoupage, "abstract").text = options.name - elements = etree.SubElement(decoupage, u"elements") + elements = etree.SubElement(decoupage, "elements") ensemble_id = ensemble.get('id', '') decoupage_id = decoupage.get('id', '') if decoupage is not None else None @@ -504,28 +506,28 @@ if not username: username = "anon." - element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel_milli), u"dur":u"0", u"src":unicode(profile_url)}) - etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text) - etree.SubElement(element, u"abstract").text = unicode(tw.text) + element = etree.SubElement(elements, "element" , {"id": "%s-%s" % (uuid.uuid4(),tw.id), "color":options.color, "author":username, "date":tweet_ts_dt.strftime("%Y/%m/%d"), "begin": str(tweet_ts_rel_milli), "dur":"0", "src":profile_url}) + etree.SubElement(element, "title").text = username + ": " + tw.text + etree.SubElement(element, "abstract").text = tw.text - tags_node = etree.SubElement(element, u"tags") + tags_node = etree.SubElement(element, "tags") for entity in tw.entity_list: - if entity.type == u'entity_hashtag': - etree.SubElement(tags_node,u"tag").text = entity.hashtag.text + if entity.type == 'entity_hashtag': + etree.SubElement(tags_node,"tag").text = entity.hashtag.text - meta_element = etree.SubElement(element, u'meta') + meta_element = etree.SubElement(element, 'meta') - etree.SubElement(meta_element, u"polemic_version").text = options.protocol_version + etree.SubElement(meta_element, "polemic_version").text = options.protocol_version parse_polemics = protocol_version_map.get(options.protocol_version, parse_polemics_2) polemics_list = parse_polemics(tw, options.extended_mode) if polemics_list: - polemics_element = etree.Element(u'polemics') + polemics_element = etree.Element('polemics') for pol in polemics_list: - etree.SubElement(polemics_element, u'polemic').text = pol + etree.SubElement(polemics_element, 'polemic').text = pol meta_element.append(polemics_element) - etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.tweet_source.original_json)) + etree.SubElement(meta_element, "source", attrib={"url":"http://dev.twitter.com", "mimetype":"application/json"}).text = etree.CDATA(tw.tweet_source.original_json) # sort by tc in if options.merge : @@ -537,14 +539,14 @@ #add to display node if display_content_node is not None: display_dec = None - for dec in display_content_node.iterchildren(tag=u"decoupage"): + for dec in display_content_node.iterchildren(tag="decoupage"): if dec.get('idens','') == ensemble_id and dec.get('id', '') == decoupage_id: display_dec = dec break if display_dec is None and ensemble_id and decoupage_id: - etree.SubElement(display_content_node, u"decoupage", attrib={'idens': ensemble_id, 'id': decoupage_id, 'tagsSelect':''}) + etree.SubElement(display_content_node, "decoupage", attrib={'idens': ensemble_id, 'id': decoupage_id, 'tagsSelect':''}) - output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True) + output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True).decode('utf-8') if content_file_write and content_file_write.find("http") == 0: @@ -554,14 +556,14 @@ post_param = {} if options.post_param: - post_param = anyjson.loads(options.post_param) + post_param = json.loads(options.post_param) get_logger().debug("write http " + content_file_write) #@UndefinedVariable get_logger().debug("write http " + repr(post_param)) #@UndefinedVariable get_logger().debug("write http " + repr(project)) #@UndefinedVariable - r = requests.put(content_file_write, data=anyjson.dumps(project), headers={'content-type':'application/json'}, params=post_param); + r = requests.put(content_file_write, data=json.dumps(project), headers={'content-type':'application/json'}, params=post_param) get_logger().debug("write http " + repr(r) + " content " + r.text) #@UndefinedVariable - if r.status_code != requests.codes.ok: # @UndefinedVariable + if r.status_code != requests.codes.ok: # pylint: disable=E1101 r.raise_for_status() else: if content_file_write and os.path.exists(content_file_write): diff -r efbda157eb57 -r 184372ec27e2 script/utils/merge_tweets.py --- a/script/utils/merge_tweets.py Fri Dec 21 12:33:01 2018 +0100 +++ b/script/utils/merge_tweets.py Wed Jan 02 17:49:19 2019 +0100 @@ -1,14 +1,15 @@ #from models import setup_database -from iri_tweet.models import setup_database, TweetSource, Tweet, TweetLog -from iri_tweet.processor import TwitterProcessorStatus -from iri_tweet.utils import get_oauth_token, show_progress -import anyjson import argparse import codecs +import json import logging import re import sys +from iri_tweet.models import Tweet, TweetLog, TweetSource, setup_database +from iri_tweet.processor import TwitterProcessorStatus +from iri_tweet.utils import get_oauth_token, show_progress + logger = logging.getLogger(__name__) def get_option(): @@ -49,10 +50,10 @@ #open source src_conn_str = options.source[0].strip() - if not re.match("^\w+://.+", src_conn_str): + if not re.match(r"^\w+://.+", src_conn_str): src_conn_str = 'sqlite:///' + src_conn_str tgt_conn_str = options.target[0].strip() - if not re.match("^\w+://.+", tgt_conn_str): + if not re.match(r"^\w+://.+", tgt_conn_str): tgt_conn_str = 'sqlite:///' + tgt_conn_str @@ -66,13 +67,11 @@ #conn_tgt = engine_tgt.connect() session_src = Session_src() session_tgt = Session_tgt() - - count_tw_query = Tweet.__table__.count() # @UndefinedVariable - - count_tw = engine_src.scalar(count_tw_query) + + count_tw = session_src.query(Tweet).count() if count_tw == 0: - print "No tweet to process : exit" + print("No tweet to process : exit") sys.exit() query_src = session_src.query(Tweet).join(TweetSource).yield_per(100) @@ -88,7 +87,7 @@ progress_text = u"Adding : " tweet_source = tweet.tweet_source.original_json - tweet_obj = anyjson.deserialize(tweet_source) + tweet_obj = json.loads(tweet_source) if 'text' not in tweet_obj: tweet_log = TweetLog(tweet_source_id=tweet.tweet_source.id, status=TweetLog.TWEET_STATUS['NOT_TWEET']) session_tgt.add(tweet_log) @@ -102,7 +101,7 @@ writer = show_progress(i+1, count_tw, ptext.replace("\n",""), 70, writer) session_tgt.commit() - print u"%d new tweet added" % (added) + print(u"%d new tweet added" % (added,)) finally: if session_tgt is not None: @@ -113,5 +112,3 @@ conn_tgt.close() if conn_src is not None: conn_src.close() - - \ No newline at end of file diff -r efbda157eb57 -r 184372ec27e2 script/utils/search_twitter_api.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/utils/search_twitter_api.py Wed Jan 02 17:49:19 2019 +0100 @@ -0,0 +1,141 @@ +import argparse +import logging +import math +import re +import time +import datetime +import urllib + +from blessings import Terminal +import requests +import twitter + +from iri_tweet import models, utils +from iri_tweet.processor import TwitterProcessorStatus + +import json + +logger = logging.getLogger(__name__) + +APPLICATION_NAME = "Tweet seach json" + + +# TODO: implement some more parameters +# script to "scrap twitter results" +# Shamelessly taken from https://github.com/Jefferson-Henrique/GetOldTweets-python +# pyquery cssselect +class TweetManager: + + def __init__(self, query, twitter_con): + self.query = query + self.max_id = 0 + self.t = twitter_con + pass + + def __iter__(self): + while True: + if self.max_id < 0: + break + json = self.get_json_response() + + next_results = json['search_metadata'].get('next_results', "?")[1:] + self.max_id = int(urllib.parse.parse_qs(next_results).get('max_id', [-1])[0]) + + tweet_list = json['statuses'] + + if len(tweet_list) == 0: + break + + for tweet in tweet_list: + yield tweet + + def get_json_response(self): + return self.t.search.tweets(q=self.query, include_entities=True, max_id=self.max_id) + + +def get_options(): + + usage = "usage: %(prog)s [options] " + + parser = argparse.ArgumentParser(usage=usage) + + parser.add_argument(dest="conn_str", + help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR") + parser.add_argument("-Q", dest="query", + help="query", metavar="QUERY") + parser.add_argument("-k", "--key", dest="consumer_key", + help="Twitter consumer key", metavar="CONSUMER_KEY") + parser.add_argument("-s", "--secret", dest="consumer_secret", + help="Twitter consumer secret", metavar="CONSUMER_SECRET") + parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token", + help="Token file name") + + utils.set_logging_options(parser) + + return parser.parse_args() + + + +if __name__ == "__main__": + + options = get_options() + + utils.set_logging(options) + + + acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME) + + t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True) + t.secure = True + + conn_str = options.conn_str.strip() + if not re.match(r"^\w+://.+", conn_str): + conn_str = 'sqlite:///' + conn_str + + engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True) + session = None + + + term = Terminal() + + try: + session = Session() + + results = None + print(options.query) + + tm = TweetManager(options.query, t) + + move_up = 0 + + for i,tweet in enumerate(tm): + # get id + tweet_id = tweet.get("id") + + if not tweet_id: + continue + + if move_up > 0: + print((move_up+1)*term.move_up()) + move_up = 0 + + print ("%d: %s - %r" % (i+1, tweet_id, tweet.get("text", "") ) + term.clear_eol()) + move_up += 1 + + count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count() + + if count_tweet: + continue + + processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger) + processor.process() + session.flush() + session.commit() + + except twitter.api.TwitterHTTPError as e: + fmt = ("." + e.format) if e.format else "" + print("Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data))) + + finally: + if session: + session.close() diff -r efbda157eb57 -r 184372ec27e2 script/utils/search_twitter_json.py --- a/script/utils/search_twitter_json.py Fri Dec 21 12:33:01 2018 +0100 +++ b/script/utils/search_twitter_json.py Wed Jan 02 17:49:19 2019 +0100 @@ -13,7 +13,6 @@ from iri_tweet import models, utils from iri_tweet.processor import TwitterProcessorStatus -from lxml import html import json from pyquery import PyQuery @@ -35,8 +34,6 @@ def __iter__(self): - results = [] - while True: json = self.get_json_response() if len(json['items_html'].strip()) == 0: @@ -51,13 +48,13 @@ for tweetHTML in tweets: tweet_pq = PyQuery(tweetHTML) - username = tweet_pq("span.username.js-action-profile-name b").text(); - txt = re.sub(r"\s+", " ", re.sub(r"[^\x00-\x7F]", "", tweet_pq("p.js-tweet-text").text()).replace('# ', '#').replace('@ ', '@')); - retweets = int(tweet_pq("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")); - favorites = int(tweet_pq("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")); - date_sec = int(tweet_pq("small.time span.js-short-timestamp").attr("data-time")); - id = tweet_pq.attr("data-tweet-id"); - permalink = tweet_pq.attr("data-permalink-path"); + username = tweet_pq("span.username.js-action-profile-name b").text() + txt = re.sub(r"\s+", " ", re.sub(r"[^\x00-\x7F]", "", tweet_pq("p.js-tweet-text").text()).replace('# ', '#').replace('@ ', '@')) + retweets = int(tweet_pq("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")) + favorites = int(tweet_pq("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")) + date_sec = int(tweet_pq("small.time span.js-short-timestamp").attr("data-time")) + id = tweet_pq.attr("data-tweet-id") + permalink = tweet_pq.attr("data-permalink-path") geo = '' geo_span = tweet_pq('span.Tweet-geo') @@ -129,7 +126,7 @@ options = get_options() - utils.set_logging(options); + utils.set_logging(options) acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME) @@ -138,7 +135,7 @@ t.secure = True conn_str = options.conn_str.strip() - if not re.match("^\w+://.+", conn_str): + if not re.match(r"^\w+://.+", conn_str): conn_str = 'sqlite:///' + conn_str engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True) @@ -151,7 +148,7 @@ session = Session() results = None - print options.query + print(options.query) tm = TweetManager(options.query) @@ -188,9 +185,9 @@ session.flush() session.commit() - print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('X-Rate-Limit-Limit'))) + term.clear_eol()) + print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers['X-Rate-Limit-Limit'])) + term.clear_eol()) move_up += 1 - rate_limit_limit = int(tweet.headers.getheader('X-Rate-Limit-Limit')) + rate_limit_limit = int(tweet.headers['X-Rate-Limit-Limit']) rate_limit_remaining = int(tweet.rate_limit_remaining) if rate_limit_remaining > rate_limit_limit: @@ -198,7 +195,7 @@ else: time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining)) - for i in xrange(time_to_sleep): + for i in range(time_to_sleep): if i: print(2*term.move_up()) else: @@ -208,7 +205,7 @@ except twitter.api.TwitterHTTPError as e: fmt = ("." + e.format) if e.format else "" - print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data)) + print("Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data))) finally: if session: diff -r efbda157eb57 -r 184372ec27e2 script/utils/tweet_twitter_user.py --- a/script/utils/tweet_twitter_user.py Fri Dec 21 12:33:01 2018 +0100 +++ b/script/utils/tweet_twitter_user.py Wed Jan 02 17:49:19 2019 +0100 @@ -60,7 +60,7 @@ sys.exit() conn_str = options.database.strip() - if not re.match("^\w+://.+", conn_str): + if not re.match(r"^\w+://.+", conn_str): conn_str = 'sqlite:///' + conn_str engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)