--- a/.hgignore Fri Dec 21 12:33:01 2018 +0100
+++ b/.hgignore Wed Jan 02 17:49:19 2019 +0100
@@ -30,6 +30,8 @@
^script/lib/tweetstream/tweetstream\.egg-info$
^script/virtualenv/script/env$
^script/virtualenv/script/project-boot\.py$
+^script/.direnv
+^script/lib/iri_tweet/build
^web/event_props$
^script/utils/ghostdriver.log$
^sbin/sync/sync_live
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/script/.envrc Wed Jan 02 17:49:19 2019 +0100
@@ -0,0 +1,1 @@
+use pythonvenv 3.7.1+brew
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/script/.vscode/settings.json Wed Jan 02 17:49:19 2019 +0100
@@ -0,0 +1,4 @@
+{
+ "python.pythonPath": "/Users/ymh/dev/projects/tweet_live/script/.direnv/python-3.7.1/bin/python",
+ "python.analysis.diagnosticPublishDelay": 996
+}
\ No newline at end of file
--- a/script/lib/iri_tweet/iri_tweet/models.py Fri Dec 21 12:33:01 2018 +0100
+++ b/script/lib/iri_tweet/iri_tweet/models.py Wed Jan 02 17:49:19 2019 +0100
@@ -1,14 +1,15 @@
-from sqlalchemy import (Boolean, Column, Enum, BigInteger, Integer, String,
- ForeignKey, DateTime, create_engine, event)
+import datetime
+import email.utils
+import json
+
+from sqlalchemy import (BigInteger, Boolean, Column, DateTime, Enum,
+ ForeignKey, Integer, String, create_engine, event)
from sqlalchemy.engine import Engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, sessionmaker
-import anyjson
-import datetime
-import email.utils
+
import iri_tweet
-
Base = declarative_base()
APPLICATION_NAME = "IRI_TWITTER"
@@ -23,13 +24,13 @@
if obj is None:
return None
else:
- return anyjson.serialize(obj)
+ return json.dumps(obj)
class TweetMeta(type(Base)):
def __init__(cls, name, bases, ns): #@NoSelf
def init(self, **kwargs):
- for key, value in kwargs.iteritems():
+ for key, value in kwargs.items():
if hasattr(self, key):
setattr(self, key, value)
super(cls, self).__init__()
@@ -272,13 +273,13 @@
session_argname = [ 'autoflush','binds', "class_", "_enable_transaction_accounting","expire_on_commit", "extension", "query_cls", "twophase", "weak_identity_map", "autocommit"]
- kwargs_ce = dict((k, v) for k,v in kwargs.iteritems() if (k not in session_argname and k != "create_all"))
+ kwargs_ce = dict((k, v) for k,v in kwargs.items() if (k not in session_argname and k != "create_all"))
engine = create_engine(*args, **kwargs_ce)
if engine.name == "sqlite":
@event.listens_for(Engine, "connect")
- def set_sqlite_pragma(dbapi_connection, connection_record):
+ def set_sqlite_pragma(dbapi_connection, connection_record): #pylint: W0612
cursor = dbapi_connection.cursor()
cursor.execute("PRAGMA foreign_keys=ON")
cursor.close()
@@ -301,4 +302,3 @@
session.close()
return (engine, metadata, Session)
-
--- a/script/lib/iri_tweet/iri_tweet/processor.py Fri Dec 21 12:33:01 2018 +0100
+++ b/script/lib/iri_tweet/iri_tweet/processor.py Wed Jan 02 17:49:19 2019 +0100
@@ -10,10 +10,10 @@
from iri_tweet.utils import (ObjectsBuffer, adapt_fields, fields_adapter,
ObjectBufferProxy, get_oauth_token, clean_keys)
from sqlalchemy.orm import joinedload
-import anyjson
import logging
import twitter
import twitter_text
+import json
class TwitterProcessorException(Exception):
@@ -26,12 +26,12 @@
raise TwitterProcessorException("No json")
if json_dict is None:
- self.json_dict = anyjson.deserialize(json_txt)
+ self.json_dict = json.loads(json_txt)
else:
self.json_dict = json_dict
if not json_txt:
- self.json_txt = anyjson.serialize(json_dict)
+ self.json_txt = json.dumps(json_dict)
else:
self.json_txt = json_txt
@@ -258,7 +258,7 @@
def __process_entities(self):
if "entities" in self.json_dict:
- for ind_type, entity_list in self.json_dict["entities"].iteritems():
+ for ind_type, entity_list in self.json_dict["entities"].items():
for ind in entity_list:
self.__process_entity(ind, ind_type)
else:
@@ -281,7 +281,7 @@
status_id = self.json_dict["id"]
log = self.session.query(TweetLog).filter(TweetLog.status_id==status_id).first()
if(log):
- self.obj_buffer.add_object(TweetLog, log, {'status': TweetLog.TWEET_STATUS['DELETE'], 'status_id': None})
+ self.obj_buffer.add_object(TweetLog, log, {'status': TweetLog.TWEET_STATUS['DELETE'], 'status_id': None}, False)
self.session.query(TweetSource).filter(TweetSource.id==self.source_id).delete()
else:
self.__process_twitter()
@@ -350,12 +350,12 @@
return
tweets = self.session.query(Tweet).options(joinedload(Tweet.tweet_source)).filter(Tweet.id <= up_to_status_id)
for t in tweets:
- self.obj_buffer.add_object(Tweet, t, {'geo': None})
+ self.obj_buffer.add_object(Tweet, t, {'geo': None}, False)
tsource = t.tweet_source
- tsource_dict = anyjson.serialize(tsource.original_json)
+ tsource_dict = json.loads(tsource.original_json)
if tsource_dict.get("geo", None):
tsource_dict["geo"] = None
- self.obj_buffer.add_object(TweetSource, tsource, {'original_json': anyjson.serialize(tsource_dict)})
+ self.obj_buffer.add_object(TweetSource, tsource, {'original_json': json.dumps(tsource_dict)}, False)
self.obj_buffer.add_object(TweetLog, None, {'tweet_source_id':self.source_id, 'status':TweetLog.TWEET_STATUS['SCRUB_GEO']}, True)
def log_info(self):
@@ -486,7 +486,7 @@
}
def get_processor(tweet_dict):
- for processor_key,processor_klass in TWEET_PROCESSOR_MAP.iteritems():
+ for processor_key,processor_klass in TWEET_PROCESSOR_MAP.items():
if processor_key in tweet_dict:
return processor_klass
return None
--- a/script/lib/iri_tweet/iri_tweet/utils.py Fri Dec 21 12:33:01 2018 +0100
+++ b/script/lib/iri_tweet/iri_tweet/utils.py Wed Jan 02 17:49:19 2019 +0100
@@ -1,18 +1,21 @@
-from models import (Tweet, User, Hashtag, EntityHashtag, APPLICATION_NAME, ACCESS_TOKEN_SECRET, adapt_date, adapt_json,
- ACCESS_TOKEN_KEY)
-from sqlalchemy.sql import select, or_
-import Queue
import codecs
import datetime
import email.utils
+import functools
import logging
import math
import os.path
+import Queue
import socket
import sys
+
import twitter.oauth
import twitter.oauth_dance
+from sqlalchemy.sql import or_, select
+from .models import (ACCESS_TOKEN_KEY, ACCESS_TOKEN_SECRET, APPLICATION_NAME,
+ EntityHashtag, Hashtag, Tweet, User, adapt_date,
+ adapt_json)
CACHE_ACCESS_TOKEN = {}
@@ -160,12 +163,12 @@
if proxy.kwargs is None or len(proxy.kwargs) == 0 or proxy.klass != klass:
continue
found = True
- for k,v in kwargs.iteritems():
+ for k,v in kwargs.items():
if (k not in proxy.kwargs) or v != proxy.kwargs[k]:
found = False
break
if found:
- return proxy
+ return proxy
return None
@@ -239,7 +242,7 @@
def merge_hash(l,h):
l.extend(h.split(","))
return l
- htags = reduce(merge_hash, hashtags, [])
+ htags = functools.reduce(merge_hash, hashtags, [])
query = query.filter(or_(*map(lambda h: Hashtag.text.contains(h), htags))) #@UndefinedVariable
@@ -311,17 +314,15 @@
if writer is None:
writer = sys.stdout
- if sys.stdout.encoding is not None:
- writer = codecs.getwriter(sys.stdout.encoding)(sys.stdout)
percent = (float(current_line) / float(total_line)) * 100.0
marks = math.floor(width * (percent / 100.0))
spaces = math.floor(width - marks)
- loader = u'[' + (u'=' * int(marks)) + (u' ' * int(spaces)) + u']'
+ loader = '[' + ('=' * int(marks)) + (' ' * int(spaces)) + ']'
- s = u"%s %3d%% %*d/%d - %*s\r" % (loader, percent, len(str(total_line)), current_line, total_line, width, label[:width])
+ s = "%s %3d%% %*d/%d - %*s\r" % (loader, percent, len(str(total_line)), current_line, total_line, width, label[:width])
writer.write(s) #takes the header into account
if percent >= 100:
@@ -336,4 +337,3 @@
_, port = s.getsockname()
s.close()
return port
-
--- a/script/lib/iri_tweet/setup.py Fri Dec 21 12:33:01 2018 +0100
+++ b/script/lib/iri_tweet/setup.py Wed Jan 02 17:49:19 2019 +0100
@@ -1,4 +1,3 @@
-#@PydevCodeAnalysisIgnore
import sys
import os
@@ -45,7 +44,7 @@
if line.strip() == '# -eof meta-':
break
acc.append(line)
- for pattern, handler in pats.iteritems():
+ for pattern, handler in pats.items():
m = pattern.match(line.strip())
if m:
meta.update(handler(m))
--- a/script/utils/export_twitter_alchemy.py Fri Dec 21 12:33:01 2018 +0100
+++ b/script/utils/export_twitter_alchemy.py Wed Jan 02 17:49:19 2019 +0100
@@ -1,24 +1,26 @@
#!/usr/bin/env python
# coding=utf-8
-from lxml import etree
-from iri_tweet.models import setup_database, Tweet, User
-from sqlalchemy import Table, Column, BigInteger, event, bindparam
-from sqlalchemy.sql import select, func
-from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query,
- get_logger)
import argparse
-import anyjson
+import bisect
import datetime
-import requests
+import json
import os.path
import re
import sys
import time
-import uuid #@UnresolvedImport
+import uuid # @UnresolvedImport
+
+import requests
+from lxml import etree
+from sqlalchemy import BigInteger, Column, Table, bindparam, event
+from sqlalchemy.sql import func, select
+
from dateutil.parser import parse as parse_date_raw
from dateutil.tz import tzutc
-import bisect
+from iri_tweet.models import Tweet, User, setup_database
+from iri_tweet.utils import (get_filter_query, get_logger, set_logging,
+ set_logging_options)
#class TweetExclude(object):
# def __init__(self, id):
@@ -49,12 +51,12 @@
parse polemics in text and return a list of polemic code. None if not polemic found
"""
polemics = {}
- for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text):
+ for m in re.finditer(r"(\+\+|\-\-|\?\?|\=\=)",tw.text):
pol_link = {
- '++' : u'OK',
- '--' : u'KO',
- '??' : u'Q',
- '==' : u'REF'}[m.group(1)]
+ '++' : 'OK',
+ '--' : 'KO',
+ '??' : 'Q',
+ '==' : 'REF'}[m.group(1)]
polemics[pol_link] = pol_link
if extended_mode:
@@ -75,12 +77,12 @@
parse polemics in text and return a list of polemic code. None if not polemic found
"""
polemics = {}
- for m in re.finditer("(\+\+|\!\!|\?\?|\=\=)",tw.text):
+ for m in re.finditer(r"(\+\+|\!\!|\?\?|\=\=)",tw.text):
pol_link = {
- '++' : u'OK',
- '!!' : u'KO',
- '??' : u'Q',
- '==' : u'REF'}[m.group(1)]
+ '++' : 'OK',
+ '!!' : 'KO',
+ '??' : 'Q',
+ '==' : 'REF'}[m.group(1)]
polemics[pol_link] = pol_link
if extended_mode:
@@ -101,12 +103,12 @@
parse polemics in text and return a list of polemic code. None if not polemic found
"""
polemics = {}
- for m in re.finditer("(\+\+|\?\?|\*\*|\=\=)",tw.text):
+ for m in re.finditer(r"(\+\+|\?\?|\*\*|\=\=)",tw.text):
pol_link = {
- '++' : u'OK',
- '??' : u'KO',
- '**' : u'REF',
- '==' : u'Q'}[m.group(1)]
+ '++' : 'OK',
+ '??' : 'KO',
+ '**' : 'REF',
+ '==' : 'Q'}[m.group(1)]
polemics[pol_link] = pol_link
if extended_mode:
@@ -158,7 +160,7 @@
parser.add_argument("-D", "--duration", dest="duration", type=int,
help="Duration", metavar="DURATION", default=None)
parser.add_argument("-n", "--name", dest="name",
- help="Cutting name", metavar="NAME", default=u"Tweets")
+ help="Cutting name", metavar="NAME", default="Tweets")
parser.add_argument("-R", "--replace", dest="replace", action="store_true",
help="Replace tweet ensemble", default=False)
parser.add_argument("-m", "--merge", dest="merge", action="store_true",
@@ -228,7 +230,7 @@
sys.exit(1)
conn_str = options.database.strip()
- if not re.match("^\w+://.+", conn_str):
+ if not re.match(r"^\w+://.+", conn_str):
conn_str = 'sqlite:///' + conn_str
engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)
@@ -249,8 +251,8 @@
if options.exclude and os.path.exists(options.exclude):
with open(options.exclude, 'r+') as f:
- tei = tweet_exclude_table.insert()
- ex_regexp = re.compile("(?P<field>\w+)(?P<op>[~=])(?P<value>.+)", re.I)
+ tei = tweet_exclude_table.insert() # pylint: disable=E1120
+ ex_regexp = re.compile(r"(?P<field>\w+)(?P<op>[~=])(?P<value>.+)", re.I)
for line in f:
res = ex_regexp.match(line.strip())
if res:
@@ -320,7 +322,7 @@
}]
post_param = {}
if options.post_param:
- post_param = anyjson.loads(options.post_param)
+ post_param = json.loads(options.post_param)
for params in parameters:
@@ -365,15 +367,15 @@
if root is None:
- root = etree.Element(u"iri")
+ root = etree.Element("iri")
- project = etree.SubElement(root, u"project", {u"abstract":u"Polemics Tweets",u"title":u"Polemic Tweets", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())})
+ project = etree.SubElement(root, "project", {"abstract":"Polemics Tweets","title":"Polemic Tweets", "user":"IRI Web", "id":str(uuid.uuid4())})
- medias = etree.SubElement(root, u"medias")
- media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
+ medias = etree.SubElement(root, "medias")
+ media = etree.SubElement(medias, "media", {"pict":"", "src":options.content, "video":options.video, "id":options.content_id, "extra":""})
- annotations = etree.SubElement(root, u"annotations")
- content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
+ annotations = etree.SubElement(root, "annotations")
+ content = etree.SubElement(annotations, "content", {"id":options.content_id})
ensemble_parent = content
content_id = options.content_id
@@ -393,14 +395,14 @@
media_nodes = root.xpath("//media")
if len(media_nodes) > 0:
media = media_nodes[0]
- annotations_node = root.find(u"annotations")
+ annotations_node = root.find("annotations")
if annotations_node is None:
- annotations_node = etree.SubElement(root, u"annotations")
- content_node = annotations_node.find(u"content")
+ annotations_node = etree.SubElement(root, "annotations")
+ content_node = annotations_node.find("content")
if content_node is None:
- content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id"))
+ content_node = etree.SubElement(annotations_node,"content", id=media.get("id"))
ensemble_parent = content_node
- content_id = content_node.get(u"id")
+ content_id = content_node.get("id")
display_nodes = root.xpath("//displays/display/content[@id='%s']" % content_id)
if len(display_nodes) == 0:
get_logger().info("No display node found. Will not update display")
@@ -409,12 +411,12 @@
display_content_node = display_nodes[0]
elif file_type == "iri":
- body_node = root.find(u"body")
+ body_node = root.find("body")
if body_node is None:
- body_node = etree.SubElement(root, u"body")
- ensembles_node = body_node.find(u"ensembles")
+ body_node = etree.SubElement(root, "body")
+ ensembles_node = body_node.find("ensembles")
if ensembles_node is None:
- ensembles_node = etree.SubElement(body_node, u"ensembles")
+ ensembles_node = etree.SubElement(body_node, "ensembles")
ensemble_parent = ensembles_node
content_id = root.xpath("head/meta[@name='id']/@content")[0]
display_content_node = None
@@ -425,7 +427,7 @@
sys.exit()
if options.replace:
- for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
+ for ens in ensemble_parent.iterchildren(tag="ensemble"):
ens_id = ens.get("id","")
if ens_id.startswith("tweet_"):
ensemble_parent.remove(ens)
@@ -439,22 +441,22 @@
elements = None
if options.merge:
- for ens in ensemble_parent.findall(u"ensemble"):
+ for ens in ensemble_parent.findall("ensemble"):
if ens.get('id',"").startswith("tweet_"):
ensemble = ens
break
if ensemble is not None:
- elements = ensemble.find(u".//elements")
- decoupage = ensemble.find(u"decoupage")
+ elements = ensemble.find(".//elements")
+ decoupage = ensemble.find("decoupage")
if ensemble is None or elements is None:
- ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"})
- decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
+ ensemble = etree.SubElement(ensemble_parent, "ensemble", {"id":"tweet_" + str(uuid.uuid4()), "title":"Ensemble Twitter", "author":"IRI Web", "abstract":"Ensemble Twitter"})
+ decoupage = etree.SubElement(ensemble, "decoupage", {"id": str(uuid.uuid4()), "author": "IRI Web"})
- etree.SubElement(decoupage, u"title").text = unicode(options.name)
- etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
+ etree.SubElement(decoupage, "title").text = options.name
+ etree.SubElement(decoupage, "abstract").text = options.name
- elements = etree.SubElement(decoupage, u"elements")
+ elements = etree.SubElement(decoupage, "elements")
ensemble_id = ensemble.get('id', '')
decoupage_id = decoupage.get('id', '') if decoupage is not None else None
@@ -504,28 +506,28 @@
if not username:
username = "anon."
- element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel_milli), u"dur":u"0", u"src":unicode(profile_url)})
- etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text)
- etree.SubElement(element, u"abstract").text = unicode(tw.text)
+ element = etree.SubElement(elements, "element" , {"id": "%s-%s" % (uuid.uuid4(),tw.id), "color":options.color, "author":username, "date":tweet_ts_dt.strftime("%Y/%m/%d"), "begin": str(tweet_ts_rel_milli), "dur":"0", "src":profile_url})
+ etree.SubElement(element, "title").text = username + ": " + tw.text
+ etree.SubElement(element, "abstract").text = tw.text
- tags_node = etree.SubElement(element, u"tags")
+ tags_node = etree.SubElement(element, "tags")
for entity in tw.entity_list:
- if entity.type == u'entity_hashtag':
- etree.SubElement(tags_node,u"tag").text = entity.hashtag.text
+ if entity.type == 'entity_hashtag':
+ etree.SubElement(tags_node,"tag").text = entity.hashtag.text
- meta_element = etree.SubElement(element, u'meta')
+ meta_element = etree.SubElement(element, 'meta')
- etree.SubElement(meta_element, u"polemic_version").text = options.protocol_version
+ etree.SubElement(meta_element, "polemic_version").text = options.protocol_version
parse_polemics = protocol_version_map.get(options.protocol_version, parse_polemics_2)
polemics_list = parse_polemics(tw, options.extended_mode)
if polemics_list:
- polemics_element = etree.Element(u'polemics')
+ polemics_element = etree.Element('polemics')
for pol in polemics_list:
- etree.SubElement(polemics_element, u'polemic').text = pol
+ etree.SubElement(polemics_element, 'polemic').text = pol
meta_element.append(polemics_element)
- etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.tweet_source.original_json))
+ etree.SubElement(meta_element, "source", attrib={"url":"http://dev.twitter.com", "mimetype":"application/json"}).text = etree.CDATA(tw.tweet_source.original_json)
# sort by tc in
if options.merge :
@@ -537,14 +539,14 @@
#add to display node
if display_content_node is not None:
display_dec = None
- for dec in display_content_node.iterchildren(tag=u"decoupage"):
+ for dec in display_content_node.iterchildren(tag="decoupage"):
if dec.get('idens','') == ensemble_id and dec.get('id', '') == decoupage_id:
display_dec = dec
break
if display_dec is None and ensemble_id and decoupage_id:
- etree.SubElement(display_content_node, u"decoupage", attrib={'idens': ensemble_id, 'id': decoupage_id, 'tagsSelect':''})
+ etree.SubElement(display_content_node, "decoupage", attrib={'idens': ensemble_id, 'id': decoupage_id, 'tagsSelect':''})
- output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True)
+ output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True).decode('utf-8')
if content_file_write and content_file_write.find("http") == 0:
@@ -554,14 +556,14 @@
post_param = {}
if options.post_param:
- post_param = anyjson.loads(options.post_param)
+ post_param = json.loads(options.post_param)
get_logger().debug("write http " + content_file_write) #@UndefinedVariable
get_logger().debug("write http " + repr(post_param)) #@UndefinedVariable
get_logger().debug("write http " + repr(project)) #@UndefinedVariable
- r = requests.put(content_file_write, data=anyjson.dumps(project), headers={'content-type':'application/json'}, params=post_param);
+ r = requests.put(content_file_write, data=json.dumps(project), headers={'content-type':'application/json'}, params=post_param)
get_logger().debug("write http " + repr(r) + " content " + r.text) #@UndefinedVariable
- if r.status_code != requests.codes.ok: # @UndefinedVariable
+ if r.status_code != requests.codes.ok: # pylint: disable=E1101
r.raise_for_status()
else:
if content_file_write and os.path.exists(content_file_write):
--- a/script/utils/merge_tweets.py Fri Dec 21 12:33:01 2018 +0100
+++ b/script/utils/merge_tweets.py Wed Jan 02 17:49:19 2019 +0100
@@ -1,14 +1,15 @@
#from models import setup_database
-from iri_tweet.models import setup_database, TweetSource, Tweet, TweetLog
-from iri_tweet.processor import TwitterProcessorStatus
-from iri_tweet.utils import get_oauth_token, show_progress
-import anyjson
import argparse
import codecs
+import json
import logging
import re
import sys
+from iri_tweet.models import Tweet, TweetLog, TweetSource, setup_database
+from iri_tweet.processor import TwitterProcessorStatus
+from iri_tweet.utils import get_oauth_token, show_progress
+
logger = logging.getLogger(__name__)
def get_option():
@@ -49,10 +50,10 @@
#open source
src_conn_str = options.source[0].strip()
- if not re.match("^\w+://.+", src_conn_str):
+ if not re.match(r"^\w+://.+", src_conn_str):
src_conn_str = 'sqlite:///' + src_conn_str
tgt_conn_str = options.target[0].strip()
- if not re.match("^\w+://.+", tgt_conn_str):
+ if not re.match(r"^\w+://.+", tgt_conn_str):
tgt_conn_str = 'sqlite:///' + tgt_conn_str
@@ -66,13 +67,11 @@
#conn_tgt = engine_tgt.connect()
session_src = Session_src()
session_tgt = Session_tgt()
-
- count_tw_query = Tweet.__table__.count() # @UndefinedVariable
-
- count_tw = engine_src.scalar(count_tw_query)
+
+ count_tw = session_src.query(Tweet).count()
if count_tw == 0:
- print "No tweet to process : exit"
+ print("No tweet to process : exit")
sys.exit()
query_src = session_src.query(Tweet).join(TweetSource).yield_per(100)
@@ -88,7 +87,7 @@
progress_text = u"Adding : "
tweet_source = tweet.tweet_source.original_json
- tweet_obj = anyjson.deserialize(tweet_source)
+ tweet_obj = json.loads(tweet_source)
if 'text' not in tweet_obj:
tweet_log = TweetLog(tweet_source_id=tweet.tweet_source.id, status=TweetLog.TWEET_STATUS['NOT_TWEET'])
session_tgt.add(tweet_log)
@@ -102,7 +101,7 @@
writer = show_progress(i+1, count_tw, ptext.replace("\n",""), 70, writer)
session_tgt.commit()
- print u"%d new tweet added" % (added)
+ print(u"%d new tweet added" % (added,))
finally:
if session_tgt is not None:
@@ -113,5 +112,3 @@
conn_tgt.close()
if conn_src is not None:
conn_src.close()
-
-
\ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/script/utils/search_twitter_api.py Wed Jan 02 17:49:19 2019 +0100
@@ -0,0 +1,141 @@
+import argparse
+import logging
+import math
+import re
+import time
+import datetime
+import urllib
+
+from blessings import Terminal
+import requests
+import twitter
+
+from iri_tweet import models, utils
+from iri_tweet.processor import TwitterProcessorStatus
+
+import json
+
+logger = logging.getLogger(__name__)
+
+APPLICATION_NAME = "Tweet seach json"
+
+
+# TODO: implement some more parameters
+# script to "scrap twitter results"
+# Shamelessly taken from https://github.com/Jefferson-Henrique/GetOldTweets-python
+# pyquery cssselect
+class TweetManager:
+
+ def __init__(self, query, twitter_con):
+ self.query = query
+ self.max_id = 0
+ self.t = twitter_con
+ pass
+
+ def __iter__(self):
+ while True:
+ if self.max_id < 0:
+ break
+ json = self.get_json_response()
+
+ next_results = json['search_metadata'].get('next_results', "?")[1:]
+ self.max_id = int(urllib.parse.parse_qs(next_results).get('max_id', [-1])[0])
+
+ tweet_list = json['statuses']
+
+ if len(tweet_list) == 0:
+ break
+
+ for tweet in tweet_list:
+ yield tweet
+
+ def get_json_response(self):
+ return self.t.search.tweets(q=self.query, include_entities=True, max_id=self.max_id)
+
+
+def get_options():
+
+ usage = "usage: %(prog)s [options] <connection_str_or_filepath>"
+
+ parser = argparse.ArgumentParser(usage=usage)
+
+ parser.add_argument(dest="conn_str",
+ help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR")
+ parser.add_argument("-Q", dest="query",
+ help="query", metavar="QUERY")
+ parser.add_argument("-k", "--key", dest="consumer_key",
+ help="Twitter consumer key", metavar="CONSUMER_KEY")
+ parser.add_argument("-s", "--secret", dest="consumer_secret",
+ help="Twitter consumer secret", metavar="CONSUMER_SECRET")
+ parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
+ help="Token file name")
+
+ utils.set_logging_options(parser)
+
+ return parser.parse_args()
+
+
+
+if __name__ == "__main__":
+
+ options = get_options()
+
+ utils.set_logging(options)
+
+
+ acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME)
+
+ t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True)
+ t.secure = True
+
+ conn_str = options.conn_str.strip()
+ if not re.match(r"^\w+://.+", conn_str):
+ conn_str = 'sqlite:///' + conn_str
+
+ engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
+ session = None
+
+
+ term = Terminal()
+
+ try:
+ session = Session()
+
+ results = None
+ print(options.query)
+
+ tm = TweetManager(options.query, t)
+
+ move_up = 0
+
+ for i,tweet in enumerate(tm):
+ # get id
+ tweet_id = tweet.get("id")
+
+ if not tweet_id:
+ continue
+
+ if move_up > 0:
+ print((move_up+1)*term.move_up())
+ move_up = 0
+
+ print ("%d: %s - %r" % (i+1, tweet_id, tweet.get("text", "") ) + term.clear_eol())
+ move_up += 1
+
+ count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count()
+
+ if count_tweet:
+ continue
+
+ processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger)
+ processor.process()
+ session.flush()
+ session.commit()
+
+ except twitter.api.TwitterHTTPError as e:
+ fmt = ("." + e.format) if e.format else ""
+ print("Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data)))
+
+ finally:
+ if session:
+ session.close()
--- a/script/utils/search_twitter_json.py Fri Dec 21 12:33:01 2018 +0100
+++ b/script/utils/search_twitter_json.py Wed Jan 02 17:49:19 2019 +0100
@@ -13,7 +13,6 @@
from iri_tweet import models, utils
from iri_tweet.processor import TwitterProcessorStatus
-from lxml import html
import json
from pyquery import PyQuery
@@ -35,8 +34,6 @@
def __iter__(self):
- results = []
-
while True:
json = self.get_json_response()
if len(json['items_html'].strip()) == 0:
@@ -51,13 +48,13 @@
for tweetHTML in tweets:
tweet_pq = PyQuery(tweetHTML)
- username = tweet_pq("span.username.js-action-profile-name b").text();
- txt = re.sub(r"\s+", " ", re.sub(r"[^\x00-\x7F]", "", tweet_pq("p.js-tweet-text").text()).replace('# ', '#').replace('@ ', '@'));
- retweets = int(tweet_pq("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""));
- favorites = int(tweet_pq("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""));
- date_sec = int(tweet_pq("small.time span.js-short-timestamp").attr("data-time"));
- id = tweet_pq.attr("data-tweet-id");
- permalink = tweet_pq.attr("data-permalink-path");
+ username = tweet_pq("span.username.js-action-profile-name b").text()
+ txt = re.sub(r"\s+", " ", re.sub(r"[^\x00-\x7F]", "", tweet_pq("p.js-tweet-text").text()).replace('# ', '#').replace('@ ', '@'))
+ retweets = int(tweet_pq("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
+ favorites = int(tweet_pq("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
+ date_sec = int(tweet_pq("small.time span.js-short-timestamp").attr("data-time"))
+ id = tweet_pq.attr("data-tweet-id")
+ permalink = tweet_pq.attr("data-permalink-path")
geo = ''
geo_span = tweet_pq('span.Tweet-geo')
@@ -129,7 +126,7 @@
options = get_options()
- utils.set_logging(options);
+ utils.set_logging(options)
acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME)
@@ -138,7 +135,7 @@
t.secure = True
conn_str = options.conn_str.strip()
- if not re.match("^\w+://.+", conn_str):
+ if not re.match(r"^\w+://.+", conn_str):
conn_str = 'sqlite:///' + conn_str
engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
@@ -151,7 +148,7 @@
session = Session()
results = None
- print options.query
+ print(options.query)
tm = TweetManager(options.query)
@@ -188,9 +185,9 @@
session.flush()
session.commit()
- print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('X-Rate-Limit-Limit'))) + term.clear_eol())
+ print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers['X-Rate-Limit-Limit'])) + term.clear_eol())
move_up += 1
- rate_limit_limit = int(tweet.headers.getheader('X-Rate-Limit-Limit'))
+ rate_limit_limit = int(tweet.headers['X-Rate-Limit-Limit'])
rate_limit_remaining = int(tweet.rate_limit_remaining)
if rate_limit_remaining > rate_limit_limit:
@@ -198,7 +195,7 @@
else:
time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining))
- for i in xrange(time_to_sleep):
+ for i in range(time_to_sleep):
if i:
print(2*term.move_up())
else:
@@ -208,7 +205,7 @@
except twitter.api.TwitterHTTPError as e:
fmt = ("." + e.format) if e.format else ""
- print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data))
+ print("Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data)))
finally:
if session:
--- a/script/utils/tweet_twitter_user.py Fri Dec 21 12:33:01 2018 +0100
+++ b/script/utils/tweet_twitter_user.py Wed Jan 02 17:49:19 2019 +0100
@@ -60,7 +60,7 @@
sys.exit()
conn_str = options.database.strip()
- if not re.match("^\w+://.+", conn_str):
+ if not re.match(r"^\w+://.+", conn_str):
conn_str = 'sqlite:///' + conn_str
engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)