Adapt recorder_stream to python 3
Improve twitter authentication management
Use Oauth2 where possible
Delete old script
import argparse
import logging
import math
import re
import time
import datetime
import urllib
from blessings import Terminal
import requests
import twitter
from iri_tweet import models, utils
from iri_tweet.processor import TwitterProcessorStatus
import json
from pyquery import PyQuery
logger = logging.getLogger(__name__)
APPLICATION_NAME = "Tweet seach json"
# TODO: implement some more parameters
# script to "scrap twitter results"
# Shamelessly taken from https://github.com/Jefferson-Henrique/GetOldTweets-python
# pyquery cssselect
class TweetManager:
def __init__(self, query):
self.query = query
self.refresh_cursor = ''
pass
def __iter__(self):
while True:
json = self.get_json_response()
if len(json['items_html'].strip()) == 0:
break
self.refresh_cursor = json['min_position']
tweets = PyQuery(json['items_html'])('div.js-stream-tweet')
if len(tweets) == 0:
break
for tweetHTML in tweets:
tweet_pq = PyQuery(tweetHTML)
username = tweet_pq("span.username.js-action-profile-name b").text()
txt = re.sub(r"\s+", " ", re.sub(r"[^\x00-\x7F]", "", tweet_pq("p.js-tweet-text").text()).replace('# ', '#').replace('@ ', '@'))
retweets = int(tweet_pq("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
favorites = int(tweet_pq("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
date_sec = int(tweet_pq("small.time span.js-short-timestamp").attr("data-time"))
id = tweet_pq.attr("data-tweet-id")
permalink = tweet_pq.attr("data-permalink-path")
geo = ''
geo_span = tweet_pq('span.Tweet-geo')
if len(geo_span) > 0:
geo = geo_span.attr('title')
yield {
"id" : id,
"permalink": 'https://twitter.com' + permalink,
"username" : username,
"text": txt,
"date" : datetime.datetime.fromtimestamp(date_sec),
"retweets" : retweets,
"favorites" : favorites,
"mentions": " ".join(re.compile('(@\\w*)').findall(txt)),
"hashtags": " ".join(re.compile('(#\\w*)').findall(txt)),
"geo": geo,
}
def get_json_response(self):
url = "https://twitter.com/i/search/timeline"
# if hasattr(tweetCriteria, 'username'):
# urlGetData += ' from:' + tweetCriteria.username
#
# if hasattr(tweetCriteria, 'since'):
# urlGetData += ' since:' + tweetCriteria.since
#
# if hasattr(tweetCriteria, 'until'):
# urlGetData += ' until:' + tweetCriteria.until
params = {
'f': 'realtime',
'q': self.query,
'src': 'typd',
'max_position': self.refresh_cursor
}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}
return requests.get(url, params=params, headers=headers).json()
def get_options():
usage = "usage: %(prog)s [options] <connection_str_or_filepath>"
parser = argparse.ArgumentParser(usage=usage)
parser.add_argument(dest="conn_str",
help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR")
parser.add_argument("-Q", dest="query",
help="query", metavar="QUERY")
parser.add_argument("-k", "--key", dest="consumer_key",
help="Twitter consumer key", metavar="CONSUMER_KEY")
parser.add_argument("-s", "--secret", dest="consumer_secret",
help="Twitter consumer secret", metavar="CONSUMER_SECRET")
parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
help="Token file name")
utils.set_logging_options(parser)
return parser.parse_args()
if __name__ == "__main__":
options = get_options()
utils.set_logging(options)
acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME)
twitter_auth = twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret)
t = twitter.Twitter(domain="api.twitter.com", auth=twitter_auth, secure=True)
t.secure = True
conn_str = options.conn_str.strip()
if not re.match(r"^\w+://.+", conn_str):
conn_str = 'sqlite:///' + conn_str
engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
session = None
term = Terminal()
try:
session = Session()
results = None
print(options.query)
tm = TweetManager(options.query)
move_up = 0
for i,item in enumerate(tm):
# get id
tweet_id = item.get("id")
if not tweet_id:
continue
if move_up > 0:
print((move_up+1)*term.move_up())
move_up = 0
print ("%d: %s - %r" % (i+1, tweet_id, item.get("text", "") ) + term.clear_eol())
move_up += 1
count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count()
if count_tweet:
continue
try:
tweet = t.statuses.show(id=tweet_id, include_entities=True)
except twitter.api.TwitterHTTPError as e:
if e.e.code == 404 or e.e.code == 403:
continue
else:
raise
processor = TwitterProcessorStatus(tweet, None, None, session, twitter_auth=twitter_auth, logger=logger)
processor.process()
session.flush()
session.commit()
print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers['X-Rate-Limit-Limit'])) + term.clear_eol())
move_up += 1
rate_limit_limit = int(tweet.headers['X-Rate-Limit-Limit'])
rate_limit_remaining = int(tweet.rate_limit_remaining)
if rate_limit_remaining > rate_limit_limit:
time_to_sleep = 0
else:
time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining))
for i in range(time_to_sleep):
if i:
print(2*term.move_up())
else:
move_up += 1
print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol())
time.sleep(1)
except twitter.api.TwitterHTTPError as e:
fmt = ("." + e.format) if e.format else ""
print("Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data)))
finally:
if session:
session.close()