diff -r 884b1b7fc420 -r e1d3c1469691 script/utils/search_twitter_json.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/utils/search_twitter_json.py Wed Mar 16 23:09:57 2016 +0100 @@ -0,0 +1,215 @@ +import argparse +import logging +import math +import re +import time +import datetime +import urllib + +from blessings import Terminal +import requests +import twitter + +from iri_tweet import models, utils +from iri_tweet.processor import TwitterProcessorStatus + +from lxml import html +import json +from pyquery import PyQuery + +logger = logging.getLogger(__name__) + +APPLICATION_NAME = "Tweet seach json" + + +# TODO: implement some more parameters +# script to "scrap twitter results" +# Shamelessly taken from https://github.com/Jefferson-Henrique/GetOldTweets-python +# pyquery cssselect +class TweetManager: + + def __init__(self, query): + self.query = query + self.refresh_cursor = '' + pass + + def __iter__(self): + + results = [] + + while True: + json = self.get_json_response() + if len(json['items_html'].strip()) == 0: + break + + self.refresh_cursor = json['min_position'] + tweets = PyQuery(json['items_html'])('div.js-stream-tweet') + + if len(tweets) == 0: + break + + for tweetHTML in tweets: + tweet_pq = PyQuery(tweetHTML) + + username = tweet_pq("span.username.js-action-profile-name b").text(); + txt = re.sub(r"\s+", " ", re.sub(r"[^\x00-\x7F]", "", tweet_pq("p.js-tweet-text").text()).replace('# ', '#').replace('@ ', '@')); + retweets = int(tweet_pq("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")); + favorites = int(tweet_pq("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")); + date_sec = int(tweet_pq("small.time span.js-short-timestamp").attr("data-time")); + id = tweet_pq.attr("data-tweet-id"); + permalink = tweet_pq.attr("data-permalink-path"); + + geo = '' + geo_span = tweet_pq('span.Tweet-geo') + if len(geo_span) > 0: + geo = geo_span.attr('title') + + yield { + "id" : id, + "permalink": 'https://twitter.com' + permalink, + "username" : username, + "text": txt, + "date" : datetime.datetime.fromtimestamp(date_sec), + "retweets" : retweets, + "favorites" : favorites, + "mentions": " ".join(re.compile('(@\\w*)').findall(txt)), + "hashtags": " ".join(re.compile('(#\\w*)').findall(txt)), + "geo": geo, + } + + def get_json_response(self): + + url = "https://twitter.com/i/search/timeline" + + # if hasattr(tweetCriteria, 'username'): + # urlGetData += ' from:' + tweetCriteria.username + # + # if hasattr(tweetCriteria, 'since'): + # urlGetData += ' since:' + tweetCriteria.since + # + # if hasattr(tweetCriteria, 'until'): + # urlGetData += ' until:' + tweetCriteria.until + + params = { + 'f': 'realtime', + 'q': self.query, + 'src': 'typd', + 'max_position': self.refresh_cursor + } + + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'} + + return requests.get(url, params=params, headers=headers).json() + + +def get_options(): + + usage = "usage: %(prog)s [options] " + + parser = argparse.ArgumentParser(usage=usage) + + parser.add_argument(dest="conn_str", + help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR") + parser.add_argument("-Q", dest="query", + help="query", metavar="QUERY") + parser.add_argument("-k", "--key", dest="consumer_key", + help="Twitter consumer key", metavar="CONSUMER_KEY") + parser.add_argument("-s", "--secret", dest="consumer_secret", + help="Twitter consumer secret", metavar="CONSUMER_SECRET") + parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token", + help="Token file name") + + utils.set_logging_options(parser) + + return parser.parse_args() + + + +if __name__ == "__main__": + + options = get_options() + + utils.set_logging(options); + + + acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME) + + t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True) + t.secure = True + + conn_str = options.conn_str.strip() + if not re.match("^\w+://.+", conn_str): + conn_str = 'sqlite:///' + conn_str + + engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True) + session = None + + + term = Terminal() + + try: + session = Session() + + results = None + print options.query + + tm = TweetManager(options.query) + + move_up = 0 + + for i,item in enumerate(tm): + # get id + tweet_id = item.get("id") + + if not tweet_id: + continue + + if move_up > 0: + print((move_up+1)*term.move_up()) + move_up = 0 + + print ("%d: %s - %r" % (i+1, tweet_id, item.get("text", "") ) + term.clear_eol()) + move_up += 1 + + count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count() + + if count_tweet: + continue + try: + tweet = t.statuses.show(id=tweet_id, include_entities=True) + except twitter.api.TwitterHTTPError as e: + if e.e.code == 404 or e.e.code == 403: + continue + else: + raise + + processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger) + processor.process() + session.flush() + session.commit() + + print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('X-Rate-Limit-Limit'))) + term.clear_eol()) + move_up += 1 + rate_limit_limit = int(tweet.headers.getheader('X-Rate-Limit-Limit')) + rate_limit_remaining = int(tweet.rate_limit_remaining) + + if rate_limit_remaining > rate_limit_limit: + time_to_sleep = 0 + else: + time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining)) + + for i in xrange(time_to_sleep): + if i: + print(2*term.move_up()) + else: + move_up += 1 + print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol()) + time.sleep(1) + + except twitter.api.TwitterHTTPError as e: + fmt = ("." + e.format) if e.format else "" + print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data)) + + finally: + if session: + session.close()