diff -r 184372ec27e2 -r 14a9bed2e3cd script/utils/search_topsy.py --- a/script/utils/search_topsy.py Wed Jan 02 17:49:19 2019 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,181 +0,0 @@ -import argparse -import logging -import math -import re -import time - -from blessings import Terminal -import requests -import twitter - -from iri_tweet import models, utils -from iri_tweet.processor import TwitterProcessorStatus - - -logger = logging.getLogger(__name__) - -APPLICATION_NAME = "Tweet recorder user" - - -class TopsyResource(object): - - def __init__(self, query, **kwargs): - - self.options = kwargs - self.options['q'] = query - self.url = kwargs.get("url", "http://otter.topsy.com/search.json") - self.page = 0 - self.req = None - self.res = {} - - def __initialize(self): - - params = {} - params.update(self.options) - self.req = requests.get(self.url, params=params) - self.res = self.req.json() - - def __next_page(self): - page = self.res.get("response").get("page") + 1 - params = {} - params.update(self.options) - params['page'] = page - self.req = requests.get(self.url, params=params) - self.res = self.req.json() - - def __iter__(self): - if not self.req: - self.__initialize() - while "response" in self.res and "list" in self.res.get("response") and self.res.get("response").get("list"): - for item in self.res.get("response").get("list"): - yield item - self.__next_page() - - def total(self): - if not self.res: - return 0 - else: - return self.res.get("response",{}).get("total",0) - - - -def get_options(): - - usage = "usage: %(prog)s [options] " - - parser = argparse.ArgumentParser(usage=usage) - - parser.add_argument(dest="conn_str", - help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR") - parser.add_argument("-Q", dest="query", - help="query", metavar="QUERY") - parser.add_argument("-k", "--key", dest="consumer_key", - help="Twitter consumer key", metavar="CONSUMER_KEY") - parser.add_argument("-s", "--secret", dest="consumer_secret", - help="Twitter consumer secret", metavar="CONSUMER_SECRET") - parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token", - help="Token file name") - parser.add_argument("-T", dest="topsy_apikey", metavar="TOPSY_APIKEY", default=None, - help="Topsy apikey") - - utils.set_logging_options(parser) - - return parser.parse_args() - - - -if __name__ == "__main__": - - options = get_options() - - utils.set_logging(options); - - - acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME) - - t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True) - t.secure = True - - conn_str = options.conn_str.strip() - if not re.match("^\w+://.+", conn_str): - conn_str = 'sqlite:///' + conn_str - - engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True) - session = None - - - topsy_parameters = { - 'apikey': options.topsy_apikey, - 'perpage': 100, - 'window': 'a', - 'type': 'tweet', - 'hidden': True, - } - - term = Terminal() - - try: - session = Session() - - results = None - page = 1 - print options.query - - tr = TopsyResource(options.query, **topsy_parameters) - - move_up = 0 - - for i,item in enumerate(tr): - # get id - url = item.get("url") - tweet_id = url.split("/")[-1] - - if move_up > 0: - print((move_up+1)*term.move_up()) - move_up = 0 - - print ("%d/%d:%03d%% - %s - %r" % (i+1, tr.total(), int(float(i+1)/float(tr.total())*100.0), tweet_id, item.get("content") ) + term.clear_eol()) - move_up += 1 - - count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count() - - if count_tweet: - continue - try: - tweet = t.statuses.show(id=tweet_id, include_entities=True) - except twitter.api.TwitterHTTPError as e: - if e.e.code == 404 or e.e.code == 403: - continue - else: - raise - - processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger) - processor.process() - session.flush() - session.commit() - - print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('X-Rate-Limit-Limit'))) + term.clear_eol()) - move_up += 1 - rate_limit_limit = int(tweet.headers.getheader('X-Rate-Limit-Limit')) - rate_limit_remaining = int(tweet.rate_limit_remaining) - - if rate_limit_remaining < rate_limit_limit: - time_to_sleep = 0 - else: - time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining)) - - for i in xrange(time_to_sleep): - if i: - print(2*term.move_up()) - else: - move_up += 1 - print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol()) - time.sleep(1) - - except twitter.api.TwitterHTTPError as e: - fmt = ("." + e.format) if e.format else "" - print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data)) - - finally: - if session: - session.close()