diff -r 51072e5e6ea9 -r 2ef837069108 script/utils/search_topsy.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/utils/search_topsy.py Mon Oct 15 17:01:50 2012 +0200 @@ -0,0 +1,170 @@ +from iri_tweet import models, utils +from sqlalchemy.orm import sessionmaker +import anyjson +import sqlite3 +import twitter +import re +import requests +from optparse import OptionParser +import simplejson +import time +from blessings import Terminal +import sys +import math +from symbol import except_clause + +APPLICATION_NAME = "Tweet recorder user" +CONSUMER_KEY = "Vdr5ZcsjI1G3esTPI8yDg" +CONSUMER_SECRET = "LMhNrY99R6a7E0YbZZkRFpUZpX5EfB1qATbDk1sIVLs" + + +class TopsyResource(object): + + def __init__(self, query, **kwargs): + + self.options = kwargs + self.options['q'] = query + self.url = kwargs.get("url", "http://otter.topsy.com/search.json") + self.page = 0 + self.req = None + self.res = {} + + def __initialize(self): + + params = {} + params.update(self.options) + self.req = requests.get(self.url, params=params) + self.res = self.req.json + + def __next_page(self): + page = self.res.get("response").get("page") + 1 + params = {} + params.update(self.options) + params['page'] = page + self.req = requests.get(self.url, params=params) + self.res = self.req.json + + def __iter__(self): + if not self.req: + self.__initialize() + while "response" in self.res and "list" in self.res.get("response") and self.res.get("response").get("list"): + for item in self.res.get("response").get("list"): + yield item + self.__next_page() + + def total(self): + if not self.res: + return 0 + else: + return self.res.get("response",{}).get("total",0) + + + +def get_option(): + + parser = OptionParser() + + parser.add_option("-d", "--database", dest="database", + help="Input database", metavar="DATABASE") + parser.add_option("-Q", dest="query", + help="query", metavar="QUERY") + parser.add_option("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token", + help="Token file name") + parser.add_option("-T", dest="topsy_apikey", metavar="TOPSY_APIKEY", default=None, + help="Topsy apikey") + + utils.set_logging_options(parser) + + return parser.parse_args() + + + +if __name__ == "__main__": + + (options, args) = get_option() + + utils.set_logging(options); + + + acess_token_key, access_token_secret = utils.get_oauth_token(options.token_filename, application_name=APPLICATION_NAME, consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET) + + t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, CONSUMER_KEY, CONSUMER_SECRET), secure=True) + t.secure = True + + conn_str = options.database.strip() + if not re.match("^\w+://.+", conn_str): + conn_str = 'sqlite:///' + conn_str + + engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True) + session = None + + + topsy_parameters = { + 'apikey': options.topsy_apikey, + 'perpage': 100, + 'window': 'a', + 'type': 'tweet', + 'hidden': True, + } + + term = Terminal() + + try: + session = Session() + + results = None + page = 1 + print options.query + + tr = TopsyResource(options.query, **topsy_parameters) + + move_up = 0 + + for i,item in enumerate(tr): + # get id + url = item.get("url") + tweet_id = url.split("/")[-1] + + if move_up > 0: + print((move_up+1)*term.move_up()) + move_up = 0 + + print ("%d/%d:%03d%% - %s - %r" % (i+1, tr.total(), int(float(i+1)/float(tr.total())*100.0), tweet_id, item.get("content") ) + term.clear_eol()) + move_up += 1 + + count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count() + + if count_tweet: + continue + try: + tweet = t.statuses.show(id=tweet_id, include_entities=True) + except twitter.api.TwitterHTTPError as e: + if e.e.code == 404 or e.e.code == 403: + continue + else: + raise + + processor = utils.TwitterProcessor(tweet, None, None, session, None, options.token_filename) + processor.process() + session.flush() + session.commit() + + time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining)) + + print "rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('x-ratelimit-limit'))) + term.clear_eol() + move_up += 1 + for i in xrange(time_to_sleep): + if i: + print(2*term.move_up()) + else: + move_up += 1 + print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol()) + time.sleep(1) + + except twitter.api.TwitterHTTPError as e: + fmt = ("." + e.format) if e.format else "" + print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data)) + + finally: + if session: + session.close()