script/utils/search_topsy.py
author Nicolas Sauret <nicolas.sauret@iri.centrepompidou.fr>
Mon, 21 Mar 2016 15:09:48 +0100
changeset 1338 00e23c95e844
parent 1137 5c757e167687
permissions -rw-r--r--
Added tag V04.040 for changeset 4606877ca400

import argparse
import logging
import math
import re
import time

from blessings import Terminal
import requests
import twitter

from iri_tweet import models, utils
from iri_tweet.processor import TwitterProcessorStatus


logger = logging.getLogger(__name__)

APPLICATION_NAME = "Tweet recorder user"


class TopsyResource(object):
    
    def __init__(self, query, **kwargs):

        self.options = kwargs
        self.options['q'] = query
        self.url = kwargs.get("url", "http://otter.topsy.com/search.json")
        self.page = 0
        self.req = None
        self.res = {}
        
    def __initialize(self):
        
        params = {}
        params.update(self.options)
        self.req = requests.get(self.url, params=params)
        self.res = self.req.json()
        
    def __next_page(self):
        page = self.res.get("response").get("page") + 1
        params = {}
        params.update(self.options)
        params['page'] = page
        self.req = requests.get(self.url, params=params)
        self.res = self.req.json()

    def __iter__(self):        
        if not self.req:
            self.__initialize()
        while "response" in self.res and "list" in self.res.get("response") and self.res.get("response").get("list"):
            for item in  self.res.get("response").get("list"):
                yield item
            self.__next_page()
            
    def total(self):
        if not self.res:
            return 0
        else:
            return self.res.get("response",{}).get("total",0)
            


def get_options():
    
    usage = "usage: %(prog)s [options] <connection_str_or_filepath>"
    
    parser = argparse.ArgumentParser(usage=usage)

    parser.add_argument(dest="conn_str",
                        help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR")
    parser.add_argument("-Q", dest="query",
                      help="query", metavar="QUERY")
    parser.add_argument("-k", "--key", dest="consumer_key",
                        help="Twitter consumer key", metavar="CONSUMER_KEY")
    parser.add_argument("-s", "--secret", dest="consumer_secret",
                        help="Twitter consumer secret", metavar="CONSUMER_SECRET")
    parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
                      help="Token file name")
    parser.add_argument("-T", dest="topsy_apikey", metavar="TOPSY_APIKEY", default=None,
                      help="Topsy apikey")

    utils.set_logging_options(parser)

    return parser.parse_args()



if __name__ == "__main__":

    options = get_options()
    
    utils.set_logging(options);


    acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME)

    t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True)
    t.secure = True
    
    conn_str = options.conn_str.strip()
    if not re.match("^\w+://.+", conn_str):
        conn_str = 'sqlite:///' + conn_str
    
    engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
    session = None
    
    
    topsy_parameters = {
        'apikey': options.topsy_apikey,
        'perpage': 100,
        'window': 'a',
        'type': 'tweet',
        'hidden': True,
    }
    
    term = Terminal()
    
    try:
        session = Session()
        
        results = None        
        page = 1
        print options.query

        tr = TopsyResource(options.query, **topsy_parameters)
        
        move_up = 0
        
        for i,item in enumerate(tr):
            # get id
            url = item.get("url")
            tweet_id = url.split("/")[-1]
            
            if move_up > 0:
                print((move_up+1)*term.move_up())
                move_up = 0
            
            print ("%d/%d:%03d%% - %s - %r" % (i+1, tr.total(), int(float(i+1)/float(tr.total())*100.0), tweet_id, item.get("content") ) + term.clear_eol())            
            move_up += 1
            
            count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count()
            
            if count_tweet:
                continue
            try:                                    
                tweet = t.statuses.show(id=tweet_id, include_entities=True)
            except twitter.api.TwitterHTTPError as e:
                if e.e.code == 404 or e.e.code == 403:
                    continue
                else:
                    raise
            
            processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger)
            processor.process()
            session.flush()
            session.commit()

            print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('X-Rate-Limit-Limit'))) + term.clear_eol())
            move_up += 1
            rate_limit_limit = int(tweet.headers.getheader('X-Rate-Limit-Limit'))
            rate_limit_remaining = int(tweet.rate_limit_remaining)

            if rate_limit_remaining < rate_limit_limit:
                time_to_sleep = 0
            else:
                time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining)) 

            for i in xrange(time_to_sleep):
                if i:
                    print(2*term.move_up())
                else:
                    move_up += 1
                print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol())
                time.sleep(1)
                
    except twitter.api.TwitterHTTPError as e:
        fmt = ("." + e.format) if e.format else ""
        print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data))
        
    finally:
        if session:
            session.close()