script/utils/search_topsy.py
author ymh <ymh.work@gmail.com>
Mon, 18 Jun 2018 23:15:34 +0200
changeset 1427 8b3d57a519eb
parent 1137 5c757e167687
permissions -rw-r--r--
remove dependency to Zend 1. Use composer to reduce the number of dependencies

import argparse
import logging
import math
import re
import time

from blessings import Terminal
import requests
import twitter

from iri_tweet import models, utils
from iri_tweet.processor import TwitterProcessorStatus


logger = logging.getLogger(__name__)

APPLICATION_NAME = "Tweet recorder user"


class TopsyResource(object):
    
    def __init__(self, query, **kwargs):

        self.options = kwargs
        self.options['q'] = query
        self.url = kwargs.get("url", "http://otter.topsy.com/search.json")
        self.page = 0
        self.req = None
        self.res = {}
        
    def __initialize(self):
        
        params = {}
        params.update(self.options)
        self.req = requests.get(self.url, params=params)
        self.res = self.req.json()
        
    def __next_page(self):
        page = self.res.get("response").get("page") + 1
        params = {}
        params.update(self.options)
        params['page'] = page
        self.req = requests.get(self.url, params=params)
        self.res = self.req.json()

    def __iter__(self):        
        if not self.req:
            self.__initialize()
        while "response" in self.res and "list" in self.res.get("response") and self.res.get("response").get("list"):
            for item in  self.res.get("response").get("list"):
                yield item
            self.__next_page()
            
    def total(self):
        if not self.res:
            return 0
        else:
            return self.res.get("response",{}).get("total",0)
            


def get_options():
    
    usage = "usage: %(prog)s [options] <connection_str_or_filepath>"
    
    parser = argparse.ArgumentParser(usage=usage)

    parser.add_argument(dest="conn_str",
                        help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR")
    parser.add_argument("-Q", dest="query",
                      help="query", metavar="QUERY")
    parser.add_argument("-k", "--key", dest="consumer_key",
                        help="Twitter consumer key", metavar="CONSUMER_KEY")
    parser.add_argument("-s", "--secret", dest="consumer_secret",
                        help="Twitter consumer secret", metavar="CONSUMER_SECRET")
    parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
                      help="Token file name")
    parser.add_argument("-T", dest="topsy_apikey", metavar="TOPSY_APIKEY", default=None,
                      help="Topsy apikey")

    utils.set_logging_options(parser)

    return parser.parse_args()



if __name__ == "__main__":

    options = get_options()
    
    utils.set_logging(options);


    acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME)

    t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True)
    t.secure = True
    
    conn_str = options.conn_str.strip()
    if not re.match("^\w+://.+", conn_str):
        conn_str = 'sqlite:///' + conn_str
    
    engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
    session = None
    
    
    topsy_parameters = {
        'apikey': options.topsy_apikey,
        'perpage': 100,
        'window': 'a',
        'type': 'tweet',
        'hidden': True,
    }
    
    term = Terminal()
    
    try:
        session = Session()
        
        results = None        
        page = 1
        print options.query

        tr = TopsyResource(options.query, **topsy_parameters)
        
        move_up = 0
        
        for i,item in enumerate(tr):
            # get id
            url = item.get("url")
            tweet_id = url.split("/")[-1]
            
            if move_up > 0:
                print((move_up+1)*term.move_up())
                move_up = 0
            
            print ("%d/%d:%03d%% - %s - %r" % (i+1, tr.total(), int(float(i+1)/float(tr.total())*100.0), tweet_id, item.get("content") ) + term.clear_eol())            
            move_up += 1
            
            count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count()
            
            if count_tweet:
                continue
            try:                                    
                tweet = t.statuses.show(id=tweet_id, include_entities=True)
            except twitter.api.TwitterHTTPError as e:
                if e.e.code == 404 or e.e.code == 403:
                    continue
                else:
                    raise
            
            processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger)
            processor.process()
            session.flush()
            session.commit()

            print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('X-Rate-Limit-Limit'))) + term.clear_eol())
            move_up += 1
            rate_limit_limit = int(tweet.headers.getheader('X-Rate-Limit-Limit'))
            rate_limit_remaining = int(tweet.rate_limit_remaining)

            if rate_limit_remaining < rate_limit_limit:
                time_to_sleep = 0
            else:
                time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining)) 

            for i in xrange(time_to_sleep):
                if i:
                    print(2*term.move_up())
                else:
                    move_up += 1
                print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol())
                time.sleep(1)
                
    except twitter.api.TwitterHTTPError as e:
        fmt = ("." + e.format) if e.format else ""
        print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data))
        
    finally:
        if session:
            session.close()