script/utils/search_topsy.py
author Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
Mon, 15 Oct 2012 17:01:50 +0200
changeset 693 2ef837069108
child 888 6fc6637d8403
permissions -rw-r--r--
Starting 'listener_update' branch

from iri_tweet import models, utils
from sqlalchemy.orm import sessionmaker
import anyjson
import sqlite3
import twitter
import re
import requests
from optparse import OptionParser
import simplejson
import time
from blessings import Terminal
import sys
import math
from symbol import except_clause

APPLICATION_NAME = "Tweet recorder user"
CONSUMER_KEY = "Vdr5ZcsjI1G3esTPI8yDg"
CONSUMER_SECRET = "LMhNrY99R6a7E0YbZZkRFpUZpX5EfB1qATbDk1sIVLs"


class TopsyResource(object):
    
    def __init__(self, query, **kwargs):
                
        self.options = kwargs
        self.options['q'] = query
        self.url = kwargs.get("url", "http://otter.topsy.com/search.json")
        self.page = 0
        self.req = None
        self.res = {}
        
    def __initialize(self):
        
        params = {}
        params.update(self.options)
        self.req = requests.get(self.url, params=params)
        self.res = self.req.json
        
    def __next_page(self):
        page = self.res.get("response").get("page") + 1
        params = {}
        params.update(self.options)
        params['page'] = page
        self.req = requests.get(self.url, params=params)
        self.res = self.req.json

    def __iter__(self):        
        if not self.req:
            self.__initialize()
        while "response" in self.res and "list" in self.res.get("response") and self.res.get("response").get("list"):
            for item in  self.res.get("response").get("list"):
                yield item
            self.__next_page()
            
    def total(self):
        if not self.res:
            return 0
        else:
            return self.res.get("response",{}).get("total",0)
            


def get_option():
    
    parser = OptionParser()

    parser.add_option("-d", "--database", dest="database",
                      help="Input database", metavar="DATABASE")
    parser.add_option("-Q", dest="query",
                      help="query", metavar="QUERY")
    parser.add_option("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
                      help="Token file name")
    parser.add_option("-T", dest="topsy_apikey", metavar="TOPSY_APIKEY", default=None,
                      help="Topsy apikey")
    
    utils.set_logging_options(parser)

    return parser.parse_args()



if __name__ == "__main__":

    (options, args) = get_option()
    
    utils.set_logging(options);


    acess_token_key, access_token_secret = utils.get_oauth_token(options.token_filename, application_name=APPLICATION_NAME, consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET)

    t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, CONSUMER_KEY, CONSUMER_SECRET), secure=True)
    t.secure = True
    
    conn_str = options.database.strip()
    if not re.match("^\w+://.+", conn_str):
        conn_str = 'sqlite:///' + conn_str
    
    engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
    session = None
    
    
    topsy_parameters = {
        'apikey': options.topsy_apikey,
        'perpage': 100,
        'window': 'a',
        'type': 'tweet',
        'hidden': True,
    }
    
    term = Terminal()
    
    try:
        session = Session()
        
        results = None        
        page = 1
        print options.query

        tr = TopsyResource(options.query, **topsy_parameters)
        
        move_up = 0
        
        for i,item in enumerate(tr):
            # get id
            url = item.get("url")
            tweet_id = url.split("/")[-1]
            
            if move_up > 0:
                print((move_up+1)*term.move_up())
                move_up = 0
            
            print ("%d/%d:%03d%% - %s - %r" % (i+1, tr.total(), int(float(i+1)/float(tr.total())*100.0), tweet_id, item.get("content") ) + term.clear_eol())            
            move_up += 1
            
            count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count()
            
            if count_tweet:
                continue
            try:                                    
                tweet = t.statuses.show(id=tweet_id, include_entities=True)
            except twitter.api.TwitterHTTPError as e:
                if e.e.code == 404 or e.e.code == 403:
                    continue
                else:
                    raise
            
            processor = utils.TwitterProcessor(tweet, None, None, session, None, options.token_filename)
            processor.process()
            session.flush()
            session.commit()
                        
            time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining))
            
            print "rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('x-ratelimit-limit'))) + term.clear_eol()
            move_up += 1
            for i in xrange(time_to_sleep):
                if i:
                    print(2*term.move_up())
                else:
                    move_up += 1
                print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol())
                time.sleep(1)
                
    except twitter.api.TwitterHTTPError as e:
        fmt = ("." + e.format) if e.format else ""
        print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data))
        
    finally:
        if session:
            session.close()