script/rest/search_twitter.py
author Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
Tue, 17 Dec 2013 17:49:01 +0100
changeset 1031 5d301c2ddb89
parent 987 18cb05f027a0
permissions -rw-r--r--
Correct pagination for search twitter results.

import argparse
import re
import sys

import anyjson
import twitter

from iri_tweet import models, processor, utils
import urlparse


def get_options():
    
    usage = "usage: %(prog)s [options] <connection_str_or_filepath>"
    
    parser = argparse.ArgumentParser(usage=usage)

    parser.add_argument(dest="conn_str",
                        help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR")
    parser.add_argument("-Q", dest="query",
                      help="query", metavar="QUERY")
    parser.add_argument("-P", dest="rpp", metavar="RPP", default="100",
                      help="Result per page")
    parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
                      help="Token file name")
    parser.add_argument("-k", "--key", dest="consumer_key",
                        help="Twitter consumer key", metavar="CONSUMER_KEY")
    parser.add_argument("-s", "--secret", dest="consumer_secret",
                        help="Twitter consumer secret", metavar="CONSUMER_SECRET")
    
    utils.set_logging_options(parser)
    
    return parser.parse_args()

def get_auth(options, access_token):
    consumer_key = options.consumer_key
    consumer_secret = options.consumer_secret
    auth = twitter.OAuth(token=access_token[0], token_secret=access_token[1], consumer_key=consumer_key, consumer_secret=consumer_secret)
    return auth

def get_max_id(results):
    next_results = results.get('search_metadata',{}).get('next_results','');
    if next_results and next_results.startswith("?"):
        next_results = next_results[1:]
    
    max_ids = urlparse.parse_qs(next_results).get('max_id',[])
    max_id = 0
    if max_ids:
        max_id = int(max_ids[0])
    return max_id

if __name__ == "__main__":

    options = get_options()
    
    access_token = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename)
    auth = get_auth(options, access_token)

    t = twitter.Twitter(domain="api.twitter.com",api_version="1.1",secure=True, auth=auth)

    conn_str = options.conn_str.strip()
    if not re.match("^\w+://.+", conn_str):
        conn_str = 'sqlite:///' + conn_str

    engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
    session = None
    try:
        session = Session()
        #conn.row_factory = sqlite3.Row
        #curs = conn.cursor()
        #curs.execute("create table if not exists tweet_tweet (json);")
        #conn.commit()
        
        results = None        
        page = 1
        print options.query

        #get current_maxid
        results = t.search.tweets(q=options.query, result_type="recent")
        max_id = get_max_id(results)
        if max_id==0:
            print("No results, exit")
            sys.exit(0)
        
        while page <= int(1500/int(options.rpp)) and ( results is None  or len(results.get('statuses',0)) > 0) and max_id > 0:
            
            results = t.search.tweets(q=options.query, count=options.rpp, max_id=max_id, include_entities=True, result_type='recent')
            
            max_id = get_max_id(results)

            
            for tweet in results["statuses"]:
                print tweet
                tweet_str = anyjson.serialize(tweet)
                #invalidate user id
                p = processor.TwitterProcessorStatus(json_dict=tweet, json_txt=tweet_str, source_id=None, session=session, consumer_token=(options.consumer_key, options.consumer_secret), access_token=access_token, token_filename=options.token_filename, user_query_twitter=False, logger=None)
                p.process()
                session.flush()
                session.commit()
            page += 1
            #session.commit()
    finally:
        if session:
            session.close()