script/utils/merge_tweets.py
author Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
Wed, 16 Jan 2013 05:04:23 +0100
changeset 763 bc29a6fbb8e8
parent 464 b9243ade95e2
child 886 1e110b03ae96
permissions -rw-r--r--
various correction for export tweet alchemy. Can give a project

#from models import setup_database
from iri_tweet.models import setup_database, TweetSource, Tweet, TweetLog
from iri_tweet.utils import TwitterProcessor, get_oauth_token, show_progress
import argparse
import sys
import re
import anyjson
import math
import codecs

def get_option():
    
    parser = argparse.ArgumentParser(description='Merge tweets databases')

    parser.add_argument("-l", "--log", dest="logfile",
                        help="log to file", metavar="LOG", default="stderr")
    parser.add_argument("-v", dest="verbose", action="count",
                        help="verbose", default=0)
    parser.add_argument("-q", dest="quiet", action="count",
                        help="quiet", default=0)
    parser.add_argument("--query-user", dest="query_user", action="store_true",
                        help="Query twitter for user information",  default=False)
    parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
                      help="Token file name")

    
    parser.add_argument("source", action="store", nargs=1, type=str, metavar="SOURCE")
    parser.add_argument("target", action="store", nargs=1, type=str, metavar="TARGET")
    

    return parser.parse_args()

if __name__ == "__main__":
    
    sys.stdout = codecs.getwriter(sys.stdout.encoding)(sys.stdout)
    
    options = get_option()
    
    access_token = None
    if options.query_user:
        access_token = get_oauth_token(options.token_filename)
    
    #open source
    src_conn_str = options.source[0].strip()
    if not re.match("^\w+://.+", src_conn_str):
        src_conn_str = 'sqlite:///' + src_conn_str
    tgt_conn_str = options.target[0].strip()
    if not re.match("^\w+://.+", tgt_conn_str):
        tgt_conn_str = 'sqlite:///' + tgt_conn_str


    engine_src, metadata_src, Session_src = setup_database(src_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)
    engine_tgt, metadata_tgt, Session_tgt = setup_database(tgt_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)        

    conn_src = conn_tgt = session_src = session_tgt = None
    
    try:
        #conn_src = engine_src.connect()
        #conn_tgt = engine_tgt.connect()
        session_src = Session_src()
        session_tgt = Session_tgt()
        
        count_tw_query = Tweet.__table__.count()
        
        count_tw = engine_src.scalar(count_tw_query)
        
        if count_tw == 0:
            print "No tweet to process : exit"
            sys.exit()
            
        query_src = session_src.query(Tweet).join(TweetSource).yield_per(100)
        added = 0
        
        for i,tweet in enumerate(query_src):
            
            tweet_count = session_tgt.query(Tweet).filter(Tweet.id == tweet.id).count()
            
            progress_text = u"Process: "
            if tweet_count == 0:
                added += 1
                progress_text = u"Adding : "
                tweet_source = tweet.tweet_source.original_json
                                
                tweet_obj = anyjson.deserialize(tweet_source)
                if 'text' not in tweet_obj:
                    tweet_log = TweetLog(tweet_source_id=source_id, status=TweetLog.TWEET_STATUS['NOT_TWEET'])
                    session_tgt.add(tweet_log)
                else:                
                    tp = TwitterProcessor(None, tweet_source, None, session_tgt, access_token, options.token_filename, user_query_twitter=options.query_user)
                    tp.process()
                
                session_tgt.flush()
                
            show_progress(i+1, count_tw, repr(progress_text+tweet.text), 70)
                            
        session_tgt.commit()
        print u"%d new tweet added" % (added)
        
    finally:
        session_tgt.close() if session_tgt is not None else None
        session_src.close() if session_src is not None else None
        conn_tgt.close() if conn_tgt is not None else None
        conn_src.close() if conn_src is not None else None