diff -r d3b86c65c980 -r b9243ade95e2 script/utils/merge_tweets.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/utils/merge_tweets.py Sat Jan 07 16:12:44 2012 +0100 @@ -0,0 +1,105 @@ +#from models import setup_database +from iri_tweet.models import setup_database, TweetSource, Tweet, TweetLog +from iri_tweet.utils import TwitterProcessor, get_oauth_token, show_progress +import argparse +import sys +import re +import anyjson +import math +import codecs + +def get_option(): + + parser = argparse.ArgumentParser(description='Merge tweets databases') + + parser.add_argument("-l", "--log", dest="logfile", + help="log to file", metavar="LOG", default="stderr") + parser.add_argument("-v", dest="verbose", action="count", + help="verbose", default=0) + parser.add_argument("-q", dest="quiet", action="count", + help="quiet", default=0) + parser.add_argument("--query-user", dest="query_user", action="store_true", + help="Query twitter for user information", default=False) + parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token", + help="Token file name") + + + parser.add_argument("source", action="store", nargs=1, type=str, metavar="SOURCE") + parser.add_argument("target", action="store", nargs=1, type=str, metavar="TARGET") + + + return parser.parse_args() + +if __name__ == "__main__": + + sys.stdout = codecs.getwriter(sys.stdout.encoding)(sys.stdout) + + options = get_option() + + access_token = None + if options.query_user: + access_token = get_oauth_token(options.token_filename) + + #open source + src_conn_str = options.source[0].strip() + if not re.match("^\w+://.+", src_conn_str): + src_conn_str = 'sqlite:///' + src_conn_str + tgt_conn_str = options.target[0].strip() + if not re.match("^\w+://.+", tgt_conn_str): + tgt_conn_str = 'sqlite:///' + tgt_conn_str + + + engine_src, metadata_src, Session_src = setup_database(src_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False) + engine_tgt, metadata_tgt, Session_tgt = setup_database(tgt_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False) + + conn_src = conn_tgt = session_src = session_tgt = None + + try: + #conn_src = engine_src.connect() + #conn_tgt = engine_tgt.connect() + session_src = Session_src() + session_tgt = Session_tgt() + + count_tw_query = Tweet.__table__.count() + + count_tw = engine_src.scalar(count_tw_query) + + if count_tw == 0: + print "No tweet to process : exit" + sys.exit() + + query_src = session_src.query(Tweet).join(TweetSource).yield_per(100) + added = 0 + + for i,tweet in enumerate(query_src): + + tweet_count = session_tgt.query(Tweet).filter(Tweet.id == tweet.id).count() + + progress_text = u"Process: " + if tweet_count == 0: + added += 1 + progress_text = u"Adding : " + tweet_source = tweet.tweet_source.original_json + + tweet_obj = anyjson.deserialize(tweet_source) + if 'text' not in tweet_obj: + tweet_log = TweetLog(tweet_source_id=source_id, status=TweetLog.TWEET_STATUS['NOT_TWEET']) + session_tgt.add(tweet_log) + else: + tp = TwitterProcessor(None, tweet_source, None, session_tgt, access_token, options.token_filename, user_query_twitter=options.query_user) + tp.process() + + session_tgt.flush() + + show_progress(i+1, count_tw, progress_text+tweet.text, 70) + + session_tgt.commit() + print u"%d new tweet added" % (added) + + finally: + session_tgt.close() if session_tgt is not None else None + session_src.close() if session_src is not None else None + conn_tgt.close() if conn_tgt is not None else None + conn_src.close() if conn_src is not None else None + + \ No newline at end of file