--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/script/utils/merge_tweets.py Sat Jan 07 16:12:44 2012 +0100
@@ -0,0 +1,105 @@
+#from models import setup_database
+from iri_tweet.models import setup_database, TweetSource, Tweet, TweetLog
+from iri_tweet.utils import TwitterProcessor, get_oauth_token, show_progress
+import argparse
+import sys
+import re
+import anyjson
+import math
+import codecs
+
+def get_option():
+
+ parser = argparse.ArgumentParser(description='Merge tweets databases')
+
+ parser.add_argument("-l", "--log", dest="logfile",
+ help="log to file", metavar="LOG", default="stderr")
+ parser.add_argument("-v", dest="verbose", action="count",
+ help="verbose", default=0)
+ parser.add_argument("-q", dest="quiet", action="count",
+ help="quiet", default=0)
+ parser.add_argument("--query-user", dest="query_user", action="store_true",
+ help="Query twitter for user information", default=False)
+ parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
+ help="Token file name")
+
+
+ parser.add_argument("source", action="store", nargs=1, type=str, metavar="SOURCE")
+ parser.add_argument("target", action="store", nargs=1, type=str, metavar="TARGET")
+
+
+ return parser.parse_args()
+
+if __name__ == "__main__":
+
+ sys.stdout = codecs.getwriter(sys.stdout.encoding)(sys.stdout)
+
+ options = get_option()
+
+ access_token = None
+ if options.query_user:
+ access_token = get_oauth_token(options.token_filename)
+
+ #open source
+ src_conn_str = options.source[0].strip()
+ if not re.match("^\w+://.+", src_conn_str):
+ src_conn_str = 'sqlite:///' + src_conn_str
+ tgt_conn_str = options.target[0].strip()
+ if not re.match("^\w+://.+", tgt_conn_str):
+ tgt_conn_str = 'sqlite:///' + tgt_conn_str
+
+
+ engine_src, metadata_src, Session_src = setup_database(src_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)
+ engine_tgt, metadata_tgt, Session_tgt = setup_database(tgt_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)
+
+ conn_src = conn_tgt = session_src = session_tgt = None
+
+ try:
+ #conn_src = engine_src.connect()
+ #conn_tgt = engine_tgt.connect()
+ session_src = Session_src()
+ session_tgt = Session_tgt()
+
+ count_tw_query = Tweet.__table__.count()
+
+ count_tw = engine_src.scalar(count_tw_query)
+
+ if count_tw == 0:
+ print "No tweet to process : exit"
+ sys.exit()
+
+ query_src = session_src.query(Tweet).join(TweetSource).yield_per(100)
+ added = 0
+
+ for i,tweet in enumerate(query_src):
+
+ tweet_count = session_tgt.query(Tweet).filter(Tweet.id == tweet.id).count()
+
+ progress_text = u"Process: "
+ if tweet_count == 0:
+ added += 1
+ progress_text = u"Adding : "
+ tweet_source = tweet.tweet_source.original_json
+
+ tweet_obj = anyjson.deserialize(tweet_source)
+ if 'text' not in tweet_obj:
+ tweet_log = TweetLog(tweet_source_id=source_id, status=TweetLog.TWEET_STATUS['NOT_TWEET'])
+ session_tgt.add(tweet_log)
+ else:
+ tp = TwitterProcessor(None, tweet_source, None, session_tgt, access_token, options.token_filename, user_query_twitter=options.query_user)
+ tp.process()
+
+ session_tgt.flush()
+
+ show_progress(i+1, count_tw, progress_text+tweet.text, 70)
+
+ session_tgt.commit()
+ print u"%d new tweet added" % (added)
+
+ finally:
+ session_tgt.close() if session_tgt is not None else None
+ session_src.close() if session_src is not None else None
+ conn_tgt.close() if conn_tgt is not None else None
+ conn_src.close() if conn_src is not None else None
+
+
\ No newline at end of file