script/utils/merge_tweets.py
changeset 464 b9243ade95e2
child 693 2ef837069108
child 763 bc29a6fbb8e8
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/script/utils/merge_tweets.py	Sat Jan 07 16:12:44 2012 +0100
@@ -0,0 +1,105 @@
+#from models import setup_database
+from iri_tweet.models import setup_database, TweetSource, Tweet, TweetLog
+from iri_tweet.utils import TwitterProcessor, get_oauth_token, show_progress
+import argparse
+import sys
+import re
+import anyjson
+import math
+import codecs
+
+def get_option():
+    
+    parser = argparse.ArgumentParser(description='Merge tweets databases')
+
+    parser.add_argument("-l", "--log", dest="logfile",
+                        help="log to file", metavar="LOG", default="stderr")
+    parser.add_argument("-v", dest="verbose", action="count",
+                        help="verbose", default=0)
+    parser.add_argument("-q", dest="quiet", action="count",
+                        help="quiet", default=0)
+    parser.add_argument("--query-user", dest="query_user", action="store_true",
+                        help="Query twitter for user information",  default=False)
+    parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
+                      help="Token file name")
+
+    
+    parser.add_argument("source", action="store", nargs=1, type=str, metavar="SOURCE")
+    parser.add_argument("target", action="store", nargs=1, type=str, metavar="TARGET")
+    
+
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    
+    sys.stdout = codecs.getwriter(sys.stdout.encoding)(sys.stdout)
+    
+    options = get_option()
+    
+    access_token = None
+    if options.query_user:
+        access_token = get_oauth_token(options.token_filename)
+    
+    #open source
+    src_conn_str = options.source[0].strip()
+    if not re.match("^\w+://.+", src_conn_str):
+        src_conn_str = 'sqlite:///' + src_conn_str
+    tgt_conn_str = options.target[0].strip()
+    if not re.match("^\w+://.+", tgt_conn_str):
+        tgt_conn_str = 'sqlite:///' + tgt_conn_str
+
+
+    engine_src, metadata_src, Session_src = setup_database(src_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)
+    engine_tgt, metadata_tgt, Session_tgt = setup_database(tgt_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)        
+
+    conn_src = conn_tgt = session_src = session_tgt = None
+    
+    try:
+        #conn_src = engine_src.connect()
+        #conn_tgt = engine_tgt.connect()
+        session_src = Session_src()
+        session_tgt = Session_tgt()
+        
+        count_tw_query = Tweet.__table__.count()
+        
+        count_tw = engine_src.scalar(count_tw_query)
+        
+        if count_tw == 0:
+            print "No tweet to process : exit"
+            sys.exit()
+            
+        query_src = session_src.query(Tweet).join(TweetSource).yield_per(100)
+        added = 0
+        
+        for i,tweet in enumerate(query_src):
+            
+            tweet_count = session_tgt.query(Tweet).filter(Tweet.id == tweet.id).count()
+            
+            progress_text = u"Process: "
+            if tweet_count == 0:
+                added += 1
+                progress_text = u"Adding : "
+                tweet_source = tweet.tweet_source.original_json
+                                
+                tweet_obj = anyjson.deserialize(tweet_source)
+                if 'text' not in tweet_obj:
+                    tweet_log = TweetLog(tweet_source_id=source_id, status=TweetLog.TWEET_STATUS['NOT_TWEET'])
+                    session_tgt.add(tweet_log)
+                else:                
+                    tp = TwitterProcessor(None, tweet_source, None, session_tgt, access_token, options.token_filename, user_query_twitter=options.query_user)
+                    tp.process()
+                
+                session_tgt.flush()
+                
+            show_progress(i+1, count_tw, progress_text+tweet.text, 70)
+                            
+        session_tgt.commit()
+        print u"%d new tweet added" % (added)
+        
+    finally:
+        session_tgt.close() if session_tgt is not None else None
+        session_src.close() if session_src is not None else None
+        conn_tgt.close() if conn_tgt is not None else None
+        conn_src.close() if conn_src is not None else None
+        
+        
\ No newline at end of file