script/utils/merge_tweets.py
changeset 464 b9243ade95e2
child 693 2ef837069108
child 763 bc29a6fbb8e8
equal deleted inserted replaced
463:d3b86c65c980 464:b9243ade95e2
       
     1 #from models import setup_database
       
     2 from iri_tweet.models import setup_database, TweetSource, Tweet, TweetLog
       
     3 from iri_tweet.utils import TwitterProcessor, get_oauth_token, show_progress
       
     4 import argparse
       
     5 import sys
       
     6 import re
       
     7 import anyjson
       
     8 import math
       
     9 import codecs
       
    10 
       
    11 def get_option():
       
    12     
       
    13     parser = argparse.ArgumentParser(description='Merge tweets databases')
       
    14 
       
    15     parser.add_argument("-l", "--log", dest="logfile",
       
    16                         help="log to file", metavar="LOG", default="stderr")
       
    17     parser.add_argument("-v", dest="verbose", action="count",
       
    18                         help="verbose", default=0)
       
    19     parser.add_argument("-q", dest="quiet", action="count",
       
    20                         help="quiet", default=0)
       
    21     parser.add_argument("--query-user", dest="query_user", action="store_true",
       
    22                         help="Query twitter for user information",  default=False)
       
    23     parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
       
    24                       help="Token file name")
       
    25 
       
    26     
       
    27     parser.add_argument("source", action="store", nargs=1, type=str, metavar="SOURCE")
       
    28     parser.add_argument("target", action="store", nargs=1, type=str, metavar="TARGET")
       
    29     
       
    30 
       
    31     return parser.parse_args()
       
    32 
       
    33 if __name__ == "__main__":
       
    34     
       
    35     sys.stdout = codecs.getwriter(sys.stdout.encoding)(sys.stdout)
       
    36     
       
    37     options = get_option()
       
    38     
       
    39     access_token = None
       
    40     if options.query_user:
       
    41         access_token = get_oauth_token(options.token_filename)
       
    42     
       
    43     #open source
       
    44     src_conn_str = options.source[0].strip()
       
    45     if not re.match("^\w+://.+", src_conn_str):
       
    46         src_conn_str = 'sqlite:///' + src_conn_str
       
    47     tgt_conn_str = options.target[0].strip()
       
    48     if not re.match("^\w+://.+", tgt_conn_str):
       
    49         tgt_conn_str = 'sqlite:///' + tgt_conn_str
       
    50 
       
    51 
       
    52     engine_src, metadata_src, Session_src = setup_database(src_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)
       
    53     engine_tgt, metadata_tgt, Session_tgt = setup_database(tgt_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)        
       
    54 
       
    55     conn_src = conn_tgt = session_src = session_tgt = None
       
    56     
       
    57     try:
       
    58         #conn_src = engine_src.connect()
       
    59         #conn_tgt = engine_tgt.connect()
       
    60         session_src = Session_src()
       
    61         session_tgt = Session_tgt()
       
    62         
       
    63         count_tw_query = Tweet.__table__.count()
       
    64         
       
    65         count_tw = engine_src.scalar(count_tw_query)
       
    66         
       
    67         if count_tw == 0:
       
    68             print "No tweet to process : exit"
       
    69             sys.exit()
       
    70             
       
    71         query_src = session_src.query(Tweet).join(TweetSource).yield_per(100)
       
    72         added = 0
       
    73         
       
    74         for i,tweet in enumerate(query_src):
       
    75             
       
    76             tweet_count = session_tgt.query(Tweet).filter(Tweet.id == tweet.id).count()
       
    77             
       
    78             progress_text = u"Process: "
       
    79             if tweet_count == 0:
       
    80                 added += 1
       
    81                 progress_text = u"Adding : "
       
    82                 tweet_source = tweet.tweet_source.original_json
       
    83                                 
       
    84                 tweet_obj = anyjson.deserialize(tweet_source)
       
    85                 if 'text' not in tweet_obj:
       
    86                     tweet_log = TweetLog(tweet_source_id=source_id, status=TweetLog.TWEET_STATUS['NOT_TWEET'])
       
    87                     session_tgt.add(tweet_log)
       
    88                 else:                
       
    89                     tp = TwitterProcessor(None, tweet_source, None, session_tgt, access_token, options.token_filename, user_query_twitter=options.query_user)
       
    90                     tp.process()
       
    91                 
       
    92                 session_tgt.flush()
       
    93                 
       
    94             show_progress(i+1, count_tw, progress_text+tweet.text, 70)
       
    95                             
       
    96         session_tgt.commit()
       
    97         print u"%d new tweet added" % (added)
       
    98         
       
    99     finally:
       
   100         session_tgt.close() if session_tgt is not None else None
       
   101         session_src.close() if session_src is not None else None
       
   102         conn_tgt.close() if conn_tgt is not None else None
       
   103         conn_src.close() if conn_src is not None else None
       
   104         
       
   105