script/utils/merge_tweets.py
changeset 888 6fc6637d8403
parent 886 1e110b03ae96
child 957 e4d0094f097b
equal deleted inserted replaced
887:503f9a7b7d6c 888:6fc6637d8403
     1 #from models import setup_database
     1 #from models import setup_database
     2 from iri_tweet.models import setup_database, TweetSource, Tweet, TweetLog
     2 from iri_tweet.models import setup_database, TweetSource, Tweet, TweetLog
     3 from iri_tweet.utils import TwitterProcessor, get_oauth_token, show_progress
     3 from iri_tweet.processor import TwitterProcessorStatus
       
     4 from iri_tweet.utils import get_oauth_token, show_progress
       
     5 import anyjson
     4 import argparse
     6 import argparse
       
     7 import codecs
       
     8 import logging
       
     9 import re
     5 import sys
    10 import sys
     6 import re
    11 
     7 import anyjson
    12 logger = logging.getLogger(__name__)
     8 import math
       
     9 import codecs
       
    10 
    13 
    11 def get_option():
    14 def get_option():
    12     
    15     
    13     parser = argparse.ArgumentParser(description='Merge tweets databases')
    16     parser = argparse.ArgumentParser(description='Merge tweets databases')
    14 
    17 
    15     parser.add_argument("-l", "--log", dest="logfile",
    18     parser.add_argument("-l", "--log", dest="logfile",
    16                         help="log to file", metavar="LOG", default="stderr")
    19                         help="log to file", metavar="LOG", default="stderr")
    17     parser.add_argument("-v", dest="verbose", action="count",
    20     parser.add_argument("-v", dest="verbose", action="count",
    18                         help="verbose", default=0)
    21                         help="verbose", default=0)
       
    22     parser.add_option("-k", "--key", dest="consumer_key",
       
    23                       help="Twitter consumer key", metavar="CONSUMER_KEY")
       
    24     parser.add_option("-s", "--secret", dest="consumer_secret",
       
    25                       help="Twitter consumer secret", metavar="CONSUMER_SECRET")
    19     parser.add_argument("-q", dest="quiet", action="count",
    26     parser.add_argument("-q", dest="quiet", action="count",
    20                         help="quiet", default=0)
    27                         help="quiet", default=0)
    21     parser.add_argument("--query-user", dest="query_user", action="store_true",
    28     parser.add_argument("--query-user", dest="query_user", action="store_true",
    22                         help="Query twitter for user information",  default=False)
    29                         help="Query twitter for user information",  default=False)
    23     parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
    30     parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
    36     
    43     
    37     options = get_option()
    44     options = get_option()
    38     
    45     
    39     access_token = None
    46     access_token = None
    40     if options.query_user:
    47     if options.query_user:
    41         access_token = get_oauth_token(options.token_filename)
    48         access_token = get_oauth_token(options.consumer_key, options.consumer_secret, options.token_filename)
    42     
    49     
    43     #open source
    50     #open source
    44     src_conn_str = options.source[0].strip()
    51     src_conn_str = options.source[0].strip()
    45     if not re.match("^\w+://.+", src_conn_str):
    52     if not re.match("^\w+://.+", src_conn_str):
    46         src_conn_str = 'sqlite:///' + src_conn_str
    53         src_conn_str = 'sqlite:///' + src_conn_str
    58         #conn_src = engine_src.connect()
    65         #conn_src = engine_src.connect()
    59         #conn_tgt = engine_tgt.connect()
    66         #conn_tgt = engine_tgt.connect()
    60         session_src = Session_src()
    67         session_src = Session_src()
    61         session_tgt = Session_tgt()
    68         session_tgt = Session_tgt()
    62         
    69         
    63         count_tw_query = Tweet.__table__.count()
    70         count_tw_query = Tweet.__table__.count()  # @UndefinedVariable
    64         
    71         
    65         count_tw = engine_src.scalar(count_tw_query)
    72         count_tw = engine_src.scalar(count_tw_query)
    66         
    73         
    67         if count_tw == 0:
    74         if count_tw == 0:
    68             print "No tweet to process : exit"
    75             print "No tweet to process : exit"
    81                 progress_text = u"Adding : "
    88                 progress_text = u"Adding : "
    82                 tweet_source = tweet.tweet_source.original_json
    89                 tweet_source = tweet.tweet_source.original_json
    83                                 
    90                                 
    84                 tweet_obj = anyjson.deserialize(tweet_source)
    91                 tweet_obj = anyjson.deserialize(tweet_source)
    85                 if 'text' not in tweet_obj:
    92                 if 'text' not in tweet_obj:
    86                     tweet_log = TweetLog(tweet_source_id=source_id, status=TweetLog.TWEET_STATUS['NOT_TWEET'])
    93                     tweet_log = TweetLog(tweet_source_id=tweet.tweet_source.id, status=TweetLog.TWEET_STATUS['NOT_TWEET'])
    87                     session_tgt.add(tweet_log)
    94                     session_tgt.add(tweet_log)
    88                 else:                
    95                 else:                
    89                     tp = TwitterProcessor(None, tweet_source, None, session_tgt, access_token, options.token_filename, user_query_twitter=options.query_user)
    96                     tp = TwitterProcessorStatus(None, tweet_source, None, session_tgt, access_token, options.token_filename, user_query_twitter=options.query_user, logger=logger)
    90                     tp.process()
    97                     tp.process()
    91                 
    98                 
    92                 session_tgt.flush()
    99                 session_tgt.flush()
    93                 
   100                 
    94             ptext = progress_text + tweet.text
   101             ptext = progress_text + tweet.text
    96                             
   103                             
    97         session_tgt.commit()
   104         session_tgt.commit()
    98         print u"%d new tweet added" % (added)
   105         print u"%d new tweet added" % (added)
    99         
   106         
   100     finally:
   107     finally:
   101         session_tgt.close() if session_tgt is not None else None
   108         if session_tgt is not None:
   102         session_src.close() if session_src is not None else None
   109             session_tgt.close()
   103         conn_tgt.close() if conn_tgt is not None else None
   110         if session_src is not None:
   104         conn_src.close() if conn_src is not None else None
   111             session_src.close()
       
   112         if conn_tgt is not None:
       
   113             conn_tgt.close()
       
   114         if conn_src is not None:
       
   115             conn_src.close()
   105         
   116         
   106         
   117