script/utils/merge_tweets.py
changeset 1496 184372ec27e2
parent 957 e4d0094f097b
child 1497 14a9bed2e3cd
equal deleted inserted replaced
1495:efbda157eb57 1496:184372ec27e2
     1 #from models import setup_database
     1 #from models import setup_database
     2 from iri_tweet.models import setup_database, TweetSource, Tweet, TweetLog
       
     3 from iri_tweet.processor import TwitterProcessorStatus
       
     4 from iri_tweet.utils import get_oauth_token, show_progress
       
     5 import anyjson
       
     6 import argparse
     2 import argparse
     7 import codecs
     3 import codecs
       
     4 import json
     8 import logging
     5 import logging
     9 import re
     6 import re
    10 import sys
     7 import sys
       
     8 
       
     9 from iri_tweet.models import Tweet, TweetLog, TweetSource, setup_database
       
    10 from iri_tweet.processor import TwitterProcessorStatus
       
    11 from iri_tweet.utils import get_oauth_token, show_progress
    11 
    12 
    12 logger = logging.getLogger(__name__)
    13 logger = logging.getLogger(__name__)
    13 
    14 
    14 def get_option():
    15 def get_option():
    15     
    16     
    47     if options.query_user:
    48     if options.query_user:
    48         access_token = get_oauth_token(options.consumer_key, options.consumer_secret, options.token_filename)
    49         access_token = get_oauth_token(options.consumer_key, options.consumer_secret, options.token_filename)
    49     
    50     
    50     #open source
    51     #open source
    51     src_conn_str = options.source[0].strip()
    52     src_conn_str = options.source[0].strip()
    52     if not re.match("^\w+://.+", src_conn_str):
    53     if not re.match(r"^\w+://.+", src_conn_str):
    53         src_conn_str = 'sqlite:///' + src_conn_str
    54         src_conn_str = 'sqlite:///' + src_conn_str
    54     tgt_conn_str = options.target[0].strip()
    55     tgt_conn_str = options.target[0].strip()
    55     if not re.match("^\w+://.+", tgt_conn_str):
    56     if not re.match(r"^\w+://.+", tgt_conn_str):
    56         tgt_conn_str = 'sqlite:///' + tgt_conn_str
    57         tgt_conn_str = 'sqlite:///' + tgt_conn_str
    57 
    58 
    58 
    59 
    59     engine_src, metadata_src, Session_src = setup_database(src_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)
    60     engine_src, metadata_src, Session_src = setup_database(src_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)
    60     engine_tgt, metadata_tgt, Session_tgt = setup_database(tgt_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)        
    61     engine_tgt, metadata_tgt, Session_tgt = setup_database(tgt_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)        
    64     try:
    65     try:
    65         #conn_src = engine_src.connect()
    66         #conn_src = engine_src.connect()
    66         #conn_tgt = engine_tgt.connect()
    67         #conn_tgt = engine_tgt.connect()
    67         session_src = Session_src()
    68         session_src = Session_src()
    68         session_tgt = Session_tgt()
    69         session_tgt = Session_tgt()
    69         
    70                 
    70         count_tw_query = Tweet.__table__.count()  # @UndefinedVariable
    71         count_tw = session_src.query(Tweet).count()
    71         
       
    72         count_tw = engine_src.scalar(count_tw_query)
       
    73         
    72         
    74         if count_tw == 0:
    73         if count_tw == 0:
    75             print "No tweet to process : exit"
    74             print("No tweet to process : exit")
    76             sys.exit()
    75             sys.exit()
    77             
    76             
    78         query_src = session_src.query(Tweet).join(TweetSource).yield_per(100)
    77         query_src = session_src.query(Tweet).join(TweetSource).yield_per(100)
    79         added = 0
    78         added = 0
    80         
    79         
    86             if tweet_count == 0:
    85             if tweet_count == 0:
    87                 added += 1
    86                 added += 1
    88                 progress_text = u"Adding : "
    87                 progress_text = u"Adding : "
    89                 tweet_source = tweet.tweet_source.original_json
    88                 tweet_source = tweet.tweet_source.original_json
    90                                 
    89                                 
    91                 tweet_obj = anyjson.deserialize(tweet_source)
    90                 tweet_obj = json.loads(tweet_source)
    92                 if 'text' not in tweet_obj:
    91                 if 'text' not in tweet_obj:
    93                     tweet_log = TweetLog(tweet_source_id=tweet.tweet_source.id, status=TweetLog.TWEET_STATUS['NOT_TWEET'])
    92                     tweet_log = TweetLog(tweet_source_id=tweet.tweet_source.id, status=TweetLog.TWEET_STATUS['NOT_TWEET'])
    94                     session_tgt.add(tweet_log)
    93                     session_tgt.add(tweet_log)
    95                 else:                
    94                 else:                
    96                     tp = TwitterProcessorStatus(None, tweet_source, None, session_tgt, access_token, options.token_filename, user_query_twitter=options.query_user, logger=logger)
    95                     tp = TwitterProcessorStatus(None, tweet_source, None, session_tgt, access_token, options.token_filename, user_query_twitter=options.query_user, logger=logger)
   100                 
    99                 
   101             ptext = progress_text + tweet.text
   100             ptext = progress_text + tweet.text
   102             writer = show_progress(i+1, count_tw, ptext.replace("\n",""), 70, writer)
   101             writer = show_progress(i+1, count_tw, ptext.replace("\n",""), 70, writer)
   103                             
   102                             
   104         session_tgt.commit()
   103         session_tgt.commit()
   105         print u"%d new tweet added" % (added)
   104         print(u"%d new tweet added" % (added,))
   106         
   105         
   107     finally:
   106     finally:
   108         if session_tgt is not None:
   107         if session_tgt is not None:
   109             session_tgt.close()
   108             session_tgt.close()
   110         if session_src is not None:
   109         if session_src is not None:
   111             session_src.close()
   110             session_src.close()
   112         if conn_tgt is not None:
   111         if conn_tgt is not None:
   113             conn_tgt.close()
   112             conn_tgt.close()
   114         if conn_src is not None:
   113         if conn_src is not None:
   115             conn_src.close()
   114             conn_src.close()
   116         
       
   117