script/utils/merge_tweets.py
author Raphael Velt <raph.velt@gmail.com>
Tue, 27 Mar 2012 15:25:13 +0200
changeset 568 34e035553dae
parent 464 b9243ade95e2
child 693 2ef837069108
child 763 bc29a6fbb8e8
permissions -rw-r--r--
Added tag V01.59 for changeset 8ff25a0fde76
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
     1
#from models import setup_database
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
     2
from iri_tweet.models import setup_database, TweetSource, Tweet, TweetLog
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
     3
from iri_tweet.utils import TwitterProcessor, get_oauth_token, show_progress
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
     4
import argparse
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
     5
import sys
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
     6
import re
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
     7
import anyjson
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
     8
import math
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
     9
import codecs
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    10
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    11
def get_option():
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    12
    
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    13
    parser = argparse.ArgumentParser(description='Merge tweets databases')
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    14
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    15
    parser.add_argument("-l", "--log", dest="logfile",
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    16
                        help="log to file", metavar="LOG", default="stderr")
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    17
    parser.add_argument("-v", dest="verbose", action="count",
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    18
                        help="verbose", default=0)
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    19
    parser.add_argument("-q", dest="quiet", action="count",
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    20
                        help="quiet", default=0)
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    21
    parser.add_argument("--query-user", dest="query_user", action="store_true",
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    22
                        help="Query twitter for user information",  default=False)
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    23
    parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    24
                      help="Token file name")
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    25
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    26
    
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    27
    parser.add_argument("source", action="store", nargs=1, type=str, metavar="SOURCE")
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    28
    parser.add_argument("target", action="store", nargs=1, type=str, metavar="TARGET")
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    29
    
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    30
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    31
    return parser.parse_args()
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    32
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    33
if __name__ == "__main__":
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    34
    
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    35
    sys.stdout = codecs.getwriter(sys.stdout.encoding)(sys.stdout)
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    36
    
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    37
    options = get_option()
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    38
    
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    39
    access_token = None
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    40
    if options.query_user:
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    41
        access_token = get_oauth_token(options.token_filename)
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    42
    
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    43
    #open source
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    44
    src_conn_str = options.source[0].strip()
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    45
    if not re.match("^\w+://.+", src_conn_str):
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    46
        src_conn_str = 'sqlite:///' + src_conn_str
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    47
    tgt_conn_str = options.target[0].strip()
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    48
    if not re.match("^\w+://.+", tgt_conn_str):
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    49
        tgt_conn_str = 'sqlite:///' + tgt_conn_str
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    50
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    51
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    52
    engine_src, metadata_src, Session_src = setup_database(src_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    53
    engine_tgt, metadata_tgt, Session_tgt = setup_database(tgt_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)        
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    54
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    55
    conn_src = conn_tgt = session_src = session_tgt = None
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    56
    
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    57
    try:
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    58
        #conn_src = engine_src.connect()
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    59
        #conn_tgt = engine_tgt.connect()
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    60
        session_src = Session_src()
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    61
        session_tgt = Session_tgt()
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    62
        
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    63
        count_tw_query = Tweet.__table__.count()
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    64
        
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    65
        count_tw = engine_src.scalar(count_tw_query)
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    66
        
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    67
        if count_tw == 0:
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    68
            print "No tweet to process : exit"
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    69
            sys.exit()
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    70
            
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    71
        query_src = session_src.query(Tweet).join(TweetSource).yield_per(100)
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    72
        added = 0
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    73
        
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    74
        for i,tweet in enumerate(query_src):
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    75
            
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    76
            tweet_count = session_tgt.query(Tweet).filter(Tweet.id == tweet.id).count()
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    77
            
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    78
            progress_text = u"Process: "
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    79
            if tweet_count == 0:
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    80
                added += 1
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    81
                progress_text = u"Adding : "
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    82
                tweet_source = tweet.tweet_source.original_json
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    83
                                
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    84
                tweet_obj = anyjson.deserialize(tweet_source)
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    85
                if 'text' not in tweet_obj:
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    86
                    tweet_log = TweetLog(tweet_source_id=source_id, status=TweetLog.TWEET_STATUS['NOT_TWEET'])
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    87
                    session_tgt.add(tweet_log)
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    88
                else:                
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    89
                    tp = TwitterProcessor(None, tweet_source, None, session_tgt, access_token, options.token_filename, user_query_twitter=options.query_user)
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    90
                    tp.process()
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    91
                
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    92
                session_tgt.flush()
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    93
                
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    94
            show_progress(i+1, count_tw, progress_text+tweet.text, 70)
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    95
                            
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    96
        session_tgt.commit()
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    97
        print u"%d new tweet added" % (added)
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    98
        
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    99
    finally:
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   100
        session_tgt.close() if session_tgt is not None else None
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   101
        session_src.close() if session_src is not None else None
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   102
        conn_tgt.close() if conn_tgt is not None else None
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   103
        conn_src.close() if conn_src is not None else None
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   104
        
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   105