script/utils/merge_tweets.py
author ymh <ymh.work@gmail.com>
Wed, 18 Dec 2019 12:01:20 +0100
changeset 1525 3c96e9ef3d64
parent 1497 14a9bed2e3cd
permissions -rw-r--r--
update code settings + some dependencies version
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
     1
#from models import setup_database
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
     2
import argparse
888
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 886
diff changeset
     3
import codecs
1496
184372ec27e2 upgrade to python 3 and twitter api
ymh <ymh.work@gmail.com>
parents: 957
diff changeset
     4
import json
888
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 886
diff changeset
     5
import logging
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
     6
import re
888
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 886
diff changeset
     7
import sys
1497
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
     8
import twitter
888
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 886
diff changeset
     9
1496
184372ec27e2 upgrade to python 3 and twitter api
ymh <ymh.work@gmail.com>
parents: 957
diff changeset
    10
from iri_tweet.models import Tweet, TweetLog, TweetSource, setup_database
184372ec27e2 upgrade to python 3 and twitter api
ymh <ymh.work@gmail.com>
parents: 957
diff changeset
    11
from iri_tweet.processor import TwitterProcessorStatus
184372ec27e2 upgrade to python 3 and twitter api
ymh <ymh.work@gmail.com>
parents: 957
diff changeset
    12
from iri_tweet.utils import get_oauth_token, show_progress
184372ec27e2 upgrade to python 3 and twitter api
ymh <ymh.work@gmail.com>
parents: 957
diff changeset
    13
888
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 886
diff changeset
    14
logger = logging.getLogger(__name__)
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    15
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    16
def get_option():
1497
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
    17
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    18
    parser = argparse.ArgumentParser(description='Merge tweets databases')
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    19
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    20
    parser.add_argument("-l", "--log", dest="logfile",
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    21
                        help="log to file", metavar="LOG", default="stderr")
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    22
    parser.add_argument("-v", dest="verbose", action="count",
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    23
                        help="verbose", default=0)
957
e4d0094f097b upgrade virtualenv + script
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
    24
    parser.add_argument("-k", "--key", dest="consumer_key",
888
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 886
diff changeset
    25
                      help="Twitter consumer key", metavar="CONSUMER_KEY")
957
e4d0094f097b upgrade virtualenv + script
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
    26
    parser.add_argument("-s", "--secret", dest="consumer_secret",
888
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 886
diff changeset
    27
                      help="Twitter consumer secret", metavar="CONSUMER_SECRET")
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    28
    parser.add_argument("-q", dest="quiet", action="count",
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    29
                        help="quiet", default=0)
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    30
    parser.add_argument("--query-user", dest="query_user", action="store_true",
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    31
                        help="Query twitter for user information",  default=False)
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    32
    parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    33
                      help="Token file name")
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    34
1497
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
    35
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    36
    parser.add_argument("source", action="store", nargs=1, type=str, metavar="SOURCE")
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    37
    parser.add_argument("target", action="store", nargs=1, type=str, metavar="TARGET")
1497
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
    38
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    39
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    40
    return parser.parse_args()
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    41
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    42
if __name__ == "__main__":
1497
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
    43
957
e4d0094f097b upgrade virtualenv + script
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
    44
    #sys.stdout = codecs.getwriter(sys.stdout.encoding)(sys.stdout)
e4d0094f097b upgrade virtualenv + script
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
    45
    writer = None
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    46
    options = get_option()
1497
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
    47
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
    48
    twitter_auth = None
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    49
    if options.query_user:
1497
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
    50
        acess_token_key, access_token_secret = get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename)
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
    51
        twitter_auth = twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret)
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
    52
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    53
    #open source
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    54
    src_conn_str = options.source[0].strip()
1496
184372ec27e2 upgrade to python 3 and twitter api
ymh <ymh.work@gmail.com>
parents: 957
diff changeset
    55
    if not re.match(r"^\w+://.+", src_conn_str):
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    56
        src_conn_str = 'sqlite:///' + src_conn_str
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    57
    tgt_conn_str = options.target[0].strip()
1496
184372ec27e2 upgrade to python 3 and twitter api
ymh <ymh.work@gmail.com>
parents: 957
diff changeset
    58
    if not re.match(r"^\w+://.+", tgt_conn_str):
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    59
        tgt_conn_str = 'sqlite:///' + tgt_conn_str
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    60
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    61
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    62
    engine_src, metadata_src, Session_src = setup_database(src_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)
1497
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
    63
    engine_tgt, metadata_tgt, Session_tgt = setup_database(tgt_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    64
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    65
    conn_src = conn_tgt = session_src = session_tgt = None
1497
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
    66
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    67
    try:
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    68
        #conn_src = engine_src.connect()
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    69
        #conn_tgt = engine_tgt.connect()
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    70
        session_src = Session_src()
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    71
        session_tgt = Session_tgt()
1497
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
    72
1496
184372ec27e2 upgrade to python 3 and twitter api
ymh <ymh.work@gmail.com>
parents: 957
diff changeset
    73
        count_tw = session_src.query(Tweet).count()
1497
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
    74
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    75
        if count_tw == 0:
1496
184372ec27e2 upgrade to python 3 and twitter api
ymh <ymh.work@gmail.com>
parents: 957
diff changeset
    76
            print("No tweet to process : exit")
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    77
            sys.exit()
1497
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
    78
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    79
        query_src = session_src.query(Tweet).join(TweetSource).yield_per(100)
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    80
        added = 0
1497
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
    81
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    82
        for i,tweet in enumerate(query_src):
1497
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
    83
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    84
            tweet_count = session_tgt.query(Tweet).filter(Tweet.id == tweet.id).count()
1497
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
    85
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    86
            progress_text = u"Process: "
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    87
            if tweet_count == 0:
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    88
                added += 1
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    89
                progress_text = u"Adding : "
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    90
                tweet_source = tweet.tweet_source.original_json
1497
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
    91
1496
184372ec27e2 upgrade to python 3 and twitter api
ymh <ymh.work@gmail.com>
parents: 957
diff changeset
    92
                tweet_obj = json.loads(tweet_source)
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    93
                if 'text' not in tweet_obj:
888
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 886
diff changeset
    94
                    tweet_log = TweetLog(tweet_source_id=tweet.tweet_source.id, status=TweetLog.TWEET_STATUS['NOT_TWEET'])
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    95
                    session_tgt.add(tweet_log)
1497
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
    96
                else:
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
    97
                    tp = TwitterProcessorStatus(None, tweet_source, None, session_tgt, twitter_auth=twitter_auth, user_query_twitter=options.query_user, logger=logger)
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    98
                    tp.process()
1497
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
    99
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   100
                session_tgt.flush()
1497
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
   101
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 464
diff changeset
   102
            ptext = progress_text + tweet.text
957
e4d0094f097b upgrade virtualenv + script
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
   103
            writer = show_progress(i+1, count_tw, ptext.replace("\n",""), 70, writer)
1497
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
   104
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   105
        session_tgt.commit()
1496
184372ec27e2 upgrade to python 3 and twitter api
ymh <ymh.work@gmail.com>
parents: 957
diff changeset
   106
        print(u"%d new tweet added" % (added,))
1497
14a9bed2e3cd Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents: 1496
diff changeset
   107
464
b9243ade95e2 code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   108
    finally:
888
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 886
diff changeset
   109
        if session_tgt is not None:
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 886
diff changeset
   110
            session_tgt.close()
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 886
diff changeset
   111
        if session_src is not None:
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 886
diff changeset
   112
            session_src.close()
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 886
diff changeset
   113
        if conn_tgt is not None:
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 886
diff changeset
   114
            conn_tgt.close()
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 886
diff changeset
   115
        if conn_src is not None:
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 886
diff changeset
   116
            conn_src.close()