|
1 #from models import setup_database |
|
2 from iri_tweet.models import setup_database, TweetSource, Tweet, TweetLog |
|
3 from iri_tweet.utils import TwitterProcessor, get_oauth_token, show_progress |
|
4 import argparse |
|
5 import sys |
|
6 import re |
|
7 import anyjson |
|
8 import math |
|
9 import codecs |
|
10 |
|
11 def get_option(): |
|
12 |
|
13 parser = argparse.ArgumentParser(description='Merge tweets databases') |
|
14 |
|
15 parser.add_argument("-l", "--log", dest="logfile", |
|
16 help="log to file", metavar="LOG", default="stderr") |
|
17 parser.add_argument("-v", dest="verbose", action="count", |
|
18 help="verbose", default=0) |
|
19 parser.add_argument("-q", dest="quiet", action="count", |
|
20 help="quiet", default=0) |
|
21 parser.add_argument("--query-user", dest="query_user", action="store_true", |
|
22 help="Query twitter for user information", default=False) |
|
23 parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token", |
|
24 help="Token file name") |
|
25 |
|
26 |
|
27 parser.add_argument("source", action="store", nargs=1, type=str, metavar="SOURCE") |
|
28 parser.add_argument("target", action="store", nargs=1, type=str, metavar="TARGET") |
|
29 |
|
30 |
|
31 return parser.parse_args() |
|
32 |
|
33 if __name__ == "__main__": |
|
34 |
|
35 sys.stdout = codecs.getwriter(sys.stdout.encoding)(sys.stdout) |
|
36 |
|
37 options = get_option() |
|
38 |
|
39 access_token = None |
|
40 if options.query_user: |
|
41 access_token = get_oauth_token(options.token_filename) |
|
42 |
|
43 #open source |
|
44 src_conn_str = options.source[0].strip() |
|
45 if not re.match("^\w+://.+", src_conn_str): |
|
46 src_conn_str = 'sqlite:///' + src_conn_str |
|
47 tgt_conn_str = options.target[0].strip() |
|
48 if not re.match("^\w+://.+", tgt_conn_str): |
|
49 tgt_conn_str = 'sqlite:///' + tgt_conn_str |
|
50 |
|
51 |
|
52 engine_src, metadata_src, Session_src = setup_database(src_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False) |
|
53 engine_tgt, metadata_tgt, Session_tgt = setup_database(tgt_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False) |
|
54 |
|
55 conn_src = conn_tgt = session_src = session_tgt = None |
|
56 |
|
57 try: |
|
58 #conn_src = engine_src.connect() |
|
59 #conn_tgt = engine_tgt.connect() |
|
60 session_src = Session_src() |
|
61 session_tgt = Session_tgt() |
|
62 |
|
63 count_tw_query = Tweet.__table__.count() |
|
64 |
|
65 count_tw = engine_src.scalar(count_tw_query) |
|
66 |
|
67 if count_tw == 0: |
|
68 print "No tweet to process : exit" |
|
69 sys.exit() |
|
70 |
|
71 query_src = session_src.query(Tweet).join(TweetSource).yield_per(100) |
|
72 added = 0 |
|
73 |
|
74 for i,tweet in enumerate(query_src): |
|
75 |
|
76 tweet_count = session_tgt.query(Tweet).filter(Tweet.id == tweet.id).count() |
|
77 |
|
78 progress_text = u"Process: " |
|
79 if tweet_count == 0: |
|
80 added += 1 |
|
81 progress_text = u"Adding : " |
|
82 tweet_source = tweet.tweet_source.original_json |
|
83 |
|
84 tweet_obj = anyjson.deserialize(tweet_source) |
|
85 if 'text' not in tweet_obj: |
|
86 tweet_log = TweetLog(tweet_source_id=source_id, status=TweetLog.TWEET_STATUS['NOT_TWEET']) |
|
87 session_tgt.add(tweet_log) |
|
88 else: |
|
89 tp = TwitterProcessor(None, tweet_source, None, session_tgt, access_token, options.token_filename, user_query_twitter=options.query_user) |
|
90 tp.process() |
|
91 |
|
92 session_tgt.flush() |
|
93 |
|
94 show_progress(i+1, count_tw, progress_text+tweet.text, 70) |
|
95 |
|
96 session_tgt.commit() |
|
97 print u"%d new tweet added" % (added) |
|
98 |
|
99 finally: |
|
100 session_tgt.close() if session_tgt is not None else None |
|
101 session_src.close() if session_src is not None else None |
|
102 conn_tgt.close() if conn_tgt is not None else None |
|
103 conn_src.close() if conn_src is not None else None |
|
104 |
|
105 |