| author | Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com> |
| Mon, 15 Oct 2012 17:01:50 +0200 | |
| changeset 693 | 2ef837069108 |
| parent 464 | b9243ade95e2 |
| child 886 | 1e110b03ae96 |
| permissions | -rw-r--r-- |
|
464
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
1 |
#from models import setup_database |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
2 |
from iri_tweet.models import setup_database, TweetSource, Tweet, TweetLog |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
3 |
from iri_tweet.utils import TwitterProcessor, get_oauth_token, show_progress |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
4 |
import argparse |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
5 |
import sys |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
6 |
import re |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
7 |
import anyjson |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
8 |
import math |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
9 |
import codecs |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
10 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
11 |
def get_option(): |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
12 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
13 |
parser = argparse.ArgumentParser(description='Merge tweets databases') |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
14 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
15 |
parser.add_argument("-l", "--log", dest="logfile", |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
16 |
help="log to file", metavar="LOG", default="stderr") |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
17 |
parser.add_argument("-v", dest="verbose", action="count", |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
18 |
help="verbose", default=0) |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
19 |
parser.add_argument("-q", dest="quiet", action="count", |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
20 |
help="quiet", default=0) |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
21 |
parser.add_argument("--query-user", dest="query_user", action="store_true", |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
22 |
help="Query twitter for user information", default=False) |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
23 |
parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token", |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
24 |
help="Token file name") |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
25 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
26 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
27 |
parser.add_argument("source", action="store", nargs=1, type=str, metavar="SOURCE") |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
28 |
parser.add_argument("target", action="store", nargs=1, type=str, metavar="TARGET") |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
29 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
30 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
31 |
return parser.parse_args() |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
32 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
33 |
if __name__ == "__main__": |
|
693
2ef837069108
Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
464
diff
changeset
|
34 |
|
|
464
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
35 |
options = get_option() |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
36 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
37 |
access_token = None |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
38 |
if options.query_user: |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
39 |
access_token = get_oauth_token(options.token_filename) |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
40 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
41 |
#open source |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
42 |
src_conn_str = options.source[0].strip() |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
43 |
if not re.match("^\w+://.+", src_conn_str): |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
44 |
src_conn_str = 'sqlite:///' + src_conn_str |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
45 |
tgt_conn_str = options.target[0].strip() |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
46 |
if not re.match("^\w+://.+", tgt_conn_str): |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
47 |
tgt_conn_str = 'sqlite:///' + tgt_conn_str |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
48 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
49 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
50 |
engine_src, metadata_src, Session_src = setup_database(src_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False) |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
51 |
engine_tgt, metadata_tgt, Session_tgt = setup_database(tgt_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False) |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
52 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
53 |
conn_src = conn_tgt = session_src = session_tgt = None |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
54 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
55 |
try: |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
56 |
#conn_src = engine_src.connect() |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
57 |
#conn_tgt = engine_tgt.connect() |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
58 |
session_src = Session_src() |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
59 |
session_tgt = Session_tgt() |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
60 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
61 |
count_tw_query = Tweet.__table__.count() |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
62 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
63 |
count_tw = engine_src.scalar(count_tw_query) |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
64 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
65 |
if count_tw == 0: |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
66 |
print "No tweet to process : exit" |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
67 |
sys.exit() |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
68 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
69 |
query_src = session_src.query(Tweet).join(TweetSource).yield_per(100) |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
70 |
added = 0 |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
71 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
72 |
for i,tweet in enumerate(query_src): |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
73 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
74 |
tweet_count = session_tgt.query(Tweet).filter(Tweet.id == tweet.id).count() |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
75 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
76 |
progress_text = u"Process: " |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
77 |
if tweet_count == 0: |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
78 |
added += 1 |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
79 |
progress_text = u"Adding : " |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
80 |
tweet_source = tweet.tweet_source.original_json |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
81 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
82 |
tweet_obj = anyjson.deserialize(tweet_source) |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
83 |
if 'text' not in tweet_obj: |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
84 |
tweet_log = TweetLog(tweet_source_id=source_id, status=TweetLog.TWEET_STATUS['NOT_TWEET']) |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
85 |
session_tgt.add(tweet_log) |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
86 |
else: |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
87 |
tp = TwitterProcessor(None, tweet_source, None, session_tgt, access_token, options.token_filename, user_query_twitter=options.query_user) |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
88 |
tp.process() |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
89 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
90 |
session_tgt.flush() |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
91 |
|
|
693
2ef837069108
Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
464
diff
changeset
|
92 |
ptext = progress_text + tweet.text |
|
2ef837069108
Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
464
diff
changeset
|
93 |
show_progress(i+1, count_tw, ptext.replace("\n",""), 70) |
|
464
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
94 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
95 |
session_tgt.commit() |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
96 |
print u"%d new tweet added" % (added) |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
97 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
98 |
finally: |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
99 |
session_tgt.close() if session_tgt is not None else None |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
100 |
session_src.close() if session_src is not None else None |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
101 |
conn_tgt.close() if conn_tgt is not None else None |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
102 |
conn_src.close() if conn_src is not None else None |
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
103 |
|
|
b9243ade95e2
code cleaning and reorganisation for scripts
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
104 |