|
1 from iri_tweet import models, utils |
|
2 from sqlalchemy.orm import sessionmaker |
|
3 import anyjson |
|
4 import sqlite3 |
|
5 import twitter |
|
6 import re |
|
7 import requests |
|
8 from optparse import OptionParser |
|
9 import simplejson |
|
10 import time |
|
11 from blessings import Terminal |
|
12 import sys |
|
13 import math |
|
14 from symbol import except_clause |
|
15 |
|
16 APPLICATION_NAME = "Tweet recorder user" |
|
17 CONSUMER_KEY = "Vdr5ZcsjI1G3esTPI8yDg" |
|
18 CONSUMER_SECRET = "LMhNrY99R6a7E0YbZZkRFpUZpX5EfB1qATbDk1sIVLs" |
|
19 |
|
20 |
|
21 class TopsyResource(object): |
|
22 |
|
23 def __init__(self, query, **kwargs): |
|
24 |
|
25 self.options = kwargs |
|
26 self.options['q'] = query |
|
27 self.url = kwargs.get("url", "http://otter.topsy.com/search.json") |
|
28 self.page = 0 |
|
29 self.req = None |
|
30 self.res = {} |
|
31 |
|
32 def __initialize(self): |
|
33 |
|
34 params = {} |
|
35 params.update(self.options) |
|
36 self.req = requests.get(self.url, params=params) |
|
37 self.res = self.req.json |
|
38 |
|
39 def __next_page(self): |
|
40 page = self.res.get("response").get("page") + 1 |
|
41 params = {} |
|
42 params.update(self.options) |
|
43 params['page'] = page |
|
44 self.req = requests.get(self.url, params=params) |
|
45 self.res = self.req.json |
|
46 |
|
47 def __iter__(self): |
|
48 if not self.req: |
|
49 self.__initialize() |
|
50 while "response" in self.res and "list" in self.res.get("response") and self.res.get("response").get("list"): |
|
51 for item in self.res.get("response").get("list"): |
|
52 yield item |
|
53 self.__next_page() |
|
54 |
|
55 def total(self): |
|
56 if not self.res: |
|
57 return 0 |
|
58 else: |
|
59 return self.res.get("response",{}).get("total",0) |
|
60 |
|
61 |
|
62 |
|
63 def get_option(): |
|
64 |
|
65 parser = OptionParser() |
|
66 |
|
67 parser.add_option("-d", "--database", dest="database", |
|
68 help="Input database", metavar="DATABASE") |
|
69 parser.add_option("-Q", dest="query", |
|
70 help="query", metavar="QUERY") |
|
71 parser.add_option("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token", |
|
72 help="Token file name") |
|
73 parser.add_option("-T", dest="topsy_apikey", metavar="TOPSY_APIKEY", default=None, |
|
74 help="Topsy apikey") |
|
75 |
|
76 utils.set_logging_options(parser) |
|
77 |
|
78 return parser.parse_args() |
|
79 |
|
80 |
|
81 |
|
82 if __name__ == "__main__": |
|
83 |
|
84 (options, args) = get_option() |
|
85 |
|
86 utils.set_logging(options); |
|
87 |
|
88 |
|
89 acess_token_key, access_token_secret = utils.get_oauth_token(options.token_filename, application_name=APPLICATION_NAME, consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET) |
|
90 |
|
91 t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, CONSUMER_KEY, CONSUMER_SECRET), secure=True) |
|
92 t.secure = True |
|
93 |
|
94 conn_str = options.database.strip() |
|
95 if not re.match("^\w+://.+", conn_str): |
|
96 conn_str = 'sqlite:///' + conn_str |
|
97 |
|
98 engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True) |
|
99 session = None |
|
100 |
|
101 |
|
102 topsy_parameters = { |
|
103 'apikey': options.topsy_apikey, |
|
104 'perpage': 100, |
|
105 'window': 'a', |
|
106 'type': 'tweet', |
|
107 'hidden': True, |
|
108 } |
|
109 |
|
110 term = Terminal() |
|
111 |
|
112 try: |
|
113 session = Session() |
|
114 |
|
115 results = None |
|
116 page = 1 |
|
117 print options.query |
|
118 |
|
119 tr = TopsyResource(options.query, **topsy_parameters) |
|
120 |
|
121 move_up = 0 |
|
122 |
|
123 for i,item in enumerate(tr): |
|
124 # get id |
|
125 url = item.get("url") |
|
126 tweet_id = url.split("/")[-1] |
|
127 |
|
128 if move_up > 0: |
|
129 print((move_up+1)*term.move_up()) |
|
130 move_up = 0 |
|
131 |
|
132 print ("%d/%d:%03d%% - %s - %r" % (i+1, tr.total(), int(float(i+1)/float(tr.total())*100.0), tweet_id, item.get("content") ) + term.clear_eol()) |
|
133 move_up += 1 |
|
134 |
|
135 count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count() |
|
136 |
|
137 if count_tweet: |
|
138 continue |
|
139 try: |
|
140 tweet = t.statuses.show(id=tweet_id, include_entities=True) |
|
141 except twitter.api.TwitterHTTPError as e: |
|
142 if e.e.code == 404 or e.e.code == 403: |
|
143 continue |
|
144 else: |
|
145 raise |
|
146 |
|
147 processor = utils.TwitterProcessor(tweet, None, None, session, None, options.token_filename) |
|
148 processor.process() |
|
149 session.flush() |
|
150 session.commit() |
|
151 |
|
152 time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining)) |
|
153 |
|
154 print "rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('x-ratelimit-limit'))) + term.clear_eol() |
|
155 move_up += 1 |
|
156 for i in xrange(time_to_sleep): |
|
157 if i: |
|
158 print(2*term.move_up()) |
|
159 else: |
|
160 move_up += 1 |
|
161 print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol()) |
|
162 time.sleep(1) |
|
163 |
|
164 except twitter.api.TwitterHTTPError as e: |
|
165 fmt = ("." + e.format) if e.format else "" |
|
166 print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data)) |
|
167 |
|
168 finally: |
|
169 if session: |
|
170 session.close() |