|
1 import argparse |
|
2 import logging |
|
3 import math |
|
4 import re |
|
5 import time |
|
6 import datetime |
|
7 import urllib |
|
8 |
|
9 from blessings import Terminal |
|
10 import requests |
|
11 import twitter |
|
12 |
|
13 from iri_tweet import models, utils |
|
14 from iri_tweet.processor import TwitterProcessorStatus |
|
15 |
|
16 from lxml import html |
|
17 import json |
|
18 from pyquery import PyQuery |
|
19 |
|
20 logger = logging.getLogger(__name__) |
|
21 |
|
22 APPLICATION_NAME = "Tweet seach json" |
|
23 |
|
24 |
|
25 # TODO: implement some more parameters |
|
26 # script to "scrap twitter results" |
|
27 # Shamelessly taken from https://github.com/Jefferson-Henrique/GetOldTweets-python |
|
28 # pyquery cssselect |
|
29 class TweetManager: |
|
30 |
|
31 def __init__(self, query): |
|
32 self.query = query |
|
33 self.refresh_cursor = '' |
|
34 pass |
|
35 |
|
36 def __iter__(self): |
|
37 |
|
38 results = [] |
|
39 |
|
40 while True: |
|
41 json = self.get_json_response() |
|
42 if len(json['items_html'].strip()) == 0: |
|
43 break |
|
44 |
|
45 self.refresh_cursor = json['min_position'] |
|
46 tweets = PyQuery(json['items_html'])('div.js-stream-tweet') |
|
47 |
|
48 if len(tweets) == 0: |
|
49 break |
|
50 |
|
51 for tweetHTML in tweets: |
|
52 tweet_pq = PyQuery(tweetHTML) |
|
53 |
|
54 username = tweet_pq("span.username.js-action-profile-name b").text(); |
|
55 txt = re.sub(r"\s+", " ", re.sub(r"[^\x00-\x7F]", "", tweet_pq("p.js-tweet-text").text()).replace('# ', '#').replace('@ ', '@')); |
|
56 retweets = int(tweet_pq("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")); |
|
57 favorites = int(tweet_pq("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")); |
|
58 date_sec = int(tweet_pq("small.time span.js-short-timestamp").attr("data-time")); |
|
59 id = tweet_pq.attr("data-tweet-id"); |
|
60 permalink = tweet_pq.attr("data-permalink-path"); |
|
61 |
|
62 geo = '' |
|
63 geo_span = tweet_pq('span.Tweet-geo') |
|
64 if len(geo_span) > 0: |
|
65 geo = geo_span.attr('title') |
|
66 |
|
67 yield { |
|
68 "id" : id, |
|
69 "permalink": 'https://twitter.com' + permalink, |
|
70 "username" : username, |
|
71 "text": txt, |
|
72 "date" : datetime.datetime.fromtimestamp(date_sec), |
|
73 "retweets" : retweets, |
|
74 "favorites" : favorites, |
|
75 "mentions": " ".join(re.compile('(@\\w*)').findall(txt)), |
|
76 "hashtags": " ".join(re.compile('(#\\w*)').findall(txt)), |
|
77 "geo": geo, |
|
78 } |
|
79 |
|
80 def get_json_response(self): |
|
81 |
|
82 url = "https://twitter.com/i/search/timeline" |
|
83 |
|
84 # if hasattr(tweetCriteria, 'username'): |
|
85 # urlGetData += ' from:' + tweetCriteria.username |
|
86 # |
|
87 # if hasattr(tweetCriteria, 'since'): |
|
88 # urlGetData += ' since:' + tweetCriteria.since |
|
89 # |
|
90 # if hasattr(tweetCriteria, 'until'): |
|
91 # urlGetData += ' until:' + tweetCriteria.until |
|
92 |
|
93 params = { |
|
94 'f': 'realtime', |
|
95 'q': self.query, |
|
96 'src': 'typd', |
|
97 'max_position': self.refresh_cursor |
|
98 } |
|
99 |
|
100 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'} |
|
101 |
|
102 return requests.get(url, params=params, headers=headers).json() |
|
103 |
|
104 |
|
105 def get_options(): |
|
106 |
|
107 usage = "usage: %(prog)s [options] <connection_str_or_filepath>" |
|
108 |
|
109 parser = argparse.ArgumentParser(usage=usage) |
|
110 |
|
111 parser.add_argument(dest="conn_str", |
|
112 help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR") |
|
113 parser.add_argument("-Q", dest="query", |
|
114 help="query", metavar="QUERY") |
|
115 parser.add_argument("-k", "--key", dest="consumer_key", |
|
116 help="Twitter consumer key", metavar="CONSUMER_KEY") |
|
117 parser.add_argument("-s", "--secret", dest="consumer_secret", |
|
118 help="Twitter consumer secret", metavar="CONSUMER_SECRET") |
|
119 parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token", |
|
120 help="Token file name") |
|
121 |
|
122 utils.set_logging_options(parser) |
|
123 |
|
124 return parser.parse_args() |
|
125 |
|
126 |
|
127 |
|
128 if __name__ == "__main__": |
|
129 |
|
130 options = get_options() |
|
131 |
|
132 utils.set_logging(options); |
|
133 |
|
134 |
|
135 acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME) |
|
136 |
|
137 t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True) |
|
138 t.secure = True |
|
139 |
|
140 conn_str = options.conn_str.strip() |
|
141 if not re.match("^\w+://.+", conn_str): |
|
142 conn_str = 'sqlite:///' + conn_str |
|
143 |
|
144 engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True) |
|
145 session = None |
|
146 |
|
147 |
|
148 term = Terminal() |
|
149 |
|
150 try: |
|
151 session = Session() |
|
152 |
|
153 results = None |
|
154 print options.query |
|
155 |
|
156 tm = TweetManager(options.query) |
|
157 |
|
158 move_up = 0 |
|
159 |
|
160 for i,item in enumerate(tm): |
|
161 # get id |
|
162 tweet_id = item.get("id") |
|
163 |
|
164 if not tweet_id: |
|
165 continue |
|
166 |
|
167 if move_up > 0: |
|
168 print((move_up+1)*term.move_up()) |
|
169 move_up = 0 |
|
170 |
|
171 print ("%d: %s - %r" % (i+1, tweet_id, item.get("text", "") ) + term.clear_eol()) |
|
172 move_up += 1 |
|
173 |
|
174 count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count() |
|
175 |
|
176 if count_tweet: |
|
177 continue |
|
178 try: |
|
179 tweet = t.statuses.show(id=tweet_id, include_entities=True) |
|
180 except twitter.api.TwitterHTTPError as e: |
|
181 if e.e.code == 404 or e.e.code == 403: |
|
182 continue |
|
183 else: |
|
184 raise |
|
185 |
|
186 processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger) |
|
187 processor.process() |
|
188 session.flush() |
|
189 session.commit() |
|
190 |
|
191 print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('X-Rate-Limit-Limit'))) + term.clear_eol()) |
|
192 move_up += 1 |
|
193 rate_limit_limit = int(tweet.headers.getheader('X-Rate-Limit-Limit')) |
|
194 rate_limit_remaining = int(tweet.rate_limit_remaining) |
|
195 |
|
196 if rate_limit_remaining > rate_limit_limit: |
|
197 time_to_sleep = 0 |
|
198 else: |
|
199 time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining)) |
|
200 |
|
201 for i in xrange(time_to_sleep): |
|
202 if i: |
|
203 print(2*term.move_up()) |
|
204 else: |
|
205 move_up += 1 |
|
206 print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol()) |
|
207 time.sleep(1) |
|
208 |
|
209 except twitter.api.TwitterHTTPError as e: |
|
210 fmt = ("." + e.format) if e.format else "" |
|
211 print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data)) |
|
212 |
|
213 finally: |
|
214 if session: |
|
215 session.close() |