| author | ymh <ymh.work@gmail.com> |
| Fri, 15 Nov 2024 01:29:53 +0100 | |
| changeset 1575 | ce1d5b0d1479 |
| parent 1523 | 53f1b28188f0 |
| permissions | -rw-r--r-- |
| 1496 | 1 |
import argparse |
|
1497
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
2 |
import datetime |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
3 |
import functools |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
4 |
import json |
| 1496 | 5 |
import logging |
6 |
import math |
|
7 |
import re |
|
8 |
import time |
|
9 |
import urllib |
|
|
1497
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
10 |
from enum import Enum |
| 1496 | 11 |
|
12 |
import requests |
|
13 |
import twitter |
|
|
1497
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
14 |
from blessings import Terminal |
| 1496 | 15 |
|
16 |
from iri_tweet import models, utils |
|
17 |
from iri_tweet.processor import TwitterProcessorStatus |
|
18 |
||
19 |
logger = logging.getLogger(__name__) |
|
20 |
||
21 |
APPLICATION_NAME = "Tweet seach json" |
|
22 |
||
23 |
||
|
1497
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
24 |
class SearchType(Enum): |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
25 |
standard = 'standard' |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
26 |
_30day = '30day' |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
27 |
full = 'full' |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
28 |
|
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
29 |
def __str__(self): |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
30 |
return self.value |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
31 |
|
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
32 |
def pass_kwargs_as_json(f): |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
33 |
def kwargs_json_wrapper(*args, **kwargs): |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
34 |
normal_kwargs = { k:v for k,v in kwargs.items() if k[0] != "_" } |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
35 |
special_kwargs = { k:v for k,v in kwargs.items() if k[0] == "_" } |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
36 |
new_kwargs = { **special_kwargs, '_json': normal_kwargs } |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
37 |
return f(*args, **new_kwargs) |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
38 |
return kwargs_json_wrapper |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
39 |
|
| 1496 | 40 |
# TODO: implement some more parameters |
41 |
# script to "scrap twitter results" |
|
42 |
# Shamelessly taken from https://github.com/Jefferson-Henrique/GetOldTweets-python |
|
43 |
# pyquery cssselect |
|
44 |
class TweetManager: |
|
45 |
||
|
1497
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
46 |
def __init__(self, twitter_con, query, search_type, api_env): |
| 1496 | 47 |
self.query = query |
|
1497
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
48 |
self.search_type = search_type |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
49 |
self.next = "" |
| 1496 | 50 |
self.t = twitter_con |
|
1497
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
51 |
self.api_env = api_env |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
52 |
self.twitter_api = self.get_twitter_api() |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
53 |
self.rate_limit_remaining = 0 |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
54 |
self.rate_limit_limit = 0 |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
55 |
self.rate_limit_reset = 0 |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
56 |
self.i = 0 |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
57 |
|
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
58 |
def get_twitter_api(self): |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
59 |
return { |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
60 |
SearchType.standard: lambda t: t.search.tweets, |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
61 |
SearchType._30day: lambda t: pass_kwargs_as_json(functools.partial(getattr(getattr(t.tweets.search,'30day'),self.api_env), _method="POST")), |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
62 |
SearchType.full: lambda t: pass_kwargs_as_json(functools.partial(getattr(t.tweets.search.fullarchive, self.api_env), _method="POST")), |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
63 |
}[self.search_type](self.t) |
| 1496 | 64 |
|
65 |
def __iter__(self): |
|
66 |
while True: |
|
|
1497
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
67 |
if self.next is None: |
| 1496 | 68 |
break |
|
1497
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
69 |
self.i = self.i+1 |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
70 |
|
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
71 |
# with open("json_dump_%s.json" % self.i, 'r') as fp: |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
72 |
# jsondata = json.load(fp) |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
73 |
jsondata = self.get_json_response() |
| 1496 | 74 |
|
|
1497
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
75 |
self.rate_limit_remaining = jsondata.rate_limit_remaining |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
76 |
self.rate_limit_limit = jsondata.rate_limit_limit |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
77 |
self.rate_limit_reset = jsondata.rate_limit_reset |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
78 |
|
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
79 |
with open("json_dump_%s.json" % self.i, 'w') as fp: |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
80 |
json.dump(jsondata, fp) |
| 1496 | 81 |
|
|
1497
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
82 |
if self.search_type == SearchType.standard: |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
83 |
next_results = jsondata['search_metadata'].get('next_results', "?")[1:] |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
84 |
self.next = urllib.parse.parse_qs(next_results).get('max_id', [None])[0] |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
85 |
tweet_list = jsondata['statuses'] |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
86 |
else: |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
87 |
self.next = jsondata.get('next') |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
88 |
tweet_list = jsondata['results'] |
| 1496 | 89 |
|
90 |
if len(tweet_list) == 0: |
|
91 |
break |
|
92 |
||
93 |
for tweet in tweet_list: |
|
94 |
yield tweet |
|
95 |
||
96 |
def get_json_response(self): |
|
|
1497
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
97 |
if self.search_type == SearchType.standard: |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
98 |
return self.twitter_api(q=self.query, include_entities=True, max_id=int(self.next) if self.next else 0) |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
99 |
else: |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
100 |
kwargs = { "query": self.query, "maxResults": 100 } |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
101 |
if self.next: |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
102 |
kwargs["next"] = self.next |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
103 |
return self.twitter_api(**kwargs) |
| 1496 | 104 |
|
105 |
def get_options(): |
|
106 |
||
107 |
usage = "usage: %(prog)s [options] <connection_str_or_filepath>" |
|
108 |
||
109 |
parser = argparse.ArgumentParser(usage=usage) |
|
110 |
||
111 |
parser.add_argument(dest="conn_str", |
|
112 |
help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR") |
|
113 |
parser.add_argument("-Q", dest="query", |
|
|
1497
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
114 |
help="query", metavar="QUERY") |
| 1496 | 115 |
parser.add_argument("-k", "--key", dest="consumer_key", |
116 |
help="Twitter consumer key", metavar="CONSUMER_KEY") |
|
117 |
parser.add_argument("-s", "--secret", dest="consumer_secret", |
|
118 |
help="Twitter consumer secret", metavar="CONSUMER_SECRET") |
|
119 |
parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token", |
|
|
1497
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
120 |
help="Token file name") |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
121 |
parser.add_argument("-a", dest="search_type", metavar="SEARCH_TYPE", default=SearchType.standard, choices=list(SearchType), type=SearchType, |
|
1523
53f1b28188f0
Take into account change into twitter
ymh <ymh.work@gmail.com>
parents:
1497
diff
changeset
|
122 |
help="Twitter search type ('standard', '30day', 'full')") |
|
1497
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
123 |
parser.add_argument("-e", dest="api_env", metavar="API_ENV", default="dev", |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
124 |
help="Twitter api dev environment") |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
125 |
|
| 1496 | 126 |
|
127 |
utils.set_logging_options(parser) |
|
128 |
||
129 |
return parser.parse_args() |
|
130 |
||
131 |
||
132 |
if __name__ == "__main__": |
|
133 |
||
134 |
options = get_options() |
|
135 |
||
|
1497
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
136 |
print("the search type is : %s" % options.search_type) |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
137 |
|
| 1496 | 138 |
utils.set_logging(options) |
139 |
||
|
1497
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
140 |
bearer_token = utils.get_oauth2_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME) |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
141 |
twitter_auth = twitter.OAuth2(options.consumer_key, options.consumer_secret, bearer_token) |
| 1496 | 142 |
|
|
1497
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
143 |
t = twitter.Twitter(domain="api.twitter.com", auth=twitter_auth, secure=True) |
|
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
144 |
t.secure = True |
| 1496 | 145 |
|
146 |
conn_str = options.conn_str.strip() |
|
147 |
if not re.match(r"^\w+://.+", conn_str): |
|
148 |
conn_str = 'sqlite:///' + conn_str |
|
149 |
||
150 |
engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True) |
|
151 |
session = None |
|
152 |
||
153 |
||
154 |
term = Terminal() |
|
155 |
||
156 |
try: |
|
157 |
session = Session() |
|
158 |
||
159 |
results = None |
|
160 |
print(options.query) |
|
161 |
||
|
1497
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
162 |
tm = TweetManager(t, options.query, options.search_type, options.api_env) |
| 1496 | 163 |
|
164 |
move_up = 0 |
|
165 |
||
166 |
for i,tweet in enumerate(tm): |
|
167 |
# get id |
|
168 |
tweet_id = tweet.get("id") |
|
169 |
||
170 |
if not tweet_id: |
|
171 |
continue |
|
172 |
||
173 |
if move_up > 0: |
|
174 |
print((move_up+1)*term.move_up()) |
|
175 |
move_up = 0 |
|
176 |
||
177 |
print ("%d: %s - %r" % (i+1, tweet_id, tweet.get("text", "") ) + term.clear_eol()) |
|
178 |
move_up += 1 |
|
179 |
||
180 |
count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count() |
|
181 |
||
182 |
if count_tweet: |
|
183 |
continue |
|
184 |
||
|
1497
14a9bed2e3cd
Adapt recorder_stream to python 3
ymh <ymh.work@gmail.com>
parents:
1496
diff
changeset
|
185 |
processor = TwitterProcessorStatus(tweet, None, None, session, twitter_auth=twitter_auth, logger=logger) |
| 1496 | 186 |
processor.process() |
187 |
session.flush() |
|
188 |
session.commit() |
|
189 |
||
190 |
except twitter.api.TwitterHTTPError as e: |
|
191 |
fmt = ("." + e.format) if e.format else "" |
|
192 |
print("Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data))) |
|
193 |
||
194 |
finally: |
|
195 |
if session: |
|
196 |
session.close() |