1 import argparse |
1 import argparse |
|
2 import datetime |
|
3 import functools |
|
4 import json |
2 import logging |
5 import logging |
3 import math |
6 import math |
4 import re |
7 import re |
5 import time |
8 import time |
6 import datetime |
|
7 import urllib |
9 import urllib |
|
10 from enum import Enum |
8 |
11 |
9 from blessings import Terminal |
|
10 import requests |
12 import requests |
11 import twitter |
13 import twitter |
|
14 from blessings import Terminal |
12 |
15 |
13 from iri_tweet import models, utils |
16 from iri_tweet import models, utils |
14 from iri_tweet.processor import TwitterProcessorStatus |
17 from iri_tweet.processor import TwitterProcessorStatus |
15 |
|
16 import json |
|
17 |
18 |
18 logger = logging.getLogger(__name__) |
19 logger = logging.getLogger(__name__) |
19 |
20 |
20 APPLICATION_NAME = "Tweet seach json" |
21 APPLICATION_NAME = "Tweet seach json" |
21 |
22 |
|
23 |
|
24 class SearchType(Enum): |
|
25 standard = 'standard' |
|
26 _30day = '30day' |
|
27 full = 'full' |
|
28 |
|
29 def __str__(self): |
|
30 return self.value |
|
31 |
|
32 def pass_kwargs_as_json(f): |
|
33 def kwargs_json_wrapper(*args, **kwargs): |
|
34 normal_kwargs = { k:v for k,v in kwargs.items() if k[0] != "_" } |
|
35 special_kwargs = { k:v for k,v in kwargs.items() if k[0] == "_" } |
|
36 new_kwargs = { **special_kwargs, '_json': normal_kwargs } |
|
37 return f(*args, **new_kwargs) |
|
38 return kwargs_json_wrapper |
22 |
39 |
23 # TODO: implement some more parameters |
40 # TODO: implement some more parameters |
24 # script to "scrap twitter results" |
41 # script to "scrap twitter results" |
25 # Shamelessly taken from https://github.com/Jefferson-Henrique/GetOldTweets-python |
42 # Shamelessly taken from https://github.com/Jefferson-Henrique/GetOldTweets-python |
26 # pyquery cssselect |
43 # pyquery cssselect |
27 class TweetManager: |
44 class TweetManager: |
28 |
45 |
29 def __init__(self, query, twitter_con): |
46 def __init__(self, twitter_con, query, search_type, api_env): |
30 self.query = query |
47 self.query = query |
31 self.max_id = 0 |
48 self.search_type = search_type |
|
49 self.next = "" |
32 self.t = twitter_con |
50 self.t = twitter_con |
33 pass |
51 self.api_env = api_env |
|
52 self.twitter_api = self.get_twitter_api() |
|
53 self.rate_limit_remaining = 0 |
|
54 self.rate_limit_limit = 0 |
|
55 self.rate_limit_reset = 0 |
|
56 self.i = 0 |
|
57 |
|
58 def get_twitter_api(self): |
|
59 return { |
|
60 SearchType.standard: lambda t: t.search.tweets, |
|
61 SearchType._30day: lambda t: pass_kwargs_as_json(functools.partial(getattr(getattr(t.tweets.search,'30day'),self.api_env), _method="POST")), |
|
62 SearchType.full: lambda t: pass_kwargs_as_json(functools.partial(getattr(t.tweets.search.fullarchive, self.api_env), _method="POST")), |
|
63 }[self.search_type](self.t) |
34 |
64 |
35 def __iter__(self): |
65 def __iter__(self): |
36 while True: |
66 while True: |
37 if self.max_id < 0: |
67 if self.next is None: |
38 break |
68 break |
39 json = self.get_json_response() |
69 self.i = self.i+1 |
40 |
70 |
41 next_results = json['search_metadata'].get('next_results', "?")[1:] |
71 # with open("json_dump_%s.json" % self.i, 'r') as fp: |
42 self.max_id = int(urllib.parse.parse_qs(next_results).get('max_id', [-1])[0]) |
72 # jsondata = json.load(fp) |
|
73 jsondata = self.get_json_response() |
43 |
74 |
44 tweet_list = json['statuses'] |
75 self.rate_limit_remaining = jsondata.rate_limit_remaining |
|
76 self.rate_limit_limit = jsondata.rate_limit_limit |
|
77 self.rate_limit_reset = jsondata.rate_limit_reset |
|
78 |
|
79 with open("json_dump_%s.json" % self.i, 'w') as fp: |
|
80 json.dump(jsondata, fp) |
|
81 |
|
82 if self.search_type == SearchType.standard: |
|
83 next_results = jsondata['search_metadata'].get('next_results', "?")[1:] |
|
84 self.next = urllib.parse.parse_qs(next_results).get('max_id', [None])[0] |
|
85 tweet_list = jsondata['statuses'] |
|
86 else: |
|
87 self.next = jsondata.get('next') |
|
88 tweet_list = jsondata['results'] |
45 |
89 |
46 if len(tweet_list) == 0: |
90 if len(tweet_list) == 0: |
47 break |
91 break |
48 |
92 |
49 for tweet in tweet_list: |
93 for tweet in tweet_list: |
50 yield tweet |
94 yield tweet |
51 |
95 |
52 def get_json_response(self): |
96 def get_json_response(self): |
53 return self.t.search.tweets(q=self.query, include_entities=True, max_id=self.max_id) |
97 if self.search_type == SearchType.standard: |
54 |
98 return self.twitter_api(q=self.query, include_entities=True, max_id=int(self.next) if self.next else 0) |
|
99 else: |
|
100 kwargs = { "query": self.query, "maxResults": 100 } |
|
101 if self.next: |
|
102 kwargs["next"] = self.next |
|
103 return self.twitter_api(**kwargs) |
55 |
104 |
56 def get_options(): |
105 def get_options(): |
57 |
106 |
58 usage = "usage: %(prog)s [options] <connection_str_or_filepath>" |
107 usage = "usage: %(prog)s [options] <connection_str_or_filepath>" |
59 |
108 |
60 parser = argparse.ArgumentParser(usage=usage) |
109 parser = argparse.ArgumentParser(usage=usage) |
61 |
110 |
62 parser.add_argument(dest="conn_str", |
111 parser.add_argument(dest="conn_str", |
63 help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR") |
112 help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR") |
64 parser.add_argument("-Q", dest="query", |
113 parser.add_argument("-Q", dest="query", |
65 help="query", metavar="QUERY") |
114 help="query", metavar="QUERY") |
66 parser.add_argument("-k", "--key", dest="consumer_key", |
115 parser.add_argument("-k", "--key", dest="consumer_key", |
67 help="Twitter consumer key", metavar="CONSUMER_KEY") |
116 help="Twitter consumer key", metavar="CONSUMER_KEY") |
68 parser.add_argument("-s", "--secret", dest="consumer_secret", |
117 parser.add_argument("-s", "--secret", dest="consumer_secret", |
69 help="Twitter consumer secret", metavar="CONSUMER_SECRET") |
118 help="Twitter consumer secret", metavar="CONSUMER_SECRET") |
70 parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token", |
119 parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token", |
71 help="Token file name") |
120 help="Token file name") |
|
121 parser.add_argument("-a", dest="search_type", metavar="SEARCH_TYPE", default=SearchType.standard, choices=list(SearchType), type=SearchType, |
|
122 help="Twitter search type ('standard', '30days', 'full')") |
|
123 parser.add_argument("-e", dest="api_env", metavar="API_ENV", default="dev", |
|
124 help="Twitter api dev environment") |
|
125 |
72 |
126 |
73 utils.set_logging_options(parser) |
127 utils.set_logging_options(parser) |
74 |
128 |
75 return parser.parse_args() |
129 return parser.parse_args() |
76 |
130 |
77 |
131 |
78 |
|
79 if __name__ == "__main__": |
132 if __name__ == "__main__": |
80 |
133 |
81 options = get_options() |
134 options = get_options() |
82 |
135 |
|
136 print("the search type is : %s" % options.search_type) |
|
137 |
83 utils.set_logging(options) |
138 utils.set_logging(options) |
84 |
139 |
|
140 bearer_token = utils.get_oauth2_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME) |
|
141 twitter_auth = twitter.OAuth2(options.consumer_key, options.consumer_secret, bearer_token) |
85 |
142 |
86 acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME) |
143 t = twitter.Twitter(domain="api.twitter.com", auth=twitter_auth, secure=True) |
87 |
144 t.secure = True |
88 t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True) |
|
89 t.secure = True |
|
90 |
145 |
91 conn_str = options.conn_str.strip() |
146 conn_str = options.conn_str.strip() |
92 if not re.match(r"^\w+://.+", conn_str): |
147 if not re.match(r"^\w+://.+", conn_str): |
93 conn_str = 'sqlite:///' + conn_str |
148 conn_str = 'sqlite:///' + conn_str |
94 |
149 |