|
1496
|
1 |
import argparse |
|
|
2 |
import logging |
|
|
3 |
import math |
|
|
4 |
import re |
|
|
5 |
import time |
|
|
6 |
import datetime |
|
|
7 |
import urllib |
|
|
8 |
|
|
|
9 |
from blessings import Terminal |
|
|
10 |
import requests |
|
|
11 |
import twitter |
|
|
12 |
|
|
|
13 |
from iri_tweet import models, utils |
|
|
14 |
from iri_tweet.processor import TwitterProcessorStatus |
|
|
15 |
|
|
|
16 |
import json |
|
|
17 |
|
|
|
18 |
logger = logging.getLogger(__name__) |
|
|
19 |
|
|
|
20 |
APPLICATION_NAME = "Tweet seach json" |
|
|
21 |
|
|
|
22 |
|
|
|
23 |
# TODO: implement some more parameters |
|
|
24 |
# script to "scrap twitter results" |
|
|
25 |
# Shamelessly taken from https://github.com/Jefferson-Henrique/GetOldTweets-python |
|
|
26 |
# pyquery cssselect |
|
|
27 |
class TweetManager: |
|
|
28 |
|
|
|
29 |
def __init__(self, query, twitter_con): |
|
|
30 |
self.query = query |
|
|
31 |
self.max_id = 0 |
|
|
32 |
self.t = twitter_con |
|
|
33 |
pass |
|
|
34 |
|
|
|
35 |
def __iter__(self): |
|
|
36 |
while True: |
|
|
37 |
if self.max_id < 0: |
|
|
38 |
break |
|
|
39 |
json = self.get_json_response() |
|
|
40 |
|
|
|
41 |
next_results = json['search_metadata'].get('next_results', "?")[1:] |
|
|
42 |
self.max_id = int(urllib.parse.parse_qs(next_results).get('max_id', [-1])[0]) |
|
|
43 |
|
|
|
44 |
tweet_list = json['statuses'] |
|
|
45 |
|
|
|
46 |
if len(tweet_list) == 0: |
|
|
47 |
break |
|
|
48 |
|
|
|
49 |
for tweet in tweet_list: |
|
|
50 |
yield tweet |
|
|
51 |
|
|
|
52 |
def get_json_response(self): |
|
|
53 |
return self.t.search.tweets(q=self.query, include_entities=True, max_id=self.max_id) |
|
|
54 |
|
|
|
55 |
|
|
|
56 |
def get_options(): |
|
|
57 |
|
|
|
58 |
usage = "usage: %(prog)s [options] <connection_str_or_filepath>" |
|
|
59 |
|
|
|
60 |
parser = argparse.ArgumentParser(usage=usage) |
|
|
61 |
|
|
|
62 |
parser.add_argument(dest="conn_str", |
|
|
63 |
help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR") |
|
|
64 |
parser.add_argument("-Q", dest="query", |
|
|
65 |
help="query", metavar="QUERY") |
|
|
66 |
parser.add_argument("-k", "--key", dest="consumer_key", |
|
|
67 |
help="Twitter consumer key", metavar="CONSUMER_KEY") |
|
|
68 |
parser.add_argument("-s", "--secret", dest="consumer_secret", |
|
|
69 |
help="Twitter consumer secret", metavar="CONSUMER_SECRET") |
|
|
70 |
parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token", |
|
|
71 |
help="Token file name") |
|
|
72 |
|
|
|
73 |
utils.set_logging_options(parser) |
|
|
74 |
|
|
|
75 |
return parser.parse_args() |
|
|
76 |
|
|
|
77 |
|
|
|
78 |
|
|
|
79 |
if __name__ == "__main__": |
|
|
80 |
|
|
|
81 |
options = get_options() |
|
|
82 |
|
|
|
83 |
utils.set_logging(options) |
|
|
84 |
|
|
|
85 |
|
|
|
86 |
acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME) |
|
|
87 |
|
|
|
88 |
t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True) |
|
|
89 |
t.secure = True |
|
|
90 |
|
|
|
91 |
conn_str = options.conn_str.strip() |
|
|
92 |
if not re.match(r"^\w+://.+", conn_str): |
|
|
93 |
conn_str = 'sqlite:///' + conn_str |
|
|
94 |
|
|
|
95 |
engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True) |
|
|
96 |
session = None |
|
|
97 |
|
|
|
98 |
|
|
|
99 |
term = Terminal() |
|
|
100 |
|
|
|
101 |
try: |
|
|
102 |
session = Session() |
|
|
103 |
|
|
|
104 |
results = None |
|
|
105 |
print(options.query) |
|
|
106 |
|
|
|
107 |
tm = TweetManager(options.query, t) |
|
|
108 |
|
|
|
109 |
move_up = 0 |
|
|
110 |
|
|
|
111 |
for i,tweet in enumerate(tm): |
|
|
112 |
# get id |
|
|
113 |
tweet_id = tweet.get("id") |
|
|
114 |
|
|
|
115 |
if not tweet_id: |
|
|
116 |
continue |
|
|
117 |
|
|
|
118 |
if move_up > 0: |
|
|
119 |
print((move_up+1)*term.move_up()) |
|
|
120 |
move_up = 0 |
|
|
121 |
|
|
|
122 |
print ("%d: %s - %r" % (i+1, tweet_id, tweet.get("text", "") ) + term.clear_eol()) |
|
|
123 |
move_up += 1 |
|
|
124 |
|
|
|
125 |
count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count() |
|
|
126 |
|
|
|
127 |
if count_tweet: |
|
|
128 |
continue |
|
|
129 |
|
|
|
130 |
processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger) |
|
|
131 |
processor.process() |
|
|
132 |
session.flush() |
|
|
133 |
session.commit() |
|
|
134 |
|
|
|
135 |
except twitter.api.TwitterHTTPError as e: |
|
|
136 |
fmt = ("." + e.format) if e.format else "" |
|
|
137 |
print("Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data))) |
|
|
138 |
|
|
|
139 |
finally: |
|
|
140 |
if session: |
|
|
141 |
session.close() |