--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/script/utils/search_twitter_json.py Wed Mar 16 23:09:57 2016 +0100
@@ -0,0 +1,215 @@
+import argparse
+import logging
+import math
+import re
+import time
+import datetime
+import urllib
+
+from blessings import Terminal
+import requests
+import twitter
+
+from iri_tweet import models, utils
+from iri_tweet.processor import TwitterProcessorStatus
+
+from lxml import html
+import json
+from pyquery import PyQuery
+
+logger = logging.getLogger(__name__)
+
+APPLICATION_NAME = "Tweet seach json"
+
+
+# TODO: implement some more parameters
+# script to "scrap twitter results"
+# Shamelessly taken from https://github.com/Jefferson-Henrique/GetOldTweets-python
+# pyquery cssselect
+class TweetManager:
+
+ def __init__(self, query):
+ self.query = query
+ self.refresh_cursor = ''
+ pass
+
+ def __iter__(self):
+
+ results = []
+
+ while True:
+ json = self.get_json_response()
+ if len(json['items_html'].strip()) == 0:
+ break
+
+ self.refresh_cursor = json['min_position']
+ tweets = PyQuery(json['items_html'])('div.js-stream-tweet')
+
+ if len(tweets) == 0:
+ break
+
+ for tweetHTML in tweets:
+ tweet_pq = PyQuery(tweetHTML)
+
+ username = tweet_pq("span.username.js-action-profile-name b").text();
+ txt = re.sub(r"\s+", " ", re.sub(r"[^\x00-\x7F]", "", tweet_pq("p.js-tweet-text").text()).replace('# ', '#').replace('@ ', '@'));
+ retweets = int(tweet_pq("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""));
+ favorites = int(tweet_pq("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""));
+ date_sec = int(tweet_pq("small.time span.js-short-timestamp").attr("data-time"));
+ id = tweet_pq.attr("data-tweet-id");
+ permalink = tweet_pq.attr("data-permalink-path");
+
+ geo = ''
+ geo_span = tweet_pq('span.Tweet-geo')
+ if len(geo_span) > 0:
+ geo = geo_span.attr('title')
+
+ yield {
+ "id" : id,
+ "permalink": 'https://twitter.com' + permalink,
+ "username" : username,
+ "text": txt,
+ "date" : datetime.datetime.fromtimestamp(date_sec),
+ "retweets" : retweets,
+ "favorites" : favorites,
+ "mentions": " ".join(re.compile('(@\\w*)').findall(txt)),
+ "hashtags": " ".join(re.compile('(#\\w*)').findall(txt)),
+ "geo": geo,
+ }
+
+ def get_json_response(self):
+
+ url = "https://twitter.com/i/search/timeline"
+
+ # if hasattr(tweetCriteria, 'username'):
+ # urlGetData += ' from:' + tweetCriteria.username
+ #
+ # if hasattr(tweetCriteria, 'since'):
+ # urlGetData += ' since:' + tweetCriteria.since
+ #
+ # if hasattr(tweetCriteria, 'until'):
+ # urlGetData += ' until:' + tweetCriteria.until
+
+ params = {
+ 'f': 'realtime',
+ 'q': self.query,
+ 'src': 'typd',
+ 'max_position': self.refresh_cursor
+ }
+
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}
+
+ return requests.get(url, params=params, headers=headers).json()
+
+
+def get_options():
+
+ usage = "usage: %(prog)s [options] <connection_str_or_filepath>"
+
+ parser = argparse.ArgumentParser(usage=usage)
+
+ parser.add_argument(dest="conn_str",
+ help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR")
+ parser.add_argument("-Q", dest="query",
+ help="query", metavar="QUERY")
+ parser.add_argument("-k", "--key", dest="consumer_key",
+ help="Twitter consumer key", metavar="CONSUMER_KEY")
+ parser.add_argument("-s", "--secret", dest="consumer_secret",
+ help="Twitter consumer secret", metavar="CONSUMER_SECRET")
+ parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
+ help="Token file name")
+
+ utils.set_logging_options(parser)
+
+ return parser.parse_args()
+
+
+
+if __name__ == "__main__":
+
+ options = get_options()
+
+ utils.set_logging(options);
+
+
+ acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME)
+
+ t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True)
+ t.secure = True
+
+ conn_str = options.conn_str.strip()
+ if not re.match("^\w+://.+", conn_str):
+ conn_str = 'sqlite:///' + conn_str
+
+ engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
+ session = None
+
+
+ term = Terminal()
+
+ try:
+ session = Session()
+
+ results = None
+ print options.query
+
+ tm = TweetManager(options.query)
+
+ move_up = 0
+
+ for i,item in enumerate(tm):
+ # get id
+ tweet_id = item.get("id")
+
+ if not tweet_id:
+ continue
+
+ if move_up > 0:
+ print((move_up+1)*term.move_up())
+ move_up = 0
+
+ print ("%d: %s - %r" % (i+1, tweet_id, item.get("text", "") ) + term.clear_eol())
+ move_up += 1
+
+ count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count()
+
+ if count_tweet:
+ continue
+ try:
+ tweet = t.statuses.show(id=tweet_id, include_entities=True)
+ except twitter.api.TwitterHTTPError as e:
+ if e.e.code == 404 or e.e.code == 403:
+ continue
+ else:
+ raise
+
+ processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger)
+ processor.process()
+ session.flush()
+ session.commit()
+
+ print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('X-Rate-Limit-Limit'))) + term.clear_eol())
+ move_up += 1
+ rate_limit_limit = int(tweet.headers.getheader('X-Rate-Limit-Limit'))
+ rate_limit_remaining = int(tweet.rate_limit_remaining)
+
+ if rate_limit_remaining > rate_limit_limit:
+ time_to_sleep = 0
+ else:
+ time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining))
+
+ for i in xrange(time_to_sleep):
+ if i:
+ print(2*term.move_up())
+ else:
+ move_up += 1
+ print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol())
+ time.sleep(1)
+
+ except twitter.api.TwitterHTTPError as e:
+ fmt = ("." + e.format) if e.format else ""
+ print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data))
+
+ finally:
+ if session:
+ session.close()