script/utils/search_twitter_json.py
author ymh <ymh.work@gmail.com>
Mon, 18 Jun 2018 23:15:34 +0200
changeset 1427 8b3d57a519eb
parent 1334 e1d3c1469691
child 1496 184372ec27e2
permissions -rw-r--r--
remove dependency to Zend 1. Use composer to reduce the number of dependencies

import argparse
import logging
import math
import re
import time
import datetime
import urllib

from blessings import Terminal
import requests
import twitter

from iri_tweet import models, utils
from iri_tweet.processor import TwitterProcessorStatus

from lxml import html
import json
from pyquery import PyQuery

logger = logging.getLogger(__name__)

APPLICATION_NAME = "Tweet seach json"


# TODO: implement some more parameters
# script to "scrap twitter results"
# Shamelessly taken from https://github.com/Jefferson-Henrique/GetOldTweets-python
# pyquery cssselect
class TweetManager:

    def __init__(self, query):
        self.query = query
        self.refresh_cursor = ''
        pass

    def __iter__(self):

        results = []

        while True:
            json = self.get_json_response()
            if len(json['items_html'].strip()) == 0:
                break

            self.refresh_cursor = json['min_position']
            tweets = PyQuery(json['items_html'])('div.js-stream-tweet')

            if len(tweets) == 0:
                break

            for tweetHTML in tweets:
                tweet_pq = PyQuery(tweetHTML)

                username = tweet_pq("span.username.js-action-profile-name b").text();
                txt = re.sub(r"\s+", " ", re.sub(r"[^\x00-\x7F]", "", tweet_pq("p.js-tweet-text").text()).replace('# ', '#').replace('@ ', '@'));
                retweets = int(tweet_pq("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""));
                favorites = int(tweet_pq("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""));
                date_sec = int(tweet_pq("small.time span.js-short-timestamp").attr("data-time"));
                id = tweet_pq.attr("data-tweet-id");
                permalink = tweet_pq.attr("data-permalink-path");

                geo = ''
                geo_span = tweet_pq('span.Tweet-geo')
                if len(geo_span) > 0:
                    geo = geo_span.attr('title')

                yield {
                    "id" : id,
                    "permalink": 'https://twitter.com' + permalink,
                    "username" : username,
                    "text": txt,
                    "date" : datetime.datetime.fromtimestamp(date_sec),
                    "retweets" : retweets,
                    "favorites" : favorites,
                    "mentions": " ".join(re.compile('(@\\w*)').findall(txt)),
                    "hashtags": " ".join(re.compile('(#\\w*)').findall(txt)),
                    "geo": geo,
                }

    def get_json_response(self):

        url = "https://twitter.com/i/search/timeline"

        # if hasattr(tweetCriteria, 'username'):
        #     urlGetData += ' from:' + tweetCriteria.username
        #
        # if hasattr(tweetCriteria, 'since'):
        #     urlGetData += ' since:' + tweetCriteria.since
        #
        # if hasattr(tweetCriteria, 'until'):
        #     urlGetData += ' until:' + tweetCriteria.until

        params = {
            'f': 'realtime',
            'q': self.query,
            'src': 'typd',
            'max_position': self.refresh_cursor
        }

        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}

        return requests.get(url, params=params, headers=headers).json()


def get_options():

    usage = "usage: %(prog)s [options] <connection_str_or_filepath>"

    parser = argparse.ArgumentParser(usage=usage)

    parser.add_argument(dest="conn_str",
                        help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR")
    parser.add_argument("-Q", dest="query",
                      help="query", metavar="QUERY")
    parser.add_argument("-k", "--key", dest="consumer_key",
                        help="Twitter consumer key", metavar="CONSUMER_KEY")
    parser.add_argument("-s", "--secret", dest="consumer_secret",
                        help="Twitter consumer secret", metavar="CONSUMER_SECRET")
    parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
                      help="Token file name")

    utils.set_logging_options(parser)

    return parser.parse_args()



if __name__ == "__main__":

    options = get_options()

    utils.set_logging(options);


    acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME)

    t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True)
    t.secure = True

    conn_str = options.conn_str.strip()
    if not re.match("^\w+://.+", conn_str):
        conn_str = 'sqlite:///' + conn_str

    engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
    session = None


    term = Terminal()

    try:
        session = Session()

        results = None
        print options.query

        tm = TweetManager(options.query)

        move_up = 0

        for i,item in enumerate(tm):
            # get id
            tweet_id = item.get("id")

            if not tweet_id:
                continue

            if move_up > 0:
                print((move_up+1)*term.move_up())
                move_up = 0

            print ("%d: %s - %r" % (i+1, tweet_id, item.get("text", "") ) + term.clear_eol())
            move_up += 1

            count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count()

            if count_tweet:
                continue
            try:
                tweet = t.statuses.show(id=tweet_id, include_entities=True)
            except twitter.api.TwitterHTTPError as e:
                if e.e.code == 404 or e.e.code == 403:
                    continue
                else:
                    raise

            processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger)
            processor.process()
            session.flush()
            session.commit()

            print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('X-Rate-Limit-Limit'))) + term.clear_eol())
            move_up += 1
            rate_limit_limit = int(tweet.headers.getheader('X-Rate-Limit-Limit'))
            rate_limit_remaining = int(tweet.rate_limit_remaining)

            if rate_limit_remaining > rate_limit_limit:
                time_to_sleep = 0
            else:
                time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining))

            for i in xrange(time_to_sleep):
                if i:
                    print(2*term.move_up())
                else:
                    move_up += 1
                print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol())
                time.sleep(1)

    except twitter.api.TwitterHTTPError as e:
        fmt = ("." + e.format) if e.format else ""
        print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data))

    finally:
        if session:
            session.close()