script/utils/search_twitter_json.py
author ymh <ymh.work@gmail.com>
Thu, 10 Jan 2019 18:36:36 +0100
changeset 1497 14a9bed2e3cd
parent 1496 184372ec27e2
permissions -rw-r--r--
Adapt recorder_stream to python 3 Improve twitter authentication management Use Oauth2 where possible Delete old script

import argparse
import logging
import math
import re
import time
import datetime
import urllib

from blessings import Terminal
import requests
import twitter

from iri_tweet import models, utils
from iri_tweet.processor import TwitterProcessorStatus

import json
from pyquery import PyQuery

logger = logging.getLogger(__name__)

APPLICATION_NAME = "Tweet seach json"


# TODO: implement some more parameters
# script to "scrap twitter results"
# Shamelessly taken from https://github.com/Jefferson-Henrique/GetOldTweets-python
# pyquery cssselect
class TweetManager:

    def __init__(self, query):
        self.query = query
        self.refresh_cursor = ''
        pass

    def __iter__(self):

        while True:
            json = self.get_json_response()
            if len(json['items_html'].strip()) == 0:
                break

            self.refresh_cursor = json['min_position']
            tweets = PyQuery(json['items_html'])('div.js-stream-tweet')

            if len(tweets) == 0:
                break

            for tweetHTML in tweets:
                tweet_pq = PyQuery(tweetHTML)

                username = tweet_pq("span.username.js-action-profile-name b").text()
                txt = re.sub(r"\s+", " ", re.sub(r"[^\x00-\x7F]", "", tweet_pq("p.js-tweet-text").text()).replace('# ', '#').replace('@ ', '@'))
                retweets = int(tweet_pq("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
                favorites = int(tweet_pq("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
                date_sec = int(tweet_pq("small.time span.js-short-timestamp").attr("data-time"))
                id = tweet_pq.attr("data-tweet-id")
                permalink = tweet_pq.attr("data-permalink-path")

                geo = ''
                geo_span = tweet_pq('span.Tweet-geo')
                if len(geo_span) > 0:
                    geo = geo_span.attr('title')

                yield {
                    "id" : id,
                    "permalink": 'https://twitter.com' + permalink,
                    "username" : username,
                    "text": txt,
                    "date" : datetime.datetime.fromtimestamp(date_sec),
                    "retweets" : retweets,
                    "favorites" : favorites,
                    "mentions": " ".join(re.compile('(@\\w*)').findall(txt)),
                    "hashtags": " ".join(re.compile('(#\\w*)').findall(txt)),
                    "geo": geo,
                }

    def get_json_response(self):

        url = "https://twitter.com/i/search/timeline"

        # if hasattr(tweetCriteria, 'username'):
        #     urlGetData += ' from:' + tweetCriteria.username
        #
        # if hasattr(tweetCriteria, 'since'):
        #     urlGetData += ' since:' + tweetCriteria.since
        #
        # if hasattr(tweetCriteria, 'until'):
        #     urlGetData += ' until:' + tweetCriteria.until

        params = {
            'f': 'realtime',
            'q': self.query,
            'src': 'typd',
            'max_position': self.refresh_cursor
        }

        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}

        return requests.get(url, params=params, headers=headers).json()


def get_options():

    usage = "usage: %(prog)s [options] <connection_str_or_filepath>"

    parser = argparse.ArgumentParser(usage=usage)

    parser.add_argument(dest="conn_str",
                        help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR")
    parser.add_argument("-Q", dest="query",
                      help="query", metavar="QUERY")
    parser.add_argument("-k", "--key", dest="consumer_key",
                        help="Twitter consumer key", metavar="CONSUMER_KEY")
    parser.add_argument("-s", "--secret", dest="consumer_secret",
                        help="Twitter consumer secret", metavar="CONSUMER_SECRET")
    parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
                      help="Token file name")

    utils.set_logging_options(parser)

    return parser.parse_args()



if __name__ == "__main__":

    options = get_options()

    utils.set_logging(options)


    acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME)
    twitter_auth = twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret)

    t = twitter.Twitter(domain="api.twitter.com", auth=twitter_auth, secure=True)
    t.secure = True

    conn_str = options.conn_str.strip()
    if not re.match(r"^\w+://.+", conn_str):
        conn_str = 'sqlite:///' + conn_str

    engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
    session = None


    term = Terminal()

    try:
        session = Session()

        results = None
        print(options.query)

        tm = TweetManager(options.query)

        move_up = 0

        for i,item in enumerate(tm):
            # get id
            tweet_id = item.get("id")

            if not tweet_id:
                continue

            if move_up > 0:
                print((move_up+1)*term.move_up())
                move_up = 0

            print ("%d: %s - %r" % (i+1, tweet_id, item.get("text", "") ) + term.clear_eol())
            move_up += 1

            count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count()

            if count_tweet:
                continue
            try:
                tweet = t.statuses.show(id=tweet_id, include_entities=True)
            except twitter.api.TwitterHTTPError as e:
                if e.e.code == 404 or e.e.code == 403:
                    continue
                else:
                    raise

            processor = TwitterProcessorStatus(tweet, None, None, session, twitter_auth=twitter_auth, logger=logger)
            processor.process()
            session.flush()
            session.commit()

            print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers['X-Rate-Limit-Limit'])) + term.clear_eol())
            move_up += 1
            rate_limit_limit = int(tweet.headers['X-Rate-Limit-Limit'])
            rate_limit_remaining = int(tweet.rate_limit_remaining)

            if rate_limit_remaining > rate_limit_limit:
                time_to_sleep = 0
            else:
                time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining))

            for i in range(time_to_sleep):
                if i:
                    print(2*term.move_up())
                else:
                    move_up += 1
                print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol())
                time.sleep(1)

    except twitter.api.TwitterHTTPError as e:
        fmt = ("." + e.format) if e.format else ""
        print("Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data)))

    finally:
        if session:
            session.close()