script/utils/search_twitter_json.py
author Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
Fri, 15 Jun 2018 11:08:31 +0200
changeset 1425 99671a4d5274
parent 1334 e1d3c1469691
child 1496 184372ec27e2
permissions -rw-r--r--
remove twitter tokens

import argparse
import logging
import math
import re
import time
import datetime
import urllib

from blessings import Terminal
import requests
import twitter

from iri_tweet import models, utils
from iri_tweet.processor import TwitterProcessorStatus

from lxml import html
import json
from pyquery import PyQuery

logger = logging.getLogger(__name__)

APPLICATION_NAME = "Tweet seach json"


# TODO: implement some more parameters
# script to "scrap twitter results"
# Shamelessly taken from https://github.com/Jefferson-Henrique/GetOldTweets-python
# pyquery cssselect
class TweetManager:

    def __init__(self, query):
        self.query = query
        self.refresh_cursor = ''
        pass

    def __iter__(self):

        results = []

        while True:
            json = self.get_json_response()
            if len(json['items_html'].strip()) == 0:
                break

            self.refresh_cursor = json['min_position']
            tweets = PyQuery(json['items_html'])('div.js-stream-tweet')

            if len(tweets) == 0:
                break

            for tweetHTML in tweets:
                tweet_pq = PyQuery(tweetHTML)

                username = tweet_pq("span.username.js-action-profile-name b").text();
                txt = re.sub(r"\s+", " ", re.sub(r"[^\x00-\x7F]", "", tweet_pq("p.js-tweet-text").text()).replace('# ', '#').replace('@ ', '@'));
                retweets = int(tweet_pq("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""));
                favorites = int(tweet_pq("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""));
                date_sec = int(tweet_pq("small.time span.js-short-timestamp").attr("data-time"));
                id = tweet_pq.attr("data-tweet-id");
                permalink = tweet_pq.attr("data-permalink-path");

                geo = ''
                geo_span = tweet_pq('span.Tweet-geo')
                if len(geo_span) > 0:
                    geo = geo_span.attr('title')

                yield {
                    "id" : id,
                    "permalink": 'https://twitter.com' + permalink,
                    "username" : username,
                    "text": txt,
                    "date" : datetime.datetime.fromtimestamp(date_sec),
                    "retweets" : retweets,
                    "favorites" : favorites,
                    "mentions": " ".join(re.compile('(@\\w*)').findall(txt)),
                    "hashtags": " ".join(re.compile('(#\\w*)').findall(txt)),
                    "geo": geo,
                }

    def get_json_response(self):

        url = "https://twitter.com/i/search/timeline"

        # if hasattr(tweetCriteria, 'username'):
        #     urlGetData += ' from:' + tweetCriteria.username
        #
        # if hasattr(tweetCriteria, 'since'):
        #     urlGetData += ' since:' + tweetCriteria.since
        #
        # if hasattr(tweetCriteria, 'until'):
        #     urlGetData += ' until:' + tweetCriteria.until

        params = {
            'f': 'realtime',
            'q': self.query,
            'src': 'typd',
            'max_position': self.refresh_cursor
        }

        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}

        return requests.get(url, params=params, headers=headers).json()


def get_options():

    usage = "usage: %(prog)s [options] <connection_str_or_filepath>"

    parser = argparse.ArgumentParser(usage=usage)

    parser.add_argument(dest="conn_str",
                        help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR")
    parser.add_argument("-Q", dest="query",
                      help="query", metavar="QUERY")
    parser.add_argument("-k", "--key", dest="consumer_key",
                        help="Twitter consumer key", metavar="CONSUMER_KEY")
    parser.add_argument("-s", "--secret", dest="consumer_secret",
                        help="Twitter consumer secret", metavar="CONSUMER_SECRET")
    parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
                      help="Token file name")

    utils.set_logging_options(parser)

    return parser.parse_args()



if __name__ == "__main__":

    options = get_options()

    utils.set_logging(options);


    acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME)

    t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True)
    t.secure = True

    conn_str = options.conn_str.strip()
    if not re.match("^\w+://.+", conn_str):
        conn_str = 'sqlite:///' + conn_str

    engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
    session = None


    term = Terminal()

    try:
        session = Session()

        results = None
        print options.query

        tm = TweetManager(options.query)

        move_up = 0

        for i,item in enumerate(tm):
            # get id
            tweet_id = item.get("id")

            if not tweet_id:
                continue

            if move_up > 0:
                print((move_up+1)*term.move_up())
                move_up = 0

            print ("%d: %s - %r" % (i+1, tweet_id, item.get("text", "") ) + term.clear_eol())
            move_up += 1

            count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count()

            if count_tweet:
                continue
            try:
                tweet = t.statuses.show(id=tweet_id, include_entities=True)
            except twitter.api.TwitterHTTPError as e:
                if e.e.code == 404 or e.e.code == 403:
                    continue
                else:
                    raise

            processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger)
            processor.process()
            session.flush()
            session.commit()

            print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('X-Rate-Limit-Limit'))) + term.clear_eol())
            move_up += 1
            rate_limit_limit = int(tweet.headers.getheader('X-Rate-Limit-Limit'))
            rate_limit_remaining = int(tweet.rate_limit_remaining)

            if rate_limit_remaining > rate_limit_limit:
                time_to_sleep = 0
            else:
                time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining))

            for i in xrange(time_to_sleep):
                if i:
                    print(2*term.move_up())
                else:
                    move_up += 1
                print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol())
                time.sleep(1)

    except twitter.api.TwitterHTTPError as e:
        fmt = ("." + e.format) if e.format else ""
        print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data))

    finally:
        if session:
            session.close()