script/utils/search_twitter_json.py
changeset 1334 e1d3c1469691
child 1496 184372ec27e2
equal deleted inserted replaced
1333:884b1b7fc420 1334:e1d3c1469691
       
     1 import argparse
       
     2 import logging
       
     3 import math
       
     4 import re
       
     5 import time
       
     6 import datetime
       
     7 import urllib
       
     8 
       
     9 from blessings import Terminal
       
    10 import requests
       
    11 import twitter
       
    12 
       
    13 from iri_tweet import models, utils
       
    14 from iri_tweet.processor import TwitterProcessorStatus
       
    15 
       
    16 from lxml import html
       
    17 import json
       
    18 from pyquery import PyQuery
       
    19 
       
    20 logger = logging.getLogger(__name__)
       
    21 
       
    22 APPLICATION_NAME = "Tweet seach json"
       
    23 
       
    24 
       
    25 # TODO: implement some more parameters
       
    26 # script to "scrap twitter results"
       
    27 # Shamelessly taken from https://github.com/Jefferson-Henrique/GetOldTweets-python
       
    28 # pyquery cssselect
       
    29 class TweetManager:
       
    30 
       
    31     def __init__(self, query):
       
    32         self.query = query
       
    33         self.refresh_cursor = ''
       
    34         pass
       
    35 
       
    36     def __iter__(self):
       
    37 
       
    38         results = []
       
    39 
       
    40         while True:
       
    41             json = self.get_json_response()
       
    42             if len(json['items_html'].strip()) == 0:
       
    43                 break
       
    44 
       
    45             self.refresh_cursor = json['min_position']
       
    46             tweets = PyQuery(json['items_html'])('div.js-stream-tweet')
       
    47 
       
    48             if len(tweets) == 0:
       
    49                 break
       
    50 
       
    51             for tweetHTML in tweets:
       
    52                 tweet_pq = PyQuery(tweetHTML)
       
    53 
       
    54                 username = tweet_pq("span.username.js-action-profile-name b").text();
       
    55                 txt = re.sub(r"\s+", " ", re.sub(r"[^\x00-\x7F]", "", tweet_pq("p.js-tweet-text").text()).replace('# ', '#').replace('@ ', '@'));
       
    56                 retweets = int(tweet_pq("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""));
       
    57                 favorites = int(tweet_pq("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""));
       
    58                 date_sec = int(tweet_pq("small.time span.js-short-timestamp").attr("data-time"));
       
    59                 id = tweet_pq.attr("data-tweet-id");
       
    60                 permalink = tweet_pq.attr("data-permalink-path");
       
    61 
       
    62                 geo = ''
       
    63                 geo_span = tweet_pq('span.Tweet-geo')
       
    64                 if len(geo_span) > 0:
       
    65                     geo = geo_span.attr('title')
       
    66 
       
    67                 yield {
       
    68                     "id" : id,
       
    69                     "permalink": 'https://twitter.com' + permalink,
       
    70                     "username" : username,
       
    71                     "text": txt,
       
    72                     "date" : datetime.datetime.fromtimestamp(date_sec),
       
    73                     "retweets" : retweets,
       
    74                     "favorites" : favorites,
       
    75                     "mentions": " ".join(re.compile('(@\\w*)').findall(txt)),
       
    76                     "hashtags": " ".join(re.compile('(#\\w*)').findall(txt)),
       
    77                     "geo": geo,
       
    78                 }
       
    79 
       
    80     def get_json_response(self):
       
    81 
       
    82         url = "https://twitter.com/i/search/timeline"
       
    83 
       
    84         # if hasattr(tweetCriteria, 'username'):
       
    85         #     urlGetData += ' from:' + tweetCriteria.username
       
    86         #
       
    87         # if hasattr(tweetCriteria, 'since'):
       
    88         #     urlGetData += ' since:' + tweetCriteria.since
       
    89         #
       
    90         # if hasattr(tweetCriteria, 'until'):
       
    91         #     urlGetData += ' until:' + tweetCriteria.until
       
    92 
       
    93         params = {
       
    94             'f': 'realtime',
       
    95             'q': self.query,
       
    96             'src': 'typd',
       
    97             'max_position': self.refresh_cursor
       
    98         }
       
    99 
       
   100         headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}
       
   101 
       
   102         return requests.get(url, params=params, headers=headers).json()
       
   103 
       
   104 
       
   105 def get_options():
       
   106 
       
   107     usage = "usage: %(prog)s [options] <connection_str_or_filepath>"
       
   108 
       
   109     parser = argparse.ArgumentParser(usage=usage)
       
   110 
       
   111     parser.add_argument(dest="conn_str",
       
   112                         help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR")
       
   113     parser.add_argument("-Q", dest="query",
       
   114                       help="query", metavar="QUERY")
       
   115     parser.add_argument("-k", "--key", dest="consumer_key",
       
   116                         help="Twitter consumer key", metavar="CONSUMER_KEY")
       
   117     parser.add_argument("-s", "--secret", dest="consumer_secret",
       
   118                         help="Twitter consumer secret", metavar="CONSUMER_SECRET")
       
   119     parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
       
   120                       help="Token file name")
       
   121 
       
   122     utils.set_logging_options(parser)
       
   123 
       
   124     return parser.parse_args()
       
   125 
       
   126 
       
   127 
       
   128 if __name__ == "__main__":
       
   129 
       
   130     options = get_options()
       
   131 
       
   132     utils.set_logging(options);
       
   133 
       
   134 
       
   135     acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME)
       
   136 
       
   137     t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True)
       
   138     t.secure = True
       
   139 
       
   140     conn_str = options.conn_str.strip()
       
   141     if not re.match("^\w+://.+", conn_str):
       
   142         conn_str = 'sqlite:///' + conn_str
       
   143 
       
   144     engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
       
   145     session = None
       
   146 
       
   147 
       
   148     term = Terminal()
       
   149 
       
   150     try:
       
   151         session = Session()
       
   152 
       
   153         results = None
       
   154         print options.query
       
   155 
       
   156         tm = TweetManager(options.query)
       
   157 
       
   158         move_up = 0
       
   159 
       
   160         for i,item in enumerate(tm):
       
   161             # get id
       
   162             tweet_id = item.get("id")
       
   163 
       
   164             if not tweet_id:
       
   165                 continue
       
   166 
       
   167             if move_up > 0:
       
   168                 print((move_up+1)*term.move_up())
       
   169                 move_up = 0
       
   170 
       
   171             print ("%d: %s - %r" % (i+1, tweet_id, item.get("text", "") ) + term.clear_eol())
       
   172             move_up += 1
       
   173 
       
   174             count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count()
       
   175 
       
   176             if count_tweet:
       
   177                 continue
       
   178             try:
       
   179                 tweet = t.statuses.show(id=tweet_id, include_entities=True)
       
   180             except twitter.api.TwitterHTTPError as e:
       
   181                 if e.e.code == 404 or e.e.code == 403:
       
   182                     continue
       
   183                 else:
       
   184                     raise
       
   185 
       
   186             processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger)
       
   187             processor.process()
       
   188             session.flush()
       
   189             session.commit()
       
   190 
       
   191             print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('X-Rate-Limit-Limit'))) + term.clear_eol())
       
   192             move_up += 1
       
   193             rate_limit_limit = int(tweet.headers.getheader('X-Rate-Limit-Limit'))
       
   194             rate_limit_remaining = int(tweet.rate_limit_remaining)
       
   195 
       
   196             if rate_limit_remaining > rate_limit_limit:
       
   197                 time_to_sleep = 0
       
   198             else:
       
   199                 time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining))
       
   200 
       
   201             for i in xrange(time_to_sleep):
       
   202                 if i:
       
   203                     print(2*term.move_up())
       
   204                 else:
       
   205                     move_up += 1
       
   206                 print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol())
       
   207                 time.sleep(1)
       
   208 
       
   209     except twitter.api.TwitterHTTPError as e:
       
   210         fmt = ("." + e.format) if e.format else ""
       
   211         print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data))
       
   212 
       
   213     finally:
       
   214         if session:
       
   215             session.close()