script/utils/search_twitter_api.py
changeset 1496 184372ec27e2
child 1497 14a9bed2e3cd
equal deleted inserted replaced
1495:efbda157eb57 1496:184372ec27e2
       
     1 import argparse
       
     2 import logging
       
     3 import math
       
     4 import re
       
     5 import time
       
     6 import datetime
       
     7 import urllib
       
     8 
       
     9 from blessings import Terminal
       
    10 import requests
       
    11 import twitter
       
    12 
       
    13 from iri_tweet import models, utils
       
    14 from iri_tweet.processor import TwitterProcessorStatus
       
    15 
       
    16 import json
       
    17 
       
    18 logger = logging.getLogger(__name__)
       
    19 
       
    20 APPLICATION_NAME = "Tweet seach json"
       
    21 
       
    22 
       
    23 # TODO: implement some more parameters
       
    24 # script to "scrap twitter results"
       
    25 # Shamelessly taken from https://github.com/Jefferson-Henrique/GetOldTweets-python
       
    26 # pyquery cssselect
       
    27 class TweetManager:
       
    28 
       
    29     def __init__(self, query, twitter_con):
       
    30         self.query = query
       
    31         self.max_id = 0
       
    32         self.t = twitter_con
       
    33         pass
       
    34 
       
    35     def __iter__(self):
       
    36         while True:
       
    37             if self.max_id < 0:
       
    38                 break
       
    39             json = self.get_json_response()
       
    40 
       
    41             next_results = json['search_metadata'].get('next_results', "?")[1:]
       
    42             self.max_id = int(urllib.parse.parse_qs(next_results).get('max_id', [-1])[0])
       
    43 
       
    44             tweet_list = json['statuses']
       
    45 
       
    46             if len(tweet_list) == 0:
       
    47                 break
       
    48 
       
    49             for tweet in tweet_list:
       
    50                 yield tweet
       
    51 
       
    52     def get_json_response(self):
       
    53         return self.t.search.tweets(q=self.query, include_entities=True, max_id=self.max_id)
       
    54 
       
    55 
       
    56 def get_options():
       
    57 
       
    58     usage = "usage: %(prog)s [options] <connection_str_or_filepath>"
       
    59 
       
    60     parser = argparse.ArgumentParser(usage=usage)
       
    61 
       
    62     parser.add_argument(dest="conn_str",
       
    63                         help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR")
       
    64     parser.add_argument("-Q", dest="query",
       
    65                       help="query", metavar="QUERY")
       
    66     parser.add_argument("-k", "--key", dest="consumer_key",
       
    67                         help="Twitter consumer key", metavar="CONSUMER_KEY")
       
    68     parser.add_argument("-s", "--secret", dest="consumer_secret",
       
    69                         help="Twitter consumer secret", metavar="CONSUMER_SECRET")
       
    70     parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
       
    71                       help="Token file name")
       
    72 
       
    73     utils.set_logging_options(parser)
       
    74 
       
    75     return parser.parse_args()
       
    76 
       
    77 
       
    78 
       
    79 if __name__ == "__main__":
       
    80 
       
    81     options = get_options()
       
    82 
       
    83     utils.set_logging(options)
       
    84 
       
    85 
       
    86     acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME)
       
    87 
       
    88     t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True)
       
    89     t.secure = True    
       
    90 
       
    91     conn_str = options.conn_str.strip()
       
    92     if not re.match(r"^\w+://.+", conn_str):
       
    93         conn_str = 'sqlite:///' + conn_str
       
    94 
       
    95     engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
       
    96     session = None
       
    97 
       
    98 
       
    99     term = Terminal()
       
   100 
       
   101     try:
       
   102         session = Session()
       
   103 
       
   104         results = None
       
   105         print(options.query)
       
   106 
       
   107         tm = TweetManager(options.query, t)
       
   108 
       
   109         move_up = 0
       
   110 
       
   111         for i,tweet in enumerate(tm):
       
   112             # get id
       
   113             tweet_id = tweet.get("id")
       
   114 
       
   115             if not tweet_id:
       
   116                 continue
       
   117 
       
   118             if move_up > 0:
       
   119                 print((move_up+1)*term.move_up())
       
   120                 move_up = 0
       
   121 
       
   122             print ("%d: %s - %r" % (i+1, tweet_id, tweet.get("text", "") ) + term.clear_eol())
       
   123             move_up += 1
       
   124 
       
   125             count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count()
       
   126 
       
   127             if count_tweet:
       
   128                 continue
       
   129 
       
   130             processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger)
       
   131             processor.process()
       
   132             session.flush()
       
   133             session.commit()
       
   134 
       
   135     except twitter.api.TwitterHTTPError as e:
       
   136         fmt = ("." + e.format) if e.format else ""
       
   137         print("Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data)))
       
   138 
       
   139     finally:
       
   140         if session:
       
   141             session.close()