script/utils/search_topsy.py
changeset 1497 14a9bed2e3cd
parent 1496 184372ec27e2
child 1498 e0b3ef3c07d0
equal deleted inserted replaced
1496:184372ec27e2 1497:14a9bed2e3cd
     1 import argparse
       
     2 import logging
       
     3 import math
       
     4 import re
       
     5 import time
       
     6 
       
     7 from blessings import Terminal
       
     8 import requests
       
     9 import twitter
       
    10 
       
    11 from iri_tweet import models, utils
       
    12 from iri_tweet.processor import TwitterProcessorStatus
       
    13 
       
    14 
       
    15 logger = logging.getLogger(__name__)
       
    16 
       
    17 APPLICATION_NAME = "Tweet recorder user"
       
    18 
       
    19 
       
    20 class TopsyResource(object):
       
    21     
       
    22     def __init__(self, query, **kwargs):
       
    23 
       
    24         self.options = kwargs
       
    25         self.options['q'] = query
       
    26         self.url = kwargs.get("url", "http://otter.topsy.com/search.json")
       
    27         self.page = 0
       
    28         self.req = None
       
    29         self.res = {}
       
    30         
       
    31     def __initialize(self):
       
    32         
       
    33         params = {}
       
    34         params.update(self.options)
       
    35         self.req = requests.get(self.url, params=params)
       
    36         self.res = self.req.json()
       
    37         
       
    38     def __next_page(self):
       
    39         page = self.res.get("response").get("page") + 1
       
    40         params = {}
       
    41         params.update(self.options)
       
    42         params['page'] = page
       
    43         self.req = requests.get(self.url, params=params)
       
    44         self.res = self.req.json()
       
    45 
       
    46     def __iter__(self):        
       
    47         if not self.req:
       
    48             self.__initialize()
       
    49         while "response" in self.res and "list" in self.res.get("response") and self.res.get("response").get("list"):
       
    50             for item in  self.res.get("response").get("list"):
       
    51                 yield item
       
    52             self.__next_page()
       
    53             
       
    54     def total(self):
       
    55         if not self.res:
       
    56             return 0
       
    57         else:
       
    58             return self.res.get("response",{}).get("total",0)
       
    59             
       
    60 
       
    61 
       
    62 def get_options():
       
    63     
       
    64     usage = "usage: %(prog)s [options] <connection_str_or_filepath>"
       
    65     
       
    66     parser = argparse.ArgumentParser(usage=usage)
       
    67 
       
    68     parser.add_argument(dest="conn_str",
       
    69                         help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR")
       
    70     parser.add_argument("-Q", dest="query",
       
    71                       help="query", metavar="QUERY")
       
    72     parser.add_argument("-k", "--key", dest="consumer_key",
       
    73                         help="Twitter consumer key", metavar="CONSUMER_KEY")
       
    74     parser.add_argument("-s", "--secret", dest="consumer_secret",
       
    75                         help="Twitter consumer secret", metavar="CONSUMER_SECRET")
       
    76     parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
       
    77                       help="Token file name")
       
    78     parser.add_argument("-T", dest="topsy_apikey", metavar="TOPSY_APIKEY", default=None,
       
    79                       help="Topsy apikey")
       
    80 
       
    81     utils.set_logging_options(parser)
       
    82 
       
    83     return parser.parse_args()
       
    84 
       
    85 
       
    86 
       
    87 if __name__ == "__main__":
       
    88 
       
    89     options = get_options()
       
    90     
       
    91     utils.set_logging(options);
       
    92 
       
    93 
       
    94     acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME)
       
    95 
       
    96     t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True)
       
    97     t.secure = True
       
    98     
       
    99     conn_str = options.conn_str.strip()
       
   100     if not re.match("^\w+://.+", conn_str):
       
   101         conn_str = 'sqlite:///' + conn_str
       
   102     
       
   103     engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
       
   104     session = None
       
   105     
       
   106     
       
   107     topsy_parameters = {
       
   108         'apikey': options.topsy_apikey,
       
   109         'perpage': 100,
       
   110         'window': 'a',
       
   111         'type': 'tweet',
       
   112         'hidden': True,
       
   113     }
       
   114     
       
   115     term = Terminal()
       
   116     
       
   117     try:
       
   118         session = Session()
       
   119         
       
   120         results = None        
       
   121         page = 1
       
   122         print options.query
       
   123 
       
   124         tr = TopsyResource(options.query, **topsy_parameters)
       
   125         
       
   126         move_up = 0
       
   127         
       
   128         for i,item in enumerate(tr):
       
   129             # get id
       
   130             url = item.get("url")
       
   131             tweet_id = url.split("/")[-1]
       
   132             
       
   133             if move_up > 0:
       
   134                 print((move_up+1)*term.move_up())
       
   135                 move_up = 0
       
   136             
       
   137             print ("%d/%d:%03d%% - %s - %r" % (i+1, tr.total(), int(float(i+1)/float(tr.total())*100.0), tweet_id, item.get("content") ) + term.clear_eol())            
       
   138             move_up += 1
       
   139             
       
   140             count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count()
       
   141             
       
   142             if count_tweet:
       
   143                 continue
       
   144             try:                                    
       
   145                 tweet = t.statuses.show(id=tweet_id, include_entities=True)
       
   146             except twitter.api.TwitterHTTPError as e:
       
   147                 if e.e.code == 404 or e.e.code == 403:
       
   148                     continue
       
   149                 else:
       
   150                     raise
       
   151             
       
   152             processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger)
       
   153             processor.process()
       
   154             session.flush()
       
   155             session.commit()
       
   156 
       
   157             print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('X-Rate-Limit-Limit'))) + term.clear_eol())
       
   158             move_up += 1
       
   159             rate_limit_limit = int(tweet.headers.getheader('X-Rate-Limit-Limit'))
       
   160             rate_limit_remaining = int(tweet.rate_limit_remaining)
       
   161 
       
   162             if rate_limit_remaining < rate_limit_limit:
       
   163                 time_to_sleep = 0
       
   164             else:
       
   165                 time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining)) 
       
   166 
       
   167             for i in xrange(time_to_sleep):
       
   168                 if i:
       
   169                     print(2*term.move_up())
       
   170                 else:
       
   171                     move_up += 1
       
   172                 print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol())
       
   173                 time.sleep(1)
       
   174                 
       
   175     except twitter.api.TwitterHTTPError as e:
       
   176         fmt = ("." + e.format) if e.format else ""
       
   177         print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data))
       
   178         
       
   179     finally:
       
   180         if session:
       
   181             session.close()