script/utils/search_topsy.py
changeset 693 2ef837069108
child 888 6fc6637d8403
equal deleted inserted replaced
692:51072e5e6ea9 693:2ef837069108
       
     1 from iri_tweet import models, utils
       
     2 from sqlalchemy.orm import sessionmaker
       
     3 import anyjson
       
     4 import sqlite3
       
     5 import twitter
       
     6 import re
       
     7 import requests
       
     8 from optparse import OptionParser
       
     9 import simplejson
       
    10 import time
       
    11 from blessings import Terminal
       
    12 import sys
       
    13 import math
       
    14 from symbol import except_clause
       
    15 
       
    16 APPLICATION_NAME = "Tweet recorder user"
       
    17 CONSUMER_KEY = "Vdr5ZcsjI1G3esTPI8yDg"
       
    18 CONSUMER_SECRET = "LMhNrY99R6a7E0YbZZkRFpUZpX5EfB1qATbDk1sIVLs"
       
    19 
       
    20 
       
    21 class TopsyResource(object):
       
    22     
       
    23     def __init__(self, query, **kwargs):
       
    24                 
       
    25         self.options = kwargs
       
    26         self.options['q'] = query
       
    27         self.url = kwargs.get("url", "http://otter.topsy.com/search.json")
       
    28         self.page = 0
       
    29         self.req = None
       
    30         self.res = {}
       
    31         
       
    32     def __initialize(self):
       
    33         
       
    34         params = {}
       
    35         params.update(self.options)
       
    36         self.req = requests.get(self.url, params=params)
       
    37         self.res = self.req.json
       
    38         
       
    39     def __next_page(self):
       
    40         page = self.res.get("response").get("page") + 1
       
    41         params = {}
       
    42         params.update(self.options)
       
    43         params['page'] = page
       
    44         self.req = requests.get(self.url, params=params)
       
    45         self.res = self.req.json
       
    46 
       
    47     def __iter__(self):        
       
    48         if not self.req:
       
    49             self.__initialize()
       
    50         while "response" in self.res and "list" in self.res.get("response") and self.res.get("response").get("list"):
       
    51             for item in  self.res.get("response").get("list"):
       
    52                 yield item
       
    53             self.__next_page()
       
    54             
       
    55     def total(self):
       
    56         if not self.res:
       
    57             return 0
       
    58         else:
       
    59             return self.res.get("response",{}).get("total",0)
       
    60             
       
    61 
       
    62 
       
    63 def get_option():
       
    64     
       
    65     parser = OptionParser()
       
    66 
       
    67     parser.add_option("-d", "--database", dest="database",
       
    68                       help="Input database", metavar="DATABASE")
       
    69     parser.add_option("-Q", dest="query",
       
    70                       help="query", metavar="QUERY")
       
    71     parser.add_option("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
       
    72                       help="Token file name")
       
    73     parser.add_option("-T", dest="topsy_apikey", metavar="TOPSY_APIKEY", default=None,
       
    74                       help="Topsy apikey")
       
    75     
       
    76     utils.set_logging_options(parser)
       
    77 
       
    78     return parser.parse_args()
       
    79 
       
    80 
       
    81 
       
    82 if __name__ == "__main__":
       
    83 
       
    84     (options, args) = get_option()
       
    85     
       
    86     utils.set_logging(options);
       
    87 
       
    88 
       
    89     acess_token_key, access_token_secret = utils.get_oauth_token(options.token_filename, application_name=APPLICATION_NAME, consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET)
       
    90 
       
    91     t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, CONSUMER_KEY, CONSUMER_SECRET), secure=True)
       
    92     t.secure = True
       
    93     
       
    94     conn_str = options.database.strip()
       
    95     if not re.match("^\w+://.+", conn_str):
       
    96         conn_str = 'sqlite:///' + conn_str
       
    97     
       
    98     engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
       
    99     session = None
       
   100     
       
   101     
       
   102     topsy_parameters = {
       
   103         'apikey': options.topsy_apikey,
       
   104         'perpage': 100,
       
   105         'window': 'a',
       
   106         'type': 'tweet',
       
   107         'hidden': True,
       
   108     }
       
   109     
       
   110     term = Terminal()
       
   111     
       
   112     try:
       
   113         session = Session()
       
   114         
       
   115         results = None        
       
   116         page = 1
       
   117         print options.query
       
   118 
       
   119         tr = TopsyResource(options.query, **topsy_parameters)
       
   120         
       
   121         move_up = 0
       
   122         
       
   123         for i,item in enumerate(tr):
       
   124             # get id
       
   125             url = item.get("url")
       
   126             tweet_id = url.split("/")[-1]
       
   127             
       
   128             if move_up > 0:
       
   129                 print((move_up+1)*term.move_up())
       
   130                 move_up = 0
       
   131             
       
   132             print ("%d/%d:%03d%% - %s - %r" % (i+1, tr.total(), int(float(i+1)/float(tr.total())*100.0), tweet_id, item.get("content") ) + term.clear_eol())            
       
   133             move_up += 1
       
   134             
       
   135             count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count()
       
   136             
       
   137             if count_tweet:
       
   138                 continue
       
   139             try:                                    
       
   140                 tweet = t.statuses.show(id=tweet_id, include_entities=True)
       
   141             except twitter.api.TwitterHTTPError as e:
       
   142                 if e.e.code == 404 or e.e.code == 403:
       
   143                     continue
       
   144                 else:
       
   145                     raise
       
   146             
       
   147             processor = utils.TwitterProcessor(tweet, None, None, session, None, options.token_filename)
       
   148             processor.process()
       
   149             session.flush()
       
   150             session.commit()
       
   151                         
       
   152             time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining))
       
   153             
       
   154             print "rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('x-ratelimit-limit'))) + term.clear_eol()
       
   155             move_up += 1
       
   156             for i in xrange(time_to_sleep):
       
   157                 if i:
       
   158                     print(2*term.move_up())
       
   159                 else:
       
   160                     move_up += 1
       
   161                 print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol())
       
   162                 time.sleep(1)
       
   163                 
       
   164     except twitter.api.TwitterHTTPError as e:
       
   165         fmt = ("." + e.format) if e.format else ""
       
   166         print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data))
       
   167         
       
   168     finally:
       
   169         if session:
       
   170             session.close()