script/utils/search_topsy.py
changeset 693 2ef837069108
child 888 6fc6637d8403
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/script/utils/search_topsy.py	Mon Oct 15 17:01:50 2012 +0200
@@ -0,0 +1,170 @@
+from iri_tweet import models, utils
+from sqlalchemy.orm import sessionmaker
+import anyjson
+import sqlite3
+import twitter
+import re
+import requests
+from optparse import OptionParser
+import simplejson
+import time
+from blessings import Terminal
+import sys
+import math
+from symbol import except_clause
+
+APPLICATION_NAME = "Tweet recorder user"
+CONSUMER_KEY = "Vdr5ZcsjI1G3esTPI8yDg"
+CONSUMER_SECRET = "LMhNrY99R6a7E0YbZZkRFpUZpX5EfB1qATbDk1sIVLs"
+
+
+class TopsyResource(object):
+    
+    def __init__(self, query, **kwargs):
+                
+        self.options = kwargs
+        self.options['q'] = query
+        self.url = kwargs.get("url", "http://otter.topsy.com/search.json")
+        self.page = 0
+        self.req = None
+        self.res = {}
+        
+    def __initialize(self):
+        
+        params = {}
+        params.update(self.options)
+        self.req = requests.get(self.url, params=params)
+        self.res = self.req.json
+        
+    def __next_page(self):
+        page = self.res.get("response").get("page") + 1
+        params = {}
+        params.update(self.options)
+        params['page'] = page
+        self.req = requests.get(self.url, params=params)
+        self.res = self.req.json
+
+    def __iter__(self):        
+        if not self.req:
+            self.__initialize()
+        while "response" in self.res and "list" in self.res.get("response") and self.res.get("response").get("list"):
+            for item in  self.res.get("response").get("list"):
+                yield item
+            self.__next_page()
+            
+    def total(self):
+        if not self.res:
+            return 0
+        else:
+            return self.res.get("response",{}).get("total",0)
+            
+
+
+def get_option():
+    
+    parser = OptionParser()
+
+    parser.add_option("-d", "--database", dest="database",
+                      help="Input database", metavar="DATABASE")
+    parser.add_option("-Q", dest="query",
+                      help="query", metavar="QUERY")
+    parser.add_option("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
+                      help="Token file name")
+    parser.add_option("-T", dest="topsy_apikey", metavar="TOPSY_APIKEY", default=None,
+                      help="Topsy apikey")
+    
+    utils.set_logging_options(parser)
+
+    return parser.parse_args()
+
+
+
+if __name__ == "__main__":
+
+    (options, args) = get_option()
+    
+    utils.set_logging(options);
+
+
+    acess_token_key, access_token_secret = utils.get_oauth_token(options.token_filename, application_name=APPLICATION_NAME, consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET)
+
+    t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, CONSUMER_KEY, CONSUMER_SECRET), secure=True)
+    t.secure = True
+    
+    conn_str = options.database.strip()
+    if not re.match("^\w+://.+", conn_str):
+        conn_str = 'sqlite:///' + conn_str
+    
+    engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
+    session = None
+    
+    
+    topsy_parameters = {
+        'apikey': options.topsy_apikey,
+        'perpage': 100,
+        'window': 'a',
+        'type': 'tweet',
+        'hidden': True,
+    }
+    
+    term = Terminal()
+    
+    try:
+        session = Session()
+        
+        results = None        
+        page = 1
+        print options.query
+
+        tr = TopsyResource(options.query, **topsy_parameters)
+        
+        move_up = 0
+        
+        for i,item in enumerate(tr):
+            # get id
+            url = item.get("url")
+            tweet_id = url.split("/")[-1]
+            
+            if move_up > 0:
+                print((move_up+1)*term.move_up())
+                move_up = 0
+            
+            print ("%d/%d:%03d%% - %s - %r" % (i+1, tr.total(), int(float(i+1)/float(tr.total())*100.0), tweet_id, item.get("content") ) + term.clear_eol())            
+            move_up += 1
+            
+            count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count()
+            
+            if count_tweet:
+                continue
+            try:                                    
+                tweet = t.statuses.show(id=tweet_id, include_entities=True)
+            except twitter.api.TwitterHTTPError as e:
+                if e.e.code == 404 or e.e.code == 403:
+                    continue
+                else:
+                    raise
+            
+            processor = utils.TwitterProcessor(tweet, None, None, session, None, options.token_filename)
+            processor.process()
+            session.flush()
+            session.commit()
+                        
+            time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining))
+            
+            print "rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('x-ratelimit-limit'))) + term.clear_eol()
+            move_up += 1
+            for i in xrange(time_to_sleep):
+                if i:
+                    print(2*term.move_up())
+                else:
+                    move_up += 1
+                print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol())
+                time.sleep(1)
+                
+    except twitter.api.TwitterHTTPError as e:
+        fmt = ("." + e.format) if e.format else ""
+        print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data))
+        
+    finally:
+        if session:
+            session.close()