diff -r 078085becbf8 -r 5c757e167687 script/utils/search_topsy_scrap.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/utils/search_topsy_scrap.py Fri Sep 12 13:03:29 2014 +0200 @@ -0,0 +1,211 @@ +import argparse +import logging +import math +import re +import time +import urllib + +from blessings import Terminal +import requests +import twitter + +from iri_tweet import models, utils +from iri_tweet.processor import TwitterProcessorStatus + +from selenium import webdriver +from selenium.webdriver.common.desired_capabilities import DesiredCapabilities +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC + +from lxml import html +import json + +logger = logging.getLogger(__name__) + +APPLICATION_NAME = "Tweet recorder user" + +dcap = dict(DesiredCapabilities.PHANTOMJS) +dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.103 Safari/537.36" + +class TopsyResource(object): + + def __init__(self, query, **kwargs): + + self.options = {} + self.options['q'] = query + self.options.update(kwargs) + self.base_url = "http://topsy.com/s" + self.driver = webdriver.PhantomJS(desired_capabilities=dcap) + self.driver.set_window_size(1024, 768) + self.page = -1 + self.tree = None + + + def __do_request(self, params): + url = "%s?%s" % (self.base_url, urllib.urlencode(params).replace('+','%20')) #calculate url with urllib + print('Requesting %s' % url) + self.driver.get(url) + try: + element = WebDriverWait(self.driver, 60).until( + EC.presence_of_element_located((By.CLASS_NAME, "result-tweet")) + ) + except Exception as e: + print('Exception requesting %s : %s' % (url, e)) + self.tree = None + else: + self.tree = html.fromstring(self.driver.page_source) + + def __check_last(self): + if self.page < 0: + return False + if self.tree is None or len(self.tree.xpath("//*[@id=\"module-pager\"]/div/ul/li[@data-page=\"next\"and @class=\"disabled\"]")): + return True + else: + return False + + + def __next_page(self): + if self.__check_last(): + return False + self.page += 1 + params = {} + params.update(self.options) + if self.page: + params['offset'] = self.page*self.options.get('perpage',10) + self.__do_request(params) + return self.tree is not None + + def __iter__(self): + result_xpath = "//*[@id=\"results\"]/div" + while self.__next_page(): + for res_node in self.tree.xpath(result_xpath): + res_obj = { + 'user': "".join(res_node.xpath("./div/div/h5/a/text()")), + 'content': "".join(res_node.xpath("./div/div/div/text()")), + 'url': "".join(res_node.xpath("./div/div/ul/li[1]/small/a/@href")) + } + if res_obj['url']: + yield res_obj + + +def get_options(): + + usage = "usage: %(prog)s [options] " + + parser = argparse.ArgumentParser(usage=usage) + + parser.add_argument(dest="conn_str", + help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR") + parser.add_argument("-Q", dest="query", + help="query", metavar="QUERY") + parser.add_argument("-k", "--key", dest="consumer_key", + help="Twitter consumer key", metavar="CONSUMER_KEY") + parser.add_argument("-s", "--secret", dest="consumer_secret", + help="Twitter consumer secret", metavar="CONSUMER_SECRET") + parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token", + help="Token file name") + parser.add_argument("-T", dest="topsy_apikey", metavar="TOPSY_APIKEY", default=None, + help="Topsy apikey") + + utils.set_logging_options(parser) + + return parser.parse_args() + + + +if __name__ == "__main__": + + options = get_options() + + utils.set_logging(options); + + + acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME) + + t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True) + t.secure = True + + conn_str = options.conn_str.strip() + if not re.match("^\w+://.+", conn_str): + conn_str = 'sqlite:///' + conn_str + + engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True) + session = None + + + topsy_parameters = { + 'perpage': 10, + 'window': 'a', + 'type': 'tweet', + 'hidden': 1, + 'sort': 'date' + } + + term = Terminal() + + try: + session = Session() + + results = None + page = 1 + print options.query + + tr = TopsyResource(options.query, **topsy_parameters) + + move_up = 0 + + for i,item in enumerate(tr): + # get id + url = item.get("url") + tweet_id = url.split("/")[-1] + + if move_up > 0: + print((move_up+1)*term.move_up()) + move_up = 0 + + print ("%d: %s - %r" % (i+1, tweet_id, item.get("content") ) + term.clear_eol()) + move_up += 1 + + count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count() + + if count_tweet: + continue + try: + tweet = t.statuses.show(id=tweet_id, include_entities=True) + except twitter.api.TwitterHTTPError as e: + if e.e.code == 404 or e.e.code == 403: + continue + else: + raise + + processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger) + processor.process() + session.flush() + session.commit() + + print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('X-Rate-Limit-Limit'))) + term.clear_eol()) + move_up += 1 + rate_limit_limit = int(tweet.headers.getheader('X-Rate-Limit-Limit')) + rate_limit_remaining = int(tweet.rate_limit_remaining) + + if rate_limit_remaining < rate_limit_limit: + time_to_sleep = 0 + else: + time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining)) + + for i in xrange(time_to_sleep): + if i: + print(2*term.move_up()) + else: + move_up += 1 + print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol()) + time.sleep(1) + + except twitter.api.TwitterHTTPError as e: + fmt = ("." + e.format) if e.format else "" + print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data)) + + finally: + if session: + session.close()