script/utils/search_topsy_scrap.py
changeset 1497 14a9bed2e3cd
parent 1496 184372ec27e2
child 1498 e0b3ef3c07d0
--- a/script/utils/search_topsy_scrap.py	Wed Jan 02 17:49:19 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,211 +0,0 @@
-import argparse
-import logging
-import math
-import re
-import time
-import urllib
-
-from blessings import Terminal
-import requests
-import twitter
-
-from iri_tweet import models, utils
-from iri_tweet.processor import TwitterProcessorStatus
-
-from selenium import webdriver
-from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-
-from lxml import html
-import json
-
-logger = logging.getLogger(__name__)
-
-APPLICATION_NAME = "Tweet recorder user"
-
-dcap = dict(DesiredCapabilities.PHANTOMJS)
-dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.103 Safari/537.36"
-
-class TopsyResource(object):
-
-    def __init__(self, query, **kwargs):
-
-        self.options = {}
-        self.options['q'] = query
-        self.options.update(kwargs)
-        self.base_url = "http://topsy.com/s"
-        self.driver = webdriver.PhantomJS(desired_capabilities=dcap)
-        self.driver.set_window_size(1024, 768)
-        self.page = -1
-        self.tree = None
-
-
-    def __do_request(self, params):
-      url = "%s?%s" % (self.base_url, urllib.urlencode(params).replace('+','%20')) #calculate url with urllib
-      print('Requesting %s' % url)
-      self.driver.get(url)
-      try:
-          element = WebDriverWait(self.driver, 60).until(
-              EC.presence_of_element_located((By.CLASS_NAME, "result-tweet"))
-          )
-      except Exception as e:
-        print('Exception requesting %s : %s' % (url, e))
-        self.tree = None
-      else:
-        self.tree = html.fromstring(self.driver.page_source)
-
-    def __check_last(self):
-      if self.page < 0:
-          return False
-      if self.tree is None or len(self.tree.xpath("//*[@id=\"module-pager\"]/div/ul/li[@data-page=\"next\"and @class=\"disabled\"]")):
-          return True
-      else:
-          return False
-
-
-    def __next_page(self):
-        if self.__check_last():
-          return False
-        self.page += 1
-        params = {}
-        params.update(self.options)
-        if self.page:
-          params['offset'] = self.page*self.options.get('perpage',10)
-        self.__do_request(params)
-        return self.tree is not None
-
-    def __iter__(self):
-        result_xpath = "//*[@id=\"results\"]/div"
-        while self.__next_page():
-            for res_node in self.tree.xpath(result_xpath):
-                res_obj = {
-                  'user': "".join(res_node.xpath("./div/div/h5/a/text()")),
-                  'content': "".join(res_node.xpath("./div/div/div/text()")),
-                  'url': "".join(res_node.xpath("./div/div/ul/li[1]/small/a/@href"))
-                }
-                if res_obj['url']:
-                  yield res_obj
-
-
-def get_options():
-
-    usage = "usage: %(prog)s [options] <connection_str_or_filepath>"
-
-    parser = argparse.ArgumentParser(usage=usage)
-
-    parser.add_argument(dest="conn_str",
-                        help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR")
-    parser.add_argument("-Q", dest="query",
-                      help="query", metavar="QUERY")
-    parser.add_argument("-k", "--key", dest="consumer_key",
-                        help="Twitter consumer key", metavar="CONSUMER_KEY")
-    parser.add_argument("-s", "--secret", dest="consumer_secret",
-                        help="Twitter consumer secret", metavar="CONSUMER_SECRET")
-    parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
-                      help="Token file name")
-    parser.add_argument("-T", dest="topsy_apikey", metavar="TOPSY_APIKEY", default=None,
-                      help="Topsy apikey")
-
-    utils.set_logging_options(parser)
-
-    return parser.parse_args()
-
-
-
-if __name__ == "__main__":
-
-    options = get_options()
-
-    utils.set_logging(options);
-
-
-    acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME)
-
-    t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True)
-    t.secure = True
-
-    conn_str = options.conn_str.strip()
-    if not re.match("^\w+://.+", conn_str):
-        conn_str = 'sqlite:///' + conn_str
-
-    engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
-    session = None
-
-
-    topsy_parameters = {
-        'perpage': 10,
-        'window': 'a',
-        'type': 'tweet',
-        'hidden': 1,
-        'sort': 'date'
-    }
-
-    term = Terminal()
-
-    try:
-        session = Session()
-
-        results = None
-        page = 1
-        print options.query
-
-        tr = TopsyResource(options.query, **topsy_parameters)
-
-        move_up = 0
-
-        for i,item in enumerate(tr):
-            # get id
-            url = item.get("url")
-            tweet_id = url.split("/")[-1]
-
-            if move_up > 0:
-                print((move_up+1)*term.move_up())
-                move_up = 0
-
-            print ("%d: %s - %r" % (i+1, tweet_id, item.get("content") ) + term.clear_eol())
-            move_up += 1
-
-            count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count()
-
-            if count_tweet:
-                continue
-            try:
-                tweet = t.statuses.show(id=tweet_id, include_entities=True)
-            except twitter.api.TwitterHTTPError as e:
-                if e.e.code == 404 or e.e.code == 403:
-                    continue
-                else:
-                    raise
-
-            processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger)
-            processor.process()
-            session.flush()
-            session.commit()
-
-            print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('X-Rate-Limit-Limit'))) + term.clear_eol())
-            move_up += 1
-            rate_limit_limit = int(tweet.headers.getheader('X-Rate-Limit-Limit'))
-            rate_limit_remaining = int(tweet.rate_limit_remaining)
-
-            if rate_limit_remaining < rate_limit_limit:
-                time_to_sleep = 0
-            else:
-                time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining))
-
-            for i in xrange(time_to_sleep):
-                if i:
-                    print(2*term.move_up())
-                else:
-                    move_up += 1
-                print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol())
-                time.sleep(1)
-
-    except twitter.api.TwitterHTTPError as e:
-        fmt = ("." + e.format) if e.format else ""
-        print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data))
-
-    finally:
-        if session:
-            session.close()