script/utils/search_topsy_scrap.py
author ymh <ymh.work@gmail.com>
Mon, 18 Jun 2018 23:15:34 +0200
changeset 1427 8b3d57a519eb
parent 1137 5c757e167687
permissions -rw-r--r--
remove dependency to Zend 1. Use composer to reduce the number of dependencies

import argparse
import logging
import math
import re
import time
import urllib

from blessings import Terminal
import requests
import twitter

from iri_tweet import models, utils
from iri_tweet.processor import TwitterProcessorStatus

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from lxml import html
import json

logger = logging.getLogger(__name__)

APPLICATION_NAME = "Tweet recorder user"

dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.103 Safari/537.36"

class TopsyResource(object):

    def __init__(self, query, **kwargs):

        self.options = {}
        self.options['q'] = query
        self.options.update(kwargs)
        self.base_url = "http://topsy.com/s"
        self.driver = webdriver.PhantomJS(desired_capabilities=dcap)
        self.driver.set_window_size(1024, 768)
        self.page = -1
        self.tree = None


    def __do_request(self, params):
      url = "%s?%s" % (self.base_url, urllib.urlencode(params).replace('+','%20')) #calculate url with urllib
      print('Requesting %s' % url)
      self.driver.get(url)
      try:
          element = WebDriverWait(self.driver, 60).until(
              EC.presence_of_element_located((By.CLASS_NAME, "result-tweet"))
          )
      except Exception as e:
        print('Exception requesting %s : %s' % (url, e))
        self.tree = None
      else:
        self.tree = html.fromstring(self.driver.page_source)

    def __check_last(self):
      if self.page < 0:
          return False
      if self.tree is None or len(self.tree.xpath("//*[@id=\"module-pager\"]/div/ul/li[@data-page=\"next\"and @class=\"disabled\"]")):
          return True
      else:
          return False


    def __next_page(self):
        if self.__check_last():
          return False
        self.page += 1
        params = {}
        params.update(self.options)
        if self.page:
          params['offset'] = self.page*self.options.get('perpage',10)
        self.__do_request(params)
        return self.tree is not None

    def __iter__(self):
        result_xpath = "//*[@id=\"results\"]/div"
        while self.__next_page():
            for res_node in self.tree.xpath(result_xpath):
                res_obj = {
                  'user': "".join(res_node.xpath("./div/div/h5/a/text()")),
                  'content': "".join(res_node.xpath("./div/div/div/text()")),
                  'url': "".join(res_node.xpath("./div/div/ul/li[1]/small/a/@href"))
                }
                if res_obj['url']:
                  yield res_obj


def get_options():

    usage = "usage: %(prog)s [options] <connection_str_or_filepath>"

    parser = argparse.ArgumentParser(usage=usage)

    parser.add_argument(dest="conn_str",
                        help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR")
    parser.add_argument("-Q", dest="query",
                      help="query", metavar="QUERY")
    parser.add_argument("-k", "--key", dest="consumer_key",
                        help="Twitter consumer key", metavar="CONSUMER_KEY")
    parser.add_argument("-s", "--secret", dest="consumer_secret",
                        help="Twitter consumer secret", metavar="CONSUMER_SECRET")
    parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
                      help="Token file name")
    parser.add_argument("-T", dest="topsy_apikey", metavar="TOPSY_APIKEY", default=None,
                      help="Topsy apikey")

    utils.set_logging_options(parser)

    return parser.parse_args()



if __name__ == "__main__":

    options = get_options()

    utils.set_logging(options);


    acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME)

    t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True)
    t.secure = True

    conn_str = options.conn_str.strip()
    if not re.match("^\w+://.+", conn_str):
        conn_str = 'sqlite:///' + conn_str

    engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
    session = None


    topsy_parameters = {
        'perpage': 10,
        'window': 'a',
        'type': 'tweet',
        'hidden': 1,
        'sort': 'date'
    }

    term = Terminal()

    try:
        session = Session()

        results = None
        page = 1
        print options.query

        tr = TopsyResource(options.query, **topsy_parameters)

        move_up = 0

        for i,item in enumerate(tr):
            # get id
            url = item.get("url")
            tweet_id = url.split("/")[-1]

            if move_up > 0:
                print((move_up+1)*term.move_up())
                move_up = 0

            print ("%d: %s - %r" % (i+1, tweet_id, item.get("content") ) + term.clear_eol())
            move_up += 1

            count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count()

            if count_tweet:
                continue
            try:
                tweet = t.statuses.show(id=tweet_id, include_entities=True)
            except twitter.api.TwitterHTTPError as e:
                if e.e.code == 404 or e.e.code == 403:
                    continue
                else:
                    raise

            processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger)
            processor.process()
            session.flush()
            session.commit()

            print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('X-Rate-Limit-Limit'))) + term.clear_eol())
            move_up += 1
            rate_limit_limit = int(tweet.headers.getheader('X-Rate-Limit-Limit'))
            rate_limit_remaining = int(tweet.rate_limit_remaining)

            if rate_limit_remaining < rate_limit_limit:
                time_to_sleep = 0
            else:
                time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining))

            for i in xrange(time_to_sleep):
                if i:
                    print(2*term.move_up())
                else:
                    move_up += 1
                print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol())
                time.sleep(1)

    except twitter.api.TwitterHTTPError as e:
        fmt = ("." + e.format) if e.format else ""
        print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data))

    finally:
        if session:
            session.close()