script/utils/search_topsy_scrap.py
changeset 1497 14a9bed2e3cd
parent 1496 184372ec27e2
child 1498 e0b3ef3c07d0
equal deleted inserted replaced
1496:184372ec27e2 1497:14a9bed2e3cd
     1 import argparse
       
     2 import logging
       
     3 import math
       
     4 import re
       
     5 import time
       
     6 import urllib
       
     7 
       
     8 from blessings import Terminal
       
     9 import requests
       
    10 import twitter
       
    11 
       
    12 from iri_tweet import models, utils
       
    13 from iri_tweet.processor import TwitterProcessorStatus
       
    14 
       
    15 from selenium import webdriver
       
    16 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
       
    17 from selenium.webdriver.common.by import By
       
    18 from selenium.webdriver.support.ui import WebDriverWait
       
    19 from selenium.webdriver.support import expected_conditions as EC
       
    20 
       
    21 from lxml import html
       
    22 import json
       
    23 
       
    24 logger = logging.getLogger(__name__)
       
    25 
       
    26 APPLICATION_NAME = "Tweet recorder user"
       
    27 
       
    28 dcap = dict(DesiredCapabilities.PHANTOMJS)
       
    29 dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.103 Safari/537.36"
       
    30 
       
    31 class TopsyResource(object):
       
    32 
       
    33     def __init__(self, query, **kwargs):
       
    34 
       
    35         self.options = {}
       
    36         self.options['q'] = query
       
    37         self.options.update(kwargs)
       
    38         self.base_url = "http://topsy.com/s"
       
    39         self.driver = webdriver.PhantomJS(desired_capabilities=dcap)
       
    40         self.driver.set_window_size(1024, 768)
       
    41         self.page = -1
       
    42         self.tree = None
       
    43 
       
    44 
       
    45     def __do_request(self, params):
       
    46       url = "%s?%s" % (self.base_url, urllib.urlencode(params).replace('+','%20')) #calculate url with urllib
       
    47       print('Requesting %s' % url)
       
    48       self.driver.get(url)
       
    49       try:
       
    50           element = WebDriverWait(self.driver, 60).until(
       
    51               EC.presence_of_element_located((By.CLASS_NAME, "result-tweet"))
       
    52           )
       
    53       except Exception as e:
       
    54         print('Exception requesting %s : %s' % (url, e))
       
    55         self.tree = None
       
    56       else:
       
    57         self.tree = html.fromstring(self.driver.page_source)
       
    58 
       
    59     def __check_last(self):
       
    60       if self.page < 0:
       
    61           return False
       
    62       if self.tree is None or len(self.tree.xpath("//*[@id=\"module-pager\"]/div/ul/li[@data-page=\"next\"and @class=\"disabled\"]")):
       
    63           return True
       
    64       else:
       
    65           return False
       
    66 
       
    67 
       
    68     def __next_page(self):
       
    69         if self.__check_last():
       
    70           return False
       
    71         self.page += 1
       
    72         params = {}
       
    73         params.update(self.options)
       
    74         if self.page:
       
    75           params['offset'] = self.page*self.options.get('perpage',10)
       
    76         self.__do_request(params)
       
    77         return self.tree is not None
       
    78 
       
    79     def __iter__(self):
       
    80         result_xpath = "//*[@id=\"results\"]/div"
       
    81         while self.__next_page():
       
    82             for res_node in self.tree.xpath(result_xpath):
       
    83                 res_obj = {
       
    84                   'user': "".join(res_node.xpath("./div/div/h5/a/text()")),
       
    85                   'content': "".join(res_node.xpath("./div/div/div/text()")),
       
    86                   'url': "".join(res_node.xpath("./div/div/ul/li[1]/small/a/@href"))
       
    87                 }
       
    88                 if res_obj['url']:
       
    89                   yield res_obj
       
    90 
       
    91 
       
    92 def get_options():
       
    93 
       
    94     usage = "usage: %(prog)s [options] <connection_str_or_filepath>"
       
    95 
       
    96     parser = argparse.ArgumentParser(usage=usage)
       
    97 
       
    98     parser.add_argument(dest="conn_str",
       
    99                         help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR")
       
   100     parser.add_argument("-Q", dest="query",
       
   101                       help="query", metavar="QUERY")
       
   102     parser.add_argument("-k", "--key", dest="consumer_key",
       
   103                         help="Twitter consumer key", metavar="CONSUMER_KEY")
       
   104     parser.add_argument("-s", "--secret", dest="consumer_secret",
       
   105                         help="Twitter consumer secret", metavar="CONSUMER_SECRET")
       
   106     parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
       
   107                       help="Token file name")
       
   108     parser.add_argument("-T", dest="topsy_apikey", metavar="TOPSY_APIKEY", default=None,
       
   109                       help="Topsy apikey")
       
   110 
       
   111     utils.set_logging_options(parser)
       
   112 
       
   113     return parser.parse_args()
       
   114 
       
   115 
       
   116 
       
   117 if __name__ == "__main__":
       
   118 
       
   119     options = get_options()
       
   120 
       
   121     utils.set_logging(options);
       
   122 
       
   123 
       
   124     acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME)
       
   125 
       
   126     t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True)
       
   127     t.secure = True
       
   128 
       
   129     conn_str = options.conn_str.strip()
       
   130     if not re.match("^\w+://.+", conn_str):
       
   131         conn_str = 'sqlite:///' + conn_str
       
   132 
       
   133     engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
       
   134     session = None
       
   135 
       
   136 
       
   137     topsy_parameters = {
       
   138         'perpage': 10,
       
   139         'window': 'a',
       
   140         'type': 'tweet',
       
   141         'hidden': 1,
       
   142         'sort': 'date'
       
   143     }
       
   144 
       
   145     term = Terminal()
       
   146 
       
   147     try:
       
   148         session = Session()
       
   149 
       
   150         results = None
       
   151         page = 1
       
   152         print options.query
       
   153 
       
   154         tr = TopsyResource(options.query, **topsy_parameters)
       
   155 
       
   156         move_up = 0
       
   157 
       
   158         for i,item in enumerate(tr):
       
   159             # get id
       
   160             url = item.get("url")
       
   161             tweet_id = url.split("/")[-1]
       
   162 
       
   163             if move_up > 0:
       
   164                 print((move_up+1)*term.move_up())
       
   165                 move_up = 0
       
   166 
       
   167             print ("%d: %s - %r" % (i+1, tweet_id, item.get("content") ) + term.clear_eol())
       
   168             move_up += 1
       
   169 
       
   170             count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count()
       
   171 
       
   172             if count_tweet:
       
   173                 continue
       
   174             try:
       
   175                 tweet = t.statuses.show(id=tweet_id, include_entities=True)
       
   176             except twitter.api.TwitterHTTPError as e:
       
   177                 if e.e.code == 404 or e.e.code == 403:
       
   178                     continue
       
   179                 else:
       
   180                     raise
       
   181 
       
   182             processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger)
       
   183             processor.process()
       
   184             session.flush()
       
   185             session.commit()
       
   186 
       
   187             print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('X-Rate-Limit-Limit'))) + term.clear_eol())
       
   188             move_up += 1
       
   189             rate_limit_limit = int(tweet.headers.getheader('X-Rate-Limit-Limit'))
       
   190             rate_limit_remaining = int(tweet.rate_limit_remaining)
       
   191 
       
   192             if rate_limit_remaining < rate_limit_limit:
       
   193                 time_to_sleep = 0
       
   194             else:
       
   195                 time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining))
       
   196 
       
   197             for i in xrange(time_to_sleep):
       
   198                 if i:
       
   199                     print(2*term.move_up())
       
   200                 else:
       
   201                     move_up += 1
       
   202                 print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol())
       
   203                 time.sleep(1)
       
   204 
       
   205     except twitter.api.TwitterHTTPError as e:
       
   206         fmt = ("." + e.format) if e.format else ""
       
   207         print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data))
       
   208 
       
   209     finally:
       
   210         if session:
       
   211             session.close()