remove dependency to Zend 1. Use composer to reduce the number of dependencies
import argparse
import logging
import math
import re
import time
import urllib
from blessings import Terminal
import requests
import twitter
from iri_tweet import models, utils
from iri_tweet.processor import TwitterProcessorStatus
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import html
import json
logger = logging.getLogger(__name__)
APPLICATION_NAME = "Tweet recorder user"
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.103 Safari/537.36"
class TopsyResource(object):
def __init__(self, query, **kwargs):
self.options = {}
self.options['q'] = query
self.options.update(kwargs)
self.base_url = "http://topsy.com/s"
self.driver = webdriver.PhantomJS(desired_capabilities=dcap)
self.driver.set_window_size(1024, 768)
self.page = -1
self.tree = None
def __do_request(self, params):
url = "%s?%s" % (self.base_url, urllib.urlencode(params).replace('+','%20')) #calculate url with urllib
print('Requesting %s' % url)
self.driver.get(url)
try:
element = WebDriverWait(self.driver, 60).until(
EC.presence_of_element_located((By.CLASS_NAME, "result-tweet"))
)
except Exception as e:
print('Exception requesting %s : %s' % (url, e))
self.tree = None
else:
self.tree = html.fromstring(self.driver.page_source)
def __check_last(self):
if self.page < 0:
return False
if self.tree is None or len(self.tree.xpath("//*[@id=\"module-pager\"]/div/ul/li[@data-page=\"next\"and @class=\"disabled\"]")):
return True
else:
return False
def __next_page(self):
if self.__check_last():
return False
self.page += 1
params = {}
params.update(self.options)
if self.page:
params['offset'] = self.page*self.options.get('perpage',10)
self.__do_request(params)
return self.tree is not None
def __iter__(self):
result_xpath = "//*[@id=\"results\"]/div"
while self.__next_page():
for res_node in self.tree.xpath(result_xpath):
res_obj = {
'user': "".join(res_node.xpath("./div/div/h5/a/text()")),
'content': "".join(res_node.xpath("./div/div/div/text()")),
'url': "".join(res_node.xpath("./div/div/ul/li[1]/small/a/@href"))
}
if res_obj['url']:
yield res_obj
def get_options():
usage = "usage: %(prog)s [options] <connection_str_or_filepath>"
parser = argparse.ArgumentParser(usage=usage)
parser.add_argument(dest="conn_str",
help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR")
parser.add_argument("-Q", dest="query",
help="query", metavar="QUERY")
parser.add_argument("-k", "--key", dest="consumer_key",
help="Twitter consumer key", metavar="CONSUMER_KEY")
parser.add_argument("-s", "--secret", dest="consumer_secret",
help="Twitter consumer secret", metavar="CONSUMER_SECRET")
parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
help="Token file name")
parser.add_argument("-T", dest="topsy_apikey", metavar="TOPSY_APIKEY", default=None,
help="Topsy apikey")
utils.set_logging_options(parser)
return parser.parse_args()
if __name__ == "__main__":
options = get_options()
utils.set_logging(options);
acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME)
t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True)
t.secure = True
conn_str = options.conn_str.strip()
if not re.match("^\w+://.+", conn_str):
conn_str = 'sqlite:///' + conn_str
engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
session = None
topsy_parameters = {
'perpage': 10,
'window': 'a',
'type': 'tweet',
'hidden': 1,
'sort': 'date'
}
term = Terminal()
try:
session = Session()
results = None
page = 1
print options.query
tr = TopsyResource(options.query, **topsy_parameters)
move_up = 0
for i,item in enumerate(tr):
# get id
url = item.get("url")
tweet_id = url.split("/")[-1]
if move_up > 0:
print((move_up+1)*term.move_up())
move_up = 0
print ("%d: %s - %r" % (i+1, tweet_id, item.get("content") ) + term.clear_eol())
move_up += 1
count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count()
if count_tweet:
continue
try:
tweet = t.statuses.show(id=tweet_id, include_entities=True)
except twitter.api.TwitterHTTPError as e:
if e.e.code == 404 or e.e.code == 403:
continue
else:
raise
processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger)
processor.process()
session.flush()
session.commit()
print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('X-Rate-Limit-Limit'))) + term.clear_eol())
move_up += 1
rate_limit_limit = int(tweet.headers.getheader('X-Rate-Limit-Limit'))
rate_limit_remaining = int(tweet.rate_limit_remaining)
if rate_limit_remaining < rate_limit_limit:
time_to_sleep = 0
else:
time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining))
for i in xrange(time_to_sleep):
if i:
print(2*term.move_up())
else:
move_up += 1
print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol())
time.sleep(1)
except twitter.api.TwitterHTTPError as e:
fmt = ("." + e.format) if e.format else ""
print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data))
finally:
if session:
session.close()