script/utils/search_topsy_scrap.py
author ymh <ymh.work@gmail.com>
Sat, 22 Sep 2018 14:48:48 +0200
changeset 1465 201a829de81b
parent 1137 5c757e167687
permissions -rw-r--r--
Added tag V06.005 for changeset 239a2fbc9e2c
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
982
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
     1
import argparse
888
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 693
diff changeset
     2
import logging
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 693
diff changeset
     3
import math
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
     4
import re
982
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
     5
import time
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
     6
import urllib
982
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
     7
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
     8
from blessings import Terminal
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
     9
import requests
888
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 693
diff changeset
    10
import twitter
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 693
diff changeset
    11
982
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
    12
from iri_tweet import models, utils
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
    13
from iri_tweet.processor import TwitterProcessorStatus
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
    14
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    15
from selenium import webdriver
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    16
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    17
from selenium.webdriver.common.by import By
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    18
from selenium.webdriver.support.ui import WebDriverWait
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    19
from selenium.webdriver.support import expected_conditions as EC
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    20
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    21
from lxml import html
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    22
import json
982
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
    23
888
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 693
diff changeset
    24
logger = logging.getLogger(__name__)
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    25
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    26
APPLICATION_NAME = "Tweet recorder user"
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    27
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    28
dcap = dict(DesiredCapabilities.PHANTOMJS)
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    29
dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.103 Safari/537.36"
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    30
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    31
class TopsyResource(object):
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    32
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    33
    def __init__(self, query, **kwargs):
982
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
    34
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    35
        self.options = {}
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    36
        self.options['q'] = query
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    37
        self.options.update(kwargs)
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    38
        self.base_url = "http://topsy.com/s"
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    39
        self.driver = webdriver.PhantomJS(desired_capabilities=dcap)
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    40
        self.driver.set_window_size(1024, 768)
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    41
        self.page = -1
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    42
        self.tree = None
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    43
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    44
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    45
    def __do_request(self, params):
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    46
      url = "%s?%s" % (self.base_url, urllib.urlencode(params).replace('+','%20')) #calculate url with urllib
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    47
      print('Requesting %s' % url)
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    48
      self.driver.get(url)
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    49
      try:
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    50
          element = WebDriverWait(self.driver, 60).until(
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    51
              EC.presence_of_element_located((By.CLASS_NAME, "result-tweet"))
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    52
          )
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    53
      except Exception as e:
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    54
        print('Exception requesting %s : %s' % (url, e))
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    55
        self.tree = None
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    56
      else:
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    57
        self.tree = html.fromstring(self.driver.page_source)
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    58
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    59
    def __check_last(self):
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    60
      if self.page < 0:
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    61
          return False
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    62
      if self.tree is None or len(self.tree.xpath("//*[@id=\"module-pager\"]/div/ul/li[@data-page=\"next\"and @class=\"disabled\"]")):
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    63
          return True
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    64
      else:
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    65
          return False
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    66
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    67
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    68
    def __next_page(self):
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    69
        if self.__check_last():
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    70
          return False
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    71
        self.page += 1
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    72
        params = {}
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    73
        params.update(self.options)
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    74
        if self.page:
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    75
          params['offset'] = self.page*self.options.get('perpage',10)
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    76
        self.__do_request(params)
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    77
        return self.tree is not None
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    78
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    79
    def __iter__(self):
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    80
        result_xpath = "//*[@id=\"results\"]/div"
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    81
        while self.__next_page():
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    82
            for res_node in self.tree.xpath(result_xpath):
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    83
                res_obj = {
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    84
                  'user': "".join(res_node.xpath("./div/div/h5/a/text()")),
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    85
                  'content': "".join(res_node.xpath("./div/div/div/text()")),
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    86
                  'url': "".join(res_node.xpath("./div/div/ul/li[1]/small/a/@href"))
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    87
                }
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    88
                if res_obj['url']:
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    89
                  yield res_obj
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    90
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    91
982
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
    92
def get_options():
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    93
982
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
    94
    usage = "usage: %(prog)s [options] <connection_str_or_filepath>"
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
    95
982
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
    96
    parser = argparse.ArgumentParser(usage=usage)
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
    97
982
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
    98
    parser.add_argument(dest="conn_str",
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
    99
                        help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR")
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
   100
    parser.add_argument("-Q", dest="query",
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   101
                      help="query", metavar="QUERY")
982
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
   102
    parser.add_argument("-k", "--key", dest="consumer_key",
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
   103
                        help="Twitter consumer key", metavar="CONSUMER_KEY")
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
   104
    parser.add_argument("-s", "--secret", dest="consumer_secret",
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
   105
                        help="Twitter consumer secret", metavar="CONSUMER_SECRET")
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
   106
    parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   107
                      help="Token file name")
982
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
   108
    parser.add_argument("-T", dest="topsy_apikey", metavar="TOPSY_APIKEY", default=None,
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   109
                      help="Topsy apikey")
982
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
   110
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   111
    utils.set_logging_options(parser)
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   112
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   113
    return parser.parse_args()
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   114
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   115
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   116
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   117
if __name__ == "__main__":
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   118
982
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
   119
    options = get_options()
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   120
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   121
    utils.set_logging(options);
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   122
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   123
982
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
   124
    acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME)
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   125
982
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
   126
    t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True)
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   127
    t.secure = True
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   128
982
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
   129
    conn_str = options.conn_str.strip()
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   130
    if not re.match("^\w+://.+", conn_str):
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   131
        conn_str = 'sqlite:///' + conn_str
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   132
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   133
    engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   134
    session = None
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   135
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   136
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   137
    topsy_parameters = {
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   138
        'perpage': 10,
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   139
        'window': 'a',
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   140
        'type': 'tweet',
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   141
        'hidden': 1,
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   142
        'sort': 'date'
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   143
    }
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   144
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   145
    term = Terminal()
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   146
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   147
    try:
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   148
        session = Session()
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   149
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   150
        results = None
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   151
        page = 1
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   152
        print options.query
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   153
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   154
        tr = TopsyResource(options.query, **topsy_parameters)
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   155
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   156
        move_up = 0
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   157
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   158
        for i,item in enumerate(tr):
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   159
            # get id
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   160
            url = item.get("url")
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   161
            tweet_id = url.split("/")[-1]
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   162
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   163
            if move_up > 0:
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   164
                print((move_up+1)*term.move_up())
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   165
                move_up = 0
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   166
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   167
            print ("%d: %s - %r" % (i+1, tweet_id, item.get("content") ) + term.clear_eol())
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   168
            move_up += 1
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   169
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   170
            count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count()
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   171
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   172
            if count_tweet:
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   173
                continue
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   174
            try:
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   175
                tweet = t.statuses.show(id=tweet_id, include_entities=True)
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   176
            except twitter.api.TwitterHTTPError as e:
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   177
                if e.e.code == 404 or e.e.code == 403:
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   178
                    continue
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   179
                else:
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   180
                    raise
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   181
888
6fc6637d8403 update listener. add support for twitter regulation messages. update virtualenv
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 693
diff changeset
   182
            processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger)
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   183
            processor.process()
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   184
            session.flush()
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   185
            session.commit()
982
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
   186
11c1322cffe6 correct search twitter and topsy
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 888
diff changeset
   187
            print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('X-Rate-Limit-Limit'))) + term.clear_eol())
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   188
            move_up += 1
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   189
            rate_limit_limit = int(tweet.headers.getheader('X-Rate-Limit-Limit'))
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   190
            rate_limit_remaining = int(tweet.rate_limit_remaining)
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   191
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   192
            if rate_limit_remaining < rate_limit_limit:
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   193
                time_to_sleep = 0
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   194
            else:
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   195
                time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining))
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   196
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   197
            for i in xrange(time_to_sleep):
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   198
                if i:
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   199
                    print(2*term.move_up())
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   200
                else:
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   201
                    move_up += 1
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   202
                print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol())
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   203
                time.sleep(1)
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   204
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   205
    except twitter.api.TwitterHTTPError as e:
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   206
        fmt = ("." + e.format) if e.format else ""
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   207
        print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data))
1137
5c757e167687 improve topsy search
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents: 982
diff changeset
   208
693
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   209
    finally:
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   210
        if session:
2ef837069108 Starting 'listener_update' branch
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff changeset
   211
            session.close()