script/utils/search_twitter_json.py
changeset 1496 184372ec27e2
parent 1334 e1d3c1469691
child 1497 14a9bed2e3cd
equal deleted inserted replaced
1495:efbda157eb57 1496:184372ec27e2
    11 import twitter
    11 import twitter
    12 
    12 
    13 from iri_tweet import models, utils
    13 from iri_tweet import models, utils
    14 from iri_tweet.processor import TwitterProcessorStatus
    14 from iri_tweet.processor import TwitterProcessorStatus
    15 
    15 
    16 from lxml import html
       
    17 import json
    16 import json
    18 from pyquery import PyQuery
    17 from pyquery import PyQuery
    19 
    18 
    20 logger = logging.getLogger(__name__)
    19 logger = logging.getLogger(__name__)
    21 
    20 
    33         self.refresh_cursor = ''
    32         self.refresh_cursor = ''
    34         pass
    33         pass
    35 
    34 
    36     def __iter__(self):
    35     def __iter__(self):
    37 
    36 
    38         results = []
       
    39 
       
    40         while True:
    37         while True:
    41             json = self.get_json_response()
    38             json = self.get_json_response()
    42             if len(json['items_html'].strip()) == 0:
    39             if len(json['items_html'].strip()) == 0:
    43                 break
    40                 break
    44 
    41 
    49                 break
    46                 break
    50 
    47 
    51             for tweetHTML in tweets:
    48             for tweetHTML in tweets:
    52                 tweet_pq = PyQuery(tweetHTML)
    49                 tweet_pq = PyQuery(tweetHTML)
    53 
    50 
    54                 username = tweet_pq("span.username.js-action-profile-name b").text();
    51                 username = tweet_pq("span.username.js-action-profile-name b").text()
    55                 txt = re.sub(r"\s+", " ", re.sub(r"[^\x00-\x7F]", "", tweet_pq("p.js-tweet-text").text()).replace('# ', '#').replace('@ ', '@'));
    52                 txt = re.sub(r"\s+", " ", re.sub(r"[^\x00-\x7F]", "", tweet_pq("p.js-tweet-text").text()).replace('# ', '#').replace('@ ', '@'))
    56                 retweets = int(tweet_pq("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""));
    53                 retweets = int(tweet_pq("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
    57                 favorites = int(tweet_pq("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""));
    54                 favorites = int(tweet_pq("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
    58                 date_sec = int(tweet_pq("small.time span.js-short-timestamp").attr("data-time"));
    55                 date_sec = int(tweet_pq("small.time span.js-short-timestamp").attr("data-time"))
    59                 id = tweet_pq.attr("data-tweet-id");
    56                 id = tweet_pq.attr("data-tweet-id")
    60                 permalink = tweet_pq.attr("data-permalink-path");
    57                 permalink = tweet_pq.attr("data-permalink-path")
    61 
    58 
    62                 geo = ''
    59                 geo = ''
    63                 geo_span = tweet_pq('span.Tweet-geo')
    60                 geo_span = tweet_pq('span.Tweet-geo')
    64                 if len(geo_span) > 0:
    61                 if len(geo_span) > 0:
    65                     geo = geo_span.attr('title')
    62                     geo = geo_span.attr('title')
   127 
   124 
   128 if __name__ == "__main__":
   125 if __name__ == "__main__":
   129 
   126 
   130     options = get_options()
   127     options = get_options()
   131 
   128 
   132     utils.set_logging(options);
   129     utils.set_logging(options)
   133 
   130 
   134 
   131 
   135     acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME)
   132     acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME)
   136 
   133 
   137     t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True)
   134     t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True)
   138     t.secure = True
   135     t.secure = True
   139 
   136 
   140     conn_str = options.conn_str.strip()
   137     conn_str = options.conn_str.strip()
   141     if not re.match("^\w+://.+", conn_str):
   138     if not re.match(r"^\w+://.+", conn_str):
   142         conn_str = 'sqlite:///' + conn_str
   139         conn_str = 'sqlite:///' + conn_str
   143 
   140 
   144     engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
   141     engine, metadata, Session = models.setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all=True)
   145     session = None
   142     session = None
   146 
   143 
   149 
   146 
   150     try:
   147     try:
   151         session = Session()
   148         session = Session()
   152 
   149 
   153         results = None
   150         results = None
   154         print options.query
   151         print(options.query)
   155 
   152 
   156         tm = TweetManager(options.query)
   153         tm = TweetManager(options.query)
   157 
   154 
   158         move_up = 0
   155         move_up = 0
   159 
   156 
   186             processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger)
   183             processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger)
   187             processor.process()
   184             processor.process()
   188             session.flush()
   185             session.flush()
   189             session.commit()
   186             session.commit()
   190 
   187 
   191             print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers.getheader('X-Rate-Limit-Limit'))) + term.clear_eol())
   188             print("rate limit remaining %s of %s" % (str(tweet.rate_limit_remaining), str(tweet.headers['X-Rate-Limit-Limit'])) + term.clear_eol())
   192             move_up += 1
   189             move_up += 1
   193             rate_limit_limit = int(tweet.headers.getheader('X-Rate-Limit-Limit'))
   190             rate_limit_limit = int(tweet.headers['X-Rate-Limit-Limit'])
   194             rate_limit_remaining = int(tweet.rate_limit_remaining)
   191             rate_limit_remaining = int(tweet.rate_limit_remaining)
   195 
   192 
   196             if rate_limit_remaining > rate_limit_limit:
   193             if rate_limit_remaining > rate_limit_limit:
   197                 time_to_sleep = 0
   194                 time_to_sleep = 0
   198             else:
   195             else:
   199                 time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining))
   196                 time_to_sleep = int(math.ceil((tweet.rate_limit_reset - time.mktime(time.gmtime())) / tweet.rate_limit_remaining))
   200 
   197 
   201             for i in xrange(time_to_sleep):
   198             for i in range(time_to_sleep):
   202                 if i:
   199                 if i:
   203                     print(2*term.move_up())
   200                     print(2*term.move_up())
   204                 else:
   201                 else:
   205                     move_up += 1
   202                     move_up += 1
   206                 print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol())
   203                 print(("Sleeping for %d seconds, %d remaining" % (time_to_sleep, time_to_sleep-i)) + term.clear_eol())
   207                 time.sleep(1)
   204                 time.sleep(1)
   208 
   205 
   209     except twitter.api.TwitterHTTPError as e:
   206     except twitter.api.TwitterHTTPError as e:
   210         fmt = ("." + e.format) if e.format else ""
   207         fmt = ("." + e.format) if e.format else ""
   211         print "Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data))
   208         print("Twitter sent status %s for URL: %s%s using parameters: (%s)\ndetails: %s" % (repr(e.e.code), repr(e.uri), repr(fmt), repr(e.uriparts), repr(e.response_data)))
   212 
   209 
   213     finally:
   210     finally:
   214         if session:
   211         if session:
   215             session.close()
   212             session.close()