script/utils/search_twitter_api.py
changeset 1497 14a9bed2e3cd
parent 1496 184372ec27e2
child 1523 53f1b28188f0
equal deleted inserted replaced
1496:184372ec27e2 1497:14a9bed2e3cd
     1 import argparse
     1 import argparse
       
     2 import datetime
       
     3 import functools
       
     4 import json
     2 import logging
     5 import logging
     3 import math
     6 import math
     4 import re
     7 import re
     5 import time
     8 import time
     6 import datetime
       
     7 import urllib
     9 import urllib
       
    10 from enum import Enum
     8 
    11 
     9 from blessings import Terminal
       
    10 import requests
    12 import requests
    11 import twitter
    13 import twitter
       
    14 from blessings import Terminal
    12 
    15 
    13 from iri_tweet import models, utils
    16 from iri_tweet import models, utils
    14 from iri_tweet.processor import TwitterProcessorStatus
    17 from iri_tweet.processor import TwitterProcessorStatus
    15 
       
    16 import json
       
    17 
    18 
    18 logger = logging.getLogger(__name__)
    19 logger = logging.getLogger(__name__)
    19 
    20 
    20 APPLICATION_NAME = "Tweet seach json"
    21 APPLICATION_NAME = "Tweet seach json"
    21 
    22 
       
    23 
       
    24 class SearchType(Enum):
       
    25     standard = 'standard'
       
    26     _30day = '30day'
       
    27     full = 'full'
       
    28 
       
    29     def __str__(self):
       
    30         return self.value
       
    31 
       
    32 def pass_kwargs_as_json(f):
       
    33     def kwargs_json_wrapper(*args, **kwargs):
       
    34         normal_kwargs = { k:v for k,v in kwargs.items() if k[0] != "_" }
       
    35         special_kwargs = { k:v for k,v in kwargs.items() if k[0] == "_" }
       
    36         new_kwargs = { **special_kwargs, '_json': normal_kwargs }
       
    37         return f(*args, **new_kwargs)
       
    38     return kwargs_json_wrapper
    22 
    39 
    23 # TODO: implement some more parameters
    40 # TODO: implement some more parameters
    24 # script to "scrap twitter results"
    41 # script to "scrap twitter results"
    25 # Shamelessly taken from https://github.com/Jefferson-Henrique/GetOldTweets-python
    42 # Shamelessly taken from https://github.com/Jefferson-Henrique/GetOldTweets-python
    26 # pyquery cssselect
    43 # pyquery cssselect
    27 class TweetManager:
    44 class TweetManager:
    28 
    45 
    29     def __init__(self, query, twitter_con):
    46     def __init__(self, twitter_con, query, search_type, api_env):
    30         self.query = query
    47         self.query = query
    31         self.max_id = 0
    48         self.search_type = search_type
       
    49         self.next = ""
    32         self.t = twitter_con
    50         self.t = twitter_con
    33         pass
    51         self.api_env = api_env
       
    52         self.twitter_api = self.get_twitter_api()
       
    53         self.rate_limit_remaining = 0
       
    54         self.rate_limit_limit = 0
       
    55         self.rate_limit_reset = 0
       
    56         self.i = 0
       
    57 
       
    58     def get_twitter_api(self):
       
    59         return {
       
    60             SearchType.standard: lambda t: t.search.tweets,
       
    61             SearchType._30day:   lambda t: pass_kwargs_as_json(functools.partial(getattr(getattr(t.tweets.search,'30day'),self.api_env), _method="POST")),
       
    62             SearchType.full:     lambda t: pass_kwargs_as_json(functools.partial(getattr(t.tweets.search.fullarchive, self.api_env), _method="POST")),
       
    63         }[self.search_type](self.t)
    34 
    64 
    35     def __iter__(self):
    65     def __iter__(self):
    36         while True:
    66         while True:
    37             if self.max_id < 0:
    67             if self.next is None:
    38                 break
    68                 break
    39             json = self.get_json_response()
    69             self.i = self.i+1
    40 
    70 
    41             next_results = json['search_metadata'].get('next_results', "?")[1:]
    71             # with open("json_dump_%s.json" % self.i, 'r') as fp:
    42             self.max_id = int(urllib.parse.parse_qs(next_results).get('max_id', [-1])[0])
    72             #     jsondata = json.load(fp)
       
    73             jsondata = self.get_json_response()
    43 
    74 
    44             tweet_list = json['statuses']
    75             self.rate_limit_remaining = jsondata.rate_limit_remaining
       
    76             self.rate_limit_limit = jsondata.rate_limit_limit
       
    77             self.rate_limit_reset = jsondata.rate_limit_reset
       
    78 
       
    79             with open("json_dump_%s.json" % self.i, 'w') as fp:
       
    80                 json.dump(jsondata, fp)
       
    81 
       
    82             if self.search_type == SearchType.standard:
       
    83                 next_results = jsondata['search_metadata'].get('next_results', "?")[1:]
       
    84                 self.next = urllib.parse.parse_qs(next_results).get('max_id', [None])[0]
       
    85                 tweet_list = jsondata['statuses']
       
    86             else:
       
    87                 self.next = jsondata.get('next')
       
    88                 tweet_list = jsondata['results']
    45 
    89 
    46             if len(tweet_list) == 0:
    90             if len(tweet_list) == 0:
    47                 break
    91                 break
    48 
    92 
    49             for tweet in tweet_list:
    93             for tweet in tweet_list:
    50                 yield tweet
    94                 yield tweet
    51 
    95 
    52     def get_json_response(self):
    96     def get_json_response(self):
    53         return self.t.search.tweets(q=self.query, include_entities=True, max_id=self.max_id)
    97         if self.search_type == SearchType.standard:
    54 
    98             return self.twitter_api(q=self.query, include_entities=True, max_id=int(self.next) if self.next else 0)
       
    99         else:
       
   100             kwargs = { "query": self.query, "maxResults": 100 }
       
   101             if self.next:
       
   102                 kwargs["next"] = self.next
       
   103             return self.twitter_api(**kwargs)
    55 
   104 
    56 def get_options():
   105 def get_options():
    57 
   106 
    58     usage = "usage: %(prog)s [options] <connection_str_or_filepath>"
   107     usage = "usage: %(prog)s [options] <connection_str_or_filepath>"
    59 
   108 
    60     parser = argparse.ArgumentParser(usage=usage)
   109     parser = argparse.ArgumentParser(usage=usage)
    61 
   110 
    62     parser.add_argument(dest="conn_str",
   111     parser.add_argument(dest="conn_str",
    63                         help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR")
   112                         help="write tweet to DATABASE. This is a connection string", metavar="CONNECTION_STR")
    64     parser.add_argument("-Q", dest="query",
   113     parser.add_argument("-Q", dest="query",
    65                       help="query", metavar="QUERY")
   114                         help="query", metavar="QUERY")
    66     parser.add_argument("-k", "--key", dest="consumer_key",
   115     parser.add_argument("-k", "--key", dest="consumer_key",
    67                         help="Twitter consumer key", metavar="CONSUMER_KEY")
   116                         help="Twitter consumer key", metavar="CONSUMER_KEY")
    68     parser.add_argument("-s", "--secret", dest="consumer_secret",
   117     parser.add_argument("-s", "--secret", dest="consumer_secret",
    69                         help="Twitter consumer secret", metavar="CONSUMER_SECRET")
   118                         help="Twitter consumer secret", metavar="CONSUMER_SECRET")
    70     parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
   119     parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
    71                       help="Token file name")
   120                         help="Token file name")
       
   121     parser.add_argument("-a", dest="search_type", metavar="SEARCH_TYPE", default=SearchType.standard, choices=list(SearchType), type=SearchType,
       
   122                         help="Twitter search type ('standard', '30days', 'full')")
       
   123     parser.add_argument("-e", dest="api_env", metavar="API_ENV", default="dev",
       
   124                         help="Twitter api dev environment")
       
   125 
    72 
   126 
    73     utils.set_logging_options(parser)
   127     utils.set_logging_options(parser)
    74 
   128 
    75     return parser.parse_args()
   129     return parser.parse_args()
    76 
   130 
    77 
   131 
    78 
       
    79 if __name__ == "__main__":
   132 if __name__ == "__main__":
    80 
   133 
    81     options = get_options()
   134     options = get_options()
    82 
   135 
       
   136     print("the search type is : %s" % options.search_type)
       
   137 
    83     utils.set_logging(options)
   138     utils.set_logging(options)
    84 
   139 
       
   140     bearer_token = utils.get_oauth2_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME)
       
   141     twitter_auth = twitter.OAuth2(options.consumer_key, options.consumer_secret, bearer_token)
    85 
   142 
    86     acess_token_key, access_token_secret = utils.get_oauth_token(consumer_key=options.consumer_key, consumer_secret=options.consumer_secret, token_file_path=options.token_filename, application_name=APPLICATION_NAME)
   143     t = twitter.Twitter(domain="api.twitter.com", auth=twitter_auth, secure=True)
    87 
   144     t.secure = True
    88     t = twitter.Twitter(domain="api.twitter.com", auth=twitter.OAuth(acess_token_key, access_token_secret, options.consumer_key, options.consumer_secret), secure=True)
       
    89     t.secure = True    
       
    90 
   145 
    91     conn_str = options.conn_str.strip()
   146     conn_str = options.conn_str.strip()
    92     if not re.match(r"^\w+://.+", conn_str):
   147     if not re.match(r"^\w+://.+", conn_str):
    93         conn_str = 'sqlite:///' + conn_str
   148         conn_str = 'sqlite:///' + conn_str
    94 
   149 
   102         session = Session()
   157         session = Session()
   103 
   158 
   104         results = None
   159         results = None
   105         print(options.query)
   160         print(options.query)
   106 
   161 
   107         tm = TweetManager(options.query, t)
   162         tm = TweetManager(t, options.query, options.search_type, options.api_env)
   108 
   163 
   109         move_up = 0
   164         move_up = 0
   110 
   165 
   111         for i,tweet in enumerate(tm):
   166         for i,tweet in enumerate(tm):
   112             # get id
   167             # get id
   125             count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count()
   180             count_tweet = session.query(models.Tweet).filter_by(id_str=tweet_id).count()
   126 
   181 
   127             if count_tweet:
   182             if count_tweet:
   128                 continue
   183                 continue
   129 
   184 
   130             processor = TwitterProcessorStatus(tweet, None, None, session, None, options.token_filename, logger)
   185             processor = TwitterProcessorStatus(tweet, None, None, session, twitter_auth=twitter_auth, logger=logger)
   131             processor.process()
   186             processor.process()
   132             session.flush()
   187             session.flush()
   133             session.commit()
   188             session.commit()
   134 
   189 
   135     except twitter.api.TwitterHTTPError as e:
   190     except twitter.api.TwitterHTTPError as e: