script/utils/export_twitter_alchemy.py
changeset 1153 02722ce55cf8
parent 1024 44636bcf3ea8
child 1295 03d2aa7b4967
equal deleted inserted replaced
1152:8a5ed4265209 1153:02722ce55cf8
    14 import os.path
    14 import os.path
    15 import re
    15 import re
    16 import sys
    16 import sys
    17 import time
    17 import time
    18 import uuid #@UnresolvedImport
    18 import uuid #@UnresolvedImport
    19 from dateutil.parser import parse as parse_date
    19 from dateutil.parser import parse as parse_date_raw
       
    20 from dateutil.tz import tzutc
    20 import bisect
    21 import bisect
    21 
    22 
    22 #class TweetExclude(object):
    23 #class TweetExclude(object):
    23 #    def __init__(self, id):
    24 #    def __init__(self, id):
    24 #        self.id = id
    25 #        self.id = id
    26 #    def __repr__(self):
    27 #    def __repr__(self):
    27 #        return "<TweetExclude(id=%d)>" % (self.id)
    28 #        return "<TweetExclude(id=%d)>" % (self.id)
    28 
    29 
    29 LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/"
    30 LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/"
    30 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/"
    31 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/"
       
    32 
       
    33 def parse_date(datestr):
       
    34     res = parse_date_raw(datestr)
       
    35     if res.tzinfo is None:
       
    36         res = res.replace(tzinfo=tzutc())
       
    37     return res
    31 
    38 
    32 
    39 
    33 def re_fn(expr, item):
    40 def re_fn(expr, item):
    34     reg = re.compile(expr, re.I)
    41     reg = re.compile(expr, re.I)
    35     res = reg.search(item)
    42     res = reg.search(item)
    63     else:
    70     else:
    64         return None
    71         return None
    65 
    72 
    66 def get_options():
    73 def get_options():
    67 
    74 
    68     usage = "usage: %(prog)s [options]"
    75     parser = argparse.ArgumentParser(description="All date should be given using iso8601 format. If no timezone is used, the date is considered as UTC")
    69 
       
    70     parser = argparse.ArgumentParser(usage)
       
    71 
    76 
    72     parser.add_argument("-f", "--file", dest="filename",
    77     parser.add_argument("-f", "--file", dest="filename",
    73                       help="write export to file", metavar="FILE", default="project.ldt")
    78                       help="write export to file", metavar="FILE", default="project.ldt")
    74     parser.add_argument("-d", "--database", dest="database",
    79     parser.add_argument("-d", "--database", dest="database",
    75                       help="Input database", metavar="DATABASE")
    80                       help="Input database", metavar="DATABASE")
   136         time_params = {
   141         time_params = {
   137             'hours': int(parts[0]),
   142             'hours': int(parts[0]),
   138             'minutes': int(parts[1]),
   143             'minutes': int(parts[1]),
   139             'seconds': int(parts[2]) if len(parts)>2 else 0
   144             'seconds': int(parts[2]) if len(parts)>2 else 0
   140         }
   145         }
   141         return int(datetime.timedelta(**time_params).total_seconds()*1000)
   146         return int(round(datetime.timedelta(**time_params).total_seconds()*1000))
   142 
   147 
   143 
   148 
   144 if __name__ == "__main__" :
   149 if __name__ == "__main__" :
   145 
   150 
   146     (options, parser) = get_options()
   151     (options, parser) = get_options()
   265                 start_date_str = params.get("start_date",None)
   270                 start_date_str = params.get("start_date",None)
   266                 end_date_str = params.get("end_date", None)
   271                 end_date_str = params.get("end_date", None)
   267                 duration = params.get("duration", None)
   272                 duration = params.get("duration", None)
   268                 content_file = params.get("content_file", None)
   273                 content_file = params.get("content_file", None)
   269                 content_file_write = params.get("content_file_write", None)
   274                 content_file_write = params.get("content_file_write", None)
   270                 hashtags = params.get('hashtags', [])
   275                 hashtags = list(set(params.get('hashtags', [])))
   271 
   276 
   272                 if user_whitelist_file:
   277                 if user_whitelist_file:
   273                     with open(user_whitelist_file, 'r+') as f:
   278                     with open(user_whitelist_file, 'r+') as f:
   274                         user_whitelist = list(set([s.strip() for s in f]))
   279                         user_whitelist = list(set([s.strip() for s in f]))
   275 
   280 
   276                 start_date = None
   281                 start_date = None
   277                 ts = None
       
   278                 if start_date_str:
   282                 if start_date_str:
   279                     start_date = parse_date(start_date_str)
   283                     start_date = parse_date(start_date_str)
   280                     ts = time.mktime(start_date.timetuple())
       
   281 
       
   282 
   284 
   283                 root = None
   285                 root = None
   284                 ensemble_parent = None
   286                 ensemble_parent = None
   285 
   287 
   286                 #to do : analyse situation ldt or iri ? filename set or not ?
   288                 #to do : analyse situation ldt or iri ? filename set or not ?
   420                 query_res = query.all()
   422                 query_res = query.all()
   421 
   423 
   422 
   424 
   423                 for tw in query_res:
   425                 for tw in query_res:
   424                     tweet_ts_dt = tw.created_at
   426                     tweet_ts_dt = tw.created_at
   425                     tweet_ts = int(time.mktime(tweet_ts_dt.timetuple()))
   427                     if tweet_ts_dt.tzinfo is None:
   426                     if ts is None:
   428                         tweet_ts_dt = tweet_ts_dt.replace(tzinfo=tzutc())
   427                         ts = tweet_ts
   429                     if start_date is None:
   428                     tweet_ts_rel = (tweet_ts-ts) * 1000
   430                         start_date = tweet_ts_dt
       
   431                     tweet_ts_rel = tweet_ts_dt-start_date
       
   432                     tweet_ts_rel_milli = int(round(tweet_ts_rel.total_seconds() * 1000))
   429                     if deltas:
   433                     if deltas:
   430                         d = find_delta(deltas, tweet_ts_rel)
   434                         d = find_delta(deltas, tweet_ts_rel_milli)
   431                         if d[1] < 0:
   435                         if d[1] < 0:
   432                             continue
   436                             continue
   433                         else :
   437                         else :
   434                             tweet_ts_rel -= d[1]
   438                             tweet_ts_rel_milli -= d[1]
   435 
   439 
   436                     username = None
   440                     username = None
   437                     profile_url = ""
   441                     profile_url = ""
   438                     if tw.user is not None:
   442                     if tw.user is not None:
   439                         username = tw.user.screen_name
   443                         username = tw.user.screen_name
   440                         profile_url = tw.user.profile_image_url if tw.user.profile_image_url is not None else ""
   444                         profile_url = tw.user.profile_image_url if tw.user.profile_image_url is not None else ""
   441                     if not username:
   445                     if not username:
   442                         username = "anon."
   446                         username = "anon."
   443 
   447 
   444                     element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":unicode(profile_url)})
   448                     element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel_milli), u"dur":u"0", u"src":unicode(profile_url)})
   445                     etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text)
   449                     etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text)
   446                     etree.SubElement(element, u"abstract").text = unicode(tw.text)
   450                     etree.SubElement(element, u"abstract").text = unicode(tw.text)
   447 
   451 
   448                     tags_node = etree.SubElement(element, u"tags")
   452                     tags_node = etree.SubElement(element, u"tags")
   449 
   453