diff -r 8a5ed4265209 -r 02722ce55cf8 script/utils/export_twitter_alchemy.py --- a/script/utils/export_twitter_alchemy.py Tue Oct 07 11:02:27 2014 +0200 +++ b/script/utils/export_twitter_alchemy.py Thu Oct 16 15:13:01 2014 +0200 @@ -16,7 +16,8 @@ import sys import time import uuid #@UnresolvedImport -from dateutil.parser import parse as parse_date +from dateutil.parser import parse as parse_date_raw +from dateutil.tz import tzutc import bisect #class TweetExclude(object): @@ -29,6 +30,12 @@ LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/" LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/" +def parse_date(datestr): + res = parse_date_raw(datestr) + if res.tzinfo is None: + res = res.replace(tzinfo=tzutc()) + return res + def re_fn(expr, item): reg = re.compile(expr, re.I) @@ -65,9 +72,7 @@ def get_options(): - usage = "usage: %(prog)s [options]" - - parser = argparse.ArgumentParser(usage) + parser = argparse.ArgumentParser(description="All date should be given using iso8601 format. If no timezone is used, the date is considered as UTC") parser.add_argument("-f", "--file", dest="filename", help="write export to file", metavar="FILE", default="project.ldt") @@ -138,7 +143,7 @@ 'minutes': int(parts[1]), 'seconds': int(parts[2]) if len(parts)>2 else 0 } - return int(datetime.timedelta(**time_params).total_seconds()*1000) + return int(round(datetime.timedelta(**time_params).total_seconds()*1000)) if __name__ == "__main__" : @@ -267,18 +272,15 @@ duration = params.get("duration", None) content_file = params.get("content_file", None) content_file_write = params.get("content_file_write", None) - hashtags = params.get('hashtags', []) + hashtags = list(set(params.get('hashtags', []))) if user_whitelist_file: with open(user_whitelist_file, 'r+') as f: user_whitelist = list(set([s.strip() for s in f])) start_date = None - ts = None if start_date_str: start_date = parse_date(start_date_str) - ts = time.mktime(start_date.timetuple()) - root = None ensemble_parent = None @@ -422,16 +424,18 @@ for tw in query_res: tweet_ts_dt = tw.created_at - tweet_ts = int(time.mktime(tweet_ts_dt.timetuple())) - if ts is None: - ts = tweet_ts - tweet_ts_rel = (tweet_ts-ts) * 1000 + if tweet_ts_dt.tzinfo is None: + tweet_ts_dt = tweet_ts_dt.replace(tzinfo=tzutc()) + if start_date is None: + start_date = tweet_ts_dt + tweet_ts_rel = tweet_ts_dt-start_date + tweet_ts_rel_milli = int(round(tweet_ts_rel.total_seconds() * 1000)) if deltas: - d = find_delta(deltas, tweet_ts_rel) + d = find_delta(deltas, tweet_ts_rel_milli) if d[1] < 0: continue else : - tweet_ts_rel -= d[1] + tweet_ts_rel_milli -= d[1] username = None profile_url = "" @@ -441,7 +445,7 @@ if not username: username = "anon." - element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":unicode(profile_url)}) + element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel_milli), u"dur":u"0", u"src":unicode(profile_url)}) etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text) etree.SubElement(element, u"abstract").text = unicode(tw.text)