script/utils/export_twitter_alchemy.py
changeset 1153 02722ce55cf8
parent 1024 44636bcf3ea8
child 1295 03d2aa7b4967
--- a/script/utils/export_twitter_alchemy.py	Tue Oct 07 11:02:27 2014 +0200
+++ b/script/utils/export_twitter_alchemy.py	Thu Oct 16 15:13:01 2014 +0200
@@ -16,7 +16,8 @@
 import sys
 import time
 import uuid #@UnresolvedImport
-from dateutil.parser import parse as parse_date
+from dateutil.parser import parse as parse_date_raw
+from dateutil.tz import tzutc
 import bisect
 
 #class TweetExclude(object):
@@ -29,6 +30,12 @@
 LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/"
 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/"
 
+def parse_date(datestr):
+    res = parse_date_raw(datestr)
+    if res.tzinfo is None:
+        res = res.replace(tzinfo=tzutc())
+    return res
+
 
 def re_fn(expr, item):
     reg = re.compile(expr, re.I)
@@ -65,9 +72,7 @@
 
 def get_options():
 
-    usage = "usage: %(prog)s [options]"
-
-    parser = argparse.ArgumentParser(usage)
+    parser = argparse.ArgumentParser(description="All date should be given using iso8601 format. If no timezone is used, the date is considered as UTC")
 
     parser.add_argument("-f", "--file", dest="filename",
                       help="write export to file", metavar="FILE", default="project.ldt")
@@ -138,7 +143,7 @@
             'minutes': int(parts[1]),
             'seconds': int(parts[2]) if len(parts)>2 else 0
         }
-        return int(datetime.timedelta(**time_params).total_seconds()*1000)
+        return int(round(datetime.timedelta(**time_params).total_seconds()*1000))
 
 
 if __name__ == "__main__" :
@@ -267,18 +272,15 @@
                 duration = params.get("duration", None)
                 content_file = params.get("content_file", None)
                 content_file_write = params.get("content_file_write", None)
-                hashtags = params.get('hashtags', [])
+                hashtags = list(set(params.get('hashtags', [])))
 
                 if user_whitelist_file:
                     with open(user_whitelist_file, 'r+') as f:
                         user_whitelist = list(set([s.strip() for s in f]))
 
                 start_date = None
-                ts = None
                 if start_date_str:
                     start_date = parse_date(start_date_str)
-                    ts = time.mktime(start_date.timetuple())
-
 
                 root = None
                 ensemble_parent = None
@@ -422,16 +424,18 @@
 
                 for tw in query_res:
                     tweet_ts_dt = tw.created_at
-                    tweet_ts = int(time.mktime(tweet_ts_dt.timetuple()))
-                    if ts is None:
-                        ts = tweet_ts
-                    tweet_ts_rel = (tweet_ts-ts) * 1000
+                    if tweet_ts_dt.tzinfo is None:
+                        tweet_ts_dt = tweet_ts_dt.replace(tzinfo=tzutc())
+                    if start_date is None:
+                        start_date = tweet_ts_dt
+                    tweet_ts_rel = tweet_ts_dt-start_date
+                    tweet_ts_rel_milli = int(round(tweet_ts_rel.total_seconds() * 1000))
                     if deltas:
-                        d = find_delta(deltas, tweet_ts_rel)
+                        d = find_delta(deltas, tweet_ts_rel_milli)
                         if d[1] < 0:
                             continue
                         else :
-                            tweet_ts_rel -= d[1]
+                            tweet_ts_rel_milli -= d[1]
 
                     username = None
                     profile_url = ""
@@ -441,7 +445,7 @@
                     if not username:
                         username = "anon."
 
-                    element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":unicode(profile_url)})
+                    element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel_milli), u"dur":u"0", u"src":unicode(profile_url)})
                     etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text)
                     etree.SubElement(element, u"abstract").text = unicode(tw.text)