Add the capacities to have cuts.
authorYves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
Mon, 02 Dec 2013 16:38:41 +0100
changeset 1023 7d87ba8cc268
parent 1022 92429e14ca48
child 1024 44636bcf3ea8
Add the capacities to have cuts.
script/utils/export_twitter_alchemy.py
--- a/script/utils/export_twitter_alchemy.py	Fri Nov 29 18:14:45 2013 +0100
+++ b/script/utils/export_twitter_alchemy.py	Mon Dec 02 16:38:41 2013 +0100
@@ -5,7 +5,7 @@
 from iri_tweet.models import setup_database, Tweet, User
 from sqlalchemy import Table, Column, BigInteger, event, bindparam
 from sqlalchemy.sql import select, func
-from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, 
+from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query,
     get_logger)
 import argparse
 import anyjson
@@ -17,29 +17,31 @@
 import time
 import uuid #@UnresolvedImport
 from dateutil.parser import parse as parse_date
+import bisect
 
 #class TweetExclude(object):
 #    def __init__(self, id):
 #        self.id = id
-#        
+#
 #    def __repr__(self):
 #        return "<TweetExclude(id=%d)>" % (self.id)
 
 LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/"
 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/"
 
-def re_fn(expr, item):    
+
+def re_fn(expr, item):
     reg = re.compile(expr, re.I)
     res = reg.search(item)
     if res:
         get_logger().debug("re_fn : " + repr(expr) + "~" + repr(item)) #@UndefinedVariable
-    return res is not None 
+    return res is not None
 
 def parse_polemics(tw, extended_mode):
     """
     parse polemics in text and return a list of polemic code. None if not polemic found
     """
-    polemics = {} 
+    polemics = {}
     for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text):
         pol_link = {
             '++' : u'OK',
@@ -47,26 +49,26 @@
             '??' : u'Q',
             '==' : u'REF'}[m.group(1)]
         polemics[pol_link] = pol_link
-    
+
     if extended_mode:
         if "?" in tw.text:
             polemics["Q"] = "Q"
-        
+
         for entity in tw.entity_list:
             if entity.type == "entity_url":
-                polemics["REF"] = "REF" 
-    
+                polemics["REF"] = "REF"
+
     if len(polemics) > 0:
         return polemics.keys()
     else:
         return None
 
 def get_options():
-    
+
     usage = "usage: %(prog)s [options]"
-    
+
     parser = argparse.ArgumentParser(usage)
-    
+
     parser.add_argument("-f", "--file", dest="filename",
                       help="write export to file", metavar="FILE", default="project.ldt")
     parser.add_argument("-d", "--database", dest="database",
@@ -88,7 +90,7 @@
     parser.add_argument("-C", "--color", dest="color",
                       help="Color code", metavar="COLOR", default="16763904")
     parser.add_argument("-H", "--hashtag", dest="hashtag",
-                      help="Hashtag", metavar="HASHTAG", default=[], action="append")                      
+                      help="Hashtag", metavar="HASHTAG", default=[], action="append")
     parser.add_argument("-D", "--duration", dest="duration", type=int,
                       help="Duration", metavar="DURATION", default=None)
     parser.add_argument("-n", "--name", dest="name",
@@ -103,50 +105,84 @@
                       help="Trigger polemic extended mode", default=False)
     parser.add_argument("-b", "--base-url", dest="base_url",
                       help="base URL of the platform", metavar="BASE_URL", default="http://ldt.iri.centrepompidou.fr/ldtplatform/")
-    parser.add_argument("-p", "--project", dest="project_id", 
+    parser.add_argument("-p", "--project", dest="project_id",
                       help="Project id", metavar="PROJECT_ID", default=None)
-    parser.add_argument("-P", "--post-param", dest="post_param", 
-                      help="Post param", metavar="POST_PARAM", default=None)        
+    parser.add_argument("-P", "--post-param", dest="post_param",
+                      help="Post param", metavar="POST_PARAM", default=None)
     parser.add_argument("--user-whitelist", dest="user_whitelist", action="store",
                       help="A list of user screen name", metavar="USER_WHITELIST",default=None)
-    
-    
+    parser.add_argument("--cut", dest="cuts", action="append",
+                      help="A cut with the forma <ts in ms>::<duration>", metavar="CUT", default=[])
+
     set_logging_options(parser)
 
-    
     return (parser.parse_args(), parser)
 
 
+def find_delta(deltas, ts):
+    i = bisect.bisect_right(deltas, (ts+1,0))
+    if i:
+        return deltas[i-1]
+    return (0,0)
+
+
+def parse_duration(s):
+    try:
+        return int(s)
+    except ValueError:
+        parts = s.split(":")
+        if len(parts) < 2:
+            raise ValueError("Bad duration format")
+        time_params = {
+            'hours': int(parts[0]),
+            'minutes': int(parts[1]),
+            'seconds': int(parts[2]) if len(parts)>2 else 0
+        }
+        return int(datetime.timedelta(**time_params).total_seconds()*1000)
+
+
 if __name__ == "__main__" :
 
     (options, parser) = get_options()
-    
+
     set_logging(options)
-        
+
     get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable
-    
+
+
+    deltas = [(0,0)]
+    total_delta = 0
+    if options.cuts:
+        cuts_raw = sorted([tuple([parse_duration(s) for s in c.split("::")]) for c in options.cuts])
+        for c, d in cuts_raw:
+            deltas.append((c+total_delta, -1))
+            total_delta += d
+            deltas.append((c+total_delta, total_delta))
+
     if len(sys.argv) == 1 or options.database is None:
         parser.print_help()
         sys.exit(1)
-    
+
     conn_str = options.database.strip()
     if not re.match("^\w+://.+", conn_str):
         conn_str = 'sqlite:///' + conn_str
 
-    engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)        
+    engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)
+
+
     conn = None
     try :
         conn = engine.connect()
         @event.listens_for(conn, "begin")
         def do_begin(conn):
-            conn.connection.create_function('regexp', 2, re_fn)    
+            conn.connection.create_function('regexp', 2, re_fn)
         session = None
         try :
-            session = Session(bind=conn)         
+            session = Session(bind=conn)
             tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY'])
             #mapper(TweetExclude, tweet_exclude_table)
             metadata.create_all(bind=conn, tables=[tweet_exclude_table])
-            
+
             if options.exclude and os.path.exists(options.exclude):
                 with open(options.exclude, 'r+') as f:
                     tei = tweet_exclude_table.insert()
@@ -154,7 +190,7 @@
                     for line in f:
                         res = ex_regexp.match(line.strip())
                         if res:
-                            if res.group('field') == "id":                                
+                            if res.group('field') == "id":
                                 conn.execute(tei.values(id=res.group('value')))
                             else:
                                 exclude_query = session.query(Tweet)
@@ -163,24 +199,24 @@
                                 if filter_field.startswith("user__"):
                                     exclude_query = exclude_query.outerjoin(User, Tweet.user_id==User.id)
                                     filter_obj = User
-                                    filter_field = filter_field[len("user__"):]                                    
+                                    filter_field = filter_field[len("user__"):]
 
                                 if res.group('op') == "=":
                                     exclude_query = exclude_query.filter(getattr(filter_obj, filter_field) == res.group('value'))
                                 else:
                                     exclude_query = exclude_query.filter(getattr(filter_obj, filter_field).op('regexp')(res.group('value')))
-                                
+
                                 test_query = select([func.count()]).where(tweet_exclude_table.c.id==bindparam('t_id'))
                                 for t in exclude_query.all():
                                     get_logger().debug("t : " + repr(t))
                                     if conn.execute(test_query, t_id=t.id).fetchone()[0] == 0:
                                         conn.execute(tei.values(id=t.id))
-                                
+
             user_whitelist_file = options.user_whitelist
             user_whitelist = None
-            
+
             if options.listconf:
-                
+
                 parameters = []
                 confdoc = etree.parse(options.listconf)
                 for node in confdoc.xpath("/twitter_export/file"):
@@ -208,7 +244,7 @@
                 if options.project_id:
                     content_file = options.base_url + LDT_PROJECT_REST_API_PATH + options.project_id + "/?format=json"
                 else:
-                    content_file = options.content_file                                          
+                    content_file = options.content_file
                 parameters = [{
                     'start_date': options.start_date,
                     'end_date' : options.end_date,
@@ -216,72 +252,72 @@
                     'content_file' : content_file,
                     'content_file_write' : content_file,
                     'hashtags' : options.hashtag,
-                    'project_id' : options.project_id 
+                    'project_id' : options.project_id
                 }]
             post_param = {}
             if options.post_param:
                 post_param = anyjson.loads(options.post_param)
 
             for params in parameters:
-                
+
                 get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable
-                
+
                 start_date_str = params.get("start_date",None)
                 end_date_str = params.get("end_date", None)
                 duration = params.get("duration", None)
                 content_file = params.get("content_file", None)
                 content_file_write = params.get("content_file_write", None)
                 hashtags = params.get('hashtags', [])
-                  
+
                 if user_whitelist_file:
                     with open(user_whitelist_file, 'r+') as f:
                         user_whitelist = list(set([s.strip() for s in f]))
-                
+
                 start_date = None
                 ts = None
                 if start_date_str:
-                    start_date = parse_date(start_date_str) 
+                    start_date = parse_date(start_date_str)
                     ts = time.mktime(start_date.timetuple())
-            
-                                 
+
+
                 root = None
                 ensemble_parent = None
-                
+
                 #to do : analyse situation ldt or iri ? filename set or not ?
-                
+
                 if content_file and content_file.find("http") == 0:
-                    
+
                     get_logger().debug("url : " + content_file) #@UndefinedVariable
-                    
-                    r = requests.get(content_file, params=post_param)                    
-                    get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable                    
+
+                    r = requests.get(content_file, params=post_param)
+                    get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable
                     project = r.json()
                     text_match = re.match(r"\<\?\s*xml.*?\?\>(.*)", project['ldt'], re.I|re.S)
                     root = etree.fromstring(text_match.group(1) if text_match else project['ldt'])
-                
+
                 elif content_file and os.path.exists(content_file):
 
                     doc = etree.parse(content_file)
                     root = doc.getroot()
-                
-                content_id = None    
-                
+
+                content_id = None
+
                 if root is None:
-                
+
                     root = etree.Element(u"iri")
-                        
+
                     project = etree.SubElement(root, u"project", {u"abstract":u"Polemics Tweets",u"title":u"Polemic Tweets", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())})
-                
+
                     medias = etree.SubElement(root, u"medias")
                     media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
-                    
-                    annotations = etree.SubElement(root, u"annotations")    
+
+                    annotations = etree.SubElement(root, u"annotations")
                     content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
                     ensemble_parent = content
-                    
+
                     content_id = options.content_id
-                    
-                
+
+
                 if ensemble_parent is None:
                     file_type = None
                     for node in root:
@@ -291,7 +327,7 @@
                         elif node.tag == "head":
                             file_type = "iri"
                             break
-                    
+
                     if file_type == "ldt":
                         media_nodes = root.xpath("//media")
                         if len(media_nodes) > 0:
@@ -309,8 +345,8 @@
                             get_logger().info("No display node found. Will not update display")
                             display_content_node = None
                         else:
-                            display_content_node = display_nodes[0] 
-                        
+                            display_content_node = display_nodes[0]
+
                     elif file_type == "iri":
                         body_node = root.find(u"body")
                         if body_node is None:
@@ -321,45 +357,45 @@
                         ensemble_parent = ensembles_node
                         content_id = root.xpath("head/meta[@name='id']/@content")[0]
                         display_content_node = None
-                    
-                    
+
+
                 if ensemble_parent is None:
                     get_logger().error("Can not process file") #@UndefinedVariable
                     sys.exit()
-            
+
                 if options.replace:
                     for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
-                        ens_id = ens.get("id","") 
-                        if ens_id.startswith("tweet_"):                            
+                        ens_id = ens.get("id","")
+                        if ens_id.startswith("tweet_"):
                             ensemble_parent.remove(ens)
                             # remove in display nodes
                             if display_content_node is not None:
                                 for cut_display in display_content_node.iterchildren():
                                     if cut_display.get('idens','') == ens_id:
                                         display_content_node.remove(cut_display)
-                
+
                 ensemble = None
                 elements = None
-                
+
                 if options.merge:
                     for ens in ensemble_parent.findall(u"ensemble"):
                         if ens.get('id',"").startswith("tweet_"):
                             ensemble = ens
                             break
-                    if ensemble is not None:                            
+                    if ensemble is not None:
                         elements = ensemble.find(u".//elements")
                         decoupage = ensemble.find(u"decoupage")
-                    
+
                 if ensemble is None or elements is None:
                     ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"})
                     decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
-                
+
                     etree.SubElement(decoupage, u"title").text = unicode(options.name)
                     etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
-                
+
                     elements = etree.SubElement(decoupage, u"elements")
 
-                ensemble_id = ensemble.get('id', '')                
+                ensemble_id = ensemble.get('id', '')
                 decoupage_id = decoupage.get('id', '') if decoupage is not None else None
 
                 end_date = None
@@ -367,7 +403,7 @@
                     end_date = parse_date(end_date_str)
                 elif start_date and duration:
                     end_date = start_date + datetime.timedelta(seconds=duration)
-                elif start_date and options.base_url:                    
+                elif start_date and options.base_url:
                     # get duration from api
                     content_url = options.base_url + LDT_CONTENT_REST_API_PATH + content_id + "/?format=json"
                     r = requests.get(content_url)
@@ -376,18 +412,27 @@
                     get_logger().debug("get duration " + repr(duration)) #@UndefinedVariable
 
                     end_date = start_date + datetime.timedelta(seconds=int(duration/1000))
-                
+
+                if end_date and deltas:
+                    end_date = end_date + datetime.timedelta(milliseconds=deltas[-1][1])
                 query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist)
-                    
+
                 query_res = query.all()
 
-                
+
                 for tw in query_res:
                     tweet_ts_dt = tw.created_at
                     tweet_ts = int(time.mktime(tweet_ts_dt.timetuple()))
                     if ts is None:
                         ts = tweet_ts
                     tweet_ts_rel = (tweet_ts-ts) * 1000
+                    if deltas:
+                        d = find_delta(tweet_ts_rel, deltas)
+                        if d[1] < 0:
+                            continue
+                        else :
+                            tweet_ts_rel -= d[1]
+
                     username = None
                     profile_url = ""
                     if tw.user is not None:
@@ -395,19 +440,19 @@
                         profile_url = tw.user.profile_image_url if tw.user.profile_image_url is not None else ""
                     if not username:
                         username = "anon."
-                    
+
                     element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":unicode(profile_url)})
                     etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text)
                     etree.SubElement(element, u"abstract").text = unicode(tw.text)
-            
+
                     tags_node = etree.SubElement(element, u"tags")
-                    
+
                     for entity in tw.entity_list:
-                        if entity.type == u'entity_hashtag': 
+                        if entity.type == u'entity_hashtag':
                             etree.SubElement(tags_node,u"tag").text = entity.hashtag.text
-                            
+
                     meta_element = etree.SubElement(element, u'meta')
-                    
+
                     polemics_list = parse_polemics(tw, options.extended_mode)
                     if polemics_list:
                         polemics_element = etree.Element(u'polemics')
@@ -416,15 +461,15 @@
                         meta_element.append(polemics_element)
 
                     etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.tweet_source.original_json))
-                    
+
                 # sort by tc in
                 if options.merge :
                     # remove all elements and put them in a array
                     # sort them with tc
                     #put them back
                     elements[:] = sorted(elements,key=lambda n: int(n.get('begin')))
-                    
-                #add to display node    
+
+                #add to display node
                 if display_content_node is not None:
                     display_dec = None
                     for dec in display_content_node.iterchildren(tag=u"decoupage"):
@@ -433,15 +478,15 @@
                             break
                     if display_dec is None and ensemble_id and decoupage_id:
                         etree.SubElement(display_content_node, u"decoupage", attrib={'idens': ensemble_id, 'id': decoupage_id, 'tagsSelect':''})
-                
-                output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True)  
-                
+
+                output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True)
+
                 if content_file_write and content_file_write.find("http") == 0:
-                    
+
                     project["ldt"] = output_data
                     project['owner'] = project['owner'].replace('%7E','~')
                     project['contents'] = [c_url.replace('%7E','~') for c_url in project['contents']]
-                          
+
                     post_param = {}
                     if options.post_param:
                         post_param = anyjson.loads(options.post_param)
@@ -458,13 +503,13 @@
                         dest_file_name = content_file_write
                     else:
                         dest_file_name = options.filename
-            
+
                     get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable
                     output = open(dest_file_name, "w")
                     output.write(output_data)
                     output.flush()
                     output.close()
-                
+
         finally:
             if session:
                 session.close()