script/utils/export_twitter_alchemy.py
changeset 1023 7d87ba8cc268
parent 957 e4d0094f097b
child 1024 44636bcf3ea8
equal deleted inserted replaced
1022:92429e14ca48 1023:7d87ba8cc268
     3 
     3 
     4 from lxml import etree
     4 from lxml import etree
     5 from iri_tweet.models import setup_database, Tweet, User
     5 from iri_tweet.models import setup_database, Tweet, User
     6 from sqlalchemy import Table, Column, BigInteger, event, bindparam
     6 from sqlalchemy import Table, Column, BigInteger, event, bindparam
     7 from sqlalchemy.sql import select, func
     7 from sqlalchemy.sql import select, func
     8 from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, 
     8 from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query,
     9     get_logger)
     9     get_logger)
    10 import argparse
    10 import argparse
    11 import anyjson
    11 import anyjson
    12 import datetime
    12 import datetime
    13 import requests
    13 import requests
    15 import re
    15 import re
    16 import sys
    16 import sys
    17 import time
    17 import time
    18 import uuid #@UnresolvedImport
    18 import uuid #@UnresolvedImport
    19 from dateutil.parser import parse as parse_date
    19 from dateutil.parser import parse as parse_date
       
    20 import bisect
    20 
    21 
    21 #class TweetExclude(object):
    22 #class TweetExclude(object):
    22 #    def __init__(self, id):
    23 #    def __init__(self, id):
    23 #        self.id = id
    24 #        self.id = id
    24 #        
    25 #
    25 #    def __repr__(self):
    26 #    def __repr__(self):
    26 #        return "<TweetExclude(id=%d)>" % (self.id)
    27 #        return "<TweetExclude(id=%d)>" % (self.id)
    27 
    28 
    28 LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/"
    29 LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/"
    29 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/"
    30 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/"
    30 
    31 
    31 def re_fn(expr, item):    
    32 
       
    33 def re_fn(expr, item):
    32     reg = re.compile(expr, re.I)
    34     reg = re.compile(expr, re.I)
    33     res = reg.search(item)
    35     res = reg.search(item)
    34     if res:
    36     if res:
    35         get_logger().debug("re_fn : " + repr(expr) + "~" + repr(item)) #@UndefinedVariable
    37         get_logger().debug("re_fn : " + repr(expr) + "~" + repr(item)) #@UndefinedVariable
    36     return res is not None 
    38     return res is not None
    37 
    39 
    38 def parse_polemics(tw, extended_mode):
    40 def parse_polemics(tw, extended_mode):
    39     """
    41     """
    40     parse polemics in text and return a list of polemic code. None if not polemic found
    42     parse polemics in text and return a list of polemic code. None if not polemic found
    41     """
    43     """
    42     polemics = {} 
    44     polemics = {}
    43     for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text):
    45     for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text):
    44         pol_link = {
    46         pol_link = {
    45             '++' : u'OK',
    47             '++' : u'OK',
    46             '--' : u'KO',
    48             '--' : u'KO',
    47             '??' : u'Q',
    49             '??' : u'Q',
    48             '==' : u'REF'}[m.group(1)]
    50             '==' : u'REF'}[m.group(1)]
    49         polemics[pol_link] = pol_link
    51         polemics[pol_link] = pol_link
    50     
    52 
    51     if extended_mode:
    53     if extended_mode:
    52         if "?" in tw.text:
    54         if "?" in tw.text:
    53             polemics["Q"] = "Q"
    55             polemics["Q"] = "Q"
    54         
    56 
    55         for entity in tw.entity_list:
    57         for entity in tw.entity_list:
    56             if entity.type == "entity_url":
    58             if entity.type == "entity_url":
    57                 polemics["REF"] = "REF" 
    59                 polemics["REF"] = "REF"
    58     
    60 
    59     if len(polemics) > 0:
    61     if len(polemics) > 0:
    60         return polemics.keys()
    62         return polemics.keys()
    61     else:
    63     else:
    62         return None
    64         return None
    63 
    65 
    64 def get_options():
    66 def get_options():
    65     
    67 
    66     usage = "usage: %(prog)s [options]"
    68     usage = "usage: %(prog)s [options]"
    67     
    69 
    68     parser = argparse.ArgumentParser(usage)
    70     parser = argparse.ArgumentParser(usage)
    69     
    71 
    70     parser.add_argument("-f", "--file", dest="filename",
    72     parser.add_argument("-f", "--file", dest="filename",
    71                       help="write export to file", metavar="FILE", default="project.ldt")
    73                       help="write export to file", metavar="FILE", default="project.ldt")
    72     parser.add_argument("-d", "--database", dest="database",
    74     parser.add_argument("-d", "--database", dest="database",
    73                       help="Input database", metavar="DATABASE")
    75                       help="Input database", metavar="DATABASE")
    74     parser.add_argument("-s", "--start-date", dest="start_date",
    76     parser.add_argument("-s", "--start-date", dest="start_date",
    86     parser.add_argument("-x", "--exclude", dest="exclude",
    88     parser.add_argument("-x", "--exclude", dest="exclude",
    87                       help="file containing the id to exclude", metavar="EXCLUDE")
    89                       help="file containing the id to exclude", metavar="EXCLUDE")
    88     parser.add_argument("-C", "--color", dest="color",
    90     parser.add_argument("-C", "--color", dest="color",
    89                       help="Color code", metavar="COLOR", default="16763904")
    91                       help="Color code", metavar="COLOR", default="16763904")
    90     parser.add_argument("-H", "--hashtag", dest="hashtag",
    92     parser.add_argument("-H", "--hashtag", dest="hashtag",
    91                       help="Hashtag", metavar="HASHTAG", default=[], action="append")                      
    93                       help="Hashtag", metavar="HASHTAG", default=[], action="append")
    92     parser.add_argument("-D", "--duration", dest="duration", type=int,
    94     parser.add_argument("-D", "--duration", dest="duration", type=int,
    93                       help="Duration", metavar="DURATION", default=None)
    95                       help="Duration", metavar="DURATION", default=None)
    94     parser.add_argument("-n", "--name", dest="name",
    96     parser.add_argument("-n", "--name", dest="name",
    95                       help="Cutting name", metavar="NAME", default=u"Tweets")
    97                       help="Cutting name", metavar="NAME", default=u"Tweets")
    96     parser.add_argument("-R", "--replace", dest="replace", action="store_true",
    98     parser.add_argument("-R", "--replace", dest="replace", action="store_true",
   101                       help="list of file to process", metavar="LIST_CONF", default=None)
   103                       help="list of file to process", metavar="LIST_CONF", default=None)
   102     parser.add_argument("-E", "--extended", dest="extended_mode", action="store_true",
   104     parser.add_argument("-E", "--extended", dest="extended_mode", action="store_true",
   103                       help="Trigger polemic extended mode", default=False)
   105                       help="Trigger polemic extended mode", default=False)
   104     parser.add_argument("-b", "--base-url", dest="base_url",
   106     parser.add_argument("-b", "--base-url", dest="base_url",
   105                       help="base URL of the platform", metavar="BASE_URL", default="http://ldt.iri.centrepompidou.fr/ldtplatform/")
   107                       help="base URL of the platform", metavar="BASE_URL", default="http://ldt.iri.centrepompidou.fr/ldtplatform/")
   106     parser.add_argument("-p", "--project", dest="project_id", 
   108     parser.add_argument("-p", "--project", dest="project_id",
   107                       help="Project id", metavar="PROJECT_ID", default=None)
   109                       help="Project id", metavar="PROJECT_ID", default=None)
   108     parser.add_argument("-P", "--post-param", dest="post_param", 
   110     parser.add_argument("-P", "--post-param", dest="post_param",
   109                       help="Post param", metavar="POST_PARAM", default=None)        
   111                       help="Post param", metavar="POST_PARAM", default=None)
   110     parser.add_argument("--user-whitelist", dest="user_whitelist", action="store",
   112     parser.add_argument("--user-whitelist", dest="user_whitelist", action="store",
   111                       help="A list of user screen name", metavar="USER_WHITELIST",default=None)
   113                       help="A list of user screen name", metavar="USER_WHITELIST",default=None)
   112     
   114     parser.add_argument("--cut", dest="cuts", action="append",
   113     
   115                       help="A cut with the forma <ts in ms>::<duration>", metavar="CUT", default=[])
       
   116 
   114     set_logging_options(parser)
   117     set_logging_options(parser)
   115 
   118 
   116     
       
   117     return (parser.parse_args(), parser)
   119     return (parser.parse_args(), parser)
   118 
   120 
   119 
   121 
       
   122 def find_delta(deltas, ts):
       
   123     i = bisect.bisect_right(deltas, (ts+1,0))
       
   124     if i:
       
   125         return deltas[i-1]
       
   126     return (0,0)
       
   127 
       
   128 
       
   129 def parse_duration(s):
       
   130     try:
       
   131         return int(s)
       
   132     except ValueError:
       
   133         parts = s.split(":")
       
   134         if len(parts) < 2:
       
   135             raise ValueError("Bad duration format")
       
   136         time_params = {
       
   137             'hours': int(parts[0]),
       
   138             'minutes': int(parts[1]),
       
   139             'seconds': int(parts[2]) if len(parts)>2 else 0
       
   140         }
       
   141         return int(datetime.timedelta(**time_params).total_seconds()*1000)
       
   142 
       
   143 
   120 if __name__ == "__main__" :
   144 if __name__ == "__main__" :
   121 
   145 
   122     (options, parser) = get_options()
   146     (options, parser) = get_options()
   123     
   147 
   124     set_logging(options)
   148     set_logging(options)
   125         
   149 
   126     get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable
   150     get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable
   127     
   151 
       
   152 
       
   153     deltas = [(0,0)]
       
   154     total_delta = 0
       
   155     if options.cuts:
       
   156         cuts_raw = sorted([tuple([parse_duration(s) for s in c.split("::")]) for c in options.cuts])
       
   157         for c, d in cuts_raw:
       
   158             deltas.append((c+total_delta, -1))
       
   159             total_delta += d
       
   160             deltas.append((c+total_delta, total_delta))
       
   161 
   128     if len(sys.argv) == 1 or options.database is None:
   162     if len(sys.argv) == 1 or options.database is None:
   129         parser.print_help()
   163         parser.print_help()
   130         sys.exit(1)
   164         sys.exit(1)
   131     
   165 
   132     conn_str = options.database.strip()
   166     conn_str = options.database.strip()
   133     if not re.match("^\w+://.+", conn_str):
   167     if not re.match("^\w+://.+", conn_str):
   134         conn_str = 'sqlite:///' + conn_str
   168         conn_str = 'sqlite:///' + conn_str
   135 
   169 
   136     engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)        
   170     engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)
       
   171 
       
   172 
   137     conn = None
   173     conn = None
   138     try :
   174     try :
   139         conn = engine.connect()
   175         conn = engine.connect()
   140         @event.listens_for(conn, "begin")
   176         @event.listens_for(conn, "begin")
   141         def do_begin(conn):
   177         def do_begin(conn):
   142             conn.connection.create_function('regexp', 2, re_fn)    
   178             conn.connection.create_function('regexp', 2, re_fn)
   143         session = None
   179         session = None
   144         try :
   180         try :
   145             session = Session(bind=conn)         
   181             session = Session(bind=conn)
   146             tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY'])
   182             tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY'])
   147             #mapper(TweetExclude, tweet_exclude_table)
   183             #mapper(TweetExclude, tweet_exclude_table)
   148             metadata.create_all(bind=conn, tables=[tweet_exclude_table])
   184             metadata.create_all(bind=conn, tables=[tweet_exclude_table])
   149             
   185 
   150             if options.exclude and os.path.exists(options.exclude):
   186             if options.exclude and os.path.exists(options.exclude):
   151                 with open(options.exclude, 'r+') as f:
   187                 with open(options.exclude, 'r+') as f:
   152                     tei = tweet_exclude_table.insert()
   188                     tei = tweet_exclude_table.insert()
   153                     ex_regexp = re.compile("(?P<field>\w+)(?P<op>[~=])(?P<value>.+)", re.I)
   189                     ex_regexp = re.compile("(?P<field>\w+)(?P<op>[~=])(?P<value>.+)", re.I)
   154                     for line in f:
   190                     for line in f:
   155                         res = ex_regexp.match(line.strip())
   191                         res = ex_regexp.match(line.strip())
   156                         if res:
   192                         if res:
   157                             if res.group('field') == "id":                                
   193                             if res.group('field') == "id":
   158                                 conn.execute(tei.values(id=res.group('value')))
   194                                 conn.execute(tei.values(id=res.group('value')))
   159                             else:
   195                             else:
   160                                 exclude_query = session.query(Tweet)
   196                                 exclude_query = session.query(Tweet)
   161                                 filter_obj = Tweet
   197                                 filter_obj = Tweet
   162                                 filter_field = res.group('field')
   198                                 filter_field = res.group('field')
   163                                 if filter_field.startswith("user__"):
   199                                 if filter_field.startswith("user__"):
   164                                     exclude_query = exclude_query.outerjoin(User, Tweet.user_id==User.id)
   200                                     exclude_query = exclude_query.outerjoin(User, Tweet.user_id==User.id)
   165                                     filter_obj = User
   201                                     filter_obj = User
   166                                     filter_field = filter_field[len("user__"):]                                    
   202                                     filter_field = filter_field[len("user__"):]
   167 
   203 
   168                                 if res.group('op') == "=":
   204                                 if res.group('op') == "=":
   169                                     exclude_query = exclude_query.filter(getattr(filter_obj, filter_field) == res.group('value'))
   205                                     exclude_query = exclude_query.filter(getattr(filter_obj, filter_field) == res.group('value'))
   170                                 else:
   206                                 else:
   171                                     exclude_query = exclude_query.filter(getattr(filter_obj, filter_field).op('regexp')(res.group('value')))
   207                                     exclude_query = exclude_query.filter(getattr(filter_obj, filter_field).op('regexp')(res.group('value')))
   172                                 
   208 
   173                                 test_query = select([func.count()]).where(tweet_exclude_table.c.id==bindparam('t_id'))
   209                                 test_query = select([func.count()]).where(tweet_exclude_table.c.id==bindparam('t_id'))
   174                                 for t in exclude_query.all():
   210                                 for t in exclude_query.all():
   175                                     get_logger().debug("t : " + repr(t))
   211                                     get_logger().debug("t : " + repr(t))
   176                                     if conn.execute(test_query, t_id=t.id).fetchone()[0] == 0:
   212                                     if conn.execute(test_query, t_id=t.id).fetchone()[0] == 0:
   177                                         conn.execute(tei.values(id=t.id))
   213                                         conn.execute(tei.values(id=t.id))
   178                                 
   214 
   179             user_whitelist_file = options.user_whitelist
   215             user_whitelist_file = options.user_whitelist
   180             user_whitelist = None
   216             user_whitelist = None
   181             
   217 
   182             if options.listconf:
   218             if options.listconf:
   183                 
   219 
   184                 parameters = []
   220                 parameters = []
   185                 confdoc = etree.parse(options.listconf)
   221                 confdoc = etree.parse(options.listconf)
   186                 for node in confdoc.xpath("/twitter_export/file"):
   222                 for node in confdoc.xpath("/twitter_export/file"):
   187                     params = {}
   223                     params = {}
   188                     for snode in node:
   224                     for snode in node:
   206                     parameters.append(params)
   242                     parameters.append(params)
   207             else:
   243             else:
   208                 if options.project_id:
   244                 if options.project_id:
   209                     content_file = options.base_url + LDT_PROJECT_REST_API_PATH + options.project_id + "/?format=json"
   245                     content_file = options.base_url + LDT_PROJECT_REST_API_PATH + options.project_id + "/?format=json"
   210                 else:
   246                 else:
   211                     content_file = options.content_file                                          
   247                     content_file = options.content_file
   212                 parameters = [{
   248                 parameters = [{
   213                     'start_date': options.start_date,
   249                     'start_date': options.start_date,
   214                     'end_date' : options.end_date,
   250                     'end_date' : options.end_date,
   215                     'duration' : options.duration,
   251                     'duration' : options.duration,
   216                     'content_file' : content_file,
   252                     'content_file' : content_file,
   217                     'content_file_write' : content_file,
   253                     'content_file_write' : content_file,
   218                     'hashtags' : options.hashtag,
   254                     'hashtags' : options.hashtag,
   219                     'project_id' : options.project_id 
   255                     'project_id' : options.project_id
   220                 }]
   256                 }]
   221             post_param = {}
   257             post_param = {}
   222             if options.post_param:
   258             if options.post_param:
   223                 post_param = anyjson.loads(options.post_param)
   259                 post_param = anyjson.loads(options.post_param)
   224 
   260 
   225             for params in parameters:
   261             for params in parameters:
   226                 
   262 
   227                 get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable
   263                 get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable
   228                 
   264 
   229                 start_date_str = params.get("start_date",None)
   265                 start_date_str = params.get("start_date",None)
   230                 end_date_str = params.get("end_date", None)
   266                 end_date_str = params.get("end_date", None)
   231                 duration = params.get("duration", None)
   267                 duration = params.get("duration", None)
   232                 content_file = params.get("content_file", None)
   268                 content_file = params.get("content_file", None)
   233                 content_file_write = params.get("content_file_write", None)
   269                 content_file_write = params.get("content_file_write", None)
   234                 hashtags = params.get('hashtags', [])
   270                 hashtags = params.get('hashtags', [])
   235                   
   271 
   236                 if user_whitelist_file:
   272                 if user_whitelist_file:
   237                     with open(user_whitelist_file, 'r+') as f:
   273                     with open(user_whitelist_file, 'r+') as f:
   238                         user_whitelist = list(set([s.strip() for s in f]))
   274                         user_whitelist = list(set([s.strip() for s in f]))
   239                 
   275 
   240                 start_date = None
   276                 start_date = None
   241                 ts = None
   277                 ts = None
   242                 if start_date_str:
   278                 if start_date_str:
   243                     start_date = parse_date(start_date_str) 
   279                     start_date = parse_date(start_date_str)
   244                     ts = time.mktime(start_date.timetuple())
   280                     ts = time.mktime(start_date.timetuple())
   245             
   281 
   246                                  
   282 
   247                 root = None
   283                 root = None
   248                 ensemble_parent = None
   284                 ensemble_parent = None
   249                 
   285 
   250                 #to do : analyse situation ldt or iri ? filename set or not ?
   286                 #to do : analyse situation ldt or iri ? filename set or not ?
   251                 
   287 
   252                 if content_file and content_file.find("http") == 0:
   288                 if content_file and content_file.find("http") == 0:
   253                     
   289 
   254                     get_logger().debug("url : " + content_file) #@UndefinedVariable
   290                     get_logger().debug("url : " + content_file) #@UndefinedVariable
   255                     
   291 
   256                     r = requests.get(content_file, params=post_param)                    
   292                     r = requests.get(content_file, params=post_param)
   257                     get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable                    
   293                     get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable
   258                     project = r.json()
   294                     project = r.json()
   259                     text_match = re.match(r"\<\?\s*xml.*?\?\>(.*)", project['ldt'], re.I|re.S)
   295                     text_match = re.match(r"\<\?\s*xml.*?\?\>(.*)", project['ldt'], re.I|re.S)
   260                     root = etree.fromstring(text_match.group(1) if text_match else project['ldt'])
   296                     root = etree.fromstring(text_match.group(1) if text_match else project['ldt'])
   261                 
   297 
   262                 elif content_file and os.path.exists(content_file):
   298                 elif content_file and os.path.exists(content_file):
   263 
   299 
   264                     doc = etree.parse(content_file)
   300                     doc = etree.parse(content_file)
   265                     root = doc.getroot()
   301                     root = doc.getroot()
   266                 
   302 
   267                 content_id = None    
   303                 content_id = None
   268                 
   304 
   269                 if root is None:
   305                 if root is None:
   270                 
   306 
   271                     root = etree.Element(u"iri")
   307                     root = etree.Element(u"iri")
   272                         
   308 
   273                     project = etree.SubElement(root, u"project", {u"abstract":u"Polemics Tweets",u"title":u"Polemic Tweets", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())})
   309                     project = etree.SubElement(root, u"project", {u"abstract":u"Polemics Tweets",u"title":u"Polemic Tweets", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())})
   274                 
   310 
   275                     medias = etree.SubElement(root, u"medias")
   311                     medias = etree.SubElement(root, u"medias")
   276                     media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
   312                     media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
   277                     
   313 
   278                     annotations = etree.SubElement(root, u"annotations")    
   314                     annotations = etree.SubElement(root, u"annotations")
   279                     content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
   315                     content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
   280                     ensemble_parent = content
   316                     ensemble_parent = content
   281                     
   317 
   282                     content_id = options.content_id
   318                     content_id = options.content_id
   283                     
   319 
   284                 
   320 
   285                 if ensemble_parent is None:
   321                 if ensemble_parent is None:
   286                     file_type = None
   322                     file_type = None
   287                     for node in root:
   323                     for node in root:
   288                         if node.tag == "project":
   324                         if node.tag == "project":
   289                             file_type = "ldt"
   325                             file_type = "ldt"
   290                             break
   326                             break
   291                         elif node.tag == "head":
   327                         elif node.tag == "head":
   292                             file_type = "iri"
   328                             file_type = "iri"
   293                             break
   329                             break
   294                     
   330 
   295                     if file_type == "ldt":
   331                     if file_type == "ldt":
   296                         media_nodes = root.xpath("//media")
   332                         media_nodes = root.xpath("//media")
   297                         if len(media_nodes) > 0:
   333                         if len(media_nodes) > 0:
   298                             media = media_nodes[0]
   334                             media = media_nodes[0]
   299                         annotations_node = root.find(u"annotations")
   335                         annotations_node = root.find(u"annotations")
   307                         display_nodes = root.xpath("//displays/display/content[@id='%s']" % content_id)
   343                         display_nodes = root.xpath("//displays/display/content[@id='%s']" % content_id)
   308                         if len(display_nodes) == 0:
   344                         if len(display_nodes) == 0:
   309                             get_logger().info("No display node found. Will not update display")
   345                             get_logger().info("No display node found. Will not update display")
   310                             display_content_node = None
   346                             display_content_node = None
   311                         else:
   347                         else:
   312                             display_content_node = display_nodes[0] 
   348                             display_content_node = display_nodes[0]
   313                         
   349 
   314                     elif file_type == "iri":
   350                     elif file_type == "iri":
   315                         body_node = root.find(u"body")
   351                         body_node = root.find(u"body")
   316                         if body_node is None:
   352                         if body_node is None:
   317                             body_node = etree.SubElement(root, u"body")
   353                             body_node = etree.SubElement(root, u"body")
   318                         ensembles_node = body_node.find(u"ensembles")
   354                         ensembles_node = body_node.find(u"ensembles")
   319                         if ensembles_node is None:
   355                         if ensembles_node is None:
   320                             ensembles_node = etree.SubElement(body_node, u"ensembles")
   356                             ensembles_node = etree.SubElement(body_node, u"ensembles")
   321                         ensemble_parent = ensembles_node
   357                         ensemble_parent = ensembles_node
   322                         content_id = root.xpath("head/meta[@name='id']/@content")[0]
   358                         content_id = root.xpath("head/meta[@name='id']/@content")[0]
   323                         display_content_node = None
   359                         display_content_node = None
   324                     
   360 
   325                     
   361 
   326                 if ensemble_parent is None:
   362                 if ensemble_parent is None:
   327                     get_logger().error("Can not process file") #@UndefinedVariable
   363                     get_logger().error("Can not process file") #@UndefinedVariable
   328                     sys.exit()
   364                     sys.exit()
   329             
   365 
   330                 if options.replace:
   366                 if options.replace:
   331                     for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
   367                     for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
   332                         ens_id = ens.get("id","") 
   368                         ens_id = ens.get("id","")
   333                         if ens_id.startswith("tweet_"):                            
   369                         if ens_id.startswith("tweet_"):
   334                             ensemble_parent.remove(ens)
   370                             ensemble_parent.remove(ens)
   335                             # remove in display nodes
   371                             # remove in display nodes
   336                             if display_content_node is not None:
   372                             if display_content_node is not None:
   337                                 for cut_display in display_content_node.iterchildren():
   373                                 for cut_display in display_content_node.iterchildren():
   338                                     if cut_display.get('idens','') == ens_id:
   374                                     if cut_display.get('idens','') == ens_id:
   339                                         display_content_node.remove(cut_display)
   375                                         display_content_node.remove(cut_display)
   340                 
   376 
   341                 ensemble = None
   377                 ensemble = None
   342                 elements = None
   378                 elements = None
   343                 
   379 
   344                 if options.merge:
   380                 if options.merge:
   345                     for ens in ensemble_parent.findall(u"ensemble"):
   381                     for ens in ensemble_parent.findall(u"ensemble"):
   346                         if ens.get('id',"").startswith("tweet_"):
   382                         if ens.get('id',"").startswith("tweet_"):
   347                             ensemble = ens
   383                             ensemble = ens
   348                             break
   384                             break
   349                     if ensemble is not None:                            
   385                     if ensemble is not None:
   350                         elements = ensemble.find(u".//elements")
   386                         elements = ensemble.find(u".//elements")
   351                         decoupage = ensemble.find(u"decoupage")
   387                         decoupage = ensemble.find(u"decoupage")
   352                     
   388 
   353                 if ensemble is None or elements is None:
   389                 if ensemble is None or elements is None:
   354                     ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"})
   390                     ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"})
   355                     decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
   391                     decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
   356                 
   392 
   357                     etree.SubElement(decoupage, u"title").text = unicode(options.name)
   393                     etree.SubElement(decoupage, u"title").text = unicode(options.name)
   358                     etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
   394                     etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
   359                 
   395 
   360                     elements = etree.SubElement(decoupage, u"elements")
   396                     elements = etree.SubElement(decoupage, u"elements")
   361 
   397 
   362                 ensemble_id = ensemble.get('id', '')                
   398                 ensemble_id = ensemble.get('id', '')
   363                 decoupage_id = decoupage.get('id', '') if decoupage is not None else None
   399                 decoupage_id = decoupage.get('id', '') if decoupage is not None else None
   364 
   400 
   365                 end_date = None
   401                 end_date = None
   366                 if end_date_str:
   402                 if end_date_str:
   367                     end_date = parse_date(end_date_str)
   403                     end_date = parse_date(end_date_str)
   368                 elif start_date and duration:
   404                 elif start_date and duration:
   369                     end_date = start_date + datetime.timedelta(seconds=duration)
   405                     end_date = start_date + datetime.timedelta(seconds=duration)
   370                 elif start_date and options.base_url:                    
   406                 elif start_date and options.base_url:
   371                     # get duration from api
   407                     # get duration from api
   372                     content_url = options.base_url + LDT_CONTENT_REST_API_PATH + content_id + "/?format=json"
   408                     content_url = options.base_url + LDT_CONTENT_REST_API_PATH + content_id + "/?format=json"
   373                     r = requests.get(content_url)
   409                     r = requests.get(content_url)
   374                     duration = int(r.json()['duration'])
   410                     duration = int(r.json()['duration'])
   375                     get_logger().debug("get duration " + content_url) #@UndefinedVariable
   411                     get_logger().debug("get duration " + content_url) #@UndefinedVariable
   376                     get_logger().debug("get duration " + repr(duration)) #@UndefinedVariable
   412                     get_logger().debug("get duration " + repr(duration)) #@UndefinedVariable
   377 
   413 
   378                     end_date = start_date + datetime.timedelta(seconds=int(duration/1000))
   414                     end_date = start_date + datetime.timedelta(seconds=int(duration/1000))
   379                 
   415 
       
   416                 if end_date and deltas:
       
   417                     end_date = end_date + datetime.timedelta(milliseconds=deltas[-1][1])
   380                 query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist)
   418                 query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist)
   381                     
   419 
   382                 query_res = query.all()
   420                 query_res = query.all()
   383 
   421 
   384                 
   422 
   385                 for tw in query_res:
   423                 for tw in query_res:
   386                     tweet_ts_dt = tw.created_at
   424                     tweet_ts_dt = tw.created_at
   387                     tweet_ts = int(time.mktime(tweet_ts_dt.timetuple()))
   425                     tweet_ts = int(time.mktime(tweet_ts_dt.timetuple()))
   388                     if ts is None:
   426                     if ts is None:
   389                         ts = tweet_ts
   427                         ts = tweet_ts
   390                     tweet_ts_rel = (tweet_ts-ts) * 1000
   428                     tweet_ts_rel = (tweet_ts-ts) * 1000
       
   429                     if deltas:
       
   430                         d = find_delta(tweet_ts_rel, deltas)
       
   431                         if d[1] < 0:
       
   432                             continue
       
   433                         else :
       
   434                             tweet_ts_rel -= d[1]
       
   435 
   391                     username = None
   436                     username = None
   392                     profile_url = ""
   437                     profile_url = ""
   393                     if tw.user is not None:
   438                     if tw.user is not None:
   394                         username = tw.user.screen_name
   439                         username = tw.user.screen_name
   395                         profile_url = tw.user.profile_image_url if tw.user.profile_image_url is not None else ""
   440                         profile_url = tw.user.profile_image_url if tw.user.profile_image_url is not None else ""
   396                     if not username:
   441                     if not username:
   397                         username = "anon."
   442                         username = "anon."
   398                     
   443 
   399                     element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":unicode(profile_url)})
   444                     element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":unicode(profile_url)})
   400                     etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text)
   445                     etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text)
   401                     etree.SubElement(element, u"abstract").text = unicode(tw.text)
   446                     etree.SubElement(element, u"abstract").text = unicode(tw.text)
   402             
   447 
   403                     tags_node = etree.SubElement(element, u"tags")
   448                     tags_node = etree.SubElement(element, u"tags")
   404                     
   449 
   405                     for entity in tw.entity_list:
   450                     for entity in tw.entity_list:
   406                         if entity.type == u'entity_hashtag': 
   451                         if entity.type == u'entity_hashtag':
   407                             etree.SubElement(tags_node,u"tag").text = entity.hashtag.text
   452                             etree.SubElement(tags_node,u"tag").text = entity.hashtag.text
   408                             
   453 
   409                     meta_element = etree.SubElement(element, u'meta')
   454                     meta_element = etree.SubElement(element, u'meta')
   410                     
   455 
   411                     polemics_list = parse_polemics(tw, options.extended_mode)
   456                     polemics_list = parse_polemics(tw, options.extended_mode)
   412                     if polemics_list:
   457                     if polemics_list:
   413                         polemics_element = etree.Element(u'polemics')
   458                         polemics_element = etree.Element(u'polemics')
   414                         for pol in polemics_list:
   459                         for pol in polemics_list:
   415                             etree.SubElement(polemics_element, u'polemic').text = pol
   460                             etree.SubElement(polemics_element, u'polemic').text = pol
   416                         meta_element.append(polemics_element)
   461                         meta_element.append(polemics_element)
   417 
   462 
   418                     etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.tweet_source.original_json))
   463                     etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.tweet_source.original_json))
   419                     
   464 
   420                 # sort by tc in
   465                 # sort by tc in
   421                 if options.merge :
   466                 if options.merge :
   422                     # remove all elements and put them in a array
   467                     # remove all elements and put them in a array
   423                     # sort them with tc
   468                     # sort them with tc
   424                     #put them back
   469                     #put them back
   425                     elements[:] = sorted(elements,key=lambda n: int(n.get('begin')))
   470                     elements[:] = sorted(elements,key=lambda n: int(n.get('begin')))
   426                     
   471 
   427                 #add to display node    
   472                 #add to display node
   428                 if display_content_node is not None:
   473                 if display_content_node is not None:
   429                     display_dec = None
   474                     display_dec = None
   430                     for dec in display_content_node.iterchildren(tag=u"decoupage"):
   475                     for dec in display_content_node.iterchildren(tag=u"decoupage"):
   431                         if dec.get('idens','') == ensemble_id and dec.get('id', '') == decoupage_id:
   476                         if dec.get('idens','') == ensemble_id and dec.get('id', '') == decoupage_id:
   432                             display_dec = dec
   477                             display_dec = dec
   433                             break
   478                             break
   434                     if display_dec is None and ensemble_id and decoupage_id:
   479                     if display_dec is None and ensemble_id and decoupage_id:
   435                         etree.SubElement(display_content_node, u"decoupage", attrib={'idens': ensemble_id, 'id': decoupage_id, 'tagsSelect':''})
   480                         etree.SubElement(display_content_node, u"decoupage", attrib={'idens': ensemble_id, 'id': decoupage_id, 'tagsSelect':''})
   436                 
   481 
   437                 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True)  
   482                 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True)
   438                 
   483 
   439                 if content_file_write and content_file_write.find("http") == 0:
   484                 if content_file_write and content_file_write.find("http") == 0:
   440                     
   485 
   441                     project["ldt"] = output_data
   486                     project["ldt"] = output_data
   442                     project['owner'] = project['owner'].replace('%7E','~')
   487                     project['owner'] = project['owner'].replace('%7E','~')
   443                     project['contents'] = [c_url.replace('%7E','~') for c_url in project['contents']]
   488                     project['contents'] = [c_url.replace('%7E','~') for c_url in project['contents']]
   444                           
   489 
   445                     post_param = {}
   490                     post_param = {}
   446                     if options.post_param:
   491                     if options.post_param:
   447                         post_param = anyjson.loads(options.post_param)
   492                         post_param = anyjson.loads(options.post_param)
   448 
   493 
   449                     get_logger().debug("write http " + content_file_write) #@UndefinedVariable
   494                     get_logger().debug("write http " + content_file_write) #@UndefinedVariable
   456                 else:
   501                 else:
   457                     if content_file_write and os.path.exists(content_file_write):
   502                     if content_file_write and os.path.exists(content_file_write):
   458                         dest_file_name = content_file_write
   503                         dest_file_name = content_file_write
   459                     else:
   504                     else:
   460                         dest_file_name = options.filename
   505                         dest_file_name = options.filename
   461             
   506 
   462                     get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable
   507                     get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable
   463                     output = open(dest_file_name, "w")
   508                     output = open(dest_file_name, "w")
   464                     output.write(output_data)
   509                     output.write(output_data)
   465                     output.flush()
   510                     output.flush()
   466                     output.close()
   511                     output.close()
   467                 
   512 
   468         finally:
   513         finally:
   469             if session:
   514             if session:
   470                 session.close()
   515                 session.close()
   471     finally:
   516     finally:
   472         if conn:
   517         if conn: