utils/export_annotations.py
changeset 30 c2294ac6e875
parent 26 ebfd0d3cffab
child 43 e27c3c1c57f1
equal deleted inserted replaced
29:5007c248fbad 30:c2294ac6e875
     9 import os.path
     9 import os.path
    10 import re
    10 import re
    11 import sys
    11 import sys
    12 import time
    12 import time
    13 import uuid #@UnresolvedImport
    13 import uuid #@UnresolvedImport
    14 from dateutil.parser import parse as parse_date
    14 from dateutil.parser import parse as parse_date_raw
       
    15 from dateutil.tz import tzutc
    15 import bisect
    16 import bisect
    16 import logging
    17 import logging
    17 
    18 
    18 #class TweetExclude(object):
    19 #class TweetExclude(object):
    19 #    def __init__(self, id):
    20 #    def __init__(self, id):
    23 #        return "<TweetExclude(id=%d)>" % (self.id)
    24 #        return "<TweetExclude(id=%d)>" % (self.id)
    24 
    25 
    25 LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/"
    26 LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/"
    26 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/"
    27 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/"
    27 DEFAULT_ANNOTATION_CHANNEL = 'ANNOT'
    28 DEFAULT_ANNOTATION_CHANNEL = 'ANNOT'
       
    29 
       
    30 def parse_date(datestr):
       
    31     res = parse_date_raw(datestr)
       
    32     if res.tzinfo is None:
       
    33         res = res.replace(tzinfo=tzutc())
       
    34     return res
    28 
    35 
    29 def get_logger():
    36 def get_logger():
    30     return logging.getLogger(__name__)
    37     return logging.getLogger(__name__)
    31 
    38 
    32 
    39 
   121                       help="quiet", default=0)
   128                       help="quiet", default=0)
   122 
   129 
   123 
   130 
   124 def get_options():
   131 def get_options():
   125 
   132 
   126     usage = "usage: %(prog)s [options]"
   133 
   127 
   134     parser = argparse.ArgumentParser(description="All date should be given using iso8601 format. If no timezone is used, the date is considered as UTC")
   128     parser = argparse.ArgumentParser(usage)
       
   129 
   135 
   130     parser.add_argument("-f", "--file", dest="filename",
   136     parser.add_argument("-f", "--file", dest="filename",
   131                       help="write export to file", metavar="FILE", default="project.ldt")
   137                       help="write export to file", metavar="FILE", default="project.ldt")
   132     parser.add_argument("-a", "--annot-url", dest="annot_url",
   138     parser.add_argument("-a", "--annot-url", dest="annot_url",
   133                       help="annotation server url", metavar="ANNOT-URL", required=True)
   139                       help="annotation server url", metavar="ANNOT-URL", required=True)
   196         time_params = {
   202         time_params = {
   197             'hours': int(parts[0]),
   203             'hours': int(parts[0]),
   198             'minutes': int(parts[1]),
   204             'minutes': int(parts[1]),
   199             'seconds': int(parts[2]) if len(parts)>2 else 0
   205             'seconds': int(parts[2]) if len(parts)>2 else 0
   200         }
   206         }
   201         return int(datetime.timedelta(**time_params).total_seconds()*1000)
   207         return int(round(datetime.timedelta(**time_params).total_seconds()*1000))
   202 
   208 
   203 def build_annotation_iterator(url, params, headers):
   209 def build_annotation_iterator(url, params, headers):
   204     page = 0
   210     page = 0
   205     page_nb = 1
   211     page_nb = 1
   206     while page < page_nb:
   212     while page < page_nb:
   299         start_date_str = params.get("start_date",None)
   305         start_date_str = params.get("start_date",None)
   300         end_date_str = params.get("end_date", None)
   306         end_date_str = params.get("end_date", None)
   301         duration = params.get("duration", None)
   307         duration = params.get("duration", None)
   302         content_file = params.get("content_file", None)
   308         content_file = params.get("content_file", None)
   303         content_file_write = params.get("content_file_write", None)
   309         content_file_write = params.get("content_file_write", None)
   304         channels = params.get('channels', [DEFAULT_ANNOTATION_CHANNEL])
   310         channels = list(set(params.get('channels', [DEFAULT_ANNOTATION_CHANNEL])))
   305         events = params.get('events', [])
   311         events = list(set(params.get('events', [])))
   306 
   312 
   307         if user_whitelist_file:
   313         if user_whitelist_file:
   308             with open(user_whitelist_file, 'r+') as f:
   314             with open(user_whitelist_file, 'r+') as f:
   309                 user_whitelist = list(set([s.strip() for s in f]))
   315                 user_whitelist = list(set([s.strip() for s in f]))
   310 
   316 
   311         start_date = None
   317         start_date = None
   312         ts = None
       
   313         if start_date_str:
   318         if start_date_str:
   314             start_date = parse_date(start_date_str)
   319             start_date= parse_date(start_date_str)
   315             ts = time.mktime(start_date.timetuple())
       
   316 
   320 
   317 
   321 
   318         root = None
   322         root = None
   319         ensemble_parent = None
   323         ensemble_parent = None
   320 
   324 
   339 
   343 
   340         if root is None:
   344         if root is None:
   341 
   345 
   342             root = etree.Element(u"iri")
   346             root = etree.Element(u"iri")
   343 
   347 
   344             project = etree.SubElement(root, u"project", {u"abstract":u"Polemics Tweets",u"title":u"Polemic Tweets", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())})
   348             project = etree.SubElement(root, u"project", {u"abstract":u"Annotations",u"title":u"Annotations", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())})
   345 
   349 
   346             medias = etree.SubElement(root, u"medias")
   350             medias = etree.SubElement(root, u"medias")
   347             media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
   351             media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
   348 
   352 
   349             annotations = etree.SubElement(root, u"annotations")
   353             annotations = etree.SubElement(root, u"annotations")
   459 
   463 
   460         params = { 'q':json.dumps({'filters':filters}), 'results_per_page': options.batch_size}
   464         params = { 'q':json.dumps({'filters':filters}), 'results_per_page': options.batch_size}
   461 
   465 
   462 
   466 
   463         for annot in build_annotation_iterator(annotation_url, params, headers):
   467         for annot in build_annotation_iterator(annotation_url, params, headers):
   464             #TODO : check timezone !!!
   468             annot_ts = parse_date(annot['ts'])
   465             annot_ts_dt = parse_date(annot['ts'])
   469             if start_date is None:
   466             annot_ts = int(time.mktime(annot_ts_dt.timetuple()))
   470                 star_date = annot_ts
   467             if ts is None:
   471             annot_ts_rel = annot_ts-start_date
   468                 ts = annot_ts
   472             annot_ts_rel_milli = int(round(annot_ts_rel.total_seconds()*1000))
   469             annot_ts_rel = (annot_ts-ts) * 1000
       
   470             if deltas:
   473             if deltas:
   471                 d = find_delta(deltas, annot_ts_rel)
   474                 d = find_delta(deltas, annot_ts_rel_milli)
   472                 if d[1] < 0:
   475                 if d[1] < 0:
   473                     continue
   476                     continue
   474                 else :
   477                 else :
   475                     annot_ts_rel -= d[1]
   478                     annot_ts_rel_milli -= d[1]
   476             annot_content = annot.get('content',{'category':'', 'user':None})
   479             annot_content = annot.get('content',{'category':'', 'user':None})
   477 
   480 
   478             username = annot_content.get('user', 'anon.') or 'anon.'
   481             username = annot_content.get('user', 'anon.') or 'anon.'
   479 
   482 
   480             category = annot_content.get('category', None)
   483             category = annot_content.get('category', None)
   481             if category is None:
   484             if category is None:
   482                 continue
   485                 continue
   483 
   486 
   484             element = etree.SubElement(elements, u"element" , {u"id":annot.get('uuid', uuid.uuid4()), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(annot_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(annot_ts_rel), u"dur":u"0"})
   487             element = etree.SubElement(elements, u"element" , {u"id":annot.get('uuid', uuid.uuid4()), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(annot_ts.strftime("%Y/%m/%d")), u"begin": unicode(annot_ts_rel_milli), u"dur":u"0"})
   485             etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(category.get('label', category.get('code', '')))
   488             etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(category.get('label', category.get('code', '')))
   486             etree.SubElement(element, u"abstract").text = unicode(category.get('label', category.get('code', '')))
   489             etree.SubElement(element, u"abstract").text = unicode(category.get('label', category.get('code', '')))
   487 
   490 
   488             tags_node = etree.SubElement(element, u"tags")
   491             tags_node = etree.SubElement(element, u"tags")
   489             etree.SubElement(tags_node,u"tag").text = category.get('code', '')
   492             etree.SubElement(tags_node,u"tag").text = category.get('code', '')