script/utils/export_chat_zoom_cloud.py
changeset 1542 82b5f22448f6
equal deleted inserted replaced
1541:61423ca4e0af 1542:82b5f22448f6
       
     1 #!/usr/bin/env python
       
     2 # coding=utf-8
       
     3 
       
     4 import argparse
       
     5 import bisect
       
     6 import datetime
       
     7 import json
       
     8 import os.path
       
     9 import re
       
    10 import sys
       
    11 import uuid  # @UnresolvedImport
       
    12 
       
    13 import requests
       
    14 
       
    15 from dateutil.parser import parse as parse_date
       
    16 from iri_tweet.utils import get_logger, set_logging, set_logging_options
       
    17 from lxml import etree
       
    18 
       
    19 
       
    20 LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/"
       
    21 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/"
       
    22 
       
    23 
       
    24 def re_fn(expr, item):
       
    25     reg = re.compile(expr, re.I)
       
    26     res = reg.search(item)
       
    27     if res:
       
    28         get_logger().debug("re_fn : " + repr(expr) + "~" + repr(item)) #@UndefinedVariable
       
    29     return res is not None
       
    30 
       
    31 def parse_polemics_1(tw_text, extended_mode):
       
    32     """
       
    33     parse polemics in text and return a list of polemic code. None if not polemic found
       
    34     """
       
    35     polemics = {}
       
    36     for m in re.finditer(r"(\+\+|\-\-|\?\?|\=\=)",tw_text):
       
    37         pol_link = {
       
    38             '++' : 'OK',
       
    39             '--' : 'KO',
       
    40             '??' : 'Q',
       
    41             '==' : 'REF'}[m.group(1)]
       
    42         polemics[pol_link] = pol_link
       
    43 
       
    44     if extended_mode:
       
    45         if "?" in tw_text:
       
    46             polemics["Q"] = "Q"
       
    47 
       
    48     if len(polemics) > 0:
       
    49         return polemics.keys()
       
    50     else:
       
    51         return None
       
    52 
       
    53 def parse_polemics_2(tw_text, extended_mode):
       
    54     """
       
    55     parse polemics in text and return a list of polemic code. None if not polemic found
       
    56     """
       
    57     polemics = {}
       
    58     for m in re.finditer(r"(\+\+|\!\!|\?\?|\=\=)",tw_text):
       
    59         pol_link = {
       
    60             '++' : 'OK',
       
    61             '!!' : 'KO',
       
    62             '??' : 'Q',
       
    63             '==' : 'REF'}[m.group(1)]
       
    64         polemics[pol_link] = pol_link
       
    65 
       
    66     if extended_mode:
       
    67         if "?" in tw_text:
       
    68             polemics["Q"] = "Q"
       
    69 
       
    70 
       
    71     if len(polemics) > 0:
       
    72         return polemics.keys()
       
    73     else:
       
    74         return None
       
    75 
       
    76 def parse_polemics_3(tw_text, extended_mode):
       
    77     """
       
    78     parse polemics in text and return a list of polemic code. None if not polemic found
       
    79     """
       
    80     polemics = {}
       
    81     for m in re.finditer(r"(\+\+|\?\?|\*\*|\=\=)",tw_text):
       
    82         pol_link = {
       
    83             '++' : 'OK',
       
    84             '??' : 'KO',
       
    85             '**' : 'REF',
       
    86             '==' : 'Q'}[m.group(1)]
       
    87         polemics[pol_link] = pol_link
       
    88 
       
    89     if len(polemics) > 0:
       
    90         return polemics.keys()
       
    91     else:
       
    92         return None
       
    93 
       
    94 
       
    95 protocol_version_map = {
       
    96     "1" : parse_polemics_1,
       
    97     "2" : parse_polemics_2,
       
    98     "3" : parse_polemics_3
       
    99 }
       
   100 
       
   101 def get_options():
       
   102 
       
   103     parser = argparse.ArgumentParser(description="All date should be given using iso8601 format. If no timezone is used, the date is considered as UTC")
       
   104 
       
   105     parser.add_argument("-f", "--file", dest="filename",
       
   106                       help="write export to file", metavar="FILE", default="project.ldt")
       
   107     parser.add_argument("-d", "--chat-database", dest="database",
       
   108                       help="Input chat file", metavar="CHAT_DATABASE")
       
   109     parser.add_argument("-s", "--start-date", dest="start_date",
       
   110                       help="start date", metavar="START_DATE", default=None)
       
   111     parser.add_argument("-a", "--annotation-protocol", dest="protocol_version",
       
   112                       help="annotation protocol version", metavar="PROTOCOL_VERSION",
       
   113                       default="2")
       
   114     parser.add_argument("-I", "--content-file", dest="content_file",
       
   115                       help="Content file", metavar="CONTENT_FILE")
       
   116     parser.add_argument("-c", "--content", dest="content",
       
   117                       help="Content url", metavar="CONTENT")
       
   118     parser.add_argument("-V", "--video-url", dest="video",
       
   119                       help="video url", metavar="VIDEO")
       
   120     parser.add_argument("-i", "--content-id", dest="content_id",
       
   121                       help="Content id", metavar="CONTENT_ID")
       
   122     parser.add_argument("-C", "--color", dest="color",
       
   123                       help="Color code", metavar="COLOR", default="16763904")
       
   124     parser.add_argument("-D", "--duration", dest="duration", type=int,
       
   125                       help="Duration", metavar="DURATION", default=None)
       
   126     parser.add_argument("-n", "--name", dest="name",
       
   127                       help="Cutting name", metavar="NAME", default="Chats")
       
   128     parser.add_argument("-R", "--replace", dest="replace", action="store_true",
       
   129                       help="Replace tweet ensemble", default=False)
       
   130     parser.add_argument("-m", "--merge", dest="merge", action="store_true",
       
   131                       help="merge tweet ensemble, choose the first ensemble", default=False)
       
   132     parser.add_argument("-E", "--extended", dest="extended_mode", action="store_true",
       
   133                       help="Trigger polemic extended mode", default=False)
       
   134     parser.add_argument("-b", "--base-url", dest="base_url",
       
   135                       help="base URL of the platform", metavar="BASE_URL", default="http://ldt.iri.centrepompidou.fr/ldtplatform/")
       
   136     parser.add_argument("-p", "--project", dest="project_id",
       
   137                       help="Project id", metavar="PROJECT_ID", default=None)
       
   138     parser.add_argument("-P", "--post-param", dest="post_param",
       
   139                       help="Post param", metavar="POST_PARAM", default=None)
       
   140     parser.add_argument("--user-whitelist", dest="user_whitelist", action="store",
       
   141                       help="A list of user screen name", metavar="USER_WHITELIST",default=None)
       
   142     parser.add_argument("--cut", dest="cuts", action="append",
       
   143                       help="A cut with the forma <ts in ms>::<duration>", metavar="CUT", default=[])
       
   144 
       
   145     set_logging_options(parser)
       
   146 
       
   147     return (parser.parse_args(), parser)
       
   148 
       
   149 
       
   150 def find_delta(deltas, ts):
       
   151     i = bisect.bisect_right(deltas, (ts+1,0))
       
   152     if i:
       
   153         return deltas[i-1]
       
   154     return (0,0)
       
   155 
       
   156 
       
   157 def parse_duration(s):
       
   158     try:
       
   159         return int(s)
       
   160     except ValueError:
       
   161         parts = s.split(":")
       
   162         if len(parts) < 2:
       
   163             raise ValueError("Bad duration format")
       
   164         time_params = {
       
   165             'hours': int(parts[0]),
       
   166             'minutes': int(parts[1]),
       
   167             'seconds': int(parts[2]) if len(parts)>2 else 0
       
   168         }
       
   169         return int(round(datetime.timedelta(**time_params).total_seconds()*1000))
       
   170 
       
   171 CHAT_REGEXP = re.compile(r"^(?P<created_at>\d{2}:\d{2}:\d{2})\t(?P<user>.+?)\s?:\s(?P<text>.*)$", re.DOTALL)
       
   172 CHAT_LINE_REGEXP = re.compile(r"^\d{2}:\d{2}:\d{2}\t.+?:\s")
       
   173 
       
   174 def parse_chat_line(chat_id, chat_line):
       
   175     if (m := CHAT_REGEXP.match(chat_line)) is not None:
       
   176         res = {k: v.replace('\r','\n') if k == 'text' else v for k,v in m.groupdict().items()}
       
   177         res['id'] = chat_id
       
   178         res['tags'] = re.findall('#(\w+)',res['text'])
       
   179         return res
       
   180     else:
       
   181         return {}
       
   182 
       
   183 def read_chat_file(chat_file_path):
       
   184     current_line = ""
       
   185     chat_content = []
       
   186     with open(chat_file_path, "r") as chat_file:
       
   187         for chat_line in chat_file:
       
   188             if CHAT_LINE_REGEXP.match(chat_line) is not None:
       
   189                 if current_line:
       
   190                     chat_content.append(current_line)
       
   191                 current_line = chat_line
       
   192             else:
       
   193                 current_line = current_line + "\n" + chat_line
       
   194     if current_line:
       
   195         chat_content.append(current_line)
       
   196     return chat_content
       
   197 
       
   198 
       
   199 if __name__ == "__main__" :
       
   200 
       
   201     (options, parser) = get_options()
       
   202 
       
   203     set_logging(options)
       
   204 
       
   205     get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable
       
   206 
       
   207 
       
   208     deltas = [(0,0)]
       
   209     total_delta = 0
       
   210     if options.cuts:
       
   211         cuts_raw = sorted([tuple([parse_duration(s) for s in c.split("::")]) for c in options.cuts])
       
   212         for c, d in cuts_raw:
       
   213             deltas.append((c+total_delta, -1))
       
   214             total_delta += d
       
   215             deltas.append((c+total_delta, total_delta))
       
   216 
       
   217     if len(sys.argv) == 1 or options.database is None:
       
   218         parser.print_help()
       
   219         sys.exit(1)
       
   220 
       
   221     user_whitelist_file = options.user_whitelist
       
   222     user_whitelist = None
       
   223 
       
   224     if options.project_id:
       
   225         content_file = options.base_url + LDT_PROJECT_REST_API_PATH + options.project_id + "/?format=json"
       
   226     else:
       
   227         content_file = options.content_file
       
   228 
       
   229     params = {
       
   230         'start_date': options.start_date,
       
   231         'duration' : options.duration,
       
   232         'content_file' : content_file,
       
   233         'content_file_write' : content_file,
       
   234         'project_id' : options.project_id
       
   235     }
       
   236     post_param = {}
       
   237 
       
   238     if options.post_param:
       
   239         post_param = json.loads(options.post_param)
       
   240 
       
   241     display_content_node = None
       
   242 
       
   243     get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable
       
   244 
       
   245     start_date_str = params.get("start_date",None)
       
   246     duration = params.get("duration", None)
       
   247     content_file = params.get("content_file", None)
       
   248     content_file_write = params.get("content_file_write", None)
       
   249     if user_whitelist_file:
       
   250         with open(user_whitelist_file, 'r+') as f:
       
   251             user_whitelist = list(set([s.strip() for s in f]))
       
   252 
       
   253     start_date = datetime.datetime.now()
       
   254     if start_date_str:
       
   255         start_date = parse_date(start_date_str)
       
   256 
       
   257     root = None
       
   258     ensemble_parent = None
       
   259     project = None
       
   260 
       
   261     #to do : analyse situation ldt or iri ? filename set or not ?
       
   262 
       
   263     if content_file and content_file.find("http") == 0:
       
   264 
       
   265         get_logger().debug("url : " + content_file) #@UndefinedVariable
       
   266 
       
   267         r = requests.get(content_file, params=post_param)
       
   268         get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable
       
   269         project = r.json()
       
   270         text_match = re.match(r"\<\?\s*xml.*?\?\>(.*)", project['ldt'], re.I|re.S)
       
   271         root = etree.fromstring(text_match.group(1) if text_match else project['ldt'])
       
   272 
       
   273     elif content_file and os.path.exists(content_file):
       
   274 
       
   275         doc = etree.parse(content_file)
       
   276         root = doc.getroot()
       
   277         for child in root:
       
   278             if child.tag == "project":
       
   279                 project = child
       
   280                 break
       
   281         if project is None:
       
   282             root = None
       
   283 
       
   284     content_id = None
       
   285 
       
   286     if root is None:
       
   287 
       
   288         root = etree.Element("iri")
       
   289 
       
   290         project = etree.SubElement(root, "project", {"abstract":"Polemics Chat","title":"Polemic Chat", "user":"IRI Web", "id":str(uuid.uuid4())})
       
   291 
       
   292         medias = etree.SubElement(root, "medias")
       
   293         media = etree.SubElement(medias, "media", {"pict":"", "src":options.content, "video":options.video, "id":options.content_id, "extra":""})
       
   294 
       
   295         annotations = etree.SubElement(root, "annotations")
       
   296         content = etree.SubElement(annotations, "content", {"id":options.content_id})
       
   297         ensemble_parent = content
       
   298 
       
   299         content_id = options.content_id
       
   300 
       
   301 
       
   302     if ensemble_parent is None:
       
   303         file_type = None
       
   304         for node in root:
       
   305             if node.tag == "project":
       
   306                 file_type = "ldt"
       
   307                 break
       
   308             elif node.tag == "head":
       
   309                 file_type = "iri"
       
   310                 break
       
   311 
       
   312         if file_type == "ldt":
       
   313             media_nodes = root.xpath("//media")
       
   314             media = None
       
   315             if len(media_nodes) > 0:
       
   316                 media = media_nodes[0]
       
   317             annotations_node = root.find("annotations")
       
   318             if annotations_node is None:
       
   319                 annotations_node = etree.SubElement(root, "annotations")
       
   320             content_node = annotations_node.find("content")
       
   321             if content_node is None and media is not None:
       
   322                 content_node = etree.SubElement(annotations_node,"content", id=media.get("id"))
       
   323             ensemble_parent = content_node
       
   324             content_id = content_node.get("id")
       
   325             display_nodes = root.xpath("//displays/display/content[@id='%s']" % content_id)
       
   326             if len(display_nodes) == 0:
       
   327                 get_logger().info("No display node found. Will not update display")
       
   328                 display_content_node = None
       
   329             else:
       
   330                 display_content_node = display_nodes[0]
       
   331 
       
   332         elif file_type == "iri":
       
   333             body_node = root.find("body")
       
   334             if body_node is None:
       
   335                 body_node = etree.SubElement(root, "body")
       
   336             ensembles_node = body_node.find("ensembles")
       
   337             if ensembles_node is None:
       
   338                 ensembles_node = etree.SubElement(body_node, "ensembles")
       
   339             ensemble_parent = ensembles_node
       
   340             content_id = root.xpath("head/meta[@name='id']/@content")[0]
       
   341             display_content_node = None
       
   342 
       
   343 
       
   344     if ensemble_parent is None:
       
   345         get_logger().error("Can not process file") #@UndefinedVariable
       
   346         sys.exit()
       
   347 
       
   348     if options.replace:
       
   349         for ens in ensemble_parent.iterchildren(tag="ensemble"):
       
   350             ens_id = ens.get("id","")
       
   351             if ens_id.startswith("chat_"):
       
   352                 ensemble_parent.remove(ens)
       
   353                 # remove in display nodes
       
   354                 if display_content_node is not None:
       
   355                     for cut_display in display_content_node.iterchildren():
       
   356                         if cut_display.get('idens','') == ens_id:
       
   357                             display_content_node.remove(cut_display)
       
   358 
       
   359     ensemble = None
       
   360     elements = None
       
   361     decoupage = None
       
   362 
       
   363     if options.merge:
       
   364         for ens in ensemble_parent.findall("ensemble"):
       
   365             if ens.get('id',"").startswith("chat_"):
       
   366                 ensemble = ens
       
   367                 break
       
   368         if ensemble is not None:
       
   369             elements = ensemble.find(".//elements")
       
   370             decoupage = ensemble.find("decoupage")
       
   371 
       
   372     if ensemble is None or elements is None:
       
   373         ensemble = etree.SubElement(ensemble_parent, "ensemble", {"id":"chat_" + str(uuid.uuid4()), "title":"Ensemble Chat", "author":"IRI Web", "abstract":"Ensemble Chat"})
       
   374         decoupage = etree.SubElement(ensemble, "decoupage", {"id": str(uuid.uuid4()), "author": "IRI Web"})
       
   375 
       
   376         etree.SubElement(decoupage, "title").text = options.name
       
   377         etree.SubElement(decoupage, "abstract").text = options.name
       
   378 
       
   379         elements = etree.SubElement(decoupage, "elements")
       
   380 
       
   381     ensemble_id = ensemble.get('id', '')
       
   382     decoupage_id = decoupage.get('id', '') if decoupage is not None else None
       
   383 
       
   384     if not duration and options.base_url:
       
   385         content_url = options.base_url + LDT_CONTENT_REST_API_PATH + content_id + "/?format=json"
       
   386         r = requests.get(content_url)
       
   387         duration = int(r.json()['duration'])
       
   388         get_logger().debug("get duration " + content_url) #@UndefinedVariable
       
   389         get_logger().debug("get duration " + repr(duration)) #@UndefinedVariable
       
   390 
       
   391     chat_content_lines = read_chat_file(options.database.strip())
       
   392     for i,chat_line in enumerate(chat_content_lines):
       
   393 
       
   394         cht = parse_chat_line("%04d" % (i+1) ,chat_line.strip())
       
   395 
       
   396         #TODO parse chat line
       
   397         cht_ts_dt = cht['created_at']
       
   398         cht_ts_rel_milli = parse_duration(cht_ts_dt)
       
   399         element_date = start_date + datetime.timedelta(milliseconds=cht_ts_rel_milli)
       
   400         if deltas:
       
   401             d = find_delta(deltas, cht_ts_rel_milli)
       
   402             if d[1] < 0:
       
   403                 continue
       
   404             else :
       
   405                 cht_ts_rel_milli -= d[1]
       
   406 
       
   407         username = cht['user'] or "anon."
       
   408 
       
   409         element = etree.SubElement(elements, "element" , {"id": "%s-%s" % (uuid.uuid4(),cht['id']), "color":options.color, "author":username, "date":element_date.strftime("%Y/%m/%d"), "begin": str(cht_ts_rel_milli), "dur":"0", "src":"zoom"})
       
   410         etree.SubElement(element, "title").text = username + ": " + cht['text'][:255]
       
   411         etree.SubElement(element, "abstract").text = cht['text']
       
   412 
       
   413         tags_node = etree.SubElement(element, "tags")
       
   414 
       
   415         for tag in cht['tags']:
       
   416             etree.SubElement(tags_node,"tag").text = tag
       
   417 
       
   418         meta_element = etree.SubElement(element, 'meta')
       
   419 
       
   420         etree.SubElement(meta_element, "polemic_version").text = options.protocol_version
       
   421         parse_polemics = protocol_version_map.get(options.protocol_version, parse_polemics_2)
       
   422         polemics_list = parse_polemics(cht['text'], options.extended_mode)
       
   423         if polemics_list:
       
   424             polemics_element = etree.Element('polemics')
       
   425             for pol in polemics_list:
       
   426                 etree.SubElement(polemics_element, 'polemic').text = pol
       
   427             meta_element.append(polemics_element)
       
   428 
       
   429         etree.SubElement(meta_element, "source", attrib={"url":"http://zoom.io", "mimetype":"text/plain"}).text = etree.CDATA(json.dumps({'chat': chat_line}))
       
   430 
       
   431     # sort by tc in
       
   432     if options.merge :
       
   433         # remove all elements and put them in a array
       
   434         # sort them with tc
       
   435         #put them back
       
   436         elements[:] = sorted(elements,key=lambda n: int(n.get('begin')))
       
   437 
       
   438     #add to display node
       
   439     if display_content_node is not None:
       
   440         display_dec = None
       
   441         for dec in display_content_node.iterchildren(tag="decoupage"):
       
   442             if dec.get('idens','') == ensemble_id and dec.get('id', '') == decoupage_id:
       
   443                 display_dec = dec
       
   444                 break
       
   445         if display_dec is None and ensemble_id and decoupage_id:
       
   446             etree.SubElement(display_content_node, "decoupage", attrib={'idens': ensemble_id, 'id': decoupage_id, 'tagsSelect':''})
       
   447 
       
   448     output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True).decode('utf-8')
       
   449 
       
   450     if content_file_write and content_file_write.find("http") == 0:
       
   451 
       
   452         project["ldt"] = output_data
       
   453         project['owner'] = project['owner'].replace('%7E','~')
       
   454         project['contents'] = [c_url.replace('%7E','~') for c_url in project['contents']]
       
   455 
       
   456         post_param = {}
       
   457         if options.post_param:
       
   458             post_param = json.loads(options.post_param)
       
   459 
       
   460         get_logger().debug("write http " + content_file_write) #@UndefinedVariable
       
   461         get_logger().debug("write http " + repr(post_param)) #@UndefinedVariable
       
   462         get_logger().debug("write http " + repr(project)) #@UndefinedVariable
       
   463         r = requests.put(content_file_write, data=json.dumps(project), headers={'content-type':'application/json'}, params=post_param)
       
   464         get_logger().debug("write http " + repr(r) + " content " + r.text) #@UndefinedVariable
       
   465         if r.status_code != requests.codes.ok:  # pylint: disable=E1101
       
   466             r.raise_for_status()
       
   467     else:
       
   468         if content_file_write and os.path.exists(content_file_write):
       
   469             dest_file_name = content_file_write
       
   470         else:
       
   471             dest_file_name = options.filename
       
   472 
       
   473         get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable
       
   474         output = open(dest_file_name, "w")
       
   475         output.write(output_data)
       
   476         output.flush()
       
   477         output.close()