script/utils/export_pad.py
changeset 893 10a19dd4e1c9
parent 891 8628c590f608
equal deleted inserted replaced
877:41ce1c341abe 893:10a19dd4e1c9
       
     1 #!/usr/bin/env python
       
     2 # coding=utf-8
       
     3 
       
     4 from dateutil.parser import parse as parse_date
       
     5 from iri_tweet.utils import set_logging_options, set_logging, get_logger
       
     6 from lxml import etree
       
     7 from optparse import OptionParser
       
     8 import anyjson
       
     9 import datetime
       
    10 import functools
       
    11 import httplib2
       
    12 import os.path
       
    13 import requests
       
    14 import sys
       
    15 import time
       
    16 import uuid
       
    17 
       
    18 
       
    19 class EtherpadRequestException(Exception):
       
    20     def __init__(self, original_resp):
       
    21         super(EtherpadRequestException, self).__init__(original_resp["message"])
       
    22         self.status = original_resp["status"]
       
    23         self.original_resp = original_resp
       
    24 
       
    25 
       
    26 class EtherpadRequest():
       
    27     
       
    28     def __init__(self, base_url, api_key):
       
    29         self.base_url = base_url
       
    30         self.api_key = api_key
       
    31         self.__request = None
       
    32 
       
    33     def __getattr__(self, name):
       
    34         return functools.partial(self.__action, name)
       
    35 
       
    36     def __action(self, action, **kwargs):
       
    37         url = "%s/%s" % (self.base_url, action)
       
    38         params = dict(kwargs)
       
    39         params['apikey'] = self.api_key
       
    40         
       
    41         r = requests.get(url, params)
       
    42         
       
    43         resp = anyjson.deserialize(r.text)
       
    44         
       
    45         if resp["code"] == 0:
       
    46             return resp["data"]
       
    47         else:
       
    48             raise EtherpadRequestException(resp)
       
    49         
       
    50         return resp
       
    51     
       
    52     def getRevisionsCount(self, padID):
       
    53         f = self.__getattr__("getRevisionsCount")
       
    54         res = f(padID=padID)
       
    55         
       
    56         return res["revisions"]
       
    57     
       
    58     def getPadUrl(self, padID):
       
    59         
       
    60         return "%s/%s" % (self.base_url,padID)
       
    61     
       
    62     
       
    63 
       
    64 def abort(message, parser):
       
    65     if message is not None:
       
    66         sys.stderr.write(message + "\n")
       
    67     parser.print_help()
       
    68     sys.exit(1)
       
    69 
       
    70 def get_options():
       
    71     
       
    72     parser = OptionParser()
       
    73     parser.add_option("-u", "--api-url", dest="api_url",
       
    74                       help="Base etherpad-lite api url", metavar="API_URL", default=None)
       
    75     parser.add_option("-k", "--api-key", dest="api_key",
       
    76                       help="Base etherpad-lite api url", metavar="API_KEY", default=None)
       
    77     parser.add_option("-p", "--pad-id", dest="pad_id",
       
    78                       help="pad id", metavar="PADID")
       
    79     parser.add_option("-s", "--start-date", dest="start_date",
       
    80                       help="start date", metavar="START_DATE", default=None)
       
    81     parser.add_option("-e", "--end-date", dest="end_date",
       
    82                       help="end date", metavar="END_DATE", default=None)
       
    83     parser.add_option("-f", "--format", dest="format", type="choice",
       
    84                       help="format", metavar="FORMAT", choice=['html', 'text'], default='html')
       
    85     parser.add_option("-I", "--content-file", dest="content_file",
       
    86                       help="Content file", metavar="CONTENT_FILE")
       
    87     parser.add_option("-C", "--color", dest="color",
       
    88                       help="Color code", metavar="COLOR", default="16763904")
       
    89     parser.add_option("-D", "--duration", dest="duration", type="int",
       
    90                       help="Duration", metavar="DURATION", default=None)
       
    91     parser.add_option("-n", "--name", dest="name",
       
    92                       help="Cutting name", metavar="NAME", default=u"pads")
       
    93     parser.add_option("-R", "--replace", dest="replace", action="store_true",
       
    94                       help="Replace tweet ensemble", metavar="REPLACE", default=False)
       
    95     parser.add_option("-m", "--merge", dest="merge", action="store_true",
       
    96                       help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False)
       
    97     parser.add_option("-E", "--extended", dest="extended_mode", action="store_true",
       
    98                       help="Trigger polemic extended mode", metavar="EXTENDED", default=False)
       
    99     parser.add_option("-S", "--step", dest="step", type=1,
       
   100                       help="step for version", metavar="STEP", default=False)
       
   101 
       
   102     
       
   103     
       
   104     set_logging_options(parser)
       
   105 
       
   106     
       
   107     return parser.parse_args() + (parser,)
       
   108 
       
   109 
       
   110 if __name__ == "__main__" :
       
   111 
       
   112     (options, args, parser) = get_options()
       
   113     
       
   114     set_logging(options)
       
   115     get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable
       
   116     
       
   117     if len(sys.argv) == 1:
       
   118         abort(None)
       
   119 
       
   120     base_url = options.get("api_url", None)
       
   121     if not base_url:
       
   122         abort("No base url")
       
   123 
       
   124     api_key = options.get("api_key", None)
       
   125     if not api_key:
       
   126         abort("api key missing")
       
   127         
       
   128     pad_id = options.get("pad_id", None)
       
   129     if not pad_id:
       
   130         abort("No pad id")
       
   131 
       
   132     start_date_str = options.get("start_date",None)
       
   133     end_date_str = options.get("end_date", None)
       
   134     duration = options.get("duration", None)
       
   135     
       
   136     start_date = None
       
   137     start_ts = None
       
   138     if start_date_str:
       
   139         start_date = parse_date(start_date_str) 
       
   140         start_ts = time.mktime(start_date.timetuple())*1000
       
   141 
       
   142     end_date = None
       
   143     if end_date_str:
       
   144         end_date = parse_date(end_date_str)
       
   145     elif start_date and duration:
       
   146         end_date = start_date + datetime.timedelta(seconds=duration)
       
   147         
       
   148     if start_date is None or end_date is None:
       
   149         abort("No start date found")
       
   150 
       
   151     end_ts = None
       
   152     if end_date is not None:
       
   153         end_ts = time.mktime(end_date.timetuple())*1000
       
   154 
       
   155     content_file = options.get("content_file", None)
       
   156     
       
   157     if not content_file:
       
   158         abort("No content file")        
       
   159 
       
   160     root = None
       
   161 
       
   162     if content_file.find("http") == 0:
       
   163 
       
   164         get_logger().debug("url : " + content_file) #@UndefinedVariable
       
   165         
       
   166         h = httplib2.Http()
       
   167         resp, content = h.request(content_file)
       
   168         
       
   169         get_logger().debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable
       
   170         
       
   171         project = anyjson.deserialize(content)
       
   172         root = etree.fromstring(project["ldt"])
       
   173                 
       
   174     elif os.path.exists(content_file):
       
   175 
       
   176         doc = etree.parse(content_file)
       
   177         root = doc.getroot()
       
   178             
       
   179     if root is None:
       
   180         abort("No content file, file not found")
       
   181 
       
   182     cutting_name = options.get("name", None)
       
   183     if cutting_name is None:
       
   184         cutting_name = "pad_%s" % pad_id
       
   185 
       
   186     output_format = options.get('format','html')
       
   187     ensemble_parent = None
       
   188                 
       
   189     file_type = None
       
   190     for node in root:
       
   191         if node.tag == "project":
       
   192             file_type = "ldt"
       
   193             break
       
   194         elif node.tag == "head":
       
   195             file_type = "iri"
       
   196             break
       
   197     if file_type is None:
       
   198         abort("Unknown file type")
       
   199 
       
   200     if file_type == "ldt":
       
   201         media_nodes = root.xpath("//media")
       
   202         if len(media_nodes) > 0:
       
   203             media = media_nodes[0]
       
   204         annotations_node = root.find(u"annotations")
       
   205         if annotations_node is None:
       
   206             annotations_node = etree.SubElement(root, u"annotations")
       
   207         content_node = annotations_node.find(u"content")
       
   208         if content_node is None:
       
   209             content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id"))
       
   210         ensemble_parent = content_node
       
   211     elif file_type == "iri":
       
   212         body_node = root.find(u"body")
       
   213         if body_node is None:
       
   214             body_node = etree.SubElement(root, u"body")
       
   215         ensembles_node = body_node.find(u"ensembles")
       
   216         if ensembles_node is None:
       
   217             ensembles_node = etree.SubElement(body_node, u"ensembles")
       
   218         ensemble_parent = ensembles_node
       
   219 
       
   220     if ensemble_parent is None:
       
   221         abort("Can not add cutting")
       
   222 
       
   223     if options.replace:
       
   224         for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
       
   225             if ens.get("id","").startswith(cutting_name):
       
   226                 ensemble_parent.remove(ens)
       
   227                 
       
   228     ensemble = None
       
   229     elements = None
       
   230                 
       
   231     if options.merge:
       
   232         ensemble = ensemble_parent.find(u"ensemble")
       
   233         if ensemble is not None:
       
   234             elements = ensemble.find(u".//elements")                
       
   235         
       
   236     if ensemble is None or elements is None:
       
   237         ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble pad", u"author":u"IRI Web", u"abstract":u"Ensemble Pad"})
       
   238         decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
       
   239     
       
   240         etree.SubElement(decoupage, u"title").text = unicode(cutting_name)
       
   241         etree.SubElement(decoupage, u"abstract").text = unicode(cutting_name)
       
   242     
       
   243         elements = etree.SubElement(decoupage, u"elements")
       
   244 
       
   245 
       
   246     etp_req = EtherpadRequest(base_url, api_key)
       
   247     rev_count = etp_req.getRevisionCount(pad_id)
       
   248     
       
   249     
       
   250     version_range = range(1,rev_count+1, 1)
       
   251     #make sure that teh last version is exported
       
   252     if rev_count not in version_range:
       
   253         version_range.append(rev_count)
       
   254     for rev in version_range:
       
   255         
       
   256         data = None
       
   257         text = ""
       
   258         
       
   259         if output_format == "html":
       
   260             data = etp_req.getHtml(padID=pad_id, rev=rev)
       
   261             text = data.get("html", "")
       
   262         else:
       
   263             data = etp_req.getText(padID=pad_id, rev=rev)
       
   264             text = data.get("text","")
       
   265 
       
   266         pad_ts = data['timestamp']
       
   267         
       
   268         if pad_ts < start_ts:
       
   269             continue
       
   270         
       
   271         if end_ts is not None and pad_ts > end_ts:
       
   272             break
       
   273 
       
   274         pad_dt = datetime.datetime.fromtimestamp(float(pad_ts)/1000.0)
       
   275         pad_ts_rel = pad_ts - start_ts
       
   276         
       
   277         username = None
       
   278         color = ""
       
   279         if 'author' in data:
       
   280             username = data['author']['name'] if ('name' in data['author'] and data['author']['name']) else data['author']['id']
       
   281             color =  data['author']['color'] if ('color' in data['author'] and data['author']['color']) else ""
       
   282         
       
   283         if not username:
       
   284             username = "anon."
       
   285             
       
   286             
       
   287         element = etree.SubElement(elements, u"element" , {u"id":"%s-%s-%d" %(unicode(uuid.uuid4()),unicode(pad_id),rev), u"color":unicode(color), u"author":unicode(username), u"date":unicode(pad_dt.strftime("%Y/%m/%d")), u"begin": unicode(pad_ts_rel), u"dur":u"0", u"src":""})
       
   288         etree.SubElement(element, u"title").text = "%s: %s - rev %d" % (unicode(username), unicode(pad_id), rev)
       
   289         etree.SubElement(element, u"abstract").text = unicode(text)
       
   290         
       
   291         meta_element = etree.SubElement(element, u'meta')
       
   292         etree.SubElement(meta_element, "pad_url").text = etree.CDATA(unicode(etp_req.getPadUrl(pad_id)))
       
   293         etree.SubElement(meta_element, "revision").text = etree.CDATA(unicode(rev))
       
   294 
       
   295     # sort by tc in
       
   296     if options.merge :
       
   297         elements[:] = sorted(elements,key=lambda n: int(n.get('begin')))
       
   298     
       
   299     output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True)  
       
   300     
       
   301     if content_file and content_file.find("http") == 0:
       
   302         
       
   303         project["ldt"] = output_data
       
   304         body = anyjson.serialize(project)
       
   305         h = httplib2.Http()
       
   306         resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body)
       
   307         if resp.status != 200:
       
   308             raise Exception("Error writing content : %d : %s"%(resp.status, resp.reason))                        
       
   309     else:
       
   310         if content_file and os.path.exists(content_file):
       
   311             dest_file_name = content_file 
       
   312         else:
       
   313             dest_file_name = options.filename
       
   314 
       
   315         output = open(dest_file_name, "w")
       
   316         output.write(output_data)
       
   317         output.flush()
       
   318         output.close()
       
   319         
       
   320