script/utils/export_pad.py
changeset 693 2ef837069108
child 891 8628c590f608
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/script/utils/export_pad.py	Mon Oct 15 17:01:50 2012 +0200
@@ -0,0 +1,324 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+from lxml import etree
+from iri_tweet.models import setup_database
+from optparse import OptionParser #@UnresolvedImport
+from sqlalchemy import Table, Column, BigInteger
+from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, 
+    get_logger)
+import anyjson
+import datetime
+import httplib2
+import os.path
+import re
+import sys
+import time
+import uuid #@UnresolvedImport
+from dateutil.parser import parse as parse_date
+import json
+import functools
+
+
+class EtherpadRequestException(Exception):
+    def __init__(self, original_resp):
+        super(EtherpadRequestException, self).__init__(original_resp["message"])
+        self.status = original_resp["status"]
+        self.original_resp = original_resp
+
+
+class EtherpadRequest():
+    
+    def __init__(self, base_url, api_key):
+        self.base_url = base_url
+        self.api_key = api_key
+        self.__request = None
+
+    def __getattr__(self, name):
+        return functools.partial(self.__action, name)
+
+    def __action(self, action, **kwargs):
+        url = "%s/%s" % (self.base_url, action)
+        params = dict(kwargs)
+        params['apikey'] = self.api_key
+        
+        r = requests.get(url, params)
+        
+        resp = anyjson.deserialize(r.text)
+        
+        if resp["code"] == 0:
+            return resp["data"]
+        else:
+            raise EtherpadRequestException(resp)
+        
+        return resp
+    
+    def getRevisionsCount(self, padID):
+        f = self.__getattr__("getRevisionsCount")
+        res = f(padID=padID)
+        
+        return res["revisions"]
+    
+    def getPadUrl(self, padID):
+        
+        return "%s/%s" % (self.base_url,padID)
+    
+    
+
+def abort(message, parser):
+    if message is not None:
+        sys.stderr.write(message + "\n")
+    parser.print_help()
+    sys.exit(1)
+
+def get_options():
+    
+    parser = OptionParser()
+    parser.add_option("-u", "--api-url", dest="api_url",
+                      help="Base etherpad-lite api url", metavar="API_URL", default=None)
+    parser.add_option("-k", "--api-key", dest="api_key",
+                      help="Base etherpad-lite api url", metavar="API_KEY", default=None)
+    parser.add_option("-p", "--pad-id", dest="pad_id",
+                      help="pad id", metavar="PADID")
+    parser.add_option("-s", "--start-date", dest="start_date",
+                      help="start date", metavar="START_DATE", default=None)
+    parser.add_option("-e", "--end-date", dest="end_date",
+                      help="end date", metavar="END_DATE", default=None)
+    parser.add_option("-f", "--format", dest="format", type="choice",
+                      help="format", metavar="FORMAT", choice=['html', 'text'], default='html')
+    parser.add_option("-I", "--content-file", dest="content_file",
+                      help="Content file", metavar="CONTENT_FILE")
+    parser.add_option("-C", "--color", dest="color",
+                      help="Color code", metavar="COLOR", default="16763904")
+    parser.add_option("-D", "--duration", dest="duration", type="int",
+                      help="Duration", metavar="DURATION", default=None)
+    parser.add_option("-n", "--name", dest="name",
+                      help="Cutting name", metavar="NAME", default=u"pads")
+    parser.add_option("-R", "--replace", dest="replace", action="store_true",
+                      help="Replace tweet ensemble", metavar="REPLACE", default=False)
+    parser.add_option("-m", "--merge", dest="merge", action="store_true",
+                      help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False)
+    parser.add_option("-E", "--extended", dest="extended_mode", action="store_true",
+                      help="Trigger polemic extended mode", metavar="EXTENDED", default=False)
+    parser.add_option("-S", "--step", dest="step", type=1,
+                      help="step for version", metavar="STEP", default=False)
+
+    
+    
+    set_logging_options(parser)
+
+    
+    return parser.parse_args() + (parser,)
+
+
+if __name__ == "__main__" :
+
+    (options, args, parser) = get_options()
+    
+    set_logging(options)
+    get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable
+    
+    if len(sys.argv) == 1:
+        abort(None)
+
+    base_url = options.get("api_url", None)
+    if not base_url:
+        abort("No base url")
+
+    api_key = options.get("api_key", None)
+    if not api_key:
+        abort("api key missing")
+        
+    pad_id = options.get("pad_id", None)
+    if not pad_id:
+        abort("No pad id")
+
+    start_date_str = options.get("start_date",None)
+    end_date_str = options.get("end_date", None)
+    duration = options.get("duration", None)
+    
+    start_date = None
+    start_ts = None
+    if start_date_str:
+        start_date = parse_date(start_date_str) 
+        start_ts = time.mktime(start_date.timetuple())*1000
+
+    end_date = None
+    if end_date_str:
+        end_date = parse_date(end_date_str)
+    elif start_date and duration:
+        end_date = start_date + datetime.timedelta(seconds=duration)
+        
+    if start_date is None or ts is None:
+        abort("No start date found")
+
+    end_ts = None
+    if end_date is not None:
+        end_ts = time.mktime(end_date.timetuple())*1000
+
+    content_file = options.get("content_file", None)
+    
+    if not content_file:
+        abort("No content file")        
+
+    root = None
+
+    if content_file.find("http") == 0:
+
+        get_logger().debug("url : " + content_file) #@UndefinedVariable
+        
+        h = httplib2.Http()
+        resp, content = h.request(content_file)
+        
+        get_logger().debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable
+        
+        project = anyjson.deserialize(content)
+        root = etree.fromstring(project["ldt"])
+                
+    elif os.path.exists(content_file):
+
+        doc = etree.parse(content_file)
+        root = doc.getroot()
+            
+    if root is None:
+        abort("No content file, file not found")
+
+    cutting_name = options.get("name", None)
+    if cutting_name is None:
+        cutting_name = "pad_%s" % pad_id
+
+    format = options.get('format','html')
+    ensemble_parent = None
+                
+    file_type = None
+    for node in root:
+        if node.tag == "project":
+            file_type = "ldt"
+            break
+        elif node.tag == "head":
+            file_type = "iri"
+            break
+    if file_type is None:
+        abort("Unknown file type")
+
+    if file_type == "ldt":
+        media_nodes = root.xpath("//media")
+        if len(media_nodes) > 0:
+            media = media_nodes[0]
+        annotations_node = root.find(u"annotations")
+        if annotations_node is None:
+            annotations_node = etree.SubElement(root, u"annotations")
+        content_node = annotations_node.find(u"content")
+        if content_node is None:
+            content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id"))
+        ensemble_parent = content_node
+    elif file_type == "iri":
+        body_node = root.find(u"body")
+        if body_node is None:
+            body_node = etree.SubElement(root, u"body")
+        ensembles_node = body_node.find(u"ensembles")
+        if ensembles_node is None:
+            ensembles_node = etree.SubElement(body_node, u"ensembles")
+        ensemble_parent = ensembles_node
+
+    if ensemble_parent is None:
+        abort("Can not add cutting")
+
+    if options.replace:
+        for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
+            if ens.get("id","").startswith(cutting_name):
+                ensemble_parent.remove(ens)
+                
+    ensemble = None
+    elements = None
+                
+    if options.merge:
+        ensemble = ensemble_parent.find(u"ensemble")
+        if ensemble is not None:
+            elements = ensemble.find(u".//elements")                
+        
+    if ensemble is None or elements is None:
+        ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble pad", u"author":u"IRI Web", u"abstract":u"Ensemble Pad"})
+        decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
+    
+        etree.SubElement(decoupage, u"title").text = unicode(cutting_name)
+        etree.SubElement(decoupage, u"abstract").text = unicode(cutting_name)
+    
+        elements = etree.SubElement(decoupage, u"elements")
+
+
+    etp_req = EtherpadRequest(base_url, api_key)
+    rev_count = et_req.getRevisionCount(pad_id)
+    
+    
+    version_range = range(1,rev_count+1, step)
+    #make sure that teh last version is exported
+    if rev_count not in version_range:
+        version_range.append(rev_count)
+    for rev in version_range:
+        
+        data = None
+        text = ""
+
+        if format == "html":
+            data = etp_req.getHtml(padID=padID, rev=rev)
+            text = data.get("html", "")
+        else:
+            data = etp_req.getText(padID=padID, rev=rev)
+            text = data.get("text","")
+
+        pad_ts = data['timestamp']
+        
+        if pad_ts < start_ts:
+            continue
+        
+        if end_ts is not None and pad_ts > end_ts:
+             break
+
+        pad_dt = datetime.fromtimestamp(float(pad_ts)/1000.0)
+        pad_ts_rel = pad_ts - start_ts
+        
+        username = None
+        color = ""
+        if 'author' in data:
+            username = data['author']['name'] if ('name' in data['author'] and data['author']['name']) else data['author']['id']
+            color =  data['author']['color'] if ('color' in data['author'] and data['author']['color']) else ""
+        
+        if not username:
+            username = "anon."
+            
+            
+        element = etree.SubElement(elements, u"element" , {u"id":"%s-%s-%d" %(unicode(uuid.uuid4()),unicode(pad_id),rev), u"color":unicode(color), u"author":unicode(username), u"date":unicode(pad_dt.strftime("%Y/%m/%d")), u"begin": unicode(pad_ts_rel), u"dur":u"0", u"src":""})
+        etree.SubElement(element, u"title").text = "%s: %s - rev %d" % (unicode(username), unicode(pad_id), rev)
+        etree.SubElement(element, u"abstract").text = unicode(text)
+        
+        meta_element = etree.SubElement(element, u'meta')
+        etree.SubElement(meta_element, "pad_url").text = etree.CDATA(unicode(etp_req.getPadUrl(padID)))
+        etree.SubElement(meta_element, "revision").text = etree.CDATA(unicode(rev))
+
+    # sort by tc in
+    if options.merge :
+        elements[:] = sorted(elements,key=lambda n: int(n.get('begin')))
+    
+    output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True)  
+    
+    if content_file and content_file.find("http") == 0:
+        
+        project["ldt"] = output_data
+        body = anyjson.serialize(project)
+        h = httplib2.Http()
+        resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body)
+        if resp.status != 200:
+            raise Exception("Error writing content : %d : %s"%(resp.status, resp.reason))                        
+    else:
+        if content_file and os.path.exists(content_file):
+            dest_file_name = content_file 
+        else:
+            dest_file_name = options.filename
+
+        output = open(dest_file_name, "w")
+        output.write(output_data)
+        output.flush()
+        output.close()
+        
+