diff -r 51072e5e6ea9 -r 2ef837069108 script/utils/export_pad.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/utils/export_pad.py Mon Oct 15 17:01:50 2012 +0200 @@ -0,0 +1,324 @@ +#!/usr/bin/env python +# coding=utf-8 + +from lxml import etree +from iri_tweet.models import setup_database +from optparse import OptionParser #@UnresolvedImport +from sqlalchemy import Table, Column, BigInteger +from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, + get_logger) +import anyjson +import datetime +import httplib2 +import os.path +import re +import sys +import time +import uuid #@UnresolvedImport +from dateutil.parser import parse as parse_date +import json +import functools + + +class EtherpadRequestException(Exception): + def __init__(self, original_resp): + super(EtherpadRequestException, self).__init__(original_resp["message"]) + self.status = original_resp["status"] + self.original_resp = original_resp + + +class EtherpadRequest(): + + def __init__(self, base_url, api_key): + self.base_url = base_url + self.api_key = api_key + self.__request = None + + def __getattr__(self, name): + return functools.partial(self.__action, name) + + def __action(self, action, **kwargs): + url = "%s/%s" % (self.base_url, action) + params = dict(kwargs) + params['apikey'] = self.api_key + + r = requests.get(url, params) + + resp = anyjson.deserialize(r.text) + + if resp["code"] == 0: + return resp["data"] + else: + raise EtherpadRequestException(resp) + + return resp + + def getRevisionsCount(self, padID): + f = self.__getattr__("getRevisionsCount") + res = f(padID=padID) + + return res["revisions"] + + def getPadUrl(self, padID): + + return "%s/%s" % (self.base_url,padID) + + + +def abort(message, parser): + if message is not None: + sys.stderr.write(message + "\n") + parser.print_help() + sys.exit(1) + +def get_options(): + + parser = OptionParser() + parser.add_option("-u", "--api-url", dest="api_url", + help="Base etherpad-lite api url", metavar="API_URL", default=None) + parser.add_option("-k", "--api-key", dest="api_key", + help="Base etherpad-lite api url", metavar="API_KEY", default=None) + parser.add_option("-p", "--pad-id", dest="pad_id", + help="pad id", metavar="PADID") + parser.add_option("-s", "--start-date", dest="start_date", + help="start date", metavar="START_DATE", default=None) + parser.add_option("-e", "--end-date", dest="end_date", + help="end date", metavar="END_DATE", default=None) + parser.add_option("-f", "--format", dest="format", type="choice", + help="format", metavar="FORMAT", choice=['html', 'text'], default='html') + parser.add_option("-I", "--content-file", dest="content_file", + help="Content file", metavar="CONTENT_FILE") + parser.add_option("-C", "--color", dest="color", + help="Color code", metavar="COLOR", default="16763904") + parser.add_option("-D", "--duration", dest="duration", type="int", + help="Duration", metavar="DURATION", default=None) + parser.add_option("-n", "--name", dest="name", + help="Cutting name", metavar="NAME", default=u"pads") + parser.add_option("-R", "--replace", dest="replace", action="store_true", + help="Replace tweet ensemble", metavar="REPLACE", default=False) + parser.add_option("-m", "--merge", dest="merge", action="store_true", + help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False) + parser.add_option("-E", "--extended", dest="extended_mode", action="store_true", + help="Trigger polemic extended mode", metavar="EXTENDED", default=False) + parser.add_option("-S", "--step", dest="step", type=1, + help="step for version", metavar="STEP", default=False) + + + + set_logging_options(parser) + + + return parser.parse_args() + (parser,) + + +if __name__ == "__main__" : + + (options, args, parser) = get_options() + + set_logging(options) + get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable + + if len(sys.argv) == 1: + abort(None) + + base_url = options.get("api_url", None) + if not base_url: + abort("No base url") + + api_key = options.get("api_key", None) + if not api_key: + abort("api key missing") + + pad_id = options.get("pad_id", None) + if not pad_id: + abort("No pad id") + + start_date_str = options.get("start_date",None) + end_date_str = options.get("end_date", None) + duration = options.get("duration", None) + + start_date = None + start_ts = None + if start_date_str: + start_date = parse_date(start_date_str) + start_ts = time.mktime(start_date.timetuple())*1000 + + end_date = None + if end_date_str: + end_date = parse_date(end_date_str) + elif start_date and duration: + end_date = start_date + datetime.timedelta(seconds=duration) + + if start_date is None or ts is None: + abort("No start date found") + + end_ts = None + if end_date is not None: + end_ts = time.mktime(end_date.timetuple())*1000 + + content_file = options.get("content_file", None) + + if not content_file: + abort("No content file") + + root = None + + if content_file.find("http") == 0: + + get_logger().debug("url : " + content_file) #@UndefinedVariable + + h = httplib2.Http() + resp, content = h.request(content_file) + + get_logger().debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable + + project = anyjson.deserialize(content) + root = etree.fromstring(project["ldt"]) + + elif os.path.exists(content_file): + + doc = etree.parse(content_file) + root = doc.getroot() + + if root is None: + abort("No content file, file not found") + + cutting_name = options.get("name", None) + if cutting_name is None: + cutting_name = "pad_%s" % pad_id + + format = options.get('format','html') + ensemble_parent = None + + file_type = None + for node in root: + if node.tag == "project": + file_type = "ldt" + break + elif node.tag == "head": + file_type = "iri" + break + if file_type is None: + abort("Unknown file type") + + if file_type == "ldt": + media_nodes = root.xpath("//media") + if len(media_nodes) > 0: + media = media_nodes[0] + annotations_node = root.find(u"annotations") + if annotations_node is None: + annotations_node = etree.SubElement(root, u"annotations") + content_node = annotations_node.find(u"content") + if content_node is None: + content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id")) + ensemble_parent = content_node + elif file_type == "iri": + body_node = root.find(u"body") + if body_node is None: + body_node = etree.SubElement(root, u"body") + ensembles_node = body_node.find(u"ensembles") + if ensembles_node is None: + ensembles_node = etree.SubElement(body_node, u"ensembles") + ensemble_parent = ensembles_node + + if ensemble_parent is None: + abort("Can not add cutting") + + if options.replace: + for ens in ensemble_parent.iterchildren(tag=u"ensemble"): + if ens.get("id","").startswith(cutting_name): + ensemble_parent.remove(ens) + + ensemble = None + elements = None + + if options.merge: + ensemble = ensemble_parent.find(u"ensemble") + if ensemble is not None: + elements = ensemble.find(u".//elements") + + if ensemble is None or elements is None: + ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble pad", u"author":u"IRI Web", u"abstract":u"Ensemble Pad"}) + decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"}) + + etree.SubElement(decoupage, u"title").text = unicode(cutting_name) + etree.SubElement(decoupage, u"abstract").text = unicode(cutting_name) + + elements = etree.SubElement(decoupage, u"elements") + + + etp_req = EtherpadRequest(base_url, api_key) + rev_count = et_req.getRevisionCount(pad_id) + + + version_range = range(1,rev_count+1, step) + #make sure that teh last version is exported + if rev_count not in version_range: + version_range.append(rev_count) + for rev in version_range: + + data = None + text = "" + + if format == "html": + data = etp_req.getHtml(padID=padID, rev=rev) + text = data.get("html", "") + else: + data = etp_req.getText(padID=padID, rev=rev) + text = data.get("text","") + + pad_ts = data['timestamp'] + + if pad_ts < start_ts: + continue + + if end_ts is not None and pad_ts > end_ts: + break + + pad_dt = datetime.fromtimestamp(float(pad_ts)/1000.0) + pad_ts_rel = pad_ts - start_ts + + username = None + color = "" + if 'author' in data: + username = data['author']['name'] if ('name' in data['author'] and data['author']['name']) else data['author']['id'] + color = data['author']['color'] if ('color' in data['author'] and data['author']['color']) else "" + + if not username: + username = "anon." + + + element = etree.SubElement(elements, u"element" , {u"id":"%s-%s-%d" %(unicode(uuid.uuid4()),unicode(pad_id),rev), u"color":unicode(color), u"author":unicode(username), u"date":unicode(pad_dt.strftime("%Y/%m/%d")), u"begin": unicode(pad_ts_rel), u"dur":u"0", u"src":""}) + etree.SubElement(element, u"title").text = "%s: %s - rev %d" % (unicode(username), unicode(pad_id), rev) + etree.SubElement(element, u"abstract").text = unicode(text) + + meta_element = etree.SubElement(element, u'meta') + etree.SubElement(meta_element, "pad_url").text = etree.CDATA(unicode(etp_req.getPadUrl(padID))) + etree.SubElement(meta_element, "revision").text = etree.CDATA(unicode(rev)) + + # sort by tc in + if options.merge : + elements[:] = sorted(elements,key=lambda n: int(n.get('begin'))) + + output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True) + + if content_file and content_file.find("http") == 0: + + project["ldt"] = output_data + body = anyjson.serialize(project) + h = httplib2.Http() + resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body) + if resp.status != 200: + raise Exception("Error writing content : %d : %s"%(resp.status, resp.reason)) + else: + if content_file and os.path.exists(content_file): + dest_file_name = content_file + else: + dest_file_name = options.filename + + output = open(dest_file_name, "w") + output.write(output_data) + output.flush() + output.close() + +