script/utils/export_pad.py
changeset 693 2ef837069108
child 891 8628c590f608
equal deleted inserted replaced
692:51072e5e6ea9 693:2ef837069108
       
     1 #!/usr/bin/env python
       
     2 # coding=utf-8
       
     3 
       
     4 from lxml import etree
       
     5 from iri_tweet.models import setup_database
       
     6 from optparse import OptionParser #@UnresolvedImport
       
     7 from sqlalchemy import Table, Column, BigInteger
       
     8 from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, 
       
     9     get_logger)
       
    10 import anyjson
       
    11 import datetime
       
    12 import httplib2
       
    13 import os.path
       
    14 import re
       
    15 import sys
       
    16 import time
       
    17 import uuid #@UnresolvedImport
       
    18 from dateutil.parser import parse as parse_date
       
    19 import json
       
    20 import functools
       
    21 
       
    22 
       
    23 class EtherpadRequestException(Exception):
       
    24     def __init__(self, original_resp):
       
    25         super(EtherpadRequestException, self).__init__(original_resp["message"])
       
    26         self.status = original_resp["status"]
       
    27         self.original_resp = original_resp
       
    28 
       
    29 
       
    30 class EtherpadRequest():
       
    31     
       
    32     def __init__(self, base_url, api_key):
       
    33         self.base_url = base_url
       
    34         self.api_key = api_key
       
    35         self.__request = None
       
    36 
       
    37     def __getattr__(self, name):
       
    38         return functools.partial(self.__action, name)
       
    39 
       
    40     def __action(self, action, **kwargs):
       
    41         url = "%s/%s" % (self.base_url, action)
       
    42         params = dict(kwargs)
       
    43         params['apikey'] = self.api_key
       
    44         
       
    45         r = requests.get(url, params)
       
    46         
       
    47         resp = anyjson.deserialize(r.text)
       
    48         
       
    49         if resp["code"] == 0:
       
    50             return resp["data"]
       
    51         else:
       
    52             raise EtherpadRequestException(resp)
       
    53         
       
    54         return resp
       
    55     
       
    56     def getRevisionsCount(self, padID):
       
    57         f = self.__getattr__("getRevisionsCount")
       
    58         res = f(padID=padID)
       
    59         
       
    60         return res["revisions"]
       
    61     
       
    62     def getPadUrl(self, padID):
       
    63         
       
    64         return "%s/%s" % (self.base_url,padID)
       
    65     
       
    66     
       
    67 
       
    68 def abort(message, parser):
       
    69     if message is not None:
       
    70         sys.stderr.write(message + "\n")
       
    71     parser.print_help()
       
    72     sys.exit(1)
       
    73 
       
    74 def get_options():
       
    75     
       
    76     parser = OptionParser()
       
    77     parser.add_option("-u", "--api-url", dest="api_url",
       
    78                       help="Base etherpad-lite api url", metavar="API_URL", default=None)
       
    79     parser.add_option("-k", "--api-key", dest="api_key",
       
    80                       help="Base etherpad-lite api url", metavar="API_KEY", default=None)
       
    81     parser.add_option("-p", "--pad-id", dest="pad_id",
       
    82                       help="pad id", metavar="PADID")
       
    83     parser.add_option("-s", "--start-date", dest="start_date",
       
    84                       help="start date", metavar="START_DATE", default=None)
       
    85     parser.add_option("-e", "--end-date", dest="end_date",
       
    86                       help="end date", metavar="END_DATE", default=None)
       
    87     parser.add_option("-f", "--format", dest="format", type="choice",
       
    88                       help="format", metavar="FORMAT", choice=['html', 'text'], default='html')
       
    89     parser.add_option("-I", "--content-file", dest="content_file",
       
    90                       help="Content file", metavar="CONTENT_FILE")
       
    91     parser.add_option("-C", "--color", dest="color",
       
    92                       help="Color code", metavar="COLOR", default="16763904")
       
    93     parser.add_option("-D", "--duration", dest="duration", type="int",
       
    94                       help="Duration", metavar="DURATION", default=None)
       
    95     parser.add_option("-n", "--name", dest="name",
       
    96                       help="Cutting name", metavar="NAME", default=u"pads")
       
    97     parser.add_option("-R", "--replace", dest="replace", action="store_true",
       
    98                       help="Replace tweet ensemble", metavar="REPLACE", default=False)
       
    99     parser.add_option("-m", "--merge", dest="merge", action="store_true",
       
   100                       help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False)
       
   101     parser.add_option("-E", "--extended", dest="extended_mode", action="store_true",
       
   102                       help="Trigger polemic extended mode", metavar="EXTENDED", default=False)
       
   103     parser.add_option("-S", "--step", dest="step", type=1,
       
   104                       help="step for version", metavar="STEP", default=False)
       
   105 
       
   106     
       
   107     
       
   108     set_logging_options(parser)
       
   109 
       
   110     
       
   111     return parser.parse_args() + (parser,)
       
   112 
       
   113 
       
   114 if __name__ == "__main__" :
       
   115 
       
   116     (options, args, parser) = get_options()
       
   117     
       
   118     set_logging(options)
       
   119     get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable
       
   120     
       
   121     if len(sys.argv) == 1:
       
   122         abort(None)
       
   123 
       
   124     base_url = options.get("api_url", None)
       
   125     if not base_url:
       
   126         abort("No base url")
       
   127 
       
   128     api_key = options.get("api_key", None)
       
   129     if not api_key:
       
   130         abort("api key missing")
       
   131         
       
   132     pad_id = options.get("pad_id", None)
       
   133     if not pad_id:
       
   134         abort("No pad id")
       
   135 
       
   136     start_date_str = options.get("start_date",None)
       
   137     end_date_str = options.get("end_date", None)
       
   138     duration = options.get("duration", None)
       
   139     
       
   140     start_date = None
       
   141     start_ts = None
       
   142     if start_date_str:
       
   143         start_date = parse_date(start_date_str) 
       
   144         start_ts = time.mktime(start_date.timetuple())*1000
       
   145 
       
   146     end_date = None
       
   147     if end_date_str:
       
   148         end_date = parse_date(end_date_str)
       
   149     elif start_date and duration:
       
   150         end_date = start_date + datetime.timedelta(seconds=duration)
       
   151         
       
   152     if start_date is None or ts is None:
       
   153         abort("No start date found")
       
   154 
       
   155     end_ts = None
       
   156     if end_date is not None:
       
   157         end_ts = time.mktime(end_date.timetuple())*1000
       
   158 
       
   159     content_file = options.get("content_file", None)
       
   160     
       
   161     if not content_file:
       
   162         abort("No content file")        
       
   163 
       
   164     root = None
       
   165 
       
   166     if content_file.find("http") == 0:
       
   167 
       
   168         get_logger().debug("url : " + content_file) #@UndefinedVariable
       
   169         
       
   170         h = httplib2.Http()
       
   171         resp, content = h.request(content_file)
       
   172         
       
   173         get_logger().debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable
       
   174         
       
   175         project = anyjson.deserialize(content)
       
   176         root = etree.fromstring(project["ldt"])
       
   177                 
       
   178     elif os.path.exists(content_file):
       
   179 
       
   180         doc = etree.parse(content_file)
       
   181         root = doc.getroot()
       
   182             
       
   183     if root is None:
       
   184         abort("No content file, file not found")
       
   185 
       
   186     cutting_name = options.get("name", None)
       
   187     if cutting_name is None:
       
   188         cutting_name = "pad_%s" % pad_id
       
   189 
       
   190     format = options.get('format','html')
       
   191     ensemble_parent = None
       
   192                 
       
   193     file_type = None
       
   194     for node in root:
       
   195         if node.tag == "project":
       
   196             file_type = "ldt"
       
   197             break
       
   198         elif node.tag == "head":
       
   199             file_type = "iri"
       
   200             break
       
   201     if file_type is None:
       
   202         abort("Unknown file type")
       
   203 
       
   204     if file_type == "ldt":
       
   205         media_nodes = root.xpath("//media")
       
   206         if len(media_nodes) > 0:
       
   207             media = media_nodes[0]
       
   208         annotations_node = root.find(u"annotations")
       
   209         if annotations_node is None:
       
   210             annotations_node = etree.SubElement(root, u"annotations")
       
   211         content_node = annotations_node.find(u"content")
       
   212         if content_node is None:
       
   213             content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id"))
       
   214         ensemble_parent = content_node
       
   215     elif file_type == "iri":
       
   216         body_node = root.find(u"body")
       
   217         if body_node is None:
       
   218             body_node = etree.SubElement(root, u"body")
       
   219         ensembles_node = body_node.find(u"ensembles")
       
   220         if ensembles_node is None:
       
   221             ensembles_node = etree.SubElement(body_node, u"ensembles")
       
   222         ensemble_parent = ensembles_node
       
   223 
       
   224     if ensemble_parent is None:
       
   225         abort("Can not add cutting")
       
   226 
       
   227     if options.replace:
       
   228         for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
       
   229             if ens.get("id","").startswith(cutting_name):
       
   230                 ensemble_parent.remove(ens)
       
   231                 
       
   232     ensemble = None
       
   233     elements = None
       
   234                 
       
   235     if options.merge:
       
   236         ensemble = ensemble_parent.find(u"ensemble")
       
   237         if ensemble is not None:
       
   238             elements = ensemble.find(u".//elements")                
       
   239         
       
   240     if ensemble is None or elements is None:
       
   241         ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble pad", u"author":u"IRI Web", u"abstract":u"Ensemble Pad"})
       
   242         decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
       
   243     
       
   244         etree.SubElement(decoupage, u"title").text = unicode(cutting_name)
       
   245         etree.SubElement(decoupage, u"abstract").text = unicode(cutting_name)
       
   246     
       
   247         elements = etree.SubElement(decoupage, u"elements")
       
   248 
       
   249 
       
   250     etp_req = EtherpadRequest(base_url, api_key)
       
   251     rev_count = et_req.getRevisionCount(pad_id)
       
   252     
       
   253     
       
   254     version_range = range(1,rev_count+1, step)
       
   255     #make sure that teh last version is exported
       
   256     if rev_count not in version_range:
       
   257         version_range.append(rev_count)
       
   258     for rev in version_range:
       
   259         
       
   260         data = None
       
   261         text = ""
       
   262 
       
   263         if format == "html":
       
   264             data = etp_req.getHtml(padID=padID, rev=rev)
       
   265             text = data.get("html", "")
       
   266         else:
       
   267             data = etp_req.getText(padID=padID, rev=rev)
       
   268             text = data.get("text","")
       
   269 
       
   270         pad_ts = data['timestamp']
       
   271         
       
   272         if pad_ts < start_ts:
       
   273             continue
       
   274         
       
   275         if end_ts is not None and pad_ts > end_ts:
       
   276              break
       
   277 
       
   278         pad_dt = datetime.fromtimestamp(float(pad_ts)/1000.0)
       
   279         pad_ts_rel = pad_ts - start_ts
       
   280         
       
   281         username = None
       
   282         color = ""
       
   283         if 'author' in data:
       
   284             username = data['author']['name'] if ('name' in data['author'] and data['author']['name']) else data['author']['id']
       
   285             color =  data['author']['color'] if ('color' in data['author'] and data['author']['color']) else ""
       
   286         
       
   287         if not username:
       
   288             username = "anon."
       
   289             
       
   290             
       
   291         element = etree.SubElement(elements, u"element" , {u"id":"%s-%s-%d" %(unicode(uuid.uuid4()),unicode(pad_id),rev), u"color":unicode(color), u"author":unicode(username), u"date":unicode(pad_dt.strftime("%Y/%m/%d")), u"begin": unicode(pad_ts_rel), u"dur":u"0", u"src":""})
       
   292         etree.SubElement(element, u"title").text = "%s: %s - rev %d" % (unicode(username), unicode(pad_id), rev)
       
   293         etree.SubElement(element, u"abstract").text = unicode(text)
       
   294         
       
   295         meta_element = etree.SubElement(element, u'meta')
       
   296         etree.SubElement(meta_element, "pad_url").text = etree.CDATA(unicode(etp_req.getPadUrl(padID)))
       
   297         etree.SubElement(meta_element, "revision").text = etree.CDATA(unicode(rev))
       
   298 
       
   299     # sort by tc in
       
   300     if options.merge :
       
   301         elements[:] = sorted(elements,key=lambda n: int(n.get('begin')))
       
   302     
       
   303     output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True)  
       
   304     
       
   305     if content_file and content_file.find("http") == 0:
       
   306         
       
   307         project["ldt"] = output_data
       
   308         body = anyjson.serialize(project)
       
   309         h = httplib2.Http()
       
   310         resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body)
       
   311         if resp.status != 200:
       
   312             raise Exception("Error writing content : %d : %s"%(resp.status, resp.reason))                        
       
   313     else:
       
   314         if content_file and os.path.exists(content_file):
       
   315             dest_file_name = content_file 
       
   316         else:
       
   317             dest_file_name = options.filename
       
   318 
       
   319         output = open(dest_file_name, "w")
       
   320         output.write(output_data)
       
   321         output.flush()
       
   322         output.close()
       
   323         
       
   324