src/cm/converters/oo_converters.py
changeset 0 40c8f766c9b8
child 24 c8a95e540b79
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/cm/converters/oo_converters.py	Mon Nov 23 15:14:29 2009 +0100
@@ -0,0 +1,437 @@
+# warning : oo server autolaunch is tricky
+# make sure .qt .kde .openoffice.org2 should be writable in home directory
+# for instance, if working user is www-data
+#   mkdir /var/www/.openoffice.org2 ; chown www-data:www-data /var/www/.openoffice.org2
+#   mkdir /var/www/.qt ; chown www-data:www-data /var/www/.qt
+#   mkdir /var/www/.kde ; chown www-data:www-data /var/www/.kde
+
+UNO_IMPORT = True
+
+if UNO_IMPORT:
+    import uno
+
+# old ubuntu bug left for the record
+#    print "#### Uno import failed ! #### "
+#    print "#### https://bugs.launchpad.net/ubuntu/+source/openoffice.org2/+bug/139077 #### "
+#    print "#### launch : sudo ldconfig -v /usr/lib/openoffice/program #### "
+
+from cm.utils.thread import synchronized, daemonize
+if UNO_IMPORT:
+    from com.sun.star.beans import PropertyValue
+from datetime import datetime
+from subprocess import Popen,call
+from tempfile import mkstemp,mkdtemp
+
+if UNO_IMPORT:
+    from unohelper import systemPathToFileUrl, absolutize
+
+from xml.dom.minidom import parseString
+import cStringIO
+import chardet
+import sys
+import magic
+import os,re
+import random
+import threading
+import time
+import logging
+
+CONN_STRING = "uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext" 
+
+KILL = 'killall -KILL xvfb-run ; killall -KILL soffice; killall -KILL soffice.bin; killall -KILL Xvfb'
+RM = 'rm -f /tmp/.X99-lock'
+LAUNCH = 'xvfb-run soffice -headless "-accept=socket,port=2002;urp;"'
+
+# xvfb-run soffice -headless "-accept=socket,port=2002;urp;";
+#  soffice "-accept=socket,port=2002;urp;";
+
+
+ms = magic.open(magic.MAGIC_NONE)
+ms.load()
+
+def is_text(buffer):
+    type = ms.buffer(buffer)
+    return ' text, ' in type
+
+def fix_text_encoding(buffer, to_encoding = 'utf-8'):
+    detected = chardet.detect(buffer)
+    encoding = detected['encoding']
+    if encoding != to_encoding:
+        return buffer.decode(encoding).encode(to_encoding)
+    return buffer
+# $$$ RBE TODO fix_content a call should be made before oo_convert call when importing text file with non utf-8 encoding todo test that to make it crash     
+def fix_content(buffer):
+    """
+    Fix content fixes :
+    - encoding to utf8 to txt files
+    """
+    try:
+        if is_text(buffer):
+            return fix_text_encoding(buffer)
+        return buffer
+    except:
+        return buffer
+
+processing = 0
+
+# timeout : kill oo
+PROCESSING_TIMEOUT = 20.0
+
+def oo_process_controller(code):
+    """
+    If 'code' process is still active : kill oo 
+    """
+    global processing
+    logging.info('oo_process_controller')
+    if processing == code:
+        logging.error('--> oo_process_controller : killing !')
+        kill_oo()        
+
+def kill_oo():
+    logging.info('killing')
+    p = Popen(KILL, shell=True)
+    sts = os.waitpid(p.pid, 0)
+    p = Popen(RM, shell=True)
+    sts = os.waitpid(p.pid, 0)
+
+def launch_oo():
+    logging.info('launching')
+    p = Popen(LAUNCH, shell=True)
+
+def kill_and_relaunch_oo():
+    kill_oo()
+    launch_oo()
+
+get_connection_lock = threading.RLock() 
+
+def start_processing():
+    global processing
+    logging.info('start_processing')
+    code = random.random()
+    processing = code
+    t = threading.Timer(PROCESSING_TIMEOUT, oo_process_controller, args = [code,])
+    t.start()
+
+def end_processing():
+    logging.info('end_processing')
+    global processing
+    processing = 0 
+    
+@synchronized(get_connection_lock)
+def get_connection(retry = 2):
+    while retry > 0:
+        try:
+            localContext = uno.getComponentContext()
+            
+            resolver = localContext.ServiceManager.createInstanceWithContext(
+                            "com.sun.star.bridge.UnoUrlResolver", localContext )
+            
+            ctx = resolver.resolve(CONN_STRING)
+            return ctx
+        except:
+            retry -= 1
+            kill_and_relaunch_oo()
+            time.sleep(8)
+            
+    raise Exception('could not launch oo, please read README.txt section Openoffice for troubleshooting')
+
+def get_desktop():
+    ctx = get_connection()
+    smgr = ctx.ServiceManager    
+    # get the central desktop object
+    desktop = smgr.createInstanceWithContext( "com.sun.star.frame.Desktop",ctx)
+    
+    return desktop
+
+
+class FmtList:    
+    def __init__(self):
+        self._list = []
+
+    def add(self, name, extension, summary, filter, export = False, mimetype = None):
+        dd = {
+              'name' : name,
+              'extension' : extension,
+              'summary' : summary,
+              'filter' : filter,
+              'export' : export,
+              'mimetype' : mimetype,
+              }
+        self._list.append(dd)
+        
+    def get_filter_by_summary(self, value):
+        return self.get_filter_by('summary', value)
+
+    def get_filter_by_name(self, value):
+        return self.get_filter_by('name', value)
+
+    def get_filter_by(self, name, value):
+        res = self.get_by(name, value)
+        if res:
+            return res['filter']
+        return None
+
+    def get_by_name(self, value):
+        return self.get_by('name', value)
+    
+    def get_by(self, name, value):
+        for fmt in self._list:
+            if fmt[name] == value:
+                return fmt
+        return None
+    
+    def get_export_formats_tuple(self):
+        return [(f['summary'],f['name']) for f in self._list if f['export']]
+
+    def ids_by_summary(self):
+        return self.ids_by('summary')
+    
+    def ids_by(self, name):
+        return dict([(r[name],r['name']) for r in self._list])
+
+fmts = None
+if UNO_IMPORT:
+    fmts = FmtList()
+    fmts.add('bib', 'bib', 'BibTeX', 'BibTeX_Writer')
+    fmts.add('doc', 'doc', 'Microsoft Word 97/2000/XP', 'MS Word 97', True, 'application/msword')
+    fmts.add('doc6', 'doc', 'Microsoft Word 6.0', 'MS WinWord 6.0')
+    fmts.add('doc95', 'doc', 'Microsoft Word 95', 'MS Word 95')
+    fmts.add('docbook', 'xml', 'DocBook', 'DocBook File')
+    fmts.add('html', 'html', 'HTML Document (OpenOffice.org Writer)', 'HTML (StarWriter)')
+    fmts.add('odt', 'odt', 'Open Document Text', 'writer8', True, 'application/vnd.oasis.opendocument.text')
+    fmts.add('ott', 'ott', 'Open Document Text', 'writer8_template')
+    fmts.add('ooxml', 'xml', 'Microsoft Office Open XML', 'MS Word 2003 XML')
+    fmts.add('pdb', 'pdb', 'AportisDoc (Palm)', 'AportisDoc Palm DB')
+    fmts.add('pdf', 'pdf', 'Portable Document Format', 'writer_pdf_Export', True, 'application/pdf')
+    fmts.add('psw', 'psw', 'Pocket Word', 'PocketWord File')
+    fmts.add('rtf', 'rtf', 'Rich Text Format', 'Rich Text Format', True, 'application/rtf')
+    fmts.add('latex', 'ltx', 'LaTeX 2e', 'LaTeX_Writer')
+    fmts.add('sdw', 'sdw', 'StarWriter 5.0', 'StarWriter 5.0')
+    fmts.add('sdw4', 'sdw', 'StarWriter 4.0', 'StarWriter 4.0')
+    fmts.add('sdw3', 'sdw', 'StarWriter 3.0', 'StarWriter 3.0')
+    fmts.add('stw', 'stw', 'Open Office.org 1.0 Text Document Template', 'writer_StarOffice_XML_Writer_Template')
+    fmts.add('sxw', 'sxw', 'Open Office.org 1.0 Text Document', 'StarOffice XML (Writer)')
+    fmts.add('text', 'txt', 'Text Encoded', 'Text (encoded)', True, 'application/txt')
+    fmts.add('txt', 'txt', 'Plain Text', 'Text')
+    fmts.add('vor', 'vor', 'StarWriter 5.0 Template', 'StarWriter 5.0 Vorlage/Template')
+    fmts.add('vor4', 'vor', 'StarWriter 4.0 Template', 'StarWriter 4.0 Vorlage/Template')
+    fmts.add('vor3', 'vor', 'StarWriter 3.0 Template', 'StarWriter 3.0 Vorlage/Template')
+    fmts.add('xhtml', 'html', 'XHTML Document', 'XHTML Writer File')
+
+THE_OUTDIR = "outdir"
+THE_OUTFILE = "outfile"
+
+THE_INDIR = "indir"
+THE_INFILE = "infile"
+
+def fix_img_path(html,xhtml,imgs):
+    """
+    imgs : name --> path
+    """
+    finder_re = 'src[\s]*=[\s]*\"((?:(?!https?))[^\"]*)\"'
+    len_res_html = len(re.findall(finder_re,html,re.IGNORECASE))
+    len_res_xhtml = len(re.findall(finder_re,xhtml,re.IGNORECASE))
+    res_html = re.finditer(finder_re,html,re.IGNORECASE)
+    res_xhtml = re.finditer(finder_re,xhtml,re.IGNORECASE)
+    result = []
+    last_index = 0
+    for match_xhtml in res_xhtml:
+        img_path = '' 
+        try:
+            match_html = res_html.next()
+            if match_html:
+                img_name = match_html.group(1)
+                img_path = imgs[img_name]
+        except StopIteration:
+            # TODO : report pb
+            pass 
+        offset = len(match_xhtml.group(0)) - len(match_xhtml.group(1))
+        result.append(xhtml[last_index:match_xhtml.start() + offset - 1])
+        result.append(img_path)
+        last_index = match_xhtml.end() - 1 # -1 because trailing "
+    result.append(xhtml[last_index:len(xhtml)])
+    return u''.join(result)
+
+  
+def extract_css_body(xhtml):
+    dom = parseString(xhtml.encode('utf8'))
+    style = dom.getElementsByTagName("style")[0].toxml()
+    body = dom.getElementsByTagName("body")[0].toxml()
+    # cleanup initial/final tags
+    style_clean = style[style.find('>')+1:style.rfind('</')]
+    body_clean = body[body.find('>')+1:body.rfind('</')]
+    return style_clean,body_clean
+    
+convert_lock = threading.RLock() 
+
+def combine_css_body(body, css):
+    return """
+<html xmlns="http://www.w3.org/1999/xhtml">
+    <head>
+        <meta http-equiv="content-type" content="text/html; charset=utf-8" />
+        <style type="text/css">
+            %s
+        </style>    
+    </head>
+    <body>
+        %s
+    </body>
+</html>
+""" %(css,body)
+
+def to_string(input):
+    if type(input) == unicode:
+        input = input.encode('utf8')
+    return input
+    
+@synchronized(convert_lock)    
+def convert_html(input, format_name, images = None):
+    out_filter = fmts.get_filter_by_name(format_name)    
+    if not out_filter:
+        raise Exception("Unsupported format name %s" %(format_name)) 
+    infile = None
+    outfile = None
+    out_f = None
+    try:
+        desktop = get_desktop()
+        
+        start_processing()
+                        
+        # create in/out files
+        temp_dir = mkdtemp(prefix="cm_")
+        
+        # in
+        indir_name = os.path.join(temp_dir, THE_INDIR)
+        os.mkdir(indir_name)
+        infile_name = os.path.join(indir_name, THE_INFILE  + '.html')
+        
+        # out
+        outdir_name = os.path.join(temp_dir, THE_OUTDIR)
+        os.mkdir(outdir_name)
+        outfile_name = os.path.join(outdir_name, THE_OUTFILE)
+
+        # write infile 
+        infile = open(infile_name,'w')
+        input = to_string(input)
+        infile.write(input)
+        infile.close()
+
+        # fix perms
+        # TODO: group permission should suffice
+        os.chmod(temp_dir, 0755) # read    
+        os.chmod(indir_name, 0755) # read
+        os.chmod(infile_name, 0755) # read
+        os.chmod(outdir_name, 0777) # read / write
+
+        inProps = PropertyValue( "Hidden" , 0 , True, 0 ),        
+        doc = desktop.loadComponentFromURL( "private:factory/swriter", "_blank", 0, inProps )
+        text   = doc.Text
+        cursor = text.createTextCursor()
+
+        fileUrl = systemPathToFileUrl(infile_name)
+        cursor.insertDocumentFromURL(fileUrl, ())
+        
+        properties= (PropertyValue("Hidden", 0, True, 0), PropertyValue("FilterName", 0, out_filter, 0))        
+        doc.storeToURL('file://%s' %outfile_name,tuple(properties))
+        
+        out_f = open(outfile_name,'r')
+
+        output = out_f.read()
+        return output
+    finally:
+        end_processing()
+        try:
+            if out_f:
+                out_f.close()
+            if infile:
+                infile.close()
+            top = temp_dir
+            for root, dirs, files in os.walk(top, topdown=False):
+                for name in files:
+                    os.remove(os.path.join(root, name))
+                for name in dirs:
+                    os.rmdir(os.path.join(root, name))
+            os.rmdir(top)            
+        except:
+            # TODO : warn
+            pass
+
+@synchronized(convert_lock)    
+def convert(input, format_name, unicode = False):
+    
+    logging.info('convert')
+    out_filter = fmts.get_filter_by_name(format_name)    
+    if not out_filter:
+        raise Exception("Unsupported format name %s" %(format_name)) 
+    infile = None
+    outfile = None
+    out_f = None
+    try:
+        desktop = get_desktop() 
+	    
+        start_processing()
+    
+        # create in/out files
+        temp_dir = mkdtemp(prefix="cm_")
+        
+        # in
+        indir_name = os.path.join(temp_dir, THE_INDIR)
+        os.mkdir(indir_name)
+        infile_name = os.path.join(indir_name, THE_INFILE)
+        
+        # out
+        outdir_name = os.path.join(temp_dir, THE_OUTDIR)
+        os.mkdir(outdir_name)
+        outfile_name = os.path.join(outdir_name, THE_OUTFILE)
+
+        # write infile 
+        infile = open(infile_name,'w')
+        input = to_string(input)
+        infile.write(input)
+        infile.close()
+
+        # fix perms
+        # TODO group permission should suffice
+        os.chmod(temp_dir, 0755) # read        
+        os.chmod(indir_name, 0755) # read        
+        os.chmod(infile_name, 0755) # read
+        os.chmod(outdir_name, 0777) # read / write
+                
+        properties = PropertyValue("Hidden", 0, True, 0),
+                       
+        #import pdb;pdb.set_trace()   
+        doc=desktop.loadComponentFromURL("file://%s" % infile_name, "_blank", 0, properties)
+        
+        properties= (PropertyValue("Hidden", 0, True, 0), PropertyValue("FilterName", 0, out_filter, 0))        
+        doc.storeToURL('file://%s' %outfile_name,tuple(properties))
+        
+        out_f = open(outfile_name,'r')
+
+        output = out_f.read()
+        # load other files (useful only for html)
+        image_names = [name for name in os.listdir(outdir_name) if name != THE_OUTFILE]
+        img_res = [] 
+        for image_name in image_names:
+              img_res.append(os.path.join(outdir_name, image_name))
+        if unicode:
+            output = output.decode('utf8')
+        return output,img_res
+    finally:
+        end_processing()
+        try:
+            if out_f:
+                out_f.close()
+            if infile:
+                infile.close()
+# Do not remove dir: we only return images path to avoid 
+# mem overload             
+#            top = temp_dir
+#            for root, dirs, files in os.walk(top, topdown=False):
+#                for name in files:
+#                    os.remove(os.path.join(root, name))
+#                for name in dirs:
+#                    os.rmdir(os.path.join(root, name))
+#            os.rmdir(top)            
+        except:
+            # TODO : warn
+            pass