diff -r 000000000000 -r 40c8f766c9b8 src/cm/converters/oo_converters.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/cm/converters/oo_converters.py Mon Nov 23 15:14:29 2009 +0100 @@ -0,0 +1,437 @@ +# warning : oo server autolaunch is tricky +# make sure .qt .kde .openoffice.org2 should be writable in home directory +# for instance, if working user is www-data +# mkdir /var/www/.openoffice.org2 ; chown www-data:www-data /var/www/.openoffice.org2 +# mkdir /var/www/.qt ; chown www-data:www-data /var/www/.qt +# mkdir /var/www/.kde ; chown www-data:www-data /var/www/.kde + +UNO_IMPORT = True + +if UNO_IMPORT: + import uno + +# old ubuntu bug left for the record +# print "#### Uno import failed ! #### " +# print "#### https://bugs.launchpad.net/ubuntu/+source/openoffice.org2/+bug/139077 #### " +# print "#### launch : sudo ldconfig -v /usr/lib/openoffice/program #### " + +from cm.utils.thread import synchronized, daemonize +if UNO_IMPORT: + from com.sun.star.beans import PropertyValue +from datetime import datetime +from subprocess import Popen,call +from tempfile import mkstemp,mkdtemp + +if UNO_IMPORT: + from unohelper import systemPathToFileUrl, absolutize + +from xml.dom.minidom import parseString +import cStringIO +import chardet +import sys +import magic +import os,re +import random +import threading +import time +import logging + +CONN_STRING = "uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext" + +KILL = 'killall -KILL xvfb-run ; killall -KILL soffice; killall -KILL soffice.bin; killall -KILL Xvfb' +RM = 'rm -f /tmp/.X99-lock' +LAUNCH = 'xvfb-run soffice -headless "-accept=socket,port=2002;urp;"' + +# xvfb-run soffice -headless "-accept=socket,port=2002;urp;"; +# soffice "-accept=socket,port=2002;urp;"; + + +ms = magic.open(magic.MAGIC_NONE) +ms.load() + +def is_text(buffer): + type = ms.buffer(buffer) + return ' text, ' in type + +def fix_text_encoding(buffer, to_encoding = 'utf-8'): + detected = chardet.detect(buffer) + encoding = detected['encoding'] + if encoding != to_encoding: + return buffer.decode(encoding).encode(to_encoding) + return buffer +# $$$ RBE TODO fix_content a call should be made before oo_convert call when importing text file with non utf-8 encoding todo test that to make it crash +def fix_content(buffer): + """ + Fix content fixes : + - encoding to utf8 to txt files + """ + try: + if is_text(buffer): + return fix_text_encoding(buffer) + return buffer + except: + return buffer + +processing = 0 + +# timeout : kill oo +PROCESSING_TIMEOUT = 20.0 + +def oo_process_controller(code): + """ + If 'code' process is still active : kill oo + """ + global processing + logging.info('oo_process_controller') + if processing == code: + logging.error('--> oo_process_controller : killing !') + kill_oo() + +def kill_oo(): + logging.info('killing') + p = Popen(KILL, shell=True) + sts = os.waitpid(p.pid, 0) + p = Popen(RM, shell=True) + sts = os.waitpid(p.pid, 0) + +def launch_oo(): + logging.info('launching') + p = Popen(LAUNCH, shell=True) + +def kill_and_relaunch_oo(): + kill_oo() + launch_oo() + +get_connection_lock = threading.RLock() + +def start_processing(): + global processing + logging.info('start_processing') + code = random.random() + processing = code + t = threading.Timer(PROCESSING_TIMEOUT, oo_process_controller, args = [code,]) + t.start() + +def end_processing(): + logging.info('end_processing') + global processing + processing = 0 + +@synchronized(get_connection_lock) +def get_connection(retry = 2): + while retry > 0: + try: + localContext = uno.getComponentContext() + + resolver = localContext.ServiceManager.createInstanceWithContext( + "com.sun.star.bridge.UnoUrlResolver", localContext ) + + ctx = resolver.resolve(CONN_STRING) + return ctx + except: + retry -= 1 + kill_and_relaunch_oo() + time.sleep(8) + + raise Exception('could not launch oo, please read README.txt section Openoffice for troubleshooting') + +def get_desktop(): + ctx = get_connection() + smgr = ctx.ServiceManager + # get the central desktop object + desktop = smgr.createInstanceWithContext( "com.sun.star.frame.Desktop",ctx) + + return desktop + + +class FmtList: + def __init__(self): + self._list = [] + + def add(self, name, extension, summary, filter, export = False, mimetype = None): + dd = { + 'name' : name, + 'extension' : extension, + 'summary' : summary, + 'filter' : filter, + 'export' : export, + 'mimetype' : mimetype, + } + self._list.append(dd) + + def get_filter_by_summary(self, value): + return self.get_filter_by('summary', value) + + def get_filter_by_name(self, value): + return self.get_filter_by('name', value) + + def get_filter_by(self, name, value): + res = self.get_by(name, value) + if res: + return res['filter'] + return None + + def get_by_name(self, value): + return self.get_by('name', value) + + def get_by(self, name, value): + for fmt in self._list: + if fmt[name] == value: + return fmt + return None + + def get_export_formats_tuple(self): + return [(f['summary'],f['name']) for f in self._list if f['export']] + + def ids_by_summary(self): + return self.ids_by('summary') + + def ids_by(self, name): + return dict([(r[name],r['name']) for r in self._list]) + +fmts = None +if UNO_IMPORT: + fmts = FmtList() + fmts.add('bib', 'bib', 'BibTeX', 'BibTeX_Writer') + fmts.add('doc', 'doc', 'Microsoft Word 97/2000/XP', 'MS Word 97', True, 'application/msword') + fmts.add('doc6', 'doc', 'Microsoft Word 6.0', 'MS WinWord 6.0') + fmts.add('doc95', 'doc', 'Microsoft Word 95', 'MS Word 95') + fmts.add('docbook', 'xml', 'DocBook', 'DocBook File') + fmts.add('html', 'html', 'HTML Document (OpenOffice.org Writer)', 'HTML (StarWriter)') + fmts.add('odt', 'odt', 'Open Document Text', 'writer8', True, 'application/vnd.oasis.opendocument.text') + fmts.add('ott', 'ott', 'Open Document Text', 'writer8_template') + fmts.add('ooxml', 'xml', 'Microsoft Office Open XML', 'MS Word 2003 XML') + fmts.add('pdb', 'pdb', 'AportisDoc (Palm)', 'AportisDoc Palm DB') + fmts.add('pdf', 'pdf', 'Portable Document Format', 'writer_pdf_Export', True, 'application/pdf') + fmts.add('psw', 'psw', 'Pocket Word', 'PocketWord File') + fmts.add('rtf', 'rtf', 'Rich Text Format', 'Rich Text Format', True, 'application/rtf') + fmts.add('latex', 'ltx', 'LaTeX 2e', 'LaTeX_Writer') + fmts.add('sdw', 'sdw', 'StarWriter 5.0', 'StarWriter 5.0') + fmts.add('sdw4', 'sdw', 'StarWriter 4.0', 'StarWriter 4.0') + fmts.add('sdw3', 'sdw', 'StarWriter 3.0', 'StarWriter 3.0') + fmts.add('stw', 'stw', 'Open Office.org 1.0 Text Document Template', 'writer_StarOffice_XML_Writer_Template') + fmts.add('sxw', 'sxw', 'Open Office.org 1.0 Text Document', 'StarOffice XML (Writer)') + fmts.add('text', 'txt', 'Text Encoded', 'Text (encoded)', True, 'application/txt') + fmts.add('txt', 'txt', 'Plain Text', 'Text') + fmts.add('vor', 'vor', 'StarWriter 5.0 Template', 'StarWriter 5.0 Vorlage/Template') + fmts.add('vor4', 'vor', 'StarWriter 4.0 Template', 'StarWriter 4.0 Vorlage/Template') + fmts.add('vor3', 'vor', 'StarWriter 3.0 Template', 'StarWriter 3.0 Vorlage/Template') + fmts.add('xhtml', 'html', 'XHTML Document', 'XHTML Writer File') + +THE_OUTDIR = "outdir" +THE_OUTFILE = "outfile" + +THE_INDIR = "indir" +THE_INFILE = "infile" + +def fix_img_path(html,xhtml,imgs): + """ + imgs : name --> path + """ + finder_re = 'src[\s]*=[\s]*\"((?:(?!https?))[^\"]*)\"' + len_res_html = len(re.findall(finder_re,html,re.IGNORECASE)) + len_res_xhtml = len(re.findall(finder_re,xhtml,re.IGNORECASE)) + res_html = re.finditer(finder_re,html,re.IGNORECASE) + res_xhtml = re.finditer(finder_re,xhtml,re.IGNORECASE) + result = [] + last_index = 0 + for match_xhtml in res_xhtml: + img_path = '' + try: + match_html = res_html.next() + if match_html: + img_name = match_html.group(1) + img_path = imgs[img_name] + except StopIteration: + # TODO : report pb + pass + offset = len(match_xhtml.group(0)) - len(match_xhtml.group(1)) + result.append(xhtml[last_index:match_xhtml.start() + offset - 1]) + result.append(img_path) + last_index = match_xhtml.end() - 1 # -1 because trailing " + result.append(xhtml[last_index:len(xhtml)]) + return u''.join(result) + + +def extract_css_body(xhtml): + dom = parseString(xhtml.encode('utf8')) + style = dom.getElementsByTagName("style")[0].toxml() + body = dom.getElementsByTagName("body")[0].toxml() + # cleanup initial/final tags + style_clean = style[style.find('>')+1:style.rfind('')+1:body.rfind(' + + + + + + %s + + +""" %(css,body) + +def to_string(input): + if type(input) == unicode: + input = input.encode('utf8') + return input + +@synchronized(convert_lock) +def convert_html(input, format_name, images = None): + out_filter = fmts.get_filter_by_name(format_name) + if not out_filter: + raise Exception("Unsupported format name %s" %(format_name)) + infile = None + outfile = None + out_f = None + try: + desktop = get_desktop() + + start_processing() + + # create in/out files + temp_dir = mkdtemp(prefix="cm_") + + # in + indir_name = os.path.join(temp_dir, THE_INDIR) + os.mkdir(indir_name) + infile_name = os.path.join(indir_name, THE_INFILE + '.html') + + # out + outdir_name = os.path.join(temp_dir, THE_OUTDIR) + os.mkdir(outdir_name) + outfile_name = os.path.join(outdir_name, THE_OUTFILE) + + # write infile + infile = open(infile_name,'w') + input = to_string(input) + infile.write(input) + infile.close() + + # fix perms + # TODO: group permission should suffice + os.chmod(temp_dir, 0755) # read + os.chmod(indir_name, 0755) # read + os.chmod(infile_name, 0755) # read + os.chmod(outdir_name, 0777) # read / write + + inProps = PropertyValue( "Hidden" , 0 , True, 0 ), + doc = desktop.loadComponentFromURL( "private:factory/swriter", "_blank", 0, inProps ) + text = doc.Text + cursor = text.createTextCursor() + + fileUrl = systemPathToFileUrl(infile_name) + cursor.insertDocumentFromURL(fileUrl, ()) + + properties= (PropertyValue("Hidden", 0, True, 0), PropertyValue("FilterName", 0, out_filter, 0)) + doc.storeToURL('file://%s' %outfile_name,tuple(properties)) + + out_f = open(outfile_name,'r') + + output = out_f.read() + return output + finally: + end_processing() + try: + if out_f: + out_f.close() + if infile: + infile.close() + top = temp_dir + for root, dirs, files in os.walk(top, topdown=False): + for name in files: + os.remove(os.path.join(root, name)) + for name in dirs: + os.rmdir(os.path.join(root, name)) + os.rmdir(top) + except: + # TODO : warn + pass + +@synchronized(convert_lock) +def convert(input, format_name, unicode = False): + + logging.info('convert') + out_filter = fmts.get_filter_by_name(format_name) + if not out_filter: + raise Exception("Unsupported format name %s" %(format_name)) + infile = None + outfile = None + out_f = None + try: + desktop = get_desktop() + + start_processing() + + # create in/out files + temp_dir = mkdtemp(prefix="cm_") + + # in + indir_name = os.path.join(temp_dir, THE_INDIR) + os.mkdir(indir_name) + infile_name = os.path.join(indir_name, THE_INFILE) + + # out + outdir_name = os.path.join(temp_dir, THE_OUTDIR) + os.mkdir(outdir_name) + outfile_name = os.path.join(outdir_name, THE_OUTFILE) + + # write infile + infile = open(infile_name,'w') + input = to_string(input) + infile.write(input) + infile.close() + + # fix perms + # TODO group permission should suffice + os.chmod(temp_dir, 0755) # read + os.chmod(indir_name, 0755) # read + os.chmod(infile_name, 0755) # read + os.chmod(outdir_name, 0777) # read / write + + properties = PropertyValue("Hidden", 0, True, 0), + + #import pdb;pdb.set_trace() + doc=desktop.loadComponentFromURL("file://%s" % infile_name, "_blank", 0, properties) + + properties= (PropertyValue("Hidden", 0, True, 0), PropertyValue("FilterName", 0, out_filter, 0)) + doc.storeToURL('file://%s' %outfile_name,tuple(properties)) + + out_f = open(outfile_name,'r') + + output = out_f.read() + # load other files (useful only for html) + image_names = [name for name in os.listdir(outdir_name) if name != THE_OUTFILE] + img_res = [] + for image_name in image_names: + img_res.append(os.path.join(outdir_name, image_name)) + if unicode: + output = output.decode('utf8') + return output,img_res + finally: + end_processing() + try: + if out_f: + out_f.close() + if infile: + infile.close() +# Do not remove dir: we only return images path to avoid +# mem overload +# top = temp_dir +# for root, dirs, files in os.walk(top, topdown=False): +# for name in files: +# os.remove(os.path.join(root, name)) +# for name in dirs: +# os.rmdir(os.path.join(root, name)) +# os.rmdir(top) + except: + # TODO : warn + pass