src/cm/converters/oo_converters.py
changeset 0 40c8f766c9b8
child 24 c8a95e540b79
equal deleted inserted replaced
-1:000000000000 0:40c8f766c9b8
       
     1 # warning : oo server autolaunch is tricky
       
     2 # make sure .qt .kde .openoffice.org2 should be writable in home directory
       
     3 # for instance, if working user is www-data
       
     4 #   mkdir /var/www/.openoffice.org2 ; chown www-data:www-data /var/www/.openoffice.org2
       
     5 #   mkdir /var/www/.qt ; chown www-data:www-data /var/www/.qt
       
     6 #   mkdir /var/www/.kde ; chown www-data:www-data /var/www/.kde
       
     7 
       
     8 UNO_IMPORT = True
       
     9 
       
    10 if UNO_IMPORT:
       
    11     import uno
       
    12 
       
    13 # old ubuntu bug left for the record
       
    14 #    print "#### Uno import failed ! #### "
       
    15 #    print "#### https://bugs.launchpad.net/ubuntu/+source/openoffice.org2/+bug/139077 #### "
       
    16 #    print "#### launch : sudo ldconfig -v /usr/lib/openoffice/program #### "
       
    17 
       
    18 from cm.utils.thread import synchronized, daemonize
       
    19 if UNO_IMPORT:
       
    20     from com.sun.star.beans import PropertyValue
       
    21 from datetime import datetime
       
    22 from subprocess import Popen,call
       
    23 from tempfile import mkstemp,mkdtemp
       
    24 
       
    25 if UNO_IMPORT:
       
    26     from unohelper import systemPathToFileUrl, absolutize
       
    27 
       
    28 from xml.dom.minidom import parseString
       
    29 import cStringIO
       
    30 import chardet
       
    31 import sys
       
    32 import magic
       
    33 import os,re
       
    34 import random
       
    35 import threading
       
    36 import time
       
    37 import logging
       
    38 
       
    39 CONN_STRING = "uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext" 
       
    40 
       
    41 KILL = 'killall -KILL xvfb-run ; killall -KILL soffice; killall -KILL soffice.bin; killall -KILL Xvfb'
       
    42 RM = 'rm -f /tmp/.X99-lock'
       
    43 LAUNCH = 'xvfb-run soffice -headless "-accept=socket,port=2002;urp;"'
       
    44 
       
    45 # xvfb-run soffice -headless "-accept=socket,port=2002;urp;";
       
    46 #  soffice "-accept=socket,port=2002;urp;";
       
    47 
       
    48 
       
    49 ms = magic.open(magic.MAGIC_NONE)
       
    50 ms.load()
       
    51 
       
    52 def is_text(buffer):
       
    53     type = ms.buffer(buffer)
       
    54     return ' text, ' in type
       
    55 
       
    56 def fix_text_encoding(buffer, to_encoding = 'utf-8'):
       
    57     detected = chardet.detect(buffer)
       
    58     encoding = detected['encoding']
       
    59     if encoding != to_encoding:
       
    60         return buffer.decode(encoding).encode(to_encoding)
       
    61     return buffer
       
    62 # $$$ RBE TODO fix_content a call should be made before oo_convert call when importing text file with non utf-8 encoding todo test that to make it crash     
       
    63 def fix_content(buffer):
       
    64     """
       
    65     Fix content fixes :
       
    66     - encoding to utf8 to txt files
       
    67     """
       
    68     try:
       
    69         if is_text(buffer):
       
    70             return fix_text_encoding(buffer)
       
    71         return buffer
       
    72     except:
       
    73         return buffer
       
    74 
       
    75 processing = 0
       
    76 
       
    77 # timeout : kill oo
       
    78 PROCESSING_TIMEOUT = 20.0
       
    79 
       
    80 def oo_process_controller(code):
       
    81     """
       
    82     If 'code' process is still active : kill oo 
       
    83     """
       
    84     global processing
       
    85     logging.info('oo_process_controller')
       
    86     if processing == code:
       
    87         logging.error('--> oo_process_controller : killing !')
       
    88         kill_oo()        
       
    89 
       
    90 def kill_oo():
       
    91     logging.info('killing')
       
    92     p = Popen(KILL, shell=True)
       
    93     sts = os.waitpid(p.pid, 0)
       
    94     p = Popen(RM, shell=True)
       
    95     sts = os.waitpid(p.pid, 0)
       
    96 
       
    97 def launch_oo():
       
    98     logging.info('launching')
       
    99     p = Popen(LAUNCH, shell=True)
       
   100 
       
   101 def kill_and_relaunch_oo():
       
   102     kill_oo()
       
   103     launch_oo()
       
   104 
       
   105 get_connection_lock = threading.RLock() 
       
   106 
       
   107 def start_processing():
       
   108     global processing
       
   109     logging.info('start_processing')
       
   110     code = random.random()
       
   111     processing = code
       
   112     t = threading.Timer(PROCESSING_TIMEOUT, oo_process_controller, args = [code,])
       
   113     t.start()
       
   114 
       
   115 def end_processing():
       
   116     logging.info('end_processing')
       
   117     global processing
       
   118     processing = 0 
       
   119     
       
   120 @synchronized(get_connection_lock)
       
   121 def get_connection(retry = 2):
       
   122     while retry > 0:
       
   123         try:
       
   124             localContext = uno.getComponentContext()
       
   125             
       
   126             resolver = localContext.ServiceManager.createInstanceWithContext(
       
   127                             "com.sun.star.bridge.UnoUrlResolver", localContext )
       
   128             
       
   129             ctx = resolver.resolve(CONN_STRING)
       
   130             return ctx
       
   131         except:
       
   132             retry -= 1
       
   133             kill_and_relaunch_oo()
       
   134             time.sleep(8)
       
   135             
       
   136     raise Exception('could not launch oo, please read README.txt section Openoffice for troubleshooting')
       
   137 
       
   138 def get_desktop():
       
   139     ctx = get_connection()
       
   140     smgr = ctx.ServiceManager    
       
   141     # get the central desktop object
       
   142     desktop = smgr.createInstanceWithContext( "com.sun.star.frame.Desktop",ctx)
       
   143     
       
   144     return desktop
       
   145 
       
   146 
       
   147 class FmtList:    
       
   148     def __init__(self):
       
   149         self._list = []
       
   150 
       
   151     def add(self, name, extension, summary, filter, export = False, mimetype = None):
       
   152         dd = {
       
   153               'name' : name,
       
   154               'extension' : extension,
       
   155               'summary' : summary,
       
   156               'filter' : filter,
       
   157               'export' : export,
       
   158               'mimetype' : mimetype,
       
   159               }
       
   160         self._list.append(dd)
       
   161         
       
   162     def get_filter_by_summary(self, value):
       
   163         return self.get_filter_by('summary', value)
       
   164 
       
   165     def get_filter_by_name(self, value):
       
   166         return self.get_filter_by('name', value)
       
   167 
       
   168     def get_filter_by(self, name, value):
       
   169         res = self.get_by(name, value)
       
   170         if res:
       
   171             return res['filter']
       
   172         return None
       
   173 
       
   174     def get_by_name(self, value):
       
   175         return self.get_by('name', value)
       
   176     
       
   177     def get_by(self, name, value):
       
   178         for fmt in self._list:
       
   179             if fmt[name] == value:
       
   180                 return fmt
       
   181         return None
       
   182     
       
   183     def get_export_formats_tuple(self):
       
   184         return [(f['summary'],f['name']) for f in self._list if f['export']]
       
   185 
       
   186     def ids_by_summary(self):
       
   187         return self.ids_by('summary')
       
   188     
       
   189     def ids_by(self, name):
       
   190         return dict([(r[name],r['name']) for r in self._list])
       
   191 
       
   192 fmts = None
       
   193 if UNO_IMPORT:
       
   194     fmts = FmtList()
       
   195     fmts.add('bib', 'bib', 'BibTeX', 'BibTeX_Writer')
       
   196     fmts.add('doc', 'doc', 'Microsoft Word 97/2000/XP', 'MS Word 97', True, 'application/msword')
       
   197     fmts.add('doc6', 'doc', 'Microsoft Word 6.0', 'MS WinWord 6.0')
       
   198     fmts.add('doc95', 'doc', 'Microsoft Word 95', 'MS Word 95')
       
   199     fmts.add('docbook', 'xml', 'DocBook', 'DocBook File')
       
   200     fmts.add('html', 'html', 'HTML Document (OpenOffice.org Writer)', 'HTML (StarWriter)')
       
   201     fmts.add('odt', 'odt', 'Open Document Text', 'writer8', True, 'application/vnd.oasis.opendocument.text')
       
   202     fmts.add('ott', 'ott', 'Open Document Text', 'writer8_template')
       
   203     fmts.add('ooxml', 'xml', 'Microsoft Office Open XML', 'MS Word 2003 XML')
       
   204     fmts.add('pdb', 'pdb', 'AportisDoc (Palm)', 'AportisDoc Palm DB')
       
   205     fmts.add('pdf', 'pdf', 'Portable Document Format', 'writer_pdf_Export', True, 'application/pdf')
       
   206     fmts.add('psw', 'psw', 'Pocket Word', 'PocketWord File')
       
   207     fmts.add('rtf', 'rtf', 'Rich Text Format', 'Rich Text Format', True, 'application/rtf')
       
   208     fmts.add('latex', 'ltx', 'LaTeX 2e', 'LaTeX_Writer')
       
   209     fmts.add('sdw', 'sdw', 'StarWriter 5.0', 'StarWriter 5.0')
       
   210     fmts.add('sdw4', 'sdw', 'StarWriter 4.0', 'StarWriter 4.0')
       
   211     fmts.add('sdw3', 'sdw', 'StarWriter 3.0', 'StarWriter 3.0')
       
   212     fmts.add('stw', 'stw', 'Open Office.org 1.0 Text Document Template', 'writer_StarOffice_XML_Writer_Template')
       
   213     fmts.add('sxw', 'sxw', 'Open Office.org 1.0 Text Document', 'StarOffice XML (Writer)')
       
   214     fmts.add('text', 'txt', 'Text Encoded', 'Text (encoded)', True, 'application/txt')
       
   215     fmts.add('txt', 'txt', 'Plain Text', 'Text')
       
   216     fmts.add('vor', 'vor', 'StarWriter 5.0 Template', 'StarWriter 5.0 Vorlage/Template')
       
   217     fmts.add('vor4', 'vor', 'StarWriter 4.0 Template', 'StarWriter 4.0 Vorlage/Template')
       
   218     fmts.add('vor3', 'vor', 'StarWriter 3.0 Template', 'StarWriter 3.0 Vorlage/Template')
       
   219     fmts.add('xhtml', 'html', 'XHTML Document', 'XHTML Writer File')
       
   220 
       
   221 THE_OUTDIR = "outdir"
       
   222 THE_OUTFILE = "outfile"
       
   223 
       
   224 THE_INDIR = "indir"
       
   225 THE_INFILE = "infile"
       
   226 
       
   227 def fix_img_path(html,xhtml,imgs):
       
   228     """
       
   229     imgs : name --> path
       
   230     """
       
   231     finder_re = 'src[\s]*=[\s]*\"((?:(?!https?))[^\"]*)\"'
       
   232     len_res_html = len(re.findall(finder_re,html,re.IGNORECASE))
       
   233     len_res_xhtml = len(re.findall(finder_re,xhtml,re.IGNORECASE))
       
   234     res_html = re.finditer(finder_re,html,re.IGNORECASE)
       
   235     res_xhtml = re.finditer(finder_re,xhtml,re.IGNORECASE)
       
   236     result = []
       
   237     last_index = 0
       
   238     for match_xhtml in res_xhtml:
       
   239         img_path = '' 
       
   240         try:
       
   241             match_html = res_html.next()
       
   242             if match_html:
       
   243                 img_name = match_html.group(1)
       
   244                 img_path = imgs[img_name]
       
   245         except StopIteration:
       
   246             # TODO : report pb
       
   247             pass 
       
   248         offset = len(match_xhtml.group(0)) - len(match_xhtml.group(1))
       
   249         result.append(xhtml[last_index:match_xhtml.start() + offset - 1])
       
   250         result.append(img_path)
       
   251         last_index = match_xhtml.end() - 1 # -1 because trailing "
       
   252     result.append(xhtml[last_index:len(xhtml)])
       
   253     return u''.join(result)
       
   254 
       
   255   
       
   256 def extract_css_body(xhtml):
       
   257     dom = parseString(xhtml.encode('utf8'))
       
   258     style = dom.getElementsByTagName("style")[0].toxml()
       
   259     body = dom.getElementsByTagName("body")[0].toxml()
       
   260     # cleanup initial/final tags
       
   261     style_clean = style[style.find('>')+1:style.rfind('</')]
       
   262     body_clean = body[body.find('>')+1:body.rfind('</')]
       
   263     return style_clean,body_clean
       
   264     
       
   265 convert_lock = threading.RLock() 
       
   266 
       
   267 def combine_css_body(body, css):
       
   268     return """
       
   269 <html xmlns="http://www.w3.org/1999/xhtml">
       
   270     <head>
       
   271         <meta http-equiv="content-type" content="text/html; charset=utf-8" />
       
   272         <style type="text/css">
       
   273             %s
       
   274         </style>    
       
   275     </head>
       
   276     <body>
       
   277         %s
       
   278     </body>
       
   279 </html>
       
   280 """ %(css,body)
       
   281 
       
   282 def to_string(input):
       
   283     if type(input) == unicode:
       
   284         input = input.encode('utf8')
       
   285     return input
       
   286     
       
   287 @synchronized(convert_lock)    
       
   288 def convert_html(input, format_name, images = None):
       
   289     out_filter = fmts.get_filter_by_name(format_name)    
       
   290     if not out_filter:
       
   291         raise Exception("Unsupported format name %s" %(format_name)) 
       
   292     infile = None
       
   293     outfile = None
       
   294     out_f = None
       
   295     try:
       
   296         desktop = get_desktop()
       
   297         
       
   298         start_processing()
       
   299                         
       
   300         # create in/out files
       
   301         temp_dir = mkdtemp(prefix="cm_")
       
   302         
       
   303         # in
       
   304         indir_name = os.path.join(temp_dir, THE_INDIR)
       
   305         os.mkdir(indir_name)
       
   306         infile_name = os.path.join(indir_name, THE_INFILE  + '.html')
       
   307         
       
   308         # out
       
   309         outdir_name = os.path.join(temp_dir, THE_OUTDIR)
       
   310         os.mkdir(outdir_name)
       
   311         outfile_name = os.path.join(outdir_name, THE_OUTFILE)
       
   312 
       
   313         # write infile 
       
   314         infile = open(infile_name,'w')
       
   315         input = to_string(input)
       
   316         infile.write(input)
       
   317         infile.close()
       
   318 
       
   319         # fix perms
       
   320         # TODO: group permission should suffice
       
   321         os.chmod(temp_dir, 0755) # read    
       
   322         os.chmod(indir_name, 0755) # read
       
   323         os.chmod(infile_name, 0755) # read
       
   324         os.chmod(outdir_name, 0777) # read / write
       
   325 
       
   326         inProps = PropertyValue( "Hidden" , 0 , True, 0 ),        
       
   327         doc = desktop.loadComponentFromURL( "private:factory/swriter", "_blank", 0, inProps )
       
   328         text   = doc.Text
       
   329         cursor = text.createTextCursor()
       
   330 
       
   331         fileUrl = systemPathToFileUrl(infile_name)
       
   332         cursor.insertDocumentFromURL(fileUrl, ())
       
   333         
       
   334         properties= (PropertyValue("Hidden", 0, True, 0), PropertyValue("FilterName", 0, out_filter, 0))        
       
   335         doc.storeToURL('file://%s' %outfile_name,tuple(properties))
       
   336         
       
   337         out_f = open(outfile_name,'r')
       
   338 
       
   339         output = out_f.read()
       
   340         return output
       
   341     finally:
       
   342         end_processing()
       
   343         try:
       
   344             if out_f:
       
   345                 out_f.close()
       
   346             if infile:
       
   347                 infile.close()
       
   348             top = temp_dir
       
   349             for root, dirs, files in os.walk(top, topdown=False):
       
   350                 for name in files:
       
   351                     os.remove(os.path.join(root, name))
       
   352                 for name in dirs:
       
   353                     os.rmdir(os.path.join(root, name))
       
   354             os.rmdir(top)            
       
   355         except:
       
   356             # TODO : warn
       
   357             pass
       
   358 
       
   359 @synchronized(convert_lock)    
       
   360 def convert(input, format_name, unicode = False):
       
   361     
       
   362     logging.info('convert')
       
   363     out_filter = fmts.get_filter_by_name(format_name)    
       
   364     if not out_filter:
       
   365         raise Exception("Unsupported format name %s" %(format_name)) 
       
   366     infile = None
       
   367     outfile = None
       
   368     out_f = None
       
   369     try:
       
   370         desktop = get_desktop() 
       
   371 	    
       
   372         start_processing()
       
   373     
       
   374         # create in/out files
       
   375         temp_dir = mkdtemp(prefix="cm_")
       
   376         
       
   377         # in
       
   378         indir_name = os.path.join(temp_dir, THE_INDIR)
       
   379         os.mkdir(indir_name)
       
   380         infile_name = os.path.join(indir_name, THE_INFILE)
       
   381         
       
   382         # out
       
   383         outdir_name = os.path.join(temp_dir, THE_OUTDIR)
       
   384         os.mkdir(outdir_name)
       
   385         outfile_name = os.path.join(outdir_name, THE_OUTFILE)
       
   386 
       
   387         # write infile 
       
   388         infile = open(infile_name,'w')
       
   389         input = to_string(input)
       
   390         infile.write(input)
       
   391         infile.close()
       
   392 
       
   393         # fix perms
       
   394         # TODO group permission should suffice
       
   395         os.chmod(temp_dir, 0755) # read        
       
   396         os.chmod(indir_name, 0755) # read        
       
   397         os.chmod(infile_name, 0755) # read
       
   398         os.chmod(outdir_name, 0777) # read / write
       
   399                 
       
   400         properties = PropertyValue("Hidden", 0, True, 0),
       
   401                        
       
   402         #import pdb;pdb.set_trace()   
       
   403         doc=desktop.loadComponentFromURL("file://%s" % infile_name, "_blank", 0, properties)
       
   404         
       
   405         properties= (PropertyValue("Hidden", 0, True, 0), PropertyValue("FilterName", 0, out_filter, 0))        
       
   406         doc.storeToURL('file://%s' %outfile_name,tuple(properties))
       
   407         
       
   408         out_f = open(outfile_name,'r')
       
   409 
       
   410         output = out_f.read()
       
   411         # load other files (useful only for html)
       
   412         image_names = [name for name in os.listdir(outdir_name) if name != THE_OUTFILE]
       
   413         img_res = [] 
       
   414         for image_name in image_names:
       
   415               img_res.append(os.path.join(outdir_name, image_name))
       
   416         if unicode:
       
   417             output = output.decode('utf8')
       
   418         return output,img_res
       
   419     finally:
       
   420         end_processing()
       
   421         try:
       
   422             if out_f:
       
   423                 out_f.close()
       
   424             if infile:
       
   425                 infile.close()
       
   426 # Do not remove dir: we only return images path to avoid 
       
   427 # mem overload             
       
   428 #            top = temp_dir
       
   429 #            for root, dirs, files in os.walk(top, topdown=False):
       
   430 #                for name in files:
       
   431 #                    os.remove(os.path.join(root, name))
       
   432 #                for name in dirs:
       
   433 #                    os.rmdir(os.path.join(root, name))
       
   434 #            os.rmdir(top)            
       
   435         except:
       
   436             # TODO : warn
       
   437             pass