Add abiword as an alternative to open office for conversions
authorgibus
Thu, 21 Jul 2011 09:42:33 +0200
changeset 360 bfaab8740995
parent 359 0bab4ef95bfe
child 361 5f2a1237050a
Add abiword as an alternative to open office for conversions
README.txt
src/cm/cm_settings.py
src/cm/converters/__init__.py
src/cm/converters/abi_converters.py
src/cm/converters/abi_error.py
src/cm/main.py
src/cm/utils/comment_positioning.py
src/cm/views/export.py
--- a/README.txt	Thu Jun 16 09:27:56 2011 +0200
+++ b/README.txt	Thu Jul 21 09:42:33 2011 +0200
@@ -23,7 +23,8 @@
 -------------
 - Postgresql 8.3 or Mysql 5+ or sqlite
 - Python 2.5+
-- Openoffice 3.0+ (headless) & Pandoc
+- Abiword or Openoffice 3.0+ (headless)
+- Pandoc
 
 
 Requirements
@@ -32,9 +33,8 @@
 - python magic
 - python development headers
 - python setuptools
-- python uno
 - pandoc
-- headless openoffice
+- abiword (or headless openoffice and python uno)
 - git
 - libyaml
 (all other python dependencies will be downloaded by buildout)
@@ -45,7 +45,9 @@
 	(ubuntu users : 'sudo apt-get install python python-magic python-setuptools python-uno libyaml-0-1 python-yaml python-dev git-core python-utidylib')
 2. Install pandoc
 	(ubuntu users : 'sudo apt-get install pandoc')
-3. Install openoffice (headless mode) [used for document conversion]
+3. Install abiword
+  (ubuntu users: 'sudo apt-get install abiword')
+   Alternatively, install openoffice (headless mode) [used for document conversion]
 	(ubuntu users : 'sudo apt-get install sun-java6-jre openoffice.org openoffice.org-headless xvfb')
 4. Install/configure database [skip this step if you plan to use a sqlite database]
 	4 a) Postgresql
@@ -105,11 +107,11 @@
    - `./bin/django migrate cm 0001_initial --fake`
    - `./bin/django migrate`
 
-Openoffice
-==========
-Comt uses openoffice to convert documents from ODT, MS Word, etc. to html.
-On a development setup, you should make sure no openoffice process is left and launch
-`soffice -headless "-accept=socket,port=2002;urp;"` to start openoffice in background mode.
+Abiword or Openoffice
+=====================
+Comt uses either abiword or openoffice to convert documents from ODT, MS Word, etc. to html.
+Abiword is a lighter and more performant solution. You have to add the configuration parameter `USE_ABI = True` in your settings_local.py to use Abiword. Otherwise openoffice is used.
+To use openoffice, on a development setup, you should make sure no openoffice process is left and launch `soffice -headless "-accept=socket,port=2002;urp;"` to start openoffice in background mode.
 
 Comt uses
 ============
@@ -141,8 +143,24 @@
 
 FAQ
 ====
-Q: I get 'import error' when starting the server (step #9)
-R: Make sure you installed all required python dependencies
+Q1: How can I check the distribution for errors (libraries etc.):
+R1: After configuring a database and access in your settings_local.py, you can launch the unit test suite with the following command: `./bin/django test cm`
+
+Q2: I'm getting the following error when launching the migrate command:
+`
+line 62, in handle
+    __import__(app_name + '.management', {}, {}, [''])
+  File "/usr/lib/python2.5/site-packages/uno.py", line 300, in _uno_import
+    raise ImportError( "type "+ name + "." +x + " is unknown" )
+ImportError: type django.contrib.sessions.management. is unknown
+`
+R2: This is due to a bug in uno (python openoffice bridge) that monkey patches the import system and messes with django's dynamic module loading system. A workaround to launch the migrate command is to set: `UNO_IMPORT = False` in file src/cm/converters/oo_converters.py and then to launch the migrate command. Set the value back to True and relaunch the server to use openoffice as a conversion backend.
+
+Q3: When using  co-ment Drupal module, I want that the name of commentators to be the same as the Drupal username
+R3: For this feature (commentator name = drupal login name) to be available, a configuration parameter should be set in settings_local.py: `DECORATED_CREATORS = True`
+
+Q4: I get 'import error' when starting the server (step #9)
+R4: Make sure you installed all required python dependencies
                       
 Community
 =========
--- a/src/cm/cm_settings.py	Thu Jun 16 09:27:56 2011 +0200
+++ b/src/cm/cm_settings.py	Thu Jul 21 09:42:33 2011 +0200
@@ -31,4 +31,7 @@
 STORE_ACTIVITY_IP = get_setting('STORE_ACTIVITY_IP', True)
 
 # Show 'decorated' users in comments (not structural creator id) 
-DECORATED_CREATORS = get_setting('DECORATED_CREATORS', False)
\ No newline at end of file
+DECORATED_CREATORS = get_setting('DECORATED_CREATORS', False)
+
+# Use AbiWord for conversions
+USE_ABI = get_setting('USE_ABI', False)
--- a/src/cm/converters/__init__.py	Thu Jun 16 09:27:56 2011 +0200
+++ b/src/cm/converters/__init__.py	Thu Jul 21 09:42:33 2011 +0200
@@ -3,7 +3,7 @@
 from cm.utils.string_utils import to_unicode 
 import re
 import os
-from cm.converters.oo_converters import extract_css_body
+from oo_converters import extract_css_body
 
 
 # TODO: move that in text_base: save images
@@ -18,16 +18,26 @@
     attachs = []
     attachs_dir = None
     ##############################
+    # OO/MS-Word
     if mime_type in ['application/vnd.oasis.opendocument.text',
                      'application/msword',
+                     'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
                      ]:
         
-        html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input)
-        if format == 'html':
+        from cm.cm_settings import USE_ABI
+        if USE_ABI:
+          from abi_converters import AbiFileConverter
+          converter = AbiFileConverter()
+          html_input, attachs = converter.convert_to_html(input)
+          html_input = re.sub(r' awml:style="[^"]*"', '', html_input)
+          converted_input = pandoc_convert(html_input, 'html', format)
+        else:
+          html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input)
+          if format == 'html':
                 _not_used_css, converted_input = extract_css_body(xhtml_input)
                 #converted_input = xhtml_input
         
-        converted_input = pandoc_convert(html_input, 'html', format)
+          converted_input = pandoc_convert(html_input, 'html', format)
         
     ##############################
     # latex
@@ -136,4 +146,4 @@
     CODE_INDICATOR = "    " # 4 spaces
     return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')])
 
-        
\ No newline at end of file
+        
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/cm/converters/abi_converters.py	Thu Jul 21 09:42:33 2011 +0200
@@ -0,0 +1,283 @@
+import os
+import tempfile
+import re
+
+import pexpect
+
+from abi_error import AbiConverterError, AbiCommandError
+
+
+TYPES_IN  = {'602': '602',       'abw': 'abw',       'aw': 'aw',     
+             'awt': 'awt',       'cwk': 'cwk',       'dbk': 'dbk',   
+             'doc': 'doc',       'docm': 'docm',     'docx': 'docx', 
+             'dot': 'dot',       'dotm': 'dotm',     'dotx': 'dotx',
+             'fo': 'fo',         'htm': 'htm',       'html': 'html', 
+             'hwp': 'hwp',       'isc': 'isc',       'iscii': 'iscii',   
+             'kwd': 'kwd',       'mif': 'mif',       'odt': 'odt',
+             'opml': 'opml',     'ott': 'ott',       'pdb': 'pdb',
+             'pdf': 'pdf',       'rtf': 'rtf',       'sdw': 'sdw',
+             'stw': 'stw',       'sxw': 'sxw',       'text': 'text',
+             'txt': 'txt',       'wml': 'wml',       'wp': 'wp',
+             'wpd': 'wpd',       'wri': 'wri',       'xhtml': 'xhtml',
+             'xml': 'xml',       'zabw': 'zabw'}
+
+TYPES_OUT = {'abw': 'abw',       'aw': 'aw',         'awt': 'awt',
+             'dbk': 'dbk',       'doc': 'doc',       'eml': 'eml',
+             'fo': 'fo',         'html': 'html',     'isc': 'isc',
+             'iscii': 'iscii',   'kwd': 'kwd',       'latex': 'latex',
+             'mht': 'mht',       'mif': 'mif',       'nroff': 'nroff',
+             'nws': 'nws',       'odt': 'odt',       'pdb': 'pdb',
+             'pdf': 'pdf',       'ps': 'ps',         'rtf': 'rtf',
+             'sxw': 'sxw',       'text': 'text',     'txt': 'txt',
+             'wml': 'wml',       'xml': 'xml',       'xml2ps': 'xml2ps',
+             'zabw': 'zabw'}
+
+class AbiFileConverter(object):
+    """This let's you convert between all filetypes supperted by the 
+    AbiWord program. Import type isn't checked, as AbiWord doesn't check 
+    on extension, but on metadata.
+    """
+
+    def __init__(self, timeout=60):
+        self.id = None
+        self.timeout = timeout
+        self._start_abiword()
+
+    def _start_abiword(self):
+        """
+        Start abiword with the AbiCommand plugin, if not already started
+        """
+
+        # find the abiword executable
+        abicommand = None
+        for dir in os.environ['PATH'].split(':'):
+            if os.path.isfile(os.path.join(dir, 'abiword')):
+                abicommand = os.path.join(dir, 'abiword')
+        if not abicommand:
+            raise AbiConverterError('Can not find abiword executable')
+
+        # start the abiword executable
+        try:
+            self.child = pexpect.spawn(abicommand + ' --plugin AbiCommand')
+            self.child.expect(
+                    'AbiWord command line plugin: Type "quit" to exit', 10)
+        except:
+            raise AbiConverterError('Can not open abiword executable')
+
+    def stop_abiword(self):
+        """
+        Stop the running abiword, kill it if necessary
+        """
+        self.child.sendline('quit')
+        if self._is_running():
+            os.kill(self.child.pid, 9)
+
+    def _is_running(self):
+        """
+        Test to see if abiword is running
+        """
+        try:
+            self.child.sendline('writepid /dev/null')
+            self.child.expect('OK', 1)
+            return True
+        except:
+            return False
+
+    def convert_file(self, in_file, out_file=None, type=None):
+        """
+        Convert a file. If out_file is not specified, a byte string is 
+        returned. If type is not specified, the file extension from out_file is
+        used to determine the type. If this fails, the type 'text' is used.
+        Return value is -1 if an error occurred.
+        """
+        # is the out_file specified?
+        return_bytes = False
+        if out_file is None:
+            out_file = tempfile.mktemp(prefix="abiconvert_")
+            return_bytes = True
+            
+        # is the type specified
+        type = TYPES_OUT.get(
+            type or os.path.splitext(out_file)[1][1:], 'txt')
+
+        # do the coversion
+        self._perform_conversion(in_file, out_file, type)
+
+        # return a byte string if no out_file is specified
+        if return_bytes:
+            fp = open(out_file,  'r')
+            bytes = fp.read()
+            fp.close()
+            os.remove(out_file)
+            return bytes
+
+    def _perform_conversion(self, in_file, out_file, type):
+        """
+        Do the actual conversion
+        """
+        # make sure we are up and running 
+        if not self._is_running:
+            self._start_abiword()
+
+        # convert the file
+        cmd = 'convert %s %s %s' % (os.path.abspath(in_file), 
+                                    os.path.abspath(out_file), type)
+        self.child.sendline(cmd)
+
+        # Check for errors
+        i = self.child.expect(['OK', pexpect.TIMEOUT])
+        if i != 0:
+            raise AbiCommandError('Error performing AbiCommand: %s' %cmd)
+
+    def convert_to_html(self, input):
+        """ 
+        Convert input file to HTML
+        """
+
+        from tempfile import mkstemp,mkdtemp
+
+        THE_OUTDIR = "outdir"
+        THE_OUTFILE = "outfile"
+        THE_INDIR = "indir"
+        THE_INFILE = "infile"
+
+        infile = None
+        outfile = None
+        out_f = None
+        try:
+          # create in/out files
+          temp_dir = mkdtemp(prefix="cm_")
+
+          # in
+          indir_name = os.path.join(temp_dir, THE_INDIR)
+          os.mkdir(indir_name)
+          infile_name = os.path.join(indir_name, THE_INFILE)
+
+          # out
+          outdir_name = os.path.join(temp_dir, THE_OUTDIR)
+          os.mkdir(outdir_name)
+          outfile_name = os.path.join(outdir_name, THE_OUTFILE)
+
+          # write infile 
+          infile = open(infile_name,'w')
+          if type(input) == unicode:
+            input = input.encode('utf8')
+          infile.write(input)
+          infile.close()
+
+          # fix perms
+          # TODO group permission should suffice
+          os.chmod(temp_dir, 0755) # read        
+          os.chmod(indir_name, 0755) # read        
+          os.chmod(infile_name, 0755) # read
+          os.chmod(outdir_name, 0777) # read / write
+
+          # Do the job
+          self.convert_file(infile_name, outfile_name, 'html')
+
+          out_f = open(outfile_name,'r')
+          output = out_f.read()
+
+          # load other files (useful only for html)
+          img_res = [] 
+          if os.path.isdir(outdir_name + '/' + THE_OUTFILE + '_files'):
+            image_names = [name for name in os.listdir(outdir_name + '/' + THE_OUTFILE + '_files') if name != THE_OUTFILE]
+            for image_name in image_names:
+              img_res.append(os.path.join(outdir_name + '/' + THE_OUTFILE + '_files', image_name))
+
+            # clean images paths
+            output = re.sub(r'<img(.+src=")outfile_files/([^"]+")', r'<img\1\2', output);
+            output = re.sub(r'<img(.+)style="width:[\d\.]+mm"', r'<img\1', output);
+          return output,img_res
+
+        finally:
+          try:
+            if out_f:
+                out_f.close()
+            if infile:
+                infile.close()
+          except:
+            pass
+
+    def convert_from_html(self, input, format):
+        """ 
+        Convert input file from HTML
+        """
+
+        from tempfile import mkstemp,mkdtemp
+
+        THE_OUTDIR = "outdir"
+        THE_OUTFILE = "outfile"
+        THE_INDIR = "indir"
+        THE_INFILE = "infile"
+
+        infile = None
+        outfile = None
+        out_f = None
+        try:
+          # create in/out files
+          temp_dir = mkdtemp(prefix="cm_")
+
+          # in
+          indir_name = os.path.join(temp_dir, THE_INDIR)
+          os.mkdir(indir_name)
+          infile_name = os.path.join(indir_name, THE_INFILE + '.html')
+
+          # out
+          outdir_name = os.path.join(temp_dir, THE_OUTDIR)
+          os.mkdir(outdir_name)
+          outfile_name = os.path.join(outdir_name, THE_OUTFILE)
+
+          # write infile 
+          infile = open(infile_name,'w')
+          if type(input) == unicode:
+            input = input.encode('utf8')
+          infile.write(input)
+          infile.close()
+
+          # fix perms
+          # TODO group permission should suffice
+          os.chmod(temp_dir, 0755) # read        
+          os.chmod(indir_name, 0755) # read        
+          os.chmod(infile_name, 0755) # read
+          os.chmod(outdir_name, 0777) # read / write
+
+          # Do the job
+          self.convert_file(infile_name, outfile_name, format)
+
+          out_f = open(outfile_name,'r')
+          output = out_f.read()
+          return output
+
+        finally:
+          try:
+            if out_f:
+                out_f.close()
+            if infile:
+                infile.close()
+            #top = temp_dir
+            #for root, dirs, files in os.walk(top, topdown=False):
+            #    for name in files:
+            #        os.remove(os.path.join(root, name))
+            #    for name in dirs:
+            #        os.rmdir(os.path.join(root, name))
+            #os.rmdir(top)
+          except:
+            pass
+
+    def add_html_header(self, body):
+        """ 
+        Add an HTML header to an HTML body
+        """
+
+        return """
+<html xmlns="http://www.w3.org/1999/xhtml">
+    <head>
+        <meta http-equiv="content-type" content="text/html; charset=utf-8" />
+    </head>
+    <body>
+        %s
+    </body>
+</html>
+""" %body
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/cm/converters/abi_error.py	Thu Jul 21 09:42:33 2011 +0200
@@ -0,0 +1,9 @@
+
+class AbiConverterError(Exception):
+    pass
+
+class AbiCommandError(Exception):
+    pass
+
+class ToolsConverterError(Exception):
+    pass
--- a/src/cm/main.py	Thu Jun 16 09:27:56 2011 +0200
+++ b/src/cm/main.py	Thu Jul 21 09:42:33 2011 +0200
@@ -12,4 +12,4 @@
     # add ch to logger
     logger.addHandler(ch)
 
-logger_config()
\ No newline at end of file
+logger_config()
--- a/src/cm/utils/comment_positioning.py	Thu Jun 16 09:27:56 2011 +0200
+++ b/src/cm/utils/comment_positioning.py	Thu Jul 21 09:42:33 2011 +0200
@@ -127,7 +127,7 @@
     
     if with_markers:
         end_ids.reverse()
-        ret = "%s%s%s"%(''.join(["[%s>"%start_id for start_id in start_ids]), ret, ''.join(["<%s]"%end_id for end_id in end_ids]))
+        ret = "%s%s%s"%(''.join(["[%s&gt;"%start_id for start_id in start_ids]), ret, ''.join(["&lt;%s]"%end_id for end_id in end_ids]))
      
     if with_colors and color != 0 :
         ret = "<span style='background-color:%s;'>%s</span>"%(BCKCOLORS[color], ret)
@@ -243,4 +243,4 @@
 #        top_comment_cpt = top_comment_cpt + 1
 #    
 #    ret = "%s%s%s"%("""<div class="pagebreakhere">""", html_comments, """</div>""")
-#    return ret
\ No newline at end of file
+#    return ret
--- a/src/cm/views/export.py	Thu Jun 16 09:27:56 2011 +0200
+++ b/src/cm/views/export.py	Thu Jul 21 09:42:33 2011 +0200
@@ -8,6 +8,7 @@
 from cm.models import Text, TextVersion, Attachment, Comment
 import mimetypes
 import simplejson
+from cm.cm_settings import USE_ABI
 EXPORT2_INFOS = {
 # key -> { mimetype, extension}
 's5' :   {},
@@ -34,10 +35,20 @@
         else :
             fix_content = content
             if content_format == 'html':
-                from cm.converters.oo_converters import combine_css_body                
-                fix_content = combine_css_body(content, '')
-            from cm.converters.oo_converters import convert_html as oo_convert                
-            export_content = oo_convert(fix_content, format)
+                if USE_ABI:
+                  from cm.converters.abi_converters import AbiFileConverter
+                  converter = AbiFileConverter()
+                  fix_content = converter.add_html_header(content)
+                else:
+                  from cm.converters.oo_converters import combine_css_body                
+                  fix_content = combine_css_body(content, '')
+            if USE_ABI:
+              from cm.converters.abi_converters import AbiFileConverter
+              converter = AbiFileConverter()
+              export_content = converter.convert_from_html(fix_content, format)
+            else:
+              from cm.converters.oo_converters import convert_html as oo_convert                
+              export_content = oo_convert(fix_content, format)
     
     export_infos = EXPORT2_INFOS[format]
      
@@ -189,4 +200,4 @@
     return content_export(request, text_version.content, text_version.title, text_version.format, format)
 
 def text_feed(request, key):
-    return ""
\ No newline at end of file
+    return ""