# HG changeset patch # User gibus # Date 1311234153 -7200 # Node ID bfaab87409955b76b2116993da412b5d0e8fc98b # Parent 0bab4ef95bfee150a2b4f0ab650f2d7814b8537c Add abiword as an alternative to open office for conversions diff -r 0bab4ef95bfe -r bfaab8740995 README.txt --- a/README.txt Thu Jun 16 09:27:56 2011 +0200 +++ b/README.txt Thu Jul 21 09:42:33 2011 +0200 @@ -23,7 +23,8 @@ ------------- - Postgresql 8.3 or Mysql 5+ or sqlite - Python 2.5+ -- Openoffice 3.0+ (headless) & Pandoc +- Abiword or Openoffice 3.0+ (headless) +- Pandoc Requirements @@ -32,9 +33,8 @@ - python magic - python development headers - python setuptools -- python uno - pandoc -- headless openoffice +- abiword (or headless openoffice and python uno) - git - libyaml (all other python dependencies will be downloaded by buildout) @@ -45,7 +45,9 @@ (ubuntu users : 'sudo apt-get install python python-magic python-setuptools python-uno libyaml-0-1 python-yaml python-dev git-core python-utidylib') 2. Install pandoc (ubuntu users : 'sudo apt-get install pandoc') -3. Install openoffice (headless mode) [used for document conversion] +3. Install abiword + (ubuntu users: 'sudo apt-get install abiword') + Alternatively, install openoffice (headless mode) [used for document conversion] (ubuntu users : 'sudo apt-get install sun-java6-jre openoffice.org openoffice.org-headless xvfb') 4. Install/configure database [skip this step if you plan to use a sqlite database] 4 a) Postgresql @@ -105,11 +107,11 @@ - `./bin/django migrate cm 0001_initial --fake` - `./bin/django migrate` -Openoffice -========== -Comt uses openoffice to convert documents from ODT, MS Word, etc. to html. -On a development setup, you should make sure no openoffice process is left and launch -`soffice -headless "-accept=socket,port=2002;urp;"` to start openoffice in background mode. +Abiword or Openoffice +===================== +Comt uses either abiword or openoffice to convert documents from ODT, MS Word, etc. to html. +Abiword is a lighter and more performant solution. You have to add the configuration parameter `USE_ABI = True` in your settings_local.py to use Abiword. Otherwise openoffice is used. +To use openoffice, on a development setup, you should make sure no openoffice process is left and launch `soffice -headless "-accept=socket,port=2002;urp;"` to start openoffice in background mode. Comt uses ============ @@ -141,8 +143,24 @@ FAQ ==== -Q: I get 'import error' when starting the server (step #9) -R: Make sure you installed all required python dependencies +Q1: How can I check the distribution for errors (libraries etc.): +R1: After configuring a database and access in your settings_local.py, you can launch the unit test suite with the following command: `./bin/django test cm` + +Q2: I'm getting the following error when launching the migrate command: +` +line 62, in handle + __import__(app_name + '.management', {}, {}, ['']) + File "/usr/lib/python2.5/site-packages/uno.py", line 300, in _uno_import + raise ImportError( "type "+ name + "." +x + " is unknown" ) +ImportError: type django.contrib.sessions.management. is unknown +` +R2: This is due to a bug in uno (python openoffice bridge) that monkey patches the import system and messes with django's dynamic module loading system. A workaround to launch the migrate command is to set: `UNO_IMPORT = False` in file src/cm/converters/oo_converters.py and then to launch the migrate command. Set the value back to True and relaunch the server to use openoffice as a conversion backend. + +Q3: When using co-ment Drupal module, I want that the name of commentators to be the same as the Drupal username +R3: For this feature (commentator name = drupal login name) to be available, a configuration parameter should be set in settings_local.py: `DECORATED_CREATORS = True` + +Q4: I get 'import error' when starting the server (step #9) +R4: Make sure you installed all required python dependencies Community ========= diff -r 0bab4ef95bfe -r bfaab8740995 src/cm/cm_settings.py --- a/src/cm/cm_settings.py Thu Jun 16 09:27:56 2011 +0200 +++ b/src/cm/cm_settings.py Thu Jul 21 09:42:33 2011 +0200 @@ -31,4 +31,7 @@ STORE_ACTIVITY_IP = get_setting('STORE_ACTIVITY_IP', True) # Show 'decorated' users in comments (not structural creator id) -DECORATED_CREATORS = get_setting('DECORATED_CREATORS', False) \ No newline at end of file +DECORATED_CREATORS = get_setting('DECORATED_CREATORS', False) + +# Use AbiWord for conversions +USE_ABI = get_setting('USE_ABI', False) diff -r 0bab4ef95bfe -r bfaab8740995 src/cm/converters/__init__.py --- a/src/cm/converters/__init__.py Thu Jun 16 09:27:56 2011 +0200 +++ b/src/cm/converters/__init__.py Thu Jul 21 09:42:33 2011 +0200 @@ -3,7 +3,7 @@ from cm.utils.string_utils import to_unicode import re import os -from cm.converters.oo_converters import extract_css_body +from oo_converters import extract_css_body # TODO: move that in text_base: save images @@ -18,16 +18,26 @@ attachs = [] attachs_dir = None ############################## + # OO/MS-Word if mime_type in ['application/vnd.oasis.opendocument.text', 'application/msword', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ]: - html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input) - if format == 'html': + from cm.cm_settings import USE_ABI + if USE_ABI: + from abi_converters import AbiFileConverter + converter = AbiFileConverter() + html_input, attachs = converter.convert_to_html(input) + html_input = re.sub(r' awml:style="[^"]*"', '', html_input) + converted_input = pandoc_convert(html_input, 'html', format) + else: + html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input) + if format == 'html': _not_used_css, converted_input = extract_css_body(xhtml_input) #converted_input = xhtml_input - converted_input = pandoc_convert(html_input, 'html', format) + converted_input = pandoc_convert(html_input, 'html', format) ############################## # latex @@ -136,4 +146,4 @@ CODE_INDICATOR = " " # 4 spaces return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')]) - \ No newline at end of file + diff -r 0bab4ef95bfe -r bfaab8740995 src/cm/converters/abi_converters.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/cm/converters/abi_converters.py Thu Jul 21 09:42:33 2011 +0200 @@ -0,0 +1,283 @@ +import os +import tempfile +import re + +import pexpect + +from abi_error import AbiConverterError, AbiCommandError + + +TYPES_IN = {'602': '602', 'abw': 'abw', 'aw': 'aw', + 'awt': 'awt', 'cwk': 'cwk', 'dbk': 'dbk', + 'doc': 'doc', 'docm': 'docm', 'docx': 'docx', + 'dot': 'dot', 'dotm': 'dotm', 'dotx': 'dotx', + 'fo': 'fo', 'htm': 'htm', 'html': 'html', + 'hwp': 'hwp', 'isc': 'isc', 'iscii': 'iscii', + 'kwd': 'kwd', 'mif': 'mif', 'odt': 'odt', + 'opml': 'opml', 'ott': 'ott', 'pdb': 'pdb', + 'pdf': 'pdf', 'rtf': 'rtf', 'sdw': 'sdw', + 'stw': 'stw', 'sxw': 'sxw', 'text': 'text', + 'txt': 'txt', 'wml': 'wml', 'wp': 'wp', + 'wpd': 'wpd', 'wri': 'wri', 'xhtml': 'xhtml', + 'xml': 'xml', 'zabw': 'zabw'} + +TYPES_OUT = {'abw': 'abw', 'aw': 'aw', 'awt': 'awt', + 'dbk': 'dbk', 'doc': 'doc', 'eml': 'eml', + 'fo': 'fo', 'html': 'html', 'isc': 'isc', + 'iscii': 'iscii', 'kwd': 'kwd', 'latex': 'latex', + 'mht': 'mht', 'mif': 'mif', 'nroff': 'nroff', + 'nws': 'nws', 'odt': 'odt', 'pdb': 'pdb', + 'pdf': 'pdf', 'ps': 'ps', 'rtf': 'rtf', + 'sxw': 'sxw', 'text': 'text', 'txt': 'txt', + 'wml': 'wml', 'xml': 'xml', 'xml2ps': 'xml2ps', + 'zabw': 'zabw'} + +class AbiFileConverter(object): + """This let's you convert between all filetypes supperted by the + AbiWord program. Import type isn't checked, as AbiWord doesn't check + on extension, but on metadata. + """ + + def __init__(self, timeout=60): + self.id = None + self.timeout = timeout + self._start_abiword() + + def _start_abiword(self): + """ + Start abiword with the AbiCommand plugin, if not already started + """ + + # find the abiword executable + abicommand = None + for dir in os.environ['PATH'].split(':'): + if os.path.isfile(os.path.join(dir, 'abiword')): + abicommand = os.path.join(dir, 'abiword') + if not abicommand: + raise AbiConverterError('Can not find abiword executable') + + # start the abiword executable + try: + self.child = pexpect.spawn(abicommand + ' --plugin AbiCommand') + self.child.expect( + 'AbiWord command line plugin: Type "quit" to exit', 10) + except: + raise AbiConverterError('Can not open abiword executable') + + def stop_abiword(self): + """ + Stop the running abiword, kill it if necessary + """ + self.child.sendline('quit') + if self._is_running(): + os.kill(self.child.pid, 9) + + def _is_running(self): + """ + Test to see if abiword is running + """ + try: + self.child.sendline('writepid /dev/null') + self.child.expect('OK', 1) + return True + except: + return False + + def convert_file(self, in_file, out_file=None, type=None): + """ + Convert a file. If out_file is not specified, a byte string is + returned. If type is not specified, the file extension from out_file is + used to determine the type. If this fails, the type 'text' is used. + Return value is -1 if an error occurred. + """ + # is the out_file specified? + return_bytes = False + if out_file is None: + out_file = tempfile.mktemp(prefix="abiconvert_") + return_bytes = True + + # is the type specified + type = TYPES_OUT.get( + type or os.path.splitext(out_file)[1][1:], 'txt') + + # do the coversion + self._perform_conversion(in_file, out_file, type) + + # return a byte string if no out_file is specified + if return_bytes: + fp = open(out_file, 'r') + bytes = fp.read() + fp.close() + os.remove(out_file) + return bytes + + def _perform_conversion(self, in_file, out_file, type): + """ + Do the actual conversion + """ + # make sure we are up and running + if not self._is_running: + self._start_abiword() + + # convert the file + cmd = 'convert %s %s %s' % (os.path.abspath(in_file), + os.path.abspath(out_file), type) + self.child.sendline(cmd) + + # Check for errors + i = self.child.expect(['OK', pexpect.TIMEOUT]) + if i != 0: + raise AbiCommandError('Error performing AbiCommand: %s' %cmd) + + def convert_to_html(self, input): + """ + Convert input file to HTML + """ + + from tempfile import mkstemp,mkdtemp + + THE_OUTDIR = "outdir" + THE_OUTFILE = "outfile" + THE_INDIR = "indir" + THE_INFILE = "infile" + + infile = None + outfile = None + out_f = None + try: + # create in/out files + temp_dir = mkdtemp(prefix="cm_") + + # in + indir_name = os.path.join(temp_dir, THE_INDIR) + os.mkdir(indir_name) + infile_name = os.path.join(indir_name, THE_INFILE) + + # out + outdir_name = os.path.join(temp_dir, THE_OUTDIR) + os.mkdir(outdir_name) + outfile_name = os.path.join(outdir_name, THE_OUTFILE) + + # write infile + infile = open(infile_name,'w') + if type(input) == unicode: + input = input.encode('utf8') + infile.write(input) + infile.close() + + # fix perms + # TODO group permission should suffice + os.chmod(temp_dir, 0755) # read + os.chmod(indir_name, 0755) # read + os.chmod(infile_name, 0755) # read + os.chmod(outdir_name, 0777) # read / write + + # Do the job + self.convert_file(infile_name, outfile_name, 'html') + + out_f = open(outfile_name,'r') + output = out_f.read() + + # load other files (useful only for html) + img_res = [] + if os.path.isdir(outdir_name + '/' + THE_OUTFILE + '_files'): + image_names = [name for name in os.listdir(outdir_name + '/' + THE_OUTFILE + '_files') if name != THE_OUTFILE] + for image_name in image_names: + img_res.append(os.path.join(outdir_name + '/' + THE_OUTFILE + '_files', image_name)) + + # clean images paths + output = re.sub(r' + + + + + %s + + +""" %body + diff -r 0bab4ef95bfe -r bfaab8740995 src/cm/converters/abi_error.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/cm/converters/abi_error.py Thu Jul 21 09:42:33 2011 +0200 @@ -0,0 +1,9 @@ + +class AbiConverterError(Exception): + pass + +class AbiCommandError(Exception): + pass + +class ToolsConverterError(Exception): + pass diff -r 0bab4ef95bfe -r bfaab8740995 src/cm/main.py --- a/src/cm/main.py Thu Jun 16 09:27:56 2011 +0200 +++ b/src/cm/main.py Thu Jul 21 09:42:33 2011 +0200 @@ -12,4 +12,4 @@ # add ch to logger logger.addHandler(ch) -logger_config() \ No newline at end of file +logger_config() diff -r 0bab4ef95bfe -r bfaab8740995 src/cm/utils/comment_positioning.py --- a/src/cm/utils/comment_positioning.py Thu Jun 16 09:27:56 2011 +0200 +++ b/src/cm/utils/comment_positioning.py Thu Jul 21 09:42:33 2011 +0200 @@ -127,7 +127,7 @@ if with_markers: end_ids.reverse() - ret = "%s%s%s"%(''.join(["[%s>"%start_id for start_id in start_ids]), ret, ''.join(["<%s]"%end_id for end_id in end_ids])) + ret = "%s%s%s"%(''.join(["[%s>"%start_id for start_id in start_ids]), ret, ''.join(["<%s]"%end_id for end_id in end_ids])) if with_colors and color != 0 : ret = "%s"%(BCKCOLORS[color], ret) @@ -243,4 +243,4 @@ # top_comment_cpt = top_comment_cpt + 1 # # ret = "%s%s%s"%("""
""", html_comments, """
""") -# return ret \ No newline at end of file +# return ret diff -r 0bab4ef95bfe -r bfaab8740995 src/cm/views/export.py --- a/src/cm/views/export.py Thu Jun 16 09:27:56 2011 +0200 +++ b/src/cm/views/export.py Thu Jul 21 09:42:33 2011 +0200 @@ -8,6 +8,7 @@ from cm.models import Text, TextVersion, Attachment, Comment import mimetypes import simplejson +from cm.cm_settings import USE_ABI EXPORT2_INFOS = { # key -> { mimetype, extension} 's5' : {}, @@ -34,10 +35,20 @@ else : fix_content = content if content_format == 'html': - from cm.converters.oo_converters import combine_css_body - fix_content = combine_css_body(content, '') - from cm.converters.oo_converters import convert_html as oo_convert - export_content = oo_convert(fix_content, format) + if USE_ABI: + from cm.converters.abi_converters import AbiFileConverter + converter = AbiFileConverter() + fix_content = converter.add_html_header(content) + else: + from cm.converters.oo_converters import combine_css_body + fix_content = combine_css_body(content, '') + if USE_ABI: + from cm.converters.abi_converters import AbiFileConverter + converter = AbiFileConverter() + export_content = converter.convert_from_html(fix_content, format) + else: + from cm.converters.oo_converters import convert_html as oo_convert + export_content = oo_convert(fix_content, format) export_infos = EXPORT2_INFOS[format] @@ -189,4 +200,4 @@ return content_export(request, text_version.content, text_version.title, text_version.format, format) def text_feed(request, key): - return "" \ No newline at end of file + return ""