src/cm/converters/pandoc_converters.py
changeset 0 40c8f766c9b8
child 119 5e8dda1b7631
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/cm/converters/pandoc_converters.py	Mon Nov 23 15:14:29 2009 +0100
@@ -0,0 +1,232 @@
+# python 2.5 compat
+from __future__ import with_statement
+from cm.utils.cache import memoize
+######
+## This module requires pandoc v > 1.0 (pandoc & markdown executables) 
+######
+
+from subprocess import Popen, PIPE, call
+import os
+from tempfile import mkstemp
+import StringIO
+import tidy
+
+
+PANDOC_BIN = "pandoc"
+PANDOC_OPTIONS = "--sanitize-html "
+
+MARKDOWN2PDF_BIN = "markdown2pdf"
+
+# make sure binaries are available
+from cm.utils.system import bin_search
+bin_search(PANDOC_BIN)
+bin_search(MARKDOWN2PDF_BIN)
+
+# pandoc capabilities
+INPUT_FORMATS = ['native', 'markdown', 'rst', 'html', 'latex']
+OUTPUT_FORMATS = ['native', 'html', 's5', 'docbook', 'opendocument', 'odt', 'latex', 'context', 'texinfo', 'man', 'markdown', 'rst', 'mediawiki', 'rtf']
+
+# add pdf output using markdown2pdf
+OUTPUT_FORMATS.append('pdf')
+
+# input formats
+CHOICES_INPUT_FORMATS = [(f, f) for f in ['markdown', 'rst', 'html']]
+
+DEFAULT_INPUT_FORMAT = 'markdown'
+
+_PANDOC_ENCODING = 'utf8'
+
+@memoize
+def pandoc_convert(content, from_format, to_format, full=False):
+    """
+    Convert markdown content to pdf
+    
+    >>> res = pandoc_convert('<span>dssd', 'html', 'pdf')
+    """
+    # pandoc does not react well when html is not valid
+    # use tidy to clean html  
+    if from_format == 'html':
+        content = do_tidy(content)
+    # if to_format is pdf: use markdown2pdf
+    if to_format == 'pdf':        
+        if from_format != 'markdown':
+            content = pandoc_convert(content, from_format, 'markdown', True)
+        return pandoc_markdown2pdf(content)
+    return pandoc_pandoc(content, from_format, to_format, full)
+
+def content_or_file_name(content, file_name):
+    if not content and not file_name:
+        raise Exception('You should provide either a content or a file_name')
+    if content and file_name:
+        raise Exception('You should not provide a content AND a file_name')
+
+    if file_name:
+        fp = file(file_name)
+        content = fp.read()
+        fp.close()
+
+    return content
+
+@memoize
+def do_tidy(content=None, file_name=None):
+    """
+    Tidy (html) content
+    
+    >>> res = do_tidy('<span>sdd')
+    """
+    content = content_or_file_name(content, file_name)
+    
+    tidy_options = dict(output_xhtml=1, 
+                        add_xml_decl=0, 
+                        indent=0, 
+                        tidy_mark=0,
+                        input_encoding='utf8',
+                        output_encoding='utf8',
+                        )
+    tidyied_content = tidy.parseString(content.encode('utf8'), **tidy_options)
+    tidyied_content = str(tidyied_content)
+    if content and not tidyied_content.strip():
+        raise Exception('Content could not be tidyfied') 
+    return str(tidyied_content).decode('utf8')
+
+
+def get_filetemp(mode="r"):
+    (fd, fname) = mkstemp()
+    return (os.fdopen(fd, mode), fname)
+
+# build absolute address for latex header file
+_tmp_ = __file__.split(os.path.sep)[:-1]
+_tmp_.append('latex_header.txt')
+_tmp_.insert(0, os.path.sep)
+
+LATEX_HEADER_PATH = os.path.join(*_tmp_)
+
+if not os.path.isfile(LATEX_HEADER_PATH):
+    raise Exception('LATEX_HEADER_PATH is not a file!')
+
+@memoize
+def pandoc_markdown2pdf(content=None, file_name=None):
+    """
+    Convert markdown content to pdf
+    
+    >>> pdf_content = pandoc_markdown2pdf('# dssd')
+    """
+    content = content_or_file_name(content, file_name)
+        
+    # write file to disk
+    temp_file, input_temp_name = get_filetemp('w')
+    fp_error, error_temp_name = get_filetemp('w')
+    
+    temp_file.write(content.encode(_PANDOC_ENCODING))
+    temp_file.close()
+    
+    # custom latex header
+    cust_head_tex = " --custom-header=%s " %LATEX_HEADER_PATH
+    
+    # use markdown2pdf
+    retcode = call(MARKDOWN2PDF_BIN + cust_head_tex + ' ' + input_temp_name, shell=True, stderr=fp_error)
+    fp_error.close()
+    
+    fp_error = file(error_temp_name)
+    error = fp_error.read()
+    fp_error.close()
+
+    os.remove(input_temp_name)
+    os.remove(error_temp_name)
+    
+    if retcode:
+        raise Exception(error)
+    
+    output_temp_name = input_temp_name + '.pdf'
+    fp_output = file(output_temp_name)
+    pdf_content = fp_output.read()
+    fp_output.close()
+    
+    os.remove(output_temp_name)
+    
+    return pdf_content
+    
+# TODO: manage images in pandoc (?)
+# TODO: use tidy to cleanup html
+
+@memoize
+def pandoc_pandoc(content, from_format, to_format, full=False):
+    """
+    Convert content (should be unicode) from from_format to to_format
+    (if full: includes header & co [html, latex])
+    Returns out (unicode), err
+    
+    >>> res, err = pandoc_pandoc(u'# sdsd', 'markdown', 'html', False)
+    >>> print err
+    None
+    >>> res.replace("\\n","")
+    u'<h1 id="sdsd">sdsd</h1>'
+    >>> res, err = pandoc_pandoc(u'# sdsd', 'markdown', 'html', True)
+    >>> print err
+    None
+    """
+    # verify formats
+    if from_format not in INPUT_FORMATS:
+        raise Exception("Input format [%s] is not a supported format [%s]" % (from_format, ' '.join(INPUT_FORMATS)))
+    if to_format not in OUTPUT_FORMATS:
+        raise Exception("Output format [%s] is not a supported format [%s]" % (to_format, ' '.join(OUTPUT_FORMATS)))
+    if type(content) != unicode:
+        raise Exception('Content is not in unicode format!')
+
+    # temp file
+    input_file, input_temp_name = get_filetemp('w')
+    output_temp_fp, output_temp_name = get_filetemp()
+    output_temp_fp.close()
+    
+    error_temp_fp, error_temp_name = get_filetemp('w')
+    error_temp_fp.close()
+    
+    input_file.write(content.encode(_PANDOC_ENCODING))
+    input_file.close()
+    
+    # pandoc arguments and command line
+    cmd_args = ' %s -o %s ' %(PANDOC_OPTIONS,output_temp_name) 
+    if full:
+        cmd_args += ' -s '
+    cmd_args += ' -f %s ' % from_format
+    cmd_args += ' -t %s ' % to_format
+    cmd_args += ' %s ' % input_temp_name
+    cmd = PANDOC_BIN + ' ' + cmd_args
+
+    #from socommons.converters.new_conv import controlled_Popen 
+    #controlled_Popen(cmd, stderr=file(error_temp_name,'w'))
+    fp_error = file(error_temp_name,'w')
+    retcode = call(cmd, shell=True, stderr=fp_error)
+    fp_error.close()
+    
+    fp_error = file(error_temp_name)
+    error = fp_error.read()
+    fp_error.close()
+    
+    fp_output = file(output_temp_name)
+    stdoutdata = fp_output.read()
+    fp_output.close()
+    
+    
+    # cleanup
+    os.remove(output_temp_name)
+    os.remove(input_temp_name)
+    os.remove(error_temp_name)
+    
+    if retcode:
+        raise Exception(error)
+
+    # try converting to unicode
+    try:
+        stdoutdata = stdoutdata.decode(_PANDOC_ENCODING)
+    except UnicodeDecodeError:
+        # this will fail for binary output formats such as odt
+        # return result without conversion then
+        pass
+    
+    return stdoutdata
+    
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
+