src/cm/converters/pandoc_converters.py
changeset 0 40c8f766c9b8
child 119 5e8dda1b7631
equal deleted inserted replaced
-1:000000000000 0:40c8f766c9b8
       
     1 # python 2.5 compat
       
     2 from __future__ import with_statement
       
     3 from cm.utils.cache import memoize
       
     4 ######
       
     5 ## This module requires pandoc v > 1.0 (pandoc & markdown executables) 
       
     6 ######
       
     7 
       
     8 from subprocess import Popen, PIPE, call
       
     9 import os
       
    10 from tempfile import mkstemp
       
    11 import StringIO
       
    12 import tidy
       
    13 
       
    14 
       
    15 PANDOC_BIN = "pandoc"
       
    16 PANDOC_OPTIONS = "--sanitize-html "
       
    17 
       
    18 MARKDOWN2PDF_BIN = "markdown2pdf"
       
    19 
       
    20 # make sure binaries are available
       
    21 from cm.utils.system import bin_search
       
    22 bin_search(PANDOC_BIN)
       
    23 bin_search(MARKDOWN2PDF_BIN)
       
    24 
       
    25 # pandoc capabilities
       
    26 INPUT_FORMATS = ['native', 'markdown', 'rst', 'html', 'latex']
       
    27 OUTPUT_FORMATS = ['native', 'html', 's5', 'docbook', 'opendocument', 'odt', 'latex', 'context', 'texinfo', 'man', 'markdown', 'rst', 'mediawiki', 'rtf']
       
    28 
       
    29 # add pdf output using markdown2pdf
       
    30 OUTPUT_FORMATS.append('pdf')
       
    31 
       
    32 # input formats
       
    33 CHOICES_INPUT_FORMATS = [(f, f) for f in ['markdown', 'rst', 'html']]
       
    34 
       
    35 DEFAULT_INPUT_FORMAT = 'markdown'
       
    36 
       
    37 _PANDOC_ENCODING = 'utf8'
       
    38 
       
    39 @memoize
       
    40 def pandoc_convert(content, from_format, to_format, full=False):
       
    41     """
       
    42     Convert markdown content to pdf
       
    43     
       
    44     >>> res = pandoc_convert('<span>dssd', 'html', 'pdf')
       
    45     """
       
    46     # pandoc does not react well when html is not valid
       
    47     # use tidy to clean html  
       
    48     if from_format == 'html':
       
    49         content = do_tidy(content)
       
    50     # if to_format is pdf: use markdown2pdf
       
    51     if to_format == 'pdf':        
       
    52         if from_format != 'markdown':
       
    53             content = pandoc_convert(content, from_format, 'markdown', True)
       
    54         return pandoc_markdown2pdf(content)
       
    55     return pandoc_pandoc(content, from_format, to_format, full)
       
    56 
       
    57 def content_or_file_name(content, file_name):
       
    58     if not content and not file_name:
       
    59         raise Exception('You should provide either a content or a file_name')
       
    60     if content and file_name:
       
    61         raise Exception('You should not provide a content AND a file_name')
       
    62 
       
    63     if file_name:
       
    64         fp = file(file_name)
       
    65         content = fp.read()
       
    66         fp.close()
       
    67 
       
    68     return content
       
    69 
       
    70 @memoize
       
    71 def do_tidy(content=None, file_name=None):
       
    72     """
       
    73     Tidy (html) content
       
    74     
       
    75     >>> res = do_tidy('<span>sdd')
       
    76     """
       
    77     content = content_or_file_name(content, file_name)
       
    78     
       
    79     tidy_options = dict(output_xhtml=1, 
       
    80                         add_xml_decl=0, 
       
    81                         indent=0, 
       
    82                         tidy_mark=0,
       
    83                         input_encoding='utf8',
       
    84                         output_encoding='utf8',
       
    85                         )
       
    86     tidyied_content = tidy.parseString(content.encode('utf8'), **tidy_options)
       
    87     tidyied_content = str(tidyied_content)
       
    88     if content and not tidyied_content.strip():
       
    89         raise Exception('Content could not be tidyfied') 
       
    90     return str(tidyied_content).decode('utf8')
       
    91 
       
    92 
       
    93 def get_filetemp(mode="r"):
       
    94     (fd, fname) = mkstemp()
       
    95     return (os.fdopen(fd, mode), fname)
       
    96 
       
    97 # build absolute address for latex header file
       
    98 _tmp_ = __file__.split(os.path.sep)[:-1]
       
    99 _tmp_.append('latex_header.txt')
       
   100 _tmp_.insert(0, os.path.sep)
       
   101 
       
   102 LATEX_HEADER_PATH = os.path.join(*_tmp_)
       
   103 
       
   104 if not os.path.isfile(LATEX_HEADER_PATH):
       
   105     raise Exception('LATEX_HEADER_PATH is not a file!')
       
   106 
       
   107 @memoize
       
   108 def pandoc_markdown2pdf(content=None, file_name=None):
       
   109     """
       
   110     Convert markdown content to pdf
       
   111     
       
   112     >>> pdf_content = pandoc_markdown2pdf('# dssd')
       
   113     """
       
   114     content = content_or_file_name(content, file_name)
       
   115         
       
   116     # write file to disk
       
   117     temp_file, input_temp_name = get_filetemp('w')
       
   118     fp_error, error_temp_name = get_filetemp('w')
       
   119     
       
   120     temp_file.write(content.encode(_PANDOC_ENCODING))
       
   121     temp_file.close()
       
   122     
       
   123     # custom latex header
       
   124     cust_head_tex = " --custom-header=%s " %LATEX_HEADER_PATH
       
   125     
       
   126     # use markdown2pdf
       
   127     retcode = call(MARKDOWN2PDF_BIN + cust_head_tex + ' ' + input_temp_name, shell=True, stderr=fp_error)
       
   128     fp_error.close()
       
   129     
       
   130     fp_error = file(error_temp_name)
       
   131     error = fp_error.read()
       
   132     fp_error.close()
       
   133 
       
   134     os.remove(input_temp_name)
       
   135     os.remove(error_temp_name)
       
   136     
       
   137     if retcode:
       
   138         raise Exception(error)
       
   139     
       
   140     output_temp_name = input_temp_name + '.pdf'
       
   141     fp_output = file(output_temp_name)
       
   142     pdf_content = fp_output.read()
       
   143     fp_output.close()
       
   144     
       
   145     os.remove(output_temp_name)
       
   146     
       
   147     return pdf_content
       
   148     
       
   149 # TODO: manage images in pandoc (?)
       
   150 # TODO: use tidy to cleanup html
       
   151 
       
   152 @memoize
       
   153 def pandoc_pandoc(content, from_format, to_format, full=False):
       
   154     """
       
   155     Convert content (should be unicode) from from_format to to_format
       
   156     (if full: includes header & co [html, latex])
       
   157     Returns out (unicode), err
       
   158     
       
   159     >>> res, err = pandoc_pandoc(u'# sdsd', 'markdown', 'html', False)
       
   160     >>> print err
       
   161     None
       
   162     >>> res.replace("\\n","")
       
   163     u'<h1 id="sdsd">sdsd</h1>'
       
   164     >>> res, err = pandoc_pandoc(u'# sdsd', 'markdown', 'html', True)
       
   165     >>> print err
       
   166     None
       
   167     """
       
   168     # verify formats
       
   169     if from_format not in INPUT_FORMATS:
       
   170         raise Exception("Input format [%s] is not a supported format [%s]" % (from_format, ' '.join(INPUT_FORMATS)))
       
   171     if to_format not in OUTPUT_FORMATS:
       
   172         raise Exception("Output format [%s] is not a supported format [%s]" % (to_format, ' '.join(OUTPUT_FORMATS)))
       
   173     if type(content) != unicode:
       
   174         raise Exception('Content is not in unicode format!')
       
   175 
       
   176     # temp file
       
   177     input_file, input_temp_name = get_filetemp('w')
       
   178     output_temp_fp, output_temp_name = get_filetemp()
       
   179     output_temp_fp.close()
       
   180     
       
   181     error_temp_fp, error_temp_name = get_filetemp('w')
       
   182     error_temp_fp.close()
       
   183     
       
   184     input_file.write(content.encode(_PANDOC_ENCODING))
       
   185     input_file.close()
       
   186     
       
   187     # pandoc arguments and command line
       
   188     cmd_args = ' %s -o %s ' %(PANDOC_OPTIONS,output_temp_name) 
       
   189     if full:
       
   190         cmd_args += ' -s '
       
   191     cmd_args += ' -f %s ' % from_format
       
   192     cmd_args += ' -t %s ' % to_format
       
   193     cmd_args += ' %s ' % input_temp_name
       
   194     cmd = PANDOC_BIN + ' ' + cmd_args
       
   195 
       
   196     #from socommons.converters.new_conv import controlled_Popen 
       
   197     #controlled_Popen(cmd, stderr=file(error_temp_name,'w'))
       
   198     fp_error = file(error_temp_name,'w')
       
   199     retcode = call(cmd, shell=True, stderr=fp_error)
       
   200     fp_error.close()
       
   201     
       
   202     fp_error = file(error_temp_name)
       
   203     error = fp_error.read()
       
   204     fp_error.close()
       
   205     
       
   206     fp_output = file(output_temp_name)
       
   207     stdoutdata = fp_output.read()
       
   208     fp_output.close()
       
   209     
       
   210     
       
   211     # cleanup
       
   212     os.remove(output_temp_name)
       
   213     os.remove(input_temp_name)
       
   214     os.remove(error_temp_name)
       
   215     
       
   216     if retcode:
       
   217         raise Exception(error)
       
   218 
       
   219     # try converting to unicode
       
   220     try:
       
   221         stdoutdata = stdoutdata.decode(_PANDOC_ENCODING)
       
   222     except UnicodeDecodeError:
       
   223         # this will fail for binary output formats such as odt
       
   224         # return result without conversion then
       
   225         pass
       
   226     
       
   227     return stdoutdata
       
   228     
       
   229 if __name__ == "__main__":
       
   230     import doctest
       
   231     doctest.testmod()
       
   232