src/cm/converters/pandoc_converters.py
author gibus
Thu, 09 Aug 2012 16:55:25 +0200
changeset 459 a69421197502
parent 458 ba7e05582435
child 517 e7bc083fa9dc
permissions -rw-r--r--
Fixed renderContents on BeautifilSoup if there is no body in source html

# python 2.5 compat
from __future__ import with_statement
from cm.utils.cache import memoize, dj_memoize
######
## This module requires pandoc v > 1.0 (pandoc & markdown executables) 
######

from subprocess import Popen, PIPE, call
import os
from tempfile import mkstemp
import StringIO
import tidy
from cm.utils.string_utils import to_unicode
from BeautifulSoup import BeautifulSoup
import re
from distutils.version import LooseVersion

PANDOC_BIN = "pandoc"
import commands
PANDOC_VERSION = commands.getstatusoutput(PANDOC_BIN + " -v|head -n 1|awk '{print $2;}'")[1]
if LooseVersion(PANDOC_VERSION) < '1.8':
  PANDOC_OPTIONS = " --sanitize-html --email-obfuscation=none "
else:
  PANDOC_OPTIONS = " --email-obfuscation=none "

PANDOC_OPTIONS_RAW = " -R --email-obfuscation=none "

if LooseVersion(PANDOC_VERSION) < '1.9':
  MARKDOWN2PDF_BIN = "markdown2pdf"
else:
  MARKDOWN2PDF_BIN = None

# make sure binaries are available
from cm.utils.system import bin_search
bin_search(PANDOC_BIN)
if MARKDOWN2PDF_BIN:
  bin_search(MARKDOWN2PDF_BIN)

# pandoc capabilities
INPUT_FORMATS = ['native', 'markdown', 'rst', 'html', 'latex']
OUTPUT_FORMATS = ['native', 'html', 's5', 'docbook', 'opendocument', 'odt', 'latex', 'context', 'texinfo', 'man', 'markdown', 'rst', 'mediawiki', 'rtf', 'pdf', 'epub']

# input formats
CHOICES_INPUT_FORMATS = [(f, f) for f in ['markdown', 'rst', 'html']]

DEFAULT_INPUT_FORMAT = 'markdown'

_PANDOC_ENCODING = 'utf8'

@dj_memoize
def pandoc_convert(content, from_format, to_format, full=False, raw=False):
    """
    Convert markdown content to pdf
    
    >>> res = pandoc_convert('<span>dssd', 'html', 'pdf')
    """
    # pandoc does not react well when html is not valid
    # use tidy to clean html  
    if from_format == 'html':
        try:
            content = do_tidy(content)
        except:
            # tidy fails ... try pandoc anyway...
            content = to_unicode(content)
    # if to_format is pdf: use markdown2pdf
    if MARKDOWN2PDF_BIN and to_format == 'pdf':        
        if from_format != 'markdown':
            content = pandoc_convert(content, from_format, 'markdown', True)
        return pandoc_markdown2pdf(content)
    return pandoc_pandoc(content, from_format, to_format, full, from_format==to_format=='html') # use raw pandoc convertion if html->html

def content_or_file_name(content, file_name):
    if not content and not file_name:
        raise Exception('You should provide either a content or a file_name')
    if content and file_name:
        raise Exception('You should not provide a content AND a file_name')

    if file_name:
        fp = file(file_name)
        content = fp.read()
        fp.close()

    return content

@dj_memoize
def do_tidy(content=None, file_name=None):
    """
    Tidy (html) content
    
    >>> res = do_tidy('<span>sdd')
    """
    content = content_or_file_name(content, file_name)
    
    tidy_options = dict(output_xhtml=1, 
                        add_xml_decl=0, 
                        indent=0, 
                        tidy_mark=0,
                        logical_emphasis=1,
                        wrap=0,
                        input_encoding='utf8',
                        output_encoding='utf8',
                        )
    tidyied_content = tidy.parseString(to_unicode(content).encode('utf8'), **tidy_options)
    tidyied_content = str(tidyied_content)
    if content and not tidyied_content.strip():
        raise Exception('Content could not be tidyfied') 
    return str(tidyied_content).decode('utf8')


def get_filetemp(mode="r", suffix=''):
    (fd, fname) = mkstemp(suffix)
    return (os.fdopen(fd, mode), fname)

@dj_memoize
def pandoc_markdown2pdf(content=None, file_name=None):
    """
    Convert markdown content to pdf
    
    >>> pdf_content = pandoc_markdown2pdf('# dssd')
    """
    content = content_or_file_name(content, file_name)
        
    # write file to disk
    temp_file, input_temp_name = get_filetemp('w', 'input')
    fp_error, error_temp_name = get_filetemp('w', 'err')
    
    temp_file.write(content.encode(_PANDOC_ENCODING))
    temp_file.close()
    
    cust_tex = " --xetex "
    
    # use markdown2pdf
    retcode = call(MARKDOWN2PDF_BIN + cust_tex + ' ' + input_temp_name, shell=True, stderr=fp_error)

    # xetex seems to randomly cause "Invalid or incomplete multibyte or wide character" errors, try without it
    if retcode:
      # build absolute address for latex header file
      _tmp_ = __file__.split(os.path.sep)[:-1]
      _tmp_.append('latex_header.txt')
      _tmp_.insert(0, os.path.sep)

      LATEX_HEADER_PATH = os.path.join(*_tmp_)

      if not os.path.isfile(LATEX_HEADER_PATH):
        raise Exception('LATEX_HEADER_PATH is not a file!')

      # custom latex header
      cust_head_tex = " --custom-header=%s " %LATEX_HEADER_PATH

      retcode = call(MARKDOWN2PDF_BIN + cust_head_tex + ' ' + input_temp_name, shell=True, stderr=fp_error)

    fp_error.close()
    
    fp_error = file(error_temp_name)
    error = fp_error.read()
    fp_error.close()

    os.remove(input_temp_name)
    os.remove(error_temp_name)
    
    if retcode:
        raise Exception(error)
    
    output_temp_name = input_temp_name + '.pdf'
    fp_output = file(output_temp_name)
    pdf_content = fp_output.read()
    fp_output.close()
    
    os.remove(output_temp_name)
    
    return pdf_content
    
# TODO: manage images in pandoc (?)
# TODO: use tidy to cleanup html

@dj_memoize
def pandoc_pandoc(content, from_format, to_format, full=False, raw=False):
    """
    Convert content (should be unicode) from from_format to to_format
    (if full: includes header & co [html, latex])
    Returns out (unicode), err
    
    >>> res, err = pandoc_pandoc(u'# sdsd', 'markdown', 'html', False)
    >>> print err
    None
    >>> res.replace("\\n","")
    u'<h1 id="sdsd">sdsd</h1>'
    >>> res, err = pandoc_pandoc(u'# sdsd', 'markdown', 'html', True)
    >>> print err
    None
    """
    # verify formats
    if from_format not in INPUT_FORMATS:
        raise Exception("Input format [%s] is not a supported format [%s]" % (from_format, ' '.join(INPUT_FORMATS)))
    if to_format not in OUTPUT_FORMATS:
        raise Exception("Output format [%s] is not a supported format [%s]" % (to_format, ' '.join(OUTPUT_FORMATS)))
    if type(content) != unicode:
        raise Exception('Content is not in unicode format!')

    # temp file
    input_file, input_temp_name = get_filetemp('w', 'input')
    # For some reason when pandoc > 1.9 converts to PDF, '-t' shouldn't be used but output file name extension has to be '.pdf'
    if to_format != 'pdf':
      output_temp_fp, output_temp_name = get_filetemp('r', 'output')
    else:
      output_temp_fp, output_temp_name = get_filetemp('r', 'output.pdf')
    output_temp_fp.close()
    
    error_temp_fp, error_temp_name = get_filetemp('w', 'err')
    error_temp_fp.close()
    
    input_file.write(content.encode(_PANDOC_ENCODING))
    input_file.close()
    
    # pandoc arguments and command line
    p_options = PANDOC_OPTIONS
    if raw:
        p_options = PANDOC_OPTIONS_RAW
                
    # do not use pandoc to convert from html to html
    if from_format==to_format=='html':
      # get body content
      stdoutdata = (content.encode('utf8'))
      soup = BeautifulSoup(stdoutdata)
      body = soup.body
      if body:
        stdoutdata = body.renderContents()
      # strip leading spaces
      stdoutdata = re.sub(r"^\s+", '', stdoutdata)
      # add new line before closing bracket
      stdoutdata = re.sub(r"(\/?)>", r"\n\1>", stdoutdata)
      # do not split closing tag with following opening tag
      stdoutdata = re.sub(r">\n<", r"><", stdoutdata)
      return stdoutdata

    cmd_args = ' %s -o %s ' %(p_options,output_temp_name) 
    if full:
        cmd_args += ' -s '
    cmd_args += ' -f %s ' % from_format
    if to_format != 'pdf':
      cmd_args += ' -t %s ' % to_format
    cmd_args += ' %s ' % input_temp_name
    cmd = PANDOC_BIN + ' ' + cmd_args

    fp_error = file(error_temp_name,'w')
    retcode = call(cmd, shell=True, stderr=fp_error)
    fp_error.close()
    
    fp_error = file(error_temp_name)
    error = fp_error.read()
    fp_error.close()
    
    fp_output = file(output_temp_name)
    stdoutdata = fp_output.read()
    fp_output.close()
    
    
    # cleanup
    os.remove(output_temp_name)
    os.remove(input_temp_name)
    os.remove(error_temp_name)
    
    if retcode:
        raise Exception(error)

    # try converting to unicode
    try:
        stdoutdata = stdoutdata.decode(_PANDOC_ENCODING)
    except UnicodeDecodeError:
        # this will fail for binary output formats such as odt
        # return result without conversion then
        pass
    
    return stdoutdata
    
if __name__ == "__main__":
    import doctest
    doctest.testmod()