src/cm/converters/pandoc_converters.py
changeset 119 5e8dda1b7631
parent 0 40c8f766c9b8
child 149 0f2c5744b39b
equal deleted inserted replaced
118:75d94dd14511 119:5e8dda1b7631
     8 from subprocess import Popen, PIPE, call
     8 from subprocess import Popen, PIPE, call
     9 import os
     9 import os
    10 from tempfile import mkstemp
    10 from tempfile import mkstemp
    11 import StringIO
    11 import StringIO
    12 import tidy
    12 import tidy
    13 
    13 from cm.utils.string import to_unicode
    14 
    14 
    15 PANDOC_BIN = "pandoc"
    15 PANDOC_BIN = "pandoc"
    16 PANDOC_OPTIONS = "--sanitize-html "
    16 PANDOC_OPTIONS = "--sanitize-html "
    17 
    17 
    18 MARKDOWN2PDF_BIN = "markdown2pdf"
    18 MARKDOWN2PDF_BIN = "markdown2pdf"
    44     >>> res = pandoc_convert('<span>dssd', 'html', 'pdf')
    44     >>> res = pandoc_convert('<span>dssd', 'html', 'pdf')
    45     """
    45     """
    46     # pandoc does not react well when html is not valid
    46     # pandoc does not react well when html is not valid
    47     # use tidy to clean html  
    47     # use tidy to clean html  
    48     if from_format == 'html':
    48     if from_format == 'html':
    49         content = do_tidy(content)
    49         try:
       
    50             content = do_tidy(content)
       
    51         except:
       
    52             # tidy fails ... try pandoc anyway...
       
    53             content = to_unicode(content)
    50     # if to_format is pdf: use markdown2pdf
    54     # if to_format is pdf: use markdown2pdf
    51     if to_format == 'pdf':        
    55     if to_format == 'pdf':        
    52         if from_format != 'markdown':
    56         if from_format != 'markdown':
    53             content = pandoc_convert(content, from_format, 'markdown', True)
    57             content = pandoc_convert(content, from_format, 'markdown', True)
    54         return pandoc_markdown2pdf(content)
    58         return pandoc_markdown2pdf(content)
    81                         indent=0, 
    85                         indent=0, 
    82                         tidy_mark=0,
    86                         tidy_mark=0,
    83                         input_encoding='utf8',
    87                         input_encoding='utf8',
    84                         output_encoding='utf8',
    88                         output_encoding='utf8',
    85                         )
    89                         )
    86     tidyied_content = tidy.parseString(content.encode('utf8'), **tidy_options)
    90     tidyied_content = tidy.parseString(to_unicode(content).encode('utf8'), **tidy_options)
    87     tidyied_content = str(tidyied_content)
    91     tidyied_content = str(tidyied_content)
    88     if content and not tidyied_content.strip():
    92     if content and not tidyied_content.strip():
    89         raise Exception('Content could not be tidyfied') 
    93         raise Exception('Content could not be tidyfied') 
    90     return str(tidyied_content).decode('utf8')
    94     return str(tidyied_content).decode('utf8')
    91 
    95