src/cm/converters/pandoc_converters.py
changeset 352 07a1fba18fff
parent 351 9245a73f5787
child 355 c926868cf7e6
equal deleted inserted replaced
351:9245a73f5787 352:07a1fba18fff
     9 import os
     9 import os
    10 from tempfile import mkstemp
    10 from tempfile import mkstemp
    11 import StringIO
    11 import StringIO
    12 import tidy
    12 import tidy
    13 from cm.utils.string_utils import to_unicode
    13 from cm.utils.string_utils import to_unicode
       
    14 from xml.dom.minidom import parseString
       
    15 import re
    14 
    16 
    15 PANDOC_BIN = "pandoc"
    17 PANDOC_BIN = "pandoc"
    16 PANDOC_OPTIONS = " --sanitize-html "
    18 PANDOC_OPTIONS = " --sanitize-html --email-obfuscation=none  "
    17 PANDOC_OPTIONS_RAW = " -R "
    19 PANDOC_OPTIONS_RAW = " -R --email-obfuscation=none "
    18 
    20 
    19 MARKDOWN2PDF_BIN = "markdown2pdf"
    21 MARKDOWN2PDF_BIN = "markdown2pdf"
    20 
    22 
    21 # make sure binaries are available
    23 # make sure binaries are available
    22 from cm.utils.system import bin_search
    24 from cm.utils.system import bin_search
    83     
    85     
    84     tidy_options = dict(output_xhtml=1, 
    86     tidy_options = dict(output_xhtml=1, 
    85                         add_xml_decl=0, 
    87                         add_xml_decl=0, 
    86                         indent=0, 
    88                         indent=0, 
    87                         tidy_mark=0,
    89                         tidy_mark=0,
       
    90                         logical_emphasis=1,
       
    91                         wrap=0,
    88                         input_encoding='utf8',
    92                         input_encoding='utf8',
    89                         output_encoding='utf8',
    93                         output_encoding='utf8',
    90                         )
    94                         )
    91     tidyied_content = tidy.parseString(to_unicode(content).encode('utf8'), **tidy_options)
    95     tidyied_content = tidy.parseString(to_unicode(content).encode('utf8'), **tidy_options)
    92     tidyied_content = str(tidyied_content)
    96     tidyied_content = str(tidyied_content)
   181     # pandoc arguments and command line
   185     # pandoc arguments and command line
   182     p_options = PANDOC_OPTIONS
   186     p_options = PANDOC_OPTIONS
   183     if raw:
   187     if raw:
   184         p_options = PANDOC_OPTIONS_RAW
   188         p_options = PANDOC_OPTIONS_RAW
   185                 
   189                 
       
   190     # do not use pandoc to convert from html to html
       
   191     if from_format==to_format=='html':
       
   192       # get body content
       
   193       stdoutdata = (content.encode('utf8'))
       
   194       # if for some reason, tidy has not guess the doctype, make xml.dom.minidom happy with HTML entities ( )
       
   195       stdoutdata = re.sub(r" ", '\xc2\xa0', stdoutdata)
       
   196       dom = parseString(stdoutdata)
       
   197       body = dom.getElementsByTagName("body")[0].toxml()
       
   198       stdoutdata = body[body.find('>')+1:body.rfind('</')]
       
   199       # strip leading spaces
       
   200       stdoutdata = re.sub(r"^\s+", '', stdoutdata)
       
   201       # add new line before closing bracket
       
   202       stdoutdata = re.sub(r"(\/?)>", r"\n\1>", stdoutdata)
       
   203       # do not split closing tag with following opening tag
       
   204       stdoutdata = re.sub(r">\n<", r"><", stdoutdata)
       
   205       # nest headers tags 
       
   206       #stdoutdata = re.sub(r'<h(\d) id="([^"]+)"\n>', r'<div id="\2"><h\1>', stdoutdata)
       
   207       #stdoutdata = re.sub(r'<\/h(\d)\n>', r'</h\1></div>', stdoutdata)
       
   208       return stdoutdata
       
   209 
   186     cmd_args = ' %s -o %s ' %(p_options,output_temp_name) 
   210     cmd_args = ' %s -o %s ' %(p_options,output_temp_name) 
   187     if full:
   211     if full:
   188         cmd_args += ' -s '
   212         cmd_args += ' -s '
   189     cmd_args += ' -f %s ' % from_format
   213     cmd_args += ' -f %s ' % from_format
   190     cmd_args += ' -t %s ' % to_format
   214     cmd_args += ' -t %s ' % to_format