# HG changeset patch # User Production Moz # Date 1306858194 -7200 # Node ID 07a1fba18fffbf7943811582906bbfe3ebb794ef # Parent 9245a73f578750d69db1b729875ce551152f3db0 do not use pandoc to convert from html to html diff -r 9245a73f5787 -r 07a1fba18fff src/cm/converters/pandoc_converters.py --- a/src/cm/converters/pandoc_converters.py Tue May 31 18:07:46 2011 +0200 +++ b/src/cm/converters/pandoc_converters.py Tue May 31 18:09:54 2011 +0200 @@ -11,10 +11,12 @@ import StringIO import tidy from cm.utils.string_utils import to_unicode +from xml.dom.minidom import parseString +import re PANDOC_BIN = "pandoc" -PANDOC_OPTIONS = " --sanitize-html " -PANDOC_OPTIONS_RAW = " -R " +PANDOC_OPTIONS = " --sanitize-html --email-obfuscation=none " +PANDOC_OPTIONS_RAW = " -R --email-obfuscation=none " MARKDOWN2PDF_BIN = "markdown2pdf" @@ -85,6 +87,8 @@ add_xml_decl=0, indent=0, tidy_mark=0, + logical_emphasis=1, + wrap=0, input_encoding='utf8', output_encoding='utf8', ) @@ -183,6 +187,26 @@ if raw: p_options = PANDOC_OPTIONS_RAW + # do not use pandoc to convert from html to html + if from_format==to_format=='html': + # get body content + stdoutdata = (content.encode('utf8')) + # if for some reason, tidy has not guess the doctype, make xml.dom.minidom happy with HTML entities ( ) + stdoutdata = re.sub(r" ", '\xc2\xa0', stdoutdata) + dom = parseString(stdoutdata) + body = dom.getElementsByTagName("body")[0].toxml() + stdoutdata = body[body.find('>')+1:body.rfind('", r"\n\1>", stdoutdata) + # do not split closing tag with following opening tag + stdoutdata = re.sub(r">\n<", r"><", stdoutdata) + # nest headers tags + #stdoutdata = re.sub(r'', r'

', stdoutdata) + #stdoutdata = re.sub(r'<\/h(\d)\n>', r'

', stdoutdata) + return stdoutdata + cmd_args = ' %s -o %s ' %(p_options,output_temp_name) if full: cmd_args += ' -s '