9 import os |
9 import os |
10 from tempfile import mkstemp |
10 from tempfile import mkstemp |
11 import StringIO |
11 import StringIO |
12 import tidy |
12 import tidy |
13 from cm.utils.string_utils import to_unicode |
13 from cm.utils.string_utils import to_unicode |
|
14 from xml.dom.minidom import parseString |
|
15 import re |
14 |
16 |
15 PANDOC_BIN = "pandoc" |
17 PANDOC_BIN = "pandoc" |
16 PANDOC_OPTIONS = " --sanitize-html " |
18 PANDOC_OPTIONS = " --sanitize-html --email-obfuscation=none " |
17 PANDOC_OPTIONS_RAW = " -R " |
19 PANDOC_OPTIONS_RAW = " -R --email-obfuscation=none " |
18 |
20 |
19 MARKDOWN2PDF_BIN = "markdown2pdf" |
21 MARKDOWN2PDF_BIN = "markdown2pdf" |
20 |
22 |
21 # make sure binaries are available |
23 # make sure binaries are available |
22 from cm.utils.system import bin_search |
24 from cm.utils.system import bin_search |
83 |
85 |
84 tidy_options = dict(output_xhtml=1, |
86 tidy_options = dict(output_xhtml=1, |
85 add_xml_decl=0, |
87 add_xml_decl=0, |
86 indent=0, |
88 indent=0, |
87 tidy_mark=0, |
89 tidy_mark=0, |
|
90 logical_emphasis=1, |
|
91 wrap=0, |
88 input_encoding='utf8', |
92 input_encoding='utf8', |
89 output_encoding='utf8', |
93 output_encoding='utf8', |
90 ) |
94 ) |
91 tidyied_content = tidy.parseString(to_unicode(content).encode('utf8'), **tidy_options) |
95 tidyied_content = tidy.parseString(to_unicode(content).encode('utf8'), **tidy_options) |
92 tidyied_content = str(tidyied_content) |
96 tidyied_content = str(tidyied_content) |
181 # pandoc arguments and command line |
185 # pandoc arguments and command line |
182 p_options = PANDOC_OPTIONS |
186 p_options = PANDOC_OPTIONS |
183 if raw: |
187 if raw: |
184 p_options = PANDOC_OPTIONS_RAW |
188 p_options = PANDOC_OPTIONS_RAW |
185 |
189 |
|
190 # do not use pandoc to convert from html to html |
|
191 if from_format==to_format=='html': |
|
192 # get body content |
|
193 stdoutdata = (content.encode('utf8')) |
|
194 # if for some reason, tidy has not guess the doctype, make xml.dom.minidom happy with HTML entities ( ) |
|
195 stdoutdata = re.sub(r" ", '\xc2\xa0', stdoutdata) |
|
196 dom = parseString(stdoutdata) |
|
197 body = dom.getElementsByTagName("body")[0].toxml() |
|
198 stdoutdata = body[body.find('>')+1:body.rfind('</')] |
|
199 # strip leading spaces |
|
200 stdoutdata = re.sub(r"^\s+", '', stdoutdata) |
|
201 # add new line before closing bracket |
|
202 stdoutdata = re.sub(r"(\/?)>", r"\n\1>", stdoutdata) |
|
203 # do not split closing tag with following opening tag |
|
204 stdoutdata = re.sub(r">\n<", r"><", stdoutdata) |
|
205 # nest headers tags |
|
206 #stdoutdata = re.sub(r'<h(\d) id="([^"]+)"\n>', r'<div id="\2"><h\1>', stdoutdata) |
|
207 #stdoutdata = re.sub(r'<\/h(\d)\n>', r'</h\1></div>', stdoutdata) |
|
208 return stdoutdata |
|
209 |
186 cmd_args = ' %s -o %s ' %(p_options,output_temp_name) |
210 cmd_args = ' %s -o %s ' %(p_options,output_temp_name) |
187 if full: |
211 if full: |
188 cmd_args += ' -s ' |
212 cmd_args += ' -s ' |
189 cmd_args += ' -f %s ' % from_format |
213 cmd_args += ' -f %s ' % from_format |
190 cmd_args += ' -t %s ' % to_format |
214 cmd_args += ' -t %s ' % to_format |