| author | Production Moz <dev@sopinspace.com> |
| Tue, 31 May 2011 18:09:54 +0200 | |
| changeset 352 | 07a1fba18fff |
| parent 351 | 9245a73f5787 |
| child 355 | c926868cf7e6 |
| permissions | -rw-r--r-- |
| 0 | 1 |
# python 2.5 compat |
2 |
from __future__ import with_statement |
|
| 261 | 3 |
from cm.utils.cache import memoize, dj_memoize |
| 0 | 4 |
###### |
5 |
## This module requires pandoc v > 1.0 (pandoc & markdown executables) |
|
6 |
###### |
|
7 |
||
8 |
from subprocess import Popen, PIPE, call |
|
9 |
import os |
|
10 |
from tempfile import mkstemp |
|
11 |
import StringIO |
|
12 |
import tidy |
|
| 149 | 13 |
from cm.utils.string_utils import to_unicode |
|
352
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
14 |
from xml.dom.minidom import parseString |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
15 |
import re |
| 0 | 16 |
|
17 |
PANDOC_BIN = "pandoc" |
|
|
352
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
18 |
PANDOC_OPTIONS = " --sanitize-html --email-obfuscation=none " |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
19 |
PANDOC_OPTIONS_RAW = " -R --email-obfuscation=none " |
| 0 | 20 |
|
21 |
MARKDOWN2PDF_BIN = "markdown2pdf" |
|
22 |
||
23 |
# make sure binaries are available |
|
24 |
from cm.utils.system import bin_search |
|
25 |
bin_search(PANDOC_BIN) |
|
26 |
bin_search(MARKDOWN2PDF_BIN) |
|
27 |
||
28 |
# pandoc capabilities |
|
29 |
INPUT_FORMATS = ['native', 'markdown', 'rst', 'html', 'latex'] |
|
30 |
OUTPUT_FORMATS = ['native', 'html', 's5', 'docbook', 'opendocument', 'odt', 'latex', 'context', 'texinfo', 'man', 'markdown', 'rst', 'mediawiki', 'rtf'] |
|
31 |
||
32 |
# add pdf output using markdown2pdf |
|
33 |
OUTPUT_FORMATS.append('pdf') |
|
34 |
||
35 |
# input formats |
|
36 |
CHOICES_INPUT_FORMATS = [(f, f) for f in ['markdown', 'rst', 'html']] |
|
37 |
||
38 |
DEFAULT_INPUT_FORMAT = 'markdown' |
|
39 |
||
40 |
_PANDOC_ENCODING = 'utf8' |
|
41 |
||
| 261 | 42 |
@dj_memoize |
|
259
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
43 |
def pandoc_convert(content, from_format, to_format, full=False, raw=False): |
| 0 | 44 |
""" |
45 |
Convert markdown content to pdf |
|
46 |
|
|
47 |
>>> res = pandoc_convert('<span>dssd', 'html', 'pdf') |
|
48 |
""" |
|
49 |
# pandoc does not react well when html is not valid |
|
50 |
# use tidy to clean html |
|
51 |
if from_format == 'html': |
|
| 119 | 52 |
try: |
53 |
content = do_tidy(content) |
|
54 |
except: |
|
55 |
# tidy fails ... try pandoc anyway... |
|
56 |
content = to_unicode(content) |
|
| 0 | 57 |
# if to_format is pdf: use markdown2pdf |
58 |
if to_format == 'pdf': |
|
59 |
if from_format != 'markdown': |
|
60 |
content = pandoc_convert(content, from_format, 'markdown', True) |
|
61 |
return pandoc_markdown2pdf(content) |
|
|
259
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
62 |
return pandoc_pandoc(content, from_format, to_format, full, from_format==to_format=='html') # use raw pandoc convertion if html->html |
| 0 | 63 |
|
64 |
def content_or_file_name(content, file_name): |
|
65 |
if not content and not file_name: |
|
66 |
raise Exception('You should provide either a content or a file_name') |
|
67 |
if content and file_name: |
|
68 |
raise Exception('You should not provide a content AND a file_name') |
|
69 |
||
70 |
if file_name: |
|
71 |
fp = file(file_name) |
|
72 |
content = fp.read() |
|
73 |
fp.close() |
|
74 |
||
75 |
return content |
|
76 |
||
| 261 | 77 |
@dj_memoize |
| 0 | 78 |
def do_tidy(content=None, file_name=None): |
79 |
""" |
|
80 |
Tidy (html) content |
|
81 |
|
|
82 |
>>> res = do_tidy('<span>sdd') |
|
83 |
""" |
|
84 |
content = content_or_file_name(content, file_name) |
|
85 |
||
86 |
tidy_options = dict(output_xhtml=1, |
|
87 |
add_xml_decl=0, |
|
88 |
indent=0, |
|
89 |
tidy_mark=0, |
|
|
352
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
90 |
logical_emphasis=1, |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
91 |
wrap=0, |
| 0 | 92 |
input_encoding='utf8', |
93 |
output_encoding='utf8', |
|
94 |
) |
|
| 119 | 95 |
tidyied_content = tidy.parseString(to_unicode(content).encode('utf8'), **tidy_options) |
| 0 | 96 |
tidyied_content = str(tidyied_content) |
97 |
if content and not tidyied_content.strip(): |
|
98 |
raise Exception('Content could not be tidyfied') |
|
99 |
return str(tidyied_content).decode('utf8') |
|
100 |
||
101 |
||
|
351
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
102 |
def get_filetemp(mode="r", suffix=''): |
|
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
103 |
(fd, fname) = mkstemp(suffix) |
| 0 | 104 |
return (os.fdopen(fd, mode), fname) |
105 |
||
| 261 | 106 |
@dj_memoize |
| 0 | 107 |
def pandoc_markdown2pdf(content=None, file_name=None): |
108 |
""" |
|
109 |
Convert markdown content to pdf |
|
110 |
|
|
111 |
>>> pdf_content = pandoc_markdown2pdf('# dssd') |
|
112 |
""" |
|
113 |
content = content_or_file_name(content, file_name) |
|
114 |
||
115 |
# write file to disk |
|
|
351
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
116 |
temp_file, input_temp_name = get_filetemp('w', 'input') |
|
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
117 |
fp_error, error_temp_name = get_filetemp('w', 'err') |
| 0 | 118 |
|
119 |
temp_file.write(content.encode(_PANDOC_ENCODING)) |
|
120 |
temp_file.close() |
|
121 |
||
|
351
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
122 |
cust_tex = " --xetex " |
| 0 | 123 |
|
124 |
# use markdown2pdf |
|
|
351
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
125 |
retcode = call(MARKDOWN2PDF_BIN + cust_tex + ' ' + input_temp_name, shell=True, stderr=fp_error) |
| 0 | 126 |
fp_error.close() |
127 |
||
128 |
fp_error = file(error_temp_name) |
|
129 |
error = fp_error.read() |
|
130 |
fp_error.close() |
|
131 |
||
132 |
os.remove(input_temp_name) |
|
133 |
os.remove(error_temp_name) |
|
134 |
||
135 |
if retcode: |
|
136 |
raise Exception(error) |
|
137 |
||
138 |
output_temp_name = input_temp_name + '.pdf' |
|
139 |
fp_output = file(output_temp_name) |
|
140 |
pdf_content = fp_output.read() |
|
141 |
fp_output.close() |
|
142 |
||
143 |
os.remove(output_temp_name) |
|
144 |
||
145 |
return pdf_content |
|
146 |
||
147 |
# TODO: manage images in pandoc (?) |
|
148 |
# TODO: use tidy to cleanup html |
|
149 |
||
| 261 | 150 |
@dj_memoize |
|
259
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
151 |
def pandoc_pandoc(content, from_format, to_format, full=False, raw=False): |
| 0 | 152 |
""" |
153 |
Convert content (should be unicode) from from_format to to_format |
|
154 |
(if full: includes header & co [html, latex]) |
|
155 |
Returns out (unicode), err |
|
156 |
|
|
157 |
>>> res, err = pandoc_pandoc(u'# sdsd', 'markdown', 'html', False) |
|
158 |
>>> print err |
|
159 |
None |
|
160 |
>>> res.replace("\\n","") |
|
161 |
u'<h1 id="sdsd">sdsd</h1>' |
|
162 |
>>> res, err = pandoc_pandoc(u'# sdsd', 'markdown', 'html', True) |
|
163 |
>>> print err |
|
164 |
None |
|
165 |
""" |
|
166 |
# verify formats |
|
167 |
if from_format not in INPUT_FORMATS: |
|
168 |
raise Exception("Input format [%s] is not a supported format [%s]" % (from_format, ' '.join(INPUT_FORMATS))) |
|
169 |
if to_format not in OUTPUT_FORMATS: |
|
170 |
raise Exception("Output format [%s] is not a supported format [%s]" % (to_format, ' '.join(OUTPUT_FORMATS))) |
|
171 |
if type(content) != unicode: |
|
172 |
raise Exception('Content is not in unicode format!') |
|
173 |
||
174 |
# temp file |
|
|
351
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
175 |
input_file, input_temp_name = get_filetemp('w', 'input') |
|
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
176 |
output_temp_fp, output_temp_name = get_filetemp('r', 'output') |
| 0 | 177 |
output_temp_fp.close() |
178 |
||
|
351
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
179 |
error_temp_fp, error_temp_name = get_filetemp('w', 'err') |
| 0 | 180 |
error_temp_fp.close() |
181 |
||
182 |
input_file.write(content.encode(_PANDOC_ENCODING)) |
|
183 |
input_file.close() |
|
184 |
||
185 |
# pandoc arguments and command line |
|
|
259
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
186 |
p_options = PANDOC_OPTIONS |
|
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
187 |
if raw: |
|
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
188 |
p_options = PANDOC_OPTIONS_RAW |
|
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
189 |
|
|
352
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
190 |
# do not use pandoc to convert from html to html |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
191 |
if from_format==to_format=='html': |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
192 |
# get body content |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
193 |
stdoutdata = (content.encode('utf8')) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
194 |
# if for some reason, tidy has not guess the doctype, make xml.dom.minidom happy with HTML entities ( ) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
195 |
stdoutdata = re.sub(r" ", '\xc2\xa0', stdoutdata) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
196 |
dom = parseString(stdoutdata) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
197 |
body = dom.getElementsByTagName("body")[0].toxml() |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
198 |
stdoutdata = body[body.find('>')+1:body.rfind('</')] |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
199 |
# strip leading spaces |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
200 |
stdoutdata = re.sub(r"^\s+", '', stdoutdata) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
201 |
# add new line before closing bracket |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
202 |
stdoutdata = re.sub(r"(\/?)>", r"\n\1>", stdoutdata) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
203 |
# do not split closing tag with following opening tag |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
204 |
stdoutdata = re.sub(r">\n<", r"><", stdoutdata) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
205 |
# nest headers tags |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
206 |
#stdoutdata = re.sub(r'<h(\d) id="([^"]+)"\n>', r'<div id="\2"><h\1>', stdoutdata) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
207 |
#stdoutdata = re.sub(r'<\/h(\d)\n>', r'</h\1></div>', stdoutdata) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
208 |
return stdoutdata |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
209 |
|
|
259
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
210 |
cmd_args = ' %s -o %s ' %(p_options,output_temp_name) |
| 0 | 211 |
if full: |
212 |
cmd_args += ' -s ' |
|
213 |
cmd_args += ' -f %s ' % from_format |
|
214 |
cmd_args += ' -t %s ' % to_format |
|
215 |
cmd_args += ' %s ' % input_temp_name |
|
216 |
cmd = PANDOC_BIN + ' ' + cmd_args |
|
217 |
||
218 |
#from socommons.converters.new_conv import controlled_Popen |
|
219 |
#controlled_Popen(cmd, stderr=file(error_temp_name,'w')) |
|
220 |
fp_error = file(error_temp_name,'w') |
|
221 |
retcode = call(cmd, shell=True, stderr=fp_error) |
|
222 |
fp_error.close() |
|
223 |
||
224 |
fp_error = file(error_temp_name) |
|
225 |
error = fp_error.read() |
|
226 |
fp_error.close() |
|
227 |
||
228 |
fp_output = file(output_temp_name) |
|
229 |
stdoutdata = fp_output.read() |
|
230 |
fp_output.close() |
|
231 |
||
232 |
||
233 |
# cleanup |
|
234 |
os.remove(output_temp_name) |
|
235 |
os.remove(input_temp_name) |
|
236 |
os.remove(error_temp_name) |
|
237 |
||
238 |
if retcode: |
|
239 |
raise Exception(error) |
|
240 |
||
241 |
# try converting to unicode |
|
242 |
try: |
|
243 |
stdoutdata = stdoutdata.decode(_PANDOC_ENCODING) |
|
244 |
except UnicodeDecodeError: |
|
245 |
# this will fail for binary output formats such as odt |
|
246 |
# return result without conversion then |
|
247 |
pass |
|
248 |
||
249 |
return stdoutdata |
|
250 |
||
251 |
if __name__ == "__main__": |
|
252 |
import doctest |
|
253 |
doctest.testmod() |
|
254 |