| author | gibus |
| Mon, 14 May 2012 15:24:57 +0200 | |
| changeset 427 | ffb767e0f7c0 |
| parent 419 | 34f5b9e4c4f9 |
| child 428 | 9591c651391d |
| permissions | -rw-r--r-- |
| 0 | 1 |
# python 2.5 compat |
2 |
from __future__ import with_statement |
|
| 261 | 3 |
from cm.utils.cache import memoize, dj_memoize |
| 0 | 4 |
###### |
5 |
## This module requires pandoc v > 1.0 (pandoc & markdown executables) |
|
6 |
###### |
|
7 |
||
8 |
from subprocess import Popen, PIPE, call |
|
9 |
import os |
|
10 |
from tempfile import mkstemp |
|
11 |
import StringIO |
|
12 |
import tidy |
|
| 149 | 13 |
from cm.utils.string_utils import to_unicode |
|
352
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
14 |
from xml.dom.minidom import parseString |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
15 |
import re |
| 0 | 16 |
|
17 |
PANDOC_BIN = "pandoc" |
|
| 427 | 18 |
PANDOC_OPTIONS = " --email-obfuscation=none " |
|
352
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
19 |
PANDOC_OPTIONS_RAW = " -R --email-obfuscation=none " |
| 0 | 20 |
|
21 |
MARKDOWN2PDF_BIN = "markdown2pdf" |
|
22 |
||
23 |
# make sure binaries are available |
|
24 |
from cm.utils.system import bin_search |
|
25 |
bin_search(PANDOC_BIN) |
|
26 |
bin_search(MARKDOWN2PDF_BIN) |
|
27 |
||
28 |
# pandoc capabilities |
|
29 |
INPUT_FORMATS = ['native', 'markdown', 'rst', 'html', 'latex'] |
|
30 |
OUTPUT_FORMATS = ['native', 'html', 's5', 'docbook', 'opendocument', 'odt', 'latex', 'context', 'texinfo', 'man', 'markdown', 'rst', 'mediawiki', 'rtf'] |
|
31 |
||
32 |
# add pdf output using markdown2pdf |
|
33 |
OUTPUT_FORMATS.append('pdf') |
|
34 |
||
35 |
# input formats |
|
36 |
CHOICES_INPUT_FORMATS = [(f, f) for f in ['markdown', 'rst', 'html']] |
|
37 |
||
38 |
DEFAULT_INPUT_FORMAT = 'markdown' |
|
39 |
||
40 |
_PANDOC_ENCODING = 'utf8' |
|
41 |
||
| 261 | 42 |
@dj_memoize |
|
259
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
43 |
def pandoc_convert(content, from_format, to_format, full=False, raw=False): |
| 0 | 44 |
""" |
45 |
Convert markdown content to pdf |
|
46 |
|
|
47 |
>>> res = pandoc_convert('<span>dssd', 'html', 'pdf') |
|
48 |
""" |
|
49 |
# pandoc does not react well when html is not valid |
|
50 |
# use tidy to clean html |
|
51 |
if from_format == 'html': |
|
| 119 | 52 |
try: |
53 |
content = do_tidy(content) |
|
54 |
except: |
|
55 |
# tidy fails ... try pandoc anyway... |
|
56 |
content = to_unicode(content) |
|
| 0 | 57 |
# if to_format is pdf: use markdown2pdf |
58 |
if to_format == 'pdf': |
|
59 |
if from_format != 'markdown': |
|
60 |
content = pandoc_convert(content, from_format, 'markdown', True) |
|
61 |
return pandoc_markdown2pdf(content) |
|
|
259
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
62 |
return pandoc_pandoc(content, from_format, to_format, full, from_format==to_format=='html') # use raw pandoc convertion if html->html |
| 0 | 63 |
|
64 |
def content_or_file_name(content, file_name): |
|
65 |
if not content and not file_name: |
|
66 |
raise Exception('You should provide either a content or a file_name') |
|
67 |
if content and file_name: |
|
68 |
raise Exception('You should not provide a content AND a file_name') |
|
69 |
||
70 |
if file_name: |
|
71 |
fp = file(file_name) |
|
72 |
content = fp.read() |
|
73 |
fp.close() |
|
74 |
||
75 |
return content |
|
76 |
||
| 261 | 77 |
@dj_memoize |
| 0 | 78 |
def do_tidy(content=None, file_name=None): |
79 |
""" |
|
80 |
Tidy (html) content |
|
81 |
|
|
82 |
>>> res = do_tidy('<span>sdd') |
|
83 |
""" |
|
84 |
content = content_or_file_name(content, file_name) |
|
85 |
||
86 |
tidy_options = dict(output_xhtml=1, |
|
87 |
add_xml_decl=0, |
|
88 |
indent=0, |
|
89 |
tidy_mark=0, |
|
|
352
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
90 |
logical_emphasis=1, |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
91 |
wrap=0, |
| 0 | 92 |
input_encoding='utf8', |
93 |
output_encoding='utf8', |
|
94 |
) |
|
| 119 | 95 |
tidyied_content = tidy.parseString(to_unicode(content).encode('utf8'), **tidy_options) |
| 0 | 96 |
tidyied_content = str(tidyied_content) |
97 |
if content and not tidyied_content.strip(): |
|
98 |
raise Exception('Content could not be tidyfied') |
|
99 |
return str(tidyied_content).decode('utf8') |
|
100 |
||
101 |
||
|
351
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
102 |
def get_filetemp(mode="r", suffix=''): |
|
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
103 |
(fd, fname) = mkstemp(suffix) |
| 0 | 104 |
return (os.fdopen(fd, mode), fname) |
105 |
||
| 261 | 106 |
@dj_memoize |
| 0 | 107 |
def pandoc_markdown2pdf(content=None, file_name=None): |
108 |
""" |
|
109 |
Convert markdown content to pdf |
|
110 |
|
|
111 |
>>> pdf_content = pandoc_markdown2pdf('# dssd') |
|
112 |
""" |
|
113 |
content = content_or_file_name(content, file_name) |
|
114 |
||
115 |
# write file to disk |
|
|
351
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
116 |
temp_file, input_temp_name = get_filetemp('w', 'input') |
|
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
117 |
fp_error, error_temp_name = get_filetemp('w', 'err') |
| 0 | 118 |
|
119 |
temp_file.write(content.encode(_PANDOC_ENCODING)) |
|
120 |
temp_file.close() |
|
121 |
||
|
357
2f4587a37ff9
if markdown2pdf fails try without -xetex option
Production Moz <dev@sopinspace.com>
parents:
356
diff
changeset
|
122 |
cust_tex = " --xetex " |
| 0 | 123 |
|
124 |
# use markdown2pdf |
|
|
351
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
125 |
retcode = call(MARKDOWN2PDF_BIN + cust_tex + ' ' + input_temp_name, shell=True, stderr=fp_error) |
|
357
2f4587a37ff9
if markdown2pdf fails try without -xetex option
Production Moz <dev@sopinspace.com>
parents:
356
diff
changeset
|
126 |
|
|
2f4587a37ff9
if markdown2pdf fails try without -xetex option
Production Moz <dev@sopinspace.com>
parents:
356
diff
changeset
|
127 |
# xetex seems to randomly cause "Invalid or incomplete multibyte or wide character" errors, try without it |
|
2f4587a37ff9
if markdown2pdf fails try without -xetex option
Production Moz <dev@sopinspace.com>
parents:
356
diff
changeset
|
128 |
if retcode: |
|
358
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
129 |
# build absolute address for latex header file |
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
130 |
_tmp_ = __file__.split(os.path.sep)[:-1] |
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
131 |
_tmp_.append('latex_header.txt') |
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
132 |
_tmp_.insert(0, os.path.sep) |
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
133 |
|
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
134 |
LATEX_HEADER_PATH = os.path.join(*_tmp_) |
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
135 |
|
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
136 |
if not os.path.isfile(LATEX_HEADER_PATH): |
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
137 |
raise Exception('LATEX_HEADER_PATH is not a file!') |
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
138 |
|
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
139 |
# custom latex header |
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
140 |
cust_head_tex = " --custom-header=%s " %LATEX_HEADER_PATH |
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
141 |
|
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
142 |
retcode = call(MARKDOWN2PDF_BIN + cust_head_tex + ' ' + input_temp_name, shell=True, stderr=fp_error) |
|
357
2f4587a37ff9
if markdown2pdf fails try without -xetex option
Production Moz <dev@sopinspace.com>
parents:
356
diff
changeset
|
143 |
|
| 0 | 144 |
fp_error.close() |
145 |
||
146 |
fp_error = file(error_temp_name) |
|
147 |
error = fp_error.read() |
|
148 |
fp_error.close() |
|
149 |
||
150 |
os.remove(input_temp_name) |
|
151 |
os.remove(error_temp_name) |
|
152 |
||
153 |
if retcode: |
|
154 |
raise Exception(error) |
|
155 |
||
156 |
output_temp_name = input_temp_name + '.pdf' |
|
157 |
fp_output = file(output_temp_name) |
|
158 |
pdf_content = fp_output.read() |
|
159 |
fp_output.close() |
|
160 |
||
161 |
os.remove(output_temp_name) |
|
162 |
||
163 |
return pdf_content |
|
164 |
||
165 |
# TODO: manage images in pandoc (?) |
|
166 |
# TODO: use tidy to cleanup html |
|
167 |
||
| 261 | 168 |
@dj_memoize |
|
259
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
169 |
def pandoc_pandoc(content, from_format, to_format, full=False, raw=False): |
| 0 | 170 |
""" |
171 |
Convert content (should be unicode) from from_format to to_format |
|
172 |
(if full: includes header & co [html, latex]) |
|
173 |
Returns out (unicode), err |
|
174 |
|
|
175 |
>>> res, err = pandoc_pandoc(u'# sdsd', 'markdown', 'html', False) |
|
176 |
>>> print err |
|
177 |
None |
|
178 |
>>> res.replace("\\n","") |
|
179 |
u'<h1 id="sdsd">sdsd</h1>' |
|
180 |
>>> res, err = pandoc_pandoc(u'# sdsd', 'markdown', 'html', True) |
|
181 |
>>> print err |
|
182 |
None |
|
183 |
""" |
|
184 |
# verify formats |
|
185 |
if from_format not in INPUT_FORMATS: |
|
186 |
raise Exception("Input format [%s] is not a supported format [%s]" % (from_format, ' '.join(INPUT_FORMATS))) |
|
187 |
if to_format not in OUTPUT_FORMATS: |
|
188 |
raise Exception("Output format [%s] is not a supported format [%s]" % (to_format, ' '.join(OUTPUT_FORMATS))) |
|
189 |
if type(content) != unicode: |
|
190 |
raise Exception('Content is not in unicode format!') |
|
191 |
||
192 |
# temp file |
|
|
351
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
193 |
input_file, input_temp_name = get_filetemp('w', 'input') |
|
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
194 |
output_temp_fp, output_temp_name = get_filetemp('r', 'output') |
| 0 | 195 |
output_temp_fp.close() |
196 |
||
|
351
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
197 |
error_temp_fp, error_temp_name = get_filetemp('w', 'err') |
| 0 | 198 |
error_temp_fp.close() |
199 |
||
200 |
input_file.write(content.encode(_PANDOC_ENCODING)) |
|
201 |
input_file.close() |
|
202 |
||
203 |
# pandoc arguments and command line |
|
|
259
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
204 |
p_options = PANDOC_OPTIONS |
|
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
205 |
if raw: |
|
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
206 |
p_options = PANDOC_OPTIONS_RAW |
|
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
207 |
|
|
352
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
208 |
# do not use pandoc to convert from html to html |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
209 |
if from_format==to_format=='html': |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
210 |
# get body content |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
211 |
stdoutdata = (content.encode('utf8')) |
| 393 | 212 |
#stdoutdata = re.sub(r".*<body[^>]*>", '', stdoutdata) |
213 |
#stdoutdata = re.sub(r"</body>.*", '', stdoutdata) |
|
|
352
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
214 |
# if for some reason, tidy has not guess the doctype, make xml.dom.minidom happy with HTML entities ( ) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
215 |
stdoutdata = re.sub(r" ", '\xc2\xa0', stdoutdata) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
216 |
dom = parseString(stdoutdata) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
217 |
body = dom.getElementsByTagName("body")[0].toxml() |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
218 |
stdoutdata = body[body.find('>')+1:body.rfind('</')] |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
219 |
# strip leading spaces |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
220 |
stdoutdata = re.sub(r"^\s+", '', stdoutdata) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
221 |
# add new line before closing bracket |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
222 |
stdoutdata = re.sub(r"(\/?)>", r"\n\1>", stdoutdata) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
223 |
# do not split closing tag with following opening tag |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
224 |
stdoutdata = re.sub(r">\n<", r"><", stdoutdata) |
|
355
c926868cf7e6
if DECORATED_CREATORS take into account the "fake" username if has_own_perm()
gibus
parents:
352
diff
changeset
|
225 |
# nest headers tags |
|
352
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
226 |
#stdoutdata = re.sub(r'<h(\d) id="([^"]+)"\n>', r'<div id="\2"><h\1>', stdoutdata) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
227 |
#stdoutdata = re.sub(r'<\/h(\d)\n>', r'</h\1></div>', stdoutdata) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
228 |
return stdoutdata |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
229 |
|
|
259
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
230 |
cmd_args = ' %s -o %s ' %(p_options,output_temp_name) |
| 0 | 231 |
if full: |
232 |
cmd_args += ' -s ' |
|
233 |
cmd_args += ' -f %s ' % from_format |
|
234 |
cmd_args += ' -t %s ' % to_format |
|
235 |
cmd_args += ' %s ' % input_temp_name |
|
236 |
cmd = PANDOC_BIN + ' ' + cmd_args |
|
237 |
||
238 |
#from socommons.converters.new_conv import controlled_Popen |
|
239 |
#controlled_Popen(cmd, stderr=file(error_temp_name,'w')) |
|
240 |
fp_error = file(error_temp_name,'w') |
|
241 |
retcode = call(cmd, shell=True, stderr=fp_error) |
|
242 |
fp_error.close() |
|
243 |
||
244 |
fp_error = file(error_temp_name) |
|
245 |
error = fp_error.read() |
|
246 |
fp_error.close() |
|
247 |
||
248 |
fp_output = file(output_temp_name) |
|
249 |
stdoutdata = fp_output.read() |
|
250 |
fp_output.close() |
|
251 |
||
252 |
||
253 |
# cleanup |
|
254 |
os.remove(output_temp_name) |
|
255 |
os.remove(input_temp_name) |
|
256 |
os.remove(error_temp_name) |
|
257 |
||
258 |
if retcode: |
|
259 |
raise Exception(error) |
|
260 |
||
261 |
# try converting to unicode |
|
262 |
try: |
|
263 |
stdoutdata = stdoutdata.decode(_PANDOC_ENCODING) |
|
264 |
except UnicodeDecodeError: |
|
265 |
# this will fail for binary output formats such as odt |
|
266 |
# return result without conversion then |
|
267 |
pass |
|
268 |
||
269 |
return stdoutdata |
|
270 |
||
271 |
if __name__ == "__main__": |
|
272 |
import doctest |
|
273 |
doctest.testmod() |
|
274 |