| author | gibus |
| Thu, 24 May 2012 12:48:39 +0200 | |
| changeset 442 | b6e443be2a9b |
| parent 428 | 9591c651391d |
| child 443 | cacd524f5279 |
| permissions | -rw-r--r-- |
| 0 | 1 |
# python 2.5 compat |
2 |
from __future__ import with_statement |
|
| 261 | 3 |
from cm.utils.cache import memoize, dj_memoize |
| 0 | 4 |
###### |
5 |
## This module requires pandoc v > 1.0 (pandoc & markdown executables) |
|
6 |
###### |
|
7 |
||
8 |
from subprocess import Popen, PIPE, call |
|
9 |
import os |
|
10 |
from tempfile import mkstemp |
|
11 |
import StringIO |
|
12 |
import tidy |
|
| 149 | 13 |
from cm.utils.string_utils import to_unicode |
|
352
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
14 |
from xml.dom.minidom import parseString |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
15 |
import re |
| 442 | 16 |
from distutils.version import LooseVersion |
| 0 | 17 |
|
18 |
PANDOC_BIN = "pandoc" |
|
| 442 | 19 |
import commands |
20 |
PANDOC_VERSION = commands.getstatusoutput(PANDOC_BIN + " -v|head -n 1|awk '{print $2;}'")[1] |
|
21 |
if LooseVersion(PANDOC_VERSION) < '1.8': |
|
22 |
PANDOC_OPTIONS = " --sanitize-html --email-obfuscation=none " |
|
23 |
else: |
|
24 |
PANDOC_OPTIONS = " --email-obfuscation=none " |
|
25 |
||
|
352
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
26 |
PANDOC_OPTIONS_RAW = " -R --email-obfuscation=none " |
| 0 | 27 |
|
| 442 | 28 |
if LooseVersion(PANDOC_VERSION) < '1.9': |
29 |
MARKDOWN2PDF_BIN = "markdown2pdf" |
|
30 |
else: |
|
31 |
MARKDOWN2PDF_BIN = None |
|
| 0 | 32 |
|
33 |
# make sure binaries are available |
|
34 |
from cm.utils.system import bin_search |
|
35 |
bin_search(PANDOC_BIN) |
|
| 442 | 36 |
if MARKDOWN2PDF_BIN: |
37 |
bin_search(MARKDOWN2PDF_BIN) |
|
| 0 | 38 |
|
39 |
# pandoc capabilities |
|
40 |
INPUT_FORMATS = ['native', 'markdown', 'rst', 'html', 'latex'] |
|
| 442 | 41 |
OUTPUT_FORMATS = ['native', 'html', 's5', 'docbook', 'opendocument', 'odt', 'latex', 'context', 'texinfo', 'man', 'markdown', 'rst', 'mediawiki', 'rtf', 'pdf'] |
| 0 | 42 |
|
43 |
# input formats |
|
44 |
CHOICES_INPUT_FORMATS = [(f, f) for f in ['markdown', 'rst', 'html']] |
|
45 |
||
46 |
DEFAULT_INPUT_FORMAT = 'markdown' |
|
47 |
||
48 |
_PANDOC_ENCODING = 'utf8' |
|
49 |
||
| 261 | 50 |
@dj_memoize |
|
259
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
51 |
def pandoc_convert(content, from_format, to_format, full=False, raw=False): |
| 0 | 52 |
""" |
53 |
Convert markdown content to pdf |
|
54 |
|
|
55 |
>>> res = pandoc_convert('<span>dssd', 'html', 'pdf') |
|
56 |
""" |
|
57 |
# pandoc does not react well when html is not valid |
|
58 |
# use tidy to clean html |
|
59 |
if from_format == 'html': |
|
| 119 | 60 |
try: |
61 |
content = do_tidy(content) |
|
62 |
except: |
|
63 |
# tidy fails ... try pandoc anyway... |
|
64 |
content = to_unicode(content) |
|
| 0 | 65 |
# if to_format is pdf: use markdown2pdf |
| 442 | 66 |
if MARKDOWN2PDF_BIN and to_format == 'pdf': |
| 0 | 67 |
if from_format != 'markdown': |
68 |
content = pandoc_convert(content, from_format, 'markdown', True) |
|
69 |
return pandoc_markdown2pdf(content) |
|
|
259
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
70 |
return pandoc_pandoc(content, from_format, to_format, full, from_format==to_format=='html') # use raw pandoc convertion if html->html |
| 0 | 71 |
|
72 |
def content_or_file_name(content, file_name): |
|
73 |
if not content and not file_name: |
|
74 |
raise Exception('You should provide either a content or a file_name') |
|
75 |
if content and file_name: |
|
76 |
raise Exception('You should not provide a content AND a file_name') |
|
77 |
||
78 |
if file_name: |
|
79 |
fp = file(file_name) |
|
80 |
content = fp.read() |
|
81 |
fp.close() |
|
82 |
||
83 |
return content |
|
84 |
||
| 261 | 85 |
@dj_memoize |
| 0 | 86 |
def do_tidy(content=None, file_name=None): |
87 |
""" |
|
88 |
Tidy (html) content |
|
89 |
|
|
90 |
>>> res = do_tidy('<span>sdd') |
|
91 |
""" |
|
92 |
content = content_or_file_name(content, file_name) |
|
93 |
||
94 |
tidy_options = dict(output_xhtml=1, |
|
95 |
add_xml_decl=0, |
|
96 |
indent=0, |
|
97 |
tidy_mark=0, |
|
|
352
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
98 |
logical_emphasis=1, |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
99 |
wrap=0, |
| 0 | 100 |
input_encoding='utf8', |
101 |
output_encoding='utf8', |
|
102 |
) |
|
| 119 | 103 |
tidyied_content = tidy.parseString(to_unicode(content).encode('utf8'), **tidy_options) |
| 0 | 104 |
tidyied_content = str(tidyied_content) |
105 |
if content and not tidyied_content.strip(): |
|
106 |
raise Exception('Content could not be tidyfied') |
|
107 |
return str(tidyied_content).decode('utf8') |
|
108 |
||
109 |
||
|
351
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
110 |
def get_filetemp(mode="r", suffix=''): |
|
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
111 |
(fd, fname) = mkstemp(suffix) |
| 0 | 112 |
return (os.fdopen(fd, mode), fname) |
113 |
||
| 261 | 114 |
@dj_memoize |
| 0 | 115 |
def pandoc_markdown2pdf(content=None, file_name=None): |
116 |
""" |
|
117 |
Convert markdown content to pdf |
|
118 |
|
|
119 |
>>> pdf_content = pandoc_markdown2pdf('# dssd') |
|
120 |
""" |
|
121 |
content = content_or_file_name(content, file_name) |
|
122 |
||
123 |
# write file to disk |
|
|
351
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
124 |
temp_file, input_temp_name = get_filetemp('w', 'input') |
|
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
125 |
fp_error, error_temp_name = get_filetemp('w', 'err') |
| 0 | 126 |
|
127 |
temp_file.write(content.encode(_PANDOC_ENCODING)) |
|
128 |
temp_file.close() |
|
129 |
||
|
357
2f4587a37ff9
if markdown2pdf fails try without -xetex option
Production Moz <dev@sopinspace.com>
parents:
356
diff
changeset
|
130 |
cust_tex = " --xetex " |
| 0 | 131 |
|
132 |
# use markdown2pdf |
|
|
351
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
133 |
retcode = call(MARKDOWN2PDF_BIN + cust_tex + ' ' + input_temp_name, shell=True, stderr=fp_error) |
|
357
2f4587a37ff9
if markdown2pdf fails try without -xetex option
Production Moz <dev@sopinspace.com>
parents:
356
diff
changeset
|
134 |
|
|
2f4587a37ff9
if markdown2pdf fails try without -xetex option
Production Moz <dev@sopinspace.com>
parents:
356
diff
changeset
|
135 |
# xetex seems to randomly cause "Invalid or incomplete multibyte or wide character" errors, try without it |
|
2f4587a37ff9
if markdown2pdf fails try without -xetex option
Production Moz <dev@sopinspace.com>
parents:
356
diff
changeset
|
136 |
if retcode: |
|
358
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
137 |
# build absolute address for latex header file |
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
138 |
_tmp_ = __file__.split(os.path.sep)[:-1] |
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
139 |
_tmp_.append('latex_header.txt') |
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
140 |
_tmp_.insert(0, os.path.sep) |
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
141 |
|
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
142 |
LATEX_HEADER_PATH = os.path.join(*_tmp_) |
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
143 |
|
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
144 |
if not os.path.isfile(LATEX_HEADER_PATH): |
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
145 |
raise Exception('LATEX_HEADER_PATH is not a file!') |
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
146 |
|
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
147 |
# custom latex header |
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
148 |
cust_head_tex = " --custom-header=%s " %LATEX_HEADER_PATH |
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
149 |
|
|
3e58bf6b3f3e
when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents:
357
diff
changeset
|
150 |
retcode = call(MARKDOWN2PDF_BIN + cust_head_tex + ' ' + input_temp_name, shell=True, stderr=fp_error) |
|
357
2f4587a37ff9
if markdown2pdf fails try without -xetex option
Production Moz <dev@sopinspace.com>
parents:
356
diff
changeset
|
151 |
|
| 0 | 152 |
fp_error.close() |
153 |
||
154 |
fp_error = file(error_temp_name) |
|
155 |
error = fp_error.read() |
|
156 |
fp_error.close() |
|
157 |
||
158 |
os.remove(input_temp_name) |
|
159 |
os.remove(error_temp_name) |
|
160 |
||
161 |
if retcode: |
|
162 |
raise Exception(error) |
|
163 |
||
164 |
output_temp_name = input_temp_name + '.pdf' |
|
165 |
fp_output = file(output_temp_name) |
|
166 |
pdf_content = fp_output.read() |
|
167 |
fp_output.close() |
|
168 |
||
169 |
os.remove(output_temp_name) |
|
170 |
||
171 |
return pdf_content |
|
172 |
||
173 |
# TODO: manage images in pandoc (?) |
|
174 |
# TODO: use tidy to cleanup html |
|
175 |
||
| 261 | 176 |
@dj_memoize |
|
259
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
177 |
def pandoc_pandoc(content, from_format, to_format, full=False, raw=False): |
| 0 | 178 |
""" |
179 |
Convert content (should be unicode) from from_format to to_format |
|
180 |
(if full: includes header & co [html, latex]) |
|
181 |
Returns out (unicode), err |
|
182 |
|
|
183 |
>>> res, err = pandoc_pandoc(u'# sdsd', 'markdown', 'html', False) |
|
184 |
>>> print err |
|
185 |
None |
|
186 |
>>> res.replace("\\n","") |
|
187 |
u'<h1 id="sdsd">sdsd</h1>' |
|
188 |
>>> res, err = pandoc_pandoc(u'# sdsd', 'markdown', 'html', True) |
|
189 |
>>> print err |
|
190 |
None |
|
191 |
""" |
|
192 |
# verify formats |
|
193 |
if from_format not in INPUT_FORMATS: |
|
194 |
raise Exception("Input format [%s] is not a supported format [%s]" % (from_format, ' '.join(INPUT_FORMATS))) |
|
195 |
if to_format not in OUTPUT_FORMATS: |
|
196 |
raise Exception("Output format [%s] is not a supported format [%s]" % (to_format, ' '.join(OUTPUT_FORMATS))) |
|
197 |
if type(content) != unicode: |
|
198 |
raise Exception('Content is not in unicode format!') |
|
199 |
||
200 |
# temp file |
|
|
351
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
201 |
input_file, input_temp_name = get_filetemp('w', 'input') |
| 442 | 202 |
# For some reason when pandoc > 1.9 converts to PDF, '-t' shouldn't be used but output file name extension has to be '.pdf' |
203 |
if to_format != 'pdf': |
|
204 |
output_temp_fp, output_temp_name = get_filetemp('r', 'output') |
|
205 |
else: |
|
206 |
output_temp_fp, output_temp_name = get_filetemp('r', 'output.pdf') |
|
| 0 | 207 |
output_temp_fp.close() |
208 |
||
|
351
9245a73f5787
use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents:
261
diff
changeset
|
209 |
error_temp_fp, error_temp_name = get_filetemp('w', 'err') |
| 0 | 210 |
error_temp_fp.close() |
211 |
||
212 |
input_file.write(content.encode(_PANDOC_ENCODING)) |
|
213 |
input_file.close() |
|
214 |
||
215 |
# pandoc arguments and command line |
|
|
259
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
216 |
p_options = PANDOC_OPTIONS |
|
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
217 |
if raw: |
|
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
218 |
p_options = PANDOC_OPTIONS_RAW |
|
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
219 |
|
|
352
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
220 |
# do not use pandoc to convert from html to html |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
221 |
if from_format==to_format=='html': |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
222 |
# get body content |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
223 |
stdoutdata = (content.encode('utf8')) |
| 393 | 224 |
#stdoutdata = re.sub(r".*<body[^>]*>", '', stdoutdata) |
225 |
#stdoutdata = re.sub(r"</body>.*", '', stdoutdata) |
|
|
352
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
226 |
# if for some reason, tidy has not guess the doctype, make xml.dom.minidom happy with HTML entities ( ) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
227 |
stdoutdata = re.sub(r" ", '\xc2\xa0', stdoutdata) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
228 |
dom = parseString(stdoutdata) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
229 |
body = dom.getElementsByTagName("body")[0].toxml() |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
230 |
stdoutdata = body[body.find('>')+1:body.rfind('</')] |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
231 |
# strip leading spaces |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
232 |
stdoutdata = re.sub(r"^\s+", '', stdoutdata) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
233 |
# add new line before closing bracket |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
234 |
stdoutdata = re.sub(r"(\/?)>", r"\n\1>", stdoutdata) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
235 |
# do not split closing tag with following opening tag |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
236 |
stdoutdata = re.sub(r">\n<", r"><", stdoutdata) |
|
355
c926868cf7e6
if DECORATED_CREATORS take into account the "fake" username if has_own_perm()
gibus
parents:
352
diff
changeset
|
237 |
# nest headers tags |
|
352
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
238 |
#stdoutdata = re.sub(r'<h(\d) id="([^"]+)"\n>', r'<div id="\2"><h\1>', stdoutdata) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
239 |
#stdoutdata = re.sub(r'<\/h(\d)\n>', r'</h\1></div>', stdoutdata) |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
240 |
return stdoutdata |
|
07a1fba18fff
do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents:
351
diff
changeset
|
241 |
|
|
259
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
252
diff
changeset
|
242 |
cmd_args = ' %s -o %s ' %(p_options,output_temp_name) |
| 0 | 243 |
if full: |
244 |
cmd_args += ' -s ' |
|
245 |
cmd_args += ' -f %s ' % from_format |
|
| 442 | 246 |
if to_format != 'pdf': |
247 |
cmd_args += ' -t %s ' % to_format |
|
| 0 | 248 |
cmd_args += ' %s ' % input_temp_name |
249 |
cmd = PANDOC_BIN + ' ' + cmd_args |
|
250 |
||
251 |
#from socommons.converters.new_conv import controlled_Popen |
|
252 |
#controlled_Popen(cmd, stderr=file(error_temp_name,'w')) |
|
253 |
fp_error = file(error_temp_name,'w') |
|
254 |
retcode = call(cmd, shell=True, stderr=fp_error) |
|
255 |
fp_error.close() |
|
256 |
||
257 |
fp_error = file(error_temp_name) |
|
258 |
error = fp_error.read() |
|
259 |
fp_error.close() |
|
260 |
||
261 |
fp_output = file(output_temp_name) |
|
262 |
stdoutdata = fp_output.read() |
|
263 |
fp_output.close() |
|
264 |
||
265 |
||
266 |
# cleanup |
|
267 |
os.remove(output_temp_name) |
|
268 |
os.remove(input_temp_name) |
|
269 |
os.remove(error_temp_name) |
|
270 |
||
271 |
if retcode: |
|
272 |
raise Exception(error) |
|
273 |
||
274 |
# try converting to unicode |
|
275 |
try: |
|
276 |
stdoutdata = stdoutdata.decode(_PANDOC_ENCODING) |
|
277 |
except UnicodeDecodeError: |
|
278 |
# this will fail for binary output formats such as odt |
|
279 |
# return result without conversion then |
|
280 |
pass |
|
281 |
||
282 |
return stdoutdata |
|
283 |
||
284 |
if __name__ == "__main__": |
|
285 |
import doctest |
|
286 |
doctest.testmod() |
|
287 |