|
0
|
1 |
# python 2.5 compat |
|
|
2 |
from __future__ import with_statement |
|
|
3 |
from cm.utils.cache import memoize |
|
|
4 |
###### |
|
|
5 |
## This module requires pandoc v > 1.0 (pandoc & markdown executables) |
|
|
6 |
###### |
|
|
7 |
|
|
|
8 |
from subprocess import Popen, PIPE, call |
|
|
9 |
import os |
|
|
10 |
from tempfile import mkstemp |
|
|
11 |
import StringIO |
|
|
12 |
import tidy |
|
149
|
13 |
from cm.utils.string_utils import to_unicode |
|
0
|
14 |
|
|
|
15 |
PANDOC_BIN = "pandoc" |
|
|
16 |
PANDOC_OPTIONS = "--sanitize-html " |
|
|
17 |
|
|
|
18 |
MARKDOWN2PDF_BIN = "markdown2pdf" |
|
|
19 |
|
|
|
20 |
# make sure binaries are available |
|
|
21 |
from cm.utils.system import bin_search |
|
|
22 |
bin_search(PANDOC_BIN) |
|
|
23 |
bin_search(MARKDOWN2PDF_BIN) |
|
|
24 |
|
|
|
25 |
# pandoc capabilities |
|
|
26 |
INPUT_FORMATS = ['native', 'markdown', 'rst', 'html', 'latex'] |
|
|
27 |
OUTPUT_FORMATS = ['native', 'html', 's5', 'docbook', 'opendocument', 'odt', 'latex', 'context', 'texinfo', 'man', 'markdown', 'rst', 'mediawiki', 'rtf'] |
|
|
28 |
|
|
|
29 |
# add pdf output using markdown2pdf |
|
|
30 |
OUTPUT_FORMATS.append('pdf') |
|
|
31 |
|
|
|
32 |
# input formats |
|
|
33 |
CHOICES_INPUT_FORMATS = [(f, f) for f in ['markdown', 'rst', 'html']] |
|
|
34 |
|
|
|
35 |
DEFAULT_INPUT_FORMAT = 'markdown' |
|
|
36 |
|
|
|
37 |
_PANDOC_ENCODING = 'utf8' |
|
|
38 |
|
|
|
39 |
@memoize |
|
|
40 |
def pandoc_convert(content, from_format, to_format, full=False): |
|
|
41 |
""" |
|
|
42 |
Convert markdown content to pdf |
|
|
43 |
|
|
|
44 |
>>> res = pandoc_convert('<span>dssd', 'html', 'pdf') |
|
|
45 |
""" |
|
|
46 |
# pandoc does not react well when html is not valid |
|
|
47 |
# use tidy to clean html |
|
|
48 |
if from_format == 'html': |
|
119
|
49 |
try: |
|
|
50 |
content = do_tidy(content) |
|
|
51 |
except: |
|
|
52 |
# tidy fails ... try pandoc anyway... |
|
|
53 |
content = to_unicode(content) |
|
0
|
54 |
# if to_format is pdf: use markdown2pdf |
|
|
55 |
if to_format == 'pdf': |
|
|
56 |
if from_format != 'markdown': |
|
|
57 |
content = pandoc_convert(content, from_format, 'markdown', True) |
|
|
58 |
return pandoc_markdown2pdf(content) |
|
|
59 |
return pandoc_pandoc(content, from_format, to_format, full) |
|
|
60 |
|
|
|
61 |
def content_or_file_name(content, file_name): |
|
|
62 |
if not content and not file_name: |
|
|
63 |
raise Exception('You should provide either a content or a file_name') |
|
|
64 |
if content and file_name: |
|
|
65 |
raise Exception('You should not provide a content AND a file_name') |
|
|
66 |
|
|
|
67 |
if file_name: |
|
|
68 |
fp = file(file_name) |
|
|
69 |
content = fp.read() |
|
|
70 |
fp.close() |
|
|
71 |
|
|
|
72 |
return content |
|
|
73 |
|
|
|
74 |
@memoize |
|
|
75 |
def do_tidy(content=None, file_name=None): |
|
|
76 |
""" |
|
|
77 |
Tidy (html) content |
|
|
78 |
|
|
|
79 |
>>> res = do_tidy('<span>sdd') |
|
|
80 |
""" |
|
|
81 |
content = content_or_file_name(content, file_name) |
|
|
82 |
|
|
|
83 |
tidy_options = dict(output_xhtml=1, |
|
|
84 |
add_xml_decl=0, |
|
|
85 |
indent=0, |
|
|
86 |
tidy_mark=0, |
|
|
87 |
input_encoding='utf8', |
|
|
88 |
output_encoding='utf8', |
|
|
89 |
) |
|
119
|
90 |
tidyied_content = tidy.parseString(to_unicode(content).encode('utf8'), **tidy_options) |
|
0
|
91 |
tidyied_content = str(tidyied_content) |
|
|
92 |
if content and not tidyied_content.strip(): |
|
|
93 |
raise Exception('Content could not be tidyfied') |
|
|
94 |
return str(tidyied_content).decode('utf8') |
|
|
95 |
|
|
|
96 |
|
|
|
97 |
def get_filetemp(mode="r"): |
|
|
98 |
(fd, fname) = mkstemp() |
|
|
99 |
return (os.fdopen(fd, mode), fname) |
|
|
100 |
|
|
|
101 |
# build absolute address for latex header file |
|
|
102 |
_tmp_ = __file__.split(os.path.sep)[:-1] |
|
|
103 |
_tmp_.append('latex_header.txt') |
|
|
104 |
_tmp_.insert(0, os.path.sep) |
|
|
105 |
|
|
|
106 |
LATEX_HEADER_PATH = os.path.join(*_tmp_) |
|
|
107 |
|
|
|
108 |
if not os.path.isfile(LATEX_HEADER_PATH): |
|
|
109 |
raise Exception('LATEX_HEADER_PATH is not a file!') |
|
|
110 |
|
|
|
111 |
@memoize |
|
|
112 |
def pandoc_markdown2pdf(content=None, file_name=None): |
|
|
113 |
""" |
|
|
114 |
Convert markdown content to pdf |
|
|
115 |
|
|
|
116 |
>>> pdf_content = pandoc_markdown2pdf('# dssd') |
|
|
117 |
""" |
|
|
118 |
content = content_or_file_name(content, file_name) |
|
|
119 |
|
|
|
120 |
# write file to disk |
|
|
121 |
temp_file, input_temp_name = get_filetemp('w') |
|
|
122 |
fp_error, error_temp_name = get_filetemp('w') |
|
|
123 |
|
|
|
124 |
temp_file.write(content.encode(_PANDOC_ENCODING)) |
|
|
125 |
temp_file.close() |
|
|
126 |
|
|
|
127 |
# custom latex header |
|
|
128 |
cust_head_tex = " --custom-header=%s " %LATEX_HEADER_PATH |
|
|
129 |
|
|
|
130 |
# use markdown2pdf |
|
|
131 |
retcode = call(MARKDOWN2PDF_BIN + cust_head_tex + ' ' + input_temp_name, shell=True, stderr=fp_error) |
|
|
132 |
fp_error.close() |
|
|
133 |
|
|
|
134 |
fp_error = file(error_temp_name) |
|
|
135 |
error = fp_error.read() |
|
|
136 |
fp_error.close() |
|
|
137 |
|
|
|
138 |
os.remove(input_temp_name) |
|
|
139 |
os.remove(error_temp_name) |
|
|
140 |
|
|
|
141 |
if retcode: |
|
|
142 |
raise Exception(error) |
|
|
143 |
|
|
|
144 |
output_temp_name = input_temp_name + '.pdf' |
|
|
145 |
fp_output = file(output_temp_name) |
|
|
146 |
pdf_content = fp_output.read() |
|
|
147 |
fp_output.close() |
|
|
148 |
|
|
|
149 |
os.remove(output_temp_name) |
|
|
150 |
|
|
|
151 |
return pdf_content |
|
|
152 |
|
|
|
153 |
# TODO: manage images in pandoc (?) |
|
|
154 |
# TODO: use tidy to cleanup html |
|
|
155 |
|
|
|
156 |
@memoize |
|
|
157 |
def pandoc_pandoc(content, from_format, to_format, full=False): |
|
|
158 |
""" |
|
|
159 |
Convert content (should be unicode) from from_format to to_format |
|
|
160 |
(if full: includes header & co [html, latex]) |
|
|
161 |
Returns out (unicode), err |
|
|
162 |
|
|
|
163 |
>>> res, err = pandoc_pandoc(u'# sdsd', 'markdown', 'html', False) |
|
|
164 |
>>> print err |
|
|
165 |
None |
|
|
166 |
>>> res.replace("\\n","") |
|
|
167 |
u'<h1 id="sdsd">sdsd</h1>' |
|
|
168 |
>>> res, err = pandoc_pandoc(u'# sdsd', 'markdown', 'html', True) |
|
|
169 |
>>> print err |
|
|
170 |
None |
|
|
171 |
""" |
|
|
172 |
# verify formats |
|
|
173 |
if from_format not in INPUT_FORMATS: |
|
|
174 |
raise Exception("Input format [%s] is not a supported format [%s]" % (from_format, ' '.join(INPUT_FORMATS))) |
|
|
175 |
if to_format not in OUTPUT_FORMATS: |
|
|
176 |
raise Exception("Output format [%s] is not a supported format [%s]" % (to_format, ' '.join(OUTPUT_FORMATS))) |
|
|
177 |
if type(content) != unicode: |
|
|
178 |
raise Exception('Content is not in unicode format!') |
|
|
179 |
|
|
|
180 |
# temp file |
|
|
181 |
input_file, input_temp_name = get_filetemp('w') |
|
|
182 |
output_temp_fp, output_temp_name = get_filetemp() |
|
|
183 |
output_temp_fp.close() |
|
|
184 |
|
|
|
185 |
error_temp_fp, error_temp_name = get_filetemp('w') |
|
|
186 |
error_temp_fp.close() |
|
|
187 |
|
|
|
188 |
input_file.write(content.encode(_PANDOC_ENCODING)) |
|
|
189 |
input_file.close() |
|
|
190 |
|
|
|
191 |
# pandoc arguments and command line |
|
|
192 |
cmd_args = ' %s -o %s ' %(PANDOC_OPTIONS,output_temp_name) |
|
|
193 |
if full: |
|
|
194 |
cmd_args += ' -s ' |
|
|
195 |
cmd_args += ' -f %s ' % from_format |
|
|
196 |
cmd_args += ' -t %s ' % to_format |
|
|
197 |
cmd_args += ' %s ' % input_temp_name |
|
|
198 |
cmd = PANDOC_BIN + ' ' + cmd_args |
|
|
199 |
|
|
|
200 |
#from socommons.converters.new_conv import controlled_Popen |
|
|
201 |
#controlled_Popen(cmd, stderr=file(error_temp_name,'w')) |
|
|
202 |
fp_error = file(error_temp_name,'w') |
|
|
203 |
retcode = call(cmd, shell=True, stderr=fp_error) |
|
|
204 |
fp_error.close() |
|
|
205 |
|
|
|
206 |
fp_error = file(error_temp_name) |
|
|
207 |
error = fp_error.read() |
|
|
208 |
fp_error.close() |
|
|
209 |
|
|
|
210 |
fp_output = file(output_temp_name) |
|
|
211 |
stdoutdata = fp_output.read() |
|
|
212 |
fp_output.close() |
|
|
213 |
|
|
|
214 |
|
|
|
215 |
# cleanup |
|
|
216 |
os.remove(output_temp_name) |
|
|
217 |
os.remove(input_temp_name) |
|
|
218 |
os.remove(error_temp_name) |
|
|
219 |
|
|
|
220 |
if retcode: |
|
|
221 |
raise Exception(error) |
|
|
222 |
|
|
|
223 |
# try converting to unicode |
|
|
224 |
try: |
|
|
225 |
stdoutdata = stdoutdata.decode(_PANDOC_ENCODING) |
|
|
226 |
except UnicodeDecodeError: |
|
|
227 |
# this will fail for binary output formats such as odt |
|
|
228 |
# return result without conversion then |
|
|
229 |
pass |
|
|
230 |
|
|
|
231 |
return stdoutdata |
|
|
232 |
|
|
|
233 |
if __name__ == "__main__": |
|
|
234 |
import doctest |
|
|
235 |
doctest.testmod() |
|
|
236 |
|