|
1 # python 2.5 compat |
|
2 from __future__ import with_statement |
|
3 from cm.utils.cache import memoize |
|
4 ###### |
|
5 ## This module requires pandoc v > 1.0 (pandoc & markdown executables) |
|
6 ###### |
|
7 |
|
8 from subprocess import Popen, PIPE, call |
|
9 import os |
|
10 from tempfile import mkstemp |
|
11 import StringIO |
|
12 import tidy |
|
13 |
|
14 |
|
15 PANDOC_BIN = "pandoc" |
|
16 PANDOC_OPTIONS = "--sanitize-html " |
|
17 |
|
18 MARKDOWN2PDF_BIN = "markdown2pdf" |
|
19 |
|
20 # make sure binaries are available |
|
21 from cm.utils.system import bin_search |
|
22 bin_search(PANDOC_BIN) |
|
23 bin_search(MARKDOWN2PDF_BIN) |
|
24 |
|
25 # pandoc capabilities |
|
26 INPUT_FORMATS = ['native', 'markdown', 'rst', 'html', 'latex'] |
|
27 OUTPUT_FORMATS = ['native', 'html', 's5', 'docbook', 'opendocument', 'odt', 'latex', 'context', 'texinfo', 'man', 'markdown', 'rst', 'mediawiki', 'rtf'] |
|
28 |
|
29 # add pdf output using markdown2pdf |
|
30 OUTPUT_FORMATS.append('pdf') |
|
31 |
|
32 # input formats |
|
33 CHOICES_INPUT_FORMATS = [(f, f) for f in ['markdown', 'rst', 'html']] |
|
34 |
|
35 DEFAULT_INPUT_FORMAT = 'markdown' |
|
36 |
|
37 _PANDOC_ENCODING = 'utf8' |
|
38 |
|
39 @memoize |
|
40 def pandoc_convert(content, from_format, to_format, full=False): |
|
41 """ |
|
42 Convert markdown content to pdf |
|
43 |
|
44 >>> res = pandoc_convert('<span>dssd', 'html', 'pdf') |
|
45 """ |
|
46 # pandoc does not react well when html is not valid |
|
47 # use tidy to clean html |
|
48 if from_format == 'html': |
|
49 content = do_tidy(content) |
|
50 # if to_format is pdf: use markdown2pdf |
|
51 if to_format == 'pdf': |
|
52 if from_format != 'markdown': |
|
53 content = pandoc_convert(content, from_format, 'markdown', True) |
|
54 return pandoc_markdown2pdf(content) |
|
55 return pandoc_pandoc(content, from_format, to_format, full) |
|
56 |
|
57 def content_or_file_name(content, file_name): |
|
58 if not content and not file_name: |
|
59 raise Exception('You should provide either a content or a file_name') |
|
60 if content and file_name: |
|
61 raise Exception('You should not provide a content AND a file_name') |
|
62 |
|
63 if file_name: |
|
64 fp = file(file_name) |
|
65 content = fp.read() |
|
66 fp.close() |
|
67 |
|
68 return content |
|
69 |
|
70 @memoize |
|
71 def do_tidy(content=None, file_name=None): |
|
72 """ |
|
73 Tidy (html) content |
|
74 |
|
75 >>> res = do_tidy('<span>sdd') |
|
76 """ |
|
77 content = content_or_file_name(content, file_name) |
|
78 |
|
79 tidy_options = dict(output_xhtml=1, |
|
80 add_xml_decl=0, |
|
81 indent=0, |
|
82 tidy_mark=0, |
|
83 input_encoding='utf8', |
|
84 output_encoding='utf8', |
|
85 ) |
|
86 tidyied_content = tidy.parseString(content.encode('utf8'), **tidy_options) |
|
87 tidyied_content = str(tidyied_content) |
|
88 if content and not tidyied_content.strip(): |
|
89 raise Exception('Content could not be tidyfied') |
|
90 return str(tidyied_content).decode('utf8') |
|
91 |
|
92 |
|
93 def get_filetemp(mode="r"): |
|
94 (fd, fname) = mkstemp() |
|
95 return (os.fdopen(fd, mode), fname) |
|
96 |
|
97 # build absolute address for latex header file |
|
98 _tmp_ = __file__.split(os.path.sep)[:-1] |
|
99 _tmp_.append('latex_header.txt') |
|
100 _tmp_.insert(0, os.path.sep) |
|
101 |
|
102 LATEX_HEADER_PATH = os.path.join(*_tmp_) |
|
103 |
|
104 if not os.path.isfile(LATEX_HEADER_PATH): |
|
105 raise Exception('LATEX_HEADER_PATH is not a file!') |
|
106 |
|
107 @memoize |
|
108 def pandoc_markdown2pdf(content=None, file_name=None): |
|
109 """ |
|
110 Convert markdown content to pdf |
|
111 |
|
112 >>> pdf_content = pandoc_markdown2pdf('# dssd') |
|
113 """ |
|
114 content = content_or_file_name(content, file_name) |
|
115 |
|
116 # write file to disk |
|
117 temp_file, input_temp_name = get_filetemp('w') |
|
118 fp_error, error_temp_name = get_filetemp('w') |
|
119 |
|
120 temp_file.write(content.encode(_PANDOC_ENCODING)) |
|
121 temp_file.close() |
|
122 |
|
123 # custom latex header |
|
124 cust_head_tex = " --custom-header=%s " %LATEX_HEADER_PATH |
|
125 |
|
126 # use markdown2pdf |
|
127 retcode = call(MARKDOWN2PDF_BIN + cust_head_tex + ' ' + input_temp_name, shell=True, stderr=fp_error) |
|
128 fp_error.close() |
|
129 |
|
130 fp_error = file(error_temp_name) |
|
131 error = fp_error.read() |
|
132 fp_error.close() |
|
133 |
|
134 os.remove(input_temp_name) |
|
135 os.remove(error_temp_name) |
|
136 |
|
137 if retcode: |
|
138 raise Exception(error) |
|
139 |
|
140 output_temp_name = input_temp_name + '.pdf' |
|
141 fp_output = file(output_temp_name) |
|
142 pdf_content = fp_output.read() |
|
143 fp_output.close() |
|
144 |
|
145 os.remove(output_temp_name) |
|
146 |
|
147 return pdf_content |
|
148 |
|
149 # TODO: manage images in pandoc (?) |
|
150 # TODO: use tidy to cleanup html |
|
151 |
|
152 @memoize |
|
153 def pandoc_pandoc(content, from_format, to_format, full=False): |
|
154 """ |
|
155 Convert content (should be unicode) from from_format to to_format |
|
156 (if full: includes header & co [html, latex]) |
|
157 Returns out (unicode), err |
|
158 |
|
159 >>> res, err = pandoc_pandoc(u'# sdsd', 'markdown', 'html', False) |
|
160 >>> print err |
|
161 None |
|
162 >>> res.replace("\\n","") |
|
163 u'<h1 id="sdsd">sdsd</h1>' |
|
164 >>> res, err = pandoc_pandoc(u'# sdsd', 'markdown', 'html', True) |
|
165 >>> print err |
|
166 None |
|
167 """ |
|
168 # verify formats |
|
169 if from_format not in INPUT_FORMATS: |
|
170 raise Exception("Input format [%s] is not a supported format [%s]" % (from_format, ' '.join(INPUT_FORMATS))) |
|
171 if to_format not in OUTPUT_FORMATS: |
|
172 raise Exception("Output format [%s] is not a supported format [%s]" % (to_format, ' '.join(OUTPUT_FORMATS))) |
|
173 if type(content) != unicode: |
|
174 raise Exception('Content is not in unicode format!') |
|
175 |
|
176 # temp file |
|
177 input_file, input_temp_name = get_filetemp('w') |
|
178 output_temp_fp, output_temp_name = get_filetemp() |
|
179 output_temp_fp.close() |
|
180 |
|
181 error_temp_fp, error_temp_name = get_filetemp('w') |
|
182 error_temp_fp.close() |
|
183 |
|
184 input_file.write(content.encode(_PANDOC_ENCODING)) |
|
185 input_file.close() |
|
186 |
|
187 # pandoc arguments and command line |
|
188 cmd_args = ' %s -o %s ' %(PANDOC_OPTIONS,output_temp_name) |
|
189 if full: |
|
190 cmd_args += ' -s ' |
|
191 cmd_args += ' -f %s ' % from_format |
|
192 cmd_args += ' -t %s ' % to_format |
|
193 cmd_args += ' %s ' % input_temp_name |
|
194 cmd = PANDOC_BIN + ' ' + cmd_args |
|
195 |
|
196 #from socommons.converters.new_conv import controlled_Popen |
|
197 #controlled_Popen(cmd, stderr=file(error_temp_name,'w')) |
|
198 fp_error = file(error_temp_name,'w') |
|
199 retcode = call(cmd, shell=True, stderr=fp_error) |
|
200 fp_error.close() |
|
201 |
|
202 fp_error = file(error_temp_name) |
|
203 error = fp_error.read() |
|
204 fp_error.close() |
|
205 |
|
206 fp_output = file(output_temp_name) |
|
207 stdoutdata = fp_output.read() |
|
208 fp_output.close() |
|
209 |
|
210 |
|
211 # cleanup |
|
212 os.remove(output_temp_name) |
|
213 os.remove(input_temp_name) |
|
214 os.remove(error_temp_name) |
|
215 |
|
216 if retcode: |
|
217 raise Exception(error) |
|
218 |
|
219 # try converting to unicode |
|
220 try: |
|
221 stdoutdata = stdoutdata.decode(_PANDOC_ENCODING) |
|
222 except UnicodeDecodeError: |
|
223 # this will fail for binary output formats such as odt |
|
224 # return result without conversion then |
|
225 pass |
|
226 |
|
227 return stdoutdata |
|
228 |
|
229 if __name__ == "__main__": |
|
230 import doctest |
|
231 doctest.testmod() |
|
232 |