1 from pandoc_converters import pandoc_convert |
1 from pandoc_converters import pandoc_convert |
2 import chardet |
2 import chardet |
3 from cm.utils.string_utils import to_unicode |
3 from cm.utils.string_utils import to_unicode |
4 import re |
4 import re |
|
5 import os |
|
6 from cm.converters.oo_converters import extract_css_body |
|
7 |
5 |
8 |
6 # TODO: move that in text_base: save images |
9 # TODO: move that in text_base: save images |
7 def convert_from_mimetype(file_name, mime_type, format): |
10 def convert_from_mimetype(file_name, mime_type, format): |
8 input = open(file_name, 'r').read() |
11 input = open(file_name, 'r').read() |
9 return _convert_from_mimetype(input, mime_type, format) |
12 return _convert_from_mimetype(input, mime_type, format) |
17 ############################## |
20 ############################## |
18 if mime_type in ['application/vnd.oasis.opendocument.text', |
21 if mime_type in ['application/vnd.oasis.opendocument.text', |
19 'application/msword', |
22 'application/msword', |
20 ]: |
23 ]: |
21 |
24 |
22 xhtml_input, attachs = convert_oo_to_html(input) |
25 html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input) |
23 converted_input = pandoc_convert(xhtml_input, 'html', format) |
26 if format == 'html': |
|
27 _not_used_css, converted_input = extract_css_body(xhtml_input) |
|
28 #converted_input = xhtml_input |
|
29 |
|
30 converted_input = pandoc_convert(html_input, 'html', format) |
24 |
31 |
25 ############################## |
32 ############################## |
26 # latex |
33 # latex |
27 elif mime_type in ['application/x-latex','text/x-tex',]: |
34 elif mime_type in ['application/x-latex','text/x-tex',]: |
28 converted_input = pandoc_convert(to_unicode(input), 'latex', format) |
35 converted_input = pandoc_convert(to_unicode(input), 'latex', format) |
35 ############################## |
42 ############################## |
36 # html |
43 # html |
37 elif mime_type in ['text/html', 'application/xhtml+xml']: |
44 elif mime_type in ['text/html', 'application/xhtml+xml']: |
38 if format == 'html': |
45 if format == 'html': |
39 converted_input = input |
46 converted_input = input |
40 else: |
47 |
41 converted_input = pandoc_convert(input, 'html', format) |
48 converted_input = pandoc_convert(input, 'html', format) |
42 ############################## |
49 ############################## |
43 # anything looks like text -> markdown |
50 # anything looks like text -> markdown |
44 elif mime_type in ['text/plain', |
51 elif mime_type in ['text/plain', |
45 'text/english', |
52 'text/english', |
46 'text/enriched' |
53 'text/enriched' |
86 from oo_converters import convert |
93 from oo_converters import convert |
87 html_input, images = convert(input, 'html') |
94 html_input, images = convert(input, 'html') |
88 |
95 |
89 enc = chardet.detect(html_input)['encoding'] |
96 enc = chardet.detect(html_input)['encoding'] |
90 try_encodings = [enc, 'utf8', 'latin1'] |
97 try_encodings = [enc, 'utf8', 'latin1'] |
91 res_content = None |
|
92 for encoding in try_encodings: |
98 for encoding in try_encodings: |
93 try: |
99 try: |
94 res_content_html = unicode(html_input, encoding) |
100 res_content_html = unicode(html_input, encoding) |
95 break; |
101 break; |
96 except UnicodeDecodeError: |
102 except UnicodeDecodeError: |
97 pass |
103 pass |
98 if not res_content_html: |
104 if not res_content_html: |
99 raise Exception('UnicodeDecodeError: could not decode') |
105 raise Exception('UnicodeDecodeError: could not decode') |
100 return res_content_html, images |
106 return res_content_html, images |
101 |
107 |
102 def old_convert_oo_to_html(input): |
108 def fix_html_img_path(html): |
|
109 return html.replace('IMG SRC="../outdir/','IMG SRC="') |
|
110 |
|
111 def convert_oo_to_html_and_xhtml(input): |
103 from oo_converters import convert |
112 from oo_converters import convert |
104 html_input, images = convert(input, 'html') |
113 html_input, images = convert(input, 'html') |
105 xhtml_input, _not_used_ = convert(input, 'xhtml') |
114 xhtml_input, _not_used_ = convert(input, 'xhtml') |
106 |
|
107 enc = chardet.detect(xhtml_input)['encoding'] |
115 enc = chardet.detect(xhtml_input)['encoding'] |
108 try_encodings = [enc, 'utf8', 'latin1'] |
116 try_encodings = [enc, 'utf8', 'latin1'] |
109 res_content = None |
|
110 for encoding in try_encodings: |
117 for encoding in try_encodings: |
111 try: |
118 try: |
112 # TODO: fix path and manage images |
|
113 #res_content = fix_img_path(unicode(html_res_content,encoding), |
|
114 # unicode(xhtml_res_content,encoding), |
|
115 # iimg) |
|
116 res_content_html = unicode(html_input, encoding) |
119 res_content_html = unicode(html_input, encoding) |
117 res_content_xhtml = unicode(xhtml_input, encoding) |
120 res_content_xhtml = unicode(xhtml_input, encoding) |
118 break; |
121 break; |
119 except UnicodeDecodeError: |
122 except UnicodeDecodeError: |
120 pass |
123 pass |
|
124 |
|
125 res_content_xhtml = fix_img_path(res_content_html, res_content_xhtml, images) |
|
126 res_content_html = fix_html_img_path(res_content_html) |
|
127 |
121 if not res_content_html or not res_content_xhtml: |
128 if not res_content_html or not res_content_xhtml: |
122 raise Exception('UnicodeDecodeError: could not decode') |
129 raise Exception('UnicodeDecodeError: could not decode') |
123 return res_content_html, res_content_xhtml, images |
130 return res_content_html, cleanup(res_content_xhtml), images |
124 |
131 |
|
132 def cleanup(string): |
|
133 return string.replace(u'\xc2\xa0',u'') |
|
134 |
125 def markdown_from_code(code): |
135 def markdown_from_code(code): |
126 CODE_INDICATOR = " " # 4 spaces |
136 CODE_INDICATOR = " " # 4 spaces |
127 return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')]) |
137 return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')]) |
128 |
138 |
129 |
139 |