1 from pandoc_converters import pandoc_convert |
1 from pandoc_converters import pandoc_convert |
2 import chardet |
2 import chardet |
3 from cm.utils.string_utils import to_unicode |
3 from cm.utils.string_utils import to_unicode |
4 import re |
4 import re |
|
5 from cm.converters.oo_converters import extract_css_body |
|
6 |
5 |
7 |
6 # TODO: move that in text_base: save images |
8 # TODO: move that in text_base: save images |
7 def convert_from_mimetype(file_name, mime_type, format): |
9 def convert_from_mimetype(file_name, mime_type, format): |
8 input = open(file_name, 'r').read() |
10 input = open(file_name, 'r').read() |
9 return _convert_from_mimetype(input, mime_type, format) |
11 return _convert_from_mimetype(input, mime_type, format) |
17 ############################## |
19 ############################## |
18 if mime_type in ['application/vnd.oasis.opendocument.text', |
20 if mime_type in ['application/vnd.oasis.opendocument.text', |
19 'application/msword', |
21 'application/msword', |
20 ]: |
22 ]: |
21 |
23 |
22 xhtml_input, attachs = convert_oo_to_html(input) |
24 html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input) |
23 converted_input = pandoc_convert(xhtml_input, 'html', format) |
25 if format == 'html': |
|
26 _not_used_css, converted_input = extract_css_body(xhtml_input) |
|
27 #converted_input = xhtml_input |
|
28 else: |
|
29 converted_input = pandoc_convert(html_input, 'html', format) |
24 |
30 |
25 ############################## |
31 ############################## |
26 # latex |
32 # latex |
27 elif mime_type in ['application/x-latex','text/x-tex',]: |
33 elif mime_type in ['application/x-latex','text/x-tex',]: |
28 converted_input = pandoc_convert(to_unicode(input), 'latex', format) |
34 converted_input = pandoc_convert(to_unicode(input), 'latex', format) |
86 from oo_converters import convert |
92 from oo_converters import convert |
87 html_input, images = convert(input, 'html') |
93 html_input, images = convert(input, 'html') |
88 |
94 |
89 enc = chardet.detect(html_input)['encoding'] |
95 enc = chardet.detect(html_input)['encoding'] |
90 try_encodings = [enc, 'utf8', 'latin1'] |
96 try_encodings = [enc, 'utf8', 'latin1'] |
91 res_content = None |
|
92 for encoding in try_encodings: |
97 for encoding in try_encodings: |
93 try: |
98 try: |
94 res_content_html = unicode(html_input, encoding) |
99 res_content_html = unicode(html_input, encoding) |
95 break; |
100 break; |
96 except UnicodeDecodeError: |
101 except UnicodeDecodeError: |
97 pass |
102 pass |
98 if not res_content_html: |
103 if not res_content_html: |
99 raise Exception('UnicodeDecodeError: could not decode') |
104 raise Exception('UnicodeDecodeError: could not decode') |
100 return res_content_html, images |
105 return res_content_html, images |
101 |
106 |
102 def old_convert_oo_to_html(input): |
107 def convert_oo_to_html_and_xhtml(input): |
103 from oo_converters import convert |
108 from oo_converters import convert |
104 html_input, images = convert(input, 'html') |
109 html_input, images = convert(input, 'html') |
105 xhtml_input, _not_used_ = convert(input, 'xhtml') |
110 xhtml_input, _not_used_ = convert(input, 'xhtml') |
106 |
111 |
107 enc = chardet.detect(xhtml_input)['encoding'] |
112 enc = chardet.detect(xhtml_input)['encoding'] |
108 try_encodings = [enc, 'utf8', 'latin1'] |
113 try_encodings = [enc, 'utf8', 'latin1'] |
109 res_content = None |
|
110 for encoding in try_encodings: |
114 for encoding in try_encodings: |
111 try: |
115 try: |
112 # TODO: fix path and manage images |
116 # TODO: fix path and manage images |
113 #res_content = fix_img_path(unicode(html_res_content,encoding), |
117 #res_content = fix_img_path(unicode(html_res_content,encoding), |
114 # unicode(xhtml_res_content,encoding), |
118 # unicode(xhtml_res_content,encoding), |
118 break; |
122 break; |
119 except UnicodeDecodeError: |
123 except UnicodeDecodeError: |
120 pass |
124 pass |
121 if not res_content_html or not res_content_xhtml: |
125 if not res_content_html or not res_content_xhtml: |
122 raise Exception('UnicodeDecodeError: could not decode') |
126 raise Exception('UnicodeDecodeError: could not decode') |
123 return res_content_html, res_content_xhtml, images |
127 return res_content_html, cleanup(res_content_xhtml), images |
124 |
128 |
|
129 def cleanup(string): |
|
130 return string.replace(u'\xc2\xa0',u'') |
|
131 |
125 def markdown_from_code(code): |
132 def markdown_from_code(code): |
126 CODE_INDICATOR = " " # 4 spaces |
133 CODE_INDICATOR = " " # 4 spaces |
127 return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')]) |
134 return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')]) |
128 |
135 |
129 |
136 |