|
1 from oo_converters import convert |
|
2 from pandoc_converters import pandoc_convert |
|
3 import chardet |
|
4 |
|
5 # TODO: move that in text_base: save images |
|
6 def convert_from_mimetype(file_name, mime_type, format): |
|
7 input = open(file_name, 'r').read() |
|
8 return _convert_from_mimetype(input, mime_type, format) |
|
9 |
|
10 def to_unicode(input): |
|
11 if type(input) == str: |
|
12 res = None |
|
13 for encoding in [chardet.detect(input)['encoding'], 'utf8', 'latin1']: |
|
14 try: |
|
15 res = unicode(input, encoding) |
|
16 break; |
|
17 except UnicodeDecodeError: |
|
18 pass |
|
19 if not res: |
|
20 raise Exception('UnicodeDecodeError: could not decode') |
|
21 return res |
|
22 return input |
|
23 |
|
24 def _convert_from_mimetype(input, mime_type, format): |
|
25 #input = to_unicode(input) |
|
26 |
|
27 attachs = [] |
|
28 attachs_dir = None |
|
29 ############################## |
|
30 if mime_type in ['application/vnd.oasis.opendocument.text', |
|
31 'application/msword', |
|
32 ]: |
|
33 |
|
34 xhtml_input, attachs = convert_oo_to_html(input) |
|
35 converted_input = pandoc_convert(xhtml_input, 'html', format) |
|
36 |
|
37 ############################## |
|
38 # anything looks like text -> markdown |
|
39 elif mime_type in ['text/plain', |
|
40 'text/english', |
|
41 'text/enriched' |
|
42 ]: |
|
43 converted_input = input |
|
44 |
|
45 ############################## |
|
46 # anything looks like code: put them into markdown citation |
|
47 elif mime_type.startswith('text/x-') or mime_type in ['application/x-latex', |
|
48 'application/x-ruby', |
|
49 ]: |
|
50 converted_input = markdown_from_code(input) |
|
51 |
|
52 ############################## |
|
53 # html |
|
54 elif mime_type in ['text/html', 'application/xhtml+xml']: |
|
55 if format == 'html': |
|
56 converted_input = input |
|
57 else: |
|
58 converted_input = pandoc_convert(input, 'html', format) |
|
59 |
|
60 return converted_input, attachs |
|
61 |
|
62 def fix_img_path(html, xhtml, imgs): |
|
63 """ |
|
64 imgs : name --> path |
|
65 """ |
|
66 finder_re = 'src[\s]*=[\s]*\"((?:(?!https?))[^\"]*)\"' |
|
67 len_res_html = len(re.findall(finder_re, html, re.IGNORECASE)) |
|
68 len_res_xhtml = len(re.findall(finder_re, xhtml, re.IGNORECASE)) |
|
69 res_html = re.finditer(finder_re, html, re.IGNORECASE) |
|
70 res_xhtml = re.finditer(finder_re, xhtml, re.IGNORECASE) |
|
71 result = [] |
|
72 last_index = 0 |
|
73 for match_xhtml in res_xhtml: |
|
74 img_path = '' |
|
75 try: |
|
76 match_html = res_html.next() |
|
77 if match_html: |
|
78 img_name = match_html.group(1) |
|
79 img_path = imgs[img_name] |
|
80 except StopIteration: |
|
81 # TODO : report pb |
|
82 pass |
|
83 offset = len(match_xhtml.group(0)) - len(match_xhtml.group(1)) |
|
84 result.append(xhtml[last_index:match_xhtml.start() + offset - 1]) |
|
85 result.append(img_path) |
|
86 last_index = match_xhtml.end() - 1 # -1 because trailing " |
|
87 result.append(xhtml[last_index:len(xhtml)]) |
|
88 return u''.join(result) |
|
89 |
|
90 def convert_oo_to_html(input): |
|
91 html_input, images = convert(input, 'html') |
|
92 |
|
93 enc = chardet.detect(html_input)['encoding'] |
|
94 try_encodings = [enc, 'utf8', 'latin1'] |
|
95 res_content = None |
|
96 for encoding in try_encodings: |
|
97 try: |
|
98 res_content_html = unicode(html_input, encoding) |
|
99 break; |
|
100 except UnicodeDecodeError: |
|
101 pass |
|
102 if not res_content_html: |
|
103 raise Exception('UnicodeDecodeError: could not decode') |
|
104 return res_content_html, images |
|
105 |
|
106 def old_convert_oo_to_html(input): |
|
107 html_input, images = convert(input, 'html') |
|
108 xhtml_input, _not_used_ = convert(input, 'xhtml') |
|
109 |
|
110 enc = chardet.detect(xhtml_input)['encoding'] |
|
111 try_encodings = [enc, 'utf8', 'latin1'] |
|
112 res_content = None |
|
113 for encoding in try_encodings: |
|
114 try: |
|
115 # TODO: fix path and manage images |
|
116 #res_content = fix_img_path(unicode(html_res_content,encoding), |
|
117 # unicode(xhtml_res_content,encoding), |
|
118 # iimg) |
|
119 res_content_html = unicode(html_input, encoding) |
|
120 res_content_xhtml = unicode(xhtml_input, encoding) |
|
121 break; |
|
122 except UnicodeDecodeError: |
|
123 pass |
|
124 if not res_content_html or not res_content_xhtml: |
|
125 raise Exception('UnicodeDecodeError: could not decode') |
|
126 return res_content_html, res_content_xhtml, images |
|
127 |
|
128 def markdown_from_code(code): |
|
129 CODE_INDICATOR = " " # 4 spaces |
|
130 return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')]) |
|
131 |
|
132 |