| author | Yves-Marie Haussonne <ymh.work+github@gmail.com> |
| Fri, 09 May 2014 18:35:26 +0200 | |
| changeset 656 | a84519031134 |
| parent 555 | 5d79dc4e50a3 |
| permissions | -rw-r--r-- |
| 0 | 1 |
from pandoc_converters import pandoc_convert |
2 |
import chardet |
|
| 149 | 3 |
from cm.utils.string_utils import to_unicode |
|
77
fe91eb717a96
import oo_converters locally (not at module level) to avoid weird uno imports
raph
parents:
50
diff
changeset
|
4 |
import re |
| 253 | 5 |
import os |
|
360
bfaab8740995
Add abiword as an alternative to open office for conversions
gibus
parents:
259
diff
changeset
|
6 |
from oo_converters import extract_css_body |
| 252 | 7 |
|
| 0 | 8 |
|
9 |
# TODO: move that in text_base: save images |
|
10 |
def convert_from_mimetype(file_name, mime_type, format): |
|
11 |
input = open(file_name, 'r').read() |
|
12 |
return _convert_from_mimetype(input, mime_type, format) |
|
13 |
||
14 |
||
15 |
def _convert_from_mimetype(input, mime_type, format): |
|
16 |
#input = to_unicode(input) |
|
17 |
||
18 |
attachs = [] |
|
19 |
attachs_dir = None |
|
20 |
############################## |
|
|
360
bfaab8740995
Add abiword as an alternative to open office for conversions
gibus
parents:
259
diff
changeset
|
21 |
# OO/MS-Word |
| 0 | 22 |
if mime_type in ['application/vnd.oasis.opendocument.text', |
23 |
'application/msword', |
|
|
452
8f1d6a6cd7f6
forget a coma in list of input format for abiword conversion.
Production Moz <dev@sopinspace.com>
parents:
416
diff
changeset
|
24 |
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', |
|
416
5573b959131d
Adds rtfas input formats when converting with abiword.
Production Moz <dev@sopinspace.com>
parents:
360
diff
changeset
|
25 |
'application/rtf', |
|
5573b959131d
Adds rtfas input formats when converting with abiword.
Production Moz <dev@sopinspace.com>
parents:
360
diff
changeset
|
26 |
'text/rtf', |
| 0 | 27 |
]: |
28 |
||
|
360
bfaab8740995
Add abiword as an alternative to open office for conversions
gibus
parents:
259
diff
changeset
|
29 |
from cm.cm_settings import USE_ABI |
|
bfaab8740995
Add abiword as an alternative to open office for conversions
gibus
parents:
259
diff
changeset
|
30 |
if USE_ABI: |
|
bfaab8740995
Add abiword as an alternative to open office for conversions
gibus
parents:
259
diff
changeset
|
31 |
from abi_converters import AbiFileConverter |
|
bfaab8740995
Add abiword as an alternative to open office for conversions
gibus
parents:
259
diff
changeset
|
32 |
converter = AbiFileConverter() |
|
555
5d79dc4e50a3
When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents:
452
diff
changeset
|
33 |
try: |
|
5d79dc4e50a3
When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents:
452
diff
changeset
|
34 |
html_input, attachs = converter.convert_to_html(input) |
|
5d79dc4e50a3
When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents:
452
diff
changeset
|
35 |
html_input = re.sub(r' awml:style="[^"]*"', '', html_input) |
|
5d79dc4e50a3
When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents:
452
diff
changeset
|
36 |
converted_input = pandoc_convert(html_input, 'html', format) |
|
5d79dc4e50a3
When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents:
452
diff
changeset
|
37 |
except: |
|
5d79dc4e50a3
When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents:
452
diff
changeset
|
38 |
# If Abiword fails for any reason, try libreoffice |
|
5d79dc4e50a3
When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents:
452
diff
changeset
|
39 |
html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input) |
|
5d79dc4e50a3
When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents:
452
diff
changeset
|
40 |
if format == 'html': |
|
5d79dc4e50a3
When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents:
452
diff
changeset
|
41 |
_not_used_css, converted_input = extract_css_body(xhtml_input) |
|
5d79dc4e50a3
When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents:
452
diff
changeset
|
42 |
#converted_input = xhtml_input |
|
5d79dc4e50a3
When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents:
452
diff
changeset
|
43 |
|
|
5d79dc4e50a3
When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents:
452
diff
changeset
|
44 |
converted_input = pandoc_convert(html_input, 'html', format) |
|
360
bfaab8740995
Add abiword as an alternative to open office for conversions
gibus
parents:
259
diff
changeset
|
45 |
else: |
|
bfaab8740995
Add abiword as an alternative to open office for conversions
gibus
parents:
259
diff
changeset
|
46 |
html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input) |
|
bfaab8740995
Add abiword as an alternative to open office for conversions
gibus
parents:
259
diff
changeset
|
47 |
if format == 'html': |
| 252 | 48 |
_not_used_css, converted_input = extract_css_body(xhtml_input) |
49 |
#converted_input = xhtml_input |
|
|
259
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
253
diff
changeset
|
50 |
|
|
360
bfaab8740995
Add abiword as an alternative to open office for conversions
gibus
parents:
259
diff
changeset
|
51 |
converted_input = pandoc_convert(html_input, 'html', format) |
| 0 | 52 |
|
53 |
############################## |
|
| 118 | 54 |
# latex |
55 |
elif mime_type in ['application/x-latex','text/x-tex',]: |
|
56 |
converted_input = pandoc_convert(to_unicode(input), 'latex', format) |
|
57 |
||
58 |
############################## |
|
| 0 | 59 |
# anything looks like code: put them into markdown citation |
| 118 | 60 |
elif mime_type.startswith('text/x-') or mime_type in ['application/x-ruby',]: |
| 0 | 61 |
converted_input = markdown_from_code(input) |
62 |
||
63 |
############################## |
|
64 |
# html |
|
65 |
elif mime_type in ['text/html', 'application/xhtml+xml']: |
|
66 |
if format == 'html': |
|
67 |
converted_input = input |
|
|
259
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
253
diff
changeset
|
68 |
|
|
0371caf8bcc6
always use pandoc but in raw mode for html->html convert
raph
parents:
253
diff
changeset
|
69 |
converted_input = pandoc_convert(input, 'html', format) |
| 78 | 70 |
############################## |
71 |
# anything looks like text -> markdown |
|
72 |
elif mime_type in ['text/plain', |
|
73 |
'text/english', |
|
74 |
'text/enriched' |
|
75 |
]: |
|
76 |
converted_input = to_unicode(input) |
|
77 |
############################## |
|
78 |
# default case: assume it's text |
|
79 |
else: |
|
80 |
converted_input = to_unicode(input) |
|
81 |
||
| 0 | 82 |
|
83 |
return converted_input, attachs |
|
84 |
||
85 |
def fix_img_path(html, xhtml, imgs): |
|
86 |
""" |
|
87 |
imgs : name --> path |
|
88 |
""" |
|
89 |
finder_re = 'src[\s]*=[\s]*\"((?:(?!https?))[^\"]*)\"' |
|
90 |
len_res_html = len(re.findall(finder_re, html, re.IGNORECASE)) |
|
91 |
len_res_xhtml = len(re.findall(finder_re, xhtml, re.IGNORECASE)) |
|
92 |
res_html = re.finditer(finder_re, html, re.IGNORECASE) |
|
93 |
res_xhtml = re.finditer(finder_re, xhtml, re.IGNORECASE) |
|
94 |
result = [] |
|
95 |
last_index = 0 |
|
96 |
for match_xhtml in res_xhtml: |
|
97 |
img_path = '' |
|
98 |
try: |
|
99 |
match_html = res_html.next() |
|
100 |
if match_html: |
|
101 |
img_name = match_html.group(1) |
|
| 253 | 102 |
img_path = os.path.split(img_name)[-1] |
| 0 | 103 |
except StopIteration: |
104 |
# TODO : report pb |
|
105 |
pass |
|
106 |
offset = len(match_xhtml.group(0)) - len(match_xhtml.group(1)) |
|
107 |
result.append(xhtml[last_index:match_xhtml.start() + offset - 1]) |
|
108 |
result.append(img_path) |
|
109 |
last_index = match_xhtml.end() - 1 # -1 because trailing " |
|
110 |
result.append(xhtml[last_index:len(xhtml)]) |
|
111 |
return u''.join(result) |
|
112 |
||
|
77
fe91eb717a96
import oo_converters locally (not at module level) to avoid weird uno imports
raph
parents:
50
diff
changeset
|
113 |
def convert_oo_to_html(input): |
|
fe91eb717a96
import oo_converters locally (not at module level) to avoid weird uno imports
raph
parents:
50
diff
changeset
|
114 |
from oo_converters import convert |
| 0 | 115 |
html_input, images = convert(input, 'html') |
116 |
||
117 |
enc = chardet.detect(html_input)['encoding'] |
|
118 |
try_encodings = [enc, 'utf8', 'latin1'] |
|
119 |
for encoding in try_encodings: |
|
120 |
try: |
|
121 |
res_content_html = unicode(html_input, encoding) |
|
122 |
break; |
|
123 |
except UnicodeDecodeError: |
|
124 |
pass |
|
125 |
if not res_content_html: |
|
126 |
raise Exception('UnicodeDecodeError: could not decode') |
|
127 |
return res_content_html, images |
|
128 |
||
| 253 | 129 |
def fix_html_img_path(html): |
130 |
return html.replace('IMG SRC="../outdir/','IMG SRC="') |
|
131 |
||
| 252 | 132 |
def convert_oo_to_html_and_xhtml(input): |
|
77
fe91eb717a96
import oo_converters locally (not at module level) to avoid weird uno imports
raph
parents:
50
diff
changeset
|
133 |
from oo_converters import convert |
| 0 | 134 |
html_input, images = convert(input, 'html') |
135 |
xhtml_input, _not_used_ = convert(input, 'xhtml') |
|
136 |
enc = chardet.detect(xhtml_input)['encoding'] |
|
137 |
try_encodings = [enc, 'utf8', 'latin1'] |
|
138 |
for encoding in try_encodings: |
|
139 |
try: |
|
140 |
res_content_html = unicode(html_input, encoding) |
|
141 |
res_content_xhtml = unicode(xhtml_input, encoding) |
|
142 |
break; |
|
143 |
except UnicodeDecodeError: |
|
144 |
pass |
|
| 253 | 145 |
|
146 |
res_content_xhtml = fix_img_path(res_content_html, res_content_xhtml, images) |
|
147 |
res_content_html = fix_html_img_path(res_content_html) |
|
148 |
||
| 0 | 149 |
if not res_content_html or not res_content_xhtml: |
150 |
raise Exception('UnicodeDecodeError: could not decode') |
|
| 252 | 151 |
return res_content_html, cleanup(res_content_xhtml), images |
| 0 | 152 |
|
| 252 | 153 |
def cleanup(string): |
154 |
return string.replace(u'\xc2\xa0',u'') |
|
155 |
||
| 0 | 156 |
def markdown_from_code(code): |
157 |
CODE_INDICATOR = " " # 4 spaces |
|
158 |
return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')]) |
|
159 |
||
|
360
bfaab8740995
Add abiword as an alternative to open office for conversions
gibus
parents:
259
diff
changeset
|
160 |