comt: src/cm/converters/__init__.py@a84519031134 (annotated)

0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	1	from pandoc_converters import pandoc_convert
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	2	import chardet
149 0f2c5744b39b cleanup diff files / add experimental diff raph parents: 119 diff changeset	3	from cm.utils.string_utils import to_unicode
77 fe91eb717a96 import oo_converters locally (not at module level) to avoid weird uno imports raph parents: 50 diff changeset	4	import re
253 a844469257b0 fix img path raph parents: 252 diff changeset	5	import os
360 bfaab8740995 Add abiword as an alternative to open office for conversions gibus parents: 259 diff changeset	6	from oo_converters import extract_css_body
252 0f0a79f7f213 do not use pandoc for html content raph parents: 149 diff changeset	7
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	8
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	9	# TODO: move that in text_base: save images
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	10	def convert_from_mimetype(file_name, mime_type, format):
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	11	input = open(file_name, 'r').read()
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	12	return _convert_from_mimetype(input, mime_type, format)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	13
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	14
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	15	def _convert_from_mimetype(input, mime_type, format):
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	16	#input = to_unicode(input)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	17
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	18	attachs = []
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	19	attachs_dir = None
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	20	##############################
360 bfaab8740995 Add abiword as an alternative to open office for conversions gibus parents: 259 diff changeset	21	# OO/MS-Word
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	22	if mime_type in ['application/vnd.oasis.opendocument.text',
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	23	'application/msword',
452 8f1d6a6cd7f6 forget a coma in list of input format for abiword conversion. Production Moz <dev@sopinspace.com> parents: 416 diff changeset	24	'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
416 5573b959131d Adds rtfas input formats when converting with abiword. Production Moz <dev@sopinspace.com> parents: 360 diff changeset	25	'application/rtf',
5573b959131d Adds rtfas input formats when converting with abiword. Production Moz <dev@sopinspace.com> parents: 360 diff changeset	26	'text/rtf',
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	27	]:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	28
360 bfaab8740995 Add abiword as an alternative to open office for conversions gibus parents: 259 diff changeset	29	from cm.cm_settings import USE_ABI
bfaab8740995 Add abiword as an alternative to open office for conversions gibus parents: 259 diff changeset	30	if USE_ABI:
bfaab8740995 Add abiword as an alternative to open office for conversions gibus parents: 259 diff changeset	31	from abi_converters import AbiFileConverter
bfaab8740995 Add abiword as an alternative to open office for conversions gibus parents: 259 diff changeset	32	converter = AbiFileConverter()
555 5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails. gibus parents: 452 diff changeset	33	try:
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails. gibus parents: 452 diff changeset	34	html_input, attachs = converter.convert_to_html(input)
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails. gibus parents: 452 diff changeset	35	html_input = re.sub(r' awml:style="[^"]*"', '', html_input)
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails. gibus parents: 452 diff changeset	36	converted_input = pandoc_convert(html_input, 'html', format)
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails. gibus parents: 452 diff changeset	37	except:
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails. gibus parents: 452 diff changeset	38	# If Abiword fails for any reason, try libreoffice
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails. gibus parents: 452 diff changeset	39	html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input)
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails. gibus parents: 452 diff changeset	40	if format == 'html':
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails. gibus parents: 452 diff changeset	41	_not_used_css, converted_input = extract_css_body(xhtml_input)
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails. gibus parents: 452 diff changeset	42	#converted_input = xhtml_input
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails. gibus parents: 452 diff changeset	43
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails. gibus parents: 452 diff changeset	44	converted_input = pandoc_convert(html_input, 'html', format)
360 bfaab8740995 Add abiword as an alternative to open office for conversions gibus parents: 259 diff changeset	45	else:
bfaab8740995 Add abiword as an alternative to open office for conversions gibus parents: 259 diff changeset	46	html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input)
bfaab8740995 Add abiword as an alternative to open office for conversions gibus parents: 259 diff changeset	47	if format == 'html':
252 0f0a79f7f213 do not use pandoc for html content raph parents: 149 diff changeset	48	_not_used_css, converted_input = extract_css_body(xhtml_input)
0f0a79f7f213 do not use pandoc for html content raph parents: 149 diff changeset	49	#converted_input = xhtml_input
259 0371caf8bcc6 always use pandoc but in raw mode for html->html convert raph parents: 253 diff changeset	50
360 bfaab8740995 Add abiword as an alternative to open office for conversions gibus parents: 259 diff changeset	51	converted_input = pandoc_convert(html_input, 'html', format)
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	52
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	53	##############################
118 75d94dd14511 add latex conversion (with pandoc) raph parents: 78 diff changeset	54	# latex
75d94dd14511 add latex conversion (with pandoc) raph parents: 78 diff changeset	55	elif mime_type in ['application/x-latex','text/x-tex',]:
75d94dd14511 add latex conversion (with pandoc) raph parents: 78 diff changeset	56	converted_input = pandoc_convert(to_unicode(input), 'latex', format)
75d94dd14511 add latex conversion (with pandoc) raph parents: 78 diff changeset	57
75d94dd14511 add latex conversion (with pandoc) raph parents: 78 diff changeset	58	##############################
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	59	# anything looks like code: put them into markdown citation
118 75d94dd14511 add latex conversion (with pandoc) raph parents: 78 diff changeset	60	elif mime_type.startswith('text/x-') or mime_type in ['application/x-ruby',]:
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	61	converted_input = markdown_from_code(input)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	62
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	63	##############################
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	64	# html
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	65	elif mime_type in ['text/html', 'application/xhtml+xml']:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	66	if format == 'html':
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	67	converted_input = input
259 0371caf8bcc6 always use pandoc but in raw mode for html->html convert raph parents: 253 diff changeset	68
0371caf8bcc6 always use pandoc but in raw mode for html->html convert raph parents: 253 diff changeset	69	converted_input = pandoc_convert(input, 'html', format)
78 dda94db1149a add default case in upload mime decoding (assume text) raph parents: 77 diff changeset	70	##############################
dda94db1149a add default case in upload mime decoding (assume text) raph parents: 77 diff changeset	71	# anything looks like text -> markdown
dda94db1149a add default case in upload mime decoding (assume text) raph parents: 77 diff changeset	72	elif mime_type in ['text/plain',
dda94db1149a add default case in upload mime decoding (assume text) raph parents: 77 diff changeset	73	'text/english',
dda94db1149a add default case in upload mime decoding (assume text) raph parents: 77 diff changeset	74	'text/enriched'
dda94db1149a add default case in upload mime decoding (assume text) raph parents: 77 diff changeset	75	]:
dda94db1149a add default case in upload mime decoding (assume text) raph parents: 77 diff changeset	76	converted_input = to_unicode(input)
dda94db1149a add default case in upload mime decoding (assume text) raph parents: 77 diff changeset	77	##############################
dda94db1149a add default case in upload mime decoding (assume text) raph parents: 77 diff changeset	78	# default case: assume it's text
dda94db1149a add default case in upload mime decoding (assume text) raph parents: 77 diff changeset	79	else:
dda94db1149a add default case in upload mime decoding (assume text) raph parents: 77 diff changeset	80	converted_input = to_unicode(input)
dda94db1149a add default case in upload mime decoding (assume text) raph parents: 77 diff changeset	81
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	82
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	83	return converted_input, attachs
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	84
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	85	def fix_img_path(html, xhtml, imgs):
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	86	"""
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	87	imgs : name --> path
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	88	"""
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	89	finder_re = 'src[\s]=[\s]\"((?:(?!https?))[^\"]*)\"'
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	90	len_res_html = len(re.findall(finder_re, html, re.IGNORECASE))
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	91	len_res_xhtml = len(re.findall(finder_re, xhtml, re.IGNORECASE))
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	92	res_html = re.finditer(finder_re, html, re.IGNORECASE)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	93	res_xhtml = re.finditer(finder_re, xhtml, re.IGNORECASE)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	94	result = []
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	95	last_index = 0
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	96	for match_xhtml in res_xhtml:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	97	img_path = ''
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	98	try:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	99	match_html = res_html.next()
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	100	if match_html:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	101	img_name = match_html.group(1)
253 a844469257b0 fix img path raph parents: 252 diff changeset	102	img_path = os.path.split(img_name)[-1]
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	103	except StopIteration:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	104	# TODO : report pb
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	105	pass
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	106	offset = len(match_xhtml.group(0)) - len(match_xhtml.group(1))
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	107	result.append(xhtml[last_index:match_xhtml.start() + offset - 1])
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	108	result.append(img_path)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	109	last_index = match_xhtml.end() - 1 # -1 because trailing "
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	110	result.append(xhtml[last_index:len(xhtml)])
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	111	return u''.join(result)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	112
77 fe91eb717a96 import oo_converters locally (not at module level) to avoid weird uno imports raph parents: 50 diff changeset	113	def convert_oo_to_html(input):
fe91eb717a96 import oo_converters locally (not at module level) to avoid weird uno imports raph parents: 50 diff changeset	114	from oo_converters import convert
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	115	html_input, images = convert(input, 'html')
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	116
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	117	enc = chardet.detect(html_input)['encoding']
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	118	try_encodings = [enc, 'utf8', 'latin1']
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	119	for encoding in try_encodings:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	120	try:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	121	res_content_html = unicode(html_input, encoding)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	122	break;
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	123	except UnicodeDecodeError:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	124	pass
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	125	if not res_content_html:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	126	raise Exception('UnicodeDecodeError: could not decode')
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	127	return res_content_html, images
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	128
253 a844469257b0 fix img path raph parents: 252 diff changeset	129	def fix_html_img_path(html):
a844469257b0 fix img path raph parents: 252 diff changeset	130	return html.replace('IMG SRC="../outdir/','IMG SRC="')
a844469257b0 fix img path raph parents: 252 diff changeset	131
252 0f0a79f7f213 do not use pandoc for html content raph parents: 149 diff changeset	132	def convert_oo_to_html_and_xhtml(input):
77 fe91eb717a96 import oo_converters locally (not at module level) to avoid weird uno imports raph parents: 50 diff changeset	133	from oo_converters import convert
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	134	html_input, images = convert(input, 'html')
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	135	xhtml_input, _not_used_ = convert(input, 'xhtml')
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	136	enc = chardet.detect(xhtml_input)['encoding']
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	137	try_encodings = [enc, 'utf8', 'latin1']
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	138	for encoding in try_encodings:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	139	try:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	140	res_content_html = unicode(html_input, encoding)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	141	res_content_xhtml = unicode(xhtml_input, encoding)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	142	break;
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	143	except UnicodeDecodeError:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	144	pass
253 a844469257b0 fix img path raph parents: 252 diff changeset	145
a844469257b0 fix img path raph parents: 252 diff changeset	146	res_content_xhtml = fix_img_path(res_content_html, res_content_xhtml, images)
a844469257b0 fix img path raph parents: 252 diff changeset	147	res_content_html = fix_html_img_path(res_content_html)
a844469257b0 fix img path raph parents: 252 diff changeset	148
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	149	if not res_content_html or not res_content_xhtml:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	150	raise Exception('UnicodeDecodeError: could not decode')
252 0f0a79f7f213 do not use pandoc for html content raph parents: 149 diff changeset	151	return res_content_html, cleanup(res_content_xhtml), images
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	152
252 0f0a79f7f213 do not use pandoc for html content raph parents: 149 diff changeset	153	def cleanup(string):
0f0a79f7f213 do not use pandoc for html content raph parents: 149 diff changeset	154	return string.replace(u'\xc2\xa0',u'')
0f0a79f7f213 do not use pandoc for html content raph parents: 149 diff changeset	155
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	156	def markdown_from_code(code):
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	157	CODE_INDICATOR = " " # 4 spaces
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	158	return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')])
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	159
360 bfaab8740995 Add abiword as an alternative to open office for conversions gibus parents: 259 diff changeset	160

author	Yves-Marie Haussonne <ymh.work+github@gmail.com>
	Fri, 09 May 2014 18:35:26 +0200
changeset 656	a84519031134
parent 555	5d79dc4e50a3
permissions	-rw-r--r--