comt: src/cm/utils/string_utils.py@725653080973 (annotated)

119 5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	1	import chardet
175 4f072edc51a1 BUG FIX : handling html rbernard parents: 149 diff changeset	2	import re
119 5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	3
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	4	def to_unicode(input):
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	5	if type(input) == str:
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	6	res = None
236 725653080973 fix error if no encoding is detected raph parents: 175 diff changeset	7	encodings = ['utf8', 'latin1']
725653080973 fix error if no encoding is detected raph parents: 175 diff changeset	8	doc_enc = chardet.detect(input)['encoding']
725653080973 fix error if no encoding is detected raph parents: 175 diff changeset	9	if doc_enc:
725653080973 fix error if no encoding is detected raph parents: 175 diff changeset	10	encodings = [doc_enc,] + encodings
725653080973 fix error if no encoding is detected raph parents: 175 diff changeset	11	for encoding in encodings:
119 5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	12	try:
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	13	res = unicode(input, encoding)
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	14	break;
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	15	except UnicodeDecodeError:
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	16	pass
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	17	if not res:
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	18	raise Exception('UnicodeDecodeError: could not decode')
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	19	return res
175 4f072edc51a1 BUG FIX : handling html rbernard parents: 149 diff changeset	20	return input
4f072edc51a1 BUG FIX : handling html rbernard parents: 149 diff changeset	21
4f072edc51a1 BUG FIX : handling html rbernard parents: 149 diff changeset	22	# strip carriage returns
4f072edc51a1 BUG FIX : handling html rbernard parents: 149 diff changeset	23	def strip_cr(input):
4f072edc51a1 BUG FIX : handling html rbernard parents: 149 diff changeset	24	return re.sub('\r\n\|\r\|\n', '\n', input)
4f072edc51a1 BUG FIX : handling html rbernard parents: 149 diff changeset	25

author	raph
	Tue, 30 Mar 2010 14:13:45 +0200
changeset 236	725653080973
parent 175	4f072edc51a1
permissions	-rw-r--r--