comt: src/cm/utils/string_utils.py@e71ea24ff34c (annotated)

119 5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	1	import chardet
175 4f072edc51a1 BUG FIX : handling html rbernard parents: 149 diff changeset	2	import re
119 5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	3
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	4	def to_unicode(input):
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	5	if type(input) == str:
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	6	res = None
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	7	for encoding in [chardet.detect(input)['encoding'], 'utf8', 'latin1']:
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	8	try:
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	9	res = unicode(input, encoding)
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	10	break;
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	11	except UnicodeDecodeError:
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	12	pass
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	13	if not res:
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	14	raise Exception('UnicodeDecodeError: could not decode')
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: diff changeset	15	return res
175 4f072edc51a1 BUG FIX : handling html rbernard parents: 149 diff changeset	16	return input
4f072edc51a1 BUG FIX : handling html rbernard parents: 149 diff changeset	17
4f072edc51a1 BUG FIX : handling html rbernard parents: 149 diff changeset	18	# strip carriage returns
4f072edc51a1 BUG FIX : handling html rbernard parents: 149 diff changeset	19	def strip_cr(input):
4f072edc51a1 BUG FIX : handling html rbernard parents: 149 diff changeset	20	return re.sub('\r\n\|\r\|\n', '\n', input)
4f072edc51a1 BUG FIX : handling html rbernard parents: 149 diff changeset	21

author	raph
	Thu, 25 Mar 2010 17:19:04 +0100
changeset 231	e71ea24ff34c
parent 175	4f072edc51a1
child 236	725653080973
permissions	-rw-r--r--