| author | raph |
| Thu, 25 Mar 2010 17:19:04 +0100 | |
| changeset 231 | e71ea24ff34c |
| parent 175 | 4f072edc51a1 |
| child 236 | 725653080973 |
| permissions | -rw-r--r-- |
| 119 | 1 |
import chardet |
| 175 | 2 |
import re |
| 119 | 3 |
|
4 |
def to_unicode(input): |
|
5 |
if type(input) == str: |
|
6 |
res = None |
|
7 |
for encoding in [chardet.detect(input)['encoding'], 'utf8', 'latin1']: |
|
8 |
try: |
|
9 |
res = unicode(input, encoding) |
|
10 |
break; |
|
11 |
except UnicodeDecodeError: |
|
12 |
pass |
|
13 |
if not res: |
|
14 |
raise Exception('UnicodeDecodeError: could not decode') |
|
15 |
return res |
|
| 175 | 16 |
return input |
17 |
||
18 |
# strip carriage returns |
|
19 |
def strip_cr(input): |
|
20 |
return re.sub('\r\n|\r|\n', '\n', input) |
|
21 |