|
119
|
1 |
import chardet |
|
175
|
2 |
import re |
|
119
|
3 |
|
|
|
4 |
def to_unicode(input): |
|
|
5 |
if type(input) == str: |
|
|
6 |
res = None |
|
236
|
7 |
encodings = ['utf8', 'latin1'] |
|
|
8 |
doc_enc = chardet.detect(input)['encoding'] |
|
|
9 |
if doc_enc: |
|
|
10 |
encodings = [doc_enc,] + encodings |
|
|
11 |
for encoding in encodings: |
|
119
|
12 |
try: |
|
|
13 |
res = unicode(input, encoding) |
|
|
14 |
break; |
|
|
15 |
except UnicodeDecodeError: |
|
|
16 |
pass |
|
|
17 |
if not res: |
|
|
18 |
raise Exception('UnicodeDecodeError: could not decode') |
|
|
19 |
return res |
|
175
|
20 |
return input |
|
|
21 |
|
|
|
22 |
# strip carriage returns |
|
|
23 |
def strip_cr(input): |
|
|
24 |
return re.sub('\r\n|\r|\n', '\n', input) |
|
|
25 |
|