src/cm/utils/string_utils.py
changeset 236 725653080973
parent 175 4f072edc51a1
equal deleted inserted replaced
235:b23aee4dd81d 236:725653080973
     2 import re
     2 import re
     3 
     3 
     4 def to_unicode(input):
     4 def to_unicode(input):
     5     if type(input) == str:
     5     if type(input) == str:
     6         res = None
     6         res = None
     7         for encoding in [chardet.detect(input)['encoding'], 'utf8', 'latin1']:
     7         encodings = ['utf8', 'latin1']
       
     8         doc_enc = chardet.detect(input)['encoding']
       
     9         if doc_enc:
       
    10             encodings = [doc_enc,] + encodings  
       
    11         for encoding in encodings:
     8             try:
    12             try:
     9                 res = unicode(input, encoding)
    13                 res = unicode(input, encoding)
    10                 break;
    14                 break;
    11             except UnicodeDecodeError:
    15             except UnicodeDecodeError:
    12                 pass
    16                 pass