src/cm/utils/string_utils.py
author gibus
Wed, 11 Sep 2013 23:13:01 +0200
changeset 532 0bad3613f59d
parent 236 725653080973
permissions -rw-r--r--
Reverse to YUI 3.0.0 since with YUI.3.10.3, comment content including words 'paragraph' or 'section' do not show up on Firefox, this is weird and has to be investigated.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
119
5e8dda1b7631 recover when tidy trashes: try markdown anyway
raph
parents:
diff changeset
     1
import chardet
175
4f072edc51a1 BUG FIX : handling html
rbernard
parents: 149
diff changeset
     2
import re
119
5e8dda1b7631 recover when tidy trashes: try markdown anyway
raph
parents:
diff changeset
     3
5e8dda1b7631 recover when tidy trashes: try markdown anyway
raph
parents:
diff changeset
     4
def to_unicode(input):
5e8dda1b7631 recover when tidy trashes: try markdown anyway
raph
parents:
diff changeset
     5
    if type(input) == str:
5e8dda1b7631 recover when tidy trashes: try markdown anyway
raph
parents:
diff changeset
     6
        res = None
236
725653080973 fix error if no encoding is detected
raph
parents: 175
diff changeset
     7
        encodings = ['utf8', 'latin1']
725653080973 fix error if no encoding is detected
raph
parents: 175
diff changeset
     8
        doc_enc = chardet.detect(input)['encoding']
725653080973 fix error if no encoding is detected
raph
parents: 175
diff changeset
     9
        if doc_enc:
725653080973 fix error if no encoding is detected
raph
parents: 175
diff changeset
    10
            encodings = [doc_enc,] + encodings  
725653080973 fix error if no encoding is detected
raph
parents: 175
diff changeset
    11
        for encoding in encodings:
119
5e8dda1b7631 recover when tidy trashes: try markdown anyway
raph
parents:
diff changeset
    12
            try:
5e8dda1b7631 recover when tidy trashes: try markdown anyway
raph
parents:
diff changeset
    13
                res = unicode(input, encoding)
5e8dda1b7631 recover when tidy trashes: try markdown anyway
raph
parents:
diff changeset
    14
                break;
5e8dda1b7631 recover when tidy trashes: try markdown anyway
raph
parents:
diff changeset
    15
            except UnicodeDecodeError:
5e8dda1b7631 recover when tidy trashes: try markdown anyway
raph
parents:
diff changeset
    16
                pass
5e8dda1b7631 recover when tidy trashes: try markdown anyway
raph
parents:
diff changeset
    17
        if not res:
5e8dda1b7631 recover when tidy trashes: try markdown anyway
raph
parents:
diff changeset
    18
            raise Exception('UnicodeDecodeError: could not decode')
5e8dda1b7631 recover when tidy trashes: try markdown anyway
raph
parents:
diff changeset
    19
        return res
175
4f072edc51a1 BUG FIX : handling html
rbernard
parents: 149
diff changeset
    20
    return input
4f072edc51a1 BUG FIX : handling html
rbernard
parents: 149
diff changeset
    21
4f072edc51a1 BUG FIX : handling html
rbernard
parents: 149
diff changeset
    22
# strip carriage returns
4f072edc51a1 BUG FIX : handling html
rbernard
parents: 149
diff changeset
    23
def strip_cr(input):
4f072edc51a1 BUG FIX : handling html
rbernard
parents: 149
diff changeset
    24
    return re.sub('\r\n|\r|\n', '\n', input)
4f072edc51a1 BUG FIX : handling html
rbernard
parents: 149
diff changeset
    25