web/lib/django/utils/encoding.py
author ymh <ymh.work@gmail.com>
Thu, 05 Aug 2010 17:28:09 +0200
changeset 50 012451a812f1
parent 38 77b6da96e6f1
permissions -rw-r--r--
Merge with a2711e44ba5de8b1675d7e0ee6aaa4a6c56a9b46
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
38
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
     1
import types
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
     2
import urllib
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
     3
import locale
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
     4
import datetime
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
     5
import codecs
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
     6
from decimal import Decimal
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
     7
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
     8
from django.utils.functional import Promise
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
     9
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    10
class DjangoUnicodeDecodeError(UnicodeDecodeError):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    11
    def __init__(self, obj, *args):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    12
        self.obj = obj
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    13
        UnicodeDecodeError.__init__(self, *args)
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    14
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    15
    def __str__(self):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    16
        original = UnicodeDecodeError.__str__(self)
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    17
        return '%s. You passed in %r (%s)' % (original, self.obj,
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    18
                type(self.obj))
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    19
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    20
class StrAndUnicode(object):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    21
    """
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    22
    A class whose __str__ returns its __unicode__ as a UTF-8 bytestring.
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    23
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    24
    Useful as a mix-in.
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    25
    """
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    26
    def __str__(self):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    27
        return self.__unicode__().encode('utf-8')
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    28
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    29
def smart_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    30
    """
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    31
    Returns a unicode object representing 's'. Treats bytestrings using the
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    32
    'encoding' codec.
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    33
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    34
    If strings_only is True, don't convert (some) non-string-like objects.
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    35
    """
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    36
    if isinstance(s, Promise):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    37
        # The input is the result of a gettext_lazy() call.
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    38
        return s
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    39
    return force_unicode(s, encoding, strings_only, errors)
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    40
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    41
def is_protected_type(obj):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    42
    """Determine if the object instance is of a protected type.
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    43
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    44
    Objects of protected types are preserved as-is when passed to
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    45
    force_unicode(strings_only=True).
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    46
    """
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    47
    return isinstance(obj, (
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    48
        types.NoneType,
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    49
        int, long,
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    50
        datetime.datetime, datetime.date, datetime.time,
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    51
        float, Decimal)
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    52
    )
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    53
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    54
def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    55
    """
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    56
    Similar to smart_unicode, except that lazy instances are resolved to
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    57
    strings, rather than kept as lazy objects.
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    58
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    59
    If strings_only is True, don't convert (some) non-string-like objects.
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    60
    """
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    61
    if strings_only and is_protected_type(s):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    62
        return s
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    63
    try:
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    64
        if not isinstance(s, basestring,):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    65
            if hasattr(s, '__unicode__'):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    66
                s = unicode(s)
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    67
            else:
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    68
                try:
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    69
                    s = unicode(str(s), encoding, errors)
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    70
                except UnicodeEncodeError:
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    71
                    if not isinstance(s, Exception):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    72
                        raise
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    73
                    # If we get to here, the caller has passed in an Exception
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    74
                    # subclass populated with non-ASCII data without special
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    75
                    # handling to display as a string. We need to handle this
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    76
                    # without raising a further exception. We do an
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    77
                    # approximation to what the Exception's standard str()
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    78
                    # output should be.
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    79
                    s = ' '.join([force_unicode(arg, encoding, strings_only,
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    80
                            errors) for arg in s])
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    81
        elif not isinstance(s, unicode):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    82
            # Note: We use .decode() here, instead of unicode(s, encoding,
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    83
            # errors), so that if s is a SafeString, it ends up being a
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    84
            # SafeUnicode at the end.
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    85
            s = s.decode(encoding, errors)
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    86
    except UnicodeDecodeError, e:
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    87
        if not isinstance(s, Exception):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    88
            raise DjangoUnicodeDecodeError(s, *e.args)
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    89
        else:
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    90
            # If we get to here, the caller has passed in an Exception
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    91
            # subclass populated with non-ASCII bytestring data without a
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    92
            # working unicode method. Try to handle this without raising a
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    93
            # further exception by individually forcing the exception args
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    94
            # to unicode.
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    95
            s = ' '.join([force_unicode(arg, encoding, strings_only,
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    96
                    errors) for arg in s])
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    97
    return s
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    98
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
    99
def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   100
    """
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   101
    Returns a bytestring version of 's', encoded as specified in 'encoding'.
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   102
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   103
    If strings_only is True, don't convert (some) non-string-like objects.
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   104
    """
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   105
    if strings_only and isinstance(s, (types.NoneType, int)):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   106
        return s
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   107
    if isinstance(s, Promise):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   108
        return unicode(s).encode(encoding, errors)
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   109
    elif not isinstance(s, basestring):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   110
        try:
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   111
            return str(s)
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   112
        except UnicodeEncodeError:
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   113
            if isinstance(s, Exception):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   114
                # An Exception subclass containing non-ASCII data that doesn't
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   115
                # know how to print itself properly. We shouldn't raise a
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   116
                # further exception.
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   117
                return ' '.join([smart_str(arg, encoding, strings_only,
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   118
                        errors) for arg in s])
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   119
            return unicode(s).encode(encoding, errors)
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   120
    elif isinstance(s, unicode):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   121
        return s.encode(encoding, errors)
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   122
    elif s and encoding != 'utf-8':
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   123
        return s.decode('utf-8', errors).encode(encoding, errors)
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   124
    else:
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   125
        return s
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   126
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   127
def iri_to_uri(iri):
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   128
    """
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   129
    Convert an Internationalized Resource Identifier (IRI) portion to a URI
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   130
    portion that is suitable for inclusion in a URL.
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   131
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   132
    This is the algorithm from section 3.1 of RFC 3987.  However, since we are
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   133
    assuming input is either UTF-8 or unicode already, we can simplify things a
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   134
    little from the full method.
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   135
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   136
    Returns an ASCII string containing the encoded result.
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   137
    """
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   138
    # The list of safe characters here is constructed from the "reserved" and
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   139
    # "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986:
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   140
    #     reserved    = gen-delims / sub-delims
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   141
    #     gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   142
    #     sub-delims  = "!" / "$" / "&" / "'" / "(" / ")"
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   143
    #                   / "*" / "+" / "," / ";" / "="
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   144
    #     unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   145
    # Of the unreserved characters, urllib.quote already considers all but
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   146
    # the ~ safe.
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   147
    # The % character is also added to the list of safe characters here, as the
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   148
    # end of section 3.1 of RFC 3987 specifically mentions that % must not be
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   149
    # converted.
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   150
    if iri is None:
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   151
        return iri
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   152
    return urllib.quote(smart_str(iri), safe="/#%[]=:;$&()+,!?*@'~")
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   153
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   154
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   155
# The encoding of the default system locale but falls back to the
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   156
# given fallback encoding if the encoding is unsupported by python or could
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   157
# not be determined.  See tickets #10335 and #5846
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   158
try:
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   159
    DEFAULT_LOCALE_ENCODING = locale.getdefaultlocale()[1] or 'ascii'
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   160
    codecs.lookup(DEFAULT_LOCALE_ENCODING)
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   161
except:
77b6da96e6f1 update django
ymh <ymh.work@gmail.com>
parents:
diff changeset
   162
    DEFAULT_LOCALE_ENCODING = 'ascii'