web/lib/django/utils/encoding.py
author ymh <ymh.work@gmail.com>
Tue, 25 May 2010 02:43:45 +0200
changeset 29 cc9b7e14412b
parent 0 0d40e90630ef
permissions -rw-r--r--
update django and lucene
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     1
import types
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     2
import urllib
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     3
import locale
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     4
import datetime
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     5
import codecs
29
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
     6
from decimal import Decimal
0
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     7
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     8
from django.utils.functional import Promise
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     9
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    10
class DjangoUnicodeDecodeError(UnicodeDecodeError):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    11
    def __init__(self, obj, *args):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    12
        self.obj = obj
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    13
        UnicodeDecodeError.__init__(self, *args)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    14
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    15
    def __str__(self):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    16
        original = UnicodeDecodeError.__str__(self)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    17
        return '%s. You passed in %r (%s)' % (original, self.obj,
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    18
                type(self.obj))
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    19
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    20
class StrAndUnicode(object):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    21
    """
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    22
    A class whose __str__ returns its __unicode__ as a UTF-8 bytestring.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    23
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    24
    Useful as a mix-in.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    25
    """
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    26
    def __str__(self):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    27
        return self.__unicode__().encode('utf-8')
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    28
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    29
def smart_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    30
    """
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    31
    Returns a unicode object representing 's'. Treats bytestrings using the
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    32
    'encoding' codec.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    33
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    34
    If strings_only is True, don't convert (some) non-string-like objects.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    35
    """
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    36
    if isinstance(s, Promise):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    37
        # The input is the result of a gettext_lazy() call.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    38
        return s
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    39
    return force_unicode(s, encoding, strings_only, errors)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    40
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    41
def is_protected_type(obj):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    42
    """Determine if the object instance is of a protected type.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    43
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    44
    Objects of protected types are preserved as-is when passed to
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    45
    force_unicode(strings_only=True).
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    46
    """
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    47
    return isinstance(obj, (
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    48
        types.NoneType,
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    49
        int, long,
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    50
        datetime.datetime, datetime.date, datetime.time,
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    51
        float, Decimal)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    52
    )
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    53
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    54
def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    55
    """
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    56
    Similar to smart_unicode, except that lazy instances are resolved to
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    57
    strings, rather than kept as lazy objects.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    58
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    59
    If strings_only is True, don't convert (some) non-string-like objects.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    60
    """
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    61
    if strings_only and is_protected_type(s):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    62
        return s
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    63
    try:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    64
        if not isinstance(s, basestring,):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    65
            if hasattr(s, '__unicode__'):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    66
                s = unicode(s)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    67
            else:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    68
                try:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    69
                    s = unicode(str(s), encoding, errors)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    70
                except UnicodeEncodeError:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    71
                    if not isinstance(s, Exception):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    72
                        raise
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    73
                    # If we get to here, the caller has passed in an Exception
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    74
                    # subclass populated with non-ASCII data without special
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    75
                    # handling to display as a string. We need to handle this
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    76
                    # without raising a further exception. We do an
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    77
                    # approximation to what the Exception's standard str()
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    78
                    # output should be.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    79
                    s = ' '.join([force_unicode(arg, encoding, strings_only,
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    80
                            errors) for arg in s])
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    81
        elif not isinstance(s, unicode):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    82
            # Note: We use .decode() here, instead of unicode(s, encoding,
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    83
            # errors), so that if s is a SafeString, it ends up being a
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    84
            # SafeUnicode at the end.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    85
            s = s.decode(encoding, errors)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    86
    except UnicodeDecodeError, e:
29
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
    87
        if not isinstance(s, Exception):
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
    88
            raise DjangoUnicodeDecodeError(s, *e.args)
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
    89
        else:
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
    90
            # If we get to here, the caller has passed in an Exception
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
    91
            # subclass populated with non-ASCII bytestring data without a
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
    92
            # working unicode method. Try to handle this without raising a
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
    93
            # further exception by individually forcing the exception args
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
    94
            # to unicode.
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
    95
            s = ' '.join([force_unicode(arg, encoding, strings_only,
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
    96
                    errors) for arg in s])
0
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    97
    return s
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    98
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    99
def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   100
    """
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   101
    Returns a bytestring version of 's', encoded as specified in 'encoding'.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   102
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   103
    If strings_only is True, don't convert (some) non-string-like objects.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   104
    """
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   105
    if strings_only and isinstance(s, (types.NoneType, int)):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   106
        return s
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   107
    if isinstance(s, Promise):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   108
        return unicode(s).encode(encoding, errors)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   109
    elif not isinstance(s, basestring):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   110
        try:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   111
            return str(s)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   112
        except UnicodeEncodeError:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   113
            if isinstance(s, Exception):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   114
                # An Exception subclass containing non-ASCII data that doesn't
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   115
                # know how to print itself properly. We shouldn't raise a
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   116
                # further exception.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   117
                return ' '.join([smart_str(arg, encoding, strings_only,
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   118
                        errors) for arg in s])
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   119
            return unicode(s).encode(encoding, errors)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   120
    elif isinstance(s, unicode):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   121
        return s.encode(encoding, errors)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   122
    elif s and encoding != 'utf-8':
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   123
        return s.decode('utf-8', errors).encode(encoding, errors)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   124
    else:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   125
        return s
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   126
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   127
def iri_to_uri(iri):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   128
    """
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   129
    Convert an Internationalized Resource Identifier (IRI) portion to a URI
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   130
    portion that is suitable for inclusion in a URL.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   131
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   132
    This is the algorithm from section 3.1 of RFC 3987.  However, since we are
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   133
    assuming input is either UTF-8 or unicode already, we can simplify things a
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   134
    little from the full method.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   135
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   136
    Returns an ASCII string containing the encoded result.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   137
    """
29
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
   138
    # The list of safe characters here is constructed from the "reserved" and
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
   139
    # "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986:
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
   140
    #     reserved    = gen-delims / sub-delims
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
   141
    #     gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
   142
    #     sub-delims  = "!" / "$" / "&" / "'" / "(" / ")"
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
   143
    #                   / "*" / "+" / "," / ";" / "="
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
   144
    #     unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
   145
    # Of the unreserved characters, urllib.quote already considers all but
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
   146
    # the ~ safe.
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
   147
    # The % character is also added to the list of safe characters here, as the
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
   148
    # end of section 3.1 of RFC 3987 specifically mentions that % must not be
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
   149
    # converted.
0
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   150
    if iri is None:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   151
        return iri
29
cc9b7e14412b update django and lucene
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
   152
    return urllib.quote(smart_str(iri), safe="/#%[]=:;$&()+,!?*@'~")
0
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   153
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   154
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   155
# The encoding of the default system locale but falls back to the
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   156
# given fallback encoding if the encoding is unsupported by python or could
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   157
# not be determined.  See tickets #10335 and #5846
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   158
try:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   159
    DEFAULT_LOCALE_ENCODING = locale.getdefaultlocale()[1] or 'ascii'
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   160
    codecs.lookup(DEFAULT_LOCALE_ENCODING)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   161
except:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   162
    DEFAULT_LOCALE_ENCODING = 'ascii'