web/lib/django/utils/encoding.py
author ymh <ymh.work@gmail.com>
Wed, 20 Jan 2010 00:34:04 +0100
changeset 0 0d40e90630ef
child 29 cc9b7e14412b
permissions -rw-r--r--
Blinkster creation
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     1
import types
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     2
import urllib
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     3
import locale
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     4
import datetime
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     5
import codecs
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     6
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     7
from django.utils.functional import Promise
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     8
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     9
try:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    10
    from decimal import Decimal
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    11
except ImportError:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    12
    from django.utils._decimal import Decimal # Python 2.3 fallback
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    13
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    14
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    15
class DjangoUnicodeDecodeError(UnicodeDecodeError):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    16
    def __init__(self, obj, *args):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    17
        self.obj = obj
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    18
        UnicodeDecodeError.__init__(self, *args)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    19
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    20
    def __str__(self):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    21
        original = UnicodeDecodeError.__str__(self)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    22
        return '%s. You passed in %r (%s)' % (original, self.obj,
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    23
                type(self.obj))
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    24
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    25
class StrAndUnicode(object):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    26
    """
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    27
    A class whose __str__ returns its __unicode__ as a UTF-8 bytestring.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    28
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    29
    Useful as a mix-in.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    30
    """
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    31
    def __str__(self):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    32
        return self.__unicode__().encode('utf-8')
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    33
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    34
def smart_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    35
    """
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    36
    Returns a unicode object representing 's'. Treats bytestrings using the
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    37
    'encoding' codec.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    38
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    39
    If strings_only is True, don't convert (some) non-string-like objects.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    40
    """
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    41
    if isinstance(s, Promise):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    42
        # The input is the result of a gettext_lazy() call.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    43
        return s
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    44
    return force_unicode(s, encoding, strings_only, errors)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    45
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    46
def is_protected_type(obj):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    47
    """Determine if the object instance is of a protected type.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    48
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    49
    Objects of protected types are preserved as-is when passed to
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    50
    force_unicode(strings_only=True).
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    51
    """
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    52
    return isinstance(obj, (
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    53
        types.NoneType,
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    54
        int, long,
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    55
        datetime.datetime, datetime.date, datetime.time,
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    56
        float, Decimal)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    57
    )
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    58
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    59
def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    60
    """
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    61
    Similar to smart_unicode, except that lazy instances are resolved to
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    62
    strings, rather than kept as lazy objects.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    63
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    64
    If strings_only is True, don't convert (some) non-string-like objects.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    65
    """
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    66
    if strings_only and is_protected_type(s):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    67
        return s
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    68
    try:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    69
        if not isinstance(s, basestring,):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    70
            if hasattr(s, '__unicode__'):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    71
                s = unicode(s)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    72
            else:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    73
                try:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    74
                    s = unicode(str(s), encoding, errors)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    75
                except UnicodeEncodeError:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    76
                    if not isinstance(s, Exception):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    77
                        raise
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    78
                    # If we get to here, the caller has passed in an Exception
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    79
                    # subclass populated with non-ASCII data without special
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    80
                    # handling to display as a string. We need to handle this
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    81
                    # without raising a further exception. We do an
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    82
                    # approximation to what the Exception's standard str()
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    83
                    # output should be.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    84
                    s = ' '.join([force_unicode(arg, encoding, strings_only,
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    85
                            errors) for arg in s])
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    86
        elif not isinstance(s, unicode):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    87
            # Note: We use .decode() here, instead of unicode(s, encoding,
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    88
            # errors), so that if s is a SafeString, it ends up being a
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    89
            # SafeUnicode at the end.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    90
            s = s.decode(encoding, errors)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    91
    except UnicodeDecodeError, e:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    92
        raise DjangoUnicodeDecodeError(s, *e.args)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    93
    return s
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    94
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    95
def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    96
    """
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    97
    Returns a bytestring version of 's', encoded as specified in 'encoding'.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    98
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    99
    If strings_only is True, don't convert (some) non-string-like objects.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   100
    """
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   101
    if strings_only and isinstance(s, (types.NoneType, int)):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   102
        return s
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   103
    if isinstance(s, Promise):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   104
        return unicode(s).encode(encoding, errors)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   105
    elif not isinstance(s, basestring):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   106
        try:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   107
            return str(s)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   108
        except UnicodeEncodeError:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   109
            if isinstance(s, Exception):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   110
                # An Exception subclass containing non-ASCII data that doesn't
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   111
                # know how to print itself properly. We shouldn't raise a
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   112
                # further exception.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   113
                return ' '.join([smart_str(arg, encoding, strings_only,
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   114
                        errors) for arg in s])
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   115
            return unicode(s).encode(encoding, errors)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   116
    elif isinstance(s, unicode):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   117
        return s.encode(encoding, errors)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   118
    elif s and encoding != 'utf-8':
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   119
        return s.decode('utf-8', errors).encode(encoding, errors)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   120
    else:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   121
        return s
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   122
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   123
def iri_to_uri(iri):
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   124
    """
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   125
    Convert an Internationalized Resource Identifier (IRI) portion to a URI
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   126
    portion that is suitable for inclusion in a URL.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   127
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   128
    This is the algorithm from section 3.1 of RFC 3987.  However, since we are
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   129
    assuming input is either UTF-8 or unicode already, we can simplify things a
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   130
    little from the full method.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   131
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   132
    Returns an ASCII string containing the encoded result.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   133
    """
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   134
    # The list of safe characters here is constructed from the printable ASCII
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   135
    # characters that are not explicitly excluded by the list at the end of
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   136
    # section 3.1 of RFC 3987.
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   137
    if iri is None:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   138
        return iri
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   139
    return urllib.quote(smart_str(iri), safe='/#%[]=:;$&()+,!?*')
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   140
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   141
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   142
# The encoding of the default system locale but falls back to the
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   143
# given fallback encoding if the encoding is unsupported by python or could
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   144
# not be determined.  See tickets #10335 and #5846
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   145
try:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   146
    DEFAULT_LOCALE_ENCODING = locale.getdefaultlocale()[1] or 'ascii'
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   147
    codecs.lookup(DEFAULT_LOCALE_ENCODING)
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   148
except:
0d40e90630ef Blinkster creation
ymh <ymh.work@gmail.com>
parents:
diff changeset
   149
    DEFAULT_LOCALE_ENCODING = 'ascii'