web/lib/django/utils/encoding.py
changeset 38 77b6da96e6f1
equal deleted inserted replaced
37:8d941af65caf 38:77b6da96e6f1
       
     1 import types
       
     2 import urllib
       
     3 import locale
       
     4 import datetime
       
     5 import codecs
       
     6 from decimal import Decimal
       
     7 
       
     8 from django.utils.functional import Promise
       
     9 
       
    10 class DjangoUnicodeDecodeError(UnicodeDecodeError):
       
    11     def __init__(self, obj, *args):
       
    12         self.obj = obj
       
    13         UnicodeDecodeError.__init__(self, *args)
       
    14 
       
    15     def __str__(self):
       
    16         original = UnicodeDecodeError.__str__(self)
       
    17         return '%s. You passed in %r (%s)' % (original, self.obj,
       
    18                 type(self.obj))
       
    19 
       
    20 class StrAndUnicode(object):
       
    21     """
       
    22     A class whose __str__ returns its __unicode__ as a UTF-8 bytestring.
       
    23 
       
    24     Useful as a mix-in.
       
    25     """
       
    26     def __str__(self):
       
    27         return self.__unicode__().encode('utf-8')
       
    28 
       
    29 def smart_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
       
    30     """
       
    31     Returns a unicode object representing 's'. Treats bytestrings using the
       
    32     'encoding' codec.
       
    33 
       
    34     If strings_only is True, don't convert (some) non-string-like objects.
       
    35     """
       
    36     if isinstance(s, Promise):
       
    37         # The input is the result of a gettext_lazy() call.
       
    38         return s
       
    39     return force_unicode(s, encoding, strings_only, errors)
       
    40 
       
    41 def is_protected_type(obj):
       
    42     """Determine if the object instance is of a protected type.
       
    43 
       
    44     Objects of protected types are preserved as-is when passed to
       
    45     force_unicode(strings_only=True).
       
    46     """
       
    47     return isinstance(obj, (
       
    48         types.NoneType,
       
    49         int, long,
       
    50         datetime.datetime, datetime.date, datetime.time,
       
    51         float, Decimal)
       
    52     )
       
    53 
       
    54 def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
       
    55     """
       
    56     Similar to smart_unicode, except that lazy instances are resolved to
       
    57     strings, rather than kept as lazy objects.
       
    58 
       
    59     If strings_only is True, don't convert (some) non-string-like objects.
       
    60     """
       
    61     if strings_only and is_protected_type(s):
       
    62         return s
       
    63     try:
       
    64         if not isinstance(s, basestring,):
       
    65             if hasattr(s, '__unicode__'):
       
    66                 s = unicode(s)
       
    67             else:
       
    68                 try:
       
    69                     s = unicode(str(s), encoding, errors)
       
    70                 except UnicodeEncodeError:
       
    71                     if not isinstance(s, Exception):
       
    72                         raise
       
    73                     # If we get to here, the caller has passed in an Exception
       
    74                     # subclass populated with non-ASCII data without special
       
    75                     # handling to display as a string. We need to handle this
       
    76                     # without raising a further exception. We do an
       
    77                     # approximation to what the Exception's standard str()
       
    78                     # output should be.
       
    79                     s = ' '.join([force_unicode(arg, encoding, strings_only,
       
    80                             errors) for arg in s])
       
    81         elif not isinstance(s, unicode):
       
    82             # Note: We use .decode() here, instead of unicode(s, encoding,
       
    83             # errors), so that if s is a SafeString, it ends up being a
       
    84             # SafeUnicode at the end.
       
    85             s = s.decode(encoding, errors)
       
    86     except UnicodeDecodeError, e:
       
    87         if not isinstance(s, Exception):
       
    88             raise DjangoUnicodeDecodeError(s, *e.args)
       
    89         else:
       
    90             # If we get to here, the caller has passed in an Exception
       
    91             # subclass populated with non-ASCII bytestring data without a
       
    92             # working unicode method. Try to handle this without raising a
       
    93             # further exception by individually forcing the exception args
       
    94             # to unicode.
       
    95             s = ' '.join([force_unicode(arg, encoding, strings_only,
       
    96                     errors) for arg in s])
       
    97     return s
       
    98 
       
    99 def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'):
       
   100     """
       
   101     Returns a bytestring version of 's', encoded as specified in 'encoding'.
       
   102 
       
   103     If strings_only is True, don't convert (some) non-string-like objects.
       
   104     """
       
   105     if strings_only and isinstance(s, (types.NoneType, int)):
       
   106         return s
       
   107     if isinstance(s, Promise):
       
   108         return unicode(s).encode(encoding, errors)
       
   109     elif not isinstance(s, basestring):
       
   110         try:
       
   111             return str(s)
       
   112         except UnicodeEncodeError:
       
   113             if isinstance(s, Exception):
       
   114                 # An Exception subclass containing non-ASCII data that doesn't
       
   115                 # know how to print itself properly. We shouldn't raise a
       
   116                 # further exception.
       
   117                 return ' '.join([smart_str(arg, encoding, strings_only,
       
   118                         errors) for arg in s])
       
   119             return unicode(s).encode(encoding, errors)
       
   120     elif isinstance(s, unicode):
       
   121         return s.encode(encoding, errors)
       
   122     elif s and encoding != 'utf-8':
       
   123         return s.decode('utf-8', errors).encode(encoding, errors)
       
   124     else:
       
   125         return s
       
   126 
       
   127 def iri_to_uri(iri):
       
   128     """
       
   129     Convert an Internationalized Resource Identifier (IRI) portion to a URI
       
   130     portion that is suitable for inclusion in a URL.
       
   131 
       
   132     This is the algorithm from section 3.1 of RFC 3987.  However, since we are
       
   133     assuming input is either UTF-8 or unicode already, we can simplify things a
       
   134     little from the full method.
       
   135 
       
   136     Returns an ASCII string containing the encoded result.
       
   137     """
       
   138     # The list of safe characters here is constructed from the "reserved" and
       
   139     # "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986:
       
   140     #     reserved    = gen-delims / sub-delims
       
   141     #     gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
       
   142     #     sub-delims  = "!" / "$" / "&" / "'" / "(" / ")"
       
   143     #                   / "*" / "+" / "," / ";" / "="
       
   144     #     unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
       
   145     # Of the unreserved characters, urllib.quote already considers all but
       
   146     # the ~ safe.
       
   147     # The % character is also added to the list of safe characters here, as the
       
   148     # end of section 3.1 of RFC 3987 specifically mentions that % must not be
       
   149     # converted.
       
   150     if iri is None:
       
   151         return iri
       
   152     return urllib.quote(smart_str(iri), safe="/#%[]=:;$&()+,!?*@'~")
       
   153 
       
   154 
       
   155 # The encoding of the default system locale but falls back to the
       
   156 # given fallback encoding if the encoding is unsupported by python or could
       
   157 # not be determined.  See tickets #10335 and #5846
       
   158 try:
       
   159     DEFAULT_LOCALE_ENCODING = locale.getdefaultlocale()[1] or 'ascii'
       
   160     codecs.lookup(DEFAULT_LOCALE_ENCODING)
       
   161 except:
       
   162     DEFAULT_LOCALE_ENCODING = 'ascii'