|
1 import types |
|
2 import urllib |
|
3 import locale |
|
4 import datetime |
|
5 import codecs |
|
6 from decimal import Decimal |
|
7 |
|
8 from django.utils.functional import Promise |
|
9 |
|
10 class DjangoUnicodeDecodeError(UnicodeDecodeError): |
|
11 def __init__(self, obj, *args): |
|
12 self.obj = obj |
|
13 UnicodeDecodeError.__init__(self, *args) |
|
14 |
|
15 def __str__(self): |
|
16 original = UnicodeDecodeError.__str__(self) |
|
17 return '%s. You passed in %r (%s)' % (original, self.obj, |
|
18 type(self.obj)) |
|
19 |
|
20 class StrAndUnicode(object): |
|
21 """ |
|
22 A class whose __str__ returns its __unicode__ as a UTF-8 bytestring. |
|
23 |
|
24 Useful as a mix-in. |
|
25 """ |
|
26 def __str__(self): |
|
27 return self.__unicode__().encode('utf-8') |
|
28 |
|
29 def smart_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): |
|
30 """ |
|
31 Returns a unicode object representing 's'. Treats bytestrings using the |
|
32 'encoding' codec. |
|
33 |
|
34 If strings_only is True, don't convert (some) non-string-like objects. |
|
35 """ |
|
36 if isinstance(s, Promise): |
|
37 # The input is the result of a gettext_lazy() call. |
|
38 return s |
|
39 return force_unicode(s, encoding, strings_only, errors) |
|
40 |
|
41 def is_protected_type(obj): |
|
42 """Determine if the object instance is of a protected type. |
|
43 |
|
44 Objects of protected types are preserved as-is when passed to |
|
45 force_unicode(strings_only=True). |
|
46 """ |
|
47 return isinstance(obj, ( |
|
48 types.NoneType, |
|
49 int, long, |
|
50 datetime.datetime, datetime.date, datetime.time, |
|
51 float, Decimal) |
|
52 ) |
|
53 |
|
54 def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): |
|
55 """ |
|
56 Similar to smart_unicode, except that lazy instances are resolved to |
|
57 strings, rather than kept as lazy objects. |
|
58 |
|
59 If strings_only is True, don't convert (some) non-string-like objects. |
|
60 """ |
|
61 if strings_only and is_protected_type(s): |
|
62 return s |
|
63 try: |
|
64 if not isinstance(s, basestring,): |
|
65 if hasattr(s, '__unicode__'): |
|
66 s = unicode(s) |
|
67 else: |
|
68 try: |
|
69 s = unicode(str(s), encoding, errors) |
|
70 except UnicodeEncodeError: |
|
71 if not isinstance(s, Exception): |
|
72 raise |
|
73 # If we get to here, the caller has passed in an Exception |
|
74 # subclass populated with non-ASCII data without special |
|
75 # handling to display as a string. We need to handle this |
|
76 # without raising a further exception. We do an |
|
77 # approximation to what the Exception's standard str() |
|
78 # output should be. |
|
79 s = ' '.join([force_unicode(arg, encoding, strings_only, |
|
80 errors) for arg in s]) |
|
81 elif not isinstance(s, unicode): |
|
82 # Note: We use .decode() here, instead of unicode(s, encoding, |
|
83 # errors), so that if s is a SafeString, it ends up being a |
|
84 # SafeUnicode at the end. |
|
85 s = s.decode(encoding, errors) |
|
86 except UnicodeDecodeError, e: |
|
87 if not isinstance(s, Exception): |
|
88 raise DjangoUnicodeDecodeError(s, *e.args) |
|
89 else: |
|
90 # If we get to here, the caller has passed in an Exception |
|
91 # subclass populated with non-ASCII bytestring data without a |
|
92 # working unicode method. Try to handle this without raising a |
|
93 # further exception by individually forcing the exception args |
|
94 # to unicode. |
|
95 s = ' '.join([force_unicode(arg, encoding, strings_only, |
|
96 errors) for arg in s]) |
|
97 return s |
|
98 |
|
99 def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'): |
|
100 """ |
|
101 Returns a bytestring version of 's', encoded as specified in 'encoding'. |
|
102 |
|
103 If strings_only is True, don't convert (some) non-string-like objects. |
|
104 """ |
|
105 if strings_only and isinstance(s, (types.NoneType, int)): |
|
106 return s |
|
107 if isinstance(s, Promise): |
|
108 return unicode(s).encode(encoding, errors) |
|
109 elif not isinstance(s, basestring): |
|
110 try: |
|
111 return str(s) |
|
112 except UnicodeEncodeError: |
|
113 if isinstance(s, Exception): |
|
114 # An Exception subclass containing non-ASCII data that doesn't |
|
115 # know how to print itself properly. We shouldn't raise a |
|
116 # further exception. |
|
117 return ' '.join([smart_str(arg, encoding, strings_only, |
|
118 errors) for arg in s]) |
|
119 return unicode(s).encode(encoding, errors) |
|
120 elif isinstance(s, unicode): |
|
121 return s.encode(encoding, errors) |
|
122 elif s and encoding != 'utf-8': |
|
123 return s.decode('utf-8', errors).encode(encoding, errors) |
|
124 else: |
|
125 return s |
|
126 |
|
127 def iri_to_uri(iri): |
|
128 """ |
|
129 Convert an Internationalized Resource Identifier (IRI) portion to a URI |
|
130 portion that is suitable for inclusion in a URL. |
|
131 |
|
132 This is the algorithm from section 3.1 of RFC 3987. However, since we are |
|
133 assuming input is either UTF-8 or unicode already, we can simplify things a |
|
134 little from the full method. |
|
135 |
|
136 Returns an ASCII string containing the encoded result. |
|
137 """ |
|
138 # The list of safe characters here is constructed from the "reserved" and |
|
139 # "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986: |
|
140 # reserved = gen-delims / sub-delims |
|
141 # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" |
|
142 # sub-delims = "!" / "$" / "&" / "'" / "(" / ")" |
|
143 # / "*" / "+" / "," / ";" / "=" |
|
144 # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" |
|
145 # Of the unreserved characters, urllib.quote already considers all but |
|
146 # the ~ safe. |
|
147 # The % character is also added to the list of safe characters here, as the |
|
148 # end of section 3.1 of RFC 3987 specifically mentions that % must not be |
|
149 # converted. |
|
150 if iri is None: |
|
151 return iri |
|
152 return urllib.quote(smart_str(iri), safe="/#%[]=:;$&()+,!?*@'~") |
|
153 |
|
154 |
|
155 # The encoding of the default system locale but falls back to the |
|
156 # given fallback encoding if the encoding is unsupported by python or could |
|
157 # not be determined. See tickets #10335 and #5846 |
|
158 try: |
|
159 DEFAULT_LOCALE_ENCODING = locale.getdefaultlocale()[1] or 'ascii' |
|
160 codecs.lookup(DEFAULT_LOCALE_ENCODING) |
|
161 except: |
|
162 DEFAULT_LOCALE_ENCODING = 'ascii' |