|
1 <?php |
|
2 |
|
3 /** |
|
4 * IDNA URL encoder |
|
5 * |
|
6 * Note: Not fully compliant, as nameprep does nothing yet. |
|
7 * |
|
8 * @package Requests |
|
9 * @subpackage Utilities |
|
10 * @see https://tools.ietf.org/html/rfc3490 IDNA specification |
|
11 * @see https://tools.ietf.org/html/rfc3492 Punycode/Bootstrap specification |
|
12 */ |
|
13 class Requests_IDNAEncoder { |
|
14 /** |
|
15 * ACE prefix used for IDNA |
|
16 * |
|
17 * @see https://tools.ietf.org/html/rfc3490#section-5 |
|
18 * @var string |
|
19 */ |
|
20 const ACE_PREFIX = 'xn--'; |
|
21 |
|
22 /**#@+ |
|
23 * Bootstrap constant for Punycode |
|
24 * |
|
25 * @see https://tools.ietf.org/html/rfc3492#section-5 |
|
26 * @var int |
|
27 */ |
|
28 const BOOTSTRAP_BASE = 36; |
|
29 const BOOTSTRAP_TMIN = 1; |
|
30 const BOOTSTRAP_TMAX = 26; |
|
31 const BOOTSTRAP_SKEW = 38; |
|
32 const BOOTSTRAP_DAMP = 700; |
|
33 const BOOTSTRAP_INITIAL_BIAS = 72; |
|
34 const BOOTSTRAP_INITIAL_N = 128; |
|
35 /**#@-*/ |
|
36 |
|
37 /** |
|
38 * Encode a hostname using Punycode |
|
39 * |
|
40 * @param string $string Hostname |
|
41 * @return string Punycode-encoded hostname |
|
42 */ |
|
43 public static function encode($string) { |
|
44 $parts = explode('.', $string); |
|
45 foreach ($parts as &$part) { |
|
46 $part = self::to_ascii($part); |
|
47 } |
|
48 return implode('.', $parts); |
|
49 } |
|
50 |
|
51 /** |
|
52 * Convert a UTF-8 string to an ASCII string using Punycode |
|
53 * |
|
54 * @throws Requests_Exception Provided string longer than 64 ASCII characters (`idna.provided_too_long`) |
|
55 * @throws Requests_Exception Prepared string longer than 64 ASCII characters (`idna.prepared_too_long`) |
|
56 * @throws Requests_Exception Provided string already begins with xn-- (`idna.provided_is_prefixed`) |
|
57 * @throws Requests_Exception Encoded string longer than 64 ASCII characters (`idna.encoded_too_long`) |
|
58 * |
|
59 * @param string $string ASCII or UTF-8 string (max length 64 characters) |
|
60 * @return string ASCII string |
|
61 */ |
|
62 public static function to_ascii($string) { |
|
63 // Step 1: Check if the string is already ASCII |
|
64 if (self::is_ascii($string)) { |
|
65 // Skip to step 7 |
|
66 if (strlen($string) < 64) { |
|
67 return $string; |
|
68 } |
|
69 |
|
70 throw new Requests_Exception('Provided string is too long', 'idna.provided_too_long', $string); |
|
71 } |
|
72 |
|
73 // Step 2: nameprep |
|
74 $string = self::nameprep($string); |
|
75 |
|
76 // Step 3: UseSTD3ASCIIRules is false, continue |
|
77 // Step 4: Check if it's ASCII now |
|
78 if (self::is_ascii($string)) { |
|
79 // Skip to step 7 |
|
80 if (strlen($string) < 64) { |
|
81 return $string; |
|
82 } |
|
83 |
|
84 throw new Requests_Exception('Prepared string is too long', 'idna.prepared_too_long', $string); |
|
85 } |
|
86 |
|
87 // Step 5: Check ACE prefix |
|
88 if (strpos($string, self::ACE_PREFIX) === 0) { |
|
89 throw new Requests_Exception('Provided string begins with ACE prefix', 'idna.provided_is_prefixed', $string); |
|
90 } |
|
91 |
|
92 // Step 6: Encode with Punycode |
|
93 $string = self::punycode_encode($string); |
|
94 |
|
95 // Step 7: Prepend ACE prefix |
|
96 $string = self::ACE_PREFIX . $string; |
|
97 |
|
98 // Step 8: Check size |
|
99 if (strlen($string) < 64) { |
|
100 return $string; |
|
101 } |
|
102 |
|
103 throw new Requests_Exception('Encoded string is too long', 'idna.encoded_too_long', $string); |
|
104 } |
|
105 |
|
106 /** |
|
107 * Check whether a given string contains only ASCII characters |
|
108 * |
|
109 * @internal (Testing found regex was the fastest implementation) |
|
110 * |
|
111 * @param string $string |
|
112 * @return bool Is the string ASCII-only? |
|
113 */ |
|
114 protected static function is_ascii($string) { |
|
115 return (preg_match('/(?:[^\x00-\x7F])/', $string) !== 1); |
|
116 } |
|
117 |
|
118 /** |
|
119 * Prepare a string for use as an IDNA name |
|
120 * |
|
121 * @todo Implement this based on RFC 3491 and the newer 5891 |
|
122 * @param string $string |
|
123 * @return string Prepared string |
|
124 */ |
|
125 protected static function nameprep($string) { |
|
126 return $string; |
|
127 } |
|
128 |
|
129 /** |
|
130 * Convert a UTF-8 string to a UCS-4 codepoint array |
|
131 * |
|
132 * Based on Requests_IRI::replace_invalid_with_pct_encoding() |
|
133 * |
|
134 * @throws Requests_Exception Invalid UTF-8 codepoint (`idna.invalidcodepoint`) |
|
135 * @param string $input |
|
136 * @return array Unicode code points |
|
137 */ |
|
138 protected static function utf8_to_codepoints($input) { |
|
139 $codepoints = array(); |
|
140 |
|
141 // Get number of bytes |
|
142 $strlen = strlen($input); |
|
143 |
|
144 for ($position = 0; $position < $strlen; $position++) { |
|
145 $value = ord($input[$position]); |
|
146 |
|
147 // One byte sequence: |
|
148 if ((~$value & 0x80) === 0x80) { |
|
149 $character = $value; |
|
150 $length = 1; |
|
151 $remaining = 0; |
|
152 } |
|
153 // Two byte sequence: |
|
154 elseif (($value & 0xE0) === 0xC0) { |
|
155 $character = ($value & 0x1F) << 6; |
|
156 $length = 2; |
|
157 $remaining = 1; |
|
158 } |
|
159 // Three byte sequence: |
|
160 elseif (($value & 0xF0) === 0xE0) { |
|
161 $character = ($value & 0x0F) << 12; |
|
162 $length = 3; |
|
163 $remaining = 2; |
|
164 } |
|
165 // Four byte sequence: |
|
166 elseif (($value & 0xF8) === 0xF0) { |
|
167 $character = ($value & 0x07) << 18; |
|
168 $length = 4; |
|
169 $remaining = 3; |
|
170 } |
|
171 // Invalid byte: |
|
172 else { |
|
173 throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $value); |
|
174 } |
|
175 |
|
176 if ($remaining > 0) { |
|
177 if ($position + $length > $strlen) { |
|
178 throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character); |
|
179 } |
|
180 for ($position++; $remaining > 0; $position++) { |
|
181 $value = ord($input[$position]); |
|
182 |
|
183 // If it is invalid, count the sequence as invalid and reprocess the current byte: |
|
184 if (($value & 0xC0) !== 0x80) { |
|
185 throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character); |
|
186 } |
|
187 |
|
188 $character |= ($value & 0x3F) << (--$remaining * 6); |
|
189 } |
|
190 $position--; |
|
191 } |
|
192 |
|
193 if ( |
|
194 // Non-shortest form sequences are invalid |
|
195 $length > 1 && $character <= 0x7F |
|
196 || $length > 2 && $character <= 0x7FF |
|
197 || $length > 3 && $character <= 0xFFFF |
|
198 // Outside of range of ucschar codepoints |
|
199 // Noncharacters |
|
200 || ($character & 0xFFFE) === 0xFFFE |
|
201 || $character >= 0xFDD0 && $character <= 0xFDEF |
|
202 || ( |
|
203 // Everything else not in ucschar |
|
204 $character > 0xD7FF && $character < 0xF900 |
|
205 || $character < 0x20 |
|
206 || $character > 0x7E && $character < 0xA0 |
|
207 || $character > 0xEFFFD |
|
208 ) |
|
209 ) { |
|
210 throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character); |
|
211 } |
|
212 |
|
213 $codepoints[] = $character; |
|
214 } |
|
215 |
|
216 return $codepoints; |
|
217 } |
|
218 |
|
219 /** |
|
220 * RFC3492-compliant encoder |
|
221 * |
|
222 * @internal Pseudo-code from Section 6.3 is commented with "#" next to relevant code |
|
223 * @throws Requests_Exception On character outside of the domain (never happens with Punycode) (`idna.character_outside_domain`) |
|
224 * |
|
225 * @param string $input UTF-8 encoded string to encode |
|
226 * @return string Punycode-encoded string |
|
227 */ |
|
228 public static function punycode_encode($input) { |
|
229 $output = ''; |
|
230 # let n = initial_n |
|
231 $n = self::BOOTSTRAP_INITIAL_N; |
|
232 # let delta = 0 |
|
233 $delta = 0; |
|
234 # let bias = initial_bias |
|
235 $bias = self::BOOTSTRAP_INITIAL_BIAS; |
|
236 # let h = b = the number of basic code points in the input |
|
237 $h = $b = 0; // see loop |
|
238 # copy them to the output in order |
|
239 $codepoints = self::utf8_to_codepoints($input); |
|
240 $extended = array(); |
|
241 |
|
242 foreach ($codepoints as $char) { |
|
243 if ($char < 128) { |
|
244 // Character is valid ASCII |
|
245 // TODO: this should also check if it's valid for a URL |
|
246 $output .= chr($char); |
|
247 $h++; |
|
248 } |
|
249 // Check if the character is non-ASCII, but below initial n |
|
250 // This never occurs for Punycode, so ignore in coverage |
|
251 // @codeCoverageIgnoreStart |
|
252 elseif ($char < $n) { |
|
253 throw new Requests_Exception('Invalid character', 'idna.character_outside_domain', $char); |
|
254 } |
|
255 // @codeCoverageIgnoreEnd |
|
256 else { |
|
257 $extended[$char] = true; |
|
258 } |
|
259 } |
|
260 $extended = array_keys($extended); |
|
261 sort($extended); |
|
262 $b = $h; |
|
263 # [copy them] followed by a delimiter if b > 0 |
|
264 if (strlen($output) > 0) { |
|
265 $output .= '-'; |
|
266 } |
|
267 # {if the input contains a non-basic code point < n then fail} |
|
268 # while h < length(input) do begin |
|
269 while ($h < count($codepoints)) { |
|
270 # let m = the minimum code point >= n in the input |
|
271 $m = array_shift($extended); |
|
272 //printf('next code point to insert is %s' . PHP_EOL, dechex($m)); |
|
273 # let delta = delta + (m - n) * (h + 1), fail on overflow |
|
274 $delta += ($m - $n) * ($h + 1); |
|
275 # let n = m |
|
276 $n = $m; |
|
277 # for each code point c in the input (in order) do begin |
|
278 for ($num = 0; $num < count($codepoints); $num++) { |
|
279 $c = $codepoints[$num]; |
|
280 # if c < n then increment delta, fail on overflow |
|
281 if ($c < $n) { |
|
282 $delta++; |
|
283 } |
|
284 # if c == n then begin |
|
285 elseif ($c === $n) { |
|
286 # let q = delta |
|
287 $q = $delta; |
|
288 # for k = base to infinity in steps of base do begin |
|
289 for ($k = self::BOOTSTRAP_BASE; ; $k += self::BOOTSTRAP_BASE) { |
|
290 # let t = tmin if k <= bias {+ tmin}, or |
|
291 # tmax if k >= bias + tmax, or k - bias otherwise |
|
292 if ($k <= ($bias + self::BOOTSTRAP_TMIN)) { |
|
293 $t = self::BOOTSTRAP_TMIN; |
|
294 } |
|
295 elseif ($k >= ($bias + self::BOOTSTRAP_TMAX)) { |
|
296 $t = self::BOOTSTRAP_TMAX; |
|
297 } |
|
298 else { |
|
299 $t = $k - $bias; |
|
300 } |
|
301 # if q < t then break |
|
302 if ($q < $t) { |
|
303 break; |
|
304 } |
|
305 # output the code point for digit t + ((q - t) mod (base - t)) |
|
306 $digit = $t + (($q - $t) % (self::BOOTSTRAP_BASE - $t)); |
|
307 $output .= self::digit_to_char($digit); |
|
308 # let q = (q - t) div (base - t) |
|
309 $q = floor(($q - $t) / (self::BOOTSTRAP_BASE - $t)); |
|
310 # end |
|
311 } |
|
312 # output the code point for digit q |
|
313 $output .= self::digit_to_char($q); |
|
314 # let bias = adapt(delta, h + 1, test h equals b?) |
|
315 $bias = self::adapt($delta, $h + 1, $h === $b); |
|
316 # let delta = 0 |
|
317 $delta = 0; |
|
318 # increment h |
|
319 $h++; |
|
320 # end |
|
321 } |
|
322 # end |
|
323 } |
|
324 # increment delta and n |
|
325 $delta++; |
|
326 $n++; |
|
327 # end |
|
328 } |
|
329 |
|
330 return $output; |
|
331 } |
|
332 |
|
333 /** |
|
334 * Convert a digit to its respective character |
|
335 * |
|
336 * @see https://tools.ietf.org/html/rfc3492#section-5 |
|
337 * @throws Requests_Exception On invalid digit (`idna.invalid_digit`) |
|
338 * |
|
339 * @param int $digit Digit in the range 0-35 |
|
340 * @return string Single character corresponding to digit |
|
341 */ |
|
342 protected static function digit_to_char($digit) { |
|
343 // @codeCoverageIgnoreStart |
|
344 // As far as I know, this never happens, but still good to be sure. |
|
345 if ($digit < 0 || $digit > 35) { |
|
346 throw new Requests_Exception(sprintf('Invalid digit %d', $digit), 'idna.invalid_digit', $digit); |
|
347 } |
|
348 // @codeCoverageIgnoreEnd |
|
349 $digits = 'abcdefghijklmnopqrstuvwxyz0123456789'; |
|
350 return substr($digits, $digit, 1); |
|
351 } |
|
352 |
|
353 /** |
|
354 * Adapt the bias |
|
355 * |
|
356 * @see https://tools.ietf.org/html/rfc3492#section-6.1 |
|
357 * @param int $delta |
|
358 * @param int $numpoints |
|
359 * @param bool $firsttime |
|
360 * @return int New bias |
|
361 */ |
|
362 protected static function adapt($delta, $numpoints, $firsttime) { |
|
363 # function adapt(delta,numpoints,firsttime): |
|
364 # if firsttime then let delta = delta div damp |
|
365 if ($firsttime) { |
|
366 $delta = floor($delta / self::BOOTSTRAP_DAMP); |
|
367 } |
|
368 # else let delta = delta div 2 |
|
369 else { |
|
370 $delta = floor($delta / 2); |
|
371 } |
|
372 # let delta = delta + (delta div numpoints) |
|
373 $delta += floor($delta / $numpoints); |
|
374 # let k = 0 |
|
375 $k = 0; |
|
376 # while delta > ((base - tmin) * tmax) div 2 do begin |
|
377 $max = floor(((self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN) * self::BOOTSTRAP_TMAX) / 2); |
|
378 while ($delta > $max) { |
|
379 # let delta = delta div (base - tmin) |
|
380 $delta = floor($delta / (self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN)); |
|
381 # let k = k + base |
|
382 $k += self::BOOTSTRAP_BASE; |
|
383 # end |
|
384 } |
|
385 # return k + (((base - tmin + 1) * delta) div (delta + skew)) |
|
386 return $k + floor(((self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN + 1) * $delta) / ($delta + self::BOOTSTRAP_SKEW)); |
|
387 } |
|
388 } |