|
1 <?php |
|
2 |
|
3 /** |
|
4 * HTML API: WP_HTML_Decoder class |
|
5 * |
|
6 * Decodes spans of raw text found inside HTML content. |
|
7 * |
|
8 * @package WordPress |
|
9 * @subpackage HTML-API |
|
10 * @since 6.6.0 |
|
11 */ |
|
12 class WP_HTML_Decoder { |
|
13 /** |
|
14 * Indicates if an attribute value starts with a given raw string value. |
|
15 * |
|
16 * Use this method to determine if an attribute value starts with a given string, regardless |
|
17 * of how it might be encoded in HTML. For instance, `http:` could be represented as `http:` |
|
18 * or as `http:` or as `http:` or as `http:`, or in many other ways. |
|
19 * |
|
20 * Example: |
|
21 * |
|
22 * $value = 'http://wordpress.org/'; |
|
23 * true === WP_HTML_Decoder::attribute_starts_with( $value, 'http:', 'ascii-case-insensitive' ); |
|
24 * false === WP_HTML_Decoder::attribute_starts_with( $value, 'https:', 'ascii-case-insensitive' ); |
|
25 * |
|
26 * @since 6.6.0 |
|
27 * |
|
28 * @param string $haystack String containing the raw non-decoded attribute value. |
|
29 * @param string $search_text Does the attribute value start with this plain string. |
|
30 * @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. |
|
31 * Default 'case-sensitive'. |
|
32 * @return bool Whether the attribute value starts with the given string. |
|
33 */ |
|
34 public static function attribute_starts_with( $haystack, $search_text, $case_sensitivity = 'case-sensitive' ) { |
|
35 $search_length = strlen( $search_text ); |
|
36 $loose_case = 'ascii-case-insensitive' === $case_sensitivity; |
|
37 $haystack_end = strlen( $haystack ); |
|
38 $search_at = 0; |
|
39 $haystack_at = 0; |
|
40 |
|
41 while ( $search_at < $search_length && $haystack_at < $haystack_end ) { |
|
42 $chars_match = $loose_case |
|
43 ? strtolower( $haystack[ $haystack_at ] ) === strtolower( $search_text[ $search_at ] ) |
|
44 : $haystack[ $haystack_at ] === $search_text[ $search_at ]; |
|
45 |
|
46 $is_introducer = '&' === $haystack[ $haystack_at ]; |
|
47 $next_chunk = $is_introducer |
|
48 ? self::read_character_reference( 'attribute', $haystack, $haystack_at, $token_length ) |
|
49 : null; |
|
50 |
|
51 // If there's no character reference and the characters don't match, the match fails. |
|
52 if ( null === $next_chunk && ! $chars_match ) { |
|
53 return false; |
|
54 } |
|
55 |
|
56 // If there's no character reference but the character do match, then it could still match. |
|
57 if ( null === $next_chunk && $chars_match ) { |
|
58 ++$haystack_at; |
|
59 ++$search_at; |
|
60 continue; |
|
61 } |
|
62 |
|
63 // If there is a character reference, then the decoded value must exactly match what follows in the search string. |
|
64 if ( 0 !== substr_compare( $search_text, $next_chunk, $search_at, strlen( $next_chunk ), $loose_case ) ) { |
|
65 return false; |
|
66 } |
|
67 |
|
68 // The character reference matched, so continue checking. |
|
69 $haystack_at += $token_length; |
|
70 $search_at += strlen( $next_chunk ); |
|
71 } |
|
72 |
|
73 return true; |
|
74 } |
|
75 |
|
76 /** |
|
77 * Returns a string containing the decoded value of a given HTML text node. |
|
78 * |
|
79 * Text nodes appear in HTML DATA sections, which are the text segments inside |
|
80 * and around tags, excepting SCRIPT and STYLE elements (and some others), |
|
81 * whose inner text is not decoded. Use this function to read the decoded |
|
82 * value of such a text span in an HTML document. |
|
83 * |
|
84 * Example: |
|
85 * |
|
86 * '“😄”' === WP_HTML_Decode::decode_text_node( '“😄”' ); |
|
87 * |
|
88 * @since 6.6.0 |
|
89 * |
|
90 * @param string $text Text containing raw and non-decoded text node to decode. |
|
91 * @return string Decoded UTF-8 value of given text node. |
|
92 */ |
|
93 public static function decode_text_node( $text ) { |
|
94 return static::decode( 'data', $text ); |
|
95 } |
|
96 |
|
97 /** |
|
98 * Returns a string containing the decoded value of a given HTML attribute. |
|
99 * |
|
100 * Text found inside an HTML attribute has different parsing rules than for |
|
101 * text found inside other markup, or DATA segments. Use this function to |
|
102 * read the decoded value of an HTML string inside a quoted attribute. |
|
103 * |
|
104 * Example: |
|
105 * |
|
106 * '“😄”' === WP_HTML_Decode::decode_attribute( '“😄”' ); |
|
107 * |
|
108 * @since 6.6.0 |
|
109 * |
|
110 * @param string $text Text containing raw and non-decoded attribute value to decode. |
|
111 * @return string Decoded UTF-8 value of given attribute value. |
|
112 */ |
|
113 public static function decode_attribute( $text ) { |
|
114 return static::decode( 'attribute', $text ); |
|
115 } |
|
116 |
|
117 /** |
|
118 * Decodes a span of HTML text, depending on the context in which it's found. |
|
119 * |
|
120 * This is a low-level method; prefer calling WP_HTML_Decoder::decode_attribute() or |
|
121 * WP_HTML_Decoder::decode_text_node() instead. It's provided for cases where this |
|
122 * may be difficult to do from calling code. |
|
123 * |
|
124 * Example: |
|
125 * |
|
126 * '©' = WP_HTML_Decoder::decode( 'data', '©' ); |
|
127 * |
|
128 * @since 6.6.0 |
|
129 * |
|
130 * @access private |
|
131 * |
|
132 * @param string $context `attribute` for decoding attribute values, `data` otherwise. |
|
133 * @param string $text Text document containing span of text to decode. |
|
134 * @return string Decoded UTF-8 string. |
|
135 */ |
|
136 public static function decode( $context, $text ) { |
|
137 $decoded = ''; |
|
138 $end = strlen( $text ); |
|
139 $at = 0; |
|
140 $was_at = 0; |
|
141 |
|
142 while ( $at < $end ) { |
|
143 $next_character_reference_at = strpos( $text, '&', $at ); |
|
144 if ( false === $next_character_reference_at || $next_character_reference_at >= $end ) { |
|
145 break; |
|
146 } |
|
147 |
|
148 $character_reference = self::read_character_reference( $context, $text, $next_character_reference_at, $token_length ); |
|
149 if ( isset( $character_reference ) ) { |
|
150 $at = $next_character_reference_at; |
|
151 $decoded .= substr( $text, $was_at, $at - $was_at ); |
|
152 $decoded .= $character_reference; |
|
153 $at += $token_length; |
|
154 $was_at = $at; |
|
155 continue; |
|
156 } |
|
157 |
|
158 ++$at; |
|
159 } |
|
160 |
|
161 if ( 0 === $was_at ) { |
|
162 return $text; |
|
163 } |
|
164 |
|
165 if ( $was_at < $end ) { |
|
166 $decoded .= substr( $text, $was_at, $end - $was_at ); |
|
167 } |
|
168 |
|
169 return $decoded; |
|
170 } |
|
171 |
|
172 /** |
|
173 * Attempt to read a character reference at the given location in a given string, |
|
174 * depending on the context in which it's found. |
|
175 * |
|
176 * If a character reference is found, this function will return the translated value |
|
177 * that the reference maps to. It will then set `$match_byte_length` the |
|
178 * number of bytes of input it read while consuming the character reference. This |
|
179 * gives calling code the opportunity to advance its cursor when traversing a string |
|
180 * and decoding. |
|
181 * |
|
182 * Example: |
|
183 * |
|
184 * null === WP_HTML_Decoder::read_character_reference( 'attribute', 'Ships…', 0 ); |
|
185 * '…' === WP_HTML_Decoder::read_character_reference( 'attribute', 'Ships…', 5, $token_length ); |
|
186 * 8 === $token_length; // `…` |
|
187 * |
|
188 * null === WP_HTML_Decoder::read_character_reference( 'attribute', '¬in', 0 ); |
|
189 * '∉' === WP_HTML_Decoder::read_character_reference( 'attribute', '∉', 0, $token_length ); |
|
190 * 7 === $token_length; // `∉` |
|
191 * |
|
192 * '¬' === WP_HTML_Decoder::read_character_reference( 'data', '¬in', 0, $token_length ); |
|
193 * 4 === $token_length; // `¬` |
|
194 * '∉' === WP_HTML_Decoder::read_character_reference( 'data', '∉', 0, $token_length ); |
|
195 * 7 === $token_length; // `∉` |
|
196 * |
|
197 * @since 6.6.0 |
|
198 * |
|
199 * @param string $context `attribute` for decoding attribute values, `data` otherwise. |
|
200 * @param string $text Text document containing span of text to decode. |
|
201 * @param int $at Optional. Byte offset into text where span begins, defaults to the beginning (0). |
|
202 * @param int &$match_byte_length Optional. Set to byte-length of character reference if provided and if a match |
|
203 * is found, otherwise not set. Default null. |
|
204 * @return string|false Decoded character reference in UTF-8 if found, otherwise `false`. |
|
205 */ |
|
206 public static function read_character_reference( $context, $text, $at = 0, &$match_byte_length = null ) { |
|
207 /** |
|
208 * Mappings for HTML5 named character references. |
|
209 * |
|
210 * @var WP_Token_Map $html5_named_character_references |
|
211 */ |
|
212 global $html5_named_character_references; |
|
213 |
|
214 $length = strlen( $text ); |
|
215 if ( $at + 1 >= $length ) { |
|
216 return null; |
|
217 } |
|
218 |
|
219 if ( '&' !== $text[ $at ] ) { |
|
220 return null; |
|
221 } |
|
222 |
|
223 /* |
|
224 * Numeric character references. |
|
225 * |
|
226 * When truncated, these will encode the code point found by parsing the |
|
227 * digits that are available. For example, when `🅰` is truncated |
|
228 * to `DZ` it will encode `DZ`. It does not: |
|
229 * - know how to parse the original `🅰`. |
|
230 * - fail to parse and return plaintext `DZ`. |
|
231 * - fail to parse and return the replacement character `�` |
|
232 */ |
|
233 if ( '#' === $text[ $at + 1 ] ) { |
|
234 if ( $at + 2 >= $length ) { |
|
235 return null; |
|
236 } |
|
237 |
|
238 /** Tracks inner parsing within the numeric character reference. */ |
|
239 $digits_at = $at + 2; |
|
240 |
|
241 if ( 'x' === $text[ $digits_at ] || 'X' === $text[ $digits_at ] ) { |
|
242 $numeric_base = 16; |
|
243 $numeric_digits = '0123456789abcdefABCDEF'; |
|
244 $max_digits = 6; //  |
|
245 ++$digits_at; |
|
246 } else { |
|
247 $numeric_base = 10; |
|
248 $numeric_digits = '0123456789'; |
|
249 $max_digits = 7; //  |
|
250 } |
|
251 |
|
252 // Cannot encode invalid Unicode code points. Max is to U+10FFFF. |
|
253 $zero_count = strspn( $text, '0', $digits_at ); |
|
254 $digit_count = strspn( $text, $numeric_digits, $digits_at + $zero_count ); |
|
255 $after_digits = $digits_at + $zero_count + $digit_count; |
|
256 $has_semicolon = $after_digits < $length && ';' === $text[ $after_digits ]; |
|
257 $end_of_span = $has_semicolon ? $after_digits + 1 : $after_digits; |
|
258 |
|
259 // `&#` or `&#x` without digits returns into plaintext. |
|
260 if ( 0 === $digit_count && 0 === $zero_count ) { |
|
261 return null; |
|
262 } |
|
263 |
|
264 // Whereas `&#` and only zeros is invalid. |
|
265 if ( 0 === $digit_count ) { |
|
266 $match_byte_length = $end_of_span - $at; |
|
267 return '�'; |
|
268 } |
|
269 |
|
270 // If there are too many digits then it's not worth parsing. It's invalid. |
|
271 if ( $digit_count > $max_digits ) { |
|
272 $match_byte_length = $end_of_span - $at; |
|
273 return '�'; |
|
274 } |
|
275 |
|
276 $digits = substr( $text, $digits_at + $zero_count, $digit_count ); |
|
277 $code_point = intval( $digits, $numeric_base ); |
|
278 |
|
279 /* |
|
280 * Noncharacters, 0x0D, and non-ASCII-whitespace control characters. |
|
281 * |
|
282 * > A noncharacter is a code point that is in the range U+FDD0 to U+FDEF, |
|
283 * > inclusive, or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, |
|
284 * > U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, |
|
285 * > U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, |
|
286 * > U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, |
|
287 * > U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, or U+10FFFF. |
|
288 * |
|
289 * A C0 control is a code point that is in the range of U+00 to U+1F, |
|
290 * but ASCII whitespace includes U+09, U+0A, U+0C, and U+0D. |
|
291 * |
|
292 * These characters are invalid but still decode as any valid character. |
|
293 * This comment is here to note and explain why there's no check to |
|
294 * remove these characters or replace them. |
|
295 * |
|
296 * @see https://infra.spec.whatwg.org/#noncharacter |
|
297 */ |
|
298 |
|
299 /* |
|
300 * Code points in the C1 controls area need to be remapped as if they |
|
301 * were stored in Windows-1252. Note! This transformation only happens |
|
302 * for numeric character references. The raw code points in the byte |
|
303 * stream are not translated. |
|
304 * |
|
305 * > If the number is one of the numbers in the first column of |
|
306 * > the following table, then find the row with that number in |
|
307 * > the first column, and set the character reference code to |
|
308 * > the number in the second column of that row. |
|
309 */ |
|
310 if ( $code_point >= 0x80 && $code_point <= 0x9F ) { |
|
311 $windows_1252_mapping = array( |
|
312 0x20AC, // 0x80 -> EURO SIGN (€). |
|
313 0x81, // 0x81 -> (no change). |
|
314 0x201A, // 0x82 -> SINGLE LOW-9 QUOTATION MARK (‚). |
|
315 0x0192, // 0x83 -> LATIN SMALL LETTER F WITH HOOK (ƒ). |
|
316 0x201E, // 0x84 -> DOUBLE LOW-9 QUOTATION MARK („). |
|
317 0x2026, // 0x85 -> HORIZONTAL ELLIPSIS (…). |
|
318 0x2020, // 0x86 -> DAGGER (†). |
|
319 0x2021, // 0x87 -> DOUBLE DAGGER (‡). |
|
320 0x02C6, // 0x88 -> MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ). |
|
321 0x2030, // 0x89 -> PER MILLE SIGN (‰). |
|
322 0x0160, // 0x8A -> LATIN CAPITAL LETTER S WITH CARON (Š). |
|
323 0x2039, // 0x8B -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹). |
|
324 0x0152, // 0x8C -> LATIN CAPITAL LIGATURE OE (Œ). |
|
325 0x8D, // 0x8D -> (no change). |
|
326 0x017D, // 0x8E -> LATIN CAPITAL LETTER Z WITH CARON (Ž). |
|
327 0x8F, // 0x8F -> (no change). |
|
328 0x90, // 0x90 -> (no change). |
|
329 0x2018, // 0x91 -> LEFT SINGLE QUOTATION MARK (‘). |
|
330 0x2019, // 0x92 -> RIGHT SINGLE QUOTATION MARK (’). |
|
331 0x201C, // 0x93 -> LEFT DOUBLE QUOTATION MARK (“). |
|
332 0x201D, // 0x94 -> RIGHT DOUBLE QUOTATION MARK (”). |
|
333 0x2022, // 0x95 -> BULLET (•). |
|
334 0x2013, // 0x96 -> EN DASH (–). |
|
335 0x2014, // 0x97 -> EM DASH (—). |
|
336 0x02DC, // 0x98 -> SMALL TILDE (˜). |
|
337 0x2122, // 0x99 -> TRADE MARK SIGN (™). |
|
338 0x0161, // 0x9A -> LATIN SMALL LETTER S WITH CARON (š). |
|
339 0x203A, // 0x9B -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›). |
|
340 0x0153, // 0x9C -> LATIN SMALL LIGATURE OE (œ). |
|
341 0x9D, // 0x9D -> (no change). |
|
342 0x017E, // 0x9E -> LATIN SMALL LETTER Z WITH CARON (ž). |
|
343 0x0178, // 0x9F -> LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ). |
|
344 ); |
|
345 |
|
346 $code_point = $windows_1252_mapping[ $code_point - 0x80 ]; |
|
347 } |
|
348 |
|
349 $match_byte_length = $end_of_span - $at; |
|
350 return self::code_point_to_utf8_bytes( $code_point ); |
|
351 } |
|
352 |
|
353 /** Tracks inner parsing within the named character reference. */ |
|
354 $name_at = $at + 1; |
|
355 // Minimum named character reference is two characters. E.g. `GT`. |
|
356 if ( $name_at + 2 > $length ) { |
|
357 return null; |
|
358 } |
|
359 |
|
360 $name_length = 0; |
|
361 $replacement = $html5_named_character_references->read_token( $text, $name_at, $name_length ); |
|
362 if ( false === $replacement ) { |
|
363 return null; |
|
364 } |
|
365 |
|
366 $after_name = $name_at + $name_length; |
|
367 |
|
368 // If the match ended with a semicolon then it should always be decoded. |
|
369 if ( ';' === $text[ $name_at + $name_length - 1 ] ) { |
|
370 $match_byte_length = $after_name - $at; |
|
371 return $replacement; |
|
372 } |
|
373 |
|
374 /* |
|
375 * At this point though there's a match for an entry in the named |
|
376 * character reference table but the match doesn't end in `;`. |
|
377 * It may be allowed if it's followed by something unambiguous. |
|
378 */ |
|
379 $ambiguous_follower = ( |
|
380 $after_name < $length && |
|
381 $name_at < $length && |
|
382 ( |
|
383 ctype_alnum( $text[ $after_name ] ) || |
|
384 '=' === $text[ $after_name ] |
|
385 ) |
|
386 ); |
|
387 |
|
388 // It's non-ambiguous, safe to leave it in. |
|
389 if ( ! $ambiguous_follower ) { |
|
390 $match_byte_length = $after_name - $at; |
|
391 return $replacement; |
|
392 } |
|
393 |
|
394 // It's ambiguous, which isn't allowed inside attributes. |
|
395 if ( 'attribute' === $context ) { |
|
396 return null; |
|
397 } |
|
398 |
|
399 $match_byte_length = $after_name - $at; |
|
400 return $replacement; |
|
401 } |
|
402 |
|
403 /** |
|
404 * Encode a code point number into the UTF-8 encoding. |
|
405 * |
|
406 * This encoder implements the UTF-8 encoding algorithm for converting |
|
407 * a code point into a byte sequence. If it receives an invalid code |
|
408 * point it will return the Unicode Replacement Character U+FFFD `�`. |
|
409 * |
|
410 * Example: |
|
411 * |
|
412 * '🅰' === WP_HTML_Decoder::code_point_to_utf8_bytes( 0x1f170 ); |
|
413 * |
|
414 * // Half of a surrogate pair is an invalid code point. |
|
415 * '�' === WP_HTML_Decoder::code_point_to_utf8_bytes( 0xd83c ); |
|
416 * |
|
417 * @since 6.6.0 |
|
418 * |
|
419 * @see https://www.rfc-editor.org/rfc/rfc3629 For the UTF-8 standard. |
|
420 * |
|
421 * @param int $code_point Which code point to convert. |
|
422 * @return string Converted code point, or `�` if invalid. |
|
423 */ |
|
424 public static function code_point_to_utf8_bytes( $code_point ) { |
|
425 // Pre-check to ensure a valid code point. |
|
426 if ( |
|
427 $code_point <= 0 || |
|
428 ( $code_point >= 0xD800 && $code_point <= 0xDFFF ) || |
|
429 $code_point > 0x10FFFF |
|
430 ) { |
|
431 return '�'; |
|
432 } |
|
433 |
|
434 if ( $code_point <= 0x7F ) { |
|
435 return chr( $code_point ); |
|
436 } |
|
437 |
|
438 if ( $code_point <= 0x7FF ) { |
|
439 $byte1 = ( $code_point >> 6 ) | 0xC0; |
|
440 $byte2 = $code_point & 0x3F | 0x80; |
|
441 |
|
442 return pack( 'CC', $byte1, $byte2 ); |
|
443 } |
|
444 |
|
445 if ( $code_point <= 0xFFFF ) { |
|
446 $byte1 = ( $code_point >> 12 ) | 0xE0; |
|
447 $byte2 = ( $code_point >> 6 ) & 0x3F | 0x80; |
|
448 $byte3 = $code_point & 0x3F | 0x80; |
|
449 |
|
450 return pack( 'CCC', $byte1, $byte2, $byte3 ); |
|
451 } |
|
452 |
|
453 // Any values above U+10FFFF are eliminated above in the pre-check. |
|
454 $byte1 = ( $code_point >> 18 ) | 0xF0; |
|
455 $byte2 = ( $code_point >> 12 ) & 0x3F | 0x80; |
|
456 $byte3 = ( $code_point >> 6 ) & 0x3F | 0x80; |
|
457 $byte4 = $code_point & 0x3F | 0x80; |
|
458 |
|
459 return pack( 'CCCC', $byte1, $byte2, $byte3, $byte4 ); |
|
460 } |
|
461 } |