|
1 <?php |
|
2 |
|
3 /** |
|
4 * Class for efficiently looking up and mapping string keys to string values, with limits. |
|
5 * |
|
6 * @package WordPress |
|
7 * @since 6.6.0 |
|
8 */ |
|
9 |
|
10 /** |
|
11 * WP_Token_Map class. |
|
12 * |
|
13 * Use this class in specific circumstances with a static set of lookup keys which map to |
|
14 * a static set of transformed values. For example, this class is used to map HTML named |
|
15 * character references to their equivalent UTF-8 values. |
|
16 * |
|
17 * This class works differently than code calling `in_array()` and other methods. It |
|
18 * internalizes lookup logic and provides helper interfaces to optimize lookup and |
|
19 * transformation. It provides a method for precomputing the lookup tables and storing |
|
20 * them as PHP source code. |
|
21 * |
|
22 * All tokens and substitutions must be shorter than 256 bytes. |
|
23 * |
|
24 * Example: |
|
25 * |
|
26 * $smilies = WP_Token_Map::from_array( array( |
|
27 * '8O' => 'π―', |
|
28 * ':(' => 'π', |
|
29 * ':)' => 'π', |
|
30 * ':?' => 'π', |
|
31 * ) ); |
|
32 * |
|
33 * true === $smilies->contains( ':)' ); |
|
34 * false === $smilies->contains( 'simile' ); |
|
35 * |
|
36 * 'π' === $smilies->read_token( 'Not sure :?.', 9, $length_of_smily_syntax ); |
|
37 * 2 === $length_of_smily_syntax; |
|
38 * |
|
39 * ## Precomputing the Token Map. |
|
40 * |
|
41 * Creating the class involves some work sorting and organizing the tokens and their |
|
42 * replacement values. In order to skip this, it's possible for the class to export |
|
43 * its state and be used as actual PHP source code. |
|
44 * |
|
45 * Example: |
|
46 * |
|
47 * // Export with four spaces as the indent, only for the sake of this docblock. |
|
48 * // The default indent is a tab character. |
|
49 * $indent = ' '; |
|
50 * echo $smilies->precomputed_php_source_table( $indent ); |
|
51 * |
|
52 * // Output, to be pasted into a PHP source file: |
|
53 * WP_Token_Map::from_precomputed_table( |
|
54 * array( |
|
55 * "storage_version" => "6.6.0", |
|
56 * "key_length" => 2, |
|
57 * "groups" => "", |
|
58 * "long_words" => array(), |
|
59 * "small_words" => "8O\x00:)\x00:(\x00:?\x00", |
|
60 * "small_mappings" => array( "π―", "π", "π", "π" ) |
|
61 * ) |
|
62 * ); |
|
63 * |
|
64 * ## Large vs. small words. |
|
65 * |
|
66 * This class uses a short prefix called the "key" to optimize lookup of its tokens. |
|
67 * This means that some tokens may be shorter than or equal in length to that key. |
|
68 * Those words that are longer than the key are called "large" while those shorter |
|
69 * than or equal to the key length are called "small." |
|
70 * |
|
71 * This separation of large and small words is incidental to the way this class |
|
72 * optimizes lookup, and should be considered an internal implementation detail |
|
73 * of the class. It may still be important to be aware of it, however. |
|
74 * |
|
75 * ## Determining Key Length. |
|
76 * |
|
77 * The choice of the size of the key length should be based on the data being stored in |
|
78 * the token map. It should divide the data as evenly as possible, but should not create |
|
79 * so many groups that a large fraction of the groups only contain a single token. |
|
80 * |
|
81 * For the HTML5 named character references, a key length of 2 was found to provide a |
|
82 * sufficient spread and should be a good default for relatively large sets of tokens. |
|
83 * |
|
84 * However, for some data sets this might be too long. For example, a list of smilies |
|
85 * may be too small for a key length of 2. Perhaps 1 would be more appropriate. It's |
|
86 * best to experiment and determine empirically which values are appropriate. |
|
87 * |
|
88 * ## Generate Pre-Computed Source Code. |
|
89 * |
|
90 * Since the `WP_Token_Map` is designed for relatively static lookups, it can be |
|
91 * advantageous to precompute the values and instantiate a table that has already |
|
92 * sorted and grouped the tokens and built the lookup strings. |
|
93 * |
|
94 * This can be done with `WP_Token_Map::precomputed_php_source_table()`. |
|
95 * |
|
96 * Note that if there is a leading character that all tokens need, such as `&` for |
|
97 * HTML named character references, it can be beneficial to exclude this from the |
|
98 * token map. Instead, find occurrences of the leading character and then use the |
|
99 * token map to see if the following characters complete the token. |
|
100 * |
|
101 * Example: |
|
102 * |
|
103 * $map = WP_Token_Map::from_array( array( 'simple_smile:' => 'π', 'sob:' => 'π', 'soba:' => 'π' ) ); |
|
104 * echo $map->precomputed_php_source_table(); |
|
105 * // Output |
|
106 * WP_Token_Map::from_precomputed_table( |
|
107 * array( |
|
108 * "storage_version" => "6.6.0", |
|
109 * "key_length" => 2, |
|
110 * "groups" => "si\x00so\x00", |
|
111 * "long_words" => array( |
|
112 * // simple_smile:[π]. |
|
113 * "\x0bmple_smile:\x04π", |
|
114 * // soba:[π] sob:[π]. |
|
115 * "\x03ba:\x04π\x02b:\x04π", |
|
116 * ), |
|
117 * "short_words" => "", |
|
118 * "short_mappings" => array() |
|
119 * } |
|
120 * ); |
|
121 * |
|
122 * This precomputed value can be stored directly in source code and will skip the |
|
123 * startup cost of generating the lookup strings. See `$html5_named_character_entities`. |
|
124 * |
|
125 * Note that any updates to the precomputed format should update the storage version |
|
126 * constant. It would also be best to provide an update function to take older known |
|
127 * versions and upgrade them in place when loading into `from_precomputed_table()`. |
|
128 * |
|
129 * ## Future Direction. |
|
130 * |
|
131 * It may be viable to dynamically increase the length limits such that there's no need to impose them. |
|
132 * The limit appears because of the packing structure, which indicates how many bytes each segment of |
|
133 * text in the lookup tables spans. If, however, care were taken to track the longest word length, then |
|
134 * the packing structure could change its representation to allow for that. Each additional byte storing |
|
135 * length, however, increases the memory overhead and lookup runtime. |
|
136 * |
|
137 * An alternative approach could be to borrow the UTF-8 variable-length encoding and store lengths of less |
|
138 * than 127 as a single byte with the high bit unset, storing longer lengths as the combination of |
|
139 * continuation bytes. |
|
140 * |
|
141 * Since it has not been shown during the development of this class that longer strings are required, this |
|
142 * update is deferred until such a need is clear. |
|
143 * |
|
144 * @since 6.6.0 |
|
145 */ |
|
146 class WP_Token_Map { |
|
147 /** |
|
148 * Denotes the version of the code which produces pre-computed source tables. |
|
149 * |
|
150 * This version will be used not only to verify pre-computed data, but also |
|
151 * to upgrade pre-computed data from older versions. Choosing a name that |
|
152 * corresponds to the WordPress release will help people identify where an |
|
153 * old copy of data came from. |
|
154 */ |
|
155 const STORAGE_VERSION = '6.6.0-trunk'; |
|
156 |
|
157 /** |
|
158 * Maximum length for each key and each transformed value in the table (in bytes). |
|
159 * |
|
160 * @since 6.6.0 |
|
161 */ |
|
162 const MAX_LENGTH = 256; |
|
163 |
|
164 /** |
|
165 * How many bytes of each key are used to form a group key for lookup. |
|
166 * This also determines whether a word is considered short or long. |
|
167 * |
|
168 * @since 6.6.0 |
|
169 * |
|
170 * @var int |
|
171 */ |
|
172 private $key_length = 2; |
|
173 |
|
174 /** |
|
175 * Stores an optimized form of the word set, where words are grouped |
|
176 * by a prefix of the `$key_length` and then collapsed into a string. |
|
177 * |
|
178 * In each group, the keys and lookups form a packed data structure. |
|
179 * The keys in the string are stripped of their "group key," which is |
|
180 * the prefix of length `$this->key_length` shared by all of the items |
|
181 * in the group. Each word in the string is prefixed by a single byte |
|
182 * whose raw unsigned integer value represents how many bytes follow. |
|
183 * |
|
184 * ββββββββββββββββββ¬ββββββββββββββββ¬ββββββββββββββββββ¬βββββββββ |
|
185 * β Length of rest β Rest of key β Length of value β Value β |
|
186 * β of key (bytes) β β (bytes) β β |
|
187 * ββββββββββββββββββΌββββββββββββββββΌββββββββββββββββββΌβββββββββ€ |
|
188 * β 0x08 β nterDot; β 0x02 β Β· β |
|
189 * ββββββββββββββββββ΄ββββββββββββββββ΄ββββββββββββββββββ΄βββββββββ |
|
190 * |
|
191 * In this example, the key `CenterDot;` has a group key `Ce`, leaving |
|
192 * eight bytes for the rest of the key, `nterDot;`, and two bytes for |
|
193 * the transformed value `Β·` (or U+B7 or "\xC2\xB7"). |
|
194 * |
|
195 * Example: |
|
196 * |
|
197 * // Stores array( 'CenterDot;' => 'Β·', 'Cedilla;' => 'ΒΈ' ). |
|
198 * $groups = "Ce\x00"; |
|
199 * $large_words = array( "\x08nterDot;\x02Β·\x06dilla;\x02ΒΈ" ) |
|
200 * |
|
201 * The prefixes appear in the `$groups` string, each followed by a null |
|
202 * byte. This makes for quick lookup of where in the group string the key |
|
203 * is found, and then a simple division converts that offset into the index |
|
204 * in the `$large_words` array where the group string is to be found. |
|
205 * |
|
206 * This lookup data structure is designed to optimize cache locality and |
|
207 * minimize indirect memory reads when matching strings in the set. |
|
208 * |
|
209 * @since 6.6.0 |
|
210 * |
|
211 * @var array |
|
212 */ |
|
213 private $large_words = array(); |
|
214 |
|
215 /** |
|
216 * Stores the group keys for sequential string lookup. |
|
217 * |
|
218 * The offset into this string where the group key appears corresponds with the index |
|
219 * into the group array where the rest of the group string appears. This is an optimization |
|
220 * to improve cache locality while searching and minimize indirect memory accesses. |
|
221 * |
|
222 * @since 6.6.0 |
|
223 * |
|
224 * @var string |
|
225 */ |
|
226 private $groups = ''; |
|
227 |
|
228 /** |
|
229 * Stores an optimized row of small words, where every entry is |
|
230 * `$this->key_size + 1` bytes long and zero-extended. |
|
231 * |
|
232 * This packing allows for direct lookup of a short word followed |
|
233 * by the null byte, if extended to `$this->key_size + 1`. |
|
234 * |
|
235 * Example: |
|
236 * |
|
237 * // Stores array( 'GT', 'LT', 'gt', 'lt' ). |
|
238 * "GT\x00LT\x00gt\x00lt\x00" |
|
239 * |
|
240 * @since 6.6.0 |
|
241 * |
|
242 * @var string |
|
243 */ |
|
244 private $small_words = ''; |
|
245 |
|
246 /** |
|
247 * Replacements for the small words, in the same order they appear. |
|
248 * |
|
249 * With the position of a small word it's possible to index the translation |
|
250 * directly, as its position in the `$small_words` string corresponds to |
|
251 * the index of the replacement in the `$small_mapping` array. |
|
252 * |
|
253 * Example: |
|
254 * |
|
255 * array( '>', '<', '>', '<' ) |
|
256 * |
|
257 * @since 6.6.0 |
|
258 * |
|
259 * @var string[] |
|
260 */ |
|
261 private $small_mappings = array(); |
|
262 |
|
263 /** |
|
264 * Create a token map using an associative array of key/value pairs as the input. |
|
265 * |
|
266 * Example: |
|
267 * |
|
268 * $smilies = WP_Token_Map::from_array( array( |
|
269 * '8O' => 'π―', |
|
270 * ':(' => 'π', |
|
271 * ':)' => 'π', |
|
272 * ':?' => 'π', |
|
273 * ) ); |
|
274 * |
|
275 * @since 6.6.0 |
|
276 * |
|
277 * @param array $mappings The keys transform into the values, both are strings. |
|
278 * @param int $key_length Determines the group key length. Leave at the default value |
|
279 * of 2 unless there's an empirical reason to change it. |
|
280 * |
|
281 * @return WP_Token_Map|null Token map, unless unable to create it. |
|
282 */ |
|
283 public static function from_array( $mappings, $key_length = 2 ) { |
|
284 $map = new WP_Token_Map(); |
|
285 $map->key_length = $key_length; |
|
286 |
|
287 // Start by grouping words. |
|
288 |
|
289 $groups = array(); |
|
290 $shorts = array(); |
|
291 foreach ( $mappings as $word => $mapping ) { |
|
292 if ( |
|
293 self::MAX_LENGTH <= strlen( $word ) || |
|
294 self::MAX_LENGTH <= strlen( $mapping ) |
|
295 ) { |
|
296 _doing_it_wrong( |
|
297 __METHOD__, |
|
298 sprintf( |
|
299 /* translators: 1: maximum byte length (a count) */ |
|
300 __( 'Token Map tokens and substitutions must all be shorter than %1$d bytes.' ), |
|
301 self::MAX_LENGTH |
|
302 ), |
|
303 '6.6.0' |
|
304 ); |
|
305 return null; |
|
306 } |
|
307 |
|
308 $length = strlen( $word ); |
|
309 |
|
310 if ( $key_length >= $length ) { |
|
311 $shorts[] = $word; |
|
312 } else { |
|
313 $group = substr( $word, 0, $key_length ); |
|
314 |
|
315 if ( ! isset( $groups[ $group ] ) ) { |
|
316 $groups[ $group ] = array(); |
|
317 } |
|
318 |
|
319 $groups[ $group ][] = array( substr( $word, $key_length ), $mapping ); |
|
320 } |
|
321 } |
|
322 |
|
323 /* |
|
324 * Sort the words to ensure that no smaller substring of a match masks the full match. |
|
325 * For example, `Cap` should not match before `CapitalDifferentialD`. |
|
326 */ |
|
327 usort( $shorts, 'WP_Token_Map::longest_first_then_alphabetical' ); |
|
328 foreach ( $groups as $group_key => $group ) { |
|
329 usort( |
|
330 $groups[ $group_key ], |
|
331 static function ( $a, $b ) { |
|
332 return self::longest_first_then_alphabetical( $a[0], $b[0] ); |
|
333 } |
|
334 ); |
|
335 } |
|
336 |
|
337 // Finally construct the optimized lookups. |
|
338 |
|
339 foreach ( $shorts as $word ) { |
|
340 $map->small_words .= str_pad( $word, $key_length + 1, "\x00", STR_PAD_RIGHT ); |
|
341 $map->small_mappings[] = $mappings[ $word ]; |
|
342 } |
|
343 |
|
344 $group_keys = array_keys( $groups ); |
|
345 sort( $group_keys ); |
|
346 |
|
347 foreach ( $group_keys as $group ) { |
|
348 $map->groups .= "{$group}\x00"; |
|
349 |
|
350 $group_string = ''; |
|
351 |
|
352 foreach ( $groups[ $group ] as $group_word ) { |
|
353 list( $word, $mapping ) = $group_word; |
|
354 |
|
355 $word_length = pack( 'C', strlen( $word ) ); |
|
356 $mapping_length = pack( 'C', strlen( $mapping ) ); |
|
357 $group_string .= "{$word_length}{$word}{$mapping_length}{$mapping}"; |
|
358 } |
|
359 |
|
360 $map->large_words[] = $group_string; |
|
361 } |
|
362 |
|
363 return $map; |
|
364 } |
|
365 |
|
366 /** |
|
367 * Creates a token map from a pre-computed table. |
|
368 * This skips the initialization cost of generating the table. |
|
369 * |
|
370 * This function should only be used to load data created with |
|
371 * WP_Token_Map::precomputed_php_source_tag(). |
|
372 * |
|
373 * @since 6.6.0 |
|
374 * |
|
375 * @param array $state { |
|
376 * Stores pre-computed state for directly loading into a Token Map. |
|
377 * |
|
378 * @type string $storage_version Which version of the code produced this state. |
|
379 * @type int $key_length Group key length. |
|
380 * @type string $groups Group lookup index. |
|
381 * @type array $large_words Large word groups and packed strings. |
|
382 * @type string $small_words Small words packed string. |
|
383 * @type array $small_mappings Small word mappings. |
|
384 * } |
|
385 * |
|
386 * @return WP_Token_Map Map with precomputed data loaded. |
|
387 */ |
|
388 public static function from_precomputed_table( $state ) { |
|
389 $has_necessary_state = isset( |
|
390 $state['storage_version'], |
|
391 $state['key_length'], |
|
392 $state['groups'], |
|
393 $state['large_words'], |
|
394 $state['small_words'], |
|
395 $state['small_mappings'] |
|
396 ); |
|
397 |
|
398 if ( ! $has_necessary_state ) { |
|
399 _doing_it_wrong( |
|
400 __METHOD__, |
|
401 __( 'Missing required inputs to pre-computed WP_Token_Map.' ), |
|
402 '6.6.0' |
|
403 ); |
|
404 return null; |
|
405 } |
|
406 |
|
407 if ( self::STORAGE_VERSION !== $state['storage_version'] ) { |
|
408 _doing_it_wrong( |
|
409 __METHOD__, |
|
410 /* translators: 1: version string, 2: version string. */ |
|
411 sprintf( __( 'Loaded version \'%1$s\' incompatible with expected version \'%2$s\'.' ), $state['storage_version'], self::STORAGE_VERSION ), |
|
412 '6.6.0' |
|
413 ); |
|
414 return null; |
|
415 } |
|
416 |
|
417 $map = new WP_Token_Map(); |
|
418 |
|
419 $map->key_length = $state['key_length']; |
|
420 $map->groups = $state['groups']; |
|
421 $map->large_words = $state['large_words']; |
|
422 $map->small_words = $state['small_words']; |
|
423 $map->small_mappings = $state['small_mappings']; |
|
424 |
|
425 return $map; |
|
426 } |
|
427 |
|
428 /** |
|
429 * Indicates if a given word is a lookup key in the map. |
|
430 * |
|
431 * Example: |
|
432 * |
|
433 * true === $smilies->contains( ':)' ); |
|
434 * false === $smilies->contains( 'simile' ); |
|
435 * |
|
436 * @since 6.6.0 |
|
437 * |
|
438 * @param string $word Determine if this word is a lookup key in the map. |
|
439 * @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'. |
|
440 * @return bool Whether there's an entry for the given word in the map. |
|
441 */ |
|
442 public function contains( $word, $case_sensitivity = 'case-sensitive' ) { |
|
443 $ignore_case = 'ascii-case-insensitive' === $case_sensitivity; |
|
444 |
|
445 if ( $this->key_length >= strlen( $word ) ) { |
|
446 if ( 0 === strlen( $this->small_words ) ) { |
|
447 return false; |
|
448 } |
|
449 |
|
450 $term = str_pad( $word, $this->key_length + 1, "\x00", STR_PAD_RIGHT ); |
|
451 $word_at = $ignore_case ? stripos( $this->small_words, $term ) : strpos( $this->small_words, $term ); |
|
452 if ( false === $word_at ) { |
|
453 return false; |
|
454 } |
|
455 |
|
456 return true; |
|
457 } |
|
458 |
|
459 $group_key = substr( $word, 0, $this->key_length ); |
|
460 $group_at = $ignore_case ? stripos( $this->groups, $group_key ) : strpos( $this->groups, $group_key ); |
|
461 if ( false === $group_at ) { |
|
462 return false; |
|
463 } |
|
464 $group = $this->large_words[ $group_at / ( $this->key_length + 1 ) ]; |
|
465 $group_length = strlen( $group ); |
|
466 $slug = substr( $word, $this->key_length ); |
|
467 $length = strlen( $slug ); |
|
468 $at = 0; |
|
469 |
|
470 while ( $at < $group_length ) { |
|
471 $token_length = unpack( 'C', $group[ $at++ ] )[1]; |
|
472 $token_at = $at; |
|
473 $at += $token_length; |
|
474 $mapping_length = unpack( 'C', $group[ $at++ ] )[1]; |
|
475 $mapping_at = $at; |
|
476 |
|
477 if ( $token_length === $length && 0 === substr_compare( $group, $slug, $token_at, $token_length, $ignore_case ) ) { |
|
478 return true; |
|
479 } |
|
480 |
|
481 $at = $mapping_at + $mapping_length; |
|
482 } |
|
483 |
|
484 return false; |
|
485 } |
|
486 |
|
487 /** |
|
488 * If the text starting at a given offset is a lookup key in the map, |
|
489 * return the corresponding transformation from the map, else `false`. |
|
490 * |
|
491 * This function returns the translated string, but accepts an optional |
|
492 * parameter `$matched_token_byte_length`, which communicates how many |
|
493 * bytes long the lookup key was, if it found one. This can be used to |
|
494 * advance a cursor in calling code if a lookup key was found. |
|
495 * |
|
496 * Example: |
|
497 * |
|
498 * false === $smilies->read_token( 'Not sure :?.', 0, $token_byte_length ); |
|
499 * 'π' === $smilies->read_token( 'Not sure :?.', 9, $token_byte_length ); |
|
500 * 2 === $token_byte_length; |
|
501 * |
|
502 * Example: |
|
503 * |
|
504 * while ( $at < strlen( $input ) ) { |
|
505 * $next_at = strpos( $input, ':', $at ); |
|
506 * if ( false === $next_at ) { |
|
507 * break; |
|
508 * } |
|
509 * |
|
510 * $smily = $smilies->read_token( $input, $next_at, $token_byte_length ); |
|
511 * if ( false === $next_at ) { |
|
512 * ++$at; |
|
513 * continue; |
|
514 * } |
|
515 * |
|
516 * $prefix = substr( $input, $at, $next_at - $at ); |
|
517 * $at += $token_byte_length; |
|
518 * $output .= "{$prefix}{$smily}"; |
|
519 * } |
|
520 * |
|
521 * @since 6.6.0 |
|
522 * |
|
523 * @param string $text String in which to search for a lookup key. |
|
524 * @param int $offset Optional. How many bytes into the string where the lookup key ought to start. Default 0. |
|
525 * @param ?int &$matched_token_byte_length Optional. Holds byte-length of found token matched, otherwise not set. Default null. |
|
526 * @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'. |
|
527 * @return string|null Mapped value of lookup key if found, otherwise `null`. |
|
528 */ |
|
529 public function read_token( $text, $offset = 0, &$matched_token_byte_length = null, $case_sensitivity = 'case-sensitive' ) { |
|
530 $ignore_case = 'ascii-case-insensitive' === $case_sensitivity; |
|
531 $text_length = strlen( $text ); |
|
532 |
|
533 // Search for a long word first, if the text is long enough, and if that fails, a short one. |
|
534 if ( $text_length > $this->key_length ) { |
|
535 $group_key = substr( $text, $offset, $this->key_length ); |
|
536 |
|
537 $group_at = $ignore_case ? stripos( $this->groups, $group_key ) : strpos( $this->groups, $group_key ); |
|
538 if ( false === $group_at ) { |
|
539 // Perhaps a short word then. |
|
540 return strlen( $this->small_words ) > 0 |
|
541 ? $this->read_small_token( $text, $offset, $matched_token_byte_length, $case_sensitivity ) |
|
542 : null; |
|
543 } |
|
544 |
|
545 $group = $this->large_words[ $group_at / ( $this->key_length + 1 ) ]; |
|
546 $group_length = strlen( $group ); |
|
547 $at = 0; |
|
548 while ( $at < $group_length ) { |
|
549 $token_length = unpack( 'C', $group[ $at++ ] )[1]; |
|
550 $token = substr( $group, $at, $token_length ); |
|
551 $at += $token_length; |
|
552 $mapping_length = unpack( 'C', $group[ $at++ ] )[1]; |
|
553 $mapping_at = $at; |
|
554 |
|
555 if ( 0 === substr_compare( $text, $token, $offset + $this->key_length, $token_length, $ignore_case ) ) { |
|
556 $matched_token_byte_length = $this->key_length + $token_length; |
|
557 return substr( $group, $mapping_at, $mapping_length ); |
|
558 } |
|
559 |
|
560 $at = $mapping_at + $mapping_length; |
|
561 } |
|
562 } |
|
563 |
|
564 // Perhaps a short word then. |
|
565 return strlen( $this->small_words ) > 0 |
|
566 ? $this->read_small_token( $text, $offset, $matched_token_byte_length, $case_sensitivity ) |
|
567 : null; |
|
568 } |
|
569 |
|
570 /** |
|
571 * Finds a match for a short word at the index. |
|
572 * |
|
573 * @since 6.6.0. |
|
574 * |
|
575 * @param string $text String in which to search for a lookup key. |
|
576 * @param int $offset Optional. How many bytes into the string where the lookup key ought to start. Default 0. |
|
577 * @param ?int &$matched_token_byte_length Optional. Holds byte-length of found lookup key if matched, otherwise not set. Default null. |
|
578 * @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'. |
|
579 * @return string|null Mapped value of lookup key if found, otherwise `null`. |
|
580 */ |
|
581 private function read_small_token( $text, $offset, &$matched_token_byte_length, $case_sensitivity = 'case-sensitive' ) { |
|
582 $ignore_case = 'ascii-case-insensitive' === $case_sensitivity; |
|
583 $small_length = strlen( $this->small_words ); |
|
584 $search_text = substr( $text, $offset, $this->key_length ); |
|
585 if ( $ignore_case ) { |
|
586 $search_text = strtoupper( $search_text ); |
|
587 } |
|
588 $starting_char = $search_text[0]; |
|
589 |
|
590 $at = 0; |
|
591 while ( $at < $small_length ) { |
|
592 if ( |
|
593 $starting_char !== $this->small_words[ $at ] && |
|
594 ( ! $ignore_case || strtoupper( $this->small_words[ $at ] ) !== $starting_char ) |
|
595 ) { |
|
596 $at += $this->key_length + 1; |
|
597 continue; |
|
598 } |
|
599 |
|
600 for ( $adjust = 1; $adjust < $this->key_length; $adjust++ ) { |
|
601 if ( "\x00" === $this->small_words[ $at + $adjust ] ) { |
|
602 $matched_token_byte_length = $adjust; |
|
603 return $this->small_mappings[ $at / ( $this->key_length + 1 ) ]; |
|
604 } |
|
605 |
|
606 if ( |
|
607 $search_text[ $adjust ] !== $this->small_words[ $at + $adjust ] && |
|
608 ( ! $ignore_case || strtoupper( $this->small_words[ $at + $adjust ] !== $search_text[ $adjust ] ) ) |
|
609 ) { |
|
610 $at += $this->key_length + 1; |
|
611 continue 2; |
|
612 } |
|
613 } |
|
614 |
|
615 $matched_token_byte_length = $adjust; |
|
616 return $this->small_mappings[ $at / ( $this->key_length + 1 ) ]; |
|
617 } |
|
618 |
|
619 return null; |
|
620 } |
|
621 |
|
622 /** |
|
623 * Exports the token map into an associate array of key/value pairs. |
|
624 * |
|
625 * Example: |
|
626 * |
|
627 * $smilies->to_array() === array( |
|
628 * '8O' => 'π―', |
|
629 * ':(' => 'π', |
|
630 * ':)' => 'π', |
|
631 * ':?' => 'π', |
|
632 * ); |
|
633 * |
|
634 * @return array The lookup key/substitution values as an associate array. |
|
635 */ |
|
636 public function to_array() { |
|
637 $tokens = array(); |
|
638 |
|
639 $at = 0; |
|
640 $small_mapping = 0; |
|
641 $small_length = strlen( $this->small_words ); |
|
642 while ( $at < $small_length ) { |
|
643 $key = rtrim( substr( $this->small_words, $at, $this->key_length + 1 ), "\x00" ); |
|
644 $value = $this->small_mappings[ $small_mapping++ ]; |
|
645 $tokens[ $key ] = $value; |
|
646 |
|
647 $at += $this->key_length + 1; |
|
648 } |
|
649 |
|
650 foreach ( $this->large_words as $index => $group ) { |
|
651 $prefix = substr( $this->groups, $index * ( $this->key_length + 1 ), 2 ); |
|
652 $group_length = strlen( $group ); |
|
653 $at = 0; |
|
654 while ( $at < $group_length ) { |
|
655 $length = unpack( 'C', $group[ $at++ ] )[1]; |
|
656 $key = $prefix . substr( $group, $at, $length ); |
|
657 |
|
658 $at += $length; |
|
659 $length = unpack( 'C', $group[ $at++ ] )[1]; |
|
660 $value = substr( $group, $at, $length ); |
|
661 |
|
662 $tokens[ $key ] = $value; |
|
663 $at += $length; |
|
664 } |
|
665 } |
|
666 |
|
667 return $tokens; |
|
668 } |
|
669 |
|
670 /** |
|
671 * Export the token map for quick loading in PHP source code. |
|
672 * |
|
673 * This function has a specific purpose, to make loading of static token maps fast. |
|
674 * It's used to ensure that the HTML character reference lookups add a minimal cost |
|
675 * to initializing the PHP process. |
|
676 * |
|
677 * Example: |
|
678 * |
|
679 * echo $smilies->precomputed_php_source_table(); |
|
680 * |
|
681 * // Output. |
|
682 * WP_Token_Map::from_precomputed_table( |
|
683 * array( |
|
684 * "storage_version" => "6.6.0", |
|
685 * "key_length" => 2, |
|
686 * "groups" => "", |
|
687 * "long_words" => array(), |
|
688 * "small_words" => "8O\x00:)\x00:(\x00:?\x00", |
|
689 * "small_mappings" => array( "π―", "π", "π", "π" ) |
|
690 * ) |
|
691 * ); |
|
692 * |
|
693 * @since 6.6.0 |
|
694 * |
|
695 * @param string $indent Optional. Use this string for indentation, or rely on the default horizontal tab character. Default "\t". |
|
696 * @return string Value which can be pasted into a PHP source file for quick loading of table. |
|
697 */ |
|
698 public function precomputed_php_source_table( $indent = "\t" ) { |
|
699 $i1 = $indent; |
|
700 $i2 = $i1 . $indent; |
|
701 $i3 = $i2 . $indent; |
|
702 |
|
703 $class_version = self::STORAGE_VERSION; |
|
704 |
|
705 $output = self::class . "::from_precomputed_table(\n"; |
|
706 $output .= "{$i1}array(\n"; |
|
707 $output .= "{$i2}\"storage_version\" => \"{$class_version}\",\n"; |
|
708 $output .= "{$i2}\"key_length\" => {$this->key_length},\n"; |
|
709 |
|
710 $group_line = str_replace( "\x00", "\\x00", $this->groups ); |
|
711 $output .= "{$i2}\"groups\" => \"{$group_line}\",\n"; |
|
712 |
|
713 $output .= "{$i2}\"large_words\" => array(\n"; |
|
714 |
|
715 $prefixes = explode( "\x00", $this->groups ); |
|
716 foreach ( $prefixes as $index => $prefix ) { |
|
717 if ( '' === $prefix ) { |
|
718 break; |
|
719 } |
|
720 $group = $this->large_words[ $index ]; |
|
721 $group_length = strlen( $group ); |
|
722 $comment_line = "{$i3}//"; |
|
723 $data_line = "{$i3}\""; |
|
724 $at = 0; |
|
725 while ( $at < $group_length ) { |
|
726 $token_length = unpack( 'C', $group[ $at++ ] )[1]; |
|
727 $token = substr( $group, $at, $token_length ); |
|
728 $at += $token_length; |
|
729 $mapping_length = unpack( 'C', $group[ $at++ ] )[1]; |
|
730 $mapping = substr( $group, $at, $mapping_length ); |
|
731 $at += $mapping_length; |
|
732 |
|
733 $token_digits = str_pad( dechex( $token_length ), 2, '0', STR_PAD_LEFT ); |
|
734 $mapping_digits = str_pad( dechex( $mapping_length ), 2, '0', STR_PAD_LEFT ); |
|
735 |
|
736 $mapping = preg_replace_callback( |
|
737 "~[\\x00-\\x1f\\x22\\x5c]~", |
|
738 static function ( $match_result ) { |
|
739 switch ( $match_result[0] ) { |
|
740 case '"': |
|
741 return '\\"'; |
|
742 |
|
743 case '\\': |
|
744 return '\\\\'; |
|
745 |
|
746 default: |
|
747 $hex = dechex( ord( $match_result[0] ) ); |
|
748 return "\\x{$hex}"; |
|
749 } |
|
750 }, |
|
751 $mapping |
|
752 ); |
|
753 |
|
754 $comment_line .= " {$prefix}{$token}[{$mapping}]"; |
|
755 $data_line .= "\\x{$token_digits}{$token}\\x{$mapping_digits}{$mapping}"; |
|
756 } |
|
757 $comment_line .= ".\n"; |
|
758 $data_line .= "\",\n"; |
|
759 |
|
760 $output .= $comment_line; |
|
761 $output .= $data_line; |
|
762 } |
|
763 |
|
764 $output .= "{$i2}),\n"; |
|
765 |
|
766 $small_words = array(); |
|
767 $small_length = strlen( $this->small_words ); |
|
768 $at = 0; |
|
769 while ( $at < $small_length ) { |
|
770 $small_words[] = substr( $this->small_words, $at, $this->key_length + 1 ); |
|
771 $at += $this->key_length + 1; |
|
772 } |
|
773 |
|
774 $small_text = str_replace( "\x00", '\x00', implode( '', $small_words ) ); |
|
775 $output .= "{$i2}\"small_words\" => \"{$small_text}\",\n"; |
|
776 |
|
777 $output .= "{$i2}\"small_mappings\" => array(\n"; |
|
778 foreach ( $this->small_mappings as $mapping ) { |
|
779 $output .= "{$i3}\"{$mapping}\",\n"; |
|
780 } |
|
781 $output .= "{$i2})\n"; |
|
782 $output .= "{$i1})\n"; |
|
783 $output .= ')'; |
|
784 |
|
785 return $output; |
|
786 } |
|
787 |
|
788 /** |
|
789 * Compares two strings, returning the longest, or whichever |
|
790 * is first alphabetically if they are the same length. |
|
791 * |
|
792 * This is an important sort when building the token map because |
|
793 * it should not form a match on a substring of a longer potential |
|
794 * match. For example, it should not detect `Cap` when matching |
|
795 * against the string `CapitalDifferentialD`. |
|
796 * |
|
797 * @since 6.6.0 |
|
798 * |
|
799 * @param string $a First string to compare. |
|
800 * @param string $b Second string to compare. |
|
801 * @return int -1 or lower if `$a` is less than `$b`; 1 or greater if `$a` is greater than `$b`, and 0 if they are equal. |
|
802 */ |
|
803 private static function longest_first_then_alphabetical( $a, $b ) { |
|
804 if ( $a === $b ) { |
|
805 return 0; |
|
806 } |
|
807 |
|
808 $length_a = strlen( $a ); |
|
809 $length_b = strlen( $b ); |
|
810 |
|
811 // Longer strings are less-than for comparison's sake. |
|
812 if ( $length_a !== $length_b ) { |
|
813 return $length_b - $length_a; |
|
814 } |
|
815 |
|
816 return strcmp( $a, $b ); |
|
817 } |
|
818 } |