wp/wp-includes/class-wp-token-map.php
changeset 21 48c4eec2b7e6
child 22 8c2e4d02f4ef
equal deleted inserted replaced
20:7b1b88e27a20 21:48c4eec2b7e6
       
     1 <?php
       
     2 
       
     3 /**
       
     4  * Class for efficiently looking up and mapping string keys to string values, with limits.
       
     5  *
       
     6  * @package    WordPress
       
     7  * @since      6.6.0
       
     8  */
       
     9 
       
    10 /**
       
    11  * WP_Token_Map class.
       
    12  *
       
    13  * Use this class in specific circumstances with a static set of lookup keys which map to
       
    14  * a static set of transformed values. For example, this class is used to map HTML named
       
    15  * character references to their equivalent UTF-8 values.
       
    16  *
       
    17  * This class works differently than code calling `in_array()` and other methods. It
       
    18  * internalizes lookup logic and provides helper interfaces to optimize lookup and
       
    19  * transformation. It provides a method for precomputing the lookup tables and storing
       
    20  * them as PHP source code.
       
    21  *
       
    22  * All tokens and substitutions must be shorter than 256 bytes.
       
    23  *
       
    24  * Example:
       
    25  *
       
    26  *     $smilies = WP_Token_Map::from_array( array(
       
    27  *         '8O' => '😯',
       
    28  *         ':(' => 'πŸ™',
       
    29  *         ':)' => 'πŸ™‚',
       
    30  *         ':?' => 'πŸ˜•',
       
    31  *      ) );
       
    32  *
       
    33  *      true  === $smilies->contains( ':)' );
       
    34  *      false === $smilies->contains( 'simile' );
       
    35  *
       
    36  *      'πŸ˜•' === $smilies->read_token( 'Not sure :?.', 9, $length_of_smily_syntax );
       
    37  *      2    === $length_of_smily_syntax;
       
    38  *
       
    39  * ## Precomputing the Token Map.
       
    40  *
       
    41  * Creating the class involves some work sorting and organizing the tokens and their
       
    42  * replacement values. In order to skip this, it's possible for the class to export
       
    43  * its state and be used as actual PHP source code.
       
    44  *
       
    45  * Example:
       
    46  *
       
    47  *      // Export with four spaces as the indent, only for the sake of this docblock.
       
    48  *      // The default indent is a tab character.
       
    49  *      $indent = '    ';
       
    50  *      echo $smilies->precomputed_php_source_table( $indent );
       
    51  *
       
    52  *      // Output, to be pasted into a PHP source file:
       
    53  *      WP_Token_Map::from_precomputed_table(
       
    54  *          array(
       
    55  *              "storage_version" => "6.6.0",
       
    56  *              "key_length" => 2,
       
    57  *              "groups" => "",
       
    58  *              "long_words" => array(),
       
    59  *              "small_words" => "8O\x00:)\x00:(\x00:?\x00",
       
    60  *              "small_mappings" => array( "😯", "πŸ™‚", "πŸ™", "πŸ˜•" )
       
    61  *          )
       
    62  *      );
       
    63  *
       
    64  * ## Large vs. small words.
       
    65  *
       
    66  * This class uses a short prefix called the "key" to optimize lookup of its tokens.
       
    67  * This means that some tokens may be shorter than or equal in length to that key.
       
    68  * Those words that are longer than the key are called "large" while those shorter
       
    69  * than or equal to the key length are called "small."
       
    70  *
       
    71  * This separation of large and small words is incidental to the way this class
       
    72  * optimizes lookup, and should be considered an internal implementation detail
       
    73  * of the class. It may still be important to be aware of it, however.
       
    74  *
       
    75  * ## Determining Key Length.
       
    76  *
       
    77  * The choice of the size of the key length should be based on the data being stored in
       
    78  * the token map. It should divide the data as evenly as possible, but should not create
       
    79  * so many groups that a large fraction of the groups only contain a single token.
       
    80  *
       
    81  * For the HTML5 named character references, a key length of 2 was found to provide a
       
    82  * sufficient spread and should be a good default for relatively large sets of tokens.
       
    83  *
       
    84  * However, for some data sets this might be too long. For example, a list of smilies
       
    85  * may be too small for a key length of 2. Perhaps 1 would be more appropriate. It's
       
    86  * best to experiment and determine empirically which values are appropriate.
       
    87  *
       
    88  * ## Generate Pre-Computed Source Code.
       
    89  *
       
    90  * Since the `WP_Token_Map` is designed for relatively static lookups, it can be
       
    91  * advantageous to precompute the values and instantiate a table that has already
       
    92  * sorted and grouped the tokens and built the lookup strings.
       
    93  *
       
    94  * This can be done with `WP_Token_Map::precomputed_php_source_table()`.
       
    95  *
       
    96  * Note that if there is a leading character that all tokens need, such as `&` for
       
    97  * HTML named character references, it can be beneficial to exclude this from the
       
    98  * token map. Instead, find occurrences of the leading character and then use the
       
    99  * token map to see if the following characters complete the token.
       
   100  *
       
   101  * Example:
       
   102  *
       
   103  *     $map = WP_Token_Map::from_array( array( 'simple_smile:' => 'πŸ™‚', 'sob:' => '😭', 'soba:' => '🍜' ) );
       
   104  *     echo $map->precomputed_php_source_table();
       
   105  *     // Output
       
   106  *     WP_Token_Map::from_precomputed_table(
       
   107  *         array(
       
   108  *             "storage_version" => "6.6.0",
       
   109  *             "key_length" => 2,
       
   110  *             "groups" => "si\x00so\x00",
       
   111  *             "long_words" => array(
       
   112  *                 // simple_smile:[πŸ™‚].
       
   113  *                 "\x0bmple_smile:\x04πŸ™‚",
       
   114  *                 // soba:[🍜] sob:[😭].
       
   115  *                 "\x03ba:\x04🍜\x02b:\x04😭",
       
   116  *             ),
       
   117  *             "short_words" => "",
       
   118  *             "short_mappings" => array()
       
   119  *         }
       
   120  *     );
       
   121  *
       
   122  * This precomputed value can be stored directly in source code and will skip the
       
   123  * startup cost of generating the lookup strings. See `$html5_named_character_entities`.
       
   124  *
       
   125  * Note that any updates to the precomputed format should update the storage version
       
   126  * constant. It would also be best to provide an update function to take older known
       
   127  * versions and upgrade them in place when loading into `from_precomputed_table()`.
       
   128  *
       
   129  * ## Future Direction.
       
   130  *
       
   131  * It may be viable to dynamically increase the length limits such that there's no need to impose them.
       
   132  * The limit appears because of the packing structure, which indicates how many bytes each segment of
       
   133  * text in the lookup tables spans. If, however, care were taken to track the longest word length, then
       
   134  * the packing structure could change its representation to allow for that. Each additional byte storing
       
   135  * length, however, increases the memory overhead and lookup runtime.
       
   136  *
       
   137  * An alternative approach could be to borrow the UTF-8 variable-length encoding and store lengths of less
       
   138  * than 127 as a single byte with the high bit unset, storing longer lengths as the combination of
       
   139  * continuation bytes.
       
   140  *
       
   141  * Since it has not been shown during the development of this class that longer strings are required, this
       
   142  * update is deferred until such a need is clear.
       
   143  *
       
   144  * @since 6.6.0
       
   145  */
       
   146 class WP_Token_Map {
       
   147 	/**
       
   148 	 * Denotes the version of the code which produces pre-computed source tables.
       
   149 	 *
       
   150 	 * This version will be used not only to verify pre-computed data, but also
       
   151 	 * to upgrade pre-computed data from older versions. Choosing a name that
       
   152 	 * corresponds to the WordPress release will help people identify where an
       
   153 	 * old copy of data came from.
       
   154 	 */
       
   155 	const STORAGE_VERSION = '6.6.0-trunk';
       
   156 
       
   157 	/**
       
   158 	 * Maximum length for each key and each transformed value in the table (in bytes).
       
   159 	 *
       
   160 	 * @since 6.6.0
       
   161 	 */
       
   162 	const MAX_LENGTH = 256;
       
   163 
       
   164 	/**
       
   165 	 * How many bytes of each key are used to form a group key for lookup.
       
   166 	 * This also determines whether a word is considered short or long.
       
   167 	 *
       
   168 	 * @since 6.6.0
       
   169 	 *
       
   170 	 * @var int
       
   171 	 */
       
   172 	private $key_length = 2;
       
   173 
       
   174 	/**
       
   175 	 * Stores an optimized form of the word set, where words are grouped
       
   176 	 * by a prefix of the `$key_length` and then collapsed into a string.
       
   177 	 *
       
   178 	 * In each group, the keys and lookups form a packed data structure.
       
   179 	 * The keys in the string are stripped of their "group key," which is
       
   180 	 * the prefix of length `$this->key_length` shared by all of the items
       
   181 	 * in the group. Each word in the string is prefixed by a single byte
       
   182 	 * whose raw unsigned integer value represents how many bytes follow.
       
   183 	 *
       
   184 	 *     β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”
       
   185 	 *     β”‚ Length of rest β”‚ Rest of key   β”‚ Length of value β”‚ Value  β”‚
       
   186 	 *     β”‚ of key (bytes) β”‚               β”‚ (bytes)         β”‚        β”‚
       
   187 	 *     β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€
       
   188 	 *     β”‚ 0x08           β”‚ nterDot;      β”‚ 0x02            β”‚ Β·      β”‚
       
   189 	 *     β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”˜
       
   190 	 *
       
   191 	 * In this example, the key `CenterDot;` has a group key `Ce`, leaving
       
   192 	 * eight bytes for the rest of the key, `nterDot;`, and two bytes for
       
   193 	 * the transformed value `Β·` (or U+B7 or "\xC2\xB7").
       
   194 	 *
       
   195 	 * Example:
       
   196 	 *
       
   197 	 *    // Stores array( 'CenterDot;' => 'Β·', 'Cedilla;' => 'ΒΈ' ).
       
   198 	 *    $groups      = "Ce\x00";
       
   199 	 *    $large_words = array( "\x08nterDot;\x02Β·\x06dilla;\x02ΒΈ" )
       
   200 	 *
       
   201 	 * The prefixes appear in the `$groups` string, each followed by a null
       
   202 	 * byte. This makes for quick lookup of where in the group string the key
       
   203 	 * is found, and then a simple division converts that offset into the index
       
   204 	 * in the `$large_words` array where the group string is to be found.
       
   205 	 *
       
   206 	 * This lookup data structure is designed to optimize cache locality and
       
   207 	 * minimize indirect memory reads when matching strings in the set.
       
   208 	 *
       
   209 	 * @since 6.6.0
       
   210 	 *
       
   211 	 * @var array
       
   212 	 */
       
   213 	private $large_words = array();
       
   214 
       
   215 	/**
       
   216 	 * Stores the group keys for sequential string lookup.
       
   217 	 *
       
   218 	 * The offset into this string where the group key appears corresponds with the index
       
   219 	 * into the group array where the rest of the group string appears. This is an optimization
       
   220 	 * to improve cache locality while searching and minimize indirect memory accesses.
       
   221 	 *
       
   222 	 * @since 6.6.0
       
   223 	 *
       
   224 	 * @var string
       
   225 	 */
       
   226 	private $groups = '';
       
   227 
       
   228 	/**
       
   229 	 * Stores an optimized row of small words, where every entry is
       
   230 	 * `$this->key_size + 1` bytes long and zero-extended.
       
   231 	 *
       
   232 	 * This packing allows for direct lookup of a short word followed
       
   233 	 * by the null byte, if extended to `$this->key_size + 1`.
       
   234 	 *
       
   235 	 * Example:
       
   236 	 *
       
   237 	 *     // Stores array( 'GT', 'LT', 'gt', 'lt' ).
       
   238 	 *     "GT\x00LT\x00gt\x00lt\x00"
       
   239 	 *
       
   240 	 * @since 6.6.0
       
   241 	 *
       
   242 	 * @var string
       
   243 	 */
       
   244 	private $small_words = '';
       
   245 
       
   246 	/**
       
   247 	 * Replacements for the small words, in the same order they appear.
       
   248 	 *
       
   249 	 * With the position of a small word it's possible to index the translation
       
   250 	 * directly, as its position in the `$small_words` string corresponds to
       
   251 	 * the index of the replacement in the `$small_mapping` array.
       
   252 	 *
       
   253 	 * Example:
       
   254 	 *
       
   255 	 *     array( '>', '<', '>', '<' )
       
   256 	 *
       
   257 	 * @since 6.6.0
       
   258 	 *
       
   259 	 * @var string[]
       
   260 	 */
       
   261 	private $small_mappings = array();
       
   262 
       
   263 	/**
       
   264 	 * Create a token map using an associative array of key/value pairs as the input.
       
   265 	 *
       
   266 	 * Example:
       
   267 	 *
       
   268 	 *     $smilies = WP_Token_Map::from_array( array(
       
   269 	 *          '8O' => '😯',
       
   270 	 *          ':(' => 'πŸ™',
       
   271 	 *          ':)' => 'πŸ™‚',
       
   272 	 *          ':?' => 'πŸ˜•',
       
   273 	 *       ) );
       
   274 	 *
       
   275 	 * @since 6.6.0
       
   276 	 *
       
   277 	 * @param array $mappings   The keys transform into the values, both are strings.
       
   278 	 * @param int   $key_length Determines the group key length. Leave at the default value
       
   279 	 *                          of 2 unless there's an empirical reason to change it.
       
   280 	 *
       
   281 	 * @return WP_Token_Map|null Token map, unless unable to create it.
       
   282 	 */
       
   283 	public static function from_array( $mappings, $key_length = 2 ) {
       
   284 		$map             = new WP_Token_Map();
       
   285 		$map->key_length = $key_length;
       
   286 
       
   287 		// Start by grouping words.
       
   288 
       
   289 		$groups = array();
       
   290 		$shorts = array();
       
   291 		foreach ( $mappings as $word => $mapping ) {
       
   292 			if (
       
   293 				self::MAX_LENGTH <= strlen( $word ) ||
       
   294 				self::MAX_LENGTH <= strlen( $mapping )
       
   295 			) {
       
   296 				_doing_it_wrong(
       
   297 					__METHOD__,
       
   298 					sprintf(
       
   299 						/* translators: 1: maximum byte length (a count) */
       
   300 						__( 'Token Map tokens and substitutions must all be shorter than %1$d bytes.' ),
       
   301 						self::MAX_LENGTH
       
   302 					),
       
   303 					'6.6.0'
       
   304 				);
       
   305 				return null;
       
   306 			}
       
   307 
       
   308 			$length = strlen( $word );
       
   309 
       
   310 			if ( $key_length >= $length ) {
       
   311 				$shorts[] = $word;
       
   312 			} else {
       
   313 				$group = substr( $word, 0, $key_length );
       
   314 
       
   315 				if ( ! isset( $groups[ $group ] ) ) {
       
   316 					$groups[ $group ] = array();
       
   317 				}
       
   318 
       
   319 				$groups[ $group ][] = array( substr( $word, $key_length ), $mapping );
       
   320 			}
       
   321 		}
       
   322 
       
   323 		/*
       
   324 		 * Sort the words to ensure that no smaller substring of a match masks the full match.
       
   325 		 * For example, `Cap` should not match before `CapitalDifferentialD`.
       
   326 		 */
       
   327 		usort( $shorts, 'WP_Token_Map::longest_first_then_alphabetical' );
       
   328 		foreach ( $groups as $group_key => $group ) {
       
   329 			usort(
       
   330 				$groups[ $group_key ],
       
   331 				static function ( $a, $b ) {
       
   332 					return self::longest_first_then_alphabetical( $a[0], $b[0] );
       
   333 				}
       
   334 			);
       
   335 		}
       
   336 
       
   337 		// Finally construct the optimized lookups.
       
   338 
       
   339 		foreach ( $shorts as $word ) {
       
   340 			$map->small_words     .= str_pad( $word, $key_length + 1, "\x00", STR_PAD_RIGHT );
       
   341 			$map->small_mappings[] = $mappings[ $word ];
       
   342 		}
       
   343 
       
   344 		$group_keys = array_keys( $groups );
       
   345 		sort( $group_keys );
       
   346 
       
   347 		foreach ( $group_keys as $group ) {
       
   348 			$map->groups .= "{$group}\x00";
       
   349 
       
   350 			$group_string = '';
       
   351 
       
   352 			foreach ( $groups[ $group ] as $group_word ) {
       
   353 				list( $word, $mapping ) = $group_word;
       
   354 
       
   355 				$word_length    = pack( 'C', strlen( $word ) );
       
   356 				$mapping_length = pack( 'C', strlen( $mapping ) );
       
   357 				$group_string  .= "{$word_length}{$word}{$mapping_length}{$mapping}";
       
   358 			}
       
   359 
       
   360 			$map->large_words[] = $group_string;
       
   361 		}
       
   362 
       
   363 		return $map;
       
   364 	}
       
   365 
       
   366 	/**
       
   367 	 * Creates a token map from a pre-computed table.
       
   368 	 * This skips the initialization cost of generating the table.
       
   369 	 *
       
   370 	 * This function should only be used to load data created with
       
   371 	 * WP_Token_Map::precomputed_php_source_tag().
       
   372 	 *
       
   373 	 * @since 6.6.0
       
   374 	 *
       
   375 	 * @param array $state {
       
   376 	 *     Stores pre-computed state for directly loading into a Token Map.
       
   377 	 *
       
   378 	 *     @type string $storage_version Which version of the code produced this state.
       
   379 	 *     @type int    $key_length      Group key length.
       
   380 	 *     @type string $groups          Group lookup index.
       
   381 	 *     @type array  $large_words     Large word groups and packed strings.
       
   382 	 *     @type string $small_words     Small words packed string.
       
   383 	 *     @type array  $small_mappings  Small word mappings.
       
   384 	 * }
       
   385 	 *
       
   386 	 * @return WP_Token_Map Map with precomputed data loaded.
       
   387 	 */
       
   388 	public static function from_precomputed_table( $state ) {
       
   389 		$has_necessary_state = isset(
       
   390 			$state['storage_version'],
       
   391 			$state['key_length'],
       
   392 			$state['groups'],
       
   393 			$state['large_words'],
       
   394 			$state['small_words'],
       
   395 			$state['small_mappings']
       
   396 		);
       
   397 
       
   398 		if ( ! $has_necessary_state ) {
       
   399 			_doing_it_wrong(
       
   400 				__METHOD__,
       
   401 				__( 'Missing required inputs to pre-computed WP_Token_Map.' ),
       
   402 				'6.6.0'
       
   403 			);
       
   404 			return null;
       
   405 		}
       
   406 
       
   407 		if ( self::STORAGE_VERSION !== $state['storage_version'] ) {
       
   408 			_doing_it_wrong(
       
   409 				__METHOD__,
       
   410 				/* translators: 1: version string, 2: version string. */
       
   411 				sprintf( __( 'Loaded version \'%1$s\' incompatible with expected version \'%2$s\'.' ), $state['storage_version'], self::STORAGE_VERSION ),
       
   412 				'6.6.0'
       
   413 			);
       
   414 			return null;
       
   415 		}
       
   416 
       
   417 		$map = new WP_Token_Map();
       
   418 
       
   419 		$map->key_length     = $state['key_length'];
       
   420 		$map->groups         = $state['groups'];
       
   421 		$map->large_words    = $state['large_words'];
       
   422 		$map->small_words    = $state['small_words'];
       
   423 		$map->small_mappings = $state['small_mappings'];
       
   424 
       
   425 		return $map;
       
   426 	}
       
   427 
       
   428 	/**
       
   429 	 * Indicates if a given word is a lookup key in the map.
       
   430 	 *
       
   431 	 * Example:
       
   432 	 *
       
   433 	 *     true  === $smilies->contains( ':)' );
       
   434 	 *     false === $smilies->contains( 'simile' );
       
   435 	 *
       
   436 	 * @since 6.6.0
       
   437 	 *
       
   438 	 * @param string $word             Determine if this word is a lookup key in the map.
       
   439 	 * @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'.
       
   440 	 * @return bool Whether there's an entry for the given word in the map.
       
   441 	 */
       
   442 	public function contains( $word, $case_sensitivity = 'case-sensitive' ) {
       
   443 		$ignore_case = 'ascii-case-insensitive' === $case_sensitivity;
       
   444 
       
   445 		if ( $this->key_length >= strlen( $word ) ) {
       
   446 			if ( 0 === strlen( $this->small_words ) ) {
       
   447 				return false;
       
   448 			}
       
   449 
       
   450 			$term    = str_pad( $word, $this->key_length + 1, "\x00", STR_PAD_RIGHT );
       
   451 			$word_at = $ignore_case ? stripos( $this->small_words, $term ) : strpos( $this->small_words, $term );
       
   452 			if ( false === $word_at ) {
       
   453 				return false;
       
   454 			}
       
   455 
       
   456 			return true;
       
   457 		}
       
   458 
       
   459 		$group_key = substr( $word, 0, $this->key_length );
       
   460 		$group_at  = $ignore_case ? stripos( $this->groups, $group_key ) : strpos( $this->groups, $group_key );
       
   461 		if ( false === $group_at ) {
       
   462 			return false;
       
   463 		}
       
   464 		$group        = $this->large_words[ $group_at / ( $this->key_length + 1 ) ];
       
   465 		$group_length = strlen( $group );
       
   466 		$slug         = substr( $word, $this->key_length );
       
   467 		$length       = strlen( $slug );
       
   468 		$at           = 0;
       
   469 
       
   470 		while ( $at < $group_length ) {
       
   471 			$token_length   = unpack( 'C', $group[ $at++ ] )[1];
       
   472 			$token_at       = $at;
       
   473 			$at            += $token_length;
       
   474 			$mapping_length = unpack( 'C', $group[ $at++ ] )[1];
       
   475 			$mapping_at     = $at;
       
   476 
       
   477 			if ( $token_length === $length && 0 === substr_compare( $group, $slug, $token_at, $token_length, $ignore_case ) ) {
       
   478 				return true;
       
   479 			}
       
   480 
       
   481 			$at = $mapping_at + $mapping_length;
       
   482 		}
       
   483 
       
   484 		return false;
       
   485 	}
       
   486 
       
   487 	/**
       
   488 	 * If the text starting at a given offset is a lookup key in the map,
       
   489 	 * return the corresponding transformation from the map, else `false`.
       
   490 	 *
       
   491 	 * This function returns the translated string, but accepts an optional
       
   492 	 * parameter `$matched_token_byte_length`, which communicates how many
       
   493 	 * bytes long the lookup key was, if it found one. This can be used to
       
   494 	 * advance a cursor in calling code if a lookup key was found.
       
   495 	 *
       
   496 	 * Example:
       
   497 	 *
       
   498 	 *     false === $smilies->read_token( 'Not sure :?.', 0, $token_byte_length );
       
   499 	 *     'πŸ˜•'  === $smilies->read_token( 'Not sure :?.', 9, $token_byte_length );
       
   500 	 *     2     === $token_byte_length;
       
   501 	 *
       
   502 	 * Example:
       
   503 	 *
       
   504 	 *     while ( $at < strlen( $input ) ) {
       
   505 	 *         $next_at = strpos( $input, ':', $at );
       
   506 	 *         if ( false === $next_at ) {
       
   507 	 *             break;
       
   508 	 *         }
       
   509 	 *
       
   510 	 *         $smily = $smilies->read_token( $input, $next_at, $token_byte_length );
       
   511 	 *         if ( false === $next_at ) {
       
   512 	 *             ++$at;
       
   513 	 *             continue;
       
   514 	 *         }
       
   515 	 *
       
   516 	 *         $prefix  = substr( $input, $at, $next_at - $at );
       
   517 	 *         $at     += $token_byte_length;
       
   518 	 *         $output .= "{$prefix}{$smily}";
       
   519 	 *     }
       
   520 	 *
       
   521 	 * @since 6.6.0
       
   522 	 *
       
   523 	 * @param string  $text                       String in which to search for a lookup key.
       
   524 	 * @param int     $offset                     Optional. How many bytes into the string where the lookup key ought to start. Default 0.
       
   525 	 * @param ?int    &$matched_token_byte_length Optional. Holds byte-length of found token matched, otherwise not set. Default null.
       
   526 	 * @param string  $case_sensitivity           Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'.
       
   527 	 * @return string|null Mapped value of lookup key if found, otherwise `null`.
       
   528 	 */
       
   529 	public function read_token( $text, $offset = 0, &$matched_token_byte_length = null, $case_sensitivity = 'case-sensitive' ) {
       
   530 		$ignore_case = 'ascii-case-insensitive' === $case_sensitivity;
       
   531 		$text_length = strlen( $text );
       
   532 
       
   533 		// Search for a long word first, if the text is long enough, and if that fails, a short one.
       
   534 		if ( $text_length > $this->key_length ) {
       
   535 			$group_key = substr( $text, $offset, $this->key_length );
       
   536 
       
   537 			$group_at = $ignore_case ? stripos( $this->groups, $group_key ) : strpos( $this->groups, $group_key );
       
   538 			if ( false === $group_at ) {
       
   539 				// Perhaps a short word then.
       
   540 				return strlen( $this->small_words ) > 0
       
   541 					? $this->read_small_token( $text, $offset, $matched_token_byte_length, $case_sensitivity )
       
   542 					: null;
       
   543 			}
       
   544 
       
   545 			$group        = $this->large_words[ $group_at / ( $this->key_length + 1 ) ];
       
   546 			$group_length = strlen( $group );
       
   547 			$at           = 0;
       
   548 			while ( $at < $group_length ) {
       
   549 				$token_length   = unpack( 'C', $group[ $at++ ] )[1];
       
   550 				$token          = substr( $group, $at, $token_length );
       
   551 				$at            += $token_length;
       
   552 				$mapping_length = unpack( 'C', $group[ $at++ ] )[1];
       
   553 				$mapping_at     = $at;
       
   554 
       
   555 				if ( 0 === substr_compare( $text, $token, $offset + $this->key_length, $token_length, $ignore_case ) ) {
       
   556 					$matched_token_byte_length = $this->key_length + $token_length;
       
   557 					return substr( $group, $mapping_at, $mapping_length );
       
   558 				}
       
   559 
       
   560 				$at = $mapping_at + $mapping_length;
       
   561 			}
       
   562 		}
       
   563 
       
   564 		// Perhaps a short word then.
       
   565 		return strlen( $this->small_words ) > 0
       
   566 			? $this->read_small_token( $text, $offset, $matched_token_byte_length, $case_sensitivity )
       
   567 			: null;
       
   568 	}
       
   569 
       
   570 	/**
       
   571 	 * Finds a match for a short word at the index.
       
   572 	 *
       
   573 	 * @since 6.6.0.
       
   574 	 *
       
   575 	 * @param string $text                       String in which to search for a lookup key.
       
   576 	 * @param int    $offset                     Optional. How many bytes into the string where the lookup key ought to start. Default 0.
       
   577 	 * @param ?int   &$matched_token_byte_length Optional. Holds byte-length of found lookup key if matched, otherwise not set. Default null.
       
   578 	 * @param string $case_sensitivity           Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'.
       
   579 	 * @return string|null Mapped value of lookup key if found, otherwise `null`.
       
   580 	 */
       
   581 	private function read_small_token( $text, $offset, &$matched_token_byte_length, $case_sensitivity = 'case-sensitive' ) {
       
   582 		$ignore_case  = 'ascii-case-insensitive' === $case_sensitivity;
       
   583 		$small_length = strlen( $this->small_words );
       
   584 		$search_text  = substr( $text, $offset, $this->key_length );
       
   585 		if ( $ignore_case ) {
       
   586 			$search_text = strtoupper( $search_text );
       
   587 		}
       
   588 		$starting_char = $search_text[0];
       
   589 
       
   590 		$at = 0;
       
   591 		while ( $at < $small_length ) {
       
   592 			if (
       
   593 				$starting_char !== $this->small_words[ $at ] &&
       
   594 				( ! $ignore_case || strtoupper( $this->small_words[ $at ] ) !== $starting_char )
       
   595 			) {
       
   596 				$at += $this->key_length + 1;
       
   597 				continue;
       
   598 			}
       
   599 
       
   600 			for ( $adjust = 1; $adjust < $this->key_length; $adjust++ ) {
       
   601 				if ( "\x00" === $this->small_words[ $at + $adjust ] ) {
       
   602 					$matched_token_byte_length = $adjust;
       
   603 					return $this->small_mappings[ $at / ( $this->key_length + 1 ) ];
       
   604 				}
       
   605 
       
   606 				if (
       
   607 					$search_text[ $adjust ] !== $this->small_words[ $at + $adjust ] &&
       
   608 					( ! $ignore_case || strtoupper( $this->small_words[ $at + $adjust ] !== $search_text[ $adjust ] ) )
       
   609 				) {
       
   610 					$at += $this->key_length + 1;
       
   611 					continue 2;
       
   612 				}
       
   613 			}
       
   614 
       
   615 			$matched_token_byte_length = $adjust;
       
   616 			return $this->small_mappings[ $at / ( $this->key_length + 1 ) ];
       
   617 		}
       
   618 
       
   619 		return null;
       
   620 	}
       
   621 
       
   622 	/**
       
   623 	 * Exports the token map into an associate array of key/value pairs.
       
   624 	 *
       
   625 	 * Example:
       
   626 	 *
       
   627 	 *     $smilies->to_array() === array(
       
   628 	 *         '8O' => '😯',
       
   629 	 *         ':(' => 'πŸ™',
       
   630 	 *         ':)' => 'πŸ™‚',
       
   631 	 *         ':?' => 'πŸ˜•',
       
   632 	 *     );
       
   633 	 *
       
   634 	 * @return array The lookup key/substitution values as an associate array.
       
   635 	 */
       
   636 	public function to_array() {
       
   637 		$tokens = array();
       
   638 
       
   639 		$at            = 0;
       
   640 		$small_mapping = 0;
       
   641 		$small_length  = strlen( $this->small_words );
       
   642 		while ( $at < $small_length ) {
       
   643 			$key            = rtrim( substr( $this->small_words, $at, $this->key_length + 1 ), "\x00" );
       
   644 			$value          = $this->small_mappings[ $small_mapping++ ];
       
   645 			$tokens[ $key ] = $value;
       
   646 
       
   647 			$at += $this->key_length + 1;
       
   648 		}
       
   649 
       
   650 		foreach ( $this->large_words as $index => $group ) {
       
   651 			$prefix       = substr( $this->groups, $index * ( $this->key_length + 1 ), 2 );
       
   652 			$group_length = strlen( $group );
       
   653 			$at           = 0;
       
   654 			while ( $at < $group_length ) {
       
   655 				$length = unpack( 'C', $group[ $at++ ] )[1];
       
   656 				$key    = $prefix . substr( $group, $at, $length );
       
   657 
       
   658 				$at    += $length;
       
   659 				$length = unpack( 'C', $group[ $at++ ] )[1];
       
   660 				$value  = substr( $group, $at, $length );
       
   661 
       
   662 				$tokens[ $key ] = $value;
       
   663 				$at            += $length;
       
   664 			}
       
   665 		}
       
   666 
       
   667 		return $tokens;
       
   668 	}
       
   669 
       
   670 	/**
       
   671 	 * Export the token map for quick loading in PHP source code.
       
   672 	 *
       
   673 	 * This function has a specific purpose, to make loading of static token maps fast.
       
   674 	 * It's used to ensure that the HTML character reference lookups add a minimal cost
       
   675 	 * to initializing the PHP process.
       
   676 	 *
       
   677 	 * Example:
       
   678 	 *
       
   679 	 *     echo $smilies->precomputed_php_source_table();
       
   680 	 *
       
   681 	 *     // Output.
       
   682 	 *     WP_Token_Map::from_precomputed_table(
       
   683 	 *         array(
       
   684 	 *             "storage_version" => "6.6.0",
       
   685 	 *             "key_length" => 2,
       
   686 	 *             "groups" => "",
       
   687 	 *             "long_words" => array(),
       
   688 	 *             "small_words" => "8O\x00:)\x00:(\x00:?\x00",
       
   689 	 *             "small_mappings" => array( "😯", "πŸ™‚", "πŸ™", "πŸ˜•" )
       
   690 	 *         )
       
   691 	 *     );
       
   692 	 *
       
   693 	 * @since 6.6.0
       
   694 	 *
       
   695 	 * @param string $indent Optional. Use this string for indentation, or rely on the default horizontal tab character. Default "\t".
       
   696 	 * @return string Value which can be pasted into a PHP source file for quick loading of table.
       
   697 	 */
       
   698 	public function precomputed_php_source_table( $indent = "\t" ) {
       
   699 		$i1 = $indent;
       
   700 		$i2 = $i1 . $indent;
       
   701 		$i3 = $i2 . $indent;
       
   702 
       
   703 		$class_version = self::STORAGE_VERSION;
       
   704 
       
   705 		$output  = self::class . "::from_precomputed_table(\n";
       
   706 		$output .= "{$i1}array(\n";
       
   707 		$output .= "{$i2}\"storage_version\" => \"{$class_version}\",\n";
       
   708 		$output .= "{$i2}\"key_length\" => {$this->key_length},\n";
       
   709 
       
   710 		$group_line = str_replace( "\x00", "\\x00", $this->groups );
       
   711 		$output    .= "{$i2}\"groups\" => \"{$group_line}\",\n";
       
   712 
       
   713 		$output .= "{$i2}\"large_words\" => array(\n";
       
   714 
       
   715 		$prefixes = explode( "\x00", $this->groups );
       
   716 		foreach ( $prefixes as $index => $prefix ) {
       
   717 			if ( '' === $prefix ) {
       
   718 				break;
       
   719 			}
       
   720 			$group        = $this->large_words[ $index ];
       
   721 			$group_length = strlen( $group );
       
   722 			$comment_line = "{$i3}//";
       
   723 			$data_line    = "{$i3}\"";
       
   724 			$at           = 0;
       
   725 			while ( $at < $group_length ) {
       
   726 				$token_length   = unpack( 'C', $group[ $at++ ] )[1];
       
   727 				$token          = substr( $group, $at, $token_length );
       
   728 				$at            += $token_length;
       
   729 				$mapping_length = unpack( 'C', $group[ $at++ ] )[1];
       
   730 				$mapping        = substr( $group, $at, $mapping_length );
       
   731 				$at            += $mapping_length;
       
   732 
       
   733 				$token_digits   = str_pad( dechex( $token_length ), 2, '0', STR_PAD_LEFT );
       
   734 				$mapping_digits = str_pad( dechex( $mapping_length ), 2, '0', STR_PAD_LEFT );
       
   735 
       
   736 				$mapping = preg_replace_callback(
       
   737 					"~[\\x00-\\x1f\\x22\\x5c]~",
       
   738 					static function ( $match_result ) {
       
   739 						switch ( $match_result[0] ) {
       
   740 							case '"':
       
   741 								return '\\"';
       
   742 
       
   743 							case '\\':
       
   744 								return '\\\\';
       
   745 
       
   746 							default:
       
   747 								$hex = dechex( ord( $match_result[0] ) );
       
   748 								return "\\x{$hex}";
       
   749 						}
       
   750 					},
       
   751 					$mapping
       
   752 				);
       
   753 
       
   754 				$comment_line .= " {$prefix}{$token}[{$mapping}]";
       
   755 				$data_line    .= "\\x{$token_digits}{$token}\\x{$mapping_digits}{$mapping}";
       
   756 			}
       
   757 			$comment_line .= ".\n";
       
   758 			$data_line    .= "\",\n";
       
   759 
       
   760 			$output .= $comment_line;
       
   761 			$output .= $data_line;
       
   762 		}
       
   763 
       
   764 		$output .= "{$i2}),\n";
       
   765 
       
   766 		$small_words  = array();
       
   767 		$small_length = strlen( $this->small_words );
       
   768 		$at           = 0;
       
   769 		while ( $at < $small_length ) {
       
   770 			$small_words[] = substr( $this->small_words, $at, $this->key_length + 1 );
       
   771 			$at           += $this->key_length + 1;
       
   772 		}
       
   773 
       
   774 		$small_text = str_replace( "\x00", '\x00', implode( '', $small_words ) );
       
   775 		$output    .= "{$i2}\"small_words\" => \"{$small_text}\",\n";
       
   776 
       
   777 		$output .= "{$i2}\"small_mappings\" => array(\n";
       
   778 		foreach ( $this->small_mappings as $mapping ) {
       
   779 			$output .= "{$i3}\"{$mapping}\",\n";
       
   780 		}
       
   781 		$output .= "{$i2})\n";
       
   782 		$output .= "{$i1})\n";
       
   783 		$output .= ')';
       
   784 
       
   785 		return $output;
       
   786 	}
       
   787 
       
   788 	/**
       
   789 	 * Compares two strings, returning the longest, or whichever
       
   790 	 * is first alphabetically if they are the same length.
       
   791 	 *
       
   792 	 * This is an important sort when building the token map because
       
   793 	 * it should not form a match on a substring of a longer potential
       
   794 	 * match. For example, it should not detect `Cap` when matching
       
   795 	 * against the string `CapitalDifferentialD`.
       
   796 	 *
       
   797 	 * @since 6.6.0
       
   798 	 *
       
   799 	 * @param string $a First string to compare.
       
   800 	 * @param string $b Second string to compare.
       
   801 	 * @return int -1 or lower if `$a` is less than `$b`; 1 or greater if `$a` is greater than `$b`, and 0 if they are equal.
       
   802 	 */
       
   803 	private static function longest_first_then_alphabetical( $a, $b ) {
       
   804 		if ( $a === $b ) {
       
   805 			return 0;
       
   806 		}
       
   807 
       
   808 		$length_a = strlen( $a );
       
   809 		$length_b = strlen( $b );
       
   810 
       
   811 		// Longer strings are less-than for comparison's sake.
       
   812 		if ( $length_a !== $length_b ) {
       
   813 			return $length_b - $length_a;
       
   814 		}
       
   815 
       
   816 		return strcmp( $a, $b );
       
   817 	}
       
   818 }