wp/wp-includes/html-api/class-wp-html-doctype-info.php
changeset 22 8c2e4d02f4ef
equal deleted inserted replaced
21:48c4eec2b7e6 22:8c2e4d02f4ef
       
     1 <?php
       
     2 /**
       
     3  * HTML API: WP_HTML_Doctype_Info class
       
     4  *
       
     5  * @package WordPress
       
     6  * @subpackage HTML-API
       
     7  * @since 6.7.0
       
     8  */
       
     9 
       
    10 /**
       
    11  * Core class used by the HTML API to represent a DOCTYPE declaration.
       
    12  *
       
    13  * This class parses DOCTYPE tokens for the full parser in the HTML Processor.
       
    14  * Most code interacting with HTML won't need to parse DOCTYPE declarations;
       
    15  * the HTML Processor is one exception. Consult the HTML Processor for proper
       
    16  * parsing of an HTML document.
       
    17  *
       
    18  * A DOCTYPE declaration may indicate its document compatibility mode, which impacts
       
    19  * the structure of the following HTML as well as the behavior of CSS class selectors.
       
    20  * There are three possible modes:
       
    21  *
       
    22  *  - "no-quirks" and "limited-quirks" modes (also called "standards mode").
       
    23  *  - "quirks" mode.
       
    24  *
       
    25  * These modes mostly determine whether CSS class name selectors match values in the
       
    26  * HTML `class` attribute in an ASCII-case-insensitive way (quirks mode), or whether
       
    27  * they match only when byte-for-byte identical (no-quirks mode).
       
    28  *
       
    29  * All HTML documents should start with the standard HTML5 DOCTYPE: `<!DOCTYPE html>`.
       
    30  *
       
    31  * > DOCTYPEs are required for legacy reasons. When omitted, browsers tend to use a different
       
    32  * > rendering mode that is incompatible with some specifications. Including the DOCTYPE in a
       
    33  * > document ensures that the browser makes a best-effort attempt at following the
       
    34  * > relevant specifications.
       
    35  *
       
    36  * @see https://html.spec.whatwg.org/#the-doctype
       
    37  *
       
    38  * DOCTYPE declarations comprise four properties: a name, public identifier, system identifier,
       
    39  * and an indication of which document compatability mode they would imply if an HTML parser
       
    40  * hadn't already determined it from other information.
       
    41  *
       
    42  * @see https://html.spec.whatwg.org/#the-initial-insertion-mode
       
    43  *
       
    44  * Historically, the DOCTYPE declaration was used in SGML documents to instruct a parser how
       
    45  * to interpret the various tags and entities within a document. Its role in HTML diverged
       
    46  * from how it was used in SGML and no meaning should be back-read into HTML based on how it
       
    47  * is used in SGML, XML, or XHTML documents.
       
    48  *
       
    49  * @see https://www.iso.org/standard/16387.html
       
    50  *
       
    51  * @since 6.7.0
       
    52  *
       
    53  * @see WP_HTML_Processor
       
    54  */
       
    55 class WP_HTML_Doctype_Info {
       
    56 	/**
       
    57 	 * Name of the DOCTYPE: should be "html" for HTML documents.
       
    58 	 *
       
    59 	 * This value should be considered "read only" and not modified.
       
    60 	 *
       
    61 	 * Historically the DOCTYPE name indicates name of the document's root element.
       
    62 	 *
       
    63 	 *     <!DOCTYPE html>
       
    64 	 *               ╰──┴── name is "html".
       
    65 	 *
       
    66 	 * @see https://html.spec.whatwg.org/#tokenization
       
    67 	 *
       
    68 	 * @since 6.7.0
       
    69 	 *
       
    70 	 * @var string|null
       
    71 	 */
       
    72 	public $name = null;
       
    73 
       
    74 	/**
       
    75 	 * Public identifier of the DOCTYPE.
       
    76 	 *
       
    77 	 * This value should be considered "read only" and not modified.
       
    78 	 *
       
    79 	 * The public identifier is optional and should not appear in HTML documents.
       
    80 	 * A `null` value indicates that no public identifier was present in the DOCTYPE.
       
    81 	 *
       
    82 	 * Historically the presence of the public identifier indicated that a document
       
    83 	 * was meant to be shared between computer systems and the value indicated to a
       
    84 	 * knowledgeable parser how to find the relevant document type definition (DTD).
       
    85 	 *
       
    86 	 *     <!DOCTYPE html PUBLIC "public id goes here in quotes">
       
    87 	 *               │  │         ╰─── public identifier ─────╯
       
    88 	 *               ╰──┴── name is "html".
       
    89 	 *
       
    90 	 * @see https://html.spec.whatwg.org/#tokenization
       
    91 	 *
       
    92 	 * @since 6.7.0
       
    93 	 *
       
    94 	 * @var string|null
       
    95 	 */
       
    96 	public $public_identifier = null;
       
    97 
       
    98 	/**
       
    99 	 * System identifier of the DOCTYPE.
       
   100 	 *
       
   101 	 * This value should be considered "read only" and not modified.
       
   102 	 *
       
   103 	 * The system identifier is optional and should not appear in HTML documents.
       
   104 	 * A `null` value indicates that no system identifier was present in the DOCTYPE.
       
   105 	 *
       
   106 	 * Historically the system identifier specified where a relevant document type
       
   107 	 * declaration for the given document is stored and may be retrieved.
       
   108 	 *
       
   109 	 *     <!DOCTYPE html SYSTEM "system id goes here in quotes">
       
   110 	 *               │  │         ╰──── system identifier ────╯
       
   111 	 *               ╰──┴── name is "html".
       
   112 	 *
       
   113 	 * If a public identifier were provided it would indicate to a knowledgeable
       
   114 	 * parser how to interpret the system identifier.
       
   115 	 *
       
   116 	 *     <!DOCTYPE html PUBLIC "public id goes here in quotes" "system id goes here in quotes">
       
   117 	 *               │  │         ╰─── public identifier ─────╯   ╰──── system identifier ────╯
       
   118 	 *               ╰──┴── name is "html".
       
   119 	 *
       
   120 	 * @see https://html.spec.whatwg.org/#tokenization
       
   121 	 *
       
   122 	 * @since 6.7.0
       
   123 	 *
       
   124 	 * @var string|null
       
   125 	 */
       
   126 	public $system_identifier = null;
       
   127 
       
   128 	/**
       
   129 	 * Which document compatability mode this DOCTYPE declaration indicates.
       
   130 	 *
       
   131 	 * This value should be considered "read only" and not modified.
       
   132 	 *
       
   133 	 * When an HTML parser has not already set the document compatability mode,
       
   134 	 * (e.g. "quirks" or "no-quirks" mode), it will infer if from the properties
       
   135 	 * of the appropriate DOCTYPE declaration, if one exists. The DOCTYPE can
       
   136 	 * indicate one of three possible document compatability modes:
       
   137 	 *
       
   138 	 *  - "no-quirks" and "limited-quirks" modes (also called "standards" mode).
       
   139 	 *  - "quirks" mode (also called `CSS1Compat` mode).
       
   140 	 *
       
   141 	 * An appropriate DOCTYPE is one encountered in the "initial" insertion mode,
       
   142 	 * before the HTML element has been opened and before finding any other
       
   143 	 * DOCTYPE declaration tokens.
       
   144 	 *
       
   145 	 * @see https://html.spec.whatwg.org/#the-initial-insertion-mode
       
   146 	 *
       
   147 	 * @since 6.7.0
       
   148 	 *
       
   149 	 * @var string One of "no-quirks", "limited-quirks", or "quirks".
       
   150 	 */
       
   151 	public $indicated_compatability_mode;
       
   152 
       
   153 	/**
       
   154 	 * Constructor.
       
   155 	 *
       
   156 	 * This class should not be instantiated directly.
       
   157 	 * Use the static {@see self::from_doctype_token} method instead.
       
   158 	 *
       
   159 	 * The arguments to this constructor correspond to the "DOCTYPE token"
       
   160 	 * as defined in the HTML specification.
       
   161 	 *
       
   162 	 * > DOCTYPE tokens have a name, a public identifier, a system identifier,
       
   163 	 * > and a force-quirks flag. When a DOCTYPE token is created, its name, public identifier,
       
   164 	 * > and system identifier must be marked as missing (which is a distinct state from the
       
   165 	 * > empty string), and the force-quirks flag must be set to off (its other state is on).
       
   166 	 *
       
   167 	 * @see https://html.spec.whatwg.org/multipage/parsing.html#tokenization
       
   168 	 *
       
   169 	 * @since 6.7.0
       
   170 	 *
       
   171 	 * @param string|null $name              Name of the DOCTYPE.
       
   172 	 * @param string|null $public_identifier Public identifier of the DOCTYPE.
       
   173 	 * @param string|null $system_identifier System identifier of the DOCTYPE.
       
   174 	 * @param bool        $force_quirks_flag Whether the force-quirks flag is set for the token.
       
   175 	 */
       
   176 	private function __construct(
       
   177 		?string $name,
       
   178 		?string $public_identifier,
       
   179 		?string $system_identifier,
       
   180 		bool $force_quirks_flag
       
   181 	) {
       
   182 		$this->name              = $name;
       
   183 		$this->public_identifier = $public_identifier;
       
   184 		$this->system_identifier = $system_identifier;
       
   185 
       
   186 		/*
       
   187 		 * > If the DOCTYPE token matches one of the conditions in the following list,
       
   188 		 * > then set the Document to quirks mode:
       
   189 		 */
       
   190 
       
   191 		/*
       
   192 		 * > The force-quirks flag is set to on.
       
   193 		 */
       
   194 		if ( $force_quirks_flag ) {
       
   195 			$this->indicated_compatability_mode = 'quirks';
       
   196 			return;
       
   197 		}
       
   198 
       
   199 		/*
       
   200 		 * Normative documents will contain the literal `<!DOCTYPE html>` with no
       
   201 		 * public or system identifiers; short-circuit to avoid extra parsing.
       
   202 		 */
       
   203 		if ( 'html' === $name && null === $public_identifier && null === $system_identifier ) {
       
   204 			$this->indicated_compatability_mode = 'no-quirks';
       
   205 			return;
       
   206 		}
       
   207 
       
   208 		/*
       
   209 		 * > The name is not "html".
       
   210 		 *
       
   211 		 * The tokenizer must report the name in lower case even if provided in
       
   212 		 * the document in upper case; thus no conversion is required here.
       
   213 		 */
       
   214 		if ( 'html' !== $name ) {
       
   215 			$this->indicated_compatability_mode = 'quirks';
       
   216 			return;
       
   217 		}
       
   218 
       
   219 		/*
       
   220 		 * Set up some variables to handle the rest of the conditions.
       
   221 		 *
       
   222 		 * > set...the public identifier...to...the empty string if the public identifier was missing.
       
   223 		 * > set...the system identifier...to...the empty string if the system identifier was missing.
       
   224 		 * >
       
   225 		 * > The system identifier and public identifier strings must be compared...
       
   226 		 * > in an ASCII case-insensitive manner.
       
   227 		 * >
       
   228 		 * > A system identifier whose value is the empty string is not considered missing
       
   229 		 * > for the purposes of the conditions above.
       
   230 		 */
       
   231 		$system_identifier_is_missing = null === $system_identifier;
       
   232 		$public_identifier            = null === $public_identifier ? '' : strtolower( $public_identifier );
       
   233 		$system_identifier            = null === $system_identifier ? '' : strtolower( $system_identifier );
       
   234 
       
   235 		/*
       
   236 		 * > The public identifier is set to…
       
   237 		 */
       
   238 		if (
       
   239 			'-//w3o//dtd w3 html strict 3.0//en//' === $public_identifier ||
       
   240 			'-/w3c/dtd html 4.0 transitional/en' === $public_identifier ||
       
   241 			'html' === $public_identifier
       
   242 		) {
       
   243 			$this->indicated_compatability_mode = 'quirks';
       
   244 			return;
       
   245 		}
       
   246 
       
   247 		/*
       
   248 		 * > The system identifier is set to…
       
   249 		 */
       
   250 		if ( 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd' === $system_identifier ) {
       
   251 			$this->indicated_compatability_mode = 'quirks';
       
   252 			return;
       
   253 		}
       
   254 
       
   255 		/*
       
   256 		 * All of the following conditions depend on matching the public identifier.
       
   257 		 * If the public identifier is empty, none of the following conditions will match.
       
   258 		 */
       
   259 		if ( '' === $public_identifier ) {
       
   260 			$this->indicated_compatability_mode = 'no-quirks';
       
   261 			return;
       
   262 		}
       
   263 
       
   264 		/*
       
   265 		 * > The public identifier starts with…
       
   266 		 *
       
   267 		 * @todo Optimize this matching. It shouldn't be a large overall performance issue,
       
   268 		 *       however, as only a single DOCTYPE declaration token should ever be parsed,
       
   269 		 *       and normative documents will have exited before reaching this condition.
       
   270 		 */
       
   271 		if (
       
   272 			str_starts_with( $public_identifier, '+//silmaril//dtd html pro v0r11 19970101//' ) ||
       
   273 			str_starts_with( $public_identifier, '-//as//dtd html 3.0 aswedit + extensions//' ) ||
       
   274 			str_starts_with( $public_identifier, '-//advasoft ltd//dtd html 3.0 aswedit + extensions//' ) ||
       
   275 			str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 level 1//' ) ||
       
   276 			str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 level 2//' ) ||
       
   277 			str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict level 1//' ) ||
       
   278 			str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict level 2//' ) ||
       
   279 			str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict//' ) ||
       
   280 			str_starts_with( $public_identifier, '-//ietf//dtd html 2.0//' ) ||
       
   281 			str_starts_with( $public_identifier, '-//ietf//dtd html 2.1e//' ) ||
       
   282 			str_starts_with( $public_identifier, '-//ietf//dtd html 3.0//' ) ||
       
   283 			str_starts_with( $public_identifier, '-//ietf//dtd html 3.2 final//' ) ||
       
   284 			str_starts_with( $public_identifier, '-//ietf//dtd html 3.2//' ) ||
       
   285 			str_starts_with( $public_identifier, '-//ietf//dtd html 3//' ) ||
       
   286 			str_starts_with( $public_identifier, '-//ietf//dtd html level 0//' ) ||
       
   287 			str_starts_with( $public_identifier, '-//ietf//dtd html level 1//' ) ||
       
   288 			str_starts_with( $public_identifier, '-//ietf//dtd html level 2//' ) ||
       
   289 			str_starts_with( $public_identifier, '-//ietf//dtd html level 3//' ) ||
       
   290 			str_starts_with( $public_identifier, '-//ietf//dtd html strict level 0//' ) ||
       
   291 			str_starts_with( $public_identifier, '-//ietf//dtd html strict level 1//' ) ||
       
   292 			str_starts_with( $public_identifier, '-//ietf//dtd html strict level 2//' ) ||
       
   293 			str_starts_with( $public_identifier, '-//ietf//dtd html strict level 3//' ) ||
       
   294 			str_starts_with( $public_identifier, '-//ietf//dtd html strict//' ) ||
       
   295 			str_starts_with( $public_identifier, '-//ietf//dtd html//' ) ||
       
   296 			str_starts_with( $public_identifier, '-//metrius//dtd metrius presentational//' ) ||
       
   297 			str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 html strict//' ) ||
       
   298 			str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 html//' ) ||
       
   299 			str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 tables//' ) ||
       
   300 			str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 html strict//' ) ||
       
   301 			str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 html//' ) ||
       
   302 			str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 tables//' ) ||
       
   303 			str_starts_with( $public_identifier, '-//netscape comm. corp.//dtd html//' ) ||
       
   304 			str_starts_with( $public_identifier, '-//netscape comm. corp.//dtd strict html//' ) ||
       
   305 			str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html 2.0//" ) ||
       
   306 			str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html extended 1.0//" ) ||
       
   307 			str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html extended relaxed 1.0//" ) ||
       
   308 			str_starts_with( $public_identifier, '-//sq//dtd html 2.0 hotmetal + extensions//' ) ||
       
   309 			str_starts_with( $public_identifier, '-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//' ) ||
       
   310 			str_starts_with( $public_identifier, '-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//' ) ||
       
   311 			str_starts_with( $public_identifier, '-//spyglass//dtd html 2.0 extended//' ) ||
       
   312 			str_starts_with( $public_identifier, '-//sun microsystems corp.//dtd hotjava html//' ) ||
       
   313 			str_starts_with( $public_identifier, '-//sun microsystems corp.//dtd hotjava strict html//' ) ||
       
   314 			str_starts_with( $public_identifier, '-//w3c//dtd html 3 1995-03-24//' ) ||
       
   315 			str_starts_with( $public_identifier, '-//w3c//dtd html 3.2 draft//' ) ||
       
   316 			str_starts_with( $public_identifier, '-//w3c//dtd html 3.2 final//' ) ||
       
   317 			str_starts_with( $public_identifier, '-//w3c//dtd html 3.2//' ) ||
       
   318 			str_starts_with( $public_identifier, '-//w3c//dtd html 3.2s draft//' ) ||
       
   319 			str_starts_with( $public_identifier, '-//w3c//dtd html 4.0 frameset//' ) ||
       
   320 			str_starts_with( $public_identifier, '-//w3c//dtd html 4.0 transitional//' ) ||
       
   321 			str_starts_with( $public_identifier, '-//w3c//dtd html experimental 19960712//' ) ||
       
   322 			str_starts_with( $public_identifier, '-//w3c//dtd html experimental 970421//' ) ||
       
   323 			str_starts_with( $public_identifier, '-//w3c//dtd w3 html//' ) ||
       
   324 			str_starts_with( $public_identifier, '-//w3o//dtd w3 html 3.0//' ) ||
       
   325 			str_starts_with( $public_identifier, '-//webtechs//dtd mozilla html 2.0//' ) ||
       
   326 			str_starts_with( $public_identifier, '-//webtechs//dtd mozilla html//' )
       
   327 		) {
       
   328 			$this->indicated_compatability_mode = 'quirks';
       
   329 			return;
       
   330 		}
       
   331 
       
   332 		/*
       
   333 		 * > The system identifier is missing and the public identifier starts with…
       
   334 		 */
       
   335 		if (
       
   336 			$system_identifier_is_missing && (
       
   337 				str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 frameset//' ) ||
       
   338 				str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 transitional//' )
       
   339 			)
       
   340 		) {
       
   341 			$this->indicated_compatability_mode = 'quirks';
       
   342 			return;
       
   343 		}
       
   344 
       
   345 		/*
       
   346 		 * > Otherwise, if the DOCTYPE token matches one of the conditions in
       
   347 		 * > the following list, then set the Document to limited-quirks mode.
       
   348 		 */
       
   349 
       
   350 		/*
       
   351 		 * > The public identifier starts with…
       
   352 		 */
       
   353 		if (
       
   354 			str_starts_with( $public_identifier, '-//w3c//dtd xhtml 1.0 frameset//' ) ||
       
   355 			str_starts_with( $public_identifier, '-//w3c//dtd xhtml 1.0 transitional//' )
       
   356 		) {
       
   357 			$this->indicated_compatability_mode = 'limited-quirks';
       
   358 			return;
       
   359 		}
       
   360 
       
   361 		/*
       
   362 		 * > The system identifier is not missing and the public identifier starts with…
       
   363 		 */
       
   364 		if (
       
   365 			! $system_identifier_is_missing && (
       
   366 				str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 frameset//' ) ||
       
   367 				str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 transitional//' )
       
   368 			)
       
   369 		) {
       
   370 			$this->indicated_compatability_mode = 'limited-quirks';
       
   371 			return;
       
   372 		}
       
   373 
       
   374 		$this->indicated_compatability_mode = 'no-quirks';
       
   375 	}
       
   376 
       
   377 	/**
       
   378 	 * Creates a WP_HTML_Doctype_Info instance by parsing a raw DOCTYPE declaration token.
       
   379 	 *
       
   380 	 * Use this method to parse a DOCTYPE declaration token and get access to its properties
       
   381 	 * via the returned WP_HTML_Doctype_Info class instance. The provided input must parse
       
   382 	 * properly as a DOCTYPE declaration, though it must not represent a valid DOCTYPE.
       
   383 	 *
       
   384 	 * Example:
       
   385 	 *
       
   386 	 *     // Normative HTML DOCTYPE declaration.
       
   387 	 *     $doctype = WP_HTML_Doctype_Info::from_doctype_token( '<!DOCTYPE html>' );
       
   388 	 *     'no-quirks' === $doctype->indicated_compatability_mode;
       
   389 	 *
       
   390 	 *     // A nonsensical DOCTYPE is still valid, and will indicate "quirks" mode.
       
   391 	 *     $doctype = WP_HTML_Doctype_Info::from_doctype_token( '<!doctypeJSON SILLY "nonsense\'>' );
       
   392 	 *     'quirks' === $doctype->indicated_compatability_mode;
       
   393 	 *
       
   394 	 *     // Textual quirks present in raw HTML are handled appropriately.
       
   395 	 *     $doctype = WP_HTML_Doctype_Info::from_doctype_token( "<!DOCTYPE\nhtml\n>" );
       
   396 	 *     'no-quirks' === $doctype->indicated_compatability_mode;
       
   397 	 *
       
   398 	 *     // Anything other than a proper DOCTYPE declaration token fails to parse.
       
   399 	 *     null === WP_HTML_Doctype_Info::from_doctype_token( ' <!DOCTYPE>' );
       
   400 	 *     null === WP_HTML_Doctype_Info::from_doctype_token( '<!DOCTYPE ><p>' );
       
   401 	 *     null === WP_HTML_Doctype_Info::from_doctype_token( '<!TYPEDOC>' );
       
   402 	 *     null === WP_HTML_Doctype_Info::from_doctype_token( 'html' );
       
   403 	 *     null === WP_HTML_Doctype_Info::from_doctype_token( '<?xml version="1.0" encoding="UTF-8" ?>' );
       
   404 	 *
       
   405 	 * @since 6.7.0
       
   406 	 *
       
   407 	 * @param string $doctype_html The complete raw DOCTYPE HTML string, e.g. `<!DOCTYPE html>`.
       
   408 	 *
       
   409 	 * @return WP_HTML_Doctype_Info|null A WP_HTML_Doctype_Info instance will be returned if the
       
   410 	 *                                   provided DOCTYPE HTML is a valid DOCTYPE. Otherwise, null.
       
   411 	 */
       
   412 	public static function from_doctype_token( string $doctype_html ): ?self {
       
   413 		$doctype_name      = null;
       
   414 		$doctype_public_id = null;
       
   415 		$doctype_system_id = null;
       
   416 
       
   417 		$end = strlen( $doctype_html ) - 1;
       
   418 
       
   419 		/*
       
   420 		 * This parser combines the rules for parsing DOCTYPE tokens found in the HTML
       
   421 		 * specification for the DOCTYPE related tokenizer states.
       
   422 		 *
       
   423 		 * @see https://html.spec.whatwg.org/#doctype-state
       
   424 		 */
       
   425 
       
   426 		/*
       
   427 		 * - Valid DOCTYPE HTML token must be at least `<!DOCTYPE>` assuming a complete token not
       
   428 		 *   ending in end-of-file.
       
   429 		 * - It must start with an ASCII case-insensitive match for `<!DOCTYPE`.
       
   430 		 * - The only occurrence of `>` must be the final byte in the HTML string.
       
   431 		 */
       
   432 		if (
       
   433 			$end < 9 ||
       
   434 			0 !== substr_compare( $doctype_html, '<!DOCTYPE', 0, 9, true )
       
   435 		) {
       
   436 			return null;
       
   437 		}
       
   438 
       
   439 		$at = 9;
       
   440 		// Is there one and only one `>`?
       
   441 		if ( '>' !== $doctype_html[ $end ] || ( strcspn( $doctype_html, '>', $at ) + $at ) < $end ) {
       
   442 			return null;
       
   443 		}
       
   444 
       
   445 		/*
       
   446 		 * Perform newline normalization and ensure the $end value is correct after normalization.
       
   447 		 *
       
   448 		 * @see https://html.spec.whatwg.org/#preprocessing-the-input-stream
       
   449 		 * @see https://infra.spec.whatwg.org/#normalize-newlines
       
   450 		 */
       
   451 		$doctype_html = str_replace( "\r\n", "\n", $doctype_html );
       
   452 		$doctype_html = str_replace( "\r", "\n", $doctype_html );
       
   453 		$end          = strlen( $doctype_html ) - 1;
       
   454 
       
   455 		/*
       
   456 		 * In this state, the doctype token has been found and its "content" optionally including the
       
   457 		 * name, public identifier, and system identifier is between the current position and the end.
       
   458 		 *
       
   459 		 *     "<!DOCTYPE...declaration...>"
       
   460 		 *               ╰─ $at           ╰─ $end
       
   461 		 *
       
   462 		 * It's also possible that the declaration part is empty.
       
   463 		 *
       
   464 		 *               ╭─ $at
       
   465 		 *     "<!DOCTYPE>"
       
   466 		 *               ╰─ $end
       
   467 		 *
       
   468 		 * Rules for parsing ">" which terminates the DOCTYPE do not need to be considered as they
       
   469 		 * have been handled above in the condition that the provided DOCTYPE HTML must contain
       
   470 		 * exactly one ">" character in the final position.
       
   471 		 */
       
   472 
       
   473 		/*
       
   474 		 *
       
   475 		 * Parsing effectively begins in "Before DOCTYPE name state". Ignore whitespace and
       
   476 		 * proceed to the next state.
       
   477 		 *
       
   478 		 * @see https://html.spec.whatwg.org/#before-doctype-name-state
       
   479 		 */
       
   480 		$at += strspn( $doctype_html, " \t\n\f\r", $at );
       
   481 
       
   482 		if ( $at >= $end ) {
       
   483 			return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
       
   484 		}
       
   485 
       
   486 		$name_length  = strcspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
       
   487 		$doctype_name = str_replace( "\0", "\u{FFFD}", strtolower( substr( $doctype_html, $at, $name_length ) ) );
       
   488 
       
   489 		$at += $name_length;
       
   490 		$at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
       
   491 		if ( $at >= $end ) {
       
   492 			return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false );
       
   493 		}
       
   494 
       
   495 		/*
       
   496 		 * "After DOCTYPE name state"
       
   497 		 *
       
   498 		 * Find a case-insensitive match for "PUBLIC" or "SYSTEM" at this point.
       
   499 		 * Otherwise, set force-quirks and enter bogus DOCTYPE state (skip the rest of the doctype).
       
   500 		 *
       
   501 		 * @see https://html.spec.whatwg.org/#after-doctype-name-state
       
   502 		 */
       
   503 		if ( $at + 6 >= $end ) {
       
   504 			return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
       
   505 		}
       
   506 
       
   507 		/*
       
   508 		 * > If the six characters starting from the current input character are an ASCII
       
   509 		 * > case-insensitive match for the word "PUBLIC", then consume those characters
       
   510 		 * > and switch to the after DOCTYPE public keyword state.
       
   511 		 */
       
   512 		if ( 0 === substr_compare( $doctype_html, 'PUBLIC', $at, 6, true ) ) {
       
   513 			$at += 6;
       
   514 			$at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
       
   515 			if ( $at >= $end ) {
       
   516 				return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
       
   517 			}
       
   518 			goto parse_doctype_public_identifier;
       
   519 		}
       
   520 
       
   521 		/*
       
   522 		 * > Otherwise, if the six characters starting from the current input character are an ASCII
       
   523 		 * > case-insensitive match for the word "SYSTEM", then consume those characters and switch
       
   524 		 * > to the after DOCTYPE system keyword state.
       
   525 		 */
       
   526 		if ( 0 === substr_compare( $doctype_html, 'SYSTEM', $at, 6, true ) ) {
       
   527 			$at += 6;
       
   528 			$at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
       
   529 			if ( $at >= $end ) {
       
   530 				return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
       
   531 			}
       
   532 			goto parse_doctype_system_identifier;
       
   533 		}
       
   534 
       
   535 		/*
       
   536 		 * > Otherwise, this is an invalid-character-sequence-after-doctype-name parse error.
       
   537 		 * > Set the current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus
       
   538 		 * > DOCTYPE state.
       
   539 		 */
       
   540 		return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
       
   541 
       
   542 		parse_doctype_public_identifier:
       
   543 		/*
       
   544 		 * The parser should enter "DOCTYPE public identifier (double-quoted) state" or
       
   545 		 * "DOCTYPE public identifier (single-quoted) state" by finding one of the valid quotes.
       
   546 		 * Anything else forces quirks mode and ignores the rest of the contents.
       
   547 		 *
       
   548 		 * @see https://html.spec.whatwg.org/#doctype-public-identifier-(double-quoted)-state
       
   549 		 * @see https://html.spec.whatwg.org/#doctype-public-identifier-(single-quoted)-state
       
   550 		 */
       
   551 		$closer_quote = $doctype_html[ $at ];
       
   552 
       
   553 		/*
       
   554 		 * > This is a missing-quote-before-doctype-public-identifier parse error. Set the
       
   555 		 * > current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus DOCTYPE state.
       
   556 		 */
       
   557 		if ( '"' !== $closer_quote && "'" !== $closer_quote ) {
       
   558 			return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
       
   559 		}
       
   560 
       
   561 		++$at;
       
   562 
       
   563 		$identifier_length = strcspn( $doctype_html, $closer_quote, $at, $end - $at );
       
   564 		$doctype_public_id = str_replace( "\0", "\u{FFFD}", substr( $doctype_html, $at, $identifier_length ) );
       
   565 
       
   566 		$at += $identifier_length;
       
   567 		if ( $at >= $end || $closer_quote !== $doctype_html[ $at ] ) {
       
   568 			return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
       
   569 		}
       
   570 
       
   571 		++$at;
       
   572 
       
   573 		/*
       
   574 		 * "Between DOCTYPE public and system identifiers state"
       
   575 		 *
       
   576 		 * Advance through whitespace between public and system identifiers.
       
   577 		 *
       
   578 		 * @see https://html.spec.whatwg.org/#between-doctype-public-and-system-identifiers-state
       
   579 		 */
       
   580 		$at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at );
       
   581 		if ( $at >= $end ) {
       
   582 			return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false );
       
   583 		}
       
   584 
       
   585 		parse_doctype_system_identifier:
       
   586 		/*
       
   587 		 * The parser should enter "DOCTYPE system identifier (double-quoted) state" or
       
   588 		 * "DOCTYPE system identifier (single-quoted) state" by finding one of the valid quotes.
       
   589 		 * Anything else forces quirks mode and ignores the rest of the contents.
       
   590 		 *
       
   591 		 * @see https://html.spec.whatwg.org/#doctype-system-identifier-(double-quoted)-state
       
   592 		 * @see https://html.spec.whatwg.org/#doctype-system-identifier-(single-quoted)-state
       
   593 		 */
       
   594 		$closer_quote = $doctype_html[ $at ];
       
   595 
       
   596 		/*
       
   597 		 * > This is a missing-quote-before-doctype-system-identifier parse error. Set the
       
   598 		 * > current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus DOCTYPE state.
       
   599 		 */
       
   600 		if ( '"' !== $closer_quote && "'" !== $closer_quote ) {
       
   601 			return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
       
   602 		}
       
   603 
       
   604 		++$at;
       
   605 
       
   606 		$identifier_length = strcspn( $doctype_html, $closer_quote, $at, $end - $at );
       
   607 		$doctype_system_id = str_replace( "\0", "\u{FFFD}", substr( $doctype_html, $at, $identifier_length ) );
       
   608 
       
   609 		$at += $identifier_length;
       
   610 		if ( $at >= $end || $closer_quote !== $doctype_html[ $at ] ) {
       
   611 			return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true );
       
   612 		}
       
   613 
       
   614 		return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false );
       
   615 	}
       
   616 }