|
1 <?php |
|
2 /** |
|
3 * HTML API: WP_HTML_Doctype_Info class |
|
4 * |
|
5 * @package WordPress |
|
6 * @subpackage HTML-API |
|
7 * @since 6.7.0 |
|
8 */ |
|
9 |
|
10 /** |
|
11 * Core class used by the HTML API to represent a DOCTYPE declaration. |
|
12 * |
|
13 * This class parses DOCTYPE tokens for the full parser in the HTML Processor. |
|
14 * Most code interacting with HTML won't need to parse DOCTYPE declarations; |
|
15 * the HTML Processor is one exception. Consult the HTML Processor for proper |
|
16 * parsing of an HTML document. |
|
17 * |
|
18 * A DOCTYPE declaration may indicate its document compatibility mode, which impacts |
|
19 * the structure of the following HTML as well as the behavior of CSS class selectors. |
|
20 * There are three possible modes: |
|
21 * |
|
22 * - "no-quirks" and "limited-quirks" modes (also called "standards mode"). |
|
23 * - "quirks" mode. |
|
24 * |
|
25 * These modes mostly determine whether CSS class name selectors match values in the |
|
26 * HTML `class` attribute in an ASCII-case-insensitive way (quirks mode), or whether |
|
27 * they match only when byte-for-byte identical (no-quirks mode). |
|
28 * |
|
29 * All HTML documents should start with the standard HTML5 DOCTYPE: `<!DOCTYPE html>`. |
|
30 * |
|
31 * > DOCTYPEs are required for legacy reasons. When omitted, browsers tend to use a different |
|
32 * > rendering mode that is incompatible with some specifications. Including the DOCTYPE in a |
|
33 * > document ensures that the browser makes a best-effort attempt at following the |
|
34 * > relevant specifications. |
|
35 * |
|
36 * @see https://html.spec.whatwg.org/#the-doctype |
|
37 * |
|
38 * DOCTYPE declarations comprise four properties: a name, public identifier, system identifier, |
|
39 * and an indication of which document compatability mode they would imply if an HTML parser |
|
40 * hadn't already determined it from other information. |
|
41 * |
|
42 * @see https://html.spec.whatwg.org/#the-initial-insertion-mode |
|
43 * |
|
44 * Historically, the DOCTYPE declaration was used in SGML documents to instruct a parser how |
|
45 * to interpret the various tags and entities within a document. Its role in HTML diverged |
|
46 * from how it was used in SGML and no meaning should be back-read into HTML based on how it |
|
47 * is used in SGML, XML, or XHTML documents. |
|
48 * |
|
49 * @see https://www.iso.org/standard/16387.html |
|
50 * |
|
51 * @since 6.7.0 |
|
52 * |
|
53 * @see WP_HTML_Processor |
|
54 */ |
|
55 class WP_HTML_Doctype_Info { |
|
56 /** |
|
57 * Name of the DOCTYPE: should be "html" for HTML documents. |
|
58 * |
|
59 * This value should be considered "read only" and not modified. |
|
60 * |
|
61 * Historically the DOCTYPE name indicates name of the document's root element. |
|
62 * |
|
63 * <!DOCTYPE html> |
|
64 * ╰──┴── name is "html". |
|
65 * |
|
66 * @see https://html.spec.whatwg.org/#tokenization |
|
67 * |
|
68 * @since 6.7.0 |
|
69 * |
|
70 * @var string|null |
|
71 */ |
|
72 public $name = null; |
|
73 |
|
74 /** |
|
75 * Public identifier of the DOCTYPE. |
|
76 * |
|
77 * This value should be considered "read only" and not modified. |
|
78 * |
|
79 * The public identifier is optional and should not appear in HTML documents. |
|
80 * A `null` value indicates that no public identifier was present in the DOCTYPE. |
|
81 * |
|
82 * Historically the presence of the public identifier indicated that a document |
|
83 * was meant to be shared between computer systems and the value indicated to a |
|
84 * knowledgeable parser how to find the relevant document type definition (DTD). |
|
85 * |
|
86 * <!DOCTYPE html PUBLIC "public id goes here in quotes"> |
|
87 * │ │ ╰─── public identifier ─────╯ |
|
88 * ╰──┴── name is "html". |
|
89 * |
|
90 * @see https://html.spec.whatwg.org/#tokenization |
|
91 * |
|
92 * @since 6.7.0 |
|
93 * |
|
94 * @var string|null |
|
95 */ |
|
96 public $public_identifier = null; |
|
97 |
|
98 /** |
|
99 * System identifier of the DOCTYPE. |
|
100 * |
|
101 * This value should be considered "read only" and not modified. |
|
102 * |
|
103 * The system identifier is optional and should not appear in HTML documents. |
|
104 * A `null` value indicates that no system identifier was present in the DOCTYPE. |
|
105 * |
|
106 * Historically the system identifier specified where a relevant document type |
|
107 * declaration for the given document is stored and may be retrieved. |
|
108 * |
|
109 * <!DOCTYPE html SYSTEM "system id goes here in quotes"> |
|
110 * │ │ ╰──── system identifier ────╯ |
|
111 * ╰──┴── name is "html". |
|
112 * |
|
113 * If a public identifier were provided it would indicate to a knowledgeable |
|
114 * parser how to interpret the system identifier. |
|
115 * |
|
116 * <!DOCTYPE html PUBLIC "public id goes here in quotes" "system id goes here in quotes"> |
|
117 * │ │ ╰─── public identifier ─────╯ ╰──── system identifier ────╯ |
|
118 * ╰──┴── name is "html". |
|
119 * |
|
120 * @see https://html.spec.whatwg.org/#tokenization |
|
121 * |
|
122 * @since 6.7.0 |
|
123 * |
|
124 * @var string|null |
|
125 */ |
|
126 public $system_identifier = null; |
|
127 |
|
128 /** |
|
129 * Which document compatability mode this DOCTYPE declaration indicates. |
|
130 * |
|
131 * This value should be considered "read only" and not modified. |
|
132 * |
|
133 * When an HTML parser has not already set the document compatability mode, |
|
134 * (e.g. "quirks" or "no-quirks" mode), it will infer if from the properties |
|
135 * of the appropriate DOCTYPE declaration, if one exists. The DOCTYPE can |
|
136 * indicate one of three possible document compatability modes: |
|
137 * |
|
138 * - "no-quirks" and "limited-quirks" modes (also called "standards" mode). |
|
139 * - "quirks" mode (also called `CSS1Compat` mode). |
|
140 * |
|
141 * An appropriate DOCTYPE is one encountered in the "initial" insertion mode, |
|
142 * before the HTML element has been opened and before finding any other |
|
143 * DOCTYPE declaration tokens. |
|
144 * |
|
145 * @see https://html.spec.whatwg.org/#the-initial-insertion-mode |
|
146 * |
|
147 * @since 6.7.0 |
|
148 * |
|
149 * @var string One of "no-quirks", "limited-quirks", or "quirks". |
|
150 */ |
|
151 public $indicated_compatability_mode; |
|
152 |
|
153 /** |
|
154 * Constructor. |
|
155 * |
|
156 * This class should not be instantiated directly. |
|
157 * Use the static {@see self::from_doctype_token} method instead. |
|
158 * |
|
159 * The arguments to this constructor correspond to the "DOCTYPE token" |
|
160 * as defined in the HTML specification. |
|
161 * |
|
162 * > DOCTYPE tokens have a name, a public identifier, a system identifier, |
|
163 * > and a force-quirks flag. When a DOCTYPE token is created, its name, public identifier, |
|
164 * > and system identifier must be marked as missing (which is a distinct state from the |
|
165 * > empty string), and the force-quirks flag must be set to off (its other state is on). |
|
166 * |
|
167 * @see https://html.spec.whatwg.org/multipage/parsing.html#tokenization |
|
168 * |
|
169 * @since 6.7.0 |
|
170 * |
|
171 * @param string|null $name Name of the DOCTYPE. |
|
172 * @param string|null $public_identifier Public identifier of the DOCTYPE. |
|
173 * @param string|null $system_identifier System identifier of the DOCTYPE. |
|
174 * @param bool $force_quirks_flag Whether the force-quirks flag is set for the token. |
|
175 */ |
|
176 private function __construct( |
|
177 ?string $name, |
|
178 ?string $public_identifier, |
|
179 ?string $system_identifier, |
|
180 bool $force_quirks_flag |
|
181 ) { |
|
182 $this->name = $name; |
|
183 $this->public_identifier = $public_identifier; |
|
184 $this->system_identifier = $system_identifier; |
|
185 |
|
186 /* |
|
187 * > If the DOCTYPE token matches one of the conditions in the following list, |
|
188 * > then set the Document to quirks mode: |
|
189 */ |
|
190 |
|
191 /* |
|
192 * > The force-quirks flag is set to on. |
|
193 */ |
|
194 if ( $force_quirks_flag ) { |
|
195 $this->indicated_compatability_mode = 'quirks'; |
|
196 return; |
|
197 } |
|
198 |
|
199 /* |
|
200 * Normative documents will contain the literal `<!DOCTYPE html>` with no |
|
201 * public or system identifiers; short-circuit to avoid extra parsing. |
|
202 */ |
|
203 if ( 'html' === $name && null === $public_identifier && null === $system_identifier ) { |
|
204 $this->indicated_compatability_mode = 'no-quirks'; |
|
205 return; |
|
206 } |
|
207 |
|
208 /* |
|
209 * > The name is not "html". |
|
210 * |
|
211 * The tokenizer must report the name in lower case even if provided in |
|
212 * the document in upper case; thus no conversion is required here. |
|
213 */ |
|
214 if ( 'html' !== $name ) { |
|
215 $this->indicated_compatability_mode = 'quirks'; |
|
216 return; |
|
217 } |
|
218 |
|
219 /* |
|
220 * Set up some variables to handle the rest of the conditions. |
|
221 * |
|
222 * > set...the public identifier...to...the empty string if the public identifier was missing. |
|
223 * > set...the system identifier...to...the empty string if the system identifier was missing. |
|
224 * > |
|
225 * > The system identifier and public identifier strings must be compared... |
|
226 * > in an ASCII case-insensitive manner. |
|
227 * > |
|
228 * > A system identifier whose value is the empty string is not considered missing |
|
229 * > for the purposes of the conditions above. |
|
230 */ |
|
231 $system_identifier_is_missing = null === $system_identifier; |
|
232 $public_identifier = null === $public_identifier ? '' : strtolower( $public_identifier ); |
|
233 $system_identifier = null === $system_identifier ? '' : strtolower( $system_identifier ); |
|
234 |
|
235 /* |
|
236 * > The public identifier is set to… |
|
237 */ |
|
238 if ( |
|
239 '-//w3o//dtd w3 html strict 3.0//en//' === $public_identifier || |
|
240 '-/w3c/dtd html 4.0 transitional/en' === $public_identifier || |
|
241 'html' === $public_identifier |
|
242 ) { |
|
243 $this->indicated_compatability_mode = 'quirks'; |
|
244 return; |
|
245 } |
|
246 |
|
247 /* |
|
248 * > The system identifier is set to… |
|
249 */ |
|
250 if ( 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd' === $system_identifier ) { |
|
251 $this->indicated_compatability_mode = 'quirks'; |
|
252 return; |
|
253 } |
|
254 |
|
255 /* |
|
256 * All of the following conditions depend on matching the public identifier. |
|
257 * If the public identifier is empty, none of the following conditions will match. |
|
258 */ |
|
259 if ( '' === $public_identifier ) { |
|
260 $this->indicated_compatability_mode = 'no-quirks'; |
|
261 return; |
|
262 } |
|
263 |
|
264 /* |
|
265 * > The public identifier starts with… |
|
266 * |
|
267 * @todo Optimize this matching. It shouldn't be a large overall performance issue, |
|
268 * however, as only a single DOCTYPE declaration token should ever be parsed, |
|
269 * and normative documents will have exited before reaching this condition. |
|
270 */ |
|
271 if ( |
|
272 str_starts_with( $public_identifier, '+//silmaril//dtd html pro v0r11 19970101//' ) || |
|
273 str_starts_with( $public_identifier, '-//as//dtd html 3.0 aswedit + extensions//' ) || |
|
274 str_starts_with( $public_identifier, '-//advasoft ltd//dtd html 3.0 aswedit + extensions//' ) || |
|
275 str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 level 1//' ) || |
|
276 str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 level 2//' ) || |
|
277 str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict level 1//' ) || |
|
278 str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict level 2//' ) || |
|
279 str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict//' ) || |
|
280 str_starts_with( $public_identifier, '-//ietf//dtd html 2.0//' ) || |
|
281 str_starts_with( $public_identifier, '-//ietf//dtd html 2.1e//' ) || |
|
282 str_starts_with( $public_identifier, '-//ietf//dtd html 3.0//' ) || |
|
283 str_starts_with( $public_identifier, '-//ietf//dtd html 3.2 final//' ) || |
|
284 str_starts_with( $public_identifier, '-//ietf//dtd html 3.2//' ) || |
|
285 str_starts_with( $public_identifier, '-//ietf//dtd html 3//' ) || |
|
286 str_starts_with( $public_identifier, '-//ietf//dtd html level 0//' ) || |
|
287 str_starts_with( $public_identifier, '-//ietf//dtd html level 1//' ) || |
|
288 str_starts_with( $public_identifier, '-//ietf//dtd html level 2//' ) || |
|
289 str_starts_with( $public_identifier, '-//ietf//dtd html level 3//' ) || |
|
290 str_starts_with( $public_identifier, '-//ietf//dtd html strict level 0//' ) || |
|
291 str_starts_with( $public_identifier, '-//ietf//dtd html strict level 1//' ) || |
|
292 str_starts_with( $public_identifier, '-//ietf//dtd html strict level 2//' ) || |
|
293 str_starts_with( $public_identifier, '-//ietf//dtd html strict level 3//' ) || |
|
294 str_starts_with( $public_identifier, '-//ietf//dtd html strict//' ) || |
|
295 str_starts_with( $public_identifier, '-//ietf//dtd html//' ) || |
|
296 str_starts_with( $public_identifier, '-//metrius//dtd metrius presentational//' ) || |
|
297 str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 html strict//' ) || |
|
298 str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 html//' ) || |
|
299 str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 tables//' ) || |
|
300 str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 html strict//' ) || |
|
301 str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 html//' ) || |
|
302 str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 tables//' ) || |
|
303 str_starts_with( $public_identifier, '-//netscape comm. corp.//dtd html//' ) || |
|
304 str_starts_with( $public_identifier, '-//netscape comm. corp.//dtd strict html//' ) || |
|
305 str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html 2.0//" ) || |
|
306 str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html extended 1.0//" ) || |
|
307 str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html extended relaxed 1.0//" ) || |
|
308 str_starts_with( $public_identifier, '-//sq//dtd html 2.0 hotmetal + extensions//' ) || |
|
309 str_starts_with( $public_identifier, '-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//' ) || |
|
310 str_starts_with( $public_identifier, '-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//' ) || |
|
311 str_starts_with( $public_identifier, '-//spyglass//dtd html 2.0 extended//' ) || |
|
312 str_starts_with( $public_identifier, '-//sun microsystems corp.//dtd hotjava html//' ) || |
|
313 str_starts_with( $public_identifier, '-//sun microsystems corp.//dtd hotjava strict html//' ) || |
|
314 str_starts_with( $public_identifier, '-//w3c//dtd html 3 1995-03-24//' ) || |
|
315 str_starts_with( $public_identifier, '-//w3c//dtd html 3.2 draft//' ) || |
|
316 str_starts_with( $public_identifier, '-//w3c//dtd html 3.2 final//' ) || |
|
317 str_starts_with( $public_identifier, '-//w3c//dtd html 3.2//' ) || |
|
318 str_starts_with( $public_identifier, '-//w3c//dtd html 3.2s draft//' ) || |
|
319 str_starts_with( $public_identifier, '-//w3c//dtd html 4.0 frameset//' ) || |
|
320 str_starts_with( $public_identifier, '-//w3c//dtd html 4.0 transitional//' ) || |
|
321 str_starts_with( $public_identifier, '-//w3c//dtd html experimental 19960712//' ) || |
|
322 str_starts_with( $public_identifier, '-//w3c//dtd html experimental 970421//' ) || |
|
323 str_starts_with( $public_identifier, '-//w3c//dtd w3 html//' ) || |
|
324 str_starts_with( $public_identifier, '-//w3o//dtd w3 html 3.0//' ) || |
|
325 str_starts_with( $public_identifier, '-//webtechs//dtd mozilla html 2.0//' ) || |
|
326 str_starts_with( $public_identifier, '-//webtechs//dtd mozilla html//' ) |
|
327 ) { |
|
328 $this->indicated_compatability_mode = 'quirks'; |
|
329 return; |
|
330 } |
|
331 |
|
332 /* |
|
333 * > The system identifier is missing and the public identifier starts with… |
|
334 */ |
|
335 if ( |
|
336 $system_identifier_is_missing && ( |
|
337 str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 frameset//' ) || |
|
338 str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 transitional//' ) |
|
339 ) |
|
340 ) { |
|
341 $this->indicated_compatability_mode = 'quirks'; |
|
342 return; |
|
343 } |
|
344 |
|
345 /* |
|
346 * > Otherwise, if the DOCTYPE token matches one of the conditions in |
|
347 * > the following list, then set the Document to limited-quirks mode. |
|
348 */ |
|
349 |
|
350 /* |
|
351 * > The public identifier starts with… |
|
352 */ |
|
353 if ( |
|
354 str_starts_with( $public_identifier, '-//w3c//dtd xhtml 1.0 frameset//' ) || |
|
355 str_starts_with( $public_identifier, '-//w3c//dtd xhtml 1.0 transitional//' ) |
|
356 ) { |
|
357 $this->indicated_compatability_mode = 'limited-quirks'; |
|
358 return; |
|
359 } |
|
360 |
|
361 /* |
|
362 * > The system identifier is not missing and the public identifier starts with… |
|
363 */ |
|
364 if ( |
|
365 ! $system_identifier_is_missing && ( |
|
366 str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 frameset//' ) || |
|
367 str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 transitional//' ) |
|
368 ) |
|
369 ) { |
|
370 $this->indicated_compatability_mode = 'limited-quirks'; |
|
371 return; |
|
372 } |
|
373 |
|
374 $this->indicated_compatability_mode = 'no-quirks'; |
|
375 } |
|
376 |
|
377 /** |
|
378 * Creates a WP_HTML_Doctype_Info instance by parsing a raw DOCTYPE declaration token. |
|
379 * |
|
380 * Use this method to parse a DOCTYPE declaration token and get access to its properties |
|
381 * via the returned WP_HTML_Doctype_Info class instance. The provided input must parse |
|
382 * properly as a DOCTYPE declaration, though it must not represent a valid DOCTYPE. |
|
383 * |
|
384 * Example: |
|
385 * |
|
386 * // Normative HTML DOCTYPE declaration. |
|
387 * $doctype = WP_HTML_Doctype_Info::from_doctype_token( '<!DOCTYPE html>' ); |
|
388 * 'no-quirks' === $doctype->indicated_compatability_mode; |
|
389 * |
|
390 * // A nonsensical DOCTYPE is still valid, and will indicate "quirks" mode. |
|
391 * $doctype = WP_HTML_Doctype_Info::from_doctype_token( '<!doctypeJSON SILLY "nonsense\'>' ); |
|
392 * 'quirks' === $doctype->indicated_compatability_mode; |
|
393 * |
|
394 * // Textual quirks present in raw HTML are handled appropriately. |
|
395 * $doctype = WP_HTML_Doctype_Info::from_doctype_token( "<!DOCTYPE\nhtml\n>" ); |
|
396 * 'no-quirks' === $doctype->indicated_compatability_mode; |
|
397 * |
|
398 * // Anything other than a proper DOCTYPE declaration token fails to parse. |
|
399 * null === WP_HTML_Doctype_Info::from_doctype_token( ' <!DOCTYPE>' ); |
|
400 * null === WP_HTML_Doctype_Info::from_doctype_token( '<!DOCTYPE ><p>' ); |
|
401 * null === WP_HTML_Doctype_Info::from_doctype_token( '<!TYPEDOC>' ); |
|
402 * null === WP_HTML_Doctype_Info::from_doctype_token( 'html' ); |
|
403 * null === WP_HTML_Doctype_Info::from_doctype_token( '<?xml version="1.0" encoding="UTF-8" ?>' ); |
|
404 * |
|
405 * @since 6.7.0 |
|
406 * |
|
407 * @param string $doctype_html The complete raw DOCTYPE HTML string, e.g. `<!DOCTYPE html>`. |
|
408 * |
|
409 * @return WP_HTML_Doctype_Info|null A WP_HTML_Doctype_Info instance will be returned if the |
|
410 * provided DOCTYPE HTML is a valid DOCTYPE. Otherwise, null. |
|
411 */ |
|
412 public static function from_doctype_token( string $doctype_html ): ?self { |
|
413 $doctype_name = null; |
|
414 $doctype_public_id = null; |
|
415 $doctype_system_id = null; |
|
416 |
|
417 $end = strlen( $doctype_html ) - 1; |
|
418 |
|
419 /* |
|
420 * This parser combines the rules for parsing DOCTYPE tokens found in the HTML |
|
421 * specification for the DOCTYPE related tokenizer states. |
|
422 * |
|
423 * @see https://html.spec.whatwg.org/#doctype-state |
|
424 */ |
|
425 |
|
426 /* |
|
427 * - Valid DOCTYPE HTML token must be at least `<!DOCTYPE>` assuming a complete token not |
|
428 * ending in end-of-file. |
|
429 * - It must start with an ASCII case-insensitive match for `<!DOCTYPE`. |
|
430 * - The only occurrence of `>` must be the final byte in the HTML string. |
|
431 */ |
|
432 if ( |
|
433 $end < 9 || |
|
434 0 !== substr_compare( $doctype_html, '<!DOCTYPE', 0, 9, true ) |
|
435 ) { |
|
436 return null; |
|
437 } |
|
438 |
|
439 $at = 9; |
|
440 // Is there one and only one `>`? |
|
441 if ( '>' !== $doctype_html[ $end ] || ( strcspn( $doctype_html, '>', $at ) + $at ) < $end ) { |
|
442 return null; |
|
443 } |
|
444 |
|
445 /* |
|
446 * Perform newline normalization and ensure the $end value is correct after normalization. |
|
447 * |
|
448 * @see https://html.spec.whatwg.org/#preprocessing-the-input-stream |
|
449 * @see https://infra.spec.whatwg.org/#normalize-newlines |
|
450 */ |
|
451 $doctype_html = str_replace( "\r\n", "\n", $doctype_html ); |
|
452 $doctype_html = str_replace( "\r", "\n", $doctype_html ); |
|
453 $end = strlen( $doctype_html ) - 1; |
|
454 |
|
455 /* |
|
456 * In this state, the doctype token has been found and its "content" optionally including the |
|
457 * name, public identifier, and system identifier is between the current position and the end. |
|
458 * |
|
459 * "<!DOCTYPE...declaration...>" |
|
460 * ╰─ $at ╰─ $end |
|
461 * |
|
462 * It's also possible that the declaration part is empty. |
|
463 * |
|
464 * ╭─ $at |
|
465 * "<!DOCTYPE>" |
|
466 * ╰─ $end |
|
467 * |
|
468 * Rules for parsing ">" which terminates the DOCTYPE do not need to be considered as they |
|
469 * have been handled above in the condition that the provided DOCTYPE HTML must contain |
|
470 * exactly one ">" character in the final position. |
|
471 */ |
|
472 |
|
473 /* |
|
474 * |
|
475 * Parsing effectively begins in "Before DOCTYPE name state". Ignore whitespace and |
|
476 * proceed to the next state. |
|
477 * |
|
478 * @see https://html.spec.whatwg.org/#before-doctype-name-state |
|
479 */ |
|
480 $at += strspn( $doctype_html, " \t\n\f\r", $at ); |
|
481 |
|
482 if ( $at >= $end ) { |
|
483 return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); |
|
484 } |
|
485 |
|
486 $name_length = strcspn( $doctype_html, " \t\n\f\r", $at, $end - $at ); |
|
487 $doctype_name = str_replace( "\0", "\u{FFFD}", strtolower( substr( $doctype_html, $at, $name_length ) ) ); |
|
488 |
|
489 $at += $name_length; |
|
490 $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at ); |
|
491 if ( $at >= $end ) { |
|
492 return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false ); |
|
493 } |
|
494 |
|
495 /* |
|
496 * "After DOCTYPE name state" |
|
497 * |
|
498 * Find a case-insensitive match for "PUBLIC" or "SYSTEM" at this point. |
|
499 * Otherwise, set force-quirks and enter bogus DOCTYPE state (skip the rest of the doctype). |
|
500 * |
|
501 * @see https://html.spec.whatwg.org/#after-doctype-name-state |
|
502 */ |
|
503 if ( $at + 6 >= $end ) { |
|
504 return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); |
|
505 } |
|
506 |
|
507 /* |
|
508 * > If the six characters starting from the current input character are an ASCII |
|
509 * > case-insensitive match for the word "PUBLIC", then consume those characters |
|
510 * > and switch to the after DOCTYPE public keyword state. |
|
511 */ |
|
512 if ( 0 === substr_compare( $doctype_html, 'PUBLIC', $at, 6, true ) ) { |
|
513 $at += 6; |
|
514 $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at ); |
|
515 if ( $at >= $end ) { |
|
516 return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); |
|
517 } |
|
518 goto parse_doctype_public_identifier; |
|
519 } |
|
520 |
|
521 /* |
|
522 * > Otherwise, if the six characters starting from the current input character are an ASCII |
|
523 * > case-insensitive match for the word "SYSTEM", then consume those characters and switch |
|
524 * > to the after DOCTYPE system keyword state. |
|
525 */ |
|
526 if ( 0 === substr_compare( $doctype_html, 'SYSTEM', $at, 6, true ) ) { |
|
527 $at += 6; |
|
528 $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at ); |
|
529 if ( $at >= $end ) { |
|
530 return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); |
|
531 } |
|
532 goto parse_doctype_system_identifier; |
|
533 } |
|
534 |
|
535 /* |
|
536 * > Otherwise, this is an invalid-character-sequence-after-doctype-name parse error. |
|
537 * > Set the current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus |
|
538 * > DOCTYPE state. |
|
539 */ |
|
540 return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); |
|
541 |
|
542 parse_doctype_public_identifier: |
|
543 /* |
|
544 * The parser should enter "DOCTYPE public identifier (double-quoted) state" or |
|
545 * "DOCTYPE public identifier (single-quoted) state" by finding one of the valid quotes. |
|
546 * Anything else forces quirks mode and ignores the rest of the contents. |
|
547 * |
|
548 * @see https://html.spec.whatwg.org/#doctype-public-identifier-(double-quoted)-state |
|
549 * @see https://html.spec.whatwg.org/#doctype-public-identifier-(single-quoted)-state |
|
550 */ |
|
551 $closer_quote = $doctype_html[ $at ]; |
|
552 |
|
553 /* |
|
554 * > This is a missing-quote-before-doctype-public-identifier parse error. Set the |
|
555 * > current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus DOCTYPE state. |
|
556 */ |
|
557 if ( '"' !== $closer_quote && "'" !== $closer_quote ) { |
|
558 return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); |
|
559 } |
|
560 |
|
561 ++$at; |
|
562 |
|
563 $identifier_length = strcspn( $doctype_html, $closer_quote, $at, $end - $at ); |
|
564 $doctype_public_id = str_replace( "\0", "\u{FFFD}", substr( $doctype_html, $at, $identifier_length ) ); |
|
565 |
|
566 $at += $identifier_length; |
|
567 if ( $at >= $end || $closer_quote !== $doctype_html[ $at ] ) { |
|
568 return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); |
|
569 } |
|
570 |
|
571 ++$at; |
|
572 |
|
573 /* |
|
574 * "Between DOCTYPE public and system identifiers state" |
|
575 * |
|
576 * Advance through whitespace between public and system identifiers. |
|
577 * |
|
578 * @see https://html.spec.whatwg.org/#between-doctype-public-and-system-identifiers-state |
|
579 */ |
|
580 $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at ); |
|
581 if ( $at >= $end ) { |
|
582 return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false ); |
|
583 } |
|
584 |
|
585 parse_doctype_system_identifier: |
|
586 /* |
|
587 * The parser should enter "DOCTYPE system identifier (double-quoted) state" or |
|
588 * "DOCTYPE system identifier (single-quoted) state" by finding one of the valid quotes. |
|
589 * Anything else forces quirks mode and ignores the rest of the contents. |
|
590 * |
|
591 * @see https://html.spec.whatwg.org/#doctype-system-identifier-(double-quoted)-state |
|
592 * @see https://html.spec.whatwg.org/#doctype-system-identifier-(single-quoted)-state |
|
593 */ |
|
594 $closer_quote = $doctype_html[ $at ]; |
|
595 |
|
596 /* |
|
597 * > This is a missing-quote-before-doctype-system-identifier parse error. Set the |
|
598 * > current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus DOCTYPE state. |
|
599 */ |
|
600 if ( '"' !== $closer_quote && "'" !== $closer_quote ) { |
|
601 return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); |
|
602 } |
|
603 |
|
604 ++$at; |
|
605 |
|
606 $identifier_length = strcspn( $doctype_html, $closer_quote, $at, $end - $at ); |
|
607 $doctype_system_id = str_replace( "\0", "\u{FFFD}", substr( $doctype_html, $at, $identifier_length ) ); |
|
608 |
|
609 $at += $identifier_length; |
|
610 if ( $at >= $end || $closer_quote !== $doctype_html[ $at ] ) { |
|
611 return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); |
|
612 } |
|
613 |
|
614 return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false ); |
|
615 } |
|
616 } |