diff -r 48c4eec2b7e6 -r 8c2e4d02f4ef wp/wp-includes/html-api/class-wp-html-processor.php --- a/wp/wp-includes/html-api/class-wp-html-processor.php Fri Sep 05 18:40:08 2025 +0200 +++ b/wp/wp-includes/html-api/class-wp-html-processor.php Fri Sep 05 18:52:52 2025 +0200 @@ -97,22 +97,11 @@ * will abort early and stop all processing. This draconian measure ensures * that the HTML Processor won't break any HTML it doesn't fully understand. * - * The following list specifies the HTML tags that _are_ supported: + * The HTML Processor supports all elements other than a specific set: * - * - Containers: ADDRESS, BLOCKQUOTE, DETAILS, DIALOG, DIV, FOOTER, HEADER, MAIN, MENU, SPAN, SUMMARY. - * - Custom elements: All custom elements are supported. :) - * - Form elements: BUTTON, DATALIST, FIELDSET, INPUT, LABEL, LEGEND, METER, PROGRESS, SEARCH. - * - Formatting elements: B, BIG, CODE, EM, FONT, I, PRE, SMALL, STRIKE, STRONG, TT, U, WBR. - * - Heading elements: H1, H2, H3, H4, H5, H6, HGROUP. - * - Links: A. - * - Lists: DD, DL, DT, LI, OL, UL. - * - Media elements: AUDIO, CANVAS, EMBED, FIGCAPTION, FIGURE, IMG, MAP, PICTURE, SOURCE, TRACK, VIDEO. - * - Paragraph: BR, P. - * - Phrasing elements: ABBR, AREA, BDI, BDO, CITE, DATA, DEL, DFN, INS, MARK, OUTPUT, Q, SAMP, SUB, SUP, TIME, VAR. - * - Sectioning elements: ARTICLE, ASIDE, HR, NAV, SECTION. - * - Templating elements: SLOT. - * - Text decoration: RUBY. - * - Deprecated elements: ACRONYM, BLINK, CENTER, DIR, ISINDEX, KEYGEN, LISTING, MULTICOL, NEXTID, PARAM, SPACER. + * - Any element inside a TABLE. + * - Any element inside foreign content, including SVG and MATH. + * - Any element outside the IN BODY insertion mode, e.g. doctype declarations, meta, links. * * ### Supported markup * @@ -121,15 +110,30 @@ * may in fact belong _before_ the table in the DOM. If the HTML Processor encounters * such a case it will stop processing. * - * The following list specifies HTML markup that _is_ supported: + * The following list illustrates some common examples of unexpected HTML inputs that + * the HTML Processor properly parses and represents: + * + * - HTML with optional tags omitted, e.g. `

one

two`. + * - HTML with unexpected tag closers, e.g. `

one more

`. + * - Non-void tags with self-closing flag, e.g. `
the DIV is still open.
`. + * - Heading elements which close open heading elements of another level, e.g. `

Closed by

`. + * - Elements containing text that looks like other tags but isn't, e.g. `The <img> is plaintext`. + * - SCRIPT and STYLE tags containing text that looks like HTML but isn't, e.g. ``. + * - SCRIPT content which has been escaped, e.g. ``. * - * - Markup involving only those tags listed above. - * - Fully-balanced and non-overlapping tags. - * - HTML with unexpected tag closers. - * - Some unbalanced or overlapping tags. - * - P tags after unclosed P tags. - * - BUTTON tags after unclosed BUTTON tags. - * - A tags after unclosed A tags that don't involve any active formatting elements. + * ### Unsupported Features + * + * This parser does not report parse errors. + * + * Normally, when additional HTML or BODY tags are encountered in a document, if there + * are any additional attributes on them that aren't found on the previous elements, + * the existing HTML and BODY elements adopt those missing attribute values. This + * parser does not add those additional attributes. + * + * In certain situations, elements are moved to a different part of the document in + * a process called "adoption" and "fostering." Because the nodes move to a location + * in the document that the parser had already processed, this parser does not support + * these situations and will bail. * * @since 6.4.0 * @@ -159,7 +163,7 @@ * * @var WP_HTML_Processor_State */ - private $state = null; + private $state; /** * Used to create unique bookmark names. @@ -189,6 +193,17 @@ private $last_error = null; /** + * Stores context for why the parser bailed on unsupported HTML, if it did. + * + * @see self::get_unsupported_exception + * + * @since 6.7.0 + * + * @var WP_HTML_Unsupported_Exception|null + */ + private $unsupported_exception = null; + + /** * Releases a bookmark when PHP garbage-collects its wrapping WP_HTML_Token instance. * * This function is created inside the class constructor so that it can be passed to @@ -197,7 +212,7 @@ * * @since 6.4.0 * - * @var closure + * @var Closure|null */ private $release_internal_bookmark_on_destruct = null; @@ -212,6 +227,15 @@ private $element_queue = array(); /** + * Stores the current breadcrumbs. + * + * @since 6.7.0 + * + * @var string[] + */ + private $breadcrumbs = array(); + + /** * Current stack event, if set, representing a matched token. * * Because the parser may internally point to a place further along in a document @@ -221,32 +245,17 @@ * * @since 6.6.0 * - * @var ?WP_HTML_Stack_Event + * @var WP_HTML_Stack_Event|null */ private $current_element = null; /** * Context node if created as a fragment parser. * - * @var ?WP_HTML_Token + * @var WP_HTML_Token|null */ private $context_node = null; - /** - * Whether the parser has yet processed the context node, - * if created as a fragment parser. - * - * The context node will be initially pushed onto the stack of open elements, - * but when created as a fragment parser, this context element (and the implicit - * HTML document node above it) should not be exposed as a matched token or node. - * - * This boolean indicates whether the processor should skip over the current - * node in its initial search for the first node created from the input HTML. - * - * @var bool - */ - private $has_seen_context_node = false; - /* * Public Interface Functions */ @@ -288,30 +297,52 @@ return null; } - $processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE ); - $processor->state->context_node = array( 'BODY', array() ); - $processor->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; - - // @todo Create "fake" bookmarks for non-existent but implied nodes. - $processor->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 ); - $processor->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 ); - - $processor->state->stack_of_open_elements->push( - new WP_HTML_Token( - 'root-node', - 'HTML', - false - ) - ); - - $context_node = new WP_HTML_Token( - 'context-node', - $processor->state->context_node[0], - false - ); - - $processor->state->stack_of_open_elements->push( $context_node ); - $processor->context_node = $context_node; + $context_processor = static::create_full_parser( "{$context}", $encoding ); + if ( null === $context_processor ) { + return null; + } + + while ( $context_processor->next_tag() ) { + if ( ! $context_processor->is_virtual() ) { + $context_processor->set_bookmark( 'final_node' ); + } + } + + if ( + ! $context_processor->has_bookmark( 'final_node' ) || + ! $context_processor->seek( 'final_node' ) + ) { + _doing_it_wrong( __METHOD__, __( 'No valid context element was detected.' ), '6.8.0' ); + return null; + } + + return $context_processor->create_fragment_at_current_node( $html ); + } + + /** + * Creates an HTML processor in the full parsing mode. + * + * It's likely that a fragment parser is more appropriate, unless sending an + * entire HTML document from start to finish. Consider a fragment parser with + * a context node of ``. + * + * UTF-8 is the only allowed encoding. If working with a document that + * isn't UTF-8, first convert the document to UTF-8, then pass in the + * converted HTML. + * + * @param string $html Input HTML document to process. + * @param string|null $known_definite_encoding Optional. If provided, specifies the charset used + * in the input byte stream. Currently must be UTF-8. + * @return static|null The created processor if successful, otherwise null. + */ + public static function create_full_parser( $html, $known_definite_encoding = 'UTF-8' ) { + if ( 'UTF-8' !== $known_definite_encoding ) { + return null; + } + + $processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE ); + $processor->state->encoding = $known_definite_encoding; + $processor->state->encoding_confidence = 'certain'; return $processor; } @@ -348,20 +379,30 @@ $this->state = new WP_HTML_Processor_State(); $this->state->stack_of_open_elements->set_push_handler( - function ( WP_HTML_Token $token ) { + function ( WP_HTML_Token $token ): void { $is_virtual = ! isset( $this->state->current_token ) || $this->is_tag_closer(); $same_node = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name; $provenance = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real'; $this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::PUSH, $provenance ); + + $this->change_parsing_namespace( $token->integration_node_type ? 'html' : $token->namespace ); } ); $this->state->stack_of_open_elements->set_pop_handler( - function ( WP_HTML_Token $token ) { + function ( WP_HTML_Token $token ): void { $is_virtual = ! isset( $this->state->current_token ) || ! $this->is_tag_closer(); $same_node = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name; $provenance = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real'; $this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::POP, $provenance ); + + $adjusted_current_node = $this->get_adjusted_current_node(); + + if ( $adjusted_current_node ) { + $this->change_parsing_namespace( $adjusted_current_node->integration_node_type ? 'html' : $adjusted_current_node->namespace ); + } else { + $this->change_parsing_namespace( 'html' ); + } } ); @@ -370,12 +411,191 @@ * a private method into WP_HTML_Token classes without * exposing it to any public API. */ - $this->release_internal_bookmark_on_destruct = function ( $name ) { + $this->release_internal_bookmark_on_destruct = function ( string $name ): void { parent::release_bookmark( $name ); }; } /** + * Creates a fragment processor at the current node. + * + * HTML Fragment parsing always happens with a context node. HTML Fragment Processors can be + * instantiated with a `BODY` context node via `WP_HTML_Processor::create_fragment( $html )`. + * + * The context node may impact how a fragment of HTML is parsed. For example, consider the HTML + * fragment `Inside TD?`. + * + * A BODY context node will produce the following tree: + * + * └─#text Inside TD? + * + * Notice that the `` tags are completely ignored. + * + * Compare that with an SVG context node that produces the following tree: + * + * ├─svg:td + * └─#text Inside TD? + * + * Here, a `td` node in the `svg` namespace is created, and its self-closing flag is respected. + * This is a peculiarity of parsing HTML in foreign content like SVG. + * + * Finally, consider the tree produced with a TABLE context node: + * + * └─TBODY + * └─TR + * └─TD + * └─#text Inside TD? + * + * These examples demonstrate how important the context node may be when processing an HTML + * fragment. Special care must be taken when processing fragments that are expected to appear + * in specific contexts. SVG and TABLE are good examples, but there are others. + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm + * + * @since 6.8.0 + * + * @param string $html Input HTML fragment to process. + * @return static|null The created processor if successful, otherwise null. + */ + private function create_fragment_at_current_node( string $html ) { + if ( $this->get_token_type() !== '#tag' || $this->is_tag_closer() ) { + _doing_it_wrong( + __METHOD__, + __( 'The context element must be a start tag.' ), + '6.8.0' + ); + return null; + } + + $tag_name = $this->current_element->token->node_name; + $namespace = $this->current_element->token->namespace; + + if ( 'html' === $namespace && self::is_void( $tag_name ) ) { + _doing_it_wrong( + __METHOD__, + sprintf( + // translators: %s: A tag name like INPUT or BR. + __( 'The context element cannot be a void element, found "%s".' ), + $tag_name + ), + '6.8.0' + ); + return null; + } + + /* + * Prevent creating fragments at nodes that require a special tokenizer state. + * This is unsupported by the HTML Processor. + */ + if ( + 'html' === $namespace && + in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP', 'PLAINTEXT' ), true ) + ) { + _doing_it_wrong( + __METHOD__, + sprintf( + // translators: %s: A tag name like IFRAME or TEXTAREA. + __( 'The context element "%s" is not supported.' ), + $tag_name + ), + '6.8.0' + ); + return null; + } + + $fragment_processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE ); + + $fragment_processor->compat_mode = $this->compat_mode; + + // @todo Create "fake" bookmarks for non-existent but implied nodes. + $fragment_processor->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 ); + $root_node = new WP_HTML_Token( + 'root-node', + 'HTML', + false + ); + $fragment_processor->state->stack_of_open_elements->push( $root_node ); + + $fragment_processor->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 ); + $fragment_processor->context_node = clone $this->current_element->token; + $fragment_processor->context_node->bookmark_name = 'context-node'; + $fragment_processor->context_node->on_destroy = null; + + $fragment_processor->breadcrumbs = array( 'HTML', $fragment_processor->context_node->node_name ); + + if ( 'TEMPLATE' === $fragment_processor->context_node->node_name ) { + $fragment_processor->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE; + } + + $fragment_processor->reset_insertion_mode_appropriately(); + + /* + * > Set the parser's form element pointer to the nearest node to the context element that + * > is a form element (going straight up the ancestor chain, and including the element + * > itself, if it is a form element), if any. (If there is no such form element, the + * > form element pointer keeps its initial value, null.) + */ + foreach ( $this->state->stack_of_open_elements->walk_up() as $element ) { + if ( 'FORM' === $element->node_name && 'html' === $element->namespace ) { + $fragment_processor->state->form_element = clone $element; + $fragment_processor->state->form_element->bookmark_name = null; + $fragment_processor->state->form_element->on_destroy = null; + break; + } + } + + $fragment_processor->state->encoding_confidence = 'irrelevant'; + + /* + * Update the parsing namespace near the end of the process. + * This is important so that any push/pop from the stack of open + * elements does not change the parsing namespace. + */ + $fragment_processor->change_parsing_namespace( + $this->current_element->token->integration_node_type ? 'html' : $namespace + ); + + return $fragment_processor; + } + + /** + * Stops the parser and terminates its execution when encountering unsupported markup. + * + * @throws WP_HTML_Unsupported_Exception Halts execution of the parser. + * + * @since 6.7.0 + * + * @param string $message Explains support is missing in order to parse the current node. + */ + private function bail( string $message ) { + $here = $this->bookmarks[ $this->state->current_token->bookmark_name ]; + $token = substr( $this->html, $here->start, $here->length ); + + $open_elements = array(); + foreach ( $this->state->stack_of_open_elements->stack as $item ) { + $open_elements[] = $item->node_name; + } + + $active_formats = array(); + foreach ( $this->state->active_formatting_elements->walk_down() as $item ) { + $active_formats[] = $item->node_name; + } + + $this->last_error = self::ERROR_UNSUPPORTED; + + $this->unsupported_exception = new WP_HTML_Unsupported_Exception( + $message, + $this->state->current_token->node_name, + $here->start, + $token, + $open_elements, + $active_formats + ); + + throw $this->unsupported_exception; + } + + /** * Returns the last error, if any. * * Various situations lead to parsing failure but this class will @@ -398,11 +618,26 @@ * * @return string|null The last error, if one exists, otherwise null. */ - public function get_last_error() { + public function get_last_error(): ?string { return $this->last_error; } /** + * Returns context for why the parser aborted due to unsupported HTML, if it did. + * + * This is meant for debugging purposes, not for production use. + * + * @since 6.7.0 + * + * @see self::$unsupported_exception + * + * @return WP_HTML_Unsupported_Exception|null + */ + public function get_unsupported_exception() { + return $this->unsupported_exception; + } + + /** * Finds the next tag matching the $query. * * @todo Support matching the class name and tag name. @@ -426,7 +661,7 @@ * } * @return bool Whether a tag was matched. */ - public function next_tag( $query = null ) { + public function next_tag( $query = null ): bool { $visit_closers = isset( $query['tag_closers'] ) && 'visit' === $query['tag_closers']; if ( null === $query ) { @@ -456,6 +691,10 @@ return false; } + if ( isset( $query['tag_name'] ) ) { + $query['tag_name'] = strtoupper( $query['tag_name'] ); + } + $needs_class = ( isset( $query['class_name'] ) && is_string( $query['class_name'] ) ) ? $query['class_name'] : null; @@ -466,6 +705,10 @@ continue; } + if ( isset( $query['tag_name'] ) && $query['tag_name'] !== $this->get_token_name() ) { + continue; + } + if ( isset( $needs_class ) && ! $this->has_class( $needs_class ) ) { continue; } @@ -499,71 +742,98 @@ } /** - * Ensures internal accounting is maintained for HTML semantic rules while - * the underlying Tag Processor class is seeking to a bookmark. + * Finds the next token in the HTML document. * * This doesn't currently have a way to represent non-tags and doesn't process * semantic rules for text nodes. For access to the raw tokens consider using * WP_HTML_Tag_Processor instead. * * @since 6.5.0 Added for internal support; do not use. + * @since 6.7.2 Refactored so subclasses may extend. + * + * @return bool Whether a token was parsed. + */ + public function next_token(): bool { + return $this->next_visitable_token(); + } + + /** + * Ensures internal accounting is maintained for HTML semantic rules while + * the underlying Tag Processor class is seeking to a bookmark. + * + * This doesn't currently have a way to represent non-tags and doesn't process + * semantic rules for text nodes. For access to the raw tokens consider using + * WP_HTML_Tag_Processor instead. + * + * Note that this method may call itself recursively. This is why it is not + * implemented as {@see WP_HTML_Processor::next_token()}, which instead calls + * this method similarly to how {@see WP_HTML_Tag_Processor::next_token()} + * calls the {@see WP_HTML_Tag_Processor::base_class_next_token()} method. + * + * @since 6.7.2 Added for internal support. * * @access private * * @return bool */ - public function next_token() { + private function next_visitable_token(): bool { $this->current_element = null; if ( isset( $this->last_error ) ) { return false; } - if ( 'done' !== $this->has_seen_context_node && 0 === count( $this->element_queue ) && ! $this->step() ) { - while ( 'context-node' !== $this->state->stack_of_open_elements->current_node()->bookmark_name && $this->state->stack_of_open_elements->pop() ) { + /* + * Prime the events if there are none. + * + * @todo In some cases, probably related to the adoption agency + * algorithm, this call to step() doesn't create any new + * events. Calling it again creates them. Figure out why + * this is and if it's inherent or if it's a bug. Looping + * until there are events or until there are no more + * tokens works in the meantime and isn't obviously wrong. + */ + if ( empty( $this->element_queue ) && $this->step() ) { + return $this->next_visitable_token(); + } + + // Process the next event on the queue. + $this->current_element = array_shift( $this->element_queue ); + if ( ! isset( $this->current_element ) ) { + // There are no tokens left, so close all remaining open elements. + while ( $this->state->stack_of_open_elements->pop() ) { continue; } - $this->has_seen_context_node = 'done'; - return $this->next_token(); - } - - $this->current_element = array_shift( $this->element_queue ); - while ( isset( $this->context_node ) && ! $this->has_seen_context_node ) { - if ( isset( $this->current_element ) ) { - if ( $this->context_node === $this->current_element->token && WP_HTML_Stack_Event::PUSH === $this->current_element->operation ) { - $this->has_seen_context_node = true; - return $this->next_token(); - } - } - $this->current_element = array_shift( $this->element_queue ); + + return empty( $this->element_queue ) ? false : $this->next_visitable_token(); } - if ( ! isset( $this->current_element ) ) { - if ( 'done' === $this->has_seen_context_node ) { - return false; - } else { - return $this->next_token(); - } + $is_pop = WP_HTML_Stack_Event::POP === $this->current_element->operation; + + /* + * The root node only exists in the fragment parser, and closing it + * indicates that the parse is complete. Stop before popping it from + * the breadcrumbs. + */ + if ( 'root-node' === $this->current_element->token->bookmark_name ) { + return $this->next_visitable_token(); } - if ( isset( $this->context_node ) && WP_HTML_Stack_Event::POP === $this->current_element->operation && $this->context_node === $this->current_element->token ) { - $this->element_queue = array(); - $this->current_element = null; - return false; + // Adjust the breadcrumbs for this event. + if ( $is_pop ) { + array_pop( $this->breadcrumbs ); + } else { + $this->breadcrumbs[] = $this->current_element->token->node_name; } // Avoid sending close events for elements which don't expect a closing. - if ( - WP_HTML_Stack_Event::POP === $this->current_element->operation && - ! static::expects_closer( $this->current_element->token ) - ) { - return $this->next_token(); + if ( $is_pop && ! $this->expects_closer( $this->current_element->token ) ) { + return $this->next_visitable_token(); } return true; } - /** * Indicates if the current tag token is a tag closer. * @@ -580,7 +850,7 @@ * * @return bool Whether the current tag is a tag closer. */ - public function is_tag_closer() { + public function is_tag_closer(): bool { return $this->is_virtual() ? ( WP_HTML_Stack_Event::POP === $this->current_element->operation && '#tag' === $this->get_token_type() ) : parent::is_tag_closer(); @@ -594,7 +864,7 @@ * * @return bool Whether the current token is virtual. */ - private function is_virtual() { + private function is_virtual(): bool { return ( isset( $this->current_element->provenance ) && 'virtual' === $this->current_element->provenance @@ -626,7 +896,7 @@ * May also contain the wildcard `*` which matches a single element, e.g. `array( 'SECTION', '*' )`. * @return bool Whether the currently-matched tag is found at the given nested structure. */ - public function matches_breadcrumbs( $breadcrumbs ) { + public function matches_breadcrumbs( $breadcrumbs ): bool { // Everything matches when there are zero constraints. if ( 0 === count( $breadcrumbs ) ) { return true; @@ -639,10 +909,11 @@ return false; } - foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) { + for ( $i = count( $this->breadcrumbs ) - 1; $i >= 0; $i-- ) { + $node = $this->breadcrumbs[ $i ]; $crumb = strtoupper( current( $breadcrumbs ) ); - if ( '*' !== $crumb && $node->node_name !== $crumb ) { + if ( '*' !== $crumb && $node !== $crumb ) { return false; } @@ -667,29 +938,32 @@ * * @since 6.6.0 * - * @todo When adding support for foreign content, ensure that - * this returns false for self-closing elements in the - * SVG and MathML namespace. - * - * @param ?WP_HTML_Token $node Node to examine instead of current node, if provided. - * @return bool Whether to expect a closer for the currently-matched node, - * or `null` if not matched on any token. + * @param WP_HTML_Token|null $node Optional. Node to examine, if provided. + * Default is to examine current node. + * @return bool|null Whether to expect a closer for the currently-matched node, + * or `null` if not matched on any token. */ - public function expects_closer( $node = null ) { + public function expects_closer( ?WP_HTML_Token $node = null ): ?bool { $token_name = $node->node_name ?? $this->get_token_name(); + if ( ! isset( $token_name ) ) { return null; } + $token_namespace = $node->namespace ?? $this->get_namespace(); + $token_has_self_closing = $node->has_self_closing_flag ?? $this->has_self_closing_flag(); + return ! ( // Comments, text nodes, and other atomic tokens. '#' === $token_name[0] || // Doctype declarations. 'html' === $token_name || // Void elements. - self::is_void( $token_name ) || + ( 'html' === $token_namespace && self::is_void( $token_name ) ) || // Special atomic elements. - in_array( $token_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) + ( 'html' === $token_namespace && in_array( $token_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ) || + // Self-closing elements in foreign content. + ( 'html' !== $token_namespace && $token_has_self_closing ) ); } @@ -706,7 +980,7 @@ * @param string $node_to_process Whether to parse the next node or reprocess the current node. * @return bool Whether a tag was matched. */ - public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { + public function step( $node_to_process = self::PROCESS_NEXT_NODE ): bool { // Refuse to proceed if there was a previous error. if ( null !== $this->last_error ) { return false; @@ -721,20 +995,18 @@ * * When moving on to the next node, therefore, if the bottom-most element * on the stack is a void element, it must be closed. - * - * @todo Once self-closing foreign elements and BGSOUND are supported, - * they must also be implicitly closed here too. BGSOUND is - * special since it's only self-closing if the self-closing flag - * is provided in the opening tag, otherwise it expects a tag closer. */ $top_node = $this->state->stack_of_open_elements->current_node(); - if ( isset( $top_node ) && ! static::expects_closer( $top_node ) ) { + if ( isset( $top_node ) && ! $this->expects_closer( $top_node ) ) { $this->state->stack_of_open_elements->pop(); } } if ( self::PROCESS_NEXT_NODE === $node_to_process ) { parent::next_token(); + if ( WP_HTML_Tag_Processor::STATE_TEXT_NODE === $this->parser_state ) { + parent::subdivide_text_appropriately(); + } } // Finish stepping when there are no more tokens in the document. @@ -745,21 +1017,116 @@ return false; } - $this->state->current_token = new WP_HTML_Token( - $this->bookmark_token(), - $this->get_token_name(), - $this->has_self_closing_flag(), - $this->release_internal_bookmark_on_destruct + $adjusted_current_node = $this->get_adjusted_current_node(); + $is_closer = $this->is_tag_closer(); + $is_start_tag = WP_HTML_Tag_Processor::STATE_MATCHED_TAG === $this->parser_state && ! $is_closer; + $token_name = $this->get_token_name(); + + if ( self::REPROCESS_CURRENT_NODE !== $node_to_process ) { + $this->state->current_token = new WP_HTML_Token( + $this->bookmark_token(), + $token_name, + $this->has_self_closing_flag(), + $this->release_internal_bookmark_on_destruct + ); + } + + $parse_in_current_insertion_mode = ( + 0 === $this->state->stack_of_open_elements->count() || + 'html' === $adjusted_current_node->namespace || + ( + 'math' === $adjusted_current_node->integration_node_type && + ( + ( $is_start_tag && ! in_array( $token_name, array( 'MGLYPH', 'MALIGNMARK' ), true ) ) || + '#text' === $token_name + ) + ) || + ( + 'math' === $adjusted_current_node->namespace && + 'ANNOTATION-XML' === $adjusted_current_node->node_name && + $is_start_tag && 'SVG' === $token_name + ) || + ( + 'html' === $adjusted_current_node->integration_node_type && + ( $is_start_tag || '#text' === $token_name ) + ) ); try { + if ( ! $parse_in_current_insertion_mode ) { + return $this->step_in_foreign_content(); + } + switch ( $this->state->insertion_mode ) { + case WP_HTML_Processor_State::INSERTION_MODE_INITIAL: + return $this->step_initial(); + + case WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML: + return $this->step_before_html(); + + case WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD: + return $this->step_before_head(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD: + return $this->step_in_head(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD_NOSCRIPT: + return $this->step_in_head_noscript(); + + case WP_HTML_Processor_State::INSERTION_MODE_AFTER_HEAD: + return $this->step_after_head(); + case WP_HTML_Processor_State::INSERTION_MODE_IN_BODY: return $this->step_in_body(); + case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE: + return $this->step_in_table(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_TEXT: + return $this->step_in_table_text(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_CAPTION: + return $this->step_in_caption(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_COLUMN_GROUP: + return $this->step_in_column_group(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY: + return $this->step_in_table_body(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_ROW: + return $this->step_in_row(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_CELL: + return $this->step_in_cell(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT: + return $this->step_in_select(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT_IN_TABLE: + return $this->step_in_select_in_table(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE: + return $this->step_in_template(); + + case WP_HTML_Processor_State::INSERTION_MODE_AFTER_BODY: + return $this->step_after_body(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_FRAMESET: + return $this->step_in_frameset(); + + case WP_HTML_Processor_State::INSERTION_MODE_AFTER_FRAMESET: + return $this->step_after_frameset(); + + case WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_BODY: + return $this->step_after_after_body(); + + case WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_FRAMESET: + return $this->step_after_after_frameset(); + + // This should be unreachable but PHP doesn't have total type checking on switch. default: - $this->last_error = self::ERROR_UNSUPPORTED; - throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." ); + $this->bail( "Unaware of the requested parsing mode: '{$this->state->insertion_mode}'." ); } } catch ( WP_HTML_Unsupported_Exception $e ) { /* @@ -776,11 +1143,7 @@ * Breadcrumbs start at the outermost parent and descend toward the matched element. * They always include the entire path from the root HTML node to the matched element. * - * @todo It could be more efficient to expose a generator-based version of this function - * to avoid creating the array copy on tag iteration. If this is done, it would likely - * be more useful to walk up the stack when yielding instead of starting at the top. - * - * Example + * Example: * * $processor = WP_HTML_Processor::create_fragment( '

' ); * $processor->next_tag( 'IMG' ); @@ -788,49 +1151,10 @@ * * @since 6.4.0 * - * @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL. + * @return string[] Array of tag names representing path to matched node. */ - public function get_breadcrumbs() { - $breadcrumbs = array(); - - foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) { - $breadcrumbs[] = $stack_item->node_name; - } - - if ( ! $this->is_virtual() ) { - return $breadcrumbs; - } - - foreach ( $this->element_queue as $queue_item ) { - if ( $this->current_element->token->bookmark_name === $queue_item->token->bookmark_name ) { - break; - } - - if ( 'context-node' === $queue_item->token->bookmark_name ) { - break; - } - - if ( 'real' === $queue_item->provenance ) { - break; - } - - if ( WP_HTML_Stack_Event::PUSH === $queue_item->operation ) { - $breadcrumbs[] = $queue_item->token->node_name; - } else { - array_pop( $breadcrumbs ); - } - } - - if ( null !== parent::get_token_name() && ! parent::is_tag_closer() ) { - array_pop( $breadcrumbs ); - } - - // Add the virtual node we're at. - if ( WP_HTML_Stack_Event::PUSH === $this->current_element->operation ) { - $breadcrumbs[] = $this->current_element->token->node_name; - } - - return $breadcrumbs; + public function get_breadcrumbs(): array { + return $this->breadcrumbs; } /** @@ -858,10 +1182,971 @@ * * @return int Nesting-depth of current location in the document. */ - public function get_current_depth() { - return $this->is_virtual() - ? count( $this->get_breadcrumbs() ) - : $this->state->stack_of_open_elements->count(); + public function get_current_depth(): int { + return count( $this->breadcrumbs ); + } + + /** + * Normalizes an HTML fragment by serializing it. + * + * This method assumes that the given HTML snippet is found in BODY context. + * For normalizing full documents or fragments found in other contexts, create + * a new processor using {@see WP_HTML_Processor::create_fragment} or + * {@see WP_HTML_Processor::create_full_parser} and call {@see WP_HTML_Processor::serialize} + * on the created instances. + * + * Many aspects of an input HTML fragment may be changed during normalization. + * + * - Attribute values will be double-quoted. + * - Duplicate attributes will be removed. + * - Omitted tags will be added. + * - Tag and attribute name casing will be lower-cased, + * except for specific SVG and MathML tags or attributes. + * - Text will be re-encoded, null bytes handled, + * and invalid UTF-8 replaced with U+FFFD. + * - Any incomplete syntax trailing at the end will be omitted, + * for example, an unclosed comment opener will be removed. + * + * Example: + * + * echo WP_HTML_Processor::normalize( 'One syntax < <> "oddities" + * + * @since 6.7.0 + * + * @param string $html Input HTML to normalize. + * + * @return string|null Normalized output, or `null` if unable to normalize. + */ + public static function normalize( string $html ): ?string { + return static::create_fragment( $html )->serialize(); + } + + /** + * Returns normalized HTML for a fragment by serializing it. + * + * This differs from {@see WP_HTML_Processor::normalize} in that it starts with + * a specific HTML Processor, which _must_ not have already started scanning; + * it must be in the initial ready state and will be in the completed state once + * serialization is complete. + * + * Many aspects of an input HTML fragment may be changed during normalization. + * + * - Attribute values will be double-quoted. + * - Duplicate attributes will be removed. + * - Omitted tags will be added. + * - Tag and attribute name casing will be lower-cased, + * except for specific SVG and MathML tags or attributes. + * - Text will be re-encoded, null bytes handled, + * and invalid UTF-8 replaced with U+FFFD. + * - Any incomplete syntax trailing at the end will be omitted, + * for example, an unclosed comment opener will be removed. + * + * Example: + * + * $processor = WP_HTML_Processor::create_fragment( 'One syntax < <> "oddities" + * + * @since 6.7.0 + * + * @return string|null Normalized HTML markup represented by processor, + * or `null` if unable to generate serialization. + */ + public function serialize(): ?string { + if ( WP_HTML_Tag_Processor::STATE_READY !== $this->parser_state ) { + wp_trigger_error( + __METHOD__, + 'An HTML Processor which has already started processing cannot serialize its contents. Serialize immediately after creating the instance.', + E_USER_WARNING + ); + return null; + } + + $html = ''; + while ( $this->next_token() ) { + $html .= $this->serialize_token(); + } + + if ( null !== $this->get_last_error() ) { + wp_trigger_error( + __METHOD__, + "Cannot serialize HTML Processor with parsing error: {$this->get_last_error()}.", + E_USER_WARNING + ); + return null; + } + + return $html; + } + + /** + * Serializes the currently-matched token. + * + * This method produces a fully-normative HTML string for the currently-matched token, + * if able. If not matched at any token or if the token doesn't correspond to any HTML + * it will return an empty string (for example, presumptuous end tags are ignored). + * + * @see static::serialize() + * + * @since 6.7.0 + * + * @return string Serialization of token, or empty string if no serialization exists. + */ + protected function serialize_token(): string { + $html = ''; + $token_type = $this->get_token_type(); + + switch ( $token_type ) { + case '#doctype': + $doctype = $this->get_doctype_info(); + if ( null === $doctype ) { + break; + } + + $html .= 'name ) { + $html .= " {$doctype->name}"; + } + + if ( null !== $doctype->public_identifier ) { + $quote = str_contains( $doctype->public_identifier, '"' ) ? "'" : '"'; + $html .= " PUBLIC {$quote}{$doctype->public_identifier}{$quote}"; + } + if ( null !== $doctype->system_identifier ) { + if ( null === $doctype->public_identifier ) { + $html .= ' SYSTEM'; + } + $quote = str_contains( $doctype->system_identifier, '"' ) ? "'" : '"'; + $html .= " {$quote}{$doctype->system_identifier}{$quote}"; + } + + $html .= '>'; + break; + + case '#text': + $html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); + break; + + // Unlike the `<>` which is interpreted as plaintext, this is ignored entirely. + case '#presumptuous-tag': + break; + + case '#funky-comment': + case '#comment': + $html .= ""; + break; + + case '#cdata-section': + $html .= "get_modifiable_text()}]]>"; + break; + } + + if ( '#tag' !== $token_type ) { + return $html; + } + + $tag_name = str_replace( "\x00", "\u{FFFD}", $this->get_tag() ); + $in_html = 'html' === $this->get_namespace(); + $qualified_name = $in_html ? strtolower( $tag_name ) : $this->get_qualified_tag_name(); + + if ( $this->is_tag_closer() ) { + $html .= ""; + return $html; + } + + $attribute_names = $this->get_attribute_names_with_prefix( '' ); + if ( ! isset( $attribute_names ) ) { + $html .= "<{$qualified_name}>"; + return $html; + } + + $html .= "<{$qualified_name}"; + foreach ( $attribute_names as $attribute_name ) { + $html .= " {$this->get_qualified_attribute_name( $attribute_name )}"; + $value = $this->get_attribute( $attribute_name ); + + if ( is_string( $value ) ) { + $html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"'; + } + + $html = str_replace( "\x00", "\u{FFFD}", $html ); + } + + if ( ! $in_html && $this->has_self_closing_flag() ) { + $html .= ' /'; + } + + $html .= '>'; + + // Flush out self-contained elements. + if ( $in_html && in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ) { + $text = $this->get_modifiable_text(); + + switch ( $tag_name ) { + case 'IFRAME': + case 'NOEMBED': + case 'NOFRAMES': + $text = ''; + break; + + case 'SCRIPT': + case 'STYLE': + break; + + default: + $text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); + } + + $html .= "{$text}"; + } + + return $html; + } + + /** + * Parses next element in the 'initial' insertion mode. + * + * This internal function performs the 'initial' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#the-initial-insertion-mode + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_initial(): bool { + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( parent::is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; + + switch ( $op ) { + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, + * > U+000A LINE FEED (LF), U+000C FORM FEED (FF), + * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * + * Parse error: ignore the token. + */ + case '#text': + if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { + return $this->step(); + } + goto initial_anything_else; + break; + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A DOCTYPE token + */ + case 'html': + $doctype = $this->get_doctype_info(); + if ( null !== $doctype && 'quirks' === $doctype->indicated_compatability_mode ) { + $this->compat_mode = WP_HTML_Tag_Processor::QUIRKS_MODE; + } + + /* + * > Then, switch the insertion mode to "before html". + */ + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML; + $this->insert_html_element( $this->state->current_token ); + return true; + } + + /* + * > Anything else + */ + initial_anything_else: + $this->compat_mode = WP_HTML_Tag_Processor::QUIRKS_MODE; + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML; + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + /** + * Parses next element in the 'before html' insertion mode. + * + * This internal function performs the 'before html' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#the-before-html-insertion-mode + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_before_html(): bool { + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $is_closer = parent::is_tag_closer(); + $op_sigil = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; + + switch ( $op ) { + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, + * > U+000A LINE FEED (LF), U+000C FORM FEED (FF), + * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * + * Parse error: ignore the token. + */ + case '#text': + if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { + return $this->step(); + } + goto before_html_anything_else; + break; + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + $this->insert_html_element( $this->state->current_token ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD; + return true; + + /* + * > An end tag whose tag name is one of: "head", "body", "html", "br" + * + * Closing BR tags are always reported by the Tag Processor as opening tags. + */ + case '-HEAD': + case '-BODY': + case '-HTML': + /* + * > Act as described in the "anything else" entry below. + */ + goto before_html_anything_else; + break; + } + + /* + * > Any other end tag + */ + if ( $is_closer ) { + // Parse error: ignore the token. + return $this->step(); + } + + /* + * > Anything else. + * + * > Create an html element whose node document is the Document object. + * > Append it to the Document object. Put this element in the stack of open elements. + * > Switch the insertion mode to "before head", then reprocess the token. + */ + before_html_anything_else: + $this->insert_virtual_node( 'HTML' ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD; + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + /** + * Parses next element in the 'before head' insertion mode. + * + * This internal function performs the 'before head' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 Stub implementation. + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#the-before-head-insertion-mode + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_before_head(): bool { + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $is_closer = parent::is_tag_closer(); + $op_sigil = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; + + switch ( $op ) { + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, + * > U+000A LINE FEED (LF), U+000C FORM FEED (FF), + * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * + * Parse error: ignore the token. + */ + case '#text': + if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { + return $this->step(); + } + goto before_head_anything_else; + break; + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + return $this->step_in_body(); + + /* + * > A start tag whose tag name is "head" + */ + case '+HEAD': + $this->insert_html_element( $this->state->current_token ); + $this->state->head_element = $this->state->current_token; + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD; + return true; + + /* + * > An end tag whose tag name is one of: "head", "body", "html", "br" + * > Act as described in the "anything else" entry below. + * + * Closing BR tags are always reported by the Tag Processor as opening tags. + */ + case '-HEAD': + case '-BODY': + case '-HTML': + goto before_head_anything_else; + break; + } + + if ( $is_closer ) { + // Parse error: ignore the token. + return $this->step(); + } + + /* + * > Anything else + * + * > Insert an HTML element for a "head" start tag token with no attributes. + */ + before_head_anything_else: + $this->state->head_element = $this->insert_virtual_node( 'HEAD' ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD; + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + /** + * Parses next element in the 'in head' insertion mode. + * + * This internal function performs the 'in head' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhead + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_in_head(): bool { + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $is_closer = parent::is_tag_closer(); + $op_sigil = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; + + switch ( $op ) { + case '#text': + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, + * > U+000A LINE FEED (LF), U+000C FORM FEED (FF), + * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + */ + if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { + // Insert the character. + $this->insert_html_element( $this->state->current_token ); + return true; + } + + goto in_head_anything_else; + break; + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + return $this->step_in_body(); + + /* + * > A start tag whose tag name is one of: "base", "basefont", "bgsound", "link" + */ + case '+BASE': + case '+BASEFONT': + case '+BGSOUND': + case '+LINK': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is "meta" + */ + case '+META': + $this->insert_html_element( $this->state->current_token ); + + /* + * > If the active speculative HTML parser is null, then: + * > - If the element has a charset attribute, and getting an encoding from + * > its value results in an encoding, and the confidence is currently + * > tentative, then change the encoding to the resulting encoding. + */ + $charset = $this->get_attribute( 'charset' ); + if ( is_string( $charset ) && 'tentative' === $this->state->encoding_confidence ) { + $this->bail( 'Cannot yet process META tags with charset to determine encoding.' ); + } + + /* + * > - Otherwise, if the element has an http-equiv attribute whose value is + * > an ASCII case-insensitive match for the string "Content-Type", and + * > the element has a content attribute, and applying the algorithm for + * > extracting a character encoding from a meta element to that attribute's + * > value returns an encoding, and the confidence is currently tentative, + * > then change the encoding to the extracted encoding. + */ + $http_equiv = $this->get_attribute( 'http-equiv' ); + $content = $this->get_attribute( 'content' ); + if ( + is_string( $http_equiv ) && + is_string( $content ) && + 0 === strcasecmp( $http_equiv, 'Content-Type' ) && + 'tentative' === $this->state->encoding_confidence + ) { + $this->bail( 'Cannot yet process META tags with http-equiv Content-Type to determine encoding.' ); + } + + return true; + + /* + * > A start tag whose tag name is "title" + */ + case '+TITLE': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is "noscript", if the scripting flag is enabled + * > A start tag whose tag name is one of: "noframes", "style" + * + * The scripting flag is never enabled in this parser. + */ + case '+NOFRAMES': + case '+STYLE': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is "noscript", if the scripting flag is disabled + */ + case '+NOSCRIPT': + $this->insert_html_element( $this->state->current_token ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD_NOSCRIPT; + return true; + + /* + * > A start tag whose tag name is "script" + * + * @todo Could the adjusted insertion location be anything other than the current location? + */ + case '+SCRIPT': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > An end tag whose tag name is "head" + */ + case '-HEAD': + $this->state->stack_of_open_elements->pop(); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_HEAD; + return true; + + /* + * > An end tag whose tag name is one of: "body", "html", "br" + * + * BR tags are always reported by the Tag Processor as opening tags. + */ + case '-BODY': + case '-HTML': + /* + * > Act as described in the "anything else" entry below. + */ + goto in_head_anything_else; + break; + + /* + * > A start tag whose tag name is "template" + * + * @todo Could the adjusted insertion location be anything other than the current location? + */ + case '+TEMPLATE': + $this->state->active_formatting_elements->insert_marker(); + $this->state->frameset_ok = false; + + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE; + $this->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE; + + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > An end tag whose tag name is "template" + */ + case '-TEMPLATE': + if ( ! $this->state->stack_of_open_elements->contains( 'TEMPLATE' ) ) { + // @todo Indicate a parse error once it's possible. + return $this->step(); + } + + $this->generate_implied_end_tags_thoroughly(); + if ( ! $this->state->stack_of_open_elements->current_node_is( 'TEMPLATE' ) ) { + // @todo Indicate a parse error once it's possible. + } + + $this->state->stack_of_open_elements->pop_until( 'TEMPLATE' ); + $this->state->active_formatting_elements->clear_up_to_last_marker(); + array_pop( $this->state->stack_of_template_insertion_modes ); + $this->reset_insertion_mode_appropriately(); + return true; + } + + /* + * > A start tag whose tag name is "head" + * > Any other end tag + */ + if ( '+HEAD' === $op || $is_closer ) { + // Parse error: ignore the token. + return $this->step(); + } + + /* + * > Anything else + */ + in_head_anything_else: + $this->state->stack_of_open_elements->pop(); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_HEAD; + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + /** + * Parses next element in the 'in head noscript' insertion mode. + * + * This internal function performs the 'in head noscript' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 Stub implementation. + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#parsing-main-inheadnoscript + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_in_head_noscript(): bool { + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $is_closer = parent::is_tag_closer(); + $op_sigil = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; + + switch ( $op ) { + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, + * > U+000A LINE FEED (LF), U+000C FORM FEED (FF), + * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * + * Parse error: ignore the token. + */ + case '#text': + if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { + return $this->step_in_head(); + } + + goto in_head_noscript_anything_else; + break; + + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + return $this->step_in_body(); + + /* + * > An end tag whose tag name is "noscript" + */ + case '-NOSCRIPT': + $this->state->stack_of_open_elements->pop(); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD; + return true; + + /* + * > A comment token + * > + * > A start tag whose tag name is one of: "basefont", "bgsound", + * > "link", "meta", "noframes", "style" + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + case '+BASEFONT': + case '+BGSOUND': + case '+LINK': + case '+META': + case '+NOFRAMES': + case '+STYLE': + return $this->step_in_head(); + + /* + * > An end tag whose tag name is "br" + * + * This should never happen, as the Tag Processor prevents showing a BR closing tag. + */ + } + + /* + * > A start tag whose tag name is one of: "head", "noscript" + * > Any other end tag + */ + if ( '+HEAD' === $op || '+NOSCRIPT' === $op || $is_closer ) { + // Parse error: ignore the token. + return $this->step(); + } + + /* + * > Anything else + * + * Anything here is a parse error. + */ + in_head_noscript_anything_else: + $this->state->stack_of_open_elements->pop(); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD; + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + /** + * Parses next element in the 'after head' insertion mode. + * + * This internal function performs the 'after head' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 Stub implementation. + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#the-after-head-insertion-mode + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_after_head(): bool { + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $is_closer = parent::is_tag_closer(); + $op_sigil = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; + + switch ( $op ) { + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, + * > U+000A LINE FEED (LF), U+000C FORM FEED (FF), + * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + */ + case '#text': + if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { + // Insert the character. + $this->insert_html_element( $this->state->current_token ); + return true; + } + goto after_head_anything_else; + break; + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + return $this->step_in_body(); + + /* + * > A start tag whose tag name is "body" + */ + case '+BODY': + $this->insert_html_element( $this->state->current_token ); + $this->state->frameset_ok = false; + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + return true; + + /* + * > A start tag whose tag name is "frameset" + */ + case '+FRAMESET': + $this->insert_html_element( $this->state->current_token ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_FRAMESET; + return true; + + /* + * > A start tag whose tag name is one of: "base", "basefont", "bgsound", + * > "link", "meta", "noframes", "script", "style", "template", "title" + * + * Anything here is a parse error. + */ + case '+BASE': + case '+BASEFONT': + case '+BGSOUND': + case '+LINK': + case '+META': + case '+NOFRAMES': + case '+SCRIPT': + case '+STYLE': + case '+TEMPLATE': + case '+TITLE': + /* + * > Push the node pointed to by the head element pointer onto the stack of open elements. + * > Process the token using the rules for the "in head" insertion mode. + * > Remove the node pointed to by the head element pointer from the stack of open elements. (It might not be the current node at this point.) + */ + $this->bail( 'Cannot process elements after HEAD which reopen the HEAD element.' ); + /* + * Do not leave this break in when adding support; it's here to prevent + * WPCS from getting confused at the switch structure without a return, + * because it doesn't know that `bail()` always throws. + */ + break; + + /* + * > An end tag whose tag name is "template" + */ + case '-TEMPLATE': + return $this->step_in_head(); + + /* + * > An end tag whose tag name is one of: "body", "html", "br" + * + * Closing BR tags are always reported by the Tag Processor as opening tags. + */ + case '-BODY': + case '-HTML': + /* + * > Act as described in the "anything else" entry below. + */ + goto after_head_anything_else; + break; + } + + /* + * > A start tag whose tag name is "head" + * > Any other end tag + */ + if ( '+HEAD' === $op || $is_closer ) { + // Parse error: ignore the token. + return $this->step(); + } + + /* + * > Anything else + * > Insert an HTML element for a "body" start tag token with no attributes. + */ + after_head_anything_else: + $this->insert_virtual_node( 'BODY' ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + return $this->step( self::REPROCESS_CURRENT_NODE ); } /** @@ -879,24 +2164,14 @@ * * @return bool Whether an element was found. */ - private function step_in_body() { + private function step_in_body(): bool { $token_name = $this->get_token_name(); $token_type = $this->get_token_type(); $op_sigil = '#tag' === $token_type ? ( parent::is_tag_closer() ? '-' : '+' ) : ''; $op = "{$op_sigil}{$token_name}"; switch ( $op ) { - case '#comment': - case '#funky-comment': - case '#presumptuous-tag': - $this->insert_html_element( $this->state->current_token ); - return true; - case '#text': - $this->reconstruct_active_formatting_elements(); - - $current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ]; - /* * > A character token that is U+0000 NULL * @@ -906,50 +2181,171 @@ * here, but if there are any other characters in the stream * the active formats should be reconstructed. */ - if ( - 1 <= $current_token->length && - "\x00" === $this->html[ $current_token->start ] && - strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length - ) { + if ( parent::TEXT_IS_NULL_SEQUENCE === $this->text_node_classification ) { // Parse error: ignore the token. return $this->step(); } + $this->reconstruct_active_formatting_elements(); + /* * Whitespace-only text does not affect the frameset-ok flag. * It is probably inter-element whitespace, but it may also * contain character references which decode only to whitespace. */ - $text = $this->get_modifiable_text(); - if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) { + if ( parent::TEXT_IS_GENERIC === $this->text_node_classification ) { $this->state->frameset_ok = false; } $this->insert_html_element( $this->state->current_token ); return true; + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A DOCTYPE token + * > Parse error. Ignore the token. + */ case 'html': + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + if ( ! $this->state->stack_of_open_elements->contains( 'TEMPLATE' ) ) { + /* + * > Otherwise, for each attribute on the token, check to see if the attribute + * > is already present on the top element of the stack of open elements. If + * > it is not, add the attribute and its corresponding value to that element. + * + * This parser does not currently support this behavior: ignore the token. + */ + } + + // Ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is one of: "base", "basefont", "bgsound", "link", + * > "meta", "noframes", "script", "style", "template", "title" + * > + * > An end tag whose tag name is "template" + */ + case '+BASE': + case '+BASEFONT': + case '+BGSOUND': + case '+LINK': + case '+META': + case '+NOFRAMES': + case '+SCRIPT': + case '+STYLE': + case '+TEMPLATE': + case '+TITLE': + case '-TEMPLATE': + return $this->step_in_head(); + + /* + * > A start tag whose tag name is "body" + * + * This tag in the IN BODY insertion mode is a parse error. + */ + case '+BODY': + if ( + 1 === $this->state->stack_of_open_elements->count() || + 'BODY' !== ( $this->state->stack_of_open_elements->at( 2 )->node_name ?? null ) || + $this->state->stack_of_open_elements->contains( 'TEMPLATE' ) + ) { + // Ignore the token. + return $this->step(); + } + /* - * > A DOCTYPE token - * > Parse error. Ignore the token. + * > Otherwise, set the frameset-ok flag to "not ok"; then, for each attribute + * > on the token, check to see if the attribute is already present on the body + * > element (the second element) on the stack of open elements, and if it is + * > not, add the attribute and its corresponding value to that element. + * + * This parser does not currently support this behavior: ignore the token. + */ + $this->state->frameset_ok = false; + return $this->step(); + + /* + * > A start tag whose tag name is "frameset" + * + * This tag in the IN BODY insertion mode is a parse error. + */ + case '+FRAMESET': + if ( + 1 === $this->state->stack_of_open_elements->count() || + 'BODY' !== ( $this->state->stack_of_open_elements->at( 2 )->node_name ?? null ) || + false === $this->state->frameset_ok + ) { + // Ignore the token. + return $this->step(); + } + + /* + * > Otherwise, run the following steps: + */ + $this->bail( 'Cannot process non-ignored FRAMESET tags.' ); + break; + + /* + * > An end tag whose tag name is "body" + */ + case '-BODY': + if ( ! $this->state->stack_of_open_elements->has_element_in_scope( 'BODY' ) ) { + // Parse error: ignore the token. + return $this->step(); + } + + /* + * > Otherwise, if there is a node in the stack of open elements that is not either a + * > dd element, a dt element, an li element, an optgroup element, an option element, + * > a p element, an rb element, an rp element, an rt element, an rtc element, a tbody + * > element, a td element, a tfoot element, a th element, a thread element, a tr + * > element, the body element, or the html element, then this is a parse error. + * + * There is nothing to do for this parse error, so don't check for it. + */ + + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_BODY; + /* + * The BODY element is not removed from the stack of open elements. + * Only internal state has changed, this does not qualify as a "step" + * in terms of advancing through the document to another token. + * Nothing has been pushed or popped. + * Proceed to parse the next item. */ return $this->step(); /* - * > A start tag whose tag name is "button" - */ - case '+BUTTON': - if ( $this->state->stack_of_open_elements->has_element_in_scope( 'BUTTON' ) ) { - // @todo Indicate a parse error once it's possible. This error does not impact the logic here. - $this->generate_implied_end_tags(); - $this->state->stack_of_open_elements->pop_until( 'BUTTON' ); + * > An end tag whose tag name is "html" + */ + case '-HTML': + if ( ! $this->state->stack_of_open_elements->has_element_in_scope( 'BODY' ) ) { + // Parse error: ignore the token. + return $this->step(); } - $this->reconstruct_active_formatting_elements(); - $this->insert_html_element( $this->state->current_token ); - $this->state->frameset_ok = false; - - return true; + /* + * > Otherwise, if there is a node in the stack of open elements that is not either a + * > dd element, a dt element, an li element, an optgroup element, an option element, + * > a p element, an rb element, an rp element, an rt element, an rtc element, a tbody + * > element, a td element, a tfoot element, a th element, a thread element, a tr + * > element, the body element, or the html element, then this is a parse error. + * + * There is nothing to do for this parse error, so don't check for it. + */ + + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_BODY; + return $this->step( self::REPROCESS_CURRENT_NODE ); /* * > A start tag whose tag name is one of: "address", "article", "aside", @@ -990,6 +2386,163 @@ return true; /* + * > A start tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6" + */ + case '+H1': + case '+H2': + case '+H3': + case '+H4': + case '+H5': + case '+H6': + if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { + $this->close_a_p_element(); + } + + if ( + in_array( + $this->state->stack_of_open_elements->current_node()->node_name, + array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), + true + ) + ) { + // @todo Indicate a parse error once it's possible. + $this->state->stack_of_open_elements->pop(); + } + + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is one of: "pre", "listing" + */ + case '+PRE': + case '+LISTING': + if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { + $this->close_a_p_element(); + } + + /* + * > If the next token is a U+000A LINE FEED (LF) character token, + * > then ignore that token and move on to the next one. (Newlines + * > at the start of pre blocks are ignored as an authoring convenience.) + * + * This is handled in `get_modifiable_text()`. + */ + + $this->insert_html_element( $this->state->current_token ); + $this->state->frameset_ok = false; + return true; + + /* + * > A start tag whose tag name is "form" + */ + case '+FORM': + $stack_contains_template = $this->state->stack_of_open_elements->contains( 'TEMPLATE' ); + + if ( isset( $this->state->form_element ) && ! $stack_contains_template ) { + // Parse error: ignore the token. + return $this->step(); + } + + if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { + $this->close_a_p_element(); + } + + $this->insert_html_element( $this->state->current_token ); + if ( ! $stack_contains_template ) { + $this->state->form_element = $this->state->current_token; + } + + return true; + + /* + * > A start tag whose tag name is "li" + * > A start tag whose tag name is one of: "dd", "dt" + */ + case '+DD': + case '+DT': + case '+LI': + $this->state->frameset_ok = false; + $node = $this->state->stack_of_open_elements->current_node(); + $is_li = 'LI' === $token_name; + + in_body_list_loop: + /* + * The logic for LI and DT/DD is the same except for one point: LI elements _only_ + * close other LI elements, but a DT or DD element closes _any_ open DT or DD element. + */ + if ( $is_li ? 'LI' === $node->node_name : ( 'DD' === $node->node_name || 'DT' === $node->node_name ) ) { + $node_name = $is_li ? 'LI' : $node->node_name; + $this->generate_implied_end_tags( $node_name ); + if ( ! $this->state->stack_of_open_elements->current_node_is( $node_name ) ) { + // @todo Indicate a parse error once it's possible. This error does not impact the logic here. + } + + $this->state->stack_of_open_elements->pop_until( $node_name ); + goto in_body_list_done; + } + + if ( + 'ADDRESS' !== $node->node_name && + 'DIV' !== $node->node_name && + 'P' !== $node->node_name && + self::is_special( $node ) + ) { + /* + * > If node is in the special category, but is not an address, div, + * > or p element, then jump to the step labeled done below. + */ + goto in_body_list_done; + } else { + /* + * > Otherwise, set node to the previous entry in the stack of open elements + * > and return to the step labeled loop. + */ + foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $item ) { + $node = $item; + break; + } + goto in_body_list_loop; + } + + in_body_list_done: + if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { + $this->close_a_p_element(); + } + + $this->insert_html_element( $this->state->current_token ); + return true; + + case '+PLAINTEXT': + if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { + $this->close_a_p_element(); + } + + /* + * @todo This may need to be handled in the Tag Processor and turn into + * a single self-contained tag like TEXTAREA, whose modifiable text + * is the rest of the input document as plaintext. + */ + $this->bail( 'Cannot process PLAINTEXT elements.' ); + break; + + /* + * > A start tag whose tag name is "button" + */ + case '+BUTTON': + if ( $this->state->stack_of_open_elements->has_element_in_scope( 'BUTTON' ) ) { + // @todo Indicate a parse error once it's possible. This error does not impact the logic here. + $this->generate_implied_end_tags(); + $this->state->stack_of_open_elements->pop_until( 'BUTTON' ); + } + + $this->reconstruct_active_formatting_elements(); + $this->insert_html_element( $this->state->current_token ); + $this->state->frameset_ok = false; + + return true; + + /* * > An end tag whose tag name is one of: "address", "article", "aside", "blockquote", * > "button", "center", "details", "dialog", "dir", "div", "dl", "fieldset", * > "figcaption", "figure", "footer", "header", "hgroup", "listing", "main", @@ -1029,134 +2582,75 @@ } $this->generate_implied_end_tags(); - if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) { + if ( ! $this->state->stack_of_open_elements->current_node_is( $token_name ) ) { // @todo Record parse error: this error doesn't impact parsing. } $this->state->stack_of_open_elements->pop_until( $token_name ); return true; /* - * > A start tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6" - */ - case '+H1': - case '+H2': - case '+H3': - case '+H4': - case '+H5': - case '+H6': - if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { - $this->close_a_p_element(); - } - - if ( - in_array( - $this->state->stack_of_open_elements->current_node()->node_name, - array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), - true - ) - ) { - // @todo Indicate a parse error once it's possible. - $this->state->stack_of_open_elements->pop(); - } - - $this->insert_html_element( $this->state->current_token ); - return true; - - /* - * > A start tag whose tag name is one of: "pre", "listing" - */ - case '+PRE': - case '+LISTING': - if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { - $this->close_a_p_element(); - } - $this->insert_html_element( $this->state->current_token ); - $this->state->frameset_ok = false; - return true; - - /* - * > An end tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6" - */ - case '-H1': - case '-H2': - case '-H3': - case '-H4': - case '-H5': - case '-H6': - if ( ! $this->state->stack_of_open_elements->has_element_in_scope( '(internal: H1 through H6 - do not use)' ) ) { + * > An end tag whose tag name is "form" + */ + case '-FORM': + if ( ! $this->state->stack_of_open_elements->contains( 'TEMPLATE' ) ) { + $node = $this->state->form_element; + $this->state->form_element = null; + /* - * This is a parse error; ignore the token. + * > If node is null or if the stack of open elements does not have node + * > in scope, then this is a parse error; return and ignore the token. * - * @todo Indicate a parse error once it's possible. + * @todo It's necessary to check if the form token itself is in scope, not + * simply whether any FORM is in scope. */ - return $this->step(); - } - - $this->generate_implied_end_tags(); - - if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) { - // @todo Record parse error: this error doesn't impact parsing. - } - - $this->state->stack_of_open_elements->pop_until( '(internal: H1 through H6 - do not use)' ); - return true; - - /* - * > A start tag whose tag name is "li" - * > A start tag whose tag name is one of: "dd", "dt" - */ - case '+DD': - case '+DT': - case '+LI': - $this->state->frameset_ok = false; - $node = $this->state->stack_of_open_elements->current_node(); - $is_li = 'LI' === $token_name; - - in_body_list_loop: - /* - * The logic for LI and DT/DD is the same except for one point: LI elements _only_ - * close other LI elements, but a DT or DD element closes _any_ open DT or DD element. - */ - if ( $is_li ? 'LI' === $node->node_name : ( 'DD' === $node->node_name || 'DT' === $node->node_name ) ) { - $node_name = $is_li ? 'LI' : $node->node_name; - $this->generate_implied_end_tags( $node_name ); - if ( $node_name !== $this->state->stack_of_open_elements->current_node()->node_name ) { + if ( + null === $node || + ! $this->state->stack_of_open_elements->has_element_in_scope( 'FORM' ) + ) { + // Parse error: ignore the token. + return $this->step(); + } + + $this->generate_implied_end_tags(); + if ( $node !== $this->state->stack_of_open_elements->current_node() ) { + // @todo Indicate a parse error once it's possible. This error does not impact the logic here. + $this->bail( 'Cannot close a FORM when other elements remain open as this would throw off the breadcrumbs for the following tokens.' ); + } + + $this->state->stack_of_open_elements->remove_node( $node ); + return true; + } else { + /* + * > If the stack of open elements does not have a form element in scope, + * > then this is a parse error; return and ignore the token. + * + * Note that unlike in the clause above, this is checking for any FORM in scope. + */ + if ( ! $this->state->stack_of_open_elements->has_element_in_scope( 'FORM' ) ) { + // Parse error: ignore the token. + return $this->step(); + } + + $this->generate_implied_end_tags(); + + if ( ! $this->state->stack_of_open_elements->current_node_is( 'FORM' ) ) { // @todo Indicate a parse error once it's possible. This error does not impact the logic here. } - $this->state->stack_of_open_elements->pop_until( $node_name ); - goto in_body_list_done; + $this->state->stack_of_open_elements->pop_until( 'FORM' ); + return true; } - - if ( - 'ADDRESS' !== $node->node_name && - 'DIV' !== $node->node_name && - 'P' !== $node->node_name && - $this->is_special( $node->node_name ) - ) { - /* - * > If node is in the special category, but is not an address, div, - * > or p element, then jump to the step labeled done below. - */ - goto in_body_list_done; - } else { - /* - * > Otherwise, set node to the previous entry in the stack of open elements - * > and return to the step labeled loop. - */ - foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $item ) { - $node = $item; - break; - } - goto in_body_list_loop; + break; + + /* + * > An end tag whose tag name is "p" + */ + case '-P': + if ( ! $this->state->stack_of_open_elements->has_p_in_button_scope() ) { + $this->insert_html_element( $this->state->current_token ); } - in_body_list_done: - if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { - $this->close_a_p_element(); - } - - $this->insert_html_element( $this->state->current_token ); + $this->close_a_p_element(); return true; /* @@ -1197,7 +2691,7 @@ $this->generate_implied_end_tags( $token_name ); - if ( $token_name !== $this->state->stack_of_open_elements->current_node()->node_name ) { + if ( ! $this->state->stack_of_open_elements->current_node_is( $token_name ) ) { // @todo Indicate a parse error once it's possible. This error does not impact the logic here. } @@ -1205,28 +2699,46 @@ return true; /* - * > An end tag whose tag name is "p" - */ - case '-P': - if ( ! $this->state->stack_of_open_elements->has_p_in_button_scope() ) { - $this->insert_html_element( $this->state->current_token ); + * > An end tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6" + */ + case '-H1': + case '-H2': + case '-H3': + case '-H4': + case '-H5': + case '-H6': + if ( ! $this->state->stack_of_open_elements->has_element_in_scope( '(internal: H1 through H6 - do not use)' ) ) { + /* + * This is a parse error; ignore the token. + * + * @todo Indicate a parse error once it's possible. + */ + return $this->step(); } - $this->close_a_p_element(); + $this->generate_implied_end_tags(); + + if ( ! $this->state->stack_of_open_elements->current_node_is( $token_name ) ) { + // @todo Record parse error: this error doesn't impact parsing. + } + + $this->state->stack_of_open_elements->pop_until( '(internal: H1 through H6 - do not use)' ); return true; - // > A start tag whose tag name is "a" + /* + * > A start tag whose tag name is "a" + */ case '+A': foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { switch ( $item->node_name ) { case 'marker': - break; + break 2; case 'A': $this->run_adoption_agency_algorithm(); $this->state->active_formatting_elements->remove_node( $item ); $this->state->stack_of_open_elements->remove_node( $item ); - break; + break 2; } } @@ -1257,6 +2769,22 @@ return true; /* + * > A start tag whose tag name is "nobr" + */ + case '+NOBR': + $this->reconstruct_active_formatting_elements(); + + if ( $this->state->stack_of_open_elements->has_element_in_scope( 'NOBR' ) ) { + // Parse error. + $this->run_adoption_agency_algorithm(); + $this->reconstruct_active_formatting_elements(); + } + + $this->insert_html_element( $this->state->current_token ); + $this->state->active_formatting_elements->push( $this->state->current_token ); + return true; + + /* * > An end tag whose tag name is one of: "a", "b", "big", "code", "em", "font", "i", * > "nobr", "s", "small", "strike", "strong", "tt", "u" */ @@ -1267,6 +2795,7 @@ case '-EM': case '-FONT': case '-I': + case '-NOBR': case '-S': case '-SMALL': case '-STRIKE': @@ -1277,14 +2806,63 @@ return true; /* + * > A start tag whose tag name is one of: "applet", "marquee", "object" + */ + case '+APPLET': + case '+MARQUEE': + case '+OBJECT': + $this->reconstruct_active_formatting_elements(); + $this->insert_html_element( $this->state->current_token ); + $this->state->active_formatting_elements->insert_marker(); + $this->state->frameset_ok = false; + return true; + + /* + * > A end tag token whose tag name is one of: "applet", "marquee", "object" + */ + case '-APPLET': + case '-MARQUEE': + case '-OBJECT': + if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) { + // Parse error: ignore the token. + return $this->step(); + } + + $this->generate_implied_end_tags(); + if ( ! $this->state->stack_of_open_elements->current_node_is( $token_name ) ) { + // This is a parse error. + } + + $this->state->stack_of_open_elements->pop_until( $token_name ); + $this->state->active_formatting_elements->clear_up_to_last_marker(); + return true; + + /* + * > A start tag whose tag name is "table" + */ + case '+TABLE': + /* + * > If the Document is not set to quirks mode, and the stack of open elements + * > has a p element in button scope, then close a p element. + */ + if ( + WP_HTML_Tag_Processor::QUIRKS_MODE !== $this->compat_mode && + $this->state->stack_of_open_elements->has_p_in_button_scope() + ) { + $this->close_a_p_element(); + } + + $this->insert_html_element( $this->state->current_token ); + $this->state->frameset_ok = false; + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE; + return true; + + /* * > An end tag whose tag name is "br" - * > Parse error. Drop the attributes from the token, and act as described in the next - * > entry; i.e. act as if this was a "br" start tag token with no attributes, rather - * > than the end tag token that it actually is. - */ - case '-BR': - $this->last_error = self::ERROR_UNSUPPORTED; - throw new WP_HTML_Unsupported_Exception( 'Closing BR tags require unimplemented special handling.' ); + * + * This is prevented from happening because the Tag Processor + * reports all closing BR tags as if they were opening tags. + */ /* * > A start tag whose tag name is one of: "area", "br", "embed", "img", "keygen", "wbr" @@ -1306,15 +2884,26 @@ case '+INPUT': $this->reconstruct_active_formatting_elements(); $this->insert_html_element( $this->state->current_token ); - $type_attribute = $this->get_attribute( 'type' ); + /* * > If the token does not have an attribute with the name "type", or if it does, * > but that attribute's value is not an ASCII case-insensitive match for the * > string "hidden", then: set the frameset-ok flag to "not ok". */ + $type_attribute = $this->get_attribute( 'type' ); if ( ! is_string( $type_attribute ) || 'hidden' !== strtolower( $type_attribute ) ) { $this->state->frameset_ok = false; } + + return true; + + /* + * > A start tag whose tag name is one of: "param", "source", "track" + */ + case '+PARAM': + case '+SOURCE': + case '+TRACK': + $this->insert_html_element( $this->state->current_token ); return true; /* @@ -1329,80 +2918,213 @@ return true; /* - * > A start tag whose tag name is one of: "param", "source", "track" - */ - case '+PARAM': - case '+SOURCE': - case '+TRACK': + * > A start tag whose tag name is "image" + */ + case '+IMAGE': + /* + * > Parse error. Change the token's tag name to "img" and reprocess it. (Don't ask.) + * + * Note that this is handled elsewhere, so it should not be possible to reach this code. + */ + $this->bail( "Cannot process an IMAGE tag. (Don't ask.)" ); + break; + + /* + * > A start tag whose tag name is "textarea" + */ + case '+TEXTAREA': + $this->insert_html_element( $this->state->current_token ); + + /* + * > If the next token is a U+000A LINE FEED (LF) character token, then ignore + * > that token and move on to the next one. (Newlines at the start of + * > textarea elements are ignored as an authoring convenience.) + * + * This is handled in `get_modifiable_text()`. + */ + + $this->state->frameset_ok = false; + + /* + * > Switch the insertion mode to "text". + * + * As a self-contained node, this behavior is handled in the Tag Processor. + */ + return true; + + /* + * > A start tag whose tag name is "xmp" + */ + case '+XMP': + if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { + $this->close_a_p_element(); + } + + $this->reconstruct_active_formatting_elements(); + $this->state->frameset_ok = false; + + /* + * > Follow the generic raw text element parsing algorithm. + * + * As a self-contained node, this behavior is handled in the Tag Processor. + */ + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * A start tag whose tag name is "iframe" + */ + case '+IFRAME': + $this->state->frameset_ok = false; + + /* + * > Follow the generic raw text element parsing algorithm. + * + * As a self-contained node, this behavior is handled in the Tag Processor. + */ + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is "noembed" + * > A start tag whose tag name is "noscript", if the scripting flag is enabled + * + * The scripting flag is never enabled in this parser. + */ + case '+NOEMBED': $this->insert_html_element( $this->state->current_token ); return true; - } - - /* - * These tags require special handling in the 'in body' insertion mode - * but that handling hasn't yet been implemented. - * - * As the rules for each tag are implemented, the corresponding tag - * name should be removed from this list. An accompanying test should - * help ensure this list is maintained. - * - * @see Tests_HtmlApi_WpHtmlProcessor::test_step_in_body_fails_on_unsupported_tags - * - * Since this switch structure throws a WP_HTML_Unsupported_Exception, it's - * possible to handle "any other start tag" and "any other end tag" below, - * as that guarantees execution doesn't proceed for the unimplemented tags. - * - * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody - */ - switch ( $token_name ) { - case 'APPLET': - case 'BASE': - case 'BASEFONT': - case 'BGSOUND': - case 'BODY': - case 'CAPTION': - case 'COL': - case 'COLGROUP': - case 'FORM': - case 'FRAME': - case 'FRAMESET': - case 'HEAD': - case 'HTML': - case 'IFRAME': - case 'LINK': - case 'MARQUEE': - case 'MATH': - case 'META': - case 'NOBR': - case 'NOEMBED': - case 'NOFRAMES': - case 'NOSCRIPT': - case 'OBJECT': - case 'OPTGROUP': - case 'OPTION': - case 'PLAINTEXT': - case 'RB': - case 'RP': - case 'RT': - case 'RTC': - case 'SARCASM': - case 'SCRIPT': - case 'SELECT': - case 'STYLE': - case 'SVG': - case 'TABLE': - case 'TBODY': - case 'TD': - case 'TEMPLATE': - case 'TEXTAREA': - case 'TFOOT': - case 'TH': - case 'THEAD': - case 'TITLE': - case 'TR': - case 'XMP': - $this->last_error = self::ERROR_UNSUPPORTED; - throw new WP_HTML_Unsupported_Exception( "Cannot process {$token_name} element." ); + + /* + * > A start tag whose tag name is "select" + */ + case '+SELECT': + $this->reconstruct_active_formatting_elements(); + $this->insert_html_element( $this->state->current_token ); + $this->state->frameset_ok = false; + + switch ( $this->state->insertion_mode ) { + /* + * > If the insertion mode is one of "in table", "in caption", "in table body", "in row", + * > or "in cell", then switch the insertion mode to "in select in table". + */ + case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE: + case WP_HTML_Processor_State::INSERTION_MODE_IN_CAPTION: + case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY: + case WP_HTML_Processor_State::INSERTION_MODE_IN_ROW: + case WP_HTML_Processor_State::INSERTION_MODE_IN_CELL: + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT_IN_TABLE; + break; + + /* + * > Otherwise, switch the insertion mode to "in select". + */ + default: + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT; + break; + } + return true; + + /* + * > A start tag whose tag name is one of: "optgroup", "option" + */ + case '+OPTGROUP': + case '+OPTION': + if ( $this->state->stack_of_open_elements->current_node_is( 'OPTION' ) ) { + $this->state->stack_of_open_elements->pop(); + } + $this->reconstruct_active_formatting_elements(); + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is one of: "rb", "rtc" + */ + case '+RB': + case '+RTC': + if ( $this->state->stack_of_open_elements->has_element_in_scope( 'RUBY' ) ) { + $this->generate_implied_end_tags(); + + if ( $this->state->stack_of_open_elements->current_node_is( 'RUBY' ) ) { + // @todo Indicate a parse error once it's possible. + } + } + + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is one of: "rp", "rt" + */ + case '+RP': + case '+RT': + if ( $this->state->stack_of_open_elements->has_element_in_scope( 'RUBY' ) ) { + $this->generate_implied_end_tags( 'RTC' ); + + $current_node_name = $this->state->stack_of_open_elements->current_node()->node_name; + if ( 'RTC' === $current_node_name || 'RUBY' === $current_node_name ) { + // @todo Indicate a parse error once it's possible. + } + } + + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is "math" + */ + case '+MATH': + $this->reconstruct_active_formatting_elements(); + + /* + * @todo Adjust MathML attributes for the token. (This fixes the case of MathML attributes that are not all lowercase.) + * @todo Adjust foreign attributes for the token. (This fixes the use of namespaced attributes, in particular XLink.) + * + * These ought to be handled in the attribute methods. + */ + $this->state->current_token->namespace = 'math'; + $this->insert_html_element( $this->state->current_token ); + if ( $this->state->current_token->has_self_closing_flag ) { + $this->state->stack_of_open_elements->pop(); + } + return true; + + /* + * > A start tag whose tag name is "svg" + */ + case '+SVG': + $this->reconstruct_active_formatting_elements(); + + /* + * @todo Adjust SVG attributes for the token. (This fixes the case of SVG attributes that are not all lowercase.) + * @todo Adjust foreign attributes for the token. (This fixes the use of namespaced attributes, in particular XLink in SVG.) + * + * These ought to be handled in the attribute methods. + */ + $this->state->current_token->namespace = 'svg'; + $this->insert_html_element( $this->state->current_token ); + if ( $this->state->current_token->has_self_closing_flag ) { + $this->state->stack_of_open_elements->pop(); + } + return true; + + /* + * > A start tag whose tag name is one of: "caption", "col", "colgroup", + * > "frame", "head", "tbody", "td", "tfoot", "th", "thead", "tr" + */ + case '+CAPTION': + case '+COL': + case '+COLGROUP': + case '+FRAME': + case '+HEAD': + case '+TBODY': + case '+TD': + case '+TFOOT': + case '+TH': + case '+THEAD': + case '+TR': + // Parse error. Ignore the token. + return $this->step(); } if ( ! parent::is_tag_closer() ) { @@ -1424,11 +3146,11 @@ * close anything beyond its containing `P` or `DIV` element. */ foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) { - if ( $token_name === $node->node_name ) { + if ( 'html' === $node->namespace && $token_name === $node->node_name ) { break; } - if ( self::is_special( $node->node_name ) ) { + if ( self::is_special( $node ) ) { // This is a parse error, ignore the token. return $this->step(); } @@ -1446,6 +3168,1904 @@ } } } + + $this->bail( 'Should not have been able to reach end of IN BODY processing. Check HTML API code.' ); + // This unnecessary return prevents tools from inaccurately reporting type errors. + return false; + } + + /** + * Parses next element in the 'in table' insertion mode. + * + * This internal function performs the 'in table' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#parsing-main-intable + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_in_table(): bool { + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( parent::is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; + + switch ( $op ) { + /* + * > A character token, if the current node is table, + * > tbody, template, tfoot, thead, or tr element + */ + case '#text': + $current_node = $this->state->stack_of_open_elements->current_node(); + $current_node_name = $current_node ? $current_node->node_name : null; + if ( + $current_node_name && ( + 'TABLE' === $current_node_name || + 'TBODY' === $current_node_name || + 'TEMPLATE' === $current_node_name || + 'TFOOT' === $current_node_name || + 'THEAD' === $current_node_name || + 'TR' === $current_node_name + ) + ) { + /* + * If the text is empty after processing HTML entities and stripping + * U+0000 NULL bytes then ignore the token. + */ + if ( parent::TEXT_IS_NULL_SEQUENCE === $this->text_node_classification ) { + return $this->step(); + } + + /* + * This follows the rules for "in table text" insertion mode. + * + * Whitespace-only text nodes are inserted in-place. Otherwise + * foster parenting is enabled and the nodes would be + * inserted out-of-place. + * + * > If any of the tokens in the pending table character tokens + * > list are character tokens that are not ASCII whitespace, + * > then this is a parse error: reprocess the character tokens + * > in the pending table character tokens list using the rules + * > given in the "anything else" entry in the "in table" + * > insertion mode. + * > + * > Otherwise, insert the characters given by the pending table + * > character tokens list. + * + * @see https://html.spec.whatwg.org/#parsing-main-intabletext + */ + if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { + $this->insert_html_element( $this->state->current_token ); + return true; + } + + // Non-whitespace would trigger fostering, unsupported at this time. + $this->bail( 'Foster parenting is not supported.' ); + break; + } + break; + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "caption" + */ + case '+CAPTION': + $this->state->stack_of_open_elements->clear_to_table_context(); + $this->state->active_formatting_elements->insert_marker(); + $this->insert_html_element( $this->state->current_token ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_CAPTION; + return true; + + /* + * > A start tag whose tag name is "colgroup" + */ + case '+COLGROUP': + $this->state->stack_of_open_elements->clear_to_table_context(); + $this->insert_html_element( $this->state->current_token ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_COLUMN_GROUP; + return true; + + /* + * > A start tag whose tag name is "col" + */ + case '+COL': + $this->state->stack_of_open_elements->clear_to_table_context(); + + /* + * > Insert an HTML element for a "colgroup" start tag token with no attributes, + * > then switch the insertion mode to "in column group". + */ + $this->insert_virtual_node( 'COLGROUP' ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_COLUMN_GROUP; + return $this->step( self::REPROCESS_CURRENT_NODE ); + + /* + * > A start tag whose tag name is one of: "tbody", "tfoot", "thead" + */ + case '+TBODY': + case '+TFOOT': + case '+THEAD': + $this->state->stack_of_open_elements->clear_to_table_context(); + $this->insert_html_element( $this->state->current_token ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY; + return true; + + /* + * > A start tag whose tag name is one of: "td", "th", "tr" + */ + case '+TD': + case '+TH': + case '+TR': + $this->state->stack_of_open_elements->clear_to_table_context(); + /* + * > Insert an HTML element for a "tbody" start tag token with no attributes, + * > then switch the insertion mode to "in table body". + */ + $this->insert_virtual_node( 'TBODY' ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY; + return $this->step( self::REPROCESS_CURRENT_NODE ); + + /* + * > A start tag whose tag name is "table" + * + * This tag in the IN TABLE insertion mode is a parse error. + */ + case '+TABLE': + if ( ! $this->state->stack_of_open_elements->has_element_in_table_scope( 'TABLE' ) ) { + return $this->step(); + } + + $this->state->stack_of_open_elements->pop_until( 'TABLE' ); + $this->reset_insertion_mode_appropriately(); + return $this->step( self::REPROCESS_CURRENT_NODE ); + + /* + * > An end tag whose tag name is "table" + */ + case '-TABLE': + if ( ! $this->state->stack_of_open_elements->has_element_in_table_scope( 'TABLE' ) ) { + // @todo Indicate a parse error once it's possible. + return $this->step(); + } + + $this->state->stack_of_open_elements->pop_until( 'TABLE' ); + $this->reset_insertion_mode_appropriately(); + return true; + + /* + * > An end tag whose tag name is one of: "body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" + */ + case '-BODY': + case '-CAPTION': + case '-COL': + case '-COLGROUP': + case '-HTML': + case '-TBODY': + case '-TD': + case '-TFOOT': + case '-TH': + case '-THEAD': + case '-TR': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is one of: "style", "script", "template" + * > An end tag whose tag name is "template" + */ + case '+STYLE': + case '+SCRIPT': + case '+TEMPLATE': + case '-TEMPLATE': + /* + * > Process the token using the rules for the "in head" insertion mode. + */ + return $this->step_in_head(); + + /* + * > A start tag whose tag name is "input" + * + * > If the token does not have an attribute with the name "type", or if it does, but + * > that attribute's value is not an ASCII case-insensitive match for the string + * > "hidden", then: act as described in the "anything else" entry below. + */ + case '+INPUT': + $type_attribute = $this->get_attribute( 'type' ); + if ( ! is_string( $type_attribute ) || 'hidden' !== strtolower( $type_attribute ) ) { + goto anything_else; + } + // @todo Indicate a parse error once it's possible. + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is "form" + * + * This tag in the IN TABLE insertion mode is a parse error. + */ + case '+FORM': + if ( + $this->state->stack_of_open_elements->has_element_in_scope( 'TEMPLATE' ) || + isset( $this->state->form_element ) + ) { + return $this->step(); + } + + // This FORM is special because it immediately closes and cannot have other children. + $this->insert_html_element( $this->state->current_token ); + $this->state->form_element = $this->state->current_token; + $this->state->stack_of_open_elements->pop(); + return true; + } + + /* + * > Anything else + * > Parse error. Enable foster parenting, process the token using the rules for the + * > "in body" insertion mode, and then disable foster parenting. + * + * @todo Indicate a parse error once it's possible. + */ + anything_else: + $this->bail( 'Foster parenting is not supported.' ); + } + + /** + * Parses next element in the 'in table text' insertion mode. + * + * This internal function performs the 'in table text' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 Stub implementation. + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#parsing-main-intabletext + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_in_table_text(): bool { + $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_TEXT . ' state.' ); + } + + /** + * Parses next element in the 'in caption' insertion mode. + * + * This internal function performs the 'in caption' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#parsing-main-incaption + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_in_caption(): bool { + $tag_name = $this->get_tag(); + $op_sigil = $this->is_tag_closer() ? '-' : '+'; + $op = "{$op_sigil}{$tag_name}"; + + switch ( $op ) { + /* + * > An end tag whose tag name is "caption" + * > A start tag whose tag name is one of: "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr" + * > An end tag whose tag name is "table" + * + * These tag handling rules are identical except for the final instruction. + * Handle them in a single block. + */ + case '-CAPTION': + case '+CAPTION': + case '+COL': + case '+COLGROUP': + case '+TBODY': + case '+TD': + case '+TFOOT': + case '+TH': + case '+THEAD': + case '+TR': + case '-TABLE': + if ( ! $this->state->stack_of_open_elements->has_element_in_table_scope( 'CAPTION' ) ) { + // Parse error: ignore the token. + return $this->step(); + } + + $this->generate_implied_end_tags(); + if ( ! $this->state->stack_of_open_elements->current_node_is( 'CAPTION' ) ) { + // @todo Indicate a parse error once it's possible. + } + + $this->state->stack_of_open_elements->pop_until( 'CAPTION' ); + $this->state->active_formatting_elements->clear_up_to_last_marker(); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE; + + // If this is not a CAPTION end tag, the token should be reprocessed. + if ( '-CAPTION' === $op ) { + return true; + } + return $this->step( self::REPROCESS_CURRENT_NODE ); + + /** + * > An end tag whose tag name is one of: "body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" + */ + case '-BODY': + case '-COL': + case '-COLGROUP': + case '-HTML': + case '-TBODY': + case '-TD': + case '-TFOOT': + case '-TH': + case '-THEAD': + case '-TR': + // Parse error: ignore the token. + return $this->step(); + } + + /** + * > Anything else + * > Process the token using the rules for the "in body" insertion mode. + */ + return $this->step_in_body(); + } + + /** + * Parses next element in the 'in column group' insertion mode. + * + * This internal function performs the 'in column group' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#parsing-main-incolgroup + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_in_column_group(): bool { + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( parent::is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; + + switch ( $op ) { + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), + * > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + */ + case '#text': + if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { + // Insert the character. + $this->insert_html_element( $this->state->current_token ); + return true; + } + + goto in_column_group_anything_else; + break; + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A DOCTYPE token + */ + case 'html': + // @todo Indicate a parse error once it's possible. + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + return $this->step_in_body(); + + /* + * > A start tag whose tag name is "col" + */ + case '+COL': + $this->insert_html_element( $this->state->current_token ); + $this->state->stack_of_open_elements->pop(); + return true; + + /* + * > An end tag whose tag name is "colgroup" + */ + case '-COLGROUP': + if ( ! $this->state->stack_of_open_elements->current_node_is( 'COLGROUP' ) ) { + // @todo Indicate a parse error once it's possible. + return $this->step(); + } + $this->state->stack_of_open_elements->pop(); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE; + return true; + + /* + * > An end tag whose tag name is "col" + */ + case '-COL': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "template" + * > An end tag whose tag name is "template" + */ + case '+TEMPLATE': + case '-TEMPLATE': + return $this->step_in_head(); + } + + in_column_group_anything_else: + /* + * > Anything else + */ + if ( ! $this->state->stack_of_open_elements->current_node_is( 'COLGROUP' ) ) { + // @todo Indicate a parse error once it's possible. + return $this->step(); + } + $this->state->stack_of_open_elements->pop(); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE; + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + /** + * Parses next element in the 'in table body' insertion mode. + * + * This internal function performs the 'in table body' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#parsing-main-intbody + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_in_table_body(): bool { + $tag_name = $this->get_tag(); + $op_sigil = $this->is_tag_closer() ? '-' : '+'; + $op = "{$op_sigil}{$tag_name}"; + + switch ( $op ) { + /* + * > A start tag whose tag name is "tr" + */ + case '+TR': + $this->state->stack_of_open_elements->clear_to_table_body_context(); + $this->insert_html_element( $this->state->current_token ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_ROW; + return true; + + /* + * > A start tag whose tag name is one of: "th", "td" + */ + case '+TH': + case '+TD': + // @todo Indicate a parse error once it's possible. + $this->state->stack_of_open_elements->clear_to_table_body_context(); + $this->insert_virtual_node( 'TR' ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_ROW; + return $this->step( self::REPROCESS_CURRENT_NODE ); + + /* + * > An end tag whose tag name is one of: "tbody", "tfoot", "thead" + */ + case '-TBODY': + case '-TFOOT': + case '-THEAD': + if ( ! $this->state->stack_of_open_elements->has_element_in_table_scope( $tag_name ) ) { + // Parse error: ignore the token. + return $this->step(); + } + + $this->state->stack_of_open_elements->clear_to_table_body_context(); + $this->state->stack_of_open_elements->pop(); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE; + return true; + + /* + * > A start tag whose tag name is one of: "caption", "col", "colgroup", "tbody", "tfoot", "thead" + * > An end tag whose tag name is "table" + */ + case '+CAPTION': + case '+COL': + case '+COLGROUP': + case '+TBODY': + case '+TFOOT': + case '+THEAD': + case '-TABLE': + if ( + ! $this->state->stack_of_open_elements->has_element_in_table_scope( 'TBODY' ) && + ! $this->state->stack_of_open_elements->has_element_in_table_scope( 'THEAD' ) && + ! $this->state->stack_of_open_elements->has_element_in_table_scope( 'TFOOT' ) + ) { + // Parse error: ignore the token. + return $this->step(); + } + $this->state->stack_of_open_elements->clear_to_table_body_context(); + $this->state->stack_of_open_elements->pop(); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE; + return $this->step( self::REPROCESS_CURRENT_NODE ); + + /* + * > An end tag whose tag name is one of: "body", "caption", "col", "colgroup", "html", "td", "th", "tr" + */ + case '-BODY': + case '-CAPTION': + case '-COL': + case '-COLGROUP': + case '-HTML': + case '-TD': + case '-TH': + case '-TR': + // Parse error: ignore the token. + return $this->step(); + } + + /* + * > Anything else + * > Process the token using the rules for the "in table" insertion mode. + */ + return $this->step_in_table(); + } + + /** + * Parses next element in the 'in row' insertion mode. + * + * This internal function performs the 'in row' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#parsing-main-intr + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_in_row(): bool { + $tag_name = $this->get_tag(); + $op_sigil = $this->is_tag_closer() ? '-' : '+'; + $op = "{$op_sigil}{$tag_name}"; + + switch ( $op ) { + /* + * > A start tag whose tag name is one of: "th", "td" + */ + case '+TH': + case '+TD': + $this->state->stack_of_open_elements->clear_to_table_row_context(); + $this->insert_html_element( $this->state->current_token ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_CELL; + $this->state->active_formatting_elements->insert_marker(); + return true; + + /* + * > An end tag whose tag name is "tr" + */ + case '-TR': + if ( ! $this->state->stack_of_open_elements->has_element_in_table_scope( 'TR' ) ) { + // Parse error: ignore the token. + return $this->step(); + } + + $this->state->stack_of_open_elements->clear_to_table_row_context(); + $this->state->stack_of_open_elements->pop(); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY; + return true; + + /* + * > A start tag whose tag name is one of: "caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr" + * > An end tag whose tag name is "table" + */ + case '+CAPTION': + case '+COL': + case '+COLGROUP': + case '+TBODY': + case '+TFOOT': + case '+THEAD': + case '+TR': + case '-TABLE': + if ( ! $this->state->stack_of_open_elements->has_element_in_table_scope( 'TR' ) ) { + // Parse error: ignore the token. + return $this->step(); + } + + $this->state->stack_of_open_elements->clear_to_table_row_context(); + $this->state->stack_of_open_elements->pop(); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY; + return $this->step( self::REPROCESS_CURRENT_NODE ); + + /* + * > An end tag whose tag name is one of: "tbody", "tfoot", "thead" + */ + case '-TBODY': + case '-TFOOT': + case '-THEAD': + if ( ! $this->state->stack_of_open_elements->has_element_in_table_scope( $tag_name ) ) { + // Parse error: ignore the token. + return $this->step(); + } + + if ( ! $this->state->stack_of_open_elements->has_element_in_table_scope( 'TR' ) ) { + // Ignore the token. + return $this->step(); + } + + $this->state->stack_of_open_elements->clear_to_table_row_context(); + $this->state->stack_of_open_elements->pop(); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY; + return $this->step( self::REPROCESS_CURRENT_NODE ); + + /* + * > An end tag whose tag name is one of: "body", "caption", "col", "colgroup", "html", "td", "th" + */ + case '-BODY': + case '-CAPTION': + case '-COL': + case '-COLGROUP': + case '-HTML': + case '-TD': + case '-TH': + // Parse error: ignore the token. + return $this->step(); + } + + /* + * > Anything else + * > Process the token using the rules for the "in table" insertion mode. + */ + return $this->step_in_table(); + } + + /** + * Parses next element in the 'in cell' insertion mode. + * + * This internal function performs the 'in cell' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#parsing-main-intd + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_in_cell(): bool { + $tag_name = $this->get_tag(); + $op_sigil = $this->is_tag_closer() ? '-' : '+'; + $op = "{$op_sigil}{$tag_name}"; + + switch ( $op ) { + /* + * > An end tag whose tag name is one of: "td", "th" + */ + case '-TD': + case '-TH': + if ( ! $this->state->stack_of_open_elements->has_element_in_table_scope( $tag_name ) ) { + // Parse error: ignore the token. + return $this->step(); + } + + $this->generate_implied_end_tags(); + + /* + * @todo This needs to check if the current node is an HTML element, meaning that + * when SVG and MathML support is added, this needs to differentiate between an + * HTML element of the given name, such as `
`, and a foreign element of + * the same given name. + */ + if ( ! $this->state->stack_of_open_elements->current_node_is( $tag_name ) ) { + // @todo Indicate a parse error once it's possible. + } + + $this->state->stack_of_open_elements->pop_until( $tag_name ); + $this->state->active_formatting_elements->clear_up_to_last_marker(); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_ROW; + return true; + + /* + * > A start tag whose tag name is one of: "caption", "col", "colgroup", "tbody", "td", + * > "tfoot", "th", "thead", "tr" + */ + case '+CAPTION': + case '+COL': + case '+COLGROUP': + case '+TBODY': + case '+TD': + case '+TFOOT': + case '+TH': + case '+THEAD': + case '+TR': + /* + * > Assert: The stack of open elements has a td or th element in table scope. + * + * Nothing to do here, except to verify in tests that this never appears. + */ + + $this->close_cell(); + return $this->step( self::REPROCESS_CURRENT_NODE ); + + /* + * > An end tag whose tag name is one of: "body", "caption", "col", "colgroup", "html" + */ + case '-BODY': + case '-CAPTION': + case '-COL': + case '-COLGROUP': + case '-HTML': + // Parse error: ignore the token. + return $this->step(); + + /* + * > An end tag whose tag name is one of: "table", "tbody", "tfoot", "thead", "tr" + */ + case '-TABLE': + case '-TBODY': + case '-TFOOT': + case '-THEAD': + case '-TR': + if ( ! $this->state->stack_of_open_elements->has_element_in_table_scope( $tag_name ) ) { + // Parse error: ignore the token. + return $this->step(); + } + $this->close_cell(); + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + /* + * > Anything else + * > Process the token using the rules for the "in body" insertion mode. + */ + return $this->step_in_body(); + } + + /** + * Parses next element in the 'in select' insertion mode. + * + * This internal function performs the 'in select' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselect + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_in_select(): bool { + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( parent::is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; + + switch ( $op ) { + /* + * > Any other character token + */ + case '#text': + /* + * > A character token that is U+0000 NULL + * + * If a text node only comprises null bytes then it should be + * entirely ignored and should not return to calling code. + */ + if ( parent::TEXT_IS_NULL_SEQUENCE === $this->text_node_classification ) { + // Parse error: ignore the token. + return $this->step(); + } + + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + return $this->step_in_body(); + + /* + * > A start tag whose tag name is "option" + */ + case '+OPTION': + if ( $this->state->stack_of_open_elements->current_node_is( 'OPTION' ) ) { + $this->state->stack_of_open_elements->pop(); + } + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is "optgroup" + * > A start tag whose tag name is "hr" + * + * These rules are identical except for the treatment of the self-closing flag and + * the subsequent pop of the HR void element, all of which is handled elsewhere in the processor. + */ + case '+OPTGROUP': + case '+HR': + if ( $this->state->stack_of_open_elements->current_node_is( 'OPTION' ) ) { + $this->state->stack_of_open_elements->pop(); + } + + if ( $this->state->stack_of_open_elements->current_node_is( 'OPTGROUP' ) ) { + $this->state->stack_of_open_elements->pop(); + } + + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > An end tag whose tag name is "optgroup" + */ + case '-OPTGROUP': + $current_node = $this->state->stack_of_open_elements->current_node(); + if ( $current_node && 'OPTION' === $current_node->node_name ) { + foreach ( $this->state->stack_of_open_elements->walk_up( $current_node ) as $parent ) { + break; + } + if ( $parent && 'OPTGROUP' === $parent->node_name ) { + $this->state->stack_of_open_elements->pop(); + } + } + + if ( $this->state->stack_of_open_elements->current_node_is( 'OPTGROUP' ) ) { + $this->state->stack_of_open_elements->pop(); + return true; + } + + // Parse error: ignore the token. + return $this->step(); + + /* + * > An end tag whose tag name is "option" + */ + case '-OPTION': + if ( $this->state->stack_of_open_elements->current_node_is( 'OPTION' ) ) { + $this->state->stack_of_open_elements->pop(); + return true; + } + + // Parse error: ignore the token. + return $this->step(); + + /* + * > An end tag whose tag name is "select" + * > A start tag whose tag name is "select" + * + * > It just gets treated like an end tag. + */ + case '-SELECT': + case '+SELECT': + if ( ! $this->state->stack_of_open_elements->has_element_in_select_scope( 'SELECT' ) ) { + // Parse error: ignore the token. + return $this->step(); + } + $this->state->stack_of_open_elements->pop_until( 'SELECT' ); + $this->reset_insertion_mode_appropriately(); + return true; + + /* + * > A start tag whose tag name is one of: "input", "keygen", "textarea" + * + * All three of these tags are considered a parse error when found in this insertion mode. + */ + case '+INPUT': + case '+KEYGEN': + case '+TEXTAREA': + if ( ! $this->state->stack_of_open_elements->has_element_in_select_scope( 'SELECT' ) ) { + // Ignore the token. + return $this->step(); + } + $this->state->stack_of_open_elements->pop_until( 'SELECT' ); + $this->reset_insertion_mode_appropriately(); + return $this->step( self::REPROCESS_CURRENT_NODE ); + + /* + * > A start tag whose tag name is one of: "script", "template" + * > An end tag whose tag name is "template" + */ + case '+SCRIPT': + case '+TEMPLATE': + case '-TEMPLATE': + return $this->step_in_head(); + } + + /* + * > Anything else + * > Parse error: ignore the token. + */ + return $this->step(); + } + + /** + * Parses next element in the 'in select in table' insertion mode. + * + * This internal function performs the 'in select in table' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#parsing-main-inselectintable + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_in_select_in_table(): bool { + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( parent::is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; + + switch ( $op ) { + /* + * > A start tag whose tag name is one of: "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th" + */ + case '+CAPTION': + case '+TABLE': + case '+TBODY': + case '+TFOOT': + case '+THEAD': + case '+TR': + case '+TD': + case '+TH': + // @todo Indicate a parse error once it's possible. + $this->state->stack_of_open_elements->pop_until( 'SELECT' ); + $this->reset_insertion_mode_appropriately(); + return $this->step( self::REPROCESS_CURRENT_NODE ); + + /* + * > An end tag whose tag name is one of: "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th" + */ + case '-CAPTION': + case '-TABLE': + case '-TBODY': + case '-TFOOT': + case '-THEAD': + case '-TR': + case '-TD': + case '-TH': + // @todo Indicate a parse error once it's possible. + if ( ! $this->state->stack_of_open_elements->has_element_in_table_scope( $token_name ) ) { + return $this->step(); + } + $this->state->stack_of_open_elements->pop_until( 'SELECT' ); + $this->reset_insertion_mode_appropriately(); + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + /* + * > Anything else + */ + return $this->step_in_select(); + } + + /** + * Parses next element in the 'in template' insertion mode. + * + * This internal function performs the 'in template' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 Stub implementation. + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#parsing-main-intemplate + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_in_template(): bool { + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $is_closer = $this->is_tag_closer(); + $op_sigil = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; + + switch ( $op ) { + /* + * > A character token + * > A comment token + * > A DOCTYPE token + */ + case '#text': + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + case 'html': + return $this->step_in_body(); + + /* + * > A start tag whose tag name is one of: "base", "basefont", "bgsound", "link", + * > "meta", "noframes", "script", "style", "template", "title" + * > An end tag whose tag name is "template" + */ + case '+BASE': + case '+BASEFONT': + case '+BGSOUND': + case '+LINK': + case '+META': + case '+NOFRAMES': + case '+SCRIPT': + case '+STYLE': + case '+TEMPLATE': + case '+TITLE': + case '-TEMPLATE': + return $this->step_in_head(); + + /* + * > A start tag whose tag name is one of: "caption", "colgroup", "tbody", "tfoot", "thead" + */ + case '+CAPTION': + case '+COLGROUP': + case '+TBODY': + case '+TFOOT': + case '+THEAD': + array_pop( $this->state->stack_of_template_insertion_modes ); + $this->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE; + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE; + return $this->step( self::REPROCESS_CURRENT_NODE ); + + /* + * > A start tag whose tag name is "col" + */ + case '+COL': + array_pop( $this->state->stack_of_template_insertion_modes ); + $this->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_COLUMN_GROUP; + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_COLUMN_GROUP; + return $this->step( self::REPROCESS_CURRENT_NODE ); + + /* + * > A start tag whose tag name is "tr" + */ + case '+TR': + array_pop( $this->state->stack_of_template_insertion_modes ); + $this->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY; + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY; + return $this->step( self::REPROCESS_CURRENT_NODE ); + + /* + * > A start tag whose tag name is one of: "td", "th" + */ + case '+TD': + case '+TH': + array_pop( $this->state->stack_of_template_insertion_modes ); + $this->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_ROW; + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_ROW; + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + /* + * > Any other start tag + */ + if ( ! $is_closer ) { + array_pop( $this->state->stack_of_template_insertion_modes ); + $this->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + /* + * > Any other end tag + */ + if ( $is_closer ) { + // Parse error: ignore the token. + return $this->step(); + } + + /* + * > An end-of-file token + */ + if ( ! $this->state->stack_of_open_elements->contains( 'TEMPLATE' ) ) { + // Stop parsing. + return false; + } + + // @todo Indicate a parse error once it's possible. + $this->state->stack_of_open_elements->pop_until( 'TEMPLATE' ); + $this->state->active_formatting_elements->clear_up_to_last_marker(); + array_pop( $this->state->stack_of_template_insertion_modes ); + $this->reset_insertion_mode_appropriately(); + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + /** + * Parses next element in the 'after body' insertion mode. + * + * This internal function performs the 'after body' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 Stub implementation. + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#parsing-main-afterbody + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_after_body(): bool { + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; + + switch ( $op ) { + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), + * > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * + * > Process the token using the rules for the "in body" insertion mode. + */ + case '#text': + if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { + return $this->step_in_body(); + } + goto after_body_anything_else; + break; + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->bail( 'Content outside of BODY is unsupported.' ); + break; + + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + return $this->step_in_body(); + + /* + * > An end tag whose tag name is "html" + * + * > If the parser was created as part of the HTML fragment parsing algorithm, + * > this is a parse error; ignore the token. (fragment case) + * > + * > Otherwise, switch the insertion mode to "after after body". + */ + case '-HTML': + if ( isset( $this->context_node ) ) { + return $this->step(); + } + + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_BODY; + /* + * The HTML element is not removed from the stack of open elements. + * Only internal state has changed, this does not qualify as a "step" + * in terms of advancing through the document to another token. + * Nothing has been pushed or popped. + * Proceed to parse the next item. + */ + return $this->step(); + } + + /* + * > Parse error. Switch the insertion mode to "in body" and reprocess the token. + */ + after_body_anything_else: + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + /** + * Parses next element in the 'in frameset' insertion mode. + * + * This internal function performs the 'in frameset' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 Stub implementation. + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#parsing-main-inframeset + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_in_frameset(): bool { + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; + + switch ( $op ) { + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), + * > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * > + * > Insert the character. + * + * This algorithm effectively strips non-whitespace characters from text and inserts + * them under HTML. This is not supported at this time. + */ + case '#text': + if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { + return $this->step_in_body(); + } + $this->bail( 'Non-whitespace characters cannot be handled in frameset.' ); + break; + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + return $this->step_in_body(); + + /* + * > A start tag whose tag name is "frameset" + */ + case '+FRAMESET': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > An end tag whose tag name is "frameset" + */ + case '-FRAMESET': + /* + * > If the current node is the root html element, then this is a parse error; + * > ignore the token. (fragment case) + */ + if ( $this->state->stack_of_open_elements->current_node_is( 'HTML' ) ) { + return $this->step(); + } + + /* + * > Otherwise, pop the current node from the stack of open elements. + */ + $this->state->stack_of_open_elements->pop(); + + /* + * > If the parser was not created as part of the HTML fragment parsing algorithm + * > (fragment case), and the current node is no longer a frameset element, then + * > switch the insertion mode to "after frameset". + */ + if ( ! isset( $this->context_node ) && ! $this->state->stack_of_open_elements->current_node_is( 'FRAMESET' ) ) { + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_FRAMESET; + } + + return true; + + /* + * > A start tag whose tag name is "frame" + * + * > Insert an HTML element for the token. Immediately pop the + * > current node off the stack of open elements. + * > + * > Acknowledge the token's self-closing flag, if it is set. + */ + case '+FRAME': + $this->insert_html_element( $this->state->current_token ); + $this->state->stack_of_open_elements->pop(); + return true; + + /* + * > A start tag whose tag name is "noframes" + */ + case '+NOFRAMES': + return $this->step_in_head(); + } + + // Parse error: ignore the token. + return $this->step(); + } + + /** + * Parses next element in the 'after frameset' insertion mode. + * + * This internal function performs the 'after frameset' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 Stub implementation. + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#parsing-main-afterframeset + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_after_frameset(): bool { + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; + + switch ( $op ) { + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), + * > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * > + * > Insert the character. + * + * This algorithm effectively strips non-whitespace characters from text and inserts + * them under HTML. This is not supported at this time. + */ + case '#text': + if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { + return $this->step_in_body(); + } + $this->bail( 'Non-whitespace characters cannot be handled in after frameset' ); + break; + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + return $this->step_in_body(); + + /* + * > An end tag whose tag name is "html" + */ + case '-HTML': + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_FRAMESET; + /* + * The HTML element is not removed from the stack of open elements. + * Only internal state has changed, this does not qualify as a "step" + * in terms of advancing through the document to another token. + * Nothing has been pushed or popped. + * Proceed to parse the next item. + */ + return $this->step(); + + /* + * > A start tag whose tag name is "noframes" + */ + case '+NOFRAMES': + return $this->step_in_head(); + } + + // Parse error: ignore the token. + return $this->step(); + } + + /** + * Parses next element in the 'after after body' insertion mode. + * + * This internal function performs the 'after after body' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 Stub implementation. + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#the-after-after-body-insertion-mode + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_after_after_body(): bool { + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; + + switch ( $op ) { + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->bail( 'Content outside of HTML is unsupported.' ); + break; + + /* + * > A DOCTYPE token + * > A start tag whose tag name is "html" + * + * > Process the token using the rules for the "in body" insertion mode. + */ + case 'html': + case '+HTML': + return $this->step_in_body(); + + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), + * > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * > + * > Process the token using the rules for the "in body" insertion mode. + */ + case '#text': + if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { + return $this->step_in_body(); + } + goto after_after_body_anything_else; + break; + } + + /* + * > Parse error. Switch the insertion mode to "in body" and reprocess the token. + */ + after_after_body_anything_else: + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + /** + * Parses next element in the 'after after frameset' insertion mode. + * + * This internal function performs the 'after after frameset' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 Stub implementation. + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#the-after-after-frameset-insertion-mode + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_after_after_frameset(): bool { + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; + + switch ( $op ) { + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->bail( 'Content outside of HTML is unsupported.' ); + break; + + /* + * > A DOCTYPE token + * > A start tag whose tag name is "html" + * + * > Process the token using the rules for the "in body" insertion mode. + */ + case 'html': + case '+HTML': + return $this->step_in_body(); + + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), + * > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * > + * > Process the token using the rules for the "in body" insertion mode. + * + * This algorithm effectively strips non-whitespace characters from text and inserts + * them under HTML. This is not supported at this time. + */ + case '#text': + if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) { + return $this->step_in_body(); + } + $this->bail( 'Non-whitespace characters cannot be handled in after after frameset.' ); + break; + + /* + * > A start tag whose tag name is "noframes" + */ + case '+NOFRAMES': + return $this->step_in_head(); + } + + // Parse error: ignore the token. + return $this->step(); + } + + /** + * Parses next element in the 'in foreign content' insertion mode. + * + * This internal function performs the 'in foreign content' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.7.0 Stub implementation. + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#parsing-main-inforeign + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_in_foreign_content(): bool { + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; + + /* + * > A start tag whose name is "font", if the token has any attributes named "color", "face", or "size" + * + * This section drawn out above the switch to more easily incorporate + * the additional rules based on the presence of the attributes. + */ + if ( + '+FONT' === $op && + ( + null !== $this->get_attribute( 'color' ) || + null !== $this->get_attribute( 'face' ) || + null !== $this->get_attribute( 'size' ) + ) + ) { + $op = '+FONT with attributes'; + } + + switch ( $op ) { + case '#text': + /* + * > A character token that is U+0000 NULL + * + * This is handled by `get_modifiable_text()`. + */ + + /* + * Whitespace-only text does not affect the frameset-ok flag. + * It is probably inter-element whitespace, but it may also + * contain character references which decode only to whitespace. + */ + if ( parent::TEXT_IS_GENERIC === $this->text_node_classification ) { + $this->state->frameset_ok = false; + } + + $this->insert_foreign_element( $this->state->current_token, false ); + return true; + + /* + * CDATA sections are alternate wrappers for text content and therefore + * ought to follow the same rules as text nodes. + */ + case '#cdata-section': + /* + * NULL bytes and whitespace do not change the frameset-ok flag. + */ + $current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ]; + $cdata_content_start = $current_token->start + 9; + $cdata_content_length = $current_token->length - 12; + if ( strspn( $this->html, "\0 \t\n\f\r", $cdata_content_start, $cdata_content_length ) !== $cdata_content_length ) { + $this->state->frameset_ok = false; + } + + $this->insert_foreign_element( $this->state->current_token, false ); + return true; + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_foreign_element( $this->state->current_token, false ); + return true; + + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "b", "big", "blockquote", "body", "br", "center", + * > "code", "dd", "div", "dl", "dt", "em", "embed", "h1", "h2", "h3", "h4", "h5", + * > "h6", "head", "hr", "i", "img", "li", "listing", "menu", "meta", "nobr", "ol", + * > "p", "pre", "ruby", "s", "small", "span", "strong", "strike", "sub", "sup", + * > "table", "tt", "u", "ul", "var" + * + * > A start tag whose name is "font", if the token has any attributes named "color", "face", or "size" + * + * > An end tag whose tag name is "br", "p" + * + * Closing BR tags are always reported by the Tag Processor as opening tags. + */ + case '+B': + case '+BIG': + case '+BLOCKQUOTE': + case '+BODY': + case '+BR': + case '+CENTER': + case '+CODE': + case '+DD': + case '+DIV': + case '+DL': + case '+DT': + case '+EM': + case '+EMBED': + case '+H1': + case '+H2': + case '+H3': + case '+H4': + case '+H5': + case '+H6': + case '+HEAD': + case '+HR': + case '+I': + case '+IMG': + case '+LI': + case '+LISTING': + case '+MENU': + case '+META': + case '+NOBR': + case '+OL': + case '+P': + case '+PRE': + case '+RUBY': + case '+S': + case '+SMALL': + case '+SPAN': + case '+STRONG': + case '+STRIKE': + case '+SUB': + case '+SUP': + case '+TABLE': + case '+TT': + case '+U': + case '+UL': + case '+VAR': + case '+FONT with attributes': + case '-BR': + case '-P': + // @todo Indicate a parse error once it's possible. + foreach ( $this->state->stack_of_open_elements->walk_up() as $current_node ) { + if ( + 'math' === $current_node->integration_node_type || + 'html' === $current_node->integration_node_type || + 'html' === $current_node->namespace + ) { + break; + } + + $this->state->stack_of_open_elements->pop(); + } + goto in_foreign_content_process_in_current_insertion_mode; + } + + /* + * > Any other start tag + */ + if ( ! $this->is_tag_closer() ) { + $this->insert_foreign_element( $this->state->current_token, false ); + + /* + * > If the token has its self-closing flag set, then run + * > the appropriate steps from the following list: + * > + * > ↪ the token's tag name is "script", and the new current node is in the SVG namespace + * > Acknowledge the token's self-closing flag, and then act as + * > described in the steps for a "script" end tag below. + * > + * > ↪ Otherwise + * > Pop the current node off the stack of open elements and + * > acknowledge the token's self-closing flag. + * + * Since the rules for SCRIPT below indicate to pop the element off of the stack of + * open elements, which is the same for the Otherwise condition, there's no need to + * separate these checks. The difference comes when a parser operates with the scripting + * flag enabled, and executes the script, which this parser does not support. + */ + if ( $this->state->current_token->has_self_closing_flag ) { + $this->state->stack_of_open_elements->pop(); + } + return true; + } + + /* + * > An end tag whose name is "script", if the current node is an SVG script element. + */ + if ( $this->is_tag_closer() && 'SCRIPT' === $this->state->current_token->node_name && 'svg' === $this->state->current_token->namespace ) { + $this->state->stack_of_open_elements->pop(); + return true; + } + + /* + * > Any other end tag + */ + if ( $this->is_tag_closer() ) { + $node = $this->state->stack_of_open_elements->current_node(); + if ( $tag_name !== $node->node_name ) { + // @todo Indicate a parse error once it's possible. + } + in_foreign_content_end_tag_loop: + if ( $node === $this->state->stack_of_open_elements->at( 1 ) ) { + return true; + } + + /* + * > If node's tag name, converted to ASCII lowercase, is the same as the tag name + * > of the token, pop elements from the stack of open elements until node has + * > been popped from the stack, and then return. + */ + if ( 0 === strcasecmp( $node->node_name, $tag_name ) ) { + foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { + $this->state->stack_of_open_elements->pop(); + if ( $node === $item ) { + return true; + } + } + } + + foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $item ) { + $node = $item; + break; + } + + if ( 'html' !== $node->namespace ) { + goto in_foreign_content_end_tag_loop; + } + + in_foreign_content_process_in_current_insertion_mode: + switch ( $this->state->insertion_mode ) { + case WP_HTML_Processor_State::INSERTION_MODE_INITIAL: + return $this->step_initial(); + + case WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML: + return $this->step_before_html(); + + case WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD: + return $this->step_before_head(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD: + return $this->step_in_head(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD_NOSCRIPT: + return $this->step_in_head_noscript(); + + case WP_HTML_Processor_State::INSERTION_MODE_AFTER_HEAD: + return $this->step_after_head(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_BODY: + return $this->step_in_body(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE: + return $this->step_in_table(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_TEXT: + return $this->step_in_table_text(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_CAPTION: + return $this->step_in_caption(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_COLUMN_GROUP: + return $this->step_in_column_group(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY: + return $this->step_in_table_body(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_ROW: + return $this->step_in_row(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_CELL: + return $this->step_in_cell(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT: + return $this->step_in_select(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT_IN_TABLE: + return $this->step_in_select_in_table(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE: + return $this->step_in_template(); + + case WP_HTML_Processor_State::INSERTION_MODE_AFTER_BODY: + return $this->step_after_body(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_FRAMESET: + return $this->step_in_frameset(); + + case WP_HTML_Processor_State::INSERTION_MODE_AFTER_FRAMESET: + return $this->step_after_frameset(); + + case WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_BODY: + return $this->step_after_after_body(); + + case WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_FRAMESET: + return $this->step_after_after_frameset(); + + // This should be unreachable but PHP doesn't have total type checking on switch. + default: + $this->bail( "Unaware of the requested parsing mode: '{$this->state->insertion_mode}'." ); + } + } + + $this->bail( 'Should not have been able to reach end of IN FOREIGN CONTENT processing. Check HTML API code.' ); + // This unnecessary return prevents tools from inaccurately reporting type errors. + return false; } /* @@ -1476,6 +5096,19 @@ */ /** + * Indicates the namespace of the current token, or "html" if there is none. + * + * @return string One of "html", "math", or "svg". + */ + public function get_namespace(): string { + if ( ! isset( $this->current_element ) ) { + return parent::get_namespace(); + } + + return $this->current_element->token->namespace; + } + + /** * Returns the uppercase name of the matched tag. * * The semantic rules for HTML specify that certain tags be reprocessed @@ -1496,7 +5129,7 @@ * * @return string|null Name of currently matched tag in input HTML, or `null` if none found. */ - public function get_tag() { + public function get_tag(): ?string { if ( null !== $this->last_error ) { return null; } @@ -1507,17 +5140,13 @@ $tag_name = parent::get_tag(); - switch ( $tag_name ) { - case 'IMAGE': - /* - * > A start tag whose tag name is "image" - * > Change the token's tag name to "img" and reprocess it. (Don't ask.) - */ - return 'IMG'; - - default: - return $tag_name; - } + /* + * > A start tag whose tag name is "image" + * > Change the token's tag name to "img" and reprocess it. (Don't ask.) + */ + return ( 'IMAGE' === $tag_name && 'html' === $this->get_namespace() ) + ? 'IMG' + : $tag_name; } /** @@ -1537,7 +5166,7 @@ * * @return bool Whether the currently matched tag contains the self-closing flag. */ - public function has_self_closing_flag() { + public function has_self_closing_flag(): bool { return $this->is_virtual() ? false : parent::has_self_closing_flag(); } @@ -1561,7 +5190,7 @@ * * @return string|null Name of the matched token. */ - public function get_token_name() { + public function get_token_name(): ?string { return $this->is_virtual() ? $this->current_element->token->node_name : parent::get_token_name(); @@ -1589,7 +5218,7 @@ * * @return string|null What kind of token is matched, or null. */ - public function get_token_type() { + public function get_token_type(): ?string { if ( $this->is_virtual() ) { /* * This logic comes from the Tag Processor. @@ -1651,7 +5280,7 @@ * @param string|bool $value The new attribute value. * @return bool Whether an attribute value was set. */ - public function set_attribute( $name, $value ) { + public function set_attribute( $name, $value ): bool { return $this->is_virtual() ? false : parent::set_attribute( $name, $value ); } @@ -1663,7 +5292,7 @@ * @param string $name The attribute name to remove. * @return bool Whether an attribute was removed. */ - public function remove_attribute( $name ) { + public function remove_attribute( $name ): bool { return $this->is_virtual() ? false : parent::remove_attribute( $name ); } @@ -1693,7 +5322,7 @@ * @param string $prefix Prefix of requested attribute names. * @return array|null List of attribute names, or `null` when no tag opener is matched. */ - public function get_attribute_names_with_prefix( $prefix ) { + public function get_attribute_names_with_prefix( $prefix ): ?array { return $this->is_virtual() ? null : parent::get_attribute_names_with_prefix( $prefix ); } @@ -1705,7 +5334,7 @@ * @param string $class_name The class name to add. * @return bool Whether the class was set to be added. */ - public function add_class( $class_name ) { + public function add_class( $class_name ): bool { return $this->is_virtual() ? false : parent::add_class( $class_name ); } @@ -1717,7 +5346,7 @@ * @param string $class_name The class name to remove. * @return bool Whether the class was set to be removed. */ - public function remove_class( $class_name ) { + public function remove_class( $class_name ): bool { return $this->is_virtual() ? false : parent::remove_class( $class_name ); } @@ -1726,10 +5355,14 @@ * * @since 6.6.0 Subclassed for the HTML Processor. * + * @todo When reconstructing active formatting elements with attributes, find a way + * to indicate if the virtually-reconstructed formatting elements contain the + * wanted class name. + * * @param string $wanted_class Look for this CSS class name, ASCII case-insensitive. * @return bool|null Whether the matched tag contains the given class name, or null if not matched. */ - public function has_class( $wanted_class ) { + public function has_class( $wanted_class ): ?bool { return $this->is_virtual() ? null : parent::has_class( $wanted_class ); } @@ -1773,7 +5406,7 @@ * * @return string */ - public function get_modifiable_text() { + public function get_modifiable_text(): string { return $this->is_virtual() ? '' : parent::get_modifiable_text(); } @@ -1796,7 +5429,7 @@ * * @return string|null */ - public function get_comment_type() { + public function get_comment_type(): ?string { return $this->is_virtual() ? null : parent::get_comment_type(); } @@ -1811,7 +5444,7 @@ * @param string $bookmark_name Name of the bookmark to remove. * @return bool Whether the bookmark already existed before removal. */ - public function release_bookmark( $bookmark_name ) { + public function release_bookmark( $bookmark_name ): bool { return parent::release_bookmark( "_{$bookmark_name}" ); } @@ -1832,7 +5465,7 @@ * @param string $bookmark_name Jump to the place in the document identified by this bookmark name. * @return bool Whether the internal cursor was successfully moved to the bookmark's location. */ - public function seek( $bookmark_name ) { + public function seek( $bookmark_name ): bool { // Flush any pending updates to the document before beginning. $this->get_updated_html(); @@ -1841,7 +5474,6 @@ ? $this->bookmarks[ $this->state->current_token->bookmark_name ]->start : 0; $bookmark_starts_at = $this->bookmarks[ $actual_bookmark_name ]->start; - $bookmark_length = $this->bookmarks[ $actual_bookmark_name ]->length; $direction = $bookmark_starts_at > $processor_started_at ? 'forward' : 'backward'; /* @@ -1874,46 +5506,92 @@ * and computation time. */ if ( 'backward' === $direction ) { - /* - * Instead of clearing the parser state and starting fresh, calling the stack methods - * maintains the proper flags in the parser. + + /* + * When moving backward, stateful stacks should be cleared. */ foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { - if ( 'context-node' === $item->bookmark_name ) { - break; - } - $this->state->stack_of_open_elements->remove_node( $item ); } foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { - if ( 'context-node' === $item->bookmark_name ) { - break; - } - $this->state->active_formatting_elements->remove_node( $item ); } - parent::seek( 'context-node' ); - $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; - $this->state->frameset_ok = true; - $this->element_queue = array(); - $this->current_element = null; + /* + * **After** clearing stacks, more processor state can be reset. + * This must be done after clearing the stack because those stacks generate events that + * would appear on a subsequent call to `next_token()`. + */ + $this->state->frameset_ok = true; + $this->state->stack_of_template_insertion_modes = array(); + $this->state->head_element = null; + $this->state->form_element = null; + $this->state->current_token = null; + $this->current_element = null; + $this->element_queue = array(); + + /* + * The absence of a context node indicates a full parse. + * The presence of a context node indicates a fragment parser. + */ + if ( null === $this->context_node ) { + $this->change_parsing_namespace( 'html' ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_INITIAL; + $this->breadcrumbs = array(); + + $this->bookmarks['initial'] = new WP_HTML_Span( 0, 0 ); + parent::seek( 'initial' ); + unset( $this->bookmarks['initial'] ); + } else { + + /* + * Push the root-node (HTML) back onto the stack of open elements. + * + * Fragment parsers require this extra bit of setup. + * It's handled in full parsers by advancing the processor state. + */ + $this->state->stack_of_open_elements->push( + new WP_HTML_Token( + 'root-node', + 'HTML', + false + ) + ); + + $this->change_parsing_namespace( + $this->context_node->integration_node_type + ? 'html' + : $this->context_node->namespace + ); + + if ( 'TEMPLATE' === $this->context_node->node_name ) { + $this->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE; + } + + $this->reset_insertion_mode_appropriately(); + $this->breadcrumbs = array_slice( $this->breadcrumbs, 0, 2 ); + parent::seek( $this->context_node->bookmark_name ); + } } - // When moving forwards, reparse the document until reaching the same location as the original bookmark. - if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) { - return true; - } - - while ( $this->next_token() ) { + /* + * Here, the processor moves forward through the document until it matches the bookmark. + * do-while is used here because the processor is expected to already be stopped on + * a token than may match the bookmarked location. + */ + do { + /* + * The processor will stop on virtual tokens, but bookmarks may not be set on them. + * They should not be matched when seeking a bookmark, skip them. + */ + if ( $this->is_virtual() ) { + continue; + } if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) { - while ( isset( $this->current_element ) && WP_HTML_Stack_Event::POP === $this->current_element->operation ) { - $this->current_element = array_shift( $this->element_queue ); - } return true; } - } + } while ( $this->next_token() ); return false; } @@ -1993,12 +5671,25 @@ * reaching for it, as inappropriate use could lead to broken * HTML structure or unwanted processing overhead. * + * Bookmarks cannot be set on tokens that do no appear in the original + * HTML text. For example, the HTML `
` stops at tags `TABLE`, + * `TBODY`, `TR`, and `TD`. The `TBODY` and `TR` tags do not appear in + * the original HTML and cannot be used as bookmarks. + * * @since 6.4.0 * * @param string $bookmark_name Identifies this particular bookmark. * @return bool Whether the bookmark was successfully created. */ - public function set_bookmark( $bookmark_name ) { + public function set_bookmark( $bookmark_name ): bool { + if ( $this->is_virtual() ) { + _doing_it_wrong( + __METHOD__, + __( 'Cannot set bookmarks on tokens that do no appear in the original HTML text.' ), + '6.8.0' + ); + return false; + } return parent::set_bookmark( "_{$bookmark_name}" ); } @@ -2010,7 +5701,7 @@ * @param string $bookmark_name Name to identify a bookmark that potentially exists. * @return bool Whether that bookmark exists. */ - public function has_bookmark( $bookmark_name ) { + public function has_bookmark( $bookmark_name ): bool { return parent::has_bookmark( "_{$bookmark_name}" ); } @@ -2027,7 +5718,7 @@ * * @see https://html.spec.whatwg.org/#close-a-p-element */ - private function close_a_p_element() { + private function close_a_p_element(): void { $this->generate_implied_end_tags( 'P' ); $this->state->stack_of_open_elements->pop_until( 'P' ); } @@ -2036,23 +5727,31 @@ * Closes elements that have implied end tags. * * @since 6.4.0 + * @since 6.7.0 Full spec support. * * @see https://html.spec.whatwg.org/#generate-implied-end-tags * * @param string|null $except_for_this_element Perform as if this element doesn't exist in the stack of open elements. */ - private function generate_implied_end_tags( $except_for_this_element = null ) { + private function generate_implied_end_tags( ?string $except_for_this_element = null ): void { $elements_with_implied_end_tags = array( 'DD', 'DT', 'LI', + 'OPTGROUP', + 'OPTION', 'P', + 'RB', + 'RP', + 'RT', + 'RTC', ); - $current_node = $this->state->stack_of_open_elements->current_node(); + $no_exclusions = ! isset( $except_for_this_element ); + while ( - $current_node && $current_node->node_name !== $except_for_this_element && - in_array( $this->state->stack_of_open_elements->current_node(), $elements_with_implied_end_tags, true ) + ( $no_exclusions || ! $this->state->stack_of_open_elements->current_node_is( $except_for_this_element ) ) && + in_array( $this->state->stack_of_open_elements->current_node()->node_name, $elements_with_implied_end_tags, true ) ) { $this->state->stack_of_open_elements->pop(); } @@ -2065,24 +5764,61 @@ * different from generating end tags in the normal sense. * * @since 6.4.0 + * @since 6.7.0 Full spec support. * * @see WP_HTML_Processor::generate_implied_end_tags * @see https://html.spec.whatwg.org/#generate-implied-end-tags */ - private function generate_implied_end_tags_thoroughly() { + private function generate_implied_end_tags_thoroughly(): void { $elements_with_implied_end_tags = array( + 'CAPTION', + 'COLGROUP', 'DD', 'DT', 'LI', + 'OPTGROUP', + 'OPTION', 'P', + 'RB', + 'RP', + 'RT', + 'RTC', + 'TBODY', + 'TD', + 'TFOOT', + 'TH', + 'THEAD', + 'TR', ); - while ( in_array( $this->state->stack_of_open_elements->current_node(), $elements_with_implied_end_tags, true ) ) { + while ( in_array( $this->state->stack_of_open_elements->current_node()->node_name, $elements_with_implied_end_tags, true ) ) { $this->state->stack_of_open_elements->pop(); } } /** + * Returns the adjusted current node. + * + * > The adjusted current node is the context element if the parser was created as + * > part of the HTML fragment parsing algorithm and the stack of open elements + * > has only one element in it (fragment case); otherwise, the adjusted current + * > node is the current node. + * + * @see https://html.spec.whatwg.org/#adjusted-current-node + * + * @since 6.7.0 + * + * @return WP_HTML_Token|null The adjusted current node. + */ + private function get_adjusted_current_node(): ?WP_HTML_Token { + if ( isset( $this->context_node ) && 1 === $this->state->stack_of_open_elements->count() ) { + return $this->context_node; + } + + return $this->state->stack_of_open_elements->current_node(); + } + + /** * Reconstructs the active formatting elements. * * > This has the effect of reopening all the formatting elements that were opened @@ -2097,7 +5833,7 @@ * * @return bool Whether any formatting elements needed to be reconstructed. */ - private function reconstruct_active_formatting_elements() { + private function reconstruct_active_formatting_elements(): bool { /* * > If there are no entries in the list of active formatting elements, then there is nothing * > to reconstruct; stop this algorithm. @@ -2125,8 +5861,199 @@ return false; } - $this->last_error = self::ERROR_UNSUPPORTED; - throw new WP_HTML_Unsupported_Exception( 'Cannot reconstruct active formatting elements when advancing and rewinding is required.' ); + $this->bail( 'Cannot reconstruct active formatting elements when advancing and rewinding is required.' ); + } + + /** + * Runs the reset the insertion mode appropriately algorithm. + * + * @since 6.7.0 + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#reset-the-insertion-mode-appropriately + */ + private function reset_insertion_mode_appropriately(): void { + // Set the first node. + $first_node = null; + foreach ( $this->state->stack_of_open_elements->walk_down() as $first_node ) { + break; + } + + /* + * > 1. Let _last_ be false. + */ + $last = false; + foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) { + /* + * > 2. Let _node_ be the last node in the stack of open elements. + * > 3. _Loop_: If _node_ is the first node in the stack of open elements, then set _last_ + * > to true, and, if the parser was created as part of the HTML fragment parsing + * > algorithm (fragment case), set node to the context element passed to + * > that algorithm. + * > … + */ + if ( $node === $first_node ) { + $last = true; + if ( isset( $this->context_node ) ) { + $node = $this->context_node; + } + } + + // All of the following rules are for matching HTML elements. + if ( 'html' !== $node->namespace ) { + continue; + } + + switch ( $node->node_name ) { + /* + * > 4. If node is a `select` element, run these substeps: + * > 1. If _last_ is true, jump to the step below labeled done. + * > 2. Let _ancestor_ be _node_. + * > 3. _Loop_: If _ancestor_ is the first node in the stack of open elements, + * > jump to the step below labeled done. + * > 4. Let ancestor be the node before ancestor in the stack of open elements. + * > … + * > 7. Jump back to the step labeled _loop_. + * > 8. _Done_: Switch the insertion mode to "in select" and return. + */ + case 'SELECT': + if ( ! $last ) { + foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $ancestor ) { + if ( 'html' !== $ancestor->namespace ) { + continue; + } + + switch ( $ancestor->node_name ) { + /* + * > 5. If _ancestor_ is a `template` node, jump to the step below + * > labeled _done_. + */ + case 'TEMPLATE': + break 2; + + /* + * > 6. If _ancestor_ is a `table` node, switch the insertion mode to + * > "in select in table" and return. + */ + case 'TABLE': + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT_IN_TABLE; + return; + } + } + } + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT; + return; + + /* + * > 5. If _node_ is a `td` or `th` element and _last_ is false, then switch the + * > insertion mode to "in cell" and return. + */ + case 'TD': + case 'TH': + if ( ! $last ) { + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_CELL; + return; + } + break; + + /* + * > 6. If _node_ is a `tr` element, then switch the insertion mode to "in row" + * > and return. + */ + case 'TR': + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_ROW; + return; + + /* + * > 7. If _node_ is a `tbody`, `thead`, or `tfoot` element, then switch the + * > insertion mode to "in table body" and return. + */ + case 'TBODY': + case 'THEAD': + case 'TFOOT': + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY; + return; + + /* + * > 8. If _node_ is a `caption` element, then switch the insertion mode to + * > "in caption" and return. + */ + case 'CAPTION': + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_CAPTION; + return; + + /* + * > 9. If _node_ is a `colgroup` element, then switch the insertion mode to + * > "in column group" and return. + */ + case 'COLGROUP': + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_COLUMN_GROUP; + return; + + /* + * > 10. If _node_ is a `table` element, then switch the insertion mode to + * > "in table" and return. + */ + case 'TABLE': + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE; + return; + + /* + * > 11. If _node_ is a `template` element, then switch the insertion mode to the + * > current template insertion mode and return. + */ + case 'TEMPLATE': + $this->state->insertion_mode = end( $this->state->stack_of_template_insertion_modes ); + return; + + /* + * > 12. If _node_ is a `head` element and _last_ is false, then switch the + * > insertion mode to "in head" and return. + */ + case 'HEAD': + if ( ! $last ) { + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD; + return; + } + break; + + /* + * > 13. If _node_ is a `body` element, then switch the insertion mode to "in body" + * > and return. + */ + case 'BODY': + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + return; + + /* + * > 14. If _node_ is a `frameset` element, then switch the insertion mode to + * > "in frameset" and return. (fragment case) + */ + case 'FRAMESET': + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_FRAMESET; + return; + + /* + * > 15. If _node_ is an `html` element, run these substeps: + * > 1. If the head element pointer is null, switch the insertion mode to + * > "before head" and return. (fragment case) + * > 2. Otherwise, the head element pointer is not null, switch the insertion + * > mode to "after head" and return. + */ + case 'HTML': + $this->state->insertion_mode = isset( $this->state->head_element ) + ? WP_HTML_Processor_State::INSERTION_MODE_AFTER_HEAD + : WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD; + return; + } + } + + /* + * > 16. If _last_ is true, then switch the insertion mode to "in body" + * > and return. (fragment case) + * + * This is only reachable if `$last` is true, as per the fragment parsing case. + */ + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; } /** @@ -2138,7 +6065,7 @@ * * @see https://html.spec.whatwg.org/#adoption-agency-algorithm */ - private function run_adoption_agency_algorithm() { + private function run_adoption_agency_algorithm(): void { $budget = 1000; $subject = $this->get_tag(); $current_node = $this->state->stack_of_open_elements->current_node(); @@ -2179,8 +6106,7 @@ // > If there is no such element, then return and instead act as described in the "any other end tag" entry above. if ( null === $formatting_element ) { - $this->last_error = self::ERROR_UNSUPPORTED; - throw new WP_HTML_Unsupported_Exception( 'Cannot run adoption agency when "any other end tag" is required.' ); + $this->bail( 'Cannot run adoption agency when "any other end tag" is required.' ); } // > If formatting element is not in the stack of open elements, then this is a parse error; remove the element from the list, and return. @@ -2210,7 +6136,7 @@ continue; } - if ( self::is_special( $item->node_name ) ) { + if ( self::is_special( $item ) ) { $furthest_block = $item; break; } @@ -2232,12 +6158,37 @@ } } - $this->last_error = self::ERROR_UNSUPPORTED; - throw new WP_HTML_Unsupported_Exception( 'Cannot extract common ancestor in adoption agency algorithm.' ); + $this->bail( 'Cannot extract common ancestor in adoption agency algorithm.' ); } - $this->last_error = self::ERROR_UNSUPPORTED; - throw new WP_HTML_Unsupported_Exception( 'Cannot run adoption agency when looping required.' ); + $this->bail( 'Cannot run adoption agency when looping required.' ); + } + + /** + * Runs the "close the cell" algorithm. + * + * > Where the steps above say to close the cell, they mean to run the following algorithm: + * > 1. Generate implied end tags. + * > 2. If the current node is not now a td element or a th element, then this is a parse error. + * > 3. Pop elements from the stack of open elements stack until a td element or a th element has been popped from the stack. + * > 4. Clear the list of active formatting elements up to the last marker. + * > 5. Switch the insertion mode to "in row". + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#close-the-cell + * + * @since 6.7.0 + */ + private function close_cell(): void { + $this->generate_implied_end_tags(); + // @todo Parse error if the current node is a "td" or "th" element. + foreach ( $this->state->stack_of_open_elements->walk_up() as $element ) { + $this->state->stack_of_open_elements->pop(); + if ( 'TD' === $element->node_name || 'TH' === $element->node_name ) { + break; + } + } + $this->state->active_formatting_elements->clear_up_to_last_marker(); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_ROW; } /** @@ -2249,26 +6200,178 @@ * * @param WP_HTML_Token $token Name of bookmark pointing to element in original input HTML. */ - private function insert_html_element( $token ) { + private function insert_html_element( WP_HTML_Token $token ): void { $this->state->stack_of_open_elements->push( $token ); } + /** + * Inserts a foreign element on to the stack of open elements. + * + * @since 6.7.0 + * + * @see https://html.spec.whatwg.org/#insert-a-foreign-element + * + * @param WP_HTML_Token $token Insert this token. The token's namespace and + * insertion point will be updated correctly. + * @param bool $only_add_to_element_stack Whether to skip the "insert an element at the adjusted + * insertion location" algorithm when adding this element. + */ + private function insert_foreign_element( WP_HTML_Token $token, bool $only_add_to_element_stack ): void { + $adjusted_current_node = $this->get_adjusted_current_node(); + + $token->namespace = $adjusted_current_node ? $adjusted_current_node->namespace : 'html'; + + if ( $this->is_mathml_integration_point() ) { + $token->integration_node_type = 'math'; + } elseif ( $this->is_html_integration_point() ) { + $token->integration_node_type = 'html'; + } + + if ( false === $only_add_to_element_stack ) { + /* + * @todo Implement the "appropriate place for inserting a node" and the + * "insert an element at the adjusted insertion location" algorithms. + * + * These algorithms mostly impacts DOM tree construction and not the HTML API. + * Here, there's no DOM node onto which the element will be appended, so the + * parser will skip this step. + * + * @see https://html.spec.whatwg.org/#insert-an-element-at-the-adjusted-insertion-location + */ + } + + $this->insert_html_element( $token ); + } + + /** + * Inserts a virtual element on the stack of open elements. + * + * @since 6.7.0 + * + * @param string $token_name Name of token to create and insert into the stack of open elements. + * @param string|null $bookmark_name Optional. Name to give bookmark for created virtual node. + * Defaults to auto-creating a bookmark name. + * @return WP_HTML_Token Newly-created virtual token. + */ + private function insert_virtual_node( $token_name, $bookmark_name = null ): WP_HTML_Token { + $here = $this->bookmarks[ $this->state->current_token->bookmark_name ]; + $name = $bookmark_name ?? $this->bookmark_token(); + + $this->bookmarks[ $name ] = new WP_HTML_Span( $here->start, 0 ); + + $token = new WP_HTML_Token( $name, $token_name, false ); + $this->insert_html_element( $token ); + return $token; + } + /* * HTML Specification Helpers */ /** + * Indicates if the current token is a MathML integration point. + * + * @since 6.7.0 + * + * @see https://html.spec.whatwg.org/#mathml-text-integration-point + * + * @return bool Whether the current token is a MathML integration point. + */ + private function is_mathml_integration_point(): bool { + $current_token = $this->state->current_token; + if ( ! isset( $current_token ) ) { + return false; + } + + if ( 'math' !== $current_token->namespace || 'M' !== $current_token->node_name[0] ) { + return false; + } + + $tag_name = $current_token->node_name; + + return ( + 'MI' === $tag_name || + 'MO' === $tag_name || + 'MN' === $tag_name || + 'MS' === $tag_name || + 'MTEXT' === $tag_name + ); + } + + /** + * Indicates if the current token is an HTML integration point. + * + * Note that this method must be an instance method with access + * to the current token, since it needs to examine the attributes + * of the currently-matched tag, if it's in the MathML namespace. + * Otherwise it would be required to scan the HTML and ensure that + * no other accounting is overlooked. + * + * @since 6.7.0 + * + * @see https://html.spec.whatwg.org/#html-integration-point + * + * @return bool Whether the current token is an HTML integration point. + */ + private function is_html_integration_point(): bool { + $current_token = $this->state->current_token; + if ( ! isset( $current_token ) ) { + return false; + } + + if ( 'html' === $current_token->namespace ) { + return false; + } + + $tag_name = $current_token->node_name; + + if ( 'svg' === $current_token->namespace ) { + return ( + 'DESC' === $tag_name || + 'FOREIGNOBJECT' === $tag_name || + 'TITLE' === $tag_name + ); + } + + if ( 'math' === $current_token->namespace ) { + if ( 'ANNOTATION-XML' !== $tag_name ) { + return false; + } + + $encoding = $this->get_attribute( 'encoding' ); + + return ( + is_string( $encoding ) && + ( + 0 === strcasecmp( $encoding, 'application/xhtml+xml' ) || + 0 === strcasecmp( $encoding, 'text/html' ) + ) + ); + } + + $this->bail( 'Should not have reached end of HTML Integration Point detection: check HTML API code.' ); + // This unnecessary return prevents tools from inaccurately reporting type errors. + return false; + } + + /** * Returns whether an element of a given name is in the HTML special category. * * @since 6.4.0 * * @see https://html.spec.whatwg.org/#special * - * @param string $tag_name Name of element to check. + * @param WP_HTML_Token|string $tag_name Node to check, or only its name if in the HTML namespace. * @return bool Whether the element of the given name is in the special category. */ - public static function is_special( $tag_name ) { - $tag_name = strtoupper( $tag_name ); + public static function is_special( $tag_name ): bool { + if ( is_string( $tag_name ) ) { + $tag_name = strtoupper( $tag_name ); + } else { + $tag_name = 'html' === $tag_name->namespace + ? strtoupper( $tag_name->node_name ) + : "{$tag_name->namespace} {$tag_name->node_name}"; + } return ( 'ADDRESS' === $tag_name || @@ -2356,17 +6459,17 @@ 'XMP' === $tag_name || // MathML. - 'MI' === $tag_name || - 'MO' === $tag_name || - 'MN' === $tag_name || - 'MS' === $tag_name || - 'MTEXT' === $tag_name || - 'ANNOTATION-XML' === $tag_name || + 'math MI' === $tag_name || + 'math MO' === $tag_name || + 'math MN' === $tag_name || + 'math MS' === $tag_name || + 'math MTEXT' === $tag_name || + 'math ANNOTATION-XML' === $tag_name || // SVG. - 'FOREIGNOBJECT' === $tag_name || - 'DESC' === $tag_name || - 'TITLE' === $tag_name + 'svg DESC' === $tag_name || + 'svg FOREIGNOBJECT' === $tag_name || + 'svg TITLE' === $tag_name ); } @@ -2382,7 +6485,7 @@ * @param string $tag_name Name of HTML tag to check. * @return bool Whether the given tag is an HTML Void Element. */ - public static function is_void( $tag_name ) { + public static function is_void( $tag_name ): bool { $tag_name = strtoupper( $tag_name ); return ( @@ -2407,6 +6510,53 @@ ); } + /** + * Gets an encoding from a given string. + * + * This is an algorithm defined in the WHAT-WG specification. + * + * Example: + * + * 'UTF-8' === self::get_encoding( 'utf8' ); + * 'UTF-8' === self::get_encoding( " \tUTF-8 " ); + * null === self::get_encoding( 'UTF-7' ); + * null === self::get_encoding( 'utf8; charset=' ); + * + * @see https://encoding.spec.whatwg.org/#concept-encoding-get + * + * @todo As this parser only supports UTF-8, only the UTF-8 + * encodings are detected. Add more as desired, but the + * parser will bail on non-UTF-8 encodings. + * + * @since 6.7.0 + * + * @param string $label A string which may specify a known encoding. + * @return string|null Known encoding if matched, otherwise null. + */ + protected static function get_encoding( string $label ): ?string { + /* + * > Remove any leading and trailing ASCII whitespace from label. + */ + $label = trim( $label, " \t\f\r\n" ); + + /* + * > If label is an ASCII case-insensitive match for any of the labels listed in the + * > table below, then return the corresponding encoding; otherwise return failure. + */ + switch ( strtolower( $label ) ) { + case 'unicode-1-1-utf-8': + case 'unicode11utf8': + case 'unicode20utf8': + case 'utf-8': + case 'utf8': + case 'x-unicode20utf8': + return 'UTF-8'; + + default: + return null; + } + } + /* * Constants that would pollute the top of the class if they were found there. */