From 6badd91e7bebba1cb548777ea7d1edb028334cd9 Mon Sep 17 00:00:00 2001 From: dmsnell Date: Tue, 12 Mar 2024 00:24:09 +0000 Subject: [PATCH] HTML API: Trigger active format reconstruction when reaching text nodes. When encountering text nodes in an HTML document, the HTML parser needs to run the active format reconstruction algorithm, even if it doesn't stop to visit those text nodes. This is because the formats, which might need reconstructing, will impact the breadcrumbs of all downstream nodes from the text node. In this patch, this process is triggered, which properly triggers the active format reconstruction. It also enables the visiting of other token types as is possible in the Tag Processor. Developed in https://github.com/WordPress/wordpress-develop/pull/6054 Discussed in https://core.trac.wordpress.org/ticket/60170 Props: dmsnell, jonsurrell, westonruter. Fixes: #60455. Follow-up to: [57348]. Built from https://develop.svn.wordpress.org/trunk@57806 git-svn-id: http://core.svn.wordpress.org/trunk@57307 1a063a9b-81f0-0310-95a4-ce76da25c4cd --- .../html-api/class-wp-html-processor.php | 147 ++++++++++++------ wp-includes/version.php | 2 +- 2 files changed, 102 insertions(+), 47 deletions(-) diff --git a/wp-includes/html-api/class-wp-html-processor.php b/wp-includes/html-api/class-wp-html-processor.php index 3021bee3a7..c76cc192b1 100644 --- a/wp-includes/html-api/class-wp-html-processor.php +++ b/wp-includes/html-api/class-wp-html-processor.php @@ -361,6 +361,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { public function next_tag( $query = null ) { if ( null === $query ) { while ( $this->step() ) { + if ( '#tag' !== $this->get_token_type() ) { + continue; + } + if ( ! $this->is_tag_closer() ) { return true; } @@ -384,6 +388,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { if ( ! ( array_key_exists( 'breadcrumbs', $query ) && is_array( $query['breadcrumbs'] ) ) ) { while ( $this->step() ) { + if ( '#tag' !== $this->get_token_type() ) { + continue; + } + if ( ! $this->is_tag_closer() ) { return true; } @@ -405,6 +413,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { $match_offset = isset( $query['match_offset'] ) ? (int) $query['match_offset'] : 1; while ( $match_offset > 0 && $this->step() ) { + if ( '#tag' !== $this->get_token_type() ) { + continue; + } + if ( $this->matches_breadcrumbs( $breadcrumbs ) && 0 === --$match_offset ) { return true; } @@ -428,13 +440,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * @return bool */ public function next_token() { - $found_a_token = parent::next_token(); - - if ( '#tag' === $this->get_token_type() ) { - $this->step( self::PROCESS_CURRENT_NODE ); - } - - return $found_a_token; + return $this->step(); } /** @@ -463,10 +469,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * @return bool Whether the currently-matched tag is found at the given nested structure. */ public function matches_breadcrumbs( $breadcrumbs ) { - if ( ! $this->get_tag() ) { - return false; - } - // Everything matches when there are zero constraints. if ( 0 === count( $breadcrumbs ) ) { return true; @@ -529,25 +531,35 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * is provided in the opening tag, otherwise it expects a tag closer. */ $top_node = $this->state->stack_of_open_elements->current_node(); - if ( $top_node && self::is_void( $top_node->node_name ) ) { + if ( + $top_node && ( + // Void elements. + self::is_void( $top_node->node_name ) || + // Comments, text nodes, and other atomic tokens. + '#' === $top_node->node_name[0] || + // Doctype declarations. + 'html' === $top_node->node_name + ) + ) { $this->state->stack_of_open_elements->pop(); } } if ( self::PROCESS_NEXT_NODE === $node_to_process ) { - while ( parent::next_token() && '#tag' !== $this->get_token_type() ) { - continue; - } + parent::next_token(); } // Finish stepping when there are no more tokens in the document. - if ( null === $this->get_tag() ) { + if ( + WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state || + WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state + ) { return false; } $this->state->current_token = new WP_HTML_Token( - $this->bookmark_tag(), - $this->get_tag(), + $this->bookmark_token(), + $this->get_token_name(), $this->has_self_closing_flag(), $this->release_internal_bookmark_on_destruct ); @@ -591,10 +603,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL. */ public function get_breadcrumbs() { - if ( ! $this->get_tag() ) { - return null; - } - $breadcrumbs = array(); foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) { $breadcrumbs[] = $stack_item->node_name; @@ -619,11 +627,61 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * @return bool Whether an element was found. */ private function step_in_body() { - $tag_name = $this->get_tag(); - $op_sigil = $this->is_tag_closer() ? '-' : '+'; - $op = "{$op_sigil}{$tag_name}"; + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; switch ( $op ) { + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + case '#text': + $this->reconstruct_active_formatting_elements(); + + $current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ]; + + /* + * > A character token that is U+0000 NULL + * + * Any successive sequence of NULL bytes is ignored and won't + * trigger active format reconstruction. Therefore, if the text + * only comprises NULL bytes then the token should be ignored + * here, but if there are any other characters in the stream + * the active formats should be reconstructed. + */ + if ( + 1 <= $current_token->length && + "\x00" === $this->html[ $current_token->start ] && + strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length + ) { + // Parse error: ignore the token. + return $this->step(); + } + + /* + * Whitespace-only text does not affect the frameset-ok flag. + * It is probably inter-element whitespace, but it may also + * contain character references which decode only to whitespace. + */ + $text = $this->get_modifiable_text(); + if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) { + $this->state->frameset_ok = false; + } + + $this->insert_html_element( $this->state->current_token ); + return true; + + case 'html': + /* + * > A DOCTYPE token + * > Parse error. Ignore the token. + */ + return $this->step(); + /* * > A start tag whose tag name is "button" */ @@ -711,17 +769,17 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { case '-SECTION': case '-SUMMARY': case '-UL': - if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name ) ) { + if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) { // @todo Report parse error. // Ignore the token. return $this->step(); } $this->generate_implied_end_tags(); - if ( $this->state->stack_of_open_elements->current_node()->node_name !== $tag_name ) { + if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) { // @todo Record parse error: this error doesn't impact parsing. } - $this->state->stack_of_open_elements->pop_until( $tag_name ); + $this->state->stack_of_open_elements->pop_until( $token_name ); return true; /* @@ -783,7 +841,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { $this->generate_implied_end_tags(); - if ( $this->state->stack_of_open_elements->current_node()->node_name !== $tag_name ) { + if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) { // @todo Record parse error: this error doesn't impact parsing. } @@ -799,7 +857,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { case '+LI': $this->state->frameset_ok = false; $node = $this->state->stack_of_open_elements->current_node(); - $is_li = 'LI' === $tag_name; + $is_li = 'LI' === $token_name; in_body_list_loop: /* @@ -862,7 +920,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * then this is a parse error; ignore the token. */ ( - 'LI' === $tag_name && + 'LI' === $token_name && ! $this->state->stack_of_open_elements->has_element_in_list_item_scope( 'LI' ) ) || /* @@ -872,8 +930,8 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * parse error; ignore the token. */ ( - 'LI' !== $tag_name && - ! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name ) + 'LI' !== $token_name && + ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) ) { /* @@ -884,13 +942,13 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { return $this->step(); } - $this->generate_implied_end_tags( $tag_name ); + $this->generate_implied_end_tags( $token_name ); - if ( $tag_name !== $this->state->stack_of_open_elements->current_node()->node_name ) { + if ( $token_name !== $this->state->stack_of_open_elements->current_node()->node_name ) { // @todo Indicate a parse error once it's possible. This error does not impact the logic here. } - $this->state->stack_of_open_elements->pop_until( $tag_name ); + $this->state->stack_of_open_elements->pop_until( $token_name ); return true; /* @@ -1043,7 +1101,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody */ - switch ( $tag_name ) { + switch ( $token_name ) { case 'APPLET': case 'BASE': case 'BASEFONT': @@ -1091,7 +1149,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { case 'TR': case 'XMP': $this->last_error = self::ERROR_UNSUPPORTED; - throw new WP_HTML_Unsupported_Exception( "Cannot process {$tag_name} element." ); + throw new WP_HTML_Unsupported_Exception( "Cannot process {$token_name} element." ); } if ( ! $this->is_tag_closer() ) { @@ -1113,7 +1171,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * close anything beyond its containing `P` or `DIV` element. */ foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) { - if ( $tag_name === $node->node_name ) { + if ( $token_name === $node->node_name ) { break; } @@ -1123,7 +1181,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { } } - $this->generate_implied_end_tags( $tag_name ); + $this->generate_implied_end_tags( $token_name ); if ( $node !== $this->state->stack_of_open_elements->current_node() ) { // @todo Record parse error: this error doesn't impact parsing. } @@ -1142,19 +1200,16 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { */ /** - * Creates a new bookmark for the currently-matched tag and returns the generated name. + * Creates a new bookmark for the currently-matched token and returns the generated name. * * @since 6.4.0 + * @since 6.5.0 Renamed from bookmark_tag() to bookmark_token(). * * @throws Exception When unable to allocate requested bookmark. * * @return string|false Name of created bookmark, or false if unable to create. */ - private function bookmark_tag() { - if ( ! $this->get_tag() ) { - return false; - } - + private function bookmark_token() { if ( ! parent::set_bookmark( ++$this->bookmark_counter ) ) { $this->last_error = self::ERROR_EXCEEDED_MAX_BOOKMARKS; throw new Exception( 'could not allocate bookmark' ); diff --git a/wp-includes/version.php b/wp-includes/version.php index 76116aa6ff..4f56ef13e4 100644 --- a/wp-includes/version.php +++ b/wp-includes/version.php @@ -16,7 +16,7 @@ * * @global string $wp_version */ -$wp_version = '6.6-alpha-57805'; +$wp_version = '6.6-alpha-57806'; /** * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.