From b342d5c7b883b620ee4b1929926dd0ffca391d3b Mon Sep 17 00:00:00 2001 From: dmsnell Date: Tue, 6 Feb 2024 19:23:13 +0000 Subject: [PATCH] HTML API: Join text nodes on invalid-tag-name boundaries. A fix was introduced to the Tag Processor to ensure that contiguous text in an HTML document emerges as a single text node spanning the full sequence. Unfortunately, that patch was marginally over-zealous in checking if a "<" started a syntax token or not. It used the following: {{{ = $c ) { ... } }}} This was based on the assumption that the A-Z and a-z letters are contiguous in the ASCII range; they aren't, and there's a gap of several characters in between. The result of this is that in some cases the parser created a text boundary when it didn't need to. Text boundaries can be surprising and can be created when reaching invalid syntax, HTML comments, and more hidden elements, so semantically this wasn't a major bug, but it was an aesthetic challenge. In this patch the check is properly compared for both upper- and lower-case variants that could potentially form tag names. {{{ = $c ) || ( 'a' <= $c && 'z' >= $c ) ) { ... } }}} This solves the problem and ensures that contiguous text appears as a single text node when scanning tokens. Developed in https://github.com/WordPress/wordpress-develop/pull/6041 Discussed in https://core.trac.wordpress.org/ticket/60385 Follow-up to [57489] Props dmsnell, jonsurrell Fixes #60385 Built from https://develop.svn.wordpress.org/trunk@57542 git-svn-id: http://core.svn.wordpress.org/trunk@57043 1a063a9b-81f0-0310-95a4-ce76da25c4cd --- .../html-api/class-wp-html-tag-processor.php | 18 ++++++++++++------ wp-includes/version.php | 2 +- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/wp-includes/html-api/class-wp-html-tag-processor.php b/wp-includes/html-api/class-wp-html-tag-processor.php index b437595bd9..447f4dac1b 100644 --- a/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1528,20 +1528,26 @@ class WP_HTML_Tag_Processor { if ( $at > $was_at ) { /* - * A "<" has been found in the document. That may be the start of another node, or - * it may be an "ivalid-first-character-of-tag-name" error. If this is not the start - * of another node the "<" should be included in this text node and another - * termination point should be found for the text node. + * A "<" normally starts a new HTML tag or syntax token, but in cases where the + * following character can't produce a valid token, the "<" is instead treated + * as plaintext and the parser should skip over it. This avoids a problem when + * following earlier practices of typing emoji with text, e.g. "<3". This + * should be a heart, not a tag. It's supposed to be rendered, not hidden. + * + * At this point the parser checks if this is one of those cases and if it is + * will continue searching for the next "<" in search of a token boundary. * * @see https://html.spec.whatwg.org/#tag-open-state */ if ( strlen( $html ) > $at + 1 ) { $next_character = $html[ $at + 1 ]; - $at_another_node = + $at_another_node = ( '!' === $next_character || '/' === $next_character || '?' === $next_character || - ( 'A' <= $next_character && $next_character <= 'z' ); + ( 'A' <= $next_character && $next_character <= 'Z' ) || + ( 'a' <= $next_character && $next_character <= 'z' ) + ); if ( ! $at_another_node ) { ++$at; continue; diff --git a/wp-includes/version.php b/wp-includes/version.php index 138d449087..1b722b937c 100644 --- a/wp-includes/version.php +++ b/wp-includes/version.php @@ -16,7 +16,7 @@ * * @global string $wp_version */ -$wp_version = '6.5-alpha-57541'; +$wp_version = '6.5-alpha-57542'; /** * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.