From 9ccb882ff0a2448fb2525f2fdbf74d23a6936f2f Mon Sep 17 00:00:00 2001 From: dmsnell Date: Sat, 15 Jun 2024 06:33:13 +0000 Subject: [PATCH] KSES: Preserve some additional invalid HTML comment syntaxes. When `wp_kses_split` processes a document it attempts to leave HTML comments alone. It makes minor adjustments, but leaves the comments in the document in its output. Unfortunately it only recognizes one kind of HTML comment and rejects many others. This patch makes a minor adjustment to the algorithm in `wp_kses_split` to recognize and preserve an additional kind of HTML comment: closing tags with an invalid tag name, e.g. ``. These invalid closing tags must be interpreted as comments by a browser. This bug fix aligns the implementation of `wp_kses_split()` more closely with its stated goal of leaving HTML comments as comments. It doesn't attempt to fully fix the mis-parsed comments, but it does propose a minor fix that hopefully won't break any existing code or projects. Developed in https://github.com/WordPress/wordpress-develop/pull/6395 Discussed in https://core.trac.wordpress.org/ticket/61009 Props ellatrix, dmsnell, joemcgill, jorbin, westonruter, zieladam. See #61009. Built from https://develop.svn.wordpress.org/trunk@58418 git-svn-id: http://core.svn.wordpress.org/trunk@57867 1a063a9b-81f0-0310-95a4-ce76da25c4cd --- wp-includes/kses.php | 56 ++++++++++++++++++++++++++++++++++++++--- wp-includes/version.php | 2 +- 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/wp-includes/kses.php b/wp-includes/kses.php index 5a1ae2de84..a9e8bbdd3e 100644 --- a/wp-includes/kses.php +++ b/wp-includes/kses.php @@ -963,6 +963,7 @@ function wp_kses_version() { * It also matches stray `>` characters. * * @since 1.0.0 + * @since 6.6.0 Recognize additional forms of invalid HTML which convert into comments. * * @global array[]|string $pass_allowed_html An array of allowed HTML elements and attributes, * or a context name such as 'post'. @@ -981,7 +982,18 @@ function wp_kses_split( $content, $allowed_html, $allowed_protocols ) { $pass_allowed_html = $allowed_html; $pass_allowed_protocols = $allowed_protocols; - return preg_replace_callback( '%(|$))|(<[^>]*(>|$)|>)%', '_wp_kses_split_callback', $content ); + $token_pattern = <<|$)) # - Normative HTML comments. + | + ]*> # - Closing tags with invalid tag names. + ) + | + (<[^>]*(>|$)|>) # Tag-like spans of text. +~x +REGEX; + return preg_replace_callback( $token_pattern, '_wp_kses_split_callback', $content ); } /** @@ -1069,23 +1081,61 @@ function _wp_kses_split_callback( $matches ) { * @access private * @ignore * @since 1.0.0 + * @since 6.6.0 Recognize additional forms of invalid HTML which convert into comments. * * @param string $content Content to filter. * @param array[]|string $allowed_html An array of allowed HTML elements and attributes, * or a context name such as 'post'. See wp_kses_allowed_html() * for the list of accepted context names. * @param string[] $allowed_protocols Array of allowed URL protocols. + * * @return string Fixed HTML element */ function wp_kses_split2( $content, $allowed_html, $allowed_protocols ) { $content = wp_kses_stripslashes( $content ); - // It matched a ">" character. + /* + * The regex pattern used to split HTML into chunks attempts + * to split on HTML token boundaries. This function should + * thus receive chunks that _either_ start with meaningful + * syntax tokens, like a tag `
` or a comment ``. + * + * If the first character of the `$content` chunk _isn't_ one + * of these syntax elements, which always starts with `<`, then + * the match had to be for the final alternation of `>`. In such + * case, it's probably standing on its own and could be encoded + * with a character reference to remove ambiguity. + * + * In other words, if this chunk isn't from a match of a syntax + * token, it's just a plaintext greater-than (`>`) sign. + */ if ( ! str_starts_with( $content, '<' ) ) { return '>'; } - // Allow HTML comments. + /* + * When a closing tag appears with a name that isn't a valid tag name, + * it must be interpreted as an HTML comment. It extends until the + * first `>` character after the initial opening `]*>$~', $content ) ) { + $content = substr( $content, 2, -1 ); + $transformed = null; + + while ( $transformed !== $content ) { + $transformed = wp_kses( $content, $allowed_html, $allowed_protocols ); + $content = $transformed; + } + + return ""; + } + + /* + * Normative HTML comments should be handled separately as their + * parsing rules differ from those for tags and text nodes. + */ if ( str_starts_with( $content, '' ), '', $content ); diff --git a/wp-includes/version.php b/wp-includes/version.php index 1a52b9a9ff..ca74c6cb62 100644 --- a/wp-includes/version.php +++ b/wp-includes/version.php @@ -16,7 +16,7 @@ * * @global string $wp_version */ -$wp_version = '6.6-beta2-58417'; +$wp_version = '6.6-beta2-58418'; /** * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.