From df598e1d981f579cd0eebeec8eb5a343ef7fd03e Mon Sep 17 00:00:00 2001 From: dmsnell Date: Mon, 1 Jul 2024 23:36:15 +0000 Subject: [PATCH] HTML API: Optimize low-level parsing details in Tag Processor. Introduces a number of micro-level optimizations in the Tag Processor to improve token-scanning performance. Should contain no functional changes. Based on benchmarking against a list of the 100 most-visited websites, these changes result in an average improvement in performance of the Tag Processor for scanning tags from between 3.5% and 7.5%. Developed in https://github.com/WordPress/wordpress-develop/pull/6890 Discussed in https://core.trac.wordpress.org/ticket/61545 Follow-up to [55203]. See #61545. Built from https://develop.svn.wordpress.org/trunk@58613 git-svn-id: http://core.svn.wordpress.org/trunk@58046 1a063a9b-81f0-0310-95a4-ce76da25c4cd --- .../html-api/class-wp-html-decoder.php | 26 ++-- .../html-api/class-wp-html-tag-processor.php | 118 ++++++------------ wp-includes/version.php | 2 +- 3 files changed, 53 insertions(+), 93 deletions(-) diff --git a/wp-includes/html-api/class-wp-html-decoder.php b/wp-includes/html-api/class-wp-html-decoder.php index 78976002b4..42c6424423 100644 --- a/wp-includes/html-api/class-wp-html-decoder.php +++ b/wp-includes/html-api/class-wp-html-decoder.php @@ -141,7 +141,7 @@ class WP_HTML_Decoder { while ( $at < $end ) { $next_character_reference_at = strpos( $text, '&', $at ); - if ( false === $next_character_reference_at || $next_character_reference_at >= $end ) { + if ( false === $next_character_reference_at ) { break; } @@ -436,26 +436,26 @@ class WP_HTML_Decoder { } if ( $code_point <= 0x7FF ) { - $byte1 = ( $code_point >> 6 ) | 0xC0; - $byte2 = $code_point & 0x3F | 0x80; + $byte1 = chr( ( $code_point >> 6 ) | 0xC0 ); + $byte2 = chr( $code_point & 0x3F | 0x80 ); - return pack( 'CC', $byte1, $byte2 ); + return "{$byte1}{$byte2}"; } if ( $code_point <= 0xFFFF ) { - $byte1 = ( $code_point >> 12 ) | 0xE0; - $byte2 = ( $code_point >> 6 ) & 0x3F | 0x80; - $byte3 = $code_point & 0x3F | 0x80; + $byte1 = chr( ( $code_point >> 12 ) | 0xE0 ); + $byte2 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 ); + $byte3 = chr( $code_point & 0x3F | 0x80 ); - return pack( 'CCC', $byte1, $byte2, $byte3 ); + return "{$byte1}{$byte2}{$byte3}"; } // Any values above U+10FFFF are eliminated above in the pre-check. - $byte1 = ( $code_point >> 18 ) | 0xF0; - $byte2 = ( $code_point >> 12 ) & 0x3F | 0x80; - $byte3 = ( $code_point >> 6 ) & 0x3F | 0x80; - $byte4 = $code_point & 0x3F | 0x80; + $byte1 = chr( ( $code_point >> 18 ) | 0xF0 ); + $byte2 = chr( ( $code_point >> 12 ) & 0x3F | 0x80 ); + $byte3 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 ); + $byte4 = chr( $code_point & 0x3F | 0x80 ); - return pack( 'CCCC', $byte1, $byte2, $byte3, $byte4 ); + return "{$byte1}{$byte2}{$byte3}{$byte4}"; } } diff --git a/wp-includes/html-api/class-wp-html-tag-processor.php b/wp-includes/html-api/class-wp-html-tag-processor.php index 8fc75938c9..a388af1ef7 100644 --- a/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1524,21 +1524,10 @@ class WP_HTML_Tag_Processor { $was_at = $this->bytes_already_parsed; $at = $was_at; - while ( false !== $at && $at < $doc_length ) { + while ( $at < $doc_length ) { $at = strpos( $html, '<', $at ); - - /* - * This does not imply an incomplete parse; it indicates that there - * can be nothing left in the document other than a #text node. - */ if ( false === $at ) { - $this->parser_state = self::STATE_TEXT_NODE; - $this->token_starts_at = $was_at; - $this->token_length = strlen( $html ) - $was_at; - $this->text_starts_at = $was_at; - $this->text_length = $this->token_length; - $this->bytes_already_parsed = strlen( $html ); - return true; + break; } if ( $at > $was_at ) { @@ -1554,19 +1543,9 @@ class WP_HTML_Tag_Processor { * * @see https://html.spec.whatwg.org/#tag-open-state */ - if ( strlen( $html ) > $at + 1 ) { - $next_character = $html[ $at + 1 ]; - $at_another_node = ( - '!' === $next_character || - '/' === $next_character || - '?' === $next_character || - ( 'A' <= $next_character && $next_character <= 'Z' ) || - ( 'a' <= $next_character && $next_character <= 'z' ) - ); - if ( ! $at_another_node ) { - ++$at; - continue; - } + if ( 1 !== strspn( $html, '!/?abcdefghijklmnopqrstuvwxyzABCEFGHIJKLMNOPQRSTUVWXYZ', $at + 1, 1 ) ) { + ++$at; + continue; } $this->parser_state = self::STATE_TEXT_NODE; @@ -1630,11 +1609,7 @@ class WP_HTML_Tag_Processor { * `