From 7a0f8602f3112681a44294d908f18efae9610b7d Mon Sep 17 00:00:00 2001 From: Scott Taylor Date: Fri, 2 Oct 2015 04:26:25 +0000 Subject: [PATCH] Shortcodes/Formatting: Add PCRE Performance Testing * Move pattern from `wptexturize()` into a separate function. * Move pattern from `wp_html_split()` into a separate function. * Beautify code for `wp_html_split()`. * Remove unnecessary instances of `/s` modifier in patterns that don't use dots. * Add `tests/phpunit/data/formatting/whole-posts.php` for testing larger strings. * Add function `benchmark_pcre_backtracking()`. * Add tests for `wp_html_split()`. * Add tests for `wptexturize()`. * Add tests for `get_shortcode_regex()`. Props miqrogroove. Fixes #34121. Built from https://develop.svn.wordpress.org/trunk@34761 git-svn-id: http://core.svn.wordpress.org/trunk@34726 1a063a9b-81f0-0310-95a4-ce76da25c4cd --- wp-includes/formatting.php | 152 +++++++++++++++++++++++++------------ wp-includes/shortcodes.php | 12 +-- wp-includes/version.php | 2 +- 3 files changed, 110 insertions(+), 56 deletions(-) diff --git a/wp-includes/formatting.php b/wp-includes/formatting.php index ef60cce3c2..9b075895f8 100644 --- a/wp-includes/formatting.php +++ b/wp-includes/formatting.php @@ -219,43 +219,8 @@ function wptexturize( $text, $reset = false ) { preg_match_all( '@\[/?([^<>&/\[\]\x00-\x20]++)@', $text, $matches ); $tagnames = array_intersect( array_keys( $shortcode_tags ), $matches[1] ); $found_shortcodes = ! empty( $tagnames ); - if ( $found_shortcodes ) { - $tagregexp = join( '|', array_map( 'preg_quote', $tagnames ) ); - $tagregexp = "(?:$tagregexp)(?![\\w-])"; // Excerpt of get_shortcode_regex(). - $shortcode_regex = - '\[' // Find start of shortcode. - . '[\/\[]?' // Shortcodes may begin with [/ or [[ - . $tagregexp // Only match registered shortcodes, because performance. - . '(?:' - . '[^\[\]<>]+' // Shortcodes do not contain other shortcodes. Quantifier critical. - . '|' - . '<[^\[\]>]*>' // HTML elements permitted. Prevents matching ] before >. - . ')*+' // Possessive critical. - . '\]' // Find end of shortcode. - . '\]?'; // Shortcodes may end with ]] - } - - $comment_regex = - '!' // Start of comment, after the <. - . '(?:' // Unroll the loop: Consume everything until --> is found. - . '-(?!->)' // Dash not followed by end of comment. - . '[^\-]*+' // Consume non-dashes. - . ')*+' // Loop possessively. - . '(?:-->)?'; // End of comment. If not found, match all input. - - $html_regex = // Needs replaced with wp_html_split() per Shortcode API Roadmap. - '<' // Find start of element. - . '(?(?=!--)' // Is this a comment? - . $comment_regex // Find end of comment. - . '|' - . '[^>]*>?' // Find end of element. If not found, match all input. - . ')'; - - if ( $found_shortcodes ) { - $regex = '/(' . $html_regex . '|' . $shortcode_regex . ')/s'; - } else { - $regex = '/(' . $html_regex . ')/s'; - } + $shortcode_regex = $found_shortcodes ? _get_wptexturize_shortcode_regex( $tagnames ) : ''; + $regex = _get_wptexturize_split_regex( $shortcode_regex ); $textarr = preg_split( $regex, $text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); @@ -264,7 +229,7 @@ function wptexturize( $text, $reset = false ) { $first = $curl[0]; if ( '<' === $first ) { if ( ' is found. + . '-(?!->)' // Dash not followed by end of comment. + . '[^\-]*+' // Consume non-dashes. + . ')*+' // Loop possessively. + . '(?:-->)?'; // End of comment. If not found, match all input. + + $html_regex = // Needs replaced with wp_html_split() per Shortcode API Roadmap. + '<' // Find start of element. + . '(?(?=!--)' // Is this a comment? + . $comment_regex // Find end of comment. + . '|' + . '[^>]*>?' // Find end of element. If not found, match all input. + . ')'; + } + + if ( empty( $shortcode_regex ) ) { + $regex = '/(' . $html_regex . ')/'; + } else { + $regex = '/(' . $html_regex . '|' . $shortcode_regex . ')/'; + } + + return $regex; +} + +/** + * Retrieve the regular expression for shortcodes. + * + * @access private + * @ignore + * @internal This function will be removed in 4.5.0 per Shortcode API Roadmap. + * @since 4.4.0 + * + * @param array $tagnames List of shortcodes to find. + * @return string The regular expression + */ +function _get_wptexturize_shortcode_regex( $tagnames ) { + $tagregexp = join( '|', array_map( 'preg_quote', $tagnames ) ); + $tagregexp = "(?:$tagregexp)(?=[\\s\\]\\/])"; // Excerpt of get_shortcode_regex(). + $regex = + '\[' // Find start of shortcode. + . '[\/\[]?' // Shortcodes may begin with [/ or [[ + . $tagregexp // Only match registered shortcodes, because performance. + . '(?:' + . '[^\[\]<>]+' // Shortcodes do not contain other shortcodes. Quantifier critical. + . '|' + . '<[^\[\]>]*>' // HTML elements permitted. Prevents matching ] before >. + . ')*+' // Possessive critical. + . '\]' // Find end of shortcode. + . '\]?'; // Shortcodes may end with ]] + + return $regex; } /** @@ -768,7 +822,7 @@ function shortcode_unautop( $pee ) { . ')' . '(?:' . $spaces . ')*+' // optional trailing whitespace . '<\\/p>' // closing paragraph - . '/s'; + . '/'; return preg_replace( $pattern, '$1', $pee ); } diff --git a/wp-includes/shortcodes.php b/wp-includes/shortcodes.php index f7ce887827..3959818575 100644 --- a/wp-includes/shortcodes.php +++ b/wp-includes/shortcodes.php @@ -168,7 +168,7 @@ function has_shortcode( $content, $tag ) { } if ( shortcode_exists( $tag ) ) { - preg_match_all( '/' . get_shortcode_regex() . '/s', $content, $matches, PREG_SET_ORDER ); + preg_match_all( '/' . get_shortcode_regex() . '/', $content, $matches, PREG_SET_ORDER ); if ( empty( $matches ) ) return false; @@ -219,7 +219,7 @@ function do_shortcode( $content, $ignore_html = false ) { $content = do_shortcodes_in_html_tags( $content, $ignore_html, $tagnames ); $pattern = get_shortcode_regex( $tagnames ); - $content = preg_replace_callback( "/$pattern/s", 'do_shortcode_tag', $content ); + $content = preg_replace_callback( "/$pattern/", 'do_shortcode_tag', $content ); // Always restore square braces so we don't break things like