Shortcodes: Fix PCRE performance bugs in get_shortcode_regexp() and related to wptexturize(), do_shortcode(), and strip_shortcodes()

Alters unit tests. Props miqrogroove. Fixes #33517. Built from https://develop.svn.wordpress.org/trunk@34747 git-svn-id: http://core.svn.wordpress.org/trunk@34712 1a063a9b-81f0-0310-95a4-ce76da25c4cd
2025-03-02 11:21:57 +01:00 · 2015-10-01 18:05:25 +00:00 · 2015-10-01 18:05:25 +00:00 · 99347fd96e
commit 99347fd96e
parent a985977856
3 changed files with 64 additions and 49 deletions
--- a/wp-includes/formatting.php
+++ b/wp-includes/formatting.php
@ -216,9 +216,24 @@ function wptexturize( $text, $reset = false ) {

 	// Look for shortcodes and HTML elements.

-	$tagnames = array_keys( $shortcode_tags );
-	$tagregexp = join( '|', array_map( 'preg_quote', $tagnames ) );
-	$tagregexp = "(?:$tagregexp)(?![\\w-])"; // Excerpt of get_shortcode_regex().
+	preg_match_all( '@\[/?([^<>&/\[\]\x00-\x20]++)@', $text, $matches );
+	$tagnames = array_intersect( array_keys( $shortcode_tags ), $matches[1] );
+	$found_shortcodes = ! empty( $tagnames );
+	if ( $found_shortcodes ) {
+		$tagregexp = join( '|', array_map( 'preg_quote', $tagnames ) );
+		$tagregexp = "(?:$tagregexp)(?![\\w-])"; // Excerpt of get_shortcode_regex().
+		$shortcode_regex =
+			  '\['              // Find start of shortcode.
+			. '[\/\[]?'         // Shortcodes may begin with [/ or [[
+			. $tagregexp        // Only match registered shortcodes, because performance.
+			. '(?:'
+			.     '[^\[\]<>]+'  // Shortcodes do not contain other shortcodes. Quantifier critical.
+			. '|'
+			.     '<[^\[\]>]*>' // HTML elements permitted. Prevents matching ] before >.
+			. ')*+'             // Possessive critical.
+			. '\]'              // Find end of shortcode.
+			. '\]?';            // Shortcodes may end with ]]
+	}

 	$comment_regex =
 		  '!'           // Start of comment, after the <.
@ -228,51 +243,39 @@ function wptexturize( $text, $reset = false ) {
 		. ')*+'         // Loop possessively.
 		. '(?:-->)?';   // End of comment. If not found, match all input.

-	$shortcode_regex =
-		  '\['              // Find start of shortcode.
-		. '[\/\[]?'         // Shortcodes may begin with [/ or [[
-		. $tagregexp        // Only match registered shortcodes, because performance.
-		. '(?:'
-		.     '[^\[\]<>]+'  // Shortcodes do not contain other shortcodes. Quantifier critical.
+	$html_regex =			 // Needs replaced with wp_html_split() per Shortcode API Roadmap.
+		  '<'                // Find start of element.
+		. '(?(?=!--)'        // Is this a comment?
+		.     $comment_regex // Find end of comment.
 		. '|'
-		.     '<[^\[\]>]*>' // HTML elements permitted. Prevents matching ] before >.
-		. ')*+'             // Possessive critical.
-		. '\]'              // Find end of shortcode.
-		. '\]?';            // Shortcodes may end with ]]
+		.     '[^>]*>?'      // Find end of element. If not found, match all input.
+		. ')';

-	$regex =
-		  '/('                   // Capture the entire match.
-		.     '<'                // Find start of element.
-		.     '(?(?=!--)'        // Is this a comment?
-		.         $comment_regex // Find end of comment.
-		.     '|'
-		.         '[^>]*>'       // Find end of element.
-		.     ')'
-		. '|'
-		.     $shortcode_regex   // Find shortcodes.
-		. ')/s';
+	if ( $found_shortcodes ) {
+		$regex = '/(' . $html_regex . '|' . $shortcode_regex . ')/s';
+	} else {
+		$regex = '/(' . $html_regex . ')/s';
+	}

 	$textarr = preg_split( $regex, $text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );

 	foreach ( $textarr as &$curl ) {
 		// Only call _wptexturize_pushpop_element if $curl is a delimiter.
 		$first = $curl[0];
-		if ( '<' === $first && '<!--' === substr( $curl, 0, 4 ) ) {
-			// This is an HTML comment delimiter.
-
-			continue;
-
-		} elseif ( '<' === $first && '>' === substr( $curl, -1 ) ) {
-			// This is an HTML element delimiter.
-
-			_wptexturize_pushpop_element( $curl, $no_texturize_tags_stack, $no_texturize_tags );
+		if ( '<' === $first ) {
+			if ( '<!--' === substr( $curl, 0, 4 ) ) {
+				// This is an HTML comment delimeter.
+				continue;
+			} else {
+				// This is an HTML element delimiter.
+				_wptexturize_pushpop_element( $curl, $no_texturize_tags_stack, $no_texturize_tags );
+			}

 		} elseif ( '' === trim( $curl ) ) {
 			// This is a newline between delimiters.  Performance improves when we check this.
-
 			continue;

-		} elseif ( '[' === $first && 1 === preg_match( '/^' . $shortcode_regex . '$/', $curl ) ) {
+		} elseif ( '[' === $first && $found_shortcodes && 1 === preg_match( '/^' . $shortcode_regex . '$/', $curl ) ) {
 			// This is a shortcode delimiter.

 			if ( '[[' !== substr( $curl, 0, 2 ) && ']]' !== substr( $curl, -2 ) ) {
--- a/wp-includes/shortcodes.php
+++ b/wp-includes/shortcodes.php
@ -208,18 +208,17 @@ function do_shortcode( $content, $ignore_html = false ) {
 	if (empty($shortcode_tags) || !is_array($shortcode_tags))
 		return $content;

-	$tagnames = array_keys($shortcode_tags);
-	$tagregexp = join( '|', array_map('preg_quote', $tagnames) );
-	$pattern = "/\\[($tagregexp)/s";
+	// Find all registered tag names in $content.
+	preg_match_all( '@\[([^<>&/\[\]\x00-\x20]++)@', $content, $matches );
+	$tagnames = array_intersect( array_keys( $shortcode_tags ), $matches[1] );

-	if ( 1 !== preg_match( $pattern, $content ) ) {
-		// Avoids parsing HTML when there are no shortcodes or embeds anyway.
+	if ( empty( $tagnames ) ) {
 		return $content;
 	}

-	$content = do_shortcodes_in_html_tags( $content, $ignore_html );
+	$content = do_shortcodes_in_html_tags( $content, $ignore_html, $tagnames );

-	$pattern = get_shortcode_regex();
+	$pattern = get_shortcode_regex( $tagnames );
 	$content = preg_replace_callback( "/$pattern/s", 'do_shortcode_tag', $content );

 	// Always restore square braces so we don't break things like <!--[if IE ]>
@ -247,11 +246,15 @@ function do_shortcode( $content, $ignore_html = false ) {
 *
 * @global array $shortcode_tags
 *
+ * @param array $tagnames List of shortcodes to find. Optional. Defaults to all registered shortcodes.
 * @return string The shortcode search regular expression
 */
-function get_shortcode_regex() {
+function get_shortcode_regex( $tagnames = null ) {
 	global $shortcode_tags;
-	$tagnames = array_keys($shortcode_tags);
+
+	if ( empty( $tagnames ) ) {
+		$tagnames = array_keys( $shortcode_tags );
+	}
 	$tagregexp = join( '|', array_map('preg_quote', $tagnames) );

 	// WARNING! Do not change this regex without changing do_shortcode_tag() and strip_shortcode_tag()
@ -337,15 +340,16 @@ function do_shortcode_tag( $m ) {
 *
 * @param string $content Content to search for shortcodes
 * @param bool $ignore_html When true, all square braces inside elements will be encoded.
+ * @param array $tagnames List of shortcodes to find.
 * @return string Content with shortcodes filtered out.
 */
-function do_shortcodes_in_html_tags( $content, $ignore_html ) {
+function do_shortcodes_in_html_tags( $content, $ignore_html, $tagnames ) {
 	// Normalize entities in unfiltered HTML before adding placeholders.
 	$trans = array( '&#91;' => '&#091;', '&#93;' => '&#093;' );
 	$content = strtr( $content, $trans );
 	$trans = array( '[' => '&#91;', ']' => '&#93;' );

-	$pattern = get_shortcode_regex();
+	$pattern = get_shortcode_regex( $tagnames );
 	$textarr = wp_html_split( $content );

 	foreach ( $textarr as &$element ) {
@ -557,9 +561,17 @@ function strip_shortcodes( $content ) {
 	if (empty($shortcode_tags) || !is_array($shortcode_tags))
 		return $content;

-	$content = do_shortcodes_in_html_tags( $content, true );
+	// Find all registered tag names in $content.
+	preg_match_all( '@\[([^<>&/\[\]\x00-\x20]++)@', $content, $matches );
+	$tagnames = array_intersect( array_keys( $shortcode_tags ), $matches[1] );

-	$pattern = get_shortcode_regex();
+	if ( empty( $tagnames ) ) {
+		return $content;
+	}
+
+	$content = do_shortcodes_in_html_tags( $content, true, $tagnames );
+
+	$pattern = get_shortcode_regex( $tagnames );
 	$content = preg_replace_callback( "/$pattern/s", 'strip_shortcode_tag', $content );

 	// Always restore square braces so we don't break things like <!--[if IE ]>
--- a/wp-includes/version.php
+++ b/wp-includes/version.php
@ -4,7 +4,7 @@
 *
 * @global string $wp_version
 */
-$wp_version = '4.4-alpha-34746';
+$wp_version = '4.4-alpha-34747';

 /**
 * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.