From aff3c77aad2206286548d035716ee78f947a8c37 Mon Sep 17 00:00:00 2001
From: Gary Pendergast <gary@pento.net>
Date: Thu, 23 Jul 2015 05:01:14 +0000
Subject: [PATCH] Shortcodes: Improve the reliablity of shortcodes inside HTML
 tags.

Merge of [33359] to the 3.9 branch.

Props miqrogroove.

See #15694.


Built from https://develop.svn.wordpress.org/branches/3.9@33386


git-svn-id: http://core.svn.wordpress.org/branches/3.9@33354 1a063a9b-81f0-0310-95a4-ce76da25c4cd
---
 wp-includes/class-wp-embed.php |   6 +-
 wp-includes/formatting.php     |  71 +++++++++
 wp-includes/kses.php           | 268 ++++++++++++++++++++++++++++-----
 wp-includes/shortcodes.php     | 166 +++++++++++++++++++-
 4 files changed, 468 insertions(+), 43 deletions(-)
diff --git a/wp-includes/class-wp-embed.php b/wp-includes/class-wp-embed.php
index 66b2820f3f..8a44d99524 100644
--- a/wp-includes/class-wp-embed.php
+++ b/wp-includes/class-wp-embed.php
@@ -57,7 +57,7 @@ class WP_Embed {
 		add_shortcode( 'embed', array( $this, 'shortcode' ) );
 
 		// Do the shortcode (only the [embed] one is registered)
-		$content = do_shortcode( $content );
+		$content = do_shortcode( $content, true );
 
 		// Put the original shortcodes back
 		$shortcode_tags = $orig_shortcode_tags;
@@ -291,6 +291,10 @@ class WP_Embed {
 	 * @return string Potentially modified $content.
 	 */
 	function autoembed( $content ) {
+		// Strip newlines from all elements.
+		$content = wp_replace_in_html_tags( $content, array( "\n" => " " ) );
+
+		// Find URLs that are on their own line.
 		return preg_replace_callback( '|^\s*(https?://[^\s"]+)\s*$|im', array( $this, 'autoembed_callback' ), $content );
 	}
 
diff --git a/wp-includes/formatting.php b/wp-includes/formatting.php
index c58bb39b0d..f80c02e67a 100644
--- a/wp-includes/formatting.php
+++ b/wp-includes/formatting.php
@@ -291,6 +291,9 @@ function wpautop($pee, $br = true) {
 	$pee = preg_replace('!(</' . $allblocks . '>)!', "$1\n\n", $pee);
 	$pee = str_replace(array("\r\n", "\r"), "\n", $pee); // cross-platform newlines
 
+	// Strip newlines from all elements.
+	$pee = wp_replace_in_html_tags( $pee, array( "\n" => " " ) );
+
 	if ( strpos( $pee, '</object>' ) !== false ) {
 		// no P/BR around param and embed
 		$pee = preg_replace( '|(<object[^>]*>)\s*|', '$1', $pee );
@@ -339,6 +342,74 @@ function wpautop($pee, $br = true) {
 	return $pee;
 }
 
+/**
+ * Replace characters or phrases within HTML elements only.
+ *
+ * @since 4.2.3
+ *
+ * @param string $haystack The text which has to be formatted.
+ * @param array $replace_pairs In the form array('from' => 'to', ...).
+ * @return string The formatted text.
+ */
+function wp_replace_in_html_tags( $haystack, $replace_pairs ) {
+	// Find all elements.
+	$comments =
+		  '!'           // Start of comment, after the <.
+		. '(?:'         // Unroll the loop: Consume everything until --> is found.
+		.     '-(?!->)' // Dash not followed by end of comment.
+		.     '[^\-]*+' // Consume non-dashes.
+		. ')*+'         // Loop possessively.
+		. '(?:-->)?';   // End of comment. If not found, match all input.
+
+	$regex =
+		  '/('              // Capture the entire match.
+		.     '<'           // Find start of element.
+		.     '(?(?=!--)'   // Is this a comment?
+		.         $comments // Find end of comment.
+		.     '|'
+		.         '[^>]*>?' // Find end of element. If not found, match all input.
+		.     ')'
+		. ')/s';
+
+	$textarr = preg_split( $regex, $haystack, -1, PREG_SPLIT_DELIM_CAPTURE );
+	$changed = false;
+
+	// Optimize when searching for one item.
+	if ( 1 === count( $replace_pairs ) ) {
+		// Extract $needle and $replace.
+		foreach ( $replace_pairs as $needle => $replace );
+
+		// Loop through delimeters (elements) only.
+		for ( $i = 1, $c = count( $textarr ); $i < $c; $i += 2 ) { 
+			if ( false !== strpos( $textarr[$i], $needle ) ) {
+				$textarr[$i] = str_replace( $needle, $replace, $textarr[$i] );
+				$changed = true;
+			}
+		}
+	} else {
+		// Extract all $needles.
+		$needles = array_keys( $replace_pairs );
+
+		// Loop through delimeters (elements) only.
+		for ( $i = 1, $c = count( $textarr ); $i < $c; $i += 2 ) { 
+			foreach ( $needles as $needle ) {
+				if ( false !== strpos( $textarr[$i], $needle ) ) {
+					$textarr[$i] = strtr( $textarr[$i], $replace_pairs );
+					$changed = true;
+					// After one strtr() break out of the foreach loop and look at next element.
+					break;
+				}
+			}
+		}
+	}
+
+	if ( $changed ) {
+		$haystack = implode( $textarr );
+	}
+
+	return $haystack;
+}
+
 /**
  * Newline preservation help function for wpautop
  *
diff --git a/wp-includes/kses.php b/wp-includes/kses.php
index 70623201b5..7e1210d967 100644
--- a/wp-includes/kses.php
+++ b/wp-includes/kses.php
@@ -489,6 +489,82 @@ function wp_kses( $string, $allowed_html, $allowed_protocols = array() ) {
 	return wp_kses_split($string, $allowed_html, $allowed_protocols);
 }
 
+/**
+ * Filters one attribute only and ensures its value is allowed.
+ *
+ * This function has the advantage of being more secure than esc_attr() and can
+ * escape data in some situations where wp_kses() must strip the whole attribute.
+ *
+ * @since 4.2.3
+ *
+ * @param string $string The 'whole' attribute, including name and value.
+ * @param string $element The element name to which the attribute belongs.
+ * @return string Filtered attribute.
+ */
+function wp_kses_one_attr( $string, $element ) {
+	$uris = array('xmlns', 'profile', 'href', 'src', 'cite', 'classid', 'codebase', 'data', 'usemap', 'longdesc', 'action');
+	$allowed_html = wp_kses_allowed_html( 'post' );
+	$allowed_protocols = wp_allowed_protocols();
+	$string = wp_kses_no_null( $string, array( 'slash_zero' => 'keep' ) );
+	$string = wp_kses_js_entities( $string );
+	$string = wp_kses_normalize_entities( $string );
+
+	// Preserve leading and trailing whitespace.
+	$matches = array();
+	preg_match('/^\s*/', $string, $matches);
+	$lead = $matches[0];
+	preg_match('/\s*$/', $string, $matches);
+	$trail = $matches[0];
+	if ( empty( $trail ) ) {
+		$string = substr( $string, strlen( $lead ) );
+	} else {
+		$string = substr( $string, strlen( $lead ), -strlen( $trail ) );
+	}
+	
+	// Parse attribute name and value from input.
+	$split = preg_split( '/\s*=\s*/', $string, 2 );
+	$name = $split[0];
+	if ( count( $split ) == 2 ) {
+		$value = $split[1];
+
+		// Remove quotes surrounding $value.
+		// Also guarantee correct quoting in $string for this one attribute.
+		if ( '' == $value ) {
+			$quote = '';
+		} else {
+			$quote = $value[0];
+		}
+		if ( '"' == $quote || "'" == $quote ) {
+			if ( substr( $value, -1 ) != $quote ) {
+				return '';
+			}
+			$value = substr( $value, 1, -1 );
+		} else {
+			$quote = '"';
+		}
+
+		// Sanitize quotes and angle braces.
+		$value = htmlspecialchars( $value, ENT_QUOTES, null, false );
+
+		// Sanitize URI values.
+		if ( in_array( strtolower( $name ), $uris ) ) {
+			$value = wp_kses_bad_protocol( $value, $allowed_protocols );
+		}
+
+		$string = "$name=$quote$value$quote";
+		$vless = 'n';
+	} else {
+		$value = '';
+		$vless = 'y';
+	}
+	
+	// Sanitize attribute by name.
+	wp_kses_attr_check( $name, $value, $string, $vless, $element, $allowed_html );
+
+	// Restore whitespace.
+	return $lead . $string . $trail;
+}
+
 /**
  * Return a list of allowed tags and attributes for a given context.
  *
@@ -710,45 +786,11 @@ function wp_kses_attr($element, $attr, $allowed_html, $allowed_protocols) {
 	# Go through $attrarr, and save the allowed attributes for this element
 	# in $attr2
 	$attr2 = '';
-
-	$allowed_attr = $allowed_html[strtolower($element)];
-	foreach ($attrarr as $arreach) {
-		if ( ! isset( $allowed_attr[strtolower($arreach['name'])] ) )
-			continue; # the attribute is not allowed
-
-		$current = $allowed_attr[strtolower($arreach['name'])];
-		if ( $current == '' )
-			continue; # the attribute is not allowed
-
-		if ( strtolower( $arreach['name'] ) == 'style' ) {
-			$orig_value = $arreach['value'];
-			$value = safecss_filter_attr( $orig_value );
-
-			if ( empty( $value ) )
-				continue;
-
-			$arreach['value'] = $value;
-			$arreach['whole'] = str_replace( $orig_value, $value, $arreach['whole'] );
-		}
-
-		if ( ! is_array($current) ) {
+	foreach ( $attrarr as $arreach ) {
+		if ( wp_kses_attr_check( $arreach['name'], $arreach['value'], $arreach['whole'], $arreach['vless'], $element, $allowed_html ) ) {
 			$attr2 .= ' '.$arreach['whole'];
-		# there are no checks
-
-		} else {
-			# there are some checks
-			$ok = true;
-			foreach ($current as $currkey => $currval) {
-				if ( ! wp_kses_check_attr_val($arreach['value'], $arreach['vless'], $currkey, $currval) ) {
-					$ok = false;
-					break;
-				}
-			}
-
-			if ( $ok )
-				$attr2 .= ' '.$arreach['whole']; # it passed them
-		} # if !is_array($current)
-	} # foreach
+		}
+	}
 
 	# Remove any "<" or ">" characters
 	$attr2 = preg_replace('/[<>]/', '', $attr2);
@@ -756,6 +798,53 @@ function wp_kses_attr($element, $attr, $allowed_html, $allowed_protocols) {
 	return "<$element$attr2$xhtml_slash>";
 }
 
+/**
+ * Determine whether an attribute is allowed.
+ *
+ * @since 4.2.3
+ *
+ * @param string $name The attribute name. Returns empty string when not allowed.
+ * @param string $value The attribute value. Returns a filtered value.
+ * @param string $whole The name=value input. Returns filtered input.
+ * @param string $vless 'y' when attribute like "enabled", otherwise 'n'.
+ * @param string $element The name of the element to which this attribute belongs.
+ * @param array $allowed_html The full list of allowed elements and attributes.
+ * @return bool Is the attribute allowed?
+ */
+function wp_kses_attr_check( &$name, &$value, &$whole, $vless, $element, $allowed_html ) {
+	$allowed_attr = $allowed_html[strtolower( $element )];
+
+	$name_low = strtolower( $name );
+	if ( ! isset( $allowed_attr[$name_low] ) || '' == $allowed_attr[$name_low] ) {
+		$name = $value = $whole = '';
+		return false;
+	}
+
+	if ( 'style' == $name_low ) {
+		$new_value = safecss_filter_attr( $value );
+
+		if ( empty( $new_value ) ) {
+			$name = $value = $whole = '';
+			return false;
+		}
+
+		$whole = str_replace( $value, $new_value, $whole );
+		$value = $new_value;
+	}
+
+	if ( is_array( $allowed_attr[$name_low] ) ) {
+		// there are some checks
+		foreach ( $allowed_attr[$name_low] as $currkey => $currval ) {
+			if ( ! wp_kses_check_attr_val( $value, $vless, $currkey, $currval ) ) {
+				$name = $value = $whole = '';
+				return false;
+			}
+		}
+	}
+
+	return true;
+}
+
 /**
  * Builds an attribute list from string containing attributes.
  *
@@ -885,6 +974,109 @@ function wp_kses_hair($attr, $allowed_protocols) {
 	return $attrarr;
 }
 
+/**
+ * Finds all attributes of an HTML element.
+ *
+ * Does not modify input.  May return "evil" output.
+ *
+ * Based on wp_kses_split2() and wp_kses_attr()
+ *
+ * @since 4.2.3
+ *
+ * @param string $element HTML element/tag
+ * @return array|bool List of attributes found in $element. Returns false on failure.
+ */
+function wp_kses_attr_parse( $element ) {
+	$valid = preg_match('%^(<\s*)(/\s*)?([a-zA-Z0-9]+\s*)([^>]*)(>?)$%', $element, $matches);
+	if ( 1 !== $valid ) {
+		return false;
+	}
+
+	$begin =  $matches[1];
+	$slash =  $matches[2];
+	$elname = $matches[3];
+	$attr =   $matches[4];
+	$end =    $matches[5];
+
+	if ( '' !== $slash ) {
+		// Closing elements do not get parsed.
+		return false;
+	}
+
+	// Is there a closing XHTML slash at the end of the attributes?
+	if ( 1 === preg_match( '%\s*/\s*$%', $attr, $matches ) ) {
+		$xhtml_slash = $matches[0];
+		$attr = substr( $attr, 0, -strlen( $xhtml_slash ) );
+	} else {
+		$xhtml_slash = '';
+	}
+	
+	// Split it
+	$attrarr = wp_kses_hair_parse( $attr );
+	if ( false === $attrarr ) {
+		return false;
+	}
+
+	// Make sure all input is returned by adding front and back matter.
+	array_unshift( $attrarr, $begin . $slash . $elname );
+	array_push( $attrarr, $xhtml_slash . $end );
+	
+	return $attrarr;
+}
+
+/**
+ * Builds an attribute list from string containing attributes.
+ *
+ * Does not modify input.  May return "evil" output.
+ * In case of unexpected input, returns false instead of stripping things.
+ *
+ * Based on wp_kses_hair() but does not return a multi-dimensional array.
+ *
+ * @since 4.2.3
+ *
+ * @param string $attr Attribute list from HTML element to closing HTML element tag
+ * @return array|bool List of attributes found in $attr. Returns false on failure.
+ */
+function wp_kses_hair_parse( $attr ) {
+	if ( '' === $attr ) {
+		return array();
+	}
+
+	$regex =
+	  '(?:'
+	.     '[-a-zA-Z:]+'   // Attribute name.
+	. '|'
+	.     '\[\[?[^\[\]]+\]\]?' // Shortcode in the name position implies unfiltered_html.
+	. ')'
+	. '(?:'               // Attribute value.
+	.     '\s*=\s*'       // All values begin with '='
+	.     '(?:'
+	.         '"[^"]*"'   // Double-quoted
+	.     '|'
+	.         "'[^']*'"   // Single-quoted
+	.     '|'
+	.         '[^\s"\']+' // Non-quoted
+	.         '(?:\s|$)'  // Must have a space
+	.     ')'
+	. '|'
+	.     '(?:\s|$)'      // If attribute has no value, space is required.
+	. ')'
+	. '\s*';              // Trailing space is optional except as mentioned above.
+
+	// Although it is possible to reduce this procedure to a single regexp,
+	// we must run that regexp twice to get exactly the expected result.
+
+	$validation = "%^($regex)+$%";
+	$extraction = "%$regex%";
+
+	if ( 1 === preg_match( $validation, $attr ) ) {
+		preg_match_all( $extraction, $attr, $attrarr );
+		return $attrarr[0];
+	} else {
+		return false;
+	}
+}
+
 /**
  * Performs different checks for attribute values.
  *
diff --git a/wp-includes/shortcodes.php b/wp-includes/shortcodes.php
index 15f8561d5b..48f96d1eb6 100644
--- a/wp-includes/shortcodes.php
+++ b/wp-includes/shortcodes.php
@@ -185,9 +185,10 @@ function has_shortcode( $content, $tag ) {
  * @uses get_shortcode_regex() Gets the search pattern for searching shortcodes.
  *
  * @param string $content Content to search for shortcodes
+ * @param bool $ignore_html When true, shortcodes inside HTML elements will be skipped.
  * @return string Content with shortcodes filtered out.
  */
-function do_shortcode($content) {
+function do_shortcode( $content, $ignore_html = false ) {
 	global $shortcode_tags;
 
 	if ( false === strpos( $content, '[' ) ) {
@@ -197,8 +198,24 @@ function do_shortcode($content) {
 	if (empty($shortcode_tags) || !is_array($shortcode_tags))
 		return $content;
 
+	$tagnames = array_keys($shortcode_tags);
+	$tagregexp = join( '|', array_map('preg_quote', $tagnames) );
+	$pattern = "/\\[($tagregexp)/s";
+
+	if ( 1 !== preg_match( $pattern, $content ) ) {
+		// Avoids parsing HTML when there are no shortcodes or embeds anyway.
+		return $content;
+	}
+
+	$content = do_shortcodes_in_html_tags( $content, $ignore_html );
+
 	$pattern = get_shortcode_regex();
-	return preg_replace_callback( "/$pattern/s", 'do_shortcode_tag', $content );
+	$content = preg_replace_callback( "/$pattern/s", 'do_shortcode_tag', $content );
+	
+	// Always restore square braces so we don't break things like <!--[if IE ]>
+	$content = unescape_invalid_shortcodes( $content );
+	
+	return $content;
 }
 
 /**
@@ -291,6 +308,141 @@ function do_shortcode_tag( $m ) {
 	}
 }
 
+/**
+ * Search only inside HTML elements for shortcodes and process them.
+ *
+ * Any [ or ] characters remaining inside elements will be HTML encoded
+ * to prevent interference with shortcodes that are outside the elements.
+ * Assumes $content processed by KSES already.  Users with unfiltered_html
+ * capability may get unexpected output if angle braces are nested in tags.
+ *
+ * @since 4.2.3
+ *
+ * @param string $content Content to search for shortcodes
+ * @param bool $ignore_html When true, all square braces inside elements will be encoded.
+ * @return string Content with shortcodes filtered out.
+ */
+function do_shortcodes_in_html_tags( $content, $ignore_html ) {
+	// Normalize entities in unfiltered HTML before adding placeholders.
+	$trans = array( '&#91;' => '&#091;', '&#93;' => '&#093;' );
+	$content = strtr( $content, $trans );
+	$trans = array( '[' => '&#91;', ']' => '&#93;' );
+	
+	$pattern = get_shortcode_regex();
+
+	$comment_regex =
+		  '!'           // Start of comment, after the <.
+		. '(?:'         // Unroll the loop: Consume everything until --> is found.
+		.     '-(?!->)' // Dash not followed by end of comment.
+		.     '[^\-]*+' // Consume non-dashes.
+		. ')*+'         // Loop possessively.
+		. '(?:-->)?';   // End of comment. If not found, match all input.
+
+	$regex =
+		  '/('                   // Capture the entire match.
+		.     '<'                // Find start of element.
+		.     '(?(?=!--)'        // Is this a comment?
+		.         $comment_regex // Find end of comment.
+		.     '|'
+		.         '[^>]*>?'      // Find end of element. If not found, match all input.
+		.     ')'
+		. ')/s';
+
+	$textarr = preg_split( $regex, $content, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
+
+	foreach ( $textarr as &$element ) {
+		if ( '<' !== $element[0] ) {
+			continue;
+		}
+
+		$noopen = false === strpos( $element, '[' );
+		$noclose = false === strpos( $element, ']' );
+		if ( $noopen || $noclose ) {
+			// This element does not contain shortcodes.
+			if ( $noopen xor $noclose ) {
+				// Need to encode stray [ or ] chars.
+				$element = strtr( $element, $trans );
+			}
+			continue;
+		}
+
+		if ( $ignore_html || '<!--' === substr( $element, 0, 4 ) ) {
+			// Encode all [ and ] chars.
+			$element = strtr( $element, $trans );
+			continue;
+		}
+
+		$attributes = wp_kses_attr_parse( $element );
+		if ( false === $attributes ) {
+			// Looks like we found some crazy unfiltered HTML.  Skipping it for sanity.
+			$element = strtr( $element, $trans );
+			continue;
+		}
+		
+		// Get element name
+		$front = array_shift( $attributes );
+		$back = array_pop( $attributes );
+		$matches = array();
+		preg_match('%[a-zA-Z0-9]+%', $front, $matches);
+		$elname = $matches[0];
+		
+		// Look for shortcodes in each attribute separately.
+		foreach ( $attributes as &$attr ) {
+			$open = strpos( $attr, '[' );
+			$close = strpos( $attr, ']' );
+			if ( false === $open || false === $close ) {
+				continue; // Go to next attribute.  Square braces will be escaped at end of loop.
+			}
+			$double = strpos( $attr, '"' );
+			$single = strpos( $attr, "'" );
+			if ( ( false === $single || $open < $single ) && ( false === $double || $open < $double ) ) {
+				// $attr like '[shortcode]' or 'name = [shortcode]' implies unfiltered_html.
+				// In this specific situation we assume KSES did not run because the input
+				// was written by an administrator, so we should avoid changing the output
+				// and we do not need to run KSES here.
+				$attr = preg_replace_callback( "/$pattern/s", 'do_shortcode_tag', $attr );
+			} else {
+				// $attr like 'name = "[shortcode]"' or "name = '[shortcode]'"
+				// We do not know if $content was unfiltered. Assume KSES ran before shortcodes.
+				$count = 0;
+				$new_attr = preg_replace_callback( "/$pattern/s", 'do_shortcode_tag', $attr, -1, $count );
+				if ( $count > 0 ) {
+					// Sanitize the shortcode output using KSES.
+					$new_attr = wp_kses_one_attr( $new_attr, $elname );
+					if ( '' !== $new_attr ) {
+						// The shortcode is safe to use now.
+						$attr = $new_attr;
+					}
+				}
+			}
+		}
+		$element = $front . implode( '', $attributes ) . $back;
+		
+		// Now encode any remaining [ or ] chars.
+		$element = strtr( $element, $trans );
+	}
+	
+	$content = implode( '', $textarr );
+	
+	return $content;
+}
+
+/**
+ * Remove placeholders added by do_shortcodes_in_html_tags().
+ *
+ * @since 4.2.3
+ *
+ * @param string $content Content to search for placeholders.
+ * @return string Content with placeholders removed.
+ */
+function unescape_invalid_shortcodes( $content ) {
+        // Clean up entire string, avoids re-parsing HTML.
+        $trans = array( '&#91;' => '[', '&#93;' => ']' );
+        $content = strtr( $content, $trans );
+        
+        return $content;
+}
+
 /**
  * Retrieve all attributes from the shortcodes tag.
  *
@@ -390,9 +542,15 @@ function strip_shortcodes( $content ) {
 	if (empty($shortcode_tags) || !is_array($shortcode_tags))
 		return $content;
 
-	$pattern = get_shortcode_regex();
+	$content = do_shortcodes_in_html_tags( $content, true );
 
-	return preg_replace_callback( "/$pattern/s", 'strip_shortcode_tag', $content );
+	$pattern = get_shortcode_regex();
+	$content = preg_replace_callback( "/$pattern/s", 'strip_shortcode_tag', $content );
+
+	// Always restore square braces so we don't break things like <!--[if IE ]>
+	$content = unescape_invalid_shortcodes( $content );
+	
+	return $content;
 }
 
 function strip_shortcode_tag( $m ) {