WPDB: When checking that a string can be sent to MySQL, we shouldn't use `mb_convert_encoding()`, as it behaves differently to MySQL's character encoding conversion.

Merge of [32364] to the 4.0 branch. Props mdawaffe, pento, nbachiyski, jorbin, johnjamesjacoby, jeremyfelt. See #32165. Built from https://develop.svn.wordpress.org/branches/4.0@32388 git-svn-id: http://core.svn.wordpress.org/branches/4.0@32358 1a063a9b-81f0-0310-95a4-ce76da25c4cd
2015-05-06 19:09:39 +00:00 · 2015-05-06 19:09:39 +00:00 · 7e35f92679
parent d6df175451
commit 7e35f92679
4 changed files with 253 additions and 78 deletions
--- a/wp-admin/includes/upgrade.php
+++ b/wp-admin/includes/upgrade.php
@ -440,8 +440,8 @@ function upgrade_all() {
 	if ( $wp_current_db_version < 29630 )
 		upgrade_400();

-	if ( $wp_current_db_version < 29631 )
-		upgrade_404();
+	if ( $wp_current_db_version < 29632 )
+		upgrade_405();

 	maybe_disable_link_manager();

@ -1335,19 +1335,43 @@ function upgrade_400() {
 * @since 4.0.4
 */
 function upgrade_404() {
+}
+
+/**
+ * Execute changes made in WordPress 4.0.5.
+ *
+ * @since 4.0.5
+ */
+function upgrade_405() {
 	global $wp_current_db_version, $wpdb;

-	if ( $wp_current_db_version < 29631 ) {
+	if ( $wp_current_db_version < 29632 ) {
 		$content_length = $wpdb->get_col_length( $wpdb->comments, 'comment_content' );
-		if ( ! $content_length ) {
-			$content_length = 65535;
+		if ( false === $content_length ) {
+			$content_length = array(
+				'type'   => 'byte',
+				'length' => 65535,
+			);
+		} elseif ( ! is_array( $content_length ) ) {
+			$length = (int) $content_length > 0 ? (int) $content_length : 65535;
+			$content_length = array(
+				'type'	 => 'byte',
+				'length' => $length
+			);
 		}

+		if ( 'byte' !== $content_length['type'] ) {
+			// Sites with malformed DB schemas are on their own.
+			return;
+		}
+
+		$allowed_length = intval( $content_length['length'] ) - 10;
+
 		$comments = $wpdb->get_results(
-			"SELECT comment_ID FROM $wpdb->comments
-			WHERE comment_date_gmt > '2015-04-26'
-			AND CHAR_LENGTH( comment_content ) >= $content_length
-			AND ( comment_content LIKE '%<%' OR comment_content LIKE '%>%' )"
+			"SELECT `comment_ID` FROM `{$wpdb->comments}`
+				WHERE `comment_date_gmt` > '2015-04-26'
+				AND LENGTH( `comment_content` ) >= {$allowed_length}
+				AND ( `comment_content` LIKE '%<%' OR `comment_content` LIKE '%>%' )"
 		);

 		foreach ( $comments as $comment ) {
--- a/wp-includes/compat.php
+++ b/wp-includes/compat.php
@ -13,23 +13,141 @@ if ( !function_exists('_') ) {
 	}
 }

-if ( !function_exists('mb_substr') ):
-	function mb_substr( $str, $start, $length=null, $encoding=null ) {
-		return _mb_substr($str, $start, $length, $encoding);
+/**
+ * Returns whether PCRE/u (PCRE_UTF8 modifier) is available for use.
+ *
+ * @ignore
+ * @since 4.2.2
+ * @access private
+ *
+ * @param bool $set - Used for testing only
+ *             null   : default - get PCRE/u capability
+ *             false  : Used for testing - return false for future calls to this function
+ *             'reset': Used for testing - restore default behavior of this function
+ */
+function _wp_can_use_pcre_u( $set = null ) {
+	static $utf8_pcre = 'reset';
+
+	if ( null !== $set ) {
+		$utf8_pcre = $set;
+	}
+
+	if ( 'reset' === $utf8_pcre ) {
+		$utf8_pcre = @preg_match( '/^./u', 'a' );
+	}
+
+	return $utf8_pcre;
+}
+
+if ( ! function_exists( 'mb_substr' ) ) :
+	function mb_substr( $str, $start, $length = null, $encoding = null ) {
+		return _mb_substr( $str, $start, $length, $encoding );
 	}
 endif;

-function _mb_substr( $str, $start, $length=null, $encoding=null ) {
-	// the solution below, works only for utf-8, so in case of a different
-	// charset, just use built-in substr
-	$charset = get_option( 'blog_charset' );
-	if ( !in_array( $charset, array('utf8', 'utf-8', 'UTF8', 'UTF-8') ) ) {
-		return is_null( $length )? substr( $str, $start ) : substr( $str, $start, $length);
+/*
+ * Only understands UTF-8 and 8bit.  All other character sets will be treated as 8bit.
+ * For $encoding === UTF-8, the $str input is expected to be a valid UTF-8 byte sequence.
+ * The behavior of this function for invalid inputs is undefined.
+ */
+function _mb_substr( $str, $start, $length = null, $encoding = null ) {
+	if ( null === $encoding ) {
+		$encoding = get_option( 'blog_charset' );
 	}
-	// use the regex unicode support to separate the UTF-8 characters into an array
-	preg_match_all( '/./us', $str, $match );
-	$chars = is_null( $length )? array_slice( $match[0], $start ) : array_slice( $match[0], $start, $length );
-	return implode( '', $chars );
+
+	// The solution below works only for UTF-8,
+	// so in case of a different charset just use built-in substr()
+	if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) {
+		return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length );
+	}
+
+	if ( _wp_can_use_pcre_u() ) {
+		// Use the regex unicode support to separate the UTF-8 characters into an array
+		preg_match_all( '/./us', $str, $match );
+		$chars = is_null( $length ) ? array_slice( $match[0], $start ) : array_slice( $match[0], $start, $length );
+		return implode( '', $chars );
+	}
+
+	$regex = '/(
+		  [\x00-\x7F]                  # single-byte sequences   0xxxxxxx
+		| [\xC2-\xDF][\x80-\xBF]       # double-byte sequences   110xxxxx 10xxxxxx
+		| \xE0[\xA0-\xBF][\x80-\xBF]   # triple-byte sequences   1110xxxx 10xxxxxx * 2
+		| [\xE1-\xEC][\x80-\xBF]{2}
+		| \xED[\x80-\x9F][\x80-\xBF]
+		| [\xEE-\xEF][\x80-\xBF]{2}
+		| \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences   11110xxx 10xxxxxx * 3
+		| [\xF1-\xF3][\x80-\xBF]{3}
+		| \xF4[\x80-\x8F][\x80-\xBF]{2}
+	)/x';
+
+	$chars = array( '' ); // Start with 1 element instead of 0 since the first thing we do is pop
+	do {
+		// We had some string left over from the last round, but we counted it in that last round.
+		array_pop( $chars );
+
+		// Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string)
+		$pieces = preg_split( $regex, $str, 1000, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
+
+		$chars = array_merge( $chars, $pieces );
+	} while ( count( $pieces ) > 1 && $str = array_pop( $pieces ) ); // If there's anything left over, repeat the loop.
+
+	return join( '', array_slice( $chars, $start, $length ) );
+}
+
+if ( ! function_exists( 'mb_strlen' ) ) :
+	function mb_strlen( $str, $encoding = null ) {
+		return _mb_strlen( $str, $encoding );
+	}
+endif;
+
+/*
+ * Only understands UTF-8 and 8bit.  All other character sets will be treated as 8bit.
+ * For $encoding === UTF-8, the $str input is expected to be a valid UTF-8 byte sequence.
+ * The behavior of this function for invalid inputs is undefined.
+ */
+function _mb_strlen( $str, $encoding = null ) {
+	if ( null === $encoding ) {
+		$encoding = get_option( 'blog_charset' );
+	}
+
+	// The solution below works only for UTF-8,
+	// so in case of a different charset just use built-in strlen()
+	if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) {
+		return strlen( $str );
+	}
+
+	if ( _wp_can_use_pcre_u() ) {
+		// Use the regex unicode support to separate the UTF-8 characters into an array
+		preg_match_all( '/./us', $str, $match );
+		return count( $match[0] );
+	}
+
+	$regex = '/(?:
+		  [\x00-\x7F]                  # single-byte sequences   0xxxxxxx
+		| [\xC2-\xDF][\x80-\xBF]       # double-byte sequences   110xxxxx 10xxxxxx
+		| \xE0[\xA0-\xBF][\x80-\xBF]   # triple-byte sequences   1110xxxx 10xxxxxx * 2
+		| [\xE1-\xEC][\x80-\xBF]{2}
+		| \xED[\x80-\x9F][\x80-\xBF]
+		| [\xEE-\xEF][\x80-\xBF]{2}
+		| \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences   11110xxx 10xxxxxx * 3
+		| [\xF1-\xF3][\x80-\xBF]{3}
+		| \xF4[\x80-\x8F][\x80-\xBF]{2}
+	)/x';
+
+	$count = 1; // Start at 1 instead of 0 since the first thing we do is decrement
+	do {
+		// We had some string left over from the last round, but we counted it in that last round.
+		$count--;
+
+		// Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string)
+		$pieces = preg_split( $regex, $str, 1000 );
+
+		// Increment
+		$count += count( $pieces );
+	} while ( $str = array_pop( $pieces ) ); // If there's anything left over, repeat the loop.
+
+	// Fencepost: preg_split() always returns one extra item in the array
+	return --$count;
 }

 if ( !function_exists('hash_hmac') ):
--- a/wp-includes/version.php
+++ b/wp-includes/version.php
@ -11,7 +11,7 @@ $wp_version = '4.0.4';
 *
 * @global int $wp_db_version
 */
-$wp_db_version = 29631;
+$wp_db_version = 29632;

 /**
 * Holds the TinyMCE version
--- a/wp-includes/wp-db.php
+++ b/wp-includes/wp-db.php
@ -1787,6 +1787,8 @@ class wpdb {
 	 * @return int|false The number of rows affected, or false on error.
 	 */
 	function _insert_replace_helper( $table, $data, $format = null, $type = 'INSERT' ) {
+		$this->insert_id = 0;
+
 		if ( ! in_array( strtoupper( $type ), array( 'REPLACE', 'INSERT' ) ) ) {
 			return false;
 		}
@ -1807,7 +1809,6 @@ class wpdb {

 		$sql = "$type INTO `$table` ($fields) VALUES ($formats)";

-		$this->insert_id = 0;
 		$this->check_current_query = false;
 		return $this->query( $this->prepare( $sql, $values ) );
 	}
@ -2003,17 +2004,11 @@ class wpdb {
 				// We can skip this field if we know it isn't a string.
 				// This checks %d/%f versus ! %s because it's sprintf() could take more.
 				$value['charset'] = false;
-			} elseif ( $this->check_ascii( $value['value'] ) ) {
-				// If it's ASCII, then we don't need the charset. We can skip this field.
-				$value['charset'] = false;
 			} else {
 				$value['charset'] = $this->get_col_charset( $table, $field );
 				if ( is_wp_error( $value['charset'] ) ) {
 					return false;
 				}
-
-				// This isn't ASCII. Don't have strip_invalid_text() re-check.
-				$value['ascii'] = false;
 			}

 			$data[ $field ] = $value;
@ -2046,10 +2041,6 @@ class wpdb {
 				}
 			}

-			if ( false !== $value['length'] && strlen( $value['value'] ) > $value['length'] ) {
-				return false;
-			}
-
 			$data[ $field ] = $value;
 		}

@ -2379,14 +2370,16 @@ class wpdb {

 	/**
 	 * Retrieve the maximum string length allowed in a given column.
+	 * The length may either be specified as a byte length or a character length.
 	 *
 	 * @since 4.2.1
 	 * @access public
 	 *
 	 * @param string $table  Table name.
 	 * @param string $column Column name.
-	 * @return mixed Max column length as an int. False if the column has no
-	 *               length. WP_Error object if there was an error.
+	 * @return mixed array( 'length' => (int), 'type' => 'byte' | 'char' )
+	 *               false if the column has no length (for example, numeric column)
+	 *               WP_Error object if there was an error.
 	 */
 	public function get_col_length( $table, $column ) {
 		$tablekey = strtolower( $table );
@ -2419,27 +2412,47 @@ class wpdb {
 		}

 		switch( $type ) {
-			case 'binary':
 			case 'char':
-			case 'varbinary':
 			case 'varchar':
-				return $length;
+				return array(
+					'type'   => 'char',
+					'length' => (int) $length,
+				);
+				break;
+			case 'binary':
+			case 'varbinary':
+				return array(
+					'type'   => 'byte',
+					'length' => (int) $length,
+				);
 				break;
 			case 'tinyblob':
 			case 'tinytext':
-				return 255; // 2^8 - 1
+				return array(
+					'type'   => 'byte',
+					'length' => 255,        // 2^8 - 1
+				);
 				break;
 			case 'blob':
 			case 'text':
-				return 65535; // 2^16 - 1
+				return array(
+					'type'   => 'byte',
+					'length' => 65535,      // 2^16 - 1
+				);
 				break;
 			case 'mediumblob':
 			case 'mediumtext':
-				return 16777215; // 2^24 - 1
+				return array(
+					'type'   => 'byte',
+					'length' => 16777215,   // 2^24 - 1
+				);
 				break;
 			case 'longblob':
 			case 'longtext':
-				return 4294967295; // 2^32 - 1
+				return array(
+					'type'   => 'byte',
+					'length' => 4294967295, // 2^32 - 1
+				);
 				break;
 			default:
 				return false;
@ -2546,50 +2559,55 @@ class wpdb {
 	 */
 		// If any of the columns don't have one of these collations, it needs more sanity checking.
 	protected function strip_invalid_text( $data ) {
-		// Some multibyte character sets that we can check in PHP.
-		$mb_charsets = array(
-			'ascii'   => 'ASCII',
-			'big5'    => 'BIG-5',
-			'eucjpms' => 'eucJP-win',
-			'gb2312'  => 'EUC-CN',
-			'ujis'    => 'EUC-JP',
-			'utf32'   => 'UTF-32',
-		);
-
-		$supported_charsets = array();
-		if ( function_exists( 'mb_list_encodings' ) ) {
-			$supported_charsets = mb_list_encodings();
-		}
-
 		$db_check_string = false;

 		foreach ( $data as &$value ) {
 			$charset = $value['charset'];

-			// Column isn't a string, or is latin1, which will will happily store anything.
-			if ( false === $charset || 'latin1' === $charset ) {
+			if ( is_array( $value['length'] ) ) {
+				$length = $value['length']['length'];
+			} else {
+				$length = false;
+			}
+
+			// There's no charset to work with.
+			if ( false === $charset ) {
 				continue;
 			}

+			// Column isn't a string.
 			if ( ! is_string( $value['value'] ) ) {
 				continue;
 			}

-			// ASCII is always OK.
-			if ( ! isset( $value['ascii'] ) && $this->check_ascii( $value['value'] ) ) {
-				continue;
+			$truncate_by_byte_length = 'byte' === $value['length']['type'];
+
+			$needs_validation = true;
+			if (
+				// latin1 can store any byte sequence
+				'latin1' === $charset
+			||
+				// ASCII is always OK.
+				( ! isset( $value['ascii'] ) && $this->check_ascii( $value['value'] ) )
+			) {
+				$truncate_by_byte_length = true;
+				$needs_validation = false;
 			}

-			// Convert the text locally.
-			if ( $supported_charsets ) {
-				if ( isset( $mb_charsets[ $charset ] ) && in_array( $mb_charsets[ $charset ], $supported_charsets ) ) {
-					$value['value'] = mb_convert_encoding( $value['value'], $mb_charsets[ $charset ], $mb_charsets[ $charset ] );
+			if ( $truncate_by_byte_length ) {
+				mbstring_binary_safe_encoding();
+				if ( false !== $length && strlen( $value['value'] ) > $length ) {
+					$value['value'] = substr( $value['value'], 0, $length );
+				}
+				reset_mbstring_encoding();
+
+				if ( ! $needs_validation ) {
 					continue;
 				}
 			}

 			// utf8 can be handled by regex, which is a bunch faster than a DB lookup.
-			if ( 'utf8' === $charset || 'utf8mb3' === $charset || 'utf8mb4' === $charset ) {
+			if ( ( 'utf8' === $charset || 'utf8mb3' === $charset || 'utf8mb4' === $charset ) && function_exists( 'mb_strlen' ) ) {
 				$regex = '/
 					(
 						(?: [\x00-\x7F]                  # single-byte sequences   0xxxxxxx
@ -2599,7 +2617,7 @@ class wpdb {
 						|   \xED[\x80-\x9F][\x80-\xBF]
 						|   [\xEE-\xEF][\x80-\xBF]{2}';

-				if ( 'utf8mb4' === $charset) {
+				if ( 'utf8mb4' === $charset ) {
 					$regex .= '
 						|    \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences   11110xxx 10xxxxxx * 3
 						|    [\xF1-\xF3][\x80-\xBF]{3}
@ -2612,6 +2630,11 @@ class wpdb {
 					| .                                  # anything else
 					/x';
 				$value['value'] = preg_replace( $regex, '$1', $value['value'] );
+
+
+				if ( false !== $length && mb_strlen( $value['value'], 'UTF-8' ) > $length ) {
+					$value['value'] = mb_substr( $value['value'], 0, $length, 'UTF-8' );
+				}
 				continue;
 			}

@ -2628,8 +2651,14 @@ class wpdb {
 						$queries[ $value['charset'] ] = array();
 					}

-					// Split the CONVERT() calls by charset, so we can make sure the connection is right
-					$queries[ $value['charset'] ][ $col ] = $this->prepare( "CONVERT( %s USING {$value['charset']} )", $value['value'] );
+					// We're going to need to truncate by characters or bytes, depending on the length value we have.
+					if ( 'byte' === $value['length']['type'] ) {
+						// Split the CONVERT() calls by charset, so we can make sure the connection is right
+						$queries[ $value['charset'] ][ $col ] = $this->prepare( "CONVERT( LEFT( CONVERT( %s USING binary ), %d ) USING {$value['charset']} )", $value['value'], $value['length']['length'] );
+					} else {
+						$queries[ $value['charset'] ][ $col ] = $this->prepare( "LEFT( CONVERT( %s USING {$value['charset']} ), %d )", $value['value'], $value['length']['length'] );
+					}
+
 					unset( $data[ $col ]['db'] );
 				}
 			}
@ -2648,16 +2677,19 @@ class wpdb {

 				$this->check_current_query = false;

-				$row = $this->get_row( "SELECT " . implode( ', ', $query ), ARRAY_N );
+				$sql = array();
+				foreach ( $query as $column => $column_query ) {
+					$sql[] = $column_query . " AS x_$column";
+				}
+
+				$row = $this->get_row( "SELECT " . implode( ', ', $sql ), ARRAY_A );
 				if ( ! $row ) {
 					$this->set_charset( $this->dbh, $connection_charset );
 					return new WP_Error( 'wpdb_strip_invalid_text_failure' );
 				}

-				$cols = array_keys( $query );
-				$col_count = count( $cols );
-				for ( $ii = 0; $ii < $col_count; $ii++ ) {
-					$data[ $cols[ $ii ] ]['value'] = $row[ $ii ];
+				foreach ( array_keys( $query ) as $column ) {
+					$data[ $column ]['value'] = $row["x_$column"];
 				}
 			}

@ -2699,6 +2731,7 @@ class wpdb {
 			'value'   => $query,
 			'charset' => $charset,
 			'ascii'   => false,
+			'length'  => false,
 		);

 		$data = $this->strip_invalid_text( array( $data ) );
@ -2721,7 +2754,7 @@ class wpdb {
 	 * @return string|WP_Error The converted string, or a `WP_Error` object if the conversion fails.
 	 */
 	public function strip_invalid_text_for_column( $table, $column, $value ) {
-		if ( ! is_string( $value ) || $this->check_ascii( $value ) ) {
+		if ( ! is_string( $value ) ) {
 			return $value;
 		}

@ -2738,7 +2771,7 @@ class wpdb {
 			$column => array(
 				'value'   => $value,
 				'charset' => $charset,
-				'ascii'   => false,
+				'length'  => $this->get_col_length( $table, $column ),
 			)
 		);