From 1c86df8bbfe6ea955ae087615ee4304cf07d2609 Mon Sep 17 00:00:00 2001 From: Michael Adams Date: Wed, 6 May 2015 19:12:16 +0000 Subject: [PATCH] WPDB: When checking that a string can be sent to MySQL, we shouldn't use `mb_convert_encoding()`, as it behaves differently to MySQL's character encoding conversion. Merge of [32364] to the 3.9 branch. Props mdawaffe, pento, nbachiyski, jorbin, johnjamesjacoby, jeremyfelt. See #32165. Built from https://develop.svn.wordpress.org/branches/3.9@32389 git-svn-id: http://core.svn.wordpress.org/branches/3.9@32359 1a063a9b-81f0-0310-95a4-ce76da25c4cd --- wp-admin/includes/upgrade.php | 42 +++++++--- wp-includes/compat.php | 144 +++++++++++++++++++++++++++++++--- wp-includes/version.php | 2 +- wp-includes/wp-db.php | 143 ++++++++++++++++++++------------- 4 files changed, 253 insertions(+), 78 deletions(-) diff --git a/wp-admin/includes/upgrade.php b/wp-admin/includes/upgrade.php index 2f4f53144a..59bec0a989 100644 --- a/wp-admin/includes/upgrade.php +++ b/wp-admin/includes/upgrade.php @@ -430,8 +430,8 @@ function upgrade_all() { if ( $wp_current_db_version < 26691 ) upgrade_380(); - if ( $wp_current_db_version < 27917 ) - upgrade_396(); + if ( $wp_current_db_version < 27918 ) + upgrade_397(); maybe_disable_link_manager(); @@ -1280,19 +1280,43 @@ function upgrade_380() { * @since 3.9.6 */ function upgrade_396() { +} + +/** + * Execute changes made in WordPress 3.9.7. + * + * @since 3.9.7 + */ +function upgrade_397() { global $wp_current_db_version, $wpdb; - if ( $wp_current_db_version < 27917 ) { + if ( $wp_current_db_version < 27918 ) { $content_length = $wpdb->get_col_length( $wpdb->comments, 'comment_content' ); - if ( ! $content_length ) { - $content_length = 65535; + if ( false === $content_length ) { + $content_length = array( + 'type' => 'byte', + 'length' => 65535, + ); + } elseif ( ! is_array( $content_length ) ) { + $length = (int) $content_length > 0 ? (int) $content_length : 65535; + $content_length = array( + 'type' => 'byte', + 'length' => $length + ); } + if ( 'byte' !== $content_length['type'] ) { + // Sites with malformed DB schemas are on their own. + return; + } + + $allowed_length = intval( $content_length['length'] ) - 10; + $comments = $wpdb->get_results( - "SELECT comment_ID FROM $wpdb->comments - WHERE comment_date_gmt > '2015-04-26' - AND CHAR_LENGTH( comment_content ) >= $content_length - AND ( comment_content LIKE '%<%' OR comment_content LIKE '%>%' )" + "SELECT `comment_ID` FROM `{$wpdb->comments}` + WHERE `comment_date_gmt` > '2015-04-26' + AND LENGTH( `comment_content` ) >= {$allowed_length} + AND ( `comment_content` LIKE '%<%' OR `comment_content` LIKE '%>%' )" ); foreach ( $comments as $comment ) { diff --git a/wp-includes/compat.php b/wp-includes/compat.php index 83a8c64652..1a79733d53 100644 --- a/wp-includes/compat.php +++ b/wp-includes/compat.php @@ -13,23 +13,141 @@ if ( !function_exists('_') ) { } } -if ( !function_exists('mb_substr') ): - function mb_substr( $str, $start, $length=null, $encoding=null ) { - return _mb_substr($str, $start, $length, $encoding); +/** + * Returns whether PCRE/u (PCRE_UTF8 modifier) is available for use. + * + * @ignore + * @since 4.2.2 + * @access private + * + * @param bool $set - Used for testing only + * null : default - get PCRE/u capability + * false : Used for testing - return false for future calls to this function + * 'reset': Used for testing - restore default behavior of this function + */ +function _wp_can_use_pcre_u( $set = null ) { + static $utf8_pcre = 'reset'; + + if ( null !== $set ) { + $utf8_pcre = $set; + } + + if ( 'reset' === $utf8_pcre ) { + $utf8_pcre = @preg_match( '/^./u', 'a' ); + } + + return $utf8_pcre; +} + +if ( ! function_exists( 'mb_substr' ) ) : + function mb_substr( $str, $start, $length = null, $encoding = null ) { + return _mb_substr( $str, $start, $length, $encoding ); } endif; -function _mb_substr( $str, $start, $length=null, $encoding=null ) { - // the solution below, works only for utf-8, so in case of a different - // charset, just use built-in substr - $charset = get_option( 'blog_charset' ); - if ( !in_array( $charset, array('utf8', 'utf-8', 'UTF8', 'UTF-8') ) ) { - return is_null( $length )? substr( $str, $start ) : substr( $str, $start, $length); +/* + * Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit. + * For $encoding === UTF-8, the $str input is expected to be a valid UTF-8 byte sequence. + * The behavior of this function for invalid inputs is undefined. + */ +function _mb_substr( $str, $start, $length = null, $encoding = null ) { + if ( null === $encoding ) { + $encoding = get_option( 'blog_charset' ); } - // use the regex unicode support to separate the UTF-8 characters into an array - preg_match_all( '/./us', $str, $match ); - $chars = is_null( $length )? array_slice( $match[0], $start ) : array_slice( $match[0], $start, $length ); - return implode( '', $chars ); + + // The solution below works only for UTF-8, + // so in case of a different charset just use built-in substr() + if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) { + return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length ); + } + + if ( _wp_can_use_pcre_u() ) { + // Use the regex unicode support to separate the UTF-8 characters into an array + preg_match_all( '/./us', $str, $match ); + $chars = is_null( $length ) ? array_slice( $match[0], $start ) : array_slice( $match[0], $start, $length ); + return implode( '', $chars ); + } + + $regex = '/( + [\x00-\x7F] # single-byte sequences 0xxxxxxx + | [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx + | \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2 + | [\xE1-\xEC][\x80-\xBF]{2} + | \xED[\x80-\x9F][\x80-\xBF] + | [\xEE-\xEF][\x80-\xBF]{2} + | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3 + | [\xF1-\xF3][\x80-\xBF]{3} + | \xF4[\x80-\x8F][\x80-\xBF]{2} + )/x'; + + $chars = array( '' ); // Start with 1 element instead of 0 since the first thing we do is pop + do { + // We had some string left over from the last round, but we counted it in that last round. + array_pop( $chars ); + + // Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string) + $pieces = preg_split( $regex, $str, 1000, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); + + $chars = array_merge( $chars, $pieces ); + } while ( count( $pieces ) > 1 && $str = array_pop( $pieces ) ); // If there's anything left over, repeat the loop. + + return join( '', array_slice( $chars, $start, $length ) ); +} + +if ( ! function_exists( 'mb_strlen' ) ) : + function mb_strlen( $str, $encoding = null ) { + return _mb_strlen( $str, $encoding ); + } +endif; + +/* + * Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit. + * For $encoding === UTF-8, the $str input is expected to be a valid UTF-8 byte sequence. + * The behavior of this function for invalid inputs is undefined. + */ +function _mb_strlen( $str, $encoding = null ) { + if ( null === $encoding ) { + $encoding = get_option( 'blog_charset' ); + } + + // The solution below works only for UTF-8, + // so in case of a different charset just use built-in strlen() + if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) { + return strlen( $str ); + } + + if ( _wp_can_use_pcre_u() ) { + // Use the regex unicode support to separate the UTF-8 characters into an array + preg_match_all( '/./us', $str, $match ); + return count( $match[0] ); + } + + $regex = '/(?: + [\x00-\x7F] # single-byte sequences 0xxxxxxx + | [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx + | \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2 + | [\xE1-\xEC][\x80-\xBF]{2} + | \xED[\x80-\x9F][\x80-\xBF] + | [\xEE-\xEF][\x80-\xBF]{2} + | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3 + | [\xF1-\xF3][\x80-\xBF]{3} + | \xF4[\x80-\x8F][\x80-\xBF]{2} + )/x'; + + $count = 1; // Start at 1 instead of 0 since the first thing we do is decrement + do { + // We had some string left over from the last round, but we counted it in that last round. + $count--; + + // Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string) + $pieces = preg_split( $regex, $str, 1000 ); + + // Increment + $count += count( $pieces ); + } while ( $str = array_pop( $pieces ) ); // If there's anything left over, repeat the loop. + + // Fencepost: preg_split() always returns one extra item in the array + return --$count; } if ( !function_exists('hash_hmac') ): diff --git a/wp-includes/version.php b/wp-includes/version.php index 7559910857..dcfe54d3ad 100644 --- a/wp-includes/version.php +++ b/wp-includes/version.php @@ -11,7 +11,7 @@ $wp_version = '3.9.6'; * * @global int $wp_db_version */ -$wp_db_version = 27917; +$wp_db_version = 27918; /** * Holds the TinyMCE version diff --git a/wp-includes/wp-db.php b/wp-includes/wp-db.php index b7f803bdca..69d9362ce2 100644 --- a/wp-includes/wp-db.php +++ b/wp-includes/wp-db.php @@ -1751,6 +1751,8 @@ class wpdb { * @return int|false The number of rows affected, or false on error. */ function _insert_replace_helper( $table, $data, $format = null, $type = 'INSERT' ) { + $this->insert_id = 0; + if ( ! in_array( strtoupper( $type ), array( 'REPLACE', 'INSERT' ) ) ) { return false; } @@ -1771,7 +1773,6 @@ class wpdb { $sql = "$type INTO `$table` ($fields) VALUES ($formats)"; - $this->insert_id = 0; $this->check_current_query = false; return $this->query( $this->prepare( $sql, $values ) ); } @@ -1968,17 +1969,11 @@ class wpdb { // We can skip this field if we know it isn't a string. // This checks %d/%f versus ! %s because it's sprintf() could take more. $value['charset'] = false; - } elseif ( $this->check_ascii( $value['value'] ) ) { - // If it's ASCII, then we don't need the charset. We can skip this field. - $value['charset'] = false; } else { $value['charset'] = $this->get_col_charset( $table, $field ); if ( is_wp_error( $value['charset'] ) ) { return false; } - - // This isn't ASCII. Don't have strip_invalid_text() re-check. - $value['ascii'] = false; } $data[ $field ] = $value; @@ -2011,10 +2006,6 @@ class wpdb { } } - if ( false !== $value['length'] && strlen( $value['value'] ) > $value['length'] ) { - return false; - } - $data[ $field ] = $value; } @@ -2343,14 +2334,16 @@ class wpdb { /** * Retrieve the maximum string length allowed in a given column. + * The length may either be specified as a byte length or a character length. * * @since 4.2.1 * @access public * * @param string $table Table name. * @param string $column Column name. - * @return mixed Max column length as an int. False if the column has no - * length. WP_Error object if there was an error. + * @return mixed array( 'length' => (int), 'type' => 'byte' | 'char' ) + * false if the column has no length (for example, numeric column) + * WP_Error object if there was an error. */ public function get_col_length( $table, $column ) { $tablekey = strtolower( $table ); @@ -2383,27 +2376,47 @@ class wpdb { } switch( $type ) { - case 'binary': case 'char': - case 'varbinary': case 'varchar': - return $length; + return array( + 'type' => 'char', + 'length' => (int) $length, + ); + break; + case 'binary': + case 'varbinary': + return array( + 'type' => 'byte', + 'length' => (int) $length, + ); break; case 'tinyblob': case 'tinytext': - return 255; // 2^8 - 1 + return array( + 'type' => 'byte', + 'length' => 255, // 2^8 - 1 + ); break; case 'blob': case 'text': - return 65535; // 2^16 - 1 + return array( + 'type' => 'byte', + 'length' => 65535, // 2^16 - 1 + ); break; case 'mediumblob': case 'mediumtext': - return 16777215; // 2^24 - 1 + return array( + 'type' => 'byte', + 'length' => 16777215, // 2^24 - 1 + ); break; case 'longblob': case 'longtext': - return 4294967295; // 2^32 - 1 + return array( + 'type' => 'byte', + 'length' => 4294967295, // 2^32 - 1 + ); break; default: return false; @@ -2510,50 +2523,55 @@ class wpdb { */ // If any of the columns don't have one of these collations, it needs more sanity checking. protected function strip_invalid_text( $data ) { - // Some multibyte character sets that we can check in PHP. - $mb_charsets = array( - 'ascii' => 'ASCII', - 'big5' => 'BIG-5', - 'eucjpms' => 'eucJP-win', - 'gb2312' => 'EUC-CN', - 'ujis' => 'EUC-JP', - 'utf32' => 'UTF-32', - ); - - $supported_charsets = array(); - if ( function_exists( 'mb_list_encodings' ) ) { - $supported_charsets = mb_list_encodings(); - } - $db_check_string = false; foreach ( $data as &$value ) { $charset = $value['charset']; - // Column isn't a string, or is latin1, which will will happily store anything. - if ( false === $charset || 'latin1' === $charset ) { + if ( is_array( $value['length'] ) ) { + $length = $value['length']['length']; + } else { + $length = false; + } + + // There's no charset to work with. + if ( false === $charset ) { continue; } + // Column isn't a string. if ( ! is_string( $value['value'] ) ) { continue; } - // ASCII is always OK. - if ( ! isset( $value['ascii'] ) && $this->check_ascii( $value['value'] ) ) { - continue; + $truncate_by_byte_length = 'byte' === $value['length']['type']; + + $needs_validation = true; + if ( + // latin1 can store any byte sequence + 'latin1' === $charset + || + // ASCII is always OK. + ( ! isset( $value['ascii'] ) && $this->check_ascii( $value['value'] ) ) + ) { + $truncate_by_byte_length = true; + $needs_validation = false; } - // Convert the text locally. - if ( $supported_charsets ) { - if ( isset( $mb_charsets[ $charset ] ) && in_array( $mb_charsets[ $charset ], $supported_charsets ) ) { - $value['value'] = mb_convert_encoding( $value['value'], $mb_charsets[ $charset ], $mb_charsets[ $charset ] ); + if ( $truncate_by_byte_length ) { + mbstring_binary_safe_encoding(); + if ( false !== $length && strlen( $value['value'] ) > $length ) { + $value['value'] = substr( $value['value'], 0, $length ); + } + reset_mbstring_encoding(); + + if ( ! $needs_validation ) { continue; } } // utf8 can be handled by regex, which is a bunch faster than a DB lookup. - if ( 'utf8' === $charset || 'utf8mb3' === $charset || 'utf8mb4' === $charset ) { + if ( ( 'utf8' === $charset || 'utf8mb3' === $charset || 'utf8mb4' === $charset ) && function_exists( 'mb_strlen' ) ) { $regex = '/ ( (?: [\x00-\x7F] # single-byte sequences 0xxxxxxx @@ -2563,7 +2581,7 @@ class wpdb { | \xED[\x80-\x9F][\x80-\xBF] | [\xEE-\xEF][\x80-\xBF]{2}'; - if ( 'utf8mb4' === $charset) { + if ( 'utf8mb4' === $charset ) { $regex .= ' | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3 | [\xF1-\xF3][\x80-\xBF]{3} @@ -2576,6 +2594,11 @@ class wpdb { | . # anything else /x'; $value['value'] = preg_replace( $regex, '$1', $value['value'] ); + + + if ( false !== $length && mb_strlen( $value['value'], 'UTF-8' ) > $length ) { + $value['value'] = mb_substr( $value['value'], 0, $length, 'UTF-8' ); + } continue; } @@ -2592,8 +2615,14 @@ class wpdb { $queries[ $value['charset'] ] = array(); } - // Split the CONVERT() calls by charset, so we can make sure the connection is right - $queries[ $value['charset'] ][ $col ] = $this->prepare( "CONVERT( %s USING {$value['charset']} )", $value['value'] ); + // We're going to need to truncate by characters or bytes, depending on the length value we have. + if ( 'byte' === $value['length']['type'] ) { + // Split the CONVERT() calls by charset, so we can make sure the connection is right + $queries[ $value['charset'] ][ $col ] = $this->prepare( "CONVERT( LEFT( CONVERT( %s USING binary ), %d ) USING {$value['charset']} )", $value['value'], $value['length']['length'] ); + } else { + $queries[ $value['charset'] ][ $col ] = $this->prepare( "LEFT( CONVERT( %s USING {$value['charset']} ), %d )", $value['value'], $value['length']['length'] ); + } + unset( $data[ $col ]['db'] ); } } @@ -2612,16 +2641,19 @@ class wpdb { $this->check_current_query = false; - $row = $this->get_row( "SELECT " . implode( ', ', $query ), ARRAY_N ); + $sql = array(); + foreach ( $query as $column => $column_query ) { + $sql[] = $column_query . " AS x_$column"; + } + + $row = $this->get_row( "SELECT " . implode( ', ', $sql ), ARRAY_A ); if ( ! $row ) { $this->set_charset( $this->dbh, $connection_charset ); return new WP_Error( 'wpdb_strip_invalid_text_failure' ); } - $cols = array_keys( $query ); - $col_count = count( $cols ); - for ( $ii = 0; $ii < $col_count; $ii++ ) { - $data[ $cols[ $ii ] ]['value'] = $row[ $ii ]; + foreach ( array_keys( $query ) as $column ) { + $data[ $column ]['value'] = $row["x_$column"]; } } @@ -2663,6 +2695,7 @@ class wpdb { 'value' => $query, 'charset' => $charset, 'ascii' => false, + 'length' => false, ); $data = $this->strip_invalid_text( array( $data ) ); @@ -2685,7 +2718,7 @@ class wpdb { * @return string|WP_Error The converted string, or a `WP_Error` object if the conversion fails. */ public function strip_invalid_text_for_column( $table, $column, $value ) { - if ( ! is_string( $value ) || $this->check_ascii( $value ) ) { + if ( ! is_string( $value ) ) { return $value; } @@ -2702,7 +2735,7 @@ class wpdb { $column => array( 'value' => $value, 'charset' => $charset, - 'ascii' => false, + 'length' => $this->get_col_length( $table, $column ), ) );