From 364886a5be7a05004734ccc75908946b8aadaaa0 Mon Sep 17 00:00:00 2001 From: Gary Pendergast Date: Wed, 6 May 2015 03:00:25 +0000 Subject: [PATCH] WPDB: When checking that a string can be sent to MySQL, we shouldn't use `mb_convert_encoding()`, as it behaves differently to MySQL's character encoding conversion. Props mdawaffe, pento, nbachiyski, jorbin, johnjamesjacoby, jeremyfelt. See #32165. Built from https://develop.svn.wordpress.org/trunk@32364 git-svn-id: http://core.svn.wordpress.org/trunk@32335 1a063a9b-81f0-0310-95a4-ce76da25c4cd --- wp-admin/includes/upgrade.php | 32 ++++++-- wp-includes/comment.php | 31 +++++--- wp-includes/compat.php | 122 ++++++++++++++++++++++++++--- wp-includes/version.php | 4 +- wp-includes/wp-db.php | 143 +++++++++++++++++++++------------- 5 files changed, 244 insertions(+), 88 deletions(-) diff --git a/wp-admin/includes/upgrade.php b/wp-admin/includes/upgrade.php index 146d6a8750..ece8daf349 100644 --- a/wp-admin/includes/upgrade.php +++ b/wp-admin/includes/upgrade.php @@ -527,7 +527,7 @@ function upgrade_all() { if ( $wp_current_db_version < 31351 ) upgrade_420(); - if ( $wp_current_db_version < 32308 ) + if ( $wp_current_db_version < 32364 ) upgrade_430(); maybe_disable_link_manager(); @@ -1446,17 +1446,33 @@ function upgrade_420() { function upgrade_430() { global $wp_current_db_version, $wpdb; - if ( $wp_current_db_version < 32308 ) { + if ( $wp_current_db_version < 32364 ) { $content_length = $wpdb->get_col_length( $wpdb->comments, 'comment_content' ); - if ( ! $content_length ) { - $content_length = 65535; + if ( false === $content_length ) { + $content_length = array( + 'type' => 'byte', + 'length' => 65535, + ); + } elseif ( ! is_array( $content_length ) ) { + $length = (int) $content_length > 0 ? (int) $content_length : 65535; + $content_length = array( + 'type' => 'byte', + 'length' => $length + ); } + if ( 'byte' !== $content_length['type'] ) { + // Sites with malformed DB schemas are on their own. + return; + } + + $allowed_length = intval( $content_length['length'] ) - 10; + $comments = $wpdb->get_results( - "SELECT comment_ID FROM $wpdb->comments - WHERE comment_date_gmt > '2015-04-26' - AND CHAR_LENGTH( comment_content ) >= $content_length - AND ( comment_content LIKE '%<%' OR comment_content LIKE '%>%' )" + "SELECT `comment_ID` FROM `{$wpdb->comments}` + WHERE `comment_date_gmt` > '2015-04-26' + AND LENGTH( `comment_content` ) >= {$allowed_length} + AND ( `comment_content` LIKE '%<%' OR `comment_content` LIKE '%>%' )" ); foreach ( $comments as $comment ) { diff --git a/wp-includes/comment.php b/wp-includes/comment.php index 75c77b1d91..e2ed55386a 100644 --- a/wp-includes/comment.php +++ b/wp-includes/comment.php @@ -2118,17 +2118,7 @@ function wp_insert_comment( $commentdata ) { $compacted = compact( 'comment_post_ID', 'comment_author', 'comment_author_email', 'comment_author_url', 'comment_author_IP', 'comment_date', 'comment_date_gmt', 'comment_content', 'comment_karma', 'comment_approved', 'comment_agent', 'comment_type', 'comment_parent', 'user_id' ); if ( ! $wpdb->insert( $wpdb->comments, $compacted ) ) { - $fields = array( 'comment_author', 'comment_author_email', 'comment_author_url', 'comment_content' ); - - foreach( $fields as $field ) { - if ( isset( $compacted[ $field ] ) ) { - $compacted[ $field ] = $wpdb->strip_invalid_text_for_column( $wpdb->comments, $field, $compacted[ $field ] ); - } - } - - if ( ! $wpdb->insert( $wpdb->comments, $compacted ) ) { - return false; - } + return false; } $id = (int) $wpdb->insert_id; @@ -2252,6 +2242,8 @@ function wp_throttle_comment_flood($block, $time_lastcomment, $time_newcomment) * @return int|bool The ID of the comment on success, false on failure. */ function wp_new_comment( $commentdata ) { + global $wpdb; + if ( isset( $commentdata['user_ID'] ) ) { $commentdata['user_id'] = $commentdata['user_ID'] = (int) $commentdata['user_ID']; } @@ -2295,7 +2287,22 @@ function wp_new_comment( $commentdata ) { $comment_ID = wp_insert_comment($commentdata); if ( ! $comment_ID ) { - return false; + $fields = array( 'comment_author', 'comment_author_email', 'comment_author_url', 'comment_content' ); + + foreach( $fields as $field ) { + if ( isset( $commentdata[ $field ] ) ) { + $commentdata[ $field ] = $wpdb->strip_invalid_text_for_column( $wpdb->comments, $field, $commentdata[ $field ] ); + } + } + + $commentdata = wp_filter_comment( $commentdata ); + + $commentdata['comment_approved'] = wp_allow_comment( $commentdata ); + + $comment_ID = wp_insert_comment( $commentdata ); + if ( ! $comment_ID ) { + return false; + } } /** diff --git a/wp-includes/compat.php b/wp-includes/compat.php index 43667053f1..4317eb689b 100644 --- a/wp-includes/compat.php +++ b/wp-includes/compat.php @@ -13,23 +13,85 @@ if ( !function_exists('_') ) { } } +/** + * Returns whether PCRE/u (PCRE_UTF8 modifier) is available for use. + * + * @ignore + * @since 4.2.2 + * @access private + * + * @param bool $set - Used for testing only + * null : default - get PCRE/u capability + * false : Used for testing - return false for future calls to this function + * 'reset': Used for testing - restore default behavior of this function + */ +function _wp_can_use_pcre_u( $set = null ) { + static $utf8_pcre = 'reset'; + + if ( null !== $set ) { + $utf8_pcre = $set; + } + + if ( 'reset' === $utf8_pcre ) { + $utf8_pcre = @preg_match( '/^./u', 'a' ); + } + + return $utf8_pcre; +} + if ( ! function_exists( 'mb_substr' ) ) : function mb_substr( $str, $start, $length = null, $encoding = null ) { return _mb_substr( $str, $start, $length, $encoding ); } endif; +/* + * Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit. + * For $encoding === UTF-8, the $str input is expected to be a valid UTF-8 byte sequence. + * The behavior of this function for invalid inputs is undefined. + */ function _mb_substr( $str, $start, $length = null, $encoding = null ) { + if ( null === $encoding ) { + $encoding = get_option( 'blog_charset' ); + } + // The solution below works only for UTF-8, // so in case of a different charset just use built-in substr() - $charset = get_option( 'blog_charset' ); - if ( ! in_array( $charset, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) { + if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) { return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length ); } - // Use the regex unicode support to separate the UTF-8 characters into an array - preg_match_all( '/./us', $str, $match ); - $chars = is_null( $length ) ? array_slice( $match[0], $start ) : array_slice( $match[0], $start, $length ); - return implode( '', $chars ); + + if ( _wp_can_use_pcre_u() ) { + // Use the regex unicode support to separate the UTF-8 characters into an array + preg_match_all( '/./us', $str, $match ); + $chars = is_null( $length ) ? array_slice( $match[0], $start ) : array_slice( $match[0], $start, $length ); + return implode( '', $chars ); + } + + $regex = '/( + [\x00-\x7F] # single-byte sequences 0xxxxxxx + | [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx + | \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2 + | [\xE1-\xEC][\x80-\xBF]{2} + | \xED[\x80-\x9F][\x80-\xBF] + | [\xEE-\xEF][\x80-\xBF]{2} + | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3 + | [\xF1-\xF3][\x80-\xBF]{3} + | \xF4[\x80-\x8F][\x80-\xBF]{2} + )/x'; + + $chars = array( '' ); // Start with 1 element instead of 0 since the first thing we do is pop + do { + // We had some string left over from the last round, but we counted it in that last round. + array_pop( $chars ); + + // Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string) + $pieces = preg_split( $regex, $str, 1000, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); + + $chars = array_merge( $chars, $pieces ); + } while ( count( $pieces ) > 1 && $str = array_pop( $pieces ) ); // If there's anything left over, repeat the loop. + + return join( '', array_slice( $chars, $start, $length ) ); } if ( ! function_exists( 'mb_strlen' ) ) : @@ -38,16 +100,54 @@ if ( ! function_exists( 'mb_strlen' ) ) : } endif; +/* + * Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit. + * For $encoding === UTF-8, the $str input is expected to be a valid UTF-8 byte sequence. + * The behavior of this function for invalid inputs is undefined. + */ function _mb_strlen( $str, $encoding = null ) { + if ( null === $encoding ) { + $encoding = get_option( 'blog_charset' ); + } + // The solution below works only for UTF-8, // so in case of a different charset just use built-in strlen() - $charset = get_option( 'blog_charset' ); - if ( ! in_array( $charset, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) { + if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) { return strlen( $str ); } - // Use the regex unicode support to separate the UTF-8 characters into an array - preg_match_all( '/./us', $str, $match ); - return count( $match[0] ); + + if ( _wp_can_use_pcre_u() ) { + // Use the regex unicode support to separate the UTF-8 characters into an array + preg_match_all( '/./us', $str, $match ); + return count( $match[0] ); + } + + $regex = '/(?: + [\x00-\x7F] # single-byte sequences 0xxxxxxx + | [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx + | \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2 + | [\xE1-\xEC][\x80-\xBF]{2} + | \xED[\x80-\x9F][\x80-\xBF] + | [\xEE-\xEF][\x80-\xBF]{2} + | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3 + | [\xF1-\xF3][\x80-\xBF]{3} + | \xF4[\x80-\x8F][\x80-\xBF]{2} + )/x'; + + $count = 1; // Start at 1 instead of 0 since the first thing we do is decrement + do { + // We had some string left over from the last round, but we counted it in that last round. + $count--; + + // Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string) + $pieces = preg_split( $regex, $str, 1000 ); + + // Increment + $count += count( $pieces ); + } while ( $str = array_pop( $pieces ) ); // If there's anything left over, repeat the loop. + + // Fencepost: preg_split() always returns one extra item in the array + return --$count; } if ( !function_exists('hash_hmac') ): diff --git a/wp-includes/version.php b/wp-includes/version.php index acedc6e2c9..a026def48b 100644 --- a/wp-includes/version.php +++ b/wp-includes/version.php @@ -4,14 +4,14 @@ * * @global string $wp_version */ -$wp_version = '4.3-alpha-32359'; +$wp_version = '4.3-alpha-32364'; /** * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema. * * @global int $wp_db_version */ -$wp_db_version = 32308; +$wp_db_version = 32364; /** * Holds the TinyMCE version diff --git a/wp-includes/wp-db.php b/wp-includes/wp-db.php index 0bd7a1882b..4be05e6ec6 100644 --- a/wp-includes/wp-db.php +++ b/wp-includes/wp-db.php @@ -1809,6 +1809,8 @@ class wpdb { * @return int|false The number of rows affected, or false on error. */ function _insert_replace_helper( $table, $data, $format = null, $type = 'INSERT' ) { + $this->insert_id = 0; + if ( ! in_array( strtoupper( $type ), array( 'REPLACE', 'INSERT' ) ) ) { return false; } @@ -1829,7 +1831,6 @@ class wpdb { $sql = "$type INTO `$table` ($fields) VALUES ($formats)"; - $this->insert_id = 0; $this->check_current_query = false; return $this->query( $this->prepare( $sql, $values ) ); } @@ -2021,17 +2022,11 @@ class wpdb { // We can skip this field if we know it isn't a string. // This checks %d/%f versus ! %s because it's sprintf() could take more. $value['charset'] = false; - } elseif ( $this->check_ascii( $value['value'] ) ) { - // If it's ASCII, then we don't need the charset. We can skip this field. - $value['charset'] = false; } else { $value['charset'] = $this->get_col_charset( $table, $field ); if ( is_wp_error( $value['charset'] ) ) { return false; } - - // This isn't ASCII. Don't have strip_invalid_text() re-check. - $value['ascii'] = false; } $data[ $field ] = $value; @@ -2064,10 +2059,6 @@ class wpdb { } } - if ( false !== $value['length'] && mb_strlen( $value['value'] ) > $value['length'] ) { - return false; - } - $data[ $field ] = $value; } @@ -2406,14 +2397,16 @@ class wpdb { /** * Retrieve the maximum string length allowed in a given column. + * The length may either be specified as a byte length or a character length. * * @since 4.2.1 * @access public * * @param string $table Table name. * @param string $column Column name. - * @return mixed Max column length as an int. False if the column has no - * length. WP_Error object if there was an error. + * @return mixed array( 'length' => (int), 'type' => 'byte' | 'char' ) + * false if the column has no length (for example, numeric column) + * WP_Error object if there was an error. */ public function get_col_length( $table, $column ) { $tablekey = strtolower( $table ); @@ -2446,27 +2439,47 @@ class wpdb { } switch( $type ) { - case 'binary': case 'char': - case 'varbinary': case 'varchar': - return $length; + return array( + 'type' => 'char', + 'length' => (int) $length, + ); + break; + case 'binary': + case 'varbinary': + return array( + 'type' => 'byte', + 'length' => (int) $length, + ); break; case 'tinyblob': case 'tinytext': - return 255; // 2^8 - 1 + return array( + 'type' => 'byte', + 'length' => 255, // 2^8 - 1 + ); break; case 'blob': case 'text': - return 65535; // 2^16 - 1 + return array( + 'type' => 'byte', + 'length' => 65535, // 2^16 - 1 + ); break; case 'mediumblob': case 'mediumtext': - return 16777215; // 2^24 - 1 + return array( + 'type' => 'byte', + 'length' => 16777215, // 2^24 - 1 + ); break; case 'longblob': case 'longtext': - return 4294967295; // 2^32 - 1 + return array( + 'type' => 'byte', + 'length' => 4294967295, // 2^32 - 1 + ); break; default: return false; @@ -2572,50 +2585,55 @@ class wpdb { * remove invalid characters, a WP_Error object is returned. */ protected function strip_invalid_text( $data ) { - // Some multibyte character sets that we can check in PHP. - $mb_charsets = array( - 'ascii' => 'ASCII', - 'big5' => 'BIG-5', - 'eucjpms' => 'eucJP-win', - 'gb2312' => 'EUC-CN', - 'ujis' => 'EUC-JP', - 'utf32' => 'UTF-32', - ); - - $supported_charsets = array(); - if ( function_exists( 'mb_list_encodings' ) ) { - $supported_charsets = mb_list_encodings(); - } - $db_check_string = false; foreach ( $data as &$value ) { $charset = $value['charset']; - // Column isn't a string, or is latin1, which will will happily store anything. - if ( false === $charset || 'latin1' === $charset ) { + if ( is_array( $value['length'] ) ) { + $length = $value['length']['length']; + } else { + $length = false; + } + + // There's no charset to work with. + if ( false === $charset ) { continue; } + // Column isn't a string. if ( ! is_string( $value['value'] ) ) { continue; } - // ASCII is always OK. - if ( ! isset( $value['ascii'] ) && $this->check_ascii( $value['value'] ) ) { - continue; + $truncate_by_byte_length = 'byte' === $value['length']['type']; + + $needs_validation = true; + if ( + // latin1 can store any byte sequence + 'latin1' === $charset + || + // ASCII is always OK. + ( ! isset( $value['ascii'] ) && $this->check_ascii( $value['value'] ) ) + ) { + $truncate_by_byte_length = true; + $needs_validation = false; } - // Convert the text locally. - if ( $supported_charsets ) { - if ( isset( $mb_charsets[ $charset ] ) && in_array( $mb_charsets[ $charset ], $supported_charsets ) ) { - $value['value'] = mb_convert_encoding( $value['value'], $mb_charsets[ $charset ], $mb_charsets[ $charset ] ); + if ( $truncate_by_byte_length ) { + mbstring_binary_safe_encoding(); + if ( false !== $length && strlen( $value['value'] ) > $length ) { + $value['value'] = substr( $value['value'], 0, $length ); + } + reset_mbstring_encoding(); + + if ( ! $needs_validation ) { continue; } } // utf8 can be handled by regex, which is a bunch faster than a DB lookup. - if ( 'utf8' === $charset || 'utf8mb3' === $charset || 'utf8mb4' === $charset ) { + if ( ( 'utf8' === $charset || 'utf8mb3' === $charset || 'utf8mb4' === $charset ) && function_exists( 'mb_strlen' ) ) { $regex = '/ ( (?: [\x00-\x7F] # single-byte sequences 0xxxxxxx @@ -2625,7 +2643,7 @@ class wpdb { | \xED[\x80-\x9F][\x80-\xBF] | [\xEE-\xEF][\x80-\xBF]{2}'; - if ( 'utf8mb4' === $charset) { + if ( 'utf8mb4' === $charset ) { $regex .= ' | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3 | [\xF1-\xF3][\x80-\xBF]{3} @@ -2638,6 +2656,11 @@ class wpdb { | . # anything else /x'; $value['value'] = preg_replace( $regex, '$1', $value['value'] ); + + + if ( false !== $length && mb_strlen( $value['value'], 'UTF-8' ) > $length ) { + $value['value'] = mb_substr( $value['value'], 0, $length, 'UTF-8' ); + } continue; } @@ -2654,8 +2677,14 @@ class wpdb { $queries[ $value['charset'] ] = array(); } - // Split the CONVERT() calls by charset, so we can make sure the connection is right - $queries[ $value['charset'] ][ $col ] = $this->prepare( "CONVERT( %s USING {$value['charset']} )", $value['value'] ); + // We're going to need to truncate by characters or bytes, depending on the length value we have. + if ( 'byte' === $value['length']['type'] ) { + // Split the CONVERT() calls by charset, so we can make sure the connection is right + $queries[ $value['charset'] ][ $col ] = $this->prepare( "CONVERT( LEFT( CONVERT( %s USING binary ), %d ) USING {$value['charset']} )", $value['value'], $value['length']['length'] ); + } else { + $queries[ $value['charset'] ][ $col ] = $this->prepare( "LEFT( CONVERT( %s USING {$value['charset']} ), %d )", $value['value'], $value['length']['length'] ); + } + unset( $data[ $col ]['db'] ); } } @@ -2674,16 +2703,19 @@ class wpdb { $this->check_current_query = false; - $row = $this->get_row( "SELECT " . implode( ', ', $query ), ARRAY_N ); + $sql = array(); + foreach ( $query as $column => $column_query ) { + $sql[] = $column_query . " AS x_$column"; + } + + $row = $this->get_row( "SELECT " . implode( ', ', $sql ), ARRAY_A ); if ( ! $row ) { $this->set_charset( $this->dbh, $connection_charset ); return new WP_Error( 'wpdb_strip_invalid_text_failure' ); } - $cols = array_keys( $query ); - $col_count = count( $cols ); - for ( $ii = 0; $ii < $col_count; $ii++ ) { - $data[ $cols[ $ii ] ]['value'] = $row[ $ii ]; + foreach ( array_keys( $query ) as $column ) { + $data[ $column ]['value'] = $row["x_$column"]; } } @@ -2725,6 +2757,7 @@ class wpdb { 'value' => $query, 'charset' => $charset, 'ascii' => false, + 'length' => false, ); $data = $this->strip_invalid_text( array( $data ) ); @@ -2747,7 +2780,7 @@ class wpdb { * @return string|WP_Error The converted string, or a WP_Error object if the conversion fails. */ public function strip_invalid_text_for_column( $table, $column, $value ) { - if ( ! is_string( $value ) || $this->check_ascii( $value ) ) { + if ( ! is_string( $value ) ) { return $value; } @@ -2764,7 +2797,7 @@ class wpdb { $column => array( 'value' => $value, 'charset' => $charset, - 'ascii' => false, + 'length' => $this->get_col_length( $table, $column ), ) );