Normalize UTF-8 charset slug detection.

There are several exist places in Core that attempt to detect if a blog charset
is UTF-8. Each place attempts to perform the same check, except the logic is
spread throughout and there's no single method provided to make this
determination in a consistent way. The `_canonical_charset()` method exists,
but is marked private for use.

In this patch the new `unicode` module provides `is_utf8_charset()` as a method
taking an optional charset slug and indicating if it represents UTF-8,
examining all of the allowable variants of that slug. Associated code is
updated to use this new function, including `_canonical_charset()`. If no slug
is provided, it will look up the current `get_option( 'blog_charset' )`.

Finally, the test functions governing `_canonical_charset()` have been
rewritten as a single test with a data provider instead of as separate test
functions.

Developed in https://github.com/WordPress/wordpress-develop/pull/6535
Discussed in https://core.trac.wordpress.org/ticket/61182

Fixes #61182.
Props dmsnell, jonsurrell.

Built from https://develop.svn.wordpress.org/trunk@58147


git-svn-id: http://core.svn.wordpress.org/trunk@57612 1a063a9b-81f0-0310-95a4-ce76da25c4cd
This commit is contained in:
dmsnell 2024-05-14 18:05:11 +00:00
parent 79045fa10e
commit d30cd41ed4
7 changed files with 23 additions and 24 deletions

View File

@ -64,7 +64,7 @@ require_once ABSPATH . 'wp-admin/admin-header.php';
<?php
settings_fields( 'reading' );
if ( ! in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {
if ( ! is_utf8_charset() ) {
add_settings_field( 'blog_charset', __( 'Encoding for pages and feeds' ), 'options_reading_blog_charset', 'reading', 'default', array( 'label_for' => 'blog_charset' ) );
}
?>

View File

@ -160,7 +160,7 @@ $allowed_options['privacy'] = array();
$mail_options = array( 'mailserver_url', 'mailserver_port', 'mailserver_login', 'mailserver_pass' );
if ( ! in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {
if ( ! is_utf8_charset() ) {
$allowed_options['reading'][] = 'blog_charset';
}

View File

@ -91,7 +91,7 @@ function _mb_substr( $str, $start, $length = null, $encoding = null ) {
* The solution below works only for UTF-8, so in case of a different
* charset just use built-in substr().
*/
if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {
if ( ! is_utf8_charset( $encoding ) ) {
return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length );
}
@ -176,7 +176,7 @@ function _mb_strlen( $str, $encoding = null ) {
* The solution below works only for UTF-8, so in case of a different charset
* just use built-in strlen().
*/
if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {
if ( ! is_utf8_charset( $encoding ) ) {
return strlen( $str );
}

View File

@ -960,19 +960,7 @@ function _wp_specialchars( $text, $quote_style = ENT_NOQUOTES, $charset = false,
$quote_style = ENT_QUOTES;
}
// Store the site charset as a static to avoid multiple calls to wp_load_alloptions().
if ( ! $charset ) {
static $_charset = null;
if ( ! isset( $_charset ) ) {
$alloptions = wp_load_alloptions();
$_charset = isset( $alloptions['blog_charset'] ) ? $alloptions['blog_charset'] : '';
}
$charset = $_charset;
}
if ( in_array( $charset, array( 'utf8', 'utf-8', 'UTF8' ), true ) ) {
$charset = 'UTF-8';
}
$charset = _canonical_charset( $charset ? $charset : get_option( 'blog_charset' ) );
$_quote_style = $quote_style;
@ -1114,7 +1102,7 @@ function wp_check_invalid_utf8( $text, $strip = false ) {
// Store the site charset as a static to avoid multiple calls to get_option().
static $is_utf8 = null;
if ( ! isset( $is_utf8 ) ) {
$is_utf8 = in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true );
$is_utf8 = is_utf8_charset();
}
if ( ! $is_utf8 ) {
return $text;

View File

@ -7474,17 +7474,27 @@ function get_tag_regex( $tag ) {
*
* @see https://core.trac.wordpress.org/ticket/23688
*
* @param string $charset A charset name.
* @param string $charset A charset name, e.g. "UTF-8", "Windows-1252", "SJIS".
* @return string The canonical form of the charset.
*/
function _canonical_charset( $charset ) {
if ( 'utf-8' === strtolower( $charset ) || 'utf8' === strtolower( $charset ) ) {
if ( is_utf8_charset( $charset ) ) {
return 'UTF-8';
}
if ( 'iso-8859-1' === strtolower( $charset ) || 'iso8859-1' === strtolower( $charset ) ) {
/*
* Normalize the ISO-8859-1 family of languages.
*
* This is not required for htmlspecialchars(), as it properly recognizes all of
* the input character sets that here are transformed into "ISO-8859-1".
*
* @todo Should this entire check be removed since it's not required for the stated purpose?
* @todo Should WordPress transform other potential charset equivalents, such as "latin1"?
*/
if (
( 0 === strcasecmp( 'iso-8859-1', $charset ) ) ||
( 0 === strcasecmp( 'iso8859-1', $charset ) )
) {
return 'ISO-8859-1';
}

View File

@ -16,7 +16,7 @@
*
* @global string $wp_version
*/
$wp_version = '6.6-alpha-58146';
$wp_version = '6.6-alpha-58147';
/**
* Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.

View File

@ -106,6 +106,7 @@ if ( WP_CACHE && apply_filters( 'enable_loading_advanced_cache_dropin', true ) &
wp_set_lang_dir();
// Load early WordPress files.
require ABSPATH . WPINC . '/unicode.php';
require ABSPATH . WPINC . '/class-wp-list-util.php';
require ABSPATH . WPINC . '/formatting.php';
require ABSPATH . WPINC . '/meta.php';