Formatting: Account for HTML entities in wp_extract_urls().

Prevent `wp_extract_urls()` trimming HTML entities within URLs. Correctly escaped URLs such as https://youtube.com/watch?v=dQw4w9WgXcQ&t=1 will now be extracted as https://youtube.com/watch?v=dQw4w9WgXcQ&t=1 rather than truncated. Props trex005, voldemortensen, johnbillion, ironprogrammer, costdev, hellofromtonya. Fixes #30580 Built from https://develop.svn.wordpress.org/trunk@53044 git-svn-id: http://core.svn.wordpress.org/trunk@52633 1a063a9b-81f0-0310-95a4-ce76da25c4cd
2024-06-26 06:45:07 +02:00 · 2022-04-01 03:40:02 +00:00 · 2022-04-01 03:40:02 +00:00 · 33b6697191
commit 33b6697191
parent be3aa9a149
2 changed files with 14 additions and 3 deletions
--- a/wp-includes/functions.php
+++ b/wp-includes/functions.php
@ -820,6 +820,7 @@ function xmlrpc_removepostdata( $content ) {
 * Use RegEx to extract URLs from arbitrary content.
 *
 * @since 3.7.0
+ * @since 6.0.0 Fixes support for HTML entities (Trac 30580).
 *
 * @param string $content Content to extract URLs from.
 * @return string[] Array of URLs found in passed string.
@ -833,7 +834,7 @@ function wp_extract_urls( $content ) {
 			. '(?:'
 				. '\([\w\d]+\)|'
 				. '(?:'
-					. "[^`!()\[\]{};:'\".,<>«»“”‘’\s]|"
+					. "[^`!()\[\]{}:'\".,<>«»“”‘’\s]|"
 					. '(?:[:]\d+)?/?'
 				. ')+'
 			. ')'
@ -842,7 +843,17 @@ function wp_extract_urls( $content ) {
 		$post_links
 	);

-	$post_links = array_unique( array_map( 'html_entity_decode', $post_links[2] ) );
+	$post_links = array_unique(
+		array_map(
+			static function( $link ) {
+				// Decode to replace valid entities, like &amp;.
+				$link = html_entity_decode( $link );
+				// Maintain backward compatibility by removing extraneous semi-colons (`;`).
+				return str_replace( ';', '', $link );
+			},
+			$post_links[2]
+		)
+	);

 	return array_values( $post_links );
 }
--- a/wp-includes/version.php
+++ b/wp-includes/version.php
@ -16,7 +16,7 @@
 *
 * @global string $wp_version
 */
-$wp_version = '6.0-alpha-53043';
+$wp_version = '6.0-alpha-53044';

 /**
 * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.