Improve efficiency of make_clickable(). Props mdawaffe. Fixes #16892.

Not only does this improve general performance, but also helps to prevent
segfaults caused by malicious input to the regular expression. The regular
expression is also simplified to help readability and maintenance.


git-svn-id: http://svn.automattic.com/wordpress/trunk@19899 1a063a9b-81f0-0310-95a4-ce76da25c4cd
This commit is contained in:
duck_ 2012-02-10 13:42:15 +00:00
parent 70a6d8fb9c
commit d8a694d8ac
1 changed files with 103 additions and 13 deletions

View File

@ -1387,9 +1387,17 @@ function antispambot($emailaddy, $mailto=0) {
*/
function _make_url_clickable_cb($matches) {
$url = $matches[2];
$suffix = '';
/** Include parentheses in the URL only if paired **/
if ( ')' == $matches[3] && strpos( $url, '(' ) ) {
// If the trailing character is a closing parethesis, and the URL has an opening parenthesis in it, add the closing parenthesis to the URL.
// Then we can let the parenthesis balancer do its thing below.
$url .= $matches[3];
$suffix = '';
} else {
$suffix = $matches[3];
}
// Include parentheses in the URL only if paired
while ( substr_count( $url, '(' ) < substr_count( $url, ')' ) ) {
$suffix = strrchr( $url, ')' ) . $suffix;
$url = substr( $url, 0, strrpos( $url, ')' ) );
@ -1458,20 +1466,102 @@ function _make_email_clickable_cb($matches) {
* @param string $ret Content to convert URIs.
* @return string Content with converted URIs.
*/
function make_clickable($ret) {
$ret = ' ' . $ret;
// in testing, using arrays here was found to be faster
$save = @ini_set('pcre.recursion_limit', 10000);
$retval = preg_replace_callback('#(?<!=[\'"])(?<=[*\')+.,;:!&$\s>])(\()?([\w]+?://(?:[\w\\x80-\\xff\#%~/?@\[\]-]{1,2000}|[\'*(+.,;:!=&$](?![\b\)]|(\))?([\s]|$))|(?(1)\)(?![\s<.,;:]|$)|\)))+)#is', '_make_url_clickable_cb', $ret);
if (null !== $retval )
$ret = $retval;
@ini_set('pcre.recursion_limit', $save);
function make_clickable( $ret ) {
// Long strings might contain expensive edge cases ...
if ( 10000 < strlen( $ret ) ) {
$r = '';
// ... break it up
foreach ( _split_str_by_whitespace( $ret, 2100 ) as $chunk ) { // 2100: Extra room for scheme and leading and trailing paretheses
if ( 2101 < strlen( $chunk ) ) {
$r .= $chunk; // Too big, no whitespace: bail.
} else {
$r .= make_clickable( $chunk );
}
}
return $r;
}
$ret = " $ret "; // Pad with whitespace to simplify the regexes
$url_clickable = '~
([\\s(<.,;:!?]) # 1: Leading whitespace, or punctuation
( # 2: URL
[\\w]{1,20}+:// # Scheme and hier-part prefix
(?=\S{1,2000}\s) # Limit to URLs less than about 2000 characters long
[\\w\\x80-\\xff#%\\~/@\\[\\]*(+=&$-]*+ # Non-punctuation URL character
(?: # Unroll the Loop: Only allow puctuation URL character if followed by a non-punctuation URL character
[\'.,;:!?)] # Punctuation URL character
[\\w\\x80-\\xff#%\\~/@\\[\\]*(+=&$-]++ # Non-punctuation URL character
)*
)
(\)?) # 3: Trailing closing parenthesis (for parethesis balancing post processing)
~xS'; // The regex is a non-anchored pattern and does not have a single fixed starting character.
// Tell PCRE to spend more time optimizing since, when used on a page load, it will probably be used several times.
$ret = preg_replace_callback( $url_clickable, '_make_url_clickable_cb', $ret );
$ret = preg_replace_callback('#([\s>])((www|ftp)\.[\w\\x80-\\xff\#$%&~/.\-;:=,?@\[\]+]+)#is', '_make_web_ftp_clickable_cb', $ret);
$ret = preg_replace_callback('#([\s>])([.0-9a-z_+-]+)@(([0-9a-z-]+\.)+[0-9a-z]{2,})#i', '_make_email_clickable_cb', $ret);
// this one is not in an array because we need it to run last, for cleanup of accidental links within links
// Cleanup of accidental links within links
$ret = preg_replace("#(<a( [^>]+?>|>))<a [^>]+?>([^>]+?)</a></a>#i", "$1$3</a>", $ret);
$ret = trim($ret);
return $ret;
return substr( $ret, 1, -1 ); // Remove our whitespace padding.
}
/**
* Breaks a string into chunks by splitting at whitespace characters.
* The length of each returned chunk is as close to the specified length goal as possible,
* with the caveat that each chunk includes its trailing delimiter.
* Chunks longer than the goal are guaranteed to not have any inner whitespace.
*
* Joining the returned chunks with empty delimiters reconstructs the input string losslessly.
*
* Input string must have no null characters (or eventual transformations on output chunks must not care about null characters)
*
* <code>
* _split_str_by_whitespace( "1234 67890 1234 67890a cd 1234 890 123456789 1234567890a 45678 1 3 5 7 90 ", 10 ) ==
* array (
* 0 => '1234 67890 ', // 11 characters: Perfect split
* 1 => '1234 ', // 5 characters: '1234 67890a' was too long
* 2 => '67890a cd ', // 10 characters: '67890a cd 1234' was too long
* 3 => '1234 890 ', // 11 characters: Perfect split
* 4 => '123456789 ', // 10 characters: '123456789 1234567890a' was too long
* 5 => '1234567890a ', // 12 characters: Too long, but no inner whitespace on which to split
* 6 => ' 45678 ', // 11 characters: Perfect split
* 7 => '1 3 5 7 9', // 9 characters: End of $string
* );
* </code>
*
* @param string $string The string to split
* @param int $goal The desired chunk length.
*
* @return array Numeric array of chunks.
*/
function _split_str_by_whitespace( $string, $goal ) {
$chunks = array();
$string_nullspace = strtr( $string, "\r\n\t\v\f ", "\000\000\000\000\000\000" );
while ( $goal < strlen( $string_nullspace ) ) {
$pos = strrpos( substr( $string_nullspace, 0, $goal + 1 ), "\000" );
if ( false === $pos ) {
$pos = strpos( $string_nullspace, "\000", $goal + 1 );
if ( false === $pos ) {
break;
}
}
$chunks[] = substr( $string, 0, $pos + 1 );
$string = substr( $string, $pos + 1 );
$string_nullspace = substr( $string_nullspace, $pos + 1 );
}
if ( $string ) {
$chunks[] = $string;
}
return $chunks;
}
/**