HTML API: Track spans of text with (offset, length) instead of (start, end).

Updates the internal representation of the text span coordinates. The mixture of (offset, length) and (start, end) coordinates becomes confusing, this commit replaces it with a (offset, length) pair. There should be no functional or behavioral changes in this patch. For the internal helper classes this patch introduces breaking changes, but those classes are marked private and should not be used outside of the HTML API itself.

Props dmsnell.
Fixes #59993.


Built from https://develop.svn.wordpress.org/trunk@57179


git-svn-id: http://core.svn.wordpress.org/trunk@56690 1a063a9b-81f0-0310-95a4-ce76da25c4cd
This commit is contained in:
zieladam 2023-12-10 13:19:28 +00:00
parent 6af9b0dcfe
commit 760fae6c43
5 changed files with 137 additions and 71 deletions

View File

@ -15,6 +15,7 @@
*
* @access private
* @since 6.2.0
* @since 6.5.0 Replaced `end` with `length` to more closely match `substr()`.
*
* @see WP_HTML_Tag_Processor
*/
@ -23,6 +24,7 @@ class WP_HTML_Attribute_Token {
* Attribute name.
*
* @since 6.2.0
*
* @var string
*/
public $name;
@ -31,6 +33,7 @@ class WP_HTML_Attribute_Token {
* Attribute value.
*
* @since 6.2.0
*
* @var int
*/
public $value_starts_at;
@ -39,6 +42,7 @@ class WP_HTML_Attribute_Token {
* How many bytes the value occupies in the input HTML.
*
* @since 6.2.0
*
* @var int
*/
public $value_length;
@ -47,22 +51,43 @@ class WP_HTML_Attribute_Token {
* The string offset where the attribute name starts.
*
* @since 6.2.0
*
* @var int
*/
public $start;
/**
* The string offset after the attribute value or its name.
* Byte length of text spanning the attribute inside a tag.
*
* This span starts at the first character of the attribute name
* and it ends after one of three cases:
*
* - at the end of the attribute name for boolean attributes.
* - at the end of the value for unquoted attributes.
* - at the final single or double quote for quoted attributes.
*
* Example:
*
* <div class="post">
* ------------ length is 12, including quotes
*
* <input type="checked" checked id="selector">
* ------- length is 6
*
* <a rel=noopener>
* ------------ length is 11
*
* @since 6.5.0 Replaced `end` with `length` to more closely match `substr()`.
*
* @since 6.2.0
* @var int
*/
public $end;
public $length;
/**
* Whether the attribute is a boolean attribute with value `true`.
*
* @since 6.2.0
*
* @var bool
*/
public $is_true;
@ -71,20 +96,21 @@ class WP_HTML_Attribute_Token {
* Constructor.
*
* @since 6.2.0
* @since 6.5.0 Replaced `end` with `length` to more closely match `substr()`.
*
* @param string $name Attribute name.
* @param int $value_start Attribute value.
* @param int $value_length Number of bytes attribute value spans.
* @param int $start The string offset where the attribute name starts.
* @param int $end The string offset after the attribute value or its name.
* @param int $length Byte length of the entire attribute name or name and value pair expression.
* @param bool $is_true Whether the attribute is a boolean attribute with true value.
*/
public function __construct( $name, $value_start, $value_length, $start, $end, $is_true ) {
public function __construct( $name, $value_start, $value_length, $start, $length, $is_true ) {
$this->name = $name;
$this->value_starts_at = $value_start;
$this->value_length = $value_length;
$this->start = $start;
$this->end = $end;
$this->length = $length;
$this->is_true = $is_true;
}
}

View File

@ -18,6 +18,7 @@
*
* @access private
* @since 6.2.0
* @since 6.5.0 Replaced `end` with `length` to more closely align with `substr()`.
*
* @see WP_HTML_Tag_Processor
*/
@ -26,28 +27,30 @@ class WP_HTML_Span {
* Byte offset into document where span begins.
*
* @since 6.2.0
*
* @var int
*/
public $start;
/**
* Byte offset into document where span ends.
* Byte length of this span.
*
* @since 6.5.0
*
* @since 6.2.0
* @var int
*/
public $end;
public $length;
/**
* Constructor.
*
* @since 6.2.0
*
* @param int $start Byte offset into document where replacement span begins.
* @param int $end Byte offset into document where replacement span ends.
* @param int $start Byte offset into document where replacement span begins.
* @param int $length Byte length of span.
*/
public function __construct( $start, $end ) {
$this->start = $start;
$this->end = $end;
public function __construct( $start, $length ) {
$this->start = $start;
$this->length = $length;
}
}

View File

@ -328,6 +328,40 @@ class WP_HTML_Tag_Processor {
*/
private $bytes_already_parsed = 0;
/**
* Byte offset in input document where current token starts.
*
* Example:
*
* <div id="test">...
* 01234
* - token starts at 0
*
* @since 6.5.0
*
* @var int|null
*/
private $token_starts_at;
/**
* Byte length of current token.
*
* Example:
*
* <div id="test">...
* 012345678901234
* - token length is 14 - 0 = 14
*
* a <!-- comment --> is a token.
* 0123456789 123456789 123456789
* - token length is 17 - 2 = 15
*
* @since 6.5.0
*
* @var int|null
*/
private $token_length;
/**
* Byte offset in input document where current tag name starts.
*
@ -338,6 +372,7 @@ class WP_HTML_Tag_Processor {
* - tag name starts at 1
*
* @since 6.2.0
*
* @var int|null
*/
private $tag_name_starts_at;
@ -352,25 +387,11 @@ class WP_HTML_Tag_Processor {
* --- tag name length is 3
*
* @since 6.2.0
*
* @var int|null
*/
private $tag_name_length;
/**
* Byte offset in input document where current tag token ends.
*
* Example:
*
* <div id="test">...
* 0 1 |
* 01234567890123456
* --- tag name ends at 14
*
* @since 6.2.0
* @var int|null
*/
private $tag_ends_at;
/**
* Whether the current tag is an opening tag, e.g. <div>, or a closing tag, e.g. </div>.
*
@ -388,14 +409,14 @@ class WP_HTML_Tag_Processor {
* // <div id="test-4" class=outline title="data:text/plain;base64=asdk3nk1j3fo8">
* // ^ parsing will continue from this point.
* $this->attributes = array(
* 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 )
* 'id' => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false )
* );
*
* // When picking up parsing again, or when asking to find the
* // `class` attribute we will continue and add to this array.
* $this->attributes = array(
* 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 ),
* 'class' => new WP_HTML_Attribute_Match( 'class', 'outline', 18, 32 )
* 'id' => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false ),
* 'class' => new WP_HTML_Attribute_Token( 'class', 23, 7, 17, 13, false )
* );
*
* // Note that only the `class` attribute value is stored in the index.
@ -484,9 +505,9 @@ class WP_HTML_Tag_Processor {
*
* // Replace an attribute stored with a new value, indices
* // sourced from the lazily-parsed HTML recognizer.
* $start = $attributes['src']->start;
* $end = $attributes['src']->end;
* $modifications[] = new WP_HTML_Text_Replacement( $start, $end, $new_value );
* $start = $attributes['src']->start;
* $length = $attributes['src']->length;
* $modifications[] = new WP_HTML_Text_Replacement( $start, $length, $new_value );
*
* // Correspondingly, something like this will appear in this array.
* $lexical_updates = array(
@ -566,7 +587,7 @@ class WP_HTML_Tag_Processor {
if ( false === $tag_ends_at ) {
return false;
}
$this->tag_ends_at = $tag_ends_at;
$this->token_length = $tag_ends_at - $this->token_starts_at;
$this->bytes_already_parsed = $tag_ends_at;
// Finally, check if the parsed tag and its attributes match the search query.
@ -808,10 +829,7 @@ class WP_HTML_Tag_Processor {
return false;
}
$this->bookmarks[ $name ] = new WP_HTML_Span(
$this->tag_name_starts_at - ( $this->is_closing_tag ? 2 : 1 ),
$this->tag_ends_at
);
$this->bookmarks[ $name ] = new WP_HTML_Span( $this->token_starts_at, $this->token_length );
return true;
}
@ -875,7 +893,7 @@ class WP_HTML_Tag_Processor {
while ( false !== $at && $at < $doc_length ) {
$at = strpos( $this->html, '</', $at );
// If there is no possible tag closer then fail.
// Fail if there is no possible tag closer.
if ( false === $at || ( $at + $tag_length ) >= $doc_length ) {
$this->bytes_already_parsed = $doc_length;
return false;
@ -1093,6 +1111,8 @@ class WP_HTML_Tag_Processor {
return false;
}
$this->token_starts_at = $at;
if ( '/' === $this->html[ $at + 1 ] ) {
$this->is_closing_tag = true;
++$at;
@ -1381,7 +1401,7 @@ class WP_HTML_Tag_Processor {
$value_start,
$value_length,
$attribute_start,
$attribute_end,
$attribute_end - $attribute_start,
! $has_value
);
@ -1396,7 +1416,7 @@ class WP_HTML_Tag_Processor {
* an array when encountering duplicates avoids needless allocations in the
* normative case of parsing tags with no duplicate attributes.
*/
$duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end );
$duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end - $attribute_start );
if ( null === $this->duplicate_attributes ) {
$this->duplicate_attributes = array( $comparable_name => array( $duplicate_span ) );
} elseif ( ! array_key_exists( $comparable_name, $this->duplicate_attributes ) ) {
@ -1424,9 +1444,10 @@ class WP_HTML_Tag_Processor {
*/
private function after_tag() {
$this->get_updated_html();
$this->token_starts_at = null;
$this->token_length = null;
$this->tag_name_starts_at = null;
$this->tag_name_length = null;
$this->tag_ends_at = null;
$this->is_closing_tag = null;
$this->attributes = array();
$this->duplicate_attributes = null;
@ -1606,7 +1627,7 @@ class WP_HTML_Tag_Processor {
$bytes_already_copied = 0;
$output_buffer = '';
foreach ( $this->lexical_updates as $diff ) {
$shift = strlen( $diff->text ) - ( $diff->end - $diff->start );
$shift = strlen( $diff->text ) - $diff->length;
// Adjust the cursor position by however much an update affects it.
if ( $diff->start <= $this->bytes_already_parsed ) {
@ -1620,7 +1641,7 @@ class WP_HTML_Tag_Processor {
$output_buffer .= substr( $this->html, $bytes_already_copied, $diff->start - $bytes_already_copied );
$output_buffer .= $diff->text;
$bytes_already_copied = $diff->end;
$bytes_already_copied = $diff->start + $diff->length;
}
$this->html = $output_buffer . substr( $this->html, $bytes_already_copied );
@ -1630,6 +1651,8 @@ class WP_HTML_Tag_Processor {
* replacements adjust offsets in the input document.
*/
foreach ( $this->bookmarks as $bookmark_name => $bookmark ) {
$bookmark_end = $bookmark->start + $bookmark->length;
/*
* Each lexical update which appears before the bookmark's endpoints
* might shift the offsets for those endpoints. Loop through each change
@ -1640,28 +1663,30 @@ class WP_HTML_Tag_Processor {
$tail_delta = 0;
foreach ( $this->lexical_updates as $diff ) {
if ( $bookmark->start < $diff->start && $bookmark->end < $diff->start ) {
$diff_end = $diff->start + $diff->length;
if ( $bookmark->start < $diff->start && $bookmark_end < $diff->start ) {
break;
}
if ( $bookmark->start >= $diff->start && $bookmark->end < $diff->end ) {
if ( $bookmark->start >= $diff->start && $bookmark_end < $diff_end ) {
$this->release_bookmark( $bookmark_name );
continue 2;
}
$delta = strlen( $diff->text ) - ( $diff->end - $diff->start );
$delta = strlen( $diff->text ) - $diff->length;
if ( $bookmark->start >= $diff->start ) {
$head_delta += $delta;
}
if ( $bookmark->end >= $diff->end ) {
if ( $bookmark_end >= $diff_end ) {
$tail_delta += $delta;
}
}
$bookmark->start += $head_delta;
$bookmark->end += $tail_delta;
$bookmark->start += $head_delta;
$bookmark->length += $tail_delta - $head_delta;
}
$this->lexical_updates = array();
@ -1743,7 +1768,7 @@ class WP_HTML_Tag_Processor {
* This code should be unreachable, because it implies the two replacements
* start at the same location and contain the same text.
*/
return $a->end - $b->end;
return $a->length - $b->length;
}
/**
@ -1971,7 +1996,15 @@ class WP_HTML_Tag_Processor {
return false;
}
return '/' === $this->html[ $this->tag_ends_at - 1 ];
/*
* The self-closing flag is the solidus at the _end_ of the tag, not the beginning.
*
* Example:
*
* <figure />
* ^ this appears one character before the end of the closing ">".
*/
return '/' === $this->html[ $this->token_starts_at + $this->token_length - 1 ];
}
/**
@ -2101,7 +2134,7 @@ class WP_HTML_Tag_Processor {
$existing_attribute = $this->attributes[ $comparable_name ];
$this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement(
$existing_attribute->start,
$existing_attribute->end,
$existing_attribute->length,
$updated_attribute
);
} else {
@ -2119,7 +2152,7 @@ class WP_HTML_Tag_Processor {
*/
$this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement(
$this->tag_name_starts_at + $this->tag_name_length,
$this->tag_name_starts_at + $this->tag_name_length,
0,
' ' . $updated_attribute
);
}
@ -2194,7 +2227,7 @@ class WP_HTML_Tag_Processor {
*/
$this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement(
$this->attributes[ $name ]->start,
$this->attributes[ $name ]->end,
$this->attributes[ $name ]->length,
''
);
@ -2203,7 +2236,7 @@ class WP_HTML_Tag_Processor {
foreach ( $this->duplicate_attributes[ $name ] as $attribute_token ) {
$this->lexical_updates[] = new WP_HTML_Text_Replacement(
$attribute_token->start,
$attribute_token->end,
$attribute_token->length,
''
);
}
@ -2289,7 +2322,7 @@ class WP_HTML_Tag_Processor {
* Keep track of the position right before the current tag. This will
* be necessary for reparsing the current tag after updating the HTML.
*/
$before_current_tag = $this->tag_name_starts_at - 1;
$before_current_tag = $this->token_starts_at;
/*
* 1. Apply the enqueued edits and update all the pointers to reflect those changes.
@ -2325,7 +2358,7 @@ class WP_HTML_Tag_Processor {
}
$tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );
$this->tag_ends_at = $tag_ends_at;
$this->token_length = $tag_ends_at - $this->token_starts_at;
$this->bytes_already_parsed = $tag_ends_at;
return $this->html;

View File

@ -15,6 +15,7 @@
*
* @access private
* @since 6.2.0
* @since 6.5.0 Replace `end` with `length` to more closely match `substr()`.
*
* @see WP_HTML_Tag_Processor
*/
@ -23,22 +24,25 @@ class WP_HTML_Text_Replacement {
* Byte offset into document where replacement span begins.
*
* @since 6.2.0
*
* @var int
*/
public $start;
/**
* Byte offset into document where replacement span ends.
* Byte length of span being replaced.
*
* @since 6.5.0
*
* @since 6.2.0
* @var int
*/
public $end;
public $length;
/**
* Span of text to insert in document to replace existing content from start to end.
*
* @since 6.2.0
*
* @var string
*/
public $text;
@ -48,13 +52,13 @@ class WP_HTML_Text_Replacement {
*
* @since 6.2.0
*
* @param int $start Byte offset into document where replacement span begins.
* @param int $end Byte offset into document where replacement span ends.
* @param string $text Span of text to insert in document to replace existing content from start to end.
* @param int $start Byte offset into document where replacement span begins.
* @param int $length Byte length of span in document being replaced.
* @param string $text Span of text to insert in document to replace existing content from start to end.
*/
public function __construct( $start, $end, $text ) {
$this->start = $start;
$this->end = $end;
$this->text = $text;
public function __construct( $start, $length, $text ) {
$this->start = $start;
$this->length = $length;
$this->text = $text;
}
}

View File

@ -16,7 +16,7 @@
*
* @global string $wp_version
*/
$wp_version = '6.5-alpha-57178';
$wp_version = '6.5-alpha-57179';
/**
* Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.