HTML API: Update documentation and rename internal variable on HTML Processor

This patch updates documentation and an internal variable name within the
HTML Processor class so that they are more helpful and complete to a reader.

There should be no functional or visual changes in this patch.

Props dmsnell, mukesh27.
Fixes #59267.
Built from https://develop.svn.wordpress.org/trunk@56565


git-svn-id: http://core.svn.wordpress.org/trunk@56077 1a063a9b-81f0-0310-95a4-ce76da25c4cd
This commit is contained in:
Bernhard Reiter 2023-09-13 13:02:17 +00:00
parent deaa431a90
commit 15c3f036fd
2 changed files with 77 additions and 29 deletions

View File

@ -27,6 +27,7 @@
* - Unwrapping a tag by removing its parent.
* - Inserting and removing nodes.
* - Reading and changing inner content.
* - Navigating up or around HTML structure.
*
* ## Usage
*
@ -66,7 +67,7 @@
* `array( 'IMG' )` matches all IMG elements and `array( 'P', 'IMG' )`
* matches all IMG elements directly inside a P element. To ensure that no
* partial matches erroneously match it's possible to specify in a query
* the full breadcrumb match.
* the full breadcrumb match all the way down from the root HTML element.
*
* Example:
*
@ -76,7 +77,7 @@
*
* $html = '<figure><img><figcaption>A <em>lovely</em> day outside</figcaption></figure>';
* // ---- Matches here.
* $processor->next_tag( array( 'breadcrumbs' => array( 'FIGURE', 'IMG', 'FIGCAPTION', 'EM' ) ) );
* $processor->next_tag( array( 'breadcrumbs' => array( 'FIGURE', 'FIGCAPTION', 'EM' ) ) );
*
* $html = '<div><img></div><img>';
* // ----- Matches here, because IMG must be a direct child of the implicit BODY.
@ -100,7 +101,8 @@
*
* - Links: A.
* - The formatting elements: B, BIG, CODE, EM, FONT, I, SMALL, STRIKE, STRONG, TT, U.
* - Containers: DIV, FIGCAPTION, FIGURE.
* - Containers: DIV, FIGCAPTION, FIGURE, SPAN.
* - Form elements: BUTTON.
* - Paragraph: P.
* - Void elements: IMG.
*
@ -116,7 +118,9 @@
* - Markup involving only those tags listed above.
* - Fully-balanced and non-overlapping tags.
* - HTML with unexpected tag closers.
* - Some unbalanced or overlapping tags.
* - P tags after unclosed P tags.
* - BUTTON tags after unclosed BUTTON tags.
* - A tags after unclosed A tags that don't involve any active formatting elements.
*
* @since 6.4.0
@ -126,7 +130,10 @@
*/
class WP_HTML_Processor extends WP_HTML_Tag_Processor {
/**
* HTML processing requires more bookmarks than basic tag processing.
* The maximum number of bookmarks allowed to exist at any given time.
*
* HTML processing requires more bookmarks than basic tag processing,
* so this class constant from the Tag Processor is overwritten.
*
* @since 6.4.0
*
@ -137,6 +144,8 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
/**
* Static query for instructing the Tag Processor to visit every token.
*
* @access private
*
* @since 6.4.0
*
* @var array
@ -158,8 +167,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
/**
* Used to create unique bookmark names.
*
* This class sets a bookmark for every tag in the HTML document that it encounters.
* The bookmark name is auto-generated and increments, starting with `1`. These are
* internal bookmarks and are automatically released when the referring WP_HTML_Token
* goes out of scope and is garbage-collected.
*
* @since 6.4.0
*
* @see WP_HTML_Processor::$release_internal_bookmark_on_destruct
*
* @var int
*/
private $bookmark_counter = 0;
@ -186,7 +202,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
*
* @var closure
*/
private $release_internal_bookmark = null;
private $release_internal_bookmark_on_destruct = null;
/*
* Public Interface Functions
@ -258,10 +274,14 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
/**
* Constructor.
*
* Do not use this method. Use the static creator methods instead.
*
* @access private
*
* @since 6.4.0
*
* @see WP_HTML_Processor::createFragment()
*
* @param string $html HTML to process.
* @param string|null $use_the_static_create_methods_instead This constructor should not be called manually.
*/
@ -287,7 +307,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* a private method into WP_HTML_Token classes without
* exposing it to any public API.
*/
$this->release_internal_bookmark = function ( $name ) {
$this->release_internal_bookmark_on_destruct = function ( $name ) {
parent::release_bookmark( $name );
};
}
@ -298,13 +318,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* Various situations lead to parsing failure but this class will
* return `false` in all those cases. To determine why something
* failed it's possible to request the last error. This can be
* helpful to know if it's possible to fix something or to give up.
* helpful to know to distinguish whether a given tag couldn't
* be found or if content in the document caused the processor
* to give up and abort processing.
*
* Example
*
* $p = WP_HTML_Processor::createFragment( '<template><strong><button><em><p><em>' );
* false === $p->next_tag();
* WP_HTML_Processor::ERROR_UNSUPPORTED === $p->get_last_error();
* $processor = WP_HTML_Processor::createFragment( '<template><strong><button><em><p><em>' );
* false === $processor->next_tag();
* WP_HTML_Processor::ERROR_UNSUPPORTED === $processor->get_last_error();
*
* @since 6.4.0
*
@ -429,6 +451,20 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
}
if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
/*
* Void elements still hop onto the stack of open elements even though
* there's no corresponding closing tag. This is important for managing
* stack-based operations such as "navigate to parent node" or checking
* on an element's breadcrumbs.
*
* When moving on to the next node, therefore, if the bottom-most element
* on the stack is a void element, it must be closed.
*
* @TODO: Once self-closing foreign elements and BGSOUND are supported,
* they must also be implicitly closed here too. BGSOUND is
* special since it's only self-closing if the self-closing flag
* is provided in the opening tag, otherwise it expects a tag closer.
*/
$top_node = $this->state->stack_of_open_elements->current_node();
if ( $top_node && self::is_void( $top_node->node_name ) ) {
$this->state->stack_of_open_elements->pop();
@ -446,7 +482,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
$this->bookmark_tag(),
$this->get_tag(),
$this->is_tag_closer(),
$this->release_internal_bookmark
$this->release_internal_bookmark_on_destruct
);
try {
@ -456,7 +492,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
default:
$this->last_error = self::ERROR_UNSUPPORTED;
throw new WP_HTML_Unsupported_Exception( 'Cannot parse outside of the IN BODY insertion mode.' );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
}
} catch ( WP_HTML_Unsupported_Exception $e ) {
/*
@ -470,17 +506,22 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
/**
* Computes the HTML breadcrumbs for the currently-matched node, if matched.
*
* Breadcrumbs start at the outer-most parent and descend toward the matched element.
* Breadcrumbs start at the outermost parent and descend toward the matched element.
* They always include the entire path from the root HTML node to the matched element.
*
* @TODO: It could be more efficient to expose a generator-based version of this function
* to avoid creating the array copy on tag iteration. If this is done, it would likely
* be more useful to walk up the stack when yielding instead of starting at the top.
*
* Example
*
* $p = WP_HTML_Processor::createFragment( '<p><strong><em><img></em></strong></p>' );
* $p->next_tag( 'IMG' );
* $p->get_breadcrumbs() === array( 'HTML', 'BODY', 'P', 'STRONG', 'EM', 'IMG' );
* $processor = WP_HTML_Processor::createFragment( '<p><strong><em><img></em></strong></p>' );
* $processor->next_tag( 'IMG' );
* $processor->get_breadcrumbs() === array( 'HTML', 'BODY', 'P', 'STRONG', 'EM', 'IMG' );
*
* @since 6.4.0
*
* @return string[]|null Array of tag-names representing path to matched node, if matched, otherwise null.
* @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL.
*/
public function get_breadcrumbs() {
if ( ! $this->get_tag() ) {
@ -499,16 +540,14 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* Parses next element in the 'in body' insertion mode.
*
* This internal function performs the 'in body' insertion mode
* logic for the generalized `self::step()` function.
*
* @access private
* logic for the generalized WP_HTML_Processor::step() function.
*
* @since 6.4.0
*
* @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
*
* @see https://html.spec.whatwg.org/#parsing-main-inbody
* @see self::step
* @see WP_HTML_Processor::step
*
* @return bool Whether an element was found.
*/
@ -698,7 +737,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
*/
/**
* Creates a new bookmark for the currently-matched tag and returns generated name.
* Creates a new bookmark for the currently-matched tag and returns the generated name.
*
* @since 6.4.0
*
@ -726,14 +765,19 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
/**
* Returns the uppercase name of the matched tag.
*
* The semantic rules for HTML specify that certain tags be reprocessed
* with a different tag name. Because of this, the tag name presented
* by the HTML Processor may differ from the one reported by the HTML
* Tag Processor, which doesn't apply these semantic rules.
*
* Example:
*
* $p = new WP_HTML_Tag_Processor( '<div class="test">Test</div>' );
* $p->next_tag() === true;
* $p->get_tag() === 'DIV';
* $processor = new WP_HTML_Tag_Processor( '<div class="test">Test</div>' );
* $processor->next_tag() === true;
* $processor->get_tag() === 'DIV';
*
* $p->next_tag() === false;
* $p->get_tag() === null;
* $processor->next_tag() === false;
* $processor->get_tag() === null;
*
* @since 6.4.0
*
@ -775,11 +819,13 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
}
/**
* Moves the internal cursor in the Tag Processor to a given bookmark's location.
* Moves the internal cursor in the HTML Processor to a given bookmark's location.
*
* In order to prevent accidental infinite loops, there's a
* maximum limit on the number of times seek() can be called.
*
* @throws Exception When unable to allocate a bookmark for the next token in the input HTML document.
*
* @since 6.4.0
*
* @param string $bookmark_name Jump to the place in the document identified by this bookmark name.
@ -1348,6 +1394,8 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
*
* This unlock code is used to ensure that anyone calling the constructor is
* doing so with a full understanding that it's intended to be a private API.
*
* @access private
*/
const CONSTRUCTOR_UNLOCK_CODE = 'Use WP_HTML_Processor::createFragment instead of calling the class constructor directly.';
}

View File

@ -16,7 +16,7 @@
*
* @global string $wp_version
*/
$wp_version = '6.4-alpha-56563';
$wp_version = '6.4-alpha-56565';
/**
* Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.