From d8d6efaf0495312559419d3d88dbe3563f203db0 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 3 Jul 2024 05:32:59 -0700 Subject: [PATCH] Compat: Update HTML API with changes from 6.6 (#63089) Updates the HTML API components in the compatability layer with updates from each of the 6.4, 6.5, and 6.6 branches from Core. Notably, because of the breaking change in 6.5 to the `WP_HTML_Span` and `WP_HTML_Text_Replacement` classes, which changed the range format from `(start, end)` to `(start, length)`, the 6.6 classes still reference the `Gutenberg_HTML_Span_6_5` and `Gutenberg_HTML_Text_Replacement_6_5` substitutions. While updating it was noticed that the 6.4 compatability code accidentally pulled in updates from the 6.5 branch for the HTML Processor. These additions have been removed for the sake of closer correspondance with the Core code. Co-authored-by: ockham --- ...class-gutenberg-html-tag-processor-6-4.php | 13 +- ...ass-wp-html-active-formatting-elements.php | 20 +- .../html-api/class-wp-html-open-elements.php | 36 +- .../class-wp-html-processor-state.php | 35 +- .../html-api/class-wp-html-processor.php | 244 +- .../html-api/class-wp-html-token.php | 9 + .../class-gutenberg-html-processor-6-5.php | 255 +- ...class-gutenberg-html-tag-processor-6-5.php | 89 +- .../class-gutenberg-token-map-6-6.php | 818 ++++ .../class-gutenberg-html-decoder-6-6.php | 461 +++ ...class-gutenberg-html-open-elements-6-6.php | 541 +++ .../class-gutenberg-html-processor-6-6.php | 2472 ++++++++++++ ...ass-gutenberg-html-processor-state-6-6.php | 143 + .../class-gutenberg-html-stack-event-6-6.php | 82 + ...class-gutenberg-html-tag-processor-6-6.php | 3559 +++++++++++++++++ ...g-html5-named-character-references-6-6.php | 1315 ++++++ lib/load.php | 11 + 17 files changed, 9823 insertions(+), 280 deletions(-) create mode 100644 lib/compat/wordpress-6.6/class-gutenberg-token-map-6-6.php create mode 100644 lib/compat/wordpress-6.6/html-api/class-gutenberg-html-decoder-6-6.php create mode 100644 lib/compat/wordpress-6.6/html-api/class-gutenberg-html-open-elements-6-6.php create mode 100644 lib/compat/wordpress-6.6/html-api/class-gutenberg-html-processor-6-6.php create mode 100644 lib/compat/wordpress-6.6/html-api/class-gutenberg-html-processor-state-6-6.php create mode 100644 lib/compat/wordpress-6.6/html-api/class-gutenberg-html-stack-event-6-6.php create mode 100644 lib/compat/wordpress-6.6/html-api/class-gutenberg-html-tag-processor-6-6.php create mode 100644 lib/compat/wordpress-6.6/html-api/gutenberg-html5-named-character-references-6-6.php diff --git a/lib/compat/wordpress-6.4/html-api/class-gutenberg-html-tag-processor-6-4.php b/lib/compat/wordpress-6.4/html-api/class-gutenberg-html-tag-processor-6-4.php index e22b4fb17b902e..40e3ce425f8b05 100644 --- a/lib/compat/wordpress-6.4/html-api/class-gutenberg-html-tag-processor-6-4.php +++ b/lib/compat/wordpress-6.4/html-api/class-gutenberg-html-tag-processor-6-4.php @@ -2031,8 +2031,8 @@ public function set_attribute( $name, $value ) { * * @see https://html.spec.whatwg.org/#attributes-2 * - * @todo As the only regex pattern maybe we should take it out? - * Are Unicode patterns available broadly in Core? + * @TODO as the only regex pattern maybe we should take it out? are + * Unicode patterns available broadly in Core? */ if ( preg_match( '~[' . @@ -2071,7 +2071,14 @@ public function set_attribute( $name, $value ) { if ( true === $value ) { $updated_attribute = $name; } else { - $escaped_new_value = esc_attr( $value ); + $comparable_name = strtolower( $name ); + + /* + * Escape URL attributes. + * + * @see https://html.spec.whatwg.org/#attributes-3 + */ + $escaped_new_value = in_array( $comparable_name, wp_kses_uri_attributes() ) ? esc_url( $value ) : esc_attr( $value ); $updated_attribute = "{$name}=\"{$escaped_new_value}\""; } diff --git a/lib/compat/wordpress-6.4/html-api/class-wp-html-active-formatting-elements.php b/lib/compat/wordpress-6.4/html-api/class-wp-html-active-formatting-elements.php index 43a5eab77f09fa..4bf726bd39ec95 100644 --- a/lib/compat/wordpress-6.4/html-api/class-wp-html-active-formatting-elements.php +++ b/lib/compat/wordpress-6.4/html-api/class-wp-html-active-formatting-elements.php @@ -99,16 +99,16 @@ public function current_node() { */ public function push( $token ) { /* - * > If there are already three elements in the list of active formatting elements after the last marker, - * > if any, or anywhere in the list if there are no markers, that have the same tag name, namespace, and - * > attributes as element, then remove the earliest such element from the list of active formatting - * > elements. For these purposes, the attributes must be compared as they were when the elements were - * > created by the parser; two elements have the same attributes if all their parsed attributes can be - * > paired such that the two attributes in each pair have identical names, namespaces, and values - * > (the order of the attributes does not matter). - * - * @todo Implement the "Noah's Ark clause" to only add up to three of any given kind of formatting elements to the stack. - */ + * > If there are already three elements in the list of active formatting elements after the last marker, + * > if any, or anywhere in the list if there are no markers, that have the same tag name, namespace, and + * > attributes as element, then remove the earliest such element from the list of active formatting + * > elements. For these purposes, the attributes must be compared as they were when the elements were + * > created by the parser; two elements have the same attributes if all their parsed attributes can be + * > paired such that the two attributes in each pair have identical names, namespaces, and values + * > (the order of the attributes does not matter). + * + * @TODO: Implement the "Noah's Ark clause" to only add up to three of any given kind of formatting elements to the stack. + */ // > Add element to the list of active formatting elements. $this->stack[] = $token; } diff --git a/lib/compat/wordpress-6.4/html-api/class-wp-html-open-elements.php b/lib/compat/wordpress-6.4/html-api/class-wp-html-open-elements.php index 51386e180bf46b..f8f6f43119390b 100644 --- a/lib/compat/wordpress-6.4/html-api/class-wp-html-open-elements.php +++ b/lib/compat/wordpress-6.4/html-api/class-wp-html-open-elements.php @@ -112,7 +112,7 @@ public function current_node() { * @param string[] $termination_list List of elements that terminate the search. * @return bool Whether the element was found in a specific scope. */ - public function has_element_in_specific_scope( $tag_name, $termination_list ) { // phpcs:ignore VariableAnalysis.CodeAnalysis.VariableAnalysis.UnusedVariable + public function has_element_in_specific_scope( $tag_name, $termination_list ) { foreach ( $this->walk_up() as $node ) { if ( $node->node_name === $tag_name ) { return true; @@ -147,12 +147,12 @@ public function has_element_in_scope( $tag_name ) { array( /* - * Because it's not currently possible to encounter - * one of the termination elements, they don't need - * to be listed here. If they were, they would be - * unreachable and only waste CPU cycles while - * scanning through HTML. - */ + * Because it's not currently possible to encounter + * one of the termination elements, they don't need + * to be listed here. If they were, they would be + * unreachable and only waste CPU cycles while + * scanning through HTML. + */ ) ); } @@ -169,7 +169,7 @@ public function has_element_in_scope( $tag_name ) { * @param string $tag_name Name of tag to check. * @return bool Whether given element is in scope. */ - public function has_element_in_list_item_scope( $tag_name ) { // phpcs:ignore VariableAnalysis.CodeAnalysis.VariableAnalysis.UnusedVariable + public function has_element_in_list_item_scope( $tag_name ) { throw new WP_HTML_Unsupported_Exception( 'Cannot process elements depending on list item scope.' ); return false; // The linter requires this unreachable code until the function is implemented and can return. @@ -201,7 +201,7 @@ public function has_element_in_button_scope( $tag_name ) { * @param string $tag_name Name of tag to check. * @return bool Whether given element is in scope. */ - public function has_element_in_table_scope( $tag_name ) { // phpcs:ignore VariableAnalysis.CodeAnalysis.VariableAnalysis.UnusedVariable + public function has_element_in_table_scope( $tag_name ) { throw new WP_HTML_Unsupported_Exception( 'Cannot process elements depending on table scope.' ); return false; // The linter requires this unreachable code until the function is implemented and can return. @@ -219,7 +219,7 @@ public function has_element_in_table_scope( $tag_name ) { // phpcs:ignore Variab * @param string $tag_name Name of tag to check. * @return bool Whether given element is in scope. */ - public function has_element_in_select_scope( $tag_name ) { // phpcs:ignore VariableAnalysis.CodeAnalysis.VariableAnalysis.UnusedVariable + public function has_element_in_select_scope( $tag_name ) { throw new WP_HTML_Unsupported_Exception( 'Cannot process elements depending on select scope.' ); return false; // The linter requires this unreachable code until the function is implemented and can return. @@ -371,8 +371,8 @@ public function walk_up() { } /* - * Internal helpers. - */ + * Internal helpers. + */ /** * Updates internal flags after adding an element. @@ -389,9 +389,9 @@ public function walk_up() { */ public function after_element_push( $item ) { /* - * When adding support for new elements, expand this switch to trap - * cases where the precalculated value needs to change. - */ + * When adding support for new elements, expand this switch to trap + * cases where the precalculated value needs to change. + */ switch ( $item->node_name ) { case 'BUTTON': $this->has_p_in_button_scope = false; @@ -418,9 +418,9 @@ public function after_element_push( $item ) { */ public function after_element_pop( $item ) { /* - * When adding support for new elements, expand this switch to trap - * cases where the precalculated value needs to change. - */ + * When adding support for new elements, expand this switch to trap + * cases where the precalculated value needs to change. + */ switch ( $item->node_name ) { case 'BUTTON': $this->has_p_in_button_scope = $this->has_element_in_button_scope( 'P' ); diff --git a/lib/compat/wordpress-6.4/html-api/class-wp-html-processor-state.php b/lib/compat/wordpress-6.4/html-api/class-wp-html-processor-state.php index 5a36bc9bd6bf25..03443c2b3d3261 100644 --- a/lib/compat/wordpress-6.4/html-api/class-wp-html-processor-state.php +++ b/lib/compat/wordpress-6.4/html-api/class-wp-html-processor-state.php @@ -23,19 +23,19 @@ */ class WP_HTML_Processor_State { /* - * Insertion mode constants. - * - * These constants exist and are named to make it easier to - * discover and recognize the supported insertion modes in - * the parser. - * - * Out of all the possible insertion modes, only those - * supported by the parser are listed here. As support - * is added to the parser for more modes, add them here - * following the same naming and value pattern. - * - * @see https://html.spec.whatwg.org/#the-insertion-mode - */ + * Insertion mode constants. + * + * These constants exist and are named to make it easier to + * discover and recognize the supported insertion modes in + * the parser. + * + * Out of all the possible insertion modes, only those + * supported by the parser are listed here. As support + * is added to the parser for more modes, add them here + * following the same naming and value pattern. + * + * @see https://html.spec.whatwg.org/#the-insertion-mode + */ /** * Initial insertion mode for full HTML parser. @@ -87,6 +87,15 @@ class WP_HTML_Processor_State { */ public $active_formatting_elements = null; + /** + * Refers to the currently-matched tag, if any. + * + * @since 6.4.0 + * + * @var WP_HTML_Token|null + */ + public $current_token = null; + /** * Tree construction insertion mode. * diff --git a/lib/compat/wordpress-6.4/html-api/class-wp-html-processor.php b/lib/compat/wordpress-6.4/html-api/class-wp-html-processor.php index de75d0f2ae55e8..4a2714218229ef 100644 --- a/lib/compat/wordpress-6.4/html-api/class-wp-html-processor.php +++ b/lib/compat/wordpress-6.4/html-api/class-wp-html-processor.php @@ -101,16 +101,12 @@ * * The following list specifies the HTML tags that _are_ supported: * - * - Containers: ADDRESS, BLOCKQUOTE, DETAILS, DIALOG, DIV, FOOTER, HEADER, MAIN, MENU, SPAN, SUMMARY. - * - Form elements: BUTTON, FIELDSET, SEARCH. - * - Formatting elements: B, BIG, CODE, EM, FONT, I, SMALL, STRIKE, STRONG, TT, U. - * - Heading elements: HGROUP. * - Links: A. - * - Lists: DL. - * - Media elements: FIGCAPTION, FIGURE, IMG. + * - The formatting elements: B, BIG, CODE, EM, FONT, I, SMALL, STRIKE, STRONG, TT, U. + * - Containers: DIV, FIGCAPTION, FIGURE, SPAN. + * - Form elements: BUTTON. * - Paragraph: P. - * - Sectioning elements: ARTICLE, ASIDE, NAV, SECTION - * - Deprecated elements: CENTER, DIR + * - Void elements: IMG. * * ### Supported markup * @@ -211,8 +207,8 @@ class WP_HTML_Processor extends Gutenberg_HTML_Tag_Processor_6_4 { private $release_internal_bookmark_on_destruct = null; /* - * Public Interface Functions - */ + * Public Interface Functions + */ /** * Creates an HTML processor in the fragment parsing mode. @@ -254,7 +250,7 @@ public static function create_fragment( $html, $context = '', $encoding = $p->state->context_node = array( 'BODY', array() ); $p->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; - // @todo Create "fake" bookmarks for non-existent but implied nodes. + // @TODO: Create "fake" bookmarks for non-existent but implied nodes. $p->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 ); $p->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 ); @@ -309,10 +305,10 @@ public function __construct( $html, $use_the_static_create_methods_instead = nul $this->state = new WP_HTML_Processor_State(); /* - * Create this wrapper so that it's possible to pass - * a private method into WP_HTML_Token classes without - * exposing it to any public API. - */ + * Create this wrapper so that it's possible to pass + * a private method into WP_HTML_Token classes without + * exposing it to any public API. + */ $this->release_internal_bookmark_on_destruct = function ( $name ) { parent::release_bookmark( $name ); }; @@ -348,7 +344,7 @@ public function get_last_error() { /** * Finds the next tag matching the $query. * - * @todo Support matching the class name and tag name. + * @TODO: Support matching the class name and tag name. * * @since 6.4.0 * @@ -500,19 +496,19 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { if ( self::PROCESS_NEXT_NODE === $node_to_process ) { /* - * Void elements still hop onto the stack of open elements even though - * there's no corresponding closing tag. This is important for managing - * stack-based operations such as "navigate to parent node" or checking - * on an element's breadcrumbs. - * - * When moving on to the next node, therefore, if the bottom-most element - * on the stack is a void element, it must be closed. - * - * @todo Once self-closing foreign elements and BGSOUND are supported, - * they must also be implicitly closed here too. BGSOUND is - * special since it's only self-closing if the self-closing flag - * is provided in the opening tag, otherwise it expects a tag closer. - */ + * Void elements still hop onto the stack of open elements even though + * there's no corresponding closing tag. This is important for managing + * stack-based operations such as "navigate to parent node" or checking + * on an element's breadcrumbs. + * + * When moving on to the next node, therefore, if the bottom-most element + * on the stack is a void element, it must be closed. + * + * @TODO: Once self-closing foreign elements and BGSOUND are supported, + * they must also be implicitly closed here too. BGSOUND is + * special since it's only self-closing if the self-closing flag + * is provided in the opening tag, otherwise it expects a tag closer. + */ $top_node = $this->state->stack_of_open_elements->current_node(); if ( $top_node && self::is_void( $top_node->node_name ) ) { $this->state->stack_of_open_elements->pop(); @@ -544,9 +540,9 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { } } catch ( WP_HTML_Unsupported_Exception $e ) { /* - * Exceptions are used in this class to escape deep call stacks that - * otherwise might involve messier calling and return conventions. - */ + * Exceptions are used in this class to escape deep call stacks that + * otherwise might involve messier calling and return conventions. + */ return false; } } @@ -557,9 +553,9 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { * Breadcrumbs start at the outermost parent and descend toward the matched element. * They always include the entire path from the root HTML node to the matched element. * - * @todo It could be more efficient to expose a generator-based version of this function - * to avoid creating the array copy on tag iteration. If this is done, it would likely - * be more useful to walk up the stack when yielding instead of starting at the top. + * @TODO: It could be more efficient to expose a generator-based version of this function + * to avoid creating the array copy on tag iteration. If this is done, it would likely + * be more useful to walk up the stack when yielding instead of starting at the top. * * Example * @@ -606,11 +602,11 @@ private function step_in_body() { switch ( $op ) { /* - * > A start tag whose tag name is "button" - */ + * > A start tag whose tag name is "button" + */ case '+BUTTON': if ( $this->state->stack_of_open_elements->has_element_in_scope( 'BUTTON' ) ) { - // @todo Indicate a parse error once it's possible. This error does not impact the logic here. + // @TODO: Indicate a parse error once it's possible. This error does not impact the logic here. $this->generate_implied_end_tags(); $this->state->stack_of_open_elements->pop_until( 'BUTTON' ); } @@ -622,34 +618,16 @@ private function step_in_body() { return true; /* - * > A start tag whose tag name is one of: "address", "article", "aside", - * > "blockquote", "center", "details", "dialog", "dir", "div", "dl", - * > "fieldset", "figcaption", "figure", "footer", "header", "hgroup", - * > "main", "menu", "nav", "ol", "p", "search", "section", "summary", "ul" - */ - case '+ADDRESS': - case '+ARTICLE': - case '+ASIDE': + * > A start tag whose tag name is one of: "address", "article", "aside", + * > "blockquote", "center", "details", "dialog", "dir", "div", "dl", + * > "fieldset", "figcaption", "figure", "footer", "header", "hgroup", + * > "main", "menu", "nav", "ol", "p", "search", "section", "summary", "ul" + */ case '+BLOCKQUOTE': - case '+CENTER': - case '+DETAILS': - case '+DIALOG': - case '+DIR': case '+DIV': - case '+DL': - case '+FIELDSET': case '+FIGCAPTION': case '+FIGURE': - case '+FOOTER': - case '+HEADER': - case '+HGROUP': - case '+MAIN': - case '+MENU': - case '+NAV': case '+P': - case '+SEARCH': - case '+SECTION': - case '+SUMMARY': if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { $this->close_a_p_element(); } @@ -658,50 +636,32 @@ private function step_in_body() { return true; /* - * > An end tag whose tag name is one of: "address", "article", "aside", "blockquote", - * > "button", "center", "details", "dialog", "dir", "div", "dl", "fieldset", - * > "figcaption", "figure", "footer", "header", "hgroup", "listing", "main", - * > "menu", "nav", "ol", "pre", "search", "section", "summary", "ul" - */ - case '-ADDRESS': - case '-ARTICLE': - case '-ASIDE': + * > An end tag whose tag name is one of: "address", "article", "aside", "blockquote", + * > "button", "center", "details", "dialog", "dir", "div", "dl", "fieldset", + * > "figcaption", "figure", "footer", "header", "hgroup", "listing", "main", + * > "menu", "nav", "ol", "pre", "search", "section", "summary", "ul" + */ case '-BLOCKQUOTE': case '-BUTTON': - case '-CENTER': - case '-DETAILS': - case '-DIALOG': - case '-DIR': case '-DIV': - case '-DL': - case '-FIELDSET': case '-FIGCAPTION': case '-FIGURE': - case '-FOOTER': - case '-HEADER': - case '-HGROUP': - case '-MAIN': - case '-MENU': - case '-NAV': - case '-SEARCH': - case '-SECTION': - case '-SUMMARY': if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name ) ) { - // @todo Report parse error. + // @TODO: Report parse error. // Ignore the token. return $this->step(); } $this->generate_implied_end_tags(); if ( $this->state->stack_of_open_elements->current_node()->node_name !== $tag_name ) { - // @todo Record parse error: this error doesn't impact parsing. + // @TODO: Record parse error: this error doesn't impact parsing. } $this->state->stack_of_open_elements->pop_until( $tag_name ); return true; /* - * > An end tag whose tag name is "p" - */ + * > An end tag whose tag name is "p" + */ case '-P': if ( ! $this->state->stack_of_open_elements->has_p_in_button_scope() ) { $this->insert_html_element( $this->state->current_token ); @@ -731,9 +691,9 @@ private function step_in_body() { return true; /* - * > A start tag whose tag name is one of: "b", "big", "code", "em", "font", "i", - * > "s", "small", "strike", "strong", "tt", "u" - */ + * > A start tag whose tag name is one of: "b", "big", "code", "em", "font", "i", + * > "s", "small", "strike", "strong", "tt", "u" + */ case '+B': case '+BIG': case '+CODE': @@ -752,9 +712,9 @@ private function step_in_body() { return true; /* - * > An end tag whose tag name is one of: "a", "b", "big", "code", "em", "font", "i", - * > "nobr", "s", "small", "strike", "strong", "tt", "u" - */ + * > An end tag whose tag name is one of: "a", "b", "big", "code", "em", "font", "i", + * > "nobr", "s", "small", "strike", "strong", "tt", "u" + */ case '-A': case '-B': case '-BIG': @@ -772,24 +732,24 @@ private function step_in_body() { return true; /* - * > A start tag whose tag name is one of: "area", "br", "embed", "img", "keygen", "wbr" - */ + * > A start tag whose tag name is one of: "area", "br", "embed", "img", "keygen", "wbr" + */ case '+IMG': $this->reconstruct_active_formatting_elements(); $this->insert_html_element( $this->state->current_token ); return true; /* - * > Any other start tag - */ + * > Any other start tag + */ case '+SPAN': $this->reconstruct_active_formatting_elements(); $this->insert_html_element( $this->state->current_token ); return true; /* - * Any other end tag - */ + * Any other end tag + */ case '-SPAN': foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { // > If node is an HTML element with the same tag name as the token, then: @@ -817,8 +777,8 @@ private function step_in_body() { } /* - * Internal helpers - */ + * Internal helpers + */ /** * Creates a new bookmark for the currently-matched tag and returns the generated name. @@ -843,8 +803,8 @@ private function bookmark_tag() { } /* - * HTML semantic overrides for Tag Processor - */ + * HTML semantic overrides for Tag Processor + */ /** * Returns the uppercase name of the matched tag. @@ -877,9 +837,9 @@ public function get_tag() { switch ( $tag_name ) { case 'IMAGE': /* - * > A start tag whose tag name is "image" - * > Change the token's tag name to "img" and reprocess it. (Don't ask.) - */ + * > A start tag whose tag name is "image" + * > Change the token's tag name to "img" and reprocess it. (Don't ask.) + */ return 'IMG'; default: @@ -936,13 +896,13 @@ public function seek( $bookmark_name ) { case 'backward': /* - * When moving backwards, clear out all existing stack entries which appear after the destination - * bookmark. These could be stored for later retrieval, but doing so would require additional - * memory overhead and also demand that references and bookmarks are updated as the document - * changes. In time this could be a valuable optimization, but it's okay to give up that - * optimization in exchange for more CPU time to recompute the stack, to re-parse the - * document that may have already been parsed once. - */ + * When moving backwards, clear out all existing stack entries which appear after the destination + * bookmark. These could be stored for later retrieval, but doing so would require additional + * memory overhead and also demand that references and bookmarks are updated as the document + * changes. In time this could be a valuable optimization, but it's okay to give up that + * optimization in exchange for more CPU time to recompute the stack, to re-parse the + * document that may have already been parsed once. + */ foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { if ( $bookmark_starts_at >= $this->bookmarks[ $item->bookmark_name ]->start ) { break; @@ -1048,8 +1008,8 @@ public function set_bookmark( $bookmark_name ) { } /* - * HTML Parsing Algorithms - */ + * HTML Parsing Algorithms + */ /** * Closes a P element. @@ -1126,9 +1086,9 @@ private function generate_implied_end_tags_thoroughly() { */ private function reconstruct_active_formatting_elements() { /* - * > If there are no entries in the list of active formatting elements, then there is nothing - * > to reconstruct; stop this algorithm. - */ + * > If there are no entries in the list of active formatting elements, then there is nothing + * > to reconstruct; stop this algorithm. + */ if ( 0 === $this->state->active_formatting_elements->count() ) { return false; } @@ -1137,16 +1097,16 @@ private function reconstruct_active_formatting_elements() { if ( /* - * > If the last (most recently added) entry in the list of active formatting elements is a marker; - * > stop this algorithm. - */ + * > If the last (most recently added) entry in the list of active formatting elements is a marker; + * > stop this algorithm. + */ 'marker' === $last_entry->node_name || /* - * > If the last (most recently added) entry in the list of active formatting elements is an - * > element that is in the stack of open elements, then there is nothing to reconstruct; - * > stop this algorithm. - */ + * > If the last (most recently added) entry in the list of active formatting elements is an + * > element that is in the stack of open elements, then there is nothing to reconstruct; + * > stop this algorithm. + */ $this->state->stack_of_open_elements->contains_node( $last_entry ) ) { return false; @@ -1187,11 +1147,11 @@ private function run_adoption_agency_algorithm() { } /* - * > Let formatting element be the last element in the list of active formatting elements that: - * > - is between the end of the list and the last marker in the list, - * > if any, or the start of the list otherwise, - * > - and has the tag name subject. - */ + * > Let formatting element be the last element in the list of active formatting elements that: + * > - is between the end of the list and the last marker in the list, + * > if any, or the start of the list otherwise, + * > - and has the tag name subject. + */ $formatting_element = null; foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { if ( 'marker' === $item->node_name ) { @@ -1222,9 +1182,9 @@ private function run_adoption_agency_algorithm() { } /* - * > Let furthest block be the topmost node in the stack of open elements that is lower in the stack - * > than formatting element, and is an element in the special category. There might not be one. - */ + * > Let furthest block be the topmost node in the stack of open elements that is lower in the stack + * > than formatting element, and is an element in the special category. There might not be one. + */ $is_above_formatting_element = true; $furthest_block = null; foreach ( $this->state->stack_of_open_elements->walk_down() as $item ) { @@ -1244,10 +1204,10 @@ private function run_adoption_agency_algorithm() { } /* - * > If there is no furthest block, then the UA must first pop all the nodes from the bottom of the - * > stack of open elements, from the current node up to and including formatting element, then - * > remove formatting element from the list of active formatting elements, and finally return. - */ + * > If there is no furthest block, then the UA must first pop all the nodes from the bottom of the + * > stack of open elements, from the current node up to and including formatting element, then + * > remove formatting element from the list of active formatting elements, and finally return. + */ if ( null === $furthest_block ) { foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { $this->state->stack_of_open_elements->pop(); @@ -1281,8 +1241,8 @@ private function insert_html_element( $token ) { } /* - * HTML Specification Helpers - */ + * HTML Specification Helpers + */ /** * Returns whether an element of a given name is in the HTML special category. @@ -1430,8 +1390,8 @@ public static function is_void( $tag_name ) { } /* - * Constants that would pollute the top of the class if they were found there. - */ + * Constants that would pollute the top of the class if they were found there. + */ /** * Indicates that the next HTML token should be parsed and processed. diff --git a/lib/compat/wordpress-6.4/html-api/class-wp-html-token.php b/lib/compat/wordpress-6.4/html-api/class-wp-html-token.php index 50eb36efa26a37..87446f4533572b 100644 --- a/lib/compat/wordpress-6.4/html-api/class-wp-html-token.php +++ b/lib/compat/wordpress-6.4/html-api/class-wp-html-token.php @@ -96,5 +96,14 @@ public function __destruct() { call_user_func( $this->on_destroy, $this->bookmark_name ); } } + + /** + * Wakeup magic method. + * + * @since 6.4.2 + */ + public function __wakeup() { + throw new \LogicException( __CLASS__ . ' should never be unserialized' ); + } } } diff --git a/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-processor-6-5.php b/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-processor-6-5.php index 30d7ea52d4b544..02879b72ea9cbb 100644 --- a/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-processor-6-5.php +++ b/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-processor-6-5.php @@ -361,6 +361,10 @@ public function get_last_error() { public function next_tag( $query = null ) { if ( null === $query ) { while ( $this->step() ) { + if ( '#tag' !== $this->get_token_type() ) { + continue; + } + if ( ! $this->is_tag_closer() ) { return true; } @@ -384,6 +388,10 @@ public function next_tag( $query = null ) { if ( ! ( array_key_exists( 'breadcrumbs', $query ) && is_array( $query['breadcrumbs'] ) ) ) { while ( $this->step() ) { + if ( '#tag' !== $this->get_token_type() ) { + continue; + } + if ( ! $this->is_tag_closer() ) { return true; } @@ -405,6 +413,10 @@ public function next_tag( $query = null ) { $match_offset = isset( $query['match_offset'] ) ? (int) $query['match_offset'] : 1; while ( $match_offset > 0 && $this->step() ) { + if ( '#tag' !== $this->get_token_type() ) { + continue; + } + if ( $this->matches_breadcrumbs( $breadcrumbs ) && 0 === --$match_offset ) { return true; } @@ -428,13 +440,7 @@ public function next_tag( $query = null ) { * @return bool */ public function next_token() { - $found_a_token = parent::next_token(); - - if ( '#tag' === $this->get_token_type() ) { - $this->step( self::PROCESS_CURRENT_NODE ); - } - - return $found_a_token; + return $this->step(); } /** @@ -463,10 +469,6 @@ public function next_token() { * @return bool Whether the currently-matched tag is found at the given nested structure. */ public function matches_breadcrumbs( $breadcrumbs ) { - if ( ! $this->get_tag() ) { - return false; - } - // Everything matches when there are zero constraints. if ( 0 === count( $breadcrumbs ) ) { return true; @@ -529,25 +531,35 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { * is provided in the opening tag, otherwise it expects a tag closer. */ $top_node = $this->state->stack_of_open_elements->current_node(); - if ( $top_node && self::is_void( $top_node->node_name ) ) { + if ( + $top_node && ( + // Void elements. + self::is_void( $top_node->node_name ) || + // Comments, text nodes, and other atomic tokens. + '#' === $top_node->node_name[0] || + // Doctype declarations. + 'html' === $top_node->node_name + ) + ) { $this->state->stack_of_open_elements->pop(); } } if ( self::PROCESS_NEXT_NODE === $node_to_process ) { - while ( parent::next_token() && '#tag' !== $this->get_token_type() ) { - continue; - } + parent::next_token(); } // Finish stepping when there are no more tokens in the document. - if ( null === $this->get_tag() ) { + if ( + WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state || + WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state + ) { return false; } $this->state->current_token = new WP_HTML_Token( - $this->bookmark_tag(), - $this->get_tag(), + $this->bookmark_token(), + $this->get_token_name(), $this->has_self_closing_flag(), $this->release_internal_bookmark_on_destruct ); @@ -591,10 +603,6 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { * @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL. */ public function get_breadcrumbs() { - if ( ! $this->get_tag() ) { - return null; - } - $breadcrumbs = array(); foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) { $breadcrumbs[] = $stack_item->node_name; @@ -619,11 +627,61 @@ public function get_breadcrumbs() { * @return bool Whether an element was found. */ private function step_in_body() { - $tag_name = $this->get_tag(); - $op_sigil = $this->is_tag_closer() ? '-' : '+'; - $op = "{$op_sigil}{$tag_name}"; + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; switch ( $op ) { + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + case '#text': + $this->reconstruct_active_formatting_elements(); + + $current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ]; + + /* + * > A character token that is U+0000 NULL + * + * Any successive sequence of NULL bytes is ignored and won't + * trigger active format reconstruction. Therefore, if the text + * only comprises NULL bytes then the token should be ignored + * here, but if there are any other characters in the stream + * the active formats should be reconstructed. + */ + if ( + 1 <= $current_token->length && + "\x00" === $this->html[ $current_token->start ] && + strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length + ) { + // Parse error: ignore the token. + return $this->step(); + } + + /* + * Whitespace-only text does not affect the frameset-ok flag. + * It is probably inter-element whitespace, but it may also + * contain character references which decode only to whitespace. + */ + $text = $this->get_modifiable_text(); + if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) { + $this->state->frameset_ok = false; + } + + $this->insert_html_element( $this->state->current_token ); + return true; + + case 'html': + /* + * > A DOCTYPE token + * > Parse error. Ignore the token. + */ + return $this->step(); + /* * > A start tag whose tag name is "button" */ @@ -711,17 +769,17 @@ private function step_in_body() { case '-SECTION': case '-SUMMARY': case '-UL': - if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name ) ) { + if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) { // @todo Report parse error. // Ignore the token. return $this->step(); } $this->generate_implied_end_tags(); - if ( $this->state->stack_of_open_elements->current_node()->node_name !== $tag_name ) { + if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) { // @todo Record parse error: this error doesn't impact parsing. } - $this->state->stack_of_open_elements->pop_until( $tag_name ); + $this->state->stack_of_open_elements->pop_until( $token_name ); return true; /* @@ -783,7 +841,7 @@ private function step_in_body() { $this->generate_implied_end_tags(); - if ( $this->state->stack_of_open_elements->current_node()->node_name !== $tag_name ) { + if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) { // @todo Record parse error: this error doesn't impact parsing. } @@ -799,7 +857,7 @@ private function step_in_body() { case '+LI': $this->state->frameset_ok = false; $node = $this->state->stack_of_open_elements->current_node(); - $is_li = 'LI' === $tag_name; + $is_li = 'LI' === $token_name; in_body_list_loop: /* @@ -862,7 +920,7 @@ private function step_in_body() { * then this is a parse error; ignore the token. */ ( - 'LI' === $tag_name && + 'LI' === $token_name && ! $this->state->stack_of_open_elements->has_element_in_list_item_scope( 'LI' ) ) || /* @@ -872,8 +930,8 @@ private function step_in_body() { * parse error; ignore the token. */ ( - 'LI' !== $tag_name && - ! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name ) + 'LI' !== $token_name && + ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) ) { /* @@ -884,13 +942,13 @@ private function step_in_body() { return $this->step(); } - $this->generate_implied_end_tags( $tag_name ); + $this->generate_implied_end_tags( $token_name ); - if ( $tag_name !== $this->state->stack_of_open_elements->current_node()->node_name ) { + if ( $token_name !== $this->state->stack_of_open_elements->current_node()->node_name ) { // @todo Indicate a parse error once it's possible. This error does not impact the logic here. } - $this->state->stack_of_open_elements->pop_until( $tag_name ); + $this->state->stack_of_open_elements->pop_until( $token_name ); return true; /* @@ -1043,7 +1101,7 @@ private function step_in_body() { * * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody */ - switch ( $tag_name ) { + switch ( $token_name ) { case 'APPLET': case 'BASE': case 'BASEFONT': @@ -1091,7 +1149,7 @@ private function step_in_body() { case 'TR': case 'XMP': $this->last_error = self::ERROR_UNSUPPORTED; - throw new WP_HTML_Unsupported_Exception( "Cannot process {$tag_name} element." ); + throw new WP_HTML_Unsupported_Exception( "Cannot process {$token_name} element." ); } if ( ! $this->is_tag_closer() ) { @@ -1113,7 +1171,7 @@ private function step_in_body() { * close anything beyond its containing `P` or `DIV` element. */ foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) { - if ( $tag_name === $node->node_name ) { + if ( $token_name === $node->node_name ) { break; } @@ -1123,7 +1181,7 @@ private function step_in_body() { } } - $this->generate_implied_end_tags( $tag_name ); + $this->generate_implied_end_tags( $token_name ); if ( $node !== $this->state->stack_of_open_elements->current_node() ) { // @todo Record parse error: this error doesn't impact parsing. } @@ -1142,19 +1200,16 @@ private function step_in_body() { */ /** - * Creates a new bookmark for the currently-matched tag and returns the generated name. + * Creates a new bookmark for the currently-matched token and returns the generated name. * * @since 6.4.0 + * @since 6.5.0 Renamed from bookmark_tag() to bookmark_token(). * * @throws Exception When unable to allocate requested bookmark. * * @return string|false Name of created bookmark, or false if unable to create. */ - private function bookmark_tag() { - if ( ! $this->get_tag() ) { - return false; - } - + private function bookmark_token() { if ( ! parent::set_bookmark( ++$this->bookmark_counter ) ) { $this->last_error = self::ERROR_EXCEEDED_MAX_BOOKMARKS; throw new Exception( 'could not allocate bookmark' ); @@ -1226,6 +1281,10 @@ public function release_bookmark( $bookmark_name ) { /** * Moves the internal cursor in the HTML Processor to a given bookmark's location. * + * Be careful! Seeking backwards to a previous location resets the parser to the + * start of the document and reparses the entire contents up until it finds the + * sought-after bookmarked location. + * * In order to prevent accidental infinite loops, there's a * maximum limit on the number of times seek() can be called. * @@ -1237,6 +1296,9 @@ public function release_bookmark( $bookmark_name ) { * @return bool Whether the internal cursor was successfully moved to the bookmark's location. */ public function seek( $bookmark_name ) { + // Flush any pending updates to the document before beginning. + $this->get_updated_html(); + $actual_bookmark_name = "_{$bookmark_name}"; $processor_started_at = $this->state->current_token ? $this->bookmarks[ $this->state->current_token->bookmark_name ]->start @@ -1244,44 +1306,73 @@ public function seek( $bookmark_name ) { $bookmark_starts_at = $this->bookmarks[ $actual_bookmark_name ]->start; $direction = $bookmark_starts_at > $processor_started_at ? 'forward' : 'backward'; - switch ( $direction ) { - case 'forward': - // When moving forwards, re-parse the document until reaching the same location as the original bookmark. - while ( $this->step() ) { - if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) { - return true; - } + /* + * If seeking backwards, it's possible that the sought-after bookmark exists within an element + * which has been closed before the current cursor; in other words, it has already been removed + * from the stack of open elements. This means that it's insufficient to simply pop off elements + * from the stack of open elements which appear after the bookmarked location and then jump to + * that location, as the elements which were open before won't be re-opened. + * + * In order to maintain consistency, the HTML Processor rewinds to the start of the document + * and reparses everything until it finds the sought-after bookmark. + * + * There are potentially better ways to do this: cache the parser state for each bookmark and + * restore it when seeking; store an immutable and idempotent register of where elements open + * and close. + * + * If caching the parser state it will be essential to properly maintain the cached stack of + * open elements and active formatting elements when modifying the document. This could be a + * tedious and time-consuming process as well, and so for now will not be performed. + * + * It may be possible to track bookmarks for where elements open and close, and in doing so + * be able to quickly recalculate breadcrumbs for any element in the document. It may even + * be possible to remove the stack of open elements and compute it on the fly this way. + * If doing this, the parser would need to track the opening and closing locations for all + * tokens in the breadcrumb path for any and all bookmarks. By utilizing bookmarks themselves + * this list could be automatically maintained while modifying the document. Finding the + * breadcrumbs would then amount to traversing that list from the start until the token + * being inspected. Once an element closes, if there are no bookmarks pointing to locations + * within that element, then all of these locations may be forgotten to save on memory use + * and computation time. + */ + if ( 'backward' === $direction ) { + /* + * Instead of clearing the parser state and starting fresh, calling the stack methods + * maintains the proper flags in the parser. + */ + foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { + if ( 'context-node' === $item->bookmark_name ) { + break; } - return false; - - case 'backward': - /* - * When moving backwards, clear out all existing stack entries which appear after the destination - * bookmark. These could be stored for later retrieval, but doing so would require additional - * memory overhead and also demand that references and bookmarks are updated as the document - * changes. In time this could be a valuable optimization, but it's okay to give up that - * optimization in exchange for more CPU time to recompute the stack, to re-parse the - * document that may have already been parsed once. - */ - foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { - if ( $bookmark_starts_at >= $this->bookmarks[ $item->bookmark_name ]->start ) { - break; - } + $this->state->stack_of_open_elements->remove_node( $item ); + } - $this->state->stack_of_open_elements->remove_node( $item ); + foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { + if ( 'context-node' === $item->bookmark_name ) { + break; } - foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { - if ( $bookmark_starts_at >= $this->bookmarks[ $item->bookmark_name ]->start ) { - break; - } + $this->state->active_formatting_elements->remove_node( $item ); + } - $this->state->active_formatting_elements->remove_node( $item ); - } + parent::seek( 'context-node' ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + $this->state->frameset_ok = true; + } - return parent::seek( $actual_bookmark_name ); + // When moving forwards, reparse the document until reaching the same location as the original bookmark. + if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) { + return true; } + + while ( $this->step() ) { + if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) { + return true; + } + } + + return false; } /** @@ -1368,6 +1459,18 @@ public function set_bookmark( $bookmark_name ) { return parent::set_bookmark( "_{$bookmark_name}" ); } + /** + * Checks whether a bookmark with the given name exists. + * + * @since 6.5.0 + * + * @param string $bookmark_name Name to identify a bookmark that potentially exists. + * @return bool Whether that bookmark exists. + */ + public function has_bookmark( $bookmark_name ) { + return parent::has_bookmark( "_{$bookmark_name}" ); + } + /* * HTML Parsing Algorithms */ diff --git a/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-tag-processor-6-5.php b/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-tag-processor-6-5.php index f93a48515d93b6..3bbcc5047a076f 100644 --- a/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-tag-processor-6-5.php +++ b/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-tag-processor-6-5.php @@ -298,8 +298,8 @@ * * The special elements are: * - `SCRIPT` whose contents are treated as raw plaintext but supports a legacy - * style of including JavaScript inside of HTML comments to avoid accidentally - * closing the SCRIPT from inside a JavaScript string. E.g. `console.log( '' )`. + * style of including Javascript inside of HTML comments to avoid accidentally + * closing the SCRIPT from inside a Javascript string. E.g. `console.log( '' )`. * - `TITLE` and `TEXTAREA` whose contents are treated as plaintext and then any * character references are decoded. E.g. `1 < 2 < 3` becomes `1 < 2 < 3`. * - `IFRAME`, `NOSCRIPT`, `NOEMBED`, `NOFRAME`, `STYLE` whose contents are treated as @@ -837,8 +837,27 @@ public function next_tag( $query = null ) { * @return bool Whether a token was parsed. */ public function next_token() { + return $this->base_class_next_token(); + } + + /** + * Internal method which finds the next token in the HTML document. + * + * This method is a protected internal function which implements the logic for + * finding the next token in a document. It exists so that the parser can update + * its state without affecting the location of the cursor in the document and + * without triggering subclass methods for things like `next_token()`, e.g. when + * applying patches before searching for the next token. + * + * @since 6.5.0 + * + * @access private + * + * @return bool Whether a token was parsed. + */ + private function base_class_next_token() { $was_at = $this->bytes_already_parsed; - $this->get_updated_html(); + $this->after_tag(); // Don't proceed if there's nothing more to scan. if ( @@ -2041,6 +2060,45 @@ private function skip_whitespace() { * @since 6.2.0 */ private function after_tag() { + /* + * There could be lexical updates enqueued for an attribute that + * also exists on the next tag. In order to avoid conflating the + * attributes across the two tags, lexical updates with names + * need to be flushed to raw lexical updates. + */ + $this->class_name_updates_to_attributes_updates(); + + /* + * Purge updates if there are too many. The actual count isn't + * scientific, but a few values from 100 to a few thousand were + * tests to find a practially-useful limit. + * + * If the update queue grows too big, then the Tag Processor + * will spend more time iterating through them and lose the + * efficiency gains of deferring applying them. + */ + if ( 1000 < count( $this->lexical_updates ) ) { + $this->get_updated_html(); + } + + foreach ( $this->lexical_updates as $name => $update ) { + /* + * Any updates appearing after the cursor should be applied + * before proceeding, otherwise they may be overlooked. + */ + if ( $update->start >= $this->bytes_already_parsed ) { + $this->get_updated_html(); + break; + } + + if ( is_int( $name ) ) { + continue; + } + + $this->lexical_updates[] = $update; + unset( $this->lexical_updates[ $name ] ); + } + $this->token_starts_at = null; $this->token_length = null; $this->tag_name_starts_at = null; @@ -2230,7 +2288,7 @@ private function apply_attributes_updates( $shift_this_point = 0 ) { $shift = strlen( $diff->text ) - $diff->length; // Adjust the cursor position by however much an update affects it. - if ( $diff->start <= $this->bytes_already_parsed ) { + if ( $diff->start < $this->bytes_already_parsed ) { $this->bytes_already_parsed += $shift; } @@ -2810,10 +2868,6 @@ public function get_modifiable_text() { $decoded = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE ); - if ( empty( $decoded ) ) { - return ''; - } - /* * TEXTAREA skips a leading newline, but this newline may appear not only as the * literal character `\n`, but also as a character reference, such as in the @@ -2914,7 +2968,14 @@ public function set_attribute( $name, $value ) { if ( true === $value ) { $updated_attribute = $name; } else { - $escaped_new_value = esc_attr( $value ); + $comparable_name = strtolower( $name ); + + /* + * Escape URL attributes. + * + * @see https://html.spec.whatwg.org/#attributes-3 + */ + $escaped_new_value = in_array( $comparable_name, wp_kses_uri_attributes() ) ? esc_url( $value ) : esc_attr( $value ); $updated_attribute = "{$name}=\"{$escaped_new_value}\""; } @@ -3168,15 +3229,7 @@ public function get_updated_html() { * └←─┘ back up by strlen("em") + 1 ==> 3 */ $this->bytes_already_parsed = $before_current_tag; - $this->parse_next_tag(); - // Reparse the attributes. - while ( $this->parse_next_attribute() ) { - continue; - } - - $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed ); - $this->token_length = $tag_ends_at - $this->token_starts_at; - $this->bytes_already_parsed = $tag_ends_at; + $this->base_class_next_token(); return $this->html; } diff --git a/lib/compat/wordpress-6.6/class-gutenberg-token-map-6-6.php b/lib/compat/wordpress-6.6/class-gutenberg-token-map-6-6.php new file mode 100644 index 00000000000000..1df1fe91dab141 --- /dev/null +++ b/lib/compat/wordpress-6.6/class-gutenberg-token-map-6-6.php @@ -0,0 +1,818 @@ + '😯', + * ':(' => '🙁', + * ':)' => '🙂', + * ':?' => '😕', + * ) ); + * + * true === $smilies->contains( ':)' ); + * false === $smilies->contains( 'simile' ); + * + * '😕' === $smilies->read_token( 'Not sure :?.', 9, $length_of_smily_syntax ); + * 2 === $length_of_smily_syntax; + * + * ## Precomputing the Token Map. + * + * Creating the class involves some work sorting and organizing the tokens and their + * replacement values. In order to skip this, it's possible for the class to export + * its state and be used as actual PHP source code. + * + * Example: + * + * // Export with four spaces as the indent, only for the sake of this docblock. + * // The default indent is a tab character. + * $indent = ' '; + * echo $smilies->precomputed_php_source_table( $indent ); + * + * // Output, to be pasted into a PHP source file: + * WP_Token_Map::from_precomputed_table( + * array( + * "storage_version" => "6.6.0", + * "key_length" => 2, + * "groups" => "", + * "long_words" => array(), + * "small_words" => "8O\x00:)\x00:(\x00:?\x00", + * "small_mappings" => array( "😯", "🙂", "🙁", "😕" ) + * ) + * ); + * + * ## Large vs. small words. + * + * This class uses a short prefix called the "key" to optimize lookup of its tokens. + * This means that some tokens may be shorter than or equal in length to that key. + * Those words that are longer than the key are called "large" while those shorter + * than or equal to the key length are called "small." + * + * This separation of large and small words is incidental to the way this class + * optimizes lookup, and should be considered an internal implementation detail + * of the class. It may still be important to be aware of it, however. + * + * ## Determining Key Length. + * + * The choice of the size of the key length should be based on the data being stored in + * the token map. It should divide the data as evenly as possible, but should not create + * so many groups that a large fraction of the groups only contain a single token. + * + * For the HTML5 named character references, a key length of 2 was found to provide a + * sufficient spread and should be a good default for relatively large sets of tokens. + * + * However, for some data sets this might be too long. For example, a list of smilies + * may be too small for a key length of 2. Perhaps 1 would be more appropriate. It's + * best to experiment and determine empirically which values are appropriate. + * + * ## Generate Pre-Computed Source Code. + * + * Since the `WP_Token_Map` is designed for relatively static lookups, it can be + * advantageous to precompute the values and instantiate a table that has already + * sorted and grouped the tokens and built the lookup strings. + * + * This can be done with `WP_Token_Map::precomputed_php_source_table()`. + * + * Note that if there is a leading character that all tokens need, such as `&` for + * HTML named character references, it can be beneficial to exclude this from the + * token map. Instead, find occurrences of the leading character and then use the + * token map to see if the following characters complete the token. + * + * Example: + * + * $map = WP_Token_Map::from_array( array( 'simple_smile:' => '🙂', 'sob:' => '😭', 'soba:' => '🍜' ) ); + * echo $map->precomputed_php_source_table(); + * // Output + * WP_Token_Map::from_precomputed_table( + * array( + * "storage_version" => "6.6.0", + * "key_length" => 2, + * "groups" => "si\x00so\x00", + * "long_words" => array( + * // simple_smile:[🙂]. + * "\x0bmple_smile:\x04🙂", + * // soba:[🍜] sob:[😭]. + * "\x03ba:\x04🍜\x02b:\x04😭", + * ), + * "short_words" => "", + * "short_mappings" => array() + * } + * ); + * + * This precomputed value can be stored directly in source code and will skip the + * startup cost of generating the lookup strings. See `$html5_named_character_entities`. + * + * Note that any updates to the precomputed format should update the storage version + * constant. It would also be best to provide an update function to take older known + * versions and upgrade them in place when loading into `from_precomputed_table()`. + * + * ## Future Direction. + * + * It may be viable to dynamically increase the length limits such that there's no need to impose them. + * The limit appears because of the packing structure, which indicates how many bytes each segment of + * text in the lookup tables spans. If, however, care were taken to track the longest word length, then + * the packing structure could change its representation to allow for that. Each additional byte storing + * length, however, increases the memory overhead and lookup runtime. + * + * An alternative approach could be to borrow the UTF-8 variable-length encoding and store lengths of less + * than 127 as a single byte with the high bit unset, storing longer lengths as the combination of + * continuation bytes. + * + * Since it has not been shown during the development of this class that longer strings are required, this + * update is deferred until such a need is clear. + * + * @since 6.6.0 + */ +class Gutenberg_Token_Map_6_6 { + /** + * Denotes the version of the code which produces pre-computed source tables. + * + * This version will be used not only to verify pre-computed data, but also + * to upgrade pre-computed data from older versions. Choosing a name that + * corresponds to the WordPress release will help people identify where an + * old copy of data came from. + */ + const STORAGE_VERSION = '6.6.0-trunk'; + + /** + * Maximum length for each key and each transformed value in the table (in bytes). + * + * @since 6.6.0 + */ + const MAX_LENGTH = 256; + + /** + * How many bytes of each key are used to form a group key for lookup. + * This also determines whether a word is considered short or long. + * + * @since 6.6.0 + * + * @var int + */ + private $key_length = 2; + + /** + * Stores an optimized form of the word set, where words are grouped + * by a prefix of the `$key_length` and then collapsed into a string. + * + * In each group, the keys and lookups form a packed data structure. + * The keys in the string are stripped of their "group key," which is + * the prefix of length `$this->key_length` shared by all of the items + * in the group. Each word in the string is prefixed by a single byte + * whose raw unsigned integer value represents how many bytes follow. + * + * ┌────────────────┬───────────────┬─────────────────┬────────┐ + * │ Length of rest │ Rest of key │ Length of value │ Value │ + * │ of key (bytes) │ │ (bytes) │ │ + * ├────────────────┼───────────────┼─────────────────┼────────┤ + * │ 0x08 │ nterDot; │ 0x02 │ · │ + * └────────────────┴───────────────┴─────────────────┴────────┘ + * + * In this example, the key `CenterDot;` has a group key `Ce`, leaving + * eight bytes for the rest of the key, `nterDot;`, and two bytes for + * the transformed value `·` (or U+B7 or "\xC2\xB7"). + * + * Example: + * + * // Stores array( 'CenterDot;' => '·', 'Cedilla;' => '¸' ). + * $groups = "Ce\x00"; + * $large_words = array( "\x08nterDot;\x02·\x06dilla;\x02¸" ) + * + * The prefixes appear in the `$groups` string, each followed by a null + * byte. This makes for quick lookup of where in the group string the key + * is found, and then a simple division converts that offset into the index + * in the `$large_words` array where the group string is to be found. + * + * This lookup data structure is designed to optimize cache locality and + * minimize indirect memory reads when matching strings in the set. + * + * @since 6.6.0 + * + * @var array + */ + private $large_words = array(); + + /** + * Stores the group keys for sequential string lookup. + * + * The offset into this string where the group key appears corresponds with the index + * into the group array where the rest of the group string appears. This is an optimization + * to improve cache locality while searching and minimize indirect memory accesses. + * + * @since 6.6.0 + * + * @var string + */ + private $groups = ''; + + /** + * Stores an optimized row of small words, where every entry is + * `$this->key_size + 1` bytes long and zero-extended. + * + * This packing allows for direct lookup of a short word followed + * by the null byte, if extended to `$this->key_size + 1`. + * + * Example: + * + * // Stores array( 'GT', 'LT', 'gt', 'lt' ). + * "GT\x00LT\x00gt\x00lt\x00" + * + * @since 6.6.0 + * + * @var string + */ + private $small_words = ''; + + /** + * Replacements for the small words, in the same order they appear. + * + * With the position of a small word it's possible to index the translation + * directly, as its position in the `$small_words` string corresponds to + * the index of the replacement in the `$small_mapping` array. + * + * Example: + * + * array( '>', '<', '>', '<' ) + * + * @since 6.6.0 + * + * @var string[] + */ + private $small_mappings = array(); + + /** + * Create a token map using an associative array of key/value pairs as the input. + * + * Example: + * + * $smilies = WP_Token_Map::from_array( array( + * '8O' => '😯', + * ':(' => '🙁', + * ':)' => '🙂', + * ':?' => '😕', + * ) ); + * + * @since 6.6.0 + * + * @param array $mappings The keys transform into the values, both are strings. + * @param int $key_length Determines the group key length. Leave at the default value + * of 2 unless there's an empirical reason to change it. + * + * @return WP_Token_Map|null Token map, unless unable to create it. + */ + public static function from_array( $mappings, $key_length = 2 ) { + $map = new static(); + $map->key_length = $key_length; + + // Start by grouping words. + + $groups = array(); + $shorts = array(); + foreach ( $mappings as $word => $mapping ) { + if ( + self::MAX_LENGTH <= strlen( $word ) || + self::MAX_LENGTH <= strlen( $mapping ) + ) { + _doing_it_wrong( + __METHOD__, + sprintf( + /* translators: 1: maximum byte length (a count) */ + __( 'Token Map tokens and substitutions must all be shorter than %1$d bytes.' ), + self::MAX_LENGTH + ), + '6.6.0' + ); + return null; + } + + $length = strlen( $word ); + + if ( $key_length >= $length ) { + $shorts[] = $word; + } else { + $group = substr( $word, 0, $key_length ); + + if ( ! isset( $groups[ $group ] ) ) { + $groups[ $group ] = array(); + } + + $groups[ $group ][] = array( substr( $word, $key_length ), $mapping ); + } + } + + /* + * Sort the words to ensure that no smaller substring of a match masks the full match. + * For example, `Cap` should not match before `CapitalDifferentialD`. + */ + usort( $shorts, 'static::longest_first_then_alphabetical' ); + foreach ( $groups as $group_key => $group ) { + usort( + $groups[ $group_key ], + static function ( $a, $b ) { + return self::longest_first_then_alphabetical( $a[0], $b[0] ); + } + ); + } + + // Finally construct the optimized lookups. + + foreach ( $shorts as $word ) { + $map->small_words .= str_pad( $word, $key_length + 1, "\x00", STR_PAD_RIGHT ); + $map->small_mappings[] = $mappings[ $word ]; + } + + $group_keys = array_keys( $groups ); + sort( $group_keys ); + + foreach ( $group_keys as $group ) { + $map->groups .= "{$group}\x00"; + + $group_string = ''; + + foreach ( $groups[ $group ] as $group_word ) { + list( $word, $mapping ) = $group_word; + + $word_length = pack( 'C', strlen( $word ) ); + $mapping_length = pack( 'C', strlen( $mapping ) ); + $group_string .= "{$word_length}{$word}{$mapping_length}{$mapping}"; + } + + $map->large_words[] = $group_string; + } + + return $map; + } + + /** + * Creates a token map from a pre-computed table. + * This skips the initialization cost of generating the table. + * + * This function should only be used to load data created with + * WP_Token_Map::precomputed_php_source_tag(). + * + * @since 6.6.0 + * + * @param array $state { + * Stores pre-computed state for directly loading into a Token Map. + * + * @type string $storage_version Which version of the code produced this state. + * @type int $key_length Group key length. + * @type string $groups Group lookup index. + * @type array $large_words Large word groups and packed strings. + * @type string $small_words Small words packed string. + * @type array $small_mappings Small word mappings. + * } + * + * @return WP_Token_Map Map with precomputed data loaded. + */ + public static function from_precomputed_table( $state ) { + $has_necessary_state = isset( + $state['storage_version'], + $state['key_length'], + $state['groups'], + $state['large_words'], + $state['small_words'], + $state['small_mappings'] + ); + + if ( ! $has_necessary_state ) { + _doing_it_wrong( + __METHOD__, + __( 'Missing required inputs to pre-computed WP_Token_Map.' ), + '6.6.0' + ); + return null; + } + + if ( self::STORAGE_VERSION !== $state['storage_version'] ) { + _doing_it_wrong( + __METHOD__, + /* translators: 1: version string, 2: version string. */ + sprintf( __( 'Loaded version \'%1$s\' incompatible with expected version \'%2$s\'.' ), $state['storage_version'], self::STORAGE_VERSION ), + '6.6.0' + ); + return null; + } + + $map = new static(); + + $map->key_length = $state['key_length']; + $map->groups = $state['groups']; + $map->large_words = $state['large_words']; + $map->small_words = $state['small_words']; + $map->small_mappings = $state['small_mappings']; + + return $map; + } + + /** + * Indicates if a given word is a lookup key in the map. + * + * Example: + * + * true === $smilies->contains( ':)' ); + * false === $smilies->contains( 'simile' ); + * + * @since 6.6.0 + * + * @param string $word Determine if this word is a lookup key in the map. + * @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'. + * @return bool Whether there's an entry for the given word in the map. + */ + public function contains( $word, $case_sensitivity = 'case-sensitive' ) { + $ignore_case = 'ascii-case-insensitive' === $case_sensitivity; + + if ( $this->key_length >= strlen( $word ) ) { + if ( 0 === strlen( $this->small_words ) ) { + return false; + } + + $term = str_pad( $word, $this->key_length + 1, "\x00", STR_PAD_RIGHT ); + $word_at = $ignore_case ? stripos( $this->small_words, $term ) : strpos( $this->small_words, $term ); + if ( false === $word_at ) { + return false; + } + + return true; + } + + $group_key = substr( $word, 0, $this->key_length ); + $group_at = $ignore_case ? stripos( $this->groups, $group_key ) : strpos( $this->groups, $group_key ); + if ( false === $group_at ) { + return false; + } + $group = $this->large_words[ $group_at / ( $this->key_length + 1 ) ]; + $group_length = strlen( $group ); + $slug = substr( $word, $this->key_length ); + $length = strlen( $slug ); + $at = 0; + + while ( $at < $group_length ) { + $token_length = unpack( 'C', $group[ $at++ ] )[1]; + $token_at = $at; + $at += $token_length; + $mapping_length = unpack( 'C', $group[ $at++ ] )[1]; + $mapping_at = $at; + + if ( $token_length === $length && 0 === substr_compare( $group, $slug, $token_at, $token_length, $ignore_case ) ) { + return true; + } + + $at = $mapping_at + $mapping_length; + } + + return false; + } + + /** + * If the text starting at a given offset is a lookup key in the map, + * return the corresponding transformation from the map, else `false`. + * + * This function returns the translated string, but accepts an optional + * parameter `$matched_token_byte_length`, which communicates how many + * bytes long the lookup key was, if it found one. This can be used to + * advance a cursor in calling code if a lookup key was found. + * + * Example: + * + * false === $smilies->read_token( 'Not sure :?.', 0, $token_byte_length ); + * '😕' === $smilies->read_token( 'Not sure :?.', 9, $token_byte_length ); + * 2 === $token_byte_length; + * + * Example: + * + * while ( $at < strlen( $input ) ) { + * $next_at = strpos( $input, ':', $at ); + * if ( false === $next_at ) { + * break; + * } + * + * $smily = $smilies->read_token( $input, $next_at, $token_byte_length ); + * if ( false === $next_at ) { + * ++$at; + * continue; + * } + * + * $prefix = substr( $input, $at, $next_at - $at ); + * $at += $token_byte_length; + * $output .= "{$prefix}{$smily}"; + * } + * + * @since 6.6.0 + * + * @param string $text String in which to search for a lookup key. + * @param int $offset Optional. How many bytes into the string where the lookup key ought to start. Default 0. + * @param ?int &$matched_token_byte_length Optional. Holds byte-length of found token matched, otherwise not set. Default null. + * @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'. + * @return string|null Mapped value of lookup key if found, otherwise `null`. + */ + public function read_token( $text, $offset = 0, &$matched_token_byte_length = null, $case_sensitivity = 'case-sensitive' ) { + $ignore_case = 'ascii-case-insensitive' === $case_sensitivity; + $text_length = strlen( $text ); + + // Search for a long word first, if the text is long enough, and if that fails, a short one. + if ( $text_length > $this->key_length ) { + $group_key = substr( $text, $offset, $this->key_length ); + + $group_at = $ignore_case ? stripos( $this->groups, $group_key ) : strpos( $this->groups, $group_key ); + if ( false === $group_at ) { + // Perhaps a short word then. + return strlen( $this->small_words ) > 0 + ? $this->read_small_token( $text, $offset, $matched_token_byte_length, $case_sensitivity ) + : null; + } + + $group = $this->large_words[ $group_at / ( $this->key_length + 1 ) ]; + $group_length = strlen( $group ); + $at = 0; + while ( $at < $group_length ) { + $token_length = unpack( 'C', $group[ $at++ ] )[1]; + $token = substr( $group, $at, $token_length ); + $at += $token_length; + $mapping_length = unpack( 'C', $group[ $at++ ] )[1]; + $mapping_at = $at; + + if ( 0 === substr_compare( $text, $token, $offset + $this->key_length, $token_length, $ignore_case ) ) { + $matched_token_byte_length = $this->key_length + $token_length; + return substr( $group, $mapping_at, $mapping_length ); + } + + $at = $mapping_at + $mapping_length; + } + } + + // Perhaps a short word then. + return strlen( $this->small_words ) > 0 + ? $this->read_small_token( $text, $offset, $matched_token_byte_length, $case_sensitivity ) + : null; + } + + /** + * Finds a match for a short word at the index. + * + * @since 6.6.0. + * + * @param string $text String in which to search for a lookup key. + * @param int $offset Optional. How many bytes into the string where the lookup key ought to start. Default 0. + * @param ?int &$matched_token_byte_length Optional. Holds byte-length of found lookup key if matched, otherwise not set. Default null. + * @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'. + * @return string|null Mapped value of lookup key if found, otherwise `null`. + */ + private function read_small_token( $text, $offset, &$matched_token_byte_length, $case_sensitivity = 'case-sensitive' ) { + $ignore_case = 'ascii-case-insensitive' === $case_sensitivity; + $small_length = strlen( $this->small_words ); + $search_text = substr( $text, $offset, $this->key_length ); + if ( $ignore_case ) { + $search_text = strtoupper( $search_text ); + } + $starting_char = $search_text[0]; + + $at = 0; + while ( $at < $small_length ) { + if ( + $starting_char !== $this->small_words[ $at ] && + ( ! $ignore_case || strtoupper( $this->small_words[ $at ] ) !== $starting_char ) + ) { + $at += $this->key_length + 1; + continue; + } + + for ( $adjust = 1; $adjust < $this->key_length; $adjust++ ) { + if ( "\x00" === $this->small_words[ $at + $adjust ] ) { + $matched_token_byte_length = $adjust; + return $this->small_mappings[ $at / ( $this->key_length + 1 ) ]; + } + + if ( + $search_text[ $adjust ] !== $this->small_words[ $at + $adjust ] && + ( ! $ignore_case || strtoupper( $this->small_words[ $at + $adjust ] !== $search_text[ $adjust ] ) ) + ) { + $at += $this->key_length + 1; + continue 2; + } + } + + $matched_token_byte_length = $adjust; + return $this->small_mappings[ $at / ( $this->key_length + 1 ) ]; + } + + return null; + } + + /** + * Exports the token map into an associate array of key/value pairs. + * + * Example: + * + * $smilies->to_array() === array( + * '8O' => '😯', + * ':(' => '🙁', + * ':)' => '🙂', + * ':?' => '😕', + * ); + * + * @return array The lookup key/substitution values as an associate array. + */ + public function to_array() { + $tokens = array(); + + $at = 0; + $small_mapping = 0; + $small_length = strlen( $this->small_words ); + while ( $at < $small_length ) { + $key = rtrim( substr( $this->small_words, $at, $this->key_length + 1 ), "\x00" ); + $value = $this->small_mappings[ $small_mapping++ ]; + $tokens[ $key ] = $value; + + $at += $this->key_length + 1; + } + + foreach ( $this->large_words as $index => $group ) { + $prefix = substr( $this->groups, $index * ( $this->key_length + 1 ), 2 ); + $group_length = strlen( $group ); + $at = 0; + while ( $at < $group_length ) { + $length = unpack( 'C', $group[ $at++ ] )[1]; + $key = $prefix . substr( $group, $at, $length ); + + $at += $length; + $length = unpack( 'C', $group[ $at++ ] )[1]; + $value = substr( $group, $at, $length ); + + $tokens[ $key ] = $value; + $at += $length; + } + } + + return $tokens; + } + + /** + * Export the token map for quick loading in PHP source code. + * + * This function has a specific purpose, to make loading of static token maps fast. + * It's used to ensure that the HTML character reference lookups add a minimal cost + * to initializing the PHP process. + * + * Example: + * + * echo $smilies->precomputed_php_source_table(); + * + * // Output. + * WP_Token_Map::from_precomputed_table( + * array( + * "storage_version" => "6.6.0", + * "key_length" => 2, + * "groups" => "", + * "long_words" => array(), + * "small_words" => "8O\x00:)\x00:(\x00:?\x00", + * "small_mappings" => array( "😯", "🙂", "🙁", "😕" ) + * ) + * ); + * + * @since 6.6.0 + * + * @param string $indent Optional. Use this string for indentation, or rely on the default horizontal tab character. Default "\t". + * @return string Value which can be pasted into a PHP source file for quick loading of table. + */ + public function precomputed_php_source_table( $indent = "\t" ) { + $i1 = $indent; + $i2 = $i1 . $indent; + $i3 = $i2 . $indent; + + $class_version = self::STORAGE_VERSION; + + $output = self::class . "::from_precomputed_table(\n"; + $output .= "{$i1}array(\n"; + $output .= "{$i2}\"storage_version\" => \"{$class_version}\",\n"; + $output .= "{$i2}\"key_length\" => {$this->key_length},\n"; + + $group_line = str_replace( "\x00", "\\x00", $this->groups ); + $output .= "{$i2}\"groups\" => \"{$group_line}\",\n"; + + $output .= "{$i2}\"large_words\" => array(\n"; + + $prefixes = explode( "\x00", $this->groups ); + foreach ( $prefixes as $index => $prefix ) { + if ( '' === $prefix ) { + break; + } + $group = $this->large_words[ $index ]; + $group_length = strlen( $group ); + $comment_line = "{$i3}//"; + $data_line = "{$i3}\""; + $at = 0; + while ( $at < $group_length ) { + $token_length = unpack( 'C', $group[ $at++ ] )[1]; + $token = substr( $group, $at, $token_length ); + $at += $token_length; + $mapping_length = unpack( 'C', $group[ $at++ ] )[1]; + $mapping = substr( $group, $at, $mapping_length ); + $at += $mapping_length; + + $token_digits = str_pad( dechex( $token_length ), 2, '0', STR_PAD_LEFT ); + $mapping_digits = str_pad( dechex( $mapping_length ), 2, '0', STR_PAD_LEFT ); + + $mapping = preg_replace_callback( + "~[\\x00-\\x1f\\x22\\x5c]~", + static function ( $match_result ) { + switch ( $match_result[0] ) { + case '"': + return '\\"'; + + case '\\': + return '\\\\'; + + default: + $hex = dechex( ord( $match_result[0] ) ); + return "\\x{$hex}"; + } + }, + $mapping + ); + + $comment_line .= " {$prefix}{$token}[{$mapping}]"; + $data_line .= "\\x{$token_digits}{$token}\\x{$mapping_digits}{$mapping}"; + } + $comment_line .= ".\n"; + $data_line .= "\",\n"; + + $output .= $comment_line; + $output .= $data_line; + } + + $output .= "{$i2}),\n"; + + $small_words = array(); + $small_length = strlen( $this->small_words ); + $at = 0; + while ( $at < $small_length ) { + $small_words[] = substr( $this->small_words, $at, $this->key_length + 1 ); + $at += $this->key_length + 1; + } + + $small_text = str_replace( "\x00", '\x00', implode( '', $small_words ) ); + $output .= "{$i2}\"small_words\" => \"{$small_text}\",\n"; + + $output .= "{$i2}\"small_mappings\" => array(\n"; + foreach ( $this->small_mappings as $mapping ) { + $output .= "{$i3}\"{$mapping}\",\n"; + } + $output .= "{$i2})\n"; + $output .= "{$i1})\n"; + $output .= ')'; + + return $output; + } + + /** + * Compares two strings, returning the longest, or whichever + * is first alphabetically if they are the same length. + * + * This is an important sort when building the token map because + * it should not form a match on a substring of a longer potential + * match. For example, it should not detect `Cap` when matching + * against the string `CapitalDifferentialD`. + * + * @since 6.6.0 + * + * @param string $a First string to compare. + * @param string $b Second string to compare. + * @return int -1 or lower if `$a` is less than `$b`; 1 or greater if `$a` is greater than `$b`, and 0 if they are equal. + */ + private static function longest_first_then_alphabetical( $a, $b ) { + if ( $a === $b ) { + return 0; + } + + $length_a = strlen( $a ); + $length_b = strlen( $b ); + + // Longer strings are less-than for comparison's sake. + if ( $length_a !== $length_b ) { + return $length_b - $length_a; + } + + return strcmp( $a, $b ); + } +} diff --git a/lib/compat/wordpress-6.6/html-api/class-gutenberg-html-decoder-6-6.php b/lib/compat/wordpress-6.6/html-api/class-gutenberg-html-decoder-6-6.php new file mode 100644 index 00000000000000..2999921be52259 --- /dev/null +++ b/lib/compat/wordpress-6.6/html-api/class-gutenberg-html-decoder-6-6.php @@ -0,0 +1,461 @@ += $end ) { + break; + } + + $character_reference = self::read_character_reference( $context, $text, $next_character_reference_at, $token_length ); + if ( isset( $character_reference ) ) { + $at = $next_character_reference_at; + $decoded .= substr( $text, $was_at, $at - $was_at ); + $decoded .= $character_reference; + $at += $token_length; + $was_at = $at; + continue; + } + + ++$at; + } + + if ( 0 === $was_at ) { + return $text; + } + + if ( $was_at < $end ) { + $decoded .= substr( $text, $was_at, $end - $was_at ); + } + + return $decoded; + } + + /** + * Attempt to read a character reference at the given location in a given string, + * depending on the context in which it's found. + * + * If a character reference is found, this function will return the translated value + * that the reference maps to. It will then set `$match_byte_length` the + * number of bytes of input it read while consuming the character reference. This + * gives calling code the opportunity to advance its cursor when traversing a string + * and decoding. + * + * Example: + * + * null === WP_HTML_Decoder::read_character_reference( 'attribute', 'Ships…', 0 ); + * '…' === WP_HTML_Decoder::read_character_reference( 'attribute', 'Ships…', 5, $token_length ); + * 8 === $token_length; // `…` + * + * null === WP_HTML_Decoder::read_character_reference( 'attribute', '¬in', 0 ); + * '∉' === WP_HTML_Decoder::read_character_reference( 'attribute', '∉', 0, $token_length ); + * 7 === $token_length; // `∉` + * + * '¬' === WP_HTML_Decoder::read_character_reference( 'data', '¬in', 0, $token_length ); + * 4 === $token_length; // `¬` + * '∉' === WP_HTML_Decoder::read_character_reference( 'data', '∉', 0, $token_length ); + * 7 === $token_length; // `∉` + * + * @since 6.6.0 + * + * @param string $context `attribute` for decoding attribute values, `data` otherwise. + * @param string $text Text document containing span of text to decode. + * @param int $at Optional. Byte offset into text where span begins, defaults to the beginning (0). + * @param int &$match_byte_length Optional. Set to byte-length of character reference if provided and if a match + * is found, otherwise not set. Default null. + * @return string|false Decoded character reference in UTF-8 if found, otherwise `false`. + */ + public static function read_character_reference( $context, $text, $at = 0, &$match_byte_length = null ) { + /** + * Mappings for HTML5 named character references. + * + * @var WP_Token_Map $html5_named_character_references + */ + global $html5_named_character_references; + + $length = strlen( $text ); + if ( $at + 1 >= $length ) { + return null; + } + + if ( '&' !== $text[ $at ] ) { + return null; + } + + /* + * Numeric character references. + * + * When truncated, these will encode the code point found by parsing the + * digits that are available. For example, when `🅰` is truncated + * to `DZ` it will encode `DZ`. It does not: + * - know how to parse the original `🅰`. + * - fail to parse and return plaintext `DZ`. + * - fail to parse and return the replacement character `�` + */ + if ( '#' === $text[ $at + 1 ] ) { + if ( $at + 2 >= $length ) { + return null; + } + + /** Tracks inner parsing within the numeric character reference. */ + $digits_at = $at + 2; + + if ( 'x' === $text[ $digits_at ] || 'X' === $text[ $digits_at ] ) { + $numeric_base = 16; + $numeric_digits = '0123456789abcdefABCDEF'; + $max_digits = 6; // 􏿿 + ++$digits_at; + } else { + $numeric_base = 10; + $numeric_digits = '0123456789'; + $max_digits = 7; // 􏿿 + } + + // Cannot encode invalid Unicode code points. Max is to U+10FFFF. + $zero_count = strspn( $text, '0', $digits_at ); + $digit_count = strspn( $text, $numeric_digits, $digits_at + $zero_count ); + $after_digits = $digits_at + $zero_count + $digit_count; + $has_semicolon = $after_digits < $length && ';' === $text[ $after_digits ]; + $end_of_span = $has_semicolon ? $after_digits + 1 : $after_digits; + + // `&#` or `&#x` without digits returns into plaintext. + if ( 0 === $digit_count && 0 === $zero_count ) { + return null; + } + + // Whereas `&#` and only zeros is invalid. + if ( 0 === $digit_count ) { + $match_byte_length = $end_of_span - $at; + return '�'; + } + + // If there are too many digits then it's not worth parsing. It's invalid. + if ( $digit_count > $max_digits ) { + $match_byte_length = $end_of_span - $at; + return '�'; + } + + $digits = substr( $text, $digits_at + $zero_count, $digit_count ); + $code_point = intval( $digits, $numeric_base ); + + /* + * Noncharacters, 0x0D, and non-ASCII-whitespace control characters. + * + * > A noncharacter is a code point that is in the range U+FDD0 to U+FDEF, + * > inclusive, or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, + * > U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, + * > U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, + * > U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, + * > U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, or U+10FFFF. + * + * A C0 control is a code point that is in the range of U+00 to U+1F, + * but ASCII whitespace includes U+09, U+0A, U+0C, and U+0D. + * + * These characters are invalid but still decode as any valid character. + * This comment is here to note and explain why there's no check to + * remove these characters or replace them. + * + * @see https://infra.spec.whatwg.org/#noncharacter + */ + + /* + * Code points in the C1 controls area need to be remapped as if they + * were stored in Windows-1252. Note! This transformation only happens + * for numeric character references. The raw code points in the byte + * stream are not translated. + * + * > If the number is one of the numbers in the first column of + * > the following table, then find the row with that number in + * > the first column, and set the character reference code to + * > the number in the second column of that row. + */ + if ( $code_point >= 0x80 && $code_point <= 0x9F ) { + $windows_1252_mapping = array( + 0x20AC, // 0x80 -> EURO SIGN (€). + 0x81, // 0x81 -> (no change). + 0x201A, // 0x82 -> SINGLE LOW-9 QUOTATION MARK (‚). + 0x0192, // 0x83 -> LATIN SMALL LETTER F WITH HOOK (ƒ). + 0x201E, // 0x84 -> DOUBLE LOW-9 QUOTATION MARK („). + 0x2026, // 0x85 -> HORIZONTAL ELLIPSIS (…). + 0x2020, // 0x86 -> DAGGER (†). + 0x2021, // 0x87 -> DOUBLE DAGGER (‡). + 0x02C6, // 0x88 -> MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ). + 0x2030, // 0x89 -> PER MILLE SIGN (‰). + 0x0160, // 0x8A -> LATIN CAPITAL LETTER S WITH CARON (Š). + 0x2039, // 0x8B -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹). + 0x0152, // 0x8C -> LATIN CAPITAL LIGATURE OE (Œ). + 0x8D, // 0x8D -> (no change). + 0x017D, // 0x8E -> LATIN CAPITAL LETTER Z WITH CARON (Ž). + 0x8F, // 0x8F -> (no change). + 0x90, // 0x90 -> (no change). + 0x2018, // 0x91 -> LEFT SINGLE QUOTATION MARK (‘). + 0x2019, // 0x92 -> RIGHT SINGLE QUOTATION MARK (’). + 0x201C, // 0x93 -> LEFT DOUBLE QUOTATION MARK (“). + 0x201D, // 0x94 -> RIGHT DOUBLE QUOTATION MARK (”). + 0x2022, // 0x95 -> BULLET (•). + 0x2013, // 0x96 -> EN DASH (–). + 0x2014, // 0x97 -> EM DASH (—). + 0x02DC, // 0x98 -> SMALL TILDE (˜). + 0x2122, // 0x99 -> TRADE MARK SIGN (™). + 0x0161, // 0x9A -> LATIN SMALL LETTER S WITH CARON (š). + 0x203A, // 0x9B -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›). + 0x0153, // 0x9C -> LATIN SMALL LIGATURE OE (œ). + 0x9D, // 0x9D -> (no change). + 0x017E, // 0x9E -> LATIN SMALL LETTER Z WITH CARON (ž). + 0x0178, // 0x9F -> LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ). + ); + + $code_point = $windows_1252_mapping[ $code_point - 0x80 ]; + } + + $match_byte_length = $end_of_span - $at; + return self::code_point_to_utf8_bytes( $code_point ); + } + + /** Tracks inner parsing within the named character reference. */ + $name_at = $at + 1; + // Minimum named character reference is two characters. E.g. `GT`. + if ( $name_at + 2 > $length ) { + return null; + } + + $name_length = 0; + $replacement = $html5_named_character_references->read_token( $text, $name_at, $name_length ); + if ( false === $replacement ) { + return null; + } + + $after_name = $name_at + $name_length; + + // If the match ended with a semicolon then it should always be decoded. + if ( ';' === $text[ $name_at + $name_length - 1 ] ) { + $match_byte_length = $after_name - $at; + return $replacement; + } + + /* + * At this point though there's a match for an entry in the named + * character reference table but the match doesn't end in `;`. + * It may be allowed if it's followed by something unambiguous. + */ + $ambiguous_follower = ( + $after_name < $length && + $name_at < $length && + ( + ctype_alnum( $text[ $after_name ] ) || + '=' === $text[ $after_name ] + ) + ); + + // It's non-ambiguous, safe to leave it in. + if ( ! $ambiguous_follower ) { + $match_byte_length = $after_name - $at; + return $replacement; + } + + // It's ambiguous, which isn't allowed inside attributes. + if ( 'attribute' === $context ) { + return null; + } + + $match_byte_length = $after_name - $at; + return $replacement; + } + + /** + * Encode a code point number into the UTF-8 encoding. + * + * This encoder implements the UTF-8 encoding algorithm for converting + * a code point into a byte sequence. If it receives an invalid code + * point it will return the Unicode Replacement Character U+FFFD `�`. + * + * Example: + * + * '🅰' === WP_HTML_Decoder::code_point_to_utf8_bytes( 0x1f170 ); + * + * // Half of a surrogate pair is an invalid code point. + * '�' === WP_HTML_Decoder::code_point_to_utf8_bytes( 0xd83c ); + * + * @since 6.6.0 + * + * @see https://www.rfc-editor.org/rfc/rfc3629 For the UTF-8 standard. + * + * @param int $code_point Which code point to convert. + * @return string Converted code point, or `�` if invalid. + */ + public static function code_point_to_utf8_bytes( $code_point ) { + // Pre-check to ensure a valid code point. + if ( + $code_point <= 0 || + ( $code_point >= 0xD800 && $code_point <= 0xDFFF ) || + $code_point > 0x10FFFF + ) { + return '�'; + } + + if ( $code_point <= 0x7F ) { + return chr( $code_point ); + } + + if ( $code_point <= 0x7FF ) { + $byte1 = ( $code_point >> 6 ) | 0xC0; + $byte2 = $code_point & 0x3F | 0x80; + + return pack( 'CC', $byte1, $byte2 ); + } + + if ( $code_point <= 0xFFFF ) { + $byte1 = ( $code_point >> 12 ) | 0xE0; + $byte2 = ( $code_point >> 6 ) & 0x3F | 0x80; + $byte3 = $code_point & 0x3F | 0x80; + + return pack( 'CCC', $byte1, $byte2, $byte3 ); + } + + // Any values above U+10FFFF are eliminated above in the pre-check. + $byte1 = ( $code_point >> 18 ) | 0xF0; + $byte2 = ( $code_point >> 12 ) & 0x3F | 0x80; + $byte3 = ( $code_point >> 6 ) & 0x3F | 0x80; + $byte4 = $code_point & 0x3F | 0x80; + + return pack( 'CCCC', $byte1, $byte2, $byte3, $byte4 ); + } +} diff --git a/lib/compat/wordpress-6.6/html-api/class-gutenberg-html-open-elements-6-6.php b/lib/compat/wordpress-6.6/html-api/class-gutenberg-html-open-elements-6-6.php new file mode 100644 index 00000000000000..da237e02bc63ba --- /dev/null +++ b/lib/compat/wordpress-6.6/html-api/class-gutenberg-html-open-elements-6-6.php @@ -0,0 +1,541 @@ + Initially, the stack of open elements is empty. The stack grows + * > downwards; the topmost node on the stack is the first one added + * > to the stack, and the bottommost node of the stack is the most + * > recently added node in the stack (notwithstanding when the stack + * > is manipulated in a random access fashion as part of the handling + * > for misnested tags). + * + * @since 6.4.0 + * + * @access private + * + * @see https://html.spec.whatwg.org/#stack-of-open-elements + * @see WP_HTML_Processor + */ +class Gutenberg_HTML_Open_Elements_6_6 { + /** + * Holds the stack of open element references. + * + * @since 6.4.0 + * + * @var WP_HTML_Token[] + */ + public $stack = array(); + + /** + * Whether a P element is in button scope currently. + * + * This class optimizes scope lookup by pre-calculating + * this value when elements are added and removed to the + * stack of open elements which might change its value. + * This avoids frequent iteration over the stack. + * + * @since 6.4.0 + * + * @var bool + */ + private $has_p_in_button_scope = false; + + /** + * A function that will be called when an item is popped off the stack of open elements. + * + * The function will be called with the popped item as its argument. + * + * @since 6.6.0 + * + * @var Closure + */ + private $pop_handler = null; + + /** + * A function that will be called when an item is pushed onto the stack of open elements. + * + * The function will be called with the pushed item as its argument. + * + * @since 6.6.0 + * + * @var Closure + */ + private $push_handler = null; + + /** + * Sets a pop handler that will be called when an item is popped off the stack of + * open elements. + * + * The function will be called with the pushed item as its argument. + * + * @since 6.6.0 + * + * @param Closure $handler The handler function. + */ + public function set_pop_handler( Closure $handler ) { + $this->pop_handler = $handler; + } + + /** + * Sets a push handler that will be called when an item is pushed onto the stack of + * open elements. + * + * The function will be called with the pushed item as its argument. + * + * @since 6.6.0 + * + * @param Closure $handler The handler function. + */ + public function set_push_handler( Closure $handler ) { + $this->push_handler = $handler; + } + + /** + * Reports if a specific node is in the stack of open elements. + * + * @since 6.4.0 + * + * @param WP_HTML_Token $token Look for this node in the stack. + * @return bool Whether the referenced node is in the stack of open elements. + */ + public function contains_node( $token ) { + foreach ( $this->walk_up() as $item ) { + if ( $token->bookmark_name === $item->bookmark_name ) { + return true; + } + } + + return false; + } + + /** + * Returns how many nodes are currently in the stack of open elements. + * + * @since 6.4.0 + * + * @return int How many node are in the stack of open elements. + */ + public function count() { + return count( $this->stack ); + } + + /** + * Returns the node at the end of the stack of open elements, + * if one exists. If the stack is empty, returns null. + * + * @since 6.4.0 + * + * @return WP_HTML_Token|null Last node in the stack of open elements, if one exists, otherwise null. + */ + public function current_node() { + $current_node = end( $this->stack ); + + return $current_node ? $current_node : null; + } + + /** + * Returns whether an element is in a specific scope. + * + * ## HTML Support + * + * This function skips checking for the termination list because there + * are no supported elements which appear in the termination list. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#has-an-element-in-the-specific-scope + * + * @param string $tag_name Name of tag check. + * @param string[] $termination_list List of elements that terminate the search. + * @return bool Whether the element was found in a specific scope. + */ + public function has_element_in_specific_scope( $tag_name, $termination_list ) { + foreach ( $this->walk_up() as $node ) { + if ( $node->node_name === $tag_name ) { + return true; + } + + if ( + '(internal: H1 through H6 - do not use)' === $tag_name && + in_array( $node->node_name, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), true ) + ) { + return true; + } + + switch ( $node->node_name ) { + case 'HTML': + return false; + } + + if ( in_array( $node->node_name, $termination_list, true ) ) { + return false; + } + } + + return false; + } + + /** + * Returns whether a particular element is in scope. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#has-an-element-in-scope + * + * @param string $tag_name Name of tag to check. + * @return bool Whether given element is in scope. + */ + public function has_element_in_scope( $tag_name ) { + return $this->has_element_in_specific_scope( + $tag_name, + array( + + /* + * Because it's not currently possible to encounter + * one of the termination elements, they don't need + * to be listed here. If they were, they would be + * unreachable and only waste CPU cycles while + * scanning through HTML. + */ + ) + ); + } + + /** + * Returns whether a particular element is in list item scope. + * + * @since 6.4.0 + * @since 6.5.0 Implemented: no longer throws on every invocation. + * + * @see https://html.spec.whatwg.org/#has-an-element-in-list-item-scope + * + * @param string $tag_name Name of tag to check. + * @return bool Whether given element is in scope. + */ + public function has_element_in_list_item_scope( $tag_name ) { + return $this->has_element_in_specific_scope( + $tag_name, + array( + // There are more elements that belong here which aren't currently supported. + 'OL', + 'UL', + ) + ); + } + + /** + * Returns whether a particular element is in button scope. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#has-an-element-in-button-scope + * + * @param string $tag_name Name of tag to check. + * @return bool Whether given element is in scope. + */ + public function has_element_in_button_scope( $tag_name ) { + return $this->has_element_in_specific_scope( $tag_name, array( 'BUTTON' ) ); + } + + /** + * Returns whether a particular element is in table scope. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#has-an-element-in-table-scope + * + * @throws WP_HTML_Unsupported_Exception Always until this function is implemented. + * + * @param string $tag_name Name of tag to check. + * @return bool Whether given element is in scope. + */ + public function has_element_in_table_scope( $tag_name ) { + throw new WP_HTML_Unsupported_Exception( 'Cannot process elements depending on table scope.' ); + + return false; // The linter requires this unreachable code until the function is implemented and can return. + } + + /** + * Returns whether a particular element is in select scope. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#has-an-element-in-select-scope + * + * @throws WP_HTML_Unsupported_Exception Always until this function is implemented. + * + * @param string $tag_name Name of tag to check. + * @return bool Whether given element is in scope. + */ + public function has_element_in_select_scope( $tag_name ) { + throw new WP_HTML_Unsupported_Exception( 'Cannot process elements depending on select scope.' ); + + return false; // The linter requires this unreachable code until the function is implemented and can return. + } + + /** + * Returns whether a P is in BUTTON scope. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#has-an-element-in-button-scope + * + * @return bool Whether a P is in BUTTON scope. + */ + public function has_p_in_button_scope() { + return $this->has_p_in_button_scope; + } + + /** + * Pops a node off of the stack of open elements. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#stack-of-open-elements + * + * @return bool Whether a node was popped off of the stack. + */ + public function pop() { + $item = array_pop( $this->stack ); + if ( null === $item ) { + return false; + } + + if ( 'context-node' === $item->bookmark_name ) { + $this->stack[] = $item; + return false; + } + + $this->after_element_pop( $item ); + return true; + } + + /** + * Pops nodes off of the stack of open elements until one with the given tag name has been popped. + * + * @since 6.4.0 + * + * @see WP_HTML_Open_Elements::pop + * + * @param string $tag_name Name of tag that needs to be popped off of the stack of open elements. + * @return bool Whether a tag of the given name was found and popped off of the stack of open elements. + */ + public function pop_until( $tag_name ) { + foreach ( $this->walk_up() as $item ) { + if ( 'context-node' === $item->bookmark_name ) { + return true; + } + + $this->pop(); + + if ( + '(internal: H1 through H6 - do not use)' === $tag_name && + in_array( $item->node_name, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), true ) + ) { + return true; + } + + if ( $tag_name === $item->node_name ) { + return true; + } + } + + return false; + } + + /** + * Pushes a node onto the stack of open elements. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#stack-of-open-elements + * + * @param WP_HTML_Token $stack_item Item to add onto stack. + */ + public function push( $stack_item ) { + $this->stack[] = $stack_item; + $this->after_element_push( $stack_item ); + } + + /** + * Removes a specific node from the stack of open elements. + * + * @since 6.4.0 + * + * @param WP_HTML_Token $token The node to remove from the stack of open elements. + * @return bool Whether the node was found and removed from the stack of open elements. + */ + public function remove_node( $token ) { + if ( 'context-node' === $token->bookmark_name ) { + return false; + } + + foreach ( $this->walk_up() as $position_from_end => $item ) { + if ( $token->bookmark_name !== $item->bookmark_name ) { + continue; + } + + $position_from_start = $this->count() - $position_from_end - 1; + array_splice( $this->stack, $position_from_start, 1 ); + $this->after_element_pop( $item ); + return true; + } + + return false; + } + + + /** + * Steps through the stack of open elements, starting with the top element + * (added first) and walking downwards to the one added last. + * + * This generator function is designed to be used inside a "foreach" loop. + * + * Example: + * + * $html = 'We are here'; + * foreach ( $stack->walk_down() as $node ) { + * echo "{$node->node_name} -> "; + * } + * > EM -> STRONG -> A -> + * + * To start with the most-recently added element and walk towards the top, + * see WP_HTML_Open_Elements::walk_up(). + * + * @since 6.4.0 + */ + public function walk_down() { + $count = count( $this->stack ); + + for ( $i = 0; $i < $count; $i++ ) { + yield $this->stack[ $i ]; + } + } + + /** + * Steps through the stack of open elements, starting with the bottom element + * (added last) and walking upwards to the one added first. + * + * This generator function is designed to be used inside a "foreach" loop. + * + * Example: + * + * $html = 'We are here'; + * foreach ( $stack->walk_up() as $node ) { + * echo "{$node->node_name} -> "; + * } + * > A -> STRONG -> EM -> + * + * To start with the first added element and walk towards the bottom, + * see WP_HTML_Open_Elements::walk_down(). + * + * @since 6.4.0 + * @since 6.5.0 Accepts $above_this_node to start traversal above a given node, if it exists. + * + * @param ?WP_HTML_Token $above_this_node Start traversing above this node, if provided and if the node exists. + */ + public function walk_up( $above_this_node = null ) { + $has_found_node = null === $above_this_node; + + for ( $i = count( $this->stack ) - 1; $i >= 0; $i-- ) { + $node = $this->stack[ $i ]; + + if ( ! $has_found_node ) { + $has_found_node = $node === $above_this_node; + continue; + } + + yield $node; + } + } + + /* + * Internal helpers. + */ + + /** + * Updates internal flags after adding an element. + * + * Certain conditions (such as "has_p_in_button_scope") are maintained here as + * flags that are only modified when adding and removing elements. This allows + * the HTML Processor to quickly check for these conditions instead of iterating + * over the open stack elements upon each new tag it encounters. These flags, + * however, need to be maintained as items are added and removed from the stack. + * + * @since 6.4.0 + * + * @param WP_HTML_Token $item Element that was added to the stack of open elements. + */ + public function after_element_push( $item ) { + /* + * When adding support for new elements, expand this switch to trap + * cases where the precalculated value needs to change. + */ + switch ( $item->node_name ) { + case 'BUTTON': + $this->has_p_in_button_scope = false; + break; + + case 'P': + $this->has_p_in_button_scope = true; + break; + } + + if ( null !== $this->push_handler ) { + ( $this->push_handler )( $item ); + } + } + + /** + * Updates internal flags after removing an element. + * + * Certain conditions (such as "has_p_in_button_scope") are maintained here as + * flags that are only modified when adding and removing elements. This allows + * the HTML Processor to quickly check for these conditions instead of iterating + * over the open stack elements upon each new tag it encounters. These flags, + * however, need to be maintained as items are added and removed from the stack. + * + * @since 6.4.0 + * + * @param WP_HTML_Token $item Element that was removed from the stack of open elements. + */ + public function after_element_pop( $item ) { + /* + * When adding support for new elements, expand this switch to trap + * cases where the precalculated value needs to change. + */ + switch ( $item->node_name ) { + case 'BUTTON': + $this->has_p_in_button_scope = $this->has_element_in_button_scope( 'P' ); + break; + + case 'P': + $this->has_p_in_button_scope = $this->has_element_in_button_scope( 'P' ); + break; + } + + if ( null !== $this->pop_handler ) { + ( $this->pop_handler )( $item ); + } + } + + /** + * Wakeup magic method. + * + * @since 6.6.0 + */ + public function __wakeup() { + throw new \LogicException( __CLASS__ . ' should never be unserialized' ); + } +} diff --git a/lib/compat/wordpress-6.6/html-api/class-gutenberg-html-processor-6-6.php b/lib/compat/wordpress-6.6/html-api/class-gutenberg-html-processor-6-6.php new file mode 100644 index 00000000000000..634aaab01707cd --- /dev/null +++ b/lib/compat/wordpress-6.6/html-api/class-gutenberg-html-processor-6-6.php @@ -0,0 +1,2472 @@ +next_tag( array( 'breadcrumbs' => array( 'DIV', 'FIGURE', 'IMG' ) ) ) ) { + * $processor->add_class( 'responsive-image' ); + * } + * + * #### Breadcrumbs + * + * Breadcrumbs represent the stack of open elements from the root + * of the document or fragment down to the currently-matched node, + * if one is currently selected. Call WP_HTML_Processor::get_breadcrumbs() + * to inspect the breadcrumbs for a matched tag. + * + * Breadcrumbs can specify nested HTML structure and are equivalent + * to a CSS selector comprising tag names separated by the child + * combinator, such as "DIV > FIGURE > IMG". + * + * Since all elements find themselves inside a full HTML document + * when parsed, the return value from `get_breadcrumbs()` will always + * contain any implicit outermost elements. For example, when parsing + * with `create_fragment()` in the `BODY` context (the default), any + * tag in the given HTML document will contain `array( 'HTML', 'BODY', … )` + * in its breadcrumbs. + * + * Despite containing the implied outermost elements in their breadcrumbs, + * tags may be found with the shortest-matching breadcrumb query. That is, + * `array( 'IMG' )` matches all IMG elements and `array( 'P', 'IMG' )` + * matches all IMG elements directly inside a P element. To ensure that no + * partial matches erroneously match it's possible to specify in a query + * the full breadcrumb match all the way down from the root HTML element. + * + * Example: + * + * $html = '
A lovely day outside
'; + * // ----- Matches here. + * $processor->next_tag( array( 'breadcrumbs' => array( 'FIGURE', 'IMG' ) ) ); + * + * $html = '
A lovely day outside
'; + * // ---- Matches here. + * $processor->next_tag( array( 'breadcrumbs' => array( 'FIGURE', 'FIGCAPTION', 'EM' ) ) ); + * + * $html = '
'; + * // ----- Matches here, because IMG must be a direct child of the implicit BODY. + * $processor->next_tag( array( 'breadcrumbs' => array( 'BODY', 'IMG' ) ) ); + * + * ## HTML Support + * + * This class implements a small part of the HTML5 specification. + * It's designed to operate within its support and abort early whenever + * encountering circumstances it can't properly handle. This is + * the principle way in which this class remains as simple as possible + * without cutting corners and breaking compliance. + * + * ### Supported elements + * + * If any unsupported element appears in the HTML input the HTML Processor + * will abort early and stop all processing. This draconian measure ensures + * that the HTML Processor won't break any HTML it doesn't fully understand. + * + * The following list specifies the HTML tags that _are_ supported: + * + * - Containers: ADDRESS, BLOCKQUOTE, DETAILS, DIALOG, DIV, FOOTER, HEADER, MAIN, MENU, SPAN, SUMMARY. + * - Custom elements: All custom elements are supported. :) + * - Form elements: BUTTON, DATALIST, FIELDSET, INPUT, LABEL, LEGEND, METER, PROGRESS, SEARCH. + * - Formatting elements: B, BIG, CODE, EM, FONT, I, PRE, SMALL, STRIKE, STRONG, TT, U, WBR. + * - Heading elements: H1, H2, H3, H4, H5, H6, HGROUP. + * - Links: A. + * - Lists: DD, DL, DT, LI, OL, UL. + * - Media elements: AUDIO, CANVAS, EMBED, FIGCAPTION, FIGURE, IMG, MAP, PICTURE, SOURCE, TRACK, VIDEO. + * - Paragraph: BR, P. + * - Phrasing elements: ABBR, AREA, BDI, BDO, CITE, DATA, DEL, DFN, INS, MARK, OUTPUT, Q, SAMP, SUB, SUP, TIME, VAR. + * - Sectioning elements: ARTICLE, ASIDE, HR, NAV, SECTION. + * - Templating elements: SLOT. + * - Text decoration: RUBY. + * - Deprecated elements: ACRONYM, BLINK, CENTER, DIR, ISINDEX, KEYGEN, LISTING, MULTICOL, NEXTID, PARAM, SPACER. + * + * ### Supported markup + * + * Some kinds of non-normative HTML involve reconstruction of formatting elements and + * re-parenting of mis-nested elements. For example, a DIV tag found inside a TABLE + * may in fact belong _before_ the table in the DOM. If the HTML Processor encounters + * such a case it will stop processing. + * + * The following list specifies HTML markup that _is_ supported: + * + * - Markup involving only those tags listed above. + * - Fully-balanced and non-overlapping tags. + * - HTML with unexpected tag closers. + * - Some unbalanced or overlapping tags. + * - P tags after unclosed P tags. + * - BUTTON tags after unclosed BUTTON tags. + * - A tags after unclosed A tags that don't involve any active formatting elements. + * + * @since 6.4.0 + * + * @see WP_HTML_Tag_Processor + * @see https://html.spec.whatwg.org/ + */ +class Gutenberg_HTML_Processor_6_6 extends Gutenberg_HTML_Tag_Processor_6_6 { + /** + * The maximum number of bookmarks allowed to exist at any given time. + * + * HTML processing requires more bookmarks than basic tag processing, + * so this class constant from the Tag Processor is overwritten. + * + * @since 6.4.0 + * + * @var int + */ + const MAX_BOOKMARKS = 100; + + /** + * Holds the working state of the parser, including the stack of + * open elements and the stack of active formatting elements. + * + * Initialized in the constructor. + * + * @since 6.4.0 + * + * @var WP_HTML_Processor_State + */ + private $state = null; + + /** + * Used to create unique bookmark names. + * + * This class sets a bookmark for every tag in the HTML document that it encounters. + * The bookmark name is auto-generated and increments, starting with `1`. These are + * internal bookmarks and are automatically released when the referring WP_HTML_Token + * goes out of scope and is garbage-collected. + * + * @since 6.4.0 + * + * @see WP_HTML_Processor::$release_internal_bookmark_on_destruct + * + * @var int + */ + private $bookmark_counter = 0; + + /** + * Stores an explanation for why something failed, if it did. + * + * @see self::get_last_error + * + * @since 6.4.0 + * + * @var string|null + */ + private $last_error = null; + + /** + * Releases a bookmark when PHP garbage-collects its wrapping WP_HTML_Token instance. + * + * This function is created inside the class constructor so that it can be passed to + * the stack of open elements and the stack of active formatting elements without + * exposing it as a public method on the class. + * + * @since 6.4.0 + * + * @var closure + */ + private $release_internal_bookmark_on_destruct = null; + + /** + * Stores stack events which arise during parsing of the + * HTML document, which will then supply the "match" events. + * + * @since 6.6.0 + * + * @var WP_HTML_Stack_Event[] + */ + private $element_queue = array(); + + /** + * Current stack event, if set, representing a matched token. + * + * Because the parser may internally point to a place further along in a document + * than the nodes which have already been processed (some "virtual" nodes may have + * appeared while scanning the HTML document), this will point at the "current" node + * being processed. It comes from the front of the element queue. + * + * @since 6.6.0 + * + * @var ?WP_HTML_Stack_Event + */ + private $current_element = null; + + /** + * Context node if created as a fragment parser. + * + * @var ?WP_HTML_Token + */ + private $context_node = null; + + /** + * Whether the parser has yet processed the context node, + * if created as a fragment parser. + * + * The context node will be initially pushed onto the stack of open elements, + * but when created as a fragment parser, this context element (and the implicit + * HTML document node above it) should not be exposed as a matched token or node. + * + * This boolean indicates whether the processor should skip over the current + * node in its initial search for the first node created from the input HTML. + * + * @var bool + */ + private $has_seen_context_node = false; + + /* + * Public Interface Functions + */ + + /** + * Creates an HTML processor in the fragment parsing mode. + * + * Use this for cases where you are processing chunks of HTML that + * will be found within a bigger HTML document, such as rendered + * block output that exists within a post, `the_content` inside a + * rendered site layout. + * + * Fragment parsing occurs within a context, which is an HTML element + * that the document will eventually be placed in. It becomes important + * when special elements have different rules than others, such as inside + * a TEXTAREA or a TITLE tag where things that look like tags are text, + * or inside a SCRIPT tag where things that look like HTML syntax are JS. + * + * The context value should be a representation of the tag into which the + * HTML is found. For most cases this will be the body element. The HTML + * form is provided because a context element may have attributes that + * impact the parse, such as with a SCRIPT tag and its `type` attribute. + * + * ## Current HTML Support + * + * - The only supported context is ``, which is the default value. + * - The only supported document encoding is `UTF-8`, which is the default value. + * + * @since 6.4.0 + * @since 6.6.0 Returns `static` instead of `self` so it can create subclass instances. + * + * @param string $html Input HTML fragment to process. + * @param string $context Context element for the fragment, must be default of ``. + * @param string $encoding Text encoding of the document; must be default of 'UTF-8'. + * @return static|null The created processor if successful, otherwise null. + */ + public static function create_fragment( $html, $context = '', $encoding = 'UTF-8' ) { + if ( '' !== $context || 'UTF-8' !== $encoding ) { + return null; + } + + $processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE ); + $processor->state->context_node = array( 'BODY', array() ); + $processor->state->insertion_mode = Gutenberg_HTML_Processor_State_6_6::INSERTION_MODE_IN_BODY; + + // @todo Create "fake" bookmarks for non-existent but implied nodes. + $processor->bookmarks['root-node'] = new Gutenberg_HTML_Span_6_5( 0, 0 ); + $processor->bookmarks['context-node'] = new Gutenberg_HTML_Span_6_5( 0, 0 ); + + $processor->state->stack_of_open_elements->push( + new WP_HTML_Token( + 'root-node', + 'HTML', + false + ) + ); + + $context_node = new WP_HTML_Token( + 'context-node', + $processor->state->context_node[0], + false + ); + + $processor->state->stack_of_open_elements->push( $context_node ); + $processor->context_node = $context_node; + + return $processor; + } + + /** + * Constructor. + * + * Do not use this method. Use the static creator methods instead. + * + * @access private + * + * @since 6.4.0 + * + * @see WP_HTML_Processor::create_fragment() + * + * @param string $html HTML to process. + * @param string|null $use_the_static_create_methods_instead This constructor should not be called manually. + */ + public function __construct( $html, $use_the_static_create_methods_instead = null ) { + parent::__construct( $html ); + + if ( self::CONSTRUCTOR_UNLOCK_CODE !== $use_the_static_create_methods_instead ) { + _doing_it_wrong( + __METHOD__, + sprintf( + /* translators: %s: WP_HTML_Processor::create_fragment(). */ + __( 'Call %s to create an HTML Processor instead of calling the constructor directly.' ), + 'WP_HTML_Processor::create_fragment()' + ), + '6.4.0' + ); + } + + $this->state = new Gutenberg_HTML_Processor_State_6_6(); + + $this->state->stack_of_open_elements->set_push_handler( + function ( WP_HTML_Token $token ) { + $is_virtual = ! isset( $this->state->current_token ) || $this->is_tag_closer(); + $same_node = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name; + $provenance = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real'; + $this->element_queue[] = new Gutenberg_HTML_Stack_Event_6_6( $token, Gutenberg_HTML_Stack_Event_6_6::PUSH, $provenance ); + } + ); + + $this->state->stack_of_open_elements->set_pop_handler( + function ( WP_HTML_Token $token ) { + $is_virtual = ! isset( $this->state->current_token ) || ! $this->is_tag_closer(); + $same_node = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name; + $provenance = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real'; + $this->element_queue[] = new Gutenberg_HTML_Stack_Event_6_6( $token, Gutenberg_HTML_Stack_Event_6_6::POP, $provenance ); + } + ); + + /* + * Create this wrapper so that it's possible to pass + * a private method into WP_HTML_Token classes without + * exposing it to any public API. + */ + $this->release_internal_bookmark_on_destruct = function ( $name ) { + parent::release_bookmark( $name ); + }; + } + + /** + * Returns the last error, if any. + * + * Various situations lead to parsing failure but this class will + * return `false` in all those cases. To determine why something + * failed it's possible to request the last error. This can be + * helpful to know to distinguish whether a given tag couldn't + * be found or if content in the document caused the processor + * to give up and abort processing. + * + * Example + * + * $processor = WP_HTML_Processor::create_fragment( '