diff --git a/src/wp-includes/html-api/class-wp-html-attribute-token.php b/src/wp-includes/html-api/class-wp-html-attribute-token.php new file mode 100644 index 0000000000000..2c52164a979f0 --- /dev/null +++ b/src/wp-includes/html-api/class-wp-html-attribute-token.php @@ -0,0 +1,89 @@ +name = $name; + $this->value_starts_at = $value_start; + $this->value_length = $value_length; + $this->start = $start; + $this->end = $end; + $this->is_true = $is_true; + } +} diff --git a/src/wp-includes/html-api/class-wp-html-span.php b/src/wp-includes/html-api/class-wp-html-span.php new file mode 100644 index 0000000000000..d92778cd3a222 --- /dev/null +++ b/src/wp-includes/html-api/class-wp-html-span.php @@ -0,0 +1,52 @@ +start = $start; + $this->end = $end; + } +} diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php new file mode 100644 index 0000000000000..bbc98ba7219e4 --- /dev/null +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -0,0 +1,2264 @@ + "c" not " c". + * This would increase the size of the changes for some operations but leave more + * natural-looking output HTML. + * - Decode HTML character references within class names when matching. E.g. match having + * class `1<"2` needs to recognize `class="1<"2"`. Currently the Tag Processor + * will fail to find the right tag if the class name is encoded as such. + * - Properly decode HTML character references in `get_attribute()`. PHP's + * `html_entity_decode()` is wrong in a couple ways: it doesn't account for the + * no-ambiguous-ampersand rule, and it improperly handles the way semicolons may + * or may not terminate a character reference. + * + * @package WordPress + * @subpackage HTML-API + * @since 6.2.0 + */ + +/** + * Modifies attributes in an HTML document for tags matching a query. + * + * ## Usage + * + * Use of this class requires three steps: + * + * 1. Create a new class instance with your input HTML document. + * 2. Find the tag(s) you are looking for. + * 3. Request changes to the attributes in those tag(s). + * + * Example: + * ```php + * $tags = new WP_HTML_Tag_Processor( $html ); + * if ( $tags->next_tag( [ 'tag_name' => 'option' ] ) ) { + * $tags->set_attribute( 'selected', true ); + * } + * ``` + * + * ### Finding tags + * + * The `next_tag()` function moves the internal cursor through + * your input HTML document until it finds a tag meeting any of + * the supplied restrictions in the optional query argument. If + * no argument is provided then it will find the next HTML tag, + * regardless of what kind it is. + * + * If you want to _find whatever the next tag is_: + * ```php + * $tags->next_tag(); + * ``` + * + * | Goal | Query | + * |-----------------------------------------------------------|----------------------------------------------------------------------------| + * | Find any tag. | `$tags->next_tag();` | + * | Find next image tag. | `$tags->next_tag( [ 'tag_name' => 'img' ] );` | + * | Find next tag containing the `fullwidth` CSS class. | `$tags->next_tag( [ 'class_name' => 'fullwidth' ] );` | + * | Find next image tag containing the `fullwidth` CSS class. | `$tags->next_tag( [ 'tag_name' => 'img', 'class_name' => 'fullwidth' ] );` | + * + * If a tag was found meeting your criteria then `next_tag()` + * will return `true` and you can proceed to modify it. If it + * returns `false`, however, it failed to find the tag and + * moved the cursor to the end of the file. + * + * Once the cursor reaches the end of the file the processor + * is done and if you want to reach an earlier tag you will + * need to recreate the processor and start over, as it's + * unable to back up or move in reverse. + * + * See the section on bookmarks for an exception to this + * no-backing-up rule. + * + * #### Custom queries + * + * Sometimes it's necessary to further inspect an HTML tag than + * the query syntax here permits. In these cases one may further + * inspect the search results using the read-only functions + * provided by the processor or external state or variables. + * + * Example: + * ```php + * // Paint up to the first five DIV or SPAN tags marked with the "jazzy" style. + * $remaining_count = 5; + * while ( $remaining_count > 0 && $tags->next_tag() ) { + * if ( + * ( 'DIV' === $tags->get_tag() || 'SPAN' === $tags->get_tag() ) && + * 'jazzy' === $tags->get_attribute( 'data-style' ) + * ) { + * $tags->add_class( 'theme-style-everest-jazz' ); + * $remaining_count--; + * } + * } + * ``` + * + * `get_attribute()` will return `null` if the attribute wasn't present + * on the tag when it was called. It may return `""` (the empty string) + * in cases where the attribute was present but its value was empty. + * For boolean attributes, those whose name is present but no value is + * given, it will return `true` (the only way to set `false` for an + * attribute is to remove it). + * + * ### Modifying HTML attributes for a found tag + * + * Once you've found the start of an opening tag you can modify + * any number of the attributes on that tag. You can set a new + * value for an attribute, remove the entire attribute, or do + * nothing and move on to the next opening tag. + * + * Example: + * ```php + * if ( $tags->next_tag( [ 'class' => 'wp-group-block' ] ) ) { + * $tags->set_attribute( 'title', 'This groups the contained content.' ); + * $tags->remove_attribute( 'data-test-id' ); + * } + * ``` + * + * If `set_attribute()` is called for an existing attribute it will + * overwrite the existing value. Similarly, calling `remove_attribute()` + * for a non-existing attribute has no effect on the document. Both + * of these methods are safe to call without knowing if a given attribute + * exists beforehand. + * + * ### Modifying CSS classes for a found tag + * + * The tag processor treats the `class` attribute as a special case. + * Because it's a common operation to add or remove CSS classes, this + * interface adds helper methods to make that easier. + * + * As with attribute values, adding or removing CSS classes is a safe + * operation that doesn't require checking if the attribute or class + * exists before making changes. If removing the only class then the + * entire `class` attribute will be removed. + * + * Example: + * ```php + * // from `Yippee!` + * // to `Yippee!` + * $tags->add_class( 'is-active' ); + * + * // from `Yippee!` + * // to `Yippee!` + * $tags->add_class( 'is-active' ); + * + * // from `Yippee!` + * // to `Yippee!` + * $tags->add_class( 'is-active' ); + * + * // from `` + * // to ` + * $tags->remove_class( 'rugby' ); + * + * // from `` + * // to ` + * $tags->remove_class( 'rugby' ); + * + * // from `` + * // to ` + * $tags->remove_class( 'rugby' ); + * ``` + * + * When class changes are enqueued but a direct change to `class` is made via + * `set_attribute` then the changes to `set_attribute` (or `remove_attribute`) + * will take precedence over those made through `add_class` and `remove_class`. + * + * ### Bookmarks + * + * While scanning through the input HTMl document it's possible to set + * a named bookmark when a particular tag is found. Later on, after + * continuing to scan other tags, it's possible to `seek` to one of + * the set bookmarks and then proceed again from that point forward. + * + * Because bookmarks create processing overhead one should avoid + * creating too many of them. As a rule, create only bookmarks + * of known string literal names; avoid creating "mark_{$index}" + * and so on. It's fine from a performance standpoint to create a + * bookmark and update it frequently, such as within a loop. + * + * ```php + * $total_todos = 0; + * while ( $p->next_tag( [ 'tag_name' => 'UL', 'class_name' => 'todo' ] ) ) { + * $p->set_bookmark( 'list-start' ); + * while ( $p->next_tag( [ 'tag_closers' => 'visit' ] ) ) { + * if ( 'UL' === $p->get_tag() && $p->is_tag_closer() ) { + * $p->set_bookmark( 'list-end' ); + * $p->seek( 'list-start' ); + * $p->set_attribute( 'data-contained-todos', (string) $total_todos ); + * $total_todos = 0; + * $p->seek( 'list-end' ); + * break; + * } + * + * if ( 'LI' === $p->get_tag() && ! $p->is_tag_closer() ) { + * $total_todos++; + * } + * } + * } + * ``` + * + * ## Design and limitations + * + * The Tag Processor is designed to linearly scan HTML documents and tokenize + * HTML tags and their attributes. It's designed to do this as efficiently as + * possible without compromising parsing integrity. Therefore it will be + * slower than some methods of modifying HTML, such as those incorporating + * over-simplified PCRE patterns, but will not introduce the defects and + * failures that those methods bring in, which lead to broken page renders + * and often to security vulnerabilities. On the other hand, it will be faster + * than full-blown HTML parsers such as DOMDocument and use considerably + * less memory. It requires a negligible memory overhead, enough to consider + * it a zero-overhead system. + * + * The performance characteristics are maintained by avoiding tree construction + * and semantic cleanups which are specified in HTML5. Because of this, for + * example, it's not possible for the Tag Processor to associate any given + * opening tag with its corresponding closing tag, or to return the inner markup + * inside an element. Systems may be built on top of the Tag Processor to do + * this, but the Tag Processor is and should be constrained so it can remain an + * efficient, low-level, and reliable HTML scanner. + * + * The Tag Processor's design incorporates a "garbage-in-garbage-out" philosophy. + * HTML5 specifies that certain invalid content be transformed into different forms + * for display, such as removing null bytes from an input document and replacing + * invalid characters with the Unicode replacement character U+FFFD �. Where errors + * or transformations exist within the HTML5 specification, the Tag Processor leaves + * those invalid inputs untouched, passing them through to the final browser to handle. + * While this implies that certain operations will be non-spec-compliant, such as + * reading the value of an attribute with invalid content, it also preserves a + * simplicity and efficiency for handling those error cases. + * + * Most operations within the Tag Processor are designed to minimize the difference + * between an input and output document for any given change. For example, the + * `add_class` and `remove_class` methods preserve whitespace and the class ordering + * within the `class` attribute; and when encountering tags with duplicated attributes, + * the Tag Processor will leave those invalid duplicate attributes where they are but + * update the proper attribute which the browser will read for parsing its value. An + * exception to this rule is that all attribute updates store their values as + * double-quoted strings, meaning that attributes on input with single-quoted or + * unquoted values will appear in the output with double-quotes. + * + * @since 6.2.0 + */ +class WP_HTML_Tag_Processor { + /** + * The maximum number of bookmarks allowed to exist at + * any given time. + * + * @see set_bookmark() + * @since 6.2.0 + * @var int + */ + const MAX_BOOKMARKS = 10; + + /** + * Maximum number of times seek() can be called. + * Prevents accidental infinite loops. + * + * @see seek() + * @since 6.2.0 + * @var int + */ + const MAX_SEEK_OPS = 1000; + + /** + * The HTML document to parse. + * + * @since 6.2.0 + * @var string + */ + private $html; + + /** + * The last query passed to next_tag(). + * + * @since 6.2.0 + * @var array|null + */ + private $last_query; + + /** + * The tag name this processor currently scans for. + * + * @since 6.2.0 + * @var string|null + */ + private $sought_tag_name; + + /** + * The CSS class name this processor currently scans for. + * + * @since 6.2.0 + * @var string|null + */ + private $sought_class_name; + + /** + * The match offset this processor currently scans for. + * + * @since 6.2.0 + * @var int|null + */ + private $sought_match_offset; + + /** + * Whether to visit tag closers, e.g. , when walking an input document. + * + * @since 6.2.0 + * @var bool + */ + private $stop_on_tag_closers; + + /** + * Holds updated HTML as updates are applied. + * + * Updates and unmodified portions of the input document are + * appended to this value as they are applied. It will hold + * a copy of the updated document up until the point of the + * latest applied update. The fully-updated HTML document + * will comprise this value plus the part of the input document + * which follows that latest update. + * + * @see $bytes_already_copied + * + * @since 6.2.0 + * @var string + */ + private $output_buffer = ''; + + /** + * How many bytes from the original HTML document have been read and parsed. + * + * This value points to the latest byte offset in the input document which + * has been already parsed. It is the internal cursor for the Tag Processor + * and updates while scanning through the HTML tokens. + * + * @since 6.2.0 + * @var int + */ + private $bytes_already_parsed = 0; + + /** + * How many bytes from the input HTML document have already been + * copied into the output buffer. + * + * Lexical updates are enqueued and processed in batches. Prior + * to any given update in the input document, there might exist + * a span of HTML unaffected by any changes. This span ought to + * be copied verbatim into the output buffer before applying the + * following update. This value will point to the starting byte + * offset in the input document where that unaffected span of + * HTML starts. + * + * @since 6.2.0 + * @var int + */ + private $bytes_already_copied = 0; + + /** + * Byte offset in input document where current tag name starts. + * + * Example: + * ``` + *
... + * 01234 + * - tag name starts at 1 + * ``` + * + * @since 6.2.0 + * @var int|null + */ + private $tag_name_starts_at; + + /** + * Byte length of current tag name. + * + * Example: + * ``` + *
... + * 01234 + * --- tag name length is 3 + * ``` + * + * @since 6.2.0 + * @var int|null + */ + private $tag_name_length; + + /** + * Byte offset in input document where current tag token ends. + * + * Example: + * ``` + *
... + * 0 1 | + * 01234567890123456 + * --- tag name ends at 14 + * ``` + * + * @since 6.2.0 + * @var int|null + */ + private $tag_ends_at; + + /** + * Whether the current tag is an opening tag, e.g.
, or a closing tag, e.g.
. + * + * @var bool + */ + private $is_closing_tag; + + /** + * Lazily-built index of attributes found within an HTML tag, keyed by the attribute name. + * + * Example: + * ```php + * // supposing the parser is working through this content + * // and stops after recognizing the `id` attribute + * //
+ * // ^ parsing will continue from this point + * $this->attributes = [ + * 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 ) + * ]; + * + * // when picking up parsing again, or when asking to find the + * // `class` attribute we will continue and add to this array + * $this->attributes = [ + * 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 ), + * 'class' => new WP_HTML_Attribute_Match( 'class', 'outline', 18, 32 ) + * ]; + * + * // Note that only the `class` attribute value is stored in the index. + * // That's because it is the only value used by this class at the moment. + * ``` + * + * @since 6.2.0 + * @var WP_HTML_Attribute_Token[] + */ + private $attributes = array(); + + /** + * Which class names to add or remove from a tag. + * + * These are tracked separately from attribute updates because they are + * semantically distinct, whereas this interface exists for the common + * case of adding and removing class names while other attributes are + * generally modified as with DOM `setAttribute` calls. + * + * When modifying an HTML document these will eventually be collapsed + * into a single `set_attribute( 'class', $changes )` call. + * + * Example: + * ```php + * // Add the `wp-block-group` class, remove the `wp-group` class. + * $classname_updates = [ + * // Indexed by a comparable class name + * 'wp-block-group' => WP_HTML_Tag_Processor::ADD_CLASS, + * 'wp-group' => WP_HTML_Tag_Processor::REMOVE_CLASS + * ]; + * ``` + * + * @since 6.2.0 + * @var bool[] + */ + private $classname_updates = array(); + + /** + * Tracks a semantic location in the original HTML which + * shifts with updates as they are applied to the document. + * + * @since 6.2.0 + * @var WP_HTML_Span[] + */ + protected $bookmarks = array(); + + const ADD_CLASS = true; + const REMOVE_CLASS = false; + const SKIP_CLASS = null; + + /** + * Lexical replacements to apply to input HTML document. + * + * "Lexical" in this class refers to the part of this class which + * operates on pure text _as text_ and not as HTML. There's a line + * between the public interface, with HTML-semantic methods like + * `set_attribute` and `add_class`, and an internal state that tracks + * text offsets in the input document. + * + * When higher-level HTML methods are called, those have to transform their + * operations (such as setting an attribute's value) into text diffing + * operations (such as replacing the sub-string from indices A to B with + * some given new string). These text-diffing operations are the lexical + * updates. + * + * As new higher-level methods are added they need to collapse their + * operations into these lower-level lexical updates since that's the + * Tag Processor's internal language of change. Any code which creates + * these lexical updates must ensure that they do not cross HTML syntax + * boundaries, however, so these should never be exposed outside of this + * class or any classes which intentionally expand its functionality. + * + * These are enqueued while editing the document instead of being immediately + * applied to avoid processing overhead, string allocations, and string + * copies when applying many updates to a single document. + * + * Example: + * ```php + * // Replace an attribute stored with a new value, indices + * // sourced from the lazily-parsed HTML recognizer. + * $start = $attributes['src']->start; + * $end = $attributes['src']->end; + * $modifications[] = new WP_HTML_Text_Replacement( $start, $end, $new_value ); + * + * // Correspondingly, something like this will appear in this array. + * $lexical_updates = [ + * WP_HTML_Text_Replacement( 14, 28, 'https://my-site.my-domain/wp-content/uploads/2014/08/kittens.jpg' ) + * ]; + * ``` + * + * @since 6.2.0 + * @var WP_HTML_Text_Replacement[] + */ + protected $lexical_updates = array(); + + /** + * Tracks and limits `seek()` calls to prevent accidental infinite loops. + * + * @see seek + * @since 6.2.0 + * @var int + */ + protected $seek_count = 0; + + /** + * Constructor. + * + * @since 6.2.0 + * + * @param string $html HTML to process. + */ + public function __construct( $html ) { + $this->html = $html; + } + + /** + * Finds the next tag matching the $query. + * + * @since 6.2.0 + * + * @param array|string|null $query { + * Optional. Which tag name to find, having which class, etc. Default is to find any tag. + * + * @type string|null $tag_name Which tag to find, or `null` for "any tag." + * @type int|null $match_offset Find the Nth tag matching all search criteria. + * 0 for "first" tag, 2 for "third," etc. + * Defaults to first tag. + * @type string|null $class_name Tag must contain this whole class name to match. + * @type string|null $tag_closers "visit" or "skip": whether to stop on tag closers, e.g.
. + * } + * @return boolean Whether a tag was matched. + */ + public function next_tag( $query = null ) { + $this->parse_query( $query ); + $already_found = 0; + + do { + if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + return false; + } + + // Find the next tag if it exists. + if ( false === $this->parse_next_tag() ) { + $this->bytes_already_parsed = strlen( $this->html ); + + return false; + } + + // Parse all of its attributes. + while ( $this->parse_next_attribute() ) { + continue; + } + + // Ensure that the tag closes before the end of the document. + $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed ); + if ( false === $tag_ends_at ) { + return false; + } + $this->tag_ends_at = $tag_ends_at; + $this->bytes_already_parsed = $tag_ends_at; + + // Finally, check if the parsed tag and its attributes match the search query. + if ( $this->matches() ) { + ++$already_found; + } + + /* + * For non-DATA sections which might contain text that looks like HTML tags but + * isn't, scan with the appropriate alternative mode. Looking at the first letter + * of the tag name as a pre-check avoids a string allocation when it's not needed. + */ + $t = $this->html[ $this->tag_name_starts_at ]; + if ( ! $this->is_closing_tag && ( 's' === $t || 'S' === $t || 't' === $t || 'T' === $t ) ) { + $tag_name = $this->get_tag(); + + if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) { + $this->bytes_already_parsed = strlen( $this->html ); + return false; + } elseif ( + ( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) && + ! $this->skip_rcdata( $tag_name ) + ) { + $this->bytes_already_parsed = strlen( $this->html ); + return false; + } + } + } while ( $already_found < $this->sought_match_offset ); + + return true; + } + + + /** + * Sets a bookmark in the HTML document. + * + * Bookmarks represent specific places or tokens in the HTML + * document, such as a tag opener or closer. When applying + * edits to a document, such as setting an attribute, the + * text offsets of that token may shift; the bookmark is + * kept updated with those shifts and remains stable unless + * the entire span of text in which the token sits is removed. + * + * Release bookmarks when they are no longer needed. + * + * Example: + * ``` + *

Surprising fact you may not know!

+ * ^ ^ + * \-|-- this `H2` opener bookmark tracks the token + * + *

Surprising fact you may no… + * ^ ^ + * \-|-- it shifts with edits + * ``` + * + * Bookmarks provide the ability to seek to a previously-scanned + * place in the HTML document. This avoids the need to re-scan + * the entire document. + * + * Example: + * ``` + *
  • One
  • Two
  • Three
+ * ^^^^ + * want to note this last item + * + * $p = new WP_HTML_Tag_Processor( $html ); + * $in_list = false; + * while ( $p->next_tag( [ 'tag_closers' => $in_list ? 'visit' : 'skip' ] ) ) { + * if ( 'UL' === $p->get_tag() ) { + * if ( $p->is_tag_closer() ) { + * $in_list = false; + * $p->set_bookmark( 'resume' ); + * if ( $p->seek( 'last-li' ) ) { + * $p->add_class( 'last-li' ); + * } + * $p->seek( 'resume' ); + * $p->release_bookmark( 'last-li' ); + * $p->release_bookmark( 'resume' ); + * } else { + * $in_list = true; + * } + * } + * + * if ( 'LI' === $p->get_tag() ) { + * $p->set_bookmark( 'last-li' ); + * } + * } + * ``` + * + * Bookmarks intentionally hide the internal string offsets + * to which they refer. They are maintained internally as + * updates are applied to the HTML document and therefore + * retain their "position" - the location to which they + * originally pointed. The inability to use bookmarks with + * functions like `substr` is therefore intentional to guard + * against accidentally breaking the HTML. + * + * Because bookmarks allocate memory and require processing + * for every applied update, they are limited and require + * a name. They should not be created with programmatically-made + * names, such as "li_{$index}" with some loop. As a general + * rule they should only be created with string-literal names + * like "start-of-section" or "last-paragraph". + * + * Bookmarks are a powerful tool to enable complicated behavior. + * Consider double-checking that you need this tool if you are + * reaching for it, as inappropriate use could lead to broken + * HTML structure or unwanted processing overhead. + * + * @since 6.2.0 + * + * @param string $name Identifies this particular bookmark. + * @return bool Whether the bookmark was successfully created. + */ + public function set_bookmark( $name ) { + if ( null === $this->tag_name_starts_at ) { + return false; + } + + if ( ! array_key_exists( $name, $this->bookmarks ) && count( $this->bookmarks ) >= self::MAX_BOOKMARKS ) { + _doing_it_wrong( + __METHOD__, + __( 'Too many bookmarks: cannot create any more.' ), + '6.2.0' + ); + return false; + } + + $this->bookmarks[ $name ] = new WP_HTML_Span( + $this->tag_name_starts_at - 1, + $this->tag_ends_at + ); + + return true; + } + + + /** + * Removes a bookmark that is no longer needed. + * + * Releasing a bookmark frees up the small + * performance overhead it requires. + * + * @param string $name Name of the bookmark to remove. + * @return bool Whether the bookmark already existed before removal. + */ + public function release_bookmark( $name ) { + if ( ! array_key_exists( $name, $this->bookmarks ) ) { + return false; + } + + unset( $this->bookmarks[ $name ] ); + + return true; + } + + + /** + * Skips contents of title and textarea tags. + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state + * @since 6.2.0 + * + * @param string $tag_name – the lowercase tag name which will close the RCDATA region. + * @return bool Whether an end to the RCDATA region was found before the end of the document. + */ + private function skip_rcdata( $tag_name ) { + $html = $this->html; + $doc_length = strlen( $html ); + $tag_length = strlen( $tag_name ); + + $at = $this->bytes_already_parsed; + + while ( false !== $at && $at < $doc_length ) { + $at = strpos( $this->html, '= $doc_length ) { + $this->bytes_already_parsed = $doc_length; + return false; + } + + $at += 2; + + /* + * Find a case-insensitive match to the tag name. + * + * Because tag names are limited to US-ASCII there is no + * need to perform any kind of Unicode normalization when + * comparing; any character which could be impacted by such + * normalization could not be part of a tag name. + */ + for ( $i = 0; $i < $tag_length; $i++ ) { + $tag_char = $tag_name[ $i ]; + $html_char = $html[ $at + $i ]; + + if ( $html_char !== $tag_char && strtoupper( $html_char ) !== $tag_char ) { + $at += $i; + continue 2; + } + } + + $at += $tag_length; + $this->bytes_already_parsed = $at; + + /* + * Ensure that the tag name terminates to avoid matching on + * substrings of a longer tag name. For example, the sequence + * "' !== $c ) { + continue; + } + + while ( $this->parse_next_attribute() ) { + continue; + } + $at = $this->bytes_already_parsed; + if ( $at >= strlen( $this->html ) ) { + return false; + } + + if ( '>' === $html[ $at ] || '/' === $html[ $at ] ) { + ++$this->bytes_already_parsed; + return true; + } + } + + return false; + } + + /** + * Skips contents of script tags. + * + * @since 6.2.0 + * + * @return bool Whether the script tag was closed before the end of the document. + */ + private function skip_script_data() { + $state = 'unescaped'; + $html = $this->html; + $doc_length = strlen( $html ); + $at = $this->bytes_already_parsed; + + while ( false !== $at && $at < $doc_length ) { + $at += strcspn( $html, '-<', $at ); + + /* + * For all script states a "-->" transitions + * back into the normal unescaped script mode, + * even if that's the current state. + */ + if ( + $at + 2 < $doc_length && + '-' === $html[ $at ] && + '-' === $html[ $at + 1 ] && + '>' === $html[ $at + 2 ] + ) { + $at += 3; + $state = 'unescaped'; + continue; + } + + // Everything of interest past here starts with "<". + if ( $at + 1 >= $doc_length || '<' !== $html[ $at++ ] ) { + continue; + } + + /* + * Unlike with "-->", the " + * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state + */ + if ( + strlen( $html ) > $at + 3 && + '-' === $html[ $at + 2 ] && + '-' === $html[ $at + 3 ] + ) { + $closer_at = strpos( $html, '-->', $at + 4 ); + if ( false === $closer_at ) { + return false; + } + + $at = $closer_at + 3; + continue; + } + + /* + * + * The CDATA is case-sensitive. + * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state + */ + if ( + strlen( $html ) > $at + 8 && + '[' === $html[ $at + 2 ] && + 'C' === $html[ $at + 3 ] && + 'D' === $html[ $at + 4 ] && + 'A' === $html[ $at + 5 ] && + 'T' === $html[ $at + 6 ] && + 'A' === $html[ $at + 7 ] && + '[' === $html[ $at + 8 ] + ) { + $closer_at = strpos( $html, ']]>', $at + 9 ); + if ( false === $closer_at ) { + return false; + } + + $at = $closer_at + 3; + continue; + } + + /* + * + * These are ASCII-case-insensitive. + * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state + */ + if ( + strlen( $html ) > $at + 8 && + ( 'D' === $html[ $at + 2 ] || 'd' === $html[ $at + 2 ] ) && + ( 'O' === $html[ $at + 3 ] || 'o' === $html[ $at + 3 ] ) && + ( 'C' === $html[ $at + 4 ] || 'c' === $html[ $at + 4 ] ) && + ( 'T' === $html[ $at + 5 ] || 't' === $html[ $at + 5 ] ) && + ( 'Y' === $html[ $at + 6 ] || 'y' === $html[ $at + 6 ] ) && + ( 'P' === $html[ $at + 7 ] || 'p' === $html[ $at + 7 ] ) && + ( 'E' === $html[ $at + 8 ] || 'e' === $html[ $at + 8 ] ) + ) { + $closer_at = strpos( $html, '>', $at + 9 ); + if ( false === $closer_at ) { + return false; + } + + $at = $closer_at + 1; + continue; + } + + /* + * Anything else here is an incorrectly-opened comment and transitions + * to the bogus comment state - skip to the nearest >. + */ + $at = strpos( $html, '>', $at + 1 ); + continue; + } + + /* + * + * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state + */ + if ( '?' === $html[ $at + 1 ] ) { + $closer_at = strpos( $html, '>', $at + 2 ); + if ( false === $closer_at ) { + return false; + } + + $at = $closer_at + 1; + continue; + } + + ++$at; + } + + return false; + } + + /** + * Parses the next attribute. + * + * @since 6.2.0 + * + * @return bool Whether an attribute was found before the end of the document. + */ + private function parse_next_attribute() { + // Skip whitespace and slashes. + $this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed ); + if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + return false; + } + + /* + * Treat the equal sign as a part of the attribute + * name if it is the first encountered byte. + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state + */ + $name_length = '=' === $this->html[ $this->bytes_already_parsed ] + ? 1 + strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed + 1 ) + : strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed ); + + // No attribute, just tag closer. + if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= strlen( $this->html ) ) { + return false; + } + + $attribute_start = $this->bytes_already_parsed; + $attribute_name = substr( $this->html, $attribute_start, $name_length ); + $this->bytes_already_parsed += $name_length; + if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + return false; + } + + $this->skip_whitespace(); + if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + return false; + } + + $has_value = '=' === $this->html[ $this->bytes_already_parsed ]; + if ( $has_value ) { + ++$this->bytes_already_parsed; + $this->skip_whitespace(); + if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + return false; + } + + switch ( $this->html[ $this->bytes_already_parsed ] ) { + case "'": + case '"': + $quote = $this->html[ $this->bytes_already_parsed ]; + $value_start = $this->bytes_already_parsed + 1; + $value_length = strcspn( $this->html, $quote, $value_start ); + $attribute_end = $value_start + $value_length + 1; + $this->bytes_already_parsed = $attribute_end; + break; + + default: + $value_start = $this->bytes_already_parsed; + $value_length = strcspn( $this->html, "> \t\f\r\n", $value_start ); + $attribute_end = $value_start + $value_length; + $this->bytes_already_parsed = $attribute_end; + } + } else { + $value_start = $this->bytes_already_parsed; + $value_length = 0; + $attribute_end = $attribute_start + $name_length; + } + + if ( $attribute_end >= strlen( $this->html ) ) { + return false; + } + + if ( $this->is_closing_tag ) { + return true; + } + + /* + * > There must never be two or more attributes on + * > the same start tag whose names are an ASCII + * > case-insensitive match for each other. + * - HTML 5 spec + * + * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive + */ + $comparable_name = strtolower( $attribute_name ); + + // If an attribute is listed many times, only use the first declaration and ignore the rest. + if ( ! array_key_exists( $comparable_name, $this->attributes ) ) { + $this->attributes[ $comparable_name ] = new WP_HTML_Attribute_Token( + $attribute_name, + $value_start, + $value_length, + $attribute_start, + $attribute_end, + ! $has_value + ); + } + + return true; + } + + /** + * Move the internal cursor past any immediate successive whitespace. + * + * @since 6.2.0 + * + * @return void + */ + private function skip_whitespace() { + $this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n", $this->bytes_already_parsed ); + } + + /** + * Applies attribute updates and cleans up once a tag is fully parsed. + * + * @since 6.2.0 + * + * @return void + */ + private function after_tag() { + $this->class_name_updates_to_attributes_updates(); + $this->apply_attributes_updates(); + $this->tag_name_starts_at = null; + $this->tag_name_length = null; + $this->tag_ends_at = null; + $this->is_closing_tag = null; + $this->attributes = array(); + } + + /** + * Converts class name updates into tag attributes updates + * (they are accumulated in different data formats for performance). + * + * @see $lexical_updates + * @see $classname_updates + * + * @since 6.2.0 + * + * @return void + */ + private function class_name_updates_to_attributes_updates() { + if ( count( $this->classname_updates ) === 0 ) { + return; + } + + $existing_class = $this->get_enqueued_attribute_value( 'class' ); + if ( null === $existing_class || true === $existing_class ) { + $existing_class = ''; + } + + if ( false === $existing_class && isset( $this->attributes['class'] ) ) { + $existing_class = substr( + $this->html, + $this->attributes['class']->value_starts_at, + $this->attributes['class']->value_length + ); + } + + if ( false === $existing_class ) { + $existing_class = ''; + } + + /** + * Updated "class" attribute value. + * + * This is incrementally built while scanning through the existing class + * attribute, skipping removed classes on the way, and then appending + * added classes at the end. Only when finished processing will the + * value contain the final new value. + + * @var string $class + */ + $class = ''; + + /** + * Tracks the cursor position in the existing + * class attribute value while parsing. + * + * @var int $at + */ + $at = 0; + + /** + * Indicates if there's any need to modify the existing class attribute. + * + * If a call to `add_class()` and `remove_class()` wouldn't impact + * the `class` attribute value then there's no need to rebuild it. + * For example, when adding a class that's already present or + * removing one that isn't. + * + * This flag enables a performance optimization when none of the enqueued + * class updates would impact the `class` attribute; namely, that the + * processor can continue without modifying the input document, as if + * none of the `add_class()` or `remove_class()` calls had been made. + * + * This flag is set upon the first change that requires a string update. + * + * @var bool $modified + */ + $modified = false; + + // Remove unwanted classes by only copying the new ones. + $existing_class_length = strlen( $existing_class ); + while ( $at < $existing_class_length ) { + // Skip to the first non-whitespace character. + $ws_at = $at; + $ws_length = strspn( $existing_class, " \t\f\r\n", $ws_at ); + $at += $ws_length; + + // Capture the class name – it's everything until the next whitespace. + $name_length = strcspn( $existing_class, " \t\f\r\n", $at ); + if ( 0 === $name_length ) { + // If no more class names are found then that's the end. + break; + } + + $name = substr( $existing_class, $at, $name_length ); + $at += $name_length; + + // If this class is marked for removal, start processing the next one. + $remove_class = ( + isset( $this->classname_updates[ $name ] ) && + self::REMOVE_CLASS === $this->classname_updates[ $name ] + ); + + // If a class has already been seen then skip it; it should not be added twice. + if ( ! $remove_class ) { + $this->classname_updates[ $name ] = self::SKIP_CLASS; + } + + if ( $remove_class ) { + $modified = true; + continue; + } + + /* + * Otherwise, append it to the new "class" attribute value. + * + * There are options for handling whitespace between tags. + * Preserving the existing whitespace produces fewer changes + * to the HTML content and should clarify the before/after + * content when debugging the modified output. + * + * This approach contrasts normalizing the inter-class + * whitespace to a single space, which might appear cleaner + * in the output HTML but produce a noisier change. + */ + $class .= substr( $existing_class, $ws_at, $ws_length ); + $class .= $name; + } + + // Add new classes by appending those which haven't already been seen. + foreach ( $this->classname_updates as $name => $operation ) { + if ( self::ADD_CLASS === $operation ) { + $modified = true; + + $class .= strlen( $class ) > 0 ? ' ' : ''; + $class .= $name; + } + } + + $this->classname_updates = array(); + if ( ! $modified ) { + return; + } + + if ( strlen( $class ) > 0 ) { + $this->set_attribute( 'class', $class ); + } else { + $this->remove_attribute( 'class' ); + } + } + + /** + * Applies attribute updates to HTML document. + * + * @since 6.2.0 + * + * @return void + */ + private function apply_attributes_updates() { + if ( ! count( $this->lexical_updates ) ) { + return; + } + + /* + * Attribute updates can be enqueued in any order but updates + * to the document must occur in lexical order; that is, each + * replacement must be made before all others which follow it + * at later string indices in the input document. + * + * Sorting avoid making out-of-order replacements which + * can lead to mangled output, partially-duplicated + * attributes, and overwritten attributes. + */ + usort( $this->lexical_updates, array( self::class, 'sort_start_ascending' ) ); + + foreach ( $this->lexical_updates as $diff ) { + $this->output_buffer .= substr( $this->html, $this->bytes_already_copied, $diff->start - $this->bytes_already_copied ); + $this->output_buffer .= $diff->text; + $this->bytes_already_copied = $diff->end; + } + + /* + * Adjust bookmark locations to account for how the text + * replacements adjust offsets in the input document. + */ + foreach ( $this->bookmarks as $bookmark ) { + /* + * Each lexical update which appears before the bookmark's endpoints + * might shift the offsets for those endpoints. Loop through each change + * and accumulate the total shift for each bookmark, then apply that + * shift after tallying the full delta. + */ + $head_delta = 0; + $tail_delta = 0; + + foreach ( $this->lexical_updates as $diff ) { + $update_head = $bookmark->start >= $diff->start; + $update_tail = $bookmark->end >= $diff->start; + + if ( ! $update_head && ! $update_tail ) { + break; + } + + $delta = strlen( $diff->text ) - ( $diff->end - $diff->start ); + + if ( $update_head ) { + $head_delta += $delta; + } + + if ( $update_tail ) { + $tail_delta += $delta; + } + } + + $bookmark->start += $head_delta; + $bookmark->end += $tail_delta; + } + + $this->lexical_updates = array(); + } + + /** + * Move the internal cursor in the Tag Processor to a given bookmark's location. + * + * In order to prevent accidental infinite loops, there's a + * maximum limit on the number of times seek() can be called. + * + * @since 6.2.0 + * + * @param string $bookmark_name Jump to the place in the document identified by this bookmark name. + * @return bool Whether the internal cursor was successfully moved to the bookmark's location. + */ + public function seek( $bookmark_name ) { + if ( ! array_key_exists( $bookmark_name, $this->bookmarks ) ) { + _doing_it_wrong( + __METHOD__, + __( 'Unknown bookmark name.' ), + '6.2.0' + ); + return false; + } + + if ( ++$this->seek_count > self::MAX_SEEK_OPS ) { + _doing_it_wrong( + __METHOD__, + __( 'Too many calls to seek() - this can lead to performance issues.' ), + '6.2.0' + ); + return false; + } + + // Flush out any pending updates to the document. + $this->get_updated_html(); + + // Point this tag processor before the sought tag opener and consume it. + $this->bytes_already_parsed = $this->bookmarks[ $bookmark_name ]->start; + $this->bytes_already_copied = $this->bytes_already_parsed; + $this->output_buffer = substr( $this->html, 0, $this->bytes_already_copied ); + return $this->next_tag(); + } + + /** + * Compare two WP_HTML_Text_Replacement objects. + * + * @since 6.2.0 + * + * @param WP_HTML_Text_Replacement $a First attribute update. + * @param WP_HTML_Text_Replacement $b Second attribute update. + * @return int Comparison value for string order. + */ + private static function sort_start_ascending( $a, $b ) { + $by_start = $a->start - $b->start; + if ( 0 !== $by_start ) { + return $by_start; + } + + $by_text = isset( $a->text, $b->text ) ? strcmp( $a->text, $b->text ) : 0; + if ( 0 !== $by_text ) { + return $by_text; + } + + /* + * This code should be unreachable, because it implies the two replacements + * start at the same location and contain the same text. + */ + return $a->end - $b->end; + } + + /** + * Return the enqueued value for a given attribute, if one exists. + * + * Enqueued updates can take different data types: + * - If an update is enqueued and is boolean, the return will be `true` + * - If an update is otherwise enqueued, the return will be the string value of that update. + * - If an attribute is enqueued to be removed, the return will be `null` to indicate that. + * - If no updates are enqueued, the return will be `false` to differentiate from "removed." + * + * @since 6.2.0 + * + * @param string $comparable_name The attribute name in its comparable form. + * @return string|boolean|null Value of enqueued update if present, otherwise false. + */ + private function get_enqueued_attribute_value( $comparable_name ) { + if ( ! isset( $this->lexical_updates[ $comparable_name ] ) ) { + return false; + } + + $enqueued_text = $this->lexical_updates[ $comparable_name ]->text; + + // Removed attributes erase the entire span. + if ( '' === $enqueued_text ) { + return null; + } + + /* + * Boolean attribute updates are just the attribute name without a corresponding value. + * + * This value might differ from the given comparable name in that there could be leading + * or trailing whitespace, and that the casing follows the name given in `set_attribute`. + * + * Example: + * ``` + * $p->set_attribute( 'data-TEST-id', 'update' ); + * 'update' === $p->get_enqueued_attribute_value( 'data-test-id' ); + * ``` + * + * Detect this difference based on the absence of the `=`, which _must_ exist in any + * attribute containing a value, e.g. ``. + * ¹ ² + * 1. Attribute with a string value. + * 2. Boolean attribute whose value is `true`. + */ + $equals_at = strpos( $enqueued_text, '=' ); + if ( false === $equals_at ) { + return true; + } + + /* + * Finally, a normal update's value will appear after the `=` and + * be double-quoted, as performed incidentally by `set_attribute`. + * + * e.g. `type="text"` + * ¹² ³ + * 1. Equals is here. + * 2. Double-quoting starts one after the equals sign. + * 3. Double-quoting ends at the last character in the update. + */ + $enqueued_value = substr( $enqueued_text, $equals_at + 2, -1 ); + return html_entity_decode( $enqueued_value ); + } + + /** + * Returns the value of a requested attribute from a matched tag opener if that attribute exists. + * + * Example: + * ```php + * $p = new WP_HTML_Tag_Processor( '
Test
' ); + * $p->next_tag( [ 'class_name' => 'test' ] ) === true; + * $p->get_attribute( 'data-test-id' ) === '14'; + * $p->get_attribute( 'enabled' ) === true; + * $p->get_attribute( 'aria-label' ) === null; + * + * $p->next_tag( [] ) === false; + * $p->get_attribute( 'class' ) === null; + * ``` + * + * @since 6.2.0 + * + * @param string $name Name of attribute whose value is requested. + * @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`. + */ + public function get_attribute( $name ) { + if ( null === $this->tag_name_starts_at ) { + return null; + } + + $comparable = strtolower( $name ); + + /* + * For every attribute other than `class` it's possible to perform a quick check if + * there's an enqueued lexical update whose value takes priority over what's found in + * the input document. + * + * The `class` attribute is special though because of the exposed helpers `add_class` + * and `remove_class`. These form a builder for the `class` attribute, so an additional + * check for enqueued class changes is required in addition to the check for any enqueued + * attribute values. If any exist, those enqueued class changes must first be flushed out + * into an attribute value update. + */ + if ( 'class' === $name ) { + $this->class_name_updates_to_attributes_updates(); + } + + // Return any enqueued attribute value updates if they exist. + $enqueued_value = $this->get_enqueued_attribute_value( $comparable ); + if ( false !== $enqueued_value ) { + return $enqueued_value; + } + + if ( ! isset( $this->attributes[ $comparable ] ) ) { + return null; + } + + $attribute = $this->attributes[ $comparable ]; + + /* + * This flag distinguishes an attribute with no value + * from an attribute with an empty string value. For + * unquoted attributes this could look very similar. + * It refers to whether an `=` follows the name. + * + * e.g.
+ * ¹ ² + * 1. Attribute `boolean-attribute` is `true`. + * 2. Attribute `empty-attribute` is `""`. + */ + if ( true === $attribute->is_true ) { + return true; + } + + $raw_value = substr( $this->html, $attribute->value_starts_at, $attribute->value_length ); + + return html_entity_decode( $raw_value ); + } + + /** + * Gets lowercase names of all attributes matching a given prefix in the current tag. + * + * Note that matching is case-insensitive. This is in accordance with the spec: + * + * > There must never be two or more attributes on + * > the same start tag whose names are an ASCII + * > case-insensitive match for each other. + * - HTML 5 spec + * + * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive + * + * Example: + * ```php + * $p = new WP_HTML_Tag_Processor( '
Test
' ); + * $p->next_tag( [ 'class_name' => 'test' ] ) === true; + * $p->get_attribute_names_with_prefix( 'data-' ) === array( 'data-enabled', 'data-test-id' ); + * + * $p->next_tag( [] ) === false; + * $p->get_attribute_names_with_prefix( 'data-' ) === null; + * ``` + * + * @since 6.2.0 + * + * @param string $prefix Prefix of requested attribute names. + * @return array|null List of attribute names, or `null` when no tag opener is matched. + */ + function get_attribute_names_with_prefix( $prefix ) { + if ( $this->is_closing_tag || null === $this->tag_name_starts_at ) { + return null; + } + + $comparable = strtolower( $prefix ); + + $matches = array(); + foreach ( array_keys( $this->attributes ) as $attr_name ) { + if ( str_starts_with( $attr_name, $comparable ) ) { + $matches[] = $attr_name; + } + } + return $matches; + } + + /** + * Returns the uppercase name of the matched tag. + * + * Example: + * ```php + * $p = new WP_HTML_Tag_Processor( '
Test
' ); + * $p->next_tag( [] ) === true; + * $p->get_tag() === 'DIV'; + * + * $p->next_tag( [] ) === false; + * $p->get_tag() === null; + * ``` + * + * @since 6.2.0 + * + * @return string|null Name of currently matched tag in input HTML, or `null` if none found. + */ + public function get_tag() { + if ( null === $this->tag_name_starts_at ) { + return null; + } + + $tag_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ); + + return strtoupper( $tag_name ); + } + + /** + * Indicates if the current tag token is a tag closer. + * + * Example: + * ```php + * $p = new WP_HTML_Tag_Processor( '
' ); + * $p->next_tag( [ 'tag_name' => 'div', 'tag_closers' => 'visit' ] ); + * $p->is_tag_closer() === false; + * + * $p->next_tag( [ 'tag_name' => 'div', 'tag_closers' => 'visit' ] ); + * $p->is_tag_closer() === true; + * ``` + * + * @since 6.2.0 + * + * @return bool Whether the current tag is a tag closer. + */ + public function is_tag_closer() { + return $this->is_closing_tag; + } + + /** + * Updates or creates a new attribute on the currently matched tag with the passed value. + * + * For boolean attributes special handling is provided: + * - When `true` is passed as the value, then only the attribute name is added to the tag. + * - When `false` is passed, the attribute gets removed if it existed before. + * + * For string attributes, the value is escaped using the `esc_attr` function. + * + * @since 6.2.0 + * + * @param string $name The attribute name to target. + * @param string|bool $value The new attribute value. + * @return bool Whether an attribute value was set. + */ + public function set_attribute( $name, $value ) { + if ( $this->is_closing_tag || null === $this->tag_name_starts_at ) { + return false; + } + + /* + * WordPress rejects more characters than are strictly forbidden + * in HTML5. This is to prevent additional security risks deeper + * in the WordPress and plugin stack. Specifically the + * less-than (<) greater-than (>) and ampersand (&) aren't allowed. + * + * The use of a PCRE match enables looking for specific Unicode + * code points without writing a UTF-8 decoder. Whereas scanning + * for one-byte characters is trivial (with `strcspn`), scanning + * for the longer byte sequences would be more complicated. Given + * that this shouldn't be in the hot path for execution, it's a + * reasonable compromise in efficiency without introducing a + * noticeable impact on the overall system. + * + * @see https://html.spec.whatwg.org/#attributes-2 + * + * @TODO as the only regex pattern maybe we should take it out? are + * Unicode patterns available broadly in Core? + */ + if ( preg_match( + '~[' . + // Syntax-like characters. + '"\'>& The values "true" and "false" are not allowed on boolean attributes. + * > To represent a false value, the attribute has to be omitted altogether. + * - HTML5 spec, https://html.spec.whatwg.org/#boolean-attributes + */ + if ( false === $value ) { + return $this->remove_attribute( $name ); + } + + if ( true === $value ) { + $updated_attribute = $name; + } else { + $escaped_new_value = esc_attr( $value ); + $updated_attribute = "{$name}=\"{$escaped_new_value}\""; + } + + /* + * > There must never be two or more attributes on + * > the same start tag whose names are an ASCII + * > case-insensitive match for each other. + * - HTML 5 spec + * + * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive + */ + $comparable_name = strtolower( $name ); + + if ( isset( $this->attributes[ $comparable_name ] ) ) { + /* + * Update an existing attribute. + * + * Example – set attribute id to "new" in
: + *
+ * ^-------------^ + * start end + * replacement: `id="new"` + * + * Result:
+ */ + $existing_attribute = $this->attributes[ $comparable_name ]; + $this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement( + $existing_attribute->start, + $existing_attribute->end, + $updated_attribute + ); + } else { + /* + * Create a new attribute at the tag's name end. + * + * Example – add attribute id="new" to
: + *
+ * ^ + * start and end + * replacement: ` id="new"` + * + * Result:
+ */ + $this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement( + $this->tag_name_starts_at + $this->tag_name_length, + $this->tag_name_starts_at + $this->tag_name_length, + ' ' . $updated_attribute + ); + } + + /* + * Any calls to update the `class` attribute directly should wipe out any + * enqueued class changes from `add_class` and `remove_class`. + */ + if ( 'class' === $comparable_name && ! empty( $this->classname_updates ) ) { + $this->classname_updates = array(); + } + + return true; + } + + /** + * Remove an attribute from the currently-matched tag. + * + * @since 6.2.0 + * + * @param string $name The attribute name to remove. + * @return bool Whether an attribute was removed. + */ + public function remove_attribute( $name ) { + if ( $this->is_closing_tag ) { + return false; + } + + /* + * > There must never be two or more attributes on + * > the same start tag whose names are an ASCII + * > case-insensitive match for each other. + * - HTML 5 spec + * + * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive + */ + $name = strtolower( $name ); + + /* + * Any calls to update the `class` attribute directly should wipe out any + * enqueued class changes from `add_class` and `remove_class`. + */ + if ( 'class' === $name && count( $this->classname_updates ) !== 0 ) { + $this->classname_updates = array(); + } + + /* + * If updating an attribute that didn't exist in the input + * document, then remove the enqueued update and move on. + * + * For example, this might occur when calling `remove_attribute()` + * after calling `set_attribute()` for the same attribute + * and when that attribute wasn't originally present. + */ + if ( ! isset( $this->attributes[ $name ] ) ) { + if ( isset( $this->lexical_updates[ $name ] ) ) { + unset( $this->lexical_updates[ $name ] ); + } + return false; + } + + /* + * Removes an existing tag attribute. + * + * Example – remove the attribute id from
: + *
+ * ^-------------^ + * start end + * replacement: `` + * + * Result:
+ */ + $this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement( + $this->attributes[ $name ]->start, + $this->attributes[ $name ]->end, + '' + ); + + return true; + } + + /** + * Adds a new class name to the currently matched tag. + * + * @since 6.2.0 + * + * @param string $class_name The class name to add. + * @return bool Whether the class was set to be added. + */ + public function add_class( $class_name ) { + if ( $this->is_closing_tag ) { + return false; + } + + if ( null !== $this->tag_name_starts_at ) { + $this->classname_updates[ $class_name ] = self::ADD_CLASS; + } + + return true; + } + + /** + * Removes a class name from the currently matched tag. + * + * @since 6.2.0 + * + * @param string $class_name The class name to remove. + * @return bool Whether the class was set to be removed. + */ + public function remove_class( $class_name ) { + if ( $this->is_closing_tag ) { + return false; + } + + if ( null !== $this->tag_name_starts_at ) { + $this->classname_updates[ $class_name ] = self::REMOVE_CLASS; + } + + return true; + } + + /** + * Returns the string representation of the HTML Tag Processor. + * + * @since 6.2.0 + * @see get_updated_html + * + * @return string The processed HTML. + */ + public function __toString() { + return $this->get_updated_html(); + } + + /** + * Returns the string representation of the HTML Tag Processor. + * + * @since 6.2.0 + * + * @return string The processed HTML. + */ + public function get_updated_html() { + $requires_no_updating = 0 === count( $this->classname_updates ) && 0 === count( $this->lexical_updates ); + + /* + * When there is nothing more to update and nothing has already been + * updated, return the original document and avoid a string copy. + */ + if ( $requires_no_updating && 0 === $this->bytes_already_copied ) { + return $this->html; + } + + /* + * If there are no updates left to apply, but some have already + * been applied, then finish by copying the rest of the input + * to the end of the updated document and return. + */ + if ( $requires_no_updating && $this->bytes_already_copied > 0 ) { + return $this->output_buffer . substr( $this->html, $this->bytes_already_copied ); + } + + // Apply the updates, rewind to before the current tag, and reparse the attributes. + $content_up_to_opened_tag_name = $this->output_buffer . substr( + $this->html, + $this->bytes_already_copied, + $this->tag_name_starts_at + $this->tag_name_length - $this->bytes_already_copied + ); + + /* + * 1. Apply the edits by flushing them to the output buffer and updating the copied byte count. + * + * Note: `apply_attributes_updates()` modifies `$this->output_buffer`. + */ + $this->class_name_updates_to_attributes_updates(); + $this->apply_attributes_updates(); + + /* + * 2. Replace the original HTML with the now-updated HTML so that it's possible to + * seek to a previous location and have a consistent view of the updated document. + */ + $this->html = $this->output_buffer . substr( $this->html, $this->bytes_already_copied ); + $this->output_buffer = $content_up_to_opened_tag_name; + $this->bytes_already_copied = strlen( $this->output_buffer ); + + /* + * 3. Point this tag processor at the original tag opener and consume it + * + * At this point the internal cursor points to the end of the tag name. + * Rewind before the tag name starts so that it's as if the cursor didn't + * move; a call to `next_tag()` will reparse the recently-updated attributes + * and additional calls to modify the attributes will apply at this same + * location. + * + *

Previous HTMLMore HTML

+ * ^ | back up by the length of the tag name plus the opening < + * \<-/ back up by strlen("em") + 1 ==> 3 + */ + $this->bytes_already_parsed = strlen( $content_up_to_opened_tag_name ) - $this->tag_name_length - 1; + $this->next_tag(); + + return $this->html; + } + + /** + * Parses tag query input into internal search criteria. + * + * @since 6.2.0 + * + * @param array|string|null $query { + * Optional. Which tag name to find, having which class, etc. Default is to find any tag. + * + * @type string|null $tag_name Which tag to find, or `null` for "any tag." + * @type int|null $match_offset Find the Nth tag matching all search criteria. + * 0 for "first" tag, 2 for "third," etc. + * Defaults to first tag. + * @type string|null $class_name Tag must contain this class name to match. + * @type string $tag_closers "visit" or "skip": whether to stop on tag closers, e.g.
. + * } + * @return void + */ + private function parse_query( $query ) { + if ( null !== $query && $query === $this->last_query ) { + return; + } + + $this->last_query = $query; + $this->sought_tag_name = null; + $this->sought_class_name = null; + $this->sought_match_offset = 1; + $this->stop_on_tag_closers = false; + + // A single string value means "find the tag of this name". + if ( is_string( $query ) ) { + $this->sought_tag_name = $query; + return; + } + + // An empty query parameter applies no restrictions on the search. + if ( null === $query ) { + return; + } + + // If not using the string interface, an associative array is required. + if ( ! is_array( $query ) ) { + _doing_it_wrong( + __METHOD__, + __( 'The query argument must be an array or a tag name.' ), + '6.2.0' + ); + return; + } + + if ( isset( $query['tag_name'] ) && is_string( $query['tag_name'] ) ) { + $this->sought_tag_name = $query['tag_name']; + } + + if ( isset( $query['class_name'] ) && is_string( $query['class_name'] ) ) { + $this->sought_class_name = $query['class_name']; + } + + if ( isset( $query['match_offset'] ) && is_int( $query['match_offset'] ) && 0 < $query['match_offset'] ) { + $this->sought_match_offset = $query['match_offset']; + } + + if ( isset( $query['tag_closers'] ) ) { + $this->stop_on_tag_closers = 'visit' === $query['tag_closers']; + } + } + + + /** + * Checks whether a given tag and its attributes match the search criteria. + * + * @since 6.2.0 + * + * @return boolean Whether the given tag and its attribute match the search criteria. + */ + private function matches() { + if ( $this->is_closing_tag && ! $this->stop_on_tag_closers ) { + return false; + } + + // Does the tag name match the requested tag name in a case-insensitive manner? + if ( null !== $this->sought_tag_name ) { + /* + * String (byte) length lookup is fast. If they aren't the + * same length then they can't be the same string values. + */ + if ( strlen( $this->sought_tag_name ) !== $this->tag_name_length ) { + return false; + } + + /* + * Check each character to determine if they are the same. + * Defer calls to `strtoupper()` to avoid them when possible. + * Calling `strcasecmp()` here tested slowed than comparing each + * character, so unless benchmarks show otherwise, it should + * not be used. + * + * It's expected that most of the time that this runs, a + * lower-case tag name will be supplied and the input will + * contain lower-case tag names, thus normally bypassing + * the case comparison code. + */ + for ( $i = 0; $i < $this->tag_name_length; $i++ ) { + $html_char = $this->html[ $this->tag_name_starts_at + $i ]; + $tag_char = $this->sought_tag_name[ $i ]; + + if ( $html_char !== $tag_char && strtoupper( $html_char ) !== $tag_char ) { + return false; + } + } + } + + $needs_class_name = null !== $this->sought_class_name; + + if ( $needs_class_name && ! isset( $this->attributes['class'] ) ) { + return false; + } + + /* + * Match byte-for-byte (case-sensitive and encoding-form-sensitive) on the class name. + * + * This will overlook certain classes that exist in other lexical variations + * than was supplied to the search query, but requires more complicated searching. + */ + if ( $needs_class_name ) { + $class_start = $this->attributes['class']->value_starts_at; + $class_end = $class_start + $this->attributes['class']->value_length; + $class_at = $class_start; + + /* + * Ensure that boundaries surround the class name to avoid matching on + * substrings of a longer name. For example, the sequence "not-odd" + * should not match for the class "odd" even though "odd" is found + * within the class attribute text. + * + * See https://html.spec.whatwg.org/#attributes-3 + * See https://html.spec.whatwg.org/#space-separated-tokens + */ + while ( + // phpcs:ignore WordPress.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition + false !== ( $class_at = strpos( $this->html, $this->sought_class_name, $class_at ) ) && + $class_at < $class_end + ) { + /* + * Verify this class starts at a boundary. + */ + if ( $class_at > $class_start ) { + $character = $this->html[ $class_at - 1 ]; + + if ( ' ' !== $character && "\t" !== $character && "\f" !== $character && "\r" !== $character && "\n" !== $character ) { + $class_at += strlen( $this->sought_class_name ); + continue; + } + } + + /* + * Verify this class ends at a boundary as well. + */ + if ( $class_at + strlen( $this->sought_class_name ) < $class_end ) { + $character = $this->html[ $class_at + strlen( $this->sought_class_name ) ]; + + if ( ' ' !== $character && "\t" !== $character && "\f" !== $character && "\r" !== $character && "\n" !== $character ) { + $class_at += strlen( $this->sought_class_name ); + continue; + } + } + + return true; + } + + return false; + } + + return true; + } +} diff --git a/src/wp-includes/html-api/class-wp-html-text-replacement.php b/src/wp-includes/html-api/class-wp-html-text-replacement.php new file mode 100644 index 0000000000000..912b4a56a5eb4 --- /dev/null +++ b/src/wp-includes/html-api/class-wp-html-text-replacement.php @@ -0,0 +1,59 @@ +start = $start; + $this->end = $end; + $this->text = $text; + } +} diff --git a/src/wp-settings.php b/src/wp-settings.php index 3ed93b2f36629..e867fc3009a9c 100644 --- a/src/wp-settings.php +++ b/src/wp-settings.php @@ -234,6 +234,10 @@ require ABSPATH . WPINC . '/class-wp-oembed-controller.php'; require ABSPATH . WPINC . '/media.php'; require ABSPATH . WPINC . '/http.php'; +require ABSPATH . WPINC . '/html-api/class-wp-html-attribute-token.php'; +require ABSPATH . WPINC . '/html-api/class-wp-html-span.php'; +require ABSPATH . WPINC . '/html-api/class-wp-html-text-replacement.php'; +require ABSPATH . WPINC . '/html-api/class-wp-html-tag-processor.php'; require ABSPATH . WPINC . '/class-wp-http.php'; require ABSPATH . WPINC . '/class-wp-http-streams.php'; require ABSPATH . WPINC . '/class-wp-http-curl.php'; diff --git a/tests/phpunit/tests/html/wpHtmlTagProcessor-bookmark.php b/tests/phpunit/tests/html/wpHtmlTagProcessor-bookmark.php new file mode 100644 index 0000000000000..7702ade5a48f7 --- /dev/null +++ b/tests/phpunit/tests/html/wpHtmlTagProcessor-bookmark.php @@ -0,0 +1,381 @@ +
  • One
  • Two
  • Three
  • ' ); + $p->next_tag( 'li' ); + $this->assertTrue( $p->set_bookmark( 'first li' ), 'Could not allocate a "first li" bookmark' ); + $p->next_tag( 'li' ); + $this->assertTrue( $p->set_bookmark( 'second li' ), 'Could not allocate a "second li" bookmark' ); + $this->assertTrue( $p->set_bookmark( 'first li' ), 'Could not move the "first li" bookmark' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::release_bookmark + */ + public function test_release_bookmark() { + $p = new WP_HTML_Tag_Processor( '
    • One
    • Two
    • Three
    ' ); + $p->next_tag( 'li' ); + $this->assertFalse( $p->release_bookmark( 'first li' ), 'Released a non-existing bookmark' ); + $p->set_bookmark( 'first li' ); + $this->assertTrue( $p->release_bookmark( 'first li' ), 'Could not release a bookmark' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::seek + */ + public function test_seek() { + $p = new WP_HTML_Tag_Processor( '
    • One
    • Two
    • Three
    ' ); + $p->next_tag( 'li' ); + $p->set_bookmark( 'first li' ); + + $p->next_tag( 'li' ); + $p->set_attribute( 'foo-2', 'bar-2' ); + + $p->seek( 'first li' ); + $p->set_attribute( 'foo-1', 'bar-1' ); + + $this->assertSame( + '
    • One
    • Two
    • Three
    ', + $p->get_updated_html(), + 'Did not seek to the intended bookmark locations' + ); + } + + /** + * WP_HTML_Tag_Processor used to test for the diffs affecting + * the adjusted bookmark position while simultaneously adjusting + * the bookmark in question. As a result, updating the bookmarks + * of a next tag while removing two subsequent attributes in + * a previous tag unfolded like this: + * + * 1. Check if the first removed attribute is before the bookmark: + * + * + * ^-------------------^ ^ + * diff applied here the bookmark is here + * + * (Yes it is) + * + * 2. Move the bookmark to the left by the attribute length: + * + * + * ^ + * the bookmark is here + * + * 3. Check if the second removed attribute is before the bookmark: + * + * + * ^ ^-----^ + * bookmark diff + * + * This time, it isn't! + * + * The fix in the WP_HTML_Tag_Processor involves doing all the checks + * before moving the bookmark. This test is here to guard us from + * the erroneous behavior accidentally returning one day. + * + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::seek + * @covers WP_HTML_Tag_Processor::set_bookmark + */ + public function test_removing_long_attributes_doesnt_break_seek() { + $input = << +HTML; + $p = new WP_HTML_Tag_Processor( $input ); + $p->next_tag( 'button' ); + $p->set_bookmark( 'first' ); + $p->next_tag( 'button' ); + $p->set_bookmark( 'second' ); + + $this->assertTrue( + $p->seek( 'first' ), + 'Seek() to the first button has failed' + ); + $p->remove_attribute( 'twenty_one_characters' ); + $p->remove_attribute( '7_chars' ); + + $this->assertTrue( + $p->seek( 'second' ), + 'Seek() to the second button has failed' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::seek + * @covers WP_HTML_Tag_Processor::set_bookmark + */ + public function test_bookmarks_complex_use_case() { + $input = << +
    +
    +
    + + + + + + + +
    +
    +
    +HTML; + $expected_output = << +
    +
    +
    + + + + + + + +
    +
    +
    +HTML; + $p = new WP_HTML_Tag_Processor( $input ); + $p->next_tag( 'div' ); + $p->next_tag( 'div' ); + $p->next_tag( 'div' ); + $p->set_bookmark( 'first div' ); + $p->next_tag( 'button' ); + $p->set_bookmark( 'first button' ); + $p->next_tag( 'button' ); + $p->set_bookmark( 'second button' ); + $p->next_tag( 'button' ); + $p->set_bookmark( 'third button' ); + $p->next_tag( 'button' ); + $p->set_bookmark( 'fourth button' ); + + $p->seek( 'first button' ); + $p->set_attribute( 'type', 'submit' ); + + $this->assertTrue( + $p->seek( 'third button' ), + 'Seek() to the third button failed' + ); + $p->remove_attribute( 'class' ); + $p->remove_attribute( 'type' ); + $p->remove_attribute( 'aria-expanded' ); + $p->set_attribute( 'id', 'rebase-and-merge' ); + $p->remove_attribute( 'data-details-container' ); + + $this->assertTrue( + $p->seek( 'first div' ), + 'Seek() to the first div failed' + ); + $p->set_attribute( 'checked', false ); + + $this->assertTrue( + $p->seek( 'fourth button' ), + 'Seek() to the fourth button failed' + ); + $p->set_attribute( 'id', 'last-button' ); + $p->remove_attribute( 'class' ); + $p->remove_attribute( 'type' ); + $p->remove_attribute( 'checked' ); + $p->remove_attribute( 'aria-label' ); + $p->remove_attribute( 'disabled' ); + $p->remove_attribute( 'data-view-component' ); + + $this->assertTrue( + $p->seek( 'second button' ), + 'Seek() to the second button failed' + ); + $p->remove_attribute( 'type' ); + $p->set_attribute( 'class', 'hx_create-pr-button' ); + + $this->assertSame( + $expected_output, + $p->get_updated_html(), + 'Performing several attribute updates on different tags does not produce the expected HTML snippet' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::seek + */ + public function test_updates_bookmark_for_additions_after_both_sides() { + $p = new WP_HTML_Tag_Processor( '
    First
    Second
    ' ); + $p->next_tag(); + $p->set_bookmark( 'first' ); + $p->next_tag(); + $p->add_class( 'second' ); + + $p->seek( 'first' ); + $p->add_class( 'first' ); + + $this->assertSame( + '
    First
    Second
    ', + $p->get_updated_html(), + 'The bookmark was updated incorrectly in response to HTML markup updates' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::seek + */ + public function test_updates_bookmark_for_additions_before_both_sides() { + $p = new WP_HTML_Tag_Processor( '
    First
    Second
    ' ); + $p->next_tag(); + $p->set_bookmark( 'first' ); + $p->next_tag(); + $p->set_bookmark( 'second' ); + + $p->seek( 'first' ); + $p->add_class( 'first' ); + + $p->seek( 'second' ); + $p->add_class( 'second' ); + + $this->assertSame( + '
    First
    Second
    ', + $p->get_updated_html(), + 'The bookmark was updated incorrectly in response to HTML markup updates' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::seek + */ + public function test_updates_bookmark_for_deletions_after_both_sides() { + $p = new WP_HTML_Tag_Processor( '
    First
    Second
    ' ); + $p->next_tag(); + $p->set_bookmark( 'first' ); + $p->next_tag(); + $p->remove_attribute( 'disabled' ); + + $p->seek( 'first' ); + $p->set_attribute( 'untouched', true ); + + $this->assertSame( + /* + * It shouldn't be necessary to assert the extra space after the tag + * following the attribute removal, but doing so makes the test easier + * to see than it would be if parsing the output HTML for proper + * validation. If the Tag Processor changes so that this space no longer + * appears then this test should be updated to reflect that. The space + * is not required. + */ + '
    First
    Second
    ', + $p->get_updated_html(), + 'The bookmark was incorrectly in response to HTML markup updates' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::seek + */ + public function test_updates_bookmark_for_deletions_before_both_sides() { + $p = new WP_HTML_Tag_Processor( '
    First
    Second
    ' ); + $p->next_tag(); + $p->set_bookmark( 'first' ); + $p->next_tag(); + $p->set_bookmark( 'second' ); + + $p->seek( 'first' ); + $p->remove_attribute( 'disabled' ); + + $p->seek( 'second' ); + $p->set_attribute( 'safe', true ); + + $this->assertSame( + /* + * It shouldn't be necessary to assert the extra space after the tag + * following the attribute removal, but doing so makes the test easier + * to see than it would be if parsing the output HTML for proper + * validation. If the Tag Processor changes so that this space no longer + * appears then this test should be updated to reflect that. The space + * is not required. + */ + '
    First
    Second
    ', + $p->get_updated_html(), + 'The bookmark was updated incorrectly in response to HTML markup updates' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::set_bookmark + */ + public function test_limits_the_number_of_bookmarks() { + $p = new WP_HTML_Tag_Processor( '
    • One
    • Two
    • Three
    ' ); + $p->next_tag( 'li' ); + + for ( $i = 0; $i < WP_HTML_Tag_Processor::MAX_BOOKMARKS; $i++ ) { + $this->assertTrue( $p->set_bookmark( "bookmark $i" ), "Could not allocate the bookmark #$i" ); + } + + $this->setExpectedIncorrectUsage( 'WP_HTML_Tag_Processor::set_bookmark' ); + $this->assertFalse( $p->set_bookmark( 'final bookmark' ), "Allocated $i bookmarks, which is one above the limit" ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::seek + */ + public function test_limits_the_number_of_seek_calls() { + $p = new WP_HTML_Tag_Processor( '
    • One
    • Two
    • Three
    ' ); + $p->next_tag( 'li' ); + $p->set_bookmark( 'bookmark' ); + + for ( $i = 0; $i < WP_HTML_Tag_Processor::MAX_SEEK_OPS; $i++ ) { + $this->assertTrue( $p->seek( 'bookmark' ), 'Could not seek to the "bookmark"' ); + } + + $this->setExpectedIncorrectUsage( 'WP_HTML_Tag_Processor::seek' ); + $this->assertFalse( $p->seek( 'bookmark' ), "$i-th seek() to the bookmark succeeded, even though it should exceed the allowed limit" ); + } +} diff --git a/tests/phpunit/tests/html/wpHtmlTagProcessor.php b/tests/phpunit/tests/html/wpHtmlTagProcessor.php new file mode 100644 index 0000000000000..957f90b860d5a --- /dev/null +++ b/tests/phpunit/tests/html/wpHtmlTagProcessor.php @@ -0,0 +1,2016 @@ +Text
    '; + const HTML_WITH_CLASSES = '
    Text
    '; + const HTML_MALFORMED = '
    Back to notifications
    '; + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_tag + */ + public function test_get_tag_returns_null_before_finding_tags() { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + + $this->assertNull( $p->get_tag(), 'Calling get_tag() without selecting a tag did not return null' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_tag + */ + public function test_get_tag_returns_null_when_not_in_open_tag() { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + + $this->assertFalse( $p->next_tag( 'p' ), 'Querying a non-existing tag did not return false' ); + $this->assertNull( $p->get_tag(), 'Accessing a non-existing tag did not return null' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_tag + */ + public function test_get_tag_returns_open_tag_name() { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + + $this->assertTrue( $p->next_tag( 'div' ), 'Querying an existing tag did not return true' ); + $this->assertSame( 'DIV', $p->get_tag(), 'Accessing an existing tag name did not return "div"' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute + */ + public function test_get_attribute_returns_null_before_finding_tags() { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + + $this->assertNull( $p->get_attribute( 'class' ), 'Accessing an attribute without selecting a tag did not return null' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute + */ + public function test_get_attribute_returns_null_when_not_in_open_tag() { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + + $this->assertFalse( $p->next_tag( 'p' ), 'Querying a non-existing tag did not return false' ); + $this->assertNull( $p->get_attribute( 'class' ), 'Accessing an attribute of a non-existing tag did not return null' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute + */ + public function test_get_attribute_returns_null_when_in_closing_tag() { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + + $this->assertTrue( $p->next_tag( 'div' ), 'Querying an existing tag did not return true' ); + $this->assertTrue( $p->next_tag( array( 'tag_closers' => 'visit' ) ), 'Querying an existing closing tag did not return true' ); + $this->assertNull( $p->get_attribute( 'class' ), 'Accessing an attribute of a closing tag did not return null' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute + */ + public function test_get_attribute_returns_null_when_attribute_missing() { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + + $this->assertTrue( $p->next_tag( 'div' ), 'Querying an existing tag did not return true' ); + $this->assertNull( $p->get_attribute( 'test-id' ), 'Accessing a non-existing attribute did not return null' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute + */ + public function test_get_attribute_returns_attribute_value() { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + + $this->assertTrue( $p->next_tag( 'div' ), 'Querying an existing tag did not return true' ); + $this->assertSame( 'test', $p->get_attribute( 'class' ), 'Accessing a class="test" attribute value did not return "test"' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute + */ + public function test_get_attribute_returns_true_for_boolean_attribute() { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + + $this->assertTrue( $p->next_tag( array( 'class_name' => 'test' ) ), 'Querying an existing tag did not return true' ); + $this->assertTrue( $p->get_attribute( 'enabled' ), 'Accessing a boolean "enabled" attribute value did not return true' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute + */ + public function test_get_attribute_returns_string_for_truthy_attributes() { + $p = new WP_HTML_Tag_Processor( '' ); + + $this->assertTrue( $p->next_tag(), 'Querying an existing tag did not return true' ); + $this->assertSame( 'enabled', $p->get_attribute( 'enabled' ), 'Accessing a boolean "enabled" attribute value did not return true' ); + $this->assertSame( '1', $p->get_attribute( 'checked' ), 'Accessing a checked=1 attribute value did not return "1"' ); + $this->assertSame( 'true', $p->get_attribute( 'hidden' ), 'Accessing a hidden="true" attribute value did not return "true"' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute + */ + public function test_get_attribute_decodes_html_character_references() { + $p = new WP_HTML_Tag_Processor( '
    ' ); + $p->next_tag(); + + $this->assertSame( 'the "grande" is < 32oz†', $p->get_attribute( 'id' ), 'HTML Attribute value was returned without decoding character references' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute + */ + public function test_attributes_parser_treats_slash_as_attribute_separator() { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + + $this->assertTrue( $p->next_tag(), 'Querying an existing tag did not return true' ); + $this->assertTrue( $p->get_attribute( 'a' ), 'Accessing an existing attribute did not return true' ); + $this->assertTrue( $p->get_attribute( 'b' ), 'Accessing an existing attribute did not return true' ); + $this->assertTrue( $p->get_attribute( 'c' ), 'Accessing an existing attribute did not return true' ); + $this->assertTrue( $p->get_attribute( 'd' ), 'Accessing an existing attribute did not return true' ); + $this->assertSame( 'test', $p->get_attribute( 'e' ), 'Accessing an existing e="test" did not return "test"' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute + * + * @dataProvider data_attribute_name_case_variants + * + * @param string $attribute_name Name of data-enabled attribute with case variations. + */ + public function test_get_attribute_is_case_insensitive_for_attributes_with_values( $attribute_name ) { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + $p->next_tag(); + + $this->assertSame( + 'true', + $p->get_attribute( $attribute_name ), + 'Accessing an attribute by a differently-cased name did not return its value' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute + * + * @dataProvider data_attribute_name_case_variants + * + * @param string $attribute_name Name of data-enabled attribute with case variations. + */ + public function test_attributes_parser_is_case_insensitive_for_attributes_without_values( $attribute_name ) { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + $p->next_tag(); + + $this->assertTrue( + $p->get_attribute( $attribute_name ), + 'Accessing an attribute by a differently-cased name did not return its value' + ); + } + + /** + * Data provider. + * + * @return array[]. + */ + public function data_attribute_name_case_variants() { + return array( + array( 'DATA-enabled' ), + array( 'data-enabled' ), + array( 'DATA-ENABLED' ), + array( 'DatA-EnABled' ), + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::remove_attribute + */ + public function test_remove_attribute_is_case_insensitive() { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + $p->next_tag(); + $p->remove_attribute( 'data-enabled' ); + + $this->assertSame( '
    Test
    ', $p->get_updated_html(), 'A case-insensitive remove_attribute call did not remove the attribute' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::set_attribute + */ + public function test_set_attribute_is_case_insensitive() { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + $p->next_tag(); + $p->set_attribute( 'data-enabled', 'abc' ); + + $this->assertSame( '
    Test
    ', $p->get_updated_html(), 'A case-insensitive set_attribute call did not update the existing attribute' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute_names_with_prefix + */ + public function test_get_attribute_names_with_prefix_returns_null_before_finding_tags() { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + $this->assertNull( + $p->get_attribute_names_with_prefix( 'data-' ), + 'Accessing attributes by their prefix did not return null when no tag was selected' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute_names_with_prefix + */ + public function test_get_attribute_names_with_prefix_returns_null_when_not_in_open_tag() { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + $p->next_tag( 'p' ); + $this->assertNull( $p->get_attribute_names_with_prefix( 'data-' ), 'Accessing attributes of a non-existing tag did not return null' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute_names_with_prefix + */ + public function test_get_attribute_names_with_prefix_returns_null_when_in_closing_tag() { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + $p->next_tag( 'div' ); + $p->next_tag( array( 'tag_closers' => 'visit' ) ); + + $this->assertNull( $p->get_attribute_names_with_prefix( 'data-' ), 'Accessing attributes of a closing tag did not return null' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute_names_with_prefix + */ + public function test_get_attribute_names_with_prefix_returns_empty_array_when_no_attributes_present() { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + $p->next_tag( 'div' ); + + $this->assertSame( array(), $p->get_attribute_names_with_prefix( 'data-' ), 'Accessing the attributes on a tag without any did not return an empty array' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute_names_with_prefix + */ + public function test_get_attribute_names_with_prefix_returns_matching_attribute_names_in_lowercase() { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + $p->next_tag(); + + $this->assertSame( + array( 'data-enabled', 'data-test-id' ), + $p->get_attribute_names_with_prefix( 'data-' ), + 'Accessing attributes by their prefix did not return their lowercase names' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute_names_with_prefix + */ + public function test_get_attribute_names_with_prefix_returns_attribute_added_by_set_attribute() { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + $p->next_tag(); + $p->set_attribute( 'data-test-id', '14' ); + + $this->assertSame( + '
    Test
    ', + $p->get_updated_html(), + "Updated HTML doesn't include attribute added via set_attribute" + ); + $this->assertSame( + array( 'data-test-id', 'data-foo' ), + $p->get_attribute_names_with_prefix( 'data-' ), + "Accessing attribute names doesn't find attribute added via set_attribute" + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::__toString + */ + public function test_to_string_returns_updated_html() { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + $p->next_tag(); + $p->remove_attribute( 'id' ); + + $p->next_tag(); + $p->set_attribute( 'id', 'div-id-1' ); + $p->add_class( 'new_class_1' ); + + $this->assertSame( + $p->get_updated_html(), + (string) $p, + 'get_updated_html() returned a different value than __toString()' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_updated_html + */ + public function test_get_updated_html_applies_the_updates_so_far_and_keeps_the_processor_on_the_current_tag() { + $p = new WP_HTML_Tag_Processor( '
    Test
    ' ); + $p->next_tag(); + $p->remove_attribute( 'id' ); + + $p->next_tag(); + $p->set_attribute( 'id', 'div-id-1' ); + $p->add_class( 'new_class_1' ); + + $this->assertSame( + '
    Test
    ', + $p->get_updated_html(), + 'Calling get_updated_html after updating the attributes of the second tag returned different HTML than expected' + ); + + $p->set_attribute( 'id', 'div-id-2' ); + $p->add_class( 'new_class_2' ); + + $this->assertSame( + '
    Test
    ', + $p->get_updated_html(), + 'Calling get_updated_html after updating the attributes of the second tag for the second time returned different HTML than expected' + ); + + $p->next_tag(); + $p->remove_attribute( 'id' ); + + $this->assertSame( + '
    Test
    ', + $p->get_updated_html(), + 'Calling get_updated_html after removing the id attribute of the third tag returned different HTML than expected' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_updated_html + */ + public function test_get_updated_html_without_updating_any_attributes_returns_the_original_html() { + $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE ); + + $this->assertSame( + self::HTML_SIMPLE, + $p->get_updated_html(), + 'Casting WP_HTML_Tag_Processor to a string without performing any updates did not return the initial HTML snippet' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::next_tag + */ + public function test_next_tag_with_no_arguments_should_find_the_next_existing_tag() { + $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE ); + + $this->assertTrue( $p->next_tag(), 'Querying an existing tag did not return true' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::next_tag + */ + public function test_next_tag_should_return_false_for_a_non_existing_tag() { + $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE ); + + $this->assertFalse( $p->next_tag( 'p' ), 'Querying a non-existing tag did not return false' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::next_tag + * @covers WP_HTML_Tag_Processor::is_tag_closer + */ + public function test_next_tag_should_stop_on_closers_only_when_requested() { + $p = new WP_HTML_Tag_Processor( '
    ' ); + + $this->assertTrue( $p->next_tag( array( 'tag_name' => 'div' ) ), 'Did not find desired tag opener' ); + $this->assertFalse( $p->next_tag( array( 'tag_name' => 'div' ) ), 'Visited an unwanted tag, a tag closer' ); + + $p = new WP_HTML_Tag_Processor( '
    ' ); + $p->next_tag( + array( + 'tag_name' => 'div', + 'tag_closers' => 'visit', + ) + ); + + $this->assertFalse( $p->is_tag_closer(), 'Indicated a tag opener is a tag closer' ); + $this->assertTrue( + $p->next_tag( + array( + 'tag_name' => 'div', + 'tag_closers' => 'visit', + ) + ), + 'Did not stop at desired tag closer' + ); + $this->assertTrue( $p->is_tag_closer(), 'Indicated a tag closer is a tag opener' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::set_attribute + */ + public function test_set_attribute_on_a_non_existing_tag_does_not_change_the_markup() { + $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE ); + + $this->assertFalse( $p->next_tag( 'p' ), 'Querying a non-existing tag did not return false' ); + $this->assertFalse( $p->next_tag( 'div' ), 'Querying a non-existing tag did not return false' ); + + $p->set_attribute( 'id', 'primary' ); + + $this->assertSame( + self::HTML_SIMPLE, + $p->get_updated_html(), + 'Calling get_updated_html after updating a non-existing tag returned an HTML that was different from the original HTML' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::set_attribute + * @covers WP_HTML_Tag_Processor::remove_attribute + * @covers WP_HTML_Tag_Processor::add_class + * @covers WP_HTML_Tag_Processor::remove_class + */ + public function test_attribute_ops_on_tag_closer_do_not_change_the_markup() { + $p = new WP_HTML_Tag_Processor( '
    ' ); + $p->next_tag( + array( + 'tag_name' => 'div', + 'tag_closers' => 'visit', + ) + ); + + $this->assertFalse( $p->is_tag_closer(), 'Skipped tag opener' ); + + $p->next_tag( + array( + 'tag_name' => 'div', + 'tag_closers' => 'visit', + ) + ); + + $this->assertTrue( $p->is_tag_closer(), 'Skipped tag closer' ); + $this->assertFalse( $p->set_attribute( 'id', 'test' ), "Allowed setting an attribute on a tag closer when it shouldn't have" ); + $this->assertFalse( $p->remove_attribute( 'invalid-id' ), "Allowed removing an attribute on a tag closer when it shouldn't have" ); + $this->assertFalse( $p->add_class( 'sneaky' ), "Allowed adding a class on a tag closer when it shouldn't have" ); + $this->assertFalse( $p->remove_class( 'not-appearing-in-this-test' ), "Allowed removing a class on a tag closer when it shouldn't have" ); + $this->assertSame( + '
    ', + $p->get_updated_html(), + 'Calling get_updated_html after updating a non-existing tag returned an HTML that was different from the original HTML' + ); + } + + /** + * Passing a double quote inside of an attribute value could lead to an XSS attack as follows: + * + * ```php + * $p = new WP_HTML_Tag_Processor( '
    ' ); + * $p->next_tag(); + * $p->set_attribute('class', '" onclick="alert'); + * echo $p; + * //
    + * ``` + * + * To prevent it, `set_attribute` calls `esc_attr()` on its given values. + * + * ```php + *
    + * ``` + * + * @ticket 56299 + * + * @dataProvider data_set_attribute_prevents_xss + * @covers WP_HTML_Tag_Processor::set_attribute + * + * @param string $attribute_value A value with potential XSS exploit. + */ + public function test_set_attribute_prevents_xss( $attribute_value ) { + $p = new WP_HTML_Tag_Processor( '
    ' ); + $p->next_tag(); + $p->set_attribute( 'test', $attribute_value ); + + /* + * Testing the escaping is hard using tools that properly parse + * HTML because they might interpret the escaped values. It's hard + * with tools that don't understand HTML because they might get + * confused by improperly-escaped values. + * + * Since the input HTML is known, the test will do what looks like + * the opposite of what is expected to be done with this library. + * But by doing so, the test (a) has full control over the + * content and (b) looks at the raw values. + */ + $match = null; + preg_match( '~^
    $~', $p->get_updated_html(), $match ); + list( , $actual_value ) = $match; + + $this->assertSame( '"' . esc_attr( $attribute_value ) . '"', $actual_value, 'Entities were not properly escaped in the attribute value' ); + } + + /** + * Data provider. + * + * @return string[][]. + */ + public function data_set_attribute_prevents_xss() { + return array( + array( '"' ), + array( '"' ), + array( '&' ), + array( '&' ), + array( '€' ), + array( "'" ), + array( '<>' ), + array( '"";' ), + array( '" onclick="alert(\'1\');">' ), + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::set_attribute + */ + public function test_set_attribute_with_a_non_existing_attribute_adds_a_new_attribute_to_the_markup() { + $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE ); + $p->next_tag(); + $p->set_attribute( 'test-attribute', 'test-value' ); + + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Updated HTML does not include attribute added via set_attribute()' + ); + $this->assertSame( + 'test-value', + $p->get_attribute( 'test-attribute' ), + 'get_attribute() (called after get_updated_html()) did not return attribute added via set_attribute()' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute + */ + public function test_get_attribute_returns_updated_values_before_they_are_applied() { + $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE ); + $p->next_tag(); + $p->set_attribute( 'test-attribute', 'test-value' ); + + $this->assertSame( + 'test-value', + $p->get_attribute( 'test-attribute' ), + 'get_attribute() (called before get_updated_html()) did not return attribute added via set_attribute()' + ); + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Updated HTML does not include attribute added via set_attribute()' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute + */ + public function test_get_attribute_returns_updated_values_before_they_are_applied_with_different_name_casing() { + $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE ); + $p->next_tag(); + $p->set_attribute( 'test-ATTribute', 'test-value' ); + + $this->assertSame( + 'test-value', + $p->get_attribute( 'test-attribute' ), + 'get_attribute() (called before get_updated_html()) did not return attribute added via set_attribute()' + ); + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Updated HTML does not include attribute added via set_attribute()' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute + */ + public function test_get_attribute_reflects_added_class_names_before_they_are_applied() { + $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE ); + $p->next_tag(); + $p->add_class( 'my-class' ); + + $this->assertSame( + 'my-class', + $p->get_attribute( 'class' ), + 'get_attribute() (called before get_updated_html()) did not return class name added via add_class()' + ); + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Updated HTML does not include class name added via add_class()' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute + */ + public function test_get_attribute_reflects_added_class_names_before_they_are_applied_and_retains_classes_from_previous_add_class_calls() { + $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE ); + $p->next_tag(); + $p->add_class( 'my-class' ); + + $this->assertSame( + 'my-class', + $p->get_attribute( 'class' ), + 'get_attribute() (called before get_updated_html()) did not return class name added via add_class()' + ); + + $p->add_class( 'my-other-class' ); + + $this->assertSame( + 'my-class my-other-class', + $p->get_attribute( 'class' ), + 'get_attribute() (called before get_updated_html()) did not return class names added via subsequent add_class() calls' + ); + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Updated HTML does not include class names added via subsequent add_class() calls' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute + */ + public function test_get_attribute_reflects_removed_attribute_before_it_is_applied() { + $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE ); + $p->next_tag(); + $p->remove_attribute( 'id' ); + + $this->assertNull( + $p->get_attribute( 'id' ), + 'get_attribute() (called before get_updated_html()) returned attribute that was removed by remove_attribute()' + ); + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Updated HTML includes attribute that was removed by remove_attribute()' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute + */ + public function test_get_attribute_reflects_adding_and_then_removing_an_attribute_before_those_updates_are_applied() { + $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE ); + $p->next_tag(); + $p->set_attribute( 'test-attribute', 'test-value' ); + $p->remove_attribute( 'test-attribute' ); + + $this->assertNull( + $p->get_attribute( 'test-attribute' ), + 'get_attribute() (called before get_updated_html()) returned attribute that was added via set_attribute() and then removed by remove_attribute()' + ); + $this->assertSame( + self::HTML_SIMPLE, + $p->get_updated_html(), + 'Updated HTML includes attribute that was added via set_attribute() and then removed by remove_attribute()' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute + */ + public function test_get_attribute_reflects_setting_and_then_removing_an_existing_attribute_before_those_updates_are_applied() { + $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE ); + $p->next_tag(); + $p->set_attribute( 'id', 'test-value' ); + $p->remove_attribute( 'id' ); + + $this->assertNull( + $p->get_attribute( 'id' ), + 'get_attribute() (called before get_updated_html()) returned attribute that was overwritten by set_attribute() and then removed by remove_attribute()' + ); + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Updated HTML includes attribute that was overwritten by set_attribute() and then removed by remove_attribute()' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute + */ + public function test_get_attribute_reflects_removed_class_names_before_they_are_applied() { + $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES ); + $p->next_tag(); + $p->remove_class( 'with-border' ); + + $this->assertSame( + 'main', + $p->get_attribute( 'class' ), + 'get_attribute() (called before get_updated_html()) returned the wrong attribute after calling remove_attribute()' + ); + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Updated HTML includes wrong attribute after calling remove_attribute()' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute + */ + public function test_get_attribute_reflects_setting_and_then_removing_a_class_name_before_those_updates_are_applied() { + $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES ); + $p->next_tag(); + $p->add_class( 'foo-class' ); + $p->remove_class( 'foo-class' ); + + $this->assertSame( + 'main with-border', + $p->get_attribute( 'class' ), + 'get_attribute() (called before get_updated_html()) returned class name that was added via add_class() and then removed by remove_class()' + ); + $this->assertSame( + self::HTML_WITH_CLASSES, + $p->get_updated_html(), + 'Updated HTML includes class that was added via add_class() and then removed by remove_class()' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::get_attribute + */ + public function test_get_attribute_reflects_duplicating_and_then_removing_an_existing_class_name_before_those_updates_are_applied() { + $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES ); + $p->next_tag(); + $p->add_class( 'with-border' ); + $p->remove_class( 'with-border' ); + + $this->assertSame( + 'main', + $p->get_attribute( 'class' ), + 'get_attribute() (called before get_updated_html()) returned class name that was duplicated via add_class() and then removed by remove_class()' + ); + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Updated HTML includes class that was duplicated via add_class() and then removed by remove_class()' + ); + } + + /** + * According to HTML spec, only the first instance of an attribute counts. + * The other ones are ignored. + * + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::set_attribute + */ + public function test_update_first_attribute_when_duplicated_attributes_exist() { + $p = new WP_HTML_Tag_Processor( '
    Text
    ' ); + $p->next_tag(); + $p->set_attribute( 'id', 'updated-id' ); + + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Proper (first) appearance of attribute was not updated when duplicates exist' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::set_attribute + */ + public function test_set_attribute_with_an_existing_attribute_name_updates_its_value_in_the_markup() { + $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE ); + $p->next_tag(); + $p->set_attribute( 'id', 'new-id' ); + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Existing attribute was not updated' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::next_tag + * @covers WP_HTML_Tag_Processor::set_attribute + */ + public function test_next_tag_and_set_attribute_in_a_loop_update_all_tags_in_the_markup() { + $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE ); + while ( $p->next_tag() ) { + $p->set_attribute( 'data-foo', 'bar' ); + } + + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Not all tags were updated when looping with next_tag() and set_attribute()' + ); + } + + /** + * Removing an attribute that's listed many times, e.g. `
    ` should remove + * all its instances and output just `
    `. + * + * Today, however, WP_HTML_Tag_Processor only removes the first such attribute. It seems like a corner case + * and introducing additional complexity to correctly handle this scenario doesn't seem to be worth it. + * Let's revisit if and when this becomes a problem. + * + * This test is in place to confirm this behavior, which while incorrect, is well-defined. + * A later fix introduced to the Tag Processor should update this test to reflect the + * wanted and correct behavior. + * + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::remove_attribute + */ + public function test_remove_first_when_duplicated_attribute() { + $p = new WP_HTML_Tag_Processor( '
    Text
    ' ); + $p->next_tag(); + $p->remove_attribute( 'id' ); + + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'First attribute (when duplicates exist) was not removed' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::remove_attribute + */ + public function test_remove_attribute_with_an_existing_attribute_name_removes_it_from_the_markup() { + $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE ); + $p->next_tag(); + $p->remove_attribute( 'id' ); + + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Attribute was not removed' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::remove_attribute + */ + public function test_remove_attribute_with_a_non_existing_attribute_name_does_not_change_the_markup() { + $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE ); + $p->next_tag(); + $p->remove_attribute( 'no-such-attribute' ); + + $this->assertSame( + self::HTML_SIMPLE, + $p->get_updated_html(), + 'Content was changed when attempting to remove an attribute that did not exist' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::add_class + */ + public function test_add_class_creates_a_class_attribute_when_there_is_none() { + $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE ); + $p->next_tag(); + $p->add_class( 'foo-class' ); + + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Updated HTML does not include class name added via add_class()' + ); + $this->assertSame( + 'foo-class', + $p->get_attribute( 'class' ), + "get_attribute( 'class' ) did not return class name added via add_class()" + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::add_class + */ + public function test_calling_add_class_twice_creates_a_class_attribute_with_both_class_names_when_there_is_no_class_attribute() { + $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE ); + $p->next_tag(); + $p->add_class( 'foo-class' ); + $p->add_class( 'bar-class' ); + + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Updated HTML does not include class names added via subsequent add_class() calls' + ); + $this->assertSame( + 'foo-class bar-class', + $p->get_attribute( 'class' ), + "get_attribute( 'class' ) did not return class names added via subsequent add_class() calls" + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::remove_class + */ + public function test_remove_class_does_not_change_the_markup_when_there_is_no_class_attribute() { + $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE ); + $p->next_tag(); + $p->remove_class( 'foo-class' ); + + $this->assertSame( + self::HTML_SIMPLE, + $p->get_updated_html(), + 'Updated HTML includes class name that was removed by remove_class()' + ); + $this->assertNull( + $p->get_attribute( 'class' ), + "get_attribute( 'class' ) did not return null for class name that was removed by remove_class()" + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::add_class + */ + public function test_add_class_appends_class_names_to_the_existing_class_attribute_when_one_already_exists() { + $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES ); + $p->next_tag(); + $p->add_class( 'foo-class' ); + $p->add_class( 'bar-class' ); + + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Updated HTML does not reflect class names added to existing class attribute via subsequent add_class() calls' + ); + $this->assertSame( + 'main with-border foo-class bar-class', + $p->get_attribute( 'class' ), + "get_attribute( 'class' ) does not reflect class names added to existing class attribute via subsequent add_class() calls" + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::remove_class + */ + public function test_remove_class_removes_a_single_class_from_the_class_attribute_when_one_exists() { + $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES ); + $p->next_tag(); + $p->remove_class( 'main' ); + + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Updated HTML does not reflect class name removed from existing class attribute via remove_class()' + ); + $this->assertSame( + ' with-border', + $p->get_attribute( 'class' ), + "get_attribute( 'class' ) does not reflect class name removed from existing class attribute via remove_class()" + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::remove_class + */ + public function test_calling_remove_class_with_all_listed_class_names_removes_the_existing_class_attribute_from_the_markup() { + $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES ); + $p->next_tag(); + $p->remove_class( 'main' ); + $p->remove_class( 'with-border' ); + + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Updated HTML does not reflect class attribute removed via subesequent remove_class() calls' + ); + $this->assertNull( + $p->get_attribute( 'class' ), + "get_attribute( 'class' ) did not return null for class attribute removed via subesequent remove_class() calls" + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::add_class + */ + public function test_add_class_does_not_add_duplicate_class_names() { + $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES ); + $p->next_tag(); + $p->add_class( 'with-border' ); + + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Updated HTML does not reflect deduplicated class name added via add_class()' + ); + $this->assertSame( + 'main with-border', + $p->get_attribute( 'class' ), + "get_attribute( 'class' ) does not reflect deduplicated class name added via add_class()" + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::add_class + */ + public function test_add_class_preserves_class_name_order_when_a_duplicate_class_name_is_added() { + $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES ); + $p->next_tag(); + $p->add_class( 'main' ); + + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Updated HTML does not reflect class name order after adding duplicated class name via add_class()' + ); + $this->assertSame( + 'main with-border', + $p->get_attribute( 'class' ), + "get_attribute( 'class' ) does not reflect class name order after adding duplicated class name added via add_class()" + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::add_class + */ + public function test_add_class_when_there_is_a_class_attribute_with_excessive_whitespaces() { + $p = new WP_HTML_Tag_Processor( + '
    Text
    ' + ); + $p->next_tag(); + $p->add_class( 'foo-class' ); + + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Updated HTML does not reflect existing excessive whitespace after adding class name via add_class()' + ); + $this->assertSame( + ' main with-border foo-class', + $p->get_attribute( 'class' ), + "get_attribute( 'class' ) does not reflect existing excessive whitespace after adding class name via add_class()" + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::remove_class + */ + public function test_remove_class_preserves_whitespaces_when_there_is_a_class_attribute_with_excessive_whitespaces() { + $p = new WP_HTML_Tag_Processor( + '
    Text
    ' + ); + $p->next_tag(); + $p->remove_class( 'with-border' ); + + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Updated HTML does not reflect existing excessive whitespace after removing class name via remove_class()' + ); + $this->assertSame( + ' main', + $p->get_attribute( 'class' ), + "get_attribute( 'class' ) does not reflect existing excessive whitespace after removing class name via removing_class()" + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::remove_class + */ + public function test_removing_all_classes_removes_the_existing_class_attribute_from_the_markup_even_when_excessive_whitespaces_are_present() { + $p = new WP_HTML_Tag_Processor( + '
    Text
    ' + ); + $p->next_tag(); + $p->remove_class( 'main' ); + $p->remove_class( 'with-border' ); + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Updated HTML does not reflect removed class attribute after removing all class names via remove_class()' + ); + $this->assertNull( + $p->get_attribute( 'class' ), + "get_attribute( 'class' ) did not return null after removing all class names via remove_class()" + ); + } + + /** + * When add_class( $different_value ) is called _after_ set_attribute( 'class', $value ), the + * final class name should be "$value $different_value". In other words, the `add_class` call + * should append its class to the one(s) set by `set_attribute`. When `add_class( $different_value )` + * is called _before_ `set_attribute( 'class', $value )`, however, the final class name should be + * "$value" instead, as any direct updates to the `class` attribute supersede any changes enqueued + * via the class builder methods. + * + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::add_class + * @covers WP_HTML_Tag_Processor::set_attribute + */ + public function test_set_attribute_takes_priority_over_add_class() { + $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES ); + $p->next_tag(); + $p->add_class( 'add_class' ); + $p->set_attribute( 'class', 'set_attribute' ); + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + "Calling get_updated_html after updating first tag's attributes did not return the expected HTML" + ); + $this->assertSame( + 'set_attribute', + $p->get_attribute( 'class' ), + "Calling get_attribute after updating first tag's attributes did not return the expected class name" + ); + + $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES ); + $p->next_tag(); + $p->set_attribute( 'class', 'set_attribute' ); + $p->add_class( 'add_class' ); + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + "Calling get_updated_html after updating first tag's attributes did not return the expected HTML" + ); + $this->assertSame( + 'set_attribute add_class', + $p->get_attribute( 'class' ), + "Calling get_attribute after updating first tag's attributes did not return the expected class name" + ); + } + + /** + * When add_class( $different_value ) is called _after_ set_attribute( 'class', $value ), the + * final class name should be "$value $different_value". In other words, the `add_class` call + * should append its class to the one(s) set by `set_attribute`. When `add_class( $different_value )` + * is called _before_ `set_attribute( 'class', $value )`, however, the final class name should be + * "$value" instead, as any direct updates to the `class` attribute supersede any changes enqueued + * via the class builder methods. + * + * This is still true when reading enqueued updates before calling `get_updated_html()`. + * + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::add_class + * @covers WP_HTML_Tag_Processor::set_attribute + */ + public function test_set_attribute_takes_priority_over_add_class_even_before_updating() { + $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES ); + $p->next_tag(); + $p->add_class( 'add_class' ); + $p->set_attribute( 'class', 'set_attribute' ); + $this->assertSame( + 'set_attribute', + $p->get_attribute( 'class' ), + "Calling get_attribute after updating first tag's attributes did not return the expected class name" + ); + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + "Calling get_updated_html after updating first tag's attributes did not return the expected HTML" + ); + + $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES ); + $p->next_tag(); + $p->set_attribute( 'class', 'set_attribute' ); + $p->add_class( 'add_class' ); + $this->assertSame( + 'set_attribute add_class', + $p->get_attribute( 'class' ), + "Calling get_attribute after updating first tag's attributes did not return the expected class name" + ); + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + "Calling get_updated_html after updating first tag's attributes did not return the expected HTML" + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::add_class + */ + public function test_add_class_overrides_boolean_class_attribute() { + $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE ); + $p->next_tag(); + $p->set_attribute( 'class', true ); + $p->add_class( 'add_class' ); + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + "Updated HTML doesn't reflect class added via add_class that was originally set as boolean attribute" + ); + $this->assertSame( + 'add_class', + $p->get_attribute( 'class' ), + "get_attribute (called after get_updated_html()) doesn't reflect class added via add_class that was originally set as boolean attribute" + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::add_class + */ + public function test_add_class_overrides_boolean_class_attribute_even_before_updating() { + $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE ); + $p->next_tag(); + $p->set_attribute( 'class', true ); + $p->add_class( 'add_class' ); + $this->assertSame( + 'add_class', + $p->get_attribute( 'class' ), + "get_attribute (called before get_updated_html()) doesn't reflect class added via add_class that was originally set as boolean attribute" + ); + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + "Updated HTML doesn't reflect class added via add_class that was originally set as boolean attribute" + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::set_attribute + * @covers WP_HTML_Tag_Processor::remove_attribute + * @covers WP_HTML_Tag_Processor::add_class + * @covers WP_HTML_Tag_Processor::remove_class + * @covers WP_HTML_Tag_Processor::get_updated_html + */ + public function test_advanced_use_case() { + $input = << +
    +
    +
    + + + + + + + +
    +
    +
    +HTML; + + $expected_output = << +
    +
    +
    + + + + + + + +
    +
    +
    +HTML; + + $p = new WP_HTML_Tag_Processor( $input ); + $this->assertTrue( $p->next_tag( 'div' ), 'Querying an existing tag did not return true' ); + $p->set_attribute( 'data-details', '{ "key": "value" }' ); + $p->add_class( 'is-processed' ); + $this->assertTrue( + $p->next_tag( + array( + 'tag_name' => 'div', + 'class_name' => 'BtnGroup', + ) + ), + 'Querying an existing tag did not return true' + ); + $p->remove_class( 'BtnGroup' ); + $p->add_class( 'button-group' ); + $p->add_class( 'Another-Mixed-Case' ); + $this->assertTrue( + $p->next_tag( + array( + 'tag_name' => 'div', + 'class_name' => 'BtnGroup', + ) + ), + 'Querying an existing tag did not return true' + ); + $p->remove_class( 'BtnGroup' ); + $p->add_class( 'button-group' ); + $p->add_class( 'Another-Mixed-Case' ); + $this->assertTrue( + $p->next_tag( + array( + 'tag_name' => 'button', + 'class_name' => 'btn', + 'match_offset' => 3, + ) + ), + 'Querying an existing tag did not return true' + ); + $p->remove_attribute( 'class' ); + $this->assertFalse( $p->next_tag( 'non-existent' ), 'Querying a non-existing tag did not return false' ); + $p->set_attribute( 'class', 'test' ); + $this->assertSame( $expected_output, $p->get_updated_html(), 'Calling get_updated_html after updating the attributes did not return the expected HTML' ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::next_tag + */ + public function test_correctly_parses_html_attributes_wrapped_in_single_quotation_marks() { + $p = new WP_HTML_Tag_Processor( + '
    Text
    ' + ); + $p->next_tag( + array( + 'tag_name' => 'div', + 'id' => 'first', + ) + ); + $p->remove_attribute( 'id' ); + $p->next_tag( + array( + 'tag_name' => 'span', + 'id' => 'second', + ) + ); + $p->set_attribute( 'id', 'single-quote' ); + $this->assertSame( + '
    Text
    ', + $p->get_updated_html(), + 'Did not remove single-quoted attribute' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::set_attribute + */ + public function test_set_attribute_with_value_equal_to_true_adds_a_boolean_html_attribute_with_implicit_value() { + $p = new WP_HTML_Tag_Processor( + '
    ' + ); + $p->next_tag( 'input' ); + $p->set_attribute( 'checked', true ); + $this->assertSame( + '
    ', + $p->get_updated_html(), + 'Did not add "checked" as an expected boolean attribute' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::set_attribute + */ + public function test_setting_a_boolean_attribute_to_false_removes_it_from_the_markup() { + $p = new WP_HTML_Tag_Processor( + '
    ' + ); + $p->next_tag( 'input' ); + $p->set_attribute( 'checked', false ); + $this->assertSame( + '
    ', + $p->get_updated_html(), + 'Did not remove boolean attribute when set to false' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::set_attribute + */ + public function test_setting_a_missing_attribute_to_false_does_not_change_the_markup() { + $html_input = '
    '; + $p = new WP_HTML_Tag_Processor( $html_input ); + $p->next_tag( 'input' ); + $p->set_attribute( 'checked', false ); + $this->assertSame( + $html_input, + $p->get_updated_html(), + 'Changed the markup unexpectedly when setting a non-existing attribute to false' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::set_attribute + */ + public function test_setting_a_boolean_attribute_to_a_string_value_adds_explicit_value_to_the_markup() { + $p = new WP_HTML_Tag_Processor( + '
    ' + ); + $p->next_tag( 'input' ); + $p->set_attribute( 'checked', 'checked' ); + $this->assertSame( + '
    ', + $p->get_updated_html(), + 'Did not add string value to existing boolean attribute' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::next_tag + */ + public function test_unclosed_script_tag_should_not_cause_an_infinite_loop() { + $p = new WP_HTML_Tag_Processor( '
    ', + ), + + 'Simple uppercase script tag' => array( + '
    ', + ), + + 'Script with a comment opener inside should end at the next script tag closer (dash dash escaped state)' => array( + '
    -->', + ), + + 'Script with a comment opener and a script tag opener inside should end two script tag closer later (double escaped state)' => array( + '
    -->', + ), + + 'Double escaped script with a tricky opener' => array( + '">
    ', + ), + + 'Double escaped script with a tricky closer' => array( + '">
    ', + ), + + 'Double escaped, then escaped, then double escaped' => array( + '
    ', + ), + + 'Script with a commented a script tag opener inside should at the next tag closer (dash dash escaped state)' => array( + '
    -->', + ), + + 'Script closer with another script tag in closer attributes' => array( + '
    ', + ), + + 'Script closer with attributes' => array( + '
    ', + ), + + 'Script opener with title closer inside' => array( + '
    ', + ), + + 'Complex script with many parsing states' => array( + '-->
    -->', + ), + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::next_tag + * + * @dataProvider data_next_tag_ignores_contents_of_rcdata_tag + * + * @param string $rcdata_then_div HTML with RCDATA before a DIV. + * @param string $rcdata_tag RCDATA tag. + */ + public function test_next_tag_ignores_contents_of_rcdata_tag( $rcdata_then_div, $rcdata_tag ) { + $p = new WP_HTML_Tag_Processor( $rcdata_then_div ); + $p->next_tag(); + $this->assertSame( $rcdata_tag, $p->get_tag(), "The first found tag was not '$rcdata_tag'" ); + $p->next_tag(); + $this->assertSame( 'DIV', $p->get_tag(), "The second found tag was not 'div'" ); + } + + /** + * Data provider. + * + * @return array[] + */ + public function data_next_tag_ignores_contents_of_rcdata_tag() { + return array( + 'simple textarea' => array( + 'rcdata_then_div' => '
    ', + 'rcdata_tag' => 'TEXTAREA', + ), + 'simple title' => array( + 'rcdata_then_div' => '<span class="d-none d-md-inline">Back to notifications</title</span>
    ', + 'rcdata_tag' => 'TITLE', + ), + 'comment opener inside a textarea tag should be ignored' => array( + 'rcdata_then_div' => '
    -->', + 'rcdata_tag' => 'TEXTAREA', + ), + 'textarea closer with another textarea tag in closer attributes' => array( + 'rcdata_then_div' => '
    ', + 'rcdata_tag' => 'TEXTAREA', + ), + 'textarea closer with attributes' => array( + 'rcdata_then_div' => '
    ', + 'rcdata_tag' => 'TEXTAREA', + ), + 'textarea opener with title closer inside' => array( + 'rcdata_then_div' => '
    ', + 'rcdata_tag' => 'TEXTAREA', + ), + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::next_tag + * + * @dataProvider data_skips_contents_of_script_and_rcdata_regions + * + * @param $input_html HTML with multiple divs, one of which carries the "target" attribute. + */ + public function test_skips_contents_of_script_and_rcdata_regions( $input_html ) { + $p = new WP_HTML_Tag_Processor( $input_html ); + $p->next_tag( 'div' ); + + $this->assertTrue( + $p->get_attribute( 'target' ), + 'Did not properly skip over script and rcdata regions; incorrectly found tags inside' + ); + } + + /** + * Data provider + * + * @return array[] + */ + public function data_skips_contents_of_script_and_rcdata_regions() { + return array( + 'Balanced SCRIPT tags' => array( '
    ' ), + 'Unexpected SCRIPT closer after DIV' => array( 'console.log("
    ")
    ' ), + 'Unexpected SCRIPT closer before DIV' => array( 'console.log("")
    ' ), + 'Missing SCRIPT closer' => array( '
    ' ), + 'TITLE before DIV' => array( '<div>
    ' ), + 'SCRIPT inside TITLE' => array( '<script><div>
    ' ), + 'TITLE in TEXTAREA' => array( '
    ' ), + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::next_tag + * @covers WP_HTML_Tag_Processor::set_attribute + */ + public function test_can_query_and_update_wrongly_nested_tags() { + $p = new WP_HTML_Tag_Processor( + '123

    456789

    ' + ); + $p->next_tag( 'span' ); + $p->set_attribute( 'class', 'span-class' ); + $p->next_tag( 'p' ); + $p->set_attribute( 'class', 'p-class' ); + $this->assertSame( + '123

    456789

    ', + $p->get_updated_html(), + 'Did not find overlapping p tag' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::next_tag + * @covers WP_HTML_Tag_Processor::remove_attribute + */ + public function test_removing_specific_attributes_in_malformed_html() { + $p = new WP_HTML_Tag_Processor( self::HTML_MALFORMED ); + $p->next_tag( 'span' ); + $p->remove_attribute( 'Notifications<' ); + $this->assertSame( + '
    Back to notifications
    ', + $p->get_updated_html(), + 'Did not remove "Notifications<" attribute in malformed input' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::set_attribute + */ + public function test_updating_specific_attributes_in_malformed_html() { + $p = new WP_HTML_Tag_Processor( self::HTML_MALFORMED ); + $p->next_tag( 'span' ); + $p->set_attribute( 'id', 'first' ); + $p->next_tag( 'span' ); + $p->set_attribute( 'id', 'second' ); + $this->assertSame( + '
    Back to notifications
    ', + $p->get_updated_html(), + 'Did not add id attributes properly to malformed input' + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::add_class + * @covers WP_HTML_Tag_Processor::set_attribute + * + * @dataProvider data_updating_attributes + * + * @param string $html HTML to process. + * @param string $expected Expected updated HTML. + */ + public function test_updating_attributes( $html, $expected ) { + $p = new WP_HTML_Tag_Processor( $html ); + $p->next_tag(); + $p->set_attribute( 'foo', 'bar' ); + $p->add_class( 'firstTag' ); + $p->next_tag(); + $p->add_class( 'secondTag' ); + + $this->assertSame( + $expected, + $p->get_updated_html(), + 'Did not properly add attributes and class names' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public function data_updating_attributes() { + return array( + 'tags inside of a comment' => array( + 'input' => 'test', + 'expected' => 'test', + ), + 'does not parse <3' => array( + 'input' => '<3 is a heart but is a tag.test', + 'expected' => '<3 is a heart but is a tag.test', + ), + 'does not parse <*' => array( + 'input' => 'The applicative operator <* works well in Haskell; is what?test', + 'expected' => 'The applicative operator <* works well in Haskell; is what?test', + ), + ' in content' => array( + 'input' => 'test', + 'expected' => 'test', + ), + 'custom asdf attribute' => array( + 'input' => '
    test', + 'expected' => '
    test', + ), + 'custom data-* attribute' => array( + 'input' => '

    Some content for a test

    ', + 'expected' => '

    Some content for a test

    ', + ), + 'tag inside of CDATA' => array( + 'input' => ' a HTML Tag]]>test', + 'expected' => ' a HTML Tag]]>test', + ), + ); + } + + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::add_class + * @covers WP_HTML_Tag_Processor::set_attribute + * + * @dataProvider data_updating_attributes_in_malformed_html + * + * @param string $html HTML to process. + * @param string $expected Expected updated HTML. + */ + public function test_updating_attributes_in_malformed_html( $html, $expected ) { + $p = new WP_HTML_Tag_Processor( $html ); + $p->next_tag(); + $p->set_attribute( 'foo', 'bar' ); + $p->add_class( 'firstTag' ); + $p->next_tag(); + $p->add_class( 'secondTag' ); + + $this->assertSame( + $expected, + $p->get_updated_html(), + 'Did not properly update attributes and classnames given malformed input' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public function data_updating_attributes_in_malformed_html() { + $null_byte = chr( 0 ); + + return array( + 'Invalid entity inside attribute value' => array( + 'input' => 'test', + 'expected' => 'test', + ), + 'HTML tag opening inside attribute value' => array( + 'input' => '
    This <is> a <strong is="true">thing.
    test', + 'expected' => '
    This <is> a <strong is="true">thing.
    test', + ), + 'HTML tag brackets in attribute values and data markup' => array( + 'input' => '
    This <is> a <strong is="true">thing.
    test', + 'expected' => '
    This <is> a <strong is="true">thing.
    test', + ), + 'Single and double quotes in attribute value' => array( + 'input' => '

    test', + 'expected' => '

    test', + ), + 'Unquoted attribute values' => array( + 'input' => '


    test', + 'expected' => '
    test', + ), + 'Double-quotes escaped in double-quote attribute value' => array( + 'input' => '
    test', + 'expected' => '
    test', + ), + 'Unquoted attribute value' => array( + 'input' => '
    test', + 'expected' => '
    test', + ), + 'Unquoted attribute value with tag-like value' => array( + 'input' => '
    >test', + 'expected' => '
    >test', + ), + 'Unquoted attribute value with tag-like value followed by tag-like data' => array( + 'input' => '
    >test', + 'expected' => '
    >test', + ), + 'id=&quo;code' => array( + 'input' => '
    test', + 'expected' => '
    test', + ), + 'id/test=5' => array( + 'input' => '
    test', + 'expected' => '
    test', + ), + '
    as the id value' => array( + 'input' => '
    test', + 'expected' => '
    test', + ), + 'id=>code' => array( + 'input' => '
    code>test', + 'expected' => '
    code>test', + ), + 'id"quo="test"' => array( + 'input' => '
    test', + 'expected' => '
    test', + ), + 'id without double quotation marks around null byte' => array( + 'input' => '
    test', + 'expected' => '
    test', + ), + 'Unexpected > before an attribute' => array( + 'input' => '
    id="test">test', + 'expected' => '
    id="test">test', + ), + 'Unexpected = before an attribute' => array( + 'input' => '
    test', + 'expected' => '
    test', + ), + 'Unexpected === before an attribute' => array( + 'input' => '
    test', + 'expected' => '
    test', + ), + 'Missing closing data-tag tag' => array( + 'input' => 'The applicative operator <* works well in Haskell; is what?test', + 'expected' => 'The applicative operator <* works well in Haskell; is what?test', + ), + 'Missing closing t3 tag' => array( + 'input' => '<3 is a heart but is a tag.test', + 'expected' => '<3 is a heart but is a tag.test', + ), + 'invalid comment opening tag' => array( + 'input' => 'test', + 'expected' => 'test', + ), + '=asdf as attribute name' => array( + 'input' => '
    test', + 'expected' => '
    test', + ), + '== as attribute name with value' => array( + 'input' => '
    test', + 'expected' => '
    test', + ), + '=5 as attribute' => array( + 'input' => '
    test', + 'expected' => '
    test', + ), + '= as attribute' => array( + 'input' => '
    test', + 'expected' => '
    test', + ), + '== as attribute' => array( + 'input' => '
    test', + 'expected' => '
    test', + ), + '=== as attribute' => array( + 'input' => '
    test', + 'expected' => '
    test', + ), + 'unsupported disabled attribute' => array( + 'input' => '
    test', + 'expected' => '
    test', + ), + 'malformed custom attributes' => array( + 'input' => '
    test', + 'expected' => '
    test', + ), + 'Multiple unclosed tags treated as a single tag' => array( + 'input' => << + test +HTML + , + 'expected' => << + test +HTML + , + ), + '
    ' => array( + 'input' => '
    test', + 'expected' => '
    test', + ), + '
    ' => array( + 'input' => '
    test', + 'expected' => '
    test', + ), + ); + } +}