diff --git a/packages/playground/data-liberation/bootstrap.php b/packages/playground/data-liberation/bootstrap.php index 9ab7a0dd86..cc1ce63fd7 100644 --- a/packages/playground/data-liberation/bootstrap.php +++ b/packages/playground/data-liberation/bootstrap.php @@ -30,7 +30,6 @@ require_once __DIR__ . '/src/WP_URL.php'; require_once __DIR__ . '/src/xml-api/WP_XML_Decoder.php'; -require_once __DIR__ . '/src/xml-api/WP_XML_Tag_Processor.php'; require_once __DIR__ . '/src/xml-api/WP_XML_Processor.php'; require_once __DIR__ . '/src/WP_WXR_URL_Rewrite_Processor.php'; @@ -38,8 +37,7 @@ // Polyfill WordPress core functions -function _doing_it_wrong() { - +function _doing_it_wrong($method, $message, $version) { } function __($input) { diff --git a/packages/playground/data-liberation/phpunit.xml b/packages/playground/data-liberation/phpunit.xml index a3e66da030..50553590e5 100644 --- a/packages/playground/data-liberation/phpunit.xml +++ b/packages/playground/data-liberation/phpunit.xml @@ -9,7 +9,6 @@ tests/WPBlockMarkupUrlProcessorTests.php tests/URLParserWHATWGComplianceTests.php tests/WPXMLProcessorTests.php - tests/WPXMLTagProcessorTests.php tests/UrldecodeNTests.php diff --git a/packages/playground/data-liberation/src/WP_WXR_URL_Rewrite_Processor.php b/packages/playground/data-liberation/src/WP_WXR_URL_Rewrite_Processor.php index 34caa67513..2b6bb5b343 100644 --- a/packages/playground/data-liberation/src/WP_WXR_URL_Rewrite_Processor.php +++ b/packages/playground/data-liberation/src/WP_WXR_URL_Rewrite_Processor.php @@ -3,8 +3,8 @@ class WP_WXR_URL_Rewrite_Processor { - public static function stream( $current_site_url, $new_site_url ) { - return WP_XML_Processor::stream( + public static function create_stream_processor( $current_site_url, $new_site_url ) { + return WP_XML_Processor::create_stream_processor( function ( $processor ) use ( $current_site_url, $new_site_url ) { if ( static::is_wxr_content_node( $processor ) ) { $text = $processor->get_modifiable_text(); diff --git a/packages/playground/data-liberation/src/stream-api/WP_File_Byte_Stream.php b/packages/playground/data-liberation/src/stream-api/WP_File_Byte_Stream.php index 284e7c37a5..b2710f84ec 100644 --- a/packages/playground/data-liberation/src/stream-api/WP_File_Byte_Stream.php +++ b/packages/playground/data-liberation/src/stream-api/WP_File_Byte_Stream.php @@ -11,6 +11,7 @@ public function __construct( $file_path, $chunk_size = 8096 ) { $this->file_path = $file_path; $this->chunk_size = $chunk_size; parent::__construct(); + $this->append_eof(); } public function pause() { diff --git a/packages/playground/data-liberation/src/stream-api/WP_Stream_Chain.php b/packages/playground/data-liberation/src/stream-api/WP_Stream_Chain.php index e128471a75..b6a71cfab9 100644 --- a/packages/playground/data-liberation/src/stream-api/WP_Stream_Chain.php +++ b/packages/playground/data-liberation/src/stream-api/WP_Stream_Chain.php @@ -16,6 +16,11 @@ * Consult it for reasoning and usage examples: * * https://github.com/adamziel/wxr-normalize/pull/1 + * + * @TODO: Allow each stream to indicate its output reached EOF + * and propagate that information downstream. Otherwise, + * WP_XML_Processor will always end in an "incomplete input" + * state. */ class WP_Stream_Chain extends WP_Byte_Stream implements ArrayAccess, Iterator { private $first_stream; diff --git a/packages/playground/data-liberation/src/stream-api/WP_Stream_Processor.php b/packages/playground/data-liberation/src/stream-api/WP_Stream_Processor.php index a954a31cf0..d07b092120 100644 --- a/packages/playground/data-liberation/src/stream-api/WP_Stream_Processor.php +++ b/packages/playground/data-liberation/src/stream-api/WP_Stream_Processor.php @@ -2,6 +2,7 @@ interface WP_Stream_Processor { public function append_bytes( string $bytes ); + public function input_finished(): void; public function is_finished(): bool; public function is_paused_at_incomplete_input(): bool; public function get_last_error(): ?string; diff --git a/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php b/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php index bef650e33c..237419866e 100644 --- a/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php +++ b/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php @@ -2,15 +2,581 @@ /** * XML API: WP_XML_Processor class * + * Scans through an XML document to find specific tags, then + * transforms those tags by adding, removing, or updating the + * values of the XML attributes within that tag (opener). + * + * It implements a subset of the XML 1.0 specification (https://www.w3.org/TR/xml/) + * and supports XML documents with the following characteristics: + * + * * XML 1.0 + * * Well-formed + * * UTF-8 encoded + * * Not standalone (so can use external entities) + * * No DTD, DOCTYPE, ATTLIST, ENTITY, or conditional sections + * + * ### Possible future direction for this module + * + * The final goal is to support both 1.0 and 1.1 depending on the + * initial processing instruction (). We're + * starting with 1.0, however, because most that's what most WXR + * files declare. + * + * @TODO: Track specific error states, expose informative messages, line + * numbers, indexes, and other debugging info. + * + * @TODO: Skip over the following syntax elements: + * * + * + * or + * + * + * + * ' > + * %xx; + * ]> + * + * @TODO: Support XML 1.1. * @package WordPress * @subpackage HTML-API * @since WP_VERSION */ /** + * Core class used to modify attributes in an XML document for tags matching a query. + * + * ## Usage + * + * Use of this class requires three steps: + * + * 1. Create a new class instance with your input XML document. + * 2. Find the tag(s) you are looking for. + * 3. Request changes to the attributes in those tag(s). + * + * Example: + * + * $tags = new WP_XML_Processor( $xml ); + * if ( $tags->next_tag( 'wp:option' ) ) { + * $tags->set_attribute( 'selected', 'yes' ); + * } + * + * ### Finding tags + * + * The `next_tag()` function moves the internal cursor through + * your input XML document until it finds a tag meeting any of + * the supplied restrictions in the optional query argument. If + * no argument is provided then it will find the next XML tag, + * regardless of what kind it is. + * + * If you want to _find whatever the next tag is_: + * + * $tags->next_tag(); + * + * | Goal | Query | + * |-----------------------------------------------------------|---------------------------------------------------------------------------------| + * | Find any tag. | `$tags->next_tag();` | + * | Find next image tag. | `$tags->next_tag( array( 'tag_name' => 'wp:image' ) );` | + * | Find next image tag (without passing the array). | `$tags->next_tag( 'wp:image' );` | + * + * If a tag was found meeting your criteria then `next_tag()` + * will return `true` and you can proceed to modify it. If it + * returns `false`, however, it failed to find the tag and + * moved the cursor to the end of the file. + * + * Once the cursor reaches the end of the file the processor + * is done and if you want to reach an earlier tag you will + * need to recreate the processor and start over, as it's + * unable to back up or move in reverse. + * + * See the section on bookmarks for an exception to this + * no-backing-up rule. + * + * #### Custom queries + * + * Sometimes it's necessary to further inspect an XML tag than + * the query syntax here permits. In these cases one may further + * inspect the search results using the read-only functions + * provided by the processor or external state or variables. + * + * Example: + * + * // Paint up to the first five `wp:musician` or `wp:actor` tags marked with the "jazzy" style. + * $remaining_count = 5; + * while ( $remaining_count > 0 && $tags->next_tag() ) { + * if ( + * ( 'wp:musician' === $tags->get_tag() || 'wp:actor' === $tags->get_tag() ) && + * 'jazzy' === $tags->get_attribute( 'data-style' ) + * ) { + * $tags->set_attribute( 'wp:theme-style', 'theme-style-everest-jazz' ); + * $remaining_count--; + * } + * } + * + * `get_attribute()` will return `null` if the attribute wasn't present + * on the tag when it was called. It may return `""` (the empty string) + * in cases where the attribute was present but its value was empty. + * For boolean attributes, those whose name is present but no value is + * given, it will return `true` (the only way to set `false` for an + * attribute is to remove it). + * + * #### When matching fails + * + * When `next_tag()` returns `false` it could mean different things: + * + * - The requested tag wasn't found in the input document. + * - The input document ended in the middle of an XML syntax element. + * + * When a document ends in the middle of a syntax element it will pause + * the processor. This is to make it possible in the future to extend the + * input document and proceed - an important requirement for chunked + * streaming parsing of a document. + * + * Example: + * + * $processor = new WP_XML_Processor( 'This next_tag( array( 'tag_name' => 'wp:todo-list' ) ) ) { + * $p->set_bookmark( 'list-start' ); + * while ( $p->next_tag( array( 'tag_closers' => 'visit' ) ) ) { + * if ( 'wp:todo' === $p->get_tag() && $p->is_tag_closer() ) { + * $p->set_bookmark( 'list-end' ); + * $p->seek( 'list-start' ); + * $p->set_attribute( 'data-contained-todos', (string) $total_todos ); + * $total_todos = 0; + * $p->seek( 'list-end' ); + * break; + * } + * + * if ( 'wp:todo-item' === $p->get_tag() && ! $p->is_tag_closer() ) { + * $total_todos++; + * } + * } + * } + * + * ## Tokens and finer-grained processing. + * + * It's possible to scan through every lexical token in the + * XML document using the `next_token()` function. This + * alternative form takes no argument and provides no built-in + * query syntax. + * + * Example: + * + * $title = '(untitled)'; + * $text = ''; + * while ( $processor->next_token() ) { + * switch ( $processor->get_token_name() ) { + * case '#text': + * $text .= $processor->get_modifiable_text(); + * break; + * + * case 'wp:new-line': + * $text .= "\n"; + * break; + * + * case 'wp:title': + * $title = $processor->get_modifiable_text(); + * break; + * } + * } + * return trim( "# {$title}\n\n{$text}" ); + * + * ### Tokens and _modifiable text_. + * + * #### Other tokens with modifiable text. + * + * There are also non-elements which are void/self-closing in nature and contain + * modifiable text that is part of that individual syntax token itself. + * + * - `#text` nodes, whose entire token _is_ the modifiable text. + * - XML comments and tokens that become comments due to some syntax error. The + * text for these tokens is the portion of the comment inside of the syntax. + * E.g. for `` the text is `" comment "` (note the spaces are included). + * - `CDATA` sections, whose text is the content inside of the section itself. E.g. for + * `` the text is `"some content"`. + * - XML Processing instruction nodes like `` (with restrictions [1]). + * + * [1]: XML requires "xml" as a processing instruction name. The Tag Processor captures the entire + * processing instruction as a single token up to the closing `?>`. + * + * ## Design and limitations + * + * The Tag Processor is designed to linearly scan XML documents and tokenize + * XML tags and their attributes. It's designed to do this as efficiently as + * possible without compromising parsing integrity. Therefore it will be + * slower than some methods of modifying XML, such as those incorporating + * over-simplified PCRE patterns, but will not introduce the defects and + * failures that those methods bring in, which lead to broken page renders + * and often to security vulnerabilities. On the other hand, it will be faster + * than full-blown XML parsers such as DOMDocument and use considerably + * less memory. It requires a negligible memory overhead, enough to consider + * it a zero-overhead system. + * + * The performance characteristics are maintained by avoiding tree construction. + * + * The Tag Processor's checks the most important aspects of XML integrity as it scans + * through the document. It verifies that a single root element exists, that are + * no unclosed tags, and that each opener tag has a corresponding closer. It also + * ensures no duplicate attributes exist on a single tag. + * + * At the same time, The Tag Processor also skips expensive validation of XML entities + * in the document. The Tag Processor will initially pass through the invalid entity references + * and only fail when the developer attempts to read their value. If that doesn't happen, + * the invalid values will be left untouched in the final document. + * + * Most operations within the Tag Processor are designed to minimize the difference + * between an input and output document for any given change. For example, the + * `set_attribure` and `remove_attribute` methods preserve whitespace and the attribute + * ordering within the element definition. An exception to this rule is that all attribute + * updates store their values as double-quoted strings, meaning that attributes on input with + * single-quoted or unquoted values will appear in the output with double-quotes. + * + * ### Text Encoding + * + * The Tag Processor assumes that the input XML document is encoded with a + * UTF-8 encoding and will refuse to process documents that declare other encodings. + * * @since WP_VERSION */ -class WP_XML_Processor extends WP_XML_Tag_Processor implements WP_Stream_Processor { +class WP_XML_Processor { + /** + * The maximum number of bookmarks allowed to exist at + * any given time. + * + * @since WP_VERSION + * @var int + * + * @see WP_XML_Processor::set_bookmark() + */ + const MAX_BOOKMARKS = 10; + + /** + * Maximum number of times seek() can be called. + * Prevents accidental infinite loops. + * + * @since WP_VERSION + * @var int + * + * @see WP_XML_Processor::seek() + */ + const MAX_SEEK_OPS = 1000; + + /** + * The XML document to parse. + * + * @since WP_VERSION + * @var string + */ + public $xml; + + /** + * Specifies mode of operation of the parser at any given time. + * + * | State | Meaning | + * | ----------------|------------------------------------------------------------------------| + * | *Ready* | The parser is ready to run. | + * | *Complete* | There is nothing left to parse. | + * | *Incomplete* | The XML ended in the middle of a token; nothing more can be parsed. | + * | *Matched tag* | Found an XML tag; it's possible to modify its attributes. | + * | *Text node* | Found a #text node; this is plaintext and modifiable. | + * | *CDATA node* | Found a CDATA section; this is modifiable. | + * | *PI node* | Found a processing instruction; this is modifiable. | + * | *XML declaration* | Found an XML declaration; this is modifiable. | + * | *Comment* | Found a comment or bogus comment; this is modifiable. | + * + * @since WP_VERSION + * + * @see WP_XML_Processor::STATE_READY + * @see WP_XML_Processor::STATE_COMPLETE + * @see WP_XML_Processor::STATE_INCOMPLETE_INPUT + * @see WP_XML_Processor::STATE_MATCHED_TAG + * @see WP_XML_Processor::STATE_TEXT_NODE + * @see WP_XML_Processor::STATE_CDATA_NODE + * @see WP_XML_Processor::STATE_PI_NODE + * @see WP_XML_Processor::STATE_XML_DECLARATION + * @see WP_XML_Processor::STATE_COMMENT + * + * @var string + */ + protected $parser_state = self::STATE_READY; + + /** + * Whether the input has been finished. + * + * @var bool + */ + protected $expecting_more_input = true; + + /** + * How many bytes from the original XML document have been read and parsed. + * + * This value points to the latest byte offset in the input document which + * has been already parsed. It is the internal cursor for the Tag Processor + * and updates while scanning through the XML tokens. + * + * @since WP_VERSION + * @var int + */ + public $bytes_already_parsed = 0; + + /** + * Byte offset in input document where current token starts. + * + * Example: + * + * ... + * 01234 + * - token starts at 0 + * + * @since WP_VERSION + * + * @var int|null + */ + protected $token_starts_at; + + /** + * Byte length of current token. + * + * Example: + * + * ... + * 012345678901234 + * - token length is 14 - 0 = 14 + * + * a is a token. + * 0123456789 123456789 123456789 + * - token length is 17 - 2 = 15 + * + * @since WP_VERSION + * + * @var int|null + */ + private $token_length; + + /** + * Byte offset in input document where current tag name starts. + * + * Example: + * + * ... + * 01234 + * - tag name starts at 1 + * + * @since WP_VERSION + * + * @var int|null + */ + private $tag_name_starts_at; + + /** + * Byte length of current tag name. + * + * Example: + * + * ... + * 01234 + * --- tag name length is 3 + * + * @since WP_VERSION + * + * @var int|null + */ + private $tag_name_length; + + /** + * Byte offset into input document where current modifiable text starts. + * + * @since WP_VERSION + * + * @var int + */ + private $text_starts_at; + + /** + * Byte length of modifiable text. + * + * @since WP_VERSION + * + * @var string + */ + private $text_length; + + /** + * Whether the current tag is an opening tag, e.g. , or a closing tag, e.g. . + * + * @var bool + */ + private $is_closing_tag; + + /** + * Stores an explanation for why something failed, if it did. + * + * @see self::get_last_error + * + * @since WP_VERSION + * + * @var string|null + */ + protected $last_error = null; + + /** + * Lazily-built index of attributes found within an XML tag, keyed by the attribute name. + * + * Example: + * + * // Supposing the parser is working through this content + * // and stops after recognizing the `id` attribute. + * // + * // ^ parsing will continue from this point. + * $this->attributes = array( + * 'id' => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false ) + * ); + * + * // When picking up parsing again, or when asking to find the + * // `class` attribute we will continue and add to this array. + * $this->attributes = array( + * 'id' => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false ), + * 'class' => new WP_HTML_Attribute_Token( 'class', 23, 7, 17, 13, false ) + * ); + * + * @since WP_VERSION + * @var WP_HTML_Attribute_Token[] + */ + private $attributes = array(); + + /** + * Tracks a semantic location in the original XML which + * shifts with updates as they are applied to the document. + * + * @since WP_VERSION + * @var WP_HTML_Span[] + */ + protected $bookmarks = array(); + + /** + * Lexical replacements to apply to input XML document. + * + * "Lexical" in this class refers to the part of this class which + * operates on pure text _as text_ and not as XML. There's a line + * between the public interface, with XML-semantic methods like + * `set_attribute` and `add_class`, and an internal state that tracks + * text offsets in the input document. + * + * When higher-level XML methods are called, those have to transform their + * operations (such as setting an attribute's value) into text diffing + * operations (such as replacing the sub-string from indices A to B with + * some given new string). These text-diffing operations are the lexical + * updates. + * + * As new higher-level methods are added they need to collapse their + * operations into these lower-level lexical updates since that's the + * Tag Processor's internal language of change. Any code which creates + * these lexical updates must ensure that they do not cross XML syntax + * boundaries, however, so these should never be exposed outside of this + * class or any classes which intentionally expand its functionality. + * + * These are enqueued while editing the document instead of being immediately + * applied to avoid processing overhead, string allocations, and string + * copies when applying many updates to a single document. + * + * Example: + * + * // Replace an attribute stored with a new value, indices + * // sourced from the lazily-parsed XML recognizer. + * $start = $attributes['src']->start; + * $length = $attributes['src']->length; + * $modifications[] = new WP_HTML_Text_Replacement( $start, $length, $new_value ); + * + * // Correspondingly, something like this will appear in this array. + * $lexical_updates = array( + * WP_HTML_Text_Replacement( 14, 28, 'https://my-site.my-domain/wp-content/uploads/2014/08/kittens.jpg' ) + * ); + * + * @since WP_VERSION + * @var WP_HTML_Text_Replacement[] + */ + protected $lexical_updates = array(); + + /** + * Tracks and limits `seek()` calls to prevent accidental infinite loops. + * + * @since WP_VERSION + * @var int + * + * @see WP_XML_Processor::seek() + */ + protected $seek_count = 0; /** * Indicates the current parsing stage. @@ -32,9 +598,9 @@ class WP_XML_Processor extends WP_XML_Tag_Processor implements WP_Stream_Process * | *Element* | The parser is parsing the root element. | * | *Misc* | The parser is parsing miscellaneous content. | * - * @see WP_XML_Tag_Processor::IN_PROLOG_CONTEXT - * @see WP_XML_Tag_Processor::IN_ELEMENT_CONTEXT - * @see WP_XML_Tag_Processor::IN_MISC_CONTEXT + * @see WP_XML_Processor::IN_PROLOG_CONTEXT + * @see WP_XML_Processor::IN_ELEMENT_CONTEXT + * @see WP_XML_Processor::IN_MISC_CONTEXT * * @since WP_VERSION * @var bool @@ -50,13 +616,51 @@ class WP_XML_Processor extends WP_XML_Tag_Processor implements WP_Stream_Process */ public $stack_of_open_elements = array(); - public static function stream( $node_visitor_callback ) { - $xml_processor = new WP_XML_Processor( '', array(), WP_XML_Processor::IN_PROLOG_CONTEXT ); + public $had_previous_chunks = false; + + /** + * + */ + public static function from_string( $xml, $known_definite_encoding = 'UTF-8' ) { + if ( 'UTF-8' !== $known_definite_encoding ) { + return null; + } + + $processor = new WP_XML_Processor( $xml ); + $processor->input_finished(); + return $processor; + } + + public static function from_stream( $xml, $known_definite_encoding = 'UTF-8' ) { + if ( 'UTF-8' !== $known_definite_encoding ) { + return null; + } + return new WP_XML_Processor( $xml ); + } + + /** + * Constructor. + * + * Do not use this method. Use the static creator methods instead. + * + * @access private + * + * @since 6.4.0 + * + * @see WP_XML_Processor::create_fragment() + * @see WP_XML_Processor::create_stream() + * + * @param string $xml XML to process. + */ + protected function __construct( $xml ) { + $this->xml = $xml; + } + + public static function create_stream_processor( $node_visitor_callback ) { + $xml_processor = WP_XML_Processor::from_stream( '' ); return new ProcessorByteStream( $xml_processor, function ( $state ) use ( $xml_processor, $node_visitor_callback ) { - $buffer = $xml_processor->flush_processed_xml(); - $new_bytes = $state->consume_input_bytes(); if ( null !== $new_bytes ) { $xml_processor->append_bytes( $new_bytes ); @@ -67,16 +671,16 @@ function ( $state ) use ( $xml_processor, $node_visitor_callback ) { $node_visitor_callback( $xml_processor ); } + $buffer = ''; if ( $tokens_found > 0 ) { $buffer .= $xml_processor->flush_processed_xml(); } elseif ( - $tokens_found === 0 && - ! $xml_processor->is_paused_at_incomplete_input() && - $xml_processor->get_current_depth() === 0 + $tokens_found === 0 && + ! $xml_processor->is_paused_at_incomplete_input() && + $xml_processor->get_current_depth() === 0 ) { - // We've reached the end of the document, let's finish up. - // @TODO: Fix this so it doesn't return the entire XML - $buffer .= $xml_processor->get_unprocessed_xml(); + $buffer .= $xml_processor->flush_processed_xml(); + $buffer .= $xml_processor->get_updated_xml(); $state->finish(); } @@ -90,6 +694,8 @@ function ( $state ) use ( $xml_processor, $node_visitor_callback ) { ); } + /* + @TODO: implement these methods for re-entrancy public function pause() { return array( @@ -97,7 +703,7 @@ public function pause() { // @TODO: Include all the information below in the bookmark: 'bytes_already_parsed' => $this->token_starts_at, 'breadcrumbs' => $this->get_breadcrumbs(), - 'parser_context' => $this->get_parser_context(), + 'parser_context' => $this->parser_context, 'stack_of_open_elements' => $this->stack_of_open_elements, ); } @@ -107,8 +713,9 @@ public function resume( $paused ) { $this->stack_of_open_elements = $paused['stack_of_open_elements']; $this->parser_context = $paused['parser_context']; $this->bytes_already_parsed = $paused['bytes_already_parsed']; - $this->base_class_next_token(); + $this->parse_next_token(); } + */ /** * Wipes out the processed XML and appends the next chunk of XML to @@ -117,17 +724,41 @@ public function resume( $paused ) { * @param string $next_chunk XML to append. */ public function append_bytes( string $next_chunk ) { - $this->xml .= $next_chunk; + if ( ! $this->expecting_more_input ) { + _doing_it_wrong( + __METHOD__, + __( 'Cannot append bytes after the last input chunk was provided and input_finished() was called.' ), + 'WP_VERSION' + ); + return false; + } + $this->xml .= $next_chunk; + $this->had_previous_chunks = true; + if ( $this->parser_state === self::STATE_INCOMPLETE_INPUT ) { + $this->parser_state = self::STATE_READY; + } + return true; + } + + /** + * Indicates that all the XML document bytes have been provided. + * + * After calling this method, the processor will emit errors where + * previously it would have entered the STATE_INCOMPLETE_INPUT state. + */ + public function input_finished() { + $this->expecting_more_input = false; } public function flush_processed_xml() { + // Flush updates $this->get_updated_xml(); - $processed_xml = $this->get_processed_xml(); - $unprocessed_xml = $this->get_unprocessed_xml(); + $processed_xml = substr( $this->xml, 0, $this->bytes_already_parsed ); + $unprocessed_xml = substr( $this->xml, $this->bytes_already_parsed ); $breadcrumbs = $this->get_breadcrumbs(); - $parser_context = $this->get_parser_context(); + $parser_context = $this->parser_context; $this->reset_state(); @@ -140,29 +771,486 @@ public function flush_processed_xml() { } /** - * Constructor. + * Internal method which finds the next token in the XML document. * - * @since WP_VERSION + * This method is a protected internal function which implements the logic for + * finding the next token in a document. It exists so that the parser can update + * its state without affecting the location of the cursor in the document and + * without triggering subclass methods for things like `next_token()`, e.g. when + * applying patches before searching for the next token. + * + * @since 6.5.0 * - * @param string $xml XML to process. + * @access private + * + * @return bool Whether a token was parsed. */ - public function __construct( $xml, $breadcrumbs = array(), $parser_context = self::IN_PROLOG_CONTEXT ) { - parent::__construct( $xml ); - $this->stack_of_open_elements = $breadcrumbs; - $this->parser_context = $parser_context; - } + protected function parse_next_token() { + $was_at = $this->bytes_already_parsed; + $this->after_tag(); - public function get_parser_context() { - return $this->parser_context; - } + // Don't proceed if there's nothing more to scan. + if ( + self::STATE_COMPLETE === $this->parser_state || + self::STATE_INCOMPLETE_INPUT === $this->parser_state || + null !== $this->last_error + ) { + return false; + } - /** - * Finds the next element matching the $query. - * - * This doesn't currently have a way to represent non-tags and doesn't process - * semantic rules for text nodes. For access to the raw tokens consider using - * WP_XML_Tag_Processor instead. - * + /* + * The next step in the parsing loop determines the parsing state; + * clear it so that state doesn't linger from the previous step. + */ + $this->parser_state = self::STATE_READY; + + if ( $this->bytes_already_parsed >= strlen( $this->xml ) ) { + if ( $this->expecting_more_input ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + } else { + $this->parser_state = self::STATE_COMPLETE; + } + return false; + } + + // Find the next tag if it exists. + if ( false === $this->parse_next_tag() ) { + if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) { + $this->bytes_already_parsed = $was_at; + } + + return false; + } + + if ( null !== $this->last_error ) { + return false; + } + + /* + * For legacy reasons the rest of this function handles tags and their + * attributes. If the processor has reached the end of the document + * or if it matched any other token then it should return here to avoid + * attempting to process tag-specific syntax. + */ + if ( + self::STATE_INCOMPLETE_INPUT !== $this->parser_state && + self::STATE_COMPLETE !== $this->parser_state && + self::STATE_MATCHED_TAG !== $this->parser_state + ) { + return true; + } + + if ( $this->is_closing_tag ) { + $this->skip_whitespace(); + } else { + // Parse all of its attributes. + while ( $this->parse_next_attribute() ) { + continue; + } + } + + if ( null !== $this->last_error ) { + return false; + } + + // Ensure that the tag closes before the end of the document. + if ( + self::STATE_INCOMPLETE_INPUT === $this->parser_state || + $this->bytes_already_parsed >= strlen( $this->xml ) + ) { + // Does this appropriately clear state (parsed attributes)? + $this->set_incomplete_input_or_parse_error(); + $this->bytes_already_parsed = $was_at; + + return false; + } + + $tag_ends_at = strpos( $this->xml, '>', $this->bytes_already_parsed ); + if ( false === $tag_ends_at ) { + $this->set_incomplete_input_or_parse_error(); + $this->bytes_already_parsed = $was_at; + + return false; + } + + if ( $this->is_closing_tag && $tag_ends_at !== $this->bytes_already_parsed ) { + $this->last_error = self::ERROR_SYNTAX; + _doing_it_wrong( + __METHOD__, + __( 'Invalid closing tag encountered.' ), + 'WP_VERSION' + ); + return false; + } + + $this->parser_state = self::STATE_MATCHED_TAG; + $this->bytes_already_parsed = $tag_ends_at + 1; + $this->token_length = $this->bytes_already_parsed - $this->token_starts_at; + + /* + * If we are in a PCData element, everything until the closer + * is considered text. + */ + if ( ! $this->is_pcdata_element() ) { + return true; + } + + /* + * Preserve the opening tag pointers, as these will be overwritten + * when finding the closing tag. They will be reset after finding + * the closing to tag to point to the opening of the special atomic + * tag sequence. + */ + $tag_name_starts_at = $this->tag_name_starts_at; + $tag_name_length = $this->tag_name_length; + $tag_ends_at = $this->token_starts_at + $this->token_length; + $attributes = $this->attributes; + + $found_closer = $this->skip_pcdata( $this->get_tag() ); + + // Closer not found, the document is incomplete. + if ( false === $found_closer ) { + $this->set_incomplete_input_or_parse_error(); + $this->bytes_already_parsed = $was_at; + return false; + } + + /* + * The values here look like they reference the opening tag but they reference + * the closing tag instead. This is why the opening tag values were stored + * above in a variable. It reads confusingly here, but that's because the + * functions that skip the contents have moved all the internal cursors past + * the inner content of the tag. + */ + $this->token_starts_at = $was_at; + $this->token_length = $this->bytes_already_parsed - $this->token_starts_at; + $this->text_starts_at = $tag_ends_at; + $this->text_length = $this->tag_name_starts_at - $this->text_starts_at; + $this->tag_name_starts_at = $tag_name_starts_at; + $this->tag_name_length = $tag_name_length; + $this->attributes = $attributes; + + return true; + } + + /** + * Whether the processor paused because the input XML document ended + * in the middle of a syntax element, such as in the middle of a tag. + * + * Example: + * + * $processor = new WP_XML_Processor( '

Surprising fact you may no… + * ^ ^ + * \-|-- it shifts with edits + * + * Bookmarks provide the ability to seek to a previously-scanned + * place in the XML document. This avoids the need to re-scan + * the entire document. + * + * Example: + * + *
  • One
  • Two
  • Three
+ * ^^^^ + * want to note this last item + * + * $p = new WP_XML_Processor( $xml ); + * $in_list = false; + * while ( $p->next_tag( array( 'tag_closers' => $in_list ? 'visit' : 'skip' ) ) ) { + * if ( 'UL' === $p->get_tag() ) { + * if ( $p->is_tag_closer() ) { + * $in_list = false; + * $p->set_bookmark( 'resume' ); + * if ( $p->seek( 'last-li' ) ) { + * $p->add_class( 'last-li' ); + * } + * $p->seek( 'resume' ); + * $p->release_bookmark( 'last-li' ); + * $p->release_bookmark( 'resume' ); + * } else { + * $in_list = true; + * } + * } + * + * if ( 'LI' === $p->get_tag() ) { + * $p->set_bookmark( 'last-li' ); + * } + * } + * + * Bookmarks intentionally hide the internal string offsets + * to which they refer. They are maintained internally as + * updates are applied to the XML document and therefore + * retain their "position" - the location to which they + * originally pointed. The inability to use bookmarks with + * functions like `substr` is therefore intentional to guard + * against accidentally breaking the XML. + * + * Because bookmarks allocate memory and require processing + * for every applied update, they are limited and require + * a name. They should not be created with programmatically-made + * names, such as "li_{$index}" with some loop. As a general + * rule they should only be created with string-literal names + * like "start-of-section" or "last-paragraph". + * + * Bookmarks are a powerful tool to enable complicated behavior. + * Consider double-checking that you need this tool if you are + * reaching for it, as inappropriate use could lead to broken + * XML structure or unwanted processing overhead. + * + * @since WP_VERSION + * + * @param string $name Identifies this particular bookmark. + * @return bool Whether the bookmark was successfully created. + */ + public function set_bookmark( $name ) { + // It only makes sense to set a bookmark if the parser has paused on a concrete token. + if ( + self::STATE_COMPLETE === $this->parser_state || + self::STATE_INCOMPLETE_INPUT === $this->parser_state + ) { + return false; + } + + if ( ! array_key_exists( $name, $this->bookmarks ) && count( $this->bookmarks ) >= static::MAX_BOOKMARKS ) { + _doing_it_wrong( + __METHOD__, + __( 'Too many bookmarks: cannot create any more.' ), + 'WP_VERSION' + ); + return false; + } + + $this->bookmarks[ $name ] = new WP_HTML_Span( $this->token_starts_at, $this->token_length ); + + return true; + } + + + /** + * Removes a bookmark that is no longer needed. + * + * Releasing a bookmark frees up the small + * performance overhead it requires. + * + * @param string $name Name of the bookmark to remove. + * @return bool Whether the bookmark already existed before removal. + */ + public function release_bookmark( $name ) { + if ( ! array_key_exists( $name, $this->bookmarks ) ) { + return false; + } + + unset( $this->bookmarks[ $name ] ); + + return true; + } + + /** + * Skips contents of PCDATA element. + * + * @since WP_VERSION + * + * @see https://www.w3.org/TR/xml/#sec-mixed-content + * + * @param string $tag_name The tag name which will close the PCDATA region. + * @return false|int Byte offset of the closing tag, or false if not found. + */ + private function skip_pcdata( $tag_name ) { + $xml = $this->xml; + $doc_length = strlen( $xml ); + $tag_length = strlen( $tag_name ); + + $at = $this->bytes_already_parsed; + while ( false !== $at && $at < $doc_length ) { + $at = strpos( $this->xml, 'tag_name_starts_at = $at; + + // Fail if there is no possible tag closer. + if ( false === $at ) { + return false; + } + + $at += 2 + $tag_length; + $at += strspn( $this->xml, " \t\f\r\n", $at ); + $this->bytes_already_parsed = $at; + + /* + * Ensure that the tag name terminates to avoid matching on + * substrings of a longer tag name. For example, the sequence + * "= strlen( $xml ) ) { + return false; + } + if ( '>' === $xml[ $at ] ) { + $this->bytes_already_parsed = $at + 1; + return true; + } + } + + return false; + } + + /** + * Returns the last error, if any. + * + * Various situations lead to parsing failure but this class will + * return `false` in all those cases. To determine why something + * failed it's possible to request the last error. This can be + * helpful to know to distinguish whether a given tag couldn't + * be found or if content in the document caused the processor + * to give up and abort processing. + * + * Example + * + * $processor = WP_XML_Processor::create_fragment( '' ); + * false === $processor->next_tag(); + * WP_XML_Processor::ERROR_SYNTAX === $processor->get_last_error(); + * + * @since WP_VERSION + * + * @see self::ERROR_UNSUPPORTED + * @see self::ERROR_EXCEEDED_MAX_BOOKMARKS + * + * @return string|null The last error, if one exists, otherwise null. + */ + public function get_last_error(): ?string { + return $this->last_error; + } + + /** + * Tag names declared as PCDATA elements. + * + * PCDATA elements are elements in which everything is treated as + * text, even syntax that may look like other elements, closers, + * processing instructions, etc. + * + * Example: + * + * + * + * This text contains syntax that seems + * like XML nodes: + * + * + * + * + * + * + * &<>"' + * + * But! It's all treated as text. + * + * + * + * @var array + */ + private $pcdata_elements = array(); + + /** + * Declares an element as PCDATA. + * + * PCDATA elements are elements in which everything is treated as + * text, even syntax that may look like other elements, closers, + * processing instructions, etc. + * + * For example: + * + * $processor = new WP_XML_Processor( + * << + * + * This text uses syntax that may seem + * like XML nodes: + * + * + * + * + * + * + * &<>"' + * + * But! It's all treated as text. + * + * + * XML + * ); + * + * $processor->declare_element_as_pcdata('my-pcdata'); + * $processor->next_tag('my-pcdata'); + * $processor->next_token(); + * + * // Returns everything inside the + * // element as text: + * $processor->get_modifiable_text(); + * + * @param string $element_name The name of the element to declare as PCDATA. + * @return void + */ + public function declare_element_as_pcdata( $element_name ) { + $this->pcdata_elements[ $element_name ] = true; + } + + /** + * Indicates if the currently matched tag is a PCDATA element. + * + * @since WP_VERSION + * + * @return bool Whether the currently matched tag is a PCDATA element. + */ + public function is_pcdata_element() { + return array_key_exists( $this->get_tag(), $this->pcdata_elements ); + } + + + /** + * Finds the next element matching the $query. + * + * This doesn't currently have a way to represent non-tags and doesn't process + * semantic rules for text nodes. For access to the raw tokens consider using + * WP_XML_Processor instead. + * * @since WP_VERSION * * @param array|string|null $query { @@ -244,159 +1332,1462 @@ public function next_tag( $query = null ) { return false; } - /* - * Sets a bookmark in the XML document. - * - * Bookmarks represent specific places or tokens in the HTML - * document, such as a tag opener or closer. When applying - * edits to a document, such as setting an attribute, the - * text offsets of that token may shift; the bookmark is - * kept updated with those shifts and remains stable unless - * the entire span of text in which the token sits is removed. - * - * Release bookmarks when they are no longer needed. + /** + * Parses the next tag. * - * Example: + * This will find and start parsing the next tag, including + * the opening `<`, the potential closer `/`, and the tag + * name. It does not parse the attributes or scan to the + * closing `>`; these are left for other methods. * - *

Surprising fact you may not know!

- * ^ ^ - * \-|-- this `H2` opener bookmark tracks the token + * @since WP_VERSION * - *

Surprising fact you may no… - * ^ ^ - * \-|-- it shifts with edits + * @return bool Whether a tag was found before the end of the document. + */ + private function parse_next_tag() { + $this->after_tag(); + + $xml = $this->xml; + $doc_length = strlen( $xml ); + $was_at = $this->bytes_already_parsed; + $at = $was_at; + + while ( false !== $at && $at < $doc_length ) { + $at = strpos( $xml, '<', $at ); + if ( false === $at ) { + break; + } + + if ( $at > $was_at ) { + $this->parser_state = self::STATE_TEXT_NODE; + $this->token_starts_at = $was_at; + $this->token_length = $at - $was_at; + $this->text_starts_at = $was_at; + $this->text_length = $this->token_length; + $this->bytes_already_parsed = $at; + + return true; + } + + $this->token_starts_at = $at; + + if ( $at + 1 < $doc_length && '/' === $this->xml[ $at + 1 ] ) { + $this->is_closing_tag = true; + ++$at; + } else { + $this->is_closing_tag = false; + } + + if ( $at + 1 >= $doc_length ) { + $this->set_incomplete_input_or_parse_error(); + return false; + } + + /* + * XML tag names are defined by the same `Name` grammar rule as attribute + * names. + * + * Reference: + * * https://www.w3.org/TR/xml/#NT-STag + * * https://www.w3.org/TR/xml/#NT-Name + */ + $tag_name_length = $this->parse_name( $at + 1 ); + if ( $tag_name_length > 0 ) { + ++$at; + $this->parser_state = self::STATE_MATCHED_TAG; + $this->tag_name_starts_at = $at; + $this->tag_name_length = $tag_name_length; + $this->token_length = $this->tag_name_length; + $this->bytes_already_parsed = $at + $this->tag_name_length; + + return true; + } + + /* + * Abort if no tag is found before the end of + * the document. There is nothing left to parse. + */ + if ( $at + 1 >= $doc_length ) { + $this->set_incomplete_input_or_parse_error(); + + return false; + } + + /* + * `is_closing_tag && '!' === $xml[ $at + 1 ] ) { + /* + * ` sequence. + */ + --$closer_at; // Pre-increment inside condition below reduces risk of accidental infinite looping. + while ( ++$closer_at < $doc_length ) { + $closer_at = strpos( $xml, '--', $closer_at ); + if ( false === $closer_at || $closer_at + 2 === $doc_length ) { + $this->set_incomplete_input_or_parse_error(); + return false; + } + + /* + * The string " -- " (double-hyphen) must not occur within comments + * See https://www.w3.org/TR/xml/#sec-comments + */ + if ( '>' !== $xml[ $closer_at + 2 ] ) { + $this->last_error = self::ERROR_SYNTAX; + _doing_it_wrong( + __METHOD__, + __( 'Invalid comment syntax encountered.' ), + 'WP_VERSION' + ); + return false; + } + + $this->parser_state = self::STATE_COMMENT; + $this->token_length = $closer_at + 3 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 4; + $this->text_length = $closer_at - $this->text_starts_at; + $this->bytes_already_parsed = $closer_at + 3; + return true; + } + } + + /* + * Identify CDATA sections. + * + * Within a CDATA section, everything until the ]]> string is treated + * as data, not markup. Left angle brackets and ampersands may occur in + * their literal form; they need not (and cannot) be escaped using "<" + * and "&". CDATA sections cannot nest. + * + * See https://www.w3.org/TR/xml11.xml/#sec-cdata-sect + */ + if ( + ! $this->is_closing_tag && + $doc_length > $this->token_starts_at + 8 && + '[' === $xml[ $this->token_starts_at + 2 ] && + 'C' === $xml[ $this->token_starts_at + 3 ] && + 'D' === $xml[ $this->token_starts_at + 4 ] && + 'A' === $xml[ $this->token_starts_at + 5 ] && + 'T' === $xml[ $this->token_starts_at + 6 ] && + 'A' === $xml[ $this->token_starts_at + 7 ] && + '[' === $xml[ $this->token_starts_at + 8 ] + ) { + $closer_at = strpos( $xml, ']]>', $at + 1 ); + if ( false === $closer_at ) { + $this->set_incomplete_input_or_parse_error(); + + return false; + } + + $this->parser_state = self::STATE_CDATA_NODE; + $this->token_length = $closer_at + 1 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 9; + $this->text_length = $closer_at - $this->text_starts_at; + $this->bytes_already_parsed = $closer_at + 3; + return true; + } + + /* + * Anything else here is either unsupported at this point or invalid + * syntax. See the class-level @TODO annotations for more information. + */ + $this->set_incomplete_input_or_parse_error(); + + return false; + } + + /* + * An `had_previous_chunks && + ! $this->is_closing_tag && + '?' === $xml[ $at + 1 ] && + 'x' === $xml[ $at + 2 ] && + 'm' === $xml[ $at + 3 ] && + 'l' === $xml[ $at + 4 ] + ) { + // Setting the parser state early for the get_attribute() calls later in this + // branch. + $this->parser_state = self::STATE_XML_DECLARATION; + + $at += 5; + + // Skip whitespace. + $at += strspn( $this->xml, " \t\f\r\n", $at ); + + $this->bytes_already_parsed = $at; + + /* + * Reuse parse_next_attribute() to parse the XML declaration attributes. + * Technically, only "version", "encoding", and "standalone" are accepted + * and, unlike regular tag attributes, their values can contain any character + * other than the opening quote. However, the "<" and "&" characters are very + * unlikely to be encountered and cause trouble, so this code path liberally + * does not provide a dedicated parsing logic. + */ + while ( false !== $this->parse_next_attribute() ) { + $this->skip_whitespace(); + // Parse until the XML declaration closer. + if ( '?' === $xml[ $this->bytes_already_parsed ] ) { + break; + } + } + + if ( null !== $this->last_error ) { + return false; + } + + foreach ( $this->attributes as $name => $attribute ) { + if ( 'version' !== $name && 'encoding' !== $name && 'standalone' !== $name ) { + $this->last_error = self::ERROR_SYNTAX; + _doing_it_wrong( + __METHOD__, + __( 'Invalid attribute found in XML declaration.' ), + 'WP_VERSION' + ); + return false; + } + } + + if ( '1.0' !== $this->get_attribute( 'version' ) ) { + $this->last_error = self::ERROR_UNSUPPORTED; + _doing_it_wrong( + __METHOD__, + __( 'Unsupported XML version declared' ), + 'WP_VERSION' + ); + return false; + } + + /** + * Standalone XML documents have no external dependencies, + * including predefined entities like ` ` and `©`. + * + * See https://www.w3.org/TR/xml/#sec-predefined-ent. + */ + if ( null !== $this->get_attribute( 'encoding' ) + && 'UTF-8' !== strtoupper( $this->get_attribute( 'encoding' ) ) + ) { + $this->last_error = self::ERROR_UNSUPPORTED; + _doing_it_wrong( + __METHOD__, + __( 'Unsupported XML encoding declared, only UTF-8 is supported.' ), + 'WP_VERSION' + ); + return false; + } + if ( null !== $this->get_attribute( 'standalone' ) + && 'YES' !== strtoupper( $this->get_attribute( 'standalone' ) ) + ) { + $this->last_error = self::ERROR_UNSUPPORTED; + _doing_it_wrong( + __METHOD__, + __( 'Standalone XML documents are not supported.' ), + 'WP_VERSION' + ); + return false; + } + + $at = $this->bytes_already_parsed; + + // Skip whitespace. + $at += strspn( $this->xml, " \t\f\r\n", $at ); + + // Consume the closer. + if ( ! ( + $at + 2 <= $doc_length && + '?' === $xml[ $at ] && + '>' === $xml[ $at + 1 ] + ) ) { + $this->last_error = self::ERROR_SYNTAX; + _doing_it_wrong( + __METHOD__, + __( 'XML declaration closer not found.' ), + 'WP_VERSION' + ); + return false; + } + + $this->token_length = $at + 2 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 2; + $this->text_length = $at - $this->text_starts_at; + $this->bytes_already_parsed = $at + 2; + $this->parser_state = self::STATE_XML_DECLARATION; + + return true; + } + + /* + * `is_closing_tag && + '?' === $xml[ $at + 1 ] + ) { + if ( $at + 4 >= $doc_length ) { + $this->set_incomplete_input_or_parse_error(); + + return false; + } + + if ( ! ( + ( 'x' === $xml[ $at + 2 ] || 'X' === $xml[ $at + 2 ] ) && + ( 'm' === $xml[ $at + 3 ] || 'M' === $xml[ $at + 3 ] ) && + ( 'l' === $xml[ $at + 4 ] || 'L' === $xml[ $at + 4 ] ) + ) ) { + _doing_it_wrong( + __METHOD__, + __( 'Invalid processing instruction target.' ), + 'WP_VERSION' + ); + return false; + } + + $at += 5; + + // Skip whitespace. + $this->skip_whitespace(); + + /* + * Find the closer. + * + * We could, at this point, only consume the bytes allowed by the specification, that is: + * + * [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] // any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. + * + * However, that would require running a slow regular-expression engine for, seemingly, + * little benefit. For now, we are going to pretend that all bytes are allowed until the + * closing ?> is found. Some failures may pass unnoticed. That may not be a problem in practice, + * but if it is then this code path will require a stricter implementation. + */ + $closer_at = strpos( $xml, '?>', $at ); + if ( false === $closer_at ) { + $this->set_incomplete_input_or_parse_error(); + + return false; + } + + $this->parser_state = self::STATE_PI_NODE; + $this->token_length = $closer_at + 5 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 5; + $this->text_length = $closer_at - $this->text_starts_at; + $this->bytes_already_parsed = $closer_at + 2; + + return true; + } + + ++$at; + } + + // There's no more tag openers and we're not expecting more data – + // this mist be a trailing text node. + if ( ! $this->expecting_more_input ) { + $this->parser_state = self::STATE_TEXT_NODE; + $this->token_starts_at = $was_at; + $this->token_length = $doc_length - $was_at; + $this->text_starts_at = $was_at; + $this->text_length = $doc_length - $was_at; + $this->bytes_already_parsed = $doc_length; + return true; + } + + /* + * This does not imply an incomplete parse; it indicates that there + * can be nothing left in the document other than a #text node. + */ + $this->set_incomplete_input_or_parse_error(); + $this->token_starts_at = $was_at; + $this->token_length = $doc_length - $was_at; + $this->text_starts_at = $was_at; + $this->text_length = $doc_length - $was_at; + return false; + } + + /** + * Parses the next attribute. * - * Bookmarks provide the ability to seek to a previously-scanned - * place in the HTML document. This avoids the need to re-scan - * the entire document. + * @since WP_VERSION + * + * @return bool Whether an attribute was found before the end of the document. + */ + private function parse_next_attribute() { + // Skip whitespace and slashes. + $this->bytes_already_parsed += strspn( $this->xml, " \t\f\r\n/", $this->bytes_already_parsed ); + if ( $this->bytes_already_parsed >= strlen( $this->xml ) ) { + $this->set_incomplete_input_or_parse_error(); + return false; + } + + // No more attributes to parse. + if ( '>' === $this->xml[ $this->bytes_already_parsed ] ) { + return false; + } + + $attribute_start = $this->bytes_already_parsed; + $attribute_name_length = $this->parse_name( $this->bytes_already_parsed ); + if ( 0 === $attribute_name_length ) { + $this->last_error = self::ERROR_SYNTAX; + _doing_it_wrong( + __METHOD__, + __( 'Invalid attribute name encountered.' ), + 'WP_VERSION' + ); + } + $this->bytes_already_parsed += $attribute_name_length; + $attribute_name = substr( $this->xml, $attribute_start, $attribute_name_length ); + $this->skip_whitespace(); + + // Parse attribute value. + ++$this->bytes_already_parsed; + $this->skip_whitespace(); + if ( $this->bytes_already_parsed >= strlen( $this->xml ) ) { + $this->set_incomplete_input_or_parse_error(); + return false; + } + switch ( $this->xml[ $this->bytes_already_parsed ] ) { + case "'": + case '"': + $quote = $this->xml[ $this->bytes_already_parsed ]; + $value_start = $this->bytes_already_parsed + 1; + /** + * XML attributes cannot contain the characters "<" or "&". + * + * This only checks for "<" because it's reasonably fast. + * Ampersands are actually allowed when used as the start + * of an entity reference, but enforcing that would require + * an expensive and complex check. It doesn't seem to be + * worth it. + * + * @TODO: Discuss enforcing or abandoning the ampersand rule + * and document the rationale. + */ + $value_length = strcspn( $this->xml, "<$quote", $value_start ); + $attribute_end = $value_start + $value_length + 1; + + if ( $attribute_end - 1 >= strlen( $this->xml ) ) { + $this->set_incomplete_input_or_parse_error(); + + return false; + } + + if ( $this->xml[ $attribute_end - 1 ] !== $quote ) { + $this->last_error = self::ERROR_SYNTAX; + _doing_it_wrong( + __METHOD__, + __( 'A disallowed character encountered in an attribute value (either < or &).' ), + 'WP_VERSION' + ); + } + $this->bytes_already_parsed = $attribute_end; + break; + + default: + $this->last_error = self::ERROR_SYNTAX; + _doing_it_wrong( + __METHOD__, + __( 'Unquoted attribute value encountered.' ), + 'WP_VERSION' + ); + return false; + } + + if ( $attribute_end >= strlen( $this->xml ) ) { + $this->set_incomplete_input_or_parse_error(); + return false; + } + + if ( $this->is_closing_tag ) { + return true; + } + + if ( array_key_exists( $attribute_name, $this->attributes ) ) { + $this->last_error = self::ERROR_SYNTAX; + _doing_it_wrong( + __METHOD__, + __( 'Duplicate attribute found in an XML tag.' ), + 'WP_VERSION' + ); + return false; + } + + $this->attributes[ $attribute_name ] = new WP_HTML_Attribute_Token( + $attribute_name, + $value_start, + $value_length, + $attribute_start, + $attribute_end - $attribute_start, + false + ); + + return true; + } + + /** + * Move the internal cursor past any immediate successive whitespace. + * + * @since WP_VERSION + */ + private function skip_whitespace() { + $this->bytes_already_parsed += strspn( $this->xml, " \t\f\r\n", $this->bytes_already_parsed ); + } + + // Describes the first character of the attribute name: + // NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] + // See https://www.w3.org/TR/xml/#NT-Name + const NAME_START_CHAR_PATTERN = ':a-z_A-Z\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}'; + const NAME_CHAR_PATTERN = '\-\.0-9\x{B7}\x{0300}-\x{036F}\x{203F}-\x{2040}:a-z_A-Z\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}'; + private function parse_name( $offset ) { + if ( 1 !== preg_match( + '~[' . self::NAME_START_CHAR_PATTERN . ']~Ssu', + $this->xml[ $offset ], + $matches + ) ) { + return 0; + } + + $name_length = 1; + + // Consume the rest of the name + preg_match( + '~\G([' . self::NAME_CHAR_PATTERN . ']+)~Ssu', + $this->xml, + $matches, + 0, + $offset + 1 + ); + + if ( is_array( $matches ) && count( $matches ) > 0 ) { + $name_length += strlen( $matches[0] ); + } + + return $name_length; + } + + /** + * Applies attribute updates and cleans up once a tag is fully parsed. + * + * @since WP_VERSION + */ + private function after_tag() { + /* + * Purge updates if there are too many. The actual count isn't + * scientific, but a few values from 100 to a few thousand were + * tests to find a practically-useful limit. + * + * If the update queue grows too big, then the Tag Processor + * will spend more time iterating through them and lose the + * efficiency gains of deferring applying them. + */ + if ( 1000 < count( $this->lexical_updates ) ) { + $this->get_updated_xml(); + } + + foreach ( $this->lexical_updates as $name => $update ) { + /* + * Any updates appearing after the cursor should be applied + * before proceeding, otherwise they may be overlooked. + */ + if ( $update->start >= $this->bytes_already_parsed ) { + $this->get_updated_xml(); + break; + } + + if ( is_int( $name ) ) { + continue; + } + + $this->lexical_updates[] = $update; + unset( $this->lexical_updates[ $name ] ); + } + + $this->token_starts_at = null; + $this->token_length = null; + $this->tag_name_starts_at = null; + $this->tag_name_length = null; + $this->text_starts_at = null; + $this->text_length = null; + $this->is_closing_tag = null; + $this->attributes = array(); + } + + protected function reset_state() { + $this->xml = ''; + $this->parser_state = self::STATE_READY; + $this->bytes_already_parsed = 0; + $this->token_starts_at = null; + $this->token_length = null; + $this->tag_name_starts_at = null; + $this->tag_name_length = null; + $this->text_starts_at = null; + $this->text_length = null; + $this->is_closing_tag = null; + $this->last_error = null; + $this->attributes = array(); + $this->bookmarks = array(); + $this->lexical_updates = array(); + $this->seek_count = 0; + $this->had_previous_chunks = false; + } + + /** + * Applies lexical updates to XML document. + * + * @since WP_VERSION + * + * @param int $shift_this_point Accumulate and return shift for this position. + * @return int How many bytes the given pointer moved in response to the updates. + */ + private function apply_lexical_updates( $shift_this_point = 0 ) { + if ( ! count( $this->lexical_updates ) ) { + return 0; + } + + $accumulated_shift_for_given_point = 0; + + /* + * Attribute updates can be enqueued in any order but updates + * to the document must occur in lexical order; that is, each + * replacement must be made before all others which follow it + * at later string indices in the input document. + * + * Sorting avoid making out-of-order replacements which + * can lead to mangled output, partially-duplicated + * attributes, and overwritten attributes. + */ + usort( $this->lexical_updates, array( self::class, 'sort_start_ascending' ) ); + + $bytes_already_copied = 0; + $output_buffer = ''; + foreach ( $this->lexical_updates as $diff ) { + $shift = strlen( $diff->text ) - $diff->length; + + // Adjust the cursor position by however much an update affects it. + if ( $diff->start < $this->bytes_already_parsed ) { + $this->bytes_already_parsed += $shift; + } + + // Accumulate shift of the given pointer within this function call. + if ( $diff->start <= $shift_this_point ) { + $accumulated_shift_for_given_point += $shift; + } + + $output_buffer .= substr( $this->xml, $bytes_already_copied, $diff->start - $bytes_already_copied ); + $output_buffer .= $diff->text; + $bytes_already_copied = $diff->start + $diff->length; + } + + $this->xml = $output_buffer . substr( $this->xml, $bytes_already_copied ); + + /* + * Adjust bookmark locations to account for how the text + * replacements adjust offsets in the input document. + */ + foreach ( $this->bookmarks as $bookmark_name => $bookmark ) { + $bookmark_end = $bookmark->start + $bookmark->length; + + /* + * Each lexical update which appears before the bookmark's endpoints + * might shift the offsets for those endpoints. Loop through each change + * and accumulate the total shift for each bookmark, then apply that + * shift after tallying the full delta. + */ + $head_delta = 0; + $tail_delta = 0; + + foreach ( $this->lexical_updates as $diff ) { + $diff_end = $diff->start + $diff->length; + + if ( $bookmark->start < $diff->start && $bookmark_end < $diff->start ) { + break; + } + + if ( $bookmark->start >= $diff->start && $bookmark_end < $diff_end ) { + $this->release_bookmark( $bookmark_name ); + continue 2; + } + + $delta = strlen( $diff->text ) - $diff->length; + + if ( $bookmark->start >= $diff->start ) { + $head_delta += $delta; + } + + if ( $bookmark_end >= $diff_end ) { + $tail_delta += $delta; + } + } + + $bookmark->start += $head_delta; + $bookmark->length += $tail_delta - $head_delta; + } + + $this->lexical_updates = array(); + + return $accumulated_shift_for_given_point; + } + + /** + * Checks whether a bookmark with the given name exists. + * + * @since WP_VERSION + * + * @param string $bookmark_name Name to identify a bookmark that potentially exists. + * @return bool Whether that bookmark exists. + */ + public function has_bookmark( $bookmark_name ) { + return array_key_exists( $bookmark_name, $this->bookmarks ); + } + + /** + * Move the internal cursor in the Tag Processor to a given bookmark's location. + * + * Be careful! Seeking backwards to a previous location resets the parser to the + * start of the document and reparses the entire contents up until it finds the + * sought-after bookmarked location. + * + * In order to prevent accidental infinite loops, there's a + * maximum limit on the number of times seek() can be called. + * + * @since WP_VERSION + * + * @param string $bookmark_name Jump to the place in the document identified by this bookmark name. + * @return bool Whether the internal cursor was successfully moved to the bookmark's location. + */ + public function seek( $bookmark_name ) { + if ( ! array_key_exists( $bookmark_name, $this->bookmarks ) ) { + _doing_it_wrong( + __METHOD__, + __( 'Unknown bookmark name.' ), + 'WP_VERSION' + ); + return false; + } + + if ( ++$this->seek_count > static::MAX_SEEK_OPS ) { + _doing_it_wrong( + __METHOD__, + __( 'Too many calls to seek() - this can lead to performance issues.' ), + 'WP_VERSION' + ); + return false; + } + + // Flush out any pending updates to the document. + $this->get_updated_xml(); + + // Point this tag processor before the sought tag opener and consume it. + $this->bytes_already_parsed = $this->bookmarks[ $bookmark_name ]->start; + $this->parser_state = self::STATE_READY; + return $this->parse_next_token(); + } + + /** + * Compare two WP_HTML_Text_Replacement objects. + * + * @since WP_VERSION + * + * @param WP_HTML_Text_Replacement $a First attribute update. + * @param WP_HTML_Text_Replacement $b Second attribute update. + * @return int Comparison value for string order. + */ + private static function sort_start_ascending( $a, $b ) { + $by_start = $a->start - $b->start; + if ( 0 !== $by_start ) { + return $by_start; + } + + $by_text = isset( $a->text, $b->text ) ? strcmp( $a->text, $b->text ) : 0; + if ( 0 !== $by_text ) { + return $by_text; + } + + /* + * This code should be unreachable, because it implies the two replacements + * start at the same location and contain the same text. + */ + return $a->length - $b->length; + } + + /** + * Return the enqueued value for a given attribute, if one exists. + * + * Enqueued updates can take different data types: + * - If an update is enqueued and is boolean, the return will be `true` + * - If an update is otherwise enqueued, the return will be the string value of that update. + * - If an attribute is enqueued to be removed, the return will be `null` to indicate that. + * - If no updates are enqueued, the return will be `false` to differentiate from "removed." + * + * @since WP_VERSION + * + * @param string $comparable_name The attribute name in its comparable form. + * @return string|boolean|null Value of enqueued update if present, otherwise false. + */ + private function get_enqueued_attribute_value( $comparable_name ) { + if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { + return false; + } + + if ( ! isset( $this->lexical_updates[ $comparable_name ] ) ) { + return false; + } + + $enqueued_text = $this->lexical_updates[ $comparable_name ]->text; + + // Removed attributes erase the entire span. + if ( '' === $enqueued_text ) { + return null; + } + + /* + * Boolean attribute updates are just the attribute name without a corresponding value. + * + * This value might differ from the given comparable name in that there could be leading + * or trailing whitespace, and that the casing follows the name given in `set_attribute`. + * + * Example: + * + * $p->set_attribute( 'data-TEST-id', 'update' ); + * 'update' === $p->get_enqueued_attribute_value( 'data-test-id' ); + * + * Detect this difference based on the absence of the `=`, which _must_ exist in any + * attribute containing a value, e.g. ``. + * ¹ ² + * 1. Attribute with a string value. + * 2. Boolean attribute whose value is `true`. + */ + $equals_at = strpos( $enqueued_text, '=' ); + if ( false === $equals_at ) { + return true; + } + + /* + * Finally, a normal update's value will appear after the `=` and + * be double-quoted, as performed incidentally by `set_attribute`. + * + * e.g. `type="text"` + * ¹² ³ + * 1. Equals is here. + * 2. Double-quoting starts one after the equals sign. + * 3. Double-quoting ends at the last character in the update. + */ + $enqueued_value = substr( $enqueued_text, $equals_at + 2, -1 ); + /* + * We're deliberately not decoding entities in attribute values: + * + * Attribute values must not contain direct or indirect entity references to external entities. + * + * See https://www.w3.org/TR/xml/#sec-starttags. + */ + return $enqueued_value; + } + + /** + * Returns the value of a requested attribute from a matched tag opener if that attribute exists. * * Example: * - *
  • One
  • Two
  • Three
- * ^^^^ - * want to note this last item + * $p = new WP_XML_Processor( 'Test' ); + * $p->next_tag( array( 'class_name' => 'test' ) ) === true; + * $p->get_attribute( 'data-test-id' ) === '14'; + * $p->get_attribute( 'enabled' ) === true; + * $p->get_attribute( 'aria-label' ) === null; * - * $p = new WP_HTML_Tag_Processor( $html ); - * $in_list = false; - * while ( $p->next_tag( array( 'tag_closers' => $in_list ? 'visit' : 'skip' ) ) ) { - * if ( 'UL' === $p->get_tag() ) { - * if ( $p->is_tag_closer() ) { - * $in_list = false; - * $p->set_bookmark( 'resume' ); - * if ( $p->seek( 'last-li' ) ) { - * $p->add_class( 'last-li' ); - * } - * $p->seek( 'resume' ); - * $p->release_bookmark( 'last-li' ); - * $p->release_bookmark( 'resume' ); - * } else { - * $in_list = true; - * } - * } + * $p->next_tag() === false; + * $p->get_attribute( 'class' ) === null; * - * if ( 'LI' === $p->get_tag() ) { - * $p->set_bookmark( 'last-li' ); - * } - * } + * @since WP_VERSION + * + * @param string $name Name of attribute whose value is requested. + * @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`. + */ + public function get_attribute( $name ) { + if ( + self::STATE_MATCHED_TAG !== $this->parser_state && + self::STATE_XML_DECLARATION !== $this->parser_state + ) { + return null; + } + + // Return any enqueued attribute value updates if they exist. + $enqueued_value = $this->get_enqueued_attribute_value( $name ); + if ( false !== $enqueued_value ) { + return $enqueued_value; + } + + if ( ! isset( $this->attributes[ $name ] ) ) { + return null; + } + + $attribute = $this->attributes[ $name ]; + $raw_value = substr( $this->xml, $attribute->value_starts_at, $attribute->value_length ); + + $decoded = WP_XML_Decoder::decode( $raw_value ); + if ( ! isset( $decoded ) ) { + /** + * If the attribute contained an invalid value, it's + * a fatal error. + * + * @see WP_XML_Decoder::decode() + */ + $this->last_error = self::ERROR_SYNTAX; + _doing_it_wrong( + __METHOD__, + __( 'Invalid attribute value encountered.' ), + 'WP_VERSION' + ); + return false; + } + + return $decoded; + } + + /** + * Gets names of all attributes matching a given prefix in the current tag. + * + * Note that matching is case-sensitive. This is in accordance with the spec. + * + * Example: + * + * $p = new WP_XML_Processor( 'Test' ); + * $p->next_tag( array( 'class_name' => 'test' ) ) === true; + * $p->get_attribute_names_with_prefix( 'data-' ) === array( 'data-ENABLED' ); + * $p->get_attribute_names_with_prefix( 'DATA-' ) === array( 'DATA-test-id' ); + * $p->get_attribute_names_with_prefix( 'DAta-' ) === array(); + * + * @since WP_VERSION + * + * @param string $prefix Prefix of requested attribute names. + * @return array|null List of attribute names, or `null` when no tag opener is matched. + */ + public function get_attribute_names_with_prefix( $prefix ) { + if ( + self::STATE_MATCHED_TAG !== $this->parser_state || + $this->is_closing_tag + ) { + return null; + } + + $matches = array(); + foreach ( array_keys( $this->attributes ) as $attr_name ) { + if ( str_starts_with( $attr_name, $prefix ) ) { + $matches[] = $attr_name; + } + } + return $matches; + } + + /** + * Returns the uppercase name of the matched tag. + * + * Example: + * + * $p = new WP_XML_Processor( 'Test' ); + * $p->next_tag() === true; + * $p->get_tag() === 'DIV'; + * + * $p->next_tag() === false; + * $p->get_tag() === null; + * + * @since WP_VERSION + * + * @return string|null Name of currently matched tag in input XML, or `null` if none found. + */ + public function get_tag() { + if ( null === $this->tag_name_starts_at ) { + return null; + } + + $tag_name = substr( $this->xml, $this->tag_name_starts_at, $this->tag_name_length ); + + if ( self::STATE_MATCHED_TAG === $this->parser_state ) { + return $tag_name; + } + + return null; + } + + /** + * Indicates if the currently matched tag is an empty element tag. + * + * XML tags ending with a solidus ("/") are parsed as empty elements. They have no + * content and no matching closer is expected. + + * @since WP_VERSION + * + * @return bool Whether the currently matched tag is an empty element tag. + */ + public function is_empty_element() { + if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { + return false; + } + + /* + * An empty element tag is defined by the solidus at the _end_ of the tag, not the beginning. + * + * Example: + * + *
+ * ^ this appears one character before the end of the closing ">". + */ + return '/' === $this->xml[ $this->token_starts_at + $this->token_length - 2 ]; + } + + /** + * Indicates if the current tag token is a tag closer. + * + * Example: + * + * $p = new WP_XML_Processor( '' ); + * $p->next_tag( array( 'tag_name' => 'wp:content', 'tag_closers' => 'visit' ) ); + * $p->is_tag_closer() === false; + * + * $p->next_tag( array( 'tag_name' => 'wp:content', 'tag_closers' => 'visit' ) ); + * $p->is_tag_closer() === true; + * + * @since WP_VERSION + * + * @return bool Whether the current tag is a tag closer. + */ + public function is_tag_closer() { + return ( + self::STATE_MATCHED_TAG === $this->parser_state && + $this->is_closing_tag + ); + } + + /** + * Indicates the kind of matched token, if any. + * + * This differs from `get_token_name()` in that it always + * returns a static string indicating the type, whereas + * `get_token_name()` may return values derived from the + * token itself, such as a tag name or processing + * instruction tag. + * + * Possible values: + * - `#tag` when matched on a tag. + * - `#text` when matched on a text node. + * - `#cdata-section` when matched on a CDATA node. + * - `#comment` when matched on a comment. + * - `#presumptuous-tag` when matched on an empty tag closer. + * + * @since WP_VERSION * - * Bookmarks intentionally hide the internal string offsets - * to which they refer. They are maintained internally as - * updates are applied to the HTML document and therefore - * retain their "position" - the location to which they - * originally pointed. The inability to use bookmarks with - * functions like `substr` is therefore intentional to guard - * against accidentally breaking the HTML. + * @return string|null What kind of token is matched, or null. + */ + public function get_token_type() { + switch ( $this->parser_state ) { + case self::STATE_MATCHED_TAG: + return '#tag'; + + default: + return $this->get_token_name(); + } + } + + /** + * Returns the node name represented by the token. * - * Because bookmarks allocate memory and require processing - * for every applied update, they are limited and require - * a name. They should not be created with programmatically-made - * names, such as "li_{$index}" with some loop. As a general - * rule they should only be created with string-literal names - * like "start-of-section" or "last-paragraph". + * This matches the DOM API value `nodeName`. Some values + * are static, such as `#text` for a text node, while others + * are dynamically generated from the token itself. * - * Bookmarks are a powerful tool to enable complicated behavior. - * Consider double-checking that you need this tool if you are - * reaching for it, as inappropriate use could lead to broken - * HTML structure or unwanted processing overhead. + * Dynamic names: + * - Uppercase tag name for tag matches. + * + * Note that if the Tag Processor is not matched on a token + * then this function will return `null`, either because it + * hasn't yet found a token or because it reached the end + * of the document without matching a token. * * @since WP_VERSION * - * @param string $bookmark_name Identifies this particular bookmark. - * @return bool Whether the bookmark was successfully created. + * @return string|null Name of the matched token. */ - public function set_bookmark( $bookmark_name ) { - return parent::set_bookmark( "_{$bookmark_name}" ); + public function get_token_name() { + switch ( $this->parser_state ) { + case self::STATE_MATCHED_TAG: + return $this->get_tag(); + + case self::STATE_TEXT_NODE: + return '#text'; + + case self::STATE_CDATA_NODE: + return '#cdata-section'; + + case self::STATE_XML_DECLARATION: + return '#xml-declaration'; + + case self::STATE_PI_NODE: + return '#processing-instructions'; + + case self::STATE_COMMENT: + return '#comment'; + } } /** - * Moves the internal cursor in the HTML Processor to a given bookmark's location. + * Returns the modifiable text for a matched token, or an empty string. * - * Be careful! Seeking backwards to a previous location resets the parser to the - * start of the document and reparses the entire contents up until it finds the - * sought-after bookmarked location. + * Modifiable text is text content that may be read and changed without + * changing the XML structure of the document around it. This includes + * the contents of `#text` nodes in the XML as well as the inner + * contents of XML comments, Processing Instructions, and others, even + * though these nodes aren't part of a parsed DOM tree. They also contain + * the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any + * other section in an XML document which cannot contain XML markup (DATA). * - * In order to prevent accidental infinite loops, there's a - * maximum limit on the number of times seek() can be called. + * If a token has no modifiable text then an empty string is returned to + * avoid needless crashing or type errors. An empty string does not mean + * that a token has modifiable text, and a token with modifiable text may + * have an empty string (e.g. a comment with no contents). + * + * @since WP_VERSION + * + * @return string + */ + public function get_modifiable_text() { + if ( null === $this->text_starts_at ) { + return ''; + } + + $text = substr( $this->xml, $this->text_starts_at, $this->text_length ); + + /* + * > the XML processor must behave as if it normalized all line breaks in external parsed + * > entities (including the document entity) on input, before parsing, by translating both + * > the two-character sequence #xD #xA and any #xD that is not followed by #xA to a single + * > #xA character. + * + * See https://www.w3.org/TR/xml/#sec-line-ends + */ + $text = str_replace( array( "\r\n", "\r" ), "\n", $text ); + + // Comment data, CDATA sections, and PCData tags contents are not decoded any further. + if ( + self::STATE_CDATA_NODE === $this->parser_state || + self::STATE_COMMENT === $this->parser_state || + $this->is_pcdata_element() + ) { + return $text; + } + + $decoded = WP_XML_Decoder::decode( $text ); + if ( ! isset( $decoded ) ) { + /** + * If the attribute contained an invalid value, it's + * a fatal error. + * + * @see WP_XML_Decoder::decode() + */ + + $this->last_error = self::ERROR_SYNTAX; + _doing_it_wrong( + __METHOD__, + __( 'Invalid text content encountered.' ), + 'WP_VERSION' + ); + return false; + } + return $decoded; + } + + public function set_modifiable_text( $new_value ) { + switch ( $this->parser_state ) { + case self::STATE_TEXT_NODE: + case self::STATE_COMMENT: + $this->lexical_updates[] = new WP_HTML_Text_Replacement( + $this->text_starts_at, + $this->text_length, + // @TODO This is naive, let's rethink this. + htmlspecialchars( $new_value, ENT_XML1, 'UTF-8' ) + ); + return true; + + case self::STATE_CDATA_NODE: + $this->lexical_updates[] = new WP_HTML_Text_Replacement( + $this->text_starts_at, + $this->text_length, + // @TODO This is naive, let's rethink this. + str_replace( ']]>', ']]>', $new_value ) + ); + return true; + default: + _doing_it_wrong( + __METHOD__, + __( 'Cannot set text content on a non-text node.' ), + 'WP_VERSION' + ); + return false; + } + } + + /** + * Updates or creates a new attribute on the currently matched tag with the passed value. * - * @throws Exception When unable to allocate a bookmark for the next token in the input HTML document. + * For boolean attributes special handling is provided: + * - When `true` is passed as the value, then only the attribute name is added to the tag. + * - When `false` is passed, the attribute gets removed if it existed before. + * + * For string attributes, the value is escaped using the `esc_attr` function. * * @since WP_VERSION * - * @param string $bookmark_name Jump to the place in the document identified by this bookmark name. - * @return bool Whether the internal cursor was successfully moved to the bookmark's location. + * @param string $name The attribute name to target. + * @param string|bool $value The new attribute value. + * @return bool Whether an attribute value was set. */ - public function seek( $bookmark_name ) { - // Flush any pending updates to the document before beginning. - $this->get_updated_xml(); - return parent::seek( "_{$bookmark_name}" ); + public function set_attribute( $name, $value ) { + if ( ! is_string( $value ) ) { + _doing_it_wrong( + __METHOD__, + __( 'Non-string attribute values cannot be passed to set_attribute().' ), + 'WP_VERSION' + ); + return false; + } + if ( + self::STATE_MATCHED_TAG !== $this->parser_state || + $this->is_closing_tag + ) { + return false; + } + + $value = htmlspecialchars( $value, ENT_XML1, 'UTF-8' ); + $updated_attribute = "{$name}=\"{$value}\""; + + /* + * > An attribute name must not appear more than once + * > in the same start-tag or empty-element tag. + * - XML 1.0 spec + * + * @see https://www.w3.org/TR/xml/#sec-starttags + */ + if ( isset( $this->attributes[ $name ] ) ) { + /* + * Update an existing attribute. + * + * Example – set attribute id to "new" in : + * + * + * ^-------------^ + * start end + * replacement: `id="new"` + * + * Result: + */ + $existing_attribute = $this->attributes[ $name ]; + $this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement( + $existing_attribute->start, + $existing_attribute->length, + $updated_attribute + ); + } else { + /* + * Create a new attribute at the tag's name end. + * + * Example – add attribute id="new" to : + * + * + * ^ + * start and end + * replacement: ` id="new"` + * + * Result: + */ + $this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement( + $this->tag_name_starts_at + $this->tag_name_length, + 0, + ' ' . $updated_attribute + ); + } + + return true; } /** - * Removes a bookmark that is no longer needed. + * Remove an attribute from the currently-matched tag. * - * Releasing a bookmark frees up the small - * performance overhead it requires. + * @since WP_VERSION + * + * @param string $name The attribute name to remove. + * @return bool Whether an attribute was removed. + */ + public function remove_attribute( $name ) { + if ( + self::STATE_MATCHED_TAG !== $this->parser_state || + $this->is_closing_tag + ) { + return false; + } + + /* + * If updating an attribute that didn't exist in the input + * document, then remove the enqueued update and move on. + * + * For example, this might occur when calling `remove_attribute()` + * after calling `set_attribute()` for the same attribute + * and when that attribute wasn't originally present. + */ + if ( ! isset( $this->attributes[ $name ] ) ) { + if ( isset( $this->lexical_updates[ $name ] ) ) { + unset( $this->lexical_updates[ $name ] ); + } + return false; + } + + /* + * Removes an existing tag attribute. + * + * Example – remove the attribute id from : + * + * ^-------------^ + * start end + * replacement: `` + * + * Result: + */ + $this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement( + $this->attributes[ $name ]->start, + $this->attributes[ $name ]->length, + '' + ); + + return true; + } + + /** + * Returns the string representation of the XML Tag Processor. * * @since WP_VERSION * - * @param string $bookmark_name Name of the bookmark to remove. - * @return bool Whether the bookmark already existed before removal. + * @see WP_XML_Processor::get_updated_xml() + * + * @return string The processed XML. */ - public function release_bookmark( $bookmark_name ) { - return parent::release_bookmark( "_{$bookmark_name}" ); + public function __toString() { + return $this->get_updated_xml(); } /** - * Checks whether a bookmark with the given name exists. + * Returns the string representation of the XML Tag Processor. * - * @since 6.5.0 + * @since WP_VERSION * - * @param string $bookmark_name Name to identify a bookmark that potentially exists. - * @return bool Whether that bookmark exists. + * @return string The processed XML. */ - public function has_bookmark( $bookmark_name ) { - return parent::has_bookmark( "_{$bookmark_name}" ); + public function get_updated_xml() { + $requires_no_updating = 0 === count( $this->lexical_updates ); + + /* + * When there is nothing more to update and nothing has already been + * updated, return the original document and avoid a string copy. + */ + if ( $requires_no_updating ) { + return $this->xml; + } + + /* + * Keep track of the position right before the current token. This will + * be necessary for reparsing the current token after updating the XML. + */ + $before_current_token = $this->token_starts_at ?? 0; + + /* + * 1. Apply the enqueued edits and update all the pointers to reflect those changes. + */ + $before_current_token += $this->apply_lexical_updates( $before_current_token ); + + /* + * 2. Rewind to before the current tag and reparse to get updated attributes. + * + * At this point the internal cursor points to the end of the tag name. + * Rewind before the tag name starts so that it's as if the cursor didn't + * move; a call to `next_tag()` will reparse the recently-updated attributes + * and additional calls to modify the attributes will apply at this same + * location, but in order to avoid issues with subclasses that might add + * behaviors to `next_tag()`, the internal methods should be called here + * instead. + * + * It's important to note that in this specific place there will be no change + * because the processor was already at a tag when this was called and it's + * rewinding only to the beginning of this very tag before reprocessing it + * and its attributes. + * + *

Previous XMLMore XML

+ * ↑ │ back up by the length of the tag name plus the opening < + * └←─┘ back up by strlen("em") + 1 ==> 3 + */ + $this->bytes_already_parsed = $before_current_token; + $this->parse_next_token(); + + return $this->xml; } /** - * Low-level token iteration is not available in WP_XML_Processor - * as it could lead to undefined behaviors. + * Finds the next token in the XML document. + * + * An XML document can be viewed as a stream of tokens, + * where tokens are things like XML tags, XML comments, + * text nodes, etc. This method finds the next token in + * the XML document and returns whether it found one. + * + * If it starts parsing a token and reaches the end of the + * document then it will seek to the start of the last + * token and pause, returning `false` to indicate that it + * failed to find a complete token. * - * @use WP_XML_Processor::next_tag() instead. + * Possible token types, based on the XML specification: * - * @return false + * - an XML tag + * - a text node - the plaintext inside tags. + * - a CData section + * - an XML comment. + * - a DOCTYPE declaration. + * - a processing instruction, e.g. ``. + * + * @return bool Whether a token was parsed. */ public function next_token() { return $this->step(); } /** - * Steps through the XML document and stop at the next tag, if any. + * Moves the internal cursor to the next token in the XML document + * according to the XML specification. + * + * It considers the current XML context (prolog, element, or misc) + * and only expects the nodes that are allowed in that context. * * @since WP_VERSION * - * @param string $node_to_process Whether to parse the next node or reprocess the current node. - * @return bool Whether a tag was matched. + * @access private + * + * @param int $node_to_process Whether to process the next node or + * reprocess the current node, e.g. using another parser context. + * @return bool Whether a token was parsed. */ private function step( $node_to_process = self::PROCESS_NEXT_NODE ) { // Refuse to proceed if there was a previous error. @@ -406,8 +2797,8 @@ private function step( $node_to_process = self::PROCESS_NEXT_NODE ) { // Finish stepping when there are no more tokens in the document. if ( - WP_XML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state || - WP_XML_Tag_Processor::STATE_COMPLETE === $this->parser_state + WP_XML_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state || + WP_XML_Processor::STATE_COMPLETE === $this->parser_state ) { return false; } @@ -416,17 +2807,15 @@ private function step( $node_to_process = self::PROCESS_NEXT_NODE ) { if ( $this->is_empty_element() ) { $this->pop_open_element(); } - $this->base_class_next_token(); } - static $i = 0; switch ( $this->parser_context ) { case self::IN_PROLOG_CONTEXT: - return $this->step_in_prolog(); + return $this->step_in_prolog( $node_to_process ); case self::IN_ELEMENT_CONTEXT: - return $this->step_in_element(); + return $this->step_in_element( $node_to_process ); case self::IN_MISC_CONTEXT: - return $this->step_in_misc(); + return $this->step_in_misc( $node_to_process ); default: $this->last_error = self::ERROR_UNSUPPORTED; return false; @@ -439,19 +2828,31 @@ private function step( $node_to_process = self::PROCESS_NEXT_NODE ) { * @since WP_VERSION * * @see https://www.w3.org/TR/xml/#NT-document. - * @see WP_XML_Tag_Processor::step + * @see WP_XML_Processor::step * * @return bool Whether a node was found. */ - private function step_in_prolog() { + private function step_in_prolog( $node_to_process = self::PROCESS_NEXT_NODE ) { + if ( self::PROCESS_NEXT_NODE === $node_to_process ) { + $has_next_node = $this->parse_next_token(); + if ( + false === $has_next_node && + ! $this->expecting_more_input + ) { + $this->last_error = self::ERROR_SYNTAX; + _doing_it_wrong( __METHOD__, 'The root element was not found.', 'WP_VERSION' ); + return false; + } + } + // XML requires a root element. If we've reached the end of data in the prolog stage, // before finding a root element, then the document is incomplete. - if ( WP_XML_Tag_Processor::STATE_COMPLETE === $this->parser_state ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; + if ( WP_XML_Processor::STATE_COMPLETE === $this->parser_state ) { + $this->set_incomplete_input_or_parse_error(); return false; } // Do not step if we paused due to an incomplete input. - if ( WP_XML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state ) { + if ( WP_XML_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state ) { return false; } switch ( $this->get_token_type() ) { @@ -484,20 +2885,25 @@ private function step_in_prolog() { * @since WP_VERSION * * @see https://www.w3.org/TR/xml/#NT-document. - * @see WP_XML_Tag_Processor::step + * @see WP_XML_Processor::step * * @return bool Whether a node was found. */ - private function step_in_element() { - // An XML document isn't complete until the root element is closed. - if ( self::STATE_COMPLETE === $this->parser_state && - count( $this->stack_of_open_elements ) > 0 - ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - return false; + private function step_in_element( $node_to_process = self::PROCESS_NEXT_NODE ) { + if ( self::PROCESS_NEXT_NODE === $node_to_process ) { + $has_next_node = $this->parse_next_token(); + if ( + false === $has_next_node && + ! $this->expecting_more_input + ) { + $this->last_error = self::ERROR_SYNTAX; + _doing_it_wrong( __METHOD__, 'A tag was not closed.', 'WP_VERSION' ); + return false; + } } + // Do not step if we paused due to an incomplete input. - if ( WP_XML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state ) { + if ( WP_XML_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state ) { return false; } @@ -541,11 +2947,28 @@ private function step_in_element() { * @since WP_VERSION * * @see https://www.w3.org/TR/xml/#NT-document. - * @see WP_XML_Tag_Processor::step + * @see WP_XML_Processor::step * * @return bool Whether a node was found. */ - private function step_in_misc() { + private function step_in_misc( $node_to_process = self::PROCESS_NEXT_NODE ) { + if ( self::PROCESS_NEXT_NODE === $node_to_process ) { + $has_next_node = $this->parse_next_token(); + if ( + false === $has_next_node && + ! $this->expecting_more_input + ) { + // Parsing is complete. + $this->parser_state = self::STATE_COMPLETE; + return true; + } + } + + // Do not step if we paused due to an incomplete input. + if ( WP_XML_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state ) { + return false; + } + if ( self::STATE_COMPLETE === $this->parser_state ) { return true; } @@ -559,33 +2982,13 @@ private function step_in_misc() { $whitespaces = strspn( $text, " \t\n\r" ); if ( strlen( $text ) !== $whitespaces ) { $this->last_error = self::ERROR_SYNTAX; - _doing_it_wrong( __METHOD__, 'Unexpected token type in prolog stage.', 'WP_VERSION' ); + _doing_it_wrong( __METHOD__, 'Unexpected token type "' . $this->get_token_type() . '" in misc stage.', 'WP_VERSION' ); return false; } return $this->step(); default: - /* - * If we're at the end of the document, we can never be sure - * whether it's complete or are we still waiting for a comment - * or a processing directive. Let's mark the parse as complete - * and let the API consumer decide whether they want to re-parse - * once more data becomes available in. - */ - if ( - WP_XML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state && - $this->is_incomplete_text_node - ) { - $text = $this->get_modifiable_text(); - // Non-whitespace characters are not allowed after the root element was closed. - $contains_only_whitespace = strlen( $text ) === strspn( $text, " \t\n\r" ); - if ( $contains_only_whitespace ) { - $this->parser_state = self::STATE_COMPLETE; - return false; - } - } - $this->last_error = self::ERROR_SYNTAX; - _doing_it_wrong( __METHOD__, 'Unexpected token type in misc stage.', 'WP_VERSION' ); + _doing_it_wrong( __METHOD__, 'Unexpected token type "' . $this->get_token_type() . '" in misc stage.', 'WP_VERSION' ); return false; } } @@ -622,7 +3025,7 @@ public function get_breadcrumbs() { * * Example: * - * $processor = new WP_XML_Tag_Processor( '' ); + * $processor = new WP_XML_Processor( '' ); * $processor->next_tag( 'img' ); * true === $processor->matches_breadcrumbs( array( 'content', 'image' ) ); * true === $processor->matches_breadcrumbs( array( 'wp:post', 'content', 'image' ) ); @@ -707,10 +3110,165 @@ private function push_open_element( $tag_name ) { ); } - private function last_open_element() { - return end( $this->stack_of_open_elements ); + private function set_incomplete_input_or_parse_error() { + if ( $this->expecting_more_input ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + } else { + $this->parser_state = self::STATE_INVALID_DOCUMENT; + $this->last_error = self::ERROR_SYNTAX; + // @TODO: Add a more specific error message. + _doing_it_wrong( __METHOD__, 'Unexpected syntax encountered.', 'WP_VERSION' ); + } } + /** + * Parser Ready State. + * + * Indicates that the parser is ready to run and waiting for a state transition. + * It may not have started yet, or it may have just finished parsing a token and + * is ready to find the next one. + * + * @since WP_VERSION + * + * @access private + */ + const STATE_READY = 'STATE_READY'; + + /** + * Parser Complete State. + * + * Indicates that the parser has reached the end of the document and there is + * nothing left to scan. It finished parsing the last token completely. + * + * @since WP_VERSION + * + * @access private + */ + const STATE_COMPLETE = 'STATE_COMPLETE'; + + /** + * Parser Incomplete Input State. + * + * Indicates that the parser has reached the end of the document before finishing + * a token. It started parsing a token but there is a possibility that the input + * XML document was truncated in the middle of a token. + * + * The parser is reset at the start of the incomplete token and has paused. There + * is nothing more than can be scanned unless provided a more complete document. + * + * @since WP_VERSION + * + * @access private + */ + const STATE_INCOMPLETE_INPUT = 'STATE_INCOMPLETE_INPUT'; + + /** + * Parser Invalid Input State. + * + * Indicates that the parsed xml document contains malformed input and cannot be parsed. + * + * @since WP_VERSION + * + * @access private + */ + const STATE_INVALID_DOCUMENT = 'STATE_INVALID_DOCUMENT'; + + /** + * Parser Matched Tag State. + * + * Indicates that the parser has found an XML tag and it's possible to get + * the tag name and read or modify its attributes (if it's not a closing tag). + * + * @since WP_VERSION + * + * @access private + */ + const STATE_MATCHED_TAG = 'STATE_MATCHED_TAG'; + + /** + * Parser Text Node State. + * + * Indicates that the parser has found a text node and it's possible + * to read and modify that text. + * + * @since WP_VERSION + * + * @access private + */ + const STATE_TEXT_NODE = 'STATE_TEXT_NODE'; + + /** + * Parser CDATA Node State. + * + * Indicates that the parser has found a CDATA node and it's possible + * to read and modify its modifiable text. Note that in XML there are + * no CDATA nodes outside of foreign content (SVG and MathML). Outside + * of foreign content, they are treated as XML comments. + * + * @since WP_VERSION + * + * @access private + */ + const STATE_CDATA_NODE = 'STATE_CDATA_NODE'; + + /** + * Indicates that the parser has found an XML processing instruction. + * + * @since WP_VERSION + * + * @access private + */ + const STATE_PI_NODE = 'STATE_PI_NODE'; + + /** + * Indicates that the parser has found an XML declaration + * + * @since WP_VERSION + * + * @access private + */ + const STATE_XML_DECLARATION = 'STATE_XML_DECLARATION'; + + /** + * Indicates that the parser has found an XML comment and it's + * possible to read and modify its modifiable text. + * + * @since WP_VERSION + * + * @access private + */ + const STATE_COMMENT = 'STATE_COMMENT'; + + /** + * Indicates that the parser encountered unsupported syntax and has bailed. + * + * @since WP_VERSION + * + * @var string + */ + const ERROR_SYNTAX = 'syntax'; + + /** + * Indicates that the provided XML document contains a declaration that is + * unsupported by the parser. + * + * @since WP_VERSION + * + * @var string + */ + const ERROR_UNSUPPORTED = 'unsupported'; + + /** + * Indicates that the parser encountered more XML tokens than it + * was able to process and has bailed. + * + * @since WP_VERSION + * + * @var string + */ + const ERROR_EXCEEDED_MAX_BOOKMARKS = 'exceeded-max-bookmarks'; + + /** * Indicates that we're parsing the `prolog` part of the XML * document. diff --git a/packages/playground/data-liberation/src/xml-api/WP_XML_Tag_Processor.php b/packages/playground/data-liberation/src/xml-api/WP_XML_Tag_Processor.php deleted file mode 100644 index 26ac382e2e..0000000000 --- a/packages/playground/data-liberation/src/xml-api/WP_XML_Tag_Processor.php +++ /dev/null @@ -1,2837 +0,0 @@ -). We're - * starting with 1.0, however, because most that's what most WXR - * files declare. - * - * ## Future work - * - * @TODO: Skip over the following syntax elements: - * * - * - * or - * - * - * - * ' > - * %xx; - * ]> - * - * @TODO: Support XML 1.1. - * @package WordPress - * @subpackage HTML-API - * @since WP_VERSION - */ - -/** - * Core class used to modify attributes in an XML document for tags matching a query. - * - * ## Usage - * - * Use of this class requires three steps: - * - * 1. Create a new class instance with your input XML document. - * 2. Find the tag(s) you are looking for. - * 3. Request changes to the attributes in those tag(s). - * - * Example: - * - * $tags = new WP_XML_Tag_Processor( $xml ); - * if ( $tags->next_tag( 'wp:option' ) ) { - * $tags->set_attribute( 'selected', 'yes' ); - * } - * - * ### Finding tags - * - * The `next_tag()` function moves the internal cursor through - * your input XML document until it finds a tag meeting any of - * the supplied restrictions in the optional query argument. If - * no argument is provided then it will find the next XML tag, - * regardless of what kind it is. - * - * If you want to _find whatever the next tag is_: - * - * $tags->next_tag(); - * - * | Goal | Query | - * |-----------------------------------------------------------|---------------------------------------------------------------------------------| - * | Find any tag. | `$tags->next_tag();` | - * | Find next image tag. | `$tags->next_tag( array( 'tag_name' => 'wp:image' ) );` | - * | Find next image tag (without passing the array). | `$tags->next_tag( 'wp:image' );` | - * - * If a tag was found meeting your criteria then `next_tag()` - * will return `true` and you can proceed to modify it. If it - * returns `false`, however, it failed to find the tag and - * moved the cursor to the end of the file. - * - * Once the cursor reaches the end of the file the processor - * is done and if you want to reach an earlier tag you will - * need to recreate the processor and start over, as it's - * unable to back up or move in reverse. - * - * See the section on bookmarks for an exception to this - * no-backing-up rule. - * - * #### Custom queries - * - * Sometimes it's necessary to further inspect an XML tag than - * the query syntax here permits. In these cases one may further - * inspect the search results using the read-only functions - * provided by the processor or external state or variables. - * - * Example: - * - * // Paint up to the first five `wp:musician` or `wp:actor` tags marked with the "jazzy" style. - * $remaining_count = 5; - * while ( $remaining_count > 0 && $tags->next_tag() ) { - * if ( - * ( 'wp:musician' === $tags->get_tag() || 'wp:actor' === $tags->get_tag() ) && - * 'jazzy' === $tags->get_attribute( 'data-style' ) - * ) { - * $tags->set_attribute( 'wp:theme-style', 'theme-style-everest-jazz' ); - * $remaining_count--; - * } - * } - * - * `get_attribute()` will return `null` if the attribute wasn't present - * on the tag when it was called. It may return `""` (the empty string) - * in cases where the attribute was present but its value was empty. - * For boolean attributes, those whose name is present but no value is - * given, it will return `true` (the only way to set `false` for an - * attribute is to remove it). - * - * #### When matching fails - * - * When `next_tag()` returns `false` it could mean different things: - * - * - The requested tag wasn't found in the input document. - * - The input document ended in the middle of an XML syntax element. - * - * When a document ends in the middle of a syntax element it will pause - * the processor. This is to make it possible in the future to extend the - * input document and proceed - an important requirement for chunked - * streaming parsing of a document. - * - * Example: - * - * $processor = new WP_XML_Tag_Processor( 'This next_tag( array( 'tag_name' => 'wp:todo-list' ) ) ) { - * $p->set_bookmark( 'list-start' ); - * while ( $p->next_tag( array( 'tag_closers' => 'visit' ) ) ) { - * if ( 'wp:todo' === $p->get_tag() && $p->is_tag_closer() ) { - * $p->set_bookmark( 'list-end' ); - * $p->seek( 'list-start' ); - * $p->set_attribute( 'data-contained-todos', (string) $total_todos ); - * $total_todos = 0; - * $p->seek( 'list-end' ); - * break; - * } - * - * if ( 'wp:todo-item' === $p->get_tag() && ! $p->is_tag_closer() ) { - * $total_todos++; - * } - * } - * } - * - * ## Tokens and finer-grained processing. - * - * It's possible to scan through every lexical token in the - * XML document using the `next_token()` function. This - * alternative form takes no argument and provides no built-in - * query syntax. - * - * Example: - * - * $title = '(untitled)'; - * $text = ''; - * while ( $processor->next_token() ) { - * switch ( $processor->get_token_name() ) { - * case '#text': - * $text .= $processor->get_modifiable_text(); - * break; - * - * case 'wp:new-line': - * $text .= "\n"; - * break; - * - * case 'wp:title': - * $title = $processor->get_modifiable_text(); - * break; - * } - * } - * return trim( "# {$title}\n\n{$text}" ); - * - * ### Tokens and _modifiable text_. - * - * #### Other tokens with modifiable text. - * - * There are also non-elements which are void/self-closing in nature and contain - * modifiable text that is part of that individual syntax token itself. - * - * - `#text` nodes, whose entire token _is_ the modifiable text. - * - XML comments and tokens that become comments due to some syntax error. The - * text for these tokens is the portion of the comment inside of the syntax. - * E.g. for `` the text is `" comment "` (note the spaces are included). - * - `CDATA` sections, whose text is the content inside of the section itself. E.g. for - * `` the text is `"some content"`. - * - XML Processing instruction nodes like `` (with restrictions [1]). - * - * [1]: XML requires "xml" as a processing instruction name. The Tag Processor captures the entire - * processing instruction as a single token up to the closing `?>`. - * - * ## Design and limitations - * - * The Tag Processor is designed to linearly scan XML documents and tokenize - * XML tags and their attributes. It's designed to do this as efficiently as - * possible without compromising parsing integrity. Therefore it will be - * slower than some methods of modifying XML, such as those incorporating - * over-simplified PCRE patterns, but will not introduce the defects and - * failures that those methods bring in, which lead to broken page renders - * and often to security vulnerabilities. On the other hand, it will be faster - * than full-blown XML parsers such as DOMDocument and use considerably - * less memory. It requires a negligible memory overhead, enough to consider - * it a zero-overhead system. - * - * The performance characteristics are maintained by avoiding tree construction. - * - * The Tag Processor's checks the most important aspects of XML integrity as it scans - * through the document. It verifies that a single root element exists, that are - * no unclosed tags, and that each opener tag has a corresponding closer. It also - * ensures no duplicate attributes exist on a single tag. - * - * At the same time, The Tag Processor also skips expensive validation of XML entities - * in the document. The Tag Processor will initially pass through the invalid entity references - * and only fail when the developer attempts to read their value. If that doesn't happen, - * the invalid values will be left untouched in the final document. - * - * Most operations within the Tag Processor are designed to minimize the difference - * between an input and output document for any given change. For example, the - * `set_attribure` and `remove_attribute` methods preserve whitespace and the attribute - * ordering within the element definition. An exception to this rule is that all attribute - * updates store their values as double-quoted strings, meaning that attributes on input with - * single-quoted or unquoted values will appear in the output with double-quotes. - * - * ### Text Encoding - * - * The Tag Processor assumes that the input XML document is encoded with a - * UTF-8 encoding and will refuse to process documents that declare other encodings. - * - * @since WP_VERSION - */ -class WP_XML_Tag_Processor { - /** - * The maximum number of bookmarks allowed to exist at - * any given time. - * - * @since WP_VERSION - * @var int - * - * @see WP_XML_Tag_Processor::set_bookmark() - */ - const MAX_BOOKMARKS = 10; - - /** - * Maximum number of times seek() can be called. - * Prevents accidental infinite loops. - * - * @since WP_VERSION - * @var int - * - * @see WP_XML_Tag_Processor::seek() - */ - const MAX_SEEK_OPS = 1000; - - /** - * The XML document to parse. - * - * @since WP_VERSION - * @var string - */ - public $xml; - - /** - * The last query passed to next_tag(). - * - * @since WP_VERSION - * @var array|null - */ - private $last_query; - - /** - * The tag name this processor currently scans for. - * - * @since WP_VERSION - * @var string|null - */ - private $sought_tag_name; - - /** - * The match offset this processor currently scans for. - * - * @since WP_VERSION - * @var int|null - */ - private $sought_match_offset; - - /** - * Whether to visit tag closers, e.g. , when walking an input document. - * - * @since WP_VERSION - * @var bool - */ - private $stop_on_tag_closers; - - /** - * Specifies mode of operation of the parser at any given time. - * - * | State | Meaning | - * | ----------------|------------------------------------------------------------------------| - * | *Ready* | The parser is ready to run. | - * | *Complete* | There is nothing left to parse. | - * | *Incomplete* | The XML ended in the middle of a token; nothing more can be parsed. | - * | *Matched tag* | Found an XML tag; it's possible to modify its attributes. | - * | *Text node* | Found a #text node; this is plaintext and modifiable. | - * | *CDATA node* | Found a CDATA section; this is modifiable. | - * | *PI node* | Found a processing instruction; this is modifiable. | - * | *XML declaration* | Found an XML declaration; this is modifiable. | - * | *Comment* | Found a comment or bogus comment; this is modifiable. | - * - * @since WP_VERSION - * - * @see WP_XML_Tag_Processor::STATE_READY - * @see WP_XML_Tag_Processor::STATE_COMPLETE - * @see WP_XML_Tag_Processor::STATE_INCOMPLETE_INPUT - * @see WP_XML_Tag_Processor::STATE_MATCHED_TAG - * @see WP_XML_Tag_Processor::STATE_TEXT_NODE - * @see WP_XML_Tag_Processor::STATE_CDATA_NODE - * @see WP_XML_Tag_Processor::STATE_PI_NODE - * @see WP_XML_Tag_Processor::STATE_XML_DECLARATION - * @see WP_XML_Tag_Processor::STATE_COMMENT - * - * @var string - */ - protected $parser_state = self::STATE_READY; - - /** - * Whether we stopped at an incomplete text node. - * - * If we are before the last tag in the document, every text - * node is incomplete until we find the next tag. However, - * if we are after the last tag, an incomplete all-whitespace - * node may either mean we're the end of the document or - * that we're still waiting for more data/ - * - * This flag allows us to differentiate between these two - * cases in context-aware APIs such as WP_XML_Processor. - * - * @var bool - */ - protected $is_incomplete_text_node = false; - - /** - * How many bytes from the original XML document have been read and parsed. - * - * This value points to the latest byte offset in the input document which - * has been already parsed. It is the internal cursor for the Tag Processor - * and updates while scanning through the XML tokens. - * - * @since WP_VERSION - * @var int - */ - public $bytes_already_parsed = 0; - - /** - * Byte offset in input document where current token starts. - * - * Example: - * - * ... - * 01234 - * - token starts at 0 - * - * @since WP_VERSION - * - * @var int|null - */ - protected $token_starts_at; - - /** - * Byte length of current token. - * - * Example: - * - * ... - * 012345678901234 - * - token length is 14 - 0 = 14 - * - * a is a token. - * 0123456789 123456789 123456789 - * - token length is 17 - 2 = 15 - * - * @since WP_VERSION - * - * @var int|null - */ - private $token_length; - - /** - * Byte offset in input document where current tag name starts. - * - * Example: - * - * ... - * 01234 - * - tag name starts at 1 - * - * @since WP_VERSION - * - * @var int|null - */ - private $tag_name_starts_at; - - /** - * Byte length of current tag name. - * - * Example: - * - * ... - * 01234 - * --- tag name length is 3 - * - * @since WP_VERSION - * - * @var int|null - */ - private $tag_name_length; - - /** - * Byte offset into input document where current modifiable text starts. - * - * @since WP_VERSION - * - * @var int - */ - private $text_starts_at; - - /** - * Byte length of modifiable text. - * - * @since WP_VERSION - * - * @var string - */ - private $text_length; - - /** - * Whether the current tag is an opening tag, e.g. , or a closing tag, e.g. . - * - * @var bool - */ - private $is_closing_tag; - - /** - * Stores an explanation for why something failed, if it did. - * - * @see self::get_last_error - * - * @since WP_VERSION - * - * @var string|null - */ - protected $last_error = null; - - /** - * Lazily-built index of attributes found within an XML tag, keyed by the attribute name. - * - * Example: - * - * // Supposing the parser is working through this content - * // and stops after recognizing the `id` attribute. - * // - * // ^ parsing will continue from this point. - * $this->attributes = array( - * 'id' => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false ) - * ); - * - * // When picking up parsing again, or when asking to find the - * // `class` attribute we will continue and add to this array. - * $this->attributes = array( - * 'id' => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false ), - * 'class' => new WP_HTML_Attribute_Token( 'class', 23, 7, 17, 13, false ) - * ); - * - * @since WP_VERSION - * @var WP_HTML_Attribute_Token[] - */ - private $attributes = array(); - - /** - * Tracks a semantic location in the original XML which - * shifts with updates as they are applied to the document. - * - * @since WP_VERSION - * @var WP_HTML_Span[] - */ - protected $bookmarks = array(); - - /** - * Lexical replacements to apply to input XML document. - * - * "Lexical" in this class refers to the part of this class which - * operates on pure text _as text_ and not as XML. There's a line - * between the public interface, with XML-semantic methods like - * `set_attribute` and `add_class`, and an internal state that tracks - * text offsets in the input document. - * - * When higher-level XML methods are called, those have to transform their - * operations (such as setting an attribute's value) into text diffing - * operations (such as replacing the sub-string from indices A to B with - * some given new string). These text-diffing operations are the lexical - * updates. - * - * As new higher-level methods are added they need to collapse their - * operations into these lower-level lexical updates since that's the - * Tag Processor's internal language of change. Any code which creates - * these lexical updates must ensure that they do not cross XML syntax - * boundaries, however, so these should never be exposed outside of this - * class or any classes which intentionally expand its functionality. - * - * These are enqueued while editing the document instead of being immediately - * applied to avoid processing overhead, string allocations, and string - * copies when applying many updates to a single document. - * - * Example: - * - * // Replace an attribute stored with a new value, indices - * // sourced from the lazily-parsed XML recognizer. - * $start = $attributes['src']->start; - * $length = $attributes['src']->length; - * $modifications[] = new WP_HTML_Text_Replacement( $start, $length, $new_value ); - * - * // Correspondingly, something like this will appear in this array. - * $lexical_updates = array( - * WP_HTML_Text_Replacement( 14, 28, 'https://my-site.my-domain/wp-content/uploads/2014/08/kittens.jpg' ) - * ); - * - * @since WP_VERSION - * @var WP_HTML_Text_Replacement[] - */ - protected $lexical_updates = array(); - - /** - * Tracks and limits `seek()` calls to prevent accidental infinite loops. - * - * @since WP_VERSION - * @var int - * - * @see WP_XML_Tag_Processor::seek() - */ - protected $seek_count = 0; - - public $had_previous_chunks = false; - - /** - * Constructor. - * - * @since WP_VERSION - * - * @param string $xml XML to process. - */ - public function __construct( $xml ) { - $this->xml = $xml; - } - - /** - * Finds the next element matching the $query. - * - * This doesn't currently have a way to represent non-tags and doesn't process - * semantic rules for text nodes. - * - * @since WP_VERSION - * - * @param array|string|null $query { - * Optional. Which element name to find. Default is to find any tag. - * - * @type string|null $tag_name Which tag to find, or `null` for "any tag." - * @type int|null $match_offset Find the Nth tag matching all search criteria. - * 1 for "first" tag, 3 for "third," etc. - * Defaults to first tag. - * @type string|null $tag_closers "visit" or "skip": whether to stop on tag closers, e.g. . - * } - * @return bool Whether a tag was matched. - */ - public function next_tag( $query = null ) { - $this->parse_query( $query ); - $already_found = 0; - - do { - if ( false === $this->base_class_next_token() ) { - return false; - } - - if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { - continue; - } - - if ( $this->matches() ) { - ++$already_found; - } - } while ( $already_found < $this->sought_match_offset ); - - return true; - } - - /** - * Finds the next token in the XML document. - * - * An XML document can be viewed as a stream of tokens, - * where tokens are things like XML tags, XML comments, - * text nodes, etc. This method finds the next token in - * the XML document and returns whether it found one. - * - * If it starts parsing a token and reaches the end of the - * document then it will seek to the start of the last - * token and pause, returning `false` to indicate that it - * failed to find a complete token. - * - * Possible token types, based on the XML specification: - * - * - an XML tag, whether opening, closing, or void. - * - a text node - the plaintext inside tags. - * - an XML comment. - * - a processing instruction, e.g. ``. - * - * The Tag Processor currently only supports the tag token. - * - * @since WP_VERSION - * - * @access private - * - * @return bool Whether a token was parsed. - */ - public function next_token() { - return $this->base_class_next_token(); - } - - /** - * Internal method which finds the next token in the HTML document. - * - * This method is a protected internal function which implements the logic for - * finding the next token in a document. It exists so that the parser can update - * its state without affecting the location of the cursor in the document and - * without triggering subclass methods for things like `next_token()`, e.g. when - * applying patches before searching for the next token. - * - * @since 6.5.0 - * - * @access private - * - * @return bool Whether a token was parsed. - */ - protected function base_class_next_token() { - $was_at = $this->bytes_already_parsed; - $this->after_tag(); - - // Don't proceed if there's nothing more to scan. - if ( - self::STATE_COMPLETE === $this->parser_state || - self::STATE_INCOMPLETE_INPUT === $this->parser_state || - null !== $this->last_error - ) { - return false; - } - - /* - * The next step in the parsing loop determines the parsing state; - * clear it so that state doesn't linger from the previous step. - */ - $this->parser_state = self::STATE_READY; - - if ( $this->bytes_already_parsed >= strlen( $this->xml ) ) { - $this->parser_state = self::STATE_COMPLETE; - return false; - } - - // Find the next tag if it exists. - if ( false === $this->parse_next_tag() ) { - if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) { - $this->bytes_already_parsed = $was_at; - } - - return false; - } - - if ( null !== $this->last_error ) { - return false; - } - - /* - * For legacy reasons the rest of this function handles tags and their - * attributes. If the processor has reached the end of the document - * or if it matched any other token then it should return here to avoid - * attempting to process tag-specific syntax. - */ - if ( - self::STATE_INCOMPLETE_INPUT !== $this->parser_state && - self::STATE_COMPLETE !== $this->parser_state && - self::STATE_MATCHED_TAG !== $this->parser_state - ) { - return true; - } - - if ( $this->is_closing_tag ) { - $this->skip_whitespace(); - } else { - // Parse all of its attributes. - while ( $this->parse_next_attribute() ) { - continue; - } - } - - if ( null !== $this->last_error ) { - return false; - } - - // Ensure that the tag closes before the end of the document. - if ( - self::STATE_INCOMPLETE_INPUT === $this->parser_state || - $this->bytes_already_parsed >= strlen( $this->xml ) - ) { - // Does this appropriately clear state (parsed attributes)? - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - $this->bytes_already_parsed = $was_at; - - return false; - } - - $tag_ends_at = strpos( $this->xml, '>', $this->bytes_already_parsed ); - if ( false === $tag_ends_at ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - $this->bytes_already_parsed = $was_at; - - return false; - } - - if ( $this->is_closing_tag && $tag_ends_at !== $this->bytes_already_parsed ) { - $this->last_error = self::ERROR_SYNTAX; - _doing_it_wrong( - __METHOD__, - __( 'Invalid closing tag encountered.' ), - 'WP_VERSION' - ); - return false; - } - - $this->parser_state = self::STATE_MATCHED_TAG; - $this->bytes_already_parsed = $tag_ends_at + 1; - $this->token_length = $this->bytes_already_parsed - $this->token_starts_at; - - /* - * If we are in a PCData element, everything until the closer - * is considered text. - */ - if ( ! $this->is_pcdata_element() ) { - return true; - } - - /* - * Preserve the opening tag pointers, as these will be overwritten - * when finding the closing tag. They will be reset after finding - * the closing to tag to point to the opening of the special atomic - * tag sequence. - */ - $tag_name_starts_at = $this->tag_name_starts_at; - $tag_name_length = $this->tag_name_length; - $tag_ends_at = $this->token_starts_at + $this->token_length; - $attributes = $this->attributes; - - $found_closer = $this->skip_pcdata( $this->get_tag() ); - - // Closer not found, the document is incomplete. - if ( false === $found_closer ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - $this->bytes_already_parsed = $was_at; - return false; - } - - /* - * The values here look like they reference the opening tag but they reference - * the closing tag instead. This is why the opening tag values were stored - * above in a variable. It reads confusingly here, but that's because the - * functions that skip the contents have moved all the internal cursors past - * the inner content of the tag. - */ - $this->token_starts_at = $was_at; - $this->token_length = $this->bytes_already_parsed - $this->token_starts_at; - $this->text_starts_at = $tag_ends_at; - $this->text_length = $this->tag_name_starts_at - $this->text_starts_at; - $this->tag_name_starts_at = $tag_name_starts_at; - $this->tag_name_length = $tag_name_length; - $this->attributes = $attributes; - - return true; - } - - /** - * Whether the processor paused because the input XML document ended - * in the middle of a syntax element, such as in the middle of a tag. - * - * Example: - * - * $processor = new WP_XML_Tag_Processor( '

Surprising fact you may no… - * ^ ^ - * \-|-- it shifts with edits - * - * Bookmarks provide the ability to seek to a previously-scanned - * place in the XML document. This avoids the need to re-scan - * the entire document. - * - * Example: - * - *
  • One
  • Two
  • Three
- * ^^^^ - * want to note this last item - * - * $p = new WP_XML_Tag_Processor( $xml ); - * $in_list = false; - * while ( $p->next_tag( array( 'tag_closers' => $in_list ? 'visit' : 'skip' ) ) ) { - * if ( 'UL' === $p->get_tag() ) { - * if ( $p->is_tag_closer() ) { - * $in_list = false; - * $p->set_bookmark( 'resume' ); - * if ( $p->seek( 'last-li' ) ) { - * $p->add_class( 'last-li' ); - * } - * $p->seek( 'resume' ); - * $p->release_bookmark( 'last-li' ); - * $p->release_bookmark( 'resume' ); - * } else { - * $in_list = true; - * } - * } - * - * if ( 'LI' === $p->get_tag() ) { - * $p->set_bookmark( 'last-li' ); - * } - * } - * - * Bookmarks intentionally hide the internal string offsets - * to which they refer. They are maintained internally as - * updates are applied to the XML document and therefore - * retain their "position" - the location to which they - * originally pointed. The inability to use bookmarks with - * functions like `substr` is therefore intentional to guard - * against accidentally breaking the XML. - * - * Because bookmarks allocate memory and require processing - * for every applied update, they are limited and require - * a name. They should not be created with programmatically-made - * names, such as "li_{$index}" with some loop. As a general - * rule they should only be created with string-literal names - * like "start-of-section" or "last-paragraph". - * - * Bookmarks are a powerful tool to enable complicated behavior. - * Consider double-checking that you need this tool if you are - * reaching for it, as inappropriate use could lead to broken - * XML structure or unwanted processing overhead. - * - * @since WP_VERSION - * - * @param string $name Identifies this particular bookmark. - * @return bool Whether the bookmark was successfully created. - */ - public function set_bookmark( $name ) { - // It only makes sense to set a bookmark if the parser has paused on a concrete token. - if ( - self::STATE_COMPLETE === $this->parser_state || - self::STATE_INCOMPLETE_INPUT === $this->parser_state - ) { - return false; - } - - if ( ! array_key_exists( $name, $this->bookmarks ) && count( $this->bookmarks ) >= static::MAX_BOOKMARKS ) { - _doing_it_wrong( - __METHOD__, - __( 'Too many bookmarks: cannot create any more.' ), - 'WP_VERSION' - ); - return false; - } - - $this->bookmarks[ $name ] = new WP_HTML_Span( $this->token_starts_at, $this->token_length ); - - return true; - } - - - /** - * Removes a bookmark that is no longer needed. - * - * Releasing a bookmark frees up the small - * performance overhead it requires. - * - * @param string $name Name of the bookmark to remove. - * @return bool Whether the bookmark already existed before removal. - */ - public function release_bookmark( $name ) { - if ( ! array_key_exists( $name, $this->bookmarks ) ) { - return false; - } - - unset( $this->bookmarks[ $name ] ); - - return true; - } - - /** - * Skips contents of PCDATA element. - * - * @since WP_VERSION - * - * @see https://www.w3.org/TR/xml/#sec-mixed-content - * - * @param string $tag_name The tag name which will close the PCDATA region. - * @return false|int Byte offset of the closing tag, or false if not found. - */ - private function skip_pcdata( $tag_name ) { - $xml = $this->xml; - $doc_length = strlen( $xml ); - $tag_length = strlen( $tag_name ); - - $at = $this->bytes_already_parsed; - while ( false !== $at && $at < $doc_length ) { - $at = strpos( $this->xml, 'tag_name_starts_at = $at; - - // Fail if there is no possible tag closer. - if ( false === $at ) { - return false; - } - - $at += 2 + $tag_length; - $at += strspn( $this->xml, " \t\f\r\n", $at ); - $this->bytes_already_parsed = $at; - - /* - * Ensure that the tag name terminates to avoid matching on - * substrings of a longer tag name. For example, the sequence - * "= strlen( $xml ) ) { - return false; - } - if ( '>' === $xml[ $at ] ) { - $this->bytes_already_parsed = $at + 1; - return true; - } - } - - return false; - } - - /** - * Returns the last error, if any. - * - * Various situations lead to parsing failure but this class will - * return `false` in all those cases. To determine why something - * failed it's possible to request the last error. This can be - * helpful to know to distinguish whether a given tag couldn't - * be found or if content in the document caused the processor - * to give up and abort processing. - * - * Example - * - * $processor = WP_XML_Tag_Processor::create_fragment( '' ); - * false === $processor->next_tag(); - * WP_XML_Tag_Processor::ERROR_SYNTAX === $processor->get_last_error(); - * - * @since WP_VERSION - * - * @see self::ERROR_UNSUPPORTED - * @see self::ERROR_EXCEEDED_MAX_BOOKMARKS - * - * @return string|null The last error, if one exists, otherwise null. - */ - public function get_last_error(): ?string { - return $this->last_error; - } - - /** - * Tag names declared as PCDATA elements. - * - * PCDATA elements are elements in which everything is treated as - * text, even syntax that may look like other elements, closers, - * processing instructions, etc. - * - * Example: - * - * - * - * This text contains syntax that seems - * like XML nodes: - * - * - * - * - * - * - * &<>"' - * - * But! It's all treated as text. - * - * - * - * @var array - */ - private $pcdata_elements = array(); - - /** - * Declares an element as PCDATA. - * - * PCDATA elements are elements in which everything is treated as - * text, even syntax that may look like other elements, closers, - * processing instructions, etc. - * - * For example: - * - * $processor = new WP_XML_Tag_Processor( - * << - * - * This text uses syntax that may seem - * like XML nodes: - * - * - * - * - * - * - * &<>"' - * - * But! It's all treated as text. - * - * - * XML - * ); - * - * $processor->declare_element_as_pcdata('my-pcdata'); - * $processor->next_tag('my-pcdata'); - * $processor->next_token(); - * - * // Returns everything inside the - * // element as text: - * $processor->get_modifiable_text(); - * - * @param string $element_name The name of the element to declare as PCDATA. - * @return void - */ - public function declare_element_as_pcdata( $element_name ) { - $this->pcdata_elements[ $element_name ] = true; - } - - /** - * Indicates if the currently matched tag is a PCDATA element. - * - * @since WP_VERSION - * - * @return bool Whether the currently matched tag is a PCDATA element. - */ - public function is_pcdata_element() { - return array_key_exists( $this->get_tag(), $this->pcdata_elements ); - } - - /** - * Parses the next tag. - * - * This will find and start parsing the next tag, including - * the opening `<`, the potential closer `/`, and the tag - * name. It does not parse the attributes or scan to the - * closing `>`; these are left for other methods. - * - * @since WP_VERSION - * - * @return bool Whether a tag was found before the end of the document. - */ - private function parse_next_tag() { - $this->after_tag(); - - $xml = $this->xml; - $doc_length = strlen( $xml ); - $was_at = $this->bytes_already_parsed; - $at = $was_at; - - while ( false !== $at && $at < $doc_length ) { - $at = strpos( $xml, '<', $at ); - - /* - * There may be no text nodes outside of elements. - * If this character sequence was encountered outside of - * the root element, it is a syntax error. WP_XML_Tag_Processor - * does not have that context – it is up to the API consumer, - * such as WP_Tag_Processor, to handle this scenario. - */ - if ( false === $at ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - $this->is_incomplete_text_node = true; - $this->text_starts_at = $was_at; - $this->text_length = $doc_length - $was_at; - return false; - } - - if ( $at > $was_at ) { - $this->parser_state = self::STATE_TEXT_NODE; - $this->token_starts_at = $was_at; - $this->token_length = $at - $was_at; - $this->text_starts_at = $was_at; - $this->text_length = $this->token_length; - $this->bytes_already_parsed = $at; - - return true; - } - - $this->token_starts_at = $at; - - if ( $at + 1 < $doc_length && '/' === $this->xml[ $at + 1 ] ) { - $this->is_closing_tag = true; - ++$at; - } else { - $this->is_closing_tag = false; - } - - if ( $at + 1 >= $doc_length ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - return false; - } - - /* - * XML tag names are defined by the same `Name` grammar rule as attribute - * names. - * - * Reference: - * * https://www.w3.org/TR/xml/#NT-STag - * * https://www.w3.org/TR/xml/#NT-Name - */ - $tag_name_length = $this->parse_name( $at + 1 ); - if ( $tag_name_length > 0 ) { - ++$at; - $this->parser_state = self::STATE_MATCHED_TAG; - $this->tag_name_starts_at = $at; - $this->tag_name_length = $tag_name_length; - $this->token_length = $this->tag_name_length; - $this->bytes_already_parsed = $at + $this->tag_name_length; - - return true; - } - - /* - * Abort if no tag is found before the end of - * the document. There is nothing left to parse. - */ - if ( $at + 1 >= $doc_length ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - - return false; - } - - /* - * `is_closing_tag && '!' === $xml[ $at + 1 ] ) { - /* - * ` sequence. - */ - --$closer_at; // Pre-increment inside condition below reduces risk of accidental infinite looping. - while ( ++$closer_at < $doc_length ) { - $closer_at = strpos( $xml, '--', $closer_at ); - if ( false === $closer_at || $closer_at + 2 === $doc_length ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - return false; - } - - /* - * The string " -- " (double-hyphen) must not occur within comments - * See https://www.w3.org/TR/xml/#sec-comments - */ - if ( '>' !== $xml[ $closer_at + 2 ] ) { - $this->last_error = self::ERROR_SYNTAX; - _doing_it_wrong( - __METHOD__, - __( 'Invalid comment syntax encountered.' ), - 'WP_VERSION' - ); - return false; - } - - $this->parser_state = self::STATE_COMMENT; - $this->token_length = $closer_at + 3 - $this->token_starts_at; - $this->text_starts_at = $this->token_starts_at + 4; - $this->text_length = $closer_at - $this->text_starts_at; - $this->bytes_already_parsed = $closer_at + 3; - return true; - } - } - - /* - * Identify CDATA sections. - * - * Within a CDATA section, everything until the ]]> string is treated - * as data, not markup. Left angle brackets and ampersands may occur in - * their literal form; they need not (and cannot) be escaped using "<" - * and "&". CDATA sections cannot nest. - * - * See https://www.w3.org/TR/xml11.xml/#sec-cdata-sect - */ - if ( - ! $this->is_closing_tag && - $doc_length > $this->token_starts_at + 8 && - '[' === $xml[ $this->token_starts_at + 2 ] && - 'C' === $xml[ $this->token_starts_at + 3 ] && - 'D' === $xml[ $this->token_starts_at + 4 ] && - 'A' === $xml[ $this->token_starts_at + 5 ] && - 'T' === $xml[ $this->token_starts_at + 6 ] && - 'A' === $xml[ $this->token_starts_at + 7 ] && - '[' === $xml[ $this->token_starts_at + 8 ] - ) { - $closer_at = strpos( $xml, ']]>', $at + 1 ); - if ( false === $closer_at ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - - return false; - } - - $this->parser_state = self::STATE_CDATA_NODE; - $this->token_length = $closer_at + 1 - $this->token_starts_at; - $this->text_starts_at = $this->token_starts_at + 9; - $this->text_length = $closer_at - $this->text_starts_at; - $this->bytes_already_parsed = $closer_at + 3; - return true; - } - - /* - * Anything else here is either unsupported at this point or invalid - * syntax. See the class-level @TODO annotations for more information. - */ - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - - return false; - } - - /* - * An `had_previous_chunks && - ! $this->is_closing_tag && - '?' === $xml[ $at + 1 ] && - 'x' === $xml[ $at + 2 ] && - 'm' === $xml[ $at + 3 ] && - 'l' === $xml[ $at + 4 ] - ) { - // Setting the parser state early for the get_attribute() calls later in this - // branch. - $this->parser_state = self::STATE_XML_DECLARATION; - - $at += 5; - - // Skip whitespace. - $at += strspn( $this->xml, " \t\f\r\n", $at ); - - $this->bytes_already_parsed = $at; - - /* - * Reuse parse_next_attribute() to parse the XML declaration attributes. - * Technically, only "version", "encoding", and "standalone" are accepted - * and, unlike regular tag attributes, their values can contain any character - * other than the opening quote. However, the "<" and "&" characters are very - * unlikely to be encountered and cause trouble, so this code path liberally - * does not provide a dedicated parsing logic. - */ - while ( false !== $this->parse_next_attribute() ) { - $this->skip_whitespace(); - // Parse until the XML declaration closer. - if ( '?' === $xml[ $this->bytes_already_parsed ] ) { - break; - } - } - - if ( null !== $this->last_error ) { - return false; - } - - foreach ( $this->attributes as $name => $attribute ) { - if ( 'version' !== $name && 'encoding' !== $name && 'standalone' !== $name ) { - $this->last_error = self::ERROR_SYNTAX; - _doing_it_wrong( - __METHOD__, - __( 'Invalid attribute found in XML declaration.' ), - 'WP_VERSION' - ); - return false; - } - } - - if ( '1.0' !== $this->get_attribute( 'version' ) ) { - $this->last_error = self::ERROR_UNSUPPORTED; - _doing_it_wrong( - __METHOD__, - __( 'Unsupported XML version declared' ), - 'WP_VERSION' - ); - return false; - } - - /** - * Standalone XML documents have no external dependencies, - * including predefined entities like ` ` and `©`. - * - * See https://www.w3.org/TR/xml/#sec-predefined-ent. - */ - if ( null !== $this->get_attribute( 'encoding' ) - && 'UTF-8' !== strtoupper( $this->get_attribute( 'encoding' ) ) - ) { - $this->last_error = self::ERROR_UNSUPPORTED; - _doing_it_wrong( - __METHOD__, - __( 'Unsupported XML encoding declared, only UTF-8 is supported.' ), - 'WP_VERSION' - ); - return false; - } - if ( null !== $this->get_attribute( 'standalone' ) - && 'YES' !== strtoupper( $this->get_attribute( 'standalone' ) ) - ) { - $this->last_error = self::ERROR_UNSUPPORTED; - _doing_it_wrong( - __METHOD__, - __( 'Standalone XML documents are not supported.' ), - 'WP_VERSION' - ); - return false; - } - - $at = $this->bytes_already_parsed; - - // Skip whitespace. - $at += strspn( $this->xml, " \t\f\r\n", $at ); - - // Consume the closer. - if ( ! ( - $at + 2 <= $doc_length && - '?' === $xml[ $at ] && - '>' === $xml[ $at + 1 ] - ) ) { - $this->last_error = self::ERROR_SYNTAX; - _doing_it_wrong( - __METHOD__, - __( 'XML declaration closer not found.' ), - 'WP_VERSION' - ); - return false; - } - - $this->token_length = $at + 2 - $this->token_starts_at; - $this->text_starts_at = $this->token_starts_at + 2; - $this->text_length = $at - $this->text_starts_at; - $this->bytes_already_parsed = $at + 2; - $this->parser_state = self::STATE_XML_DECLARATION; - - return true; - } - - /* - * `is_closing_tag && - '?' === $xml[ $at + 1 ] - ) { - if ( $at + 4 >= $doc_length ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - - return false; - } - - if ( ! ( - ( 'x' === $xml[ $at + 2 ] || 'X' === $xml[ $at + 2 ] ) && - ( 'm' === $xml[ $at + 3 ] || 'M' === $xml[ $at + 3 ] ) && - ( 'l' === $xml[ $at + 4 ] || 'L' === $xml[ $at + 4 ] ) - ) ) { - _doing_it_wrong( - __METHOD__, - __( 'Invalid processing instruction target.' ), - 'WP_VERSION' - ); - return false; - } - - $at += 5; - - // Skip whitespace. - $this->skip_whitespace(); - - /* - * Find the closer. - * - * We could, at this point, only consume the bytes allowed by the specification, that is: - * - * [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] // any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. - * - * However, that would require running a slow regular-expression engine for, seemingly, - * little benefit. For now, we are going to pretend that all bytes are allowed until the - * closing ?> is found. Some failures may pass unnoticed. That may not be a problem in practice, - * but if it is then this code path will require a stricter implementation. - */ - $closer_at = strpos( $xml, '?>', $at ); - if ( false === $closer_at ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - - return false; - } - - $this->parser_state = self::STATE_PI_NODE; - $this->token_length = $closer_at + 5 - $this->token_starts_at; - $this->text_starts_at = $this->token_starts_at + 5; - $this->text_length = $closer_at - $this->text_starts_at; - $this->bytes_already_parsed = $closer_at + 2; - - return true; - } - - ++$at; - } - - return false; - } - - /** - * Parses the next attribute. - * - * @since WP_VERSION - * - * @return bool Whether an attribute was found before the end of the document. - */ - private function parse_next_attribute() { - // Skip whitespace and slashes. - $this->bytes_already_parsed += strspn( $this->xml, " \t\f\r\n/", $this->bytes_already_parsed ); - if ( $this->bytes_already_parsed >= strlen( $this->xml ) ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - return false; - } - - // No more attributes to parse. - if ( '>' === $this->xml[ $this->bytes_already_parsed ] ) { - return false; - } - - $attribute_start = $this->bytes_already_parsed; - $attribute_name_length = $this->parse_name( $this->bytes_already_parsed ); - if ( 0 === $attribute_name_length ) { - $this->last_error = self::ERROR_SYNTAX; - _doing_it_wrong( - __METHOD__, - __( 'Invalid attribute name encountered.' ), - 'WP_VERSION' - ); - } - $this->bytes_already_parsed += $attribute_name_length; - $attribute_name = substr( $this->xml, $attribute_start, $attribute_name_length ); - $this->skip_whitespace(); - - // Parse attribute value. - ++$this->bytes_already_parsed; - $this->skip_whitespace(); - if ( $this->bytes_already_parsed >= strlen( $this->xml ) ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - return false; - } - switch ( $this->xml[ $this->bytes_already_parsed ] ) { - case "'": - case '"': - $quote = $this->xml[ $this->bytes_already_parsed ]; - $value_start = $this->bytes_already_parsed + 1; - /** - * XML attributes cannot contain the characters "<" or "&". - * - * This only checks for "<" because it's reasonably fast. - * Ampersands are actually allowed when used as the start - * of an entity reference, but enforcing that would require - * an expensive and complex check. It doesn't seem to be - * worth it. - * - * @TODO: Discuss enforcing or abandoning the ampersand rule - * and document the rationale. - */ - $value_length = strcspn( $this->xml, "<$quote", $value_start ); - $attribute_end = $value_start + $value_length + 1; - - if ( $attribute_end - 1 >= strlen( $this->xml ) ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - - return false; - } - - if ( $this->xml[ $attribute_end - 1 ] !== $quote ) { - $this->last_error = self::ERROR_SYNTAX; - _doing_it_wrong( - __METHOD__, - __( 'A disallowed character encountered in an attribute value (either < or &).' ), - 'WP_VERSION' - ); - } - $this->bytes_already_parsed = $attribute_end; - break; - - default: - $this->last_error = self::ERROR_SYNTAX; - _doing_it_wrong( - __METHOD__, - __( 'Unquoted attribute value encountered.' ), - 'WP_VERSION' - ); - return false; - } - - if ( $attribute_end >= strlen( $this->xml ) ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - return false; - } - - if ( $this->is_closing_tag ) { - return true; - } - - if ( array_key_exists( $attribute_name, $this->attributes ) ) { - $this->last_error = self::ERROR_SYNTAX; - _doing_it_wrong( - __METHOD__, - __( 'Duplicate attribute found in an XML tag.' ), - 'WP_VERSION' - ); - return false; - } - - $this->attributes[ $attribute_name ] = new WP_HTML_Attribute_Token( - $attribute_name, - $value_start, - $value_length, - $attribute_start, - $attribute_end - $attribute_start, - false - ); - - return true; - } - - /** - * Move the internal cursor past any immediate successive whitespace. - * - * @since WP_VERSION - */ - private function skip_whitespace() { - $this->bytes_already_parsed += strspn( $this->xml, " \t\f\r\n", $this->bytes_already_parsed ); - } - - // Describes the first character of the attribute name: - // NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] - // See https://www.w3.org/TR/xml/#NT-Name - const NAME_START_CHAR_PATTERN = ':a-z_A-Z\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}'; - const NAME_CHAR_PATTERN = '\-\.0-9\x{B7}\x{0300}-\x{036F}\x{203F}-\x{2040}:a-z_A-Z\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}'; - private function parse_name( $offset ) { - if ( 1 !== preg_match( - '~[' . self::NAME_START_CHAR_PATTERN . ']~Ssu', - $this->xml[ $offset ], - $matches - ) ) { - return 0; - } - - $name_length = 1; - - // Consume the rest of the name - preg_match( - '~\G([' . self::NAME_CHAR_PATTERN . ']+)~Ssu', - $this->xml, - $matches, - 0, - $offset + 1 - ); - - if ( is_array( $matches ) && count( $matches ) > 0 ) { - $name_length += strlen( $matches[0] ); - } - - return $name_length; - } - - /** - * Applies attribute updates and cleans up once a tag is fully parsed. - * - * @since WP_VERSION - */ - private function after_tag() { - /* - * Purge updates if there are too many. The actual count isn't - * scientific, but a few values from 100 to a few thousand were - * tests to find a practically-useful limit. - * - * If the update queue grows too big, then the Tag Processor - * will spend more time iterating through them and lose the - * efficiency gains of deferring applying them. - */ - if ( 1000 < count( $this->lexical_updates ) ) { - $this->get_updated_xml(); - } - - foreach ( $this->lexical_updates as $name => $update ) { - /* - * Any updates appearing after the cursor should be applied - * before proceeding, otherwise they may be overlooked. - */ - if ( $update->start >= $this->bytes_already_parsed ) { - $this->get_updated_xml(); - break; - } - - if ( is_int( $name ) ) { - continue; - } - - $this->lexical_updates[] = $update; - unset( $this->lexical_updates[ $name ] ); - } - - $this->is_incomplete_text_node = false; - $this->token_starts_at = null; - $this->token_length = null; - $this->tag_name_starts_at = null; - $this->tag_name_length = null; - $this->text_starts_at = 0; - $this->text_length = 0; - $this->is_closing_tag = null; - $this->attributes = array(); - } - - protected function reset_state() { - $this->xml = ''; - $this->last_query = null; - $this->sought_tag_name = null; - $this->sought_match_offset = 0; - $this->stop_on_tag_closers = false; - $this->parser_state = self::STATE_READY; - $this->is_incomplete_text_node = false; - $this->bytes_already_parsed = 0; - $this->token_starts_at = null; - $this->token_length = null; - $this->tag_name_starts_at = null; - $this->tag_name_length = null; - $this->text_starts_at = 0; - $this->text_length = 0; - $this->is_closing_tag = null; - $this->last_error = null; - $this->attributes = array(); - $this->bookmarks = array(); - $this->lexical_updates = array(); - $this->seek_count = 0; - $this->had_previous_chunks = false; - } - - /** - * Applies attribute updates to XML document. - * - * @since WP_VERSION - * - * @param int $shift_this_point Accumulate and return shift for this position. - * @return int How many bytes the given pointer moved in response to the updates. - */ - private function apply_attributes_updates( $shift_this_point = 0 ) { - if ( ! count( $this->lexical_updates ) ) { - return 0; - } - - $accumulated_shift_for_given_point = 0; - - /* - * Attribute updates can be enqueued in any order but updates - * to the document must occur in lexical order; that is, each - * replacement must be made before all others which follow it - * at later string indices in the input document. - * - * Sorting avoid making out-of-order replacements which - * can lead to mangled output, partially-duplicated - * attributes, and overwritten attributes. - */ - usort( $this->lexical_updates, array( self::class, 'sort_start_ascending' ) ); - - $bytes_already_copied = 0; - $output_buffer = ''; - foreach ( $this->lexical_updates as $diff ) { - $shift = strlen( $diff->text ) - $diff->length; - - // Adjust the cursor position by however much an update affects it. - if ( $diff->start < $this->bytes_already_parsed ) { - $this->bytes_already_parsed += $shift; - } - - // Accumulate shift of the given pointer within this function call. - if ( $diff->start <= $shift_this_point ) { - $accumulated_shift_for_given_point += $shift; - } - - $output_buffer .= substr( $this->xml, $bytes_already_copied, $diff->start - $bytes_already_copied ); - $output_buffer .= $diff->text; - $bytes_already_copied = $diff->start + $diff->length; - } - - $this->xml = $output_buffer . substr( $this->xml, $bytes_already_copied ); - - /* - * Adjust bookmark locations to account for how the text - * replacements adjust offsets in the input document. - */ - foreach ( $this->bookmarks as $bookmark_name => $bookmark ) { - $bookmark_end = $bookmark->start + $bookmark->length; - - /* - * Each lexical update which appears before the bookmark's endpoints - * might shift the offsets for those endpoints. Loop through each change - * and accumulate the total shift for each bookmark, then apply that - * shift after tallying the full delta. - */ - $head_delta = 0; - $tail_delta = 0; - - foreach ( $this->lexical_updates as $diff ) { - $diff_end = $diff->start + $diff->length; - - if ( $bookmark->start < $diff->start && $bookmark_end < $diff->start ) { - break; - } - - if ( $bookmark->start >= $diff->start && $bookmark_end < $diff_end ) { - $this->release_bookmark( $bookmark_name ); - continue 2; - } - - $delta = strlen( $diff->text ) - $diff->length; - - if ( $bookmark->start >= $diff->start ) { - $head_delta += $delta; - } - - if ( $bookmark_end >= $diff_end ) { - $tail_delta += $delta; - } - } - - $bookmark->start += $head_delta; - $bookmark->length += $tail_delta - $head_delta; - } - - $this->lexical_updates = array(); - - return $accumulated_shift_for_given_point; - } - - /** - * Checks whether a bookmark with the given name exists. - * - * @since WP_VERSION - * - * @param string $bookmark_name Name to identify a bookmark that potentially exists. - * @return bool Whether that bookmark exists. - */ - public function has_bookmark( $bookmark_name ) { - return array_key_exists( $bookmark_name, $this->bookmarks ); - } - - public function get_processed_xml() { - // Flush updates - $this->get_updated_xml(); - return substr( $this->xml, 0, $this->bytes_already_parsed ); - } - - public function get_unprocessed_xml() { - // Flush updates - $this->get_updated_xml(); - return substr( $this->xml, $this->bytes_already_parsed ); - } - - - /** - * Move the internal cursor in the Tag Processor to a given bookmark's location. - * - * In order to prevent accidental infinite loops, there's a - * maximum limit on the number of times seek() can be called. - * - * @since WP_VERSION - * - * @param string $bookmark_name Jump to the place in the document identified by this bookmark name. - * @return bool Whether the internal cursor was successfully moved to the bookmark's location. - */ - public function seek( $bookmark_name ) { - if ( ! array_key_exists( $bookmark_name, $this->bookmarks ) ) { - _doing_it_wrong( - __METHOD__, - __( 'Unknown bookmark name.' ), - 'WP_VERSION' - ); - return false; - } - - if ( ++$this->seek_count > static::MAX_SEEK_OPS ) { - _doing_it_wrong( - __METHOD__, - __( 'Too many calls to seek() - this can lead to performance issues.' ), - 'WP_VERSION' - ); - return false; - } - - // Flush out any pending updates to the document. - $this->get_updated_xml(); - - // Point this tag processor before the sought tag opener and consume it. - $this->bytes_already_parsed = $this->bookmarks[ $bookmark_name ]->start; - $this->parser_state = self::STATE_READY; - return $this->base_class_next_token(); - } - - /** - * Compare two WP_HTML_Text_Replacement objects. - * - * @since WP_VERSION - * - * @param WP_HTML_Text_Replacement $a First attribute update. - * @param WP_HTML_Text_Replacement $b Second attribute update. - * @return int Comparison value for string order. - */ - private static function sort_start_ascending( $a, $b ) { - $by_start = $a->start - $b->start; - if ( 0 !== $by_start ) { - return $by_start; - } - - $by_text = isset( $a->text, $b->text ) ? strcmp( $a->text, $b->text ) : 0; - if ( 0 !== $by_text ) { - return $by_text; - } - - /* - * This code should be unreachable, because it implies the two replacements - * start at the same location and contain the same text. - */ - return $a->length - $b->length; - } - - /** - * Return the enqueued value for a given attribute, if one exists. - * - * Enqueued updates can take different data types: - * - If an update is enqueued and is boolean, the return will be `true` - * - If an update is otherwise enqueued, the return will be the string value of that update. - * - If an attribute is enqueued to be removed, the return will be `null` to indicate that. - * - If no updates are enqueued, the return will be `false` to differentiate from "removed." - * - * @since WP_VERSION - * - * @param string $comparable_name The attribute name in its comparable form. - * @return string|boolean|null Value of enqueued update if present, otherwise false. - */ - private function get_enqueued_attribute_value( $comparable_name ) { - if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { - return false; - } - - if ( ! isset( $this->lexical_updates[ $comparable_name ] ) ) { - return false; - } - - $enqueued_text = $this->lexical_updates[ $comparable_name ]->text; - - // Removed attributes erase the entire span. - if ( '' === $enqueued_text ) { - return null; - } - - /* - * Boolean attribute updates are just the attribute name without a corresponding value. - * - * This value might differ from the given comparable name in that there could be leading - * or trailing whitespace, and that the casing follows the name given in `set_attribute`. - * - * Example: - * - * $p->set_attribute( 'data-TEST-id', 'update' ); - * 'update' === $p->get_enqueued_attribute_value( 'data-test-id' ); - * - * Detect this difference based on the absence of the `=`, which _must_ exist in any - * attribute containing a value, e.g. ``. - * ¹ ² - * 1. Attribute with a string value. - * 2. Boolean attribute whose value is `true`. - */ - $equals_at = strpos( $enqueued_text, '=' ); - if ( false === $equals_at ) { - return true; - } - - /* - * Finally, a normal update's value will appear after the `=` and - * be double-quoted, as performed incidentally by `set_attribute`. - * - * e.g. `type="text"` - * ¹² ³ - * 1. Equals is here. - * 2. Double-quoting starts one after the equals sign. - * 3. Double-quoting ends at the last character in the update. - */ - $enqueued_value = substr( $enqueued_text, $equals_at + 2, -1 ); - /* - * We're deliberately not decoding entities in attribute values: - * - * Attribute values must not contain direct or indirect entity references to external entities. - * - * See https://www.w3.org/TR/xml/#sec-starttags. - */ - return $enqueued_value; - } - - /** - * Returns the value of a requested attribute from a matched tag opener if that attribute exists. - * - * Example: - * - * $p = new WP_XML_Tag_Processor( 'Test' ); - * $p->next_tag( array( 'class_name' => 'test' ) ) === true; - * $p->get_attribute( 'data-test-id' ) === '14'; - * $p->get_attribute( 'enabled' ) === true; - * $p->get_attribute( 'aria-label' ) === null; - * - * $p->next_tag() === false; - * $p->get_attribute( 'class' ) === null; - * - * @since WP_VERSION - * - * @param string $name Name of attribute whose value is requested. - * @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`. - */ - public function get_attribute( $name ) { - if ( - self::STATE_MATCHED_TAG !== $this->parser_state && - self::STATE_XML_DECLARATION !== $this->parser_state - ) { - return null; - } - - // Return any enqueued attribute value updates if they exist. - $enqueued_value = $this->get_enqueued_attribute_value( $name ); - if ( false !== $enqueued_value ) { - return $enqueued_value; - } - - if ( ! isset( $this->attributes[ $name ] ) ) { - return null; - } - - $attribute = $this->attributes[ $name ]; - $raw_value = substr( $this->xml, $attribute->value_starts_at, $attribute->value_length ); - - $decoded = WP_XML_Decoder::decode( $raw_value ); - if ( ! isset( $decoded ) ) { - /** - * If the attribute contained an invalid value, it's - * a fatal error. - * - * @see WP_XML_Decoder::decode() - */ - $this->last_error = self::ERROR_SYNTAX; - _doing_it_wrong( - __METHOD__, - __( 'Invalid attribute value encountered.' ), - 'WP_VERSION' - ); - return false; - } - - return $decoded; - } - - /** - * Gets names of all attributes matching a given prefix in the current tag. - * - * Note that matching is case-sensitive. This is in accordance with the spec. - * - * Example: - * - * $p = new WP_XML_Tag_Processor( 'Test' ); - * $p->next_tag( array( 'class_name' => 'test' ) ) === true; - * $p->get_attribute_names_with_prefix( 'data-' ) === array( 'data-ENABLED' ); - * $p->get_attribute_names_with_prefix( 'DATA-' ) === array( 'DATA-test-id' ); - * $p->get_attribute_names_with_prefix( 'DAta-' ) === array(); - * - * @since WP_VERSION - * - * @param string $prefix Prefix of requested attribute names. - * @return array|null List of attribute names, or `null` when no tag opener is matched. - */ - public function get_attribute_names_with_prefix( $prefix ) { - if ( - self::STATE_MATCHED_TAG !== $this->parser_state || - $this->is_closing_tag - ) { - return null; - } - - $matches = array(); - foreach ( array_keys( $this->attributes ) as $attr_name ) { - if ( str_starts_with( $attr_name, $prefix ) ) { - $matches[] = $attr_name; - } - } - return $matches; - } - - /** - * Returns the uppercase name of the matched tag. - * - * Example: - * - * $p = new WP_XML_Tag_Processor( 'Test' ); - * $p->next_tag() === true; - * $p->get_tag() === 'DIV'; - * - * $p->next_tag() === false; - * $p->get_tag() === null; - * - * @since WP_VERSION - * - * @return string|null Name of currently matched tag in input XML, or `null` if none found. - */ - public function get_tag() { - if ( null === $this->tag_name_starts_at ) { - return null; - } - - $tag_name = substr( $this->xml, $this->tag_name_starts_at, $this->tag_name_length ); - - if ( self::STATE_MATCHED_TAG === $this->parser_state ) { - return $tag_name; - } - - return null; - } - - /** - * Indicates if the currently matched tag is an empty element tag. - * - * XML tags ending with a solidus ("/") are parsed as empty elements. They have no - * content and no matching closer is expected. - - * @since WP_VERSION - * - * @return bool Whether the currently matched tag is an empty element tag. - */ - public function is_empty_element() { - if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { - return false; - } - - /* - * An empty element tag is defined by the solidus at the _end_ of the tag, not the beginning. - * - * Example: - * - *
- * ^ this appears one character before the end of the closing ">". - */ - return '/' === $this->xml[ $this->token_starts_at + $this->token_length - 2 ]; - } - - /** - * Indicates if the current tag token is a tag closer. - * - * Example: - * - * $p = new WP_XML_Tag_Processor( '' ); - * $p->next_tag( array( 'tag_name' => 'wp:content', 'tag_closers' => 'visit' ) ); - * $p->is_tag_closer() === false; - * - * $p->next_tag( array( 'tag_name' => 'wp:content', 'tag_closers' => 'visit' ) ); - * $p->is_tag_closer() === true; - * - * @since WP_VERSION - * - * @return bool Whether the current tag is a tag closer. - */ - public function is_tag_closer() { - return ( - self::STATE_MATCHED_TAG === $this->parser_state && - $this->is_closing_tag - ); - } - - /** - * Indicates the kind of matched token, if any. - * - * This differs from `get_token_name()` in that it always - * returns a static string indicating the type, whereas - * `get_token_name()` may return values derived from the - * token itself, such as a tag name or processing - * instruction tag. - * - * Possible values: - * - `#tag` when matched on a tag. - * - `#text` when matched on a text node. - * - `#cdata-section` when matched on a CDATA node. - * - `#comment` when matched on a comment. - * - `#presumptuous-tag` when matched on an empty tag closer. - * - * @since WP_VERSION - * - * @return string|null What kind of token is matched, or null. - */ - public function get_token_type() { - switch ( $this->parser_state ) { - case self::STATE_MATCHED_TAG: - return '#tag'; - - default: - return $this->get_token_name(); - } - } - - /** - * Returns the node name represented by the token. - * - * This matches the DOM API value `nodeName`. Some values - * are static, such as `#text` for a text node, while others - * are dynamically generated from the token itself. - * - * Dynamic names: - * - Uppercase tag name for tag matches. - * - * Note that if the Tag Processor is not matched on a token - * then this function will return `null`, either because it - * hasn't yet found a token or because it reached the end - * of the document without matching a token. - * - * @since WP_VERSION - * - * @return string|null Name of the matched token. - */ - public function get_token_name() { - switch ( $this->parser_state ) { - case self::STATE_MATCHED_TAG: - return $this->get_tag(); - - case self::STATE_TEXT_NODE: - return '#text'; - - case self::STATE_CDATA_NODE: - return '#cdata-section'; - - case self::STATE_XML_DECLARATION: - return '#xml-declaration'; - - case self::STATE_PI_NODE: - return '#processing-instructions'; - - case self::STATE_COMMENT: - return '#comment'; - } - } - - /** - * Returns the modifiable text for a matched token, or an empty string. - * - * Modifiable text is text content that may be read and changed without - * changing the XML structure of the document around it. This includes - * the contents of `#text` nodes in the XML as well as the inner - * contents of XML comments, Processing Instructions, and others, even - * though these nodes aren't part of a parsed DOM tree. They also contain - * the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any - * other section in an XML document which cannot contain XML markup (DATA). - * - * If a token has no modifiable text then an empty string is returned to - * avoid needless crashing or type errors. An empty string does not mean - * that a token has modifiable text, and a token with modifiable text may - * have an empty string (e.g. a comment with no contents). - * - * @since WP_VERSION - * - * @return string - */ - public function get_modifiable_text() { - if ( null === $this->text_starts_at ) { - return ''; - } - - $text = substr( $this->xml, $this->text_starts_at, $this->text_length ); - - /* - * > the XML processor must behave as if it normalized all line breaks in external parsed - * > entities (including the document entity) on input, before parsing, by translating both - * > the two-character sequence #xD #xA and any #xD that is not followed by #xA to a single - * > #xA character. - * - * See https://www.w3.org/TR/xml/#sec-line-ends - */ - $text = str_replace( array( "\r\n", "\r" ), "\n", $text ); - - // Comment data, CDATA sections, and PCData tags contents are not decoded any further. - if ( - self::STATE_CDATA_NODE === $this->parser_state || - self::STATE_COMMENT === $this->parser_state || - $this->is_pcdata_element() - ) { - return $text; - } - - $decoded = WP_XML_Decoder::decode( $text ); - if ( ! isset( $decoded ) ) { - /** - * If the attribute contained an invalid value, it's - * a fatal error. - * - * @see WP_XML_Decoder::decode() - */ - - $this->last_error = self::ERROR_SYNTAX; - var_dump( $text ); - _doing_it_wrong( - __METHOD__, - __( 'Invalid text content encountered.' ), - 'WP_VERSION' - ); - return false; - } - return $decoded; - } - - public function set_modifiable_text( $new_value ) { - switch ( $this->parser_state ) { - case self::STATE_TEXT_NODE: - case self::STATE_COMMENT: - $this->lexical_updates[] = new WP_HTML_Text_Replacement( - $this->text_starts_at, - $this->text_length, - // @TODO This is naive, let's rethink this. - htmlspecialchars( $new_value, ENT_XML1, 'UTF-8' ) - ); - return true; - - case self::STATE_CDATA_NODE: - $this->lexical_updates[] = new WP_HTML_Text_Replacement( - $this->text_starts_at, - $this->text_length, - // @TODO This is naive, let's rethink this. - str_replace( ']]>', ']]>', $new_value ) - ); - return true; - default: - _doing_it_wrong( - __METHOD__, - __( 'Cannot set text content on a non-text node.' ), - 'WP_VERSION' - ); - return false; - } - } - - /** - * Updates or creates a new attribute on the currently matched tag with the passed value. - * - * For boolean attributes special handling is provided: - * - When `true` is passed as the value, then only the attribute name is added to the tag. - * - When `false` is passed, the attribute gets removed if it existed before. - * - * For string attributes, the value is escaped using the `esc_attr` function. - * - * @since WP_VERSION - * - * @param string $name The attribute name to target. - * @param string|bool $value The new attribute value. - * @return bool Whether an attribute value was set. - */ - public function set_attribute( $name, $value ) { - if ( ! is_string( $value ) ) { - _doing_it_wrong( - __METHOD__, - __( 'Non-string attribute values cannot be passed to set_attribute().' ), - 'WP_VERSION' - ); - return false; - } - if ( - self::STATE_MATCHED_TAG !== $this->parser_state || - $this->is_closing_tag - ) { - return false; - } - - $value = htmlspecialchars( $value, ENT_XML1, 'UTF-8' ); - $updated_attribute = "{$name}=\"{$value}\""; - - /* - * > An attribute name must not appear more than once - * > in the same start-tag or empty-element tag. - * - XML 1.0 spec - * - * @see https://www.w3.org/TR/xml/#sec-starttags - */ - if ( isset( $this->attributes[ $name ] ) ) { - /* - * Update an existing attribute. - * - * Example – set attribute id to "new" in : - * - * - * ^-------------^ - * start end - * replacement: `id="new"` - * - * Result: - */ - $existing_attribute = $this->attributes[ $name ]; - $this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement( - $existing_attribute->start, - $existing_attribute->length, - $updated_attribute - ); - } else { - /* - * Create a new attribute at the tag's name end. - * - * Example – add attribute id="new" to : - * - * - * ^ - * start and end - * replacement: ` id="new"` - * - * Result: - */ - $this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement( - $this->tag_name_starts_at + $this->tag_name_length, - 0, - ' ' . $updated_attribute - ); - } - - return true; - } - - /** - * Remove an attribute from the currently-matched tag. - * - * @since WP_VERSION - * - * @param string $name The attribute name to remove. - * @return bool Whether an attribute was removed. - */ - public function remove_attribute( $name ) { - if ( - self::STATE_MATCHED_TAG !== $this->parser_state || - $this->is_closing_tag - ) { - return false; - } - - /* - * If updating an attribute that didn't exist in the input - * document, then remove the enqueued update and move on. - * - * For example, this might occur when calling `remove_attribute()` - * after calling `set_attribute()` for the same attribute - * and when that attribute wasn't originally present. - */ - if ( ! isset( $this->attributes[ $name ] ) ) { - if ( isset( $this->lexical_updates[ $name ] ) ) { - unset( $this->lexical_updates[ $name ] ); - } - return false; - } - - /* - * Removes an existing tag attribute. - * - * Example – remove the attribute id from : - * - * ^-------------^ - * start end - * replacement: `` - * - * Result: - */ - $this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement( - $this->attributes[ $name ]->start, - $this->attributes[ $name ]->length, - '' - ); - - return true; - } - - /** - * Returns the string representation of the XML Tag Processor. - * - * @since WP_VERSION - * - * @see WP_XML_Tag_Processor::get_updated_xml() - * - * @return string The processed XML. - */ - public function __toString() { - return $this->get_updated_xml(); - } - - /** - * Returns the string representation of the XML Tag Processor. - * - * @since WP_VERSION - * - * @return string The processed XML. - */ - public function get_updated_xml() { - $requires_no_updating = 0 === count( $this->lexical_updates ); - - /* - * When there is nothing more to update and nothing has already been - * updated, return the original document and avoid a string copy. - */ - if ( $requires_no_updating ) { - return $this->xml; - } - - /* - * Keep track of the position right before the current tag. This will - * be necessary for reparsing the current tag after updating the XML. - */ - $before_current_tag = $this->token_starts_at; - - /* - * 1. Apply the enqueued edits and update all the pointers to reflect those changes. - */ - $before_current_tag += $this->apply_attributes_updates( $before_current_tag ); - - /* - * 2. Rewind to before the current tag and reparse to get updated attributes. - * - * At this point the internal cursor points to the end of the tag name. - * Rewind before the tag name starts so that it's as if the cursor didn't - * move; a call to `next_tag()` will reparse the recently-updated attributes - * and additional calls to modify the attributes will apply at this same - * location, but in order to avoid issues with subclasses that might add - * behaviors to `next_tag()`, the internal methods should be called here - * instead. - * - * It's important to note that in this specific place there will be no change - * because the processor was already at a tag when this was called and it's - * rewinding only to the beginning of this very tag before reprocessing it - * and its attributes. - * - *

Previous XMLMore XML

- * ↑ │ back up by the length of the tag name plus the opening < - * └←─┘ back up by strlen("em") + 1 ==> 3 - */ - $this->bytes_already_parsed = $before_current_tag; - $this->base_class_next_token(); - - return $this->xml; - } - - /** - * Parses tag query input into internal search criteria. - * - * @since WP_VERSION - * - * @param array|string|null $query { - * Optional. Which tag name to find, having which class, etc. Default is to find any tag. - * - * @type string|null $tag_name Which tag to find, or `null` for "any tag." - * @type int|null $match_offset Find the Nth tag matching all search criteria. - * 1 for "first" tag, 3 for "third," etc. - * Defaults to first tag. - * @type string $tag_closers "visit" or "skip": whether to stop on tag closers, e.g.
. - * } - */ - private function parse_query( $query ) { - if ( null !== $query && $query === $this->last_query ) { - return; - } - - $this->last_query = $query; - $this->sought_tag_name = null; - $this->sought_match_offset = 1; - $this->stop_on_tag_closers = false; - - // A single string value means "find the tag of this name". - if ( is_string( $query ) ) { - $this->sought_tag_name = $query; - return; - } - - // An empty query parameter applies no restrictions on the search. - if ( null === $query ) { - return; - } - - // If not using the string interface, an associative array is required. - if ( ! is_array( $query ) ) { - _doing_it_wrong( - __METHOD__, - __( 'The query argument must be an array or a tag name.' ), - 'WP_VERSION' - ); - return; - } - - if ( isset( $query['tag_name'] ) && is_string( $query['tag_name'] ) ) { - $this->sought_tag_name = $query['tag_name']; - } - - if ( isset( $query['match_offset'] ) && is_int( $query['match_offset'] ) && 0 < $query['match_offset'] ) { - $this->sought_match_offset = $query['match_offset']; - } - - if ( isset( $query['tag_closers'] ) ) { - $this->stop_on_tag_closers = 'visit' === $query['tag_closers']; - } - } - - - /** - * Checks whether a given tag and its attributes match the search criteria. - * - * @since WP_VERSION - * - * @return bool Whether the given tag and its attribute match the search criteria. - */ - private function matches() { - if ( $this->is_closing_tag && ! $this->stop_on_tag_closers ) { - return false; - } - - // Does the tag name match the requested tag name in a case-insensitive manner? - if ( null !== $this->sought_tag_name ) { - /* - * String (byte) length lookup is fast. If they aren't the - * same length then they can't be the same string values. - */ - if ( strlen( $this->sought_tag_name ) !== $this->tag_name_length ) { - return false; - } - - /* - * Check each character to determine if they are the same. - */ - for ( $i = 0; $i < $this->tag_name_length; $i++ ) { - if ( $this->xml[ $this->tag_name_starts_at + $i ] !== $this->sought_tag_name[ $i ] ) { - return false; - } - } - } - - return true; - } - - /** - * Parser Ready State. - * - * Indicates that the parser is ready to run and waiting for a state transition. - * It may not have started yet, or it may have just finished parsing a token and - * is ready to find the next one. - * - * @since WP_VERSION - * - * @access private - */ - const STATE_READY = 'STATE_READY'; - - /** - * Parser Complete State. - * - * Indicates that the parser has reached the end of the document and there is - * nothing left to scan. It finished parsing the last token completely. - * - * @since WP_VERSION - * - * @access private - */ - const STATE_COMPLETE = 'STATE_COMPLETE'; - - /** - * Parser Incomplete Input State. - * - * Indicates that the parser has reached the end of the document before finishing - * a token. It started parsing a token but there is a possibility that the input - * XML document was truncated in the middle of a token. - * - * The parser is reset at the start of the incomplete token and has paused. There - * is nothing more than can be scanned unless provided a more complete document. - * - * @since WP_VERSION - * - * @access private - */ - const STATE_INCOMPLETE_INPUT = 'STATE_INCOMPLETE_INPUT'; - - /** - * Parser Invalid Input State. - * - * Indicates that the parsed xml document contains malformed input and cannot be parsed. - * - * @since WP_VERSION - * - * @access private - */ - const STATE_INVALID_DOCUMENT = 'STATE_INVALID_DOCUMENT'; - - /** - * Parser Matched Tag State. - * - * Indicates that the parser has found an XML tag and it's possible to get - * the tag name and read or modify its attributes (if it's not a closing tag). - * - * @since WP_VERSION - * - * @access private - */ - const STATE_MATCHED_TAG = 'STATE_MATCHED_TAG'; - - /** - * Parser Text Node State. - * - * Indicates that the parser has found a text node and it's possible - * to read and modify that text. - * - * @since WP_VERSION - * - * @access private - */ - const STATE_TEXT_NODE = 'STATE_TEXT_NODE'; - - /** - * Parser CDATA Node State. - * - * Indicates that the parser has found a CDATA node and it's possible - * to read and modify its modifiable text. Note that in XML there are - * no CDATA nodes outside of foreign content (SVG and MathML). Outside - * of foreign content, they are treated as XML comments. - * - * @since WP_VERSION - * - * @access private - */ - const STATE_CDATA_NODE = 'STATE_CDATA_NODE'; - - /** - * Indicates that the parser has found an XML processing instruction. - * - * @since WP_VERSION - * - * @access private - */ - const STATE_PI_NODE = 'STATE_PI_NODE'; - - /** - * Indicates that the parser has found an XML declaration - * - * @since WP_VERSION - * - * @access private - */ - const STATE_XML_DECLARATION = 'STATE_XML_DECLARATION'; - - /** - * Indicates that the parser has found an XML comment and it's - * possible to read and modify its modifiable text. - * - * @since WP_VERSION - * - * @access private - */ - const STATE_COMMENT = 'STATE_COMMENT'; - - /** - * Indicates that the parser encountered unsupported syntax and has bailed. - * - * @since WP_VERSION - * - * @var string - */ - const ERROR_SYNTAX = 'syntax'; - - /** - * Indicates that the provided XML document contains a declaration that is - * unsupported by the parser. - * - * @since WP_VERSION - * - * @var string - */ - const ERROR_UNSUPPORTED = 'unsupported'; - - /** - * Indicates that the parser encountered more XML tokens than it - * was able to process and has bailed. - * - * @since WP_VERSION - * - * @var string - */ - const ERROR_EXCEEDED_MAX_BOOKMARKS = 'exceeded-max-bookmarks'; -} diff --git a/packages/playground/data-liberation/tests/WPWXRURLRewriterTests.php b/packages/playground/data-liberation/tests/WPWXRURLRewriterTests.php index 3d66d628f7..3323fdf249 100644 --- a/packages/playground/data-liberation/tests/WPWXRURLRewriterTests.php +++ b/packages/playground/data-liberation/tests/WPWXRURLRewriterTests.php @@ -11,7 +11,7 @@ public function test_process($fixture_path, $expected_outcome_path) { $chain = new WP_Stream_Chain( [ 'file' => new WP_File_Byte_Stream($fixture_path, 100), - 'wxr' => WP_WXR_URL_Rewrite_Processor::stream( + 'wxr' => WP_WXR_URL_Rewrite_Processor::create_stream_processor( 'https://playground.internal/path', 'https://playground.wordpress.net/new-path' ), diff --git a/packages/playground/data-liberation/tests/WPXMLProcessorTests.php b/packages/playground/data-liberation/tests/WPXMLProcessorTests.php index 276eb7c311..974038ce73 100644 --- a/packages/playground/data-liberation/tests/WPXMLProcessorTests.php +++ b/packages/playground/data-liberation/tests/WPXMLProcessorTests.php @@ -5,7 +5,6 @@ * @package WordPress * @subpackage XML-API */ - use PHPUnit\Framework\TestCase; /** @@ -14,6 +13,1435 @@ * @coversDefaultClass WP_XML_Processor */ class WPXMLProcessorTests extends TestCase { + const XML_SIMPLE = 'Text'; + const XML_WITH_CLASSES = 'Text'; + const XML_MALFORMED = 'Back to notifications'; + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_tag + */ + public function test_get_tag_returns_null_before_finding_tags() { + $processor = WP_XML_Processor::from_string( 'Test' ); + + $this->assertNull( $processor->get_tag(), 'Calling get_tag() without selecting a tag did not return null' ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_tag + */ + public function test_get_tag_returns_null_when_not_in_open_tag() { + $processor = WP_XML_Processor::from_string( 'Test' ); + + $this->assertFalse( $processor->next_tag( 'p' ), 'Querying a non-existing tag did not return false' ); + $this->assertNull( $processor->get_tag(), 'Accessing a non-existing tag did not return null' ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_tag + */ + public function test_get_tag_returns_open_tag_name() { + $processor = WP_XML_Processor::from_string( 'Test' ); + + $this->assertTrue( $processor->next_tag( 'wp:content' ), 'Querying an existing tag did not return true' ); + $this->assertSame( 'wp:content', $processor->get_tag(), 'Accessing an existing tag name did not return "div"' ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::is_empty_element + * + * @dataProvider data_is_empty_element + * + * @param string $xml Input XML whose first tag might contain the self-closing flag `/`. + * @param bool $flag_is_set Whether the input XML's first tag contains the self-closing flag. + */ + public function test_is_empty_element_matches_input_xml( $xml, $flag_is_set ) { + $processor = WP_XML_Processor::from_string( $xml ); + $processor->next_tag( array( 'tag_closers' => 'visit' ) ); + + if ( $flag_is_set ) { + $this->assertTrue( $processor->is_empty_element(), 'Did not find the empty element tag when it was present.' ); + } else { + $this->assertFalse( $processor->is_empty_element(), 'Found the empty element tag when it was absent.' ); + } + } + + /** + * Data provider. XML tags which might have a self-closing flag, and an indicator if they do. + * + * @return array[] + */ + public static function data_is_empty_element() { + return array( + // These should not have a self-closer, and will leave an element un-closed if it's assumed they are self-closing. + 'Self-closing flag on non-void XML element' => array( '', true ), + 'No self-closing flag on non-void XML element' => array( '', false ), + // These should not have a self-closer, but are benign when used because the elements are void. + 'Self-closing flag on void XML element' => array( '', true ), + 'No self-closing flag on void XML element' => array( '', false ), + 'Self-closing flag on void XML element without spacing' => array( '', true ), + // These should not have a self-closer, but as part of a tag closer they are entirely ignored. + 'No self-closing flag on tag closer' => array( '', false ), + // These can and should have self-closers, and will leave an element un-closed if it's assumed they aren't self-closing. + 'Self-closing flag on a foreign element' => array( '', true ), + 'No self-closing flag on a foreign element' => array( '', false ), + // These involve syntax peculiarities. + 'Self-closing flag after extra spaces' => array( '', true ), + 'Self-closing flag after quoted attribute' => array( '', true ), + ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_attribute + */ + public function test_get_attribute_returns_null_when_not_in_open_tag() { + $processor = WP_XML_Processor::from_string( 'Test' ); + + $this->assertFalse( $processor->next_tag( 'p' ), 'Querying a non-existing tag did not return false' ); + $this->assertNull( $processor->get_attribute( 'wp:post-type' ), 'Accessing an attribute of a non-existing tag did not return null' ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_attribute + */ + public function test_get_attribute_returns_null_when_in_closing_tag() { + $processor = WP_XML_Processor::from_string( 'Test' ); + + $this->assertTrue( $processor->next_tag( 'wp:content' ), 'Querying an existing tag did not return true' ); + $this->assertTrue( $processor->next_token(), 'Querying an existing closing tag did not return true' ); + $this->assertTrue( $processor->next_token(), 'Querying an existing closing tag did not return true' ); + $this->assertNull( $processor->get_attribute( 'wp:post-type' ), 'Accessing an attribute of a closing tag did not return null' ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_attribute + */ + public function test_get_attribute_returns_null_when_attribute_missing() { + $processor = WP_XML_Processor::from_string( 'Test' ); + + $this->assertTrue( $processor->next_tag( 'wp:content' ), 'Querying an existing tag did not return true' ); + $this->assertNull( $processor->get_attribute( 'test-id' ), 'Accessing a non-existing attribute did not return null' ); + } + + /** + * @ticket 61365 + * + * @expectedIncorrectUsage WP_XML_Processor::base_class_next_token + * @covers WP_XML_Processor::get_attribute + */ + public function test_attributes_are_rejected_in_tag_closers() { + $processor = WP_XML_Processor::from_string( 'Test' ); + + $this->assertTrue( $processor->next_tag( 'wp:content' ), 'Querying an existing tag did not return true' ); + $this->assertTrue( $processor->next_token(), 'Querying a text node did not return true.' ); + $this->assertFalse( $processor->next_token(), 'Querying an existing but invalid closing tag did not return false.' ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_attribute + */ + public function test_get_attribute_returns_attribute_value() { + $processor = WP_XML_Processor::from_string( 'Test' ); + + $this->assertTrue( $processor->next_tag( 'wp:content' ), 'Querying an existing tag did not return true' ); + $this->assertSame( 'test', $processor->get_attribute( 'wp:post-type' ), 'Accessing a wp:post-type="test" attribute value did not return "test"' ); + } + + /** + * @ticket 61365 + * @expectedIncorrectUsage WP_XML_Processor::parse_next_attribute + * + * @covers WP_XML_Processor::get_attribute + */ + public function test_parsing_stops_on_malformed_attribute_value_no_value() { + $processor = WP_XML_Processor::from_string( 'Test' ); + + $this->assertFalse( $processor->next_tag(), 'Querying a malformed start tag did not return false' ); + } + + /** + * @ticket 61365 + * @expectedIncorrectUsage WP_XML_Processor::parse_next_attribute + * + * @covers WP_XML_Processor::get_attribute + */ + public function test_parsing_stops_on_malformed_attribute_value_no_quotes() { + $processor = WP_XML_Processor::from_string( 'Test' ); + + $this->assertFalse( $processor->next_tag(), 'Querying a malformed start tag did not return false' ); + } + + /** + * @ticket 61365 + * @expectedIncorrectUsage WP_XML_Processor::get_attribute + * + * @covers WP_XML_Processor::get_attribute + */ + public function test_malformed_attribute_value_containing_ampersand_is_treated_as_plaintext() { + $processor = WP_XML_Processor::from_string( 'Test' ); + + $this->assertTrue( $processor->next_tag(), 'Querying a tag did not return true' ); + $this->assertEquals('WordPress & WordPress', $processor->get_attribute('enabled')); + } + + /** + * @ticket 61365 + * @expectedIncorrectUsage WP_XML_Processor::get_attribute + * + * @covers WP_XML_Processor::get_attribute + */ + public function test_malformed_attribute_value_containing_entity_without_semicolon_is_treated_as_plaintext() { + $processor = WP_XML_Processor::from_string( 'Test' ); + + $this->assertTrue( $processor->next_tag(), 'Querying a tag did not return true' ); + $this->assertEquals('”', $processor->get_attribute('enabled')); + } + + /** + * @ticket 61365 + * @expectedIncorrectUsage WP_XML_Processor::parse_next_attribute + * + * @covers WP_XML_Processor::get_attribute + */ + public function test_parsing_stops_on_malformed_attribute_value_contains_lt_character() { + $processor = WP_XML_Processor::from_string( 'Test' ); + + $this->assertFalse( $processor->next_tag(), 'Querying a malformed start tag did not return false' ); + } + + /** + * @ticket 61365 + * @expectedIncorrectUsage WP_XML_Processor::parse_next_attribute + * + * @covers WP_XML_Processor::get_attribute + */ + public function test_parsing_stops_on_malformed_tags_duplicate_attributes() { + $processor = WP_XML_Processor::from_string( 'Text' ); + + $this->assertFalse( $processor->next_tag() ); + } + + /** + * @ticket 61365 + * @expectedIncorrectUsage WP_XML_Processor::parse_next_attribute + * + * @covers WP_XML_Processor::get_attribute + */ + public function test_parsing_stops_on_malformed_attribute_name_contains_slash() { + $processor = WP_XML_Processor::from_string( 'Test' ); + + $this->assertFalse( $processor->next_tag(), 'Querying a malformed start tag did not return false' ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_attribute + */ + public function test_get_modifiable_text_returns_a_decoded_value() { + $processor = WP_XML_Processor::from_string( '“😄”' ); + + $processor->next_tag( 'root' ); + $processor->next_token(); + + $this->assertEquals( + '“😄”', + $processor->get_modifiable_text(), + 'Reading an encoded text did not decode it.' + ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_attribute + */ + public function test_get_attribute_returns_a_decoded_value() { + $processor = WP_XML_Processor::from_string( '' ); + + $this->assertTrue( $processor->next_tag( 'root' ), 'Querying a tag did not return true' ); + $this->assertEquals( + '“😄”', + $processor->get_attribute( 'encoded-data' ), + 'Reading an encoded attribute did not decode it.' + ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_attribute + * + * @param string $attribute_name Name of data-enabled attribute with case variations. + */ + public function test_get_attribute_is_case_sensitive() { + $processor = WP_XML_Processor::from_string( 'Test' ); + $processor->next_tag(); + + $this->assertEquals( + 'true', + $processor->get_attribute( 'DATA-enabled' ), + 'Accessing an attribute by a same-cased name did return not its value' + ); + + $this->assertNull( + $processor->get_attribute( 'data-enabled' ), + 'Accessing an attribute by a differently-cased name did return its value' + ); + } + + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::remove_attribute + */ + public function test_remove_attribute_is_case_sensitive() { + $processor = WP_XML_Processor::from_string( 'Test' ); + $processor->next_tag(); + $processor->remove_attribute( 'data-enabled' ); + + $this->assertSame( 'Test', $processor->get_updated_xml(), 'A case-sensitive remove_attribute call did remove the attribute' ); + + $processor->remove_attribute( 'DATA-enabled' ); + + $this->assertSame( 'Test', $processor->get_updated_xml(), 'A case-sensitive remove_attribute call did not remove the attribute' ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::set_attribute + */ + public function test_set_attribute_is_case_sensitive() { + $processor = WP_XML_Processor::from_string( 'Test' ); + $processor->next_tag(); + $processor->set_attribute( 'data-enabled', 'abc' ); + + $this->assertSame( 'Test', $processor->get_updated_xml(), 'A case-insensitive set_attribute call did not update the existing attribute' ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_attribute_names_with_prefix + */ + public function test_get_attribute_names_with_prefix_returns_null_before_finding_tags() { + $processor = WP_XML_Processor::from_string( 'Test' ); + $this->assertNull( + $processor->get_attribute_names_with_prefix( 'data-' ), + 'Accessing attributes by their prefix did not return null when no tag was selected' + ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_attribute_names_with_prefix + */ + public function test_get_attribute_names_with_prefix_returns_null_when_not_in_open_tag() { + $processor = WP_XML_Processor::from_string( 'Test' ); + $processor->next_tag( 'p' ); + $this->assertNull( $processor->get_attribute_names_with_prefix( 'data-' ), 'Accessing attributes of a non-existing tag did not return null' ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_attribute_names_with_prefix + */ + public function test_get_attribute_names_with_prefix_returns_null_when_in_closing_tag() { + $processor = WP_XML_Processor::from_string( 'Test' ); + $processor->next_tag( 'wp:content' ); + $processor->next_tag( array( 'tag_closers' => 'visit' ) ); + + $this->assertNull( $processor->get_attribute_names_with_prefix( 'data-' ), 'Accessing attributes of a closing tag did not return null' ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_attribute_names_with_prefix + */ + public function test_get_attribute_names_with_prefix_returns_empty_array_when_no_attributes_present() { + $processor = WP_XML_Processor::from_string( 'Test' ); + $processor->next_tag( 'wp:content' ); + + $this->assertSame( array(), $processor->get_attribute_names_with_prefix( 'data-' ), 'Accessing the attributes on a tag without any did not return an empty array' ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_attribute_names_with_prefix + */ + public function test_get_attribute_names_with_prefix_returns_matching_attribute_names_in_original_case() { + $processor = WP_XML_Processor::from_string( 'Test' ); + $processor->next_tag(); + + $this->assertSame( + array( 'data-test-ID' ), + $processor->get_attribute_names_with_prefix( 'data-' ), + 'Accessing attributes by their prefix did not return their lowercase names' + ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_attribute_names_with_prefix + */ + public function test_get_attribute_names_with_prefix_returns_attribute_added_by_set_attribute() { + $processor = WP_XML_Processor::from_string( 'Test' ); + $processor->next_tag(); + $processor->set_attribute( 'data-test-id', '14' ); + + $this->assertSame( + 'Test', + $processor->get_updated_xml(), + "Updated XML doesn't include attribute added via set_attribute" + ); + $this->assertSame( + array( 'data-test-id', 'data-foo' ), + $processor->get_attribute_names_with_prefix( 'data-' ), + "Accessing attribute names doesn't find attribute added via set_attribute" + ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::__toString + */ + public function test_to_string_returns_updated_xml() { + $processor = WP_XML_Processor::from_string( 'Test' ); + $processor->next_tag(); + $processor->remove_attribute( 'id' ); + + $processor->next_tag(); + $processor->set_attribute( 'id', 'wp:content-id-1' ); + + $this->assertSame( + $processor->get_updated_xml(), + (string) $processor, + 'get_updated_xml() returned a different value than __toString()' + ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_updated_xml + */ + public function test_get_updated_xml_applies_the_updates_so_far_and_keeps_the_processor_on_the_current_tag() { + $processor = WP_XML_Processor::from_string( 'Test' ); + $processor->next_tag(); + $processor->remove_attribute( 'id' ); + + $processor->next_tag(); + $processor->set_attribute( 'id', 'wp:content-id-1' ); + + $this->assertSame( + 'Test', + $processor->get_updated_xml(), + 'Calling get_updated_xml after updating the attributes of the second tag returned different XML than expected' + ); + + $processor->set_attribute( 'id', 'wp:content-id-2' ); + + $this->assertSame( + 'Test', + $processor->get_updated_xml(), + 'Calling get_updated_xml after updating the attributes of the second tag for the second time returned different XML than expected' + ); + + $processor->next_tag(); + $processor->remove_attribute( 'id' ); + + $this->assertSame( + 'Test', + $processor->get_updated_xml(), + 'Calling get_updated_xml after removing the id attribute of the third tag returned different XML than expected' + ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_updated_xml + */ + public function test_get_updated_xml_without_updating_any_attributes_returns_the_original_xml() { + $processor = WP_XML_Processor::from_string( self::XML_SIMPLE ); + + $this->assertSame( + self::XML_SIMPLE, + $processor->get_updated_xml(), + 'Casting WP_XML_Processor to a string without performing any updates did not return the initial XML snippet' + ); + } + + /** + * Ensures that when seeking to an earlier spot in the document that + * all previously-enqueued updates are applied as they ought to be. + * + * @ticket 61365 + * @expectedIncorrectUsage WP_XML_Processor::parse_next_attribute + */ + public function test_get_updated_xml_applies_updates_to_content_after_seeking_to_before_parsed_bytes() { + $processor = WP_XML_Processor::from_string( '' ); + + $processor->next_tag(); + $processor->set_attribute( 'wonky', 'true' ); + $processor->next_tag(); + $processor->set_bookmark( 'here' ); + + $processor->next_tag( array( 'tag_closers' => 'visit' ) ); + $processor->seek( 'here' ); + + $this->assertSame( '', $processor->get_updated_xml() ); + } + + public function test_declare_element_as_pcdata() { + $text = ' + This text contains syntax that may seem + like XML nodes: + + + + + + + &<>"' + + But! It is all treated as text. + '; + $processor = WP_XML_Processor::from_string( + "$text" + ); + $processor->declare_element_as_pcdata( 'my-pcdata' ); + $processor->next_tag( 'my-pcdata' ); + + $this->assertEquals( + $text, + $processor->get_modifiable_text(), + 'get_modifiable_text() did not return the expected text' + ); + } + + /** + * Ensures that bookmarks start and length correctly describe a given token in XML. + * + * @ticket 61365 + * + * @dataProvider data_xml_nth_token_substring + * + * @param string $xml Input XML. + * @param int $match_nth_token Which token to inspect from input XML. + * @param string $expected_match Expected full raw token bookmark should capture. + */ + public function test_token_bookmark_span( string $xml, int $match_nth_token, string $expected_match ) { + $processor = new class( $xml ) extends WP_XML_Processor { + public function __construct( $xml ) { + parent::__construct( $xml ); + } + + /** + * Returns the raw span of XML for the currently-matched + * token, or null if not paused on any token. + * + * @return string|null Raw XML content of currently-matched token, + * otherwise `null` if not matched. + */ + public function get_raw_token() { + if ( + WP_XML_Processor::STATE_READY === $this->parser_state || + WP_XML_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state || + WP_XML_Processor::STATE_COMPLETE === $this->parser_state + ) { + return null; + } + + $this->set_bookmark( 'mark' ); + $mark = $this->bookmarks['mark']; + + return substr( $this->xml, $mark->start, $mark->length ); + } + }; + + for ( $i = 0; $i < $match_nth_token; $i++ ) { + $processor->next_token(); + } + + $raw_token = $processor->get_raw_token(); + $this->assertIsString( + $raw_token, + "Failed to find raw token at position {$match_nth_token}: check test data provider." + ); + + $this->assertSame( + $expected_match, + $raw_token, + 'Bookmarked wrong span of text for full matched token.' + ); + } + + /** + * Data provider. + * + * @return array + */ + public static function data_xml_nth_token_substring() { + return array( + // Tags. + 'DIV start tag' => array( '', 1, '' ), + 'DIV start tag with attributes' => array( '', 1, '' ), + 'Nested DIV' => array( '', 2, '' ), + 'Sibling DIV' => array( '', 3, '' ), + 'DIV before text' => array( ' text', 1, '' ), + 'DIV after comment' => array( '', 3, '' ), + 'DIV before comment' => array( ' ', 1, '' ), + 'Start "self-closing" tag' => array( '', 1, '' ), + 'Void tag' => array( '', 1, '' ), + 'Void tag w/self-closing flag' => array( '', 1, '' ), + 'Void tag inside DIV' => array( '', 2, '' ), + + // Text. + 'Text' => array( 'Just text', 1, 'Just text' ), + 'Text in DIV' => array( 'Text', 2, 'Text' ), + 'Text before DIV' => array( 'Text', 1, 'Text' ), + 'Text after comment' => array( 'Text', 2, 'Text' ), + 'Text before comment' => array( 'Text ', 1, 'Text' ), + + // Comments. + 'Comment' => array( '', 1, '' ), + 'Comment in DIV' => array( '', 2, '' ), + 'Comment before DIV' => array( '', 1, '' ), + 'Comment after DIV' => array( '', 3, '' ), + 'Comment after comment' => array( '', 2, '' ), + 'Comment before comment' => array( ' ', 1, '' ), + 'Empty comment' => array( '', 1, '' ), + ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::next_tag + */ + public function test_next_tag_with_no_arguments_should_find_the_next_existing_tag() { + $processor = WP_XML_Processor::from_string( self::XML_SIMPLE ); + + $this->assertTrue( $processor->next_tag(), 'Querying an existing tag did not return true' ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::next_tag + */ + public function test_next_tag_should_return_false_for_a_non_existing_tag() { + $processor = WP_XML_Processor::from_string( self::XML_SIMPLE ); + + $this->assertFalse( $processor->next_tag( 'p' ), 'Querying a non-existing tag did not return false' ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_modifiable_text + */ + public function test_normalizes_carriage_returns_in_text_nodes() { + $processor = WP_XML_Processor::from_string( + "We are\rnormalizing\r\n\nthe\n\r\r\r\ncarriage returns" + ); + $processor->next_tag(); + $processor->next_token(); + $this->assertEquals( + "We are\nnormalizing\n\nthe\n\n\n\ncarriage returns", + $processor->get_modifiable_text(), + 'get_raw_token() did not normalize the carriage return characters' + ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_modifiable_text + */ + public function test_normalizes_carriage_returns_in_cdata() { + $processor = WP_XML_Processor::from_string( + "" + ); + $processor->next_tag(); + $processor->next_token(); + $this->assertEquals( + "We are\nnormalizing\n\nthe\n\n\n\ncarriage returns", + $processor->get_modifiable_text(), + 'get_raw_token() did not normalize the carriage return characters' + ); + } + + /** + * @ticket 61365 + * @ticket 61365 + * + * @covers WP_XML_Processor::next_tag + * @covers WP_XML_Processor::is_tag_closer + */ + public function test_next_tag_should_not_stop_on_closers() { + $processor = WP_XML_Processor::from_string( '' ); + + $this->assertTrue( $processor->next_tag( array( 'breadcrumbs' => array( 'wp:content' ) ) ), 'Did not find desired tag opener' ); + $this->assertFalse( $processor->next_tag( array( 'breadcrumbs' => array( 'wp:content' ) ) ), 'Visited an unwanted tag, a tag closer' ); + } + + /** + * Verifies that updates to a document before calls to `get_updated_xml()` don't + * lead to the Tag Processor jumping to the wrong tag after the updates. + * + * @ticket 61365 + * + * @covers WP_XML_Processor::get_updated_xml + */ + public function test_internal_pointer_returns_to_original_spot_after_inserting_content_before_cursor() { + $tags = WP_XML_Processor::from_string( 'outside
inside
' ); + + $tags->next_tag(); + $tags->next_tag(); + $tags->set_attribute( 'wp:post-type', 'foo' ); + $tags->next_tag( 'section' ); + + // Return to this spot after moving ahead. + $tags->set_bookmark( 'here' ); + + // Move ahead. + $tags->next_tag( 'photo' ); + $tags->seek( 'here' ); + $this->assertSame( 'outside
inside
', $tags->get_updated_xml() ); + $this->assertSame( 'section', $tags->get_tag() ); + $this->assertFalse( $tags->is_tag_closer() ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::set_attribute + */ + public function test_set_attribute_on_a_non_existing_tag_does_not_change_the_markup() { + $processor = WP_XML_Processor::from_string( self::XML_SIMPLE ); + + $this->assertFalse( $processor->next_tag( 'p' ), 'Querying a non-existing tag did not return false' ); + $this->assertFalse( $processor->next_tag( 'wp:content' ), 'Querying a non-existing tag did not return false' ); + + $processor->set_attribute( 'id', 'primary' ); + + $this->assertSame( + self::XML_SIMPLE, + $processor->get_updated_xml(), + 'Calling get_updated_xml after updating a non-existing tag returned an XML that was different from the original XML' + ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::set_attribute + * @covers WP_XML_Processor::remove_attribute + * @covers WP_XML_Processor::add_class + * @covers WP_XML_Processor::remove_class + */ + public function test_attribute_ops_on_tag_closer_do_not_change_the_markup() { + $processor = WP_XML_Processor::from_string( '' ); + $processor->next_token(); + $this->assertFalse( $processor->is_tag_closer(), 'Skipped tag opener' ); + + $processor->next_token(); + $this->assertTrue( $processor->is_tag_closer(), 'Skipped tag closer' ); + $this->assertFalse( $processor->set_attribute( 'id', 'test' ), "Allowed setting an attribute on a tag closer when it shouldn't have" ); + $this->assertFalse( $processor->remove_attribute( 'invalid-id' ), "Allowed removing an attribute on a tag closer when it shouldn't have" ); + $this->assertSame( + '', + $processor->get_updated_xml(), + 'Calling get_updated_xml after updating a non-existing tag returned an XML that was different from the original XML' + ); + } + + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::set_attribute + */ + public function test_set_attribute_with_a_non_existing_attribute_adds_a_new_attribute_to_the_markup() { + $processor = WP_XML_Processor::from_string( self::XML_SIMPLE ); + $processor->next_tag(); + $processor->set_attribute( 'test-attribute', 'test-value' ); + + $this->assertSame( + 'Text', + $processor->get_updated_xml(), + 'Updated XML does not include attribute added via set_attribute()' + ); + $this->assertSame( + 'test-value', + $processor->get_attribute( 'test-attribute' ), + 'get_attribute() (called after get_updated_xml()) did not return attribute added via set_attribute()' + ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_attribute + */ + public function test_get_attribute_returns_updated_values_before_they_are_applied() { + $processor = WP_XML_Processor::from_string( self::XML_SIMPLE ); + $processor->next_tag(); + $processor->set_attribute( 'test-attribute', 'test-value' ); + + $this->assertSame( + 'test-value', + $processor->get_attribute( 'test-attribute' ), + 'get_attribute() (called before get_updated_xml()) did not return attribute added via set_attribute()' + ); + $this->assertSame( + 'Text', + $processor->get_updated_xml(), + 'Updated XML does not include attribute added via set_attribute()' + ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_attribute + */ + public function test_get_attribute_returns_updated_values_before_they_are_applied_with_different_name_casing() { + $processor = WP_XML_Processor::from_string( self::XML_SIMPLE ); + $processor->next_tag(); + $processor->set_attribute( 'test-ATTribute', 'test-value' ); + + $this->assertSame( + 'test-value', + $processor->get_attribute( 'test-ATTribute' ), + 'get_attribute() (called before get_updated_xml()) did not return attribute added via set_attribute()' + ); + $this->assertSame( + 'Text', + $processor->get_updated_xml(), + 'Updated XML does not include attribute added via set_attribute()' + ); + } + + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_attribute + */ + public function test_get_attribute_reflects_removed_attribute_before_it_is_applied() { + $processor = WP_XML_Processor::from_string( self::XML_SIMPLE ); + $processor->next_tag(); + $processor->remove_attribute( 'id' ); + + $this->assertNull( + $processor->get_attribute( 'id' ), + 'get_attribute() (called before get_updated_xml()) returned attribute that was removed by remove_attribute()' + ); + $this->assertSame( + 'Text', + $processor->get_updated_xml(), + 'Updated XML includes attribute that was removed by remove_attribute()' + ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_attribute + */ + public function test_get_attribute_reflects_adding_and_then_removing_an_attribute_before_those_updates_are_applied() { + $processor = WP_XML_Processor::from_string( self::XML_SIMPLE ); + $processor->next_tag(); + $processor->set_attribute( 'test-attribute', 'test-value' ); + $processor->remove_attribute( 'test-attribute' ); + + $this->assertNull( + $processor->get_attribute( 'test-attribute' ), + 'get_attribute() (called before get_updated_xml()) returned attribute that was added via set_attribute() and then removed by remove_attribute()' + ); + $this->assertSame( + self::XML_SIMPLE, + $processor->get_updated_xml(), + 'Updated XML includes attribute that was added via set_attribute() and then removed by remove_attribute()' + ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::get_attribute + */ + public function test_get_attribute_reflects_setting_and_then_removing_an_existing_attribute_before_those_updates_are_applied() { + $processor = WP_XML_Processor::from_string( self::XML_SIMPLE ); + $processor->next_tag(); + $processor->set_attribute( 'id', 'test-value' ); + $processor->remove_attribute( 'id' ); + + $this->assertNull( + $processor->get_attribute( 'id' ), + 'get_attribute() (called before get_updated_xml()) returned attribute that was overwritten by set_attribute() and then removed by remove_attribute()' + ); + $this->assertSame( + 'Text', + $processor->get_updated_xml(), + 'Updated XML includes attribute that was overwritten by set_attribute() and then removed by remove_attribute()' + ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::set_attribute + */ + public function test_set_attribute_with_an_existing_attribute_name_updates_its_value_in_the_markup() { + $processor = WP_XML_Processor::from_string( self::XML_SIMPLE ); + $processor->next_tag(); + $processor->set_attribute( 'id', 'new-id' ); + $this->assertSame( + 'Text', + $processor->get_updated_xml(), + 'Existing attribute was not updated' + ); + } + + /** + * Ensures that when setting an attribute multiple times that only + * one update flushes out into the updated XML. + * + * @ticket 61365 + * + * @covers WP_XML_Processor::set_attribute + */ + public function test_set_attribute_with_case_variants_updates_only_the_original_first_copy() { + $processor = WP_XML_Processor::from_string( '' ); + $processor->next_tag(); + $processor->set_attribute( 'data-enabled', 'canary1' ); + $processor->set_attribute( 'data-enabled', 'canary2' ); + $processor->set_attribute( 'data-enabled', 'canary3' ); + + $this->assertSame( '', strtolower( $processor->get_updated_xml() ) ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::next_tag + * @covers WP_XML_Processor::set_attribute + */ + public function test_next_tag_and_set_attribute_in_a_loop_update_all_tags_in_the_markup() { + $processor = WP_XML_Processor::from_string( self::XML_SIMPLE ); + while ( $processor->next_tag() ) { + $processor->set_attribute( 'data-foo', 'bar' ); + } + + $this->assertSame( + 'Text', + $processor->get_updated_xml(), + 'Not all tags were updated when looping with next_tag() and set_attribute()' + ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::remove_attribute + */ + public function test_remove_attribute_with_an_existing_attribute_name_removes_it_from_the_markup() { + $processor = WP_XML_Processor::from_string( self::XML_SIMPLE ); + $processor->next_tag(); + $processor->remove_attribute( 'id' ); + + $this->assertSame( + 'Text', + $processor->get_updated_xml(), + 'Attribute was not removed' + ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::remove_attribute + */ + public function test_remove_attribute_with_a_non_existing_attribute_name_does_not_change_the_markup() { + $processor = WP_XML_Processor::from_string( self::XML_SIMPLE ); + $processor->next_tag(); + $processor->remove_attribute( 'no-such-attribute' ); + + $this->assertSame( + self::XML_SIMPLE, + $processor->get_updated_xml(), + 'Content was changed when attempting to remove an attribute that did not exist' + ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::next_tag + */ + public function test_correctly_parses_xml_attributes_wrapped_in_single_quotation_marks() { + $processor = WP_XML_Processor::from_string( + 'Text' + ); + $processor->next_tag( + array( + 'breadcrumbs' => array( 'wp:content' ), + 'id' => 'first', + ) + ); + $processor->remove_attribute( 'id' ); + $processor->next_tag( + array( + 'breadcrumbs' => array( 'wp:text' ), + 'id' => 'second', + ) + ); + $processor->set_attribute( 'id', 'single-quote' ); + $this->assertSame( + 'Text', + $processor->get_updated_xml(), + 'Did not remove single-quoted attribute' + ); + } + + /** + * @ticket 61365 + * @expectedIncorrectUsage WP_XML_Processor::parse_next_attribute + * @expectedIncorrectUsage WP_XML_Processor::set_attribute + * + * @covers WP_XML_Processor::set_attribute + */ + public function test_setting_an_attribute_to_false_is_rejected() { + $processor = WP_XML_Processor::from_string( + '
' + ); + $processor->next_tag( 'input' ); + $this->assertFalse( + $processor->set_attribute( 'checked', false ), + 'Accepted a boolean attribute name.' + ); + } + + /** + * @ticket 61365 + * @expectedIncorrectUsage WP_XML_Processor::set_attribute + * + * @covers WP_XML_Processor::set_attribute + */ + public function test_setting_a_missing_attribute_to_false_does_not_change_the_markup() { + $xml_input = '
'; + $processor = WP_XML_Processor::from_string( $xml_input ); + $processor->next_tag( 'input' ); + $processor->set_attribute( 'checked', false ); + $this->assertSame( + $xml_input, + $processor->get_updated_xml(), + 'Changed the markup unexpectedly when setting a non-existing attribute to false' + ); + } + + /** + * Ensures that unclosed and invalid comments trigger warnings or errors. + * + * @ticket 61365 + * + * @covers WP_XML_Processor::next_tag + * @covers WP_XML_Processor::paused_at_incomplete_token + * + * @dataProvider data_xml_with_unclosed_comments + * + * @param string $xml_ending_before_comment_close XML with opened comments that aren't closed. + */ + public function test_documents_may_end_with_unclosed_comment( $xml_ending_before_comment_close ) { + $processor = WP_XML_Processor::from_stream( $xml_ending_before_comment_close ); + + $this->assertFalse( + $processor->next_tag(), + "Should not have found any tag, but found {$processor->get_tag()}." + ); + + $this->assertTrue( + $processor->is_paused_at_incomplete_input(), + "Should have indicated that the parser found an incomplete token but didn't." + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_xml_with_unclosed_comments() { + return array( + 'Shortest open valid comment' => array( '' ); + $this->assertFalse( $processor->next_token(), 'Did not reject a malformed XML comment.' ); + } + + /** + * @covers WP_XML_Processor::next_tag + */ + public function test_handles_malformed_taglike_open_short_xml() { + $processor = WP_XML_Processor::from_string( '<' ); + $result = $processor->next_tag(); + $this->assertFalse( $result, 'Did not handle "<" xml properly.' ); + } + + /** + * @covers WP_XML_Processor::next_tag + */ + public function test_handles_malformed_taglike_close_short_xml() { + $processor = WP_XML_Processor::from_string( 'next_tag(); + $this->assertFalse( $result, 'Did not handle " ' ); + $result = $processor->next_tag(); + $this->assertFalse( $result, 'Did not handle "
" xml properly.' ); + } + + /** + * Ensures that non-tag syntax starting with `<` is rejected. + * + * @ticket 61365 + */ + public function test_single_text_node_with_taglike_text() { + $processor = WP_XML_Processor::from_string( 'This is a text node< /A>' ); + $this->assertTrue( $processor->next_token(), 'A root node was not found.' ); + $this->assertTrue( $processor->next_token(), 'A valid text node was not found.' ); + $this->assertEquals( 'This is a text node', $processor->get_modifiable_text(), 'The contents of a valid text node were not correctly captured.' ); + $this->assertFalse( $processor->next_tag(), 'A malformed XML markup was not rejected.' ); + } + + /** + * Ensures that non-tag syntax starting with `<` is rejected. + * + * @ticket 61365 + */ + public function test_parses_CDATA() { + $processor = WP_XML_Processor::from_string( '' ); + $processor->next_tag(); + $this->assertTrue( $processor->next_token(), 'The first text node was not found.' ); $this->assertEquals( + 'This is a CDATA text node.', + $processor->get_modifiable_text(), + 'The contents of a a CDATA text node were not correctly captured.' + ); + } + + /** + * @ticket 61365 + */ + public function test_yields_CDATA_a_separate_text_node() { + $processor = WP_XML_Processor::from_string( 'This is the first text node and this is the third text node.' ); + + $processor->next_token(); + $this->assertTrue( $processor->next_token(), 'The first text node was not found.' ); + $this->assertEquals( + 'This is the first text node ', + $processor->get_modifiable_text(), + 'The contents of a valid text node were not correctly captured.' + ); + + $this->assertTrue( $processor->next_token(), 'The CDATA text node was not found.' ); + $this->assertEquals( + ' and this is a second text node ', + $processor->get_modifiable_text(), + 'The contents of a a CDATA text node were not correctly captured.' + ); + + $this->assertTrue( $processor->next_token(), 'The text node was not found.' ); + $this->assertEquals( + ' and this is the third text node.', + $processor->get_modifiable_text(), + 'The contents of a valid text node were not correctly captured.' + ); + } + + /** + * + * @ticket 61365 + */ + public function test_xml_declaration() { + $processor = WP_XML_Processor::from_string( '' ); + $this->assertTrue( $processor->next_token(), 'The XML declaration was not found.' ); + $this->assertEquals( + '#xml-declaration', + $processor->get_token_type(), + 'The XML declaration was not correctly identified.' + ); + $this->assertEquals( '1.0', $processor->get_attribute( 'version' ), 'The version attribute was not correctly captured.' ); + $this->assertEquals( 'UTF-8', $processor->get_attribute( 'encoding' ), 'The encoding attribute was not correctly captured.' ); + } + + /** + * + * @ticket 61365 + */ + public function test_xml_declaration_with_single_quotes() { + $processor = WP_XML_Processor::from_string( "" ); + $this->assertTrue( $processor->next_token(), 'The XML declaration was not found.' ); + $this->assertEquals( + '#xml-declaration', + $processor->get_token_type(), + 'The XML declaration was not correctly identified.' + ); + $this->assertEquals( '1.0', $processor->get_attribute( 'version' ), 'The version attribute was not correctly captured.' ); + $this->assertEquals( 'UTF-8', $processor->get_attribute( 'encoding' ), 'The encoding attribute was not correctly captured.' ); + } + + /** + * + * @ticket 61365 + */ + public function test_processor_instructions() { + $processor = WP_XML_Processor::from_string( + // The first ' . + // The second ' + ); + $this->assertTrue( $processor->next_token(), 'The XML declaration was not found.' ); + $this->assertTrue( $processor->next_token(), 'The processing instruction was not found.' ); + $this->assertEquals( + '#processing-instructions', + $processor->get_token_type(), + 'The processing instruction was not correctly identified.' + ); + $this->assertEquals( ' stylesheet type="text/xsl" href="style.xsl" ', $processor->get_modifiable_text(), 'The modifiable text was not correctly captured.' ); + } + + /** + * Ensures that updates which are enqueued in front of the cursor + * are applied before moving forward in the document. + * + * @ticket 61365 + */ + public function test_applies_updates_before_proceeding() { + $xml = ''; + + $subclass = new class( $xml ) extends WP_XML_Processor { + public function __construct( $xml ) { + parent::__construct( $xml ); + } + + /** + * Inserts raw text after the current token. + * + * @param string $new_xml Raw text to insert. + */ + public function insert_after( $new_xml ) { + $this->set_bookmark( 'here' ); + $this->lexical_updates[] = new WP_HTML_Text_Replacement( + $this->bookmarks['here']->start + $this->bookmarks['here']->length, + 0, + $new_xml + ); + } + }; + + $subclass->next_tag( 'photo' ); + $subclass->insert_after( '

snow-capped

' ); + + $subclass->next_tag(); + $this->assertSame( + 'p', + $subclass->get_tag(), + 'Should have matched inserted XML as next tag.' + ); + + $subclass->next_tag( 'photo' ); + $subclass->set_attribute( 'alt', 'mountain' ); + + $this->assertSame( + '

snow-capped

', + $subclass->get_updated_xml(), + 'Should have properly applied the update from in front of the cursor.' + ); + } + /** * @ticket 61365 @@ -22,7 +1450,7 @@ class WPXMLProcessorTests extends TestCase { * @covers WP_XML_Processor::get_breadcrumbs */ public function test_get_breadcrumbs() { - $processor = new WP_XML_Processor( + $processor = WP_XML_Processor::from_string( ' @@ -60,7 +1488,7 @@ public function test_get_breadcrumbs() { */ public function test_matches_breadcrumbs() { // Initialize the WP_XML_Processor with the given XML string - $processor = new WP_XML_Processor( '' ); + $processor = WP_XML_Processor::from_string( '' ); // Move to the next element with tag name 'img' $processor->next_tag( 'image' ); @@ -79,7 +1507,7 @@ public function test_matches_breadcrumbs() { */ public function test_next_tag_by_breadcrumbs() { // Initialize the WP_XML_Processor with the given XML string - $processor = new WP_XML_Processor( '' ); + $processor = WP_XML_Processor::from_string( '' ); // Move to the next element with tag name 'img' $processor->next_tag( @@ -98,7 +1526,7 @@ public function test_next_tag_by_breadcrumbs() { */ public function test_get_current_depth() { // Initialize the WP_XML_Processor with the given XML string - $processor = new WP_XML_Processor( '' ); + $processor = WP_XML_Processor::from_string( '' ); // Assert that the initial depth is 0 $this->assertEquals( 0, $processor->get_current_depth() ); @@ -130,11 +1558,11 @@ public function test_get_current_depth() { * @expectedIncorrectUsage WP_XML_Processor::step_in_misc */ public function test_no_text_allowed_after_root_element() { - $processor = new WP_XML_Processor( 'text' ); + $processor = WP_XML_Processor::from_string( 'text' ); $this->assertTrue( $processor->next_tag(), 'Did not find a tag.' ); $this->assertFalse( $processor->next_tag(), 'Found a non-existent tag.' ); $this->assertEquals( - WP_XML_Tag_Processor::ERROR_SYNTAX, + WP_XML_Processor::ERROR_SYNTAX, $processor->get_last_error(), 'Did not run into a parse error after the root element' ); @@ -144,7 +1572,7 @@ public function test_no_text_allowed_after_root_element() { * @ticket 61365 */ public function test_whitespace_text_allowed_after_root_element() { - $processor = new WP_XML_Processor( ' ' ); + $processor = WP_XML_Processor::from_string( ' ' ); $this->assertTrue( $processor->next_tag(), 'Did not find a tag.' ); $this->assertFalse( $processor->next_tag(), 'Found a non-existent tag.' ); $this->assertNull( $processor->get_last_error(), 'Ran into a parse error after the root element' ); @@ -154,7 +1582,7 @@ public function test_whitespace_text_allowed_after_root_element() { * @ticket 61365 */ public function test_processing_directives_allowed_after_root_element() { - $processor = new WP_XML_Processor( '' ); + $processor = WP_XML_Processor::from_string( '' ); $this->assertTrue( $processor->next_tag(), 'Did not find a tag.' ); $this->assertFalse( $processor->next_tag(), 'Found a non-existent tag.' ); $this->assertNull( $processor->get_last_error(), 'Ran into a parse error after the root element' ); @@ -164,7 +1592,7 @@ public function test_processing_directives_allowed_after_root_element() { * @ticket 61365 */ public function test_mixed_misc_grammar_allowed_after_root_element() { - $processor = new WP_XML_Processor( ' ' ); + $processor = WP_XML_Processor::from_string( ' ' ); $processor->next_tag(); $this->assertEquals( 'root', $processor->get_tag(), 'Did not find a tag.' ); @@ -179,11 +1607,11 @@ public function test_mixed_misc_grammar_allowed_after_root_element() { * @expectedIncorrectUsage WP_XML_Processor::step_in_misc */ public function test_elements_not_allowed_after_root_element() { - $processor = new WP_XML_Processor( '' ); + $processor = WP_XML_Processor::from_string( '' ); $this->assertTrue( $processor->next_tag(), 'Did not find a tag.' ); $this->assertFalse( $processor->next_tag(), 'Fount an illegal tag.' ); $this->assertEquals( - WP_XML_Tag_Processor::ERROR_SYNTAX, + WP_XML_Processor::ERROR_SYNTAX, $processor->get_last_error(), 'Did not run into a parse error after the root element' ); @@ -195,7 +1623,7 @@ public function test_elements_not_allowed_after_root_element() { * @return void */ public function test_comments_allowed_after_root_element() { - $processor = new WP_XML_Processor( '' ); + $processor = WP_XML_Processor::from_string( '' ); $this->assertTrue( $processor->next_tag(), 'Did not find a tag.' ); $this->assertFalse( $processor->next_tag(), 'Found an element node after the root element' ); $this->assertNull( $processor->get_last_error(), 'Ran into a parse error after the root element' ); @@ -208,11 +1636,11 @@ public function test_comments_allowed_after_root_element() { * @return void */ public function test_cdata_not_allowed_after_root_element() { - $processor = new WP_XML_Processor( '' ); + $processor = WP_XML_Processor::from_string( '' ); $this->assertTrue( $processor->next_tag(), 'Did not find a tag.' ); $this->assertFalse( $processor->next_tag(), 'Did not reject a comment node after the root element' ); $this->assertEquals( - WP_XML_Tag_Processor::ERROR_SYNTAX, + WP_XML_Processor::ERROR_SYNTAX, $processor->get_last_error(), 'Did not run into a parse error after the root element' ); @@ -224,7 +1652,7 @@ public function test_cdata_not_allowed_after_root_element() { * @covers WP_XML_Processor::next_tag */ public function test_detects_invalid_document_no_root_tag() { - $processor = new WP_XML_Processor( + $processor = WP_XML_Processor::from_stream( ' ' ); @@ -238,7 +1666,7 @@ public function test_detects_invalid_document_no_root_tag() { * @covers WP_XML_Processor::next_tag */ public function test_unclosed_root_yields_incomplete_input() { - $processor = new WP_XML_Processor( + $processor = WP_XML_Processor::from_stream( ' @@ -249,4 +1677,4 @@ public function test_unclosed_root_yields_incomplete_input() { } $this->assertTrue( $processor->is_paused_at_incomplete_input(), 'Did not indicate that the XML input was incomplete.' ); } -} +} \ No newline at end of file diff --git a/packages/playground/data-liberation/tests/WPXMLTagProcessorTests.php b/packages/playground/data-liberation/tests/WPXMLTagProcessorTests.php deleted file mode 100644 index c336371ec8..0000000000 --- a/packages/playground/data-liberation/tests/WPXMLTagProcessorTests.php +++ /dev/null @@ -1,1426 +0,0 @@ -Text'; - const XML_WITH_CLASSES = 'Text'; - const XML_MALFORMED = 'Back to notifications'; - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_tag - */ - public function test_get_tag_returns_null_before_finding_tags() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - - $this->assertNull( $processor->get_tag(), 'Calling get_tag() without selecting a tag did not return null' ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_tag - */ - public function test_get_tag_returns_null_when_not_in_open_tag() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - - $this->assertFalse( $processor->next_tag( 'p' ), 'Querying a non-existing tag did not return false' ); - $this->assertNull( $processor->get_tag(), 'Accessing a non-existing tag did not return null' ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_tag - */ - public function test_get_tag_returns_open_tag_name() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - - $this->assertTrue( $processor->next_tag( 'wp:content' ), 'Querying an existing tag did not return true' ); - $this->assertSame( 'wp:content', $processor->get_tag(), 'Accessing an existing tag name did not return "div"' ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::is_empty_element - * - * @dataProvider data_is_empty_element - * - * @param string $xml Input XML whose first tag might contain the self-closing flag `/`. - * @param bool $flag_is_set Whether the input XML's first tag contains the self-closing flag. - */ - public function test_is_empty_element_matches_input_xml( $xml, $flag_is_set ) { - $processor = new WP_XML_Tag_Processor( $xml ); - $processor->next_tag( array( 'tag_closers' => 'visit' ) ); - - if ( $flag_is_set ) { - $this->assertTrue( $processor->is_empty_element(), 'Did not find the empty element tag when it was present.' ); - } else { - $this->assertFalse( $processor->is_empty_element(), 'Found the empty element tag when it was absent.' ); - } - } - - /** - * Data provider. XML tags which might have a self-closing flag, and an indicator if they do. - * - * @return array[] - */ - public static function data_is_empty_element() { - return array( - // These should not have a self-closer, and will leave an element un-closed if it's assumed they are self-closing. - 'Self-closing flag on non-void XML element' => array( '', true ), - 'No self-closing flag on non-void XML element' => array( '', false ), - // These should not have a self-closer, but are benign when used because the elements are void. - 'Self-closing flag on void XML element' => array( '', true ), - 'No self-closing flag on void XML element' => array( '', false ), - 'Self-closing flag on void XML element without spacing' => array( '', true ), - // These should not have a self-closer, but as part of a tag closer they are entirely ignored. - 'No self-closing flag on tag closer' => array( '', false ), - // These can and should have self-closers, and will leave an element un-closed if it's assumed they aren't self-closing. - 'Self-closing flag on a foreign element' => array( '', true ), - 'No self-closing flag on a foreign element' => array( '', false ), - // These involve syntax peculiarities. - 'Self-closing flag after extra spaces' => array( '', true ), - 'Self-closing flag after quoted attribute' => array( '', true ), - ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_attribute - */ - public function test_get_attribute_returns_null_when_not_in_open_tag() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - - $this->assertFalse( $processor->next_tag( 'p' ), 'Querying a non-existing tag did not return false' ); - $this->assertNull( $processor->get_attribute( 'wp:post-type' ), 'Accessing an attribute of a non-existing tag did not return null' ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_attribute - */ - public function test_get_attribute_returns_null_when_in_closing_tag() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - - $this->assertTrue( $processor->next_tag( 'wp:content' ), 'Querying an existing tag did not return true' ); - $this->assertTrue( $processor->next_tag( array( 'tag_closers' => 'visit' ) ), 'Querying an existing closing tag did not return true' ); - $this->assertNull( $processor->get_attribute( 'wp:post-type' ), 'Accessing an attribute of a closing tag did not return null' ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_attribute - */ - public function test_get_attribute_returns_null_when_attribute_missing() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - - $this->assertTrue( $processor->next_tag( 'wp:content' ), 'Querying an existing tag did not return true' ); - $this->assertNull( $processor->get_attribute( 'test-id' ), 'Accessing a non-existing attribute did not return null' ); - } - - /** - * @ticket 61365 - * - * @expectedIncorrectUsage WP_XML_Tag_Processor::base_class_next_token - * @covers WP_XML_Tag_Processor::get_attribute - */ - public function test_attributes_are_rejected_in_tag_closers() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - - $this->assertTrue( $processor->next_tag( 'wp:content' ), 'Querying an existing tag did not return true' ); - $this->assertFalse( $processor->next_tag( array( 'tag_closers' => 'visit' ) ), 'Querying an existing but invalid closing tag did not return false.' ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_attribute - */ - public function test_get_attribute_returns_attribute_value() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - - $this->assertTrue( $processor->next_tag( 'wp:content' ), 'Querying an existing tag did not return true' ); - $this->assertSame( 'test', $processor->get_attribute( 'wp:post-type' ), 'Accessing a wp:post-type="test" attribute value did not return "test"' ); - } - - /** - * @ticket 61365 - * @expectedIncorrectUsage WP_XML_Tag_Processor::parse_next_attribute - * - * @covers WP_XML_Tag_Processor::get_attribute - */ - public function test_parsing_stops_on_malformed_attribute_value_no_value() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - - $this->assertFalse( $processor->next_tag(), 'Querying a malformed start tag did not return false' ); - } - - /** - * @ticket 61365 - * @expectedIncorrectUsage WP_XML_Tag_Processor::parse_next_attribute - * - * @covers WP_XML_Tag_Processor::get_attribute - */ - public function test_parsing_stops_on_malformed_attribute_value_no_quotes() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - - $this->assertFalse( $processor->next_tag(), 'Querying a malformed start tag did not return false' ); - } - - /** - * @ticket 61365 - * @expectedIncorrectUsage WP_XML_Tag_Processor::get_attribute - * - * @covers WP_XML_Tag_Processor::get_attribute - */ - public function test_malformed_attribute_value_containing_ampersand_is_treated_as_plaintext() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - - $this->assertTrue( $processor->next_tag(), 'Querying a tag did not return true' ); - $this->assertEquals('WordPress & WordPress', $processor->get_attribute('enabled')); - } - - /** - * @ticket 61365 - * @expectedIncorrectUsage WP_XML_Tag_Processor::get_attribute - * - * @covers WP_XML_Tag_Processor::get_attribute - */ - public function test_malformed_attribute_value_containing_entity_without_semicolon_is_treated_as_plaintext() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - - $this->assertTrue( $processor->next_tag(), 'Querying a tag did not return true' ); - $this->assertEquals('”', $processor->get_attribute('enabled')); - } - - /** - * @ticket 61365 - * @expectedIncorrectUsage WP_XML_Tag_Processor::parse_next_attribute - * - * @covers WP_XML_Tag_Processor::get_attribute - */ - public function test_parsing_stops_on_malformed_attribute_value_contains_lt_character() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - - $this->assertFalse( $processor->next_tag(), 'Querying a malformed start tag did not return false' ); - } - - /** - * @ticket 61365 - * @expectedIncorrectUsage WP_XML_Tag_Processor::parse_next_attribute - * - * @covers WP_XML_Tag_Processor::get_attribute - */ - public function test_parsing_stops_on_malformed_tags_duplicate_attributes() { - $processor = new WP_XML_Tag_Processor( 'Text' ); - - $this->assertFalse( $processor->next_tag() ); - } - - /** - * @ticket 61365 - * @expectedIncorrectUsage WP_XML_Tag_Processor::parse_next_attribute - * - * @covers WP_XML_Tag_Processor::get_attribute - */ - public function test_parsing_stops_on_malformed_attribute_name_contains_slash() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - - $this->assertFalse( $processor->next_tag(), 'Querying a malformed start tag did not return false' ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_attribute - */ - public function test_get_modifiable_text_returns_a_decoded_value() { - $processor = new WP_XML_Tag_Processor( '“😄”' ); - - $processor->next_tag( 'root' ); - $processor->next_token(); - - $this->assertEquals( - '“😄”', - $processor->get_modifiable_text(), - 'Reading an encoded text did not decode it.' - ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_attribute - */ - public function test_get_attribute_returns_a_decoded_value() { - $processor = new WP_XML_Tag_Processor( '' ); - - $this->assertTrue( $processor->next_tag( 'root' ), 'Querying a tag did not return true' ); - $this->assertEquals( - '“😄”', - $processor->get_attribute( 'encoded-data' ), - 'Reading an encoded attribute did not decode it.' - ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_attribute - * - * @param string $attribute_name Name of data-enabled attribute with case variations. - */ - public function test_get_attribute_is_case_sensitive() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - $processor->next_tag(); - - $this->assertEquals( - 'true', - $processor->get_attribute( 'DATA-enabled' ), - 'Accessing an attribute by a same-cased name did return not its value' - ); - - $this->assertNull( - $processor->get_attribute( 'data-enabled' ), - 'Accessing an attribute by a differently-cased name did return its value' - ); - } - - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::remove_attribute - */ - public function test_remove_attribute_is_case_sensitive() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - $processor->next_tag(); - $processor->remove_attribute( 'data-enabled' ); - - $this->assertSame( 'Test', $processor->get_updated_xml(), 'A case-sensitive remove_attribute call did remove the attribute' ); - - $processor->remove_attribute( 'DATA-enabled' ); - - $this->assertSame( 'Test', $processor->get_updated_xml(), 'A case-sensitive remove_attribute call did not remove the attribute' ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::set_attribute - */ - public function test_set_attribute_is_case_sensitive() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - $processor->next_tag(); - $processor->set_attribute( 'data-enabled', 'abc' ); - - $this->assertSame( 'Test', $processor->get_updated_xml(), 'A case-insensitive set_attribute call did not update the existing attribute' ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_attribute_names_with_prefix - */ - public function test_get_attribute_names_with_prefix_returns_null_before_finding_tags() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - $this->assertNull( - $processor->get_attribute_names_with_prefix( 'data-' ), - 'Accessing attributes by their prefix did not return null when no tag was selected' - ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_attribute_names_with_prefix - */ - public function test_get_attribute_names_with_prefix_returns_null_when_not_in_open_tag() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - $processor->next_tag( 'p' ); - $this->assertNull( $processor->get_attribute_names_with_prefix( 'data-' ), 'Accessing attributes of a non-existing tag did not return null' ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_attribute_names_with_prefix - */ - public function test_get_attribute_names_with_prefix_returns_null_when_in_closing_tag() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - $processor->next_tag( 'wp:content' ); - $processor->next_tag( array( 'tag_closers' => 'visit' ) ); - - $this->assertNull( $processor->get_attribute_names_with_prefix( 'data-' ), 'Accessing attributes of a closing tag did not return null' ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_attribute_names_with_prefix - */ - public function test_get_attribute_names_with_prefix_returns_empty_array_when_no_attributes_present() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - $processor->next_tag( 'wp:content' ); - - $this->assertSame( array(), $processor->get_attribute_names_with_prefix( 'data-' ), 'Accessing the attributes on a tag without any did not return an empty array' ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_attribute_names_with_prefix - */ - public function test_get_attribute_names_with_prefix_returns_matching_attribute_names_in_original_case() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - $processor->next_tag(); - - $this->assertSame( - array( 'data-test-ID' ), - $processor->get_attribute_names_with_prefix( 'data-' ), - 'Accessing attributes by their prefix did not return their lowercase names' - ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_attribute_names_with_prefix - */ - public function test_get_attribute_names_with_prefix_returns_attribute_added_by_set_attribute() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - $processor->next_tag(); - $processor->set_attribute( 'data-test-id', '14' ); - - $this->assertSame( - 'Test', - $processor->get_updated_xml(), - "Updated XML doesn't include attribute added via set_attribute" - ); - $this->assertSame( - array( 'data-test-id', 'data-foo' ), - $processor->get_attribute_names_with_prefix( 'data-' ), - "Accessing attribute names doesn't find attribute added via set_attribute" - ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::__toString - */ - public function test_to_string_returns_updated_xml() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - $processor->next_tag(); - $processor->remove_attribute( 'id' ); - - $processor->next_tag(); - $processor->set_attribute( 'id', 'wp:content-id-1' ); - - $this->assertSame( - $processor->get_updated_xml(), - (string) $processor, - 'get_updated_xml() returned a different value than __toString()' - ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_updated_xml - */ - public function test_get_updated_xml_applies_the_updates_so_far_and_keeps_the_processor_on_the_current_tag() { - $processor = new WP_XML_Tag_Processor( 'Test' ); - $processor->next_tag(); - $processor->remove_attribute( 'id' ); - - $processor->next_tag(); - $processor->set_attribute( 'id', 'wp:content-id-1' ); - - $this->assertSame( - 'Test', - $processor->get_updated_xml(), - 'Calling get_updated_xml after updating the attributes of the second tag returned different XML than expected' - ); - - $processor->set_attribute( 'id', 'wp:content-id-2' ); - - $this->assertSame( - 'Test', - $processor->get_updated_xml(), - 'Calling get_updated_xml after updating the attributes of the second tag for the second time returned different XML than expected' - ); - - $processor->next_tag(); - $processor->remove_attribute( 'id' ); - - $this->assertSame( - 'Test', - $processor->get_updated_xml(), - 'Calling get_updated_xml after removing the id attribute of the third tag returned different XML than expected' - ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_updated_xml - */ - public function test_get_updated_xml_without_updating_any_attributes_returns_the_original_xml() { - $processor = new WP_XML_Tag_Processor( self::XML_SIMPLE ); - - $this->assertSame( - self::XML_SIMPLE, - $processor->get_updated_xml(), - 'Casting WP_XML_Tag_Processor to a string without performing any updates did not return the initial XML snippet' - ); - } - - /** - * Ensures that when seeking to an earlier spot in the document that - * all previously-enqueued updates are applied as they ought to be. - * - * @ticket 61365 - * @expectedIncorrectUsage WP_XML_Tag_Processor::parse_next_attribute - */ - public function test_get_updated_xml_applies_updates_to_content_after_seeking_to_before_parsed_bytes() { - $processor = new WP_XML_Tag_Processor( '' ); - - $processor->next_tag(); - $processor->set_attribute( 'wonky', 'true' ); - $processor->next_tag(); - $processor->set_bookmark( 'here' ); - - $processor->next_tag( array( 'tag_closers' => 'visit' ) ); - $processor->seek( 'here' ); - - $this->assertSame( '', $processor->get_updated_xml() ); - } - - public function test_declare_element_as_pcdata() { - $text = ' - This text contains syntax that may seem - like XML nodes: - - - - - - - &<>"' - - But! It is all treated as text. - '; - $processor = new WP_XML_Tag_Processor( - "$text" - ); - $processor->declare_element_as_pcdata( 'my-pcdata' ); - $processor->next_tag( 'my-pcdata' ); - - $this->assertEquals( - $text, - $processor->get_modifiable_text(), - 'get_modifiable_text() did not return the expected text' - ); - } - - /** - * Ensures that bookmarks start and length correctly describe a given token in XML. - * - * @ticket 61365 - * - * @dataProvider data_xml_nth_token_substring - * - * @param string $xml Input XML. - * @param int $match_nth_token Which token to inspect from input XML. - * @param string $expected_match Expected full raw token bookmark should capture. - */ - public function test_token_bookmark_span( string $xml, int $match_nth_token, string $expected_match ) { - $processor = new class( $xml ) extends WP_XML_Tag_Processor { - /** - * Returns the raw span of XML for the currently-matched - * token, or null if not paused on any token. - * - * @return string|null Raw XML content of currently-matched token, - * otherwise `null` if not matched. - */ - public function get_raw_token() { - if ( - WP_XML_Tag_Processor::STATE_READY === $this->parser_state || - WP_XML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state || - WP_XML_Tag_Processor::STATE_COMPLETE === $this->parser_state - ) { - return null; - } - - $this->set_bookmark( 'mark' ); - $mark = $this->bookmarks['mark']; - - return substr( $this->xml, $mark->start, $mark->length ); - } - }; - - for ( $i = 0; $i < $match_nth_token; $i++ ) { - $processor->next_token(); - } - - $raw_token = $processor->get_raw_token(); - $this->assertIsString( - $raw_token, - "Failed to find raw token at position {$match_nth_token}: check test data provider." - ); - - $this->assertSame( - $expected_match, - $raw_token, - 'Bookmarked wrong span of text for full matched token.' - ); - } - - /** - * Data provider. - * - * @return array - */ - public static function data_xml_nth_token_substring() { - return array( - // Tags. - 'DIV start tag' => array( '', 1, '' ), - 'DIV start tag with attributes' => array( '', 1, '' ), - 'Nested DIV' => array( '', 2, '' ), - 'Sibling DIV' => array( '', 3, '' ), - 'DIV after text' => array( 'text ', 2, '' ), - 'DIV before text' => array( ' text', 1, '' ), - 'DIV after comment' => array( '', 3, '' ), - 'DIV before comment' => array( ' ', 1, '' ), - 'Start "self-closing" tag' => array( '', 1, '' ), - 'Void tag' => array( '', 1, '' ), - 'Void tag w/self-closing flag' => array( '', 1, '' ), - 'Void tag inside DIV' => array( '', 2, '' ), - - // Text. - 'Text' => array( 'Just text', 1, 'Just text' ), - 'Text in DIV' => array( 'Text', 2, 'Text' ), - 'Text before DIV' => array( 'Text', 1, 'Text' ), - 'Text after comment' => array( 'Text', 2, 'Text' ), - 'Text before comment' => array( 'Text ', 1, 'Text' ), - - // Comments. - 'Comment' => array( '', 1, '' ), - 'Comment in DIV' => array( '', 2, '' ), - 'Comment before DIV' => array( '', 1, '' ), - 'Comment after DIV' => array( '', 3, '' ), - 'Comment after comment' => array( '', 2, '' ), - 'Comment before comment' => array( ' ', 1, '' ), - 'Empty comment' => array( '', 1, '' ), - ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::next_tag - */ - public function test_next_tag_with_no_arguments_should_find_the_next_existing_tag() { - $processor = new WP_XML_Tag_Processor( self::XML_SIMPLE ); - - $this->assertTrue( $processor->next_tag(), 'Querying an existing tag did not return true' ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::next_tag - */ - public function test_next_tag_should_return_false_for_a_non_existing_tag() { - $processor = new WP_XML_Tag_Processor( self::XML_SIMPLE ); - - $this->assertFalse( $processor->next_tag( 'p' ), 'Querying a non-existing tag did not return false' ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_modifiable_text - */ - public function test_normalizes_carriage_returns_in_text_nodes() { - $processor = new WP_XML_Tag_Processor( - "We are\rnormalizing\r\n\nthe\n\r\r\r\ncarriage returns" - ); - $processor->next_tag(); - $processor->next_token(); - $this->assertEquals( - "We are\nnormalizing\n\nthe\n\n\n\ncarriage returns", - $processor->get_modifiable_text(), - 'get_raw_token() did not normalize the carriage return characters' - ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_modifiable_text - */ - public function test_normalizes_carriage_returns_in_cdata() { - $processor = new WP_XML_Tag_Processor( - "" - ); - $processor->next_tag(); - $processor->next_token(); - $this->assertEquals( - "We are\nnormalizing\n\nthe\n\n\n\ncarriage returns", - $processor->get_modifiable_text(), - 'get_raw_token() did not normalize the carriage return characters' - ); - } - - /** - * @ticket 61365 - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::next_tag - * @covers WP_XML_Tag_Processor::is_tag_closer - */ - public function test_next_tag_should_stop_on_closers_only_when_requested() { - $processor = new WP_XML_Tag_Processor( '' ); - - $this->assertTrue( $processor->next_tag( array( 'tag_name' => 'wp:content' ) ), 'Did not find desired tag opener' ); - $this->assertFalse( $processor->next_tag( array( 'tag_name' => 'wp:content' ) ), 'Visited an unwanted tag, a tag closer' ); - - $processor = new WP_XML_Tag_Processor( '' ); - $processor->next_tag( - array( - 'tag_name' => 'wp:content', - 'tag_closers' => 'visit', - ) - ); - - $this->assertFalse( $processor->is_tag_closer(), 'Indicated a tag opener is a tag closer' ); - $this->assertTrue( - $processor->next_tag( - array( - 'tag_name' => 'wp:content', - 'tag_closers' => 'visit', - ) - ), - 'Did not stop at desired tag closer' - ); - $this->assertTrue( $processor->is_tag_closer(), 'Indicated a tag closer is a tag opener' ); - - $processor = new WP_XML_Tag_Processor( '' ); - $this->assertTrue( $processor->next_tag( array( 'tag_closers' => 'visit' ) ), "Did not find a tag opener when tag_closers was set to 'visit'" ); - $this->assertFalse( $processor->next_tag( array( 'tag_closers' => 'visit' ) ), "Found a closer where there wasn't one" ); - } - - /** - * Verifies that updates to a document before calls to `get_updated_xml()` don't - * lead to the Tag Processor jumping to the wrong tag after the updates. - * - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_updated_xml - */ - public function test_internal_pointer_returns_to_original_spot_after_inserting_content_before_cursor() { - $tags = new WP_XML_Tag_Processor( 'outside
inside
' ); - - $tags->next_tag(); - $tags->next_tag(); - $tags->set_attribute( 'wp:post-type', 'foo' ); - $tags->next_tag( 'section' ); - - // Return to this spot after moving ahead. - $tags->set_bookmark( 'here' ); - - // Move ahead. - $tags->next_tag( 'photo' ); - $tags->seek( 'here' ); - $this->assertSame( 'outside
inside
', $tags->get_updated_xml() ); - $this->assertSame( 'section', $tags->get_tag() ); - $this->assertFalse( $tags->is_tag_closer() ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::set_attribute - */ - public function test_set_attribute_on_a_non_existing_tag_does_not_change_the_markup() { - $processor = new WP_XML_Tag_Processor( self::XML_SIMPLE ); - - $this->assertFalse( $processor->next_tag( 'p' ), 'Querying a non-existing tag did not return false' ); - $this->assertFalse( $processor->next_tag( 'wp:content' ), 'Querying a non-existing tag did not return false' ); - - $processor->set_attribute( 'id', 'primary' ); - - $this->assertSame( - self::XML_SIMPLE, - $processor->get_updated_xml(), - 'Calling get_updated_xml after updating a non-existing tag returned an XML that was different from the original XML' - ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::set_attribute - * @covers WP_XML_Tag_Processor::remove_attribute - * @covers WP_XML_Tag_Processor::add_class - * @covers WP_XML_Tag_Processor::remove_class - */ - public function test_attribute_ops_on_tag_closer_do_not_change_the_markup() { - $processor = new WP_XML_Tag_Processor( '' ); - $processor->next_tag( - array( - 'tag_name' => 'wp:content', - 'tag_closers' => 'visit', - ) - ); - - $this->assertFalse( $processor->is_tag_closer(), 'Skipped tag opener' ); - - $processor->next_tag( - array( - 'tag_name' => 'wp:content', - 'tag_closers' => 'visit', - ) - ); - - $this->assertTrue( $processor->is_tag_closer(), 'Skipped tag closer' ); - $this->assertFalse( $processor->set_attribute( 'id', 'test' ), "Allowed setting an attribute on a tag closer when it shouldn't have" ); - $this->assertFalse( $processor->remove_attribute( 'invalid-id' ), "Allowed removing an attribute on a tag closer when it shouldn't have" ); - $this->assertSame( - '', - $processor->get_updated_xml(), - 'Calling get_updated_xml after updating a non-existing tag returned an XML that was different from the original XML' - ); - } - - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::set_attribute - */ - public function test_set_attribute_with_a_non_existing_attribute_adds_a_new_attribute_to_the_markup() { - $processor = new WP_XML_Tag_Processor( self::XML_SIMPLE ); - $processor->next_tag(); - $processor->set_attribute( 'test-attribute', 'test-value' ); - - $this->assertSame( - 'Text', - $processor->get_updated_xml(), - 'Updated XML does not include attribute added via set_attribute()' - ); - $this->assertSame( - 'test-value', - $processor->get_attribute( 'test-attribute' ), - 'get_attribute() (called after get_updated_xml()) did not return attribute added via set_attribute()' - ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_attribute - */ - public function test_get_attribute_returns_updated_values_before_they_are_applied() { - $processor = new WP_XML_Tag_Processor( self::XML_SIMPLE ); - $processor->next_tag(); - $processor->set_attribute( 'test-attribute', 'test-value' ); - - $this->assertSame( - 'test-value', - $processor->get_attribute( 'test-attribute' ), - 'get_attribute() (called before get_updated_xml()) did not return attribute added via set_attribute()' - ); - $this->assertSame( - 'Text', - $processor->get_updated_xml(), - 'Updated XML does not include attribute added via set_attribute()' - ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_attribute - */ - public function test_get_attribute_returns_updated_values_before_they_are_applied_with_different_name_casing() { - $processor = new WP_XML_Tag_Processor( self::XML_SIMPLE ); - $processor->next_tag(); - $processor->set_attribute( 'test-ATTribute', 'test-value' ); - - $this->assertSame( - 'test-value', - $processor->get_attribute( 'test-ATTribute' ), - 'get_attribute() (called before get_updated_xml()) did not return attribute added via set_attribute()' - ); - $this->assertSame( - 'Text', - $processor->get_updated_xml(), - 'Updated XML does not include attribute added via set_attribute()' - ); - } - - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_attribute - */ - public function test_get_attribute_reflects_removed_attribute_before_it_is_applied() { - $processor = new WP_XML_Tag_Processor( self::XML_SIMPLE ); - $processor->next_tag(); - $processor->remove_attribute( 'id' ); - - $this->assertNull( - $processor->get_attribute( 'id' ), - 'get_attribute() (called before get_updated_xml()) returned attribute that was removed by remove_attribute()' - ); - $this->assertSame( - 'Text', - $processor->get_updated_xml(), - 'Updated XML includes attribute that was removed by remove_attribute()' - ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_attribute - */ - public function test_get_attribute_reflects_adding_and_then_removing_an_attribute_before_those_updates_are_applied() { - $processor = new WP_XML_Tag_Processor( self::XML_SIMPLE ); - $processor->next_tag(); - $processor->set_attribute( 'test-attribute', 'test-value' ); - $processor->remove_attribute( 'test-attribute' ); - - $this->assertNull( - $processor->get_attribute( 'test-attribute' ), - 'get_attribute() (called before get_updated_xml()) returned attribute that was added via set_attribute() and then removed by remove_attribute()' - ); - $this->assertSame( - self::XML_SIMPLE, - $processor->get_updated_xml(), - 'Updated XML includes attribute that was added via set_attribute() and then removed by remove_attribute()' - ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::get_attribute - */ - public function test_get_attribute_reflects_setting_and_then_removing_an_existing_attribute_before_those_updates_are_applied() { - $processor = new WP_XML_Tag_Processor( self::XML_SIMPLE ); - $processor->next_tag(); - $processor->set_attribute( 'id', 'test-value' ); - $processor->remove_attribute( 'id' ); - - $this->assertNull( - $processor->get_attribute( 'id' ), - 'get_attribute() (called before get_updated_xml()) returned attribute that was overwritten by set_attribute() and then removed by remove_attribute()' - ); - $this->assertSame( - 'Text', - $processor->get_updated_xml(), - 'Updated XML includes attribute that was overwritten by set_attribute() and then removed by remove_attribute()' - ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::set_attribute - */ - public function test_set_attribute_with_an_existing_attribute_name_updates_its_value_in_the_markup() { - $processor = new WP_XML_Tag_Processor( self::XML_SIMPLE ); - $processor->next_tag(); - $processor->set_attribute( 'id', 'new-id' ); - $this->assertSame( - 'Text', - $processor->get_updated_xml(), - 'Existing attribute was not updated' - ); - } - - /** - * Ensures that when setting an attribute multiple times that only - * one update flushes out into the updated XML. - * - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::set_attribute - */ - public function test_set_attribute_with_case_variants_updates_only_the_original_first_copy() { - $processor = new WP_XML_Tag_Processor( '' ); - $processor->next_tag(); - $processor->set_attribute( 'data-enabled', 'canary1' ); - $processor->set_attribute( 'data-enabled', 'canary2' ); - $processor->set_attribute( 'data-enabled', 'canary3' ); - - $this->assertSame( '', strtolower( $processor->get_updated_xml() ) ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::next_tag - * @covers WP_XML_Tag_Processor::set_attribute - */ - public function test_next_tag_and_set_attribute_in_a_loop_update_all_tags_in_the_markup() { - $processor = new WP_XML_Tag_Processor( self::XML_SIMPLE ); - while ( $processor->next_tag() ) { - $processor->set_attribute( 'data-foo', 'bar' ); - } - - $this->assertSame( - 'Text', - $processor->get_updated_xml(), - 'Not all tags were updated when looping with next_tag() and set_attribute()' - ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::remove_attribute - */ - public function test_remove_attribute_with_an_existing_attribute_name_removes_it_from_the_markup() { - $processor = new WP_XML_Tag_Processor( self::XML_SIMPLE ); - $processor->next_tag(); - $processor->remove_attribute( 'id' ); - - $this->assertSame( - 'Text', - $processor->get_updated_xml(), - 'Attribute was not removed' - ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::remove_attribute - */ - public function test_remove_attribute_with_a_non_existing_attribute_name_does_not_change_the_markup() { - $processor = new WP_XML_Tag_Processor( self::XML_SIMPLE ); - $processor->next_tag(); - $processor->remove_attribute( 'no-such-attribute' ); - - $this->assertSame( - self::XML_SIMPLE, - $processor->get_updated_xml(), - 'Content was changed when attempting to remove an attribute that did not exist' - ); - } - - /** - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::next_tag - */ - public function test_correctly_parses_xml_attributes_wrapped_in_single_quotation_marks() { - $processor = new WP_XML_Tag_Processor( - 'Text' - ); - $processor->next_tag( - array( - 'tag_name' => 'wp:content', - 'id' => 'first', - ) - ); - $processor->remove_attribute( 'id' ); - $processor->next_tag( - array( - 'tag_name' => 'wp:text', - 'id' => 'second', - ) - ); - $processor->set_attribute( 'id', 'single-quote' ); - $this->assertSame( - 'Text', - $processor->get_updated_xml(), - 'Did not remove single-quoted attribute' - ); - } - - /** - * @ticket 61365 - * @expectedIncorrectUsage WP_XML_Tag_Processor::parse_next_attribute - * @expectedIncorrectUsage WP_XML_Tag_Processor::set_attribute - * - * @covers WP_XML_Tag_Processor::set_attribute - */ - public function test_setting_an_attribute_to_false_is_rejected() { - $processor = new WP_XML_Tag_Processor( - '
' - ); - $processor->next_tag( 'input' ); - $this->assertFalse( - $processor->set_attribute( 'checked', false ), - 'Accepted a boolean attribute name.' - ); - } - - /** - * @ticket 61365 - * @expectedIncorrectUsage WP_XML_Tag_Processor::set_attribute - * - * @covers WP_XML_Tag_Processor::set_attribute - */ - public function test_setting_a_missing_attribute_to_false_does_not_change_the_markup() { - $xml_input = '
'; - $processor = new WP_XML_Tag_Processor( $xml_input ); - $processor->next_tag( 'input' ); - $processor->set_attribute( 'checked', false ); - $this->assertSame( - $xml_input, - $processor->get_updated_xml(), - 'Changed the markup unexpectedly when setting a non-existing attribute to false' - ); - } - - /** - * Ensures that unclosed and invalid comments trigger warnings or errors. - * - * @ticket 61365 - * - * @covers WP_XML_Tag_Processor::next_tag - * @covers WP_XML_Tag_Processor::paused_at_incomplete_token - * - * @dataProvider data_xml_with_unclosed_comments - * - * @param string $xml_ending_before_comment_close XML with opened comments that aren't closed. - */ - public function test_documents_may_end_with_unclosed_comment( $xml_ending_before_comment_close ) { - $processor = new WP_XML_Tag_Processor( $xml_ending_before_comment_close ); - - $this->assertFalse( - $processor->next_tag(), - "Should not have found any tag, but found {$processor->get_tag()}." - ); - - $this->assertTrue( - $processor->is_paused_at_incomplete_input(), - "Should have indicated that the parser found an incomplete token but didn't." - ); - } - - /** - * Data provider. - * - * @return array[] - */ - public static function data_xml_with_unclosed_comments() { - return array( - 'Shortest open valid comment' => array( '' ); - $this->assertFalse( $processor->next_token(), 'Did not reject a malformed XML comment.' ); - } - - /** - * @covers WP_XML_Tag_Processor::next_tag - */ - public function test_handles_malformed_taglike_open_short_xml() { - $processor = new WP_XML_Tag_Processor( '<' ); - $result = $processor->next_tag(); - $this->assertFalse( $result, 'Did not handle "<" xml properly.' ); - } - - /** - * @covers WP_XML_Tag_Processor::next_tag - */ - public function test_handles_malformed_taglike_close_short_xml() { - $processor = new WP_XML_Tag_Processor( 'next_tag(); - $this->assertFalse( $result, 'Did not handle " ' ); - $result = $processor->next_tag(); - $this->assertFalse( $result, 'Did not handle "
" xml properly.' ); - } - - /** - * Ensures that non-tag syntax starting with `<` is rejected. - * - * @ticket 61365 - */ - public function test_single_text_node_with_taglike_text() { - $processor = new WP_XML_Tag_Processor( 'This is a text node< /A>' ); - $this->assertTrue( $processor->next_token(), 'A valid text node was not found.' ); - $this->assertEquals( 'This is a text node', $processor->get_modifiable_text(), 'The contents of a valid text node were not correctly captured.' ); - $this->assertFalse( $processor->next_tag(), 'A malformed XML markup was not rejected.' ); - } - - /** - * Ensures that non-tag syntax starting with `<` is rejected. - * - * @ticket 61365 - */ - public function test_parses_CDATA() { - $processor = new WP_XML_Tag_Processor( '' ); - $processor->next_tag(); - $this->assertTrue( $processor->next_token(), 'The first text node was not found.' ); $this->assertEquals( - 'This is a CDATA text node.', - $processor->get_modifiable_text(), - 'The contents of a a CDATA text node were not correctly captured.' - ); - } - - /** - * @ticket 61365 - */ - public function test_yields_CDATA_a_separate_text_node() { - $processor = new WP_XML_Tag_Processor( 'This is the first text node and this is the third text node.' ); - - $processor->next_token(); - $this->assertTrue( $processor->next_token(), 'The first text node was not found.' ); - $this->assertEquals( - 'This is the first text node ', - $processor->get_modifiable_text(), - 'The contents of a valid text node were not correctly captured.' - ); - - $this->assertTrue( $processor->next_token(), 'The CDATA text node was not found.' ); - $this->assertEquals( - ' and this is a second text node ', - $processor->get_modifiable_text(), - 'The contents of a a CDATA text node were not correctly captured.' - ); - - $this->assertTrue( $processor->next_token(), 'The text node was not found.' ); - $this->assertEquals( - ' and this is the third text node.', - $processor->get_modifiable_text(), - 'The contents of a valid text node were not correctly captured.' - ); - } - - /** - * - * @ticket 61365 - */ - public function test_xml_declaration() { - $processor = new WP_XML_Tag_Processor( '' ); - $this->assertTrue( $processor->next_token(), 'The XML declaration was not found.' ); - $this->assertEquals( - '#xml-declaration', - $processor->get_token_type(), - 'The XML declaration was not correctly identified.' - ); - $this->assertEquals( '1.0', $processor->get_attribute( 'version' ), 'The version attribute was not correctly captured.' ); - $this->assertEquals( 'UTF-8', $processor->get_attribute( 'encoding' ), 'The encoding attribute was not correctly captured.' ); - } - - /** - * - * @ticket 61365 - */ - public function test_xml_declaration_with_single_quotes() { - $processor = new WP_XML_Tag_Processor( "" ); - $this->assertTrue( $processor->next_token(), 'The XML declaration was not found.' ); - $this->assertEquals( - '#xml-declaration', - $processor->get_token_type(), - 'The XML declaration was not correctly identified.' - ); - $this->assertEquals( '1.0', $processor->get_attribute( 'version' ), 'The version attribute was not correctly captured.' ); - $this->assertEquals( 'UTF-8', $processor->get_attribute( 'encoding' ), 'The encoding attribute was not correctly captured.' ); - } - - /** - * - * @ticket 61365 - */ - public function test_processor_instructions() { - $processor = new WP_XML_Tag_Processor( - // The first ' . - // The second ' - ); - $this->assertTrue( $processor->next_token(), 'The XML declaration was not found.' ); - $this->assertTrue( $processor->next_token(), 'The processing instruction was not found.' ); - $this->assertEquals( - '#processing-instructions', - $processor->get_token_type(), - 'The processing instruction was not correctly identified.' - ); - $this->assertEquals( ' stylesheet type="text/xsl" href="style.xsl" ', $processor->get_modifiable_text(), 'The modifiable text was not correctly captured.' ); - } - - /** - * Ensures that updates which are enqueued in front of the cursor - * are applied before moving forward in the document. - * - * @ticket 61365 - */ - public function test_applies_updates_before_proceeding() { - $xml = ''; - - $subclass = new class( $xml ) extends WP_XML_Tag_Processor { - /** - * Inserts raw text after the current token. - * - * @param string $new_xml Raw text to insert. - */ - public function insert_after( $new_xml ) { - $this->set_bookmark( 'here' ); - $this->lexical_updates[] = new WP_HTML_Text_Replacement( - $this->bookmarks['here']->start + $this->bookmarks['here']->length, - 0, - $new_xml - ); - } - }; - - $subclass->next_tag( 'photo' ); - $subclass->insert_after( '

snow-capped

' ); - - $subclass->next_tag(); - $this->assertSame( - 'p', - $subclass->get_tag(), - 'Should have matched inserted XML as next tag.' - ); - - $subclass->next_tag( 'photo' ); - $subclass->set_attribute( 'alt', 'mountain' ); - - $this->assertSame( - '

snow-capped

', - $subclass->get_updated_xml(), - 'Should have properly applied the update from in front of the cursor.' - ); - } -} \ No newline at end of file diff --git a/packages/playground/data-liberation/tests/fixtures/wxr-simple.xml b/packages/playground/data-liberation/tests/fixtures/wxr-simple.xml index f7c4b13b07..2edf266984 100644 --- a/packages/playground/data-liberation/tests/fixtures/wxr-simple.xml +++ b/packages/playground/data-liberation/tests/fixtures/wxr-simple.xml @@ -92,5 +92,4 @@ https://playground.internal/path-not-taken was the second best choice. - - \ No newline at end of file + \ No newline at end of file