diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 7d04fd31d80d2..c619806525732 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2889,7 +2889,9 @@ public function get_modifiable_text(): string { return ''; } - $text = substr( $this->html, $this->text_starts_at, $this->text_length ); + $text = isset( $this->lexical_updates['modifiable text'] ) + ? $this->lexical_updates['modifiable text']->text + : substr( $this->html, $this->text_starts_at, $this->text_length ); /* * Pre-processing the input stream would normally happen before @@ -2956,6 +2958,157 @@ public function get_modifiable_text(): string { : str_replace( "\x00", "\u{FFFD}", $decoded ); } + /** + * Sets the modifiable text for the matched token, if matched. + * + * Modifiable text is text content that may be read and changed without + * changing the HTML structure of the document around it. This includes + * the contents of `#text` nodes in the HTML as well as the inner + * contents of HTML comments, Processing Instructions, and others, even + * though these nodes aren't part of a parsed DOM tree. They also contain + * the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any + * other section in an HTML document which cannot contain HTML markup (DATA). + * + * Not all modifiable text may be set by this method, and not all content + * may be set as modifiable text. In the case that this fails it will return + * `false` indicating as much. For instance, it will not allow inserting the + * string `next_tag( 'STYLE' ) ) { + * $style = $processor->get_modifiable_text(); + * $processor->set_modifiable_text( "// Made with love on the World Wide Web\n{$style}" ); + * } + * + * // Replace smiley text with Emoji smilies. + * while ( $processor->next_token() ) { + * if ( '#text' !== $processor->get_token_name() ) { + * continue; + * } + * + * $chunk = $processor->get_modifiable_text(); + * if ( ! str_contains( $chunk, ':)' ) ) { + * continue; + * } + * + * $processor->set_modifiable_text( str_replace( ':)', '🙂', $chunk ) ); + * } + * + * @since 6.7.0 + * + * @param string $plaintext_content New text content to represent in the matched token. + * + * @return bool Whether the text was able to update. + */ + public function set_modifiable_text( string $plaintext_content ): bool { + if ( self::STATE_TEXT_NODE === $this->parser_state ) { + $this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement( + $this->text_starts_at, + $this->text_length, + htmlspecialchars( $plaintext_content, ENT_QUOTES | ENT_HTML5 ) + ); + + return true; + } + + // Comment data is not encoded. + if ( + self::STATE_COMMENT === $this->parser_state && + self::COMMENT_AS_HTML_COMMENT === $this->comment_type + ) { + // Check if the text could close the comment. + if ( 1 === preg_match( '/--!?>/', $plaintext_content ) ) { + return false; + } + + $this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement( + $this->text_starts_at, + $this->text_length, + $plaintext_content + ); + + return true; + } + + if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { + return false; + } + + switch ( $this->get_tag() ) { + case 'SCRIPT': + /* + * This is over-protective, but ensures the update doesn't break + * out of the SCRIPT element. A more thorough check would need to + * ensure that the script closing tag doesn't exist, and isn't + * also "hidden" inside the script double-escaped state. + * + * It may seem like replacing `lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement( + $this->text_starts_at, + $this->text_length, + $plaintext_content + ); + + return true; + + case 'STYLE': + $plaintext_content = preg_replace_callback( + '~style)~i', + static function ( $tag_match ) { + return "\\3c\\2f{$tag_match['TAG_NAME']}"; + }, + $plaintext_content + ); + + $this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement( + $this->text_starts_at, + $this->text_length, + $plaintext_content + ); + + return true; + + case 'TEXTAREA': + case 'TITLE': + $plaintext_content = preg_replace_callback( + "~{$this->get_tag()})~i", + static function ( $tag_match ) { + return "</{$tag_match['TAG_NAME']}"; + }, + $plaintext_content + ); + + /* + * These don't _need_ to be escaped, but since they are decoded it's + * safe to leave them escaped and this can prevent other code from + * naively detecting tags within the contents. + * + * @todo It would be useful to prefix a multiline replacement text + * with a newline, but not necessary. This is for aesthetics. + */ + $this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement( + $this->text_starts_at, + $this->text_length, + $plaintext_content + ); + + return true; + } + + return false; + } + /** * Updates or creates a new attribute on the currently matched tag with the passed value. * diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessorModifiableText.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessorModifiableText.php index 2c8c07e410b74..717d061016a2d 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessorModifiableText.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessorModifiableText.php @@ -39,6 +39,90 @@ public function test_get_modifiable_text_is_idempotent() { } } + /** + * Ensures that updates to modifiable text that are shorter than the + * original text do not cause the parser to lose its orientation. + * + * @ticket 61617 + */ + public function test_setting_shorter_modifiable_text() { + $processor = new WP_HTML_Tag_Processor( '
' ); + + // Find the test node in the middle. + while ( 'TEXTAREA' !== $processor->get_token_name() && $processor->next_token() ) { + continue; + } + + $this->assertSame( + 'TEXTAREA', + $processor->get_token_name(), + 'Failed to find the test TEXTAREA node; check the test setup.' + ); + + $processor->set_modifiable_text( 'short' ); + $processor->get_updated_html(); + $this->assertSame( + 'short', + $processor->get_modifiable_text(), + 'Should have updated modifiable text to something shorter than the original.' + ); + + $this->assertTrue( + $processor->next_token(), + 'Should have advanced to the last token in the input.' + ); + + $this->assertSame( + 'DIV', + $processor->get_token_name(), + 'Should have recognized the final DIV in the input.' + ); + + $this->assertSame( + 'not a ', + $processor->get_attribute( 'id' ), + 'Should have read in the id from the last DIV as "not a "' + ); + } + + /** + * Ensures that reads to modifiable text after setting it reads the updated + * enqueued values, and not the original value. + * + * @ticket 61617 + */ + public function test_modifiable_text_reads_updates_after_setting() { + $processor = new WP_HTML_Tag_Processor( 'This is text' ); + + $processor->next_token(); + $this->assertSame( + '#text', + $processor->get_token_name(), + 'Failed to find first text node: check test setup.' + ); + + $update = 'This is new text'; + $processor->set_modifiable_text( $update ); + $this->assertSame( + $update, + $processor->get_modifiable_text(), + 'Failed to read updated enqueued value of text node.' + ); + + $processor->next_token(); + $this->assertSame( + '#comment', + $processor->get_token_name(), + 'Failed to advance to comment: check test setup.' + ); + + $this->assertSame( + ' this is not ', + $processor->get_modifiable_text(), + 'Failed to read modifiable text for next token; did it read the old enqueued value from the previous token?' + ); + } + /** * Ensures that when ignoring a newline after LISTING and PRE tags, that this * happens appropriately after seeking. @@ -108,4 +192,155 @@ public function test_get_modifiable_text_ignores_newlines_after_seeking() { 'Should not have removed the leading newline from the last DIV on its second traversal.' ); } + + /** + * Ensures that modifiable text updates are not applied where they aren't supported. + * + * @ticket 61617 + * + * @dataProvider data_tokens_not_supporting_modifiable_text_updates + * + * @param string $html Contains HTML with a token not supporting modifiable text updates. + * @param int $advance_n_tokens Count of times to run `next_token()` before reaching target node. + */ + public function test_rejects_updates_on_unsupported_match_locations( string $html, int $advance_n_tokens ) { + $processor = new WP_HTML_Tag_Processor( $html ); + while ( --$advance_n_tokens >= 0 ) { + $processor->next_token(); + } + + $this->assertFalse( + $processor->set_modifiable_text( 'Bazinga!' ), + 'Should have prevented modifying the text at the target node.' + ); + + $this->assertSame( + $html, + $processor->get_updated_html(), + 'Should not have modified the input document in any way.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_tokens_not_supporting_modifiable_text_updates() { + return array( + 'Before parsing' => array( 'nothing to see here', 0 ), + 'After parsing' => array( 'nothing here either', 2 ), + 'Incomplete document' => array( ' array( 'Text', 1, 'Blubber', 'Blubber' ), + 'Text node (middle)' => array( 'Bold move', 2, 'yo', 'yo' ), + 'Text node (end)' => array( 'of a dog', 2, 'of a cat', 'of a cat' ), + 'Encoded text node' => array( '
birds and dogs
', 2, ' & ', '
<birds> & <dogs>
' ), + 'SCRIPT tag' => array( 'beforeafter', 2, 'const img = " &
";', 'beforeafter' ), + 'STYLE tag' => array( '', 1, 'p::before { content: " & "; }', '' ), + 'TEXTAREA tag' => array( 'ab', 2, "so it ", "ab" ), + 'TEXTAREA (escape)' => array( 'ab', 2, 'but it does for ', 'ab' ), + 'TEXTAREA (escape+attrs)' => array( 'ab', 2, 'but it does for ', 'ab' ), + 'TITLE tag' => array( 'ahas no need to escapeb', 2, "so it ", "aso it <doesn't>b" ), + 'TITLE (escape)' => array( 'ahas no need to escapeb', 2, 'but it does for ', 'abut it does for </title>b' ), + 'TITLE (escape+attrs)' => array( 'ahas no need to escapeb', 2, 'but it does for ', 'abut it does for </title not an="attribute">b' ), + ); + } + + /** + * Ensures that updates with potentially-compromising values aren't accepted. + * + * For example, a modifiable text update should be allowed which would break + * the structure of the containing element, such as in a script or comment. + * + * @ticket 61617 + * + * @dataProvider data_unallowed_modifiable_text_updates + * + * @param string $html_with_nonempty_modifiable_text Will be used to find the test element. + * @param string $invalid_update Update containing possibly-compromising text. + */ + public function test_rejects_updates_with_unallowed_substrings( string $html_with_nonempty_modifiable_text, string $invalid_update ) { + $processor = new WP_HTML_Tag_Processor( $html_with_nonempty_modifiable_text ); + + while ( '' === $processor->get_modifiable_text() && $processor->next_token() ) { + continue; + } + + $original_text = $processor->get_modifiable_text(); + $this->assertNotEmpty( $original_text, 'Should have found non-empty text: check test setup.' ); + + $this->assertFalse( + $processor->set_modifiable_text( $invalid_update ), + 'Should have reject possibly-compromising modifiable text update.' + ); + + // Flush updates. + $processor->get_updated_html(); + + $this->assertSame( + $original_text, + $processor->get_modifiable_text(), + 'Should have preserved the original modifiable text before the rejected update.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_unallowed_modifiable_text_updates() { + return array( + 'Comment with -->' => array( '', 'Comments end in -->' ), + 'Comment with --!>' => array( '', 'Invalid but legitimate comments end in --!>' ), + 'SCRIPT with ' => array( '', 'Just a ' ), + 'SCRIPT with ' => array( '', 'beforeafter' ), + ); + } }