WordPress · dmsnell · Jul 10, 2024 · Jul 12, 2024 · Jul 12, 2024 · Jul 12, 2024
diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php
@@ -2881,6 +2881,157 @@ public function get_modifiable_text() {
 		return $decoded;
 	}
 
+	/**
+	 * Sets the modifiable text for the matched token, if matched.
+	 *
+	 * Modifiable text is text content that may be read and changed without
+	 * changing the HTML structure of the document around it. This includes
+	 * the contents of `#text` nodes in the HTML as well as the inner
+	 * contents of HTML comments, Processing Instructions, and others, even
+	 * though these nodes aren't part of a parsed DOM tree. They also contain
+	 * the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any
+	 * other section in an HTML document which cannot contain HTML markup (DATA).
+	 *
+	 * Not all modifiable text may be set by this method, and not all content
+	 * may be set as modifiable text. In the case that this fails it will return
+	 * `false` indicating as much. For instance, it will not allow inserting the
+	 * string `</script` into a SCRIPT element, because the rules for escaping
+	 * that safely are complicated. Similarly, it will not allow setting content
+	 * into a comment which would prematurely terminate the comment.
+	 *
+	 * Example:
+	 *
+	 *     // Add a preface to all STYLE contents.
+	 *     while ( $processor->next_tag( 'STYLE' ) ) {
+	 *         $style = $processor->get_modifiable_text();
+	 *         $processor->set_modifiable_text( "// Made with love on the World Wide Web\n{$style}" );
+	 *     }
+	 *
+	 *     // Replace smiley text with Emoji smilies.
+	 *     while ( $processor->next_token() ) {
+	 *         if ( '#text' !== $processor->get_token_name() ) {
+	 *             continue;
+	 *         }
+	 *
+	 *         $chunk = $processor->get_modifiable_text();
+	 *         if ( ! str_contains( $chunk, ':)' ) ) {
+	 *             continue;
+	 *         }
+	 *
+	 *         $processor->set_modifiable_text( str_replace( ':)', '🙂', $chunk ) );
+	 *     }
+	 *
+	 * @since 6.7.0
+	 *
+	 * @param string $plaintext_content New text content to represent in the matched token.
+	 *
+	 * @return bool Whether the text was able to update.
+	 */
+	public function set_modifiable_text( string $plaintext_content ): bool {
+		if ( self::STATE_TEXT_NODE === $this->parser_state ) {
+			$this->lexical_updates[] = new WP_HTML_Text_Replacement(
+				$this->text_starts_at,
+				$this->text_length,
+				htmlspecialchars( $plaintext_content, ENT_QUOTES | ENT_HTML5 )
+			);
+
+			return true;
+		}
+
+		// Comment data is not encoded.
+		if (
+			self::STATE_COMMENT === $this->parser_state &&
+			self::COMMENT_AS_HTML_COMMENT === $this->comment_type
+		) {
+			// Check if the text could close the comment.
+			if ( 1 === preg_match( '/--!?>/', $plaintext_content ) ) {
+				return false;
+			}
+
+			$this->lexical_updates[] = new WP_HTML_Text_Replacement(
+				$this->text_starts_at,
+				$this->text_length,
+				$plaintext_content
+			);
+
+			return true;
+		}
+
+		if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
+			return false;
+		}
+
+		switch ( $this->get_tag() ) {
+			case 'SCRIPT':
+				/*
+				 * This is over-protective, but ensures the update doesn't break
+				 * out of the SCRIPT element. A more thorough check would need to
+				 * ensure that the script closing tag doesn't exist, and isn't
+				 * also "hidden" inside the script double-escaped state.
+				 *
+				 * It may seem like replacing `</script` with `<\/script` would
+				 * properly escape these things, but this could mask regex patterns
+				 * that previously worked. Resolve this by not sending `</script`
+				 */
+				if ( false !== stripos( $plaintext_content, '</script' ) ) {
+					return false;
+				}
+
+				$this->lexical_updates[] = new WP_HTML_Text_Replacement(
+					$this->text_starts_at,
+					$this->text_length,
+					$plaintext_content
+				);
+
+				return true;
+
+			case 'STYLE':
+				$plaintext_content = preg_replace_callback(
+					'~</(?P<TAG_NAME>style)~i',
+					static function ( $tag_match ) {
+						return "\\3c\\2f{$tag_match['TAG_NAME']}";
+					},
+					$plaintext_content
+				);
+
+				$this->lexical_updates[] = new WP_HTML_Text_Replacement(
+					$this->text_starts_at,
+					$this->text_length,
+					$plaintext_content
+				);
+
+				return true;
+
+			case 'TEXTAREA':
+			case 'TITLE':
+				$plaintext_content = preg_replace_callback(
+					"~</(?P<TAG_NAME>{$this->get_tag()})~i",
+					static function ( $tag_match ) {
+						return "&lt;/{$tag_match['TAG_NAME']}";
+					},
+					$plaintext_content
+				);
+
+				/*
+				 * These don't _need_ to be escaped, but since they are decoded it's
+				 * safe to leave them escaped and this can prevent other code from
+				 * naively detecting tags within the contents.
+				 *
+				 * @todo It would be useful to prefix a multiline replacement text
+				 *       with a newline, but not necessary. This is for aesthetics.
+				 */
+				$this->lexical_updates[] = new WP_HTML_Text_Replacement(
+					$this->text_starts_at,
+					$this->text_length,
+					$plaintext_content
+				);
+
+				return true;
+		}
+
+		return false;
+	}
+
 	/**
 	 * Updates or creates a new attribute on the currently matched tag with the passed value.
 	 *

diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessorModifiableText.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessorModifiableText.php
@@ -0,0 +1,110 @@
+<?php
+/**
+ * Unit tests covering WP_HTML_Tag_Processor modifiable text functionality.
+ *
+ * @package WordPress
+ * @subpackage HTML-API
+ * @group html-api
+ *
+ * @coversDefaultClass WP_HTML_Tag_Processor
+ */
+class Tests_HtmlApi_WpHtmlTagProcessorModifiableText extends WP_UnitTestCase {
+	/**
+	 * Ensures that modifiable text updates are not applied where they aren't supported.
+	 *
+	 * @ticket 61617
+	 *
+	 * @dataProvider data_tokens_not_supporting_modifiable_text_updates
+	 *
+	 * @param string $html             Contains HTML with a token not supporting modifiable text updates.
+	 * @param int    $advance_n_tokens Count of times to run `next_token()` before reaching target node.
+	 */
+	public function test_rejects_updates_on_unsupported_match_locations( string $html, int $advance_n_tokens ) {
+		$processor = new WP_HTML_Tag_Processor( $html );
+		while ( --$advance_n_tokens >= 0 ) {
+			$processor->next_token();
+		}
+
+		$this->assertFalse(
+			$processor->set_modifiable_text( 'Bazinga!' ),
+			'Should have prevented modifying the text at the target node.'
+		);
+
+		$this->assertSame(
+			$html,
+			$processor->get_updated_html(),
+			'Should not have modified the input document in any way.'
+		);
+	}
+
+	/**
+	 * Data provider.
+	 *
+	 * @return array[]
+	 */
+	public static function data_tokens_not_supporting_modifiable_text_updates() {
+		return array(
+			'Before parsing'               => array( 'nothing to see here', 0 ),
+			'After parsing'                => array( 'nothing here either', 2 ),
+			'Incomplete document'          => array( '<tag without="an end', 1 ),
+			'Presumptuous closer'          => array( 'before</>after', 2 ),
+			'Invalid (CDATA)'              => array( '<![CDATA[this is a comment]]>', 1 ),
+			'Invalid (shortest comment)'   => array( '<!-->', 1 ),
+			'Invalid (shorter comment)'    => array( '<!--->', 1 ),
+			'Invalid (markup declaration)' => array( '<!run>', 1 ),
+			'Invalid (PI-like node)'       => array( '<?xml is not html ?>', 1 ),
+		);
+	}
+
+	/**
+	 * Ensures that modifiable text updates are applied as expected to supported nodes.
+	 *
+	 * @ticket 61617
+	 *
+	 * @dataProvider data_tokens_with_basic_modifiable_text_updates
+	 *
+	 * @param string $html             Contains HTML with a token supporting modifiable text updates.
+	 * @param int    $advance_n_tokens Count of times to run `next_token()` before reaching target node.
+	 * @param string $raw_replacement  This should be escaped properly when replaced as modifiable text.
+	 * @param string $transformed      Expected output after updating modifiable text.
+	 */
+	public function test_updates_basic_modifiable_text_on_supported_nodes( string $html, int $advance_n_tokens, string $raw_replacement, string $transformed ) {
+		$processor = new WP_HTML_Tag_Processor( $html );
+		while ( --$advance_n_tokens >= 0 ) {
+			$processor->next_token();
+		}
+
+		$this->assertTrue(
+			$processor->set_modifiable_text( $raw_replacement ),
+			'Should have modified the text at the target node.'
+		);
+
+		$this->assertSame(
+			$transformed,
+			$processor->get_updated_html(),
+			"Should have transformed the HTML as expected why modifying the target node's modifiable text."
+		);
+	}
+
+	/**
+	 * Data provider.
+	 *
+	 * @return array[]
+	 */
+	public static function data_tokens_with_basic_modifiable_text_updates() {
+		return array(
+			'Text node (start)'       => array( 'Text', 1, 'Blubber', 'Blubber' ),
+			'Text node (middle)'      => array( '<em>Bold move</em>', 2, 'yo', '<em>yo</em>' ),
+			'Text node (end)'         => array( '<img>of a dog', 2, 'of a cat', '<img>of a cat' ),
+			'Encoded text node'       => array( '<figcaption>birds and dogs</figcaption>', 2, '<birds> & <dogs>', '<figcaption>&lt;birds&gt; &amp; &lt;dogs&gt;</figcaption>' ),
+			'SCRIPT tag'              => array( 'before<script></script>after', 2, 'const img = "<img> & <br>";', 'before<script>const img = "<img> & <br>";</script>after' ),
+			'STYLE tag'               => array( '<style></style>', 1, 'p::before { content: "<img> & </style>"; }', '<style>p::before { content: "<img> & \3c\2fstyle>"; }</style>' ),
+			'TEXTAREA tag'            => array( 'a<textarea>has no need to escape</textarea>b', 2, "so it <doesn't>", "a<textarea>so it <doesn't></textarea>b" ),
+			'TEXTAREA (escape)'       => array( 'a<textarea>has no need to escape</textarea>b', 2, 'but it does for </textarea>', 'a<textarea>but it does for &lt;/textarea></textarea>b' ),
+			'TEXTAREA (escape+attrs)' => array( 'a<textarea>has no need to escape</textarea>b', 2, 'but it does for </textarea not an="attribute">', 'a<textarea>but it does for &lt;/textarea not an="attribute"></textarea>b' ),
+			'TITLE tag'               => array( 'a<title>has no need to escape</title>b', 2, "so it <doesn't>", "a<title>so it <doesn't></title>b" ),
+			'TITLE (escape)'          => array( 'a<title>has no need to escape</title>b', 2, 'but it does for </title>', 'a<title>but it does for &lt;/title></title>b' ),
+			'TITLE (escape+attrs)'    => array( 'a<title>has no need to escape</title>b', 2, 'but it does for </title not an="attribute">', 'a<title>but it does for &lt;/title not an="attribute"></title>b' ),
+		);
+	}
+}