Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HTML API: Add set_modifiable_text() for replacing text nodes. #7007

Closed
151 changes: 151 additions & 0 deletions src/wp-includes/html-api/class-wp-html-tag-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -2881,6 +2881,157 @@ public function get_modifiable_text() {
return $decoded;
}

/**
* Sets the modifiable text for the matched token, if matched.
*
* Modifiable text is text content that may be read and changed without
* changing the HTML structure of the document around it. This includes
* the contents of `#text` nodes in the HTML as well as the inner
* contents of HTML comments, Processing Instructions, and others, even
* though these nodes aren't part of a parsed DOM tree. They also contain
* the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any
* other section in an HTML document which cannot contain HTML markup (DATA).
*
* Not all modifiable text may be set by this method, and not all content
* may be set as modifiable text. In the case that this fails it will return
* `false` indicating as much. For instance, it will not allow inserting the
* string `</script` into a SCRIPT element, because the rules for escaping
* that safely are complicated. Similarly, it will not allow setting content
* into a comment which would prematurely terminate the comment.
*
* Example:
*
* // Add a preface to all STYLE contents.
* while ( $processor->next_tag( 'STYLE' ) ) {
* $style = $processor->get_modifiable_text();
* $processor->set_modifiable_text( "// Made with love on the World Wide Web\n{$style}" );
* }
*
* // Replace smiley text with Emoji smilies.
* while ( $processor->next_token() ) {
* if ( '#text' !== $processor->get_token_name() ) {
* continue;
* }
*
* $chunk = $processor->get_modifiable_text();
* if ( ! str_contains( $chunk, ':)' ) ) {
* continue;
* }
*
* $processor->set_modifiable_text( str_replace( ':)', '🙂', $chunk ) );
* }
*
* @since 6.7.0
*
* @param string $plaintext_content New text content to represent in the matched token.
*
* @return bool Whether the text was able to update.
*/
public function set_modifiable_text( string $plaintext_content ): bool {
if ( self::STATE_TEXT_NODE === $this->parser_state ) {
$this->lexical_updates[] = new WP_HTML_Text_Replacement(
$this->text_starts_at,
$this->text_length,
htmlspecialchars( $plaintext_content, ENT_QUOTES | ENT_HTML5 )
);

return true;
}

// Comment data is not encoded.
if (
self::STATE_COMMENT === $this->parser_state &&
self::COMMENT_AS_HTML_COMMENT === $this->comment_type
) {
// Check if the text could close the comment.
if ( 1 === preg_match( '/--!?>/', $plaintext_content ) ) {
return false;
}

$this->lexical_updates[] = new WP_HTML_Text_Replacement(
$this->text_starts_at,
$this->text_length,
$plaintext_content
);

return true;
}

if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
return false;
}

switch ( $this->get_tag() ) {
case 'SCRIPT':
/*
* This is over-protective, but ensures the update doesn't break
* out of the SCRIPT element. A more thorough check would need to
* ensure that the script closing tag doesn't exist, and isn't
* also "hidden" inside the script double-escaped state.
*
* It may seem like replacing `</script` with `<\/script` would
* properly escape these things, but this could mask regex patterns
* that previously worked. Resolve this by not sending `</script`
*/
if ( false !== stripos( $plaintext_content, '</script' ) ) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can’t locate a unit test covering this edge case. Is it included?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added in 944c8cb

return false;
}

$this->lexical_updates[] = new WP_HTML_Text_Replacement(
$this->text_starts_at,
$this->text_length,
$plaintext_content
);

return true;

case 'STYLE':
$plaintext_content = preg_replace_callback(
'~</(?P<TAG_NAME>style)~i',
static function ( $tag_match ) {
return "\\3c\\2f{$tag_match['TAG_NAME']}";
},
$plaintext_content
);

$this->lexical_updates[] = new WP_HTML_Text_Replacement(
$this->text_starts_at,
$this->text_length,
$plaintext_content
);

return true;

case 'TEXTAREA':
case 'TITLE':
$plaintext_content = preg_replace_callback(
"~</(?P<TAG_NAME>{$this->get_tag()})~i",
static function ( $tag_match ) {
return "&lt;/{$tag_match['TAG_NAME']}";
},
$plaintext_content
);

/*
* These don't _need_ to be escaped, but since they are decoded it's
* safe to leave them escaped and this can prevent other code from
* naively detecting tags within the contents.
*
* @todo It would be useful to prefix a multiline replacement text
* with a newline, but not necessary. This is for aesthetics.
*/
$this->lexical_updates[] = new WP_HTML_Text_Replacement(
$this->text_starts_at,
$this->text_length,
$plaintext_content
);

return true;
}

return false;
}

/**
* Updates or creates a new attribute on the currently matched tag with the passed value.
*
Expand Down
110 changes: 110 additions & 0 deletions tests/phpunit/tests/html-api/wpHtmlTagProcessorModifiableText.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
<?php
/**
* Unit tests covering WP_HTML_Tag_Processor modifiable text functionality.
*
* @package WordPress
* @subpackage HTML-API
* @group html-api
*
* @coversDefaultClass WP_HTML_Tag_Processor
*/
class Tests_HtmlApi_WpHtmlTagProcessorModifiableText extends WP_UnitTestCase {
/**
* Ensures that modifiable text updates are not applied where they aren't supported.
*
* @ticket 61617
*
* @dataProvider data_tokens_not_supporting_modifiable_text_updates
*
* @param string $html Contains HTML with a token not supporting modifiable text updates.
* @param int $advance_n_tokens Count of times to run `next_token()` before reaching target node.
*/
public function test_rejects_updates_on_unsupported_match_locations( string $html, int $advance_n_tokens ) {
$processor = new WP_HTML_Tag_Processor( $html );
while ( --$advance_n_tokens >= 0 ) {
$processor->next_token();
}

$this->assertFalse(
$processor->set_modifiable_text( 'Bazinga!' ),
'Should have prevented modifying the text at the target node.'
);

$this->assertSame(
$html,
$processor->get_updated_html(),
'Should not have modified the input document in any way.'
);
}

/**
* Data provider.
*
* @return array[]
*/
public static function data_tokens_not_supporting_modifiable_text_updates() {
return array(
'Before parsing' => array( 'nothing to see here', 0 ),
'After parsing' => array( 'nothing here either', 2 ),
'Incomplete document' => array( '<tag without="an end', 1 ),
'Presumptuous closer' => array( 'before</>after', 2 ),
'Invalid (CDATA)' => array( '<![CDATA[this is a comment]]>', 1 ),
'Invalid (shortest comment)' => array( '<!-->', 1 ),
'Invalid (shorter comment)' => array( '<!--->', 1 ),
'Invalid (markup declaration)' => array( '<!run>', 1 ),
'Invalid (PI-like node)' => array( '<?xml is not html ?>', 1 ),
);
}

/**
* Ensures that modifiable text updates are applied as expected to supported nodes.
*
* @ticket 61617
*
* @dataProvider data_tokens_with_basic_modifiable_text_updates
*
* @param string $html Contains HTML with a token supporting modifiable text updates.
* @param int $advance_n_tokens Count of times to run `next_token()` before reaching target node.
* @param string $raw_replacement This should be escaped properly when replaced as modifiable text.
* @param string $transformed Expected output after updating modifiable text.
*/
public function test_updates_basic_modifiable_text_on_supported_nodes( string $html, int $advance_n_tokens, string $raw_replacement, string $transformed ) {
$processor = new WP_HTML_Tag_Processor( $html );
while ( --$advance_n_tokens >= 0 ) {
$processor->next_token();
}

$this->assertTrue(
$processor->set_modifiable_text( $raw_replacement ),
'Should have modified the text at the target node.'
);

$this->assertSame(
$transformed,
$processor->get_updated_html(),
"Should have transformed the HTML as expected why modifying the target node's modifiable text."
);
}

/**
* Data provider.
*
* @return array[]
*/
public static function data_tokens_with_basic_modifiable_text_updates() {
return array(
'Text node (start)' => array( 'Text', 1, 'Blubber', 'Blubber' ),
'Text node (middle)' => array( '<em>Bold move</em>', 2, 'yo', '<em>yo</em>' ),
'Text node (end)' => array( '<img>of a dog', 2, 'of a cat', '<img>of a cat' ),
'Encoded text node' => array( '<figcaption>birds and dogs</figcaption>', 2, '<birds> & <dogs>', '<figcaption>&lt;birds&gt; &amp; &lt;dogs&gt;</figcaption>' ),
'SCRIPT tag' => array( 'before<script></script>after', 2, 'const img = "<img> & <br>";', 'before<script>const img = "<img> & <br>";</script>after' ),
'STYLE tag' => array( '<style></style>', 1, 'p::before { content: "<img> & </style>"; }', '<style>p::before { content: "<img> & \3c\2fstyle>"; }</style>' ),
'TEXTAREA tag' => array( 'a<textarea>has no need to escape</textarea>b', 2, "so it <doesn't>", "a<textarea>so it <doesn't></textarea>b" ),
'TEXTAREA (escape)' => array( 'a<textarea>has no need to escape</textarea>b', 2, 'but it does for </textarea>', 'a<textarea>but it does for &lt;/textarea></textarea>b' ),
'TEXTAREA (escape+attrs)' => array( 'a<textarea>has no need to escape</textarea>b', 2, 'but it does for </textarea not an="attribute">', 'a<textarea>but it does for &lt;/textarea not an="attribute"></textarea>b' ),
'TITLE tag' => array( 'a<title>has no need to escape</title>b', 2, "so it <doesn't>", "a<title>so it <doesn't></title>b" ),
'TITLE (escape)' => array( 'a<title>has no need to escape</title>b', 2, 'but it does for </title>', 'a<title>but it does for &lt;/title></title>b' ),
'TITLE (escape+attrs)' => array( 'a<title>has no need to escape</title>b', 2, 'but it does for </title not an="attribute">', 'a<title>but it does for &lt;/title not an="attribute"></title>b' ),
);
}
}
Loading