From 729e5b3e969ed83b337c7099ec17d272c4e07a84 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sat, 8 Jun 2024 17:53:03 +0200 Subject: [PATCH] Blocks: Add functions to return PCRE pattern (regex) for finding blocks. In this patch two new functions are introduced for the purpose of returning a PCRE pattern that can be used to quickly and efficiently find blocks within an HTML document without having to parse the entire document and without building a full block tree. These new functions enable more efficient processing for work that only needs to examine document structure or know a few things about a document without knowing everything, including but not limited to: - Finding the URL of the first image block in a document. - Inserting hooked blocks. - Analyzing block counts. --- src/wp-includes/blocks.php | 117 ++++++++++++++++++++++ src/wp-includes/class-wp-block-parser.php | 8 +- 2 files changed, 118 insertions(+), 7 deletions(-) diff --git a/src/wp-includes/blocks.php b/src/wp-includes/blocks.php index 02b02b1188fdb..7dc9579e025f4 100644 --- a/src/wp-includes/blocks.php +++ b/src/wp-includes/blocks.php @@ -1288,6 +1288,123 @@ function make_after_block_visitor( $hooked_blocks, $context, $callback = 'insert }; } +/** + * Returns a regular expression which can be used to find + * block comment delimiters in a given HTML document. + * + * Returned matches contain named capture groups: + * - 'closer' is '/' if the delimiter is a block closer. + * - 'namespace' is non-empty if a block namespace was provided, + * otherwise the block name is assumed to be in the "core/" namespace. + * - 'name' is the block name, always non-empty. + * - 'attrs' contains the content which may be JSON, if non-empty. + * - 'void' is '/' if the delimiter indicates a void block. + * + * Example: + * + * if ( 1 === preg_match( get_block_delimiter_regex(), $block_content, $delimiter_match ) ) { + * $is_closer = '/' === $delimiter_match['closer']; + * $is_void = '/' === $delimiter_match['void']; + * $block_name = ( $delimiter_match['namespace'] ?? 'core/' ) . $delimiter_match['name']; + * $attrs = array(); + * if ( ! $is_closer ) { + * $json = json_decode( $delimiter_match['attrs'] ); + * if ( JSON_ERROR_NONE === json_last_error() ) { + * $attrs = $json; + * } + * } + * } + * + * @since {WP_VERSION} + * + * @return string PCRE pattern which can be used to find and parse block delimiter HTML comments. + */ +function get_block_delimiter_regex(): string { + return <<<'REGEXP' +~ +).)*+)?}\s+)? # It's required to parse the JSON separately, if it exists. + (?P/)? # Void blocks have no content and no closer. +--> +~sx +REGEXP; +} + +/** + * Returns a regular expression which can be used to find block comment + * delimiters for a given block type in a given HTML document. + * + * Returned matches contain named capture groups: + * - 'closer' is '/' if the delimiter is a block closer. + * - 'namespace' is non-empty if a block namespace was provided, + * otherwise the block name is assumed to be in the "core/" namespace. + * - 'name' is the block name, always non-empty. + * - 'attrs' contains the content which may be JSON, if non-empty. + * - 'void' is '/' if the delimiter indicates a void block. + * + * Example: + * + * if ( 1 === preg_match( get_named_block_delimiter_regex( 'core/image' ), $block_content, $delimiter_match ) ) { + * $is_closer = '/' === $delimiter_match['closer']; + * $is_void = '/' === $delimiter_match['void']; + * $block_name = ( $delimiter_match['namespace'] ?? 'core/' ) . $delimiter_match['name']; + * $attrs = array(); + * if ( ! $is_closer ) { + * $json = json_decode( $delimiter_match['attrs'] ); + * if ( JSON_ERROR_NONE === json_last_error() ) { + * $attrs = $json; + * } + * } + * } + * + * @since {WP_VERSION} + * + * @param string $block_name Namespace and name of block, e.g. "math-blocks/formula". + * Defaults to "core" namespace if none is provided. + * @return string PCRE pattern which can be used to find and parse block delimiter HTML comments. + */ +function get_named_block_delimiter_regex( string $block_name ): string { + $slash_at = strpos( $block_name, '/' ); + $namespace = false === $slash_at ? 'core' : substr( $block_name, 0, $slash_at ); + $name = false === $slash_at ? substr( $block_name, $slash_at + 1 ) : $block_name; + $is_core = 'core' === $namespace; + + $namespace = preg_quote( $namespace, '~' ); + $name = preg_quote( $name, '~' ); + + if ( $is_core ) { + return <</)? # This pattern also detects closing block delimiters. + wp:(?Pcore/)?(?P{$name}) # e.g. "core/paragraph", "paragraph". + \s+ + (?P{(?:(?:[^}]+|}+(?=})|(?!}\s+/?-->).)*+)?}\s+)? # It's required to parse the JSON separately, if it exists. + (?P/)? # Void blocks have no content and no closer. +--> +~sx +REGEXP; + } + + return <</)? # This pattern also detects closing block delimiters. + wp:(?P{$namespace}/)(?P{$name}) # e.g. "math-blocks/formula". + \s+ + (?P{(?:(?:[^}]+|}+(?=})|(?!}\s+/?-->).)*+)?}\s+)? # It's required to parse the JSON separately, if it exists. + (?P/)? # Void blocks have no content and no closer. +--> +~sx +REGEXP; +} + /** * Given an array of attributes, returns a string in the serialized attributes * format prepared for post content. diff --git a/src/wp-includes/class-wp-block-parser.php b/src/wp-includes/class-wp-block-parser.php index 543f53691ccb1..589ae73f2171d 100644 --- a/src/wp-includes/class-wp-block-parser.php +++ b/src/wp-includes/class-wp-block-parser.php @@ -244,13 +244,7 @@ public function next_token() { * a closer has no attributes). we can trap them both and process the * match back in PHP to see which one it was. */ - $has_match = preg_match( - '/).)*+)?}\s+)?(?P\/)?-->/s', - $this->document, - $matches, - PREG_OFFSET_CAPTURE, - $this->offset - ); + $has_match = preg_match( get_block_delimiter_regex(), $this->document, $matches, PREG_OFFSET_CAPTURE, $this->offset ); // if we get here we probably have catastrophic backtracking or out-of-memory in the PCRE. if ( false === $has_match ) {