Skip to content

Commit

Permalink
Block Parser: Explore a streaming lazy interface
Browse files Browse the repository at this point in the history
For a 3 MB document which took 5 seconds and 14 GB to parse,
this version of the parser parsed it in 41 ms and 40 MB.
  • Loading branch information
dmsnell committed Nov 26, 2023
1 parent 5ff4794 commit 2585f14
Show file tree
Hide file tree
Showing 47 changed files with 323 additions and 0 deletions.
Binary file added src/wp-content/fonts/comic-neue_italic_300.ttf
Binary file not shown.
Binary file added src/wp-content/fonts/comic-neue_italic_400.ttf
Binary file not shown.
Binary file added src/wp-content/fonts/comic-neue_italic_700.ttf
Binary file not shown.
Binary file added src/wp-content/fonts/comic-neue_normal_300.ttf
Binary file not shown.
Binary file added src/wp-content/fonts/comic-neue_normal_400.ttf
Binary file not shown.
Binary file added src/wp-content/fonts/comic-neue_normal_700.ttf
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added src/wp-content/fonts/ostrich-sans_normal_300.ttf
Binary file not shown.
Binary file added src/wp-content/fonts/ostrich-sans_normal_500.ttf
Binary file not shown.
Binary file added src/wp-content/fonts/ostrich-sans_normal_700.ttf
Binary file not shown.
Binary file added src/wp-content/fonts/ostrich-sans_normal_900.ttf
Binary file not shown.
Binary file not shown.
323 changes: 323 additions & 0 deletions src/wp-includes/class-wp-block-parser.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,329 @@
* @package WordPress
*/

class WP_Span {
public $at;
public $length;

public function __construct( $at, $length ) {
$this->at = $at;
$this->length = $length;
}
}

class WP_Linked_List_Item {
/**
* Data cell.
*
* @var mixed
*/
public $data;

/**
* Pointer to next element in list.
*
* @var mixed
*/
public $next;

public function __construct( $data ) {
$this->data = $data;
}
}

class WP_Linked_List {
/**
* Front of list.
*
* @var WP_Linked_List_Item|null
*/
public $head;

/**
* Last item of list; an optimization.
*
* @var WP_Linked_List_Item|null
*/
public $last;

public $length = 0;

public function append( $data ) {
$this->length++;

if ( null === $this->head ) {
$this->head = new WP_Linked_List_Item( $data );
$this->last = $this->head;
return;
}

$item = new WP_Linked_List_Item( $data );
$this->last->next = $item;
$this->last = $item;
}

public function items() {
$item = $this->head;
while ( $item ) {
yield $item->data;
$item = $item->next;
}
}
}

class WP_Parsed_Block implements ArrayAccess {
/**
* Offset into name list where block name starts.
*
* @var int
*/
public $name_at;

/**
* Source of block attributes
*
* @var WP_Span|null
*/
public $attrs;

/**
* List of inner content.
*
* @var WP_Linked_List|null
*/
public $inner_content;

/**
* @var WP_Parsed_Blocks
*/
public $post;

public function __construct( $post ) {
$this->post = $post;
}

public function offsetGet( mixed $offset ): mixed {
switch ( $offset ) {
case 'blockName':
$name_end = strpos( $this->post->seen_block_types, "\x00", $this->name_at );
return substr( $this->post->seen_block_types, $this->name_at, $name_end - $this->name_at );

case 'attrs':
if ( null === $this->attrs ) {
return null;
}

return json_decode( substr( $this->post->html, $this->attrs->at, $this->attrs->length ) );

case 'inner_content':
if ( null === $this->inner_content ) {
return [];

Check failure on line 124 in src/wp-includes/class-wp-block-parser.php

View workflow job for this annotation

GitHub Actions / PHP coding standards

Short array syntax is not allowed
}

return $this->inner_content->items();
}
}

public function offsetExists( mixed $offset ): bool {
// TODO: Implement offsetExists() method.
}

public function offsetSet( mixed $offset, mixed $value ): mixed {
// TODO: Implement offsetSet() method.
}

public function offsetUnset( mixed $offset ): void {

Check failure on line 139 in src/wp-includes/class-wp-block-parser.php

View workflow job for this annotation

GitHub Actions / Check PHP compatibility

void return type is not present in PHP version 7.0 or earlier
// TODO: Implement offsetUnset() method.
}
}

class WP_Block_Stack_Item {
/**
* Data cell.
*
* @var WP_Parsed_Block
*/
public $data;

/**
* Parent of this item, or null if top-node.
*
* @var WP_Block_Stack_Item|null
*/
public $parent;

/**
* Child of this item, or null if leaf-node.
*
* @var WP_Block_Stack_Item|null
*/
public $child;
}

class WP_Parsed_Blocks {
/**
* Original HTML from which the blocks were parsed.
*
* @var string
*/
public $html;

/**
* Tracks internal pointer into HTML.
*
* @var int
*/
public $at = 0;

/**
* Concatenated block names, as parsed. Used for quick lookup
* of existing names.
*
* @var string
*/
public $seen_block_types = "\x00";

/**
* Tracks blocks while parsing.
*
* @var WP_Block_Stack
*/
public $stack;

/**
* @var WP_Parsed_Block
*/
public $root;

public function __construct( $html ) {
$this->html = $html;
$this->stack = new WP_Block_Stack();
$this->root = new WP_Parsed_Block( $this );
$this->root->inner_content = new WP_Linked_List();
$this->stack->open( $this->root );
}

/**
* Generator function which returns each block and the stack as it parses.
*/
public function step() {
if ( $this->at >= strlen( $this->html ) ) {
return false;
}

$has_match = preg_match(
'/<!--\s+(?P<closer>\/)?wp:(?P<namespace>[a-z][a-z0-9_-]*\/)?(?P<name>[a-z][a-z0-9_-]*)\s+(?P<attrs>{(?:(?:[^}]+|}+(?=})|(?!}\s+\/?-->).)*+)?}\s+)?(?P<void>\/)?-->/s',
$this->html,
$matches,
PREG_OFFSET_CAPTURE,
$this->at
);

if ( ! $has_match ) {
$this->at = strlen( $this->html );
return false;
}

list( $match, $started_at ) = $matches[0];

$length = strlen( $match );
$is_closer = isset( $matches['closer'] ) && -1 !== $matches['closer'][1];
$is_void = isset( $matches['void'] ) && -1 !== $matches['void'][1];
$namespace = $matches['namespace'];
$namespace = ( isset( $namespace ) && -1 !== $namespace[1] ) ? $namespace[0] : 'core/';
$name = $namespace . $matches['name'][0];
$has_attrs = isset( $matches['attrs'] ) && -1 !== $matches['attrs'][1];

if ( $started_at > $this->at ) {
$this->stack->add_inner_chunk( new WP_Span( $this->at, $started_at - $this->at ) );
}

$this->at = $started_at + strlen( $match );

if ( $is_closer ) {
$this->stack->close();
return true;
}

$block = new WP_Parsed_Block( $this );

// Block name
$name_search = "\x00{$name}\x00";
$seen_name_at = strpos( $this->seen_block_types, $name_search );
if ( false === $seen_name_at ) {
$block->name_at = strlen( $this->seen_block_types );
$this->seen_block_types .= "{$name}\x00";
} else {
$block->name_at = $seen_name_at + 1;
}

// Block attrs
if ( $has_attrs ) {
$block->attrs = new WP_Span( $matches['attrs'][1], strlen( $matches['attrs'][0] ) );
}

$this->stack->add_inner_chunk( $block );

if ( ! $is_void ) {
$this->stack->open( $block );
}
return true;
}
}

class WP_Block_Stack {
/**
* Bottom of stack of items.
*
* @var WP_Block_Stack_Item|null
*/
public $stack;

/**
* Size of stack.
*
* @var int
*/
public $depth = 0;

public function open( $data ) {
$child = new WP_Block_Stack_Item();
$child->data = $data;

if ( null === $this->stack ) {
$this->stack = $child;
$this->depth++;
return;
}

$child->parent = $this->stack;

$this->stack->child = $child;
$this->stack = $child;
$this->depth++;
}

public function close() {
if ( null === $this->stack ) {
return null;
}

$item = $this->stack->data;
$this->stack = $this->stack->parent;
$this->depth--;

return $item;
}

public function add_inner_chunk( $data ) {
if ( null === $this->stack->data->inner_content ) {
$this->stack->data->inner_content = new WP_Linked_List();
}

$this->stack->data->inner_content->append( $data );
}
}

/**
* Class WP_Block_Parser
*
Expand Down

0 comments on commit 2585f14

Please sign in to comment.