Skip to content

Commit

Permalink
Tokenize purely using regexes (#123)
Browse files Browse the repository at this point in the history
* Tokenize purely using regexes

* Improve critical matching loop a little

+7% speedup

* address review
  • Loading branch information
mvorisek authored Sep 11, 2024
1 parent 40ee470 commit 16ca9e3
Showing 1 changed file with 81 additions and 220 deletions.
301 changes: 81 additions & 220 deletions src/Tokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,19 @@

namespace Doctrine\SqlFormatter;

use function array_key_last;
use function array_map;
use function array_pop;
use function assert;
use function count;
use function implode;
use function is_int;
use function preg_match;
use function preg_quote;
use function reset;
use function str_replace;
use function str_starts_with;
use function strlen;
use function strpos;
use function strtoupper;
use function substr;
use function usort;
Expand Down Expand Up @@ -719,15 +722,8 @@ final class Tokenizer
'YEARWEEK',
];

// Regular expressions for tokenizing

private readonly string $nextTokenRegexNumber;
private readonly string $nextTokenRegexBoundaryCharacter;
private readonly string $nextTokenRegexReservedToplevel;
private readonly string $nextTokenRegexReservedNewline;
private readonly string $nextTokenRegexReserved;
private readonly string $nextTokenRegexFunction;
private readonly string $nextTokenRegexNonReserved;
/** Regular expression for tokenizing. */
private readonly string $tokenizeRegex;

/**
* Punctuation that can be used as a boundary between other tokens
Expand Down Expand Up @@ -758,25 +754,11 @@ final class Tokenizer
];

/**
* Stuff that only needs to be done once. Builds regular expressions and
* sorts the reserved words.
* Stuff that only needs to be done once. Builds tokenizing regular expression.
*/
public function __construct()
{
// Set up regular expressions
$regexBoundaries = $this->makeRegexFromList($this->boundaries);
$regexReserved = $this->makeRegexFromList($this->reserved);
$regexReservedToplevel = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedToplevel));
$regexReservedNewline = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedNewline));
$regexFunction = $this->makeRegexFromList($this->functions);

$this->nextTokenRegexNumber = '/\G(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|"\'`|' . $regexBoundaries . ')/';
$this->nextTokenRegexBoundaryCharacter = '/\G' . $regexBoundaries . '/';
$this->nextTokenRegexReservedToplevel = '/\G' . $regexReservedToplevel . '(?=$|\s|' . $regexBoundaries . ')/';
$this->nextTokenRegexReservedNewline = '/\G' . $regexReservedNewline . '(?=$|\s|' . $regexBoundaries . ')/';
$this->nextTokenRegexReserved = '/\G' . $regexReserved . '(?=$|\s|' . $regexBoundaries . ')/';
$this->nextTokenRegexFunction = '/\G' . $regexFunction . '(?=\s*\()/';
$this->nextTokenRegexNonReserved = '/\G.*?(?=$|\s|["\'`]|' . $regexBoundaries . ')/';
$this->tokenizeRegex = $this->makeTokenizeRegex($this->makeTokenizeRegexes());
}

/**
Expand Down Expand Up @@ -848,222 +830,101 @@ private function makeRegexFromList(array $values, bool $sorted = false): string
return $regex . ')';
}

/**
* Takes a SQL string and breaks it into tokens.
* Each token is an associative array with type and value.
*
* @param string $string The SQL string
*/
public function tokenize(string $string): Cursor
/** @return array<Token::TOKEN_TYPE_*, string> */
private function makeTokenizeRegexes(): array
{
$tokens = [];
// Set up regular expressions
$regexBoundaries = $this->makeRegexFromList($this->boundaries);
$regexReserved = $this->makeRegexFromList($this->reserved);
$regexReservedToplevel = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedToplevel));
$regexReservedNewline = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedNewline));
$regexFunction = $this->makeRegexFromList($this->functions);

$upper = strtoupper($string);
$offset = 0;
$token = null;
return [
Token::TOKEN_TYPE_WHITESPACE => '\s+',
Token::TOKEN_TYPE_COMMENT => '(?:--|#)[^\n]*+',
Token::TOKEN_TYPE_BLOCK_COMMENT => '/\*(?:[^*]+|\*(?!/))*+(?:\*|$)(?:/|$)',
// 1. backtick quoted string using `` to escape
// 2. square bracket quoted string (SQL Server) using ]] to escape
Token::TOKEN_TYPE_BACKTICK_QUOTE => <<<'EOD'
(?>(?x)
`(?:[^`]+|`(?:`|$))*+(?:`|$)
|\[(?:[^\]]+|\](?:\]|$))*+(?:\]|$)
)
EOD,
// 3. double quoted string using "" or \" to escape
// 4. single quoted string using '' or \' to escape
Token::TOKEN_TYPE_QUOTE => <<<'EOD'
(?>(?sx)
'(?:[^'\\]+|\\(?:.|$)|'(?:'|$))*+(?:'|$)
|"(?:[^"\\]+|\\(?:.|$)|"(?:"|$))*+(?:"|$)
)
EOD,
// User-defined variable, possibly with quoted name
Token::TOKEN_TYPE_VARIABLE => '[@:](?:[\w.$]++|(?&t_' . Token::TOKEN_TYPE_BACKTICK_QUOTE . ')|(?&t_' . Token::TOKEN_TYPE_QUOTE . '))',
// decimal, binary, or hex
Token::TOKEN_TYPE_NUMBER => '(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|"\'`|' . $regexBoundaries . ')',
// punctuation and symbols
Token::TOKEN_TYPE_BOUNDARY => $regexBoundaries,
// A reserved word cannot be preceded by a '.'
// this makes it so in "mytable.from", "from" is not considered a reserved word
Token::TOKEN_TYPE_RESERVED_TOPLEVEL => '(?<!\.)' . $regexReservedToplevel . '(?=$|\s|' . $regexBoundaries . ')',
Token::TOKEN_TYPE_RESERVED_NEWLINE => '(?<!\.)' . $regexReservedNewline . '(?=$|\s|' . $regexBoundaries . ')',
Token::TOKEN_TYPE_RESERVED => '(?<!\.)' . $regexReserved . '(?=$|\s|' . $regexBoundaries . ')'
// A function must be succeeded by '('
// this makes it so "count(" is considered a function, but "count" alone is not function
. '|' . $regexFunction . '(?=\s*\()',
Token::TOKEN_TYPE_WORD => '.*?(?=$|\s|["\'`]|' . $regexBoundaries . ')',
];
}

// Keep processing the string until it is empty
while ($offset < strlen($string)) {
// Get the next token and the token type
$token = $this->createNextToken($string, $upper, $offset, $token);
$offset += strlen($token->value());
/** @param array<Token::TOKEN_TYPE_*, string> $regexes */
private function makeTokenizeRegex(array $regexes): string
{
$parts = [];

$tokens[] = $token;
foreach ($regexes as $type => $regex) {
$parts[] = '(?<t_' . $type . '>' . $regex . ')';
}

return new Cursor($tokens);
return '~\G(?:' . implode('|', $parts) . ')~';
}

/**
* Return the next token and token type in a SQL string.
* Quoted strings, comments, reserved words, whitespace, and punctuation
* are all their own tokens.
*
* @param string $string The SQL string
* @param string $upper The SQL string in upper case
* @param Token|null $previous The result of the previous createNextToken() call
* Takes a SQL string and breaks it into tokens.
* Each token is an associative array with type and value.
*
* @return Token An associative array containing the type and value of the token.
* @param string $string The SQL string
*/
private function createNextToken(string $string, string $upper, int $offset, Token|null $previous = null): Token
public function tokenize(string $string): Cursor
{
// Whitespace
if (preg_match('/\G\s+/', $string, $matches, 0, $offset)) {
return new Token(Token::TOKEN_TYPE_WHITESPACE, $matches[0]);
}
$tokenizeRegex = $this->tokenizeRegex;
$upper = strtoupper($string);

$firstChar = $string[$offset];
$secondChar = $string[$offset + 1] ?? '';

// Comment
if (
$firstChar === '#' ||
(($firstChar === '-' && $secondChar === '-') ||
($firstChar === '/' && $secondChar === '*'))
) {
// Comment until end of line
if ($firstChar === '-' || $firstChar === '#') {
$last = strpos($string, "\n", $offset);
$type = Token::TOKEN_TYPE_COMMENT;
} else { // Comment until closing comment tag
$pos = strpos($string, '*/', $offset + 2);
$last = $pos !== false
? $pos + 2
: false;
$type = Token::TOKEN_TYPE_BLOCK_COMMENT;
}

if ($last === false) {
$last = strlen($string);
}

return new Token($type, substr($string, $offset, $last - $offset));
}

// Quoted String
if ($firstChar === '"' || $firstChar === '\'' || $firstChar === '`' || $firstChar === '[') {
return new Token(
($firstChar === '`' || $firstChar === '['
? Token::TOKEN_TYPE_BACKTICK_QUOTE
: Token::TOKEN_TYPE_QUOTE),
$this->getNextQuotedString($string, $offset),
);
}

// User-defined Variable
if (($firstChar === '@' || $firstChar === ':') && $secondChar !== '') {
$value = null;
$type = Token::TOKEN_TYPE_VARIABLE;

// If the variable name is quoted
if ($secondChar === '"' || $secondChar === '\'' || $secondChar === '`') {
$value = $firstChar . $this->getNextQuotedString($string, $offset + 1);
} else {
// Non-quoted variable name
preg_match('/\G[@:][\w.$]+/', $string, $matches, 0, $offset);
if ($matches) {
$value = $matches[0];
}
}

if ($value !== null) {
return new Token($type, $value);
}
}

// Number (decimal, binary, or hex)
if (
preg_match(
$this->nextTokenRegexNumber,
$string,
$matches,
0,
$offset,
)
) {
return new Token(Token::TOKEN_TYPE_NUMBER, $matches[0]);
}

// Boundary Character (punctuation and symbols)
if (preg_match($this->nextTokenRegexBoundaryCharacter, $string, $matches, 0, $offset)) {
return new Token(Token::TOKEN_TYPE_BOUNDARY, $matches[0]);
}

// A reserved word cannot be preceded by a '.'
// this makes it so in "mytable.from", "from" is not considered a reserved word
if ($previous === null || $previous->value() !== '.') {
// Top Level Reserved Word
if (
preg_match(
$this->nextTokenRegexReservedToplevel,
$upper,
$matches,
0,
$offset,
)
) {
return new Token(
Token::TOKEN_TYPE_RESERVED_TOPLEVEL,
substr($string, $offset, strlen($matches[0])),
);
}
$tokens = [];
$offset = 0;

// Newline Reserved Word
if (
preg_match(
$this->nextTokenRegexReservedNewline,
$upper,
$matches,
0,
$offset,
)
) {
return new Token(
Token::TOKEN_TYPE_RESERVED_NEWLINE,
substr($string, $offset, strlen($matches[0])),
);
}
while ($offset < strlen($string)) {
// Get the next token and the token type
preg_match($tokenizeRegex, $upper, $matches, 0, $offset);
assert(($matches[0] ?? '') !== '');

// Other Reserved Word
if (
preg_match(
$this->nextTokenRegexReserved,
$upper,
$matches,
0,
$offset,
)
) {
return new Token(
Token::TOKEN_TYPE_RESERVED,
substr($string, $offset, strlen($matches[0])),
);
while (is_int($lastMatchesKey = array_key_last($matches))) {
array_pop($matches);
}
}

// A function must be succeeded by '('
// this makes it so "count(" is considered a function, but "count" alone is not function
if (preg_match($this->nextTokenRegexFunction, $upper, $matches, 0, $offset)) {
return new Token(
Token::TOKEN_TYPE_RESERVED,
substr($string, $offset, strlen($matches[0])),
);
}
assert(str_starts_with($lastMatchesKey, 't_'));

// Non reserved word
preg_match($this->nextTokenRegexNonReserved, $string, $matches, 0, $offset);
/** @var Token::TOKEN_TYPE_* $tokenType */
$tokenType = (int) substr($lastMatchesKey, 2);

return new Token(Token::TOKEN_TYPE_WORD, $matches[0]);
}
$token = new Token($tokenType, substr($string, $offset, strlen($matches[0])));

private function getNextQuotedString(string $string, int $offset): string
{
$ret = '';
$offset += strlen($token->value());

// This checks for the following patterns:
// 1. backtick quoted string using `` to escape
// 2. square bracket quoted string (SQL Server) using ]] to escape
// 3. double quoted string using "" or \" to escape
// 4. single quoted string using '' or \' to escape
if (
preg_match(
<<<'EOD'
~\G(?>(?sx)
(?:`[^`]*(?:$|`))+
|(?:\[[^\]]*($|\]))(?:\][^\]]*(?:$|\]))*
|(?:"[^"\\]*(?:\\.[^"\\]*)*(?:"|$))+
|(?:'[^'\\]*(?:\\.[^'\\]*)*(?:'|$))+
)~
EOD,
$string,
$matches,
0,
$offset,
)
) {
$ret = $matches[0];
$tokens[] = $token;
}

return $ret;
return new Cursor($tokens);
}
}

0 comments on commit 16ca9e3

Please sign in to comment.