Skip to content

Commit

Permalink
perf(syntax): further optimize is_identifier_name (#5426)
Browse files Browse the repository at this point in the history
Follow-on after #5425. Further optimize `oxc_syntax::identifier::is_identifier_name` by processing string in blocks of 8 bytes, and checking if all bytes in a block are ASCII in one go, rather than testing each byte individually.
  • Loading branch information
overlookmotel committed Sep 3, 2024
1 parent aeda84f commit bfabd8f
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 14 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/oxc_syntax/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ oxc_span = { workspace = true }
oxc_ast_macros = { workspace = true }
oxc_allocator = { workspace = true }

assert-unchecked = { workspace = true }
unicode-id-start = { workspace = true }
bitflags = { workspace = true }
rustc-hash = { workspace = true }
Expand Down
94 changes: 80 additions & 14 deletions crates/oxc_syntax/src/identifier.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use assert_unchecked::assert_unchecked;

use unicode_id_start::{is_id_continue_unicode, is_id_start_unicode};

pub const EOF: char = '\0';
Expand Down Expand Up @@ -143,6 +145,7 @@ pub fn is_identifier_name(name: &str) -> bool {
// the cheap `is_identifier_start_ascii` and `is_identifier_part_ascii` to test bytes.
// Only if a Unicode char is found, fall back to iterating over `char`s, and using the more
// expensive `is_identifier_start_unicode` and `is_identifier_part`.
// As a further optimization, we test if bytes are ASCII in blocks of 8 or 4 bytes, rather than 1 by 1.

// Get first byte. Exit if empty string.
let bytes = name.as_bytes();
Expand All @@ -154,26 +157,83 @@ pub fn is_identifier_name(name: &str) -> bool {
return false;
}

// `'outer` loop never actually loops - only here to allow breaking out of it when Unicode found
#[allow(clippy::never_loop)]
let index = 'outer: loop {
for (index, &b) in bytes[1..].iter().enumerate() {
if b.is_ascii() {
let mut index = 1;
'outer: loop {
// Check blocks of 8 bytes, then 4 bytes, then single bytes
let bytes_remaining = bytes.len() - index;
if bytes_remaining >= 8 {
// Process block of 8 bytes.
// Check that next 8 bytes are all ASCII.
// SAFETY: We checked above that there are at least 8 bytes to read starting at `index`
#[allow(clippy::cast_ptr_alignment)]
let next8_as_u64 = unsafe {
let ptr = bytes.as_ptr().add(index).cast::<u64>();
ptr.read_unaligned()
};
let high_bits = next8_as_u64 & 0x8080_8080_8080_8080;
if high_bits != 0 {
// Some chars in this block are non-ASCII
break;
}

let next8 = next8_as_u64.to_ne_bytes();
for b in next8 {
// SAFETY: We just checked all these bytes are ASCII
unsafe { assert_unchecked!(b.is_ascii()) };
if !is_identifier_part_ascii(b as char) {
return false;
}
} else {
// Unicode byte found
break 'outer index;
}

index += 8;
} else if bytes_remaining >= 4 {
// Process block of 4 bytes.
// Check that next 4 bytes are all ASCII.
// SAFETY: We checked above that there are at least 4 bytes to read starting at `index`
#[allow(clippy::cast_ptr_alignment)]
let next4_as_u32 = unsafe {
let ptr = bytes.as_ptr().add(index).cast::<u32>();
ptr.read_unaligned()
};
let high_bits = next4_as_u32 & 0x8080_8080;
if high_bits != 0 {
// Some chars in this block are non-ASCII
break;
}

let next4 = next4_as_u32.to_ne_bytes();
for b in next4 {
// SAFETY: We just checked all these bytes are ASCII
unsafe { assert_unchecked!(b.is_ascii()) };
if !is_identifier_part_ascii(b as char) {
return false;
}
}

index += 4;
} else {
loop {
let Some(&b) = bytes.get(index) else {
// We got to the end with no non-identifier chars found
return true;
};

if b.is_ascii() {
if !is_identifier_part_ascii(b as char) {
return false;
}
} else {
// Unicode byte found
break 'outer;
}

index += 1;
}
}
// We got to end without finding any non-identifier of Unicode characters
return true;
};
}

// Unicode byte found - search rest of string (from this byte onwards) as Unicode.
// `index + 1` because `index` returned from the loop is relative to start of `bytes[1..]`.
name[index + 1..].chars()
// Unicode byte found - search rest of string (from this byte onwards) as Unicode
name[index..].chars()
} else {
// First char is Unicode.
// NB: `unwrap()` cannot fail because we already checked the string is not empty.
Expand Down Expand Up @@ -260,6 +320,12 @@ fn is_identifier_name_false() {
"A£",
"A৸",
"A𐄬",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc£",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc৸",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc𐄬",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc£abcdefghijklmnopqrstuvwxyz",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc৸abcdefghijklmnopqrstuvwxyz",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$abc𐄬abcdefghijklmnopqrstuvwxyz",
// ASCII + Unicode, starting with Unicode
"£A",
"৸A",
Expand Down

0 comments on commit bfabd8f

Please sign in to comment.