Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf(syntax): optimize is_identifier_name #5425

Merged
merged 1 commit into from
Sep 3, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 132 additions & 2 deletions crates/oxc_syntax/src/identifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,137 @@ pub fn is_identifier_part_unicode(c: char) -> bool {
is_id_continue_unicode(c) || c == ZWNJ || c == ZWJ
}

/// Determine if a string is a valid JS identifier.
#[allow(clippy::missing_panics_doc)]
pub fn is_identifier_name(name: &str) -> bool {
let mut chars = name.chars();
chars.next().is_some_and(is_identifier_start) && chars.all(is_identifier_part)
// This function contains a fast path for ASCII (common case), iterating over bytes and using
// the cheap `is_identifier_start_ascii` and `is_identifier_part_ascii` to test bytes.
// Only if a Unicode char is found, fall back to iterating over `char`s, and using the more
// expensive `is_identifier_start_unicode` and `is_identifier_part`.

// Get first byte. Exit if empty string.
let bytes = name.as_bytes();
let Some(&first_byte) = bytes.first() else { return false };

let mut chars = if first_byte.is_ascii() {
// First byte is ASCII
if !is_identifier_start_ascii(first_byte as char) {
return false;
}

// `'outer` loop never actually loops - only here to allow breaking out of it when Unicode found
#[allow(clippy::never_loop)]
let index = 'outer: loop {
for (index, &b) in bytes[1..].iter().enumerate() {
if b.is_ascii() {
if !is_identifier_part_ascii(b as char) {
return false;
}
} else {
// Unicode byte found
break 'outer index;
}
}
// We got to end without finding any non-identifier of Unicode characters
return true;
};

// Unicode byte found - search rest of string (from this byte onwards) as Unicode.
// `index + 1` because `index` returned from the loop is relative to start of `bytes[1..]`.
name[index + 1..].chars()
} else {
// First char is Unicode.
// NB: `unwrap()` cannot fail because we already checked the string is not empty.
let mut chars = name.chars();
let first_char = chars.next().unwrap();
if !is_identifier_start_unicode(first_char) {
return false;
}
// Search rest of string as Unicode
chars
};

// A Unicode char was found - search rest of string as Unicode
chars.all(is_identifier_part)
}

#[test]
fn is_identifier_name_true() {
let cases = [
// 1 char ASCII
"a",
"z",
"A",
"Z",
"_",
"$",
// 1 char Unicode
"µ", // 2 bytes
"ख", // 3 bytes
"𐀀", // 4 bytes
// Multiple chars ASCII
"az",
"AZ",
"_a",
"$Z",
"a0",
"A9",
"_0",
"$9",
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$",
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_$",
"_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789$",
"$abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_",
// Multiple chars Unicode
"µख𐀀",
// ASCII + Unicode, starting with ASCII
"AµBखC𐀀D",
// ASCII + Unicode, starting with Unicode
"µAखB𐀀",
];

for str in cases {
assert!(is_identifier_name(str));
}
}

#[test]
fn is_identifier_name_false() {
let cases = [
// Empty string
"",
// 1 char ASCII
"0",
"9",
"-",
"~",
"+",
// 1 char Unicode
"£", // 2 bytes
"৸", // 3 bytes
"𐄬", // 4 bytes
// Multiple chars ASCII
"0a",
"9a",
"-a",
"+a",
"a-Z",
"A+z",
"a-",
"a+",
// Multiple chars Unicode
"£৸𐄬",
// ASCII + Unicode, starting with ASCII
"A£",
"A৸",
"A𐄬",
// ASCII + Unicode, starting with Unicode
"£A",
"৸A",
"𐄬A",
];

for str in cases {
assert!(!is_identifier_name(str));
}
}