From aeda84f904a6d4ea001bfa021752e64283fc56c4 Mon Sep 17 00:00:00 2001 From: overlookmotel <557937+overlookmotel@users.noreply.github.com> Date: Tue, 3 Sep 2024 12:48:32 +0000 Subject: [PATCH] perf(syntax): optimize `is_identifier_name` (#5425) Optimize `oxc_syntax::identifier::is_identifier_name`. Add a fast path for ASCII, which will be the common case. Only fall back to iterating over `char`s and using the more expensive test functions e.g. `is_identifier_start_unicode` if non-ASCII chars are found. --- crates/oxc_syntax/src/identifier.rs | 134 +++++++++++++++++++++++++++- 1 file changed, 132 insertions(+), 2 deletions(-) diff --git a/crates/oxc_syntax/src/identifier.rs b/crates/oxc_syntax/src/identifier.rs index bca43bdd94261..33616de72bff2 100644 --- a/crates/oxc_syntax/src/identifier.rs +++ b/crates/oxc_syntax/src/identifier.rs @@ -136,7 +136,137 @@ pub fn is_identifier_part_unicode(c: char) -> bool { is_id_continue_unicode(c) || c == ZWNJ || c == ZWJ } +/// Determine if a string is a valid JS identifier. +#[allow(clippy::missing_panics_doc)] pub fn is_identifier_name(name: &str) -> bool { - let mut chars = name.chars(); - chars.next().is_some_and(is_identifier_start) && chars.all(is_identifier_part) + // This function contains a fast path for ASCII (common case), iterating over bytes and using + // the cheap `is_identifier_start_ascii` and `is_identifier_part_ascii` to test bytes. + // Only if a Unicode char is found, fall back to iterating over `char`s, and using the more + // expensive `is_identifier_start_unicode` and `is_identifier_part`. + + // Get first byte. Exit if empty string. + let bytes = name.as_bytes(); + let Some(&first_byte) = bytes.first() else { return false }; + + let mut chars = if first_byte.is_ascii() { + // First byte is ASCII + if !is_identifier_start_ascii(first_byte as char) { + return false; + } + + // `'outer` loop never actually loops - only here to allow breaking out of it when Unicode found + #[allow(clippy::never_loop)] + let index = 'outer: loop { + for (index, &b) in bytes[1..].iter().enumerate() { + if b.is_ascii() { + if !is_identifier_part_ascii(b as char) { + return false; + } + } else { + // Unicode byte found + break 'outer index; + } + } + // We got to end without finding any non-identifier of Unicode characters + return true; + }; + + // Unicode byte found - search rest of string (from this byte onwards) as Unicode. + // `index + 1` because `index` returned from the loop is relative to start of `bytes[1..]`. + name[index + 1..].chars() + } else { + // First char is Unicode. + // NB: `unwrap()` cannot fail because we already checked the string is not empty. + let mut chars = name.chars(); + let first_char = chars.next().unwrap(); + if !is_identifier_start_unicode(first_char) { + return false; + } + // Search rest of string as Unicode + chars + }; + + // A Unicode char was found - search rest of string as Unicode + chars.all(is_identifier_part) +} + +#[test] +fn is_identifier_name_true() { + let cases = [ + // 1 char ASCII + "a", + "z", + "A", + "Z", + "_", + "$", + // 1 char Unicode + "µ", // 2 bytes + "ख", // 3 bytes + "𐀀", // 4 bytes + // Multiple chars ASCII + "az", + "AZ", + "_a", + "$Z", + "a0", + "A9", + "_0", + "$9", + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$", + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_$", + "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789$", + "$abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_", + // Multiple chars Unicode + "µख𐀀", + // ASCII + Unicode, starting with ASCII + "AµBखC𐀀D", + // ASCII + Unicode, starting with Unicode + "µAखB𐀀", + ]; + + for str in cases { + assert!(is_identifier_name(str)); + } +} + +#[test] +fn is_identifier_name_false() { + let cases = [ + // Empty string + "", + // 1 char ASCII + "0", + "9", + "-", + "~", + "+", + // 1 char Unicode + "£", // 2 bytes + "৸", // 3 bytes + "𐄬", // 4 bytes + // Multiple chars ASCII + "0a", + "9a", + "-a", + "+a", + "a-Z", + "A+z", + "a-", + "a+", + // Multiple chars Unicode + "£৸𐄬", + // ASCII + Unicode, starting with ASCII + "A£", + "A৸", + "A𐄬", + // ASCII + Unicode, starting with Unicode + "£A", + "৸A", + "𐄬A", + ]; + + for str in cases { + assert!(!is_identifier_name(str)); + } }