From 80773983877e3129ca59d16e8a6e08b4badd2960 Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Sun, 8 Sep 2024 12:05:23 +0100 Subject: [PATCH] perf(traverse): `generate_uid` cache available binding names --- Cargo.lock | 1 + crates/oxc_traverse/Cargo.toml | 1 + crates/oxc_traverse/src/context/scoping.rs | 229 +++++++++++++-------- 3 files changed, 147 insertions(+), 84 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3b6b01085ffe5c..d86d75ea98e308 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1979,6 +1979,7 @@ dependencies = [ "oxc_semantic", "oxc_span", "oxc_syntax", + "rustc-hash", ] [[package]] diff --git a/crates/oxc_traverse/Cargo.toml b/crates/oxc_traverse/Cargo.toml index ff55057c989fa5..c06a97356db256 100644 --- a/crates/oxc_traverse/Cargo.toml +++ b/crates/oxc_traverse/Cargo.toml @@ -31,3 +31,4 @@ oxc_syntax = { workspace = true } compact_str = { workspace = true } memoffset = { workspace = true } +rustc-hash = { workspace = true } diff --git a/crates/oxc_traverse/src/context/scoping.rs b/crates/oxc_traverse/src/context/scoping.rs index ba6dac845b7dde..5a64e0a697d766 100644 --- a/crates/oxc_traverse/src/context/scoping.rs +++ b/crates/oxc_traverse/src/context/scoping.rs @@ -1,10 +1,12 @@ -use std::{cell::Cell, str}; +use std::{cell::Cell, collections::hash_map::Entry, str}; use compact_str::{format_compact, CompactString}; +use rustc_hash::FxHashMap; + #[allow(clippy::wildcard_imports)] use oxc_ast::{ast::*, visit::Visit}; use oxc_semantic::{AstNodeId, Reference, ScopeTree, SymbolTable}; -use oxc_span::{Atom, CompactStr, Span, SPAN}; +use oxc_span::{format_compact_str, Atom, CompactStr, Span, SPAN}; use oxc_syntax::{ reference::{ReferenceFlags, ReferenceId}, scope::{ScopeFlags, ScopeId}, @@ -23,6 +25,7 @@ use crate::scopes_collector::ChildScopeCollector; pub struct TraverseScoping { scopes: ScopeTree, symbols: SymbolTable, + uid_names: Option>>, current_scope_id: ScopeId, } @@ -174,20 +177,18 @@ impl TraverseScoping { /// /// # Potential improvements /// - /// TODO(improve-on-babel) + /// TODO(improve-on-babel): /// /// This function is fairly expensive, because it aims to replicate Babel's output. - /// `name_is_unique` method below searches through every single binding in the entire program - /// and does a string comparison on each. - /// If the first name tried is already in use, it will repeat that entire search with a new name, - /// potentially multiple times. /// - /// We could improve this in one of 2 ways: + /// `init_uid_names` iterates through every single binding and unresolved reference in the entire AST, + /// and builds a hashmap of `Vec`s of symbols which could clash with UIDs. + /// Once that's built, it's cached, but still `find_uid_name` has to do a hashmap lookup, and if the + /// base name is in the hash table, it has to iterate through all the existing symbols for that base. + /// + /// We could improve this in one of 3 ways: /// - /// 1. Semantic generate a hash set of all identifier names used in program. - /// Check for uniqueness would then be just 1 x hashset lookup for each name that's tried. - /// This would maintain output parity with Babel. - /// But building the hash set would add some overhead to semantic. + /// 1. Build the hash table in `SemanticBuilder` instead of iterating through all symbols again here. /// /// 2. Use a much simpler method: /// @@ -201,11 +202,21 @@ impl TraverseScoping { /// /// Minimal cost in semantic, and generating UIDs extremely cheap. /// - /// This is a slightly different method from Babel, but hopefully close enough that output will - /// match Babel for most (or maybe all) test cases. + /// This is a slightly different method from Babel, and unfortunately produces UID names + /// which differ from Babel for some of its test cases. + /// + /// 3. If output is being minified anyway, use a method which produces less debuggable output, + /// but is even simpler: + /// + /// * During initial semantic pass, check for any existing identifiers starting with `_`. + /// * Find the highest number of leading `_`s for any existing symbol. + /// * Generate UIDs with a counter starting at 0, prefixed with number of `_`s one greater than + /// what was found in AST. + /// i.e. if source contains identifiers `_foo` and `__bar`, create UIDs names `___0`, `___1`, + /// `___2` etc. They'll all be unique within the program. pub fn generate_uid(&mut self, name: &str, scope_id: ScopeId, flags: SymbolFlags) -> SymbolId { // Get name for UID - let name = CompactStr::new(&self.find_uid_name(name)); + let name = self.find_uid_name(name); // Add binding to scope let symbol_id = @@ -428,6 +439,7 @@ impl TraverseScoping { Self { scopes, symbols, + uid_names: None, // Dummy value. Immediately overwritten in `walk_program`. current_scope_id: ScopeId::new(0), } @@ -440,82 +452,70 @@ impl TraverseScoping { } /// Find a variable name which can be used as a UID - fn find_uid_name(&self, name: &str) -> CompactString { - let mut name = create_uid_name_base(name); - - // Try the name without a numerical postfix (i.e. plain `_temp`) - if self.name_is_unique(&name) { - return name; - } - - // It's fairly common that UIDs may need a numerical postfix, so we try to keep string - // operations to a minimum for postfixes up to 99 - using `replace_range` on a single - // `CompactStr`, rather than generating a new string on each attempt. - // Postfixes greater than 99 should be very uncommon, so don't bother optimizing. - - // Try single-digit postfixes (i.e. `_temp2`, `_temp3` ... `_temp9`) - name.push('2'); - if self.name_is_unique(&name) { - return name; + fn find_uid_name(&mut self, name: &str) -> CompactStr { + // If `uid_names` is not already populated, initialize it + if self.uid_names.is_none() { + self.init_uid_names(); } - for c in b'3'..=b'9' { - name.replace_range(name.len() - 1.., str::from_utf8(&[c]).unwrap()); - if self.name_is_unique(&name) { - return name; + let uid_names = self.uid_names.as_mut().unwrap(); + + let base = get_uid_name_base(name); + match uid_names.entry(CompactStr::from(base)) { + Entry::Occupied(mut entry) => { + let uid = CompactStr::from(get_unique_name(base, entry.get())); + entry.get_mut().push(uid.clone()); + uid + } + Entry::Vacant(entry) => { + let uid = format_compact_str!("_{base}"); + entry.insert(vec![uid.clone()]); + uid } } + } - // Try double-digit postfixes (i.e. `_temp10` ... `_temp99`) - name.replace_range(name.len() - 1.., "1"); - name.push('0'); - let mut c1 = b'1'; - loop { - if self.name_is_unique(&name) { - return name; + /// Initialize `uid_names`. + /// + /// Iterate through all symbols and unresolved references in AST and identify any var names + /// which could clash with UIDs (start with `_`). + /// + /// Compile a hashmap mapping var names (without leading `_`s or trailing digits) to any symbols + /// used in AST matching `<1 or more underscores>name`. + /// + /// Once this map is created, generating a UID is a relatively quick operation, rather than + /// iterating over all symbols and unresolved references every time generate a UID. + fn init_uid_names(&mut self) { + let mut uid_names = FxHashMap::>::default(); + for name in self.scopes.root_unresolved_references().keys().chain(self.symbols.names.iter()) + { + if name.as_bytes().first() != Some(&b'_') { + continue; } - for c2 in b'1'..=b'9' { - name.replace_range(name.len() - 1.., str::from_utf8(&[c2]).unwrap()); - if self.name_is_unique(&name) { - return name; + // SAFETY: We just checked 1st byte is `_`, so safe to trim it off + let base = unsafe { str::from_utf8_unchecked(&name.as_bytes()[1..]) }; + let base = get_uid_name_base(base); + + match uid_names.entry(CompactStr::from(base)) { + Entry::Occupied(mut entry) => { + if !entry.get().iter().any(|existing_name| existing_name == name) { + entry.get_mut().push(name.clone()); + } + } + Entry::Vacant(entry) => { + entry.insert(vec![name.clone()]); } - } - if c1 == b'9' { - break; - } - c1 += 1; - name.replace_range(name.len() - 2.., str::from_utf8(&[c1, b'0']).unwrap()); - } - - // Try longer postfixes (`_temp100` upwards) - let name_base = { - name.pop(); - name.pop(); - &*name - }; - for n in 100..=usize::MAX { - let name = format_compact!("{}{}", name_base, n); - if self.name_is_unique(&name) { - return name; } } - - panic!("Cannot generate UID"); - } - - fn name_is_unique(&self, name: &str) -> bool { - !self.scopes.root_unresolved_references().contains_key(name) - && !self.symbols.names.iter().any(|n| n.as_str() == name) + self.uid_names = Some(uid_names); } } /// Create base for UID name based on provided `name`. -/// i.e. if `name` is "foo", returns "_foo". -/// We use `CompactString` to avoid any allocations where `name` is less than 22 bytes (the common case). -fn create_uid_name_base(name: &str) -> CompactString { - // Trim `_`s from start, and `0-9`s from end. - // Code below is equivalent to - // `let name = name.trim_start_matches('_').trim_end_matches(|c: char| c.is_ascii_digit());` - // but more efficient as operates on bytes not chars. +/// Trim `_`s from start and digits from end. +/// i.e. `__foo123` -> `foo` +fn get_uid_name_base(name: &str) -> &str { + // Equivalent to `name.trim_start_matches('_').trim_end_matches(|c: char| c.is_ascii_digit())` + // but more efficient as operates on bytes not chars let mut bytes = name.as_bytes(); while bytes.first() == Some(&b'_') { bytes = &bytes[1..]; @@ -525,14 +525,75 @@ fn create_uid_name_base(name: &str) -> CompactString { } // SAFETY: We started with a valid UTF8 `&str` and have only trimmed off ASCII characters, // so remainder must still be valid UTF8 - let name = unsafe { str::from_utf8_unchecked(bytes) }; + unsafe { str::from_utf8_unchecked(bytes) } +} +fn get_unique_name(base: &str, used: &[CompactStr]) -> CompactString { // Create `CompactString` prepending name with `_`, and with 1 byte excess capacity. // The extra byte is to avoid reallocation if need to add a digit on the end later, // which will not be too uncommon. // Having to add 2 digits will be uncommon, so we don't allocate 2 extra bytes for 2 digits. - let mut str = CompactString::with_capacity(name.len() + 2); - str.push('_'); - str.push_str(name); - str + let mut name = CompactString::with_capacity(base.len() + 2); + name.push('_'); + name.push_str(base); + + let name_is_unique = |name: &str| !used.iter().any(|used_name| used_name == name); + + // Try the name without a numerical postfix (i.e. plain `_temp`) + if name_is_unique(&name) { + return name; + } + + // It's fairly common that UIDs may need a numerical postfix, so we try to keep string + // operations to a minimum for postfixes up to 99 - using `replace_range` on a single + // `CompactStr`, rather than generating a new string on each attempt. + // Postfixes greater than 99 should be very uncommon, so don't bother optimizing. + + // Try single-digit postfixes (i.e. `_temp2`, `_temp3` ... `_temp9`) + name.push('2'); + if name_is_unique(&name) { + return name; + } + for c in b'3'..=b'9' { + name.replace_range(name.len() - 1.., str::from_utf8(&[c]).unwrap()); + if name_is_unique(&name) { + return name; + } + } + + // Try double-digit postfixes (i.e. `_temp10` ... `_temp99`) + name.replace_range(name.len() - 1.., "1"); + name.push('0'); + let mut c1 = b'1'; + loop { + if name_is_unique(&name) { + return name; + } + for c2 in b'1'..=b'9' { + name.replace_range(name.len() - 1.., str::from_utf8(&[c2]).unwrap()); + if name_is_unique(&name) { + return name; + } + } + if c1 == b'9' { + break; + } + c1 += 1; + name.replace_range(name.len() - 2.., str::from_utf8(&[c1, b'0']).unwrap()); + } + + // Try longer postfixes (`_temp100` upwards) + let name_base = { + name.pop(); + name.pop(); + &*name + }; + for n in 100..=usize::MAX { + let name = format_compact!("{}{}", name_base, n); + if name_is_unique(&name) { + return name; + } + } + + panic!("Cannot generate UID"); }