perf(traverse): generate_uid cache available binding names

oxc-project · Sep 8, 2024 · c7c4d6f · c7c4d6f
1 parent afea8d5
commit c7c4d6f
Show file tree

Hide file tree

Showing 3 changed files with 147 additions and 84 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/oxc_traverse/Cargo.toml b/crates/oxc_traverse/Cargo.toml
@@ -31,3 +31,4 @@ oxc_syntax = { workspace = true }
 
 compact_str = { workspace = true }
 memoffset = { workspace = true }
+rustc-hash = { workspace = true }
diff --git a/crates/oxc_traverse/src/context/scoping.rs b/crates/oxc_traverse/src/context/scoping.rs
@@ -1,10 +1,12 @@
-use std::{cell::Cell, str};
+use std::{cell::Cell, collections::hash_map::Entry, str};
 
 use compact_str::{format_compact, CompactString};
+use rustc_hash::FxHashMap;
+
 #[allow(clippy::wildcard_imports)]
 use oxc_ast::{ast::*, visit::Visit};
 use oxc_semantic::{AstNodeId, Reference, ScopeTree, SymbolTable};
-use oxc_span::{Atom, CompactStr, Span, SPAN};
+use oxc_span::{format_compact_str, Atom, CompactStr, Span, SPAN};
 use oxc_syntax::{
     reference::{ReferenceFlags, ReferenceId},
     scope::{ScopeFlags, ScopeId},
@@ -23,6 +25,7 @@ use crate::scopes_collector::ChildScopeCollector;
 pub struct TraverseScoping {
     scopes: ScopeTree,
     symbols: SymbolTable,
+    uid_names: Option<FxHashMap<CompactStr, Vec<CompactStr>>>,
     current_scope_id: ScopeId,
 }
 
@@ -174,20 +177,18 @@ impl TraverseScoping {
     ///
     /// # Potential improvements
     ///
-    /// TODO(improve-on-babel)
+    /// TODO(improve-on-babel):
     ///
     /// This function is fairly expensive, because it aims to replicate Babel's output.
-    /// `name_is_unique` method below searches through every single binding in the entire program
-    /// and does a string comparison on each.
-    /// If the first name tried is already in use, it will repeat that entire search with a new name,
-    /// potentially multiple times.
     ///
-    /// We could improve this in one of 2 ways:
+    /// `init_uid_names` iterates through every single binding and unresolved reference in the entire AST,
+    /// and builds a hashmap of `Vec`s of symbols which could clash with UIDs.
+    /// Once that's built, it's cached, but still `find_uid_name` has to do a hashmap lookup, and if the
+    /// base name is in the hash table, it has to iterate through all the existing symbols for that base.
+    ///
+    /// We could improve this in one of 3 ways:
     ///
-    /// 1. Semantic generate a hash set of all identifier names used in program.
-    ///    Check for uniqueness would then be just 1 x hashset lookup for each name that's tried.
-    ///    This would maintain output parity with Babel.
-    ///    But building the hash set would add some overhead to semantic.
+    /// 1. Build the hash table in `SemanticBuilder` instead of iterating through all symbols again here.
     ///
     /// 2. Use a much simpler method:
     ///
@@ -201,11 +202,21 @@ impl TraverseScoping {
     ///
     /// Minimal cost in semantic, and generating UIDs extremely cheap.
     ///
-    /// This is a slightly different method from Babel, but hopefully close enough that output will
-    /// match Babel for most (or maybe all) test cases.
+    /// This is a slightly different method from Babel, and unfortunately produces UID names
+    /// which differ from Babel for some of its test cases.
+    ///
+    /// 3. If output is being minified anyway, use a method which produces less debuggable output,
+    /// but is even simpler:
+    ///
+    /// * During initial semantic pass, check for any existing identifiers starting with `_`.
+    /// * Find the highest number of leading `_`s for any existing symbol.
+    /// * Generate UIDs with a counter starting at 0, prefixed with number of `_`s one greater than
+    ///   what was found in AST.
+    /// i.e. if source contains identifiers `_foo` and `__bar`, create UIDs names `___0`, `___1`,
+    /// `___2` etc. They'll all be unique within the program.
     pub fn generate_uid(&mut self, name: &str, scope_id: ScopeId, flags: SymbolFlags) -> SymbolId {
         // Get name for UID
-        let name = CompactStr::new(&self.find_uid_name(name));
+        let name = self.find_uid_name(name);
 
         // Add binding to scope
         let symbol_id =
@@ -428,6 +439,7 @@ impl TraverseScoping {
         Self {
             scopes,
             symbols,
+            uid_names: None,
             // Dummy value. Immediately overwritten in `walk_program`.
             current_scope_id: ScopeId::new(0),
         }
@@ -440,82 +452,70 @@ impl TraverseScoping {
     }
 
     /// Find a variable name which can be used as a UID
-    fn find_uid_name(&self, name: &str) -> CompactString {
-        let mut name = create_uid_name_base(name);
-
-        // Try the name without a numerical postfix (i.e. plain `_temp`)
-        if self.name_is_unique(&name) {
-            return name;
-        }
-
-        // It's fairly common that UIDs may need a numerical postfix, so we try to keep string
-        // operations to a minimum for postfixes up to 99 - using `replace_range` on a single
-        // `CompactStr`, rather than generating a new string on each attempt.
-        // Postfixes greater than 99 should be very uncommon, so don't bother optimizing.
-
-        // Try single-digit postfixes (i.e. `_temp2`, `_temp3` ... `_temp9`)
-        name.push('2');
-        if self.name_is_unique(&name) {
-            return name;
+    fn find_uid_name(&mut self, name: &str) -> CompactStr {
+        // If `uid_names` is not already populated, initialize it
+        if self.uid_names.is_none() {
+            self.init_uid_names();
         }
-        for c in b'3'..=b'9' {
-            name.replace_range(name.len() - 1.., str::from_utf8(&[c]).unwrap());
-            if self.name_is_unique(&name) {
-                return name;
+        let uid_names = self.uid_names.as_mut().unwrap();
+
+        let base = get_uid_name_base(name);
+        match uid_names.entry(CompactStr::from(base)) {
+            Entry::Occupied(mut entry) => {
+                let uid = CompactStr::from(get_unique_name(base, entry.get()));
+                entry.get_mut().push(uid.clone());
+                uid
+            }
+            Entry::Vacant(entry) => {
+                let uid = format_compact_str!("_{base}");
+                entry.insert(vec![uid.clone()]);
+                uid
             }
         }
+    }
 
-        // Try double-digit postfixes (i.e. `_temp10` ... `_temp99`)
-        name.replace_range(name.len() - 1.., "1");
-        name.push('0');
-        let mut c1 = b'1';
-        loop {
-            if self.name_is_unique(&name) {
-                return name;
+    /// Initialize `uid_names`.
+    ///
+    /// Iterate through all symbols and unresolved references in AST and identify any var names
+    /// which could clash with UIDs (start with `_`).
+    ///
+    /// Compile a hashmap mapping var names (without leading `_`s or trailing digits) to any symbols
+    /// used in AST matching `<one or more underscores><name><zero or more digits>`.
+    ///
+    /// Once this map is created, generating a UID is a relatively quick operation, rather than
+    /// iterating over all symbols and unresolved references every time generate a UID.
+    fn init_uid_names(&mut self) {
+        let mut uid_names = FxHashMap::<CompactStr, Vec<CompactStr>>::default();
+        for name in self.scopes.root_unresolved_references().keys().chain(self.symbols.names.iter())
+        {
+            if name.as_bytes().first() != Some(&b'_') {
+                continue;
             }
-            for c2 in b'1'..=b'9' {
-                name.replace_range(name.len() - 1.., str::from_utf8(&[c2]).unwrap());
-                if self.name_is_unique(&name) {
-                    return name;
+            // SAFETY: We just checked 1st byte is `_`, so safe to trim it off
+            let base = unsafe { str::from_utf8_unchecked(&name.as_bytes()[1..]) };
+            let base = get_uid_name_base(base);
+
+            match uid_names.entry(CompactStr::from(base)) {
+                Entry::Occupied(mut entry) => {
+                    if !entry.get().iter().any(|existing_name| existing_name == name) {
+                        entry.get_mut().push(name.clone());
+                    }
+                }
+                Entry::Vacant(entry) => {
+                    entry.insert(vec![name.clone()]);
                 }
-            }
-            if c1 == b'9' {
-                break;
-            }
-            c1 += 1;
-            name.replace_range(name.len() - 2.., str::from_utf8(&[c1, b'0']).unwrap());
-        }
-
-        // Try longer postfixes (`_temp100` upwards)
-        let name_base = {
-            name.pop();
-            name.pop();
-            &*name
-        };
-        for n in 100..=usize::MAX {
-            let name = format_compact!("{}{}", name_base, n);
-            if self.name_is_unique(&name) {
-                return name;
             }
         }
-
-        panic!("Cannot generate UID");
-    }
-
-    fn name_is_unique(&self, name: &str) -> bool {
-        !self.scopes.root_unresolved_references().contains_key(name)
-            && !self.symbols.names.iter().any(|n| n.as_str() == name)
+        self.uid_names = Some(uid_names);
     }
 }
 
 /// Create base for UID name based on provided `name`.
-/// i.e. if `name` is "foo", returns "_foo".
-/// We use `CompactString` to avoid any allocations where `name` is less than 22 bytes (the common case).
-fn create_uid_name_base(name: &str) -> CompactString {
-    // Trim `_`s from start, and `0-9`s from end.
-    // Code below is equivalent to
-    // `let name = name.trim_start_matches('_').trim_end_matches(|c: char| c.is_ascii_digit());`
-    // but more efficient as operates on bytes not chars.
+/// Trim `_`s from start and digits from end.
+/// i.e. `__foo123` -> `foo`
+fn get_uid_name_base(name: &str) -> &str {
+    // Equivalent to `name.trim_start_matches('_').trim_end_matches(|c: char| c.is_ascii_digit())`
+    // but more efficient as operates on bytes not chars
     let mut bytes = name.as_bytes();
     while bytes.first() == Some(&b'_') {
         bytes = &bytes[1..];
@@ -525,14 +525,75 @@ fn create_uid_name_base(name: &str) -> CompactString {
     }
     // SAFETY: We started with a valid UTF8 `&str` and have only trimmed off ASCII characters,
     // so remainder must still be valid UTF8
-    let name = unsafe { str::from_utf8_unchecked(bytes) };
+    unsafe { str::from_utf8_unchecked(bytes) }
+}
 
+fn get_unique_name(base: &str, used: &[CompactStr]) -> CompactString {
     // Create `CompactString` prepending name with `_`, and with 1 byte excess capacity.
     // The extra byte is to avoid reallocation if need to add a digit on the end later,
     // which will not be too uncommon.
     // Having to add 2 digits will be uncommon, so we don't allocate 2 extra bytes for 2 digits.
-    let mut str = CompactString::with_capacity(name.len() + 2);
-    str.push('_');
-    str.push_str(name);
-    str
+    let mut name = CompactString::with_capacity(base.len() + 2);
+    name.push('_');
+    name.push_str(base);
+
+    let name_is_unique = |name: &str| !used.iter().any(|used_name| used_name == name);
+
+    // Try the name without a numerical postfix (i.e. plain `_temp`)
+    if name_is_unique(&name) {
+        return name;
+    }
+
+    // It's fairly common that UIDs may need a numerical postfix, so we try to keep string
+    // operations to a minimum for postfixes up to 99 - using `replace_range` on a single
+    // `CompactStr`, rather than generating a new string on each attempt.
+    // Postfixes greater than 99 should be very uncommon, so don't bother optimizing.
+
+    // Try single-digit postfixes (i.e. `_temp2`, `_temp3` ... `_temp9`)
+    name.push('2');
+    if name_is_unique(&name) {
+        return name;
+    }
+    for c in b'3'..=b'9' {
+        name.replace_range(name.len() - 1.., str::from_utf8(&[c]).unwrap());
+        if name_is_unique(&name) {
+            return name;
+        }
+    }
+
+    // Try double-digit postfixes (i.e. `_temp10` ... `_temp99`)
+    name.replace_range(name.len() - 1.., "1");
+    name.push('0');
+    let mut c1 = b'1';
+    loop {
+        if name_is_unique(&name) {
+            return name;
+        }
+        for c2 in b'1'..=b'9' {
+            name.replace_range(name.len() - 1.., str::from_utf8(&[c2]).unwrap());
+            if name_is_unique(&name) {
+                return name;
+            }
+        }
+        if c1 == b'9' {
+            break;
+        }
+        c1 += 1;
+        name.replace_range(name.len() - 2.., str::from_utf8(&[c1, b'0']).unwrap());
+    }
+
+    // Try longer postfixes (`_temp100` upwards)
+    let name_base = {
+        name.pop();
+        name.pop();
+        &*name
+    };
+    for n in 100..=usize::MAX {
+        let name = format_compact!("{}{}", name_base, n);
+        if name_is_unique(&name) {
+            return name;
+        }
+    }
+
+    panic!("Cannot generate UID");
 }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -31,3 +31,4 @@ oxc_syntax = { workspace = true }

		compact_str = { workspace = true }
		memoffset = { workspace = true }
		rustc-hash = { workspace = true }