Minor optimizations to the codegen of TaskFnInputFunction (#8304)

## What? I noticed a number of small optimizations that could be applied to these hot and heavily-monomorphized functions: - Using `with_context()` instead of `.context()`, to avoid evaluating the error message in the common case that it's unused. I also tried `concat!()` since this can be a static string, but the resulting binary is slightly larger, and we don't need to optimize for the unlikely error case. - Extracted the parts of the monomorphized functions that didn't require the type parameters into separate non-generic functions. While the goal here is mostly to reduce binary size and compilation time, this optimization on it's own seems to help with the runtime benchmarks too (though I didn't test it rigorously in isolation). Here's a section from [Rust for Rustaceans](https://rust-for-rustaceans.com/) explaining this "non-generic function" trick: ![Screenshot from 2024-06-04 20-26-20.png](https://graphite-user-uploaded-assets-prod.s3.amazonaws.com/HAZVitxRNnZz8QMiPn4a/91a2c00d-0e43-49c7-9e67-019b98c0ca55.png) ## Binary Size? Slightly negative, at least for stripped debug builds: ``` time pnpm pack-next ``` ``` -rw-r--r-- 1 bgw bgw 167895040 Jun 4 17:20 next-swc.after.tar -rw-r--r-- 1 bgw bgw 168622080 Jun 4 15:37 next-swc.before.tar ``` ## Runtime Performance? Using https://github.com/bgw/benchmark-scripts/ ### Microbenchmark (`turbo_tasks_memory_stress/fibonacci/200`) ``` $ TURBOPACK_BENCH_STRESS=yes cargo bench -p turbo-tasks-memory -- fibonacci/200 Compiling turbo-tasks v0.1.0 (/home/bgw/turbo/crates/turbo-tasks) Compiling turbo-tasks-memory v0.1.0 (/home/bgw/turbo/crates/turbo-tasks-memory) Compiling turbo-tasks-testing v0.1.0 (/home/bgw/turbo/crates/turbo-tasks-testing) Finished `bench` profile [optimized] target(s) in 10.79s Running benches/mod.rs (target/release/deps/mod-8c0f970371f8713d) turbo_tasks_memory_stress/fibonacci/200 time: [64.420 ms 64.683 ms 64.941 ms] thrpt: [309.53 Kelem/s 310.76 Kelem/s 312.03 Kelem/s] change: time: [-2.2828% -1.7587% -1.2206%] (p = 0.00 < 0.05) thrpt: [+1.2357% +1.7902% +2.3361%] Performance has improved. Found 1 outliers among 20 measurements (5.00%) 1 (5.00%) low mild ``` ### "Realistic" Benchmark (`bench_startup/Turbopack CSR/1000 modules`) The difference is small. I patched the benchmark to increase the number of iterations so I could get something statistically significant. ``` diff --git a/crates/turbopack-bench/src/lib.rs b/crates/turbopack-bench/src/lib.rs index 4e3df12db0..d950d76071 100644 --- a/crates/turbopack-bench/src/lib.rs +++ b/crates/turbopack-bench/src/lib.rs @@ -35,8 +35,8 @@ pub mod util; pub fn bench_startup(c: &mut Criterion, bundlers: &[Box<dyn Bundler>]) { let mut g = c.benchmark_group("bench_startup"); - g.sample_size(10); - g.measurement_time(Duration::from_secs(60)); + g.sample_size(100); + g.measurement_time(Duration::from_secs(600)); bench_startup_internal(g, false, bundlers); } ``` ``` cargo bench -p turbopack-cli -- bench_startup ``` ``` Finished `bench` profile [optimized] target(s) in 1.30s Running benches/mod.rs (target/release/deps/mod-2681e324dfd90da1) bench_startup/Turbopack CSR/1000 modules time: [2.2684 s 2.2717 s 2.2750 s] change: [-1.9365% -1.7387% -1.5602%] (p = 0.00 < 0.05) Performance has improved. Found 3 outliers among 100 measurements (3.00%) 1 (1.00%) low mild 2 (2.00%) high mild ``` ## Build Speed? Not enough of a difference to measure. ``` rm -rf target/ && time cargo build -p turbopack-cli ``` Before: ``` real 10m42.174s ``` After: ``` real 10m40.735s ```
vercel · Jun 5, 2024 · e65d1e7 · e65d1e7
1 parent 67b2a85
commit e65d1e7
Showing 1 changed file with 69 additions and 50 deletions.
diff --git a/crates/turbo-tasks/src/task/function.rs b/crates/turbo-tasks/src/task/function.rs
@@ -109,8 +109,44 @@ macro_rules! task_inputs_impl {
     }
 }
 
+macro_rules! as_concrete_task_input {
+    ( $arg:ident ) => {
+        ConcreteTaskInput
+    };
+}
+
 macro_rules! task_fn_impl {
-    ( $async_fn_trait:ident , $( $arg:ident )* ) => {
+    ( $helper_module:ident $async_fn_trait:ident $arg_len:literal $( $arg:ident )* ) => {
+        mod $helper_module {
+            use super::*;
+
+            // this is a non-generic helper method to allow code-sharing across monomorphized
+            // instances of this function
+            pub fn get_args(
+                inputs: &[ConcreteTaskInput],
+            ) -> Result<($(&as_concrete_task_input!($arg),)*)> {
+                get_args_iter(inputs.iter())
+            }
+
+            pub fn get_method_args(
+                inputs: &[ConcreteTaskInput],
+            ) -> Result<(&ConcreteTaskInput, ($(&as_concrete_task_input!($arg),)*))> {
+                let mut iter = inputs.iter();
+                let recv = iter.next().context("task is missing receiver")?;
+                Ok((recv, get_args_iter(iter)?))
+            }
+
+            fn get_args_iter(
+                mut iter: std::slice::Iter<'_, ConcreteTaskInput>,
+            ) -> Result<($(&as_concrete_task_input!($arg),)*)> {
+                let args = ($(next_arg(&mut iter, stringify!($arg))?,)*);
+                if iter.next().is_some() {
+                    bail!("task was called with too many arguments");
+                }
+                Ok(args)
+            }
+        }
+
         impl<F, Output, $($arg,)*> TaskFnInputFunction<FunctionMode, ($($arg,)*)> for F
         where
             $($arg: TaskInput + 'static,)*
@@ -119,17 +155,9 @@ macro_rules! task_fn_impl {
         {
             #[allow(non_snake_case)]
             fn functor(&self, inputs: &[ConcreteTaskInput]) -> Result<NativeTaskFn> {
-                let task_fn = self.clone();
-                let mut iter = inputs.iter();
-
-                $(
-                    let $arg = iter.next().context(format!("task is missing argument {}", stringify!($arg)))?;
-                )*
-
-                if iter.next().is_some() {
-                    bail!("task was called with too many arguments");
-                }
+                let ($($arg,)*) = $helper_module::get_args(inputs)?;
 
+                let task_fn = self.clone();
                 $(
                     let $arg = $arg::try_from_concrete($arg)?;
                 )*
@@ -156,17 +184,9 @@ macro_rules! task_fn_impl {
         {
             #[allow(non_snake_case)]
             fn functor(&self, inputs: &[ConcreteTaskInput]) -> Result<NativeTaskFn> {
-                let task_fn = self.clone();
-                let mut iter = inputs.iter();
-
-                $(
-                    let $arg = iter.next().context(format!("task is missing argument {}", stringify!($arg)))?;
-                )*
-
-                if iter.next().is_some() {
-                    bail!("task was called with too many arguments");
-                }
+                let ($($arg,)*) = $helper_module::get_args(inputs)?;
 
+                let task_fn = self.clone();
                 $(
                     let $arg = $arg::try_from_concrete($arg)?;
                 )*
@@ -193,18 +213,9 @@ macro_rules! task_fn_impl {
         {
             #[allow(non_snake_case)]
             fn functor(&self, inputs: &[ConcreteTaskInput]) -> Result<NativeTaskFn> {
-                let task_fn = self.clone();
-                let mut iter = inputs.iter();
-
-                let recv = iter.next().context("task is missing receiver")?;
-                $(
-                    let $arg = iter.next().context(format!("task is missing argument {}", stringify!($arg)))?;
-                )*
-
-                if iter.next().is_some() {
-                    bail!("task was called with too many arguments");
-                }
+                let (recv, ($($arg,)*)) = $helper_module::get_method_args(inputs)?;
 
+                let task_fn = self.clone();
                 let recv = Vc::<Recv>::try_from_concrete(recv)?;
                 $(
                     let $arg = $arg::try_from_concrete($arg)?;
@@ -254,7 +265,7 @@ macro_rules! task_fn_impl {
 
                 let recv = iter.next().context("task is missing receiver")?;
                 $(
-                    let $arg = iter.next().context(format!("task is missing argument {}", stringify!($arg)))?;
+                    let $arg = next_arg(&mut iter, stringify!($arg))?;
                 )*
 
                 if iter.next().is_some() {
@@ -284,23 +295,23 @@ macro_rules! task_fn_impl {
     };
 }
 
-task_fn_impl! { AsyncFn0, }
-task_fn_impl! { AsyncFn1, A1 }
-task_fn_impl! { AsyncFn2, A1 A2 }
-task_fn_impl! { AsyncFn3, A1 A2 A3 }
-task_fn_impl! { AsyncFn4, A1 A2 A3 A4 }
-task_fn_impl! { AsyncFn5, A1 A2 A3 A4 A5 }
-task_fn_impl! { AsyncFn6, A1 A2 A3 A4 A5 A6 }
-task_fn_impl! { AsyncFn7, A1 A2 A3 A4 A5 A6 A7 }
-task_fn_impl! { AsyncFn8, A1 A2 A3 A4 A5 A6 A7 A8 }
-task_fn_impl! { AsyncFn9, A1 A2 A3 A4 A5 A6 A7 A8 A9 }
-task_fn_impl! { AsyncFn10, A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 }
-task_fn_impl! { AsyncFn11, A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 }
-task_fn_impl! { AsyncFn12, A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 }
-task_fn_impl! { AsyncFn13, A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 }
-task_fn_impl! { AsyncFn14, A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 }
-task_fn_impl! { AsyncFn15, A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 }
-task_fn_impl! { AsyncFn16, A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16 }
+task_fn_impl! { async_fn_0 AsyncFn0 0 }
+task_fn_impl! { async_fn_1 AsyncFn1 1 A1 }
+task_fn_impl! { async_fn_2 AsyncFn2 2 A1 A2 }
+task_fn_impl! { async_fn_3 AsyncFn3 3 A1 A2 A3 }
+task_fn_impl! { async_fn_4 AsyncFn4 4 A1 A2 A3 A4 }
+task_fn_impl! { async_fn_5 AsyncFn5 5 A1 A2 A3 A4 A5 }
+task_fn_impl! { async_fn_6 AsyncFn6 6 A1 A2 A3 A4 A5 A6 }
+task_fn_impl! { async_fn_7 AsyncFn7 7 A1 A2 A3 A4 A5 A6 A7 }
+task_fn_impl! { async_fn_8 AsyncFn8 8 A1 A2 A3 A4 A5 A6 A7 A8 }
+task_fn_impl! { async_fn_9 AsyncFn9 9 A1 A2 A3 A4 A5 A6 A7 A8 A9 }
+task_fn_impl! { async_fn_10 AsyncFn10 10 A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 }
+task_fn_impl! { async_fn_11 AsyncFn11 11 A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 }
+task_fn_impl! { async_fn_12 AsyncFn12 12 A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 }
+task_fn_impl! { async_fn_13 AsyncFn13 13 A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 }
+task_fn_impl! { async_fn_14 AsyncFn14 14 A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 }
+task_fn_impl! { async_fn_15 AsyncFn15 15 A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 }
+task_fn_impl! { async_fn_16 AsyncFn16 16 A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16 }
 
 // There needs to be one more implementation than task_fn_impl to account for
 // the receiver.
@@ -323,6 +334,14 @@ task_inputs_impl! { A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 }
 task_inputs_impl! { A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16 }
 task_inputs_impl! { A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16 A17 }
 
+fn next_arg<'a>(
+    iter: &mut std::slice::Iter<'a, ConcreteTaskInput>,
+    arg_name: &'static str,
+) -> Result<&'a ConcreteTaskInput> {
+    iter.next()
+        .with_context(move || format!("task is missing argument {}", arg_name))
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;