From ead4649b0c21a98534c36e7755edac68052b3c26 Mon Sep 17 00:00:00 2001 From: Cody Gunton Date: Mon, 30 Sep 2024 11:51:14 -0400 Subject: [PATCH] feat: new Tracy Time preset and more efficient univariate extension (#8789) We add a new script `benchmark_tracy_build_mainframe_run_local.sh`that will copy the executable to local and run it there instead. We also add a new tracy-time preset that reuses the BB_OP_COUNT_TIME statements around the codebase to mark zones. We add a new extend_from function that is more efficient than the existing extend_to function. From using the new extend_from function, we see improvements to the combiner round. ``` -------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... -------------------------------------------------------------------------------- ClientIVCBench/Full/6 32719 ms 30147 ms 1 Arithmetic::accumulate=3.89126M Arithmetic::accumulate(t)=7.36388G Auxiliary::accumulate=1.98134M Auxiliary::accumulate(t)=13.3567G COMMIT::databus=108 COMMIT::databus(t)=8.37903M COMMIT::databus_inve rses=36 COMMIT::databus_inverses(t)=11.6069M COMMIT::ecc_op_wires=48 COMMIT::ecc_op_wires(t)=38.5805M COMMIT::lookup_counts_tags=12 COMMIT::lookup_counts_tags(t)=106.101M COMMIT::lookup_inverses=12 COMMIT::lookup_inverses(t)=257.04M COMMIT::wires=24 COMMIT::wires(t )=2.21684G COMMIT::z_perm=12 COMMIT::z_perm(t)=2.31257G DatabusRead::accumulate=447 DatabusRead::accumulate(t)=1.64643M Decider::construct_proof=1 Decider::construct_proof(t)=1.585G DeciderProvingKey(Circuit&)=12 DeciderProvingKey(Circuit&)(t)=2.60037G DeltaRange:: accumulate=1.87876M DeltaRange::accumulate(t)=4.23593G ECCVMProver(CircuitBuilder&)=1 ECCVMProver(CircuitBuilder&)(t)=229.56M ECCVMProver::construct_proof=1 ECCVMProver::construct_proof(t)=2.59633G Elliptic::accumulate=183.692k Elliptic::accumulate(t)=464.215M Gobl in::merge=23 Goblin::merge(t)=116.512M Lookup::accumulate=1.66363M Lookup::accumulate(t)=3.69157G MegaFlavor::get_row=6.18564M MegaFlavor::get_row(t)=4.21736G OinkProver::execute_grand_product_computation_round=12 OinkProver::execute_grand_product_computation_round (t)=3.58134G OinkProver::execute_log_derivative_inverse_round=12 OinkProver::execute_log_derivative_inverse_round(t)=2.47144G OinkProver::execute_preamble_round=12 OinkProver::execute_preamble_round(t)=151.77k OinkProver::execute_sorted_list_accumulator_round=12 Oi nkProver::execute_sorted_list_accumulator_round(t)=681.46M OinkProver::execute_wire_commitments_round=12 OinkProver::execute_wire_commitments_round(t)=1.69618G OinkProver::generate_alphas_round=12 OinkProver::generate_alphas_round(t)=3.44188M Permutation::accumulat e=10.6427M Permutation::accumulate(t)=40.6403G PoseidonExt::accumulate=30.452k PoseidonExt::accumulate(t)=77.2342M PoseidonInt::accumulate=210.454k PoseidonInt::accumulate(t)=374.93M ProtogalaxyProver::prove=11 ProtogalaxyProver::prove(t)=19.5955G ProtogalaxyProver _::combiner_quotient_round=11 ProtogalaxyProver_::combiner_quotient_round(t)=8.44199G ProtogalaxyProver_::compute_row_evaluations=11 ProtogalaxyProver_::compute_row_evaluations(t)=1.97625G ProtogalaxyProver_::perturbator_round=11 ProtogalaxyProver_::perturbator_rou nd(t)=2.87543G ProtogalaxyProver_::run_oink_prover_on_each_incomplete_key=11 ProtogalaxyProver_::run_oink_prover_on_each_incomplete_key(t)=7.60574G ProtogalaxyProver_::update_target_sum_and_fold=11 ProtogalaxyProver_::update_target_sum_and_fold(t)=672.3M Translator CircuitBuilder::constructor=1 TranslatorCircuitBuilder::constructor(t)=33.0787M TranslatorProver=1 TranslatorProver(t)=46.9695M TranslatorProver::construct_proof=1 TranslatorProver::construct_proof(t)=838.946M batch_mul_with_endomorphism=16 batch_mul_with_endomorph ism(t)=407.175M commit=543 commit(t)=6.55046G commit_sparse=36 commit_sparse(t)=11.5922M compute_combiner=11 compute_combiner(t)=8.04092G compute_perturbator=11 compute_perturbator(t)=2.87517G compute_univariate=51 compute_univariate(t)=2.18992G construct_circuits= 12 construct_circuits(t)=4.20212G pippenger=215 pippenger(t)=101.133M pippenger_unsafe_optimized_for_non_dyadic_polys=543 pippenger_unsafe_optimized_for_non_dyadic_polys(t)=6.54615G Benchmarking lock deleted. client_ivc_bench.json 100% 6912 245.8KB/s 00:00 function ms % sum construct_circuits(t) 4202 13.23% DeciderProvingKey(Circuit&)(t) 2600 8.19% ProtogalaxyProver::prove(t) 19596 61.69% Decider::construct_proof(t) 1585 4.99% ECCVMProver(CircuitBuilder&)(t) 230 0.72% ECCVMProver::construct_proof(t) 2596 8.17% TranslatorProver::construct_proof(t) 839 2.64% Goblin::merge(t) 117 0.37% Total time accounted for: 31764ms/32719ms = 97.08% Major contributors: function ms % sum commit(t) 6550 20.62% compute_combiner(t) 8041 25.31% compute_perturbator(t) 2875 9.05% compute_univariate(t) 2190 6.89% Breakdown of ProtogalaxyProver::prove: ProtogalaxyProver_::run_oink_prover_on_each_incomplete_key(t) 7606 38.81% ProtogalaxyProver_::perturbator_round(t) 2875 14.67% ProtogalaxyProver_::combiner_quotient_round(t) 8442 43.08% ProtogalaxyProver_::update_target_sum_and_fold(t) 672 3.43% Relation contributions (times to be interpreted relatively): Total time accounted for (ms): 70206 operation ms % sum Arithmetic::accumulate(t) 7364 10.49% Permutation::accumulate(t) 40640 57.89% Lookup::accumulate(t) 3692 5.26% DeltaRange::accumulate(t) 4236 6.03% Elliptic::accumulate(t) 464 0.66% Auxiliary::accumulate(t) 13357 19.02% EccOp::accumulate(t) 0 0.00% DatabusRead::accumulate(t) 2 0.00% PoseidonExt::accumulate(t) 77 0.11% PoseidonInt::accumulate(t) 375 0.53% Commitment contributions: Total time accounted for (ms): 4951 operation ms % sum COMMIT::wires(t) 2217 44.77% COMMIT::z_perm(t) 2313 46.71% COMMIT::databus(t) 8 0.17% COMMIT::ecc_op_wires(t) 39 0.78% COMMIT::lookup_inverses(t) 257 5.19% COMMIT::databus_inverses(t) 12 0.23% ``` --------- Co-authored-by: lucasxia01 --- barretenberg/cpp/CMakeLists.txt | 9 ++++ barretenberg/cpp/CMakePresets.json | 33 +++++++++++-- ...nchmark_tracy_build_mainframe_run_local.sh | 49 +++++++++++++++++++ ...hmark_tracy_build_mainframe_view_local.sh} | 0 .../client_ivc_bench/client_ivc.bench.cpp | 2 +- .../protogalaxy_bench/protogalaxy.bench.cpp | 2 +- .../relations_bench/barycentric.bench.cpp | 42 ++++++++++++++-- .../benchmark/ultra_bench/mock_circuits.hpp | 2 + .../cpp/src/barretenberg/common/op_count.hpp | 12 ++++- .../execution_trace/execution_trace.cpp | 20 ++++++++ .../execution_trace/execution_trace.hpp | 9 ++++ .../cpp/src/barretenberg/flavor/flavor.hpp | 3 ++ .../cpp/src/barretenberg/goblin/goblin.hpp | 24 +++++++++ .../composer/permutation_lib.hpp | 9 ++++ .../polynomials/barycentric.test.cpp | 12 +++++ .../barretenberg/polynomials/univariate.hpp | 32 ++++++++++-- .../protogalaxy/protogalaxy_prover_impl.hpp | 6 +++ .../protogalaxy_prover_internal.hpp | 8 +-- .../srs/factories/file_crs_factory.hpp | 3 ++ .../stdlib_circuit_builders/ultra_flavor.hpp | 6 +++ .../src/barretenberg/sumcheck/sumcheck.hpp | 6 +++ .../barretenberg/sumcheck/sumcheck_round.hpp | 6 +++ .../barretenberg/ultra_honk/decider_keys.hpp | 11 ++--- .../ultra_honk/decider_prover.cpp | 3 ++ .../ultra_honk/decider_proving_key.hpp | 28 +++++++++++ .../barretenberg/ultra_honk/oink_prover.cpp | 15 ++++++ 26 files changed, 328 insertions(+), 24 deletions(-) create mode 100755 barretenberg/cpp/scripts/benchmark_tracy_build_mainframe_run_local.sh rename barretenberg/cpp/scripts/{benchmark_tracy.sh => benchmark_tracy_build_mainframe_view_local.sh} (100%) diff --git a/barretenberg/cpp/CMakeLists.txt b/barretenberg/cpp/CMakeLists.txt index 81b75435a15..db289b54800 100644 --- a/barretenberg/cpp/CMakeLists.txt +++ b/barretenberg/cpp/CMakeLists.txt @@ -61,6 +61,15 @@ else() SET(TRACY_LIBS) endif() +if(TRACY_PROFILE_MEMORY) + add_compile_options(-DTRACY_MEMORY) +endif() + +if(TRACY_PROFILE_TIME) + add_compile_options(-DTRACY_TIME) +endif() + + if(ENABLE_ASAN) add_compile_options(-fsanitize=address) add_link_options(-fsanitize=address) diff --git a/barretenberg/cpp/CMakePresets.json b/barretenberg/cpp/CMakePresets.json index 643e4590ae2..c525451b8c0 100644 --- a/barretenberg/cpp/CMakePresets.json +++ b/barretenberg/cpp/CMakePresets.json @@ -111,13 +111,31 @@ } }, { - "name": "tracy", + "name": "tracy-memory", "displayName": "Release build with tracy, optimized for memory tracking", "description": "Release build with tracy, optimized for memory tracking", "inherits": "clang16", - "binaryDir": "build-tracy", + "binaryDir": "build-tracy-memory", "cacheVariables": { - "ENABLE_TRACY": "ON" + "ENABLE_TRACY": "ON", + "TRACY_PROFILE_MEMORY": "ON" + } + }, + { + "name": "tracy-time", + "displayName": "Build for tracy time profiling", + "description": "Build for tracy time profiling", + "binaryDir": "build-tracy-time", + "inherits": "clang16", + "environment": { + "CMAKE_BUILD_TYPE": "RelWithDebInfo", + "CFLAGS": "-g -fno-omit-frame-pointer", + "CXXFLAGS": "-g -fno-omit-frame-pointer", + "LDFLAGS": "-g -fno-omit-frame-pointer -rdynamic" + }, + "cacheVariables": { + "ENABLE_TRACY": "ON", + "TRACY_PROFILE_TIME": "ON" } }, { @@ -472,9 +490,14 @@ "configurePreset": "clang16-dbg" }, { - "name": "tracy", + "name": "tracy-memory", + "inherits": "default", + "configurePreset": "tracy-memory" + }, + { + "name": "tracy-time", "inherits": "default", - "configurePreset": "tracy" + "configurePreset": "tracy-time" }, { "name": "clang16-pic", diff --git a/barretenberg/cpp/scripts/benchmark_tracy_build_mainframe_run_local.sh b/barretenberg/cpp/scripts/benchmark_tracy_build_mainframe_run_local.sh new file mode 100755 index 00000000000..b7004a0df26 --- /dev/null +++ b/barretenberg/cpp/scripts/benchmark_tracy_build_mainframe_run_local.sh @@ -0,0 +1,49 @@ + +# NOTE: intended to be ran from one's external computer, connecting to Aztec mainframe +# IF ON YOUR LOCAL COMPUTER USE NORMAL INTERACTIVE TRACY WORKFLOW +# the benchmark runs with headless capture and then we copy the trace file and run tracy profiler +# This is thus only really useful internally at Aztec, sorry external folks. It can be easily tweaked +# however for any SSH setup, especially an ubuntu one. +# on local machine run: +# export USER=... +# export PRESET=...tracy for memory or tracy-gates for circuit gates... +# ssh $USER-box "cat ~/aztec-packages/barretenberg/cpp/scripts/benchmark_tracy.sh" | bash /dev/stdin $USER +set -eux +USER=${1:-$USER} +BOX=$USER-box +BENCHMARK=${2:-protogalaxy_bench} +COMMAND=${3:-./bin/$BENCHMARK --benchmark_filter=fold_k/17} + +# Can also set PRESET=tracy-gates env variable +PRESET=${PRESET:-tracy-time} + +wait # TODO(AD) hack - not sure why needed +! [ -d ~/tracy ] && git clone https://github.com/wolfpld/tracy ~/tracy +cd ~/tracy +git checkout 075395620a504c0cdcaf9bab3d196db16a043de7 # release 0.11.0 +cmake -B profiler/build -S profiler -DCMAKE_BUILD_TYPE=Release +cmake --build profiler/build --parallel +cd - + +ssh $BOX " + set -eux ; + cd ~/aztec-packages/barretenberg/cpp/ ; + cmake --preset $PRESET && cmake --build --preset $PRESET --target $BENCHMARK ; +" & +wait +if [ ! -d build-$PRESET/bin ]; then + echo build-$PRESET/bin; + mkdir -p build-$PRESET/bin; +fi +scp $BOX:/mnt/user-data/$USER/aztec-packages/barretenberg/cpp/build-$PRESET/bin/$BENCHMARK build-$PRESET/bin/. ; +! [ -d ~/tracy ] && git clone https://github.com/wolfpld/tracy ~/tracy ; +cd ~/tracy/capture ; + git checkout 075395620a504c0cdcaf9bab3d196db16a043de7 ; +mkdir -p build && cd build && cmake .. && make -j ; + +./tracy-capture -a 127.0.0.1 -f -o ../trace-$BENCHMARK & +sleep 0.1 ; +cd ~/aztec-packages/barretenberg/cpp/build-$PRESET/ +$COMMAND ; + +~/tracy/profiler/build/tracy-profiler ~/tracy/capture/trace-$BENCHMARK diff --git a/barretenberg/cpp/scripts/benchmark_tracy.sh b/barretenberg/cpp/scripts/benchmark_tracy_build_mainframe_view_local.sh similarity index 100% rename from barretenberg/cpp/scripts/benchmark_tracy.sh rename to barretenberg/cpp/scripts/benchmark_tracy_build_mainframe_view_local.sh diff --git a/barretenberg/cpp/src/barretenberg/benchmark/client_ivc_bench/client_ivc.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/client_ivc_bench/client_ivc.bench.cpp index 3410f03faf6..12cc8718749 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/client_ivc_bench/client_ivc.bench.cpp +++ b/barretenberg/cpp/src/barretenberg/benchmark/client_ivc_bench/client_ivc.bench.cpp @@ -45,7 +45,7 @@ BENCHMARK_DEFINE_F(ClientIVCBench, Full)(benchmark::State& state) } } -#define ARGS Arg(ClientIVCBench::NUM_ITERATIONS_MEDIUM_COMPLEXITY) +#define ARGS Arg(ClientIVCBench::NUM_ITERATIONS_MEDIUM_COMPLEXITY)->Arg(2) BENCHMARK_REGISTER_F(ClientIVCBench, Full)->Unit(benchmark::kMillisecond)->ARGS; diff --git a/barretenberg/cpp/src/barretenberg/benchmark/protogalaxy_bench/protogalaxy.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/protogalaxy_bench/protogalaxy.bench.cpp index 105bbb564ce..864f8d7f383 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/protogalaxy_bench/protogalaxy.bench.cpp +++ b/barretenberg/cpp/src/barretenberg/benchmark/protogalaxy_bench/protogalaxy.bench.cpp @@ -75,7 +75,7 @@ void fold_k(State& state) noexcept } } -BENCHMARK(vector_of_evaluations)->DenseRange(15, 21)->Unit(kMillisecond); +BENCHMARK(vector_of_evaluations)->DenseRange(15, 21)->Unit(kMillisecond)->Iterations(1); BENCHMARK(compute_row_evaluations)->DenseRange(15, 21)->Unit(kMillisecond); // We stick to just k=1 for compile-time reasons. BENCHMARK(fold_k)->/* vary the circuit size */ DenseRange(14, 20)->Unit(kMillisecond); diff --git a/barretenberg/cpp/src/barretenberg/benchmark/relations_bench/barycentric.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/relations_bench/barycentric.bench.cpp index 64db936c71a..243f4f4137c 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/relations_bench/barycentric.bench.cpp +++ b/barretenberg/cpp/src/barretenberg/benchmark/relations_bench/barycentric.bench.cpp @@ -14,14 +14,50 @@ using bb::Univariate; namespace bb::benchmark { -void extend_2_to_6(State& state) noexcept +void extend_2_to_11(State& state) noexcept { auto univariate = Univariate::get_random(); for (auto _ : state) { - DoNotOptimize(univariate.extend_to<6>()); + DoNotOptimize(univariate.extend_to<11>()); } } -BENCHMARK(extend_2_to_6); + +// 93.9s goes down to 62.7 +// Theoretical min: 1 sub, 9 additions at about 3.8ns each, 38ns +void fake_extend_2_to_11(State& state) noexcept +{ + std::array univariate; + std::generate(univariate.begin(), univariate.end(), [&]() { return FF::random_element(); }); + + const auto extend_to_11 = [](auto& arr) { + FF tmp = arr[1]; + const FF delta = tmp - arr[0]; + for (size_t idx = 2; idx < 10; idx++) { + arr[idx] = (tmp += delta); // fused ~> 62.9ns; non-fused ~>69.5ns + } + arr[10] = tmp; // save one +=; + return arr; + }; + + for (auto _ : state) { + DoNotOptimize(extend_to_11(univariate)); + } +} + +// 93.9s goes down to 62.7 +// Theoretical min: 1 sub, 9 additions at about 3.8ns each, 38ns +void self_extend_2_to_11(State& state) noexcept +{ + auto univariate = Univariate::get_random(); + + for (auto _ : state) { + univariate.self_extend_from<2>(); + } +} + +BENCHMARK(extend_2_to_11); +BENCHMARK(fake_extend_2_to_11); +BENCHMARK(self_extend_2_to_11); } // namespace bb::benchmark diff --git a/barretenberg/cpp/src/barretenberg/benchmark/ultra_bench/mock_circuits.hpp b/barretenberg/cpp/src/barretenberg/benchmark/ultra_bench/mock_circuits.hpp index 27e53e13886..0d2787095f3 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/ultra_bench/mock_circuits.hpp +++ b/barretenberg/cpp/src/barretenberg/benchmark/ultra_bench/mock_circuits.hpp @@ -54,7 +54,9 @@ Prover get_prover(void (*test_circuit_function)(typename Prover::Flavor::Circuit Composer composer; return composer.create_prover(builder); } else { +#ifdef TRACY_MEMORY ZoneScopedN("creating prover"); +#endif return Prover(builder); } }; diff --git a/barretenberg/cpp/src/barretenberg/common/op_count.hpp b/barretenberg/cpp/src/barretenberg/common/op_count.hpp index 8e4711d001a..af24ecb1e10 100644 --- a/barretenberg/cpp/src/barretenberg/common/op_count.hpp +++ b/barretenberg/cpp/src/barretenberg/common/op_count.hpp @@ -2,6 +2,7 @@ #pragma once #include +#include #ifndef BB_USE_OP_COUNT // require a semicolon to appease formatters // NOLINTNEXTLINE(cppcoreguidelines-macro-usage) @@ -11,12 +12,19 @@ // NOLINTNEXTLINE(cppcoreguidelines-macro-usage) #define BB_OP_COUNT_CYCLES_NAME(name) (void)0 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage) -#define BB_OP_COUNT_TIME_NAME(name) (void)0 -// NOLINTNEXTLINE(cppcoreguidelines-macro-usage) #define BB_OP_COUNT_CYCLES() (void)0 +#ifndef TRACY_TIME +// NOLINTNEXTLINE(cppcoreguidelines-macro-usage) +#define BB_OP_COUNT_TIME_NAME(name) (void)0 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage) #define BB_OP_COUNT_TIME() (void)0 #else +// NOLINTNEXTLINE(cppcoreguidelines-macro-usage) +#define BB_OP_COUNT_TIME_NAME(name) ZoneScopedN(name) +// NOLINTNEXTLINE(cppcoreguidelines-macro-usage) +#define BB_OP_COUNT_TIME() BB_OP_COUNT_TIME_NAME(__func__) +#endif +#else /** * Provides an abstraction that counts operations based on function names. * For efficiency, we spread out counts across threads. diff --git a/barretenberg/cpp/src/barretenberg/execution_trace/execution_trace.cpp b/barretenberg/cpp/src/barretenberg/execution_trace/execution_trace.cpp index add41ae4577..82e9def06ae 100644 --- a/barretenberg/cpp/src/barretenberg/execution_trace/execution_trace.cpp +++ b/barretenberg/cpp/src/barretenberg/execution_trace/execution_trace.cpp @@ -8,7 +8,9 @@ namespace bb { template void ExecutionTrace_::populate_public_inputs_block(Builder& builder) { +#ifdef TRACY_MEMORY ZoneScopedN("populate_public_inputs_block"); +#endif // Update the public inputs block for (const auto& idx : builder.public_inputs) { for (size_t wire_idx = 0; wire_idx < NUM_WIRES; ++wire_idx) { @@ -27,7 +29,10 @@ template void ExecutionTrace_::populate_public_inputs_blo template void ExecutionTrace_::populate(Builder& builder, typename Flavor::ProvingKey& proving_key, bool is_structured) { + +#ifdef TRACY_MEMORY ZoneScopedN("trace populate"); +#endif // Share wire polynomials, selector polynomials between proving key and builder and copy cycles from raw circuit // data auto trace_data = construct_trace_data(builder, proving_key, is_structured); @@ -36,18 +41,27 @@ void ExecutionTrace_::populate(Builder& builder, typename Flavor::Provin proving_key.pub_inputs_offset = trace_data.pub_inputs_offset; } if constexpr (IsUltraPlonkOrHonk) { + +#ifdef TRACY_MEMORY ZoneScopedN("add_memory_records_to_proving_key"); +#endif add_memory_records_to_proving_key(trace_data, builder, proving_key); } if constexpr (IsGoblinFlavor) { + +#ifdef TRACY_MEMORY ZoneScopedN("add_ecc_op_wires_to_proving_key"); +#endif add_ecc_op_wires_to_proving_key(builder, proving_key); } // Compute the permutation argument polynomials (sigma/id) and add them to proving key { + +#ifdef TRACY_MEMORY ZoneScopedN("compute_permutation_argument_polynomials"); +#endif compute_permutation_argument_polynomials(builder, &proving_key, trace_data.copy_cycles); } } @@ -73,7 +87,10 @@ template typename ExecutionTrace_::TraceData ExecutionTrace_::construct_trace_data( Builder& builder, typename Flavor::ProvingKey& proving_key, bool is_structured) { + +#ifdef TRACY_MEMORY ZoneScopedN("construct_trace_data"); +#endif if constexpr (IsPlonkFlavor) { // Complete the public inputs execution trace block from builder.public_inputs @@ -91,7 +108,10 @@ typename ExecutionTrace_::TraceData ExecutionTrace_::construct_t // Update wire polynomials and copy cycles // NB: The order of row/column loops is arbitrary but needs to be row/column to match old copy_cycle code { + +#ifdef TRACY_MEMORY ZoneScopedN("populating wires and copy_cycles"); +#endif for (uint32_t block_row_idx = 0; block_row_idx < block_size; ++block_row_idx) { for (uint32_t wire_idx = 0; wire_idx < NUM_WIRES; ++wire_idx) { uint32_t var_idx = block.wires[wire_idx][block_row_idx]; // an index into the variables array diff --git a/barretenberg/cpp/src/barretenberg/execution_trace/execution_trace.hpp b/barretenberg/cpp/src/barretenberg/execution_trace/execution_trace.hpp index c1206f1039c..045c41b56a7 100644 --- a/barretenberg/cpp/src/barretenberg/execution_trace/execution_trace.hpp +++ b/barretenberg/cpp/src/barretenberg/execution_trace/execution_trace.hpp @@ -28,7 +28,10 @@ template class ExecutionTrace_ { TraceData(Builder& builder, ProvingKey& proving_key) { + +#ifdef TRACY_MEMORY ZoneScopedN("TraceData constructor"); +#endif if constexpr (IsHonkFlavor) { // Initialize and share the wire and selector polynomials for (auto [wire, other_wire] : zip_view(wires, proving_key.polynomials.get_wires())) { @@ -45,7 +48,10 @@ template class ExecutionTrace_ { proving_key.polynomial_store.put(wire_tag, wires[idx].share()); } { + +#ifdef TRACY_MEMORY ZoneScopedN("selector initialization"); +#endif for (size_t idx = 0; idx < Builder::Arithmetization::NUM_SELECTORS; ++idx) { selectors[idx] = Polynomial(proving_key.circuit_size); std::string selector_tag = builder.selector_names[idx] + "_lagrange"; @@ -54,7 +60,10 @@ template class ExecutionTrace_ { } } { + +#ifdef TRACY_MEMORY ZoneScopedN("copy cycle initialization"); +#endif copy_cycles.resize(builder.variables.size()); } } diff --git a/barretenberg/cpp/src/barretenberg/flavor/flavor.hpp b/barretenberg/cpp/src/barretenberg/flavor/flavor.hpp index fbdc13ddf8b..69433b0c865 100644 --- a/barretenberg/cpp/src/barretenberg/flavor/flavor.hpp +++ b/barretenberg/cpp/src/barretenberg/flavor/flavor.hpp @@ -129,7 +129,10 @@ template class ProvingKey_ { std::shared_ptr commitment_key = nullptr) { if (commitment_key == nullptr) { + +#ifdef TRACY_MEMORY ZoneScopedN("init commitment key"); +#endif this->commitment_key = std::make_shared(circuit_size); } else { // Don't create another commitment key if we already have one diff --git a/barretenberg/cpp/src/barretenberg/goblin/goblin.hpp b/barretenberg/cpp/src/barretenberg/goblin/goblin.hpp index 134db00a929..4ef27dbcd9c 100644 --- a/barretenberg/cpp/src/barretenberg/goblin/goblin.hpp +++ b/barretenberg/cpp/src/barretenberg/goblin/goblin.hpp @@ -171,17 +171,26 @@ class GoblinProver { void prove_eccvm() { { + +#ifdef TRACY_MEMORY ZoneScopedN("Create ECCVMBuilder and ECCVMProver"); +#endif auto eccvm_builder = std::make_unique(op_queue); eccvm_prover = std::make_unique(*eccvm_builder); } { + +#ifdef TRACY_MEMORY ZoneScopedN("Construct ECCVM Proof"); +#endif goblin_proof.eccvm_proof = eccvm_prover->construct_proof(); } { + +#ifdef TRACY_MEMORY ZoneScopedN("Assign Translation Evaluations"); +#endif goblin_proof.translation_evaluations = eccvm_prover->translation_evaluations; } } @@ -198,14 +207,20 @@ class GoblinProver { eccvm_key = eccvm_prover->key; eccvm_prover = nullptr; { + +#ifdef TRACY_MEMORY ZoneScopedN("Create TranslatorBuilder and TranslatorProver"); +#endif auto translator_builder = std::make_unique(translation_batching_challenge_v, evaluation_challenge_x, op_queue); translator_prover = std::make_unique(*translator_builder, transcript); } { + +#ifdef TRACY_MEMORY ZoneScopedN("Construct Translator Proof"); +#endif goblin_proof.translator_proof = translator_prover->construct_proof(); } } @@ -219,14 +234,23 @@ class GoblinProver { */ GoblinProof prove(MergeProof merge_proof_in = {}) { + +#ifdef TRACY_MEMORY ZoneScopedN("Goblin::prove"); +#endif goblin_proof.merge_proof = merge_proof_in.empty() ? std::move(merge_proof) : std::move(merge_proof_in); { + +#ifdef TRACY_MEMORY ZoneScopedN("prove_eccvm"); +#endif prove_eccvm(); } { + +#ifdef TRACY_MEMORY ZoneScopedN("prove_translator"); +#endif prove_translator(); } return goblin_proof; diff --git a/barretenberg/cpp/src/barretenberg/plonk_honk_shared/composer/permutation_lib.hpp b/barretenberg/cpp/src/barretenberg/plonk_honk_shared/composer/permutation_lib.hpp index 1590a4d5763..3a87d0efdab 100644 --- a/barretenberg/cpp/src/barretenberg/plonk_honk_shared/composer/permutation_lib.hpp +++ b/barretenberg/cpp/src/barretenberg/plonk_honk_shared/composer/permutation_lib.hpp @@ -62,7 +62,10 @@ template struct PermutationMapping { */ PermutationMapping(size_t circuit_size) { + +#ifdef TRACY_MEMORY ZoneScopedN("PermutationMapping constructor"); +#endif for (uint8_t col_idx = 0; col_idx < NUM_WIRES; ++col_idx) { sigmas[col_idx].reserve(circuit_size); if constexpr (generalized) { @@ -386,12 +389,18 @@ void compute_permutation_argument_polynomials(const typename Flavor::CircuitBuil } else if constexpr (IsUltraFlavor) { // any UltraHonk flavor // Compute Honk-style sigma and ID polynomials from the corresponding mappings { + +#ifdef TRACY_MEMORY ZoneScopedN("compute_honk_style_permutation_lagrange_polynomials_from_mapping"); +#endif compute_honk_style_permutation_lagrange_polynomials_from_mapping( key->polynomials.get_sigmas(), mapping.sigmas, key); } { + +#ifdef TRACY_MEMORY ZoneScopedN("compute_honk_style_permutation_lagrange_polynomials_from_mapping"); +#endif compute_honk_style_permutation_lagrange_polynomials_from_mapping( key->polynomials.get_ids(), mapping.ids, key); } diff --git a/barretenberg/cpp/src/barretenberg/polynomials/barycentric.test.cpp b/barretenberg/cpp/src/barretenberg/polynomials/barycentric.test.cpp index a481f9fc64e..2f708175430 100644 --- a/barretenberg/cpp/src/barretenberg/polynomials/barycentric.test.cpp +++ b/barretenberg/cpp/src/barretenberg/polynomials/barycentric.test.cpp @@ -36,6 +36,18 @@ TYPED_TEST(BarycentricDataTests, Extend) EXPECT_EQ(result, expected_result); } +TYPED_TEST(BarycentricDataTests, SelfExtend) +{ + BARYCENTIC_DATA_TESTS_TYPE_ALIASES + static constexpr size_t initial_size(2); + static constexpr size_t domain_size(10); + static constexpr size_t skip_count(0); + auto f = Univariate({ 1, 2, 0, 0, 0, 0, 0, 0, 0, 0 }); + auto expected_result = Univariate({ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }); + f.template self_extend_from(); + EXPECT_EQ(f, expected_result); +} + TYPED_TEST(BarycentricDataTests, Evaluate) { BARYCENTIC_DATA_TESTS_TYPE_ALIASES diff --git a/barretenberg/cpp/src/barretenberg/polynomials/univariate.hpp b/barretenberg/cpp/src/barretenberg/polynomials/univariate.hpp index 01fe32d72c6..f86fc99ae2e 100644 --- a/barretenberg/cpp/src/barretenberg/polynomials/univariate.hpp +++ b/barretenberg/cpp/src/barretenberg/polynomials/univariate.hpp @@ -83,8 +83,22 @@ template Univariate extend_to() const { - const size_t EXTENDED_LENGTH = EXTENDED_DOMAIN_END - domain_start; + static constexpr size_t EXTENDED_LENGTH = EXTENDED_DOMAIN_END - domain_start; using Data = BarycentricData; static_assert(EXTENDED_LENGTH >= LENGTH); @@ -467,6 +481,18 @@ template void self_extend_from() + { + if constexpr (INITIAL_LENGTH == 2) { + const Fr delta = value_at(1) - value_at(0); + Fr next = value_at(1); + for (size_t idx = 2; idx < LENGTH; idx++) { + next += delta; + value_at(idx) = next; + } + } + } + /** * @brief Evaluate a univariate at a point u not known at compile time * and assumed not to be in the domain (else we divide by zero). diff --git a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_impl.hpp b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_impl.hpp index 0a376672a86..85266185d91 100644 --- a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_impl.hpp +++ b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_impl.hpp @@ -11,7 +11,10 @@ template void ProtogalaxyProver_::run_oink_prover_on_one_incomplete_key(std::shared_ptr keys, const std::string& domain_separator) { + +#ifdef TRACY_MEMORY ZoneScopedN("ProtogalaxyProver::run_oink_prover_on_one_incomplete_key"); +#endif OinkProver oink_prover(keys, transcript, domain_separator + '_'); oink_prover.prove(); } @@ -159,7 +162,10 @@ FoldingResult ProtogalaxyProver_ FoldingResult ProtogalaxyProver_::prove() { + +#ifdef TRACY_MEMORY ZoneScopedN("ProtogalaxyProver::prove"); +#endif BB_OP_COUNT_TIME_NAME("ProtogalaxyProver::prove"); // Ensure keys are all of the same size for (size_t idx = 0; idx < DeciderProvingKeys::NUM - 1; ++idx) { diff --git a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_internal.hpp b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_internal.hpp index 4fd90abed34..aec2aeaf2c3 100644 --- a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_internal.hpp +++ b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_internal.hpp @@ -200,9 +200,11 @@ template class ProtogalaxyProverInternal { const DeciderPKs& keys, const size_t row_idx) { - const auto base_univariates = keys.template row_to_univariates(row_idx); - for (auto [extended_univariate, base_univariate] : zip_view(extended_univariates.get_all(), base_univariates)) { - extended_univariate = base_univariate.template extend_to(); + auto incoming_univariates = keys.template row_to_univariates(row_idx); + for (auto [extended_univariate, incoming_univariate] : + zip_view(extended_univariates.get_all(), incoming_univariates)) { + incoming_univariate.template self_extend_from(); + extended_univariate = std::move(incoming_univariate); } } diff --git a/barretenberg/cpp/src/barretenberg/srs/factories/file_crs_factory.hpp b/barretenberg/cpp/src/barretenberg/srs/factories/file_crs_factory.hpp index f3eca37a48f..3da7f409894 100644 --- a/barretenberg/cpp/src/barretenberg/srs/factories/file_crs_factory.hpp +++ b/barretenberg/cpp/src/barretenberg/srs/factories/file_crs_factory.hpp @@ -44,7 +44,10 @@ template class FileProverCrs : public ProverCrs { FileProverCrs(const size_t num_points, std::string const& path) : num_points(num_points) { + +#ifdef TRACY_MEMORY ZoneScopedN("FileProverCrs constructor"); +#endif monomials_ = scalar_multiplication::point_table_alloc(num_points); srs::IO::read_transcript_g1(monomials_.get(), num_points, path); diff --git a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/ultra_flavor.hpp b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/ultra_flavor.hpp index b11be2b68f3..427e2a2d9f2 100644 --- a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/ultra_flavor.hpp +++ b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/ultra_flavor.hpp @@ -293,7 +293,10 @@ class UltraFlavor { ProverPolynomials() = default; ProverPolynomials(size_t circuit_size) { + +#ifdef TRACY_MEMORY ZoneScopedN("creating empty prover polys"); +#endif for (auto& poly : get_to_be_shifted()) { poly = Polynomial{ /*memory size*/ circuit_size - 1, /*largest possible index*/ circuit_size, @@ -558,7 +561,10 @@ class UltraFlavor { PartiallyEvaluatedMultivariates() = default; PartiallyEvaluatedMultivariates(const size_t circuit_size) { + +#ifdef TRACY_MEMORY ZoneScopedN("PartiallyEvaluatedMultivariates constructor"); +#endif // Storage is only needed after the first partial evaluation, hence polynomials of // size (n / 2) for (auto& poly : this->get_all()) { diff --git a/barretenberg/cpp/src/barretenberg/sumcheck/sumcheck.hpp b/barretenberg/cpp/src/barretenberg/sumcheck/sumcheck.hpp index ef46da4014c..5b7e8df06b2 100644 --- a/barretenberg/cpp/src/barretenberg/sumcheck/sumcheck.hpp +++ b/barretenberg/cpp/src/barretenberg/sumcheck/sumcheck.hpp @@ -209,7 +209,10 @@ template class SumcheckProver { auto round_univariate = round.compute_univariate( round_idx, full_polynomials, relation_parameters, gate_separators, alpha, zk_sumcheck_data); { + +#ifdef TRACY_MEMORY ZoneScopedN("rest of sumcheck round 1"); +#endif // Place the evaluations of the round univariate into transcript. transcript->send_to_verifier("Sumcheck:univariate_0", round_univariate); @@ -227,7 +230,10 @@ template class SumcheckProver { // We operate on partially_evaluated_polynomials in place. } for (size_t round_idx = 1; round_idx < multivariate_d; round_idx++) { + +#ifdef TRACY_MEMORY ZoneScopedN("sumcheck loop"); +#endif // Write the round univariate to the transcript round_univariate = round.compute_univariate(round_idx, partially_evaluated_polynomials, diff --git a/barretenberg/cpp/src/barretenberg/sumcheck/sumcheck_round.hpp b/barretenberg/cpp/src/barretenberg/sumcheck/sumcheck_round.hpp index 4e9d11108af..30ca1b0d536 100644 --- a/barretenberg/cpp/src/barretenberg/sumcheck/sumcheck_round.hpp +++ b/barretenberg/cpp/src/barretenberg/sumcheck/sumcheck_round.hpp @@ -66,7 +66,10 @@ template class SumcheckProverRound { SumcheckProverRound(size_t initial_round_size) : round_size(initial_round_size) { + +#ifdef TRACY_MEMORY ZoneScopedN("SumcheckProverRound constructor"); +#endif // Initialize univariate accumulators to 0 Utils::zero_univariates(univariate_accumulators); } @@ -161,7 +164,10 @@ template class SumcheckProverRound { const RelationSeparator alpha, std::optional> zk_sumcheck_data = std::nullopt) // only submitted when Flavor HasZK { + +#ifdef TRACY_MEMORY ZoneScopedN("compute_univariate"); +#endif BB_OP_COUNT_TIME(); // Determine number of threads for multithreading. diff --git a/barretenberg/cpp/src/barretenberg/ultra_honk/decider_keys.hpp b/barretenberg/cpp/src/barretenberg/ultra_honk/decider_keys.hpp index e6ef907c622..49b72bbd0e2 100644 --- a/barretenberg/cpp/src/barretenberg/ultra_honk/decider_keys.hpp +++ b/barretenberg/cpp/src/barretenberg/ultra_honk/decider_keys.hpp @@ -53,19 +53,18 @@ template struct DeciderProvingKeys_ { * @tparam skip_count Construct univariates that skip some of the indices when computing results * @return The univariates whose extensions will be used to construct the combiner. */ - template auto row_to_univariates(size_t row_idx) const + template auto row_to_univariates(size_t row_idx) const { auto prover_polynomials_views = get_polynomials_views(); - std::array, prover_polynomials_views[0].size()> results; + std::array, prover_polynomials_views[0].size()> results; // Set the size corresponding to the number of rows in the execution trace - size_t pk_idx = 0; // Iterate over the prover polynomials' views corresponding to each proving key - for (auto& get_all : prover_polynomials_views) { + for (size_t dpk_idx = 0; auto& get_all : prover_polynomials_views) { // Iterate over all columns in the trace execution of an proving key and extract their value at row_idx. for (auto [result, poly_ptr] : zip_view(results, get_all)) { - result.evaluations[pk_idx] = poly_ptr[row_idx]; + result.evaluations[dpk_idx] = poly_ptr[row_idx]; } - pk_idx++; + dpk_idx++; } return results; } diff --git a/barretenberg/cpp/src/barretenberg/ultra_honk/decider_prover.cpp b/barretenberg/cpp/src/barretenberg/ultra_honk/decider_prover.cpp index e8063431f7d..585e4ab60c5 100644 --- a/barretenberg/cpp/src/barretenberg/ultra_honk/decider_prover.cpp +++ b/barretenberg/cpp/src/barretenberg/ultra_honk/decider_prover.cpp @@ -31,7 +31,10 @@ template void DeciderProver_::execute_relation_ch size_t polynomial_size = proving_key->proving_key.circuit_size; auto sumcheck = Sumcheck(polynomial_size, transcript); { + +#ifdef TRACY_MEMORY ZoneScopedN("sumcheck.prove"); +#endif sumcheck_output = sumcheck.prove(proving_key->proving_key.polynomials, proving_key->relation_parameters, proving_key->alphas, diff --git a/barretenberg/cpp/src/barretenberg/ultra_honk/decider_proving_key.hpp b/barretenberg/cpp/src/barretenberg/ultra_honk/decider_proving_key.hpp index 8a71f0ef1e7..2bfd069a5a4 100644 --- a/barretenberg/cpp/src/barretenberg/ultra_honk/decider_proving_key.hpp +++ b/barretenberg/cpp/src/barretenberg/ultra_honk/decider_proving_key.hpp @@ -72,7 +72,10 @@ template class DeciderProvingKey_ { circuit.op_queue->append_nonzero_ops(); } { + +#ifdef TRACY_MEMORY ZoneScopedN("constructing proving key"); +#endif proving_key = ProvingKey(dyadic_circuit_size, circuit.public_inputs.size(), commitment_key); if (IsGoblinFlavor && !is_structured) { @@ -81,13 +84,17 @@ template class DeciderProvingKey_ { } else { // Allocate only a correct amount of memory for each polynomial // Allocate the wires and selectors polynomials { +#ifdef TRACY_MEMORY ZoneScopedN("allocating wires"); +#endif for (auto& wire : proving_key.polynomials.get_wires()) { wire = Polynomial::shiftable(proving_key.circuit_size); } } { +#ifdef TRACY_MEMORY ZoneScopedN("allocating gate selectors"); +#endif // Define gate selectors over the block they are isolated to for (auto [selector, block] : zip_view(proving_key.polynomials.get_gate_selectors(), circuit.blocks.get_gate_blocks())) { @@ -107,14 +114,18 @@ template class DeciderProvingKey_ { } } { +#ifdef TRACY_MEMORY ZoneScopedN("allocating non-gate selectors"); +#endif // Set the other non-gate selector polynomials to full size for (auto& selector : proving_key.polynomials.get_non_gate_selectors()) { selector = Polynomial(proving_key.circuit_size); } } if constexpr (IsGoblinFlavor) { +#ifdef TRACY_MEMORY ZoneScopedN("allocating ecc op wires and selector"); +#endif // Allocate the ecc op wires and selector const size_t ecc_op_block_size = circuit.blocks.ecc_op.get_fixed_size(is_structured); const size_t op_wire_offset = Flavor::has_zero_row ? 1 : 0; @@ -151,7 +162,9 @@ template class DeciderProvingKey_ { std::min(static_cast(MAX_LOOKUP_TABLES_SIZE), dyadic_circuit_size - 1); size_t table_offset = dyadic_circuit_size - max_tables_size; { +#ifdef TRACY_MEMORY ZoneScopedN("allocating table polynomials"); +#endif ASSERT(dyadic_circuit_size > max_tables_size); // Allocate the table polynomials @@ -162,7 +175,9 @@ template class DeciderProvingKey_ { } } { +#ifdef TRACY_MEMORY ZoneScopedN("allocating sigmas and ids"); +#endif for (auto& sigma : proving_key.polynomials.get_sigmas()) { sigma = typename Flavor::Polynomial(proving_key.circuit_size); } @@ -207,13 +222,17 @@ template class DeciderProvingKey_ { } } { +#ifdef TRACY_MEMORY ZoneScopedN("constructing z_perm"); +#endif // Allocate the z_perm polynomial proving_key.polynomials.z_perm = Polynomial::shiftable(proving_key.circuit_size); } { +#ifdef TRACY_MEMORY ZoneScopedN("allocating lagrange polynomials"); +#endif // First and last lagrange polynomials (in the full circuit size) proving_key.polynomials.lagrange_first = Polynomial(1, dyadic_circuit_size, 0); proving_key.polynomials.lagrange_last = Polynomial(1, dyadic_circuit_size, dyadic_circuit_size - 1); @@ -226,11 +245,16 @@ template class DeciderProvingKey_ { // Construct and add to proving key the wire, selector and copy constraint polynomials Trace::populate(circuit, proving_key, is_structured); + +#ifdef TRACY_MEMORY ZoneScopedN("constructing prover instance after trace populate"); +#endif // If Goblin, construct the databus polynomials if constexpr (IsGoblinFlavor) { +#ifdef TRACY_MEMORY ZoneScopedN("constructing databus polynomials"); +#endif construct_databus_polynomials(circuit); } @@ -239,13 +263,17 @@ template class DeciderProvingKey_ { proving_key.polynomials.lagrange_last.at(dyadic_circuit_size - 1) = 1; { +#ifdef TRACY_MEMORY ZoneScopedN("constructing lookup table polynomials"); +#endif construct_lookup_table_polynomials( proving_key.polynomials.get_tables(), circuit, dyadic_circuit_size); } { +#ifdef TRACY_MEMORY ZoneScopedN("constructing lookup read counts"); +#endif construct_lookup_read_counts(proving_key.polynomials.lookup_read_counts, proving_key.polynomials.lookup_read_tags, circuit, diff --git a/barretenberg/cpp/src/barretenberg/ultra_honk/oink_prover.cpp b/barretenberg/cpp/src/barretenberg/ultra_honk/oink_prover.cpp index bd16fe7a243..8498c12a2f3 100644 --- a/barretenberg/cpp/src/barretenberg/ultra_honk/oink_prover.cpp +++ b/barretenberg/cpp/src/barretenberg/ultra_honk/oink_prover.cpp @@ -14,29 +14,44 @@ namespace bb { template void OinkProver::prove() { { + +#ifdef TRACY_MEMORY ZoneScopedN("execute_preamble_round"); +#endif // Add circuit size public input size and public inputs to transcript-> execute_preamble_round(); } { + +#ifdef TRACY_MEMORY ZoneScopedN("execute_wire_commitments_round"); +#endif // Compute first three wire commitments execute_wire_commitments_round(); } { + +#ifdef TRACY_MEMORY ZoneScopedN("execute_sorted_list_accumulator_round"); +#endif // Compute sorted list accumulator and commitment execute_sorted_list_accumulator_round(); } { + +#ifdef TRACY_MEMORY ZoneScopedN("execute_log_derivative_inverse_round"); +#endif // Fiat-Shamir: beta & gamma execute_log_derivative_inverse_round(); } { + +#ifdef TRACY_MEMORY ZoneScopedN("execute_grand_product_computation_round"); +#endif // Compute grand product(s) and commitments. execute_grand_product_computation_round(); }