Skip to content

Commit

Permalink
feat: new Tracy Time preset and more efficient univariate extension (#…
Browse files Browse the repository at this point in the history
…8789)

We add a new script `benchmark_tracy_build_mainframe_run_local.sh`that
will copy the executable to local and run it there instead.

We also add a new tracy-time preset that reuses the BB_OP_COUNT_TIME
statements around the codebase to mark zones.

We add a new extend_from function that is more efficient than the
existing extend_to function. From using the new extend_from function, we
see improvements to the combiner round.

```
--------------------------------------------------------------------------------                                                                                                                                                                                         
Benchmark                      Time             CPU   Iterations UserCounters...                                                                                                                                                                                         
--------------------------------------------------------------------------------                                                                                                                                                                                         
ClientIVCBench/Full/6      32719 ms        30147 ms            1 Arithmetic::accumulate=3.89126M Arithmetic::accumulate(t)=7.36388G Auxiliary::accumulate=1.98134M Auxiliary::accumulate(t)=13.3567G COMMIT::databus=108 COMMIT::databus(t)=8.37903M COMMIT::databus_inve
rses=36 COMMIT::databus_inverses(t)=11.6069M COMMIT::ecc_op_wires=48 COMMIT::ecc_op_wires(t)=38.5805M COMMIT::lookup_counts_tags=12 COMMIT::lookup_counts_tags(t)=106.101M COMMIT::lookup_inverses=12 COMMIT::lookup_inverses(t)=257.04M COMMIT::wires=24 COMMIT::wires(t
)=2.21684G COMMIT::z_perm=12 COMMIT::z_perm(t)=2.31257G DatabusRead::accumulate=447 DatabusRead::accumulate(t)=1.64643M Decider::construct_proof=1 Decider::construct_proof(t)=1.585G DeciderProvingKey(Circuit&)=12 DeciderProvingKey(Circuit&)(t)=2.60037G DeltaRange::
accumulate=1.87876M DeltaRange::accumulate(t)=4.23593G ECCVMProver(CircuitBuilder&)=1 ECCVMProver(CircuitBuilder&)(t)=229.56M ECCVMProver::construct_proof=1 ECCVMProver::construct_proof(t)=2.59633G Elliptic::accumulate=183.692k Elliptic::accumulate(t)=464.215M Gobl
in::merge=23 Goblin::merge(t)=116.512M Lookup::accumulate=1.66363M Lookup::accumulate(t)=3.69157G MegaFlavor::get_row=6.18564M MegaFlavor::get_row(t)=4.21736G OinkProver::execute_grand_product_computation_round=12 OinkProver::execute_grand_product_computation_round
(t)=3.58134G OinkProver::execute_log_derivative_inverse_round=12 OinkProver::execute_log_derivative_inverse_round(t)=2.47144G OinkProver::execute_preamble_round=12 OinkProver::execute_preamble_round(t)=151.77k OinkProver::execute_sorted_list_accumulator_round=12 Oi
nkProver::execute_sorted_list_accumulator_round(t)=681.46M OinkProver::execute_wire_commitments_round=12 OinkProver::execute_wire_commitments_round(t)=1.69618G OinkProver::generate_alphas_round=12 OinkProver::generate_alphas_round(t)=3.44188M Permutation::accumulat
e=10.6427M Permutation::accumulate(t)=40.6403G PoseidonExt::accumulate=30.452k PoseidonExt::accumulate(t)=77.2342M PoseidonInt::accumulate=210.454k PoseidonInt::accumulate(t)=374.93M ProtogalaxyProver::prove=11 ProtogalaxyProver::prove(t)=19.5955G ProtogalaxyProver
_::combiner_quotient_round=11 ProtogalaxyProver_::combiner_quotient_round(t)=8.44199G ProtogalaxyProver_::compute_row_evaluations=11 ProtogalaxyProver_::compute_row_evaluations(t)=1.97625G ProtogalaxyProver_::perturbator_round=11 ProtogalaxyProver_::perturbator_rou
nd(t)=2.87543G ProtogalaxyProver_::run_oink_prover_on_each_incomplete_key=11 ProtogalaxyProver_::run_oink_prover_on_each_incomplete_key(t)=7.60574G ProtogalaxyProver_::update_target_sum_and_fold=11 ProtogalaxyProver_::update_target_sum_and_fold(t)=672.3M Translator
CircuitBuilder::constructor=1 TranslatorCircuitBuilder::constructor(t)=33.0787M TranslatorProver=1 TranslatorProver(t)=46.9695M TranslatorProver::construct_proof=1 TranslatorProver::construct_proof(t)=838.946M batch_mul_with_endomorphism=16 batch_mul_with_endomorph
ism(t)=407.175M commit=543 commit(t)=6.55046G commit_sparse=36 commit_sparse(t)=11.5922M compute_combiner=11 compute_combiner(t)=8.04092G compute_perturbator=11 compute_perturbator(t)=2.87517G compute_univariate=51 compute_univariate(t)=2.18992G construct_circuits=
12 construct_circuits(t)=4.20212G pippenger=215 pippenger(t)=101.133M pippenger_unsafe_optimized_for_non_dyadic_polys=543 pippenger_unsafe_optimized_for_non_dyadic_polys(t)=6.54615G                                                                                    
Benchmarking lock deleted.                                                                                                                                                                                                                                               
client_ivc_bench.json                                                                                                                                                                                                                 100% 6912   245.8KB/s   00:00      
function                                  ms     % sum                                                                                                                                                                                                                   
construct_circuits(t)                   4202    13.23%                                                                                                                                                                                                                   
DeciderProvingKey(Circuit&)(t)          2600     8.19%                                                                                                                                                                                                                   
ProtogalaxyProver::prove(t)            19596    61.69%                                                                                                                                                                                                                   
Decider::construct_proof(t)             1585     4.99%                                                                                                                                                                                                                   
ECCVMProver(CircuitBuilder&)(t)          230     0.72%                                                                                                                                                                                                                   
ECCVMProver::construct_proof(t)         2596     8.17%                                                                                                                                                                                                                   
TranslatorProver::construct_proof(t)     839     2.64%                                                                                                                                                                                                                   
Goblin::merge(t)                         117     0.37%                                                                                                                                                                                                                   
                                                                                                                                                                                                                                                                         
Total time accounted for: 31764ms/32719ms = 97.08%                                                                                                                                                                                                                       
                                                                                                                                                                                                                                                                         
Major contributors:                                                                                                                                                                                                                                                      
function                                  ms    % sum                                                                                                                                                                                                                    
commit(t)                               6550   20.62%                                                                                                                                                                                                                    
compute_combiner(t)                     8041   25.31%                                                                                                                                                                                                                    
compute_perturbator(t)                  2875    9.05%                                                                                                                                                                                                                    
compute_univariate(t)                   2190    6.89%                                                                                                                                                                                                                    
                                                                                                                                                                                                                                                                         
Breakdown of ProtogalaxyProver::prove:                                                                                                                                                                                                                                   
ProtogalaxyProver_::run_oink_prover_on_each_incomplete_key(t)    7606    38.81%                                                                                                                                                                                          
ProtogalaxyProver_::perturbator_round(t)                         2875    14.67%                                                                                                                                                                                          
ProtogalaxyProver_::combiner_quotient_round(t)                   8442    43.08%                                                                                                                                                                                          
ProtogalaxyProver_::update_target_sum_and_fold(t)                 672     3.43%                                                                                                                                                                                          
                                                                                                                                                                                                                                                                         
Relation contributions (times to be interpreted relatively):                                                                                                                                                                                                             
Total time accounted for (ms):    70206                                                                                                                                                                                                                                  
operation                       ms     % sum                                                                                                                                                                                                                             
Arithmetic::accumulate(t)     7364    10.49%                                                                                                                                                                                                                             
Permutation::accumulate(t)   40640    57.89%                                                                                                                                                                                                                             
Lookup::accumulate(t)         3692     5.26%                                                                                                                                                                                                                             
DeltaRange::accumulate(t)     4236     6.03%                                                                                                                                                                                                                             
Elliptic::accumulate(t)        464     0.66%                                                                                                                                                                                                                             
Auxiliary::accumulate(t)     13357    19.02%                                                                                                                                                                                                                             
EccOp::accumulate(t)             0     0.00%                                                                                                                                                                                                                             
DatabusRead::accumulate(t)       2     0.00%                                                                                                                                                                                                                             
PoseidonExt::accumulate(t)      77     0.11%                                                                                                                                                                                                                             
PoseidonInt::accumulate(t)     375     0.53%                                                                                                                                                                                                                             

Commitment contributions:
Total time accounted for (ms):     4951
operation                          ms     % sum
COMMIT::wires(t)                 2217    44.77%
COMMIT::z_perm(t)                2313    46.71%
COMMIT::databus(t)                  8     0.17%
COMMIT::ecc_op_wires(t)            39     0.78%
COMMIT::lookup_inverses(t)        257     5.19%
COMMIT::databus_inverses(t)        12     0.23%
```

---------

Co-authored-by: lucasxia01 <lucasxia01@gmail.com>
  • Loading branch information
codygunton and lucasxia01 authored Sep 30, 2024
1 parent b70a728 commit ead4649
Show file tree
Hide file tree
Showing 26 changed files with 328 additions and 24 deletions.
9 changes: 9 additions & 0 deletions barretenberg/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,15 @@ else()
SET(TRACY_LIBS)
endif()

if(TRACY_PROFILE_MEMORY)
add_compile_options(-DTRACY_MEMORY)
endif()

if(TRACY_PROFILE_TIME)
add_compile_options(-DTRACY_TIME)
endif()


if(ENABLE_ASAN)
add_compile_options(-fsanitize=address)
add_link_options(-fsanitize=address)
Expand Down
33 changes: 28 additions & 5 deletions barretenberg/cpp/CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -111,13 +111,31 @@
}
},
{
"name": "tracy",
"name": "tracy-memory",
"displayName": "Release build with tracy, optimized for memory tracking",
"description": "Release build with tracy, optimized for memory tracking",
"inherits": "clang16",
"binaryDir": "build-tracy",
"binaryDir": "build-tracy-memory",
"cacheVariables": {
"ENABLE_TRACY": "ON"
"ENABLE_TRACY": "ON",
"TRACY_PROFILE_MEMORY": "ON"
}
},
{
"name": "tracy-time",
"displayName": "Build for tracy time profiling",
"description": "Build for tracy time profiling",
"binaryDir": "build-tracy-time",
"inherits": "clang16",
"environment": {
"CMAKE_BUILD_TYPE": "RelWithDebInfo",
"CFLAGS": "-g -fno-omit-frame-pointer",
"CXXFLAGS": "-g -fno-omit-frame-pointer",
"LDFLAGS": "-g -fno-omit-frame-pointer -rdynamic"
},
"cacheVariables": {
"ENABLE_TRACY": "ON",
"TRACY_PROFILE_TIME": "ON"
}
},
{
Expand Down Expand Up @@ -472,9 +490,14 @@
"configurePreset": "clang16-dbg"
},
{
"name": "tracy",
"name": "tracy-memory",
"inherits": "default",
"configurePreset": "tracy-memory"
},
{
"name": "tracy-time",
"inherits": "default",
"configurePreset": "tracy"
"configurePreset": "tracy-time"
},
{
"name": "clang16-pic",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@

# NOTE: intended to be ran from one's external computer, connecting to Aztec mainframe
# IF ON YOUR LOCAL COMPUTER USE NORMAL INTERACTIVE TRACY WORKFLOW
# the benchmark runs with headless capture and then we copy the trace file and run tracy profiler
# This is thus only really useful internally at Aztec, sorry external folks. It can be easily tweaked
# however for any SSH setup, especially an ubuntu one.
# on local machine run:
# export USER=...
# export PRESET=...tracy for memory or tracy-gates for circuit gates...
# ssh $USER-box "cat ~/aztec-packages/barretenberg/cpp/scripts/benchmark_tracy.sh" | bash /dev/stdin $USER
set -eux
USER=${1:-$USER}
BOX=$USER-box
BENCHMARK=${2:-protogalaxy_bench}
COMMAND=${3:-./bin/$BENCHMARK --benchmark_filter=fold_k/17}

# Can also set PRESET=tracy-gates env variable
PRESET=${PRESET:-tracy-time}

wait # TODO(AD) hack - not sure why needed
! [ -d ~/tracy ] && git clone https://github.com/wolfpld/tracy ~/tracy
cd ~/tracy
git checkout 075395620a504c0cdcaf9bab3d196db16a043de7 # release 0.11.0
cmake -B profiler/build -S profiler -DCMAKE_BUILD_TYPE=Release
cmake --build profiler/build --parallel
cd -

ssh $BOX "
set -eux ;
cd ~/aztec-packages/barretenberg/cpp/ ;
cmake --preset $PRESET && cmake --build --preset $PRESET --target $BENCHMARK ;
" &
wait
if [ ! -d build-$PRESET/bin ]; then
echo build-$PRESET/bin;
mkdir -p build-$PRESET/bin;
fi
scp $BOX:/mnt/user-data/$USER/aztec-packages/barretenberg/cpp/build-$PRESET/bin/$BENCHMARK build-$PRESET/bin/. ;
! [ -d ~/tracy ] && git clone https://github.com/wolfpld/tracy ~/tracy ;
cd ~/tracy/capture ;
git checkout 075395620a504c0cdcaf9bab3d196db16a043de7 ;
mkdir -p build && cd build && cmake .. && make -j ;

./tracy-capture -a 127.0.0.1 -f -o ../trace-$BENCHMARK &
sleep 0.1 ;
cd ~/aztec-packages/barretenberg/cpp/build-$PRESET/
$COMMAND ;

~/tracy/profiler/build/tracy-profiler ~/tracy/capture/trace-$BENCHMARK
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ BENCHMARK_DEFINE_F(ClientIVCBench, Full)(benchmark::State& state)
}
}

#define ARGS Arg(ClientIVCBench::NUM_ITERATIONS_MEDIUM_COMPLEXITY)
#define ARGS Arg(ClientIVCBench::NUM_ITERATIONS_MEDIUM_COMPLEXITY)->Arg(2)

BENCHMARK_REGISTER_F(ClientIVCBench, Full)->Unit(benchmark::kMillisecond)->ARGS;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ void fold_k(State& state) noexcept
}
}

BENCHMARK(vector_of_evaluations)->DenseRange(15, 21)->Unit(kMillisecond);
BENCHMARK(vector_of_evaluations)->DenseRange(15, 21)->Unit(kMillisecond)->Iterations(1);
BENCHMARK(compute_row_evaluations)->DenseRange(15, 21)->Unit(kMillisecond);
// We stick to just k=1 for compile-time reasons.
BENCHMARK(fold_k)->/* vary the circuit size */ DenseRange(14, 20)->Unit(kMillisecond);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,50 @@ using bb::Univariate;

namespace bb::benchmark {

void extend_2_to_6(State& state) noexcept
void extend_2_to_11(State& state) noexcept
{
auto univariate = Univariate<FF, 2>::get_random();
for (auto _ : state) {
DoNotOptimize(univariate.extend_to<6>());
DoNotOptimize(univariate.extend_to<11>());
}
}
BENCHMARK(extend_2_to_6);

// 93.9s goes down to 62.7
// Theoretical min: 1 sub, 9 additions at about 3.8ns each, 38ns
void fake_extend_2_to_11(State& state) noexcept
{
std::array<FF, 11> univariate;
std::generate(univariate.begin(), univariate.end(), [&]() { return FF::random_element(); });

const auto extend_to_11 = [](auto& arr) {
FF tmp = arr[1];
const FF delta = tmp - arr[0];
for (size_t idx = 2; idx < 10; idx++) {
arr[idx] = (tmp += delta); // fused ~> 62.9ns; non-fused ~>69.5ns
}
arr[10] = tmp; // save one +=;
return arr;
};

for (auto _ : state) {
DoNotOptimize(extend_to_11(univariate));
}
}

// 93.9s goes down to 62.7
// Theoretical min: 1 sub, 9 additions at about 3.8ns each, 38ns
void self_extend_2_to_11(State& state) noexcept
{
auto univariate = Univariate<FF, 11>::get_random();

for (auto _ : state) {
univariate.self_extend_from<2>();
}
}

BENCHMARK(extend_2_to_11);
BENCHMARK(fake_extend_2_to_11);
BENCHMARK(self_extend_2_to_11);

} // namespace bb::benchmark

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ Prover get_prover(void (*test_circuit_function)(typename Prover::Flavor::Circuit
Composer composer;
return composer.create_prover(builder);
} else {
#ifdef TRACY_MEMORY
ZoneScopedN("creating prover");
#endif
return Prover(builder);
}
};
Expand Down
12 changes: 10 additions & 2 deletions barretenberg/cpp/src/barretenberg/common/op_count.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#pragma once

#include <memory>
#include <tracy/Tracy.hpp>
#ifndef BB_USE_OP_COUNT
// require a semicolon to appease formatters
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
Expand All @@ -11,12 +12,19 @@
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_CYCLES_NAME(name) (void)0
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_TIME_NAME(name) (void)0
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_CYCLES() (void)0
#ifndef TRACY_TIME
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_TIME_NAME(name) (void)0
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_TIME() (void)0
#else
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_TIME_NAME(name) ZoneScopedN(name)
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_TIME() BB_OP_COUNT_TIME_NAME(__func__)
#endif
#else
/**
* Provides an abstraction that counts operations based on function names.
* For efficiency, we spread out counts across threads.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ namespace bb {

template <class Flavor> void ExecutionTrace_<Flavor>::populate_public_inputs_block(Builder& builder)
{
#ifdef TRACY_MEMORY
ZoneScopedN("populate_public_inputs_block");
#endif
// Update the public inputs block
for (const auto& idx : builder.public_inputs) {
for (size_t wire_idx = 0; wire_idx < NUM_WIRES; ++wire_idx) {
Expand All @@ -27,7 +29,10 @@ template <class Flavor> void ExecutionTrace_<Flavor>::populate_public_inputs_blo
template <class Flavor>
void ExecutionTrace_<Flavor>::populate(Builder& builder, typename Flavor::ProvingKey& proving_key, bool is_structured)
{

#ifdef TRACY_MEMORY
ZoneScopedN("trace populate");
#endif
// Share wire polynomials, selector polynomials between proving key and builder and copy cycles from raw circuit
// data
auto trace_data = construct_trace_data(builder, proving_key, is_structured);
Expand All @@ -36,18 +41,27 @@ void ExecutionTrace_<Flavor>::populate(Builder& builder, typename Flavor::Provin
proving_key.pub_inputs_offset = trace_data.pub_inputs_offset;
}
if constexpr (IsUltraPlonkOrHonk<Flavor>) {

#ifdef TRACY_MEMORY
ZoneScopedN("add_memory_records_to_proving_key");
#endif
add_memory_records_to_proving_key(trace_data, builder, proving_key);
}

if constexpr (IsGoblinFlavor<Flavor>) {

#ifdef TRACY_MEMORY
ZoneScopedN("add_ecc_op_wires_to_proving_key");
#endif
add_ecc_op_wires_to_proving_key(builder, proving_key);
}

// Compute the permutation argument polynomials (sigma/id) and add them to proving key
{

#ifdef TRACY_MEMORY
ZoneScopedN("compute_permutation_argument_polynomials");
#endif
compute_permutation_argument_polynomials<Flavor>(builder, &proving_key, trace_data.copy_cycles);
}
}
Expand All @@ -73,7 +87,10 @@ template <class Flavor>
typename ExecutionTrace_<Flavor>::TraceData ExecutionTrace_<Flavor>::construct_trace_data(
Builder& builder, typename Flavor::ProvingKey& proving_key, bool is_structured)
{

#ifdef TRACY_MEMORY
ZoneScopedN("construct_trace_data");
#endif

if constexpr (IsPlonkFlavor<Flavor>) {
// Complete the public inputs execution trace block from builder.public_inputs
Expand All @@ -91,7 +108,10 @@ typename ExecutionTrace_<Flavor>::TraceData ExecutionTrace_<Flavor>::construct_t
// Update wire polynomials and copy cycles
// NB: The order of row/column loops is arbitrary but needs to be row/column to match old copy_cycle code
{

#ifdef TRACY_MEMORY
ZoneScopedN("populating wires and copy_cycles");
#endif
for (uint32_t block_row_idx = 0; block_row_idx < block_size; ++block_row_idx) {
for (uint32_t wire_idx = 0; wire_idx < NUM_WIRES; ++wire_idx) {
uint32_t var_idx = block.wires[wire_idx][block_row_idx]; // an index into the variables array
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@ template <class Flavor> class ExecutionTrace_ {

TraceData(Builder& builder, ProvingKey& proving_key)
{

#ifdef TRACY_MEMORY
ZoneScopedN("TraceData constructor");
#endif
if constexpr (IsHonkFlavor<Flavor>) {
// Initialize and share the wire and selector polynomials
for (auto [wire, other_wire] : zip_view(wires, proving_key.polynomials.get_wires())) {
Expand All @@ -45,7 +48,10 @@ template <class Flavor> class ExecutionTrace_ {
proving_key.polynomial_store.put(wire_tag, wires[idx].share());
}
{

#ifdef TRACY_MEMORY
ZoneScopedN("selector initialization");
#endif
for (size_t idx = 0; idx < Builder::Arithmetization::NUM_SELECTORS; ++idx) {
selectors[idx] = Polynomial(proving_key.circuit_size);
std::string selector_tag = builder.selector_names[idx] + "_lagrange";
Expand All @@ -54,7 +60,10 @@ template <class Flavor> class ExecutionTrace_ {
}
}
{

#ifdef TRACY_MEMORY
ZoneScopedN("copy cycle initialization");
#endif
copy_cycles.resize(builder.variables.size());
}
}
Expand Down
3 changes: 3 additions & 0 deletions barretenberg/cpp/src/barretenberg/flavor/flavor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,10 @@ template <typename FF, typename CommitmentKey_> class ProvingKey_ {
std::shared_ptr<CommitmentKey_> commitment_key = nullptr)
{
if (commitment_key == nullptr) {

#ifdef TRACY_MEMORY
ZoneScopedN("init commitment key");
#endif
this->commitment_key = std::make_shared<CommitmentKey_>(circuit_size);
} else {
// Don't create another commitment key if we already have one
Expand Down
Loading

0 comments on commit ead4649

Please sign in to comment.