Skip to content

Commit

Permalink
feat: track active accumulator rows and leverage in IVC folding (#9599)
Browse files Browse the repository at this point in the history
If the IVC accumulator does not make use of the full capacity of a given
block, there will be some number of unused/empty rows at the end of the
block. We have a mechanism to skip performing relation work at such rows
but we sometimes still perform relatively expensive operations like
`get_row()` on them. This additional overhead can become very
significant if the total accumulator content is much smaller than the
structured trace size. We can avoid a lot of unnecessary work on these
empty rows by tracking the "active ranges" of the execution trace, i.e.
the regions of the accumulator which correspond to non-zero relation
contributions.

This PR introduces a class `ExecutionTraceUsageTracker` for tracking the
active regions of the accumulator and computing efficient distribution
of the execution trace rows according to actual content in the
multithreading context. This logic is leveraged in the combiner and
perturbator computations but can potentially be introduced in other
places as well.

Some high level numbers for the current benchmark case but with the 2^19
circuit replaced with another 2^17 circuit (i.e. ~2^17 worth of content
in a 2^19 ambient trace)

Master:
```
ClientIVCBench/Full/6      27137 ms        23345 ms

compute_combiner(t)                     5826   22.12%
compute_perturbator(t)                  2295    8.71%
```

Branch:
```
ClientIVCBench/Full/6      23247 ms        21532 ms

compute_combiner(t)                     2804   12.49%
compute_perturbator(t)                  1522    6.78%
```
  • Loading branch information
ledwards2225 authored Nov 4, 2024
1 parent 379145a commit 76328eb
Show file tree
Hide file tree
Showing 13 changed files with 475 additions and 168 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ void vector_of_evaluations(State& state) noexcept

void compute_row_evaluations(State& state) noexcept
{
using Fun = ProtogalaxyProverInternal<DeciderProvingKeys_<Flavor, 2>>;
using PGInternal = ProtogalaxyProverInternal<DeciderProvingKeys_<Flavor, 2>>;
using Polys = Flavor::ProverPolynomials;
using Alphas = Flavor::RelationSeparator;
using Params = RelationParameters<FF>;
Expand All @@ -38,7 +38,8 @@ void compute_row_evaluations(State& state) noexcept
auto params = Params::get_random();

for (auto _ : state) {
auto result = Fun::compute_row_evaluations(polys, alphas, params);
PGInternal pg_internal;
auto result = pg_internal.compute_row_evaluations(polys, alphas, params);
DoNotOptimize(result);
}
}
Expand Down
11 changes: 6 additions & 5 deletions barretenberg/cpp/src/barretenberg/client_ivc/client_ivc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,11 +174,15 @@ void ClientIVC::accumulate(ClientCircuit& circuit, const std::shared_ptr<Verific
std::shared_ptr<DeciderProvingKey> proving_key;
if (!initialized) {
proving_key = std::make_shared<DeciderProvingKey>(circuit, trace_structure);
trace_usage_tracker = ExecutionTraceUsageTracker(trace_structure);
} else {
proving_key = std::make_shared<DeciderProvingKey>(
circuit, trace_structure, fold_output.accumulator->proving_key.commitment_key);
}

// Update the accumulator trace usage based on the present circuit
trace_usage_tracker.update(circuit);

// Set the verification key from precomputed if available, else compute it
honk_vk = precomputed_vk ? precomputed_vk : std::make_shared<VerificationKey>(proving_key->proving_key);
if (mock_vk) {
Expand All @@ -201,15 +205,12 @@ void ClientIVC::accumulate(ClientCircuit& circuit, const std::shared_ptr<Verific

initialized = true;
} else { // Otherwise, fold the new key into the accumulator
FoldingProver folding_prover({ fold_output.accumulator, proving_key });
FoldingProver folding_prover({ fold_output.accumulator, proving_key }, trace_usage_tracker);
fold_output = folding_prover.prove();

// Add fold proof and corresponding verification key to the verification queue
verification_queue.push_back(bb::ClientIVC::VerifierInputs{ fold_output.proof, honk_vk, QUEUE_TYPE::PG });
}

// Track the maximum size of each block for all circuits porcessed (for debugging purposes only)
max_block_size_tracker.update(circuit);
}

/**
Expand All @@ -219,7 +220,7 @@ void ClientIVC::accumulate(ClientCircuit& circuit, const std::shared_ptr<Verific
*/
ClientIVC::Proof ClientIVC::prove()
{
max_block_size_tracker.print(); // print minimum structured sizes for each block
trace_usage_tracker.print(); // print minimum structured sizes for each block
ASSERT(verification_queue.size() == 1); // ensure only a single fold proof remains in the queue
ASSERT(merge_verification_queue.size() == 1); // ensure only a single merge proof remains in the queue
FoldProof& fold_proof = verification_queue[0].proof;
Expand Down
4 changes: 2 additions & 2 deletions barretenberg/cpp/src/barretenberg/client_ivc/client_ivc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

#include "barretenberg/goblin/goblin.hpp"
#include "barretenberg/goblin/mock_circuits.hpp"
#include "barretenberg/plonk_honk_shared/arithmetization/max_block_size_tracker.hpp"
#include "barretenberg/plonk_honk_shared/arithmetization/execution_trace_usage_tracker.hpp"
#include "barretenberg/protogalaxy/protogalaxy_prover.hpp"
#include "barretenberg/protogalaxy/protogalaxy_verifier.hpp"
#include "barretenberg/stdlib/primitives/databus/databus.hpp"
Expand Down Expand Up @@ -83,7 +83,7 @@ class ClientIVC {
using StdlibVerificationQueue = std::vector<StdlibVerifierInputs>;

// Utility for tracking the max size of each block across the full IVC
MaxBlockSizeTracker max_block_size_tracker;
ExecutionTraceUsageTracker trace_usage_tracker;

private:
using ProverFoldOutput = FoldingResult<Flavor>;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
#pragma once

#include "barretenberg/plonk_honk_shared/arithmetization/mega_arithmetization.hpp"
#include "barretenberg/stdlib_circuit_builders/mega_circuit_builder.hpp"

namespace bb {

/**
* @brief Tracks the cumulative usage of the execution trace across a series of circuits
* @details Primary uses are (1) determining the minimum required structured trace block sizes for a series of circuits
* in an IVC, and (2) determining the optimal distribution of rows across threads to evenly distribute work based on the
* fact that unused rows often do not require any computation.
*
*/
struct ExecutionTraceUsageTracker {
using Range = std::pair<size_t, size_t>;
using Builder = MegaCircuitBuilder;
using MegaTraceBlockSizes = MegaArithmetization::MegaTraceBlocks<size_t>;
using MegaTraceActiveRanges = MegaArithmetization::MegaTraceBlocks<Range>;
using MegaTraceFixedBlockSizes = MegaArithmetization::TraceBlocks;

MegaTraceBlockSizes max_sizes; // max utilization of each block
MegaTraceFixedBlockSizes fixed_sizes; // fixed size of each block prescribed by structuring
MegaTraceActiveRanges active_ranges; // ranges utlized by the accumulator within the ambient structured trace

std::vector<Range> thread_ranges; // ranges within the ambient space over which utilized space is evenly distibuted

// Max sizes of the "tables" for databus and conventional lookups (distinct from the sizes of their gate blocks)
size_t max_databus_size = 0;
size_t max_tables_size = 0;

TraceStructure trace_structure = TraceStructure::NONE;

ExecutionTraceUsageTracker(const TraceStructure& trace_structure = TraceStructure::NONE)
: trace_structure(trace_structure)
{
for (auto& size : max_sizes.get()) {
size = 0; // init max sizes to zero
}
fixed_sizes.set_fixed_block_sizes(trace_structure);
fixed_sizes.compute_offsets(/*is_structured=*/true);
}

// Update the max block utilization and active trace ranges based on the data from a provided circuit
void update(const Builder& circuit)
{
// Update the max utilization of each gate block
for (auto [block, max_size] : zip_view(circuit.blocks.get(), max_sizes.get())) {
max_size = std::max(block.size(), max_size);
}

// update the max sixe of the databus and lookup tables
max_databus_size = std::max({ max_databus_size,
circuit.get_calldata().size(),
circuit.get_secondary_calldata().size(),
circuit.get_return_data().size() });
max_tables_size = std::max(max_tables_size, circuit.get_tables_size());

// Update the active ranges of the trace based on max block utilization
for (auto [max_size, fixed_block, active_range] :
zip_view(max_sizes.get(), fixed_sizes.get(), active_ranges.get())) {
size_t start_idx = fixed_block.trace_offset;
size_t end_idx = start_idx + max_size;
active_range = Range{ start_idx, end_idx };
}

// The active ranges for the databus and lookup relations (both based on log-deriv lookup argument) must
// incorporate both the lookup/read gate blocks as well as the rows containing the data that is being read.
// Update the corresponding ranges accordingly. (Note: tables are constructed at the 'bottom' of the trace).
size_t dyadic_circuit_size = circuit.get_circuit_subgroup_size(fixed_sizes.get_total_structured_size());
active_ranges.busread.first = 0; // databus data is stored at the top of the trace
active_ranges.busread.second = std::max(max_databus_size, active_ranges.busread.second);
active_ranges.lookup.first = std::min(dyadic_circuit_size - max_tables_size, active_ranges.lookup.first);
active_ranges.lookup.second = dyadic_circuit_size; // lookups are stored at the bottom of the trace
}

// Check whether an index is contained within the active ranges
bool check_is_active(const size_t idx)
{
// If structured trace is not in use, assume the whole trace is active
if (trace_structure == TraceStructure::NONE) {
return true;
}
for (auto& range : active_ranges.get()) {
if (idx >= range.first && idx < range.second) {
return true;
}
}
return false;
}

// For printing only. Must match the order of the members in the arithmetization
std::vector<std::string> block_labels{ "ecc_op", "pub_inputs", "busread",
"arithmetic", "delta_range", "elliptic",
"aux", "poseidon2_external", "poseidon2_internal",
"lookup" };

void print()
{
info("Minimum required block sizes for structured trace: ");
for (auto [label, max_size] : zip_view(block_labels, max_sizes.get())) {
std::cout << std::left << std::setw(20) << (label + ":") << max_size << std::endl;
}
info("");
}

void print_active_ranges()
{
info("Active regions of accumulator: ");
for (auto [label, range] : zip_view(block_labels, active_ranges.get())) {
std::cout << std::left << std::setw(20) << (label + ":") << "(" << range.first << ", " << range.second
<< ")" << std::endl;
}
info("");
}

void print_thread_ranges()
{
info("Thread ranges: ");
for (auto range : thread_ranges) {
std::cout << "(" << range.first << ", " << range.second << ")" << std::endl;
}
info("");
}

/**
* @brief Construct ranges of execution trace rows that evenly distribute the active content of the trace across a
* given number of threads.
*
* @param num_threads Num ranges over which to distribute the data
* @param full_domain_size Size of full domain; needed only for unstructured case
*/
void construct_thread_ranges(const size_t num_threads, const size_t full_domain_size)
{
// Copy the ranges into a simple std container for processing by subsequent methods (cheap)
std::vector<Range> active_ranges_copy;
for (const auto& range : active_ranges.get()) {
active_ranges_copy.push_back(range);
}

// Convert the active ranges for each gate type into a set of sorted non-overlapping ranges (union of the input)
std::vector<Range> simplified_active_ranges;
if (trace_structure == TraceStructure::NONE) {
// If not using a structured trace, set the active range to the whole domain
simplified_active_ranges.push_back(Range{ 0, full_domain_size });
} else {
simplified_active_ranges = construct_union_of_ranges(active_ranges_copy);
}

// Determine ranges in the structured trace that even distibute the active content across threads
thread_ranges = construct_ranges_for_equal_content_distribution(simplified_active_ranges, num_threads);
}

/**
* @brief Construct sorted disjoint ranges representing the union of an arbitrary set of ranges
* @details Used to convert the more complex set of active ranges for the gate types into a set of well formed
* ranges that can be more easily analyzed.
*
* @param ranges Arbitrary set of input ranges (in practice, active ranges of gate types)
* @return std::vector<Range>
*/
static std::vector<Range> construct_union_of_ranges(std::vector<Range>& ranges)
{
std::vector<Range> union_ranges;

// Sort the ranges by start index (secondarily by end_idx if start indices agree)
std::sort(ranges.begin(), ranges.end());

union_ranges.push_back(ranges.front());

for (const Range& range : ranges) {
Range& prev_range = union_ranges.back();

// If the two ranges overlap or are contiguous, merge them
if (range.first <= prev_range.second) {
prev_range.second = std::max(range.second, prev_range.second);
} else { // otherwise add the present range to the union
union_ranges.push_back(range);
}
}

return union_ranges;
}

/**
* @brief Given a set of ranges indicating "active" regions of an ambient space, define a given number of new ranges
* on the ambient space which evenly divide the content
* @details In practive this is used to determine even distribution of execution trace rows across threads according
* to ranges describing the active rows of an IVC accumulator
*
* @param union_ranges A set of sorted, disjoint ranges
* @param num_threads
* @return std::vector<Range>
*/
static std::vector<Range> construct_ranges_for_equal_content_distribution(const std::vector<Range>& union_ranges,
const size_t num_threads)
{
// Compute the minimum content per thread (final thread will get the leftovers = total_content % num_threads)
size_t total_content = 0;
for (const Range& range : union_ranges) {
total_content += range.second - range.first;
}
size_t content_per_thread = total_content / num_threads;

std::vector<Range> thread_ranges;
size_t start_idx = union_ranges.front().first;
size_t thread_space_remaining = content_per_thread; // content space remaining in current thread
size_t leftovers = 0; // content from last range not yet placed in a thread range

for (const Range& range : union_ranges) {

size_t range_size = range.second - range.first;
size_t content_to_distribute = range_size + leftovers;
size_t num_full_threads = content_to_distribute / content_per_thread;
leftovers = content_to_distribute % content_per_thread;

size_t end_idx = range.first;
for (size_t i = 0; i < num_full_threads; ++i) {
end_idx += thread_space_remaining;
thread_ranges.push_back(Range{ start_idx, end_idx });
start_idx = end_idx;
thread_space_remaining = content_per_thread;
}
thread_space_remaining = content_per_thread - leftovers;
}
// Extend the final thread range to the end of the final union range
thread_ranges.back().second = union_ranges.back().second;

return thread_ranges;
}
};
} // namespace bb
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#include "barretenberg/plonk_honk_shared/arithmetization/execution_trace_usage_tracker.hpp"

#include <gtest/gtest.h>

using namespace bb;

/**
* @brief Tests for some of the utility methods in ExecutionTraceUsageTrackerTest for equally distributing work across
* threads for the perturbator/combiner calculations
*
*/
class ExecutionTraceUsageTrackerTest : public ::testing::Test {};

// Test construction of the sorted disjoint union of active ranges from a more general set of ranges
TEST_F(ExecutionTraceUsageTrackerTest, ConstructUnionRanges)
{
using Range = ExecutionTraceUsageTracker::Range;

std::vector<Range> active_ranges = { { 4, 7 }, { 9, 13 }, { 1, 12 }, { 23, 40 }, { 17, 19 } };

std::vector<Range> expected_union_ranges = { { 1, 13 }, { 17, 19 }, { 23, 40 } };

std::vector<Range> union_ranges = ExecutionTraceUsageTracker::construct_union_of_ranges(active_ranges);

EXPECT_EQ(union_ranges, expected_union_ranges);
}

// Test construction of ranges of indices for each thread that evenly distribute work according to the provided ranges
TEST_F(ExecutionTraceUsageTrackerTest, ConstructThreadRanges)
{
using Range = ExecutionTraceUsageTracker::Range;

std::vector<Range> union_ranges = { { 2, 8 }, { 13, 34 }, { 36, 42 }, { 50, 57 } };

std::vector<Range> expected_thread_ranges = { { 2, 17 }, { 17, 27 }, { 27, 39 }, { 39, 57 } };

const size_t num_threads = 4;
std::vector<Range> thread_ranges =
ExecutionTraceUsageTracker::construct_ranges_for_equal_content_distribution(union_ranges, num_threads);

EXPECT_EQ(thread_ranges, expected_thread_ranges);
}
Loading

0 comments on commit 76328eb

Please sign in to comment.