From 9f5773353aa0261fa07a81704bcadcee513d42c5 Mon Sep 17 00:00:00 2001 From: Innokentii Sennovskii Date: Tue, 30 Apr 2024 15:49:09 +0100 Subject: [PATCH 01/45] feat: Avoiding redundant computation in PG (#5844) This PR reduces PG computation time by removing computation on indices over which accumulated relation values are expected to be zero. This gives us a speedup of 4-5%. The PR also parallelised pertubator root construction. Before: x86_64: ![image](https://github.com/AztecProtocol/aztec-packages/assets/4798775/80247864-1c1c-4e34-8756-a8bd44bdbab2) wasm: ![image](https://github.com/AztecProtocol/aztec-packages/assets/4798775/649dfd97-d65c-48a5-8b8c-02fb3fbb9f47) After: x86_64: ![image](https://github.com/AztecProtocol/aztec-packages/assets/4798775/d453e026-7a88-4646-8094-b7102916a2af) wasm: ![image](https://github.com/AztecProtocol/aztec-packages/assets/4798775/549b89a3-d1dd-4058-84a2-92856180d15d) --- .../cpp/src/barretenberg/flavor/flavor.hpp | 15 +- .../barretenberg/polynomials/univariate.hpp | 214 ++++++++++------ .../protogalaxy/combiner.test.cpp | 44 ++-- .../protogalaxy/protogalaxy.test.cpp | 5 + .../protogalaxy/protogalaxy_prover.hpp | 242 +++++++++++++++--- .../relations/nested_containers.hpp | 24 +- .../barretenberg/relations/relation_types.hpp | 6 + .../goblin_ultra_flavor.hpp | 14 + .../stdlib_circuit_builders/ultra_flavor.hpp | 11 + .../sumcheck/instance/instances.hpp | 7 +- .../barretenberg/vm/generated/avm_flavor.hpp | 7 + 11 files changed, 445 insertions(+), 144 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/flavor/flavor.hpp b/barretenberg/cpp/src/barretenberg/flavor/flavor.hpp index 18eda8ecd0c..fb30168d58d 100644 --- a/barretenberg/cpp/src/barretenberg/flavor/flavor.hpp +++ b/barretenberg/cpp/src/barretenberg/flavor/flavor.hpp @@ -251,18 +251,23 @@ template static constexpr size_t compute * @details The size of the outer tuple is equal to the number of relations. Each relation contributes an inner tuple of * univariates whose size is equal to the number of subrelations of the relation. The length of a univariate in an inner * tuple is determined by the corresponding subrelation length and the number of instances to be folded. + * @tparam optimised Enable optimised version with skipping some of the computation */ -template +template static constexpr auto create_protogalaxy_tuple_of_tuples_of_univariates() { if constexpr (Index >= std::tuple_size::value) { return std::tuple<>{}; // Return empty when reach end of the tuple } else { using UnivariateTuple = - typename std::tuple_element_t::template ProtogalaxyTupleOfUnivariatesOverSubrelations; - return std::tuple_cat(std::tuple{}, - create_protogalaxy_tuple_of_tuples_of_univariates()); + std::conditional_t:: + template OptimisedProtogalaxyTupleOfUnivariatesOverSubrelations, + typename std::tuple_element_t:: + template ProtogalaxyTupleOfUnivariatesOverSubrelations>; + return std::tuple_cat( + std::tuple{}, + create_protogalaxy_tuple_of_tuples_of_univariates()); } } diff --git a/barretenberg/cpp/src/barretenberg/polynomials/univariate.hpp b/barretenberg/cpp/src/barretenberg/polynomials/univariate.hpp index aedc1353787..6471ba85b56 100644 --- a/barretenberg/cpp/src/barretenberg/polynomials/univariate.hpp +++ b/barretenberg/cpp/src/barretenberg/polynomials/univariate.hpp @@ -13,17 +13,22 @@ namespace bb { * of the data in those univariates. We do that by taking a view of those elements and then, as needed, using this to * populate new containers. */ -template class UnivariateView; +template class UnivariateView; /** * @brief A univariate polynomial represented by its values on {domain_start, domain_start + 1,..., domain_end - 1}. For * memory efficiency purposes, we store the evaluations in an array starting from 0 and make the mapping to the right * domain under the hood. + * + * @tparam skip_count Skip computing the values of elements [domain_start+1,..,domain_start+skip_count]. Used for + * optimising computation in protogalaxy. The value at [domain_start] is the value from the accumulator instance, while + * the values in [domain_start+1, ... domain_start + skip_count] in the accumulator should be zero if the original + * instances are correct. */ -template class Univariate { +template class Univariate { public: static constexpr size_t LENGTH = domain_end - domain_start; - using View = UnivariateView; + using View = UnivariateView; using value_type = Fr; // used to get the type of the elements consistently with std::array @@ -40,8 +45,27 @@ template class Univariate Univariate(Univariate&& other) noexcept = default; Univariate& operator=(const Univariate& other) = default; Univariate& operator=(Univariate&& other) noexcept = default; - // Construct constant Univariate from scalar which represents the value that all the points in the domain evaluate - // to + + /** + * @brief Convert from a version with skipped evaluations to one without skipping (with zeroes in previously skipped + * locations) + * + * @return Univariate + */ + Univariate convert() const noexcept + { + Univariate result; + result.evaluations[0] = evaluations[0]; + for (size_t i = 1; i < skip_count + 1; i++) { + result.evaluations[i] = Fr::zero(); + } + for (size_t i = skip_count + 1; i < LENGTH; i++) { + result.evaluations[i] = evaluations[i]; + } + return result; + } + // Construct constant Univariate from scalar which represents the value that all the points in the domain + // evaluate to explicit Univariate(Fr value) : evaluations{} { @@ -50,7 +74,7 @@ template class Univariate } } // Construct Univariate from UnivariateView - explicit Univariate(UnivariateView in) + explicit Univariate(UnivariateView in) : evaluations{} { for (size_t i = 0; i < in.evaluations.size(); ++i) { @@ -77,7 +101,7 @@ template class Univariate static Univariate get_random() { - auto output = Univariate(); + auto output = Univariate(); for (size_t i = 0; i != LENGTH; ++i) { output.value_at(i) = Fr::random_element(); } @@ -86,7 +110,7 @@ template class Univariate static Univariate zero() { - auto output = Univariate(); + auto output = Univariate(); for (size_t i = 0; i != LENGTH; ++i) { output.value_at(i) = Fr::zero(); } @@ -100,21 +124,25 @@ template class Univariate Univariate& operator+=(const Univariate& other) { - for (size_t i = 0; i < LENGTH; ++i) { + evaluations[0] += other.evaluations[0]; + for (size_t i = skip_count + 1; i < LENGTH; ++i) { evaluations[i] += other.evaluations[i]; } return *this; } Univariate& operator-=(const Univariate& other) { - for (size_t i = 0; i < LENGTH; ++i) { + evaluations[0] -= other.evaluations[0]; + for (size_t i = skip_count + 1; i < LENGTH; ++i) { + evaluations[i] -= other.evaluations[i]; } return *this; } Univariate& operator*=(const Univariate& other) { - for (size_t i = 0; i < LENGTH; ++i) { + evaluations[0] *= other.evaluations[0]; + for (size_t i = skip_count + 1; i < LENGTH; ++i) { evaluations[i] *= other.evaluations[i]; } return *this; @@ -135,8 +163,12 @@ template class Univariate Univariate operator-() const { Univariate res(*this); + size_t i = 0; for (auto& eval : res.evaluations) { - eval = -eval; + if (i == 0 || i >= (skip_count + 1)) { + eval = -eval; + } + i++; } return res; } @@ -151,23 +183,39 @@ template class Univariate // Operations between Univariate and scalar Univariate& operator+=(const Fr& scalar) { + size_t i = 0; for (auto& eval : evaluations) { - eval += scalar; + if (i == 0 || i >= (skip_count + 1)) { + eval += scalar; + } + i++; } return *this; } Univariate& operator-=(const Fr& scalar) { + size_t i = 0; for (auto& eval : evaluations) { - eval -= scalar; + // If skip count is zero, will be enabled on every line, otherwise don't compute for [domain_start+1,.., + // domain_start + skip_count] + if (i == 0 || i >= (skip_count + 1)) { + eval -= scalar; + } + i++; } return *this; } Univariate& operator*=(const Fr& scalar) { + size_t i = 0; for (auto& eval : evaluations) { - eval *= scalar; + // If skip count is zero, will be enabled on every line, otherwise don't compute for [domain_start+1,.., + // domain_start + skip_count] + if (i == 0 || i >= (skip_count + 1)) { + eval *= scalar; + } + i++; } return *this; } @@ -194,45 +242,48 @@ template class Univariate } // Operations between Univariate and UnivariateView - Univariate& operator+=(const UnivariateView& view) + Univariate& operator+=(const UnivariateView& view) { - for (size_t i = 0; i < LENGTH; ++i) { + evaluations[0] += view.evaluations[0]; + for (size_t i = skip_count + 1; i < LENGTH; ++i) { evaluations[i] += view.evaluations[i]; } return *this; } - Univariate& operator-=(const UnivariateView& view) + Univariate& operator-=(const UnivariateView& view) { - for (size_t i = 0; i < LENGTH; ++i) { + evaluations[0] -= view.evaluations[0]; + for (size_t i = skip_count + 1; i < LENGTH; ++i) { evaluations[i] -= view.evaluations[i]; } return *this; } - Univariate& operator*=(const UnivariateView& view) + Univariate& operator*=(const UnivariateView& view) { - for (size_t i = 0; i < LENGTH; ++i) { + evaluations[0] *= view.evaluations[0]; + for (size_t i = skip_count + 1; i < LENGTH; ++i) { evaluations[i] *= view.evaluations[i]; } return *this; } - Univariate operator+(const UnivariateView& view) const + Univariate operator+(const UnivariateView& view) const { Univariate res(*this); res += view; return res; } - Univariate operator-(const UnivariateView& view) const + Univariate operator-(const UnivariateView& view) const { Univariate res(*this); res -= view; return res; } - Univariate operator*(const UnivariateView& view) const + Univariate operator*(const UnivariateView& view) const { Univariate res(*this); res *= view; @@ -256,39 +307,42 @@ template class Univariate } /** - * @brief Given a univariate f represented by {f(domain_start), ..., f(domain_end - 1)}, compute the evaluations - * {f(domain_end),..., f(extended_domain_end -1)} and return the Univariate represented by {f(domain_start),..., - * f(extended_domain_end -1)} + * @brief Given a univariate f represented by {f(domain_start), ..., f(domain_end - 1)}, compute the + * evaluations {f(domain_end),..., f(extended_domain_end -1)} and return the Univariate represented by + * {f(domain_start),..., f(extended_domain_end -1)} * - * @details Write v_i = f(x_i) on a the domain {x_{domain_start}, ..., x_{domain_end-1}}. To efficiently compute the - * needed values of f, we use the barycentric formula + * @details Write v_i = f(x_i) on a the domain {x_{domain_start}, ..., x_{domain_end-1}}. To efficiently + * compute the needed values of f, we use the barycentric formula * - f(x) = B(x) Σ_{i=domain_start}^{domain_end-1} v_i / (d_i*(x-x_i)) * where * - B(x) = Π_{i=domain_start}^{domain_end-1} (x-x_i) - * - d_i = Π_{j ∈ {domain_start, ..., domain_end-1}, j≠i} (x_i-x_j) for i ∈ {domain_start, ..., domain_end-1} + * - d_i = Π_{j ∈ {domain_start, ..., domain_end-1}, j≠i} (x_i-x_j) for i ∈ {domain_start, ..., + * domain_end-1} * - * When the domain size is two, extending f = v0(1-X) + v1X to a new value involves just one addition and a - * subtraction: setting Δ = v1-v0, the values of f(X) are f(0)=v0, f(1)= v0 + Δ, v2 = f(1) + Δ, v3 = f(2) + Δ... + * When the domain size is two, extending f = v0(1-X) + v1X to a new value involves just one addition + * and a subtraction: setting Δ = v1-v0, the values of f(X) are f(0)=v0, f(1)= v0 + Δ, v2 = f(1) + Δ, v3 + * = f(2) + Δ... * */ - template Univariate extend_to() const + template + Univariate extend_to() const { const size_t EXTENDED_LENGTH = EXTENDED_DOMAIN_END - domain_start; using Data = BarycentricData; static_assert(EXTENDED_LENGTH >= LENGTH); - Univariate result; + Univariate result; std::copy(evaluations.begin(), evaluations.end(), result.evaluations.begin()); static constexpr Fr inverse_two = Fr(2).invert(); + static_assert(NUM_SKIPPED_INDICES < LENGTH); if constexpr (LENGTH == 2) { Fr delta = value_at(1) - value_at(0); static_assert(EXTENDED_LENGTH != 0); for (size_t idx = domain_end - 1; idx < EXTENDED_DOMAIN_END - 1; idx++) { result.value_at(idx + 1) = result.value_at(idx) + delta; } - return result; } else if constexpr (LENGTH == 3) { // Based off https://hackmd.io/@aztec-network/SyR45cmOq?type=view // The technique used here is the same as the length == 3 case below. @@ -304,7 +358,6 @@ template class Univariate result.value_at(idx + 1) = result.value_at(idx) + extra; extra += a2; } - return result; } else if constexpr (LENGTH == 4) { static constexpr Fr inverse_six = Fr(6).invert(); // computed at compile time for efficiency @@ -315,8 +368,8 @@ template class Univariate // a*1 + b*1 + c*1 + d = f(1) // a*2^3 + b*2^2 + c*2 + d = f(2) // a*3^3 + b*3^2 + c*3 + d = f(3) - // These equations can be rewritten as a matrix equation M * [a, b, c, d] = [f(0), f(1), f(2), f(3)], where - // M is: + // These equations can be rewritten as a matrix equation M * [a, b, c, d] = [f(0), f(1), f(2), + // f(3)], where M is: // 0, 0, 0, 1 // 1, 1, 1, 1 // 2^3, 2^2, 2, 1 @@ -326,9 +379,9 @@ template class Univariate // 1, -5/2, 2, -1/2 // -11/6, 3, -3/2, 1/3 // 1, 0, 0, 0 - // To compute these values, we can multiply everything by 6 and multiply by inverse_six at the end for each - // coefficient The resulting computation here does 18 field adds, 6 subtracts, 3 muls to compute a, b, c, - // and d. + // To compute these values, we can multiply everything by 6 and multiply by inverse_six at the + // end for each coefficient The resulting computation here does 18 field adds, 6 subtracts, 3 + // muls to compute a, b, c, and d. Fr zero_times_3 = value_at(0) + value_at(0) + value_at(0); Fr zero_times_6 = zero_times_3 + zero_times_3; Fr zero_times_12 = zero_times_6 + zero_times_6; @@ -368,7 +421,6 @@ template class Univariate linear_term += three_a_plus_two_b; } - return result; } else { for (size_t k = domain_end; k != EXTENDED_DOMAIN_END; ++k) { result.value_at(k) = 0; @@ -381,8 +433,8 @@ template class Univariate // scale the sum by the the value of of B(x) result.value_at(k) *= Data::full_numerator_values[k]; } - return result; } + return result; } /** @@ -399,8 +451,8 @@ template class Univariate full_numerator_value *= u - i; } - // build set of domain size-many denominator inverses 1/(d_i*(x_k - x_j)). will multiply against each of - // these (rather than to divide by something) for each barycentric evaluation + // build set of domain size-many denominator inverses 1/(d_i*(x_k - x_j)). will multiply against + // each of these (rather than to divide by something) for each barycentric evaluation std::array denominator_inverses; for (size_t i = 0; i != LENGTH; ++i) { Fr inv = Data::lagrange_denominators[i]; @@ -443,7 +495,7 @@ inline void write(B& it, Univariate const& univari write(it, univariate.evaluations); } -template class UnivariateView { +template class UnivariateView { public: static constexpr size_t LENGTH = domain_end - domain_start; std::span evaluations; @@ -453,77 +505,84 @@ template class Univariate const Fr& value_at(size_t i) const { return evaluations[i]; }; template - explicit UnivariateView(const Univariate& univariate_in) + explicit UnivariateView(const Univariate& univariate_in) : evaluations(std::span(univariate_in.evaluations.data(), LENGTH)){}; - Univariate operator+(const UnivariateView& other) const + Univariate operator+(const UnivariateView& other) const { - Univariate res(*this); + Univariate res(*this); res += other; return res; } - Univariate operator-(const UnivariateView& other) const + Univariate operator-(const UnivariateView& other) const { - Univariate res(*this); + Univariate res(*this); res -= other; return res; } - Univariate operator-() const + Univariate operator-() const { - Univariate res(*this); + Univariate res(*this); + size_t i = 0; for (auto& eval : res.evaluations) { - eval = -eval; + if (i == 0 || i >= (skip_count + 1)) { + eval = -eval; + } + i++; } return res; } - Univariate operator*(const UnivariateView& other) const + Univariate operator*(const UnivariateView& other) const { - Univariate res(*this); + Univariate res(*this); res *= other; return res; } - Univariate operator*(const Univariate& other) const + Univariate operator*( + const Univariate& other) const { - Univariate res(*this); + Univariate res(*this); res *= other; return res; } - Univariate operator+(const Univariate& other) const + Univariate operator+( + const Univariate& other) const { - Univariate res(*this); + Univariate res(*this); res += other; return res; } - Univariate operator+(const Fr& other) const + Univariate operator+(const Fr& other) const { - Univariate res(*this); + Univariate res(*this); res += other; return res; } - Univariate operator-(const Fr& other) const + Univariate operator-(const Fr& other) const { - Univariate res(*this); + Univariate res(*this); res -= other; return res; } - Univariate operator*(const Fr& other) const + Univariate operator*(const Fr& other) const { - Univariate res(*this); + Univariate res(*this); res *= other; return res; } - Univariate operator-(const Univariate& other) const + Univariate operator-( + const Univariate& other) const { - Univariate res(*this); + Univariate res(*this); res -= other; return res; } @@ -546,8 +605,8 @@ template class Univariate }; /** - * @brief Create a sub-array of `elements` at the indices given in the template pack `Is`, converting them to the new - * type T. + * @brief Create a sub-array of `elements` at the indices given in the template pack `Is`, converting them + * to the new type T. * * @tparam T type to convert to * @tparam U type to convert from @@ -555,8 +614,8 @@ template class Univariate * @tparam Is list of indices we want in the returned array. When the second argument is called with * `std::make_index_sequence`, these will be `0, 1, ..., N-1`. * @param elements array to convert from - * @return std::array result array s.t. result[i] = T(elements[Is[i]]). By default, Is[i] = i when - * called with `std::make_index_sequence`. + * @return std::array result array s.t. result[i] = T(elements[Is[i]]). By default, Is[i] + * = i when called with `std::make_index_sequence`. */ template std::array array_to_array_aux(const std::array& elements, std::index_sequence) @@ -568,11 +627,12 @@ std::array array_to_array_aux(const std::array& elements * @brief Given an std::array, returns an std::array, by calling the (explicit) constructor T(U). * * @details https://stackoverflow.com/a/32175958 - * The main use case is to convert an array of `Univariate` into `UnivariateView`. The main use case would be to let - * Sumcheck decide the required degree of the relation evaluation, rather than hardcoding it inside the relation. The - * `_aux` version could also be used to create an array of only the polynomials required by the relation, and it could - * help us implement the optimization where we extend each edge only up to the maximum degree that is required over all - * relations (for example, `L_LAST` only needs degree 3). + * The main use case is to convert an array of `Univariate` into `UnivariateView`. The main use case would + * be to let Sumcheck decide the required degree of the relation evaluation, rather than hardcoding it + * inside the relation. The + * `_aux` version could also be used to create an array of only the polynomials required by the relation, + * and it could help us implement the optimization where we extend each edge only up to the maximum degree + * that is required over all relations (for example, `L_LAST` only needs degree 3). * * @tparam T Output type * @tparam U Input type (deduced from `elements`) diff --git a/barretenberg/cpp/src/barretenberg/protogalaxy/combiner.test.cpp b/barretenberg/cpp/src/barretenberg/protogalaxy/combiner.test.cpp index 72a6fa53233..4ff7f81cb51 100644 --- a/barretenberg/cpp/src/barretenberg/protogalaxy/combiner.test.cpp +++ b/barretenberg/cpp/src/barretenberg/protogalaxy/combiner.test.cpp @@ -44,6 +44,12 @@ TEST(Protogalaxy, CombinerOn2Instances) auto prover_polynomials = get_sequential_prover_polynomials( /*log_circuit_size=*/1, idx * 128); restrict_to_standard_arithmetic_relation(prover_polynomials); + // This ensures that the combiner accumulator for second instance = 0 + // The value is computed by generating the python script values, computing the resulting accumulator and + // taking the value at index 1 + if (idx == NUM_INSTANCES - 1) { + prover_polynomials.q_c[0] -= 13644570; + } instance->proving_key.polynomials = std::move(prover_polynomials); instance->proving_key.circuit_size = 2; instance_data[idx] = instance; @@ -52,22 +58,22 @@ TEST(Protogalaxy, CombinerOn2Instances) ProverInstances instances{ instance_data }; instances.alphas.fill(bb::Univariate(FF(0))); // focus on the arithmetic relation only auto pow_polynomial = PowPolynomial(std::vector{ 2 }); - auto result = prover.compute_combiner(instances, pow_polynomial); - auto expected_result = Univariate(std::array{ - 87706, - 13644570, - 76451738, - 226257946, - static_cast(500811930), - static_cast(937862426), - static_cast(1575158170), - static_cast(2450447898), - static_cast(3601480346), - static_cast(5066004250), - static_cast(6881768346), - static_cast(9086521370), - }); + auto result = prover.compute_combiner(instances, pow_polynomial); + auto optimised_result = prover.compute_combiner(instances, pow_polynomial); + auto expected_result = Univariate(std::array{ 87706, + 0, + 0x02ee2966, + 0x0b0bd2cc, + 0x00001a98fc32, + 0x000033d5a598, + 0x00005901cefe, + 0x00008c5d7864, + 0x0000d028a1ca, + 0x000126a34b30UL, + 0x0001920d7496UL, + 0x000214a71dfcUL }); EXPECT_EQ(result, expected_result); + EXPECT_EQ(optimised_result, expected_result); } else { std::vector> instance_data(NUM_INSTANCES); ProtoGalaxyProver prover; @@ -130,11 +136,13 @@ TEST(Protogalaxy, CombinerOn2Instances) 0 0 0 0 0 0 0 0 0 6 18 36 60 90 */ auto pow_polynomial = PowPolynomial(std::vector{ 2 }); - auto result = prover.compute_combiner(instances, pow_polynomial); + auto result = prover.compute_combiner(instances, pow_polynomial); + auto optimised_result = prover.compute_combiner(instances, pow_polynomial); auto expected_result = Univariate(std::array{ 0, 0, 12, 36, 72, 120, 180, 252, 336, 432, 540, 660 }); EXPECT_EQ(result, expected_result); + EXPECT_EQ(optimised_result, expected_result); } }; run_test(true); @@ -181,11 +189,13 @@ TEST(Protogalaxy, CombinerOn4Instances) zero_all_selectors(instances[3]->proving_key.polynomials); auto pow_polynomial = PowPolynomial(std::vector{ 2 }); - auto result = prover.compute_combiner(instances, pow_polynomial); + auto result = prover.compute_combiner(instances, pow_polynomial); + auto optimised_result = prover.compute_combiner(instances, pow_polynomial); std::array zeroes; std::fill(zeroes.begin(), zeroes.end(), 0); auto expected_result = Univariate(zeroes); EXPECT_EQ(result, expected_result); + EXPECT_EQ(optimised_result, expected_result); }; run_test(); }; diff --git a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy.test.cpp b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy.test.cpp index 0b51c91f57b..3148c54cd40 100644 --- a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy.test.cpp +++ b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy.test.cpp @@ -279,6 +279,11 @@ template class ProtoGalaxyTests : public testing::Test { bb::Univariate expected_eta{ { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21 } }; EXPECT_EQ(instances.relation_parameters.eta, expected_eta); + // Optimised relation parameters are the same, we just don't compute any values for non-used indices when + // deriving values from them + for (size_t i = 0; i < 11; i++) { + EXPECT_EQ(instances.optimised_relation_parameters.eta.evaluations[i], expected_eta.evaluations[i]); + } } /** diff --git a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover.hpp b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover.hpp index d9cffcca9c7..c03af2e5333 100644 --- a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover.hpp +++ b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover.hpp @@ -49,15 +49,26 @@ template class ProtoGalaxyProver_ { // The length of ExtendedUnivariate is the largest length (==max_relation_degree + 1) of a univariate polynomial // obtained by composing a relation with folded instance + relation parameters . using ExtendedUnivariate = Univariate; + // Same as ExtendedUnivariate, but uses optimised univariates which skip redundant computation in optimistic cases + // (when we know that the evaluation of all relations is 0 on a particular index, for example) + using OptimisedExtendedUnivariate = + Univariate; // Represents the total length of the combiner univariate, obtained by combining the already folded relations with // the folded relation batching challenge. using ExtendedUnivariateWithRandomization = Univariate; using ExtendedUnivariates = typename Flavor::template ProverUnivariates; + using OptimisedExtendedUnivariates = + typename Flavor::template OptimisedProverUnivariates; using TupleOfTuplesOfUnivariates = typename Flavor::template ProtogalaxyTupleOfTuplesOfUnivariates; + using OptimisedTupleOfTuplesOfUnivariates = + typename Flavor::template OptimisedProtogalaxyTupleOfTuplesOfUnivariates; using RelationEvaluations = typename Flavor::TupleOfArraysOfValues; static constexpr size_t NUM_SUBRELATIONS = ProverInstances::NUM_SUBRELATIONS; @@ -209,14 +220,20 @@ template class ProtoGalaxyProver_ { auto prev_level_width = prev_level_coeffs.size(); // we need degree + 1 terms to represent the intermediate polynomials std::vector> level_coeffs(prev_level_width >> 1, std::vector(degree + 1, 0)); - for (size_t node = 0; node < prev_level_width; node += 2) { - auto parent = node >> 1; - std::copy(prev_level_coeffs[node].begin(), prev_level_coeffs[node].end(), level_coeffs[parent].begin()); - for (size_t d = 0; d < degree; d++) { - level_coeffs[parent][d] += prev_level_coeffs[node + 1][d] * betas[level]; - level_coeffs[parent][d + 1] += prev_level_coeffs[node + 1][d] * deltas[level]; - } - } + run_loop_in_parallel( + prev_level_width >> 1, + [&](size_t start, size_t end) { + for (size_t node = start << 1; node < end << 1; node += 2) { + auto parent = node >> 1; + std::copy( + prev_level_coeffs[node].begin(), prev_level_coeffs[node].end(), level_coeffs[parent].begin()); + for (size_t d = 0; d < degree; d++) { + level_coeffs[parent][d] += prev_level_coeffs[node + 1][d] * betas[level]; + level_coeffs[parent][d + 1] += prev_level_coeffs[node + 1][d] * deltas[level]; + } + } + }, + /*no_multhreading_if_less_or_equal=*/8); return construct_coefficients_tree(betas, deltas, level_coeffs, level + 1); } @@ -236,11 +253,16 @@ template class ProtoGalaxyProver_ { { auto width = full_honk_evaluations.size(); std::vector> first_level_coeffs(width >> 1, std::vector(2, 0)); - for (size_t node = 0; node < width; node += 2) { - auto parent = node >> 1; - first_level_coeffs[parent][0] = full_honk_evaluations[node] + full_honk_evaluations[node + 1] * betas[0]; - first_level_coeffs[parent][1] = full_honk_evaluations[node + 1] * deltas[0]; - } + run_loop_in_parallel(width >> 1, [&](size_t start, size_t end) { + // Run loop in parallel can divide the domain in such way that the indices are odd, which we can't tolerate + // here, so first we divide the width by two, enable parallelism and then reconstruct even start and end + for (size_t node = start << 1; node < end << 1; node += 2) { + auto parent = node >> 1; + first_level_coeffs[parent][0] = + full_honk_evaluations[node] + full_honk_evaluations[node + 1] * betas[0]; + first_level_coeffs[parent][1] = full_honk_evaluations[node + 1] * deltas[0]; + } + }); return construct_coefficients_tree(betas, deltas, first_level_coeffs); } @@ -262,26 +284,38 @@ template class ProtoGalaxyProver_ { return Polynomial(coeffs); } + OptimisedTupleOfTuplesOfUnivariates optimised_univariate_accumulators; TupleOfTuplesOfUnivariates univariate_accumulators; /** * @brief Prepare a univariate polynomial for relation execution in one step of the main loop in folded instance * construction. - * @details For a fixed prover polynomial index, extract that polynomial from each instance in Instances. From each - * polynomial, extract the value at row_idx. Use these values to create a univariate polynomial, and then extend - * (i.e., compute additional evaluations at adjacent domain values) as needed. + * @details For a fixed prover polynomial index, extract that polynomial from each instance in Instances. From + *each polynomial, extract the value at row_idx. Use these values to create a univariate polynomial, and then + *extend (i.e., compute additional evaluations at adjacent domain values) as needed. * @todo TODO(https://github.com/AztecProtocol/barretenberg/issues/751) Optimize memory + * + * */ - void extend_univariates(ExtendedUnivariates& extended_univariates, - const ProverInstances& instances, - const size_t row_idx) + + template + void extend_univariates( + std::conditional_t& extended_univariates, + const ProverInstances& instances, + const size_t row_idx) { - auto base_univariates = instances.row_to_univariates(row_idx); + auto base_univariates = instances.template row_to_univariates(row_idx); for (auto [extended_univariate, base_univariate] : zip_view(extended_univariates.get_all(), base_univariates)) { - extended_univariate = base_univariate.template extend_to(); + extended_univariate = base_univariate.template extend_to(); } } + /** + * @brief Add the value of each relation over univariates to an appropriate accumulator + * + * @tparam Parameters relation parameters type + * @tparam relation_idx The index of the relation + */ template void accumulate_relation_univariates(TupleOfTuplesOfUnivariates& univariate_accumulators, const ExtendedUnivariates& extended_univariates, @@ -294,39 +328,73 @@ template class ProtoGalaxyProver_ { // Repeat for the next relation. if constexpr (relation_idx + 1 < Flavor::NUM_RELATIONS) { - accumulate_relation_univariates( - univariate_accumulators, extended_univariates, relation_parameters, scaling_factor); + accumulate_relation_univariates< + + Parameters, + relation_idx + 1>(univariate_accumulators, extended_univariates, relation_parameters, scaling_factor); } } /** - * @brief Compute the combiner polynomial $G$ in the Protogalaxy paper. + * @brief Add the value of each relation over univariates to an appropriate accumulator with index skipping + * optimisation + * + * @tparam Parameters relation parameters type + * @tparam relation_idx The index of the relation + */ + template + void accumulate_relation_univariates(OptimisedTupleOfTuplesOfUnivariates& univariate_accumulators, + const OptimisedExtendedUnivariates& extended_univariates, + const Parameters& relation_parameters, + const FF& scaling_factor) + { + using Relation = std::tuple_element_t; + Relation::accumulate( + std::get(univariate_accumulators), extended_univariates, relation_parameters, scaling_factor); + + // Repeat for the next relation. + if constexpr (relation_idx + 1 < Flavor::NUM_RELATIONS) { + accumulate_relation_univariates< + + Parameters, + relation_idx + 1>(univariate_accumulators, extended_univariates, relation_parameters, scaling_factor); + } + } + /** + * @brief Compute the combiner polynomial $G$ in the Protogalaxy paper * */ + template = true> ExtendedUnivariateWithRandomization compute_combiner(const ProverInstances& instances, PowPolynomial& pow_betas) { - BB_OP_COUNT_TIME(); size_t common_instance_size = instances[0]->proving_key.circuit_size; pow_betas.compute_values(); // Determine number of threads for multithreading. // Note: Multithreading is "on" for every round but we reduce the number of threads from the max available based - // on a specified minimum number of iterations per thread. This eventually leads to the use of a single thread. - // For now we use a power of 2 number of threads simply to ensure the round size is evenly divided. + // on a specified minimum number of iterations per thread. This eventually leads to the use of a + // single thread. For now we use a power of 2 number of threads simply to ensure the round size is evenly + // divided. size_t max_num_threads = get_num_cpus_pow2(); // number of available threads (power of 2) size_t min_iterations_per_thread = 1 << 6; // min number of iterations for which we'll spin up a unique thread size_t desired_num_threads = common_instance_size / min_iterations_per_thread; size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified num_threads = num_threads > 0 ? num_threads : 1; // ensure num threads is >= 1 size_t iterations_per_thread = common_instance_size / num_threads; // actual iterations per thread + + // Univariates are optimised for usual PG, but we need the unoptimised version for tests (it's a version that + // doesn't skip computation), so we need to define types depending on the template instantiation + using ThreadAccumulators = TupleOfTuplesOfUnivariates; + using ExtendedUnivatiatesType = ExtendedUnivariates; + // Construct univariate accumulator containers; one per thread - std::vector thread_univariate_accumulators(num_threads); + std::vector thread_univariate_accumulators(num_threads); for (auto& accum : thread_univariate_accumulators) { // just normal relation lengths Utils::zero_univariates(accum); } // Construct extended univariates containers; one per thread - std::vector extended_univariates; + std::vector extended_univariates; extended_univariates.resize(num_threads); // Accumulate the contribution from each sub-relation @@ -335,14 +403,15 @@ template class ProtoGalaxyProver_ { size_t end = (thread_idx + 1) * iterations_per_thread; for (size_t idx = start; idx < end; idx++) { - // No need to initialise extended_univariates to 0, it's assigned to + extend_univariates(extended_univariates[thread_idx], instances, idx); FF pow_challenge = pow_betas[idx]; - // Accumulate the i-th row's univariate contribution. Note that the relation parameters passed to this - // function have already been folded. Moreover, linear-dependent relations that act over the entire - // execution trace rather than on rows, will not be multiplied by the pow challenge. + // Accumulate the i-th row's univariate contribution. Note that the relation parameters passed to + // this function have already been folded. Moreover, linear-dependent relations that act over the + // entire execution trace rather than on rows, will not be multiplied by the pow challenge. + accumulate_relation_univariates( thread_univariate_accumulators[thread_idx], extended_univariates[thread_idx], @@ -350,19 +419,115 @@ template class ProtoGalaxyProver_ { pow_challenge); } }); - + Utils::zero_univariates(univariate_accumulators); // Accumulate the per-thread univariate accumulators into a single set of accumulators for (auto& accumulators : thread_univariate_accumulators) { Utils::add_nested_tuples(univariate_accumulators, accumulators); } - // Batch the univariate contributions from each sub-relation to obtain the round univariate + return batch_over_relations(univariate_accumulators, instances.alphas); } + /** + * @brief Compute the combiner polynomial $G$ in the Protogalaxy paper using indice skippping optimisation + * + * @todo (https://github.com/AztecProtocol/barretenberg/issues/968) Make combiner tests better + * + */ + template = true> + ExtendedUnivariateWithRandomization compute_combiner(const ProverInstances& instances, PowPolynomial& pow_betas) + { + BB_OP_COUNT_TIME(); + size_t common_instance_size = instances[0]->proving_key.circuit_size; + pow_betas.compute_values(); + // Determine number of threads for multithreading. + // Note: Multithreading is "on" for every round but we reduce the number of threads from the max available based + // on a specified minimum number of iterations per thread. This eventually leads to the use of a + // single thread. For now we use a power of 2 number of threads simply to ensure the round size is evenly + // divided. + size_t max_num_threads = get_num_cpus_pow2(); // number of available threads (power of 2) + size_t min_iterations_per_thread = 1 << 6; // min number of iterations for which we'll spin up a unique thread + size_t desired_num_threads = common_instance_size / min_iterations_per_thread; + size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified + num_threads = num_threads > 0 ? num_threads : 1; // ensure num threads is >= 1 + size_t iterations_per_thread = common_instance_size / num_threads; // actual iterations per thread + + // Univariates are optimised for usual PG, but we need the unoptimised version for tests (it's a version that + // doesn't skip computation), so we need to define types depending on the template instantiation + using ThreadAccumulators = OptimisedTupleOfTuplesOfUnivariates; + using ExtendedUnivatiatesType = OptimisedExtendedUnivariates; + + // Construct univariate accumulator containers; one per thread + std::vector thread_univariate_accumulators(num_threads); + for (auto& accum : thread_univariate_accumulators) { + // just normal relation lengths + Utils::zero_univariates(accum); + } + + // Construct extended univariates containers; one per thread + std::vector extended_univariates; + extended_univariates.resize(num_threads); + + // Accumulate the contribution from each sub-relation + parallel_for(num_threads, [&](size_t thread_idx) { + size_t start = thread_idx * iterations_per_thread; + size_t end = (thread_idx + 1) * iterations_per_thread; + + for (size_t idx = start; idx < end; idx++) { + // No need to initialise extended_univariates to 0, it's assigned to + // Instantiate univariates with skipping to ignore computation in those indices (they are still + // available for skipping relations, but all derived univariate will ignore those evaluations) + extend_univariates( + extended_univariates[thread_idx], instances, idx); + + FF pow_challenge = pow_betas[idx]; + + // Accumulate the i-th row's univariate contribution. Note that the relation parameters passed to + // this function have already been folded. Moreover, linear-dependent relations that act over the + // entire execution trace rather than on rows, will not be multiplied by the pow challenge. + accumulate_relation_univariates( + thread_univariate_accumulators[thread_idx], + extended_univariates[thread_idx], + instances.optimised_relation_parameters, // these parameters have already been folded + pow_challenge); + } + }); + Utils::zero_univariates(optimised_univariate_accumulators); + // Accumulate the per-thread univariate accumulators into a single set of accumulators + for (auto& accumulators : thread_univariate_accumulators) { + Utils::add_nested_tuples(optimised_univariate_accumulators, accumulators); + } + + // Convert from optimised version to non-optimised + deoptimise_univariates(optimised_univariate_accumulators, univariate_accumulators); + // Batch the univariate contributions from each sub-relation to obtain the round univariate + return batch_over_relations(univariate_accumulators, instances.alphas); + } + + /** + * @brief Convert univariates from optimised form to regular + * + * @details We need to convert before we batch relations, since optimised versions don't have enough information to + * extend the univariates to maximum length + * + * @param optimised_univariate_accumulators + * @param new_univariate_accumulators + */ + static void deoptimise_univariates(const OptimisedTupleOfTuplesOfUnivariates& optimised_univariate_accumulators, + TupleOfTuplesOfUnivariates& new_univariate_accumulators + + ) + { + auto deoptimise = [&](auto& element) { + auto& optimised_element = std::get(std::get(optimised_univariate_accumulators)); + element = optimised_element.convert(); + }; + + Utils::template apply_to_tuple_of_tuples<0, 0>(new_univariate_accumulators, deoptimise); + } static ExtendedUnivariateWithRandomization batch_over_relations(TupleOfTuplesOfUnivariates& univariate_accumulators, const CombinedRelationSeparator& alpha) { - // First relation does not get multiplied by a batching challenge auto result = std::get<0>(std::get<0>(univariate_accumulators)) .template extend_to(); @@ -432,7 +597,8 @@ template class ProtoGalaxyProver_ { { size_t param_idx = 0; auto to_fold = instances.relation_parameters.get_to_fold(); - for (auto& folded_parameter : to_fold) { + auto to_fold_optimised = instances.optimised_relation_parameters.get_to_fold(); + for (auto [folded_parameter, optimised_folded_parameter] : zip_view(to_fold, to_fold_optimised)) { Univariate tmp(0); size_t instance_idx = 0; for (auto& instance : instances) { @@ -440,6 +606,8 @@ template class ProtoGalaxyProver_ { instance_idx++; } folded_parameter = tmp.template extend_to(); + optimised_folded_parameter = + tmp.template extend_to(); param_idx++; } } diff --git a/barretenberg/cpp/src/barretenberg/relations/nested_containers.hpp b/barretenberg/cpp/src/barretenberg/relations/nested_containers.hpp index 46f2d246303..36a522eb161 100644 --- a/barretenberg/cpp/src/barretenberg/relations/nested_containers.hpp +++ b/barretenberg/cpp/src/barretenberg/relations/nested_containers.hpp @@ -10,30 +10,42 @@ namespace bb { * * @details Credit: https://stackoverflow.com/a/60440611 */ -template