Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: parallelization update for polynomials #2311

Merged
merged 7 commits into from
Sep 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions barretenberg/cpp/src/barretenberg/common/thread_utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#include "thread_utils.hpp"

namespace barretenberg::thread_utils {
/**
lucasxia01 marked this conversation as resolved.
Show resolved Hide resolved
* @brief calculates number of threads to create based on minimum iterations per thread
* @details Finds the number of cpus with get_num_cpus(), and calculates `desired_num_threads`
* Returns the min of `desired_num_threads` and `max_num_threads`.
* Note that it will not calculate a power of 2 necessarily, use `calculate_num_threads_pow2` instead
*
* @param num_iterations
* @param min_iterations_per_thread
* @return size_t
*/
size_t calculate_num_threads(size_t num_iterations, size_t min_iterations_per_thread)
{
size_t max_num_threads = get_num_cpus(); // number of available threads
size_t desired_num_threads = num_iterations / min_iterations_per_thread;
size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified
num_threads = num_threads > 0 ? num_threads : 1; // ensure num_threads is at least 1
return num_threads;
}

/**
* @brief calculates number of threads to create based on minimum iterations per thread, guaranteed power of 2
* @details Same functionality as `calculate_num_threads` but guaranteed power of 2
* @param num_iterations
* @param min_iterations_per_thread
* @return size_t
*/
size_t calculate_num_threads_pow2(size_t num_iterations, size_t min_iterations_per_thread)
{
size_t max_num_threads = get_num_cpus_pow2(); // number of available threads (power of 2)
size_t desired_num_threads = num_iterations / min_iterations_per_thread;
desired_num_threads = static_cast<size_t>(1ULL << numeric::get_msb(desired_num_threads));
size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified
num_threads = num_threads > 0 ? num_threads : 1; // ensure num_threads is at least 1
return num_threads;
}

} // namespace barretenberg::thread_utils
29 changes: 29 additions & 0 deletions barretenberg/cpp/src/barretenberg/common/thread_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#include "thread.hpp"

namespace barretenberg::thread_utils {

const size_t DEFAULT_MIN_ITERS_PER_THREAD = 1 << 4;

/**
* @brief calculates number of threads to create based on minimum iterations per thread
* @details Finds the number of cpus with get_num_cpus(), and calculates `desired_num_threads`
* Returns the min of `desired_num_threads` and `max_num_theads`.
* Note that it will not calculate a power of 2 necessarily, use `calculate_num_threads_pow2` instead
*
* @param num_iterations
* @param min_iterations_per_thread
* @return size_t
*/
size_t calculate_num_threads(size_t num_iterations, size_t min_iterations_per_thread = DEFAULT_MIN_ITERS_PER_THREAD);

/**
* @brief calculates number of threads to create based on minimum iterations per thread, guaranteed power of 2
* @details Same functionality as `calculate_num_threads` but guaranteed power of 2
* @param num_iterations
* @param min_iterations_per_thread
* @return size_t
*/
size_t calculate_num_threads_pow2(size_t num_iterations,
lucasxia01 marked this conversation as resolved.
Show resolved Hide resolved
size_t min_iterations_per_thread = DEFAULT_MIN_ITERS_PER_THREAD);

} // namespace barretenberg::thread_utils
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once
#include "barretenberg/common/log.hpp"
#include "barretenberg/common/thread.hpp"
#include "barretenberg/common/thread_utils.hpp"
#include "barretenberg/polynomials/barycentric.hpp"
#include "barretenberg/polynomials/pow.hpp"
#include "barretenberg/proof_system/flavor/flavor.hpp"
Expand Down Expand Up @@ -140,12 +141,10 @@ template <typename Flavor> class SumcheckProverRound {
// Note: Multithreading is "on" for every round but we reduce the number of threads from the max available based
// on a specified minimum number of iterations per thread. This eventually leads to the use of a single thread.
// For now we use a power of 2 number of threads simply to ensure the round size is evenly divided.
size_t max_num_threads = get_num_cpus_pow2(); // number of available threads (power of 2)
size_t min_iterations_per_thread = 1 << 6; // min number of iterations for which we'll spin up a unique thread
size_t desired_num_threads = round_size / min_iterations_per_thread;
size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified
num_threads = num_threads > 0 ? num_threads : 1; // ensure num threads is >= 1
size_t iterations_per_thread = round_size / num_threads; // actual iterations per thread
size_t num_threads =
barretenberg::thread_utils::calculate_num_threads_pow2(round_size, min_iterations_per_thread);
size_t iterations_per_thread = round_size / num_threads; // actual iterations per thread

// Constuct univariate accumulator containers; one per thread
std::vector<RelationUnivariates> thread_univariate_accumulators(num_threads);
Expand Down
67 changes: 45 additions & 22 deletions barretenberg/cpp/src/barretenberg/polynomials/polynomial.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#include "polynomial.hpp"
#include "barretenberg/common/assert.hpp"
#include "barretenberg/common/slab_allocator.hpp"
#include "barretenberg/common/thread.hpp"
#include "barretenberg/common/thread_utils.hpp"
#include "polynomial_arithmetic.hpp"
#include <cstddef>
#include <fcntl.h>
Expand Down Expand Up @@ -306,25 +308,34 @@ template <typename Fr> void Polynomial<Fr>::add_scaled(std::span<const Fr> other
const size_t other_size = other.size();
ASSERT(in_place_operation_viable(other_size));

/** TODO parallelize using some kind of generic evaluation domain
* we really only need to know the thread size, but we don't need all the FFT roots
*/
for (size_t i = 0; i < other_size; ++i) {
coefficients_.get()[i] += scaling_factor * other[i];
}
// Calculates number of threads with thread_utils::calculate_num_threads
size_t num_threads = thread_utils::calculate_num_threads(other_size);
size_t range_per_thread = other_size / num_threads;
size_t leftovers = other_size - (range_per_thread * num_threads);
parallel_for(num_threads, [&](size_t j) {
size_t offset = j * range_per_thread;
size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread;
for (size_t i = offset; i < end; ++i) {
coefficients_.get()[i] += scaling_factor * other[i];
}
});
}

template <typename Fr> Polynomial<Fr>& Polynomial<Fr>::operator+=(std::span<const Fr> other)
{
const size_t other_size = other.size();
ASSERT(in_place_operation_viable(other_size));

/** TODO parallelize using some kind of generic evaluation domain
* we really only need to know the thread size, but we don't need all the FFT roots
*/
for (size_t i = 0; i < other_size; ++i) {
coefficients_.get()[i] += other[i];
}
size_t num_threads = thread_utils::calculate_num_threads(other_size);
size_t range_per_thread = other_size / num_threads;
size_t leftovers = other_size - (range_per_thread * num_threads);
parallel_for(num_threads, [&](size_t j) {
size_t offset = j * range_per_thread;
size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread;
for (size_t i = offset; i < end; ++i) {
coefficients_.get()[i] += other[i];
}
});

return *this;
}
Expand All @@ -334,23 +345,35 @@ template <typename Fr> Polynomial<Fr>& Polynomial<Fr>::operator-=(std::span<cons
const size_t other_size = other.size();
ASSERT(in_place_operation_viable(other_size));

/** TODO parallelize using some kind of generic evaluation domain
* we really only need to know the thread size, but we don't need all the FFT roots
*/
for (size_t i = 0; i < other_size; ++i) {
coefficients_.get()[i] -= other[i];
}
size_t num_threads = thread_utils::calculate_num_threads(other_size);
size_t range_per_thread = other_size / num_threads;
size_t leftovers = other_size - (range_per_thread * num_threads);
parallel_for(num_threads, [&](size_t j) {
size_t offset = j * range_per_thread;
size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread;
for (size_t i = offset; i < end; ++i) {
coefficients_.get()[i] -= other[i];
}
});

return *this;
}

template <typename Fr> Polynomial<Fr>& Polynomial<Fr>::operator*=(const Fr scaling_facor)
template <typename Fr> Polynomial<Fr>& Polynomial<Fr>::operator*=(const Fr scaling_factor)
{
ASSERT(in_place_operation_viable());

for (size_t i = 0; i < size_; ++i) {
coefficients_.get()[i] *= scaling_facor;
}
size_t num_threads = thread_utils::calculate_num_threads(size_);
size_t range_per_thread = size_ / num_threads;
size_t leftovers = size_ - (range_per_thread * num_threads);
parallel_for(num_threads, [&](size_t j) {
size_t offset = j * range_per_thread;
size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread;
for (size_t i = offset; i < end; ++i) {
coefficients_.get()[i] *= scaling_factor;
}
});

return *this;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ template <typename Fr> class Polynomial {
*
* @param scaling_factor s
*/
Polynomial& operator*=(const Fr scaling_facor);
Polynomial& operator*=(const Fr scaling_factor);

/**
* @brief evaluates p(X) = ∑ᵢ aᵢ⋅Xⁱ considered as multi-linear extension p(X₀,…,Xₘ₋₁) = ∑ᵢ aᵢ⋅Lᵢ(X₀,…,Xₘ₋₁)
Expand Down