AztecProtocol · codygunton · Sep 23, 2023 · Sep 14, 2023 · Sep 15, 2023 · Sep 15, 2023
diff --git a/barretenberg/cpp/src/barretenberg/common/thread_utils.cpp b/barretenberg/cpp/src/barretenberg/common/thread_utils.cpp
@@ -0,0 +1,40 @@
+#include "thread_utils.hpp"
+
+namespace barretenberg::thread_utils {
+/**
+ * @brief calculates number of threads to create based on minimum iterations per thread
+ * @details Finds the number of cpus with get_num_cpus(), and calculates `desired_num_threads`
+ * Returns the min of `desired_num_threads` and `max_num_threads`.
+ * Note that it will not calculate a power of 2 necessarily, use `calculate_num_threads_pow2` instead
+ *
+ * @param num_iterations
+ * @param min_iterations_per_thread
+ * @return size_t
+ */
+size_t calculate_num_threads(size_t num_iterations, size_t min_iterations_per_thread)
+{
+    size_t max_num_threads = get_num_cpus(); // number of available threads
+    size_t desired_num_threads = num_iterations / min_iterations_per_thread;
+    size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified
+    num_threads = num_threads > 0 ? num_threads : 1;                     // ensure num_threads is at least 1
+    return num_threads;
+}
+
+/**
+ * @brief calculates number of threads to create based on minimum iterations per thread, guaranteed power of 2
+ * @details Same functionality as `calculate_num_threads` but guaranteed power of 2
+ * @param num_iterations
+ * @param min_iterations_per_thread
+ * @return size_t
+ */
+size_t calculate_num_threads_pow2(size_t num_iterations, size_t min_iterations_per_thread)
+{
+    size_t max_num_threads = get_num_cpus_pow2(); // number of available threads (power of 2)
+    size_t desired_num_threads = num_iterations / min_iterations_per_thread;
+    desired_num_threads = static_cast<size_t>(1ULL << numeric::get_msb(desired_num_threads));
+    size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified
+    num_threads = num_threads > 0 ? num_threads : 1;                     // ensure num_threads is at least 1
+    return num_threads;
+}
+
+} // namespace barretenberg::thread_utils
diff --git a/barretenberg/cpp/src/barretenberg/common/thread_utils.hpp b/barretenberg/cpp/src/barretenberg/common/thread_utils.hpp
@@ -0,0 +1,29 @@
+#include "thread.hpp"
+
+namespace barretenberg::thread_utils {
+
+const size_t DEFAULT_MIN_ITERS_PER_THREAD = 1 << 4;
+
+/**
+ * @brief calculates number of threads to create based on minimum iterations per thread
+ * @details Finds the number of cpus with get_num_cpus(), and calculates `desired_num_threads`
+ * Returns the min of `desired_num_threads` and `max_num_theads`.
+ * Note that it will not calculate a power of 2 necessarily, use `calculate_num_threads_pow2` instead
+ *
+ * @param num_iterations
+ * @param min_iterations_per_thread
+ * @return size_t
+ */
+size_t calculate_num_threads(size_t num_iterations, size_t min_iterations_per_thread = DEFAULT_MIN_ITERS_PER_THREAD);
+
+/**
+ * @brief calculates number of threads to create based on minimum iterations per thread, guaranteed power of 2
+ * @details Same functionality as `calculate_num_threads` but guaranteed power of 2
+ * @param num_iterations
+ * @param min_iterations_per_thread
+ * @return size_t
+ */
+size_t calculate_num_threads_pow2(size_t num_iterations,
+                                  size_t min_iterations_per_thread = DEFAULT_MIN_ITERS_PER_THREAD);
+
+} // namespace barretenberg::thread_utils
diff --git a/barretenberg/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp b/barretenberg/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp
@@ -1,6 +1,7 @@
 #pragma once
 #include "barretenberg/common/log.hpp"
 #include "barretenberg/common/thread.hpp"
+#include "barretenberg/common/thread_utils.hpp"
 #include "barretenberg/polynomials/barycentric.hpp"
 #include "barretenberg/polynomials/pow.hpp"
 #include "barretenberg/proof_system/flavor/flavor.hpp"
@@ -140,12 +141,10 @@ template <typename Flavor> class SumcheckProverRound {
         // Note: Multithreading is "on" for every round but we reduce the number of threads from the max available based
         // on a specified minimum number of iterations per thread. This eventually leads to the use of a single thread.
         // For now we use a power of 2 number of threads simply to ensure the round size is evenly divided.
-        size_t max_num_threads = get_num_cpus_pow2(); // number of available threads (power of 2)
         size_t min_iterations_per_thread = 1 << 6; // min number of iterations for which we'll spin up a unique thread
-        size_t desired_num_threads = round_size / min_iterations_per_thread;
-        size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified
-        num_threads = num_threads > 0 ? num_threads : 1;                     // ensure num threads is >= 1
-        size_t iterations_per_thread = round_size / num_threads;             // actual iterations per thread
+        size_t num_threads =
+            barretenberg::thread_utils::calculate_num_threads_pow2(round_size, min_iterations_per_thread);
+        size_t iterations_per_thread = round_size / num_threads; // actual iterations per thread
 
         // Constuct univariate accumulator containers; one per thread
         std::vector<RelationUnivariates> thread_univariate_accumulators(num_threads);

diff --git a/barretenberg/cpp/src/barretenberg/polynomials/polynomial.cpp b/barretenberg/cpp/src/barretenberg/polynomials/polynomial.cpp
@@ -1,6 +1,8 @@
 #include "polynomial.hpp"
 #include "barretenberg/common/assert.hpp"
 #include "barretenberg/common/slab_allocator.hpp"
+#include "barretenberg/common/thread.hpp"
+#include "barretenberg/common/thread_utils.hpp"
 #include "polynomial_arithmetic.hpp"
 #include <cstddef>
 #include <fcntl.h>
@@ -306,25 +308,34 @@ template <typename Fr> void Polynomial<Fr>::add_scaled(std::span<const Fr> other
     const size_t other_size = other.size();
     ASSERT(in_place_operation_viable(other_size));
 
-    /** TODO parallelize using some kind of generic evaluation domain
-     *  we really only need to know the thread size, but we don't need all the FFT roots
-     */
-    for (size_t i = 0; i < other_size; ++i) {
-        coefficients_.get()[i] += scaling_factor * other[i];
-    }
+    // Calculates number of threads with thread_utils::calculate_num_threads
+    size_t num_threads = thread_utils::calculate_num_threads(other_size);
+    size_t range_per_thread = other_size / num_threads;
+    size_t leftovers = other_size - (range_per_thread * num_threads);
+    parallel_for(num_threads, [&](size_t j) {
+        size_t offset = j * range_per_thread;
+        size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread;
+        for (size_t i = offset; i < end; ++i) {
+            coefficients_.get()[i] += scaling_factor * other[i];
+        }
+    });
 }
 
 template <typename Fr> Polynomial<Fr>& Polynomial<Fr>::operator+=(std::span<const Fr> other)
 {
     const size_t other_size = other.size();
     ASSERT(in_place_operation_viable(other_size));
 
-    /** TODO parallelize using some kind of generic evaluation domain
-     *  we really only need to know the thread size, but we don't need all the FFT roots
-     */
-    for (size_t i = 0; i < other_size; ++i) {
-        coefficients_.get()[i] += other[i];
-    }
+    size_t num_threads = thread_utils::calculate_num_threads(other_size);
+    size_t range_per_thread = other_size / num_threads;
+    size_t leftovers = other_size - (range_per_thread * num_threads);
+    parallel_for(num_threads, [&](size_t j) {
+        size_t offset = j * range_per_thread;
+        size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread;
+        for (size_t i = offset; i < end; ++i) {
+            coefficients_.get()[i] += other[i];
+        }
+    });
 
     return *this;
 }
@@ -334,23 +345,35 @@ template <typename Fr> Polynomial<Fr>& Polynomial<Fr>::operator-=(std::span<cons
     const size_t other_size = other.size();
     ASSERT(in_place_operation_viable(other_size));
 
-    /** TODO parallelize using some kind of generic evaluation domain
-     *  we really only need to know the thread size, but we don't need all the FFT roots
-     */
-    for (size_t i = 0; i < other_size; ++i) {
-        coefficients_.get()[i] -= other[i];
-    }
+    size_t num_threads = thread_utils::calculate_num_threads(other_size);
+    size_t range_per_thread = other_size / num_threads;
+    size_t leftovers = other_size - (range_per_thread * num_threads);
+    parallel_for(num_threads, [&](size_t j) {
+        size_t offset = j * range_per_thread;
+        size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread;
+        for (size_t i = offset; i < end; ++i) {
+            coefficients_.get()[i] -= other[i];
+        }
+    });
 
     return *this;
 }
 
-template <typename Fr> Polynomial<Fr>& Polynomial<Fr>::operator*=(const Fr scaling_facor)
+template <typename Fr> Polynomial<Fr>& Polynomial<Fr>::operator*=(const Fr scaling_factor)
 {
     ASSERT(in_place_operation_viable());
 
-    for (size_t i = 0; i < size_; ++i) {
-        coefficients_.get()[i] *= scaling_facor;
-    }
+    size_t num_threads = thread_utils::calculate_num_threads(size_);
+    size_t range_per_thread = size_ / num_threads;
+    size_t leftovers = size_ - (range_per_thread * num_threads);
+    parallel_for(num_threads, [&](size_t j) {
+        size_t offset = j * range_per_thread;
+        size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread;
+        for (size_t i = offset; i < end; ++i) {
+            coefficients_.get()[i] *= scaling_factor;
+        }
+    });
+
     return *this;
 }
 

diff --git a/barretenberg/cpp/src/barretenberg/polynomials/polynomial.hpp b/barretenberg/cpp/src/barretenberg/polynomials/polynomial.hpp
@@ -188,7 +188,7 @@ template <typename Fr> class Polynomial {
      *
      * @param scaling_factor s
      */
-    Polynomial& operator*=(const Fr scaling_facor);
+    Polynomial& operator*=(const Fr scaling_factor);
 
     /**
      * @brief evaluates p(X) = ∑ᵢ aᵢ⋅Xⁱ considered as multi-linear extension p(X₀,…,Xₘ₋₁) = ∑ᵢ aᵢ⋅Lᵢ(X₀,…,Xₘ₋₁)