From 9114ecb26978a6e29573e5a108b7c708d677749a Mon Sep 17 00:00:00 2001
From: ImmanuelSegol <3ditds@gmail.com>
Date: Tue, 3 Oct 2023 15:22:28 +0300
Subject: [PATCH 1/8] fix memory error in single_stage_multi_reduction_kernel
 (#235)

* refactor

* refactor

* revert

* refactor: clang format

* Update icicle/appUtils/msm/msm.cu
---
 icicle/appUtils/msm/msm.cu            | 40 +++++++++++++++++----------
 icicle/appUtils/msm/tests/msm_test.cu |  2 +-
 icicle/utils/error_handler.cuh        | 32 +++++++++++++++++++++
 3 files changed, 59 insertions(+), 15 deletions(-)
 create mode 100644 icicle/utils/error_handler.cuh
diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu
index f09b9fe85..c58381cc0 100644
--- a/icicle/appUtils/msm/msm.cu
+++ b/icicle/appUtils/msm/msm.cu
@@ -5,6 +5,7 @@
 #include "../../primitives/field.cuh"
 #include "../../primitives/projective.cuh"
 #include "../../utils/cuda_utils.cuh"
+#include "../../utils/error_handler.cuh"
 #include "msm.cuh"
 #include <cooperative_groups.h>
 #include <cub/device/device_radix_sort.cuh>
@@ -23,20 +24,29 @@
 
 template <typename P>
 __global__ void single_stage_multi_reduction_kernel(
-  P* v, P* v_r, unsigned block_size, unsigned write_stride, unsigned write_phase, unsigned padding)
+  P* v,
+  P* v_r,
+  unsigned block_size,
+  unsigned write_stride,
+  unsigned write_phase,
+  unsigned padding,
+  unsigned num_of_threads)
 {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int tid_p = padding ? (tid / (2 * padding)) * padding + tid % padding : tid;
+  if (tid >= num_of_threads) { return; }
+
   int jump = block_size / 2;
+  int tid_p = padding ? (tid / (2 * padding)) * padding + tid % padding : tid;
   int block_id = tid_p / jump;
   int block_tid = tid_p % jump;
   unsigned read_ind = block_size * block_id + block_tid;
   unsigned write_ind = tid;
-  v_r
-    [write_stride ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride
-                  : write_ind] =
-      padding ? (tid % (2 * padding) < padding) ? v[read_ind] + v[read_ind + jump] : P::zero()
-              : v[read_ind] + v[read_ind + jump];
+  unsigned v_r_key =
+    write_stride ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride : write_ind;
+  P v_r_value = padding ? (tid % (2 * padding) < padding) ? v[read_ind] + v[read_ind + jump] : P::zero()
+                        : v[read_ind] + v[read_ind + jump];
+
+  v_r[v_r_key] = v_r_value;
 }
 
 // this kernel performs single scalar multiplication
@@ -388,7 +398,7 @@ void bucket_method_msm(
     NUM_THREADS = min(MAX_TH, s);
     NUM_BLOCKS = (s + NUM_THREADS - 1) / NUM_THREADS;
     single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
-      ones_results, ones_results, s * 2, 0, 0, 0);
+      ones_results, ones_results, s * 2, 0, 0, 0, s);
   }
 
   unsigned* bucket_indices;
@@ -554,7 +564,9 @@ void bucket_method_msm(
       NUM_THREADS = min(MAX_TH, s);
       NUM_BLOCKS = (s + NUM_THREADS - 1) / NUM_THREADS;
       single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream2>>>(
-        large_buckets, large_buckets, s * 2, 0, 0, 0);
+        large_buckets, large_buckets, s * 2, 0, 0, 0, s);
+
+      CHECK_LAST_CUDA_ERROR();
     }
 
     // distribute
@@ -631,18 +643,18 @@ void bucket_method_msm(
       if (source_bits_count > 0) {
         for (unsigned j = 0; j < target_bits_count; j++) {
           unsigned last_j = target_bits_count - 1;
-          NUM_THREADS = min(MAX_TH, (source_buckets_count >> (1 + j)));
-          NUM_BLOCKS = ((source_buckets_count >> (1 + j)) + NUM_THREADS - 1) / NUM_THREADS;
+          unsigned nof_threads = (source_buckets_count >> (1 + j));
+          NUM_THREADS = min(MAX_TH, nof_threads);
+          NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
           single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
             j == 0 ? source_buckets : temp_buckets1, j == target_bits_count - 1 ? target_buckets : temp_buckets1,
-            1 << (source_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 0, 0);
+            1 << (source_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 0, 0, nof_threads);
 
-          unsigned nof_threads = (source_buckets_count >> (1 + j));
           NUM_THREADS = min(MAX_TH, nof_threads);
           NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
           single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
             j == 0 ? source_buckets : temp_buckets2, j == target_bits_count - 1 ? target_buckets : temp_buckets2,
-            1 << (target_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 1, 0);
+            1 << (target_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 1, 0, nof_threads);
         }
       }
       if (target_bits_count == 1) {
diff --git a/icicle/appUtils/msm/tests/msm_test.cu b/icicle/appUtils/msm/tests/msm_test.cu
index 962c88bbb..b69680f38 100644
--- a/icicle/appUtils/msm/tests/msm_test.cu
+++ b/icicle/appUtils/msm/tests/msm_test.cu
@@ -224,4 +224,4 @@ int main()
   // std::cout<<pr<<std::endl;
 
   return 0;
-}
\ No newline at end of file
+}
diff --git a/icicle/utils/error_handler.cuh b/icicle/utils/error_handler.cuh
new file mode 100644
index 000000000..269d311cd
--- /dev/null
+++ b/icicle/utils/error_handler.cuh
@@ -0,0 +1,32 @@
+#pragma once
+#include <iostream>
+
+#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
+template <typename T>
+void check(T err, const char* const func, const char* const file, const int line)
+{
+  if (err != cudaSuccess) {
+    std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl;
+    std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
+  }
+}
+
+#define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__)
+void checkLast(const char* const file, const int line)
+{
+  cudaError_t err{cudaGetLastError()};
+  if (err != cudaSuccess) {
+    std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl;
+    std::cerr << cudaGetErrorString(err) << std::endl;
+  }
+}
+
+#define CHECK_SYNC_DEVICE_ERROR() syncDevice(__FILE__, __LINE__)
+void syncDevice(const char* const file, const int line)
+{
+  cudaError_t err{cudaDeviceSynchronize()};
+  if (err != cudaSuccess) {
+    std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl;
+    std::cerr << cudaGetErrorString(err) << std::endl;
+  }
+}

From 33d15830a3e784102e57e31e38d385f51485f7b8 Mon Sep 17 00:00:00 2001
From: DmytroTym <dmytrotym1@gmail.com>
Date: Tue, 3 Oct 2023 23:06:11 +0300
Subject: [PATCH 2/8] Added separate device context struct, returned lde

---
 icicle/CMakeLists.txt                         |  10 +-
 icicle/Doxyfile                               |   4 +-
 icicle/appUtils/lde/lde.cu                    | 186 ++++++++++++++++++
 icicle/appUtils/lde/lde.cuh                   |  57 ++++++
 icicle/appUtils/msm/msm.cu                    |  74 +------
 icicle/appUtils/msm/msm.cuh                   |  14 +-
 icicle/appUtils/ntt/ntt.cu                    | 140 ++++++-------
 icicle/appUtils/ntt/ntt.cuh                   |  19 +-
 .../vector_manipulation/ve_mod_mult.cuh       | 135 -------------
 icicle/utils/device_context.cuh               |  20 ++
 10 files changed, 354 insertions(+), 305 deletions(-)
 create mode 100644 icicle/appUtils/lde/lde.cu
 create mode 100644 icicle/appUtils/lde/lde.cuh
 delete mode 100644 icicle/appUtils/vector_manipulation/ve_mod_mult.cuh
 create mode 100644 icicle/utils/device_context.cuh

diff --git a/icicle/CMakeLists.txt b/icicle/CMakeLists.txt
index 18900cb15..72e8c14a7 100644
--- a/icicle/CMakeLists.txt
+++ b/icicle/CMakeLists.txt
@@ -50,14 +50,12 @@ if (NOT BUILD_TESTS)
   add_custom_command(
     TARGET icicle
     POST_BUILD
-    COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym msm_cuda=${CURVE}_msm_cuda --redefine-sym msm_internal_cuda=${CURVE}_msm_internal_cuda ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o
-  )
-  add_custom_command(
-    TARGET icicle
-    POST_BUILD
-    COMMAND ${CMAKE_AR} ARGS -rcs ${LIBRARY_OUTPUT_DIRECTORY}/libingo_${CURVE}.a ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o
+    COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym msm_cuda=${CURVE}_msm_cuda ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o
+    COMMAND ${CMAKE_OBJCOPY} ARGS --prefix-symbols=${CURVE}_ ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
+    COMMAND ${CMAKE_AR} ARGS -rcs ${LIBRARY_OUTPUT_DIRECTORY}/libingo_${CURVE}.a ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
   )
   file(REMOVE ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o)
+  file(REMOVE ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o)
 
 else()
 
diff --git a/icicle/Doxyfile b/icicle/Doxyfile
index 9ba24700e..1d7ba732e 100644
--- a/icicle/Doxyfile
+++ b/icicle/Doxyfile
@@ -831,7 +831,9 @@ WARN_LOGFILE           =
 # Note: If this tag is empty the current directory is searched.
 
 INPUT                  = appUtils/msm \
-                         appUtils/ntt
+                         appUtils/ntt \
+                         appUtils/lde \
+                         utils/device_context.cuh
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
diff --git a/icicle/appUtils/lde/lde.cu b/icicle/appUtils/lde/lde.cu
new file mode 100644
index 000000000..89a9a12b4
--- /dev/null
+++ b/icicle/appUtils/lde/lde.cu
@@ -0,0 +1,186 @@
+#include <cuda.h>
+#include <stdexcept>
+#include "../../utils/device_context.cuh"
+#include "../../curves/curve_config.cuh"
+
+namespace lde {
+
+namespace {
+
+#define MAX_THREADS_PER_BLOCK 256
+
+template <typename E, typename S>
+__global__ void mul_kernel(S* scalar_vec, E* element_vec, size_t n, E* result)
+{
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  if (tid < n) { result[tid] = scalar_vec[tid] * element_vec[tid]; }
+}
+
+template <typename E>
+__global__ void add_kernel(E* element_vec1, E* element_vec2, uint32_t n, E* result)
+{
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < n) { result[tid] = element_vec1[tid] + element_vec2[tid]; }
+}
+
+template <typename E>
+__global__ void sub_kernel(E* element_vec1, E* element_vec2, uint32_t n, E* result)
+{
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < n) { result[tid] = element_vec1[tid] - element_vec2[tid]; }
+}
+
+} // namespace
+
+template <typename E, typename S>
+cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result)
+{
+  // Set the grid and block dimensions
+  int num_threads = MAX_THREADS_PER_BLOCK;
+  int num_blocks = (n + num_threads - 1) / num_threads;
+
+  S* d_vec_a;
+  E *d_vec_b, *d_result;
+  if (on_device) {
+    // Allocate memory on the device for the input vectors and the output vector
+    cudaMallocAsync(&d_vec_a, n * sizeof(S), ctx.stream);
+    cudaMallocAsync(&d_vec_b, n * sizeof(E), ctx.stream);
+    cudaMallocAsync(&d_result, n * sizeof(E), ctx.stream);
+
+    // Copy the input vectors and the modulus from the host to the device
+    cudaMemcpyAsync(d_vec_a, vec_a, n * sizeof(S), cudaMemcpyHostToDevice, ctx.stream);
+    cudaMemcpyAsync(d_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, ctx.stream);
+  }
+
+  // Call the kernel to perform element-wise modular multiplication
+  multiply_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, on_device ? d_result : result, n);
+
+  if (on_device) {
+    cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream);
+
+    cudaFreeAsync(d_vec_a, ctx.stream);
+    cudaFreeAsync(d_vec_b, ctx.stream);
+    cudaFreeAsync(d_result, ctx.stream);
+  }
+
+  cudaStreamSynchronize(ctx.stream);
+  return cudaSuccess;
+}
+
+template <typename E>
+cudaError_t add(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result)
+{
+  // Set the grid and block dimensions
+  int num_threads = MAX_THREADS_PER_BLOCK;
+  int num_blocks = (n + num_threads - 1) / num_threads;
+
+  E *d_vec_a, *d_vec_b, *d_result;
+  if (on_device) {
+    // Allocate memory on the device for the input vectors and the output vector
+    cudaMallocAsync(&d_vec_a, n * sizeof(E), ctx.stream);
+    cudaMallocAsync(&d_vec_b, n * sizeof(E), ctx.stream);
+    cudaMallocAsync(&d_result, n * sizeof(E), ctx.stream);
+
+    // Copy the input vectors from the host to the device
+    cudaMemcpyAsync(d_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, ctx.stream);
+    cudaMemcpyAsync(d_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, ctx.stream);
+  }
+
+  // Call the kernel to perform element-wise addition
+  add_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, on_device ? d_result : result, n);
+
+  if (on_device) {
+    cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream);
+
+    cudaFreeAsync(d_vec_a, ctx.stream);
+    cudaFreeAsync(d_vec_b, ctx.stream);
+    cudaFreeAsync(d_result, ctx.stream);
+  }
+
+  cudaStreamSynchronize(ctx.stream);
+  return cudaSuccess;
+}
+
+template <typename E>
+cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result)
+{
+  // Set the grid and block dimensions
+  int num_threads = MAX_THREADS_PER_BLOCK;
+  int num_blocks = (n + num_threads - 1) / num_threads;
+
+  E *d_vec_a, *d_vec_b, *d_result;
+  if (on_device) {
+    // Allocate memory on the device for the input vectors and the output vector
+    cudaMallocAsync(&d_vec_a, n * sizeof(E), ctx.stream);
+    cudaMallocAsync(&d_vec_b, n * sizeof(E), ctx.stream);
+    cudaMallocAsync(&d_result, n * sizeof(E), ctx.stream);
+
+    // Copy the input vectors from the host to the device
+    cudaMemcpyAsync(d_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, ctx.stream);
+    cudaMemcpyAsync(d_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, ctx.stream);
+  }
+
+  // Call the kernel to perform element-wise subtraction
+  sub_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, on_device ? d_result : result, n);
+
+  if (on_device) {
+    cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream);
+
+    cudaFreeAsync(d_vec_a, ctx.stream);
+    cudaFreeAsync(d_vec_b, ctx.stream);
+    cudaFreeAsync(d_result, ctx.stream);
+  }
+
+  cudaStreamSynchronize(ctx.stream);
+  return cudaSuccess;
+}
+
+/**
+ * Extern version of [mul](@ref mul) function with the template parameters 
+ * `S` and `E` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build.
+ * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+ */
+extern "C" cudaError_t mul_cuda(
+  curve_config::scalar_t* vec_a,
+  curve_config::scalar_t* vec_b,
+  uint32_t n,
+  bool on_device,
+  device_context::DeviceContext ctx,
+  curve_config::scalar_t* result
+) {
+  return mul<curve_config::scalar_t>(vec_a, vec_b, n, on_device, ctx, result);
+}
+
+/**
+ * Extern version of [add](@ref add) function with the template parameter 
+ * `E` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build.
+ * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+ */
+extern "C" cudaError_t add_cuda(
+  curve_config::scalar_t* vec_a,
+  curve_config::scalar_t* vec_b,
+  uint32_t n,
+  bool on_device,
+  device_context::DeviceContext ctx,
+  curve_config::scalar_t* result
+) {
+  return add<curve_config::scalar_t>(vec_a, vec_b, n, on_device, ctx, result);
+}
+
+/**
+ * Extern version of [sub](@ref sub) function with the template parameter 
+ * `E` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build.
+ * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+ */
+extern "C" cudaError_t sub_cuda(
+  curve_config::scalar_t* vec_a,
+  curve_config::scalar_t* vec_b,
+  uint32_t n,
+  bool on_device,
+  device_context::DeviceContext ctx,
+  curve_config::scalar_t* result
+) {
+  return sub<curve_config::scalar_t>(vec_a, vec_b, n, on_device, ctx, result);
+}
+
+} // namespace lde
diff --git a/icicle/appUtils/lde/lde.cuh b/icicle/appUtils/lde/lde.cuh
new file mode 100644
index 000000000..90df95c4b
--- /dev/null
+++ b/icicle/appUtils/lde/lde.cuh
@@ -0,0 +1,57 @@
+#pragma once
+#ifndef LDE_H
+#define LDE_H
+
+/**
+ * @namespace lde
+ * LDE (stands for low degree extension) contains [NTT](@ref ntt)-based methods for translating between coefficient and evaluation domains of polynomials.
+ * It also contains methods for element-wise manipulation of vectors, which is useful for working with polynomials in evaluation domain.
+ */
+namespace lde {
+
+/**
+ * A function that multiplies two vectors element-wise.
+ * @param vec_a First input vector.
+ * @param vec_b Second input vector.
+ * @param n Size of vectors `vec_a` and `vec_b`.
+ * @param on_device If true, inputs and outputs are on device, if false - on the host.
+ * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method.
+ * @param result Resulting vector - element-wise product of `vec_a` and `vec_b`, can be the same pointer as `vec_b`.
+ * @tparam S The type of scalars `vec_a`.
+ * @tparam E The type of elements `vec_b` and `result`. Often (but not always), `E=S`.
+ * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+ */
+template <typename E, typename S>
+cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result);
+
+/**
+ * A function that adds two vectors element-wise.
+ * @param vec_a First input vector.
+ * @param vec_b Second input vector.
+ * @param n Size of vectors `vec_a` and `vec_b`.
+ * @param on_device If true, inputs and outputs are on device, if false - on the host.
+ * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method.
+ * @param result Resulting vector - element-wise sum of `vec_a` and `vec_b`, can be the same pointer as `vec_a` or `vec_b`.
+ * @tparam E The type of elements `vec_a`, `vec_b` and `result`.
+ * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+ */
+template <typename E>
+cudaError_t add(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result);
+
+/**
+ * A function that subtracts two vectors element-wise.
+ * @param vec_a First input vector.
+ * @param vec_b Second input vector.
+ * @param n Size of vectors `vec_a` and `vec_b`.
+ * @param on_device If true, inputs and outputs are on device, if false - on the host.
+ * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method.
+ * @param result Resulting vector - element-wise difference of `vec_a` and `vec_b`, can be the same pointer as `vec_a` or `vec_b`.
+ * @tparam E The type of elements `vec_a`, `vec_b` and `result`.
+ * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+ */
+template <typename E>
+cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result);
+
+} // namespace lde
+
+#endif
diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu
index 885a83b6b..2089abddb 100644
--- a/icicle/appUtils/msm/msm.cu
+++ b/icicle/appUtils/msm/msm.cu
@@ -885,57 +885,17 @@ void batched_bucket_method_msm(
 } // namespace
 
 template <typename S, typename A, typename P>
-cudaError_t msm_internal(S* scalars, A* points, unsigned msm_size, MSMConfig config, P* results)
+cudaError_t msm(S* scalars, A* points, unsigned msm_size, MSMConfig config, P* results)
 {
   // TODO: DmytroTym/HadarIngonyama - unify the implementation of the bucket method and the batched bucket method in one function
   // TODO: DmytroTym/HadarIngonyama - parameters to be included into the implementation: on deviceness of points, scalars and results, precompute factor, points size and device id
   if (config.batch_size == 1)
-    bucket_method_msm(config.bitsize, config.c, scalars, points, msm_size, results, config.are_scalars_on_device, config.big_triangle, config.large_bucket_factor, config.stream);
+    bucket_method_msm(config.bitsize, config.c, scalars, points, msm_size, results, config.are_scalars_on_device, config.big_triangle, config.large_bucket_factor, config.ctx.stream);
   else
-    batched_bucket_method_msm(config.bitsize, config.c, scalars, points, config.batch_size, msm_size, results, config.are_scalars_on_device, config.stream);
+    batched_bucket_method_msm(config.bitsize, config.c, scalars, points, config.batch_size, msm_size, results, config.are_scalars_on_device, config.ctx.stream);
   return cudaSuccess;
 }
 
-template <typename S, typename A, typename P>
-cudaError_t msm(S* scalars, A* points, unsigned size, P* result)
-{
-  MSMConfig config = {
-    false,     // are_scalars_on_device
-    true,      // are_scalars_montgomery_form
-    size,      // points_size
-    1,         // precompute_factor
-    false,     // are_points_on_device
-    true,      // are_points_montgomery_form
-    1,         // batch_size
-    false,     // are_result_on_device
-    16,        // c
-    S::NBITS,  // bitsize
-    false,     // big_triangle
-    10,        // large_bucket_factor
-    0,         // device_id
-    0          // stream
-  };
-  return msm_internal(scalars, points, size, config, result);
-}
-
-/**
- * Extern version of [msm_internal](@ref msm_internal) function with the following values of template parameters 
- * (where the curve is given by `-DCURVE` env variable during build):
- *  - `S` is the [scalar field](@ref scalar_t) of the curve;
- *  - `A` is the [affine representation](@ref affine_t) of curve points;
- *  - `P` is the [projective representation](@ref projective_t) of curve points.
- * @return `cudaSuccess` if the execution was successful and an error code otherwise.
- */
-extern "C" cudaError_t msm_internal_cuda(
-  curve_config::scalar_t* scalars,
-  curve_config::affine_t* points,
-  size_t msm_size,
-  MSMConfig config,
-  curve_config::projective_t* out)
-{
-  return msm_internal<curve_config::scalar_t, curve_config::affine_t, curve_config::projective_t>(scalars, points, msm_size, config, out);
-}
-
 /**
  * Extern version of [msm](@ref msm) function with the following values of template parameters 
  * (where the curve is given by `-DCURVE` env variable during build):
@@ -947,32 +907,15 @@ extern "C" cudaError_t msm_internal_cuda(
 extern "C" cudaError_t msm_cuda(
   curve_config::scalar_t* scalars,
   curve_config::affine_t* points,
-  size_t size,
+  size_t msm_size,
+  MSMConfig config,
   curve_config::projective_t* out)
 {
-  return msm<curve_config::scalar_t, curve_config::affine_t, curve_config::projective_t>(scalars, points, size, out);
+  return msm<curve_config::scalar_t, curve_config::affine_t, curve_config::projective_t>(scalars, points, msm_size, config, out);
 }
 
 #if defined(G2_DEFINED)
 
-/**
- * Extern version of [msm_internal](@ref msm_internal) function with the following values of template parameters 
- * (where the curve is given by `-DCURVE` env variable during build):
- *  - `S` is the [scalar field](@ref scalar_t) of the curve;
- *  - `A` is the [affine representation](@ref g2_affine_t) of G2 curve points;
- *  - `P` is the [projective representation](@ref g2_projective_t) of G2 curve points.
- * @return `cudaSuccess` if the execution was successful and an error code otherwise.
- */
-extern "C" cudaError_t g2_msm_internal_cuda(
-  curve_config::scalar_t* scalars,
-  curve_config::g2_affine_t* points,
-  size_t msm_size,
-  MSMConfig config,
-  curve_config::g2_projective_t* out)
-{
-  return msm_internal<curve_config::scalar_t, curve_config::g2_affine_t, curve_config::g2_projective_t>(scalars, points, msm_size, config, out);
-}
-
 /**
  * Extern version of [msm](@ref msm) function with the following values of template parameters 
  * (where the curve is given by `-DCURVE` env variable during build):
@@ -984,10 +927,11 @@ extern "C" cudaError_t g2_msm_internal_cuda(
 extern "C" cudaError_t g2_msm_cuda(
   curve_config::scalar_t* scalars,
   curve_config::g2_affine_t* points,
-  size_t size,
+  size_t msm_size,
+  MSMConfig config,
   curve_config::g2_projective_t* out)
 {
-  return msm<curve_config::scalar_t, curve_config::g2_affine_t, curve_config::g2_projective_t>(scalars, points, size, out);
+  return msm<curve_config::scalar_t, curve_config::g2_affine_t, curve_config::g2_projective_t>(scalars, points, msm_size, config, out);
 }
 
 #endif
diff --git a/icicle/appUtils/msm/msm.cuh b/icicle/appUtils/msm/msm.cuh
index ae3adf303..6dba1bcce 100644
--- a/icicle/appUtils/msm/msm.cuh
+++ b/icicle/appUtils/msm/msm.cuh
@@ -3,6 +3,7 @@
 #define MSM_H
 
 #include "cuda_runtime_api.h"
+#include "../../utils/device_context.cuh"
 
 /**
  * @namespace msm
@@ -24,7 +25,7 @@ namespace msm {
 
 /**
  * @struct MSMConfig
- * Struct that encodes MSM parameters to be passed into the [msm_internal](@ref msm_internal) function.
+ * Struct that encodes MSM parameters to be passed into the [msm](@ref msm) function.
  */
 struct MSMConfig {
   bool are_scalars_on_device;         /**< True if scalars are on device and false if they're on host. Default value: false. */
@@ -50,8 +51,7 @@ struct MSMConfig {
   unsigned large_bucket_factor;       /**< Variable that controls how sensitive the algorithm is to the buckets that occur very frequently. 
                                        *   Useful for efficient treatment of non-uniform distributions of scalars and "top windows" with few bits.
                                        *   Can be set to 0 to disable separate treatment of large buckets altogether. Default value: 10. */
-  unsigned device_id;                 /**< Index of the GPU to run the MSM on. Default value: 0. */
-  cudaStream_t stream;                /**< Stream to use. Default value: 0. */
+  device_context::DeviceContext ctx;  /**< Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext). */
 };
 
 /**
@@ -68,13 +68,7 @@ struct MSMConfig {
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
 template <typename S, typename A, typename P>
-cudaError_t msm_internal(S* scalars, A* points, unsigned msm_size, MSMConfig config, P* results);
-
-/**
- * A function that computes MSM by calling [msm_internal](@ref msm_internal) function with default [MSMConfig](@ref MSMConfig) values.
- */
-template <typename S, typename A, typename P>
-cudaError_t msm(S* scalars, A* points, unsigned size, P* result);
+cudaError_t msm(S* scalars, A* points, unsigned msm_size, MSMConfig config, P* results);
 
 } // namespace msm
 
diff --git a/icicle/appUtils/ntt/ntt.cu b/icicle/appUtils/ntt/ntt.cu
index 7ac2b62c3..b2be1e398 100644
--- a/icicle/appUtils/ntt/ntt.cu
+++ b/icicle/appUtils/ntt/ntt.cu
@@ -1,5 +1,4 @@
 #include "../../utils/sharedmem.cuh"
-#include "../vector_manipulation/ve_mod_mult.cuh"
 #include "../../curves/curve_config.cuh"
 #include "ntt.cuh"
 
@@ -31,8 +30,31 @@ __global__ void twiddle_factors_kernel(S* d_twiddles, uint32_t n_twiddles, S ome
   }
 }
 
-template <typename T>
-__global__ void reverse_order_kernel(T* arr, T* arr_reversed, uint32_t n, uint32_t logn, uint32_t batch_size)
+/**
+ * Multiply the elements of an input array by a scalar in-place. Used for normalization in iNTT.
+ * @param arr input array.
+ * @param n size of arr.
+ * @param n_inv scalar of type S (scalar).
+ */
+template <typename E, typename S>
+__global__ void template_normalize_kernel(E* arr, uint32_t n, S scalar)
+{
+  int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (tid < n) { arr[tid] = scalar * arr[tid]; }
+}
+
+template <typename E, typename S>
+__global__ void batchVectorMult(S* scalar_vec, E* element_vec, unsigned n_scalars, unsigned batch_size)
+{
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  if (tid < n_scalars * batch_size) {
+    int scalar_id = tid % n_scalars;
+    element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid];
+  }
+}
+
+template <typename E>
+__global__ void reverse_order_kernel(E* arr, E* arr_reversed, uint32_t n, uint32_t logn, uint32_t batch_size)
 {
   int threadId = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (threadId < n * batch_size) {
@@ -52,15 +74,15 @@ __global__ void reverse_order_kernel(T* arr, T* arr_reversed, uint32_t n, uint32
  * @param logn log(n).
  * @param batch_size the size of the batch.
  */
-template <typename T>
-void reverse_order_batch(T* arr, uint32_t n, uint32_t logn, uint32_t batch_size, cudaStream_t stream)
+template <typename E>
+void reverse_order_batch(E* arr, uint32_t n, uint32_t logn, uint32_t batch_size, cudaStream_t stream)
 {
-  T* arr_reversed;
-  cudaMallocAsync(&arr_reversed, n * batch_size * sizeof(T), stream);
+  E* arr_reversed;
+  cudaMallocAsync(&arr_reversed, n * batch_size * sizeof(E), stream);
   int number_of_threads = MAX_THREADS_BATCH;
   int number_of_blocks = (n * batch_size + number_of_threads - 1) / number_of_threads;
   reverse_order_kernel<<<number_of_blocks, number_of_threads, 0, stream>>>(arr, arr_reversed, n, logn, batch_size);
-  cudaMemcpyAsync(arr, arr_reversed, n * batch_size * sizeof(T), cudaMemcpyDeviceToDevice, stream);
+  cudaMemcpyAsync(arr, arr_reversed, n * batch_size * sizeof(E), cudaMemcpyDeviceToDevice, stream);
   cudaFreeAsync(arr_reversed, stream);
 }
 
@@ -72,8 +94,8 @@ void reverse_order_batch(T* arr, uint32_t n, uint32_t logn, uint32_t batch_size,
  * @param n length of `arr`.
  * @param logn log(n).
  */
-template <typename T>
-void reverse_order(T* arr, uint32_t n, uint32_t logn, cudaStream_t stream)
+template <typename E>
+void reverse_order(E* arr, uint32_t n, uint32_t logn, cudaStream_t stream)
 {
   reverse_order_batch(arr, n, logn, 1, stream);
 }
@@ -311,14 +333,14 @@ void ntt_inplace_batch_template(
         <<<num_blocks, num_threads, 0, stream>>>(d_inout, n, d_twiddles, n, total_tasks, s, false);
     }
 
-    if (is_coset) batch_vector_mult(coset, d_inout, n, batch_size, stream);
+    if (is_coset) batchVectorMult<E, S><<<num_blocks, num_threads, 0, stream>>>(coset, d_inout, n, batch_size);
 
     num_threads = max(min(n / 2, MAX_NUM_THREADS), 1);
     num_blocks = (n * batch_size + num_threads - 1) / num_threads;
     template_normalize_kernel<E, S>
       <<<num_blocks, num_threads, 0, stream>>>(d_inout, n * batch_size, S::inv_log_size(logn));
   } else {
-    if (is_coset) batch_vector_mult(coset, d_inout, n, batch_size, stream);
+    if (is_coset) batchVectorMult<E, S><<<num_blocks, num_threads, 0, stream>>>(coset, d_inout, n, batch_size);
 
     for (int s = logn - 1; s >= logn_shmem; s--) // TODO: this loop also can be unrolled
     {
@@ -338,32 +360,33 @@ void ntt_inplace_batch_template(
 } // namespace
 
 template <typename S>
-cudaError_t generate_twiddle_factors(S* d_twiddles, uint32_t n_twiddles, S omega, unsigned device_id, cudaStream_t stream)
+cudaError_t generate_twiddle_factors(S* d_twiddles, uint32_t n_twiddles, S omega, device_context::DeviceContext ctx)
 {
-  twiddle_factors_kernel<S><<<1, 1, 0, stream>>>(d_twiddles, n_twiddles, omega);
-  cudaStreamSynchronize(stream);
+  twiddle_factors_kernel<S><<<1, 1, 0, ctx.stream>>>(d_twiddles, n_twiddles, omega);
+  cudaStreamSynchronize(ctx.stream);
   return cudaSuccess;
 }
 
 template <typename E, typename S>
-cudaError_t ntt_internal(E* input, unsigned size, bool is_inverse, NTTConfig<S> config)
+cudaError_t ntt(E* input, unsigned size, bool is_inverse, NTTConfig<S> config)
 {
   uint32_t logn = uint32_t(log(size) / log(2));
   uint32_t n_twiddles = size; // n_twiddles is set to 4096 as BLS12_381::scalar_t::omega() is of that order.
   size_t input_size = size * config.batch_size * sizeof(E);
   bool on_device = config.are_inputs_on_device;
   bool generate_twiddles = (config.twiddles == nullptr);
+  cudaStream_t stream = config.ctx.stream;
   
   S* d_twiddles;
   if (generate_twiddles) {
-    cudaMallocAsync(&d_twiddles, n_twiddles * sizeof(S), config.stream);
-    generate_twiddle_factors(d_twiddles, n_twiddles, is_inverse ? S::omega_inv(logn) : S::omega(logn), config.device_id, config.stream);
+    cudaMallocAsync(&d_twiddles, n_twiddles * sizeof(S), stream);
+    generate_twiddle_factors(d_twiddles, n_twiddles, is_inverse ? S::omega_inv(logn) : S::omega(logn), config.ctx);
   }
 
   E* d_input;
   if (on_device) {
-    cudaMallocAsync(&d_input, input_size, config.stream);
-    cudaMemcpyAsync(d_input, input, input_size, cudaMemcpyHostToDevice, config.stream);
+    cudaMallocAsync(&d_input, input_size, stream);
+    cudaMemcpyAsync(d_input, input, input_size, cudaMemcpyHostToDevice, stream);
   }
 
   bool reverse_input;
@@ -388,56 +411,37 @@ cudaError_t ntt_internal(E* input, unsigned size, bool is_inverse, NTTConfig<S>
   }
 
   if (reverse_input)
-    reverse_order_batch(on_device ? d_input : input, size, logn, config.batch_size, config.stream);
+    reverse_order_batch(on_device ? d_input : input, size, logn, config.batch_size, stream);
   ntt_inplace_batch_template(on_device ? d_input : input, generate_twiddles ? d_twiddles : config.twiddles, size, 
-                             config.batch_size, is_inverse, config.is_coset, config.coset_gen, config.stream, false);
+                             config.batch_size, is_inverse, config.is_coset, config.coset_gen, stream, false);
   if (reverse_output)
-    reverse_order_batch(on_device ? d_input : input, size, logn, config.batch_size, config.stream);
+    reverse_order_batch(on_device ? d_input : input, size, logn, config.batch_size, stream);
 
   if (on_device) {
-    cudaMemcpyAsync(input, d_input, input_size, cudaMemcpyDeviceToHost, config.stream);
-    cudaFreeAsync(d_input, config.stream);
+    cudaMemcpyAsync(input, d_input, input_size, cudaMemcpyDeviceToHost, stream);
+    cudaFreeAsync(d_input, stream);
   }
 
   if (generate_twiddles) {
-    cudaFreeAsync(d_twiddles, config.stream);
+    cudaFreeAsync(d_twiddles, stream);
   }
 
-  cudaStreamSynchronize(config.stream);
+  cudaStreamSynchronize(stream);
   return cudaSuccess;
 }
 
-template <typename E, typename S>
-cudaError_t ntt(E* input, unsigned size, bool is_inverse)
-{
-  NTTConfig<curve_config::scalar_t> config = {
-    false,                    // are_inputs_on_device
-    Ordering::kNN,            // ordering
-    Decimation::kDIF,         // decimation
-    Butterfly::kCooleyTukey,  // butterfly
-    false,                    // is_coset
-    (S *)nullptr,             // coset_gen
-    (S *)nullptr,             // twiddles
-    1,                        // batch_size
-    0,                        // device_id
-    0                         // stream
-  };
-  return ntt_internal(input, size, is_inverse, config);
-}
-
 /**
- * Extern version of [ntt_internal](@ref ntt_internal) function with the following values of template parameters 
- * (where the curve is given by `-DCURVE` env variable during build):
- *  - `S` and `E` are both the [scalar field](@ref scalar_t) of the curve;
+ * Extern version of [generate_twiddle_factors](@ref generate_twiddle_factors) function with the template parameter 
+ * `S` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build.
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
-extern "C" cudaError_t ntt_internal_cuda(
-  curve_config::scalar_t* input,
-  unsigned size,
-  bool is_inverse,
-  NTTConfig<curve_config::scalar_t> config)
-{
-  return ntt_internal<curve_config::scalar_t, curve_config::scalar_t>(input, size, is_inverse, config);
+extern "C" cudaError_t generate_twiddle_factors_cuda(
+  curve_config::scalar_t* d_twiddles,
+  uint32_t n_twiddles,
+  curve_config::scalar_t omega,
+  device_context::DeviceContext ctx
+) {
+  return generate_twiddle_factors<curve_config::scalar_t>(d_twiddles, n_twiddles, omega, ctx);
 }
 
 /**
@@ -449,29 +453,14 @@ extern "C" cudaError_t ntt_internal_cuda(
 extern "C" cudaError_t ntt_cuda(
   curve_config::scalar_t* input,
   unsigned size,
-  bool is_inverse)
-{
-  return ntt<curve_config::scalar_t, curve_config::scalar_t>(input, size, is_inverse);
-}
-
-#if defined(ECNTT_DEFINED)
-
-/**
- * Extern version of [ntt_internal](@ref ntt_internal) function with the following values of template parameters 
- * (where the curve is given by `-DCURVE` env variable during build):
- *  - `S` is the [projective representation](@ref projective_t) of the curve (i.e. EC NTT is computed);
- *  - `E` is the [scalar field](@ref scalar_t) of the curve;
- * @return `cudaSuccess` if the execution was successful and an error code otherwise.
- */
-extern "C" cudaError_t ecntt_internal_cuda(
-  curve_config::projective_t* input,
-  unsigned size,
   bool is_inverse,
   NTTConfig<curve_config::scalar_t> config)
 {
-  return ntt_internal<curve_config::projective_t, curve_config::scalar_t>(input, size, is_inverse, config);
+  return ntt<curve_config::scalar_t, curve_config::scalar_t>(input, size, is_inverse, config);
 }
 
+#if defined(ECNTT_DEFINED)
+
 /**
  * Extern version of [ntt](@ref ntt) function with the following values of template parameters 
  * (where the curve is given by `-DCURVE` env variable during build):
@@ -482,9 +471,10 @@ extern "C" cudaError_t ecntt_internal_cuda(
 extern "C" cudaError_t ecntt_cuda(
   curve_config::projective_t* input,
   unsigned size,
-  bool is_inverse)
+  bool is_inverse,
+  NTTConfig<curve_config::scalar_t> config)
 {
-  return ntt<curve_config::projective_t, curve_config::scalar_t>(input, size, is_inverse);
+  return ntt<curve_config::projective_t, curve_config::scalar_t>(input, size, is_inverse, config);
 }
 
 #endif
diff --git a/icicle/appUtils/ntt/ntt.cuh b/icicle/appUtils/ntt/ntt.cuh
index 3707b34f9..3059cb572 100644
--- a/icicle/appUtils/ntt/ntt.cuh
+++ b/icicle/appUtils/ntt/ntt.cuh
@@ -3,6 +3,7 @@
 #define NTT_H
 
 #include "cuda_runtime_api.h"
+#include "../../utils/device_context.cuh"
 
 /**
  * @namespace ntt
@@ -22,13 +23,12 @@ namespace ntt {
  * @param d_twiddles Input empty array on device to which twiddles are to be written.
  * @param n_twiddles Number of twiddle \f$ n \f$ factors to generate.
  * @param omega Root of unity \f$ \omega \f$.
- * @param device_id ID of the device to use.
- * @param stream Stream to use.
+ * @param ctx Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext).
  * @tparam S The type of twiddle factors \f$ \{ \omega^i \} \f$.
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
 template <typename S>
-cudaError_t generate_twiddle_factors(S* d_twiddles, uint32_t n_twiddles, S omega, unsigned device_id, cudaStream_t stream);
+cudaError_t generate_twiddle_factors(S* d_twiddles, uint32_t n_twiddles, S omega, device_context::DeviceContext ctx);
 
 /**
  * @enum Ordering
@@ -58,7 +58,7 @@ enum class Butterfly { kCooleyTukey, kGentlemanSande };
 
 /**
  * @struct NTTConfig
- * Struct that encodes NTT parameters to be passed into the [ntt_internal](@ref ntt_internal) function.
+ * Struct that encodes NTT parameters to be passed into the [ntt](@ref ntt) function.
  */
 template <typename S>
 struct NTTConfig {
@@ -83,8 +83,7 @@ struct NTTConfig {
                                          *   are generated online using the default generator (TODO: link to twiddle gen here) and function
                                          *   [generate_twiddle_factors](@ref generate_twiddle_factors). Default value: `null`. */
     unsigned batch_size;                /**< The number of NTTs to compute. Default value: 1. */
-    unsigned device_id;                 /**< Index of the GPU to run the NTT on. Default value: 0. */
-    cudaStream_t stream;                /**< Stream to use. Default value: 0. */
+    device_context::DeviceContext ctx;  /**< Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext). */
 };
 
 /**
@@ -99,13 +98,7 @@ struct NTTConfig {
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
 template <typename E, typename S>
-cudaError_t ntt_internal(E* input, unsigned size, bool is_inverse, NTTConfig<S> config);
-
-/**
- * A function that computes NTT by calling [ntt_internal](@ref ntt_internal) function with default [NTTConfig](@ref NTTConfig) values.
- */
-template <typename E, typename S>
-cudaError_t ntt(E* input, unsigned size, bool is_inverse);
+cudaError_t ntt(E* input, unsigned size, bool is_inverse, NTTConfig<S> config);
 
 } // namespace ntt
 
diff --git a/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh b/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh
deleted file mode 100644
index 36859c49f..000000000
--- a/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh
+++ /dev/null
@@ -1,135 +0,0 @@
-#ifndef VEC_MULT
-#define VEC_MULT
-#pragma once
-#include <cuda.h>
-#include <stdexcept>
-
-#define MAX_THREADS_PER_BLOCK 256
-
-/**
- * Multiply the elements of an input array by a scalar in-place.
- * @param arr input array.
- * @param n size of arr.
- * @param n_inv scalar of type S (scalar).
- */
-template <typename E, typename S>
-__global__ void template_normalize_kernel(E* arr, uint32_t n, S scalar)
-{
-  int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (tid < n) { arr[tid] = scalar * arr[tid]; }
-}
-
-// TODO: headers for prototypes and .c .cpp .cu files for implementations
-template <typename E, typename S>
-__global__ void vectorModMult(S* scalar_vec, E* element_vec, E* result, size_t n_elments)
-{
-  int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  if (tid < n_elments) { result[tid] = scalar_vec[tid] * element_vec[tid]; }
-}
-
-template <typename E, typename S>
-int vector_mod_mult(S* vec_a, E* vec_b, E* result, size_t n_elments, cudaStream_t stream) // TODO: in place so no need
-                                                                                          // for third result vector
-{
-  // Set the grid and block dimensions
-  int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
-  int threads_per_block = MAX_THREADS_PER_BLOCK;
-
-  // Allocate memory on the device for the input vectors, the output vector, and the modulus
-  S* d_vec_a;
-  E *d_vec_b, *d_result;
-  cudaMallocAsync(&d_vec_a, n_elments * sizeof(S), stream);
-  cudaMallocAsync(&d_vec_b, n_elments * sizeof(E), stream);
-  cudaMallocAsync(&d_result, n_elments * sizeof(E), stream);
-
-  // Copy the input vectors and the modulus from the host to the device
-  cudaMemcpyAsync(d_vec_a, vec_a, n_elments * sizeof(S), cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(d_vec_b, vec_b, n_elments * sizeof(E), cudaMemcpyHostToDevice, stream);
-
-  // Call the kernel to perform element-wise modular multiplication
-  vectorModMult<<<num_blocks, threads_per_block, 0, stream>>>(d_vec_a, d_vec_b, d_result, n_elments);
-
-  cudaMemcpyAsync(result, d_result, n_elments * sizeof(E), cudaMemcpyDeviceToHost, stream);
-
-  cudaFreeAsync(d_vec_a, stream);
-  cudaFreeAsync(d_vec_b, stream);
-  cudaFreeAsync(d_result, stream);
-
-  cudaStreamSynchronize(stream);
-  return 0;
-}
-
-template <typename E, typename S>
-int vector_mod_mult_device(
-  S* d_vec_a, E* d_vec_b, E* d_result, size_t n_elments) // TODO: in place so no need for third result vector
-{
-  // Set the grid and block dimensions
-  int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
-  int threads_per_block = MAX_THREADS_PER_BLOCK;
-
-  // Call the kernel to perform element-wise modular multiplication
-  vectorModMult<<<num_blocks, threads_per_block>>>(d_vec_a, d_vec_b, d_result, n_elments);
-  return 0;
-}
-
-template <typename E, typename S>
-__global__ void batchVectorMult(S* scalar_vec, E* element_vec, unsigned n_scalars, unsigned batch_size)
-{
-  int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  if (tid < n_scalars * batch_size) {
-    int scalar_id = tid % n_scalars;
-    element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid];
-  }
-}
-
-template <typename E, typename S>
-int batch_vector_mult(S* scalar_vec, E* element_vec, unsigned n_scalars, unsigned batch_size, cudaStream_t stream)
-{
-  // Set the grid and block dimensions
-  int NUM_THREADS = MAX_THREADS_PER_BLOCK;
-  int NUM_BLOCKS = (n_scalars * batch_size + NUM_THREADS - 1) / NUM_THREADS;
-  batchVectorMult<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(scalar_vec, element_vec, n_scalars, batch_size);
-  return 0;
-}
-
-template <typename E>
-__global__ void matrixVectorMult(E* matrix_elements, E* vector_elements, E* result, size_t dim)
-{
-  int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  if (tid < dim) {
-    result[tid] = E::zero();
-    for (int i = 0; i < dim; i++)
-      result[tid] = result[tid] + matrix_elements[tid * dim + i] * vector_elements[i];
-  }
-}
-
-template <typename E>
-int matrix_mod_mult(E* matrix_elements, E* vector_elements, E* result, size_t dim, cudaStream_t stream)
-{
-  // Set the grid and block dimensions
-  int num_blocks = (int)ceil((float)dim / MAX_THREADS_PER_BLOCK);
-  int threads_per_block = MAX_THREADS_PER_BLOCK;
-
-  // Allocate memory on the device for the input vectors, the output vector, and the modulus
-  E *d_matrix, *d_vector, *d_result;
-  cudaMallocAsync(&d_matrix, (dim * dim) * sizeof(E), stream);
-  cudaMallocAsync(&d_vector, dim * sizeof(E), stream);
-  cudaMallocAsync(&d_result, dim * sizeof(E), stream);
-
-  // Copy the input vectors and the modulus from the host to the device
-  cudaMemcpyAsync(d_matrix, matrix_elements, (dim * dim) * sizeof(E), cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(d_vector, vector_elements, dim * sizeof(E), cudaMemcpyHostToDevice, stream);
-
-  // Call the kernel to perform element-wise modular multiplication
-  matrixVectorMult<<<num_blocks, threads_per_block, 0, stream>>>(d_matrix, d_vector, d_result, dim);
-
-  cudaMemcpyAsync(result, d_result, dim * sizeof(E), cudaMemcpyDeviceToHost, stream);
-
-  cudaFreeAsync(d_matrix, stream);
-  cudaFreeAsync(d_vector, stream);
-  cudaFreeAsync(d_result, stream);
-
-  cudaStreamSynchronize(stream);
-  return 0;
-}
-#endif
\ No newline at end of file
diff --git a/icicle/utils/device_context.cuh b/icicle/utils/device_context.cuh
new file mode 100644
index 000000000..e29d6a1b1
--- /dev/null
+++ b/icicle/utils/device_context.cuh
@@ -0,0 +1,20 @@
+#pragma once
+#ifndef DEVICE_CONTEXT_H
+#define DEVICE_CONTEXT_H
+
+#include <cuda_runtime.h>
+
+namespace device_context {
+
+/**
+ * Properties of the device used in icicle functions.
+ */
+struct DeviceContext {
+    unsigned device_id;                 /**< Index of the currently used GPU. Default value: 0. */
+    cudaStream_t stream;                /**< Stream to use. Default value: 0. */
+    cudaMemPool_t mempool;              /**< Mempool to use. Default value: 0. */
+};
+
+} // namespace device_context
+
+#endif

From df1fc7ed6068e8eab9d81a8d6002949524597a19 Mon Sep 17 00:00:00 2001
From: DmytroTym <dmytrotym1@gmail.com>
Date: Wed, 4 Oct 2023 08:50:13 +0300
Subject: [PATCH 3/8] added lde to cmake

---
 icicle/CMakeLists.txt      | 7 ++++++-
 icicle/appUtils/lde/lde.cu | 6 +++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/icicle/CMakeLists.txt b/icicle/CMakeLists.txt
index 72e8c14a7..7d1d156ce 100644
--- a/icicle/CMakeLists.txt
+++ b/icicle/CMakeLists.txt
@@ -44,6 +44,7 @@ if (NOT BUILD_TESTS)
     icicle
     appUtils/msm/msm.cu
     appUtils/ntt/ntt.cu
+    appUtils/lde/lde.cu
   )
   target_compile_options(icicle PRIVATE -c)
 
@@ -52,10 +53,14 @@ if (NOT BUILD_TESTS)
     POST_BUILD
     COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym msm_cuda=${CURVE}_msm_cuda ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o
     COMMAND ${CMAKE_OBJCOPY} ARGS --prefix-symbols=${CURVE}_ ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
-    COMMAND ${CMAKE_AR} ARGS -rcs ${LIBRARY_OUTPUT_DIRECTORY}/libingo_${CURVE}.a ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
+    COMMAND ${CMAKE_OBJCOPY} ARGS --prefix-symbols=${CURVE}_ ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/lde/lde.cu.o
+    COMMAND ${CMAKE_AR} ARGS -rcs ${LIBRARY_OUTPUT_DIRECTORY}/libingo_${CURVE}.a ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o 
+                                                                                 ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o
+                                                                                 ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/lde/lde.cu.o
   )
   file(REMOVE ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o)
   file(REMOVE ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o)
+  file(REMOVE ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/lde/lde.cu.o)
 
 else()
 
diff --git a/icicle/appUtils/lde/lde.cu b/icicle/appUtils/lde/lde.cu
index 89a9a12b4..2f33a3c1e 100644
--- a/icicle/appUtils/lde/lde.cu
+++ b/icicle/appUtils/lde/lde.cu
@@ -53,7 +53,7 @@ cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, device_context::De
   }
 
   // Call the kernel to perform element-wise modular multiplication
-  multiply_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, on_device ? d_result : result, n);
+  mul_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, n, on_device ? d_result : result);
 
   if (on_device) {
     cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream);
@@ -87,7 +87,7 @@ cudaError_t add(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::De
   }
 
   // Call the kernel to perform element-wise addition
-  add_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, on_device ? d_result : result, n);
+  add_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, n, on_device ? d_result : result);
 
   if (on_device) {
     cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream);
@@ -121,7 +121,7 @@ cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::De
   }
 
   // Call the kernel to perform element-wise subtraction
-  sub_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, on_device ? d_result : result, n);
+  sub_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, n, on_device ? d_result : result);
 
   if (on_device) {
     cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream);

From d8e26e8b502d6b8fb54ecad523e48ef652b239ec Mon Sep 17 00:00:00 2001
From: DmytroTym <dmytrotym1@gmail.com>
Date: Wed, 4 Oct 2023 14:34:38 +0300
Subject: [PATCH 4/8] Montgomery param added in lde.cu mul function

---
 icicle/appUtils/lde/lde.cu  | 8 +++-----
 icicle/appUtils/lde/lde.cuh | 4 +++-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/icicle/appUtils/lde/lde.cu b/icicle/appUtils/lde/lde.cu
index 2f33a3c1e..554b7665e 100644
--- a/icicle/appUtils/lde/lde.cu
+++ b/icicle/appUtils/lde/lde.cu
@@ -33,7 +33,7 @@ __global__ void sub_kernel(E* element_vec1, E* element_vec2, uint32_t n, E* resu
 } // namespace
 
 template <typename E, typename S>
-cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result)
+cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result)
 {
   // Set the grid and block dimensions
   int num_threads = MAX_THREADS_PER_BLOCK;
@@ -57,7 +57,6 @@ cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, device_context::De
 
   if (on_device) {
     cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream);
-
     cudaFreeAsync(d_vec_a, ctx.stream);
     cudaFreeAsync(d_vec_b, ctx.stream);
     cudaFreeAsync(d_result, ctx.stream);
@@ -91,7 +90,6 @@ cudaError_t add(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::De
 
   if (on_device) {
     cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream);
-
     cudaFreeAsync(d_vec_a, ctx.stream);
     cudaFreeAsync(d_vec_b, ctx.stream);
     cudaFreeAsync(d_result, ctx.stream);
@@ -125,7 +123,6 @@ cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::De
 
   if (on_device) {
     cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream);
-
     cudaFreeAsync(d_vec_a, ctx.stream);
     cudaFreeAsync(d_vec_b, ctx.stream);
     cudaFreeAsync(d_result, ctx.stream);
@@ -145,10 +142,11 @@ extern "C" cudaError_t mul_cuda(
   curve_config::scalar_t* vec_b,
   uint32_t n,
   bool on_device,
+  bool is_montgomery,
   device_context::DeviceContext ctx,
   curve_config::scalar_t* result
 ) {
-  return mul<curve_config::scalar_t>(vec_a, vec_b, n, on_device, ctx, result);
+  return mul<curve_config::scalar_t>(vec_a, vec_b, n, on_device, is_montgomery, ctx, result);
 }
 
 /**
diff --git a/icicle/appUtils/lde/lde.cuh b/icicle/appUtils/lde/lde.cuh
index 90df95c4b..d11c984a2 100644
--- a/icicle/appUtils/lde/lde.cuh
+++ b/icicle/appUtils/lde/lde.cuh
@@ -15,6 +15,8 @@ namespace lde {
  * @param vec_b Second input vector.
  * @param n Size of vectors `vec_a` and `vec_b`.
  * @param on_device If true, inputs and outputs are on device, if false - on the host.
+ * @param is_montgomery If true, inputs are expected to be Montgomery form and results are retured in Montgomery form.
+ * If false - inputs and outputs are non-Montgomery.
  * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method.
  * @param result Resulting vector - element-wise product of `vec_a` and `vec_b`, can be the same pointer as `vec_b`.
  * @tparam S The type of scalars `vec_a`.
@@ -22,7 +24,7 @@ namespace lde {
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
 template <typename E, typename S>
-cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result);
+cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result);
 
 /**
  * A function that adds two vectors element-wise.

From 7d4e266ed51febe7debeb391bafd7ac32af0e9a8 Mon Sep 17 00:00:00 2001
From: DmytroTym <dmytrotym1@gmail.com>
Date: Wed, 4 Oct 2023 16:04:01 +0300
Subject: [PATCH 5/8] fixed on_device for ntt and lde

---
 icicle/appUtils/lde/lde.cu  | 36 ++++++++++++++++++------------------
 icicle/appUtils/lde/lde.cuh | 12 ++++++------
 icicle/appUtils/ntt/ntt.cu  | 12 ++++++------
 3 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/icicle/appUtils/lde/lde.cu b/icicle/appUtils/lde/lde.cu
index 554b7665e..56e1ca2a6 100644
--- a/icicle/appUtils/lde/lde.cu
+++ b/icicle/appUtils/lde/lde.cu
@@ -33,7 +33,7 @@ __global__ void sub_kernel(E* element_vec1, E* element_vec2, uint32_t n, E* resu
 } // namespace
 
 template <typename E, typename S>
-cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result)
+cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool is_on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result)
 {
   // Set the grid and block dimensions
   int num_threads = MAX_THREADS_PER_BLOCK;
@@ -41,7 +41,7 @@ cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, bool is_montgomery
 
   S* d_vec_a;
   E *d_vec_b, *d_result;
-  if (on_device) {
+  if (!is_on_device) {
     // Allocate memory on the device for the input vectors and the output vector
     cudaMallocAsync(&d_vec_a, n * sizeof(S), ctx.stream);
     cudaMallocAsync(&d_vec_b, n * sizeof(E), ctx.stream);
@@ -53,9 +53,9 @@ cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, bool is_montgomery
   }
 
   // Call the kernel to perform element-wise modular multiplication
-  mul_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, n, on_device ? d_result : result);
+  mul_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(is_on_device ? vec_a : d_vec_a, is_on_device ? vec_b : d_vec_b, n, is_on_device ? result : d_result);
 
-  if (on_device) {
+  if (!is_on_device) {
     cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream);
     cudaFreeAsync(d_vec_a, ctx.stream);
     cudaFreeAsync(d_vec_b, ctx.stream);
@@ -67,14 +67,14 @@ cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, bool is_montgomery
 }
 
 template <typename E>
-cudaError_t add(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result)
+cudaError_t add(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context::DeviceContext ctx, E* result)
 {
   // Set the grid and block dimensions
   int num_threads = MAX_THREADS_PER_BLOCK;
   int num_blocks = (n + num_threads - 1) / num_threads;
 
   E *d_vec_a, *d_vec_b, *d_result;
-  if (on_device) {
+  if (!is_on_device) {
     // Allocate memory on the device for the input vectors and the output vector
     cudaMallocAsync(&d_vec_a, n * sizeof(E), ctx.stream);
     cudaMallocAsync(&d_vec_b, n * sizeof(E), ctx.stream);
@@ -86,9 +86,9 @@ cudaError_t add(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::De
   }
 
   // Call the kernel to perform element-wise addition
-  add_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, n, on_device ? d_result : result);
+  add_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(is_on_device ? vec_a : d_vec_a, is_on_device ? vec_b : d_vec_b, n, is_on_device ? result : d_result);
 
-  if (on_device) {
+  if (!is_on_device) {
     cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream);
     cudaFreeAsync(d_vec_a, ctx.stream);
     cudaFreeAsync(d_vec_b, ctx.stream);
@@ -100,14 +100,14 @@ cudaError_t add(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::De
 }
 
 template <typename E>
-cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result)
+cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context::DeviceContext ctx, E* result)
 {
   // Set the grid and block dimensions
   int num_threads = MAX_THREADS_PER_BLOCK;
   int num_blocks = (n + num_threads - 1) / num_threads;
 
   E *d_vec_a, *d_vec_b, *d_result;
-  if (on_device) {
+  if (!is_on_device) {
     // Allocate memory on the device for the input vectors and the output vector
     cudaMallocAsync(&d_vec_a, n * sizeof(E), ctx.stream);
     cudaMallocAsync(&d_vec_b, n * sizeof(E), ctx.stream);
@@ -119,9 +119,9 @@ cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::De
   }
 
   // Call the kernel to perform element-wise subtraction
-  sub_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, n, on_device ? d_result : result);
+  sub_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(is_on_device ? vec_a : d_vec_a, is_on_device ? vec_b : d_vec_b, n, is_on_device ? result : d_result);
 
-  if (on_device) {
+  if (!is_on_device) {
     cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream);
     cudaFreeAsync(d_vec_a, ctx.stream);
     cudaFreeAsync(d_vec_b, ctx.stream);
@@ -141,12 +141,12 @@ extern "C" cudaError_t mul_cuda(
   curve_config::scalar_t* vec_a,
   curve_config::scalar_t* vec_b,
   uint32_t n,
-  bool on_device,
+  bool is_on_device,
   bool is_montgomery,
   device_context::DeviceContext ctx,
   curve_config::scalar_t* result
 ) {
-  return mul<curve_config::scalar_t>(vec_a, vec_b, n, on_device, is_montgomery, ctx, result);
+  return mul<curve_config::scalar_t>(vec_a, vec_b, n, is_on_device, is_montgomery, ctx, result);
 }
 
 /**
@@ -158,11 +158,11 @@ extern "C" cudaError_t add_cuda(
   curve_config::scalar_t* vec_a,
   curve_config::scalar_t* vec_b,
   uint32_t n,
-  bool on_device,
+  bool is_on_device,
   device_context::DeviceContext ctx,
   curve_config::scalar_t* result
 ) {
-  return add<curve_config::scalar_t>(vec_a, vec_b, n, on_device, ctx, result);
+  return add<curve_config::scalar_t>(vec_a, vec_b, n, is_on_device, ctx, result);
 }
 
 /**
@@ -174,11 +174,11 @@ extern "C" cudaError_t sub_cuda(
   curve_config::scalar_t* vec_a,
   curve_config::scalar_t* vec_b,
   uint32_t n,
-  bool on_device,
+  bool is_on_device,
   device_context::DeviceContext ctx,
   curve_config::scalar_t* result
 ) {
-  return sub<curve_config::scalar_t>(vec_a, vec_b, n, on_device, ctx, result);
+  return sub<curve_config::scalar_t>(vec_a, vec_b, n, is_on_device, ctx, result);
 }
 
 } // namespace lde
diff --git a/icicle/appUtils/lde/lde.cuh b/icicle/appUtils/lde/lde.cuh
index d11c984a2..129baded1 100644
--- a/icicle/appUtils/lde/lde.cuh
+++ b/icicle/appUtils/lde/lde.cuh
@@ -14,7 +14,7 @@ namespace lde {
  * @param vec_a First input vector.
  * @param vec_b Second input vector.
  * @param n Size of vectors `vec_a` and `vec_b`.
- * @param on_device If true, inputs and outputs are on device, if false - on the host.
+ * @param is_on_device If true, inputs and outputs are on device, if false - on the host.
  * @param is_montgomery If true, inputs are expected to be Montgomery form and results are retured in Montgomery form.
  * If false - inputs and outputs are non-Montgomery.
  * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method.
@@ -24,35 +24,35 @@ namespace lde {
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
 template <typename E, typename S>
-cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result);
+cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool is_on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result);
 
 /**
  * A function that adds two vectors element-wise.
  * @param vec_a First input vector.
  * @param vec_b Second input vector.
  * @param n Size of vectors `vec_a` and `vec_b`.
- * @param on_device If true, inputs and outputs are on device, if false - on the host.
+ * @param is_on_device If true, inputs and outputs are on device, if false - on the host.
  * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method.
  * @param result Resulting vector - element-wise sum of `vec_a` and `vec_b`, can be the same pointer as `vec_a` or `vec_b`.
  * @tparam E The type of elements `vec_a`, `vec_b` and `result`.
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
 template <typename E>
-cudaError_t add(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result);
+cudaError_t add(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context::DeviceContext ctx, E* result);
 
 /**
  * A function that subtracts two vectors element-wise.
  * @param vec_a First input vector.
  * @param vec_b Second input vector.
  * @param n Size of vectors `vec_a` and `vec_b`.
- * @param on_device If true, inputs and outputs are on device, if false - on the host.
+ * @param is_on_device If true, inputs and outputs are on device, if false - on the host.
  * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method.
  * @param result Resulting vector - element-wise difference of `vec_a` and `vec_b`, can be the same pointer as `vec_a` or `vec_b`.
  * @tparam E The type of elements `vec_a`, `vec_b` and `result`.
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
 template <typename E>
-cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result);
+cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context::DeviceContext ctx, E* result);
 
 } // namespace lde
 
diff --git a/icicle/appUtils/ntt/ntt.cu b/icicle/appUtils/ntt/ntt.cu
index b2be1e398..6837db39c 100644
--- a/icicle/appUtils/ntt/ntt.cu
+++ b/icicle/appUtils/ntt/ntt.cu
@@ -373,7 +373,7 @@ cudaError_t ntt(E* input, unsigned size, bool is_inverse, NTTConfig<S> config)
   uint32_t logn = uint32_t(log(size) / log(2));
   uint32_t n_twiddles = size; // n_twiddles is set to 4096 as BLS12_381::scalar_t::omega() is of that order.
   size_t input_size = size * config.batch_size * sizeof(E);
-  bool on_device = config.are_inputs_on_device;
+  bool is_on_device = config.are_inputs_on_device;
   bool generate_twiddles = (config.twiddles == nullptr);
   cudaStream_t stream = config.ctx.stream;
   
@@ -384,7 +384,7 @@ cudaError_t ntt(E* input, unsigned size, bool is_inverse, NTTConfig<S> config)
   }
 
   E* d_input;
-  if (on_device) {
+  if (!is_on_device) {
     cudaMallocAsync(&d_input, input_size, stream);
     cudaMemcpyAsync(d_input, input, input_size, cudaMemcpyHostToDevice, stream);
   }
@@ -411,13 +411,13 @@ cudaError_t ntt(E* input, unsigned size, bool is_inverse, NTTConfig<S> config)
   }
 
   if (reverse_input)
-    reverse_order_batch(on_device ? d_input : input, size, logn, config.batch_size, stream);
-  ntt_inplace_batch_template(on_device ? d_input : input, generate_twiddles ? d_twiddles : config.twiddles, size, 
+    reverse_order_batch(is_on_device ? input : d_input, size, logn, config.batch_size, stream);
+  ntt_inplace_batch_template(is_on_device ? input : d_input, generate_twiddles ? d_twiddles : config.twiddles, size, 
                              config.batch_size, is_inverse, config.is_coset, config.coset_gen, stream, false);
   if (reverse_output)
-    reverse_order_batch(on_device ? d_input : input, size, logn, config.batch_size, stream);
+    reverse_order_batch(is_on_device ? input : d_input, size, logn, config.batch_size, stream);
 
-  if (on_device) {
+  if (!is_on_device) {
     cudaMemcpyAsync(input, d_input, input_size, cudaMemcpyDeviceToHost, stream);
     cudaFreeAsync(d_input, stream);
   }

From ec671969677d70f5da70bc77abc9747c87bdf511 Mon Sep 17 00:00:00 2001
From: DmytroTym <dmytrotym1@gmail.com>
Date: Thu, 5 Oct 2023 18:54:39 +0300
Subject: [PATCH 6/8] CamelCase

---
 icicle/appUtils/lde/lde.cu  | 35 ++++++++++++++++++++---------------
 icicle/appUtils/lde/lde.cuh |  6 +++---
 icicle/appUtils/msm/msm.cu  | 14 +++++++-------
 icicle/appUtils/msm/msm.cuh | 14 +++++++-------
 icicle/appUtils/ntt/ntt.cu  | 22 +++++++++++-----------
 icicle/appUtils/ntt/ntt.cuh | 34 +++++++++++++++++-----------------
 6 files changed, 65 insertions(+), 60 deletions(-)

diff --git a/icicle/appUtils/lde/lde.cu b/icicle/appUtils/lde/lde.cu
index 56e1ca2a6..811a1c163 100644
--- a/icicle/appUtils/lde/lde.cu
+++ b/icicle/appUtils/lde/lde.cu
@@ -1,6 +1,10 @@
+#include "lde.cuh"
+
 #include <cuda.h>
 #include <stdexcept>
+
 #include "../../utils/device_context.cuh"
+#include "../../utils/mont.cuh"
 #include "../../curves/curve_config.cuh"
 
 namespace lde {
@@ -10,21 +14,21 @@ namespace {
 #define MAX_THREADS_PER_BLOCK 256
 
 template <typename E, typename S>
-__global__ void mul_kernel(S* scalar_vec, E* element_vec, size_t n, E* result)
+__global__ void mul_kernel(S* scalar_vec, E* element_vec, int n, E* result)
 {
   int tid = blockDim.x * blockIdx.x + threadIdx.x;
   if (tid < n) { result[tid] = scalar_vec[tid] * element_vec[tid]; }
 }
 
 template <typename E>
-__global__ void add_kernel(E* element_vec1, E* element_vec2, uint32_t n, E* result)
+__global__ void add_kernel(E* element_vec1, E* element_vec2, int n, E* result)
 {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   if (tid < n) { result[tid] = element_vec1[tid] + element_vec2[tid]; }
 }
 
 template <typename E>
-__global__ void sub_kernel(E* element_vec1, E* element_vec2, uint32_t n, E* result)
+__global__ void sub_kernel(E* element_vec1, E* element_vec2, int n, E* result)
 {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   if (tid < n) { result[tid] = element_vec1[tid] - element_vec2[tid]; }
@@ -33,7 +37,7 @@ __global__ void sub_kernel(E* element_vec1, E* element_vec2, uint32_t n, E* resu
 } // namespace
 
 template <typename E, typename S>
-cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool is_on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result)
+cudaError_t Mul(S* vec_a, E* vec_b, int n, bool is_on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result)
 {
   // Set the grid and block dimensions
   int num_threads = MAX_THREADS_PER_BLOCK;
@@ -54,6 +58,7 @@ cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool is_on_device, bool is_montgom
 
   // Call the kernel to perform element-wise modular multiplication
   mul_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(is_on_device ? vec_a : d_vec_a, is_on_device ? vec_b : d_vec_b, n, is_on_device ? result : d_result);
+  if (is_montgomery) mont::from_montgomery(is_on_device ? result : d_result, n, ctx.stream);
 
   if (!is_on_device) {
     cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream);
@@ -67,7 +72,7 @@ cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool is_on_device, bool is_montgom
 }
 
 template <typename E>
-cudaError_t add(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context::DeviceContext ctx, E* result)
+cudaError_t Add(E* vec_a, E* vec_b, int n, bool is_on_device, device_context::DeviceContext ctx, E* result)
 {
   // Set the grid and block dimensions
   int num_threads = MAX_THREADS_PER_BLOCK;
@@ -100,7 +105,7 @@ cudaError_t add(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context:
 }
 
 template <typename E>
-cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context::DeviceContext ctx, E* result)
+cudaError_t Sub(E* vec_a, E* vec_b, int n, bool is_on_device, device_context::DeviceContext ctx, E* result)
 {
   // Set the grid and block dimensions
   int num_threads = MAX_THREADS_PER_BLOCK;
@@ -133,14 +138,14 @@ cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context:
 }
 
 /**
- * Extern version of [mul](@ref mul) function with the template parameters 
+ * Extern version of [Mul](@ref Mul) function with the template parameters 
  * `S` and `E` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build.
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
-extern "C" cudaError_t mul_cuda(
+extern "C" cudaError_t MulCuda(
   curve_config::scalar_t* vec_a,
   curve_config::scalar_t* vec_b,
-  uint32_t n,
+  int n,
   bool is_on_device,
   bool is_montgomery,
   device_context::DeviceContext ctx,
@@ -150,14 +155,14 @@ extern "C" cudaError_t mul_cuda(
 }
 
 /**
- * Extern version of [add](@ref add) function with the template parameter 
+ * Extern version of [Add](@ref Add) function with the template parameter 
  * `E` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build.
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
-extern "C" cudaError_t add_cuda(
+extern "C" cudaError_t AddCuda(
   curve_config::scalar_t* vec_a,
   curve_config::scalar_t* vec_b,
-  uint32_t n,
+  int n,
   bool is_on_device,
   device_context::DeviceContext ctx,
   curve_config::scalar_t* result
@@ -166,14 +171,14 @@ extern "C" cudaError_t add_cuda(
 }
 
 /**
- * Extern version of [sub](@ref sub) function with the template parameter 
+ * Extern version of [Sub](@ref Sub) function with the template parameter 
  * `E` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build.
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
-extern "C" cudaError_t sub_cuda(
+extern "C" cudaError_t SubCuda(
   curve_config::scalar_t* vec_a,
   curve_config::scalar_t* vec_b,
-  uint32_t n,
+  int n,
   bool is_on_device,
   device_context::DeviceContext ctx,
   curve_config::scalar_t* result
diff --git a/icicle/appUtils/lde/lde.cuh b/icicle/appUtils/lde/lde.cuh
index 129baded1..010920499 100644
--- a/icicle/appUtils/lde/lde.cuh
+++ b/icicle/appUtils/lde/lde.cuh
@@ -24,7 +24,7 @@ namespace lde {
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
 template <typename E, typename S>
-cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool is_on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result);
+cudaError_t Mul(S* vec_a, E* vec_b, int n, bool is_on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result);
 
 /**
  * A function that adds two vectors element-wise.
@@ -38,7 +38,7 @@ cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool is_on_device, bool is_montgom
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
 template <typename E>
-cudaError_t add(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context::DeviceContext ctx, E* result);
+cudaError_t Add(E* vec_a, E* vec_b, int n, bool is_on_device, device_context::DeviceContext ctx, E* result);
 
 /**
  * A function that subtracts two vectors element-wise.
@@ -52,7 +52,7 @@ cudaError_t add(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context:
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
 template <typename E>
-cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context::DeviceContext ctx, E* result);
+cudaError_t Sub(E* vec_a, E* vec_b, int n, bool is_on_device, device_context::DeviceContext ctx, E* result);
 
 } // namespace lde
 
diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu
index 246d66dce..f077bb61f 100644
--- a/icicle/appUtils/msm/msm.cu
+++ b/icicle/appUtils/msm/msm.cu
@@ -897,7 +897,7 @@ void batched_bucket_method_msm(
 } // namespace
 
 template <typename S, typename A, typename P>
-cudaError_t msm(S* scalars, A* points, unsigned msm_size, MSMConfig config, P* results)
+cudaError_t MSM(S* scalars, A* points, int msm_size, MSMConfig config, P* results)
 {
   // TODO: DmytroTym/HadarIngonyama - unify the implementation of the bucket method and the batched bucket method in one function
   // TODO: DmytroTym/HadarIngonyama - parameters to be included into the implementation: on deviceness of points, scalars and results, precompute factor, points size and device id
@@ -916,14 +916,14 @@ cudaError_t msm(S* scalars, A* points, unsigned msm_size, MSMConfig config, P* r
  *  - `P` is the [projective representation](@ref projective_t) of curve points.
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
-extern "C" cudaError_t msm_cuda(
+extern "C" cudaError_t MSMCuda(
   curve_config::scalar_t* scalars,
   curve_config::affine_t* points,
-  size_t msm_size,
+  int msm_size,
   MSMConfig config,
   curve_config::projective_t* out)
 {
-  return msm<curve_config::scalar_t, curve_config::affine_t, curve_config::projective_t>(scalars, points, msm_size, config, out);
+  return MSM<curve_config::scalar_t, curve_config::affine_t, curve_config::projective_t>(scalars, points, msm_size, config, out);
 }
 
 #if defined(G2_DEFINED)
@@ -936,14 +936,14 @@ extern "C" cudaError_t msm_cuda(
  *  - `P` is the [projective representation](@ref g2_projective_t) of G2 curve points.
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
-extern "C" cudaError_t g2_msm_cuda(
+extern "C" cudaError_t G2MSMCuda(
   curve_config::scalar_t* scalars,
   curve_config::g2_affine_t* points,
-  size_t msm_size,
+  int msm_size,
   MSMConfig config,
   curve_config::g2_projective_t* out)
 {
-  return msm<curve_config::scalar_t, curve_config::g2_affine_t, curve_config::g2_projective_t>(scalars, points, msm_size, config, out);
+  return MSM<curve_config::scalar_t, curve_config::g2_affine_t, curve_config::g2_projective_t>(scalars, points, msm_size, config, out);
 }
 
 #endif
diff --git a/icicle/appUtils/msm/msm.cuh b/icicle/appUtils/msm/msm.cuh
index 6dba1bcce..043f46683 100644
--- a/icicle/appUtils/msm/msm.cuh
+++ b/icicle/appUtils/msm/msm.cuh
@@ -30,25 +30,25 @@ namespace msm {
 struct MSMConfig {
   bool are_scalars_on_device;         /**< True if scalars are on device and false if they're on host. Default value: false. */
   bool are_scalars_montgomery_form;   /**< True if scalars are in Montgomery form and false otherwise. Default value: true. */
-  unsigned points_size;               /**< Number of points in the MSM. If a batch of MSMs needs to be computed, this should be a number 
+  int points_size;                    /**< Number of points in the MSM. If a batch of MSMs needs to be computed, this should be a number 
                                        *   of different points. So, if each MSM re-uses the same set of points, this variable is set equal 
                                        *   to the MSM size. And if every MSM uses a distinct set of points, it should be set to the product of 
                                        *   MSM size and [batch_size](@ref batch_size). Default value: 0 (meaning it's equal to the MSM size). */
-  unsigned precompute_factor;         /**< The number of extra points to pre-compute for each point. Larger values decrease the number of computations 
+  int precompute_factor;              /**< The number of extra points to pre-compute for each point. Larger values decrease the number of computations 
                                        *   to make, on-line memory footprint, but increase the static memory footprint. Default value: 1 (i.e. don't pre-compute). */
   bool are_points_on_device;          /**< True if points are on device and false if they're on host. Default value: false. */
   bool are_points_montgomery_form;    /**< True if coordinates of points are in Montgomery form and false otherwise. Default value: true. */
-  unsigned batch_size;                /**< The number of MSMs to compute. Default value: 1. */
+  int batch_size;                     /**< The number of MSMs to compute. Default value: 1. */
   bool are_result_on_device;          /**< True if the results should be on device and false if they should be on host. Default value: false. */
-  unsigned c;                         /**< \f$ c \f$ value, or "window bitsize" which is the main parameter of the "bucket method"
+  int c;                              /**< \f$ c \f$ value, or "window bitsize" which is the main parameter of the "bucket method"
                                        *   that we use to solve the MSM problem. As a rule of thumb, larger value means more on-line memory
                                        *   footprint but also more parallelism and less computational complexity (up to a certain point).
                                        *   Default value: 0 (the optimal value of \f$ c \f$ is chosen automatically). */
-  unsigned bitsize;                   /**< Number of bits of the largest scalar. Typically equals the bitsize of scalar field, but if a different 
+  int bitsize;                        /**< Number of bits of the largest scalar. Typically equals the bitsize of scalar field, but if a different 
                                        *   (better) upper bound is known, it should be reflected in this variable. Default value: 0 (set to the bitsize of scalar field). */
   bool big_triangle;                  /**< Whether to do "bucket accumulation" serially. Decreases computational complexity, but also greatly 
                                        *   decreases parallelism, so only suitable for large batches of MSMs. Default value: false. */
-  unsigned large_bucket_factor;       /**< Variable that controls how sensitive the algorithm is to the buckets that occur very frequently. 
+  int large_bucket_factor;            /**< Variable that controls how sensitive the algorithm is to the buckets that occur very frequently. 
                                        *   Useful for efficient treatment of non-uniform distributions of scalars and "top windows" with few bits.
                                        *   Can be set to 0 to disable separate treatment of large buckets altogether. Default value: 10. */
   device_context::DeviceContext ctx;  /**< Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext). */
@@ -68,7 +68,7 @@ struct MSMConfig {
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
 template <typename S, typename A, typename P>
-cudaError_t msm(S* scalars, A* points, unsigned msm_size, MSMConfig config, P* results);
+cudaError_t MSM(S* scalars, A* points, int msm_size, MSMConfig config, P* results);
 
 } // namespace msm
 
diff --git a/icicle/appUtils/ntt/ntt.cu b/icicle/appUtils/ntt/ntt.cu
index 6837db39c..b48f2dde6 100644
--- a/icicle/appUtils/ntt/ntt.cu
+++ b/icicle/appUtils/ntt/ntt.cu
@@ -360,7 +360,7 @@ void ntt_inplace_batch_template(
 } // namespace
 
 template <typename S>
-cudaError_t generate_twiddle_factors(S* d_twiddles, uint32_t n_twiddles, S omega, device_context::DeviceContext ctx)
+cudaError_t GenerateTwiddleFactors(S* d_twiddles, int n_twiddles, S omega, device_context::DeviceContext ctx)
 {
   twiddle_factors_kernel<S><<<1, 1, 0, ctx.stream>>>(d_twiddles, n_twiddles, omega);
   cudaStreamSynchronize(ctx.stream);
@@ -368,7 +368,7 @@ cudaError_t generate_twiddle_factors(S* d_twiddles, uint32_t n_twiddles, S omega
 }
 
 template <typename E, typename S>
-cudaError_t ntt(E* input, unsigned size, bool is_inverse, NTTConfig<S> config)
+cudaError_t NTT(E* input, int size, bool is_inverse, NTTConfig<S> config)
 {
   uint32_t logn = uint32_t(log(size) / log(2));
   uint32_t n_twiddles = size; // n_twiddles is set to 4096 as BLS12_381::scalar_t::omega() is of that order.
@@ -435,9 +435,9 @@ cudaError_t ntt(E* input, unsigned size, bool is_inverse, NTTConfig<S> config)
  * `S` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build.
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
-extern "C" cudaError_t generate_twiddle_factors_cuda(
+extern "C" cudaError_t GenerateTwiddleFactorsCuda(
   curve_config::scalar_t* d_twiddles,
-  uint32_t n_twiddles,
+  int n_twiddles,
   curve_config::scalar_t omega,
   device_context::DeviceContext ctx
 ) {
@@ -450,31 +450,31 @@ extern "C" cudaError_t generate_twiddle_factors_cuda(
  *  - `S` and `E` are both the [scalar field](@ref scalar_t) of the curve;
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
-extern "C" cudaError_t ntt_cuda(
+extern "C" cudaError_t NTTCuda(
   curve_config::scalar_t* input,
-  unsigned size,
+  int size,
   bool is_inverse,
   NTTConfig<curve_config::scalar_t> config)
 {
-  return ntt<curve_config::scalar_t, curve_config::scalar_t>(input, size, is_inverse, config);
+  return NTT<curve_config::scalar_t, curve_config::scalar_t>(input, size, is_inverse, config);
 }
 
 #if defined(ECNTT_DEFINED)
 
 /**
- * Extern version of [ntt](@ref ntt) function with the following values of template parameters 
+ * Extern version of [NTT](@ref NTT) function with the following values of template parameters 
  * (where the curve is given by `-DCURVE` env variable during build):
  *  - `S` is the [projective representation](@ref projective_t) of the curve (i.e. EC NTT is computed);
  *  - `E` is the [scalar field](@ref scalar_t) of the curve;
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
-extern "C" cudaError_t ecntt_cuda(
+extern "C" cudaError_t ECNTTCuda(
   curve_config::projective_t* input,
-  unsigned size,
+  int size,
   bool is_inverse,
   NTTConfig<curve_config::scalar_t> config)
 {
-  return ntt<curve_config::projective_t, curve_config::scalar_t>(input, size, is_inverse, config);
+  return NTT<curve_config::projective_t, curve_config::scalar_t>(input, size, is_inverse, config);
 }
 
 #endif
diff --git a/icicle/appUtils/ntt/ntt.cuh b/icicle/appUtils/ntt/ntt.cuh
index 3059cb572..ec8ad6695 100644
--- a/icicle/appUtils/ntt/ntt.cuh
+++ b/icicle/appUtils/ntt/ntt.cuh
@@ -18,18 +18,6 @@
  */
 namespace ntt {
 
-/**
- * Generates twiddles \f$ \{\omega^0=1, \omega^1, \dots, \omega^{n-1}\} \f$ from root of unity \f$ \omega \f$ and stores them on device.
- * @param d_twiddles Input empty array on device to which twiddles are to be written.
- * @param n_twiddles Number of twiddle \f$ n \f$ factors to generate.
- * @param omega Root of unity \f$ \omega \f$.
- * @param ctx Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext).
- * @tparam S The type of twiddle factors \f$ \{ \omega^i \} \f$.
- * @return `cudaSuccess` if the execution was successful and an error code otherwise.
- */
-template <typename S>
-cudaError_t generate_twiddle_factors(S* d_twiddles, uint32_t n_twiddles, S omega, device_context::DeviceContext ctx);
-
 /**
  * @enum Ordering
  * How to order inputs and outputs of the NTT:
@@ -76,13 +64,13 @@ struct NTTConfig {
                                          *   on a coset of [twiddles](@ref twiddles) given by [the coset generator](@ref coset_gen), so: 
                                          *   \f$ \{coset\_gen\cdot\omega^0, coset\_gen\cdot\omega^1, \dots, coset\_gen\cdot\omega^{n-1}\} \f$. Default value: false. */
     S* coset_gen;                       /**< The field element that generates a coset if [is_coset](@ref is_coset) is true. 
-                                         *   Otherwise should be set to null. Default value: `null`. */
+                                         *   Otherwise should be set to `nullptr`. Default value: `nullptr`. */
     S* twiddles;                        /**< "Twiddle factors", (or "domain", or "roots of unity") on which the NTT is evaluated. 
                                          *   This pointer is expected to live on device. The order is as follows:
-                                         *   \f$ \{\omega^0=1, \omega^1, \dots, \omega^{n-1}\} \f$. If this pointer is `null`, twiddle factors
+                                         *   \f$ \{\omega^0=1, \omega^1, \dots, \omega^{n-1}\} \f$. If this pointer is `nullptr`, twiddle factors
                                          *   are generated online using the default generator (TODO: link to twiddle gen here) and function
-                                         *   [generate_twiddle_factors](@ref generate_twiddle_factors). Default value: `null`. */
-    unsigned batch_size;                /**< The number of NTTs to compute. Default value: 1. */
+                                         *   [generate_twiddle_factors](@ref generate_twiddle_factors). Default value: `nullptr`. */
+    int batch_size;                     /**< The number of NTTs to compute. Default value: 1. */
     device_context::DeviceContext ctx;  /**< Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext). */
 };
 
@@ -98,7 +86,19 @@ struct NTTConfig {
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
 template <typename E, typename S>
-cudaError_t ntt(E* input, unsigned size, bool is_inverse, NTTConfig<S> config);
+cudaError_t NTT(E* input, int size, bool is_inverse, NTTConfig<S> config);
+
+/**
+ * Generates twiddles \f$ \{\omega^0=1, \omega^1, \dots, \omega^{n-1}\} \f$ from root of unity \f$ \omega \f$ and stores them on device.
+ * @param d_twiddles Input empty array on device to which twiddles are to be written.
+ * @param n_twiddles Number of twiddle \f$ n \f$ factors to generate.
+ * @param omega Root of unity \f$ \omega \f$.
+ * @param ctx Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext).
+ * @tparam S The type of twiddle factors \f$ \{ \omega^i \} \f$.
+ * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+ */
+template <typename S>
+cudaError_t GenerateTwiddleFactors(S* d_twiddles, int n_twiddles, S omega, device_context::DeviceContext ctx);
 
 } // namespace ntt
 

From d6dfd59e9a0193dfbf4186e262db0769c7c0dc79 Mon Sep 17 00:00:00 2001
From: DmytroTym <dmytrotym1@gmail.com>
Date: Thu, 5 Oct 2023 23:49:14 +0300
Subject: [PATCH 7/8] fixed msm_test, int unification, google guilde

---
 icicle/appUtils/lde/lde.cu            |  6 +--
 icicle/appUtils/lde/lde.cuh           |  8 +++-
 icicle/appUtils/msm/msm.cu            | 17 ++++---
 icicle/appUtils/msm/msm.cuh           |  3 +-
 icicle/appUtils/msm/tests/msm_test.cu | 67 +++++++++++++++++----------
 icicle/appUtils/ntt/ntt.cu            | 41 ++++------------
 icicle/appUtils/ntt/ntt.cuh           |  5 +-
 icicle/utils/device_context.cuh       |  2 +-
 icicle/utils/host_math.cuh            |  4 ++
 icicle/utils/mont.cuh                 | 25 ++++++++--
 icicle/utils/utils_kernels.cu         | 22 +++++++++
 icicle/utils/utils_kernels.cuh        | 21 +++++++++
 12 files changed, 145 insertions(+), 76 deletions(-)
 create mode 100644 icicle/utils/utils_kernels.cu
 create mode 100644 icicle/utils/utils_kernels.cuh

diff --git a/icicle/appUtils/lde/lde.cu b/icicle/appUtils/lde/lde.cu
index 811a1c163..f222de148 100644
--- a/icicle/appUtils/lde/lde.cu
+++ b/icicle/appUtils/lde/lde.cu
@@ -151,7 +151,7 @@ extern "C" cudaError_t MulCuda(
   device_context::DeviceContext ctx,
   curve_config::scalar_t* result
 ) {
-  return mul<curve_config::scalar_t>(vec_a, vec_b, n, is_on_device, is_montgomery, ctx, result);
+  return Mul<curve_config::scalar_t>(vec_a, vec_b, n, is_on_device, is_montgomery, ctx, result);
 }
 
 /**
@@ -167,7 +167,7 @@ extern "C" cudaError_t AddCuda(
   device_context::DeviceContext ctx,
   curve_config::scalar_t* result
 ) {
-  return add<curve_config::scalar_t>(vec_a, vec_b, n, is_on_device, ctx, result);
+  return Add<curve_config::scalar_t>(vec_a, vec_b, n, is_on_device, ctx, result);
 }
 
 /**
@@ -183,7 +183,7 @@ extern "C" cudaError_t SubCuda(
   device_context::DeviceContext ctx,
   curve_config::scalar_t* result
 ) {
-  return sub<curve_config::scalar_t>(vec_a, vec_b, n, is_on_device, ctx, result);
+  return Sub<curve_config::scalar_t>(vec_a, vec_b, n, is_on_device, ctx, result);
 }
 
 } // namespace lde
diff --git a/icicle/appUtils/lde/lde.cuh b/icicle/appUtils/lde/lde.cuh
index 010920499..3ccc403a6 100644
--- a/icicle/appUtils/lde/lde.cuh
+++ b/icicle/appUtils/lde/lde.cuh
@@ -2,6 +2,8 @@
 #ifndef LDE_H
 #define LDE_H
 
+#include "../../utils/device_context.cuh"
+
 /**
  * @namespace lde
  * LDE (stands for low degree extension) contains [NTT](@ref ntt)-based methods for translating between coefficient and evaluation domains of polynomials.
@@ -15,12 +17,12 @@ namespace lde {
  * @param vec_b Second input vector.
  * @param n Size of vectors `vec_a` and `vec_b`.
  * @param is_on_device If true, inputs and outputs are on device, if false - on the host.
- * @param is_montgomery If true, inputs are expected to be Montgomery form and results are retured in Montgomery form.
+ * @param is_montgomery If true, inputs are expected to be in Montgomery form and results are retured in Montgomery form.
  * If false - inputs and outputs are non-Montgomery.
  * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method.
  * @param result Resulting vector - element-wise product of `vec_a` and `vec_b`, can be the same pointer as `vec_b`.
  * @tparam S The type of scalars `vec_a`.
- * @tparam E The type of elements `vec_b` and `result`. Often (but not always), `E=S`.
+ * @tparam E The type of elements `vec_b` and `result`. Often (but not always) `E=S`.
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
 template <typename E, typename S>
@@ -34,6 +36,7 @@ cudaError_t Mul(S* vec_a, E* vec_b, int n, bool is_on_device, bool is_montgomery
  * @param is_on_device If true, inputs and outputs are on device, if false - on the host.
  * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method.
  * @param result Resulting vector - element-wise sum of `vec_a` and `vec_b`, can be the same pointer as `vec_a` or `vec_b`.
+ * If inputs are in Montgomery form, the result is too, and vice versa: non-Montgomery inputs produce non-Montgomery result.
  * @tparam E The type of elements `vec_a`, `vec_b` and `result`.
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
@@ -48,6 +51,7 @@ cudaError_t Add(E* vec_a, E* vec_b, int n, bool is_on_device, device_context::De
  * @param is_on_device If true, inputs and outputs are on device, if false - on the host.
  * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method.
  * @param result Resulting vector - element-wise difference of `vec_a` and `vec_b`, can be the same pointer as `vec_a` or `vec_b`.
+ * If inputs are in Montgomery form, the result is too, and vice versa: non-Montgomery inputs produce non-Montgomery result.
  * @tparam E The type of elements `vec_a`, `vec_b` and `result`.
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu
index f077bb61f..12c389dae 100644
--- a/icicle/appUtils/msm/msm.cu
+++ b/icicle/appUtils/msm/msm.cu
@@ -1,19 +1,22 @@
-#include "../../primitives/affine.cuh"
-#include "../../primitives/field.cuh"
-#include "../../primitives/projective.cuh"
-#include "../../utils/cuda_utils.cuh"
-#include "../../curves/curve_config.cuh"
-#include "../../utils/error_handler.cuh"
 #include "msm.cuh"
-#include <cooperative_groups.h>
+
 #include <cub/device/device_radix_sort.cuh>
 #include <cub/device/device_run_length_encode.cuh>
 #include <cub/device/device_scan.cuh>
 #include <cuda.h>
+#include <cooperative_groups.h>
+
 #include <iostream>
 #include <stdexcept>
 #include <vector>
 
+#include "../../primitives/affine.cuh"
+#include "../../primitives/field.cuh"
+#include "../../primitives/projective.cuh"
+#include "../../utils/cuda_utils.cuh"
+#include "../../utils/error_handler.cuh"
+#include "../../curves/curve_config.cuh"
+
 namespace msm {
 
 namespace {
diff --git a/icicle/appUtils/msm/msm.cuh b/icicle/appUtils/msm/msm.cuh
index 043f46683..49ff198f4 100644
--- a/icicle/appUtils/msm/msm.cuh
+++ b/icicle/appUtils/msm/msm.cuh
@@ -2,7 +2,8 @@
 #ifndef MSM_H
 #define MSM_H
 
-#include "cuda_runtime_api.h"
+#include <cuda_runtime.h>
+
 #include "../../utils/device_context.cuh"
 
 /**
diff --git a/icicle/appUtils/msm/tests/msm_test.cu b/icicle/appUtils/msm/tests/msm_test.cu
index b69680f38..e5bc8748b 100644
--- a/icicle/appUtils/msm/tests/msm_test.cu
+++ b/icicle/appUtils/msm/tests/msm_test.cu
@@ -1,15 +1,16 @@
-#include "../../primitives/field.cuh"
-#include "../../primitives/projective.cuh"
-#include "../../utils/cuda_utils.cuh"
+#define CURVE_ID 1
+
 #include "msm.cu"
+
 #include <chrono>
 #include <iostream>
 #include <vector>
-// #include "../../curves/bls12_377/curve_config.cuh"
-#include "../../curves/bn254/curve_config.cuh"
 
-// using namespace BLS12_377;
-using namespace BN254;
+#include "../../primitives/field.cuh"
+#include "../../primitives/projective.cuh"
+#include "../../utils/cuda_utils.cuh"
+#include "../../utils/device_context.cuh"
+#include "../../curves/curve_config.cuh"
 
 class Dummy_Scalar
 {
@@ -111,9 +112,9 @@ public:
 
 // switch between dummy and real:
 
-typedef scalar_t test_scalar;
-typedef projective_t test_projective;
-typedef affine_t test_affine;
+typedef curve_config::scalar_t test_scalar;
+typedef curve_config::projective_t test_projective;
+typedef curve_config::affine_t test_affine;
 
 // typedef Dummy_Scalar test_scalar;
 // typedef Dummy_Projective test_projective;
@@ -121,10 +122,10 @@ typedef affine_t test_affine;
 
 int main()
 {
-  unsigned batch_size = 1;
+  int batch_size = 1;
   //   unsigned msm_size = 1<<21;
-  unsigned msm_size = 12180757;
-  unsigned N = batch_size * msm_size;
+  int msm_size = 12180757;
+  int N = batch_size * msm_size;
 
   test_scalar* scalars = new test_scalar[N];
   test_affine* points = new test_affine[N];
@@ -165,19 +166,39 @@ int main()
 
   // batched_large_msm<test_scalar, test_projective, test_affine>(scalars, points, batch_size, msm_size,
   // batched_large_res, false);
-  cudaStream_t stream1;
-  cudaStream_t stream2;
-  cudaStreamCreate(&stream1);
-  cudaStreamCreate(&stream2);
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  device_context::DeviceContext ctx = {0,       // device_id
+                                       stream,  // stream
+                                       0,       // mempool
+  };
+  msm::MSMConfig config = {
+    false,               // scalars_on_device
+    true,                // scalars_montgomery_form
+    msm_size,            // points_size
+    1,                   // precompute_factor
+    false,               // points_on_device
+    true,                // points_montgomery_form
+    1,                   // batch_size
+    false,               // result_on_device
+    16,                  // c
+    test_scalar::NBITS,  // bitsize
+    false,               // big_triangle
+    10,                  // large_bucket_factor
+    ctx,                 // DeviceContext
+  };
+
   auto begin1 = std::chrono::high_resolution_clock::now();
-  large_msm<test_scalar, test_projective, test_affine>(scalars, points, msm_size, large_res, false, true, stream1);
+  msm::MSM<test_scalar, test_affine, test_projective>(scalars, points, msm_size, config, large_res);
   auto end1 = std::chrono::high_resolution_clock::now();
   auto elapsed1 = std::chrono::duration_cast<std::chrono::nanoseconds>(end1 - begin1);
   printf("Big Triangle : %.3f seconds.\n", elapsed1.count() * 1e-9);
+  config.big_triangle = true;
   // std::cout<<test_projective::to_affine(large_res[0])<<std::endl;
   auto begin = std::chrono::high_resolution_clock::now();
-  large_msm<test_scalar, test_projective, test_affine>(
-    scalars_d, points_d, msm_size, large_res_d, true, false, stream2);
+  msm::MSM<test_scalar, test_affine, test_projective>(
+    scalars_d, points_d, msm_size, config, large_res_d);
   // test_reduce_triangle(scalars);
   // test_reduce_rectangle(scalars);
   // test_reduce_single(scalars);
@@ -185,10 +206,8 @@ int main()
   auto end = std::chrono::high_resolution_clock::now();
   auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
   printf("On Device No Big Triangle: %.3f seconds.\n", elapsed.count() * 1e-9);
-  cudaStreamSynchronize(stream1);
-  cudaStreamSynchronize(stream2);
-  cudaStreamDestroy(stream1);
-  cudaStreamDestroy(stream2);
+  cudaStreamSynchronize(stream);
+  cudaStreamDestroy(stream);
 
   std::cout << test_projective::to_affine(large_res[0]) << std::endl;
 
diff --git a/icicle/appUtils/ntt/ntt.cu b/icicle/appUtils/ntt/ntt.cu
index b48f2dde6..990ae6062 100644
--- a/icicle/appUtils/ntt/ntt.cu
+++ b/icicle/appUtils/ntt/ntt.cu
@@ -1,6 +1,8 @@
+#include "ntt.cuh"
+
 #include "../../utils/sharedmem.cuh"
+#include "../../utils/utils_kernels.cuh"
 #include "../../curves/curve_config.cuh"
-#include "ntt.cuh"
 
 namespace ntt {
 
@@ -30,29 +32,6 @@ __global__ void twiddle_factors_kernel(S* d_twiddles, uint32_t n_twiddles, S ome
   }
 }
 
-/**
- * Multiply the elements of an input array by a scalar in-place. Used for normalization in iNTT.
- * @param arr input array.
- * @param n size of arr.
- * @param n_inv scalar of type S (scalar).
- */
-template <typename E, typename S>
-__global__ void template_normalize_kernel(E* arr, uint32_t n, S scalar)
-{
-  int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (tid < n) { arr[tid] = scalar * arr[tid]; }
-}
-
-template <typename E, typename S>
-__global__ void batchVectorMult(S* scalar_vec, E* element_vec, unsigned n_scalars, unsigned batch_size)
-{
-  int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  if (tid < n_scalars * batch_size) {
-    int scalar_id = tid % n_scalars;
-    element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid];
-  }
-}
-
 template <typename E>
 __global__ void reverse_order_kernel(E* arr, E* arr_reversed, uint32_t n, uint32_t logn, uint32_t batch_size)
 {
@@ -333,14 +312,14 @@ void ntt_inplace_batch_template(
         <<<num_blocks, num_threads, 0, stream>>>(d_inout, n, d_twiddles, n, total_tasks, s, false);
     }
 
-    if (is_coset) batchVectorMult<E, S><<<num_blocks, num_threads, 0, stream>>>(coset, d_inout, n, batch_size);
+    if (is_coset) utils_internal::batchVectorMult<E, S><<<num_blocks, num_threads, 0, stream>>>(d_inout, coset, n, batch_size);
 
     num_threads = max(min(n / 2, MAX_NUM_THREADS), 1);
     num_blocks = (n * batch_size + num_threads - 1) / num_threads;
-    template_normalize_kernel<E, S>
-      <<<num_blocks, num_threads, 0, stream>>>(d_inout, n * batch_size, S::inv_log_size(logn));
+    utils_internal::template_normalize_kernel<E, S>
+      <<<num_blocks, num_threads, 0, stream>>>(d_inout, S::inv_log_size(logn), n * batch_size);
   } else {
-    if (is_coset) batchVectorMult<E, S><<<num_blocks, num_threads, 0, stream>>>(coset, d_inout, n, batch_size);
+    if (is_coset) utils_internal::batchVectorMult<E, S><<<num_blocks, num_threads, 0, stream>>>(d_inout, coset, n, batch_size);
 
     for (int s = logn - 1; s >= logn_shmem; s--) // TODO: this loop also can be unrolled
     {
@@ -380,7 +359,7 @@ cudaError_t NTT(E* input, int size, bool is_inverse, NTTConfig<S> config)
   S* d_twiddles;
   if (generate_twiddles) {
     cudaMallocAsync(&d_twiddles, n_twiddles * sizeof(S), stream);
-    generate_twiddle_factors(d_twiddles, n_twiddles, is_inverse ? S::omega_inv(logn) : S::omega(logn), config.ctx);
+    GenerateTwiddleFactors(d_twiddles, n_twiddles, is_inverse ? S::omega_inv(logn) : S::omega(logn), config.ctx);
   }
 
   E* d_input;
@@ -431,7 +410,7 @@ cudaError_t NTT(E* input, int size, bool is_inverse, NTTConfig<S> config)
 }
 
 /**
- * Extern version of [generate_twiddle_factors](@ref generate_twiddle_factors) function with the template parameter 
+ * Extern version of [GenerateTwiddleFactors](@ref GenerateTwiddleFactors) function with the template parameter 
  * `S` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build.
  * @return `cudaSuccess` if the execution was successful and an error code otherwise.
  */
@@ -441,7 +420,7 @@ extern "C" cudaError_t GenerateTwiddleFactorsCuda(
   curve_config::scalar_t omega,
   device_context::DeviceContext ctx
 ) {
-  return generate_twiddle_factors<curve_config::scalar_t>(d_twiddles, n_twiddles, omega, ctx);
+  return GenerateTwiddleFactors<curve_config::scalar_t>(d_twiddles, n_twiddles, omega, ctx);
 }
 
 /**
diff --git a/icicle/appUtils/ntt/ntt.cuh b/icicle/appUtils/ntt/ntt.cuh
index ec8ad6695..e1daf2f67 100644
--- a/icicle/appUtils/ntt/ntt.cuh
+++ b/icicle/appUtils/ntt/ntt.cuh
@@ -2,7 +2,8 @@
 #ifndef NTT_H
 #define NTT_H
 
-#include "cuda_runtime_api.h"
+#include <cuda_runtime.h>
+
 #include "../../utils/device_context.cuh"
 
 /**
@@ -69,7 +70,7 @@ struct NTTConfig {
                                          *   This pointer is expected to live on device. The order is as follows:
                                          *   \f$ \{\omega^0=1, \omega^1, \dots, \omega^{n-1}\} \f$. If this pointer is `nullptr`, twiddle factors
                                          *   are generated online using the default generator (TODO: link to twiddle gen here) and function
-                                         *   [generate_twiddle_factors](@ref generate_twiddle_factors). Default value: `nullptr`. */
+                                         *   [GenerateTwiddleFactors](@ref GenerateTwiddleFactors). Default value: `nullptr`. */
     int batch_size;                     /**< The number of NTTs to compute. Default value: 1. */
     device_context::DeviceContext ctx;  /**< Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext). */
 };
diff --git a/icicle/utils/device_context.cuh b/icicle/utils/device_context.cuh
index e29d6a1b1..a5583991e 100644
--- a/icicle/utils/device_context.cuh
+++ b/icicle/utils/device_context.cuh
@@ -10,7 +10,7 @@ namespace device_context {
  * Properties of the device used in icicle functions.
  */
 struct DeviceContext {
-    unsigned device_id;                 /**< Index of the currently used GPU. Default value: 0. */
+    int device_id;                      /**< Index of the currently used GPU. Default value: 0. */
     cudaStream_t stream;                /**< Stream to use. Default value: 0. */
     cudaMemPool_t mempool;              /**< Mempool to use. Default value: 0. */
 };
diff --git a/icicle/utils/host_math.cuh b/icicle/utils/host_math.cuh
index 73922b925..ecc181844 100644
--- a/icicle/utils/host_math.cuh
+++ b/icicle/utils/host_math.cuh
@@ -1,4 +1,6 @@
 #pragma once
+#ifndef HOST_MATH_H
+#define HOST_MATH_H
 
 #include <cstdint>
 #include <cuda_runtime.h>
@@ -94,3 +96,5 @@ namespace host_math {
     }
   };
 } // namespace host_math
+
+#endif
diff --git a/icicle/utils/mont.cuh b/icicle/utils/mont.cuh
index d02d130d8..09c7a8732 100644
--- a/icicle/utils/mont.cuh
+++ b/icicle/utils/mont.cuh
@@ -1,19 +1,30 @@
 #pragma once
+#ifndef MONT_H
+#define MONT_H
 
-#include "../appUtils/vector_manipulation/ve_mod_mult.cuh"
+#include "utils_kernels.cuh"
 
+namespace mont {
+
+namespace {
+
+#define MAX_THREADS_PER_BLOCK 256
+
+// TODO (DmytroTym): do valid conversion for point types too
 template <typename E>
-int convert_montgomery(E* d_inout, size_t n_elments, bool is_into, cudaStream_t stream)
+int convert_montgomery(E* d_inout, size_t n, bool is_into, cudaStream_t stream)
 {
   // Set the grid and block dimensions
   int num_threads = MAX_THREADS_PER_BLOCK;
-  int num_blocks = (n_elments + num_threads - 1) / num_threads;
+  int num_blocks = (n + num_threads - 1) / num_threads;
   E mont = is_into ? E::montgomery_r() : E::montgomery_r_inv();
-  template_normalize_kernel<<<num_blocks, num_threads, 0, stream>>>(d_inout, n_elments, mont);
+  utils_internal::template_normalize_kernel<<<num_blocks, num_threads, 0, stream>>>(d_inout, mont, n);
 
   return 0; // TODO: void with propper error handling
 }
 
+} // namespace
+
 template <typename E>
 int to_montgomery(E* d_inout, unsigned n, cudaStream_t stream)
 {
@@ -24,4 +35,8 @@ template <typename E>
 int from_montgomery(E* d_inout, unsigned n, cudaStream_t stream)
 {
   return convert_montgomery(d_inout, n, false, stream);
-}
\ No newline at end of file
+}
+
+} // namespace mont
+
+#endif
diff --git a/icicle/utils/utils_kernels.cu b/icicle/utils/utils_kernels.cu
new file mode 100644
index 000000000..02b8e687a
--- /dev/null
+++ b/icicle/utils/utils_kernels.cu
@@ -0,0 +1,22 @@
+#include "utils_kernels.cuh"
+
+namespace utils_internal {
+
+template <typename E, typename S>
+__global__ void template_normalize_kernel(E* arr, S scalar, uint32_t n)
+{
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < n) { arr[tid] = scalar * arr[tid]; }
+}
+
+template <typename E, typename S>
+__global__ void batchVectorMult(E* element_vec, S* scalar_vec, unsigned n_scalars, unsigned batch_size)
+{
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  if (tid < n_scalars * batch_size) {
+    int scalar_id = tid % n_scalars;
+    element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid];
+  }
+}
+
+} // namespace utils_internal
diff --git a/icicle/utils/utils_kernels.cuh b/icicle/utils/utils_kernels.cuh
new file mode 100644
index 000000000..02f761fa6
--- /dev/null
+++ b/icicle/utils/utils_kernels.cuh
@@ -0,0 +1,21 @@
+#pragma once
+#ifndef UTILS_KERNELS_H
+#define UTILS_KERNELS_H
+
+namespace utils_internal {
+
+/**
+ * Multiply the elements of an input array by a scalar in-place. Used for normalization in iNTT.
+ * @param arr input array.
+ * @param n size of arr.
+ * @param n_inv scalar of type S (scalar).
+ */
+template <typename E, typename S>
+__global__ void template_normalize_kernel(E* arr, S scalar, uint32_t n);
+
+template <typename E, typename S>
+__global__ void batchVectorMult(E* element_vec, S* scalar_vec, unsigned n_scalars, unsigned batch_size);
+
+} // namespace utils_internal
+
+#endif

From ad097a9f4abd7b59cc3ce98d16648c452c1a88d5 Mon Sep 17 00:00:00 2001
From: DmytroTym <dmytrotym1@gmail.com>
Date: Thu, 12 Oct 2023 20:12:40 +0300
Subject: [PATCH 8/8] Fixed warnings

---
 icicle/appUtils/msm/msm.cu | 22 +++++++++++-----------
 icicle/utils/host_math.cuh |  2 +-
 icicle/utils/mont.cuh      |  6 +++---
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu
index 12c389dae..ee2d2579a 100644
--- a/icicle/appUtils/msm/msm.cu
+++ b/icicle/appUtils/msm/msm.cu
@@ -102,14 +102,14 @@ __global__ void split_scalars_kernel(
   unsigned bm_bitsize,
   unsigned c)
 {
-  constexpr unsigned sign_mask = 0x80000000;
+  // constexpr unsigned sign_mask = 0x80000000;
   // constexpr unsigned trash_bucket = 0x80000000;
   unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   unsigned bucket_index;
-  unsigned bucket_index2;
+  // unsigned bucket_index2;
   unsigned current_index;
   unsigned msm_index = tid >> msm_log_size;
-  unsigned borrow = 0;
+  // unsigned borrow = 0;
   if (tid < total_size) {
     S scalar = scalars[tid];
     for (unsigned bm = 0; bm < nof_bms; bm++) {
@@ -200,7 +200,7 @@ __global__ void accumulate_buckets_kernel(
   const unsigned msm_idx_shift,
   const unsigned c)
 {
-  constexpr unsigned sign_mask = 0x80000000;
+  // constexpr unsigned sign_mask = 0x80000000;
   unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (tid >= nof_buckets_to_compute) return;
   if ((single_bucket_indices[tid] & ((1 << c) - 1)) == 0) {
@@ -376,8 +376,8 @@ void bucket_method_msm(
   P* buckets;
   // compute number of bucket modules and number of buckets in each module
   unsigned nof_bms = (bitsize + c - 1) / c;
-  unsigned msm_log_size = ceil(log2(size));
-  unsigned bm_bitsize = ceil(log2(nof_bms));
+  unsigned msm_log_size = (unsigned)ceil(log2(size));
+  unsigned bm_bitsize = (unsigned)ceil(log2(nof_bms));
 #ifdef SIGNED_DIG
   unsigned nof_buckets = nof_bms * ((1 << (c - 1)) + 1); // signed digits
 #else
@@ -629,7 +629,7 @@ void bucket_method_msm(
 #endif
   } else {
     unsigned source_bits_count = c;
-    bool odd_source_c = source_bits_count % 2;
+    // bool odd_source_c = source_bits_count % 2;
     unsigned source_windows_count = nof_bms;
     unsigned source_buckets_count = nof_buckets;
     P* source_buckets = buckets;
@@ -647,7 +647,7 @@ void bucket_method_msm(
 
       if (source_bits_count > 0) {
         for (unsigned j = 0; j < target_bits_count; j++) {
-          unsigned last_j = target_bits_count - 1;
+          // unsigned last_j = target_bits_count - 1;
           unsigned nof_threads = (source_buckets_count >> (1 + j));
           NUM_THREADS = min(MAX_TH, nof_threads);
           NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
@@ -683,7 +683,7 @@ void bucket_method_msm(
       temp_buckets1 = nullptr;
       temp_buckets2 = nullptr;
       source_bits_count = target_bits_count;
-      odd_source_c = source_bits_count % 2;
+      // odd_source_c = source_bits_count % 2;
       source_windows_count = target_windows_count;
       source_buckets_count = target_buckets_count;
     }
@@ -753,8 +753,8 @@ void batched_bucket_method_msm(
   P* buckets;
   // compute number of bucket modules and number of buckets in each module
   unsigned nof_bms = (bitsize + c - 1) / c;
-  unsigned msm_log_size = ceil(log2(msm_size));
-  unsigned bm_bitsize = ceil(log2(nof_bms));
+  unsigned msm_log_size = (unsigned)ceil(log2(msm_size));
+  unsigned bm_bitsize = (unsigned)ceil(log2(nof_bms));
   unsigned nof_buckets = (nof_bms << c);
   unsigned total_nof_buckets = nof_buckets * batch_size;
   cudaMallocAsync(&buckets, sizeof(P) * total_nof_buckets, stream);
diff --git a/icicle/utils/host_math.cuh b/icicle/utils/host_math.cuh
index ecc181844..305c71319 100644
--- a/icicle/utils/host_math.cuh
+++ b/icicle/utils/host_math.cuh
@@ -58,7 +58,7 @@ namespace host_math {
   {
     uint32_t result;
     uint64_t r = static_cast<uint64_t>(x) * y + z + carry;
-    carry = r >> 32;
+    carry = (uint32_t)(r >> 32);
     result = r & 0xffffffff;
     return result;
   }
diff --git a/icicle/utils/mont.cuh b/icicle/utils/mont.cuh
index 09c7a8732..b9d1ddd1e 100644
--- a/icicle/utils/mont.cuh
+++ b/icicle/utils/mont.cuh
@@ -12,7 +12,7 @@ namespace {
 
 // TODO (DmytroTym): do valid conversion for point types too
 template <typename E>
-int convert_montgomery(E* d_inout, size_t n, bool is_into, cudaStream_t stream)
+int convert_montgomery(E* d_inout, int n, bool is_into, cudaStream_t stream)
 {
   // Set the grid and block dimensions
   int num_threads = MAX_THREADS_PER_BLOCK;
@@ -26,13 +26,13 @@ int convert_montgomery(E* d_inout, size_t n, bool is_into, cudaStream_t stream)
 } // namespace
 
 template <typename E>
-int to_montgomery(E* d_inout, unsigned n, cudaStream_t stream)
+int to_montgomery(E* d_inout, int n, cudaStream_t stream)
 {
   return convert_montgomery(d_inout, n, true, stream);
 }
 
 template <typename E>
-int from_montgomery(E* d_inout, unsigned n, cudaStream_t stream)
+int from_montgomery(E* d_inout, int n, cudaStream_t stream)
 {
   return convert_montgomery(d_inout, n, false, stream);
 }