From 9114ecb26978a6e29573e5a108b7c708d677749a Mon Sep 17 00:00:00 2001 From: ImmanuelSegol <3ditds@gmail.com> Date: Tue, 3 Oct 2023 15:22:28 +0300 Subject: [PATCH 1/8] fix memory error in single_stage_multi_reduction_kernel (#235) * refactor * refactor * revert * refactor: clang format * Update icicle/appUtils/msm/msm.cu --- icicle/appUtils/msm/msm.cu | 40 +++++++++++++++++---------- icicle/appUtils/msm/tests/msm_test.cu | 2 +- icicle/utils/error_handler.cuh | 32 +++++++++++++++++++++ 3 files changed, 59 insertions(+), 15 deletions(-) create mode 100644 icicle/utils/error_handler.cuh diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu index f09b9fe85..c58381cc0 100644 --- a/icicle/appUtils/msm/msm.cu +++ b/icicle/appUtils/msm/msm.cu @@ -5,6 +5,7 @@ #include "../../primitives/field.cuh" #include "../../primitives/projective.cuh" #include "../../utils/cuda_utils.cuh" +#include "../../utils/error_handler.cuh" #include "msm.cuh" #include #include @@ -23,20 +24,29 @@ template __global__ void single_stage_multi_reduction_kernel( - P* v, P* v_r, unsigned block_size, unsigned write_stride, unsigned write_phase, unsigned padding) + P* v, + P* v_r, + unsigned block_size, + unsigned write_stride, + unsigned write_phase, + unsigned padding, + unsigned num_of_threads) { int tid = blockIdx.x * blockDim.x + threadIdx.x; - int tid_p = padding ? (tid / (2 * padding)) * padding + tid % padding : tid; + if (tid >= num_of_threads) { return; } + int jump = block_size / 2; + int tid_p = padding ? (tid / (2 * padding)) * padding + tid % padding : tid; int block_id = tid_p / jump; int block_tid = tid_p % jump; unsigned read_ind = block_size * block_id + block_tid; unsigned write_ind = tid; - v_r - [write_stride ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride - : write_ind] = - padding ? (tid % (2 * padding) < padding) ? v[read_ind] + v[read_ind + jump] : P::zero() - : v[read_ind] + v[read_ind + jump]; + unsigned v_r_key = + write_stride ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride : write_ind; + P v_r_value = padding ? (tid % (2 * padding) < padding) ? v[read_ind] + v[read_ind + jump] : P::zero() + : v[read_ind] + v[read_ind + jump]; + + v_r[v_r_key] = v_r_value; } // this kernel performs single scalar multiplication @@ -388,7 +398,7 @@ void bucket_method_msm( NUM_THREADS = min(MAX_TH, s); NUM_BLOCKS = (s + NUM_THREADS - 1) / NUM_THREADS; single_stage_multi_reduction_kernel<<>>( - ones_results, ones_results, s * 2, 0, 0, 0); + ones_results, ones_results, s * 2, 0, 0, 0, s); } unsigned* bucket_indices; @@ -554,7 +564,9 @@ void bucket_method_msm( NUM_THREADS = min(MAX_TH, s); NUM_BLOCKS = (s + NUM_THREADS - 1) / NUM_THREADS; single_stage_multi_reduction_kernel<<>>( - large_buckets, large_buckets, s * 2, 0, 0, 0); + large_buckets, large_buckets, s * 2, 0, 0, 0, s); + + CHECK_LAST_CUDA_ERROR(); } // distribute @@ -631,18 +643,18 @@ void bucket_method_msm( if (source_bits_count > 0) { for (unsigned j = 0; j < target_bits_count; j++) { unsigned last_j = target_bits_count - 1; - NUM_THREADS = min(MAX_TH, (source_buckets_count >> (1 + j))); - NUM_BLOCKS = ((source_buckets_count >> (1 + j)) + NUM_THREADS - 1) / NUM_THREADS; + unsigned nof_threads = (source_buckets_count >> (1 + j)); + NUM_THREADS = min(MAX_TH, nof_threads); + NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS; single_stage_multi_reduction_kernel<<>>( j == 0 ? source_buckets : temp_buckets1, j == target_bits_count - 1 ? target_buckets : temp_buckets1, - 1 << (source_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 0, 0); + 1 << (source_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 0, 0, nof_threads); - unsigned nof_threads = (source_buckets_count >> (1 + j)); NUM_THREADS = min(MAX_TH, nof_threads); NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS; single_stage_multi_reduction_kernel<<>>( j == 0 ? source_buckets : temp_buckets2, j == target_bits_count - 1 ? target_buckets : temp_buckets2, - 1 << (target_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 1, 0); + 1 << (target_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 1, 0, nof_threads); } } if (target_bits_count == 1) { diff --git a/icicle/appUtils/msm/tests/msm_test.cu b/icicle/appUtils/msm/tests/msm_test.cu index 962c88bbb..b69680f38 100644 --- a/icicle/appUtils/msm/tests/msm_test.cu +++ b/icicle/appUtils/msm/tests/msm_test.cu @@ -224,4 +224,4 @@ int main() // std::cout< + +#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__) +template +void check(T err, const char* const func, const char* const file, const int line) +{ + if (err != cudaSuccess) { + std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl; + std::cerr << cudaGetErrorString(err) << " " << func << std::endl; + } +} + +#define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__) +void checkLast(const char* const file, const int line) +{ + cudaError_t err{cudaGetLastError()}; + if (err != cudaSuccess) { + std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl; + std::cerr << cudaGetErrorString(err) << std::endl; + } +} + +#define CHECK_SYNC_DEVICE_ERROR() syncDevice(__FILE__, __LINE__) +void syncDevice(const char* const file, const int line) +{ + cudaError_t err{cudaDeviceSynchronize()}; + if (err != cudaSuccess) { + std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl; + std::cerr << cudaGetErrorString(err) << std::endl; + } +} From 33d15830a3e784102e57e31e38d385f51485f7b8 Mon Sep 17 00:00:00 2001 From: DmytroTym Date: Tue, 3 Oct 2023 23:06:11 +0300 Subject: [PATCH 2/8] Added separate device context struct, returned lde --- icicle/CMakeLists.txt | 10 +- icicle/Doxyfile | 4 +- icicle/appUtils/lde/lde.cu | 186 ++++++++++++++++++ icicle/appUtils/lde/lde.cuh | 57 ++++++ icicle/appUtils/msm/msm.cu | 74 +------ icicle/appUtils/msm/msm.cuh | 14 +- icicle/appUtils/ntt/ntt.cu | 140 ++++++------- icicle/appUtils/ntt/ntt.cuh | 19 +- .../vector_manipulation/ve_mod_mult.cuh | 135 ------------- icicle/utils/device_context.cuh | 20 ++ 10 files changed, 354 insertions(+), 305 deletions(-) create mode 100644 icicle/appUtils/lde/lde.cu create mode 100644 icicle/appUtils/lde/lde.cuh delete mode 100644 icicle/appUtils/vector_manipulation/ve_mod_mult.cuh create mode 100644 icicle/utils/device_context.cuh diff --git a/icicle/CMakeLists.txt b/icicle/CMakeLists.txt index 18900cb15..72e8c14a7 100644 --- a/icicle/CMakeLists.txt +++ b/icicle/CMakeLists.txt @@ -50,14 +50,12 @@ if (NOT BUILD_TESTS) add_custom_command( TARGET icicle POST_BUILD - COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym msm_cuda=${CURVE}_msm_cuda --redefine-sym msm_internal_cuda=${CURVE}_msm_internal_cuda ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o - ) - add_custom_command( - TARGET icicle - POST_BUILD - COMMAND ${CMAKE_AR} ARGS -rcs ${LIBRARY_OUTPUT_DIRECTORY}/libingo_${CURVE}.a ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o + COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym msm_cuda=${CURVE}_msm_cuda ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o + COMMAND ${CMAKE_OBJCOPY} ARGS --prefix-symbols=${CURVE}_ ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o + COMMAND ${CMAKE_AR} ARGS -rcs ${LIBRARY_OUTPUT_DIRECTORY}/libingo_${CURVE}.a ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o ) file(REMOVE ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o) + file(REMOVE ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o) else() diff --git a/icicle/Doxyfile b/icicle/Doxyfile index 9ba24700e..1d7ba732e 100644 --- a/icicle/Doxyfile +++ b/icicle/Doxyfile @@ -831,7 +831,9 @@ WARN_LOGFILE = # Note: If this tag is empty the current directory is searched. INPUT = appUtils/msm \ - appUtils/ntt + appUtils/ntt \ + appUtils/lde \ + utils/device_context.cuh # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/icicle/appUtils/lde/lde.cu b/icicle/appUtils/lde/lde.cu new file mode 100644 index 000000000..89a9a12b4 --- /dev/null +++ b/icicle/appUtils/lde/lde.cu @@ -0,0 +1,186 @@ +#include +#include +#include "../../utils/device_context.cuh" +#include "../../curves/curve_config.cuh" + +namespace lde { + +namespace { + +#define MAX_THREADS_PER_BLOCK 256 + +template +__global__ void mul_kernel(S* scalar_vec, E* element_vec, size_t n, E* result) +{ + int tid = blockDim.x * blockIdx.x + threadIdx.x; + if (tid < n) { result[tid] = scalar_vec[tid] * element_vec[tid]; } +} + +template +__global__ void add_kernel(E* element_vec1, E* element_vec2, uint32_t n, E* result) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < n) { result[tid] = element_vec1[tid] + element_vec2[tid]; } +} + +template +__global__ void sub_kernel(E* element_vec1, E* element_vec2, uint32_t n, E* result) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < n) { result[tid] = element_vec1[tid] - element_vec2[tid]; } +} + +} // namespace + +template +cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result) +{ + // Set the grid and block dimensions + int num_threads = MAX_THREADS_PER_BLOCK; + int num_blocks = (n + num_threads - 1) / num_threads; + + S* d_vec_a; + E *d_vec_b, *d_result; + if (on_device) { + // Allocate memory on the device for the input vectors and the output vector + cudaMallocAsync(&d_vec_a, n * sizeof(S), ctx.stream); + cudaMallocAsync(&d_vec_b, n * sizeof(E), ctx.stream); + cudaMallocAsync(&d_result, n * sizeof(E), ctx.stream); + + // Copy the input vectors and the modulus from the host to the device + cudaMemcpyAsync(d_vec_a, vec_a, n * sizeof(S), cudaMemcpyHostToDevice, ctx.stream); + cudaMemcpyAsync(d_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, ctx.stream); + } + + // Call the kernel to perform element-wise modular multiplication + multiply_kernel<<>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, on_device ? d_result : result, n); + + if (on_device) { + cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream); + + cudaFreeAsync(d_vec_a, ctx.stream); + cudaFreeAsync(d_vec_b, ctx.stream); + cudaFreeAsync(d_result, ctx.stream); + } + + cudaStreamSynchronize(ctx.stream); + return cudaSuccess; +} + +template +cudaError_t add(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result) +{ + // Set the grid and block dimensions + int num_threads = MAX_THREADS_PER_BLOCK; + int num_blocks = (n + num_threads - 1) / num_threads; + + E *d_vec_a, *d_vec_b, *d_result; + if (on_device) { + // Allocate memory on the device for the input vectors and the output vector + cudaMallocAsync(&d_vec_a, n * sizeof(E), ctx.stream); + cudaMallocAsync(&d_vec_b, n * sizeof(E), ctx.stream); + cudaMallocAsync(&d_result, n * sizeof(E), ctx.stream); + + // Copy the input vectors from the host to the device + cudaMemcpyAsync(d_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, ctx.stream); + cudaMemcpyAsync(d_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, ctx.stream); + } + + // Call the kernel to perform element-wise addition + add_kernel<<>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, on_device ? d_result : result, n); + + if (on_device) { + cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream); + + cudaFreeAsync(d_vec_a, ctx.stream); + cudaFreeAsync(d_vec_b, ctx.stream); + cudaFreeAsync(d_result, ctx.stream); + } + + cudaStreamSynchronize(ctx.stream); + return cudaSuccess; +} + +template +cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result) +{ + // Set the grid and block dimensions + int num_threads = MAX_THREADS_PER_BLOCK; + int num_blocks = (n + num_threads - 1) / num_threads; + + E *d_vec_a, *d_vec_b, *d_result; + if (on_device) { + // Allocate memory on the device for the input vectors and the output vector + cudaMallocAsync(&d_vec_a, n * sizeof(E), ctx.stream); + cudaMallocAsync(&d_vec_b, n * sizeof(E), ctx.stream); + cudaMallocAsync(&d_result, n * sizeof(E), ctx.stream); + + // Copy the input vectors from the host to the device + cudaMemcpyAsync(d_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, ctx.stream); + cudaMemcpyAsync(d_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, ctx.stream); + } + + // Call the kernel to perform element-wise subtraction + sub_kernel<<>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, on_device ? d_result : result, n); + + if (on_device) { + cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream); + + cudaFreeAsync(d_vec_a, ctx.stream); + cudaFreeAsync(d_vec_b, ctx.stream); + cudaFreeAsync(d_result, ctx.stream); + } + + cudaStreamSynchronize(ctx.stream); + return cudaSuccess; +} + +/** + * Extern version of [mul](@ref mul) function with the template parameters + * `S` and `E` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build. + * @return `cudaSuccess` if the execution was successful and an error code otherwise. + */ +extern "C" cudaError_t mul_cuda( + curve_config::scalar_t* vec_a, + curve_config::scalar_t* vec_b, + uint32_t n, + bool on_device, + device_context::DeviceContext ctx, + curve_config::scalar_t* result +) { + return mul(vec_a, vec_b, n, on_device, ctx, result); +} + +/** + * Extern version of [add](@ref add) function with the template parameter + * `E` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build. + * @return `cudaSuccess` if the execution was successful and an error code otherwise. + */ +extern "C" cudaError_t add_cuda( + curve_config::scalar_t* vec_a, + curve_config::scalar_t* vec_b, + uint32_t n, + bool on_device, + device_context::DeviceContext ctx, + curve_config::scalar_t* result +) { + return add(vec_a, vec_b, n, on_device, ctx, result); +} + +/** + * Extern version of [sub](@ref sub) function with the template parameter + * `E` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build. + * @return `cudaSuccess` if the execution was successful and an error code otherwise. + */ +extern "C" cudaError_t sub_cuda( + curve_config::scalar_t* vec_a, + curve_config::scalar_t* vec_b, + uint32_t n, + bool on_device, + device_context::DeviceContext ctx, + curve_config::scalar_t* result +) { + return sub(vec_a, vec_b, n, on_device, ctx, result); +} + +} // namespace lde diff --git a/icicle/appUtils/lde/lde.cuh b/icicle/appUtils/lde/lde.cuh new file mode 100644 index 000000000..90df95c4b --- /dev/null +++ b/icicle/appUtils/lde/lde.cuh @@ -0,0 +1,57 @@ +#pragma once +#ifndef LDE_H +#define LDE_H + +/** + * @namespace lde + * LDE (stands for low degree extension) contains [NTT](@ref ntt)-based methods for translating between coefficient and evaluation domains of polynomials. + * It also contains methods for element-wise manipulation of vectors, which is useful for working with polynomials in evaluation domain. + */ +namespace lde { + +/** + * A function that multiplies two vectors element-wise. + * @param vec_a First input vector. + * @param vec_b Second input vector. + * @param n Size of vectors `vec_a` and `vec_b`. + * @param on_device If true, inputs and outputs are on device, if false - on the host. + * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method. + * @param result Resulting vector - element-wise product of `vec_a` and `vec_b`, can be the same pointer as `vec_b`. + * @tparam S The type of scalars `vec_a`. + * @tparam E The type of elements `vec_b` and `result`. Often (but not always), `E=S`. + * @return `cudaSuccess` if the execution was successful and an error code otherwise. + */ +template +cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result); + +/** + * A function that adds two vectors element-wise. + * @param vec_a First input vector. + * @param vec_b Second input vector. + * @param n Size of vectors `vec_a` and `vec_b`. + * @param on_device If true, inputs and outputs are on device, if false - on the host. + * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method. + * @param result Resulting vector - element-wise sum of `vec_a` and `vec_b`, can be the same pointer as `vec_a` or `vec_b`. + * @tparam E The type of elements `vec_a`, `vec_b` and `result`. + * @return `cudaSuccess` if the execution was successful and an error code otherwise. + */ +template +cudaError_t add(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result); + +/** + * A function that subtracts two vectors element-wise. + * @param vec_a First input vector. + * @param vec_b Second input vector. + * @param n Size of vectors `vec_a` and `vec_b`. + * @param on_device If true, inputs and outputs are on device, if false - on the host. + * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method. + * @param result Resulting vector - element-wise difference of `vec_a` and `vec_b`, can be the same pointer as `vec_a` or `vec_b`. + * @tparam E The type of elements `vec_a`, `vec_b` and `result`. + * @return `cudaSuccess` if the execution was successful and an error code otherwise. + */ +template +cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result); + +} // namespace lde + +#endif diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu index 885a83b6b..2089abddb 100644 --- a/icicle/appUtils/msm/msm.cu +++ b/icicle/appUtils/msm/msm.cu @@ -885,57 +885,17 @@ void batched_bucket_method_msm( } // namespace template -cudaError_t msm_internal(S* scalars, A* points, unsigned msm_size, MSMConfig config, P* results) +cudaError_t msm(S* scalars, A* points, unsigned msm_size, MSMConfig config, P* results) { // TODO: DmytroTym/HadarIngonyama - unify the implementation of the bucket method and the batched bucket method in one function // TODO: DmytroTym/HadarIngonyama - parameters to be included into the implementation: on deviceness of points, scalars and results, precompute factor, points size and device id if (config.batch_size == 1) - bucket_method_msm(config.bitsize, config.c, scalars, points, msm_size, results, config.are_scalars_on_device, config.big_triangle, config.large_bucket_factor, config.stream); + bucket_method_msm(config.bitsize, config.c, scalars, points, msm_size, results, config.are_scalars_on_device, config.big_triangle, config.large_bucket_factor, config.ctx.stream); else - batched_bucket_method_msm(config.bitsize, config.c, scalars, points, config.batch_size, msm_size, results, config.are_scalars_on_device, config.stream); + batched_bucket_method_msm(config.bitsize, config.c, scalars, points, config.batch_size, msm_size, results, config.are_scalars_on_device, config.ctx.stream); return cudaSuccess; } -template -cudaError_t msm(S* scalars, A* points, unsigned size, P* result) -{ - MSMConfig config = { - false, // are_scalars_on_device - true, // are_scalars_montgomery_form - size, // points_size - 1, // precompute_factor - false, // are_points_on_device - true, // are_points_montgomery_form - 1, // batch_size - false, // are_result_on_device - 16, // c - S::NBITS, // bitsize - false, // big_triangle - 10, // large_bucket_factor - 0, // device_id - 0 // stream - }; - return msm_internal(scalars, points, size, config, result); -} - -/** - * Extern version of [msm_internal](@ref msm_internal) function with the following values of template parameters - * (where the curve is given by `-DCURVE` env variable during build): - * - `S` is the [scalar field](@ref scalar_t) of the curve; - * - `A` is the [affine representation](@ref affine_t) of curve points; - * - `P` is the [projective representation](@ref projective_t) of curve points. - * @return `cudaSuccess` if the execution was successful and an error code otherwise. - */ -extern "C" cudaError_t msm_internal_cuda( - curve_config::scalar_t* scalars, - curve_config::affine_t* points, - size_t msm_size, - MSMConfig config, - curve_config::projective_t* out) -{ - return msm_internal(scalars, points, msm_size, config, out); -} - /** * Extern version of [msm](@ref msm) function with the following values of template parameters * (where the curve is given by `-DCURVE` env variable during build): @@ -947,32 +907,15 @@ extern "C" cudaError_t msm_internal_cuda( extern "C" cudaError_t msm_cuda( curve_config::scalar_t* scalars, curve_config::affine_t* points, - size_t size, + size_t msm_size, + MSMConfig config, curve_config::projective_t* out) { - return msm(scalars, points, size, out); + return msm(scalars, points, msm_size, config, out); } #if defined(G2_DEFINED) -/** - * Extern version of [msm_internal](@ref msm_internal) function with the following values of template parameters - * (where the curve is given by `-DCURVE` env variable during build): - * - `S` is the [scalar field](@ref scalar_t) of the curve; - * - `A` is the [affine representation](@ref g2_affine_t) of G2 curve points; - * - `P` is the [projective representation](@ref g2_projective_t) of G2 curve points. - * @return `cudaSuccess` if the execution was successful and an error code otherwise. - */ -extern "C" cudaError_t g2_msm_internal_cuda( - curve_config::scalar_t* scalars, - curve_config::g2_affine_t* points, - size_t msm_size, - MSMConfig config, - curve_config::g2_projective_t* out) -{ - return msm_internal(scalars, points, msm_size, config, out); -} - /** * Extern version of [msm](@ref msm) function with the following values of template parameters * (where the curve is given by `-DCURVE` env variable during build): @@ -984,10 +927,11 @@ extern "C" cudaError_t g2_msm_internal_cuda( extern "C" cudaError_t g2_msm_cuda( curve_config::scalar_t* scalars, curve_config::g2_affine_t* points, - size_t size, + size_t msm_size, + MSMConfig config, curve_config::g2_projective_t* out) { - return msm(scalars, points, size, out); + return msm(scalars, points, msm_size, config, out); } #endif diff --git a/icicle/appUtils/msm/msm.cuh b/icicle/appUtils/msm/msm.cuh index ae3adf303..6dba1bcce 100644 --- a/icicle/appUtils/msm/msm.cuh +++ b/icicle/appUtils/msm/msm.cuh @@ -3,6 +3,7 @@ #define MSM_H #include "cuda_runtime_api.h" +#include "../../utils/device_context.cuh" /** * @namespace msm @@ -24,7 +25,7 @@ namespace msm { /** * @struct MSMConfig - * Struct that encodes MSM parameters to be passed into the [msm_internal](@ref msm_internal) function. + * Struct that encodes MSM parameters to be passed into the [msm](@ref msm) function. */ struct MSMConfig { bool are_scalars_on_device; /**< True if scalars are on device and false if they're on host. Default value: false. */ @@ -50,8 +51,7 @@ struct MSMConfig { unsigned large_bucket_factor; /**< Variable that controls how sensitive the algorithm is to the buckets that occur very frequently. * Useful for efficient treatment of non-uniform distributions of scalars and "top windows" with few bits. * Can be set to 0 to disable separate treatment of large buckets altogether. Default value: 10. */ - unsigned device_id; /**< Index of the GPU to run the MSM on. Default value: 0. */ - cudaStream_t stream; /**< Stream to use. Default value: 0. */ + device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext). */ }; /** @@ -68,13 +68,7 @@ struct MSMConfig { * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ template -cudaError_t msm_internal(S* scalars, A* points, unsigned msm_size, MSMConfig config, P* results); - -/** - * A function that computes MSM by calling [msm_internal](@ref msm_internal) function with default [MSMConfig](@ref MSMConfig) values. - */ -template -cudaError_t msm(S* scalars, A* points, unsigned size, P* result); +cudaError_t msm(S* scalars, A* points, unsigned msm_size, MSMConfig config, P* results); } // namespace msm diff --git a/icicle/appUtils/ntt/ntt.cu b/icicle/appUtils/ntt/ntt.cu index 7ac2b62c3..b2be1e398 100644 --- a/icicle/appUtils/ntt/ntt.cu +++ b/icicle/appUtils/ntt/ntt.cu @@ -1,5 +1,4 @@ #include "../../utils/sharedmem.cuh" -#include "../vector_manipulation/ve_mod_mult.cuh" #include "../../curves/curve_config.cuh" #include "ntt.cuh" @@ -31,8 +30,31 @@ __global__ void twiddle_factors_kernel(S* d_twiddles, uint32_t n_twiddles, S ome } } -template -__global__ void reverse_order_kernel(T* arr, T* arr_reversed, uint32_t n, uint32_t logn, uint32_t batch_size) +/** + * Multiply the elements of an input array by a scalar in-place. Used for normalization in iNTT. + * @param arr input array. + * @param n size of arr. + * @param n_inv scalar of type S (scalar). + */ +template +__global__ void template_normalize_kernel(E* arr, uint32_t n, S scalar) +{ + int tid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tid < n) { arr[tid] = scalar * arr[tid]; } +} + +template +__global__ void batchVectorMult(S* scalar_vec, E* element_vec, unsigned n_scalars, unsigned batch_size) +{ + int tid = blockDim.x * blockIdx.x + threadIdx.x; + if (tid < n_scalars * batch_size) { + int scalar_id = tid % n_scalars; + element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid]; + } +} + +template +__global__ void reverse_order_kernel(E* arr, E* arr_reversed, uint32_t n, uint32_t logn, uint32_t batch_size) { int threadId = (blockIdx.x * blockDim.x) + threadIdx.x; if (threadId < n * batch_size) { @@ -52,15 +74,15 @@ __global__ void reverse_order_kernel(T* arr, T* arr_reversed, uint32_t n, uint32 * @param logn log(n). * @param batch_size the size of the batch. */ -template -void reverse_order_batch(T* arr, uint32_t n, uint32_t logn, uint32_t batch_size, cudaStream_t stream) +template +void reverse_order_batch(E* arr, uint32_t n, uint32_t logn, uint32_t batch_size, cudaStream_t stream) { - T* arr_reversed; - cudaMallocAsync(&arr_reversed, n * batch_size * sizeof(T), stream); + E* arr_reversed; + cudaMallocAsync(&arr_reversed, n * batch_size * sizeof(E), stream); int number_of_threads = MAX_THREADS_BATCH; int number_of_blocks = (n * batch_size + number_of_threads - 1) / number_of_threads; reverse_order_kernel<<>>(arr, arr_reversed, n, logn, batch_size); - cudaMemcpyAsync(arr, arr_reversed, n * batch_size * sizeof(T), cudaMemcpyDeviceToDevice, stream); + cudaMemcpyAsync(arr, arr_reversed, n * batch_size * sizeof(E), cudaMemcpyDeviceToDevice, stream); cudaFreeAsync(arr_reversed, stream); } @@ -72,8 +94,8 @@ void reverse_order_batch(T* arr, uint32_t n, uint32_t logn, uint32_t batch_size, * @param n length of `arr`. * @param logn log(n). */ -template -void reverse_order(T* arr, uint32_t n, uint32_t logn, cudaStream_t stream) +template +void reverse_order(E* arr, uint32_t n, uint32_t logn, cudaStream_t stream) { reverse_order_batch(arr, n, logn, 1, stream); } @@ -311,14 +333,14 @@ void ntt_inplace_batch_template( <<>>(d_inout, n, d_twiddles, n, total_tasks, s, false); } - if (is_coset) batch_vector_mult(coset, d_inout, n, batch_size, stream); + if (is_coset) batchVectorMult<<>>(coset, d_inout, n, batch_size); num_threads = max(min(n / 2, MAX_NUM_THREADS), 1); num_blocks = (n * batch_size + num_threads - 1) / num_threads; template_normalize_kernel <<>>(d_inout, n * batch_size, S::inv_log_size(logn)); } else { - if (is_coset) batch_vector_mult(coset, d_inout, n, batch_size, stream); + if (is_coset) batchVectorMult<<>>(coset, d_inout, n, batch_size); for (int s = logn - 1; s >= logn_shmem; s--) // TODO: this loop also can be unrolled { @@ -338,32 +360,33 @@ void ntt_inplace_batch_template( } // namespace template -cudaError_t generate_twiddle_factors(S* d_twiddles, uint32_t n_twiddles, S omega, unsigned device_id, cudaStream_t stream) +cudaError_t generate_twiddle_factors(S* d_twiddles, uint32_t n_twiddles, S omega, device_context::DeviceContext ctx) { - twiddle_factors_kernel<<<1, 1, 0, stream>>>(d_twiddles, n_twiddles, omega); - cudaStreamSynchronize(stream); + twiddle_factors_kernel<<<1, 1, 0, ctx.stream>>>(d_twiddles, n_twiddles, omega); + cudaStreamSynchronize(ctx.stream); return cudaSuccess; } template -cudaError_t ntt_internal(E* input, unsigned size, bool is_inverse, NTTConfig config) +cudaError_t ntt(E* input, unsigned size, bool is_inverse, NTTConfig config) { uint32_t logn = uint32_t(log(size) / log(2)); uint32_t n_twiddles = size; // n_twiddles is set to 4096 as BLS12_381::scalar_t::omega() is of that order. size_t input_size = size * config.batch_size * sizeof(E); bool on_device = config.are_inputs_on_device; bool generate_twiddles = (config.twiddles == nullptr); + cudaStream_t stream = config.ctx.stream; S* d_twiddles; if (generate_twiddles) { - cudaMallocAsync(&d_twiddles, n_twiddles * sizeof(S), config.stream); - generate_twiddle_factors(d_twiddles, n_twiddles, is_inverse ? S::omega_inv(logn) : S::omega(logn), config.device_id, config.stream); + cudaMallocAsync(&d_twiddles, n_twiddles * sizeof(S), stream); + generate_twiddle_factors(d_twiddles, n_twiddles, is_inverse ? S::omega_inv(logn) : S::omega(logn), config.ctx); } E* d_input; if (on_device) { - cudaMallocAsync(&d_input, input_size, config.stream); - cudaMemcpyAsync(d_input, input, input_size, cudaMemcpyHostToDevice, config.stream); + cudaMallocAsync(&d_input, input_size, stream); + cudaMemcpyAsync(d_input, input, input_size, cudaMemcpyHostToDevice, stream); } bool reverse_input; @@ -388,56 +411,37 @@ cudaError_t ntt_internal(E* input, unsigned size, bool is_inverse, NTTConfig } if (reverse_input) - reverse_order_batch(on_device ? d_input : input, size, logn, config.batch_size, config.stream); + reverse_order_batch(on_device ? d_input : input, size, logn, config.batch_size, stream); ntt_inplace_batch_template(on_device ? d_input : input, generate_twiddles ? d_twiddles : config.twiddles, size, - config.batch_size, is_inverse, config.is_coset, config.coset_gen, config.stream, false); + config.batch_size, is_inverse, config.is_coset, config.coset_gen, stream, false); if (reverse_output) - reverse_order_batch(on_device ? d_input : input, size, logn, config.batch_size, config.stream); + reverse_order_batch(on_device ? d_input : input, size, logn, config.batch_size, stream); if (on_device) { - cudaMemcpyAsync(input, d_input, input_size, cudaMemcpyDeviceToHost, config.stream); - cudaFreeAsync(d_input, config.stream); + cudaMemcpyAsync(input, d_input, input_size, cudaMemcpyDeviceToHost, stream); + cudaFreeAsync(d_input, stream); } if (generate_twiddles) { - cudaFreeAsync(d_twiddles, config.stream); + cudaFreeAsync(d_twiddles, stream); } - cudaStreamSynchronize(config.stream); + cudaStreamSynchronize(stream); return cudaSuccess; } -template -cudaError_t ntt(E* input, unsigned size, bool is_inverse) -{ - NTTConfig config = { - false, // are_inputs_on_device - Ordering::kNN, // ordering - Decimation::kDIF, // decimation - Butterfly::kCooleyTukey, // butterfly - false, // is_coset - (S *)nullptr, // coset_gen - (S *)nullptr, // twiddles - 1, // batch_size - 0, // device_id - 0 // stream - }; - return ntt_internal(input, size, is_inverse, config); -} - /** - * Extern version of [ntt_internal](@ref ntt_internal) function with the following values of template parameters - * (where the curve is given by `-DCURVE` env variable during build): - * - `S` and `E` are both the [scalar field](@ref scalar_t) of the curve; + * Extern version of [generate_twiddle_factors](@ref generate_twiddle_factors) function with the template parameter + * `S` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build. * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ -extern "C" cudaError_t ntt_internal_cuda( - curve_config::scalar_t* input, - unsigned size, - bool is_inverse, - NTTConfig config) -{ - return ntt_internal(input, size, is_inverse, config); +extern "C" cudaError_t generate_twiddle_factors_cuda( + curve_config::scalar_t* d_twiddles, + uint32_t n_twiddles, + curve_config::scalar_t omega, + device_context::DeviceContext ctx +) { + return generate_twiddle_factors(d_twiddles, n_twiddles, omega, ctx); } /** @@ -449,29 +453,14 @@ extern "C" cudaError_t ntt_internal_cuda( extern "C" cudaError_t ntt_cuda( curve_config::scalar_t* input, unsigned size, - bool is_inverse) -{ - return ntt(input, size, is_inverse); -} - -#if defined(ECNTT_DEFINED) - -/** - * Extern version of [ntt_internal](@ref ntt_internal) function with the following values of template parameters - * (where the curve is given by `-DCURVE` env variable during build): - * - `S` is the [projective representation](@ref projective_t) of the curve (i.e. EC NTT is computed); - * - `E` is the [scalar field](@ref scalar_t) of the curve; - * @return `cudaSuccess` if the execution was successful and an error code otherwise. - */ -extern "C" cudaError_t ecntt_internal_cuda( - curve_config::projective_t* input, - unsigned size, bool is_inverse, NTTConfig config) { - return ntt_internal(input, size, is_inverse, config); + return ntt(input, size, is_inverse, config); } +#if defined(ECNTT_DEFINED) + /** * Extern version of [ntt](@ref ntt) function with the following values of template parameters * (where the curve is given by `-DCURVE` env variable during build): @@ -482,9 +471,10 @@ extern "C" cudaError_t ecntt_internal_cuda( extern "C" cudaError_t ecntt_cuda( curve_config::projective_t* input, unsigned size, - bool is_inverse) + bool is_inverse, + NTTConfig config) { - return ntt(input, size, is_inverse); + return ntt(input, size, is_inverse, config); } #endif diff --git a/icicle/appUtils/ntt/ntt.cuh b/icicle/appUtils/ntt/ntt.cuh index 3707b34f9..3059cb572 100644 --- a/icicle/appUtils/ntt/ntt.cuh +++ b/icicle/appUtils/ntt/ntt.cuh @@ -3,6 +3,7 @@ #define NTT_H #include "cuda_runtime_api.h" +#include "../../utils/device_context.cuh" /** * @namespace ntt @@ -22,13 +23,12 @@ namespace ntt { * @param d_twiddles Input empty array on device to which twiddles are to be written. * @param n_twiddles Number of twiddle \f$ n \f$ factors to generate. * @param omega Root of unity \f$ \omega \f$. - * @param device_id ID of the device to use. - * @param stream Stream to use. + * @param ctx Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext). * @tparam S The type of twiddle factors \f$ \{ \omega^i \} \f$. * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ template -cudaError_t generate_twiddle_factors(S* d_twiddles, uint32_t n_twiddles, S omega, unsigned device_id, cudaStream_t stream); +cudaError_t generate_twiddle_factors(S* d_twiddles, uint32_t n_twiddles, S omega, device_context::DeviceContext ctx); /** * @enum Ordering @@ -58,7 +58,7 @@ enum class Butterfly { kCooleyTukey, kGentlemanSande }; /** * @struct NTTConfig - * Struct that encodes NTT parameters to be passed into the [ntt_internal](@ref ntt_internal) function. + * Struct that encodes NTT parameters to be passed into the [ntt](@ref ntt) function. */ template struct NTTConfig { @@ -83,8 +83,7 @@ struct NTTConfig { * are generated online using the default generator (TODO: link to twiddle gen here) and function * [generate_twiddle_factors](@ref generate_twiddle_factors). Default value: `null`. */ unsigned batch_size; /**< The number of NTTs to compute. Default value: 1. */ - unsigned device_id; /**< Index of the GPU to run the NTT on. Default value: 0. */ - cudaStream_t stream; /**< Stream to use. Default value: 0. */ + device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext). */ }; /** @@ -99,13 +98,7 @@ struct NTTConfig { * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ template -cudaError_t ntt_internal(E* input, unsigned size, bool is_inverse, NTTConfig config); - -/** - * A function that computes NTT by calling [ntt_internal](@ref ntt_internal) function with default [NTTConfig](@ref NTTConfig) values. - */ -template -cudaError_t ntt(E* input, unsigned size, bool is_inverse); +cudaError_t ntt(E* input, unsigned size, bool is_inverse, NTTConfig config); } // namespace ntt diff --git a/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh b/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh deleted file mode 100644 index 36859c49f..000000000 --- a/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh +++ /dev/null @@ -1,135 +0,0 @@ -#ifndef VEC_MULT -#define VEC_MULT -#pragma once -#include -#include - -#define MAX_THREADS_PER_BLOCK 256 - -/** - * Multiply the elements of an input array by a scalar in-place. - * @param arr input array. - * @param n size of arr. - * @param n_inv scalar of type S (scalar). - */ -template -__global__ void template_normalize_kernel(E* arr, uint32_t n, S scalar) -{ - int tid = (blockIdx.x * blockDim.x) + threadIdx.x; - if (tid < n) { arr[tid] = scalar * arr[tid]; } -} - -// TODO: headers for prototypes and .c .cpp .cu files for implementations -template -__global__ void vectorModMult(S* scalar_vec, E* element_vec, E* result, size_t n_elments) -{ - int tid = blockDim.x * blockIdx.x + threadIdx.x; - if (tid < n_elments) { result[tid] = scalar_vec[tid] * element_vec[tid]; } -} - -template -int vector_mod_mult(S* vec_a, E* vec_b, E* result, size_t n_elments, cudaStream_t stream) // TODO: in place so no need - // for third result vector -{ - // Set the grid and block dimensions - int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK); - int threads_per_block = MAX_THREADS_PER_BLOCK; - - // Allocate memory on the device for the input vectors, the output vector, and the modulus - S* d_vec_a; - E *d_vec_b, *d_result; - cudaMallocAsync(&d_vec_a, n_elments * sizeof(S), stream); - cudaMallocAsync(&d_vec_b, n_elments * sizeof(E), stream); - cudaMallocAsync(&d_result, n_elments * sizeof(E), stream); - - // Copy the input vectors and the modulus from the host to the device - cudaMemcpyAsync(d_vec_a, vec_a, n_elments * sizeof(S), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(d_vec_b, vec_b, n_elments * sizeof(E), cudaMemcpyHostToDevice, stream); - - // Call the kernel to perform element-wise modular multiplication - vectorModMult<<>>(d_vec_a, d_vec_b, d_result, n_elments); - - cudaMemcpyAsync(result, d_result, n_elments * sizeof(E), cudaMemcpyDeviceToHost, stream); - - cudaFreeAsync(d_vec_a, stream); - cudaFreeAsync(d_vec_b, stream); - cudaFreeAsync(d_result, stream); - - cudaStreamSynchronize(stream); - return 0; -} - -template -int vector_mod_mult_device( - S* d_vec_a, E* d_vec_b, E* d_result, size_t n_elments) // TODO: in place so no need for third result vector -{ - // Set the grid and block dimensions - int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK); - int threads_per_block = MAX_THREADS_PER_BLOCK; - - // Call the kernel to perform element-wise modular multiplication - vectorModMult<<>>(d_vec_a, d_vec_b, d_result, n_elments); - return 0; -} - -template -__global__ void batchVectorMult(S* scalar_vec, E* element_vec, unsigned n_scalars, unsigned batch_size) -{ - int tid = blockDim.x * blockIdx.x + threadIdx.x; - if (tid < n_scalars * batch_size) { - int scalar_id = tid % n_scalars; - element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid]; - } -} - -template -int batch_vector_mult(S* scalar_vec, E* element_vec, unsigned n_scalars, unsigned batch_size, cudaStream_t stream) -{ - // Set the grid and block dimensions - int NUM_THREADS = MAX_THREADS_PER_BLOCK; - int NUM_BLOCKS = (n_scalars * batch_size + NUM_THREADS - 1) / NUM_THREADS; - batchVectorMult<<>>(scalar_vec, element_vec, n_scalars, batch_size); - return 0; -} - -template -__global__ void matrixVectorMult(E* matrix_elements, E* vector_elements, E* result, size_t dim) -{ - int tid = blockDim.x * blockIdx.x + threadIdx.x; - if (tid < dim) { - result[tid] = E::zero(); - for (int i = 0; i < dim; i++) - result[tid] = result[tid] + matrix_elements[tid * dim + i] * vector_elements[i]; - } -} - -template -int matrix_mod_mult(E* matrix_elements, E* vector_elements, E* result, size_t dim, cudaStream_t stream) -{ - // Set the grid and block dimensions - int num_blocks = (int)ceil((float)dim / MAX_THREADS_PER_BLOCK); - int threads_per_block = MAX_THREADS_PER_BLOCK; - - // Allocate memory on the device for the input vectors, the output vector, and the modulus - E *d_matrix, *d_vector, *d_result; - cudaMallocAsync(&d_matrix, (dim * dim) * sizeof(E), stream); - cudaMallocAsync(&d_vector, dim * sizeof(E), stream); - cudaMallocAsync(&d_result, dim * sizeof(E), stream); - - // Copy the input vectors and the modulus from the host to the device - cudaMemcpyAsync(d_matrix, matrix_elements, (dim * dim) * sizeof(E), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(d_vector, vector_elements, dim * sizeof(E), cudaMemcpyHostToDevice, stream); - - // Call the kernel to perform element-wise modular multiplication - matrixVectorMult<<>>(d_matrix, d_vector, d_result, dim); - - cudaMemcpyAsync(result, d_result, dim * sizeof(E), cudaMemcpyDeviceToHost, stream); - - cudaFreeAsync(d_matrix, stream); - cudaFreeAsync(d_vector, stream); - cudaFreeAsync(d_result, stream); - - cudaStreamSynchronize(stream); - return 0; -} -#endif \ No newline at end of file diff --git a/icicle/utils/device_context.cuh b/icicle/utils/device_context.cuh new file mode 100644 index 000000000..e29d6a1b1 --- /dev/null +++ b/icicle/utils/device_context.cuh @@ -0,0 +1,20 @@ +#pragma once +#ifndef DEVICE_CONTEXT_H +#define DEVICE_CONTEXT_H + +#include + +namespace device_context { + +/** + * Properties of the device used in icicle functions. + */ +struct DeviceContext { + unsigned device_id; /**< Index of the currently used GPU. Default value: 0. */ + cudaStream_t stream; /**< Stream to use. Default value: 0. */ + cudaMemPool_t mempool; /**< Mempool to use. Default value: 0. */ +}; + +} // namespace device_context + +#endif From df1fc7ed6068e8eab9d81a8d6002949524597a19 Mon Sep 17 00:00:00 2001 From: DmytroTym Date: Wed, 4 Oct 2023 08:50:13 +0300 Subject: [PATCH 3/8] added lde to cmake --- icicle/CMakeLists.txt | 7 ++++++- icicle/appUtils/lde/lde.cu | 6 +++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/icicle/CMakeLists.txt b/icicle/CMakeLists.txt index 72e8c14a7..7d1d156ce 100644 --- a/icicle/CMakeLists.txt +++ b/icicle/CMakeLists.txt @@ -44,6 +44,7 @@ if (NOT BUILD_TESTS) icicle appUtils/msm/msm.cu appUtils/ntt/ntt.cu + appUtils/lde/lde.cu ) target_compile_options(icicle PRIVATE -c) @@ -52,10 +53,14 @@ if (NOT BUILD_TESTS) POST_BUILD COMMAND ${CMAKE_OBJCOPY} ARGS --redefine-sym msm_cuda=${CURVE}_msm_cuda ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o COMMAND ${CMAKE_OBJCOPY} ARGS --prefix-symbols=${CURVE}_ ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o - COMMAND ${CMAKE_AR} ARGS -rcs ${LIBRARY_OUTPUT_DIRECTORY}/libingo_${CURVE}.a ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o + COMMAND ${CMAKE_OBJCOPY} ARGS --prefix-symbols=${CURVE}_ ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/lde/lde.cu.o + COMMAND ${CMAKE_AR} ARGS -rcs ${LIBRARY_OUTPUT_DIRECTORY}/libingo_${CURVE}.a ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o + ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o + ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/lde/lde.cu.o ) file(REMOVE ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/msm/msm.cu.o) file(REMOVE ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/ntt/ntt.cu.o) + file(REMOVE ${PROJECT_BINARY_DIR}/CMakeFiles/icicle.dir/appUtils/lde/lde.cu.o) else() diff --git a/icicle/appUtils/lde/lde.cu b/icicle/appUtils/lde/lde.cu index 89a9a12b4..2f33a3c1e 100644 --- a/icicle/appUtils/lde/lde.cu +++ b/icicle/appUtils/lde/lde.cu @@ -53,7 +53,7 @@ cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, device_context::De } // Call the kernel to perform element-wise modular multiplication - multiply_kernel<<>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, on_device ? d_result : result, n); + mul_kernel<<>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, n, on_device ? d_result : result); if (on_device) { cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream); @@ -87,7 +87,7 @@ cudaError_t add(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::De } // Call the kernel to perform element-wise addition - add_kernel<<>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, on_device ? d_result : result, n); + add_kernel<<>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, n, on_device ? d_result : result); if (on_device) { cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream); @@ -121,7 +121,7 @@ cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::De } // Call the kernel to perform element-wise subtraction - sub_kernel<<>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, on_device ? d_result : result, n); + sub_kernel<<>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, n, on_device ? d_result : result); if (on_device) { cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream); From d8e26e8b502d6b8fb54ecad523e48ef652b239ec Mon Sep 17 00:00:00 2001 From: DmytroTym Date: Wed, 4 Oct 2023 14:34:38 +0300 Subject: [PATCH 4/8] Montgomery param added in lde.cu mul function --- icicle/appUtils/lde/lde.cu | 8 +++----- icicle/appUtils/lde/lde.cuh | 4 +++- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/icicle/appUtils/lde/lde.cu b/icicle/appUtils/lde/lde.cu index 2f33a3c1e..554b7665e 100644 --- a/icicle/appUtils/lde/lde.cu +++ b/icicle/appUtils/lde/lde.cu @@ -33,7 +33,7 @@ __global__ void sub_kernel(E* element_vec1, E* element_vec2, uint32_t n, E* resu } // namespace template -cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result) +cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result) { // Set the grid and block dimensions int num_threads = MAX_THREADS_PER_BLOCK; @@ -57,7 +57,6 @@ cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, device_context::De if (on_device) { cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream); - cudaFreeAsync(d_vec_a, ctx.stream); cudaFreeAsync(d_vec_b, ctx.stream); cudaFreeAsync(d_result, ctx.stream); @@ -91,7 +90,6 @@ cudaError_t add(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::De if (on_device) { cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream); - cudaFreeAsync(d_vec_a, ctx.stream); cudaFreeAsync(d_vec_b, ctx.stream); cudaFreeAsync(d_result, ctx.stream); @@ -125,7 +123,6 @@ cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::De if (on_device) { cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream); - cudaFreeAsync(d_vec_a, ctx.stream); cudaFreeAsync(d_vec_b, ctx.stream); cudaFreeAsync(d_result, ctx.stream); @@ -145,10 +142,11 @@ extern "C" cudaError_t mul_cuda( curve_config::scalar_t* vec_b, uint32_t n, bool on_device, + bool is_montgomery, device_context::DeviceContext ctx, curve_config::scalar_t* result ) { - return mul(vec_a, vec_b, n, on_device, ctx, result); + return mul(vec_a, vec_b, n, on_device, is_montgomery, ctx, result); } /** diff --git a/icicle/appUtils/lde/lde.cuh b/icicle/appUtils/lde/lde.cuh index 90df95c4b..d11c984a2 100644 --- a/icicle/appUtils/lde/lde.cuh +++ b/icicle/appUtils/lde/lde.cuh @@ -15,6 +15,8 @@ namespace lde { * @param vec_b Second input vector. * @param n Size of vectors `vec_a` and `vec_b`. * @param on_device If true, inputs and outputs are on device, if false - on the host. + * @param is_montgomery If true, inputs are expected to be Montgomery form and results are retured in Montgomery form. + * If false - inputs and outputs are non-Montgomery. * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method. * @param result Resulting vector - element-wise product of `vec_a` and `vec_b`, can be the same pointer as `vec_b`. * @tparam S The type of scalars `vec_a`. @@ -22,7 +24,7 @@ namespace lde { * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ template -cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result); +cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result); /** * A function that adds two vectors element-wise. From 7d4e266ed51febe7debeb391bafd7ac32af0e9a8 Mon Sep 17 00:00:00 2001 From: DmytroTym Date: Wed, 4 Oct 2023 16:04:01 +0300 Subject: [PATCH 5/8] fixed on_device for ntt and lde --- icicle/appUtils/lde/lde.cu | 36 ++++++++++++++++++------------------ icicle/appUtils/lde/lde.cuh | 12 ++++++------ icicle/appUtils/ntt/ntt.cu | 12 ++++++------ 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/icicle/appUtils/lde/lde.cu b/icicle/appUtils/lde/lde.cu index 554b7665e..56e1ca2a6 100644 --- a/icicle/appUtils/lde/lde.cu +++ b/icicle/appUtils/lde/lde.cu @@ -33,7 +33,7 @@ __global__ void sub_kernel(E* element_vec1, E* element_vec2, uint32_t n, E* resu } // namespace template -cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result) +cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool is_on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result) { // Set the grid and block dimensions int num_threads = MAX_THREADS_PER_BLOCK; @@ -41,7 +41,7 @@ cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, bool is_montgomery S* d_vec_a; E *d_vec_b, *d_result; - if (on_device) { + if (!is_on_device) { // Allocate memory on the device for the input vectors and the output vector cudaMallocAsync(&d_vec_a, n * sizeof(S), ctx.stream); cudaMallocAsync(&d_vec_b, n * sizeof(E), ctx.stream); @@ -53,9 +53,9 @@ cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, bool is_montgomery } // Call the kernel to perform element-wise modular multiplication - mul_kernel<<>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, n, on_device ? d_result : result); + mul_kernel<<>>(is_on_device ? vec_a : d_vec_a, is_on_device ? vec_b : d_vec_b, n, is_on_device ? result : d_result); - if (on_device) { + if (!is_on_device) { cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream); cudaFreeAsync(d_vec_a, ctx.stream); cudaFreeAsync(d_vec_b, ctx.stream); @@ -67,14 +67,14 @@ cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, bool is_montgomery } template -cudaError_t add(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result) +cudaError_t add(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context::DeviceContext ctx, E* result) { // Set the grid and block dimensions int num_threads = MAX_THREADS_PER_BLOCK; int num_blocks = (n + num_threads - 1) / num_threads; E *d_vec_a, *d_vec_b, *d_result; - if (on_device) { + if (!is_on_device) { // Allocate memory on the device for the input vectors and the output vector cudaMallocAsync(&d_vec_a, n * sizeof(E), ctx.stream); cudaMallocAsync(&d_vec_b, n * sizeof(E), ctx.stream); @@ -86,9 +86,9 @@ cudaError_t add(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::De } // Call the kernel to perform element-wise addition - add_kernel<<>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, n, on_device ? d_result : result); + add_kernel<<>>(is_on_device ? vec_a : d_vec_a, is_on_device ? vec_b : d_vec_b, n, is_on_device ? result : d_result); - if (on_device) { + if (!is_on_device) { cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream); cudaFreeAsync(d_vec_a, ctx.stream); cudaFreeAsync(d_vec_b, ctx.stream); @@ -100,14 +100,14 @@ cudaError_t add(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::De } template -cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result) +cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context::DeviceContext ctx, E* result) { // Set the grid and block dimensions int num_threads = MAX_THREADS_PER_BLOCK; int num_blocks = (n + num_threads - 1) / num_threads; E *d_vec_a, *d_vec_b, *d_result; - if (on_device) { + if (!is_on_device) { // Allocate memory on the device for the input vectors and the output vector cudaMallocAsync(&d_vec_a, n * sizeof(E), ctx.stream); cudaMallocAsync(&d_vec_b, n * sizeof(E), ctx.stream); @@ -119,9 +119,9 @@ cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::De } // Call the kernel to perform element-wise subtraction - sub_kernel<<>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, n, on_device ? d_result : result); + sub_kernel<<>>(is_on_device ? vec_a : d_vec_a, is_on_device ? vec_b : d_vec_b, n, is_on_device ? result : d_result); - if (on_device) { + if (!is_on_device) { cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream); cudaFreeAsync(d_vec_a, ctx.stream); cudaFreeAsync(d_vec_b, ctx.stream); @@ -141,12 +141,12 @@ extern "C" cudaError_t mul_cuda( curve_config::scalar_t* vec_a, curve_config::scalar_t* vec_b, uint32_t n, - bool on_device, + bool is_on_device, bool is_montgomery, device_context::DeviceContext ctx, curve_config::scalar_t* result ) { - return mul(vec_a, vec_b, n, on_device, is_montgomery, ctx, result); + return mul(vec_a, vec_b, n, is_on_device, is_montgomery, ctx, result); } /** @@ -158,11 +158,11 @@ extern "C" cudaError_t add_cuda( curve_config::scalar_t* vec_a, curve_config::scalar_t* vec_b, uint32_t n, - bool on_device, + bool is_on_device, device_context::DeviceContext ctx, curve_config::scalar_t* result ) { - return add(vec_a, vec_b, n, on_device, ctx, result); + return add(vec_a, vec_b, n, is_on_device, ctx, result); } /** @@ -174,11 +174,11 @@ extern "C" cudaError_t sub_cuda( curve_config::scalar_t* vec_a, curve_config::scalar_t* vec_b, uint32_t n, - bool on_device, + bool is_on_device, device_context::DeviceContext ctx, curve_config::scalar_t* result ) { - return sub(vec_a, vec_b, n, on_device, ctx, result); + return sub(vec_a, vec_b, n, is_on_device, ctx, result); } } // namespace lde diff --git a/icicle/appUtils/lde/lde.cuh b/icicle/appUtils/lde/lde.cuh index d11c984a2..129baded1 100644 --- a/icicle/appUtils/lde/lde.cuh +++ b/icicle/appUtils/lde/lde.cuh @@ -14,7 +14,7 @@ namespace lde { * @param vec_a First input vector. * @param vec_b Second input vector. * @param n Size of vectors `vec_a` and `vec_b`. - * @param on_device If true, inputs and outputs are on device, if false - on the host. + * @param is_on_device If true, inputs and outputs are on device, if false - on the host. * @param is_montgomery If true, inputs are expected to be Montgomery form and results are retured in Montgomery form. * If false - inputs and outputs are non-Montgomery. * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method. @@ -24,35 +24,35 @@ namespace lde { * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ template -cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result); +cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool is_on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result); /** * A function that adds two vectors element-wise. * @param vec_a First input vector. * @param vec_b Second input vector. * @param n Size of vectors `vec_a` and `vec_b`. - * @param on_device If true, inputs and outputs are on device, if false - on the host. + * @param is_on_device If true, inputs and outputs are on device, if false - on the host. * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method. * @param result Resulting vector - element-wise sum of `vec_a` and `vec_b`, can be the same pointer as `vec_a` or `vec_b`. * @tparam E The type of elements `vec_a`, `vec_b` and `result`. * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ template -cudaError_t add(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result); +cudaError_t add(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context::DeviceContext ctx, E* result); /** * A function that subtracts two vectors element-wise. * @param vec_a First input vector. * @param vec_b Second input vector. * @param n Size of vectors `vec_a` and `vec_b`. - * @param on_device If true, inputs and outputs are on device, if false - on the host. + * @param is_on_device If true, inputs and outputs are on device, if false - on the host. * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method. * @param result Resulting vector - element-wise difference of `vec_a` and `vec_b`, can be the same pointer as `vec_a` or `vec_b`. * @tparam E The type of elements `vec_a`, `vec_b` and `result`. * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ template -cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result); +cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context::DeviceContext ctx, E* result); } // namespace lde diff --git a/icicle/appUtils/ntt/ntt.cu b/icicle/appUtils/ntt/ntt.cu index b2be1e398..6837db39c 100644 --- a/icicle/appUtils/ntt/ntt.cu +++ b/icicle/appUtils/ntt/ntt.cu @@ -373,7 +373,7 @@ cudaError_t ntt(E* input, unsigned size, bool is_inverse, NTTConfig config) uint32_t logn = uint32_t(log(size) / log(2)); uint32_t n_twiddles = size; // n_twiddles is set to 4096 as BLS12_381::scalar_t::omega() is of that order. size_t input_size = size * config.batch_size * sizeof(E); - bool on_device = config.are_inputs_on_device; + bool is_on_device = config.are_inputs_on_device; bool generate_twiddles = (config.twiddles == nullptr); cudaStream_t stream = config.ctx.stream; @@ -384,7 +384,7 @@ cudaError_t ntt(E* input, unsigned size, bool is_inverse, NTTConfig config) } E* d_input; - if (on_device) { + if (!is_on_device) { cudaMallocAsync(&d_input, input_size, stream); cudaMemcpyAsync(d_input, input, input_size, cudaMemcpyHostToDevice, stream); } @@ -411,13 +411,13 @@ cudaError_t ntt(E* input, unsigned size, bool is_inverse, NTTConfig config) } if (reverse_input) - reverse_order_batch(on_device ? d_input : input, size, logn, config.batch_size, stream); - ntt_inplace_batch_template(on_device ? d_input : input, generate_twiddles ? d_twiddles : config.twiddles, size, + reverse_order_batch(is_on_device ? input : d_input, size, logn, config.batch_size, stream); + ntt_inplace_batch_template(is_on_device ? input : d_input, generate_twiddles ? d_twiddles : config.twiddles, size, config.batch_size, is_inverse, config.is_coset, config.coset_gen, stream, false); if (reverse_output) - reverse_order_batch(on_device ? d_input : input, size, logn, config.batch_size, stream); + reverse_order_batch(is_on_device ? input : d_input, size, logn, config.batch_size, stream); - if (on_device) { + if (!is_on_device) { cudaMemcpyAsync(input, d_input, input_size, cudaMemcpyDeviceToHost, stream); cudaFreeAsync(d_input, stream); } From ec671969677d70f5da70bc77abc9747c87bdf511 Mon Sep 17 00:00:00 2001 From: DmytroTym Date: Thu, 5 Oct 2023 18:54:39 +0300 Subject: [PATCH 6/8] CamelCase --- icicle/appUtils/lde/lde.cu | 35 ++++++++++++++++++++--------------- icicle/appUtils/lde/lde.cuh | 6 +++--- icicle/appUtils/msm/msm.cu | 14 +++++++------- icicle/appUtils/msm/msm.cuh | 14 +++++++------- icicle/appUtils/ntt/ntt.cu | 22 +++++++++++----------- icicle/appUtils/ntt/ntt.cuh | 34 +++++++++++++++++----------------- 6 files changed, 65 insertions(+), 60 deletions(-) diff --git a/icicle/appUtils/lde/lde.cu b/icicle/appUtils/lde/lde.cu index 56e1ca2a6..811a1c163 100644 --- a/icicle/appUtils/lde/lde.cu +++ b/icicle/appUtils/lde/lde.cu @@ -1,6 +1,10 @@ +#include "lde.cuh" + #include #include + #include "../../utils/device_context.cuh" +#include "../../utils/mont.cuh" #include "../../curves/curve_config.cuh" namespace lde { @@ -10,21 +14,21 @@ namespace { #define MAX_THREADS_PER_BLOCK 256 template -__global__ void mul_kernel(S* scalar_vec, E* element_vec, size_t n, E* result) +__global__ void mul_kernel(S* scalar_vec, E* element_vec, int n, E* result) { int tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid < n) { result[tid] = scalar_vec[tid] * element_vec[tid]; } } template -__global__ void add_kernel(E* element_vec1, E* element_vec2, uint32_t n, E* result) +__global__ void add_kernel(E* element_vec1, E* element_vec2, int n, E* result) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < n) { result[tid] = element_vec1[tid] + element_vec2[tid]; } } template -__global__ void sub_kernel(E* element_vec1, E* element_vec2, uint32_t n, E* result) +__global__ void sub_kernel(E* element_vec1, E* element_vec2, int n, E* result) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < n) { result[tid] = element_vec1[tid] - element_vec2[tid]; } @@ -33,7 +37,7 @@ __global__ void sub_kernel(E* element_vec1, E* element_vec2, uint32_t n, E* resu } // namespace template -cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool is_on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result) +cudaError_t Mul(S* vec_a, E* vec_b, int n, bool is_on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result) { // Set the grid and block dimensions int num_threads = MAX_THREADS_PER_BLOCK; @@ -54,6 +58,7 @@ cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool is_on_device, bool is_montgom // Call the kernel to perform element-wise modular multiplication mul_kernel<<>>(is_on_device ? vec_a : d_vec_a, is_on_device ? vec_b : d_vec_b, n, is_on_device ? result : d_result); + if (is_montgomery) mont::from_montgomery(is_on_device ? result : d_result, n, ctx.stream); if (!is_on_device) { cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream); @@ -67,7 +72,7 @@ cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool is_on_device, bool is_montgom } template -cudaError_t add(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context::DeviceContext ctx, E* result) +cudaError_t Add(E* vec_a, E* vec_b, int n, bool is_on_device, device_context::DeviceContext ctx, E* result) { // Set the grid and block dimensions int num_threads = MAX_THREADS_PER_BLOCK; @@ -100,7 +105,7 @@ cudaError_t add(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context: } template -cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context::DeviceContext ctx, E* result) +cudaError_t Sub(E* vec_a, E* vec_b, int n, bool is_on_device, device_context::DeviceContext ctx, E* result) { // Set the grid and block dimensions int num_threads = MAX_THREADS_PER_BLOCK; @@ -133,14 +138,14 @@ cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context: } /** - * Extern version of [mul](@ref mul) function with the template parameters + * Extern version of [Mul](@ref Mul) function with the template parameters * `S` and `E` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build. * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ -extern "C" cudaError_t mul_cuda( +extern "C" cudaError_t MulCuda( curve_config::scalar_t* vec_a, curve_config::scalar_t* vec_b, - uint32_t n, + int n, bool is_on_device, bool is_montgomery, device_context::DeviceContext ctx, @@ -150,14 +155,14 @@ extern "C" cudaError_t mul_cuda( } /** - * Extern version of [add](@ref add) function with the template parameter + * Extern version of [Add](@ref Add) function with the template parameter * `E` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build. * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ -extern "C" cudaError_t add_cuda( +extern "C" cudaError_t AddCuda( curve_config::scalar_t* vec_a, curve_config::scalar_t* vec_b, - uint32_t n, + int n, bool is_on_device, device_context::DeviceContext ctx, curve_config::scalar_t* result @@ -166,14 +171,14 @@ extern "C" cudaError_t add_cuda( } /** - * Extern version of [sub](@ref sub) function with the template parameter + * Extern version of [Sub](@ref Sub) function with the template parameter * `E` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build. * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ -extern "C" cudaError_t sub_cuda( +extern "C" cudaError_t SubCuda( curve_config::scalar_t* vec_a, curve_config::scalar_t* vec_b, - uint32_t n, + int n, bool is_on_device, device_context::DeviceContext ctx, curve_config::scalar_t* result diff --git a/icicle/appUtils/lde/lde.cuh b/icicle/appUtils/lde/lde.cuh index 129baded1..010920499 100644 --- a/icicle/appUtils/lde/lde.cuh +++ b/icicle/appUtils/lde/lde.cuh @@ -24,7 +24,7 @@ namespace lde { * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ template -cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool is_on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result); +cudaError_t Mul(S* vec_a, E* vec_b, int n, bool is_on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result); /** * A function that adds two vectors element-wise. @@ -38,7 +38,7 @@ cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool is_on_device, bool is_montgom * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ template -cudaError_t add(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context::DeviceContext ctx, E* result); +cudaError_t Add(E* vec_a, E* vec_b, int n, bool is_on_device, device_context::DeviceContext ctx, E* result); /** * A function that subtracts two vectors element-wise. @@ -52,7 +52,7 @@ cudaError_t add(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context: * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ template -cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool is_on_device, device_context::DeviceContext ctx, E* result); +cudaError_t Sub(E* vec_a, E* vec_b, int n, bool is_on_device, device_context::DeviceContext ctx, E* result); } // namespace lde diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu index 246d66dce..f077bb61f 100644 --- a/icicle/appUtils/msm/msm.cu +++ b/icicle/appUtils/msm/msm.cu @@ -897,7 +897,7 @@ void batched_bucket_method_msm( } // namespace template -cudaError_t msm(S* scalars, A* points, unsigned msm_size, MSMConfig config, P* results) +cudaError_t MSM(S* scalars, A* points, int msm_size, MSMConfig config, P* results) { // TODO: DmytroTym/HadarIngonyama - unify the implementation of the bucket method and the batched bucket method in one function // TODO: DmytroTym/HadarIngonyama - parameters to be included into the implementation: on deviceness of points, scalars and results, precompute factor, points size and device id @@ -916,14 +916,14 @@ cudaError_t msm(S* scalars, A* points, unsigned msm_size, MSMConfig config, P* r * - `P` is the [projective representation](@ref projective_t) of curve points. * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ -extern "C" cudaError_t msm_cuda( +extern "C" cudaError_t MSMCuda( curve_config::scalar_t* scalars, curve_config::affine_t* points, - size_t msm_size, + int msm_size, MSMConfig config, curve_config::projective_t* out) { - return msm(scalars, points, msm_size, config, out); + return MSM(scalars, points, msm_size, config, out); } #if defined(G2_DEFINED) @@ -936,14 +936,14 @@ extern "C" cudaError_t msm_cuda( * - `P` is the [projective representation](@ref g2_projective_t) of G2 curve points. * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ -extern "C" cudaError_t g2_msm_cuda( +extern "C" cudaError_t G2MSMCuda( curve_config::scalar_t* scalars, curve_config::g2_affine_t* points, - size_t msm_size, + int msm_size, MSMConfig config, curve_config::g2_projective_t* out) { - return msm(scalars, points, msm_size, config, out); + return MSM(scalars, points, msm_size, config, out); } #endif diff --git a/icicle/appUtils/msm/msm.cuh b/icicle/appUtils/msm/msm.cuh index 6dba1bcce..043f46683 100644 --- a/icicle/appUtils/msm/msm.cuh +++ b/icicle/appUtils/msm/msm.cuh @@ -30,25 +30,25 @@ namespace msm { struct MSMConfig { bool are_scalars_on_device; /**< True if scalars are on device and false if they're on host. Default value: false. */ bool are_scalars_montgomery_form; /**< True if scalars are in Montgomery form and false otherwise. Default value: true. */ - unsigned points_size; /**< Number of points in the MSM. If a batch of MSMs needs to be computed, this should be a number + int points_size; /**< Number of points in the MSM. If a batch of MSMs needs to be computed, this should be a number * of different points. So, if each MSM re-uses the same set of points, this variable is set equal * to the MSM size. And if every MSM uses a distinct set of points, it should be set to the product of * MSM size and [batch_size](@ref batch_size). Default value: 0 (meaning it's equal to the MSM size). */ - unsigned precompute_factor; /**< The number of extra points to pre-compute for each point. Larger values decrease the number of computations + int precompute_factor; /**< The number of extra points to pre-compute for each point. Larger values decrease the number of computations * to make, on-line memory footprint, but increase the static memory footprint. Default value: 1 (i.e. don't pre-compute). */ bool are_points_on_device; /**< True if points are on device and false if they're on host. Default value: false. */ bool are_points_montgomery_form; /**< True if coordinates of points are in Montgomery form and false otherwise. Default value: true. */ - unsigned batch_size; /**< The number of MSMs to compute. Default value: 1. */ + int batch_size; /**< The number of MSMs to compute. Default value: 1. */ bool are_result_on_device; /**< True if the results should be on device and false if they should be on host. Default value: false. */ - unsigned c; /**< \f$ c \f$ value, or "window bitsize" which is the main parameter of the "bucket method" + int c; /**< \f$ c \f$ value, or "window bitsize" which is the main parameter of the "bucket method" * that we use to solve the MSM problem. As a rule of thumb, larger value means more on-line memory * footprint but also more parallelism and less computational complexity (up to a certain point). * Default value: 0 (the optimal value of \f$ c \f$ is chosen automatically). */ - unsigned bitsize; /**< Number of bits of the largest scalar. Typically equals the bitsize of scalar field, but if a different + int bitsize; /**< Number of bits of the largest scalar. Typically equals the bitsize of scalar field, but if a different * (better) upper bound is known, it should be reflected in this variable. Default value: 0 (set to the bitsize of scalar field). */ bool big_triangle; /**< Whether to do "bucket accumulation" serially. Decreases computational complexity, but also greatly * decreases parallelism, so only suitable for large batches of MSMs. Default value: false. */ - unsigned large_bucket_factor; /**< Variable that controls how sensitive the algorithm is to the buckets that occur very frequently. + int large_bucket_factor; /**< Variable that controls how sensitive the algorithm is to the buckets that occur very frequently. * Useful for efficient treatment of non-uniform distributions of scalars and "top windows" with few bits. * Can be set to 0 to disable separate treatment of large buckets altogether. Default value: 10. */ device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext). */ @@ -68,7 +68,7 @@ struct MSMConfig { * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ template -cudaError_t msm(S* scalars, A* points, unsigned msm_size, MSMConfig config, P* results); +cudaError_t MSM(S* scalars, A* points, int msm_size, MSMConfig config, P* results); } // namespace msm diff --git a/icicle/appUtils/ntt/ntt.cu b/icicle/appUtils/ntt/ntt.cu index 6837db39c..b48f2dde6 100644 --- a/icicle/appUtils/ntt/ntt.cu +++ b/icicle/appUtils/ntt/ntt.cu @@ -360,7 +360,7 @@ void ntt_inplace_batch_template( } // namespace template -cudaError_t generate_twiddle_factors(S* d_twiddles, uint32_t n_twiddles, S omega, device_context::DeviceContext ctx) +cudaError_t GenerateTwiddleFactors(S* d_twiddles, int n_twiddles, S omega, device_context::DeviceContext ctx) { twiddle_factors_kernel<<<1, 1, 0, ctx.stream>>>(d_twiddles, n_twiddles, omega); cudaStreamSynchronize(ctx.stream); @@ -368,7 +368,7 @@ cudaError_t generate_twiddle_factors(S* d_twiddles, uint32_t n_twiddles, S omega } template -cudaError_t ntt(E* input, unsigned size, bool is_inverse, NTTConfig config) +cudaError_t NTT(E* input, int size, bool is_inverse, NTTConfig config) { uint32_t logn = uint32_t(log(size) / log(2)); uint32_t n_twiddles = size; // n_twiddles is set to 4096 as BLS12_381::scalar_t::omega() is of that order. @@ -435,9 +435,9 @@ cudaError_t ntt(E* input, unsigned size, bool is_inverse, NTTConfig config) * `S` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build. * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ -extern "C" cudaError_t generate_twiddle_factors_cuda( +extern "C" cudaError_t GenerateTwiddleFactorsCuda( curve_config::scalar_t* d_twiddles, - uint32_t n_twiddles, + int n_twiddles, curve_config::scalar_t omega, device_context::DeviceContext ctx ) { @@ -450,31 +450,31 @@ extern "C" cudaError_t generate_twiddle_factors_cuda( * - `S` and `E` are both the [scalar field](@ref scalar_t) of the curve; * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ -extern "C" cudaError_t ntt_cuda( +extern "C" cudaError_t NTTCuda( curve_config::scalar_t* input, - unsigned size, + int size, bool is_inverse, NTTConfig config) { - return ntt(input, size, is_inverse, config); + return NTT(input, size, is_inverse, config); } #if defined(ECNTT_DEFINED) /** - * Extern version of [ntt](@ref ntt) function with the following values of template parameters + * Extern version of [NTT](@ref NTT) function with the following values of template parameters * (where the curve is given by `-DCURVE` env variable during build): * - `S` is the [projective representation](@ref projective_t) of the curve (i.e. EC NTT is computed); * - `E` is the [scalar field](@ref scalar_t) of the curve; * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ -extern "C" cudaError_t ecntt_cuda( +extern "C" cudaError_t ECNTTCuda( curve_config::projective_t* input, - unsigned size, + int size, bool is_inverse, NTTConfig config) { - return ntt(input, size, is_inverse, config); + return NTT(input, size, is_inverse, config); } #endif diff --git a/icicle/appUtils/ntt/ntt.cuh b/icicle/appUtils/ntt/ntt.cuh index 3059cb572..ec8ad6695 100644 --- a/icicle/appUtils/ntt/ntt.cuh +++ b/icicle/appUtils/ntt/ntt.cuh @@ -18,18 +18,6 @@ */ namespace ntt { -/** - * Generates twiddles \f$ \{\omega^0=1, \omega^1, \dots, \omega^{n-1}\} \f$ from root of unity \f$ \omega \f$ and stores them on device. - * @param d_twiddles Input empty array on device to which twiddles are to be written. - * @param n_twiddles Number of twiddle \f$ n \f$ factors to generate. - * @param omega Root of unity \f$ \omega \f$. - * @param ctx Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext). - * @tparam S The type of twiddle factors \f$ \{ \omega^i \} \f$. - * @return `cudaSuccess` if the execution was successful and an error code otherwise. - */ -template -cudaError_t generate_twiddle_factors(S* d_twiddles, uint32_t n_twiddles, S omega, device_context::DeviceContext ctx); - /** * @enum Ordering * How to order inputs and outputs of the NTT: @@ -76,13 +64,13 @@ struct NTTConfig { * on a coset of [twiddles](@ref twiddles) given by [the coset generator](@ref coset_gen), so: * \f$ \{coset\_gen\cdot\omega^0, coset\_gen\cdot\omega^1, \dots, coset\_gen\cdot\omega^{n-1}\} \f$. Default value: false. */ S* coset_gen; /**< The field element that generates a coset if [is_coset](@ref is_coset) is true. - * Otherwise should be set to null. Default value: `null`. */ + * Otherwise should be set to `nullptr`. Default value: `nullptr`. */ S* twiddles; /**< "Twiddle factors", (or "domain", or "roots of unity") on which the NTT is evaluated. * This pointer is expected to live on device. The order is as follows: - * \f$ \{\omega^0=1, \omega^1, \dots, \omega^{n-1}\} \f$. If this pointer is `null`, twiddle factors + * \f$ \{\omega^0=1, \omega^1, \dots, \omega^{n-1}\} \f$. If this pointer is `nullptr`, twiddle factors * are generated online using the default generator (TODO: link to twiddle gen here) and function - * [generate_twiddle_factors](@ref generate_twiddle_factors). Default value: `null`. */ - unsigned batch_size; /**< The number of NTTs to compute. Default value: 1. */ + * [generate_twiddle_factors](@ref generate_twiddle_factors). Default value: `nullptr`. */ + int batch_size; /**< The number of NTTs to compute. Default value: 1. */ device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext). */ }; @@ -98,7 +86,19 @@ struct NTTConfig { * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ template -cudaError_t ntt(E* input, unsigned size, bool is_inverse, NTTConfig config); +cudaError_t NTT(E* input, int size, bool is_inverse, NTTConfig config); + +/** + * Generates twiddles \f$ \{\omega^0=1, \omega^1, \dots, \omega^{n-1}\} \f$ from root of unity \f$ \omega \f$ and stores them on device. + * @param d_twiddles Input empty array on device to which twiddles are to be written. + * @param n_twiddles Number of twiddle \f$ n \f$ factors to generate. + * @param omega Root of unity \f$ \omega \f$. + * @param ctx Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext). + * @tparam S The type of twiddle factors \f$ \{ \omega^i \} \f$. + * @return `cudaSuccess` if the execution was successful and an error code otherwise. + */ +template +cudaError_t GenerateTwiddleFactors(S* d_twiddles, int n_twiddles, S omega, device_context::DeviceContext ctx); } // namespace ntt From d6dfd59e9a0193dfbf4186e262db0769c7c0dc79 Mon Sep 17 00:00:00 2001 From: DmytroTym Date: Thu, 5 Oct 2023 23:49:14 +0300 Subject: [PATCH 7/8] fixed msm_test, int unification, google guilde --- icicle/appUtils/lde/lde.cu | 6 +-- icicle/appUtils/lde/lde.cuh | 8 +++- icicle/appUtils/msm/msm.cu | 17 ++++--- icicle/appUtils/msm/msm.cuh | 3 +- icicle/appUtils/msm/tests/msm_test.cu | 67 +++++++++++++++++---------- icicle/appUtils/ntt/ntt.cu | 41 ++++------------ icicle/appUtils/ntt/ntt.cuh | 5 +- icicle/utils/device_context.cuh | 2 +- icicle/utils/host_math.cuh | 4 ++ icicle/utils/mont.cuh | 25 ++++++++-- icicle/utils/utils_kernels.cu | 22 +++++++++ icicle/utils/utils_kernels.cuh | 21 +++++++++ 12 files changed, 145 insertions(+), 76 deletions(-) create mode 100644 icicle/utils/utils_kernels.cu create mode 100644 icicle/utils/utils_kernels.cuh diff --git a/icicle/appUtils/lde/lde.cu b/icicle/appUtils/lde/lde.cu index 811a1c163..f222de148 100644 --- a/icicle/appUtils/lde/lde.cu +++ b/icicle/appUtils/lde/lde.cu @@ -151,7 +151,7 @@ extern "C" cudaError_t MulCuda( device_context::DeviceContext ctx, curve_config::scalar_t* result ) { - return mul(vec_a, vec_b, n, is_on_device, is_montgomery, ctx, result); + return Mul(vec_a, vec_b, n, is_on_device, is_montgomery, ctx, result); } /** @@ -167,7 +167,7 @@ extern "C" cudaError_t AddCuda( device_context::DeviceContext ctx, curve_config::scalar_t* result ) { - return add(vec_a, vec_b, n, is_on_device, ctx, result); + return Add(vec_a, vec_b, n, is_on_device, ctx, result); } /** @@ -183,7 +183,7 @@ extern "C" cudaError_t SubCuda( device_context::DeviceContext ctx, curve_config::scalar_t* result ) { - return sub(vec_a, vec_b, n, is_on_device, ctx, result); + return Sub(vec_a, vec_b, n, is_on_device, ctx, result); } } // namespace lde diff --git a/icicle/appUtils/lde/lde.cuh b/icicle/appUtils/lde/lde.cuh index 010920499..3ccc403a6 100644 --- a/icicle/appUtils/lde/lde.cuh +++ b/icicle/appUtils/lde/lde.cuh @@ -2,6 +2,8 @@ #ifndef LDE_H #define LDE_H +#include "../../utils/device_context.cuh" + /** * @namespace lde * LDE (stands for low degree extension) contains [NTT](@ref ntt)-based methods for translating between coefficient and evaluation domains of polynomials. @@ -15,12 +17,12 @@ namespace lde { * @param vec_b Second input vector. * @param n Size of vectors `vec_a` and `vec_b`. * @param is_on_device If true, inputs and outputs are on device, if false - on the host. - * @param is_montgomery If true, inputs are expected to be Montgomery form and results are retured in Montgomery form. + * @param is_montgomery If true, inputs are expected to be in Montgomery form and results are retured in Montgomery form. * If false - inputs and outputs are non-Montgomery. * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method. * @param result Resulting vector - element-wise product of `vec_a` and `vec_b`, can be the same pointer as `vec_b`. * @tparam S The type of scalars `vec_a`. - * @tparam E The type of elements `vec_b` and `result`. Often (but not always), `E=S`. + * @tparam E The type of elements `vec_b` and `result`. Often (but not always) `E=S`. * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ template @@ -34,6 +36,7 @@ cudaError_t Mul(S* vec_a, E* vec_b, int n, bool is_on_device, bool is_montgomery * @param is_on_device If true, inputs and outputs are on device, if false - on the host. * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method. * @param result Resulting vector - element-wise sum of `vec_a` and `vec_b`, can be the same pointer as `vec_a` or `vec_b`. + * If inputs are in Montgomery form, the result is too, and vice versa: non-Montgomery inputs produce non-Montgomery result. * @tparam E The type of elements `vec_a`, `vec_b` and `result`. * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ @@ -48,6 +51,7 @@ cudaError_t Add(E* vec_a, E* vec_b, int n, bool is_on_device, device_context::De * @param is_on_device If true, inputs and outputs are on device, if false - on the host. * @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method. * @param result Resulting vector - element-wise difference of `vec_a` and `vec_b`, can be the same pointer as `vec_a` or `vec_b`. + * If inputs are in Montgomery form, the result is too, and vice versa: non-Montgomery inputs produce non-Montgomery result. * @tparam E The type of elements `vec_a`, `vec_b` and `result`. * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu index f077bb61f..12c389dae 100644 --- a/icicle/appUtils/msm/msm.cu +++ b/icicle/appUtils/msm/msm.cu @@ -1,19 +1,22 @@ -#include "../../primitives/affine.cuh" -#include "../../primitives/field.cuh" -#include "../../primitives/projective.cuh" -#include "../../utils/cuda_utils.cuh" -#include "../../curves/curve_config.cuh" -#include "../../utils/error_handler.cuh" #include "msm.cuh" -#include + #include #include #include #include +#include + #include #include #include +#include "../../primitives/affine.cuh" +#include "../../primitives/field.cuh" +#include "../../primitives/projective.cuh" +#include "../../utils/cuda_utils.cuh" +#include "../../utils/error_handler.cuh" +#include "../../curves/curve_config.cuh" + namespace msm { namespace { diff --git a/icicle/appUtils/msm/msm.cuh b/icicle/appUtils/msm/msm.cuh index 043f46683..49ff198f4 100644 --- a/icicle/appUtils/msm/msm.cuh +++ b/icicle/appUtils/msm/msm.cuh @@ -2,7 +2,8 @@ #ifndef MSM_H #define MSM_H -#include "cuda_runtime_api.h" +#include + #include "../../utils/device_context.cuh" /** diff --git a/icicle/appUtils/msm/tests/msm_test.cu b/icicle/appUtils/msm/tests/msm_test.cu index b69680f38..e5bc8748b 100644 --- a/icicle/appUtils/msm/tests/msm_test.cu +++ b/icicle/appUtils/msm/tests/msm_test.cu @@ -1,15 +1,16 @@ -#include "../../primitives/field.cuh" -#include "../../primitives/projective.cuh" -#include "../../utils/cuda_utils.cuh" +#define CURVE_ID 1 + #include "msm.cu" + #include #include #include -// #include "../../curves/bls12_377/curve_config.cuh" -#include "../../curves/bn254/curve_config.cuh" -// using namespace BLS12_377; -using namespace BN254; +#include "../../primitives/field.cuh" +#include "../../primitives/projective.cuh" +#include "../../utils/cuda_utils.cuh" +#include "../../utils/device_context.cuh" +#include "../../curves/curve_config.cuh" class Dummy_Scalar { @@ -111,9 +112,9 @@ public: // switch between dummy and real: -typedef scalar_t test_scalar; -typedef projective_t test_projective; -typedef affine_t test_affine; +typedef curve_config::scalar_t test_scalar; +typedef curve_config::projective_t test_projective; +typedef curve_config::affine_t test_affine; // typedef Dummy_Scalar test_scalar; // typedef Dummy_Projective test_projective; @@ -121,10 +122,10 @@ typedef affine_t test_affine; int main() { - unsigned batch_size = 1; + int batch_size = 1; // unsigned msm_size = 1<<21; - unsigned msm_size = 12180757; - unsigned N = batch_size * msm_size; + int msm_size = 12180757; + int N = batch_size * msm_size; test_scalar* scalars = new test_scalar[N]; test_affine* points = new test_affine[N]; @@ -165,19 +166,39 @@ int main() // batched_large_msm(scalars, points, batch_size, msm_size, // batched_large_res, false); - cudaStream_t stream1; - cudaStream_t stream2; - cudaStreamCreate(&stream1); - cudaStreamCreate(&stream2); + cudaStream_t stream; + cudaStreamCreate(&stream); + + device_context::DeviceContext ctx = {0, // device_id + stream, // stream + 0, // mempool + }; + msm::MSMConfig config = { + false, // scalars_on_device + true, // scalars_montgomery_form + msm_size, // points_size + 1, // precompute_factor + false, // points_on_device + true, // points_montgomery_form + 1, // batch_size + false, // result_on_device + 16, // c + test_scalar::NBITS, // bitsize + false, // big_triangle + 10, // large_bucket_factor + ctx, // DeviceContext + }; + auto begin1 = std::chrono::high_resolution_clock::now(); - large_msm(scalars, points, msm_size, large_res, false, true, stream1); + msm::MSM(scalars, points, msm_size, config, large_res); auto end1 = std::chrono::high_resolution_clock::now(); auto elapsed1 = std::chrono::duration_cast(end1 - begin1); printf("Big Triangle : %.3f seconds.\n", elapsed1.count() * 1e-9); + config.big_triangle = true; // std::cout<( - scalars_d, points_d, msm_size, large_res_d, true, false, stream2); + msm::MSM( + scalars_d, points_d, msm_size, config, large_res_d); // test_reduce_triangle(scalars); // test_reduce_rectangle(scalars); // test_reduce_single(scalars); @@ -185,10 +206,8 @@ int main() auto end = std::chrono::high_resolution_clock::now(); auto elapsed = std::chrono::duration_cast(end - begin); printf("On Device No Big Triangle: %.3f seconds.\n", elapsed.count() * 1e-9); - cudaStreamSynchronize(stream1); - cudaStreamSynchronize(stream2); - cudaStreamDestroy(stream1); - cudaStreamDestroy(stream2); + cudaStreamSynchronize(stream); + cudaStreamDestroy(stream); std::cout << test_projective::to_affine(large_res[0]) << std::endl; diff --git a/icicle/appUtils/ntt/ntt.cu b/icicle/appUtils/ntt/ntt.cu index b48f2dde6..990ae6062 100644 --- a/icicle/appUtils/ntt/ntt.cu +++ b/icicle/appUtils/ntt/ntt.cu @@ -1,6 +1,8 @@ +#include "ntt.cuh" + #include "../../utils/sharedmem.cuh" +#include "../../utils/utils_kernels.cuh" #include "../../curves/curve_config.cuh" -#include "ntt.cuh" namespace ntt { @@ -30,29 +32,6 @@ __global__ void twiddle_factors_kernel(S* d_twiddles, uint32_t n_twiddles, S ome } } -/** - * Multiply the elements of an input array by a scalar in-place. Used for normalization in iNTT. - * @param arr input array. - * @param n size of arr. - * @param n_inv scalar of type S (scalar). - */ -template -__global__ void template_normalize_kernel(E* arr, uint32_t n, S scalar) -{ - int tid = (blockIdx.x * blockDim.x) + threadIdx.x; - if (tid < n) { arr[tid] = scalar * arr[tid]; } -} - -template -__global__ void batchVectorMult(S* scalar_vec, E* element_vec, unsigned n_scalars, unsigned batch_size) -{ - int tid = blockDim.x * blockIdx.x + threadIdx.x; - if (tid < n_scalars * batch_size) { - int scalar_id = tid % n_scalars; - element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid]; - } -} - template __global__ void reverse_order_kernel(E* arr, E* arr_reversed, uint32_t n, uint32_t logn, uint32_t batch_size) { @@ -333,14 +312,14 @@ void ntt_inplace_batch_template( <<>>(d_inout, n, d_twiddles, n, total_tasks, s, false); } - if (is_coset) batchVectorMult<<>>(coset, d_inout, n, batch_size); + if (is_coset) utils_internal::batchVectorMult<<>>(d_inout, coset, n, batch_size); num_threads = max(min(n / 2, MAX_NUM_THREADS), 1); num_blocks = (n * batch_size + num_threads - 1) / num_threads; - template_normalize_kernel - <<>>(d_inout, n * batch_size, S::inv_log_size(logn)); + utils_internal::template_normalize_kernel + <<>>(d_inout, S::inv_log_size(logn), n * batch_size); } else { - if (is_coset) batchVectorMult<<>>(coset, d_inout, n, batch_size); + if (is_coset) utils_internal::batchVectorMult<<>>(d_inout, coset, n, batch_size); for (int s = logn - 1; s >= logn_shmem; s--) // TODO: this loop also can be unrolled { @@ -380,7 +359,7 @@ cudaError_t NTT(E* input, int size, bool is_inverse, NTTConfig config) S* d_twiddles; if (generate_twiddles) { cudaMallocAsync(&d_twiddles, n_twiddles * sizeof(S), stream); - generate_twiddle_factors(d_twiddles, n_twiddles, is_inverse ? S::omega_inv(logn) : S::omega(logn), config.ctx); + GenerateTwiddleFactors(d_twiddles, n_twiddles, is_inverse ? S::omega_inv(logn) : S::omega(logn), config.ctx); } E* d_input; @@ -431,7 +410,7 @@ cudaError_t NTT(E* input, int size, bool is_inverse, NTTConfig config) } /** - * Extern version of [generate_twiddle_factors](@ref generate_twiddle_factors) function with the template parameter + * Extern version of [GenerateTwiddleFactors](@ref GenerateTwiddleFactors) function with the template parameter * `S` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build. * @return `cudaSuccess` if the execution was successful and an error code otherwise. */ @@ -441,7 +420,7 @@ extern "C" cudaError_t GenerateTwiddleFactorsCuda( curve_config::scalar_t omega, device_context::DeviceContext ctx ) { - return generate_twiddle_factors(d_twiddles, n_twiddles, omega, ctx); + return GenerateTwiddleFactors(d_twiddles, n_twiddles, omega, ctx); } /** diff --git a/icicle/appUtils/ntt/ntt.cuh b/icicle/appUtils/ntt/ntt.cuh index ec8ad6695..e1daf2f67 100644 --- a/icicle/appUtils/ntt/ntt.cuh +++ b/icicle/appUtils/ntt/ntt.cuh @@ -2,7 +2,8 @@ #ifndef NTT_H #define NTT_H -#include "cuda_runtime_api.h" +#include + #include "../../utils/device_context.cuh" /** @@ -69,7 +70,7 @@ struct NTTConfig { * This pointer is expected to live on device. The order is as follows: * \f$ \{\omega^0=1, \omega^1, \dots, \omega^{n-1}\} \f$. If this pointer is `nullptr`, twiddle factors * are generated online using the default generator (TODO: link to twiddle gen here) and function - * [generate_twiddle_factors](@ref generate_twiddle_factors). Default value: `nullptr`. */ + * [GenerateTwiddleFactors](@ref GenerateTwiddleFactors). Default value: `nullptr`. */ int batch_size; /**< The number of NTTs to compute. Default value: 1. */ device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext). */ }; diff --git a/icicle/utils/device_context.cuh b/icicle/utils/device_context.cuh index e29d6a1b1..a5583991e 100644 --- a/icicle/utils/device_context.cuh +++ b/icicle/utils/device_context.cuh @@ -10,7 +10,7 @@ namespace device_context { * Properties of the device used in icicle functions. */ struct DeviceContext { - unsigned device_id; /**< Index of the currently used GPU. Default value: 0. */ + int device_id; /**< Index of the currently used GPU. Default value: 0. */ cudaStream_t stream; /**< Stream to use. Default value: 0. */ cudaMemPool_t mempool; /**< Mempool to use. Default value: 0. */ }; diff --git a/icicle/utils/host_math.cuh b/icicle/utils/host_math.cuh index 73922b925..ecc181844 100644 --- a/icicle/utils/host_math.cuh +++ b/icicle/utils/host_math.cuh @@ -1,4 +1,6 @@ #pragma once +#ifndef HOST_MATH_H +#define HOST_MATH_H #include #include @@ -94,3 +96,5 @@ namespace host_math { } }; } // namespace host_math + +#endif diff --git a/icicle/utils/mont.cuh b/icicle/utils/mont.cuh index d02d130d8..09c7a8732 100644 --- a/icicle/utils/mont.cuh +++ b/icicle/utils/mont.cuh @@ -1,19 +1,30 @@ #pragma once +#ifndef MONT_H +#define MONT_H -#include "../appUtils/vector_manipulation/ve_mod_mult.cuh" +#include "utils_kernels.cuh" +namespace mont { + +namespace { + +#define MAX_THREADS_PER_BLOCK 256 + +// TODO (DmytroTym): do valid conversion for point types too template -int convert_montgomery(E* d_inout, size_t n_elments, bool is_into, cudaStream_t stream) +int convert_montgomery(E* d_inout, size_t n, bool is_into, cudaStream_t stream) { // Set the grid and block dimensions int num_threads = MAX_THREADS_PER_BLOCK; - int num_blocks = (n_elments + num_threads - 1) / num_threads; + int num_blocks = (n + num_threads - 1) / num_threads; E mont = is_into ? E::montgomery_r() : E::montgomery_r_inv(); - template_normalize_kernel<<>>(d_inout, n_elments, mont); + utils_internal::template_normalize_kernel<<>>(d_inout, mont, n); return 0; // TODO: void with propper error handling } +} // namespace + template int to_montgomery(E* d_inout, unsigned n, cudaStream_t stream) { @@ -24,4 +35,8 @@ template int from_montgomery(E* d_inout, unsigned n, cudaStream_t stream) { return convert_montgomery(d_inout, n, false, stream); -} \ No newline at end of file +} + +} // namespace mont + +#endif diff --git a/icicle/utils/utils_kernels.cu b/icicle/utils/utils_kernels.cu new file mode 100644 index 000000000..02b8e687a --- /dev/null +++ b/icicle/utils/utils_kernels.cu @@ -0,0 +1,22 @@ +#include "utils_kernels.cuh" + +namespace utils_internal { + +template +__global__ void template_normalize_kernel(E* arr, S scalar, uint32_t n) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < n) { arr[tid] = scalar * arr[tid]; } +} + +template +__global__ void batchVectorMult(E* element_vec, S* scalar_vec, unsigned n_scalars, unsigned batch_size) +{ + int tid = blockDim.x * blockIdx.x + threadIdx.x; + if (tid < n_scalars * batch_size) { + int scalar_id = tid % n_scalars; + element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid]; + } +} + +} // namespace utils_internal diff --git a/icicle/utils/utils_kernels.cuh b/icicle/utils/utils_kernels.cuh new file mode 100644 index 000000000..02f761fa6 --- /dev/null +++ b/icicle/utils/utils_kernels.cuh @@ -0,0 +1,21 @@ +#pragma once +#ifndef UTILS_KERNELS_H +#define UTILS_KERNELS_H + +namespace utils_internal { + +/** + * Multiply the elements of an input array by a scalar in-place. Used for normalization in iNTT. + * @param arr input array. + * @param n size of arr. + * @param n_inv scalar of type S (scalar). + */ +template +__global__ void template_normalize_kernel(E* arr, S scalar, uint32_t n); + +template +__global__ void batchVectorMult(E* element_vec, S* scalar_vec, unsigned n_scalars, unsigned batch_size); + +} // namespace utils_internal + +#endif From ad097a9f4abd7b59cc3ce98d16648c452c1a88d5 Mon Sep 17 00:00:00 2001 From: DmytroTym Date: Thu, 12 Oct 2023 20:12:40 +0300 Subject: [PATCH 8/8] Fixed warnings --- icicle/appUtils/msm/msm.cu | 22 +++++++++++----------- icicle/utils/host_math.cuh | 2 +- icicle/utils/mont.cuh | 6 +++--- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu index 12c389dae..ee2d2579a 100644 --- a/icicle/appUtils/msm/msm.cu +++ b/icicle/appUtils/msm/msm.cu @@ -102,14 +102,14 @@ __global__ void split_scalars_kernel( unsigned bm_bitsize, unsigned c) { - constexpr unsigned sign_mask = 0x80000000; + // constexpr unsigned sign_mask = 0x80000000; // constexpr unsigned trash_bucket = 0x80000000; unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; unsigned bucket_index; - unsigned bucket_index2; + // unsigned bucket_index2; unsigned current_index; unsigned msm_index = tid >> msm_log_size; - unsigned borrow = 0; + // unsigned borrow = 0; if (tid < total_size) { S scalar = scalars[tid]; for (unsigned bm = 0; bm < nof_bms; bm++) { @@ -200,7 +200,7 @@ __global__ void accumulate_buckets_kernel( const unsigned msm_idx_shift, const unsigned c) { - constexpr unsigned sign_mask = 0x80000000; + // constexpr unsigned sign_mask = 0x80000000; unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; if (tid >= nof_buckets_to_compute) return; if ((single_bucket_indices[tid] & ((1 << c) - 1)) == 0) { @@ -376,8 +376,8 @@ void bucket_method_msm( P* buckets; // compute number of bucket modules and number of buckets in each module unsigned nof_bms = (bitsize + c - 1) / c; - unsigned msm_log_size = ceil(log2(size)); - unsigned bm_bitsize = ceil(log2(nof_bms)); + unsigned msm_log_size = (unsigned)ceil(log2(size)); + unsigned bm_bitsize = (unsigned)ceil(log2(nof_bms)); #ifdef SIGNED_DIG unsigned nof_buckets = nof_bms * ((1 << (c - 1)) + 1); // signed digits #else @@ -629,7 +629,7 @@ void bucket_method_msm( #endif } else { unsigned source_bits_count = c; - bool odd_source_c = source_bits_count % 2; + // bool odd_source_c = source_bits_count % 2; unsigned source_windows_count = nof_bms; unsigned source_buckets_count = nof_buckets; P* source_buckets = buckets; @@ -647,7 +647,7 @@ void bucket_method_msm( if (source_bits_count > 0) { for (unsigned j = 0; j < target_bits_count; j++) { - unsigned last_j = target_bits_count - 1; + // unsigned last_j = target_bits_count - 1; unsigned nof_threads = (source_buckets_count >> (1 + j)); NUM_THREADS = min(MAX_TH, nof_threads); NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS; @@ -683,7 +683,7 @@ void bucket_method_msm( temp_buckets1 = nullptr; temp_buckets2 = nullptr; source_bits_count = target_bits_count; - odd_source_c = source_bits_count % 2; + // odd_source_c = source_bits_count % 2; source_windows_count = target_windows_count; source_buckets_count = target_buckets_count; } @@ -753,8 +753,8 @@ void batched_bucket_method_msm( P* buckets; // compute number of bucket modules and number of buckets in each module unsigned nof_bms = (bitsize + c - 1) / c; - unsigned msm_log_size = ceil(log2(msm_size)); - unsigned bm_bitsize = ceil(log2(nof_bms)); + unsigned msm_log_size = (unsigned)ceil(log2(msm_size)); + unsigned bm_bitsize = (unsigned)ceil(log2(nof_bms)); unsigned nof_buckets = (nof_bms << c); unsigned total_nof_buckets = nof_buckets * batch_size; cudaMallocAsync(&buckets, sizeof(P) * total_nof_buckets, stream); diff --git a/icicle/utils/host_math.cuh b/icicle/utils/host_math.cuh index ecc181844..305c71319 100644 --- a/icicle/utils/host_math.cuh +++ b/icicle/utils/host_math.cuh @@ -58,7 +58,7 @@ namespace host_math { { uint32_t result; uint64_t r = static_cast(x) * y + z + carry; - carry = r >> 32; + carry = (uint32_t)(r >> 32); result = r & 0xffffffff; return result; } diff --git a/icicle/utils/mont.cuh b/icicle/utils/mont.cuh index 09c7a8732..b9d1ddd1e 100644 --- a/icicle/utils/mont.cuh +++ b/icicle/utils/mont.cuh @@ -12,7 +12,7 @@ namespace { // TODO (DmytroTym): do valid conversion for point types too template -int convert_montgomery(E* d_inout, size_t n, bool is_into, cudaStream_t stream) +int convert_montgomery(E* d_inout, int n, bool is_into, cudaStream_t stream) { // Set the grid and block dimensions int num_threads = MAX_THREADS_PER_BLOCK; @@ -26,13 +26,13 @@ int convert_montgomery(E* d_inout, size_t n, bool is_into, cudaStream_t stream) } // namespace template -int to_montgomery(E* d_inout, unsigned n, cudaStream_t stream) +int to_montgomery(E* d_inout, int n, cudaStream_t stream) { return convert_montgomery(d_inout, n, true, stream); } template -int from_montgomery(E* d_inout, unsigned n, cudaStream_t stream) +int from_montgomery(E* d_inout, int n, cudaStream_t stream) { return convert_montgomery(d_inout, n, false, stream); }