Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Linting as per google guide #241

Merged
merged 10 commits into from
Oct 12, 2023
65 changes: 35 additions & 30 deletions icicle/appUtils/lde/lde.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
#include "lde.cuh"

#include <cuda.h>
#include <stdexcept>

#include "../../utils/device_context.cuh"
#include "../../utils/mont.cuh"
#include "../../curves/curve_config.cuh"

namespace lde {
Expand All @@ -10,21 +14,21 @@ namespace {
#define MAX_THREADS_PER_BLOCK 256

template <typename E, typename S>
__global__ void mul_kernel(S* scalar_vec, E* element_vec, size_t n, E* result)
__global__ void mul_kernel(S* scalar_vec, E* element_vec, int n, E* result)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh - probably one thing truly important here - umm replacing unsigned types with signed int - feels like a potential cause of issues - ie due to shift operations etc. Also, distinguishing indexes from sizes? with size_t seems more rational than not

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

icicle/icicle/appUtils/msm/msm.cu(958): warning #2361-D: invalid narrowing conversion from "unsigned long" to "int"
        msm_size,
        ^

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right about narrowing conversions ofc, I'll fix it, just haven't finished re-working internals. https://google.github.io/styleguide/cppguide.html#Integer_Types says to "Use plain old int for such things (as loop counters)". I think loop counters/indices and sizes should be the same type? After all, loop counters range from 0 to size, so making their types different is weird.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also about narrowing conversions: I can't reproduce the warning even with -Wconversion flag set for compilation. Also there's no line 958 in msm.cu file, maybe you're looking at some older version?

{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < n) { result[tid] = scalar_vec[tid] * element_vec[tid]; }
}

template <typename E>
__global__ void add_kernel(E* element_vec1, E* element_vec2, uint32_t n, E* result)
__global__ void add_kernel(E* element_vec1, E* element_vec2, int n, E* result)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) { result[tid] = element_vec1[tid] + element_vec2[tid]; }
}

template <typename E>
__global__ void sub_kernel(E* element_vec1, E* element_vec2, uint32_t n, E* result)
__global__ void sub_kernel(E* element_vec1, E* element_vec2, int n, E* result)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) { result[tid] = element_vec1[tid] - element_vec2[tid]; }
Expand All @@ -33,15 +37,15 @@ __global__ void sub_kernel(E* element_vec1, E* element_vec2, uint32_t n, E* resu
} // namespace

template <typename E, typename S>
cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result)
cudaError_t Mul(S* vec_a, E* vec_b, int n, bool is_on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result)
{
// Set the grid and block dimensions
int num_threads = MAX_THREADS_PER_BLOCK;
int num_blocks = (n + num_threads - 1) / num_threads;

S* d_vec_a;
E *d_vec_b, *d_result;
if (on_device) {
if (!is_on_device) {
// Allocate memory on the device for the input vectors and the output vector
cudaMallocAsync(&d_vec_a, n * sizeof(S), ctx.stream);
cudaMallocAsync(&d_vec_b, n * sizeof(E), ctx.stream);
Expand All @@ -53,9 +57,10 @@ cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, bool is_montgomery
}

// Call the kernel to perform element-wise modular multiplication
mul_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, n, on_device ? d_result : result);
mul_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(is_on_device ? vec_a : d_vec_a, is_on_device ? vec_b : d_vec_b, n, is_on_device ? result : d_result);
if (is_montgomery) mont::from_montgomery(is_on_device ? result : d_result, n, ctx.stream);

if (on_device) {
if (!is_on_device) {
cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream);
cudaFreeAsync(d_vec_a, ctx.stream);
cudaFreeAsync(d_vec_b, ctx.stream);
Expand All @@ -67,14 +72,14 @@ cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, bool is_montgomery
}

template <typename E>
cudaError_t add(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result)
cudaError_t Add(E* vec_a, E* vec_b, int n, bool is_on_device, device_context::DeviceContext ctx, E* result)
{
// Set the grid and block dimensions
int num_threads = MAX_THREADS_PER_BLOCK;
int num_blocks = (n + num_threads - 1) / num_threads;

E *d_vec_a, *d_vec_b, *d_result;
if (on_device) {
if (!is_on_device) {
// Allocate memory on the device for the input vectors and the output vector
cudaMallocAsync(&d_vec_a, n * sizeof(E), ctx.stream);
cudaMallocAsync(&d_vec_b, n * sizeof(E), ctx.stream);
Expand All @@ -86,9 +91,9 @@ cudaError_t add(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::De
}

// Call the kernel to perform element-wise addition
add_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, n, on_device ? d_result : result);
add_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(is_on_device ? vec_a : d_vec_a, is_on_device ? vec_b : d_vec_b, n, is_on_device ? result : d_result);

if (on_device) {
if (!is_on_device) {
cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream);
cudaFreeAsync(d_vec_a, ctx.stream);
cudaFreeAsync(d_vec_b, ctx.stream);
Expand All @@ -100,14 +105,14 @@ cudaError_t add(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::De
}

template <typename E>
cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result)
cudaError_t Sub(E* vec_a, E* vec_b, int n, bool is_on_device, device_context::DeviceContext ctx, E* result)
{
// Set the grid and block dimensions
int num_threads = MAX_THREADS_PER_BLOCK;
int num_blocks = (n + num_threads - 1) / num_threads;

E *d_vec_a, *d_vec_b, *d_result;
if (on_device) {
if (!is_on_device) {
// Allocate memory on the device for the input vectors and the output vector
cudaMallocAsync(&d_vec_a, n * sizeof(E), ctx.stream);
cudaMallocAsync(&d_vec_b, n * sizeof(E), ctx.stream);
Expand All @@ -119,9 +124,9 @@ cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::De
}

// Call the kernel to perform element-wise subtraction
sub_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(on_device ? d_vec_a : vec_a, on_device ? d_vec_b : vec_b, n, on_device ? d_result : result);
sub_kernel<<<num_blocks, num_threads, 0, ctx.stream>>>(is_on_device ? vec_a : d_vec_a, is_on_device ? vec_b : d_vec_b, n, is_on_device ? result : d_result);

if (on_device) {
if (!is_on_device) {
cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, ctx.stream);
cudaFreeAsync(d_vec_a, ctx.stream);
cudaFreeAsync(d_vec_b, ctx.stream);
Expand All @@ -133,52 +138,52 @@ cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::De
}

/**
* Extern version of [mul](@ref mul) function with the template parameters
* Extern version of [Mul](@ref Mul) function with the template parameters
* `S` and `E` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build.
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t mul_cuda(
extern "C" cudaError_t MulCuda(
curve_config::scalar_t* vec_a,
curve_config::scalar_t* vec_b,
uint32_t n,
bool on_device,
int n,
bool is_on_device,
bool is_montgomery,
device_context::DeviceContext ctx,
curve_config::scalar_t* result
) {
return mul<curve_config::scalar_t>(vec_a, vec_b, n, on_device, is_montgomery, ctx, result);
return Mul<curve_config::scalar_t>(vec_a, vec_b, n, is_on_device, is_montgomery, ctx, result);
}

/**
* Extern version of [add](@ref add) function with the template parameter
* Extern version of [Add](@ref Add) function with the template parameter
* `E` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build.
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t add_cuda(
extern "C" cudaError_t AddCuda(
curve_config::scalar_t* vec_a,
curve_config::scalar_t* vec_b,
uint32_t n,
bool on_device,
int n,
bool is_on_device,
device_context::DeviceContext ctx,
curve_config::scalar_t* result
) {
return add<curve_config::scalar_t>(vec_a, vec_b, n, on_device, ctx, result);
return Add<curve_config::scalar_t>(vec_a, vec_b, n, is_on_device, ctx, result);
}

/**
* Extern version of [sub](@ref sub) function with the template parameter
* Extern version of [Sub](@ref Sub) function with the template parameter
* `E` being the [scalar field](@ref scalar_t) of the curve given by `-DCURVE` env variable during build.
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t sub_cuda(
extern "C" cudaError_t SubCuda(
curve_config::scalar_t* vec_a,
curve_config::scalar_t* vec_b,
uint32_t n,
bool on_device,
int n,
bool is_on_device,
device_context::DeviceContext ctx,
curve_config::scalar_t* result
) {
return sub<curve_config::scalar_t>(vec_a, vec_b, n, on_device, ctx, result);
return Sub<curve_config::scalar_t>(vec_a, vec_b, n, is_on_device, ctx, result);
}

} // namespace lde
20 changes: 12 additions & 8 deletions icicle/appUtils/lde/lde.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#ifndef LDE_H
#define LDE_H

#include "../../utils/device_context.cuh"

/**
* @namespace lde
* LDE (stands for low degree extension) contains [NTT](@ref ntt)-based methods for translating between coefficient and evaluation domains of polynomials.
Expand All @@ -14,45 +16,47 @@ namespace lde {
* @param vec_a First input vector.
* @param vec_b Second input vector.
* @param n Size of vectors `vec_a` and `vec_b`.
* @param on_device If true, inputs and outputs are on device, if false - on the host.
* @param is_montgomery If true, inputs are expected to be Montgomery form and results are retured in Montgomery form.
* @param is_on_device If true, inputs and outputs are on device, if false - on the host.
* @param is_montgomery If true, inputs are expected to be in Montgomery form and results are retured in Montgomery form.
* If false - inputs and outputs are non-Montgomery.
* @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method.
* @param result Resulting vector - element-wise product of `vec_a` and `vec_b`, can be the same pointer as `vec_b`.
* @tparam S The type of scalars `vec_a`.
* @tparam E The type of elements `vec_b` and `result`. Often (but not always), `E=S`.
* @tparam E The type of elements `vec_b` and `result`. Often (but not always) `E=S`.
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
template <typename E, typename S>
cudaError_t mul(S* vec_a, E* vec_b, size_t n, bool on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result);
cudaError_t Mul(S* vec_a, E* vec_b, int n, bool is_on_device, bool is_montgomery, device_context::DeviceContext ctx, E* result);

/**
* A function that adds two vectors element-wise.
* @param vec_a First input vector.
* @param vec_b Second input vector.
* @param n Size of vectors `vec_a` and `vec_b`.
* @param on_device If true, inputs and outputs are on device, if false - on the host.
* @param is_on_device If true, inputs and outputs are on device, if false - on the host.
* @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method.
* @param result Resulting vector - element-wise sum of `vec_a` and `vec_b`, can be the same pointer as `vec_a` or `vec_b`.
* If inputs are in Montgomery form, the result is too, and vice versa: non-Montgomery inputs produce non-Montgomery result.
* @tparam E The type of elements `vec_a`, `vec_b` and `result`.
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
template <typename E>
cudaError_t add(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result);
cudaError_t Add(E* vec_a, E* vec_b, int n, bool is_on_device, device_context::DeviceContext ctx, E* result);

/**
* A function that subtracts two vectors element-wise.
* @param vec_a First input vector.
* @param vec_b Second input vector.
* @param n Size of vectors `vec_a` and `vec_b`.
* @param on_device If true, inputs and outputs are on device, if false - on the host.
* @param is_on_device If true, inputs and outputs are on device, if false - on the host.
* @param ctx [DeviceContext](@ref device_context::DeviceContext) used in this method.
* @param result Resulting vector - element-wise difference of `vec_a` and `vec_b`, can be the same pointer as `vec_a` or `vec_b`.
* If inputs are in Montgomery form, the result is too, and vice versa: non-Montgomery inputs produce non-Montgomery result.
* @tparam E The type of elements `vec_a`, `vec_b` and `result`.
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
template <typename E>
cudaError_t sub(E* vec_a, E* vec_b, size_t n, bool on_device, device_context::DeviceContext ctx, E* result);
cudaError_t Sub(E* vec_a, E* vec_b, int n, bool is_on_device, device_context::DeviceContext ctx, E* result);

} // namespace lde

Expand Down
Loading