diff --git a/.gitignore b/.gitignore index c590884d8..244bd7c06 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,4 @@ **/Cargo.lock **/icicle/build/ **/wrappers/rust/icicle-cuda-runtime/src/bindings.rs +**/build diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu index 65651fee0..80df8f18a 100644 --- a/icicle/appUtils/msm/msm.cu +++ b/icicle/appUtils/msm/msm.cu @@ -29,15 +29,11 @@ namespace msm { // #define BIG_TRIANGLE // #define SSM_SUM //WIP - template - int get_optimal_c(int bitsize) - { - return max((int)ceil(log2(bitsize)) - 4, 1); - } + int get_optimal_c(int bitsize) { return max((int)ceil(log2(bitsize)) - 4, 1); } template __global__ void single_stage_multi_reduction_kernel( - P* v, + const P* v, P* v_r, unsigned block_size, unsigned write_stride, @@ -45,28 +41,26 @@ namespace msm { unsigned padding, unsigned num_of_threads) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid >= num_of_threads) { return; } - int jump = block_size / 2; - int tid_p = padding ? (tid / (2 * padding)) * padding + tid % padding : tid; - int block_id = tid_p / jump; - int block_tid = tid_p % jump; - unsigned read_ind = block_size * block_id + block_tid; - unsigned write_ind = tid; - unsigned v_r_key = write_stride - ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride - : write_ind; - P v_r_value = padding ? (tid % (2 * padding) < padding) ? v[read_ind] + v[read_ind + jump] : P::zero() - : v[read_ind] + v[read_ind + jump]; - - v_r[v_r_key] = v_r_value; + const int jump = block_size / 2; + const int tid_p = padding ? (tid / (2 * padding)) * padding + tid % padding : tid; + const int block_id = tid_p / jump; + const int block_tid = tid_p % jump; + const unsigned read_ind = block_size * block_id + block_tid; + const unsigned write_ind = tid; + const unsigned v_r_key = + write_stride ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride + : write_ind; + v_r[v_r_key] = padding ? (tid % (2 * padding) < padding) ? v[read_ind] + v[read_ind + jump] : P::zero() + : v[read_ind] + v[read_ind + jump]; } // this kernel performs single scalar multiplication - // each thread multilies a single scalar and point + // each thread multiplies a single scalar and point template - __global__ void ssm_kernel(S* scalars, P* points, P* results, unsigned N) + __global__ void ssm_kernel(const S* scalars, const P* points, P* results, unsigned N) { unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; if (tid < N) results[tid] = scalars[tid] * points[tid]; @@ -105,7 +99,7 @@ namespace msm { unsigned* buckets_indices, unsigned* point_indices, S* scalars, - unsigned total_size, + unsigned nof_scalars, unsigned points_size, unsigned msm_size, unsigned nof_bms, @@ -115,36 +109,36 @@ namespace msm { // constexpr unsigned sign_mask = 0x80000000; // constexpr unsigned trash_bucket = 0x80000000; unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tid >= nof_scalars) return; + unsigned bucket_index; // unsigned bucket_index2; unsigned current_index; unsigned msm_index = tid / msm_size; // unsigned borrow = 0; - if (tid < total_size) { - S scalar = scalars[tid]; - for (unsigned bm = 0; bm < nof_bms; bm++) { - bucket_index = scalar.get_scalar_digit(bm, c); + S& scalar = scalars[tid]; + for (unsigned bm = 0; bm < nof_bms; bm++) { + bucket_index = scalar.get_scalar_digit(bm, c); #ifdef SIGNED_DIG - bucket_index += borrow; - borrow = 0; - unsigned sign = 0; - if (bucket_index > (1 << (c - 1))) { - bucket_index = (1 << c) - bucket_index; - borrow = 1; - sign = sign_mask; - } + bucket_index += borrow; + borrow = 0; + unsigned sign = 0; + if (bucket_index > (1 << (c - 1))) { + bucket_index = (1 << c) - bucket_index; + borrow = 1; + sign = sign_mask; + } #endif - current_index = bm * total_size + tid; + current_index = bm * nof_scalars + tid; #ifdef SIGNED_DIG - point_indices[current_index] = sign | tid; // the point index is saved for later + point_indices[current_index] = sign | tid; // the point index is saved for later #else - buckets_indices[current_index] = - (msm_index << (c + bm_bitsize)) | (bm << c) | - bucket_index; // the bucket module number and the msm number are appended at the msbs - if (scalar == S::zero() || bucket_index == 0) buckets_indices[current_index] = 0; // will be skipped - point_indices[current_index] = tid % points_size; // the point index is saved for later + buckets_indices[current_index] = + (msm_index << (c + bm_bitsize)) | (bm << c) | + bucket_index; // the bucket module number and the msm number are appended at the msbs + if (scalar == S::zero() || bucket_index == 0) buckets_indices[current_index] = 0; // will be skipped + point_indices[current_index] = tid % points_size; // the point index is saved for later #endif - } } } @@ -277,7 +271,7 @@ namespace msm { // this kernel sums the entire bucket module // each thread deals with a single bucket module template - __global__ void big_triangle_sum_kernel(P* buckets, P* final_sums, unsigned nof_bms, unsigned c) + __global__ void big_triangle_sum_kernel(const P* buckets, P* final_sums, unsigned nof_bms, unsigned c) { unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; if (tid >= nof_bms) return; @@ -309,7 +303,7 @@ namespace msm { } template - __global__ void last_pass_kernel(P* final_buckets, P* final_sums, unsigned num_sums) + __global__ void last_pass_kernel(const P* final_buckets, P* final_sums, unsigned num_sums) { unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; if (tid > num_sums) return; @@ -320,7 +314,7 @@ namespace msm { // it is done by a single thread template __global__ void - final_accumulation_kernel(P* final_sums, P* final_results, unsigned nof_msms, unsigned nof_bms, unsigned c) + final_accumulation_kernel(const P* final_sums, P* final_results, unsigned nof_msms, unsigned nof_bms, unsigned c) { unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; if (tid > nof_msms) return; @@ -338,17 +332,19 @@ namespace msm { // this function computes msm using the bucket method template cudaError_t bucket_method_msm( - int bitsize, - int c, + unsigned bitsize, + unsigned c, S* scalars, A* points, - int size, + unsigned batch_size, + unsigned msm_size, + unsigned nof_points, P* final_result, bool are_scalars_on_device, bool are_scalars_montgomery_form, bool are_points_on_device, bool are_points_montgomery_form, - bool is_result_on_device, + bool are_results_on_device, bool is_big_triangle, int large_bucket_factor, bool is_async, @@ -356,12 +352,14 @@ namespace msm { { CHK_INIT_IF_RETURN(); + const unsigned nof_scalars = batch_size * msm_size; // assuming scalars not shared between batch elements + S* d_scalars; A* d_points; if (!are_scalars_on_device) { // copy scalars to gpu - CHK_IF_RETURN(cudaMallocAsync(&d_scalars, sizeof(S) * size, stream)); - CHK_IF_RETURN(cudaMemcpyAsync(d_scalars, scalars, sizeof(S) * size, cudaMemcpyHostToDevice, stream)); + CHK_IF_RETURN(cudaMallocAsync(&d_scalars, sizeof(S) * nof_scalars, stream)); + CHK_IF_RETURN(cudaMemcpyAsync(d_scalars, scalars, sizeof(S) * nof_scalars, cudaMemcpyHostToDevice, stream)); } else { d_scalars = scalars; } @@ -369,28 +367,28 @@ namespace msm { if (!are_points_on_device || are_points_montgomery_form) CHK_IF_RETURN(cudaStreamCreate(&stream_points)); if (!are_points_on_device) { // copy points to gpu - CHK_IF_RETURN(cudaMallocAsync(&d_points, sizeof(A) * size, stream_points)); - CHK_IF_RETURN(cudaMemcpyAsync(d_points, points, sizeof(A) * size, cudaMemcpyHostToDevice, stream_points)); + CHK_IF_RETURN(cudaMallocAsync(&d_points, sizeof(A) * nof_points, stream_points)); + CHK_IF_RETURN(cudaMemcpyAsync(d_points, points, sizeof(A) * nof_points, cudaMemcpyHostToDevice, stream_points)); } else { d_points = points; } if (are_scalars_montgomery_form) { if (are_scalars_on_device) { S* d_mont_scalars; - CHK_IF_RETURN(cudaMallocAsync(&d_mont_scalars, sizeof(S) * size, stream)); - CHK_IF_RETURN(mont::FromMontgomery(d_scalars, size, stream, d_mont_scalars)); + CHK_IF_RETURN(cudaMallocAsync(&d_mont_scalars, sizeof(S) * nof_scalars, stream)); + CHK_IF_RETURN(mont::FromMontgomery(d_scalars, nof_scalars, stream, d_mont_scalars)); d_scalars = d_mont_scalars; } else - CHK_IF_RETURN(mont::FromMontgomery(d_scalars, size, stream, d_scalars)); + CHK_IF_RETURN(mont::FromMontgomery(d_scalars, nof_scalars, stream, d_scalars)); } if (are_points_montgomery_form) { if (are_points_on_device) { A* d_mont_points; - CHK_IF_RETURN(cudaMallocAsync(&d_mont_points, sizeof(A) * size, stream_points)); - CHK_IF_RETURN(mont::FromMontgomery(d_points, size, stream_points, d_mont_points)); + CHK_IF_RETURN(cudaMallocAsync(&d_mont_points, sizeof(A) * nof_points, stream_points)); + CHK_IF_RETURN(mont::FromMontgomery(d_points, nof_points, stream_points, d_mont_points)); d_points = d_mont_points; } else - CHK_IF_RETURN(mont::FromMontgomery(d_points, size, stream_points, d_points)); + CHK_IF_RETURN(mont::FromMontgomery(d_points, nof_points, stream_points, d_points)); } cudaEvent_t event_points_uploaded; if (!are_points_on_device || are_points_montgomery_form) { @@ -407,24 +405,25 @@ namespace msm { #else unsigned nof_buckets = nof_bms << c; #endif - CHK_IF_RETURN(cudaMallocAsync(&buckets, sizeof(P) * nof_buckets, stream)); + unsigned total_nof_buckets = nof_buckets * batch_size; + CHK_IF_RETURN(cudaMallocAsync(&buckets, sizeof(P) * total_nof_buckets, stream)); // launch the bucket initialization kernel with maximum threads unsigned NUM_THREADS = 1 << 10; - unsigned NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS; - initialize_buckets_kernel<<>>(buckets, nof_buckets); + unsigned NUM_BLOCKS = (total_nof_buckets + NUM_THREADS - 1) / NUM_THREADS; + initialize_buckets_kernel<<>>(buckets, total_nof_buckets); unsigned* bucket_indices; unsigned* point_indices; - CHK_IF_RETURN(cudaMallocAsync(&bucket_indices, sizeof(unsigned) * size * (nof_bms + 1), stream)); - CHK_IF_RETURN(cudaMallocAsync(&point_indices, sizeof(unsigned) * size * (nof_bms + 1), stream)); + CHK_IF_RETURN(cudaMallocAsync(&bucket_indices, sizeof(unsigned) * nof_scalars * (nof_bms + 1), stream)); + CHK_IF_RETURN(cudaMallocAsync(&point_indices, sizeof(unsigned) * nof_scalars * (nof_bms + 1), stream)); // split scalars into digits NUM_THREADS = 1 << 10; - NUM_BLOCKS = (size + NUM_THREADS - 1) / NUM_THREADS; + NUM_BLOCKS = (nof_scalars + NUM_THREADS - 1) / NUM_THREADS; split_scalars_kernel<<>>( - bucket_indices + size, point_indices + size, d_scalars, size, size, size, nof_bms, bm_bitsize, c); - //+size - leaving the first bm free for the out of place sort later + bucket_indices + nof_scalars, point_indices + nof_scalars, d_scalars, nof_scalars, nof_points, msm_size, + nof_bms, bm_bitsize, c); //+nof_scalars - leaving the first bm free for the out of place sort later // sort indices - the indices are sorted from smallest to largest in order to group together the points that // belong to each bucket @@ -434,18 +433,18 @@ namespace msm { // See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for // more info CHK_IF_RETURN(cub::DeviceRadixSort::SortPairs( - sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + size, bucket_indices, - point_indices + size, point_indices, size, 0, sizeof(unsigned) * 8, stream)); + sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + nof_scalars, bucket_indices, + point_indices + nof_scalars, point_indices, nof_scalars, 0, sizeof(unsigned) * 8, stream)); CHK_IF_RETURN(cudaMallocAsync(&sort_indices_temp_storage, sort_indices_temp_storage_bytes, stream)); for (unsigned i = 0; i < nof_bms; i++) { - unsigned offset_out = i * size; - unsigned offset_in = offset_out + size; + unsigned offset_out = i * nof_scalars; + unsigned offset_in = offset_out + nof_scalars; // The second to last parameter is the default value supplied explicitly to allow passing the stream // See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for // more info CHK_IF_RETURN(cub::DeviceRadixSort::SortPairs( sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + offset_in, - bucket_indices + offset_out, point_indices + offset_in, point_indices + offset_out, size, 0, + bucket_indices + offset_out, point_indices + offset_in, point_indices + offset_out, nof_scalars, 0, sizeof(unsigned) * 8, stream)); } CHK_IF_RETURN(cudaFreeAsync(sort_indices_temp_storage, stream)); @@ -454,30 +453,30 @@ namespace msm { unsigned* single_bucket_indices; unsigned* bucket_sizes; unsigned* nof_buckets_to_compute; - CHK_IF_RETURN(cudaMallocAsync(&single_bucket_indices, sizeof(unsigned) * nof_buckets, stream)); - CHK_IF_RETURN(cudaMallocAsync(&bucket_sizes, sizeof(unsigned) * nof_buckets, stream)); + CHK_IF_RETURN(cudaMallocAsync(&single_bucket_indices, sizeof(unsigned) * total_nof_buckets, stream)); + CHK_IF_RETURN(cudaMallocAsync(&bucket_sizes, sizeof(unsigned) * total_nof_buckets, stream)); CHK_IF_RETURN(cudaMallocAsync(&nof_buckets_to_compute, sizeof(unsigned), stream)); unsigned* encode_temp_storage{}; size_t encode_temp_storage_bytes = 0; CHK_IF_RETURN(cub::DeviceRunLengthEncode::Encode( encode_temp_storage, encode_temp_storage_bytes, bucket_indices, single_bucket_indices, bucket_sizes, - nof_buckets_to_compute, nof_bms * size, stream)); + nof_buckets_to_compute, nof_bms * nof_scalars, stream)); CHK_IF_RETURN(cudaMallocAsync(&encode_temp_storage, encode_temp_storage_bytes, stream)); CHK_IF_RETURN(cub::DeviceRunLengthEncode::Encode( encode_temp_storage, encode_temp_storage_bytes, bucket_indices, single_bucket_indices, bucket_sizes, - nof_buckets_to_compute, nof_bms * size, stream)); + nof_buckets_to_compute, nof_bms * nof_scalars, stream)); CHK_IF_RETURN(cudaFreeAsync(encode_temp_storage, stream)); // get offsets - where does each new bucket begin unsigned* bucket_offsets; - CHK_IF_RETURN(cudaMallocAsync(&bucket_offsets, sizeof(unsigned) * nof_buckets, stream)); + CHK_IF_RETURN(cudaMallocAsync(&bucket_offsets, sizeof(unsigned) * total_nof_buckets, stream)); unsigned* offsets_temp_storage{}; size_t offsets_temp_storage_bytes = 0; CHK_IF_RETURN(cub::DeviceScan::ExclusiveSum( - offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, nof_buckets, stream)); + offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets, stream)); CHK_IF_RETURN(cudaMallocAsync(&offsets_temp_storage, offsets_temp_storage_bytes, stream)); CHK_IF_RETURN(cub::DeviceScan::ExclusiveSum( - offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, nof_buckets, stream)); + offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets, stream)); CHK_IF_RETURN(cudaFreeAsync(offsets_temp_storage, stream)); // sort by bucket sizes @@ -487,12 +486,17 @@ namespace msm { // if all points are 0 just return point 0 if (h_nof_buckets_to_compute == 0) { - if (!is_result_on_device) - final_result[0] = P::zero(); - else { - P* h_final_result = (P*)malloc(sizeof(P)); - h_final_result[0] = P::zero(); - CHK_IF_RETURN(cudaMemcpyAsync(final_result, h_final_result, sizeof(P), cudaMemcpyHostToDevice, stream)); + if (!are_results_on_device) { + for (unsigned batch_element = 0; batch_element < batch_size; ++batch_element) { + final_result[batch_element] = P::zero(); + } + } else { + P* h_final_result = (P*)malloc(sizeof(P) * batch_size); + for (unsigned batch_element = 0; batch_element < batch_size; ++batch_element) { + h_final_result[batch_element] = P::zero(); + } + CHK_IF_RETURN( + cudaMemcpyAsync(final_result, h_final_result, sizeof(P) * batch_size, cudaMemcpyHostToDevice, stream)); } return CHK_LAST(); @@ -530,7 +534,7 @@ namespace msm { CHK_IF_RETURN(cudaFreeAsync(sort_single_temp_storage, stream)); // find large buckets - unsigned avarage_size = size / (1 << c); + unsigned avarage_size = msm_size / (1 << c); unsigned bucket_th = large_bucket_factor * avarage_size; unsigned* nof_large_buckets; CHK_IF_RETURN(cudaMallocAsync(&nof_large_buckets, sizeof(unsigned), stream)); @@ -636,25 +640,27 @@ namespace msm { #endif P* d_final_result; - if (!is_result_on_device) CHK_IF_RETURN(cudaMallocAsync(&d_final_result, sizeof(P), stream)); + if (!are_results_on_device) CHK_IF_RETURN(cudaMallocAsync(&d_final_result, sizeof(P) * batch_size, stream)); P* final_results; - if (is_big_triangle) { - CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream)); + if (is_big_triangle || c == 1) { + CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(P) * nof_bms * batch_size, stream)); // launch the bucket module sum kernel - a thread for each bucket module - NUM_THREADS = nof_bms; - NUM_BLOCKS = 1; + NUM_THREADS = 32; + NUM_BLOCKS = (nof_bms * batch_size + NUM_THREADS - 1) / NUM_THREADS; #ifdef SIGNED_DIG big_triangle_sum_kernel<<>>( - buckets, final_results, nof_bms, c - 1); // sighed digits + buckets, final_results, nof_bms * batch_size, c - 1); // sighed digits #else - big_triangle_sum_kernel<<>>(buckets, final_results, nof_bms, c); + big_triangle_sum_kernel<<>>( + buckets, final_results, nof_bms * batch_size, c); #endif } else { unsigned source_bits_count = c; // bool odd_source_c = source_bits_count % 2; unsigned source_windows_count = nof_bms; unsigned source_buckets_count = nof_buckets; + unsigned target_windows_count = 0; P* source_buckets = buckets; buckets = nullptr; P* target_buckets; @@ -662,40 +668,45 @@ namespace msm { P* temp_buckets2; for (unsigned i = 0;; i++) { const unsigned target_bits_count = (source_bits_count + 1) >> 1; // c/2=8 - const unsigned target_windows_count = source_windows_count << 1; // nof bms*2 = 32 + target_windows_count = source_windows_count << 1; // nof bms*2 = 32 const unsigned target_buckets_count = target_windows_count << target_bits_count; // bms*2^c = 32*2^8 - CHK_IF_RETURN( - cudaMallocAsync(&target_buckets, sizeof(P) * target_buckets_count, stream)); // 32*2^8*2^7 buckets - CHK_IF_RETURN( - cudaMallocAsync(&temp_buckets1, sizeof(P) * source_buckets_count / 2, stream)); // 32*2^8*2^7 buckets - CHK_IF_RETURN( - cudaMallocAsync(&temp_buckets2, sizeof(P) * source_buckets_count / 2, stream)); // 32*2^8*2^7 buckets + CHK_IF_RETURN(cudaMallocAsync( + &target_buckets, sizeof(P) * target_buckets_count * batch_size, stream)); // 32*2^8*2^7 buckets + CHK_IF_RETURN(cudaMallocAsync( + &temp_buckets1, sizeof(P) * source_buckets_count / 2 * batch_size, stream)); // 32*2^8*2^7 buckets + CHK_IF_RETURN(cudaMallocAsync( + &temp_buckets2, sizeof(P) * source_buckets_count / 2 * batch_size, stream)); // 32*2^8*2^7 buckets if (source_bits_count > 0) { for (unsigned j = 0; j < target_bits_count; j++) { - // unsigned last_j = target_bits_count - 1; - unsigned nof_threads = (source_buckets_count >> (1 + j)); + const bool is_first_iter = (j == 0); + const bool is_last_iter = (j == target_bits_count - 1); + unsigned nof_threads = (source_buckets_count >> (1 + j)) * batch_size; NUM_THREADS = min(MAX_TH, nof_threads); NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS; single_stage_multi_reduction_kernel<<>>( - j == 0 ? source_buckets : temp_buckets1, j == target_bits_count - 1 ? target_buckets : temp_buckets1, - 1 << (source_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 0, 0, - nof_threads); + is_first_iter ? source_buckets : temp_buckets1, is_last_iter ? target_buckets : temp_buckets1, + 1 << (source_bits_count - j), is_last_iter ? 1 << target_bits_count : 0, 0 /*=write_phase*/, + 0 /*=padding*/, nof_threads); NUM_THREADS = min(MAX_TH, nof_threads); NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS; single_stage_multi_reduction_kernel<<>>( - j == 0 ? source_buckets : temp_buckets2, j == target_bits_count - 1 ? target_buckets : temp_buckets2, - 1 << (target_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 1, 0, - nof_threads); + is_first_iter ? source_buckets : temp_buckets2, is_last_iter ? target_buckets : temp_buckets2, + 1 << (target_bits_count - j), is_last_iter ? 1 << target_bits_count : 0, 1 /*=write_phase*/, + 0 /*=padding*/, nof_threads); } } if (target_bits_count == 1) { - nof_bms = bitsize; - CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream)); + // Note: the reduction ends up with 'target_windows_count' windows per batch element. for batch=1 we can + // only sum the first 'bitsize' elements to avoid a few ECADD which add zeros + nof_bms = (batch_size > 1) ? target_windows_count : bitsize; + + CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(P) * nof_bms * batch_size, stream)); NUM_THREADS = 32; - NUM_BLOCKS = (nof_bms + NUM_THREADS - 1) / NUM_THREADS; - last_pass_kernel<<>>(target_buckets, final_results, nof_bms); + NUM_BLOCKS = (nof_bms * batch_size + NUM_THREADS - 1) / NUM_THREADS; + last_pass_kernel<<>>( + target_buckets, final_results, nof_bms * batch_size); c = 1; CHK_IF_RETURN(cudaFreeAsync(source_buckets, stream)); CHK_IF_RETURN(cudaFreeAsync(target_buckets, stream)); @@ -717,18 +728,21 @@ namespace msm { } } - // launch the double and add kernel, a single thread - final_accumulation_kernel - <<<1, 1, 0, stream>>>(final_results, is_result_on_device ? final_result : d_final_result, 1, nof_bms, c); + // launch the double and add kernel, a single thread per batch element + NUM_THREADS = 32; + NUM_BLOCKS = (batch_size + NUM_THREADS - 1) / NUM_THREADS; + final_accumulation_kernel<<>>( + final_results, are_results_on_device ? final_result : d_final_result, batch_size, nof_bms, c); CHK_IF_RETURN(cudaFreeAsync(final_results, stream)); - if (!is_result_on_device) - CHK_IF_RETURN(cudaMemcpyAsync(final_result, d_final_result, sizeof(P), cudaMemcpyDeviceToHost, stream)); + if (!are_results_on_device) + CHK_IF_RETURN( + cudaMemcpyAsync(final_result, d_final_result, sizeof(P) * batch_size, cudaMemcpyDeviceToHost, stream)); // free memory if (!are_scalars_on_device) CHK_IF_RETURN(cudaFreeAsync(d_scalars, stream)); if (!are_points_on_device) CHK_IF_RETURN(cudaFreeAsync(d_points, stream)); - if (!is_result_on_device) CHK_IF_RETURN(cudaFreeAsync(d_final_result, stream)); + if (!are_results_on_device) CHK_IF_RETURN(cudaFreeAsync(d_final_result, stream)); CHK_IF_RETURN(cudaFreeAsync(buckets, stream)); #ifndef PHASE1_TEST CHK_IF_RETURN(cudaFreeAsync(bucket_indices, stream)); @@ -749,222 +763,6 @@ namespace msm { return CHK_LAST(); } - - // this function computes multiple msms using the bucket method - template - cudaError_t batched_bucket_method_msm( - unsigned bitsize, - unsigned c, - S* scalars, - A* points, - unsigned batch_size, - unsigned msm_size, - unsigned points_size, - P* final_results, - bool are_scalars_on_device, - bool are_scalars_montgomery_form, - bool are_points_on_device, - bool are_points_montgomery_form, - bool are_results_on_device, - bool is_async, - cudaStream_t stream) - { - CHK_INIT_IF_RETURN(); - - unsigned total_size = batch_size * msm_size; - S* d_scalars; - A* d_points; - if (!are_scalars_on_device) { - // copy scalars to gpu - CHK_IF_RETURN(cudaMallocAsync(&d_scalars, sizeof(S) * total_size, stream)); - CHK_IF_RETURN(cudaMemcpyAsync(d_scalars, scalars, sizeof(S) * total_size, cudaMemcpyHostToDevice, stream)); - } else { - d_scalars = scalars; - } - cudaStream_t stream_points; - if (!are_points_on_device || are_points_montgomery_form) cudaStreamCreate(&stream_points); - if (!are_points_on_device) { - // copy points to gpu - cudaMallocAsync(&d_points, sizeof(A) * points_size, stream_points); - cudaMemcpyAsync(d_points, points, sizeof(A) * points_size, cudaMemcpyHostToDevice, stream_points); - } else { - d_points = points; - } - if (are_scalars_montgomery_form) { - if (are_scalars_on_device) { - S* d_mont_scalars; - cudaMallocAsync(&d_mont_scalars, sizeof(S) * total_size, stream); - mont::FromMontgomery(d_scalars, total_size, stream, d_mont_scalars); - d_scalars = d_mont_scalars; - } else - mont::FromMontgomery(d_scalars, total_size, stream, d_scalars); - } - if (are_points_montgomery_form) { - if (are_points_on_device) { - A* d_mont_points; - cudaMallocAsync(&d_mont_points, sizeof(A) * points_size, stream_points); - mont::FromMontgomery(d_points, points_size, stream_points, d_mont_points); - d_points = d_mont_points; - } else - mont::FromMontgomery(d_points, points_size, stream_points, d_points); - } - cudaEvent_t event_points_uploaded; - if (!are_points_on_device || are_points_montgomery_form) { - cudaEventCreateWithFlags(&event_points_uploaded, cudaEventDisableTiming); - cudaEventRecord(event_points_uploaded, stream_points); - } - - P* buckets; - // compute number of bucket modules and number of buckets in each module - unsigned nof_bms = (bitsize + c - 1) / c; - unsigned bm_bitsize = (unsigned)ceil(log2(nof_bms)); - unsigned nof_buckets = (nof_bms << c); - unsigned total_nof_buckets = nof_buckets * batch_size; - CHK_IF_RETURN(cudaMallocAsync(&buckets, sizeof(P) * total_nof_buckets, stream)); - - // lanch the bucket initialization kernel with maximum threads - unsigned NUM_THREADS = 1 << 10; - unsigned NUM_BLOCKS = (total_nof_buckets + NUM_THREADS - 1) / NUM_THREADS; - initialize_buckets_kernel<<>>(buckets, total_nof_buckets); - - unsigned* bucket_indices; - unsigned* point_indices; - CHK_IF_RETURN(cudaMallocAsync(&bucket_indices, sizeof(unsigned) * total_size * nof_bms, stream)); - CHK_IF_RETURN(cudaMallocAsync(&point_indices, sizeof(unsigned) * total_size * nof_bms, stream)); - - // split scalars into digits - NUM_THREADS = 1 << 10; - NUM_BLOCKS = (total_size + NUM_THREADS - 1) / NUM_THREADS; - split_scalars_kernel<<>>( - bucket_indices, point_indices, d_scalars, total_size, points_size, msm_size, nof_bms, bm_bitsize, c); - - // sort indices - the indices are sorted from smallest to largest in order to group together the points that - // belong to each bucket - unsigned* sorted_bucket_indices; - unsigned* sorted_point_indices; - CHK_IF_RETURN(cudaMallocAsync(&sorted_bucket_indices, sizeof(unsigned) * (total_size * nof_bms), stream)); - CHK_IF_RETURN(cudaMallocAsync(&sorted_point_indices, sizeof(unsigned) * (total_size * nof_bms), stream)); - - unsigned* sort_indices_temp_storage{}; - size_t sort_indices_temp_storage_bytes; - // The second to last parameter is the default value supplied explicitly to allow passing the stream - // See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for - // more info - CHK_IF_RETURN(cub::DeviceRadixSort::SortPairs( - sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices, sorted_bucket_indices, - point_indices, sorted_point_indices, total_size * nof_bms, 0, sizeof(unsigned) * 8, stream)); - CHK_IF_RETURN(cudaMallocAsync(&sort_indices_temp_storage, sort_indices_temp_storage_bytes, stream)); - // The second to last parameter is the default value supplied explicitly to allow passing the stream - // See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for - // more info - CHK_IF_RETURN(cub::DeviceRadixSort::SortPairs( - sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices, sorted_bucket_indices, - point_indices, sorted_point_indices, total_size * nof_bms, 0, sizeof(unsigned) * 8, stream)); - CHK_IF_RETURN(cudaFreeAsync(sort_indices_temp_storage, stream)); - - // find bucket_sizes - unsigned* single_bucket_indices; - unsigned* bucket_sizes; - unsigned* total_nof_buckets_to_compute; - CHK_IF_RETURN(cudaMallocAsync(&single_bucket_indices, sizeof(unsigned) * total_nof_buckets, stream)); - CHK_IF_RETURN(cudaMallocAsync(&bucket_sizes, sizeof(unsigned) * total_nof_buckets, stream)); - CHK_IF_RETURN(cudaMallocAsync(&total_nof_buckets_to_compute, sizeof(unsigned), stream)); - unsigned* encode_temp_storage{}; - size_t encode_temp_storage_bytes = 0; - CHK_IF_RETURN(cub::DeviceRunLengthEncode::Encode( - encode_temp_storage, encode_temp_storage_bytes, sorted_bucket_indices, single_bucket_indices, bucket_sizes, - total_nof_buckets_to_compute, nof_bms * total_size, stream)); - CHK_IF_RETURN(cudaMallocAsync(&encode_temp_storage, encode_temp_storage_bytes, stream)); - CHK_IF_RETURN(cub::DeviceRunLengthEncode::Encode( - encode_temp_storage, encode_temp_storage_bytes, sorted_bucket_indices, single_bucket_indices, bucket_sizes, - total_nof_buckets_to_compute, nof_bms * total_size, stream)); - CHK_IF_RETURN(cudaFreeAsync(encode_temp_storage, stream)); - - // get offsets - where does each new bucket begin - unsigned* bucket_offsets; - CHK_IF_RETURN(cudaMallocAsync(&bucket_offsets, sizeof(unsigned) * total_nof_buckets, stream)); - unsigned* offsets_temp_storage{}; - size_t offsets_temp_storage_bytes = 0; - CHK_IF_RETURN(cub::DeviceScan::ExclusiveSum( - offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets, stream)); - CHK_IF_RETURN(cudaMallocAsync(&offsets_temp_storage, offsets_temp_storage_bytes, stream)); - CHK_IF_RETURN(cub::DeviceScan::ExclusiveSum( - offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets, stream)); - CHK_IF_RETURN(cudaFreeAsync(offsets_temp_storage, stream)); - - unsigned h_nof_buckets_to_compute; - CHK_IF_RETURN(cudaMemcpyAsync( - &h_nof_buckets_to_compute, total_nof_buckets_to_compute, sizeof(unsigned), cudaMemcpyDeviceToHost, stream)); - - if (!are_points_on_device || are_points_montgomery_form) { - // by this point, points need to be already uploaded and un-Montgomeried - CHK_IF_RETURN(cudaStreamWaitEvent(stream, event_points_uploaded)); - CHK_IF_RETURN(cudaStreamDestroy(stream_points)); - } - - // launch the accumulation kernel with maximum threads - NUM_THREADS = 1 << 8; - NUM_BLOCKS = (total_nof_buckets + NUM_THREADS - 1) / NUM_THREADS; - accumulate_buckets_kernel<<>>( - buckets, bucket_offsets, bucket_sizes, single_bucket_indices, sorted_point_indices, d_points, nof_buckets, - h_nof_buckets_to_compute, c + bm_bitsize, c); - - // #ifdef SSM_SUM - // //sum each bucket - // NUM_THREADS = 1 << 10; - // NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS; - // ssm_buckets_kernel<<>>(buckets, single_bucket_indices, nof_buckets, c); - - // //sum each bucket module - // P* final_results; - // cudaMalloc(&final_results, sizeof(P) * nof_bms); - // NUM_THREADS = 1<>>(buckets, final_results); - // #endif - - // #ifdef BIG_TRIANGLE - P* bm_sums; - CHK_IF_RETURN(cudaMallocAsync(&bm_sums, sizeof(P) * nof_bms * batch_size, stream)); - // launch the bucket module sum kernel - a thread for each bucket module - NUM_THREADS = 1 << 8; - NUM_BLOCKS = (nof_bms * batch_size + NUM_THREADS - 1) / NUM_THREADS; - big_triangle_sum_kernel<<>>(buckets, bm_sums, nof_bms * batch_size, c); - // #endif - - P* d_final_results; - if (!are_results_on_device) CHK_IF_RETURN(cudaMallocAsync(&d_final_results, sizeof(P) * batch_size, stream)); - - // launch the double and add kernel, a single thread for each msm - NUM_THREADS = 1 << 8; - NUM_BLOCKS = (batch_size + NUM_THREADS - 1) / NUM_THREADS; - final_accumulation_kernel<<>>( - bm_sums, are_results_on_device ? final_results : d_final_results, batch_size, nof_bms, c); - - // copy final result to host - if (!are_results_on_device) - CHK_IF_RETURN( - cudaMemcpyAsync(final_results, d_final_results, sizeof(P) * batch_size, cudaMemcpyDeviceToHost, stream)); - - // free memory - if (!are_scalars_on_device) CHK_IF_RETURN(cudaFreeAsync(d_scalars, stream)); - if (!are_points_on_device) CHK_IF_RETURN(cudaFreeAsync(d_points, stream)); - if (!are_results_on_device) CHK_IF_RETURN(cudaFreeAsync(d_final_results, stream)); - CHK_IF_RETURN(cudaFreeAsync(buckets, stream)); - CHK_IF_RETURN(cudaFreeAsync(bucket_indices, stream)); - CHK_IF_RETURN(cudaFreeAsync(point_indices, stream)); - CHK_IF_RETURN(cudaFreeAsync(sorted_bucket_indices, stream)); - CHK_IF_RETURN(cudaFreeAsync(sorted_point_indices, stream)); - CHK_IF_RETURN(cudaFreeAsync(single_bucket_indices, stream)); - CHK_IF_RETURN(cudaFreeAsync(bucket_sizes, stream)); - CHK_IF_RETURN(cudaFreeAsync(total_nof_buckets_to_compute, stream)); - CHK_IF_RETURN(cudaFreeAsync(bucket_offsets, stream)); - CHK_IF_RETURN(cudaFreeAsync(bm_sums, stream)); - - if (is_async) return CHK_LAST(); - return CHK_STICKY(cudaStreamSynchronize(stream)); - } - } // namespace extern "C" MSMConfig CONCAT_EXPAND(CURVE, DefaultMSMConfig)() @@ -992,21 +790,22 @@ namespace msm { template cudaError_t MSM(S* scalars, A* points, int msm_size, MSMConfig& config, P* results) { - int bitsize = (config.bitsize == 0) ? S::NBITS : config.bitsize; + const int bitsize = (config.bitsize == 0) ? S::NBITS : config.bitsize; cudaStream_t& stream = config.ctx.stream; - // TODO: DmytroTym/HadarIngonyama - unify the implementation of the bucket method and the batched bucket method in - // one function - if (config.batch_size == 1) - return CHK_STICKY(bucket_method_msm( - bitsize, 16, scalars, points, msm_size, results, config.are_scalars_on_device, - config.are_scalars_montgomery_form, config.are_points_on_device, config.are_points_montgomery_form, - config.are_results_on_device, config.is_big_triangle, config.large_bucket_factor, config.is_async, stream)); - else - return CHK_STICKY(batched_bucket_method_msm( - bitsize, (config.c == 0) ? get_optimal_c(msm_size) : config.c, scalars, points, config.batch_size, msm_size, - (config.points_size == 0) ? msm_size : config.points_size, results, config.are_scalars_on_device, - config.are_scalars_montgomery_form, config.are_points_on_device, config.are_points_montgomery_form, - config.are_results_on_device, config.is_async, stream)); + + unsigned c = config.batch_size > 1 ? ((config.c == 0) ? get_optimal_c(msm_size) : config.c) : 16; + // reduce c to closest power of two if not using big_triangle reduction logic + // TODO: support arbitrary values to c + if (!config.is_big_triangle) { + while ((c & (c - 1)) != 0) + c &= (c - 1); + } + + return CHK_STICKY(bucket_method_msm( + bitsize, c, scalars, points, config.batch_size, msm_size, + (config.points_size == 0) ? msm_size : config.points_size, results, config.are_scalars_on_device, + config.are_scalars_montgomery_form, config.are_points_on_device, config.are_points_montgomery_form, + config.are_results_on_device, config.is_big_triangle, config.large_bucket_factor, config.is_async, stream)); } /**