diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h index db4aeb4efc..21ed4e6e00 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h @@ -38,6 +38,7 @@ template struct int_compression { scratch_packing_keyswitch_lwe_list_to_glwe_64( streams[0], gpu_indexes[0], &fp_ks_buffer, + compression_params.small_lwe_dimension, compression_params.glwe_dimension, compression_params.polynomial_size, num_radix_blocks, true); } diff --git a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h index 00e85e5246..0df0d90bc4 100644 --- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h +++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h @@ -21,8 +21,8 @@ void cuda_keyswitch_lwe_ciphertext_vector_64( void scratch_packing_keyswitch_lwe_list_to_glwe_64( void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer, - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes, - bool allocate_gpu_memory); + uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t num_lwes, bool allocate_gpu_memory); void cuda_packing_keyswitch_lwe_list_to_glwe_64( void *stream, uint32_t gpu_index, void *glwe_array_out, diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh new file mode 100644 index 0000000000..f3fe542dd9 --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh @@ -0,0 +1,358 @@ +#ifndef CNCRT_FAST_KS_CUH +#define CNCRT_FAST_KS_CUH + +#undef NDEBUG +#include + +#include "device.h" +#include "gadget.cuh" +#include "helper_multi_gpu.h" +#include "keyswitch.cuh" +#include "polynomial/functions.cuh" +#include "polynomial/polynomial_math.cuh" +#include "torus.cuh" +#include "utils/helper.cuh" +#include "utils/kernel_dimensions.cuh" +#include +#include + +#define CEIL_DIV(M, N) ((M) + (N)-1) / (N) + +const int BLOCK_SIZE_GEMM = 64; +const int THREADS_GEMM = 8; +const int BLOCK_SIZE_DECOMP = 8; + +template uint64_t get_shared_mem_size_tgemm() { + return BLOCK_SIZE_GEMM * THREADS_GEMM * 2 * sizeof(Torus); +} + +__host__ inline bool can_use_pks_fast_path(uint32_t lwe_dimension, + uint32_t num_lwe, + uint32_t polynomial_size, + uint32_t level_count, + uint32_t glwe_dimension) { + // TODO: Generalize to level_count > 1 by transposing the KSK + return level_count == 1; +} + +// Initialize decomposition by performing rounding +// and decomposing one level of an array of Torus LWEs. Only +// decomposes the mask elements of the incoming LWEs. +template +__global__ void decompose_vectorize_init(Torus const *lwe_in, Torus *lwe_out, + uint32_t lwe_dimension, + uint32_t num_lwe, uint32_t base_log, + uint32_t level_count) { + + // index of this LWE ct in the buffer + auto lwe_idx = blockIdx.x * blockDim.x + threadIdx.x; + // index of the LWE sample in the LWE ct + auto lwe_sample_idx = blockIdx.y * blockDim.y + threadIdx.y; + + if (lwe_idx >= num_lwe || lwe_sample_idx >= lwe_dimension) + return; + + // Input LWE array is [mask_0, .., mask_lwe_dim, message] and + // we only decompose the mask. Thus the stride for reading + // is lwe_dimension + 1, while for writing it is lwe_dimension + auto read_val_idx = lwe_idx * (lwe_dimension + 1) + lwe_sample_idx; + auto write_val_idx = lwe_idx * lwe_dimension + lwe_sample_idx; + + Torus a_i = lwe_in[read_val_idx]; + + Torus state = init_decomposer_state(a_i, base_log, level_count); + + Torus mod_b_mask = (1ll << base_log) - 1ll; + lwe_out[write_val_idx] = decompose_one(state, mod_b_mask, base_log); +} + +// Continue decomposiion of an array of Torus elements in place. Supposes +// that the array contains already decomposed elements and +// computes the new decomposed level in place. +template +__global__ void +decompose_vectorize_step_inplace(Torus *buffer_in, uint32_t lwe_dimension, + uint32_t num_lwe, uint32_t base_log, + uint32_t level_count) { + + // index of this LWE ct in the buffer + auto lwe_idx = blockIdx.x * blockDim.x + threadIdx.x; + // index of the LWE sample in the LWE ct + auto lwe_sample_idx = blockIdx.y * blockDim.y + threadIdx.y; + + if (lwe_idx >= num_lwe || lwe_sample_idx >= lwe_dimension) + return; + + auto val_idx = lwe_idx * lwe_dimension + lwe_sample_idx; + + Torus state = buffer_in[val_idx]; + + Torus mod_b_mask = (1ll << base_log) - 1ll; + + buffer_in[val_idx] = decompose_one(state, mod_b_mask, base_log); +} + +// Multiply matrices A, B of size (M, K), (K, N) respectively +// with K as the inner dimension. +// +// A block of threads processeds blocks of size (BLOCK_SIZE_GEMM, +// BLOCK_SIZE_GEMM) splitting them in multiple tiles: (BLOCK_SIZE_GEMM, +// THREADS_GEMM)-shaped tiles of values from A, and a (THREADS_GEMM, +// BLOCK_SIZE_GEMM)-shaped tiles of values from B. +template +__global__ void tgemm(int M, int N, int K, const Torus *A, const Torus *B, + int stride_B, Torus *C) { + + const int BM = BLOCK_SIZE_GEMM; + const int BN = BLOCK_SIZE_GEMM; + const int BK = THREADS_GEMM; + const int TM = THREADS_GEMM; + + const uint cRow = blockIdx.y; + const uint cCol = blockIdx.x; + + const uint totalResultsBlocktile = BM * BN; + const int threadCol = threadIdx.x % BN; + const int threadRow = threadIdx.x / BN; + + // Allocate space for the current block tile in shared memory + __shared__ Torus As[BM * BK]; + __shared__ Torus Bs[BK * BN]; + + // Initialize the pointers to the input blocks from A, B + // Tiles from these blocks are loaded to shared memory + A += cRow * BM * K; + B += cCol * BN; + + // Each thread will handle multiple sub-blocks + const uint innerColA = threadIdx.x % BK; + const uint innerRowA = threadIdx.x / BK; + const uint innerColB = threadIdx.x % BN; + const uint innerRowB = threadIdx.x / BN; + + // allocate thread-local cache for results in registerfile + Torus threadResults[TM] = {0}; + + auto row_A = cRow * BM + innerRowA; + auto col_B = cCol * BN + innerColB; + + // For each thread, loop over block tiles + for (uint bkIdx = 0; bkIdx < K; bkIdx += BK) { + auto col_A = bkIdx + innerColA; + auto row_B = bkIdx + innerRowB; + + if (row_A < M && col_A < K) { + As[innerRowA * BK + innerColA] = A[innerRowA * K + innerColA]; + } else { + As[innerRowA * BK + innerColA] = 0; + } + + if (col_B < N && row_B < K) { + Bs[innerRowB * BN + innerColB] = B[innerRowB * stride_B + innerColB]; + } else { + Bs[innerRowB * BN + innerColB] = 0; + } + __syncthreads(); + + // Advance blocktile for the next iteration of this loop + A += BK; + B += BK * stride_B; + + // calculate per-thread results + for (uint dotIdx = 0; dotIdx < BK; ++dotIdx) { + // we make the dotproduct loop the outside loop, which facilitates + // reuse of the Bs entry, which we can cache in a tmp var. + Torus tmp = Bs[dotIdx * BN + threadCol]; + for (uint resIdx = 0; resIdx < TM; ++resIdx) { + threadResults[resIdx] += + As[(threadRow * TM + resIdx) * BK + dotIdx] * tmp; + } + } + __syncthreads(); + } + + // Initialize the pointer to the output block of size (BLOCK_SIZE_GEMM, + // BLOCK_SIZE_GEMM) + C += cRow * BM * N + cCol * BN; + + // write out the results + for (uint resIdx = 0; resIdx < TM; ++resIdx) { + int outRow = cRow * BM + threadRow * TM + resIdx; + int outCol = cCol * BN + threadCol; + + if (outRow >= M) + continue; + if (outCol >= N) + continue; + + C[(threadRow * TM + resIdx) * N + threadCol] += threadResults[resIdx]; + } +} + +// Finish the keyswitching operation and prepare GLWEs for accumulation. +// 1. Finish the keyswitching computation partially performed with a GEMM: +// - negate the dot product between the GLWE and KSK polynomial +// - add the GLWE message for the N-th polynomial coeff in the message poly +// 2. Rotate each of the GLWE . KSK poly dot products to +// prepare them for accumulation into a single GLWE +template +__global__ void polynomial_accumulate_monic_monomial_mul_many_neg_and_add_C( + Torus *in_glwe_buffer, Torus *out_glwe_buffer, Torus const *lwe_array, + uint32_t lwe_dimension, uint32_t num_glwes, uint32_t polynomial_size, + uint32_t glwe_dimension) { + + uint32_t glwe_id = blockIdx.x * blockDim.x + threadIdx.x; + uint32_t degree = glwe_id; // lwe 0 rotate 0, lwe 1 rotate 1, .. , lwe + // poly_size-1 rotate poly_size-1 + uint32_t coeffIdx = blockIdx.y * blockDim.y + threadIdx.y; + + if (glwe_id >= num_glwes) + return; + if (coeffIdx >= polynomial_size) + return; + + auto in_poly = + in_glwe_buffer + glwe_id * polynomial_size * (glwe_dimension + 1); + auto out_result = + out_glwe_buffer + glwe_id * polynomial_size * (glwe_dimension + 1); + if (coeffIdx == 0) { + // Add the message value of the input LWE (`C`) to the N-th coefficient + // in the GLWE . KSK dot product + + // The C is added to the first position of the last polynomial in the GLWE + // which has (glwe_dimension+1) polynomials + // The C value is extracted as the last value of the LWE ct. (of index + // glwe_id) the LWEs have (polynomial_size + 1) values + in_poly[polynomial_size * glwe_dimension] = + lwe_array[glwe_id * (lwe_dimension + 1) + lwe_dimension] - + in_poly[polynomial_size * glwe_dimension]; + + for (int gi = 1; gi < glwe_dimension; ++gi) + in_poly[coeffIdx + gi * polynomial_size] = + -in_poly[coeffIdx + gi * polynomial_size]; + + } else { + // Otherwise simply negate the input coefficient + for (int gi = 1; gi < glwe_dimension + 1; ++gi) + in_poly[coeffIdx + gi * polynomial_size] = + -in_poly[coeffIdx + gi * polynomial_size]; + } + // Negate all the coefficients for rotation for the first poly + in_poly[coeffIdx] = -in_poly[coeffIdx]; + + // rotate the body + polynomial_accumulate_monic_monomial_mul( + out_result, in_poly, degree, coeffIdx, polynomial_size, 1, true); + // rotate the mask too + for (int gi = 1; gi < glwe_dimension + 1; ++gi) + polynomial_accumulate_monic_monomial_mul( + out_result + gi * polynomial_size, in_poly + gi * polynomial_size, + degree, coeffIdx, polynomial_size, 1, true); +} + +template +__host__ void host_fast_packing_keyswitch_lwe_list_to_glwe( + cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out, + Torus const *lwe_array_in, Torus const *fp_ksk_array, int8_t *fp_ks_buffer, + uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count, uint32_t num_lwes) { + + // Optimization of packing keyswitch when packing many LWEs + + if (level_count > 1) { + PANIC("Fast path PKS only supports level_count==1"); + } + + cudaSetDevice(gpu_index); + check_cuda_error(cudaGetLastError()); + + int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size; + + // The fast path of PKS uses the scratch buffer (d_mem) differently than the + // old path: it needs to store the decomposed masks in the first half of this + // buffer and the keyswitched GLWEs in the second half of the buffer. Thus the + // scratch buffer for the fast path must determine the half-size of the + // scratch buffer as the max between the size of the GLWE and the size of the + // LWE-mask + int memory_unit = glwe_accumulator_size > lwe_dimension + ? glwe_accumulator_size + : lwe_dimension; + + // ping pong the buffer between successive calls + // split the buffer in two parts of this size + auto d_mem_0 = (Torus *)fp_ks_buffer; + auto d_mem_1 = d_mem_0 + num_lwes * memory_unit; + + // Set the scratch buffer to 0 as it is used to accumulate + // decomposition temporary results + cuda_memset_async(d_mem_1, 0, num_lwes * memory_unit * sizeof(Torus), stream, + gpu_index); + check_cuda_error(cudaGetLastError()); + + // decompose LWEs + // don't decompose LWE body - the LWE has lwe_size + 1 elements. The last + // element, the body is ignored by rounding down the number of blocks assuming + // here that the LWE dimension is a multiple of the block size + dim3 grid_decomp(CEIL_DIV(num_lwes, BLOCK_SIZE_DECOMP), + CEIL_DIV(lwe_dimension, BLOCK_SIZE_DECOMP)); + dim3 threads_decomp(BLOCK_SIZE_DECOMP, BLOCK_SIZE_DECOMP); + + // decompose first level + decompose_vectorize_init + <<>>(lwe_array_in, d_mem_0, + lwe_dimension, num_lwes, + base_log, level_count); + check_cuda_error(cudaGetLastError()); + + // gemm to ks the individual LWEs to GLWEs + dim3 grid_gemm(CEIL_DIV(glwe_accumulator_size, BLOCK_SIZE_GEMM), + CEIL_DIV(num_lwes, BLOCK_SIZE_GEMM)); + dim3 threads_gemm(BLOCK_SIZE_GEMM * THREADS_GEMM); + + auto stride_KSK_buffer = glwe_accumulator_size; + + uint32_t shared_mem_size = get_shared_mem_size_tgemm(); + tgemm<<>>( + num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0, fp_ksk_array, + stride_KSK_buffer, d_mem_1); + check_cuda_error(cudaGetLastError()); + + /* + TODO: transpose key to generalize to level_count > 1 + + for (int li = 1; li < level_count; ++li) { + decompose_vectorize_step_inplace + <<>>( + d_mem_0, lwe_dimension, num_lwes, base_log, level_count); + check_cuda_error(cudaGetLastError()); + + tgemm<<>>( num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0, + fp_ksk_array + li * ksk_block_size, stride_KSK_buffer, d_mem_1); + check_cuda_error(cudaGetLastError()); + } + */ + + // should we include the mask in the rotation ?? + dim3 grid_rotate(CEIL_DIV(num_lwes, BLOCK_SIZE_DECOMP), + CEIL_DIV(polynomial_size, BLOCK_SIZE_DECOMP)); + dim3 threads_rotate(BLOCK_SIZE_DECOMP, BLOCK_SIZE_DECOMP); + // rotate the GLWEs + polynomial_accumulate_monic_monomial_mul_many_neg_and_add_C + <<>>( + d_mem_1, d_mem_0, lwe_array_in, lwe_dimension, num_lwes, + polynomial_size, glwe_dimension); + check_cuda_error(cudaGetLastError()); + + dim3 grid_accumulate( + CEIL_DIV(polynomial_size * (glwe_dimension + 1), BLOCK_SIZE_DECOMP)); + dim3 threads_accum(BLOCK_SIZE_DECOMP); + + // accumulate to a single glwe + accumulate_glwes<<>>( + glwe_out, d_mem_0, glwe_dimension, polynomial_size, num_lwes); + + check_cuda_error(cudaGetLastError()); +} + +#endif diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu index ec1f6bf022..b3f63176df 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu @@ -1,6 +1,8 @@ +#include "fast_packing_keyswitch.cuh" #include "keyswitch.cuh" #include "keyswitch.h" #include +#include /* Perform keyswitch on a batch of 32 bits input LWE ciphertexts. * Head out to the equivalent operation on 64 bits for more details. @@ -53,15 +55,17 @@ void cuda_keyswitch_lwe_ciphertext_vector_64( void scratch_packing_keyswitch_lwe_list_to_glwe_64( void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer, - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes, - bool allocate_gpu_memory) { + uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t num_lwes, bool allocate_gpu_memory) { scratch_packing_keyswitch_lwe_list_to_glwe( - static_cast(stream), gpu_index, fp_ks_buffer, + static_cast(stream), gpu_index, fp_ks_buffer, lwe_dimension, glwe_dimension, polynomial_size, num_lwes, allocate_gpu_memory); } + /* Perform functional packing keyswitch on a batch of 64 bits input LWE * ciphertexts. */ + void cuda_packing_keyswitch_lwe_list_to_glwe_64( void *stream, uint32_t gpu_index, void *glwe_array_out, void const *lwe_array_in, void const *fp_ksk_array, int8_t *fp_ks_buffer, @@ -69,13 +73,24 @@ void cuda_packing_keyswitch_lwe_list_to_glwe_64( uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_lwes) { - host_packing_keyswitch_lwe_list_to_glwe( - static_cast(stream), gpu_index, - static_cast(glwe_array_out), - static_cast(lwe_array_in), - static_cast(fp_ksk_array), fp_ks_buffer, - input_lwe_dimension, output_glwe_dimension, output_polynomial_size, - base_log, level_count, num_lwes); + if (can_use_pks_fast_path(input_lwe_dimension, num_lwes, + output_polynomial_size, level_count, + output_glwe_dimension)) { + host_fast_packing_keyswitch_lwe_list_to_glwe( + static_cast(stream), gpu_index, + static_cast(glwe_array_out), + static_cast(lwe_array_in), + static_cast(fp_ksk_array), fp_ks_buffer, + input_lwe_dimension, output_glwe_dimension, output_polynomial_size, + base_log, level_count, num_lwes); + } else + host_packing_keyswitch_lwe_list_to_glwe( + static_cast(stream), gpu_index, + static_cast(glwe_array_out), + static_cast(lwe_array_in), + static_cast(fp_ksk_array), fp_ks_buffer, + input_lwe_dimension, output_glwe_dimension, output_polynomial_size, + base_log, level_count, num_lwes); } void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream, diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh index d4c77fb69c..5b79e5b892 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh @@ -158,16 +158,20 @@ void execute_keyswitch_async(cudaStream_t const *streams, template __host__ void scratch_packing_keyswitch_lwe_list_to_glwe( cudaStream_t stream, uint32_t gpu_index, int8_t **fp_ks_buffer, - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes, - bool allocate_gpu_memory) { + uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t num_lwes, bool allocate_gpu_memory) { cudaSetDevice(gpu_index); int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size; - if (allocate_gpu_memory) + int memory_unit = glwe_accumulator_size > lwe_dimension + ? glwe_accumulator_size + : lwe_dimension; + + if (allocate_gpu_memory) { *fp_ks_buffer = (int8_t *)cuda_malloc_async( - 2 * num_lwes * glwe_accumulator_size * sizeof(Torus), stream, - gpu_index); + 2 * num_lwes * memory_unit * sizeof(Torus), stream, gpu_index); + } } // public functional packing keyswitch for a single LWE ciphertext @@ -241,6 +245,7 @@ __global__ void packing_keyswitch_lwe_list_to_glwe( auto lwe_in = lwe_array_in + input_id * lwe_size; auto ks_glwe_out = d_mem + input_id * glwe_accumulator_size; auto glwe_out = glwe_array_out + input_id * glwe_accumulator_size; + // KS LWE to GLWE packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext( ks_glwe_out, lwe_in, fp_ksk, lwe_dimension_in, glwe_dimension, @@ -293,8 +298,18 @@ __host__ void host_packing_keyswitch_lwe_list_to_glwe( dim3 grid(num_blocks, num_lwes); dim3 threads(num_threads); + // The fast path of PKS uses the scratch buffer (d_mem) differently: + // it needs to store the decomposed masks in the first half of this buffer + // and the keyswitched GLWEs in the second half of the buffer. Thus the + // scratch buffer for the fast path must determine the half-size of the + // scratch buffer as the max between the size of the GLWE and the size of the + // LWE-mask + int memory_unit = glwe_accumulator_size > lwe_dimension_in + ? glwe_accumulator_size + : lwe_dimension_in; + auto d_mem = (Torus *)fp_ks_buffer; - auto d_tmp_glwe_array_out = d_mem + num_lwes * glwe_accumulator_size; + auto d_tmp_glwe_array_out = d_mem + num_lwes * memory_unit; // individually keyswitch each lwe packing_keyswitch_lwe_list_to_glwe<<>>( diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh index 097dc47f32..ea509006f2 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh @@ -2,6 +2,7 @@ #define CUDA_INTEGER_COMPRESSION_CUH #include "ciphertext.h" +#include "crypto/fast_packing_keyswitch.cuh" #include "crypto/keyswitch.cuh" #include "device.h" #include "integer/compression/compression.h" @@ -116,11 +117,21 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes, while (rem_lwes > 0) { auto chunk_size = min(rem_lwes, mem_ptr->lwe_per_glwe); - host_packing_keyswitch_lwe_list_to_glwe( - streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0], - fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension, - compression_params.polynomial_size, compression_params.ks_base_log, - compression_params.ks_level, chunk_size); + if (can_use_pks_fast_path( + input_lwe_dimension, chunk_size, compression_params.polynomial_size, + compression_params.ks_level, compression_params.glwe_dimension)) { + host_fast_packing_keyswitch_lwe_list_to_glwe( + streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0], + fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension, + compression_params.polynomial_size, compression_params.ks_base_log, + compression_params.ks_level, chunk_size); + } else { + host_packing_keyswitch_lwe_list_to_glwe( + streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0], + fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension, + compression_params.polynomial_size, compression_params.ks_base_log, + compression_params.ks_level, chunk_size); + } rem_lwes -= chunk_size; lwe_subset += chunk_size * lwe_in_size; diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs index 1dfa0f2c2f..983a409338 100644 --- a/backends/tfhe-cuda-backend/src/bindings.rs +++ b/backends/tfhe-cuda-backend/src/bindings.rs @@ -1229,6 +1229,7 @@ extern "C" { stream: *mut ffi::c_void, gpu_index: u32, fp_ks_buffer: *mut *mut i8, + lwe_dimension: u32, glwe_dimension: u32, polynomial_size: u32, num_lwes: u32, diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index 5964b4b874..c7fada2674 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -290,6 +290,7 @@ pub unsafe fn packing_keyswitch_list_async( streams.ptr[0], streams.gpu_indexes[0].0, std::ptr::addr_of_mut!(fp_ks_buffer), + input_lwe_dimension.0 as u32, output_glwe_dimension.0 as u32, output_polynomial_size.0 as u32, num_lwes.0 as u32, diff --git a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs index ef4f88e13f..1f332ddb28 100644 --- a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs +++ b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs @@ -440,7 +440,6 @@ impl CudaCompressedCiphertextListBuilder { streams: &CudaStreams, ) -> CudaCompressedCiphertextList { let packed_list = comp_key.compress_ciphertexts_into_list(&self.ciphertexts, streams); - CudaCompressedCiphertextList { packed_list, info: self.info.clone(), @@ -488,6 +487,15 @@ mod tests { use crate::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; use rand::Rng; + use crate::core_crypto::prelude::*; + + use crate::core_crypto::prelude::{CiphertextModulusLog, LweCiphertextCount}; + use crate::shortint::parameters::list_compression::CompressionParameters; + use crate::shortint::parameters::{ + DecompositionBaseLog, DecompositionLevelCount, DynamicDistribution, GlweDimension, + PolynomialSize, + }; + const NB_TESTS: usize = 10; const NB_OPERATOR_TESTS: usize = 10; @@ -717,4 +725,131 @@ mod tests { } } } + + #[test] + fn test_gpu_ciphertext_compression_fast_path() { + /// Implement a test only for the storage of ciphertexts + /// using a custom parameter set which is supported by a fast-path + /// packing keyswitch (only for level_count==1) + const COMP_PARAM_CUSTOM_FAST_PATH: CompressionParameters = CompressionParameters { + br_level: DecompositionLevelCount(1), + br_base_log: DecompositionBaseLog(21), + packing_ks_level: DecompositionLevelCount(1), + packing_ks_base_log: DecompositionBaseLog(19), + packing_ks_polynomial_size: PolynomialSize(2048), + packing_ks_glwe_dimension: GlweDimension(1), + lwe_per_glwe: LweCiphertextCount(2048), + storage_log_modulus: CiphertextModulusLog(55), + packing_ks_key_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev( + StandardDev(2.845267479601915e-15), + ), + }; + + const NUM_BLOCKS: usize = 32; + + let streams = CudaStreams::new_multi_gpu(); + + let (radix_cks, sks) = gen_keys_radix_gpu( + PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64, + NUM_BLOCKS, + &streams, + ); + let cks = radix_cks.as_ref(); + + let private_compression_key = cks.new_compression_private_key(COMP_PARAM_CUSTOM_FAST_PATH); + + let (cuda_compression_key, cuda_decompression_key) = + radix_cks.new_cuda_compression_decompression_keys(&private_compression_key, &streams); + + const MAX_NB_MESSAGES: usize = 2 * COMP_PARAM_CUSTOM_FAST_PATH.lwe_per_glwe.0 / NUM_BLOCKS; + + let mut rng = rand::thread_rng(); + + let message_modulus: u128 = cks.parameters().message_modulus().0 as u128; + + // Hybrid + enum MessageType { + Unsigned(u128), + Signed(i128), + Boolean(bool), + } + for _ in 0..NB_OPERATOR_TESTS { + let mut builder = CudaCompressedCiphertextListBuilder::new(); + + let nb_messages = rng.gen_range(1..=MAX_NB_MESSAGES as u64); + let mut messages = vec![]; + for _ in 0..nb_messages { + let case_selector = rng.gen_range(0..3); + match case_selector { + 0 => { + // Unsigned + let modulus = message_modulus.pow(NUM_BLOCKS as u32); + let message = rng.gen::() % modulus; + let ct = radix_cks.encrypt(message); + let d_ct = + CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &streams); + let d_and_ct = sks.bitand(&d_ct, &d_ct, &streams); + builder.push(d_and_ct, &streams); + messages.push(MessageType::Unsigned(message)); + } + 1 => { + // Signed + let modulus = message_modulus.pow((NUM_BLOCKS - 1) as u32) as i128; + let message = rng.gen::() % modulus; + let ct = radix_cks.encrypt_signed(message); + let d_ct = + CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct, &streams); + let d_and_ct = sks.bitand(&d_ct, &d_ct, &streams); + builder.push(d_and_ct, &streams); + messages.push(MessageType::Signed(message)); + } + _ => { + // Boolean + let message = rng.gen::() % 2 != 0; + let ct = radix_cks.encrypt_bool(message); + let d_boolean_ct = CudaBooleanBlock::from_boolean_block(&ct, &streams); + let d_ct = d_boolean_ct.0; + let d_and_boolean_ct = + CudaBooleanBlock::from_cuda_radix_ciphertext(d_ct.ciphertext); + builder.push(d_and_boolean_ct, &streams); + messages.push(MessageType::Boolean(message)); + } + } + } + + let cuda_compressed = builder.build(&cuda_compression_key, &streams); + + for (i, val) in messages.iter().enumerate() { + match val { + MessageType::Unsigned(message) => { + let d_decompressed: CudaUnsignedRadixCiphertext = cuda_compressed + .get(i, &cuda_decompression_key, &streams) + .unwrap() + .unwrap(); + let decompressed = d_decompressed.to_radix_ciphertext(&streams); + let decrypted: u128 = radix_cks.decrypt(&decompressed); + assert_eq!(decrypted, *message); + } + MessageType::Signed(message) => { + let d_decompressed: CudaSignedRadixCiphertext = cuda_compressed + .get(i, &cuda_decompression_key, &streams) + .unwrap() + .unwrap(); + let decompressed = d_decompressed.to_signed_radix_ciphertext(&streams); + let decrypted: i128 = radix_cks.decrypt_signed(&decompressed); + assert_eq!(decrypted, *message); + } + MessageType::Boolean(message) => { + let d_decompressed: CudaBooleanBlock = cuda_compressed + .get(i, &cuda_decompression_key, &streams) + .unwrap() + .unwrap(); + let decompressed = d_decompressed.to_boolean_block(&streams); + let decrypted = radix_cks.decrypt_bool(&decompressed); + assert_eq!(decrypted, *message); + } + } + } + } + } }