diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
index db4aeb4efc..21ed4e6e00 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -38,6 +38,7 @@ template <typename Torus> struct int_compression {
 
       scratch_packing_keyswitch_lwe_list_to_glwe_64(
           streams[0], gpu_indexes[0], &fp_ks_buffer,
+          compression_params.small_lwe_dimension,
           compression_params.glwe_dimension, compression_params.polynomial_size,
           num_radix_blocks, true);
     }
diff --git a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
index 00e85e5246..0df0d90bc4 100644
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
@@ -21,8 +21,8 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
 
 void scratch_packing_keyswitch_lwe_list_to_glwe_64(
     void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
-    bool allocate_gpu_memory);
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t num_lwes, bool allocate_gpu_memory);
 
 void cuda_packing_keyswitch_lwe_list_to_glwe_64(
     void *stream, uint32_t gpu_index, void *glwe_array_out,
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh
new file mode 100644
index 0000000000..f3fe542dd9
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh
@@ -0,0 +1,358 @@
+#ifndef CNCRT_FAST_KS_CUH
+#define CNCRT_FAST_KS_CUH
+
+#undef NDEBUG
+#include <assert.h>
+
+#include "device.h"
+#include "gadget.cuh"
+#include "helper_multi_gpu.h"
+#include "keyswitch.cuh"
+#include "polynomial/functions.cuh"
+#include "polynomial/polynomial_math.cuh"
+#include "torus.cuh"
+#include "utils/helper.cuh"
+#include "utils/kernel_dimensions.cuh"
+#include <thread>
+#include <vector>
+
+#define CEIL_DIV(M, N) ((M) + (N)-1) / (N)
+
+const int BLOCK_SIZE_GEMM = 64;
+const int THREADS_GEMM = 8;
+const int BLOCK_SIZE_DECOMP = 8;
+
+template <typename Torus> uint64_t get_shared_mem_size_tgemm() {
+  return BLOCK_SIZE_GEMM * THREADS_GEMM * 2 * sizeof(Torus);
+}
+
+__host__ inline bool can_use_pks_fast_path(uint32_t lwe_dimension,
+                                           uint32_t num_lwe,
+                                           uint32_t polynomial_size,
+                                           uint32_t level_count,
+                                           uint32_t glwe_dimension) {
+  // TODO: Generalize to level_count > 1 by transposing the KSK
+  return level_count == 1;
+}
+
+// Initialize decomposition by performing rounding
+// and decomposing one level of an array of Torus LWEs. Only
+// decomposes the mask elements of the incoming LWEs.
+template <typename Torus, typename TorusVec>
+__global__ void decompose_vectorize_init(Torus const *lwe_in, Torus *lwe_out,
+                                         uint32_t lwe_dimension,
+                                         uint32_t num_lwe, uint32_t base_log,
+                                         uint32_t level_count) {
+
+  // index of this LWE ct in the buffer
+  auto lwe_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  // index of the LWE sample in the LWE ct
+  auto lwe_sample_idx = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if (lwe_idx >= num_lwe || lwe_sample_idx >= lwe_dimension)
+    return;
+
+  // Input LWE array is [mask_0, .., mask_lwe_dim, message] and
+  // we only decompose the mask. Thus the stride for reading
+  // is lwe_dimension + 1, while for writing it is lwe_dimension
+  auto read_val_idx = lwe_idx * (lwe_dimension + 1) + lwe_sample_idx;
+  auto write_val_idx = lwe_idx * lwe_dimension + lwe_sample_idx;
+
+  Torus a_i = lwe_in[read_val_idx];
+
+  Torus state = init_decomposer_state(a_i, base_log, level_count);
+
+  Torus mod_b_mask = (1ll << base_log) - 1ll;
+  lwe_out[write_val_idx] = decompose_one<Torus>(state, mod_b_mask, base_log);
+}
+
+// Continue decomposiion of an array of Torus elements in place. Supposes
+// that the array contains already decomposed elements and
+// computes the new decomposed level in place.
+template <typename Torus, typename TorusVec>
+__global__ void
+decompose_vectorize_step_inplace(Torus *buffer_in, uint32_t lwe_dimension,
+                                 uint32_t num_lwe, uint32_t base_log,
+                                 uint32_t level_count) {
+
+  // index of this LWE ct in the buffer
+  auto lwe_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  // index of the LWE sample in the LWE ct
+  auto lwe_sample_idx = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if (lwe_idx >= num_lwe || lwe_sample_idx >= lwe_dimension)
+    return;
+
+  auto val_idx = lwe_idx * lwe_dimension + lwe_sample_idx;
+
+  Torus state = buffer_in[val_idx];
+
+  Torus mod_b_mask = (1ll << base_log) - 1ll;
+
+  buffer_in[val_idx] = decompose_one<Torus>(state, mod_b_mask, base_log);
+}
+
+// Multiply matrices A, B of size (M, K), (K, N) respectively
+// with K as the inner dimension.
+//
+// A block of threads processeds blocks of size (BLOCK_SIZE_GEMM,
+// BLOCK_SIZE_GEMM) splitting them in multiple tiles: (BLOCK_SIZE_GEMM,
+// THREADS_GEMM)-shaped tiles of values from A, and a (THREADS_GEMM,
+// BLOCK_SIZE_GEMM)-shaped tiles of values from B.
+template <typename Torus, typename TorusVec>
+__global__ void tgemm(int M, int N, int K, const Torus *A, const Torus *B,
+                      int stride_B, Torus *C) {
+
+  const int BM = BLOCK_SIZE_GEMM;
+  const int BN = BLOCK_SIZE_GEMM;
+  const int BK = THREADS_GEMM;
+  const int TM = THREADS_GEMM;
+
+  const uint cRow = blockIdx.y;
+  const uint cCol = blockIdx.x;
+
+  const uint totalResultsBlocktile = BM * BN;
+  const int threadCol = threadIdx.x % BN;
+  const int threadRow = threadIdx.x / BN;
+
+  // Allocate space for the current block tile in shared memory
+  __shared__ Torus As[BM * BK];
+  __shared__ Torus Bs[BK * BN];
+
+  // Initialize the pointers to the input blocks from A, B
+  // Tiles from these blocks are loaded to shared memory
+  A += cRow * BM * K;
+  B += cCol * BN;
+
+  // Each thread will handle multiple sub-blocks
+  const uint innerColA = threadIdx.x % BK;
+  const uint innerRowA = threadIdx.x / BK;
+  const uint innerColB = threadIdx.x % BN;
+  const uint innerRowB = threadIdx.x / BN;
+
+  // allocate thread-local cache for results in registerfile
+  Torus threadResults[TM] = {0};
+
+  auto row_A = cRow * BM + innerRowA;
+  auto col_B = cCol * BN + innerColB;
+
+  // For each thread, loop over block tiles
+  for (uint bkIdx = 0; bkIdx < K; bkIdx += BK) {
+    auto col_A = bkIdx + innerColA;
+    auto row_B = bkIdx + innerRowB;
+
+    if (row_A < M && col_A < K) {
+      As[innerRowA * BK + innerColA] = A[innerRowA * K + innerColA];
+    } else {
+      As[innerRowA * BK + innerColA] = 0;
+    }
+
+    if (col_B < N && row_B < K) {
+      Bs[innerRowB * BN + innerColB] = B[innerRowB * stride_B + innerColB];
+    } else {
+      Bs[innerRowB * BN + innerColB] = 0;
+    }
+    __syncthreads();
+
+    // Advance blocktile for the next iteration of this loop
+    A += BK;
+    B += BK * stride_B;
+
+    // calculate per-thread results
+    for (uint dotIdx = 0; dotIdx < BK; ++dotIdx) {
+      // we make the dotproduct loop the outside loop, which facilitates
+      // reuse of the Bs entry, which we can cache in a tmp var.
+      Torus tmp = Bs[dotIdx * BN + threadCol];
+      for (uint resIdx = 0; resIdx < TM; ++resIdx) {
+        threadResults[resIdx] +=
+            As[(threadRow * TM + resIdx) * BK + dotIdx] * tmp;
+      }
+    }
+    __syncthreads();
+  }
+
+  // Initialize the pointer to the output block of size (BLOCK_SIZE_GEMM,
+  // BLOCK_SIZE_GEMM)
+  C += cRow * BM * N + cCol * BN;
+
+  // write out the results
+  for (uint resIdx = 0; resIdx < TM; ++resIdx) {
+    int outRow = cRow * BM + threadRow * TM + resIdx;
+    int outCol = cCol * BN + threadCol;
+
+    if (outRow >= M)
+      continue;
+    if (outCol >= N)
+      continue;
+
+    C[(threadRow * TM + resIdx) * N + threadCol] += threadResults[resIdx];
+  }
+}
+
+// Finish the keyswitching operation and prepare GLWEs for accumulation.
+// 1. Finish the keyswitching computation partially performed with a GEMM:
+//  - negate the dot product between the GLWE and KSK polynomial
+//  - add the GLWE message for the N-th polynomial coeff in the message poly
+// 2. Rotate each of the GLWE . KSK poly dot products to
+//    prepare them for accumulation into a single GLWE
+template <typename Torus>
+__global__ void polynomial_accumulate_monic_monomial_mul_many_neg_and_add_C(
+    Torus *in_glwe_buffer, Torus *out_glwe_buffer, Torus const *lwe_array,
+    uint32_t lwe_dimension, uint32_t num_glwes, uint32_t polynomial_size,
+    uint32_t glwe_dimension) {
+
+  uint32_t glwe_id = blockIdx.x * blockDim.x + threadIdx.x;
+  uint32_t degree = glwe_id; // lwe 0 rotate 0, lwe 1 rotate 1, .. , lwe
+                             // poly_size-1 rotate poly_size-1
+  uint32_t coeffIdx = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if (glwe_id >= num_glwes)
+    return;
+  if (coeffIdx >= polynomial_size)
+    return;
+
+  auto in_poly =
+      in_glwe_buffer + glwe_id * polynomial_size * (glwe_dimension + 1);
+  auto out_result =
+      out_glwe_buffer + glwe_id * polynomial_size * (glwe_dimension + 1);
+  if (coeffIdx == 0) {
+    // Add the message value of the input LWE (`C`) to the N-th coefficient
+    // in the GLWE . KSK dot product
+
+    // The C is added to the first position of the last polynomial in the GLWE
+    // which has (glwe_dimension+1) polynomials
+    // The C value is extracted as the last value of the LWE ct. (of index
+    // glwe_id) the LWEs have (polynomial_size + 1) values
+    in_poly[polynomial_size * glwe_dimension] =
+        lwe_array[glwe_id * (lwe_dimension + 1) + lwe_dimension] -
+        in_poly[polynomial_size * glwe_dimension];
+
+    for (int gi = 1; gi < glwe_dimension; ++gi)
+      in_poly[coeffIdx + gi * polynomial_size] =
+          -in_poly[coeffIdx + gi * polynomial_size];
+
+  } else {
+    // Otherwise simply negate the input coefficient
+    for (int gi = 1; gi < glwe_dimension + 1; ++gi)
+      in_poly[coeffIdx + gi * polynomial_size] =
+          -in_poly[coeffIdx + gi * polynomial_size];
+  }
+  // Negate all the coefficients for rotation for the first poly
+  in_poly[coeffIdx] = -in_poly[coeffIdx];
+
+  // rotate the body
+  polynomial_accumulate_monic_monomial_mul<Torus>(
+      out_result, in_poly, degree, coeffIdx, polynomial_size, 1, true);
+  // rotate the mask too
+  for (int gi = 1; gi < glwe_dimension + 1; ++gi)
+    polynomial_accumulate_monic_monomial_mul<Torus>(
+        out_result + gi * polynomial_size, in_poly + gi * polynomial_size,
+        degree, coeffIdx, polynomial_size, 1, true);
+}
+
+template <typename Torus, typename TorusVec>
+__host__ void host_fast_packing_keyswitch_lwe_list_to_glwe(
+    cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out,
+    Torus const *lwe_array_in, Torus const *fp_ksk_array, int8_t *fp_ks_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {
+
+  // Optimization of packing keyswitch when packing many LWEs
+
+  if (level_count > 1) {
+    PANIC("Fast path PKS only supports level_count==1");
+  }
+
+  cudaSetDevice(gpu_index);
+  check_cuda_error(cudaGetLastError());
+
+  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
+
+  // The fast path of PKS uses the scratch buffer (d_mem) differently than the
+  // old path: it needs to store the decomposed masks in the first half of this
+  // buffer and the keyswitched GLWEs in the second half of the buffer. Thus the
+  // scratch buffer for the fast path must determine the half-size of the
+  // scratch buffer as the max between the size of the GLWE and the size of the
+  // LWE-mask
+  int memory_unit = glwe_accumulator_size > lwe_dimension
+                        ? glwe_accumulator_size
+                        : lwe_dimension;
+
+  // ping pong the buffer between successive calls
+  // split the buffer in two parts of this size
+  auto d_mem_0 = (Torus *)fp_ks_buffer;
+  auto d_mem_1 = d_mem_0 + num_lwes * memory_unit;
+
+  // Set the scratch buffer to 0 as it is used to accumulate
+  // decomposition temporary results
+  cuda_memset_async(d_mem_1, 0, num_lwes * memory_unit * sizeof(Torus), stream,
+                    gpu_index);
+  check_cuda_error(cudaGetLastError());
+
+  // decompose LWEs
+  // don't decompose LWE body - the LWE has lwe_size + 1 elements. The last
+  // element, the body is ignored by rounding down the number of blocks assuming
+  // here that the LWE dimension is a multiple of the block size
+  dim3 grid_decomp(CEIL_DIV(num_lwes, BLOCK_SIZE_DECOMP),
+                   CEIL_DIV(lwe_dimension, BLOCK_SIZE_DECOMP));
+  dim3 threads_decomp(BLOCK_SIZE_DECOMP, BLOCK_SIZE_DECOMP);
+
+  // decompose first level
+  decompose_vectorize_init<Torus, TorusVec>
+      <<<grid_decomp, threads_decomp, 0, stream>>>(lwe_array_in, d_mem_0,
+                                                   lwe_dimension, num_lwes,
+                                                   base_log, level_count);
+  check_cuda_error(cudaGetLastError());
+
+  // gemm to ks the individual LWEs to GLWEs
+  dim3 grid_gemm(CEIL_DIV(glwe_accumulator_size, BLOCK_SIZE_GEMM),
+                 CEIL_DIV(num_lwes, BLOCK_SIZE_GEMM));
+  dim3 threads_gemm(BLOCK_SIZE_GEMM * THREADS_GEMM);
+
+  auto stride_KSK_buffer = glwe_accumulator_size;
+
+  uint32_t shared_mem_size = get_shared_mem_size_tgemm<Torus>();
+  tgemm<Torus, TorusVec><<<grid_gemm, threads_gemm, shared_mem_size, stream>>>(
+      num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0, fp_ksk_array,
+      stride_KSK_buffer, d_mem_1);
+  check_cuda_error(cudaGetLastError());
+
+  /*
+    TODO: transpose key to generalize to level_count > 1
+
+    for (int li = 1; li < level_count; ++li) {
+      decompose_vectorize_step_inplace<Torus, TorusVec>
+          <<<grid_decomp, threads_decomp, 0, stream>>>(
+              d_mem_0, lwe_dimension, num_lwes, base_log, level_count);
+      check_cuda_error(cudaGetLastError());
+
+      tgemm<Torus, TorusVec><<<grid_gemm, threads_gemm, shared_mem_size,
+    stream>>>( num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0,
+          fp_ksk_array + li * ksk_block_size, stride_KSK_buffer, d_mem_1);
+      check_cuda_error(cudaGetLastError());
+    }
+  */
+
+  // should we include the mask in the rotation ??
+  dim3 grid_rotate(CEIL_DIV(num_lwes, BLOCK_SIZE_DECOMP),
+                   CEIL_DIV(polynomial_size, BLOCK_SIZE_DECOMP));
+  dim3 threads_rotate(BLOCK_SIZE_DECOMP, BLOCK_SIZE_DECOMP);
+  // rotate the GLWEs
+  polynomial_accumulate_monic_monomial_mul_many_neg_and_add_C<Torus>
+      <<<grid_rotate, threads_rotate, 0, stream>>>(
+          d_mem_1, d_mem_0, lwe_array_in, lwe_dimension, num_lwes,
+          polynomial_size, glwe_dimension);
+  check_cuda_error(cudaGetLastError());
+
+  dim3 grid_accumulate(
+      CEIL_DIV(polynomial_size * (glwe_dimension + 1), BLOCK_SIZE_DECOMP));
+  dim3 threads_accum(BLOCK_SIZE_DECOMP);
+
+  // accumulate to a single glwe
+  accumulate_glwes<Torus><<<grid_accumulate, threads_accum, 0, stream>>>(
+      glwe_out, d_mem_0, glwe_dimension, polynomial_size, num_lwes);
+
+  check_cuda_error(cudaGetLastError());
+}
+
+#endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
index ec1f6bf022..b3f63176df 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -1,6 +1,8 @@
+#include "fast_packing_keyswitch.cuh"
 #include "keyswitch.cuh"
 #include "keyswitch.h"
 #include <cstdint>
+#include <stdio.h>
 
 /* Perform keyswitch on a batch of 32 bits input LWE ciphertexts.
  * Head out to the equivalent operation on 64 bits for more details.
@@ -53,15 +55,17 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
 
 void scratch_packing_keyswitch_lwe_list_to_glwe_64(
     void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
-    bool allocate_gpu_memory) {
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t num_lwes, bool allocate_gpu_memory) {
   scratch_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
-      static_cast<cudaStream_t>(stream), gpu_index, fp_ks_buffer,
+      static_cast<cudaStream_t>(stream), gpu_index, fp_ks_buffer, lwe_dimension,
       glwe_dimension, polynomial_size, num_lwes, allocate_gpu_memory);
 }
+
 /* Perform functional packing keyswitch on a batch of 64 bits input LWE
  * ciphertexts.
  */
+
 void cuda_packing_keyswitch_lwe_list_to_glwe_64(
     void *stream, uint32_t gpu_index, void *glwe_array_out,
     void const *lwe_array_in, void const *fp_ksk_array, int8_t *fp_ks_buffer,
@@ -69,13 +73,24 @@ void cuda_packing_keyswitch_lwe_list_to_glwe_64(
     uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count,
     uint32_t num_lwes) {
 
-  host_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
-      static_cast<cudaStream_t>(stream), gpu_index,
-      static_cast<uint64_t *>(glwe_array_out),
-      static_cast<const uint64_t *>(lwe_array_in),
-      static_cast<const uint64_t *>(fp_ksk_array), fp_ks_buffer,
-      input_lwe_dimension, output_glwe_dimension, output_polynomial_size,
-      base_log, level_count, num_lwes);
+  if (can_use_pks_fast_path(input_lwe_dimension, num_lwes,
+                            output_polynomial_size, level_count,
+                            output_glwe_dimension)) {
+    host_fast_packing_keyswitch_lwe_list_to_glwe<uint64_t, ulonglong4>(
+        static_cast<cudaStream_t>(stream), gpu_index,
+        static_cast<uint64_t *>(glwe_array_out),
+        static_cast<const uint64_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(fp_ksk_array), fp_ks_buffer,
+        input_lwe_dimension, output_glwe_dimension, output_polynomial_size,
+        base_log, level_count, num_lwes);
+  } else
+    host_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
+        static_cast<cudaStream_t>(stream), gpu_index,
+        static_cast<uint64_t *>(glwe_array_out),
+        static_cast<const uint64_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(fp_ksk_array), fp_ks_buffer,
+        input_lwe_dimension, output_glwe_dimension, output_polynomial_size,
+        base_log, level_count, num_lwes);
 }
 
 void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
index d4c77fb69c..5b79e5b892 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -158,16 +158,20 @@ void execute_keyswitch_async(cudaStream_t const *streams,
 template <typename Torus>
 __host__ void scratch_packing_keyswitch_lwe_list_to_glwe(
     cudaStream_t stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
-    bool allocate_gpu_memory) {
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t num_lwes, bool allocate_gpu_memory) {
   cudaSetDevice(gpu_index);
 
   int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
 
-  if (allocate_gpu_memory)
+  int memory_unit = glwe_accumulator_size > lwe_dimension
+                        ? glwe_accumulator_size
+                        : lwe_dimension;
+
+  if (allocate_gpu_memory) {
     *fp_ks_buffer = (int8_t *)cuda_malloc_async(
-        2 * num_lwes * glwe_accumulator_size * sizeof(Torus), stream,
-        gpu_index);
+        2 * num_lwes * memory_unit * sizeof(Torus), stream, gpu_index);
+  }
 }
 
 // public functional packing keyswitch for a single LWE ciphertext
@@ -241,6 +245,7 @@ __global__ void packing_keyswitch_lwe_list_to_glwe(
   auto lwe_in = lwe_array_in + input_id * lwe_size;
   auto ks_glwe_out = d_mem + input_id * glwe_accumulator_size;
   auto glwe_out = glwe_array_out + input_id * glwe_accumulator_size;
+
   // KS LWE to GLWE
   packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext<Torus>(
       ks_glwe_out, lwe_in, fp_ksk, lwe_dimension_in, glwe_dimension,
@@ -293,8 +298,18 @@ __host__ void host_packing_keyswitch_lwe_list_to_glwe(
   dim3 grid(num_blocks, num_lwes);
   dim3 threads(num_threads);
 
+  // The fast path of PKS uses the scratch buffer (d_mem) differently:
+  // it needs to store the decomposed masks in the first half of this buffer
+  // and the keyswitched GLWEs in the second half of the buffer. Thus the
+  // scratch buffer for the fast path must determine the half-size of the
+  // scratch buffer as the max between the size of the GLWE and the size of the
+  // LWE-mask
+  int memory_unit = glwe_accumulator_size > lwe_dimension_in
+                        ? glwe_accumulator_size
+                        : lwe_dimension_in;
+
   auto d_mem = (Torus *)fp_ks_buffer;
-  auto d_tmp_glwe_array_out = d_mem + num_lwes * glwe_accumulator_size;
+  auto d_tmp_glwe_array_out = d_mem + num_lwes * memory_unit;
 
   // individually keyswitch each lwe
   packing_keyswitch_lwe_list_to_glwe<Torus><<<grid, threads, 0, stream>>>(
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
index 097dc47f32..ea509006f2 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -2,6 +2,7 @@
 #define CUDA_INTEGER_COMPRESSION_CUH
 
 #include "ciphertext.h"
+#include "crypto/fast_packing_keyswitch.cuh"
 #include "crypto/keyswitch.cuh"
 #include "device.h"
 #include "integer/compression/compression.h"
@@ -116,11 +117,21 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
   while (rem_lwes > 0) {
     auto chunk_size = min(rem_lwes, mem_ptr->lwe_per_glwe);
 
-    host_packing_keyswitch_lwe_list_to_glwe<Torus>(
-        streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
-        fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
-        compression_params.polynomial_size, compression_params.ks_base_log,
-        compression_params.ks_level, chunk_size);
+    if (can_use_pks_fast_path(
+            input_lwe_dimension, chunk_size, compression_params.polynomial_size,
+            compression_params.ks_level, compression_params.glwe_dimension)) {
+      host_fast_packing_keyswitch_lwe_list_to_glwe<Torus, ulonglong4>(
+          streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
+          fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
+          compression_params.polynomial_size, compression_params.ks_base_log,
+          compression_params.ks_level, chunk_size);
+    } else {
+      host_packing_keyswitch_lwe_list_to_glwe<Torus>(
+          streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
+          fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
+          compression_params.polynomial_size, compression_params.ks_base_log,
+          compression_params.ks_level, chunk_size);
+    }
 
     rem_lwes -= chunk_size;
     lwe_subset += chunk_size * lwe_in_size;
diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs
index 1dfa0f2c2f..983a409338 100644
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -1229,6 +1229,7 @@ extern "C" {
         stream: *mut ffi::c_void,
         gpu_index: u32,
         fp_ks_buffer: *mut *mut i8,
+        lwe_dimension: u32,
         glwe_dimension: u32,
         polynomial_size: u32,
         num_lwes: u32,
diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
index 5964b4b874..c7fada2674 100644
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -290,6 +290,7 @@ pub unsafe fn packing_keyswitch_list_async<T: UnsignedInteger>(
         streams.ptr[0],
         streams.gpu_indexes[0].0,
         std::ptr::addr_of_mut!(fp_ks_buffer),
+        input_lwe_dimension.0 as u32,
         output_glwe_dimension.0 as u32,
         output_polynomial_size.0 as u32,
         num_lwes.0 as u32,
diff --git a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs
index ef4f88e13f..1f332ddb28 100644
--- a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs
+++ b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs
@@ -440,7 +440,6 @@ impl CudaCompressedCiphertextListBuilder {
         streams: &CudaStreams,
     ) -> CudaCompressedCiphertextList {
         let packed_list = comp_key.compress_ciphertexts_into_list(&self.ciphertexts, streams);
-
         CudaCompressedCiphertextList {
             packed_list,
             info: self.info.clone(),
@@ -488,6 +487,15 @@ mod tests {
     use crate::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
     use rand::Rng;
 
+    use crate::core_crypto::prelude::*;
+
+    use crate::core_crypto::prelude::{CiphertextModulusLog, LweCiphertextCount};
+    use crate::shortint::parameters::list_compression::CompressionParameters;
+    use crate::shortint::parameters::{
+        DecompositionBaseLog, DecompositionLevelCount, DynamicDistribution, GlweDimension,
+        PolynomialSize,
+    };
+
     const NB_TESTS: usize = 10;
     const NB_OPERATOR_TESTS: usize = 10;
 
@@ -717,4 +725,131 @@ mod tests {
             }
         }
     }
+
+    #[test]
+    fn test_gpu_ciphertext_compression_fast_path() {
+        /// Implement a test only for the storage of ciphertexts
+        /// using a custom parameter set which is supported by a fast-path
+        /// packing keyswitch (only for level_count==1)
+        const COMP_PARAM_CUSTOM_FAST_PATH: CompressionParameters = CompressionParameters {
+            br_level: DecompositionLevelCount(1),
+            br_base_log: DecompositionBaseLog(21),
+            packing_ks_level: DecompositionLevelCount(1),
+            packing_ks_base_log: DecompositionBaseLog(19),
+            packing_ks_polynomial_size: PolynomialSize(2048),
+            packing_ks_glwe_dimension: GlweDimension(1),
+            lwe_per_glwe: LweCiphertextCount(2048),
+            storage_log_modulus: CiphertextModulusLog(55),
+            packing_ks_key_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(
+                StandardDev(2.845267479601915e-15),
+            ),
+        };
+
+        const NUM_BLOCKS: usize = 32;
+
+        let streams = CudaStreams::new_multi_gpu();
+
+        let (radix_cks, sks) = gen_keys_radix_gpu(
+            PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
+            NUM_BLOCKS,
+            &streams,
+        );
+        let cks = radix_cks.as_ref();
+
+        let private_compression_key = cks.new_compression_private_key(COMP_PARAM_CUSTOM_FAST_PATH);
+
+        let (cuda_compression_key, cuda_decompression_key) =
+            radix_cks.new_cuda_compression_decompression_keys(&private_compression_key, &streams);
+
+        const MAX_NB_MESSAGES: usize = 2 * COMP_PARAM_CUSTOM_FAST_PATH.lwe_per_glwe.0 / NUM_BLOCKS;
+
+        let mut rng = rand::thread_rng();
+
+        let message_modulus: u128 = cks.parameters().message_modulus().0 as u128;
+
+        // Hybrid
+        enum MessageType {
+            Unsigned(u128),
+            Signed(i128),
+            Boolean(bool),
+        }
+        for _ in 0..NB_OPERATOR_TESTS {
+            let mut builder = CudaCompressedCiphertextListBuilder::new();
+
+            let nb_messages = rng.gen_range(1..=MAX_NB_MESSAGES as u64);
+            let mut messages = vec![];
+            for _ in 0..nb_messages {
+                let case_selector = rng.gen_range(0..3);
+                match case_selector {
+                    0 => {
+                        // Unsigned
+                        let modulus = message_modulus.pow(NUM_BLOCKS as u32);
+                        let message = rng.gen::<u128>() % modulus;
+                        let ct = radix_cks.encrypt(message);
+                        let d_ct =
+                            CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &streams);
+                        let d_and_ct = sks.bitand(&d_ct, &d_ct, &streams);
+                        builder.push(d_and_ct, &streams);
+                        messages.push(MessageType::Unsigned(message));
+                    }
+                    1 => {
+                        // Signed
+                        let modulus = message_modulus.pow((NUM_BLOCKS - 1) as u32) as i128;
+                        let message = rng.gen::<i128>() % modulus;
+                        let ct = radix_cks.encrypt_signed(message);
+                        let d_ct =
+                            CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct, &streams);
+                        let d_and_ct = sks.bitand(&d_ct, &d_ct, &streams);
+                        builder.push(d_and_ct, &streams);
+                        messages.push(MessageType::Signed(message));
+                    }
+                    _ => {
+                        // Boolean
+                        let message = rng.gen::<i64>() % 2 != 0;
+                        let ct = radix_cks.encrypt_bool(message);
+                        let d_boolean_ct = CudaBooleanBlock::from_boolean_block(&ct, &streams);
+                        let d_ct = d_boolean_ct.0;
+                        let d_and_boolean_ct =
+                            CudaBooleanBlock::from_cuda_radix_ciphertext(d_ct.ciphertext);
+                        builder.push(d_and_boolean_ct, &streams);
+                        messages.push(MessageType::Boolean(message));
+                    }
+                }
+            }
+
+            let cuda_compressed = builder.build(&cuda_compression_key, &streams);
+
+            for (i, val) in messages.iter().enumerate() {
+                match val {
+                    MessageType::Unsigned(message) => {
+                        let d_decompressed: CudaUnsignedRadixCiphertext = cuda_compressed
+                            .get(i, &cuda_decompression_key, &streams)
+                            .unwrap()
+                            .unwrap();
+                        let decompressed = d_decompressed.to_radix_ciphertext(&streams);
+                        let decrypted: u128 = radix_cks.decrypt(&decompressed);
+                        assert_eq!(decrypted, *message);
+                    }
+                    MessageType::Signed(message) => {
+                        let d_decompressed: CudaSignedRadixCiphertext = cuda_compressed
+                            .get(i, &cuda_decompression_key, &streams)
+                            .unwrap()
+                            .unwrap();
+                        let decompressed = d_decompressed.to_signed_radix_ciphertext(&streams);
+                        let decrypted: i128 = radix_cks.decrypt_signed(&decompressed);
+                        assert_eq!(decrypted, *message);
+                    }
+                    MessageType::Boolean(message) => {
+                        let d_decompressed: CudaBooleanBlock = cuda_compressed
+                            .get(i, &cuda_decompression_key, &streams)
+                            .unwrap()
+                            .unwrap();
+                        let decompressed = d_decompressed.to_boolean_block(&streams);
+                        let decrypted = radix_cks.decrypt_bool(&decompressed);
+                        assert_eq!(decrypted, *message);
+                    }
+                }
+            }
+        }
+    }
 }