diff --git a/.github/workflows/linux-build.yml b/.github/workflows/linux-build.yml
index 293a3d3e861b..6c547b35f548 100644
--- a/.github/workflows/linux-build.yml
+++ b/.github/workflows/linux-build.yml
@@ -61,6 +61,7 @@ jobs:
       VELOX_DEPENDENCY_SOURCE: SYSTEM
       simdjson_SOURCE: BUNDLED
       xsimd_SOURCE: BUNDLED
+      CUDA_VERSION: "11.8"
     steps:
       - uses: actions/checkout@v4
 
@@ -69,6 +70,15 @@ jobs:
         # it doesn't work
         run: git config --global --add safe.directory /__w/velox/velox
 
+      - name: Install Dependencies
+        run: |
+          # Allows to install arbitrary cuda-version whithout needing to update
+          # docker container before. It simplifies testing new/different versions
+          if ! yum list installed cuda-nvcc-$(echo ${CUDA_VERSION} | tr '.' '-') 1>/dev/null; then
+            source scripts/setup-centos8.sh
+            install_cuda ${CUDA_VERSION}
+          fi
+
       - uses: assignUser/stash/restore@v1
         with:
           path: '${{ env.CCACHE_DIR }}'
@@ -81,6 +91,10 @@ jobs:
       - name: Make Release Build
         env:
           MAKEFLAGS: 'NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=4'
+          CUDA_ARCHITECTURES: 60
+          CUDA_COMPILER: /usr/local/cuda-${CUDA_VERSION}/bin/nvcc
+          # Without that, nvcc picks /usr/bin/c++ which is GCC 8
+          CUDA_FLAGS: "-ccbin /opt/rh/gcc-toolset-9/root/usr/bin"
         run: |
           EXTRA_CMAKE_FLAGS=(
             "-DVELOX_ENABLE_BENCHMARKS=ON"
@@ -92,6 +106,7 @@ jobs:
             "-DVELOX_ENABLE_ABFS=ON"
             "-DVELOX_ENABLE_SUBSTRAIT=ON"
             "-DVELOX_ENABLE_REMOTE_FUNCTIONS=ON"
+            "-DVELOX_ENABLE_GPU=ON"
           )
           make release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS[*]}"
     
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 53aaf4391775..23a05ef6a37d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -361,6 +361,10 @@ if(${VELOX_ENABLE_GPU})
   enable_language(CUDA)
   # Determine CUDA_ARCHITECTURES automatically.
   cmake_policy(SET CMP0104 NEW)
+  if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+    # it will fail later in the build otherwise
+    message(FATAL_ERROR "-DCMAKE_CUDA_ARCHITECTURES= must be set")
+  endif()
   if(CMAKE_BUILD_TYPE MATCHES Debug)
     add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:-G>")
   endif()
diff --git a/Makefile b/Makefile
index 2bab08bce1ab..3e8ff17397bd 100644
--- a/Makefile
+++ b/Makefile
@@ -50,6 +50,18 @@ ifdef AZURESDK_ROOT_DIR
 CMAKE_FLAGS += -DAZURESDK_ROOT_DIR=$(AZURESDK_ROOT_DIR)
 endif
 
+ifdef CUDA_ARCHITECTURES
+CMAKE_FLAGS += -DCMAKE_CUDA_ARCHITECTURES="$(CUDA_ARCHITECTURES)"
+endif
+
+ifdef CUDA_COMPILER
+CMAKE_FLAGS += -DCMAKE_CUDA_COMPILER="$(CUDA_COMPILER)"
+endif
+
+ifdef CUDA_FLAGS
+CMAKE_FLAGS += -DCMAKE_CUDA_FLAGS="$(CUDA_FLAGS)"
+endif
+
 # Use Ninja if available. If Ninja is used, pass through parallelism control flags.
 USE_NINJA ?= 1
 ifeq ($(USE_NINJA), 1)
@@ -108,6 +120,14 @@ minimal:				 #: Minimal build
 	$(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_BUILD_MINIMAL=ON"
 	$(MAKE) build BUILD_DIR=release
 
+gpu:						 #: Build with GPU support
+	$(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON"
+	$(MAKE) build BUILD_DIR=release
+
+gpu_debug:			 #: Build with debugging symbols and GPU support
+	$(MAKE) cmake BUILD_DIR=debug BUILD_TYPE=debug EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON"
+	$(MAKE) build BUILD_DIR=debug
+
 dwio:						#: Minimal build with dwio enabled.
 	$(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} \
 																										    							  -DVELOX_BUILD_MINIMAL_WITH_DWIO=ON"
diff --git a/scripts/setup-centos8.sh b/scripts/setup-centos8.sh
index 0731dafc76c1..1a1157af5089 100755
--- a/scripts/setup-centos8.sh
+++ b/scripts/setup-centos8.sh
@@ -185,6 +185,12 @@ function install_duckdb {
   fi
 }
 
+function install_cuda {
+  # See https://developer.nvidia.com/cuda-downloads
+  dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
+  yum install -y cuda-nvcc-$(echo $1 | tr '.' '-') cuda-cudart-devel-$(echo $1 | tr '.' '-')
+}
+
 function install_velox_deps {
   run_and_time install_velox_deps_from_dnf
   run_and_time install_conda
diff --git a/scripts/setup-ubuntu.sh b/scripts/setup-ubuntu.sh
index d85c9320ed6f..822027ee34f6 100755
--- a/scripts/setup-ubuntu.sh
+++ b/scripts/setup-ubuntu.sh
@@ -141,6 +141,16 @@ function install_conda {
   bash Miniconda3-latest-Linux-$ARCH.sh -b -p $MINICONDA_PATH
 }
 
+function install_cuda {
+  # See https://developer.nvidia.com/cuda-downloads
+  if ! dpkg -l cuda-keyring 1>/dev/null; then
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+    $SUDO dpkg -i cuda-keyring_1.1-1_all.deb
+    rm cuda-keyring_1.1-1_all.deb
+    $SUDO apt update
+  fi
+  $SUDO apt install -y cuda-nvcc-$(echo $1 | tr '.' '-') cuda-cudart-dev-$(echo $1 | tr '.' '-')
+}
 
 function install_velox_deps {
   run_and_time install_velox_deps_from_apt
diff --git a/velox/common/memory/MemoryArbitrator.h b/velox/common/memory/MemoryArbitrator.h
index 10376f34c675..2fa76317dcd2 100644
--- a/velox/common/memory/MemoryArbitrator.h
+++ b/velox/common/memory/MemoryArbitrator.h
@@ -450,3 +450,15 @@ void testingRunArbitration(
     uint64_t targetBytes = 0,
     bool allowSpill = true);
 } // namespace facebook::velox::memory
+
+#if FMT_VERSION < 100100
+template <>
+struct fmt::formatter<facebook::velox::memory::MemoryArbitrator::Stats>
+    : formatter<std::string> {
+  auto format(
+      facebook::velox::memory::MemoryArbitrator::Stats s,
+      format_context& ctx) {
+    return formatter<std::string>::format(s.toString(), ctx);
+  }
+};
+#endif
diff --git a/velox/experimental/gpu/tests/CMakeLists.txt b/velox/experimental/gpu/tests/CMakeLists.txt
index 8eaca86e7b95..4dc31e422fbf 100644
--- a/velox/experimental/gpu/tests/CMakeLists.txt
+++ b/velox/experimental/gpu/tests/CMakeLists.txt
@@ -14,5 +14,3 @@
 
 add_executable(velox_gpu_hash_table_test HashTableTest.cu)
 target_link_libraries(velox_gpu_hash_table_test Folly::folly gflags::gflags)
-set_target_properties(velox_gpu_hash_table_test PROPERTIES CUDA_ARCHITECTURES
-                                                           native)
diff --git a/velox/experimental/gpu/tests/HashTableTest.cu b/velox/experimental/gpu/tests/HashTableTest.cu
index d1d5ac23ef25..6de1378d93a6 100644
--- a/velox/experimental/gpu/tests/HashTableTest.cu
+++ b/velox/experimental/gpu/tests/HashTableTest.cu
@@ -35,7 +35,7 @@ namespace {
 
 constexpr int kBlockSize = 256;
 
-__device__ uint32_t jenkinsRevMix32(uint32_t key) {
+[[maybe_unused]] __device__ uint32_t jenkinsRevMix32(uint32_t key) {
   key += (key << 12); // key *= (1 + (1 << 12))
   key ^= (key >> 22);
   key += (key << 4); // key *= (1 + (1 << 4))
@@ -298,7 +298,7 @@ __global__ void probe<true>(
       j = (j + sizeof(uint32_t)) & tableSizeMask;
       cmpMask = 0xffffffff;
     }
-  end:
+  end:;
   }
 }
 
@@ -627,7 +627,7 @@ __global__ void probePartitioned<true>(
       j = (j + sizeof(uint32_t)) & tableSizeMask;
       cmpMask = 0xffffffff;
     }
-  end:
+  end:;
   }
 }
 
diff --git a/velox/experimental/wave/common/CMakeLists.txt b/velox/experimental/wave/common/CMakeLists.txt
index 205d945331ae..a33fffffba35 100644
--- a/velox/experimental/wave/common/CMakeLists.txt
+++ b/velox/experimental/wave/common/CMakeLists.txt
@@ -15,8 +15,6 @@
 add_library(velox_wave_common GpuArena.cpp Buffer.cpp Cuda.cu Exception.cpp
                               Type.cpp)
 
-set_target_properties(velox_wave_common PROPERTIES CUDA_ARCHITECTURES native)
-
 target_link_libraries(velox_wave_common velox_exception velox_common_base
                       velox_type)
 
diff --git a/velox/experimental/wave/common/Cuda.cu b/velox/experimental/wave/common/Cuda.cu
index 48a1e6804029..10e716065a24 100644
--- a/velox/experimental/wave/common/Cuda.cu
+++ b/velox/experimental/wave/common/Cuda.cu
@@ -170,8 +170,8 @@ struct EventImpl {
 
 Event::Event(bool withTime) : hasTiming_(withTime) {
   event_ = std::make_unique<EventImpl>();
-  CUDA_CHECK(
-      cudaEventCreate(&event_->event, withTime ? 0 : cudaEventDisableTiming));
+  CUDA_CHECK(cudaEventCreateWithFlags(
+      &event_->event, withTime ? 0 : cudaEventDisableTiming));
 }
 
 Event::~Event() {}
diff --git a/velox/experimental/wave/common/tests/BlockTest.cpp b/velox/experimental/wave/common/tests/BlockTest.cpp
index 6c6d8b20f2f4..012010233e71 100644
--- a/velox/experimental/wave/common/tests/BlockTest.cpp
+++ b/velox/experimental/wave/common/tests/BlockTest.cpp
@@ -59,7 +59,7 @@ TEST_F(BlockTest, boolToIndices) {
   std::vector<int32_t> referenceIndices(kNumFlags);
   std::vector<int32_t> referenceSizes(kNumBlocks);
   uint8_t* flags = flagsBuffer->as<uint8_t>();
-  for (auto i = 0; i < kNumFlags; ++i) {
+  for (auto i = 0ul; i < kNumFlags; ++i) {
     if ((i >> 8) % 17 == 0) {
       flags[i] = 0;
     } else if ((i >> 8) % 23 == 0) {
diff --git a/velox/experimental/wave/common/tests/BlockTest.cu b/velox/experimental/wave/common/tests/BlockTest.cu
index bfc35e7132f0..6b32b9880f12 100644
--- a/velox/experimental/wave/common/tests/BlockTest.cu
+++ b/velox/experimental/wave/common/tests/BlockTest.cu
@@ -11,7 +11,7 @@ __global__ void boolToIndices(
     int32_t** indices,
     int32_t* sizes,
     int64_t* times) {
-  extern __shared__ __align__(alignof(ScanAlgorithm::TempStorage)) char smem[];
+  extern __shared__ char smem[];
   int32_t idx = blockIdx.x;
   // Start cycle timer
   clock_t start = clock();
@@ -42,8 +42,7 @@ void BlockTestStream::testBoolToIndices(
 }
 
 __global__ void sum64(int64_t* numbers, int64_t* results) {
-  extern __shared__ __align__(
-      alignof(cub::BlockReduce<int64_t, 256>::TempStorage)) char smem[];
+  extern __shared__ char smem[];
   int32_t idx = blockIdx.x;
   blockSum<256>(
       [&]() { return numbers[idx * 256 + threadIdx.x]; }, smem, results);
diff --git a/velox/experimental/wave/common/tests/CMakeLists.txt b/velox/experimental/wave/common/tests/CMakeLists.txt
index 159261e72a80..f9d2a3305eec 100644
--- a/velox/experimental/wave/common/tests/CMakeLists.txt
+++ b/velox/experimental/wave/common/tests/CMakeLists.txt
@@ -15,9 +15,6 @@
 add_executable(velox_wave_common_test GpuArenaTest.cpp CudaTest.cpp CudaTest.cu
                                       BlockTest.cpp BlockTest.cu)
 
-set_target_properties(velox_wave_common_test PROPERTIES CUDA_ARCHITECTURES
-                                                        native)
-
 add_test(velox_wave_common_test velox_wave_common_test)
 
 target_link_libraries(
diff --git a/velox/experimental/wave/dwio/CMakeLists.txt b/velox/experimental/wave/dwio/CMakeLists.txt
index ea7356914813..109bc63bb10a 100644
--- a/velox/experimental/wave/dwio/CMakeLists.txt
+++ b/velox/experimental/wave/dwio/CMakeLists.txt
@@ -16,3 +16,5 @@ add_subdirectory(decode)
 
 add_library(velox_wave_dwio ColumnReader.cpp FormatData.cpp ReadStream.cpp
                             StructColumnReader.cpp)
+
+target_link_libraries(velox_wave_dwio Folly::folly fmt::fmt xsimd)
diff --git a/velox/experimental/wave/dwio/decode/CMakeLists.txt b/velox/experimental/wave/dwio/decode/CMakeLists.txt
index cb6bedda4d49..412ba83c8bd3 100644
--- a/velox/experimental/wave/dwio/decode/CMakeLists.txt
+++ b/velox/experimental/wave/dwio/decode/CMakeLists.txt
@@ -16,6 +16,4 @@ add_subdirectory(tests)
 
 add_library(velox_wave_decode GpuDecoder.cu)
 
-set_target_properties(velox_wave_decode PROPERTIES CUDA_ARCHITECTURES native)
-
 target_link_libraries(velox_wave_decode velox_wave_common)
diff --git a/velox/experimental/wave/dwio/decode/GpuDecoder-inl.cuh b/velox/experimental/wave/dwio/decode/GpuDecoder-inl.cuh
index 401ceb83e8d1..62c23facac15 100644
--- a/velox/experimental/wave/dwio/decode/GpuDecoder-inl.cuh
+++ b/velox/experimental/wave/dwio/decode/GpuDecoder-inl.cuh
@@ -170,8 +170,7 @@ __device__ int scatterIndices(
     int32_t end,
     int32_t* indices) {
   typedef cub::BlockScan<int32_t, kBlockSize> BlockScan;
-  extern __shared__ __align__(
-      alignof(typename BlockScan::TempStorage)) char smem[];
+  extern __shared__ char smem[];
   auto* scanStorage = reinterpret_cast<typename BlockScan::TempStorage*>(smem);
   int numMatch;
   bool match;
@@ -198,8 +197,7 @@ __device__ int scatterIndices(
     int32_t end,
     int32_t* indices) {
   typedef cub::BlockScan<int32_t, kBlockSize> BlockScan;
-  extern __shared__ __align__(
-      alignof(typename BlockScan::TempStorage)) char smem[];
+  extern __shared__ char smem[];
   auto* scanStorage = reinterpret_cast<typename BlockScan::TempStorage*>(smem);
   constexpr int kPerThread = 8;
   int numMatch[kPerThread];
@@ -411,8 +409,7 @@ __device__ void decodeMainlyConstant(GpuDecode& plan) {
 template <int kBlockSize, typename T, typename U>
 __device__ T sum(const U* values, int size) {
   using Reduce = cub::BlockReduce<T, kBlockSize>;
-  extern __shared__ __align__(
-      alignof(typename Reduce::TempStorage)) char smem[];
+  extern __shared__ char smem[];
   auto* reduceStorage = reinterpret_cast<typename Reduce::TempStorage*>(smem);
   T total = 0;
   for (int i = 0; i < size; i += kBlockSize) {
@@ -452,8 +449,7 @@ __device__ int upperBound(const T* data, int size, T target) {
 template <int kBlockSize, typename T>
 __device__ void decodeRle(GpuDecode::Rle& op) {
   using BlockScan = cub::BlockScan<int32_t, kBlockSize>;
-  extern __shared__ __align__(
-      alignof(typename BlockScan::TempStorage)) char smem[];
+  extern __shared__ char smem[];
   auto* scanStorage = reinterpret_cast<typename BlockScan::TempStorage*>(smem);
 
   static_assert(sizeof(*scanStorage) >= sizeof(int32_t) * kBlockSize);
diff --git a/velox/experimental/wave/dwio/decode/tests/CMakeLists.txt b/velox/experimental/wave/dwio/decode/tests/CMakeLists.txt
index aac1f7e4e0ec..566d22c4ce7e 100644
--- a/velox/experimental/wave/dwio/decode/tests/CMakeLists.txt
+++ b/velox/experimental/wave/dwio/decode/tests/CMakeLists.txt
@@ -14,10 +14,7 @@
 
 add_executable(velox_wave_decode_test GpuDecoderTest.cu)
 
-set_target_properties(velox_wave_decode_test PROPERTIES CUDA_ARCHITECTURES
-                                                        native)
-
-add_test(velox_wave_common_test velox_wave_common_test)
+add_test(velox_wave_decode_test velox_wave_decode_test)
 
 target_link_libraries(
   velox_wave_decode_test
diff --git a/velox/experimental/wave/dwio/decode/tests/GpuDecoderTest.cu b/velox/experimental/wave/dwio/decode/tests/GpuDecoderTest.cu
index 4f391df7a10f..46e11cbc2a25 100644
--- a/velox/experimental/wave/dwio/decode/tests/GpuDecoderTest.cu
+++ b/velox/experimental/wave/dwio/decode/tests/GpuDecoderTest.cu
@@ -131,6 +131,10 @@ void makeBitpackDict(
 class GpuDecoderTest : public ::testing::Test {
  protected:
   void SetUp() override {
+    if (int device; cudaGetDevice(&device) != cudaSuccess) {
+      GTEST_SKIP() << "No CUDA detected, skipping all tests";
+    }
+
     CUDA_CHECK_FATAL(cudaEventCreate(&startEvent_));
     CUDA_CHECK_FATAL(cudaEventCreate(&stopEvent_));
   }
@@ -614,6 +618,11 @@ int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   folly::Init init{&argc, &argv};
 
+  if (int device; cudaGetDevice(&device) != cudaSuccess) {
+    std::cerr << "No CUDA detected, skipping all tests" << std::endl;
+    return 0;
+  }
+
   cudaDeviceProp prop;
   CUDA_CHECK_FATAL(cudaGetDeviceProperties(&prop, FLAGS_device_id));
   printf("Running on device: %s\n", prop.name);
diff --git a/velox/experimental/wave/exec/CMakeLists.txt b/velox/experimental/wave/exec/CMakeLists.txt
index 815a80ac3dce..c31e2946b7a4 100644
--- a/velox/experimental/wave/exec/CMakeLists.txt
+++ b/velox/experimental/wave/exec/CMakeLists.txt
@@ -14,6 +14,8 @@
 
 add_library(velox_wave_stream OperandSet.cpp Wave.cpp)
 
+target_link_libraries(velox_wave_stream Folly::folly fmt::fmt xsimd)
+
 add_library(
   velox_wave_exec
   Aggregation.cpp
@@ -29,8 +31,6 @@ add_library(
   WaveHiveDataSource.cpp
   WaveSplitReader.cpp)
 
-set_target_properties(velox_wave_exec PROPERTIES CUDA_ARCHITECTURES native)
-
 target_link_libraries(
   velox_wave_exec
   velox_wave_vector
diff --git a/velox/experimental/wave/exec/tests/CMakeLists.txt b/velox/experimental/wave/exec/tests/CMakeLists.txt
index f06c3b41e8d4..c6e8f81145e8 100644
--- a/velox/experimental/wave/exec/tests/CMakeLists.txt
+++ b/velox/experimental/wave/exec/tests/CMakeLists.txt
@@ -17,8 +17,6 @@ add_subdirectory(utils)
 add_executable(velox_wave_exec_test FilterProjectTest.cpp TableScanTest.cpp
                                     Main.cpp)
 
-set_target_properties(velox_wave_exec_test PROPERTIES CUDA_ARCHITECTURES native)
-
 add_test(velox_wave_exec_test velox_wave_exec_test)
 
 target_link_libraries(
diff --git a/velox/experimental/wave/exec/tests/TableScanTest.cpp b/velox/experimental/wave/exec/tests/TableScanTest.cpp
index cd1d9d081937..0e6bbebdbd41 100644
--- a/velox/experimental/wave/exec/tests/TableScanTest.cpp
+++ b/velox/experimental/wave/exec/tests/TableScanTest.cpp
@@ -34,6 +34,9 @@ using namespace facebook::velox::exec::test;
 class TableScanTest : public virtual HiveConnectorTestBase {
  protected:
   void SetUp() override {
+    if (int device; cudaGetDevice(&device) != cudaSuccess) {
+      GTEST_SKIP() << "No CUDA detected, skipping all tests";
+    }
     HiveConnectorTestBase::SetUp();
     wave::registerWave();
     wave::WaveHiveDataSource::registerConnector();
diff --git a/velox/experimental/wave/vector/tests/CMakeLists.txt b/velox/experimental/wave/vector/tests/CMakeLists.txt
index 2d4201c4c4cc..5423006f52c8 100644
--- a/velox/experimental/wave/vector/tests/CMakeLists.txt
+++ b/velox/experimental/wave/vector/tests/CMakeLists.txt
@@ -14,9 +14,6 @@
 
 add_executable(velox_wave_vector_test VectorTest.cpp)
 
-set_target_properties(velox_wave_vector_test PROPERTIES CUDA_ARCHITECTURES
-                                                        native)
-
 add_test(veloxwave__vector_test velox_wave_vector_test)
 
 target_link_libraries(