diff --git a/.jenkins/cscs/env-clang-cuda.sh b/.jenkins/cscs/env-clang-cuda.sh
index 574551ba91c3..51a4aab1b97b 100644
--- a/.jenkins/cscs/env-clang-cuda.sh
+++ b/.jenkins/cscs/env-clang-cuda.sh
@@ -14,7 +14,6 @@ export HWLOC_ROOT="${APPS_ROOT}/hwloc-2.0.3-gcc-8.3.0"
 module load daint-gpu
 module load cudatoolkit/10.2.89_3.29-7.0.2.1_3.5__g67354b4
 module load Boost/1.75.0-CrayCCE-20.11
-module switch cce cce/10.0.2
 spack load cmake
 spack load ninja
 
diff --git a/libs/full/async_cuda/tests/unit/cuda_future.cpp b/libs/full/async_cuda/tests/unit/cuda_future.cpp
index 31ee39806bda..9e14adfe2a49 100644
--- a/libs/full/async_cuda/tests/unit/cuda_future.cpp
+++ b/libs/full/async_cuda/tests/unit/cuda_future.cpp
@@ -36,7 +36,11 @@
 template <typename T>
 extern void cuda_trivial_kernel(T, cudaStream_t stream);
 
-extern __global__ void saxpy(int n, float a, float* x, float* y);
+// Need to move the call to the saxpy device kernel in .cu, as the symbol change
+// from saxpy to __device_stub__saxpy when moving from Clang 10 to Clang 11
+extern void launch_saxpy_kernel(
+    hpx::cuda::experimental::cuda_executor& cudaexec, unsigned int& blocks,
+    unsigned int& threads, void** args);
 // -------------------------------------------------------------------------
 int test_saxpy(hpx::cuda::experimental::cuda_executor& cudaexec)
 {
@@ -77,13 +81,7 @@ int test_saxpy(hpx::cuda::experimental::cuda_executor& cudaexec)
 
     // now launch a kernel on the stream
     void* args[] = {&N, &ratio, &d_A, &d_B};
-#ifdef HPX_HAVE_HIP
-    hpx::apply(cudaexec, cudaLaunchKernel,
-#else
-    hpx::apply(cudaexec, cudaLaunchKernel<void>,
-#endif
-        reinterpret_cast<const void*>(&saxpy), dim3(blocks), dim3(threads),
-        args, std::size_t(0));
+    launch_saxpy_kernel(cudaexec, blocks, threads, args);
 
     // finally, perform a copy from the gpu back to the cpu all on the same stream
     // grab a future to when this completes
diff --git a/libs/full/async_cuda/tests/unit/saxpy.cu b/libs/full/async_cuda/tests/unit/saxpy.cu
index 6245e8acb3b8..206ec641c674 100644
--- a/libs/full/async_cuda/tests/unit/saxpy.cu
+++ b/libs/full/async_cuda/tests/unit/saxpy.cu
@@ -4,11 +4,30 @@
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+#include <hpx/local/future.hpp>
+
+#include <hpx/async_cuda/cuda_executor.hpp>
 #include <hpx/async_cuda/custom_gpu_api.hpp>
 
+#include <cstddef>
+
 __global__ void saxpy(int n, float a, float* x, float* y)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < n)
         y[i] = a * x[i] + y[i];
 }
+
+void launch_saxpy_kernel(hpx::cuda::experimental::cuda_executor& cudaexec,
+        unsigned int& blocks, unsigned int& threads, void** args)
+{
+    // Invoking hpx::apply with cudaLaunchKernel<void> directly result in an
+    // error for NVCC with gcc configuration
+#ifdef HPX_HAVE_HIP
+    auto launch_kernel = cudaLaunchKernel;
+#else
+    auto launch_kernel = cudaLaunchKernel<void>;
+#endif
+    hpx::apply(cudaexec, launch_kernel, reinterpret_cast<const void*>(&saxpy),
+        dim3(blocks), dim3(threads), args, std::size_t(0));
+}