From 2f75e31fbd963d22e49bcaab8521eeea175dffe1 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Wed, 17 Jan 2024 15:09:15 +0200
Subject: [PATCH 1/4] Attempt to work around broken complex number operators in
 HIP

---
 include/dlaf/util_cuda.h | 238 ++++++++++++++++++++++++++++++++++++---
 src/lapack/gpu/add.cu    |   4 +-
 src/lapack/gpu/lacpy.cu  |   4 +-
 src/lapack/gpu/laset.cu  |   4 +-
 4 files changed, 226 insertions(+), 24 deletions(-)
diff --git a/include/dlaf/util_cuda.h b/include/dlaf/util_cuda.h
index 84eda86d59..ef3a185b15 100644
--- a/include/dlaf/util_cuda.h
+++ b/include/dlaf/util_cuda.h
@@ -39,6 +39,19 @@
 #include <dlaf/gpu/blas/api.h>
 
 namespace dlaf::util {
+
+#if defined(DLAF_WITH_HIP)
+struct hipComplexWrapper {
+    hipComplex v{};
+    __host__ __device__ hipComplexWrapper(const hipComplex& v) : v(v) {}
+};
+
+struct hipDoubleComplexWrapper {
+    hipDoubleComplex v{};
+    __host__ __device__ hipDoubleComplexWrapper(const hipDoubleComplex& v) : v(v) {}
+};
+#endif
+
 namespace internal {
 
 template <typename T>
@@ -68,6 +81,64 @@ struct CppToCudaType<T*> {
 
 template <typename T>
 using cppToCudaType_t = typename CppToCudaType<T>::type;
+
+#if defined(DLAF_WITH_CUDA)
+template <typename T>
+struct CppToCudaWrapperType {
+  using type = T;
+};
+
+template <>
+struct CppToCudaWrapperType<std::complex<float>> {
+  using type = cuComplex;
+};
+
+template <>
+struct CppToCudaWrapperType<std::complex<double>> {
+  using type = cuDoubleComplex;
+};
+
+template <typename T>
+struct CppToCudaWrapperType<const T> {
+  using type = const typename CppToCudaWrapperType<T>::type;
+};
+
+template <typename T>
+struct CppToCudaWrapperType<T*> {
+  using type = typename CppToCudaWrapperType<T>::type*;
+};
+
+template <typename T>
+using cppToCudaWrapperType_t = typename CppToCudaWrapperType<T>::type;
+#elif defined(DLAF_WITH_HIP)
+template <typename T>
+struct CppToCudaWrapperType {
+  using type = T;
+};
+
+template <>
+struct CppToCudaWrapperType<std::complex<float>> {
+  using type = hipComplexWrapper;
+};
+
+template <>
+struct CppToCudaWrapperType<std::complex<double>> {
+  using type = hipDoubleComplexWrapper;
+};
+
+template <typename T>
+struct CppToCudaWrapperType<const T> {
+  using type = const typename CppToCudaWrapperType<T>::type;
+};
+
+template <typename T>
+struct CppToCudaWrapperType<T*> {
+  using type = typename CppToCudaWrapperType<T>::type*;
+};
+
+template <typename T>
+using cppToCudaWrapperType_t = typename CppToCudaWrapperType<T>::type;
+#endif
 }
 
 template <typename T>
@@ -80,6 +151,16 @@ constexpr typename internal::CppToCudaType<T>::type cppToCudaCast(T v) {
   return *cppToCudaCast(&v);
 }
 
+template <typename T>
+constexpr typename internal::CppToCudaWrapperType<T*>::type cppToCudaWrapperCast(T* p) {
+  return reinterpret_cast<typename internal::cppToCudaWrapperType_t<T*>>(p);
+}
+
+template <typename T>
+constexpr typename internal::CppToCudaWrapperType<T>::type cppToCudaWrapperCast(T v) {
+  return *cppToCudaWrapperCast(&v);
+}
+
 namespace cuda_operators {
 
 // operators for Cuda types:
@@ -115,6 +196,7 @@ __host__ __device__ inline float fma(const float& a, const float& b, const float
 }
 
 // Complex
+#ifdef DLAF_WITH_CUDA
 __host__ __device__ inline cuComplex conj(const cuComplex& a) noexcept {
   return cuConjf(a);
 }
@@ -127,7 +209,11 @@ __host__ __device__ inline float imag(const cuComplex& a) noexcept {
   return a.y;
 }
 
-#ifdef DLAF_WITH_CUDA
+__host__ __device__ inline cuComplex fma(const cuComplex& a, const cuComplex& b,
+                                         const cuComplex& c) noexcept {
+  return cuCfmaf(a, b, c);
+}
+
 __host__ __device__ inline cuComplex operator-(const cuComplex& a) noexcept {
   return make_cuComplex(-a.x, -a.y);
 }
@@ -137,7 +223,7 @@ __host__ __device__ inline bool operator==(const cuComplex& a, const cuComplex&
 }
 
 __host__ __device__ inline bool operator!=(const cuComplex& a, const cuComplex& b) noexcept {
-  return !operator==(a, b);
+  return !(operator==)(a, b);
 }
 
 __host__ __device__ inline cuComplex operator+(const cuComplex& a, const cuComplex& b) noexcept {
@@ -155,24 +241,76 @@ __host__ __device__ inline cuComplex operator*(const cuComplex& a, const cuCompl
 __host__ __device__ inline cuComplex operator/(const cuComplex& a, const cuComplex& b) noexcept {
   return cuCdivf(a, b);
 }
-#endif
-
-__host__ __device__ inline cuComplex fma(const cuComplex& a, const cuComplex& b,
-                                         const cuComplex& c) noexcept {
-  return cuCfmaf(a, b, c);
-}
 
 __host__ __device__ inline cuComplex operator*(const float& a, const cuComplex& b) noexcept {
   return make_cuComplex(a * b.x, a * b.y);
 }
 
 __host__ __device__ inline cuComplex operator*(const cuComplex& a, const float& b) noexcept {
-  return operator*(b, a);
+  return (operator*)(b, a);
 }
 
 __host__ __device__ inline cuComplex operator/(const cuComplex& a, const float& b) noexcept {
   return make_cuComplex(a.x / b, a.y / b);
 }
+#elif defined(DLAF_WITH_HIP)
+__host__ __device__ inline hipComplexWrapper conj(const hipComplexWrapper& a) noexcept {
+  return cuConjf(a.v);
+}
+
+__host__ __device__ inline float real(const hipComplexWrapper& a) noexcept {
+  return a.v.x;
+}
+
+__host__ __device__ inline float imag(const hipComplexWrapper& a) noexcept {
+  return a.v.y;
+}
+
+__host__ __device__ inline hipComplexWrapper fma(const hipComplexWrapper& a, const hipComplexWrapper& b,
+                                         const hipComplexWrapper& c) noexcept {
+  return {cuCfmaf(a.v, b.v, c.v)};
+}
+
+__host__ __device__ inline hipComplexWrapper operator-(const hipComplexWrapper& a) noexcept {
+  return {make_cuComplex(-a.v.x, -a.v.y)};
+}
+
+__host__ __device__ inline bool operator==(const hipComplexWrapper& a, const hipComplexWrapper& b) noexcept {
+  return a.v.x == b.v.x && a.v.y == b.v.y;
+}
+
+__host__ __device__ inline bool operator!=(const hipComplexWrapper& a, const hipComplexWrapper& b) noexcept {
+  return !(operator==)(a, b);
+}
+
+__host__ __device__ inline hipComplexWrapper operator+(const hipComplexWrapper& a, const hipComplexWrapper& b) noexcept {
+  return {cuCaddf(a.v, b.v)};
+}
+
+__host__ __device__ inline hipComplexWrapper operator-(const hipComplexWrapper& a, const hipComplexWrapper& b) noexcept {
+  return {cuCsubf(a.v, b.v)};
+}
+
+__host__ __device__ inline hipComplexWrapper operator*(const hipComplexWrapper& a, const hipComplexWrapper& b) noexcept {
+  return {cuCmulf(a.v, b.v)};
+}
+
+__host__ __device__ inline hipComplexWrapper operator/(const hipComplexWrapper& a, const hipComplexWrapper& b) noexcept {
+  return {cuCdivf(a.v, b.v)};
+}
+
+__host__ __device__ inline hipComplexWrapper operator*(const float& a, const hipComplexWrapper& b) noexcept {
+  return {make_cuComplex(a * b.v.x, a * b.v.y)};
+}
+
+__host__ __device__ inline hipComplexWrapper operator*(const hipComplexWrapper& a, const float& b) noexcept {
+  return (operator*)(b, a);
+}
+
+__host__ __device__ inline hipComplexWrapper operator/(const hipComplexWrapper& a, const float& b) noexcept {
+  return {make_cuComplex(a.v.x / b, a.v.y / b)};
+}
+#endif
 
 // Double
 __host__ __device__ inline double conj(const double& a) noexcept {
@@ -192,6 +330,7 @@ __host__ __device__ inline double fma(const double& a, const double& b, const do
 }
 
 // Double complex
+#ifdef DLAF_WITH_CUDA
 __host__ __device__ inline cuDoubleComplex conj(const cuDoubleComplex& a) noexcept {
   return cuConj(a);
 }
@@ -204,7 +343,11 @@ __host__ __device__ inline double imag(const cuDoubleComplex& a) noexcept {
   return a.y;
 }
 
-#ifdef DLAF_WITH_CUDA
+__host__ __device__ inline cuDoubleComplex fma(const cuDoubleComplex& a, const cuDoubleComplex& b,
+                                               const cuDoubleComplex& c) noexcept {
+  return cuCfma(a, b, c);
+}
+
 __host__ __device__ inline cuDoubleComplex operator-(const cuDoubleComplex& a) noexcept {
   return make_cuDoubleComplex(-a.x, -a.y);
 }
@@ -214,7 +357,7 @@ __host__ __device__ inline bool operator==(const cuDoubleComplex& a, const cuDou
 }
 
 __host__ __device__ inline bool operator!=(const cuDoubleComplex& a, const cuDoubleComplex& b) noexcept {
-  return !operator==(a, b);
+  return !(operator==)(a, b);
 }
 
 __host__ __device__ inline cuDoubleComplex operator+(const cuDoubleComplex& a,
@@ -236,12 +379,6 @@ __host__ __device__ inline cuDoubleComplex operator/(const cuDoubleComplex& a,
                                                      const cuDoubleComplex& b) noexcept {
   return cuCdiv(a, b);
 }
-#endif
-
-__host__ __device__ inline cuDoubleComplex fma(const cuDoubleComplex& a, const cuDoubleComplex& b,
-                                               const cuDoubleComplex& c) noexcept {
-  return cuCfma(a, b, c);
-}
 
 __host__ __device__ inline cuDoubleComplex operator*(const double& a,
                                                      const cuDoubleComplex& b) noexcept {
@@ -250,13 +387,78 @@ __host__ __device__ inline cuDoubleComplex operator*(const double& a,
 
 __host__ __device__ inline cuDoubleComplex operator*(const cuDoubleComplex& a,
                                                      const double& b) noexcept {
-  return operator*(b, a);
+  return (operator*)(b, a);
 }
 
 __host__ __device__ inline cuDoubleComplex operator/(const cuDoubleComplex& a,
                                                      const double& b) noexcept {
   return make_cuDoubleComplex(a.x / b, a.y / b);
 }
+#elif defined(DLAF_WITH_HIP)
+__host__ __device__ inline hipDoubleComplexWrapper conj(const hipDoubleComplexWrapper& a) noexcept {
+  return {cuConj(a.v)};
+}
+
+__host__ __device__ inline double real(const hipDoubleComplexWrapper& a) noexcept {
+  return a.v.x;
+}
+
+__host__ __device__ inline double imag(const hipDoubleComplexWrapper& a) noexcept {
+  return a.v.y;
+}
+
+__host__ __device__ inline hipDoubleComplexWrapper fma(const hipDoubleComplexWrapper& a, const hipDoubleComplexWrapper& b,
+                                               const hipDoubleComplexWrapper& c) noexcept {
+  return {cuCfma(a.v, b.v, c.v)};
+}
+
+__host__ __device__ inline hipDoubleComplexWrapper operator-(const hipDoubleComplexWrapper& a) noexcept {
+  return {make_cuDoubleComplex(-a.v.x, -a.v.y)};
+}
+
+__host__ __device__ inline bool operator==(const hipDoubleComplexWrapper& a, const hipDoubleComplexWrapper& b) noexcept {
+  return a.v.x == b.v.x && a.v.y == b.v.y;
+}
+
+__host__ __device__ inline bool operator!=(const hipDoubleComplexWrapper& a, const hipDoubleComplexWrapper& b) noexcept {
+  return !(operator==)(a, b);
+}
+
+__host__ __device__ inline hipDoubleComplexWrapper operator+(const hipDoubleComplexWrapper& a,
+                                                     const hipDoubleComplexWrapper& b) noexcept {
+  return {cuCadd(a.v, b.v)};
+}
+
+__host__ __device__ inline hipDoubleComplexWrapper operator-(const hipDoubleComplexWrapper& a,
+                                                     const hipDoubleComplexWrapper& b) noexcept {
+  return {cuCsub(a.v, b.v)};
+}
+
+__host__ __device__ inline hipDoubleComplexWrapper operator*(const hipDoubleComplexWrapper& a,
+                                                     const hipDoubleComplexWrapper& b) noexcept {
+  return {cuCmul(a.v, b.v)};
+}
+
+__host__ __device__ inline hipDoubleComplexWrapper operator/(const hipDoubleComplexWrapper& a,
+                                                     const hipDoubleComplexWrapper& b) noexcept {
+  return {cuCdiv(a.v, b.v)};
+}
+
+__host__ __device__ inline hipDoubleComplexWrapper operator*(const double& a,
+                                                     const hipDoubleComplexWrapper& b) noexcept {
+  return {make_cuDoubleComplex(a * b.v.x, a * b.v.y)};
+}
+
+__host__ __device__ inline hipDoubleComplexWrapper operator*(const hipDoubleComplexWrapper& a,
+                                                     const double& b) noexcept {
+  return (operator*)(b, a);
+}
+
+__host__ __device__ inline hipDoubleComplexWrapper operator/(const hipDoubleComplexWrapper& a,
+                                                     const double& b) noexcept {
+  return {make_cuDoubleComplex(a.v.x / b, a.v.y / b)};
+}
+#endif
 
 }
 }
diff --git a/src/lapack/gpu/add.cu b/src/lapack/gpu/add.cu
index 9e9f3e97e7..c9e35c14cb 100644
--- a/src/lapack/gpu/add.cu
+++ b/src/lapack/gpu/add.cu
@@ -159,8 +159,8 @@ void add(const blas::Uplo uplo, const SizeType m, const SizeType n, const T& alp
   const dim3 nr_blocks(util::ceilDiv(um, kernel_tile_size_rows),
                        util::ceilDiv(un, kernel_tile_size_cols));
   kernels::add<<<nr_blocks, nr_threads, 0, stream>>>(util::blasToCublas(uplo), um, un,
-                                                     util::cppToCudaCast(alpha), util::cppToCudaCast(a),
-                                                     to_uint(lda), util::cppToCudaCast(b), to_uint(ldb));
+                                                     util::cppToCudaWrapperCast(alpha), util::cppToCudaWrapperCast(a),
+                                                     to_uint(lda), util::cppToCudaWrapperCast(b), to_uint(ldb));
 }
 
 DLAF_CUBLAS_ADD_ETI(, float);
diff --git a/src/lapack/gpu/lacpy.cu b/src/lapack/gpu/lacpy.cu
index 16baa883e2..d360f6eae4 100644
--- a/src/lapack/gpu/lacpy.cu
+++ b/src/lapack/gpu/lacpy.cu
@@ -130,8 +130,8 @@ void lacpy(const blas::Uplo uplo, const SizeType m, const SizeType n, const T* a
     const dim3 nr_blocks(util::ceilDiv(um, kernel_tile_size_rows),
                          util::ceilDiv(un, kernel_tile_size_cols));
     kernels::lacpy<<<nr_blocks, nr_threads, 0, stream>>>(util::blasToCublas(uplo), um, un,
-                                                         util::cppToCudaCast(a), to_uint(lda),
-                                                         util::cppToCudaCast(b), to_uint(ldb));
+                                                         util::cppToCudaWrapperCast(a), to_uint(lda),
+                                                         util::cppToCudaWrapperCast(b), to_uint(ldb));
   }
 }
 
diff --git a/src/lapack/gpu/laset.cu b/src/lapack/gpu/laset.cu
index dea96572d9..cb186bbb51 100644
--- a/src/lapack/gpu/laset.cu
+++ b/src/lapack/gpu/laset.cu
@@ -112,8 +112,8 @@ void laset(blas::Uplo uplo, SizeType m, SizeType n, T alpha, T beta, T* a, SizeT
   dim3 nr_threads(kernel_tile_size_rows, 1);
   dim3 nr_blocks(util::ceilDiv(um, kernel_tile_size_rows), util::ceilDiv(un, kernel_tile_size_cols));
   kernels::laset<<<nr_blocks, nr_threads, 0, stream>>>(util::blasToCublas(uplo), um, un,
-                                                       util::cppToCudaCast(alpha),
-                                                       util::cppToCudaCast(beta), util::cppToCudaCast(a),
+                                                       util::cppToCudaWrapperCast(alpha),
+                                                       util::cppToCudaWrapperCast(beta), util::cppToCudaWrapperCast(a),
                                                        to_uint(lda));
 }
 

From 899a64effe7d6472c9584818b392bcb361edf45d Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Wed, 17 Jan 2024 15:10:21 +0200
Subject: [PATCH 2/4] Disable non-GPU CI pipelines

---
 ci/.gitlab-ci.yml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/ci/.gitlab-ci.yml b/ci/.gitlab-ci.yml
index ab4d738588..421d2380c3 100644
--- a/ci/.gitlab-ci.yml
+++ b/ci/.gitlab-ci.yml
@@ -1,13 +1,13 @@
 include:
-  - local: 'ci/cpu/clang12_release.yml'
-  - local: 'ci/cpu/clang13_release_cxx20.yml'
-  - local: 'ci/cpu/clang14_release_stdexec.yml'
-  - local: 'ci/cpu/clang15_release.yml'
-  - local: 'ci/cpu/gcc11_codecov.yml'
-  - local: 'ci/cpu/gcc11_release.yml'
-  - local: 'ci/cpu/gcc11_release_stdexec.yml'
-  - local: 'ci/cpu/gcc11_debug_stdexec.yml'
-  - local: 'ci/cpu/gcc12_release_cxx20.yml'
+  #  - local: 'ci/cpu/clang12_release.yml'
+  #  - local: 'ci/cpu/clang13_release_cxx20.yml'
+  #  - local: 'ci/cpu/clang14_release_stdexec.yml'
+  #  - local: 'ci/cpu/clang15_release.yml'
+  #  - local: 'ci/cpu/gcc11_codecov.yml'
+  #  - local: 'ci/cpu/gcc11_release.yml'
+  #  - local: 'ci/cpu/gcc11_release_stdexec.yml'
+  #  - local: 'ci/cpu/gcc11_debug_stdexec.yml'
+  #  - local: 'ci/cpu/gcc12_release_cxx20.yml'
   - local: 'ci/cuda/gcc11_release.yml'
   - local: 'ci/cuda/gcc11_release_scalapack.yml'
   - local: 'ci/cuda/gcc11_codecov.yml'

From 580b6905af53ab683b1fbeb48bcf494369c311d4 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Thu, 18 Jan 2024 13:26:12 +0100
Subject: [PATCH 3/4] Add ROCM 5.7.1 CI pipeline

---
 ci/.gitlab-ci.yml                  |  1 +
 ci/docker/release-rocm571.yaml     | 70 ++++++++++++++++++++++++++++++
 ci/rocm/clang14_rocm57_release.yml | 22 ++++++++++
 3 files changed, 93 insertions(+)
 create mode 100644 ci/docker/release-rocm571.yaml
 create mode 100644 ci/rocm/clang14_rocm57_release.yml

diff --git a/ci/.gitlab-ci.yml b/ci/.gitlab-ci.yml
index 421d2380c3..1cd20b98e8 100644
--- a/ci/.gitlab-ci.yml
+++ b/ci/.gitlab-ci.yml
@@ -14,3 +14,4 @@ include:
   - local: 'ci/cuda/gcc11_debug_scalapack.yml'
   - local: 'ci/rocm/clang14_release.yml'
   - local: 'ci/rocm/clang14_release_stdexec.yml'
+  - local: 'ci/rocm/clang14_rocm57_release.yml'
diff --git a/ci/docker/release-rocm571.yaml b/ci/docker/release-rocm571.yaml
new file mode 100644
index 0000000000..93f9d70c20
--- /dev/null
+++ b/ci/docker/release-rocm571.yaml
@@ -0,0 +1,70 @@
+#
+# Distributed Linear Algebra with Future (DLAF)
+#
+# Copyright (c) 2018-2023, ETH Zurich
+# All rights reserved.
+#
+# Please, refer to the LICENSE file in the root directory.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+
+spack:
+  include:
+  - /spack_environment/common.yaml
+
+  view: false
+  concretizer:
+    unify:
+      true
+
+  specs:
+  - dla-future@master +rocm amdgpu_target=gfx90a:xnack- +miniapps +ci-test
+
+  packages:
+    all:
+      variants:
+      - build_type=Release
+      - cxxstd=17
+      - amdgpu_target=gfx90a:xnack-
+    blas:
+      require:: openblas
+    lapack:
+      require:: openblas
+    mpich:
+      require:
+        - '~rocm'
+        - 'device=ch3'
+        - 'netmod=tcp'
+    hip:
+      require:
+        - '@5.7.1'
+#    llvm-amdgpu:
+#      externals:
+#      - spec: llvm-amdgpu@5.7.1 ~rocm-device-libs
+#        prefix: /opt/rocm-5.7.1/llvm
+#      buildable: false
+#    rocm-device-libs:
+#      externals:
+#      - spec: rocm-device-libs@5.7.1
+#        prefix: /opt/rocm-5.7.1
+#      buildable: false
+#    hip:
+#      externals:
+#      - spec: hip@5.7.1
+#        prefix: /opt/rocm-5.7.1
+#      buildable: false
+#    rocblas:
+#      externals:
+#      - spec: rocblas@5.7.1
+#        prefix: /opt/rocm-5.7.1
+#      buildable: false
+#    rocsolver:
+#      externals:
+#      - spec: rocsolver@5.7.1
+#        prefix: /opt/rocm-5.7.1
+#      buildable: false
+#    hsa-rocr-dev:
+#      externals:
+#      - spec: hsa-rocr-dev@5.7.1
+#        prefix: /opt/rocm-5.7.1
+#      buildable: false
diff --git a/ci/rocm/clang14_rocm57_release.yml b/ci/rocm/clang14_rocm57_release.yml
new file mode 100644
index 0000000000..3dd281758f
--- /dev/null
+++ b/ci/rocm/clang14_rocm57_release.yml
@@ -0,0 +1,22 @@
+include:
+  - local: 'ci/common-ci.yml'
+
+rocm clang14 rocm57 release deps:
+  extends: .build_deps_common
+  variables:
+    # BASE_IMAGE: docker.io/rocm/dev-ubuntu-22.04:5.7.1
+    EXTRA_APTGET: "clang-14 libomp-14-dev" # rocblas rocblas-dev rocsolver rocsolver-dev llvm-amdgpu rocm-device-libs"
+    COMPILER: clang@14
+    USE_ROCBLAS: "ON"
+    SPACK_ENVIRONMENT: ci/docker/release-rocm571.yaml
+    BUILD_IMAGE: $CSCS_REGISTRY_PATH/rocm57-clang14-release/build
+
+rocm clang14 rocm57 release build:
+  extends:
+    - .build_common
+  after_script: null
+  needs:
+    - rocm clang14 rocm57 release deps
+  variables:
+    # DEPLOY_BASE_IMAGE: docker.io/rocm/dev-ubuntu-22.04:5.7.1
+    DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/rocm57-clang14-release/deploy:$CI_COMMIT_SHA

From a0011e06be90ba1d5422243cae9f07b2be2ccb67 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Thu, 18 Jan 2024 13:27:34 +0100
Subject: [PATCH 4/4] Disable all but ROCM 5.7.1 pipeline

---
 ci/.gitlab-ci.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ci/.gitlab-ci.yml b/ci/.gitlab-ci.yml
index 1cd20b98e8..191b10f925 100644
--- a/ci/.gitlab-ci.yml
+++ b/ci/.gitlab-ci.yml
@@ -8,10 +8,10 @@ include:
   #  - local: 'ci/cpu/gcc11_release_stdexec.yml'
   #  - local: 'ci/cpu/gcc11_debug_stdexec.yml'
   #  - local: 'ci/cpu/gcc12_release_cxx20.yml'
-  - local: 'ci/cuda/gcc11_release.yml'
-  - local: 'ci/cuda/gcc11_release_scalapack.yml'
-  - local: 'ci/cuda/gcc11_codecov.yml'
-  - local: 'ci/cuda/gcc11_debug_scalapack.yml'
-  - local: 'ci/rocm/clang14_release.yml'
-  - local: 'ci/rocm/clang14_release_stdexec.yml'
+  # - local: 'ci/cuda/gcc11_release.yml'
+  # - local: 'ci/cuda/gcc11_release_scalapack.yml'
+  # - local: 'ci/cuda/gcc11_codecov.yml'
+  # - local: 'ci/cuda/gcc11_debug_scalapack.yml'
+  # - local: 'ci/rocm/clang14_release.yml'
+  # - local: 'ci/rocm/clang14_release_stdexec.yml'
   - local: 'ci/rocm/clang14_rocm57_release.yml'