From 2f75e31fbd963d22e49bcaab8521eeea175dffe1 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 17 Jan 2024 15:09:15 +0200 Subject: [PATCH 1/4] Attempt to work around broken complex number operators in HIP --- include/dlaf/util_cuda.h | 238 ++++++++++++++++++++++++++++++++++++--- src/lapack/gpu/add.cu | 4 +- src/lapack/gpu/lacpy.cu | 4 +- src/lapack/gpu/laset.cu | 4 +- 4 files changed, 226 insertions(+), 24 deletions(-) diff --git a/include/dlaf/util_cuda.h b/include/dlaf/util_cuda.h index 84eda86d59..ef3a185b15 100644 --- a/include/dlaf/util_cuda.h +++ b/include/dlaf/util_cuda.h @@ -39,6 +39,19 @@ #include namespace dlaf::util { + +#if defined(DLAF_WITH_HIP) +struct hipComplexWrapper { + hipComplex v{}; + __host__ __device__ hipComplexWrapper(const hipComplex& v) : v(v) {} +}; + +struct hipDoubleComplexWrapper { + hipDoubleComplex v{}; + __host__ __device__ hipDoubleComplexWrapper(const hipDoubleComplex& v) : v(v) {} +}; +#endif + namespace internal { template @@ -68,6 +81,64 @@ struct CppToCudaType { template using cppToCudaType_t = typename CppToCudaType::type; + +#if defined(DLAF_WITH_CUDA) +template +struct CppToCudaWrapperType { + using type = T; +}; + +template <> +struct CppToCudaWrapperType> { + using type = cuComplex; +}; + +template <> +struct CppToCudaWrapperType> { + using type = cuDoubleComplex; +}; + +template +struct CppToCudaWrapperType { + using type = const typename CppToCudaWrapperType::type; +}; + +template +struct CppToCudaWrapperType { + using type = typename CppToCudaWrapperType::type*; +}; + +template +using cppToCudaWrapperType_t = typename CppToCudaWrapperType::type; +#elif defined(DLAF_WITH_HIP) +template +struct CppToCudaWrapperType { + using type = T; +}; + +template <> +struct CppToCudaWrapperType> { + using type = hipComplexWrapper; +}; + +template <> +struct CppToCudaWrapperType> { + using type = hipDoubleComplexWrapper; +}; + +template +struct CppToCudaWrapperType { + using type = const typename CppToCudaWrapperType::type; +}; + +template +struct CppToCudaWrapperType { + using type = typename CppToCudaWrapperType::type*; +}; + +template +using cppToCudaWrapperType_t = typename CppToCudaWrapperType::type; +#endif } template @@ -80,6 +151,16 @@ constexpr typename internal::CppToCudaType::type cppToCudaCast(T v) { return *cppToCudaCast(&v); } +template +constexpr typename internal::CppToCudaWrapperType::type cppToCudaWrapperCast(T* p) { + return reinterpret_cast>(p); +} + +template +constexpr typename internal::CppToCudaWrapperType::type cppToCudaWrapperCast(T v) { + return *cppToCudaWrapperCast(&v); +} + namespace cuda_operators { // operators for Cuda types: @@ -115,6 +196,7 @@ __host__ __device__ inline float fma(const float& a, const float& b, const float } // Complex +#ifdef DLAF_WITH_CUDA __host__ __device__ inline cuComplex conj(const cuComplex& a) noexcept { return cuConjf(a); } @@ -127,7 +209,11 @@ __host__ __device__ inline float imag(const cuComplex& a) noexcept { return a.y; } -#ifdef DLAF_WITH_CUDA +__host__ __device__ inline cuComplex fma(const cuComplex& a, const cuComplex& b, + const cuComplex& c) noexcept { + return cuCfmaf(a, b, c); +} + __host__ __device__ inline cuComplex operator-(const cuComplex& a) noexcept { return make_cuComplex(-a.x, -a.y); } @@ -137,7 +223,7 @@ __host__ __device__ inline bool operator==(const cuComplex& a, const cuComplex& } __host__ __device__ inline bool operator!=(const cuComplex& a, const cuComplex& b) noexcept { - return !operator==(a, b); + return !(operator==)(a, b); } __host__ __device__ inline cuComplex operator+(const cuComplex& a, const cuComplex& b) noexcept { @@ -155,24 +241,76 @@ __host__ __device__ inline cuComplex operator*(const cuComplex& a, const cuCompl __host__ __device__ inline cuComplex operator/(const cuComplex& a, const cuComplex& b) noexcept { return cuCdivf(a, b); } -#endif - -__host__ __device__ inline cuComplex fma(const cuComplex& a, const cuComplex& b, - const cuComplex& c) noexcept { - return cuCfmaf(a, b, c); -} __host__ __device__ inline cuComplex operator*(const float& a, const cuComplex& b) noexcept { return make_cuComplex(a * b.x, a * b.y); } __host__ __device__ inline cuComplex operator*(const cuComplex& a, const float& b) noexcept { - return operator*(b, a); + return (operator*)(b, a); } __host__ __device__ inline cuComplex operator/(const cuComplex& a, const float& b) noexcept { return make_cuComplex(a.x / b, a.y / b); } +#elif defined(DLAF_WITH_HIP) +__host__ __device__ inline hipComplexWrapper conj(const hipComplexWrapper& a) noexcept { + return cuConjf(a.v); +} + +__host__ __device__ inline float real(const hipComplexWrapper& a) noexcept { + return a.v.x; +} + +__host__ __device__ inline float imag(const hipComplexWrapper& a) noexcept { + return a.v.y; +} + +__host__ __device__ inline hipComplexWrapper fma(const hipComplexWrapper& a, const hipComplexWrapper& b, + const hipComplexWrapper& c) noexcept { + return {cuCfmaf(a.v, b.v, c.v)}; +} + +__host__ __device__ inline hipComplexWrapper operator-(const hipComplexWrapper& a) noexcept { + return {make_cuComplex(-a.v.x, -a.v.y)}; +} + +__host__ __device__ inline bool operator==(const hipComplexWrapper& a, const hipComplexWrapper& b) noexcept { + return a.v.x == b.v.x && a.v.y == b.v.y; +} + +__host__ __device__ inline bool operator!=(const hipComplexWrapper& a, const hipComplexWrapper& b) noexcept { + return !(operator==)(a, b); +} + +__host__ __device__ inline hipComplexWrapper operator+(const hipComplexWrapper& a, const hipComplexWrapper& b) noexcept { + return {cuCaddf(a.v, b.v)}; +} + +__host__ __device__ inline hipComplexWrapper operator-(const hipComplexWrapper& a, const hipComplexWrapper& b) noexcept { + return {cuCsubf(a.v, b.v)}; +} + +__host__ __device__ inline hipComplexWrapper operator*(const hipComplexWrapper& a, const hipComplexWrapper& b) noexcept { + return {cuCmulf(a.v, b.v)}; +} + +__host__ __device__ inline hipComplexWrapper operator/(const hipComplexWrapper& a, const hipComplexWrapper& b) noexcept { + return {cuCdivf(a.v, b.v)}; +} + +__host__ __device__ inline hipComplexWrapper operator*(const float& a, const hipComplexWrapper& b) noexcept { + return {make_cuComplex(a * b.v.x, a * b.v.y)}; +} + +__host__ __device__ inline hipComplexWrapper operator*(const hipComplexWrapper& a, const float& b) noexcept { + return (operator*)(b, a); +} + +__host__ __device__ inline hipComplexWrapper operator/(const hipComplexWrapper& a, const float& b) noexcept { + return {make_cuComplex(a.v.x / b, a.v.y / b)}; +} +#endif // Double __host__ __device__ inline double conj(const double& a) noexcept { @@ -192,6 +330,7 @@ __host__ __device__ inline double fma(const double& a, const double& b, const do } // Double complex +#ifdef DLAF_WITH_CUDA __host__ __device__ inline cuDoubleComplex conj(const cuDoubleComplex& a) noexcept { return cuConj(a); } @@ -204,7 +343,11 @@ __host__ __device__ inline double imag(const cuDoubleComplex& a) noexcept { return a.y; } -#ifdef DLAF_WITH_CUDA +__host__ __device__ inline cuDoubleComplex fma(const cuDoubleComplex& a, const cuDoubleComplex& b, + const cuDoubleComplex& c) noexcept { + return cuCfma(a, b, c); +} + __host__ __device__ inline cuDoubleComplex operator-(const cuDoubleComplex& a) noexcept { return make_cuDoubleComplex(-a.x, -a.y); } @@ -214,7 +357,7 @@ __host__ __device__ inline bool operator==(const cuDoubleComplex& a, const cuDou } __host__ __device__ inline bool operator!=(const cuDoubleComplex& a, const cuDoubleComplex& b) noexcept { - return !operator==(a, b); + return !(operator==)(a, b); } __host__ __device__ inline cuDoubleComplex operator+(const cuDoubleComplex& a, @@ -236,12 +379,6 @@ __host__ __device__ inline cuDoubleComplex operator/(const cuDoubleComplex& a, const cuDoubleComplex& b) noexcept { return cuCdiv(a, b); } -#endif - -__host__ __device__ inline cuDoubleComplex fma(const cuDoubleComplex& a, const cuDoubleComplex& b, - const cuDoubleComplex& c) noexcept { - return cuCfma(a, b, c); -} __host__ __device__ inline cuDoubleComplex operator*(const double& a, const cuDoubleComplex& b) noexcept { @@ -250,13 +387,78 @@ __host__ __device__ inline cuDoubleComplex operator*(const double& a, __host__ __device__ inline cuDoubleComplex operator*(const cuDoubleComplex& a, const double& b) noexcept { - return operator*(b, a); + return (operator*)(b, a); } __host__ __device__ inline cuDoubleComplex operator/(const cuDoubleComplex& a, const double& b) noexcept { return make_cuDoubleComplex(a.x / b, a.y / b); } +#elif defined(DLAF_WITH_HIP) +__host__ __device__ inline hipDoubleComplexWrapper conj(const hipDoubleComplexWrapper& a) noexcept { + return {cuConj(a.v)}; +} + +__host__ __device__ inline double real(const hipDoubleComplexWrapper& a) noexcept { + return a.v.x; +} + +__host__ __device__ inline double imag(const hipDoubleComplexWrapper& a) noexcept { + return a.v.y; +} + +__host__ __device__ inline hipDoubleComplexWrapper fma(const hipDoubleComplexWrapper& a, const hipDoubleComplexWrapper& b, + const hipDoubleComplexWrapper& c) noexcept { + return {cuCfma(a.v, b.v, c.v)}; +} + +__host__ __device__ inline hipDoubleComplexWrapper operator-(const hipDoubleComplexWrapper& a) noexcept { + return {make_cuDoubleComplex(-a.v.x, -a.v.y)}; +} + +__host__ __device__ inline bool operator==(const hipDoubleComplexWrapper& a, const hipDoubleComplexWrapper& b) noexcept { + return a.v.x == b.v.x && a.v.y == b.v.y; +} + +__host__ __device__ inline bool operator!=(const hipDoubleComplexWrapper& a, const hipDoubleComplexWrapper& b) noexcept { + return !(operator==)(a, b); +} + +__host__ __device__ inline hipDoubleComplexWrapper operator+(const hipDoubleComplexWrapper& a, + const hipDoubleComplexWrapper& b) noexcept { + return {cuCadd(a.v, b.v)}; +} + +__host__ __device__ inline hipDoubleComplexWrapper operator-(const hipDoubleComplexWrapper& a, + const hipDoubleComplexWrapper& b) noexcept { + return {cuCsub(a.v, b.v)}; +} + +__host__ __device__ inline hipDoubleComplexWrapper operator*(const hipDoubleComplexWrapper& a, + const hipDoubleComplexWrapper& b) noexcept { + return {cuCmul(a.v, b.v)}; +} + +__host__ __device__ inline hipDoubleComplexWrapper operator/(const hipDoubleComplexWrapper& a, + const hipDoubleComplexWrapper& b) noexcept { + return {cuCdiv(a.v, b.v)}; +} + +__host__ __device__ inline hipDoubleComplexWrapper operator*(const double& a, + const hipDoubleComplexWrapper& b) noexcept { + return {make_cuDoubleComplex(a * b.v.x, a * b.v.y)}; +} + +__host__ __device__ inline hipDoubleComplexWrapper operator*(const hipDoubleComplexWrapper& a, + const double& b) noexcept { + return (operator*)(b, a); +} + +__host__ __device__ inline hipDoubleComplexWrapper operator/(const hipDoubleComplexWrapper& a, + const double& b) noexcept { + return {make_cuDoubleComplex(a.v.x / b, a.v.y / b)}; +} +#endif } } diff --git a/src/lapack/gpu/add.cu b/src/lapack/gpu/add.cu index 9e9f3e97e7..c9e35c14cb 100644 --- a/src/lapack/gpu/add.cu +++ b/src/lapack/gpu/add.cu @@ -159,8 +159,8 @@ void add(const blas::Uplo uplo, const SizeType m, const SizeType n, const T& alp const dim3 nr_blocks(util::ceilDiv(um, kernel_tile_size_rows), util::ceilDiv(un, kernel_tile_size_cols)); kernels::add<<>>(util::blasToCublas(uplo), um, un, - util::cppToCudaCast(alpha), util::cppToCudaCast(a), - to_uint(lda), util::cppToCudaCast(b), to_uint(ldb)); + util::cppToCudaWrapperCast(alpha), util::cppToCudaWrapperCast(a), + to_uint(lda), util::cppToCudaWrapperCast(b), to_uint(ldb)); } DLAF_CUBLAS_ADD_ETI(, float); diff --git a/src/lapack/gpu/lacpy.cu b/src/lapack/gpu/lacpy.cu index 16baa883e2..d360f6eae4 100644 --- a/src/lapack/gpu/lacpy.cu +++ b/src/lapack/gpu/lacpy.cu @@ -130,8 +130,8 @@ void lacpy(const blas::Uplo uplo, const SizeType m, const SizeType n, const T* a const dim3 nr_blocks(util::ceilDiv(um, kernel_tile_size_rows), util::ceilDiv(un, kernel_tile_size_cols)); kernels::lacpy<<>>(util::blasToCublas(uplo), um, un, - util::cppToCudaCast(a), to_uint(lda), - util::cppToCudaCast(b), to_uint(ldb)); + util::cppToCudaWrapperCast(a), to_uint(lda), + util::cppToCudaWrapperCast(b), to_uint(ldb)); } } diff --git a/src/lapack/gpu/laset.cu b/src/lapack/gpu/laset.cu index dea96572d9..cb186bbb51 100644 --- a/src/lapack/gpu/laset.cu +++ b/src/lapack/gpu/laset.cu @@ -112,8 +112,8 @@ void laset(blas::Uplo uplo, SizeType m, SizeType n, T alpha, T beta, T* a, SizeT dim3 nr_threads(kernel_tile_size_rows, 1); dim3 nr_blocks(util::ceilDiv(um, kernel_tile_size_rows), util::ceilDiv(un, kernel_tile_size_cols)); kernels::laset<<>>(util::blasToCublas(uplo), um, un, - util::cppToCudaCast(alpha), - util::cppToCudaCast(beta), util::cppToCudaCast(a), + util::cppToCudaWrapperCast(alpha), + util::cppToCudaWrapperCast(beta), util::cppToCudaWrapperCast(a), to_uint(lda)); } From 899a64effe7d6472c9584818b392bcb361edf45d Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Wed, 17 Jan 2024 15:10:21 +0200 Subject: [PATCH 2/4] Disable non-GPU CI pipelines --- ci/.gitlab-ci.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ci/.gitlab-ci.yml b/ci/.gitlab-ci.yml index ab4d738588..421d2380c3 100644 --- a/ci/.gitlab-ci.yml +++ b/ci/.gitlab-ci.yml @@ -1,13 +1,13 @@ include: - - local: 'ci/cpu/clang12_release.yml' - - local: 'ci/cpu/clang13_release_cxx20.yml' - - local: 'ci/cpu/clang14_release_stdexec.yml' - - local: 'ci/cpu/clang15_release.yml' - - local: 'ci/cpu/gcc11_codecov.yml' - - local: 'ci/cpu/gcc11_release.yml' - - local: 'ci/cpu/gcc11_release_stdexec.yml' - - local: 'ci/cpu/gcc11_debug_stdexec.yml' - - local: 'ci/cpu/gcc12_release_cxx20.yml' + # - local: 'ci/cpu/clang12_release.yml' + # - local: 'ci/cpu/clang13_release_cxx20.yml' + # - local: 'ci/cpu/clang14_release_stdexec.yml' + # - local: 'ci/cpu/clang15_release.yml' + # - local: 'ci/cpu/gcc11_codecov.yml' + # - local: 'ci/cpu/gcc11_release.yml' + # - local: 'ci/cpu/gcc11_release_stdexec.yml' + # - local: 'ci/cpu/gcc11_debug_stdexec.yml' + # - local: 'ci/cpu/gcc12_release_cxx20.yml' - local: 'ci/cuda/gcc11_release.yml' - local: 'ci/cuda/gcc11_release_scalapack.yml' - local: 'ci/cuda/gcc11_codecov.yml' From 580b6905af53ab683b1fbeb48bcf494369c311d4 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 18 Jan 2024 13:26:12 +0100 Subject: [PATCH 3/4] Add ROCM 5.7.1 CI pipeline --- ci/.gitlab-ci.yml | 1 + ci/docker/release-rocm571.yaml | 70 ++++++++++++++++++++++++++++++ ci/rocm/clang14_rocm57_release.yml | 22 ++++++++++ 3 files changed, 93 insertions(+) create mode 100644 ci/docker/release-rocm571.yaml create mode 100644 ci/rocm/clang14_rocm57_release.yml diff --git a/ci/.gitlab-ci.yml b/ci/.gitlab-ci.yml index 421d2380c3..1cd20b98e8 100644 --- a/ci/.gitlab-ci.yml +++ b/ci/.gitlab-ci.yml @@ -14,3 +14,4 @@ include: - local: 'ci/cuda/gcc11_debug_scalapack.yml' - local: 'ci/rocm/clang14_release.yml' - local: 'ci/rocm/clang14_release_stdexec.yml' + - local: 'ci/rocm/clang14_rocm57_release.yml' diff --git a/ci/docker/release-rocm571.yaml b/ci/docker/release-rocm571.yaml new file mode 100644 index 0000000000..93f9d70c20 --- /dev/null +++ b/ci/docker/release-rocm571.yaml @@ -0,0 +1,70 @@ +# +# Distributed Linear Algebra with Future (DLAF) +# +# Copyright (c) 2018-2023, ETH Zurich +# All rights reserved. +# +# Please, refer to the LICENSE file in the root directory. +# SPDX-License-Identifier: BSD-3-Clause +# + +spack: + include: + - /spack_environment/common.yaml + + view: false + concretizer: + unify: + true + + specs: + - dla-future@master +rocm amdgpu_target=gfx90a:xnack- +miniapps +ci-test + + packages: + all: + variants: + - build_type=Release + - cxxstd=17 + - amdgpu_target=gfx90a:xnack- + blas: + require:: openblas + lapack: + require:: openblas + mpich: + require: + - '~rocm' + - 'device=ch3' + - 'netmod=tcp' + hip: + require: + - '@5.7.1' +# llvm-amdgpu: +# externals: +# - spec: llvm-amdgpu@5.7.1 ~rocm-device-libs +# prefix: /opt/rocm-5.7.1/llvm +# buildable: false +# rocm-device-libs: +# externals: +# - spec: rocm-device-libs@5.7.1 +# prefix: /opt/rocm-5.7.1 +# buildable: false +# hip: +# externals: +# - spec: hip@5.7.1 +# prefix: /opt/rocm-5.7.1 +# buildable: false +# rocblas: +# externals: +# - spec: rocblas@5.7.1 +# prefix: /opt/rocm-5.7.1 +# buildable: false +# rocsolver: +# externals: +# - spec: rocsolver@5.7.1 +# prefix: /opt/rocm-5.7.1 +# buildable: false +# hsa-rocr-dev: +# externals: +# - spec: hsa-rocr-dev@5.7.1 +# prefix: /opt/rocm-5.7.1 +# buildable: false diff --git a/ci/rocm/clang14_rocm57_release.yml b/ci/rocm/clang14_rocm57_release.yml new file mode 100644 index 0000000000..3dd281758f --- /dev/null +++ b/ci/rocm/clang14_rocm57_release.yml @@ -0,0 +1,22 @@ +include: + - local: 'ci/common-ci.yml' + +rocm clang14 rocm57 release deps: + extends: .build_deps_common + variables: + # BASE_IMAGE: docker.io/rocm/dev-ubuntu-22.04:5.7.1 + EXTRA_APTGET: "clang-14 libomp-14-dev" # rocblas rocblas-dev rocsolver rocsolver-dev llvm-amdgpu rocm-device-libs" + COMPILER: clang@14 + USE_ROCBLAS: "ON" + SPACK_ENVIRONMENT: ci/docker/release-rocm571.yaml + BUILD_IMAGE: $CSCS_REGISTRY_PATH/rocm57-clang14-release/build + +rocm clang14 rocm57 release build: + extends: + - .build_common + after_script: null + needs: + - rocm clang14 rocm57 release deps + variables: + # DEPLOY_BASE_IMAGE: docker.io/rocm/dev-ubuntu-22.04:5.7.1 + DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/rocm57-clang14-release/deploy:$CI_COMMIT_SHA From a0011e06be90ba1d5422243cae9f07b2be2ccb67 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Thu, 18 Jan 2024 13:27:34 +0100 Subject: [PATCH 4/4] Disable all but ROCM 5.7.1 pipeline --- ci/.gitlab-ci.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ci/.gitlab-ci.yml b/ci/.gitlab-ci.yml index 1cd20b98e8..191b10f925 100644 --- a/ci/.gitlab-ci.yml +++ b/ci/.gitlab-ci.yml @@ -8,10 +8,10 @@ include: # - local: 'ci/cpu/gcc11_release_stdexec.yml' # - local: 'ci/cpu/gcc11_debug_stdexec.yml' # - local: 'ci/cpu/gcc12_release_cxx20.yml' - - local: 'ci/cuda/gcc11_release.yml' - - local: 'ci/cuda/gcc11_release_scalapack.yml' - - local: 'ci/cuda/gcc11_codecov.yml' - - local: 'ci/cuda/gcc11_debug_scalapack.yml' - - local: 'ci/rocm/clang14_release.yml' - - local: 'ci/rocm/clang14_release_stdexec.yml' + # - local: 'ci/cuda/gcc11_release.yml' + # - local: 'ci/cuda/gcc11_release_scalapack.yml' + # - local: 'ci/cuda/gcc11_codecov.yml' + # - local: 'ci/cuda/gcc11_debug_scalapack.yml' + # - local: 'ci/rocm/clang14_release.yml' + # - local: 'ci/rocm/clang14_release_stdexec.yml' - local: 'ci/rocm/clang14_rocm57_release.yml'