Skip to content

Commit

Permalink
TFactor: Separate larft_gemv kernel and add a faster option (#1219)
Browse files Browse the repository at this point in the history
Co-authored-by: Alberto Invernizzi <invernizzi@cscs.ch>
  • Loading branch information
rasolca and albestro authored Nov 25, 2024
1 parent c2e6501 commit 0be8cc9
Show file tree
Hide file tree
Showing 11 changed files with 568 additions and 140 deletions.
123 changes: 1 addition & 122 deletions include/dlaf/blas/tile.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,130 +28,9 @@
#include <dlaf/util_blas.h>

#ifdef DLAF_WITH_GPU
#include <whip.hpp>

#include <dlaf/gpu/blas/api.h>
#include <dlaf/gpu/blas/error.h>
#include <dlaf/gpu/blas/gpublas.h>
#include <dlaf/util_cublas.h>

#ifdef DLAF_WITH_HIP

#define DLAF_GET_ROCBLAS_WORKSPACE(f) \
[&]() { \
std::size_t workspace_size; \
DLAF_GPUBLAS_CHECK_ERROR( \
rocblas_start_device_memory_size_query(static_cast<rocblas_handle>(handle))); \
DLAF_ROCBLAS_WORKSPACE_CHECK_ERROR(rocblas_##f(handle, std::forward<Args>(args)...)); \
DLAF_GPUBLAS_CHECK_ERROR(rocblas_stop_device_memory_size_query(static_cast<rocblas_handle>(handle), \
&workspace_size)); \
return ::dlaf::memory::MemoryView<std::byte, Device::GPU>(to_int(workspace_size)); \
}();

namespace dlaf::tile::internal {
inline void extendROCBlasWorkspace(cublasHandle_t handle,
::dlaf::memory::MemoryView<std::byte, Device::GPU>&& workspace) {
whip::stream_t stream;
DLAF_GPUBLAS_CHECK_ERROR(cublasGetStream(handle, &stream));
auto f = [workspace = std::move(workspace)](whip::error_t status) { whip::check_error(status); };
pika::cuda::experimental::detail::add_event_callback(std::move(f), stream);
}
}

#define DLAF_DEFINE_GPUBLAS_OP(Name, Type, f) \
template <> \
struct Name<Type> { \
template <typename... Args> \
static void call(cublasHandle_t handle, Args&&... args) { \
auto workspace = DLAF_GET_ROCBLAS_WORKSPACE(f); \
DLAF_GPUBLAS_CHECK_ERROR(rocblas_set_workspace(static_cast<rocblas_handle>(handle), workspace(), \
to_sizet(workspace.size()))); \
DLAF_GPUBLAS_CHECK_ERROR(rocblas_##f(handle, std::forward<Args>(args)...)); \
DLAF_GPUBLAS_CHECK_ERROR(rocblas_set_workspace(static_cast<rocblas_handle>(handle), nullptr, 0)); \
::dlaf::tile::internal::extendROCBlasWorkspace(handle, std::move(workspace)); \
} \
}

#elif defined(DLAF_WITH_CUDA)

#define DLAF_DEFINE_GPUBLAS_OP(Name, Type, f) \
template <> \
struct Name<Type> { \
template <typename... Args> \
static void call(Args&&... args) { \
DLAF_GPUBLAS_CHECK_ERROR(cublas##f##_v2(std::forward<Args>(args)...)); \
} \
}

#endif

#define DLAF_DECLARE_GPUBLAS_OP(Name) \
template <typename T> \
struct Name

#ifdef DLAF_WITH_HIP
#define DLAF_MAKE_GPUBLAS_OP(Name, f) \
DLAF_DECLARE_GPUBLAS_OP(Name); \
DLAF_DEFINE_GPUBLAS_OP(Name, float, s##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, double, d##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, c##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, z##f)

#define DLAF_MAKE_GPUBLAS_SYHE_OP(Name, f) \
DLAF_DECLARE_GPUBLAS_OP(Name); \
DLAF_DEFINE_GPUBLAS_OP(Name, float, ssy##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, double, dsy##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, che##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, zhe##f)

#elif defined(DLAF_WITH_CUDA)
#define DLAF_MAKE_GPUBLAS_OP(Name, f) \
DLAF_DECLARE_GPUBLAS_OP(Name); \
DLAF_DEFINE_GPUBLAS_OP(Name, float, S##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, double, D##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, C##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, Z##f)

#define DLAF_MAKE_GPUBLAS_SYHE_OP(Name, f) \
DLAF_DECLARE_GPUBLAS_OP(Name); \
DLAF_DEFINE_GPUBLAS_OP(Name, float, Ssy##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, double, Dsy##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, Che##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, Zhe##f)
#endif

namespace dlaf::gpublas::internal {

// Level 1
DLAF_MAKE_GPUBLAS_OP(Axpy, axpy);

// Level 2
DLAF_MAKE_GPUBLAS_OP(Gemv, gemv);

DLAF_MAKE_GPUBLAS_OP(Trmv, trmv);

// Level 3
DLAF_MAKE_GPUBLAS_OP(Gemm, gemm);

DLAF_MAKE_GPUBLAS_SYHE_OP(Hemm, mm);

DLAF_MAKE_GPUBLAS_SYHE_OP(Her2k, r2k);

DLAF_MAKE_GPUBLAS_SYHE_OP(Herk, rk);

#if defined(DLAF_WITH_CUDA)
DLAF_MAKE_GPUBLAS_OP(Trmm, trmm);
#elif defined(DLAF_WITH_HIP)

#if ROCBLAS_VERSION_MAJOR >= 3 && defined(ROCBLAS_V3)
DLAF_MAKE_GPUBLAS_OP(Trmm, trmm);
#else
DLAF_MAKE_GPUBLAS_OP(Trmm, trmm_outofplace);
#endif

#endif

DLAF_MAKE_GPUBLAS_OP(Trsm, trsm);
}
#endif

namespace dlaf {
Expand Down
22 changes: 5 additions & 17 deletions include/dlaf/factorization/qr/t_factor_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <dlaf/communication/communicator_pipeline.h>
#include <dlaf/communication/kernels/all_reduce.h>
#include <dlaf/factorization/qr/api.h>
#include <dlaf/lapack/gpu/larft.h>
#include <dlaf/lapack/tile.h>
#include <dlaf/matrix/matrix.h>
#include <dlaf/matrix/tile.h>
Expand Down Expand Up @@ -172,28 +173,15 @@ struct Helpers<Backend::GPU, Device::GPU, T> {
auto gemv_func = [](cublasHandle_t handle, const matrix::Tile<const T, Device::GPU>& tile_v,
const matrix::Tile<const T, Device::CPU>& taus,
matrix::Tile<T, Device::GPU>& tile_t) noexcept {
const SizeType m = tile_v.size().rows();
const SizeType k = tile_t.size().cols();
DLAF_ASSERT(tile_v.size().cols() == k, tile_v.size().cols(), k);
DLAF_ASSERT(taus.size().rows() == k, taus.size().rows(), k);
DLAF_ASSERT(taus.size().cols() == 1, taus.size().cols());

for (SizeType j = 0; j < k; ++j) {
// T(0:j, j) = -tau . V(j:, 0:j)* . V(j:, j)
// [j x 1] = [(n-j) x j]* . [(n-j) x 1]
const TileElementIndex va_start{0, 0};
const TileElementIndex vb_start{0, j};
const TileElementSize va_size{tile_v.size().rows(), j};
const TileElementIndex t_start{0, j};
const auto neg_tau = util::blasToCublasCast(-taus({j, 0}));
const auto one = util::blasToCublasCast(T{1});

gpublas::internal::Gemv<T>::call(handle, CUBLAS_OP_C, to_int(va_size.rows()),
to_int(va_size.cols()), &neg_tau,
util::blasToCublasCast(tile_v.ptr(va_start)),
to_int(tile_v.ld()),
util::blasToCublasCast(tile_v.ptr(vb_start)), 1, &one,
util::blasToCublasCast(tile_t.ptr(t_start)), 1);
}
gpulapack::larft_gemv0(handle, m, k, tile_v.ptr(), tile_v.ld(), taus.ptr(), tile_t.ptr(),
tile_t.ld());

return std::move(tile_t);
};

Expand Down
147 changes: 147 additions & 0 deletions include/dlaf/gpu/blas/gpublas.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
//
// Distributed Linear Algebra with Future (DLAF)
//
// Copyright (c) 2018-2024, ETH Zurich
// All rights reserved.
//
// Please, refer to the LICENSE file in the root directory.
// SPDX-License-Identifier: BSD-3-Clause
//
#pragma once

/// @file
/// Provides gpublas wrappers for BLAS operations.

#ifdef DLAF_WITH_GPU
#include <cstddef>
#include <utility>

#include <whip.hpp>

#include <dlaf/gpu/blas/api.h>
#include <dlaf/gpu/blas/error.h>
#include <dlaf/util_cublas.h>

#ifdef DLAF_WITH_HIP

#include <pika/async_cuda/detail/cuda_event_callback.hpp>

#include <dlaf/memory/memory_view.h>

#define DLAF_GET_ROCBLAS_WORKSPACE(f) \
[&]() { \
std::size_t workspace_size; \
DLAF_GPUBLAS_CHECK_ERROR( \
rocblas_start_device_memory_size_query(static_cast<rocblas_handle>(handle))); \
DLAF_ROCBLAS_WORKSPACE_CHECK_ERROR(rocblas_##f(handle, std::forward<Args>(args)...)); \
DLAF_GPUBLAS_CHECK_ERROR(rocblas_stop_device_memory_size_query(static_cast<rocblas_handle>(handle), \
&workspace_size)); \
return ::dlaf::memory::MemoryView<std::byte, Device::GPU>(to_int(workspace_size)); \
}();

namespace dlaf::tile::internal {
inline void extendROCBlasWorkspace(cublasHandle_t handle,
::dlaf::memory::MemoryView<std::byte, Device::GPU>&& workspace) {
whip::stream_t stream;
DLAF_GPUBLAS_CHECK_ERROR(cublasGetStream(handle, &stream));
auto f = [workspace = std::move(workspace)](whip::error_t status) { whip::check_error(status); };
pika::cuda::experimental::detail::add_event_callback(std::move(f), stream);
}
}

#define DLAF_DEFINE_GPUBLAS_OP(Name, Type, f) \
template <> \
struct Name<Type> { \
template <typename... Args> \
static void call(cublasHandle_t handle, Args&&... args) { \
auto workspace = DLAF_GET_ROCBLAS_WORKSPACE(f); \
DLAF_GPUBLAS_CHECK_ERROR(rocblas_set_workspace(static_cast<rocblas_handle>(handle), workspace(), \
to_sizet(workspace.size()))); \
DLAF_GPUBLAS_CHECK_ERROR(rocblas_##f(handle, std::forward<Args>(args)...)); \
DLAF_GPUBLAS_CHECK_ERROR(rocblas_set_workspace(static_cast<rocblas_handle>(handle), nullptr, 0)); \
::dlaf::tile::internal::extendROCBlasWorkspace(handle, std::move(workspace)); \
} \
}

#elif defined(DLAF_WITH_CUDA)

#define DLAF_DEFINE_GPUBLAS_OP(Name, Type, f) \
template <> \
struct Name<Type> { \
template <typename... Args> \
static void call(Args&&... args) { \
DLAF_GPUBLAS_CHECK_ERROR(cublas##f##_v2(std::forward<Args>(args)...)); \
} \
}

#endif

#define DLAF_DECLARE_GPUBLAS_OP(Name) \
template <typename T> \
struct Name

#ifdef DLAF_WITH_HIP
#define DLAF_MAKE_GPUBLAS_OP(Name, f) \
DLAF_DECLARE_GPUBLAS_OP(Name); \
DLAF_DEFINE_GPUBLAS_OP(Name, float, s##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, double, d##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, c##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, z##f)

#define DLAF_MAKE_GPUBLAS_SYHE_OP(Name, f) \
DLAF_DECLARE_GPUBLAS_OP(Name); \
DLAF_DEFINE_GPUBLAS_OP(Name, float, ssy##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, double, dsy##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, che##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, zhe##f)

#elif defined(DLAF_WITH_CUDA)
#define DLAF_MAKE_GPUBLAS_OP(Name, f) \
DLAF_DECLARE_GPUBLAS_OP(Name); \
DLAF_DEFINE_GPUBLAS_OP(Name, float, S##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, double, D##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, C##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, Z##f)

#define DLAF_MAKE_GPUBLAS_SYHE_OP(Name, f) \
DLAF_DECLARE_GPUBLAS_OP(Name); \
DLAF_DEFINE_GPUBLAS_OP(Name, float, Ssy##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, double, Dsy##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<float>, Che##f); \
DLAF_DEFINE_GPUBLAS_OP(Name, std::complex<double>, Zhe##f)
#endif

namespace dlaf::gpublas::internal {

// Level 1
DLAF_MAKE_GPUBLAS_OP(Axpy, axpy);

// Level 2
DLAF_MAKE_GPUBLAS_OP(Gemv, gemv);

DLAF_MAKE_GPUBLAS_OP(Trmv, trmv);

// Level 3
DLAF_MAKE_GPUBLAS_OP(Gemm, gemm);

DLAF_MAKE_GPUBLAS_SYHE_OP(Hemm, mm);

DLAF_MAKE_GPUBLAS_SYHE_OP(Her2k, r2k);

DLAF_MAKE_GPUBLAS_SYHE_OP(Herk, rk);

#if defined(DLAF_WITH_CUDA)
DLAF_MAKE_GPUBLAS_OP(Trmm, trmm);
#elif defined(DLAF_WITH_HIP)

#if ROCBLAS_VERSION_MAJOR >= 3 && defined(ROCBLAS_V3)
DLAF_MAKE_GPUBLAS_OP(Trmm, trmm);
#else
DLAF_MAKE_GPUBLAS_OP(Trmm, trmm_outofplace);
#endif

#endif

DLAF_MAKE_GPUBLAS_OP(Trsm, trsm);
}
#endif
51 changes: 51 additions & 0 deletions include/dlaf/lapack/gpu/larft.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
//
// Distributed Linear Algebra with Future (DLAF)
//
// Copyright (c) 2018-2024, ETH Zurich
// All rights reserved.
//
// Please, refer to the LICENSE file in the root directory.
// SPDX-License-Identifier: BSD-3-Clause
//

#pragma once

#ifdef DLAF_WITH_GPU

#include <blas.hh>
#include <whip.hpp>

#include <dlaf/gpu/blas/api.h>
#include <dlaf/types.h>

namespace dlaf::gpulapack {

template <class T>
void larft_gemv0(cublasHandle_t handle, const SizeType m, SizeType k, const T* v, const SizeType ldv,
const T* tau, T* t, const SizeType ldt);

template <class T>
void larft_gemv1_notau(cublasHandle_t handle, const SizeType m, const SizeType k, const T* v,
const SizeType ldv, T* t, const SizeType ldt);

template <class T>
void larft_gemv1_fixtau(const SizeType k, const T* tau, const SizeType inctau, T* t, const SizeType ldt,
whip::stream_t stream);

#define DLAF_CUBLAS_LARFT_GEMV_ETI(kword, Type) \
kword template void larft_gemv0(cublasHandle_t handle, const SizeType n, SizeType k, const Type* v, \
const SizeType ldv, const Type* tau, Type* t, const SizeType ldt); \
kword template void larft_gemv1_notau(cublasHandle_t handle, const SizeType m, const SizeType k, \
const Type* v, const SizeType ldv, Type* t, \
const SizeType ldt); \
kword template void larft_gemv1_fixtau(const SizeType k, const Type* tau, const SizeType inctau, \
Type* t, const SizeType ldt, whip::stream_t stream)

DLAF_CUBLAS_LARFT_GEMV_ETI(extern, float);
DLAF_CUBLAS_LARFT_GEMV_ETI(extern, double);
DLAF_CUBLAS_LARFT_GEMV_ETI(extern, std::complex<float>);
DLAF_CUBLAS_LARFT_GEMV_ETI(extern, std::complex<double>);

}

#endif
1 change: 1 addition & 0 deletions miniapp/include/dlaf/miniapp/kernel_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#pragma once

#include <future>
#include <utility>

#ifdef DLAF_WITH_GPU
Expand Down
4 changes: 4 additions & 0 deletions miniapp/kernel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,9 @@

if(DLAF_BUILD_TESTING)
# TODO they depends on DLAF_TEST exclusively for the createTile method.
DLAF_addMiniapp(
miniapp_larft_gemv SOURCES miniapp_larft_gemv.cpp LIBRARIES dlaf.core DLAF_test DLAF_miniapp
)

DLAF_addMiniapp(miniapp_laset SOURCES miniapp_laset.cpp LIBRARIES dlaf.core DLAF_test)
endif()
Loading

0 comments on commit 0be8cc9

Please sign in to comment.