From 994891a23207e5ebbbb81ddbb5f02d90343a2606 Mon Sep 17 00:00:00 2001 From: yasahi-hpc <57478230+yasahi-hpc@users.noreply.github.com> Date: Wed, 10 Jul 2024 18:38:55 +0200 Subject: [PATCH] Implement batched serial pttrf (#2256) * Batched serial pttrf implementation * fix: use GEMM to add matrices * fix: initialization order * fformat * fix: temporary variable in a test code * fix: docstring of pttrf * check_positive_definitiveness only if KOKKOSKERNELS_DEBUG_LEVEL > 0 * Improve the test for pttrf * fix: int type * fix: cleanup tests for SerialPttrf * cleanup: remove unused deep_copies * fix: docstrings and comments for pttrf * ConjTranspose with conj and Transpose * quick return in pttrf for size 1 or 0 matrix * Add tests for invalid input * fix: info computation --------- Co-authored-by: Yuuichi Asahi --- .../impl/KokkosBatched_Pttrf_Serial_Impl.hpp | 73 +++ .../KokkosBatched_Pttrf_Serial_Internal.hpp | 211 ++++++++ batched/dense/src/KokkosBatched_Pttrf.hpp | 52 ++ .../dense/unit_test/Test_Batched_Dense.hpp | 3 + .../unit_test/Test_Batched_DenseUtils.hpp | 40 ++ .../unit_test/Test_Batched_SerialPttrf.hpp | 467 ++++++++++++++++++ .../Test_Batched_SerialPttrf_Complex.hpp | 31 ++ .../Test_Batched_SerialPttrf_Real.hpp | 31 ++ blas/impl/KokkosBlas_util.hpp | 1 + 9 files changed, 909 insertions(+) create mode 100644 batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp create mode 100644 batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp create mode 100644 batched/dense/src/KokkosBatched_Pttrf.hpp create mode 100644 batched/dense/unit_test/Test_Batched_SerialPttrf.hpp create mode 100644 batched/dense/unit_test/Test_Batched_SerialPttrf_Complex.hpp create mode 100644 batched/dense/unit_test/Test_Batched_SerialPttrf_Real.hpp diff --git a/batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp new file mode 100644 index 0000000000..b0ea39fa3f --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp @@ -0,0 +1,73 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PTTRF_SERIAL_IMPL_HPP_ +#define KOKKOSBATCHED_PTTRF_SERIAL_IMPL_HPP_ + +#include +#include "KokkosBatched_Pttrf_Serial_Internal.hpp" + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +template +KOKKOS_INLINE_FUNCTION static int checkPttrfInput( + [[maybe_unused]] const DViewType &d, [[maybe_unused]] const EViewType &e) { + static_assert(Kokkos::is_view::value, + "KokkosBatched::pttrf: DViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::pttrf: EViewType is not a Kokkos::View."); + + static_assert(DViewType::rank == 1, + "KokkosBatched::pttrf: DViewType must have rank 1."); + static_assert(EViewType::rank == 1, + "KokkosBatched::pttrf: EViewType must have rank 1."); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + const int nd = d.extent(0); + const int ne = e.extent(0); + + if (ne + 1 != nd) { + Kokkos::printf( + "KokkosBatched::pttrf: Dimensions of d and e do not match: d: %d, e: " + "%d \n" + "e.extent(0) must be equal to d.extent(0) - 1\n", + nd, ne); + return 1; + } +#endif + return 0; +} + +template <> +struct SerialPttrf { + template + KOKKOS_INLINE_FUNCTION static int invoke(const DViewType &d, + const EViewType &e) { + // Quick return if possible + if (d.extent(0) == 0) return 0; + if (d.extent(0) == 1) return (d(0) < 0 ? 1 : 0); + + auto info = checkPttrfInput(d, e); + if (info) return info; + + return SerialPttrfInternal::invoke( + d.extent(0), d.data(), d.stride(0), e.data(), e.stride(0)); + } +}; +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_PTTRF_SERIAL_IMPL_HPP_ diff --git a/batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp new file mode 100644 index 0000000000..5b4d3fb182 --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp @@ -0,0 +1,211 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PTTRF_SERIAL_INTERNAL_HPP_ +#define KOKKOSBATCHED_PTTRF_SERIAL_INTERNAL_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +template +struct SerialPttrfInternal { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int n, + ValueType *KOKKOS_RESTRICT d, + const int ds0, + ValueType *KOKKOS_RESTRICT e, + const int es0); + + template + KOKKOS_INLINE_FUNCTION static int invoke( + const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, + Kokkos::complex *KOKKOS_RESTRICT e, const int es0); +}; + +/// +/// Real matrix +/// + +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPttrfInternal::invoke( + const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, + ValueType *KOKKOS_RESTRICT e, const int es0) { + int info = 0; + + auto update = [&](const int i) { + auto ei_tmp = e[i * es0]; + e[i * es0] = ei_tmp / d[i * ds0]; + d[(i + 1) * ds0] -= e[i * es0] * ei_tmp; + }; + + auto check_positive_definitiveness = [&](const int i) { + return (d[i] <= 0.0) ? (i + 1) : 0; + }; + + // Compute the L*D*L' (or U'*D*U) factorization of A. + const int i4 = (n - 1) % 4; + for (int i = 0; i < i4; i++) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i); + if (info) { + return info; + } +#endif + + update(i); + } // for (int i = 0; i < i4; i++) + + for (int i = i4; i < n - 4; i += 4) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i); + if (info) { + return info; + } +#endif + + update(i); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i + 1); + if (info) { + return info; + } +#endif + + update(i + 1); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i + 2); + if (info) { + return info; + } +#endif + + update(i + 2); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i + 3); + if (info) { + return info; + } +#endif + + update(i + 3); + + } // for (int i = i4; i < n-4; 4) + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(n - 1); + if (info) { + return info; + } +#endif + + return 0; +} + +/// +/// Complex matrix +/// + +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPttrfInternal::invoke( + const int n, ValueType *KOKKOS_RESTRICT d, const int ds0, + Kokkos::complex *KOKKOS_RESTRICT e, const int es0) { + int info = 0; + + auto update = [&](const int i) { + auto eir_tmp = e[i * es0].real(); + auto eii_tmp = e[i * es0].imag(); + auto f_tmp = eir_tmp / d[i * ds0]; + auto g_tmp = eii_tmp / d[i * ds0]; + e[i * es0] = Kokkos::complex(f_tmp, g_tmp); + d[(i + 1) * ds0] = d[(i + 1) * ds0] - f_tmp * eir_tmp - g_tmp * eii_tmp; + }; + + auto check_positive_definitiveness = [&](const int i) { + return (d[i] <= 0.0) ? (i + 1) : 0; + }; + + // Compute the L*D*L' (or U'*D*U) factorization of A. + const int i4 = (n - 1) % 4; + for (int i = 0; i < i4; i++) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i); + if (info) { + return info; + } +#endif + + update(i); + } // for (int i = 0; i < i4; i++) + + for (int i = i4; i < n - 4; i += 4) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i); + if (info) { + return info; + } +#endif + + update(i); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i + 1); + if (info) { + return info; + } +#endif + + update(i + 1); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i + 2); + if (info) { + return info; + } +#endif + + update(i + 2); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(i + 3); + if (info) { + return info; + } +#endif + + update(i + 3); + + } // for (int i = i4; i < n-4; 4) + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + info = check_positive_definitiveness(n - 1); + if (info) { + return info; + } +#endif + + return 0; +} + +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_PTTRF_SERIAL_INTERNAL_HPP_ diff --git a/batched/dense/src/KokkosBatched_Pttrf.hpp b/batched/dense/src/KokkosBatched_Pttrf.hpp new file mode 100644 index 0000000000..4fcc944dc8 --- /dev/null +++ b/batched/dense/src/KokkosBatched_Pttrf.hpp @@ -0,0 +1,52 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PTTRF_HPP_ +#define KOKKOSBATCHED_PTTRF_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +/// \brief Serial Batched Pttrf: +/// Compute the Cholesky factorization L*D*L**T (or L*D*L**H) of a real +/// symmetric (or complex Hermitian) positive definite tridiagonal matrix A_l +/// for all l = 0, ..., N +/// +/// \tparam DViewType: Input type for the a diagonal matrix, needs to be a 1D +/// view +/// \tparam EViewType: Input type for the a upper/lower diagonal matrix, +/// needs to be a 1D view +/// +/// \param d [inout]: n diagonal elements of the diagonal matrix D +/// \param e [inout]: n-1 upper/lower diagonal elements of the diagonal matrix E +/// +/// No nested parallel_for is used inside of the function. +/// + +template +struct SerialPttrf { + template + KOKKOS_INLINE_FUNCTION static int invoke(const DViewType &d, + const EViewType &e); +}; + +} // namespace KokkosBatched + +#include "KokkosBatched_Pttrf_Serial_Impl.hpp" + +#endif // KOKKOSBATCHED_PTTRF_HPP_ diff --git a/batched/dense/unit_test/Test_Batched_Dense.hpp b/batched/dense/unit_test/Test_Batched_Dense.hpp index 7b0ee58312..76215b58f8 100644 --- a/batched/dense/unit_test/Test_Batched_Dense.hpp +++ b/batched/dense/unit_test/Test_Batched_Dense.hpp @@ -49,6 +49,9 @@ #include "Test_Batched_SerialTrtri_Real.hpp" #include "Test_Batched_SerialTrtri_Complex.hpp" #include "Test_Batched_SerialSVD.hpp" +#include "Test_Batched_SerialPttrf.hpp" +#include "Test_Batched_SerialPttrf_Real.hpp" +#include "Test_Batched_SerialPttrf_Complex.hpp" // Team Kernels #include "Test_Batched_TeamAxpy.hpp" diff --git a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp index 689ff4f7a5..c1328291fb 100644 --- a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp +++ b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp @@ -111,6 +111,46 @@ void create_banded_triangular_matrix(InViewType& in, OutViewType& out, } Kokkos::deep_copy(out, h_out); } + +/// \brief Create a diagonal matrix from an input vector: +/// Copies the input vector into the diagonal of the output matrix specified +/// by the parameter k. k > 0 means that the matrix is upper-diagonal and +/// k < 0 means the lower-diagonal. k = 0 means the diagonal. +/// +/// \tparam InViewType: Input type for the vector, needs to be a 2D view +/// \tparam OutViewType: Output type for the matrix, needs to be a 3D view +/// +/// \param in [in]: Input batched vector, a rank 2 view +/// \param out [out]: Output batched matrix, where the diagonal compnent +/// specified by k is filled with the input vector, a rank 3 view +/// \param k [in]: The diagonal offset to be filled (default is 0). +/// +template +void create_diagonal_matrix(InViewType& in, OutViewType& out, int k = 0) { + auto h_in = Kokkos::create_mirror_view(in); + auto h_out = Kokkos::create_mirror_view(out); + const int N = in.extent(0), BlkSize = in.extent(1); + + assert(out.extent(0) == in.extent(0)); + assert(out.extent(1) == in.extent(1) + abs(k)); + + int i1_start = k >= 0 ? 0 : -k; + int i2_start = k >= 0 ? k : 0; + + // Zero clear the output matrix + using ScalarType = typename OutViewType::non_const_value_type; + Kokkos::deep_copy(h_out, ScalarType(0.0)); + + Kokkos::deep_copy(h_in, in); + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < BlkSize; i1++) { + h_out(i0, i1 + i1_start, i1 + i2_start) = h_in(i0, i1); + } + } + + Kokkos::deep_copy(out, h_out); +} + } // namespace KokkosBatched #endif // TEST_BATCHED_DENSE_HELPER_HPP diff --git a/batched/dense/unit_test/Test_Batched_SerialPttrf.hpp b/batched/dense/unit_test/Test_Batched_SerialPttrf.hpp new file mode 100644 index 0000000000..6ee7818ddc --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialPttrf.hpp @@ -0,0 +1,467 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) +#include +#include +#include + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Pttrf.hpp" +#include "Test_Batched_DenseUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace Pttrf { + +template +struct Functor_BatchedSerialPttrf { + using execution_space = typename DeviceType::execution_space; + DViewType _d; + EViewType _e; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialPttrf(const DViewType &d, const EViewType &e) + : _d(d), _e(e) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k, int &info) const { + auto dd = Kokkos::subview(_d, k, Kokkos::ALL()); + auto ee = Kokkos::subview(_e, k, Kokkos::ALL()); + + info += KokkosBatched::SerialPttrf::invoke(dd, ee); + } + + inline int run() { + using value_type = typename DViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPttrf"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + int info_sum = 0; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _d.extent(0)); + Kokkos::parallel_reduce(name.c_str(), policy, *this, info_sum); + Kokkos::Profiling::popRegion(); + return info_sum; + } +}; + +template +struct Functor_BatchedSerialGemm { + using execution_space = typename DeviceType::execution_space; + AViewType _a; + BViewType _b; + CViewType _c; + ScalarType _alpha, _beta; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialGemm(const ScalarType alpha, const AViewType &a, + const BViewType &b, const ScalarType beta, + const CViewType &c) + : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBatched::SerialGemm::invoke(_alpha, aa, bb, + _beta, cc); + } + + inline void run() { + using value_type = typename AViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPttrf"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + } +}; + +template +/// \brief Implementation details of batched pttrf test for random matrix +/// +/// \param N [in] Batch size of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pttrf(const int N, const int BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using RealView2DType = Kokkos::View; + using View2DType = Kokkos::View; + using View3DType = Kokkos::View; + + View3DType A("A", N, BlkSize, BlkSize), + A_reconst("A_reconst", N, BlkSize, BlkSize); + View3DType EL("EL", N, BlkSize, BlkSize), EU("EU", N, BlkSize, BlkSize), + D("D", N, BlkSize, BlkSize), LD("LD", N, BlkSize, BlkSize), + L("L", N, BlkSize, BlkSize), I("I", N, BlkSize, BlkSize); + RealView2DType d("d", N, BlkSize), // Diagonal components + ones(Kokkos::view_alloc("ones", Kokkos::WithoutInitializing), N, BlkSize); + View2DType e_upper("e_upper", N, BlkSize - 1), + e_lower("e_lower", N, + BlkSize - 1); // upper and lower diagonal components + + using execution_space = typename DeviceType::execution_space; + Kokkos::Random_XorShift64_Pool rand_pool(13718); + RealType realRandStart, realRandEnd; + ScalarType randStart, randEnd; + + KokkosKernels::Impl::getRandomBounds(1.0, realRandStart, realRandEnd); + KokkosKernels::Impl::getRandomBounds(1.0, randStart, randEnd); + + // Add BlkSize to ensure positive definiteness + Kokkos::fill_random(d, rand_pool, realRandStart + BlkSize, + realRandEnd + BlkSize); + Kokkos::fill_random(e_upper, rand_pool, randStart, randEnd); + + auto h_e_upper = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), e_upper); + auto h_e_lower = Kokkos::create_mirror_view(e_lower); + + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize - 1; i++) { + // Fill the lower diagonal with conjugate of the upper diagonal + h_e_lower(ib, i) = + Kokkos::ArithTraits::conj(h_e_upper(ib, i)); + } + } + + Kokkos::deep_copy(e_lower, h_e_lower); + Kokkos::deep_copy(ones, RealType(1.0)); + + // Reconstruct Tridiagonal matrix A + // A = D + EL + EU + create_diagonal_matrix(e_lower, EL, -1); + create_diagonal_matrix(e_upper, EU, 1); + create_diagonal_matrix(d, D); + create_diagonal_matrix(ones, I); + + // Matrix matrix addition by Gemm + // D + EU by D * I + EU (result stored in EU) + Functor_BatchedSerialGemm(1.0, D, I, 1.0, EU) + .run(); + + // Copy EL to A + Kokkos::deep_copy(A, EL); + + // EU + EL by EU * I + A (result stored in A) + Functor_BatchedSerialGemm(1.0, EU, I, 1.0, A) + .run(); + + // Factorize matrix A -> L * D * L**H + // d and e are updated by pttrf + auto info = Functor_BatchedSerialPttrf(d, e_lower) + .run(); + + Kokkos::fence(); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + EXPECT_EQ(info, 0); +#endif + + // Reconstruct L and D from factorized matrix + create_diagonal_matrix(e_lower, EL, -1); + create_diagonal_matrix(d, D); + + // Copy I to L + Kokkos::deep_copy(L, I); + + // EL + I by EL * I + L (result stored in L) + Functor_BatchedSerialGemm(1.0, EL, I, 1.0, L) + .run(); + + // Reconstruct A by L*D*L**H + // Gemm to compute L*D -> LD + Functor_BatchedSerialGemm(1.0, L, D, 0.0, LD) + .run(); + + // FIXME: We should use SerialGemm Trans::ConjTranspose. + // For the moment, we compute the complex conjugate of L and + // then use Trans::Transpose. + // Gemm to compute (L*D)*L**H -> A_reconst + // Functor_BatchedSerialGemm(1.0, LD, L, 0.0, + // A_reconst) + // .run(); + + // Compute the complex conjugate of L + // L -> conj(L) + auto h_L = Kokkos::create_mirror_view(L); + Kokkos::deep_copy(h_L, L); + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + h_L(ib, i, j) = Kokkos::ArithTraits::conj(h_L(ib, i, j)); + } + } + } + Kokkos::deep_copy(L, h_L); + + // Gemm to compute (L*D)*(conj(L))**T -> A_reconst + Functor_BatchedSerialGemm(1.0, LD, L, 0.0, + A_reconst) + .run(); + + Kokkos::fence(); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + + auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); + auto h_A_reconst = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_reconst); + + // Check A = L*D*L**H + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + EXPECT_NEAR_KK(h_A_reconst(ib, i, j), h_A(ib, i, j), eps); + } + } + } +} + +template +/// \brief Implementation details of batched pttrf test for early return +/// BlkSize must be 0 or 1 +/// +/// \param N [in] Batch size of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pttrf_quick_return(const int N, const int BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using RealView2DType = Kokkos::View; + using View2DType = Kokkos::View; + + if (BlkSize > 1) return; + + const int BlkSize_minus_1 = BlkSize > 0 ? BlkSize - 1 : 0; + + RealView2DType d("d", N, BlkSize), + d2("d2", N, BlkSize); // Diagonal components + View2DType e("e", N, + BlkSize_minus_1); // lower diagonal components + + const RealType reference_value = 4.0; + + Kokkos::deep_copy(d, reference_value); + Kokkos::deep_copy(d2, -reference_value); + Kokkos::deep_copy(e, ScalarType(1.0)); + + // Factorize matrix A -> L * D * L**H + // d and e are updated by pttrf + // Early return if BlkSize is 0 or 1 + auto info = Functor_BatchedSerialPttrf(d, e) + .run(); + + // For negative values, info should be 1 for BlkSize = 1 + auto info2 = Functor_BatchedSerialPttrf(d2, e) + .run(); + + Kokkos::fence(); + + int expected_info2 = BlkSize == 0 ? 0 : N; + EXPECT_EQ(info, 0); + EXPECT_EQ(info2, expected_info2); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + + auto h_d = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), d); + auto h_d2 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), d2); + + // Check if d is unchanged + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + EXPECT_NEAR_KK(h_d(ib, i), reference_value, eps); + EXPECT_NEAR_KK(h_d2(ib, i), -reference_value, eps); + } + } +} + +template +/// \brief Implementation details of batched pttrf test +/// +/// \param N [in] Batch size of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pttrf_analytical(const int N, const int BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using RealView2DType = Kokkos::View; + using View2DType = Kokkos::View; + using View3DType = Kokkos::View; + + View3DType A("A", N, BlkSize, BlkSize), + A_reconst("A_reconst", N, BlkSize, BlkSize); + View3DType EL("EL", N, BlkSize, BlkSize), EU("EU", N, BlkSize, BlkSize), + D("D", N, BlkSize, BlkSize), LD("LD", N, BlkSize, BlkSize), + L("L", N, BlkSize, BlkSize), I("I", N, BlkSize, BlkSize); + RealView2DType d(Kokkos::view_alloc("d", Kokkos::WithoutInitializing), N, + BlkSize), // Diagonal components + ones(Kokkos::view_alloc("ones", Kokkos::WithoutInitializing), N, BlkSize); + View2DType e(Kokkos::view_alloc("e", Kokkos::WithoutInitializing), N, + BlkSize - 1); // Upper and lower diagonal components (identical) + + Kokkos::deep_copy(d, RealType(4.0)); + Kokkos::deep_copy(e, ScalarType(1.0)); + Kokkos::deep_copy(ones, RealType(1.0)); + + // Reconstruct Tridiaonal matrix A + // A = D + EL + EU + create_diagonal_matrix(e, EL, -1); + create_diagonal_matrix(e, EU, 1); + create_diagonal_matrix(d, D); + create_diagonal_matrix(ones, I); + + // Matrix matrix addition by Gemm + // D + EU by D * I + EU (result stored in EU) + Functor_BatchedSerialGemm(1.0, D, I, 1.0, EU) + .run(); + + // Copy EL to A + Kokkos::deep_copy(A, EL); + + // EU + EL by EU * I + A (result stored in A) + Functor_BatchedSerialGemm(1.0, EU, I, 1.0, A) + .run(); + + // Factorize matrix A -> L * D * L**T + // d and e are updated by pttrf + auto info = Functor_BatchedSerialPttrf(d, e) + .run(); + + Kokkos::fence(); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + EXPECT_EQ(info, 0); +#endif + + // Reconstruct L and D from factorized matrix + create_diagonal_matrix(e, EL, -1); + create_diagonal_matrix(d, D); + + // Copy I to L + Kokkos::deep_copy(L, I); + + // EL + I by EL * I + L (result stored in L) + Functor_BatchedSerialGemm(1.0, EL, I, 1.0, L) + .run(); + + // Reconstruct A by L*D*L**T + // Gemm to compute L*D -> LD + Functor_BatchedSerialGemm(1.0, L, D, 0.0, LD) + .run(); + + // Gemm to compute (L*D)*L**T -> A_reconst + Functor_BatchedSerialGemm(1.0, LD, L, 0.0, + A_reconst) + .run(); + + Kokkos::fence(); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + + auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); + auto h_A_reconst = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_reconst); + + // Check A = L*D*L.T + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + EXPECT_NEAR_KK(h_A_reconst(ib, i, j), h_A(ib, i, j), eps); + } + } + } +} + +} // namespace Pttrf +} // namespace Test + +template +int test_batched_pttrf() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + using LayoutType = Kokkos::LayoutLeft; + for (int i = 0; i < 2; i++) { + Test::Pttrf::impl_test_batched_pttrf_quick_return< + DeviceType, ScalarType, LayoutType, AlgoTagType>(1, i); + Test::Pttrf::impl_test_batched_pttrf_quick_return< + DeviceType, ScalarType, LayoutType, AlgoTagType>(2, i); + } + for (int i = 2; i < 10; i++) { + Test::Pttrf::impl_test_batched_pttrf(1, i); + Test::Pttrf::impl_test_batched_pttrf(2, i); + Test::Pttrf::impl_test_batched_pttrf_analytical( + 1, i); + Test::Pttrf::impl_test_batched_pttrf_analytical( + 2, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + using LayoutType = Kokkos::LayoutRight; + for (int i = 0; i < 2; i++) { + Test::Pttrf::impl_test_batched_pttrf_quick_return< + DeviceType, ScalarType, LayoutType, AlgoTagType>(1, i); + Test::Pttrf::impl_test_batched_pttrf_quick_return< + DeviceType, ScalarType, LayoutType, AlgoTagType>(2, i); + } + for (int i = 2; i < 10; i++) { + Test::Pttrf::impl_test_batched_pttrf(1, i); + Test::Pttrf::impl_test_batched_pttrf(2, i); + Test::Pttrf::impl_test_batched_pttrf_analytical( + 1, i); + Test::Pttrf::impl_test_batched_pttrf_analytical( + 2, i); + } + } +#endif + + return 0; +} diff --git a/batched/dense/unit_test/Test_Batched_SerialPttrf_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialPttrf_Complex.hpp new file mode 100644 index 0000000000..febccc5cb3 --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialPttrf_Complex.hpp @@ -0,0 +1,31 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +TEST_F(TestCategory, test_batched_pttrf_fcomplex) { + using algo_tag_type = typename Algo::Pttrf::Unblocked; + + test_batched_pttrf(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, test_batched_pttrf_dcomplex) { + using algo_tag_type = typename Algo::Pttrf::Unblocked; + + test_batched_pttrf(); +} +#endif diff --git a/batched/dense/unit_test/Test_Batched_SerialPttrf_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialPttrf_Real.hpp new file mode 100644 index 0000000000..8b0fb658fe --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialPttrf_Real.hpp @@ -0,0 +1,31 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, test_batched_pttrf_float) { + using algo_tag_type = typename Algo::Pttrf::Unblocked; + + test_batched_pttrf(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, test_batched_pttrf_double) { + using algo_tag_type = typename Algo::Pttrf::Unblocked; + + test_batched_pttrf(); +} +#endif diff --git a/blas/impl/KokkosBlas_util.hpp b/blas/impl/KokkosBlas_util.hpp index ecb72e7c9a..1fc6b7d480 100644 --- a/blas/impl/KokkosBlas_util.hpp +++ b/blas/impl/KokkosBlas_util.hpp @@ -85,6 +85,7 @@ struct Algo { using SolveLU = Level3; using QR = Level3; using UTV = Level3; + using Pttrf = Level3; struct Level2 { struct Unblocked {};