Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make Norm communication asynchronous #1221

Merged
merged 7 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions include/dlaf/auxiliary/norm.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ namespace dlaf::auxiliary {
/// @pre if uplo == blas::uplo::Upper or uplo == blas::uplo::Lower A should be square, i.e. M == N and MB == NB.
/// @return the max norm of the Matrix @p A or 0 if `A.size().isEmpty()`
template <Backend backend, Device device, class T>
dlaf::BaseType<T> max_norm(comm::CommunicatorGrid& grid, comm::Index2D rank, blas::Uplo uplo,
Matrix<const T, device>& A) {
pika::execution::experimental::unique_any_sender<dlaf::BaseType<T>> max_norm(
comm::CommunicatorGrid& grid, comm::Index2D rank, blas::Uplo uplo, Matrix<const T, device>& A) {
using dlaf::matrix::equal_process_grid;
using dlaf::matrix::single_tile_per_block;

Expand All @@ -43,7 +43,7 @@ dlaf::BaseType<T> max_norm(comm::CommunicatorGrid& grid, comm::Index2D rank, bla

// LAPACK documentation specify that if any dimension is 0, the result is 0
if (A.size().isEmpty())
return {0};
return {pika::execution::experimental::just(0)};

switch (uplo) {
case blas::Uplo::Lower:
Expand Down
10 changes: 6 additions & 4 deletions include/dlaf/auxiliary/norm/api.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
//
#pragma once

#include <pika/execution.hpp>

#include <dlaf/communication/communicator_grid.h>
#include <dlaf/matrix/matrix.h>
#include <dlaf/types.h>
Expand All @@ -20,11 +22,11 @@ struct Norm {};

template <class T>
struct Norm<Backend::MC, Device::CPU, T> {
static dlaf::BaseType<T> max_L(comm::CommunicatorGrid& comm_grid, comm::Index2D rank,
Matrix<const T, Device::CPU>& matrix);
static pika::execution::experimental::unique_any_sender<dlaf::BaseType<T>> max_L(
comm::CommunicatorGrid& comm_grid, comm::Index2D rank, Matrix<const T, Device::CPU>& matrix);

static dlaf::BaseType<T> max_G(comm::CommunicatorGrid& comm_grid, comm::Index2D rank,
Matrix<const T, Device::CPU>& matrix);
static pika::execution::experimental::unique_any_sender<dlaf::BaseType<T>> max_G(
comm::CommunicatorGrid& comm_grid, comm::Index2D rank, Matrix<const T, Device::CPU>& matrix);
};

// ETI
Expand Down
99 changes: 58 additions & 41 deletions include/dlaf/auxiliary/norm/mc.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,44 @@
#include <dlaf/auxiliary/norm/api.h>
#include <dlaf/common/range2d.h>
#include <dlaf/common/vector.h>
#include <dlaf/communication/kernels/reduce.h>
#include <dlaf/communication/sync/reduce.h>
#include <dlaf/lapack/tile.h>
#include <dlaf/matrix/distribution.h>
#include <dlaf/sender/transform_mpi.h>
#include <dlaf/types.h>
#include <dlaf/util_matrix.h>

namespace dlaf::auxiliary::internal {

template <class T>
T max_element(std::vector<T>&& values) {
DLAF_ASSERT(!values.empty(), "");
return *std::max_element(values.begin(), values.end());
}

template <class T>
pika::execution::experimental::unique_any_sender<T> reduce_in_place(
pika::execution::experimental::unique_any_sender<dlaf::comm::CommunicatorPipelineExclusiveWrapper>
albestro marked this conversation as resolved.
Show resolved Hide resolved
pcomm,
comm::IndexT_MPI rank, MPI_Op reduce_op, pika::execution::experimental::unique_any_sender<T> value) {
namespace ex = pika::execution::experimental;

return std::move(value) | ex::let_value([pcomm = std::move(pcomm), rank, reduce_op](T& local) mutable {
using dlaf::comm::internal::transformMPI;
return std::move(pcomm) |
transformMPI([rank, reduce_op, &local](const dlaf::comm::Communicator& comm,
MPI_Request* req) mutable {
const bool is_root_rank = comm.rank() == rank;
void* in = is_root_rank ? MPI_IN_PLACE : &local;
void* out = is_root_rank ? &local : nullptr;
DLAF_MPI_CHECK_ERROR(MPI_Ireduce(in, out, 1, dlaf::comm::mpi_datatype<T>::type,
reduce_op, rank, comm, req));
}) |
ex::then([&local]() -> T { return local; });
});
}

// Compute max norm of the lower triangular part of the distributed matrix
// https://en.wikipedia.org/wiki/Matrix_norm#Max_norm
//
Expand All @@ -32,15 +62,15 @@ namespace dlaf::auxiliary::internal {
// - sy/he lower
// - tr lower non-unit
template <class T>
dlaf::BaseType<T> Norm<Backend::MC, Device::CPU, T>::max_L(comm::CommunicatorGrid& comm_grid,
comm::Index2D rank,
Matrix<const T, Device::CPU>& matrix) {
pika::execution::experimental::unique_any_sender<dlaf::BaseType<T>> Norm<
Backend::MC, Device::CPU, T>::max_L(comm::CommunicatorGrid& comm_grid, comm::Index2D rank,
Matrix<const T, Device::CPU>& matrix) {
using namespace dlaf::matrix;
namespace ex = pika::execution::experimental;
using pika::this_thread::experimental::sync_wait;

using dlaf::common::make_data;
using dlaf::common::internal::vector;
using pika::execution::thread_stacksize;

using dlaf::tile::internal::lange;
using dlaf::tile::internal::lantr;
Expand Down Expand Up @@ -78,32 +108,27 @@ dlaf::BaseType<T> Norm<Backend::MC, Device::CPU, T>::max_L(comm::CommunicatorGri

// then it is necessary to reduce max values from all ranks into a single max value for the matrix

auto max_element = [](std::vector<NormT>&& values) {
DLAF_ASSERT(!values.empty(), "");
return *std::max_element(values.begin(), values.end());
};
NormT local_max_value =
tiles_max.empty() ? NormT{0}
: sync_wait(ex::when_all_vector(std::move(tiles_max)) |
dlaf::internal::transform(dlaf::internal::Policy<Backend::MC>(),
std::move(max_element)));
NormT max_value;
dlaf::comm::sync::reduce(comm_grid.rankFullCommunicator(rank), comm_grid.fullCommunicator(), MPI_MAX,
make_data(&local_max_value, 1), make_data(&max_value, 1));

return max_value;
ex::unique_any_sender<NormT> local_max_value = ex::just(NormT{0});
albestro marked this conversation as resolved.
Show resolved Hide resolved
if (!tiles_max.empty())
local_max_value =
ex::when_all_vector(std::move(tiles_max)) |
dlaf::internal::transform(dlaf::internal::Policy<Backend::MC>(thread_stacksize::nostack),
max_element<NormT>);

return reduce_in_place(comm_grid.full_communicator_pipeline().exclusive(),
comm_grid.rankFullCommunicator(rank), MPI_MAX, std::move(local_max_value));
}

template <class T>
dlaf::BaseType<T> Norm<Backend::MC, Device::CPU, T>::max_G(comm::CommunicatorGrid& comm_grid,
comm::Index2D rank,
Matrix<const T, Device::CPU>& matrix) {
pika::execution::experimental::unique_any_sender<dlaf::BaseType<T>> Norm<
Backend::MC, Device::CPU, T>::max_G(comm::CommunicatorGrid& comm_grid, comm::Index2D rank,
Matrix<const T, Device::CPU>& matrix) {
using namespace dlaf::matrix;
namespace ex = pika::execution::experimental;
using pika::this_thread::experimental::sync_wait;

using dlaf::common::make_data;
using dlaf::common::internal::vector;
using pika::execution::thread_stacksize;

using dlaf::tile::internal::lange;
using dlaf::tile::internal::lantr;
Expand All @@ -116,31 +141,23 @@ dlaf::BaseType<T> Norm<Backend::MC, Device::CPU, T>::max_G(comm::CommunicatorGri
tiles_max.reserve(distribution.localNrTiles().rows() * distribution.localNrTiles().cols());

for (auto tile_wrt_local : iterate_range2d(distribution.localNrTiles())) {
auto norm_max_f = [](const matrix::Tile<const T, Device::CPU>& tile) noexcept -> NormT {
return lange(lapack::Norm::Max, tile);
};
auto current_tile_max =
matrix.read(tile_wrt_local) |
dlaf::internal::transform(dlaf::internal::Policy<Backend::MC>(), std::move(norm_max_f));
dlaf::internal::whenAllLift(lapack::Norm::Max, matrix.read(tile_wrt_local)) |
dlaf::tile::lange(dlaf::internal::Policy<Backend::MC>(thread_stacksize::nostack));

tiles_max.push_back(std::move(current_tile_max));
}

// then it is necessary to reduce max values from all ranks into a single max value for the matrix

auto max_element = [](std::vector<NormT>&& values) {
DLAF_ASSERT(!values.empty(), "");
return *std::max_element(values.begin(), values.end());
};
NormT local_max_value =
tiles_max.empty() ? NormT{0}
: sync_wait(ex::when_all_vector(std::move(tiles_max)) |
dlaf::internal::transform(dlaf::internal::Policy<Backend::MC>(),
std::move(max_element)));
NormT max_value;
dlaf::comm::sync::reduce(comm_grid.rankFullCommunicator(rank), comm_grid.fullCommunicator(), MPI_MAX,
make_data(&local_max_value, 1), make_data(&max_value, 1));

return max_value;
ex::unique_any_sender<NormT> local_max_value = ex::just(NormT{0});
if (!tiles_max.empty())
local_max_value =
ex::when_all_vector(std::move(tiles_max)) |
dlaf::internal::transform(dlaf::internal::Policy<Backend::MC>(thread_stacksize::nostack),
max_element<NormT>);

return reduce_in_place(comm_grid.full_communicator_pipeline().exclusive(),
comm_grid.rankFullCommunicator(rank), MPI_MAX, std::move(local_max_value));
}
}
6 changes: 4 additions & 2 deletions miniapp/miniapp_cholesky.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,8 @@ void check_cholesky(Matrix<T, Device::CPU>& A, Matrix<T, Device::CPU>& L, Commun
const Index2D rank_result{0, 0};

// 1. Compute the max norm of the original matrix in A
const auto norm_A = dlaf::auxiliary::max_norm<dlaf::Backend::MC>(comm_grid, rank_result, uplo, A);
const auto norm_A =
sync_wait(dlaf::auxiliary::max_norm<dlaf::Backend::MC>(comm_grid, rank_result, uplo, A));

// 2.
// L is a lower triangular, reset values in the upper part (diagonal excluded)
Expand All @@ -421,7 +422,8 @@ void check_cholesky(Matrix<T, Device::CPU>& A, Matrix<T, Device::CPU>& L, Commun
cholesky_diff(A, L, comm_grid);

// 3. Compute the max norm of the difference (it has been compute in-place in A)
const auto norm_diff = dlaf::auxiliary::max_norm<dlaf::Backend::MC>(comm_grid, rank_result, uplo, A);
const auto norm_diff =
sync_wait(dlaf::auxiliary::max_norm<dlaf::Backend::MC>(comm_grid, rank_result, uplo, A));

// 4.
// Evaluation of correctness is done just by the master rank
Expand Down
7 changes: 4 additions & 3 deletions miniapp/miniapp_eigensolver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,8 @@ void checkEigensolver(CommunicatorGrid& comm_grid, blas::Uplo uplo, Matrix<const
const Index2D rank_result{0, 0};

// 1. Compute the max norm of A
const auto norm_A = dlaf::auxiliary::max_norm<dlaf::Backend::MC>(comm_grid, rank_result, uplo, A);
const auto norm_A =
sync_wait(dlaf::auxiliary::max_norm<dlaf::Backend::MC>(comm_grid, rank_result, uplo, A));

// 2.
// Compute C = E D - A E
Expand All @@ -326,8 +327,8 @@ void checkEigensolver(CommunicatorGrid& comm_grid, blas::Uplo uplo, Matrix<const
E_ref, T{1}, C);

// 3. Compute the max norm of the difference
const auto norm_diff =
dlaf::auxiliary::max_norm<dlaf::Backend::MC>(comm_grid, rank_result, blas::Uplo::General, C);
const auto norm_diff = sync_wait(dlaf::auxiliary::max_norm<dlaf::Backend::MC>(comm_grid, rank_result,
blas::Uplo::General, C));

// 4.
// Evaluation of correctness is done just by the master rank
Expand Down
10 changes: 6 additions & 4 deletions miniapp/miniapp_gen_eigensolver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -344,8 +344,10 @@ void checkGenEigensolver(CommunicatorGrid& comm_grid, blas::Uplo uplo, Matrix<co
Matrix<const T, Device::CPU>& E, const SizeType eval_idx_end) {
const Index2D rank_result{0, 0};

const auto norm_A = dlaf::auxiliary::max_norm<dlaf::Backend::MC>(comm_grid, rank_result, uplo, A);
const auto norm_B = dlaf::auxiliary::max_norm<dlaf::Backend::MC>(comm_grid, rank_result, uplo, B);
const auto norm_A =
sync_wait(dlaf::auxiliary::max_norm<dlaf::Backend::MC>(comm_grid, rank_result, uplo, A));
const auto norm_B =
sync_wait(dlaf::auxiliary::max_norm<dlaf::Backend::MC>(comm_grid, rank_result, uplo, B));

// 2.
// Compute C = E D - A E
Expand All @@ -362,8 +364,8 @@ void checkGenEigensolver(CommunicatorGrid& comm_grid, blas::Uplo uplo, Matrix<co
E_ref, T{1}, C);

// 3. Compute the max norm of the difference
const auto norm_diff =
dlaf::auxiliary::max_norm<dlaf::Backend::MC>(comm_grid, rank_result, blas::Uplo::General, C);
const auto norm_diff = sync_wait(dlaf::auxiliary::max_norm<dlaf::Backend::MC>(comm_grid, rank_result,
blas::Uplo::General, C));

// 4.
// Evaluation of correctness is done just by the master rank
Expand Down
7 changes: 4 additions & 3 deletions test/unit/auxiliary/mc/test_norm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ using namespace dlaf::matrix;
using namespace dlaf::matrix::test;
using namespace dlaf::test;
using namespace testing;
using pika::this_thread::experimental::sync_wait;

template <class T>
using NormT = dlaf::BaseType<T>;
Expand Down Expand Up @@ -60,7 +61,7 @@ TYPED_TEST(NormDistributedTest, MaxNorm_EmptyMatrix) {

for (const auto& uplo : blas_uplos) {
const NormT<TypeParam> norm =
auxiliary::max_norm<Backend::MC>(comm_grid, {0, 0}, uplo, matrix);
sync_wait(auxiliary::max_norm<Backend::MC>(comm_grid, {0, 0}, uplo, matrix));

if (Index2D{0, 0} == comm_grid.rank()) {
EXPECT_NEAR(0, norm, std::numeric_limits<NormT<TypeParam>>::epsilon());
Expand Down Expand Up @@ -99,7 +100,7 @@ void set_and_test(CommunicatorGrid& comm_grid, comm::Index2D rank, Matrix<T, Dev
modify_element(matrix, index, new_value);

ASSERT_EQ(lapack::Norm::Max, norm_type);
const NormT<T> norm = auxiliary::max_norm<Backend::MC>(comm_grid, rank, uplo, matrix);
const NormT<T> norm = sync_wait(auxiliary::max_norm<Backend::MC>(comm_grid, rank, uplo, matrix));

SCOPED_TRACE(::testing::Message() << "norm=" << norm_type << " uplo=" << uplo << " changed element="
<< index << " in matrix size=" << matrix.size()
Expand Down Expand Up @@ -132,7 +133,7 @@ TYPED_TEST(NormDistributedTest, MaxNorm_Correctness) {

const Index2D rank_result{comm_grid.size().rows() - 1, comm_grid.size().cols() - 1};
const NormT<TypeParam> norm =
auxiliary::max_norm<Backend::MC>(comm_grid, rank_result, uplo, matrix);
sync_wait(auxiliary::max_norm<Backend::MC>(comm_grid, rank_result, uplo, matrix));

if (rank_result == comm_grid.rank()) {
EXPECT_GE(norm, 0);
Expand Down
1 change: 0 additions & 1 deletion test/unit/eigensolver/test_gen_eigensolver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
//

#include <fstream>
#include <numbers>
#include <optional>
#include <set>
#include <tuple>
Expand Down
Loading