Skip to content

Commit

Permalink
Add more missing fences, dont atomic to registers
Browse files Browse the repository at this point in the history
- Add a couple more missing fences to Team GMRES, and add all
  the same new fences to TeamVector GMRES
- Don't try to do atomics on local/register values in batched Jacobi
  (those are private to each thread/vector lane). Instead use parallel_reduce
  to sum results over all threads.
  • Loading branch information
brian-kelley committed Jan 24, 2022
1 parent 2063b21 commit cb22be4
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 11 deletions.
18 changes: 9 additions & 9 deletions src/batched/sparse/KokkosBatched_JacobiPrec.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class JacobiPrec {
typename Kokkos::Details::ArithTraits<ScalarType>::mag_type;

private:
mutable ValuesViewType diag_values;
ValuesViewType diag_values;
int n_operators;
int n_rows;
int n_colums;
Expand Down Expand Up @@ -96,39 +96,39 @@ class JacobiPrec {
auto vs0 = diag_values.stride_0();
auto vs1 = diag_values.stride_1();

Kokkos::parallel_for(
Kokkos::parallel_reduce(
Kokkos::TeamThreadRange(member, 0, n_operators * n_rows),
[&](const int &iTemp) {
[&](const int &iTemp, int& ltooSmall) {
int i, j;
getIndices<int, typename ValuesViewType::array_layout>(
iTemp, n_rows, n_operators, j, i);
if (Kokkos::abs<ScalarType>(diag_values_array[i * vs0 + j * vs1]) <=
epsilon) {
Kokkos::atomic_fetch_add(&tooSmall, 1);
ltooSmall++;
diag_values_array[i * vs0 + j * vs1] = one;
} else
diag_values_array[i * vs0 + j * vs1] =
one / diag_values_array[i * vs0 + j * vs1];
});
}, tooSmall);
} else if (std::is_same<ArgMode, Mode::TeamVector>::value) {
auto diag_values_array = diag_values.data();
auto vs0 = diag_values.stride_0();
auto vs1 = diag_values.stride_1();

Kokkos::parallel_for(
Kokkos::parallel_reduce(
Kokkos::TeamVectorRange(member, 0, n_operators * n_rows),
[&](const int &iTemp) {
[&](const int &iTemp, int& ltooSmall) {
int i, j;
getIndices<int, typename ValuesViewType::array_layout>(
iTemp, n_rows, n_operators, j, i);
if (Kokkos::abs<ScalarType>(diag_values_array[i * vs0 + j * vs1]) <=
epsilon) {
Kokkos::atomic_fetch_add(&tooSmall, 1);
ltooSmall++;
diag_values_array[i * vs0 + j * vs1] = one;
} else
diag_values_array[i * vs0 + j * vs1] =
one / diag_values_array[i * vs0 + j * vs1];
});
}, tooSmall);
}

if (tooSmall > 0)
Expand Down
13 changes: 13 additions & 0 deletions src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ struct TeamVectorGMRES {
tmp(i) = beta(i) > max_tolerance ? 1. / beta(i) : 0.;
});

member.team_barrier(); //Finish writing to tmp

Kokkos::parallel_for(
Kokkos::TeamVectorRange(member, 0, numMatrices * numRows),
[&](const OrdinalType& iTemp) {
Expand All @@ -159,12 +161,14 @@ struct TeamVectorGMRES {
// int number_not_converged = 0;

for (size_t j = 0; j < maximum_iteration; ++j) {
member.team_barrier(); //Finish writing to V
// q := A p_j
auto V_j = Kokkos::subview(V, Kokkos::ALL, j, Kokkos::ALL);

A.template apply<MemberType, ScratchPadVectorViewType,
ScratchPadVectorViewType, Trans::NoTranspose,
Mode::TeamVector>(member, V_j, W);
member.team_barrier();
P.template apply<MemberType, ScratchPadVectorViewType,
ScratchPadVectorViewType, Trans::NoTranspose,
Mode::TeamVector, 1>(member, W, W);
Expand All @@ -181,9 +185,12 @@ struct TeamVectorGMRES {
Kokkos::TeamVectorRange(member, 0, numMatrices),
[&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); });

member.team_barrier(); //Finish writing to tmp

TeamVectorAxpy<MemberType>::invoke(member, tmp, V_i, W);
}

member.team_barrier(); //Finish writing to W
TeamVectorDot<MemberType>::invoke(member, W, W, tmp);
member.team_barrier();
Kokkos::parallel_for(
Expand Down Expand Up @@ -250,6 +257,8 @@ struct TeamVectorGMRES {
});
}

member.team_barrier(); //Finish writing to G

Kokkos::parallel_for(
Kokkos::TeamVectorRange(member, 0, numMatrices),
[&](const OrdinalType& l) {
Expand All @@ -264,11 +273,15 @@ struct TeamVectorGMRES {
Kokkos::ALL));
});

member.team_barrier(); //Finish writing to G

for (size_t j = 0; j < maximum_iteration; ++j)
TeamVectorAxpy<MemberType>::invoke(
member, Kokkos::subview(G, Kokkos::ALL, j),
Kokkos::subview(V, Kokkos::ALL, j, Kokkos::ALL), X);

member.team_barrier(); //Finish writing to X

TeamVectorCopy<MemberType>::invoke(member, X, _X);
return status;
}
Expand Down
6 changes: 4 additions & 2 deletions src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,7 @@ struct TeamGMRES {
member.team_barrier();

P.template apply<MemberType, ScratchPadVectorViewType,
ScratchPadVectorViewType, Trans::NoTranspose, Mode::Team,
1>(member, R, R);
ScratchPadVectorViewType, Trans::NoTranspose, Mode::Team, 1>(member, R, R);
member.team_barrier();

TeamDot<MemberType>::invoke(member, R, R, beta);
Expand Down Expand Up @@ -188,6 +187,7 @@ struct TeamGMRES {
TeamAxpy<MemberType>::invoke(member, tmp, V_i, W);
}

member.team_barrier(); //Finish writing to W
TeamDot<MemberType>::invoke(member, W, W, tmp);
member.team_barrier();
Kokkos::parallel_for(
Expand Down Expand Up @@ -270,6 +270,8 @@ struct TeamGMRES {
Kokkos::ALL));
});

member.team_barrier(); //Finish writing to G

for (size_t j = 0; j < maximum_iteration; ++j)
TeamAxpy<MemberType>::invoke(
member, Kokkos::subview(G, Kokkos::ALL, j),
Expand Down

0 comments on commit cb22be4

Please sign in to comment.