From 6995d119e36a994493658ccba91ef0735ff5d655 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sat, 22 Jun 2024 18:00:22 +0200 Subject: [PATCH 01/99] wrap 3D flat loop abstractions --- src/kokkos_abstraction.hpp | 54 ++++++++++++++++++++++++++------- tst/unit/kokkos_abstraction.cpp | 35 +++++++++++++++++++++ 2 files changed, 78 insertions(+), 11 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 2d74ce00932a..c44ea6c39a77 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -21,12 +21,14 @@ #define KOKKOS_ABSTRACTION_HPP_ #include +#include #include #include #include #include +#include "Kokkos_Macros.hpp" #include "basic_types.hpp" #include "config.hpp" #include "parthenon_array_generic.hpp" @@ -35,6 +37,7 @@ #include "utils/multi_pointer.hpp" #include "utils/object_pool.hpp" + namespace parthenon { #ifdef KOKKOS_ENABLE_CUDA_UVM @@ -258,6 +261,37 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac function, std::forward(args)...); } +template class Functor; + +template +class Functor +{ + using F = std::function; + F m_f; +public: + Functor( F function, + int _NjNi, int _Ni, int _kl, int _jl, int _il ) + : m_f(function), NjNi(_NjNi), Ni(_Ni), kl(_kl), jl(_jl), il(_il) {} + KOKKOS_INLINE_FUNCTION + void operator()(const int &idx, Args... args) const { + int k = idx / NjNi; + int j = (idx - k * NjNi) / Ni; + int i = idx - k * NjNi - j * Ni; + k += kl; + j += jl; + i += il; + m_f(k, j, i, std::forward(args)...); + } + int NjNi, Ni, kl, jl, il; +}; + +template +auto MakeFunctor(F &function, const int &NjNi, const int &Ni, + const int &kl, const int &jl, const int &il) { + return Functor(function, NjNi, Ni, kl, jl, il); +} + + // 3D loop using Kokkos 1D Range template inline typename std::enable_if::type @@ -270,17 +304,10 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp const int Ni = iu - il + 1; const int NkNjNi = Nk * Nj * Ni; const int NjNi = Nj * Ni; + kokkos_dispatch( tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi), - KOKKOS_LAMBDA(const int &idx) { - int k = idx / NjNi; - int j = (idx - k * NjNi) / Ni; - int i = idx - k * NjNi - j * Ni; - k += kl; - j += jl; - i += il; - function(k, j, i); - }, + MakeFunctor(function, NjNi, Ni, kl, jl, il), std::forward(args)...); } @@ -649,8 +676,13 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, template inline void par_dispatch(const std::string &name, Args &&...args) { - par_dispatch(DEFAULT_LOOP_PATTERN, name, DevExecSpace(), - std::forward(args)...); + if constexpr (std::is_same::value) { + par_dispatch(DEFAULT_LOOP_PATTERN, name, DevExecSpace(), + std::forward(args)...); + } else { + par_dispatch(loop_pattern_mdrange_tag, name, DevExecSpace(), + std::forward(args)...); + } } template diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index 0adc61c441b0..82a2e174f494 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -500,6 +500,33 @@ bool test_wrapper_reduce_1d(T loop_pattern, DevExecSpace exec_space) { return total == test_tot; } +template +bool test_wrapper_reduce_3d(T loop_pattern, DevExecSpace exec_space) { + constexpr int N = 10; + parthenon::IndexRange r{0, N - 1}; + parthenon::ParArray3D buffer("Testing buffer", N, N, N); + // Initialize data + parthenon::par_for( + loop_pattern, "Initialize parallel reduce array", exec_space, 0, N-1, 0, N-1, 0, N-1, + KOKKOS_LAMBDA(const int k, const int j, const int i) { buffer(k,j,i) = i+j+k; }); + int max = 0; + for (int k = 0; k < N; ++k) { + for (int j = 0; j < N; ++j) { + for (int i = 0; i < N; ++i) { + max = std::max(max, i+j+k); + } + } + } + int test_max = 0; + parthenon::par_reduce( + loop_pattern, "Max via par reduce", exec_space, + 0, N-1, 0, N-1, 0, N-1, + KOKKOS_LAMBDA(const int k, const int j, const int i, int &t) { + t = i+j+k; + }, Kokkos::Max(test_max)); + return max == test_max; +} + TEST_CASE("Parallel reduce", "[par_reduce]") { auto default_exec_space = DevExecSpace(); REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_flatrange_tag, @@ -508,4 +535,12 @@ TEST_CASE("Parallel reduce", "[par_reduce]") { REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_simdfor_tag, default_exec_space) == true); } + REQUIRE(test_wrapper_reduce_3d(parthenon::loop_pattern_flatrange_tag, + default_exec_space) == true); + /* REQUIRE(test_wrapper_reduce_3d(parthenon::LoopPatternMDRange(), */ + /* default_exec_space) == true); */ + /* if constexpr (std::is_same::value) { */ + /* REQUIRE(test_wrapper_reduce_3d(parthenon::loop_pattern_simdfor_tag, */ + /* default_exec_space) == true); */ + /* } */ } From 2e61847f65828eedc35ec3cdb836826c7bed7f0f Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sat, 22 Jun 2024 20:49:14 +0200 Subject: [PATCH 02/99] add 4D loop and test --- src/kokkos_abstraction.hpp | 128 ++++++++++++++++++-------------- tst/unit/kokkos_abstraction.cpp | 59 +++++++++++---- 2 files changed, 116 insertions(+), 71 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index c44ea6c39a77..b4971f2080cd 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -261,36 +261,42 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac function, std::forward(args)...); } -template class Functor; - -template -class Functor -{ - using F = std::function; - F m_f; -public: - Functor( F function, - int _NjNi, int _Ni, int _kl, int _jl, int _il ) - : m_f(function), NjNi(_NjNi), Ni(_Ni), kl(_kl), jl(_jl), il(_il) {} - KOKKOS_INLINE_FUNCTION - void operator()(const int &idx, Args... args) const { - int k = idx / NjNi; - int j = (idx - k * NjNi) / Ni; - int i = idx - k * NjNi - j * Ni; - k += kl; - j += jl; - i += il; - m_f(k, j, i, std::forward(args)...); - } - int NjNi, Ni, kl, jl, il; -}; +template class FlatFunctor; template -auto MakeFunctor(F &function, const int &NjNi, const int &Ni, - const int &kl, const int &jl, const int &il) { - return Functor(function, NjNi, Ni, kl, jl, il); +auto MakeFlatFunctor(F &function) { + return FlatFunctor(); } +template +class FlatFunctor +{ + public: + FlatFunctor(){}; + template + inline void operator()(Tag tag, const std::string &name, + DevExecSpace exec_space, const int kl, const int ku, + const int jl, const int ju, const int il, const int iu, + const F &function, Args ...args) const { + const int Nk = ku - kl + 1; + const int Nj = ju - jl + 1; + const int Ni = iu - il + 1; + const int NkNjNi = Nk * Nj * Ni; + const int NjNi = Nj * Ni; + kokkos_dispatch( + tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi), + KOKKOS_LAMBDA(const int &idx, FArgs ...fargs) { + int k = idx / NjNi; + int j = (idx - k * NjNi) / Ni; + int i = idx - k * NjNi - j * Ni; + k += kl; + j += jl; + i += il; + function(k, j, i, std::forward(fargs)...); + }, + std::forward(args)...); + } +}; // 3D loop using Kokkos 1D Range template @@ -299,16 +305,8 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp const int kl, const int ku, const int jl, const int ju, const int il, const int iu, const Function &function, Args &&...args) { Tag tag; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NkNjNi = Nk * Nj * Ni; - const int NjNi = Nj * Ni; - - kokkos_dispatch( - tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi), - MakeFunctor(function, NjNi, Ni, kl, jl, il), - std::forward(args)...); + const auto func = MakeFlatFunctor(function); + func(tag, name, exec_space, kl, ku, jl, ju, il, iu, function, std::forward(args)...); } // 3D loop using MDRange loops @@ -399,6 +397,40 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, function(k, j, i); } +template +class FlatFunctor +{ + public: + FlatFunctor(){}; + template + inline void operator()(Tag tag, const std::string &name, + DevExecSpace exec_space, const int nl, const int nu, + const int kl, const int ku, const int jl, const int ju, + const int il, const int iu, const F &function, Args ...args) const { + const int Nn = nu - nl + 1; + const int Nk = ku - kl + 1; + const int Nj = ju - jl + 1; + const int Ni = iu - il + 1; + const int NnNkNjNi = Nn * Nk * Nj * Ni; + const int NkNjNi = Nk * Nj * Ni; + const int NjNi = Nj * Ni; + kokkos_dispatch( + tag, name, Kokkos::RangePolicy<>(exec_space, 0, NnNkNjNi), + KOKKOS_LAMBDA(const int &idx, FArgs ...fargs) { + int n = idx / NkNjNi; + int k = (idx - n * NkNjNi) / NjNi; + int j = (idx - n * NkNjNi - k * NjNi) / Ni; + int i = idx - n * NkNjNi - k * NjNi - j * Ni; + n += nl; + k += kl; + j += jl; + i += il; + function(n, k, j, i, std::forward(fargs)...); + }, + std::forward(args)...); + } +}; + // 4D loop using Kokkos 1D Range template inline typename std::enable_if::type @@ -407,27 +439,9 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp const int ju, const int il, const int iu, const Function &function, Args &&...args) { Tag tag; - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NnNkNjNi = Nn * Nk * Nj * Ni; - const int NkNjNi = Nk * Nj * Ni; - const int NjNi = Nj * Ni; - kokkos_dispatch( - tag, name, Kokkos::RangePolicy<>(exec_space, 0, NnNkNjNi), - KOKKOS_LAMBDA(const int &idx) { - int n = idx / NkNjNi; - int k = (idx - n * NkNjNi) / NjNi; - int j = (idx - n * NkNjNi - k * NjNi) / Ni; - int i = idx - n * NkNjNi - k * NjNi - j * Ni; - n += nl; - k += kl; - j += jl; - i += il; - function(n, k, j, i); - }, - std::forward(args)...); + const auto func = MakeFlatFunctor(function); + func(tag, name, exec_space, nl, nu, kl, ku, jl, ju, il, iu, + function, std::forward(args)...); } // 4D loop using MDRange loops diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index 82a2e174f494..539951a29e13 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -503,44 +503,75 @@ bool test_wrapper_reduce_1d(T loop_pattern, DevExecSpace exec_space) { template bool test_wrapper_reduce_3d(T loop_pattern, DevExecSpace exec_space) { constexpr int N = 10; - parthenon::IndexRange r{0, N - 1}; parthenon::ParArray3D buffer("Testing buffer", N, N, N); // Initialize data parthenon::par_for( loop_pattern, "Initialize parallel reduce array", exec_space, 0, N-1, 0, N-1, 0, N-1, KOKKOS_LAMBDA(const int k, const int j, const int i) { buffer(k,j,i) = i+j+k; }); - int max = 0; + int tot = 0; for (int k = 0; k < N; ++k) { for (int j = 0; j < N; ++j) { for (int i = 0; i < N; ++i) { - max = std::max(max, i+j+k); + tot += i+j+k; } } } - int test_max = 0; + int test_tot = 0; parthenon::par_reduce( - loop_pattern, "Max via par reduce", exec_space, + loop_pattern, "Sum via par reduce", exec_space, 0, N-1, 0, N-1, 0, N-1, KOKKOS_LAMBDA(const int k, const int j, const int i, int &t) { - t = i+j+k; - }, Kokkos::Max(test_max)); - return max == test_max; + t += i+j+k; + }, Kokkos::Sum(test_tot)); + return tot == test_tot; +} + +template +bool test_wrapper_reduce_4d(T loop_pattern, DevExecSpace exec_space) { + constexpr int N = 10; + parthenon::ParArray4D buffer("Testing buffer", N, N, N, N); + // Initialize data + parthenon::par_for( + loop_pattern, "Initialize parallel reduce array", exec_space, 0, N-1, 0, N-1, 0, N-1, 0, N-1, + KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) { buffer(n,k,j,i) = i+j+k+n; }); + int tot = 0; + for (int n = 0; n < N; ++n) { + for (int k = 0; k < N; ++k) { + for (int j = 0; j < N; ++j) { + for (int i = 0; i < N; ++i) { + tot += i+j+k+n; + } + } + } + } + int test_tot = 0; + parthenon::par_reduce( + loop_pattern, "Sum via par reduce", exec_space, + 0, N-1, 0, N-1, 0, N-1, 0, N-1, + KOKKOS_LAMBDA(const int n, const int k, const int j, const int i, int &t) { + t += i+j+k+n; + }, Kokkos::Sum(test_tot)); + return tot == test_tot; } TEST_CASE("Parallel reduce", "[par_reduce]") { auto default_exec_space = DevExecSpace(); + SECTION("1D loops") { REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_flatrange_tag, default_exec_space) == true); if constexpr (std::is_same::value) { REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_simdfor_tag, default_exec_space) == true); } + } + + SECTION("3D loops") { REQUIRE(test_wrapper_reduce_3d(parthenon::loop_pattern_flatrange_tag, default_exec_space) == true); - /* REQUIRE(test_wrapper_reduce_3d(parthenon::LoopPatternMDRange(), */ - /* default_exec_space) == true); */ - /* if constexpr (std::is_same::value) { */ - /* REQUIRE(test_wrapper_reduce_3d(parthenon::loop_pattern_simdfor_tag, */ - /* default_exec_space) == true); */ - /* } */ + } + + SECTION("4D loops") { + REQUIRE(test_wrapper_reduce_4d(parthenon::loop_pattern_flatrange_tag, + default_exec_space) == true); + } } From 054ca0ef9e8519fa9fec9540f4511a1feca81ab9 Mon Sep 17 00:00:00 2001 From: Cloud User Date: Sat, 22 Jun 2024 20:12:22 +0000 Subject: [PATCH 03/99] add specialization for const int & --- tst/unit/kokkos_abstraction.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index 539951a29e13..015f1bcb8242 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -568,10 +568,14 @@ TEST_CASE("Parallel reduce", "[par_reduce]") { SECTION("3D loops") { REQUIRE(test_wrapper_reduce_3d(parthenon::loop_pattern_flatrange_tag, default_exec_space) == true); + REQUIRE(test_wrapper_reduce_3d(parthenon::loop_pattern_mdrange_tag, + default_exec_space) == true); } SECTION("4D loops") { REQUIRE(test_wrapper_reduce_4d(parthenon::loop_pattern_flatrange_tag, default_exec_space) == true); + REQUIRE(test_wrapper_reduce_4d(parthenon::loop_pattern_mdrange_tag, + default_exec_space) == true); } } From 4c83c4cc3a392bfae7fb99551a210c00a1356ef4 Mon Sep 17 00:00:00 2001 From: Cloud User Date: Sat, 22 Jun 2024 20:12:46 +0000 Subject: [PATCH 04/99] added mdrange loops to par_reduce tests --- src/kokkos_abstraction.hpp | 65 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index b4971f2080cd..30b84dc9564f 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -268,6 +268,36 @@ auto MakeFlatFunctor(F &function) { return FlatFunctor(); } +template +class FlatFunctor +{ + public: + FlatFunctor(){}; + template + inline void operator()(Tag tag, const std::string &name, + DevExecSpace exec_space, const int kl, const int ku, + const int jl, const int ju, const int il, const int iu, + const F &function, Args ...args) const { + const int Nk = ku - kl + 1; + const int Nj = ju - jl + 1; + const int Ni = iu - il + 1; + const int NkNjNi = Nk * Nj * Ni; + const int NjNi = Nj * Ni; + kokkos_dispatch( + tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi), + KOKKOS_LAMBDA(const int &idx, FArgs ...fargs) { + int k = idx / NjNi; + int j = (idx - k * NjNi) / Ni; + int i = idx - k * NjNi - j * Ni; + k += kl; + j += jl; + i += il; + function(k, j, i, std::forward(fargs)...); + }, + std::forward(args)...); + } +}; + template class FlatFunctor { @@ -397,8 +427,43 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, function(k, j, i); } + template class FlatFunctor +{ + public: + FlatFunctor(){}; + template + inline void operator()(Tag tag, const std::string &name, + DevExecSpace exec_space, const int nl, const int nu, + const int kl, const int ku, const int jl, const int ju, + const int il, const int iu, const F &function, Args ...args) const { + const int Nn = nu - nl + 1; + const int Nk = ku - kl + 1; + const int Nj = ju - jl + 1; + const int Ni = iu - il + 1; + const int NnNkNjNi = Nn * Nk * Nj * Ni; + const int NkNjNi = Nk * Nj * Ni; + const int NjNi = Nj * Ni; + kokkos_dispatch( + tag, name, Kokkos::RangePolicy<>(exec_space, 0, NnNkNjNi), + KOKKOS_LAMBDA(const int &idx, FArgs ...fargs) { + int n = idx / NkNjNi; + int k = (idx - n * NkNjNi) / NjNi; + int j = (idx - n * NkNjNi - k * NjNi) / Ni; + int i = idx - n * NkNjNi - k * NjNi - j * Ni; + n += nl; + k += kl; + j += jl; + i += il; + function(n, k, j, i, std::forward(fargs)...); + }, + std::forward(args)...); + } +}; + +template +class FlatFunctor { public: FlatFunctor(){}; From 0730429877b72c75ff10ad8853604eee12010961 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sat, 22 Jun 2024 23:31:32 +0200 Subject: [PATCH 05/99] refactor flatloop specialization --- src/kokkos_abstraction.hpp | 142 ++++++++++++++++--------------------- 1 file changed, 61 insertions(+), 81 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 30b84dc9564f..fc183baec7f5 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -268,33 +268,44 @@ auto MakeFlatFunctor(F &function) { return FlatFunctor(); } +template +struct FlatLoop3D { + FlatLoop3D(){}; + template + inline void operator()(Tag tag, const F &function, const std::string &name, + DevExecSpace exec_space, const int kl, const int ku, + const int jl, const int ju, const int il, const int iu, + Args ...args) const { + const int Nk = ku - kl + 1; + const int Nj = ju - jl + 1; + const int Ni = iu - il + 1; + const int NkNjNi = Nk * Nj * Ni; + const int NjNi = Nj * Ni; + kokkos_dispatch( + tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi), + KOKKOS_LAMBDA(const int &idx, FArgs ...fargs) { + int k = idx / NjNi; + int j = (idx - k * NjNi) / Ni; + int i = idx - k * NjNi - j * Ni; + k += kl; + j += jl; + i += il; + function(k, j, i, std::forward(fargs)...); + }, + std::forward(args)...); + } +}; + template class FlatFunctor { public: FlatFunctor(){}; template - inline void operator()(Tag tag, const std::string &name, - DevExecSpace exec_space, const int kl, const int ku, - const int jl, const int ju, const int il, const int iu, - const F &function, Args ...args) const { - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NkNjNi = Nk * Nj * Ni; - const int NjNi = Nj * Ni; - kokkos_dispatch( - tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi), - KOKKOS_LAMBDA(const int &idx, FArgs ...fargs) { - int k = idx / NjNi; - int j = (idx - k * NjNi) / Ni; - int i = idx - k * NjNi - j * Ni; - k += kl; - j += jl; - i += il; - function(k, j, i, std::forward(fargs)...); - }, - std::forward(args)...); + inline void operator()(Tag tag, const F &function, Args ...args) const { + + const FlatLoop3D flat3D; + flat3D(tag, function, std::forward(args)...); } }; @@ -304,27 +315,9 @@ class FlatFunctor public: FlatFunctor(){}; template - inline void operator()(Tag tag, const std::string &name, - DevExecSpace exec_space, const int kl, const int ku, - const int jl, const int ju, const int il, const int iu, - const F &function, Args ...args) const { - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NkNjNi = Nk * Nj * Ni; - const int NjNi = Nj * Ni; - kokkos_dispatch( - tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi), - KOKKOS_LAMBDA(const int &idx, FArgs ...fargs) { - int k = idx / NjNi; - int j = (idx - k * NjNi) / Ni; - int i = idx - k * NjNi - j * Ni; - k += kl; - j += jl; - i += il; - function(k, j, i, std::forward(fargs)...); - }, - std::forward(args)...); + inline void operator()(Tag tag, const F &function, Args ...args) const { + const FlatLoop3D flat3D; + flat3D(tag, function, std::forward(args)...); } }; @@ -336,7 +329,7 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp const int iu, const Function &function, Args &&...args) { Tag tag; const auto func = MakeFlatFunctor(function); - func(tag, name, exec_space, kl, ku, jl, ju, il, iu, function, std::forward(args)...); + func(tag, function, name, exec_space, kl, ku, jl, ju, il, iu, std::forward(args)...); } // 3D loop using MDRange loops @@ -427,17 +420,14 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, function(k, j, i); } - -template -class FlatFunctor -{ - public: - FlatFunctor(){}; - template - inline void operator()(Tag tag, const std::string &name, - DevExecSpace exec_space, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, - const int il, const int iu, const F &function, Args ...args) const { +template +struct FlatLoop4D { + FlatLoop4D(){}; + template + inline void operator()(Tag tag, const F &function, const std::string &name, + DevExecSpace exec_space, const int nl, const int nu, + const int kl, const int ku, const int jl, const int ju, + const int il, const int iu, Args ...args) const { const int Nn = nu - nl + 1; const int Nk = ku - kl + 1; const int Nj = ju - jl + 1; @@ -459,6 +449,18 @@ class FlatFunctor(fargs)...); }, std::forward(args)...); + } +}; + +template +class FlatFunctor +{ + public: + FlatFunctor(){}; + template + inline void operator()(Tag tag, const F &function, Args ...args) const { + const FlatLoop4D flat4D; + flat4D(tag, function, std::forward(args)...); } }; @@ -468,31 +470,9 @@ class FlatFunctor - inline void operator()(Tag tag, const std::string &name, - DevExecSpace exec_space, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, - const int il, const int iu, const F &function, Args ...args) const { - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NnNkNjNi = Nn * Nk * Nj * Ni; - const int NkNjNi = Nk * Nj * Ni; - const int NjNi = Nj * Ni; - kokkos_dispatch( - tag, name, Kokkos::RangePolicy<>(exec_space, 0, NnNkNjNi), - KOKKOS_LAMBDA(const int &idx, FArgs ...fargs) { - int n = idx / NkNjNi; - int k = (idx - n * NkNjNi) / NjNi; - int j = (idx - n * NkNjNi - k * NjNi) / Ni; - int i = idx - n * NkNjNi - k * NjNi - j * Ni; - n += nl; - k += kl; - j += jl; - i += il; - function(n, k, j, i, std::forward(fargs)...); - }, - std::forward(args)...); + inline void operator()(Tag tag, const F &function, Args ...args) const { + const FlatLoop4D flat4D; + flat4D(tag, function, std::forward(args)...); } }; @@ -505,8 +485,8 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp Args &&...args) { Tag tag; const auto func = MakeFlatFunctor(function); - func(tag, name, exec_space, nl, nu, kl, ku, jl, ju, il, iu, - function, std::forward(args)...); + func(tag, function, name, exec_space, nl, nu, kl, ku, jl, ju, il, iu, + std::forward(args)...); } // 4D loop using MDRange loops From adb15dd5d71d9e88a712fb375eec905e2909a4c4 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sun, 23 Jun 2024 14:05:30 +0200 Subject: [PATCH 06/99] clean up --- src/kokkos_abstraction.hpp | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index fc183baec7f5..7338e43d4567 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -21,14 +21,12 @@ #define KOKKOS_ABSTRACTION_HPP_ #include -#include #include #include #include #include -#include "Kokkos_Macros.hpp" #include "basic_types.hpp" #include "config.hpp" #include "parthenon_array_generic.hpp" @@ -37,7 +35,6 @@ #include "utils/multi_pointer.hpp" #include "utils/object_pool.hpp" - namespace parthenon { #ifdef KOKKOS_ENABLE_CUDA_UVM @@ -735,13 +732,8 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, template inline void par_dispatch(const std::string &name, Args &&...args) { - if constexpr (std::is_same::value) { - par_dispatch(DEFAULT_LOOP_PATTERN, name, DevExecSpace(), - std::forward(args)...); - } else { - par_dispatch(loop_pattern_mdrange_tag, name, DevExecSpace(), - std::forward(args)...); - } + par_dispatch(DEFAULT_LOOP_PATTERN, name, DevExecSpace(), + std::forward(args)...); } template From 5db0d34954901ec5193d6a538aeddbd941473f08 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sun, 23 Jun 2024 14:10:54 +0200 Subject: [PATCH 07/99] formatting --- src/kokkos_abstraction.hpp | 199 ++++++++++++++++---------------- tst/unit/kokkos_abstraction.cpp | 91 ++++++++------- 2 files changed, 146 insertions(+), 144 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 7338e43d4567..6032d3cc817b 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -258,64 +258,63 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac function, std::forward(args)...); } -template class FlatFunctor; +template +class FlatFunctor; -template +template auto MakeFlatFunctor(F &function) { - return FlatFunctor(); + return FlatFunctor(); } -template +template struct FlatLoop3D { - FlatLoop3D(){}; - template - inline void operator()(Tag tag, const F &function, const std::string &name, - DevExecSpace exec_space, const int kl, const int ku, - const int jl, const int ju, const int il, const int iu, - Args ...args) const { - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NkNjNi = Nk * Nj * Ni; - const int NjNi = Nj * Ni; - kokkos_dispatch( - tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi), - KOKKOS_LAMBDA(const int &idx, FArgs ...fargs) { - int k = idx / NjNi; - int j = (idx - k * NjNi) / Ni; - int i = idx - k * NjNi - j * Ni; - k += kl; - j += jl; - i += il; - function(k, j, i, std::forward(fargs)...); - }, - std::forward(args)...); - } + FlatLoop3D() {}; + template + inline void operator()(Tag tag, const F &function, const std::string &name, + DevExecSpace exec_space, const int kl, const int ku, + const int jl, const int ju, const int il, const int iu, + Args... args) const { + const int Nk = ku - kl + 1; + const int Nj = ju - jl + 1; + const int Ni = iu - il + 1; + const int NkNjNi = Nk * Nj * Ni; + const int NjNi = Nj * Ni; + kokkos_dispatch( + tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi), + KOKKOS_LAMBDA(const int &idx, FArgs... fargs) { + int k = idx / NjNi; + int j = (idx - k * NjNi) / Ni; + int i = idx - k * NjNi - j * Ni; + k += kl; + j += jl; + i += il; + function(k, j, i, std::forward(fargs)...); + }, + std::forward(args)...); + } }; -template -class FlatFunctor -{ - public: - FlatFunctor(){}; - template - inline void operator()(Tag tag, const F &function, Args ...args) const { - - const FlatLoop3D flat3D; - flat3D(tag, function, std::forward(args)...); - } +template +class FlatFunctor { + public: + FlatFunctor() {}; + template + inline void operator()(Tag tag, const F &function, Args... args) const { + + const FlatLoop3D flat3D; + flat3D(tag, function, std::forward(args)...); + } }; -template -class FlatFunctor -{ - public: - FlatFunctor(){}; - template - inline void operator()(Tag tag, const F &function, Args ...args) const { - const FlatLoop3D flat3D; - flat3D(tag, function, std::forward(args)...); - } +template +class FlatFunctor { + public: + FlatFunctor() {}; + template + inline void operator()(Tag tag, const F &function, Args... args) const { + const FlatLoop3D flat3D; + flat3D(tag, function, std::forward(args)...); + } }; // 3D loop using Kokkos 1D Range @@ -326,7 +325,8 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp const int iu, const Function &function, Args &&...args) { Tag tag; const auto func = MakeFlatFunctor(function); - func(tag, function, name, exec_space, kl, ku, jl, ju, il, iu, std::forward(args)...); + func(tag, function, name, exec_space, kl, ku, jl, ju, il, iu, + std::forward(args)...); } // 3D loop using MDRange loops @@ -417,60 +417,59 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, function(k, j, i); } -template +template struct FlatLoop4D { - FlatLoop4D(){}; - template - inline void operator()(Tag tag, const F &function, const std::string &name, - DevExecSpace exec_space, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, - const int il, const int iu, Args ...args) const { - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NnNkNjNi = Nn * Nk * Nj * Ni; - const int NkNjNi = Nk * Nj * Ni; - const int NjNi = Nj * Ni; - kokkos_dispatch( - tag, name, Kokkos::RangePolicy<>(exec_space, 0, NnNkNjNi), - KOKKOS_LAMBDA(const int &idx, FArgs ...fargs) { - int n = idx / NkNjNi; - int k = (idx - n * NkNjNi) / NjNi; - int j = (idx - n * NkNjNi - k * NjNi) / Ni; - int i = idx - n * NkNjNi - k * NjNi - j * Ni; - n += nl; - k += kl; - j += jl; - i += il; - function(n, k, j, i, std::forward(fargs)...); - }, - std::forward(args)...); - } + FlatLoop4D() {}; + template + inline void operator()(Tag tag, const F &function, const std::string &name, + DevExecSpace exec_space, const int nl, const int nu, + const int kl, const int ku, const int jl, const int ju, + const int il, const int iu, Args... args) const { + const int Nn = nu - nl + 1; + const int Nk = ku - kl + 1; + const int Nj = ju - jl + 1; + const int Ni = iu - il + 1; + const int NnNkNjNi = Nn * Nk * Nj * Ni; + const int NkNjNi = Nk * Nj * Ni; + const int NjNi = Nj * Ni; + kokkos_dispatch( + tag, name, Kokkos::RangePolicy<>(exec_space, 0, NnNkNjNi), + KOKKOS_LAMBDA(const int &idx, FArgs... fargs) { + int n = idx / NkNjNi; + int k = (idx - n * NkNjNi) / NjNi; + int j = (idx - n * NkNjNi - k * NjNi) / Ni; + int i = idx - n * NkNjNi - k * NjNi - j * Ni; + n += nl; + k += kl; + j += jl; + i += il; + function(n, k, j, i, std::forward(fargs)...); + }, + std::forward(args)...); + } }; -template -class FlatFunctor -{ - public: - FlatFunctor(){}; - template - inline void operator()(Tag tag, const F &function, Args ...args) const { - const FlatLoop4D flat4D; - flat4D(tag, function, std::forward(args)...); - } +template +class FlatFunctor { + public: + FlatFunctor() {}; + template + inline void operator()(Tag tag, const F &function, Args... args) const { + const FlatLoop4D flat4D; + flat4D(tag, function, std::forward(args)...); + } }; -template -class FlatFunctor -{ - public: - FlatFunctor(){}; - template - inline void operator()(Tag tag, const F &function, Args ...args) const { - const FlatLoop4D flat4D; - flat4D(tag, function, std::forward(args)...); - } +template +class FlatFunctor { + public: + FlatFunctor() {}; + template + inline void operator()(Tag tag, const F &function, Args... args) const { + const FlatLoop4D flat4D; + flat4D(tag, function, std::forward(args)...); + } }; // 4D loop using Kokkos 1D Range @@ -483,7 +482,7 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp Tag tag; const auto func = MakeFlatFunctor(function); func(tag, function, name, exec_space, nl, nu, kl, ku, jl, ju, il, iu, - std::forward(args)...); + std::forward(args)...); } // 4D loop using MDRange loops diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index 015f1bcb8242..ae0e3fcb79e8 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -506,76 +506,79 @@ bool test_wrapper_reduce_3d(T loop_pattern, DevExecSpace exec_space) { parthenon::ParArray3D buffer("Testing buffer", N, N, N); // Initialize data parthenon::par_for( - loop_pattern, "Initialize parallel reduce array", exec_space, 0, N-1, 0, N-1, 0, N-1, - KOKKOS_LAMBDA(const int k, const int j, const int i) { buffer(k,j,i) = i+j+k; }); + loop_pattern, "Initialize parallel reduce array", exec_space, 0, N - 1, 0, N - 1, 0, + N - 1, KOKKOS_LAMBDA(const int k, const int j, const int i) { + buffer(k, j, i) = i + j + k; + }); int tot = 0; for (int k = 0; k < N; ++k) { - for (int j = 0; j < N; ++j) { - for (int i = 0; i < N; ++i) { - tot += i+j+k; - } - } + for (int j = 0; j < N; ++j) { + for (int i = 0; i < N; ++i) { + tot += i + j + k; + } + } } int test_tot = 0; parthenon::par_reduce( - loop_pattern, "Sum via par reduce", exec_space, - 0, N-1, 0, N-1, 0, N-1, - KOKKOS_LAMBDA(const int k, const int j, const int i, int &t) { - t += i+j+k; - }, Kokkos::Sum(test_tot)); + loop_pattern, "Sum via par reduce", exec_space, 0, N - 1, 0, N - 1, 0, N - 1, + KOKKOS_LAMBDA(const int k, const int j, const int i, int &t) { t += i + j + k; }, + Kokkos::Sum(test_tot)); return tot == test_tot; } template bool test_wrapper_reduce_4d(T loop_pattern, DevExecSpace exec_space) { - constexpr int N = 10; - parthenon::ParArray4D buffer("Testing buffer", N, N, N, N); - // Initialize data - parthenon::par_for( - loop_pattern, "Initialize parallel reduce array", exec_space, 0, N-1, 0, N-1, 0, N-1, 0, N-1, - KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) { buffer(n,k,j,i) = i+j+k+n; }); - int tot = 0; - for (int n = 0; n < N; ++n) { - for (int k = 0; k < N; ++k) { - for (int j = 0; j < N; ++j) { - for (int i = 0; i < N; ++i) { - tot += i+j+k+n; - } - } + constexpr int N = 10; + parthenon::ParArray4D buffer("Testing buffer", N, N, N, N); + // Initialize data + parthenon::par_for( + loop_pattern, "Initialize parallel reduce array", exec_space, 0, N - 1, 0, N - 1, 0, + N - 1, 0, N - 1, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) { + buffer(n, k, j, i) = i + j + k + n; + }); + int tot = 0; + for (int n = 0; n < N; ++n) { + for (int k = 0; k < N; ++k) { + for (int j = 0; j < N; ++j) { + for (int i = 0; i < N; ++i) { + tot += i + j + k + n; + } } - } + } + } int test_tot = 0; parthenon::par_reduce( - loop_pattern, "Sum via par reduce", exec_space, - 0, N-1, 0, N-1, 0, N-1, 0, N-1, + loop_pattern, "Sum via par reduce", exec_space, 0, N - 1, 0, N - 1, 0, N - 1, 0, + N - 1, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i, int &t) { - t += i+j+k+n; - }, Kokkos::Sum(test_tot)); + t += i + j + k + n; + }, + Kokkos::Sum(test_tot)); return tot == test_tot; } TEST_CASE("Parallel reduce", "[par_reduce]") { auto default_exec_space = DevExecSpace(); SECTION("1D loops") { - REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_flatrange_tag, - default_exec_space) == true); - if constexpr (std::is_same::value) { - REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_simdfor_tag, + REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_flatrange_tag, default_exec_space) == true); - } + if constexpr (std::is_same::value) { + REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_simdfor_tag, + default_exec_space) == true); + } } SECTION("3D loops") { - REQUIRE(test_wrapper_reduce_3d(parthenon::loop_pattern_flatrange_tag, - default_exec_space) == true); - REQUIRE(test_wrapper_reduce_3d(parthenon::loop_pattern_mdrange_tag, - default_exec_space) == true); + REQUIRE(test_wrapper_reduce_3d(parthenon::loop_pattern_flatrange_tag, + default_exec_space) == true); + REQUIRE(test_wrapper_reduce_3d(parthenon::loop_pattern_mdrange_tag, + default_exec_space) == true); } SECTION("4D loops") { - REQUIRE(test_wrapper_reduce_4d(parthenon::loop_pattern_flatrange_tag, - default_exec_space) == true); - REQUIRE(test_wrapper_reduce_4d(parthenon::loop_pattern_mdrange_tag, - default_exec_space) == true); + REQUIRE(test_wrapper_reduce_4d(parthenon::loop_pattern_flatrange_tag, + default_exec_space) == true); + REQUIRE(test_wrapper_reduce_4d(parthenon::loop_pattern_mdrange_tag, + default_exec_space) == true); } } From ba335d796c80e1e66a7253d23957d37abab0daad Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sun, 23 Jun 2024 17:29:28 +0200 Subject: [PATCH 08/99] linting --- src/kokkos_abstraction.hpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 6032d3cc817b..6b0b7cc770e2 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -268,7 +268,7 @@ auto MakeFlatFunctor(F &function) { template struct FlatLoop3D { - FlatLoop3D() {}; + FlatLoop3D() {} template inline void operator()(Tag tag, const F &function, const std::string &name, DevExecSpace exec_space, const int kl, const int ku, @@ -297,10 +297,9 @@ struct FlatLoop3D { template class FlatFunctor { public: - FlatFunctor() {}; + FlatFunctor() {} template inline void operator()(Tag tag, const F &function, Args... args) const { - const FlatLoop3D flat3D; flat3D(tag, function, std::forward(args)...); } @@ -309,7 +308,7 @@ class FlatFunctor class FlatFunctor { public: - FlatFunctor() {}; + FlatFunctor() {} template inline void operator()(Tag tag, const F &function, Args... args) const { const FlatLoop3D flat3D; @@ -419,7 +418,7 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, template struct FlatLoop4D { - FlatLoop4D() {}; + FlatLoop4D() {} template inline void operator()(Tag tag, const F &function, const std::string &name, DevExecSpace exec_space, const int nl, const int nu, @@ -452,7 +451,7 @@ struct FlatLoop4D { template class FlatFunctor { public: - FlatFunctor() {}; + FlatFunctor() {} template inline void operator()(Tag tag, const F &function, Args... args) const { const FlatLoop4D flat4D; @@ -464,7 +463,7 @@ template class FlatFunctor { public: - FlatFunctor() {}; + FlatFunctor() {} template inline void operator()(Tag tag, const F &function, Args... args) const { const FlatLoop4D flat4D; From 2fa8ad150c8b0c4cec94ef47aa33d722aac0d67b Mon Sep 17 00:00:00 2001 From: adam reyes Date: Mon, 24 Jun 2024 00:11:52 +0200 Subject: [PATCH 09/99] templating functor index types --- src/kokkos_abstraction.hpp | 31 ++++--------------------------- 1 file changed, 4 insertions(+), 27 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 6b0b7cc770e2..58e0db791378 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -294,19 +294,8 @@ struct FlatLoop3D { } }; -template -class FlatFunctor { - public: - FlatFunctor() {} - template - inline void operator()(Tag tag, const F &function, Args... args) const { - const FlatLoop3D flat3D; - flat3D(tag, function, std::forward(args)...); - } -}; - -template -class FlatFunctor { +template +class FlatFunctor { public: FlatFunctor() {} template @@ -448,20 +437,8 @@ struct FlatLoop4D { } }; -template -class FlatFunctor { - public: - FlatFunctor() {} - template - inline void operator()(Tag tag, const F &function, Args... args) const { - const FlatLoop4D flat4D; - flat4D(tag, function, std::forward(args)...); - } -}; - -template -class FlatFunctor { +template +class FlatFunctor { public: FlatFunctor() {} template From b9a95a155f205066f0cffeb01d57029a8865878b Mon Sep 17 00:00:00 2001 From: adam reyes Date: Mon, 24 Jun 2024 11:24:56 +0200 Subject: [PATCH 10/99] moved to a single functor --- src/kokkos_abstraction.hpp | 139 +++++++++++++++---------------------- 1 file changed, 57 insertions(+), 82 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 58e0db791378..dae75a1de463 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -258,50 +258,32 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac function, std::forward(args)...); } -template +template class FlatFunctor; -template -auto MakeFlatFunctor(F &function) { - return FlatFunctor(); +template +auto MakeFlatFunctor(F &function, Args... args) { + return FlatFunctor(function, std::forward(args)...); } -template -struct FlatLoop3D { - FlatLoop3D() {} - template - inline void operator()(Tag tag, const F &function, const std::string &name, - DevExecSpace exec_space, const int kl, const int ku, - const int jl, const int ju, const int il, const int iu, - Args... args) const { - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NkNjNi = Nk * Nj * Ni; - const int NjNi = Nj * Ni; - kokkos_dispatch( - tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi), - KOKKOS_LAMBDA(const int &idx, FArgs... fargs) { - int k = idx / NjNi; - int j = (idx - k * NjNi) / Ni; - int i = idx - k * NjNi - j * Ni; - k += kl; - j += jl; - i += il; - function(k, j, i, std::forward(fargs)...); - }, - std::forward(args)...); - } -}; +template +class FlatFunctor { + int NjNi, Nj, Ni, kl, jl, il; + Function function; -template -class FlatFunctor { public: - FlatFunctor() {} - template - inline void operator()(Tag tag, const F &function, Args... args) const { - const FlatLoop3D flat3D; - flat3D(tag, function, std::forward(args)...); + FlatFunctor(const Function _function, const int _NjNi, const int _Nj, const int _Ni, + const int _kl, const int _jl, const int _il) + : function(_function), NjNi(_NjNi), Nj(_Nj), Ni(_Ni), kl(_kl), jl(_jl), il(_il) {} + KOKKOS_INLINE_FUNCTION + void operator()(const int &idx, FArgs &&...fargs) const { + int k = idx / NjNi; + int j = (idx - k * NjNi) / Ni; + int i = idx - k * NjNi - j * Ni; + k += kl; + j += jl; + i += il; + function(k, j, i, std::forward(fargs)...); } }; @@ -312,9 +294,14 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp const int kl, const int ku, const int jl, const int ju, const int il, const int iu, const Function &function, Args &&...args) { Tag tag; - const auto func = MakeFlatFunctor(function); - func(tag, function, name, exec_space, kl, ku, jl, ju, il, iu, - std::forward(args)...); + const int Nk = ku - kl + 1; + const int Nj = ju - jl + 1; + const int Ni = iu - il + 1; + const int NkNjNi = Nk * Nj * Ni; + const int NjNi = Nj * Ni; + kokkos_dispatch(tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi), + MakeFlatFunctor(function, NjNi, Nj, Ni, kl, jl, il), + std::forward(args)...); } // 3D loop using MDRange loops @@ -405,46 +392,27 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, function(k, j, i); } -template -struct FlatLoop4D { - FlatLoop4D() {} - template - inline void operator()(Tag tag, const F &function, const std::string &name, - DevExecSpace exec_space, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, - const int il, const int iu, Args... args) const { - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NnNkNjNi = Nn * Nk * Nj * Ni; - const int NkNjNi = Nk * Nj * Ni; - const int NjNi = Nj * Ni; - kokkos_dispatch( - tag, name, Kokkos::RangePolicy<>(exec_space, 0, NnNkNjNi), - KOKKOS_LAMBDA(const int &idx, FArgs... fargs) { - int n = idx / NkNjNi; - int k = (idx - n * NkNjNi) / NjNi; - int j = (idx - n * NkNjNi - k * NjNi) / Ni; - int i = idx - n * NkNjNi - k * NjNi - j * Ni; - n += nl; - k += kl; - j += jl; - i += il; - function(n, k, j, i, std::forward(fargs)...); - }, - std::forward(args)...); - } -}; +template +class FlatFunctor { + int NkNjNi, NjNi, Nj, Ni, nl, kl, jl, il; + Function function; -template -class FlatFunctor { public: - FlatFunctor() {} - template - inline void operator()(Tag tag, const F &function, Args... args) const { - const FlatLoop4D flat4D; - flat4D(tag, function, std::forward(args)...); + FlatFunctor(const Function _function, const int _NkNjNi, const int _NjNi, const int _Nj, + const int _Ni, const int _nl, const int _kl, const int _jl, const int _il) + : function(_function), NkNjNi(_NkNjNi), NjNi(_NjNi), Nj(_Nj), Ni(_Ni), nl(_nl), + kl(_kl), jl(_jl), il(_il) {} + KOKKOS_INLINE_FUNCTION + void operator()(const int &idx, FArgs &&...fargs) const { + int n = idx / NkNjNi; + int k = (idx - n * NkNjNi) / NjNi; + int j = (idx - n * NkNjNi - k * NjNi) / Ni; + int i = idx - n * NkNjNi - k * NjNi - j * Ni; + n += nl; + k += kl; + j += jl; + i += il; + function(n, k, j, i, std::forward(fargs)...); } }; @@ -456,9 +424,16 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp const int ju, const int il, const int iu, const Function &function, Args &&...args) { Tag tag; - const auto func = MakeFlatFunctor(function); - func(tag, function, name, exec_space, nl, nu, kl, ku, jl, ju, il, iu, - std::forward(args)...); + const int Nn = nu - nl + 1; + const int Nk = ku - kl + 1; + const int Nj = ju - jl + 1; + const int Ni = iu - il + 1; + const int NnNkNjNi = Nn * Nk * Nj * Ni; + const int NkNjNi = Nk * Nj * Ni; + const int NjNi = Nj * Ni; + kokkos_dispatch(tag, name, Kokkos::RangePolicy<>(exec_space, 0, NnNkNjNi), + MakeFlatFunctor(function, NkNjNi, NjNi, Nj, Ni, nl, kl, jl, il), + std::forward(args)...); } // 4D loop using MDRange loops From 9033aa3331f0bfaabf6e35dd8cde8877394f176f Mon Sep 17 00:00:00 2001 From: Adam C Reyes Date: Mon, 24 Jun 2024 12:50:27 +0200 Subject: [PATCH 11/99] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4f80ef5dd8df..967c985a0b02 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## Current develop ### Added (new features/APIs/variables/...) +- [[PR 1130]](https://github.com/parthenon-hpc-lab/parthenon/pull/1130) Enable `parthenon::par_reduce` for MD loops with Kokkos 1D Range - [[PR 1099]](https://github.com/parthenon-hpc-lab/parthenon/pull/1099) Functionality for outputting task graphs in GraphViz format. - [[PR 1091]](https://github.com/parthenon-hpc-lab/parthenon/pull/1091) Add vector wave equation example. - [[PR 991]](https://github.com/parthenon-hpc-lab/parthenon/pull/991) Add fine fields. From 503ae0c4848ef2a05303cddfb0511345dfa94140 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Tue, 25 Jun 2024 18:14:20 +0200 Subject: [PATCH 12/99] first pass, doesn't like 4D loops --- src/kokkos_abstraction.hpp | 168 +++++++++++++++++++++++++------------ 1 file changed, 116 insertions(+), 52 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index dae75a1de463..96dca3caf08f 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -27,6 +27,7 @@ #include +#include "Kokkos_Macros.hpp" #include "basic_types.hpp" #include "config.hpp" #include "parthenon_array_generic.hpp" @@ -214,6 +215,71 @@ inline void kokkos_dispatch(ParallelScanDispatch, Args &&...args) { } } // namespace dispatch_impl +namespace meta { +template +struct if_else { + using type = T; +}; + +template +struct if_else { + using type = F; +}; + +template +struct PackList {}; + +template +constexpr int PackLength(PackList) {return sizeof...(Ts);} + +} // namespace meta + +namespace meta { + +template +struct GetIndexND {}; + +template +struct GetIndexND, PackList<>> { + using value = PackList; +}; + +template +struct GetIndexND, PackList> { + using value = typename if_else::value, + typename GetIndexND, PackList>::value, + PackList>::type ; +}; + +template +struct GetFArgs {}; + +template +struct GetFArgs> { + using value = PackList<>; +}; + +template +struct GetFArgs> { + using value + = typename if_else::value, + PackList, + typename GetFArgs>::value>::type; +}; + +template +struct FunctionSignature {}; + +template +struct FunctionSignature { + using IndexND = GetIndexND, PackList>; + using FArgs = GetFArgs, PackList>; +}; + +} // namespace meta + + // 1D loop using RangePolicy loops template inline typename std::enable_if::type @@ -258,35 +324,57 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac function, std::forward(args)...); } -template -class FlatFunctor; + +template +class FlatFunctor {}; + +template +class FlatFunctor, meta::PackList> { + Kokkos::Array ranges; + Kokkos::Array strides; + Function function; + + public: + template + FlatFunctor(const Function _function, Args ...args) + : function(_function), ranges({{args...}}) { + for (int ri=1; ri(fargs)...); + } +}; template auto MakeFlatFunctor(F &function, Args... args) { - return FlatFunctor(function, std::forward(args)...); + using signature = meta::FunctionSignature; + using IndexND = typename signature::IndexND::value; + return FlatFunctor, + typename signature::FArgs::value>(function, std::forward(args)...); } -template -class FlatFunctor { - int NjNi, Nj, Ni, kl, jl, il; - Function function; - - public: - FlatFunctor(const Function _function, const int _NjNi, const int _Nj, const int _Ni, - const int _kl, const int _jl, const int _il) - : function(_function), NjNi(_NjNi), Nj(_Nj), Ni(_Ni), kl(_kl), jl(_jl), il(_il) {} - KOKKOS_INLINE_FUNCTION - void operator()(const int &idx, FArgs &&...fargs) const { - int k = idx / NjNi; - int j = (idx - k * NjNi) / Ni; - int i = idx - k * NjNi - j * Ni; - k += kl; - j += jl; - i += il; - function(k, j, i, std::forward(fargs)...); - } -}; - // 3D loop using Kokkos 1D Range template inline typename std::enable_if::type @@ -298,9 +386,9 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp const int Nj = ju - jl + 1; const int Ni = iu - il + 1; const int NkNjNi = Nk * Nj * Ni; - const int NjNi = Nj * Ni; + const IndexRange k{kl, ku}, j{jl, ju}, i{il, iu}; kokkos_dispatch(tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi), - MakeFlatFunctor(function, NjNi, Nj, Ni, kl, jl, il), + MakeFlatFunctor(function, k, j, i), std::forward(args)...); } @@ -392,29 +480,6 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, function(k, j, i); } -template -class FlatFunctor { - int NkNjNi, NjNi, Nj, Ni, nl, kl, jl, il; - Function function; - - public: - FlatFunctor(const Function _function, const int _NkNjNi, const int _NjNi, const int _Nj, - const int _Ni, const int _nl, const int _kl, const int _jl, const int _il) - : function(_function), NkNjNi(_NkNjNi), NjNi(_NjNi), Nj(_Nj), Ni(_Ni), nl(_nl), - kl(_kl), jl(_jl), il(_il) {} - KOKKOS_INLINE_FUNCTION - void operator()(const int &idx, FArgs &&...fargs) const { - int n = idx / NkNjNi; - int k = (idx - n * NkNjNi) / NjNi; - int j = (idx - n * NkNjNi - k * NjNi) / Ni; - int i = idx - n * NkNjNi - k * NjNi - j * Ni; - n += nl; - k += kl; - j += jl; - i += il; - function(n, k, j, i, std::forward(fargs)...); - } -}; // 4D loop using Kokkos 1D Range template @@ -429,10 +494,9 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp const int Nj = ju - jl + 1; const int Ni = iu - il + 1; const int NnNkNjNi = Nn * Nk * Nj * Ni; - const int NkNjNi = Nk * Nj * Ni; - const int NjNi = Nj * Ni; + const IndexRange n{nl, nu}, k{kl, ku}, j{jl, ju}, i{il, iu}; kokkos_dispatch(tag, name, Kokkos::RangePolicy<>(exec_space, 0, NnNkNjNi), - MakeFlatFunctor(function, NkNjNi, NjNi, Nj, Ni, nl, kl, jl, il), + MakeFlatFunctor(function, n, k, j, i), std::forward(args)...); } From e01dabfd0da654f5fb88dd3f97931354a1679335 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Tue, 25 Jun 2024 19:29:20 +0200 Subject: [PATCH 13/99] formatting --- src/kokkos_abstraction.hpp | 120 ++++++++++++++++++------------------- 1 file changed, 59 insertions(+), 61 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 96dca3caf08f..84d3e8314eaf 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -218,19 +218,21 @@ inline void kokkos_dispatch(ParallelScanDispatch, Args &&...args) { namespace meta { template struct if_else { - using type = T; + using type = T; }; template struct if_else { - using type = F; + using type = F; }; template struct PackList {}; -template -constexpr int PackLength(PackList) {return sizeof...(Ts);} +template +constexpr int PackLength(PackList) { + return sizeof...(Ts); +} } // namespace meta @@ -241,14 +243,15 @@ struct GetIndexND {}; template struct GetIndexND, PackList<>> { - using value = PackList; + using value = PackList; }; template struct GetIndexND, PackList> { - using value = typename if_else::value, - typename GetIndexND, PackList>::value, - PackList>::type ; + using value = typename if_else< + std::is_same::value, + typename GetIndexND, PackList>::value, + PackList>::type; }; template @@ -256,30 +259,27 @@ struct GetFArgs {}; template struct GetFArgs> { - using value = PackList<>; + using value = PackList<>; }; template struct GetFArgs> { - using value - = typename if_else::value, - PackList, - typename GetFArgs>::value>::type; + using value = typename if_else::value, + typename GetFArgs>::value, + PackList>::type; }; template struct FunctionSignature {}; template -struct FunctionSignature { - using IndexND = GetIndexND, PackList>; - using FArgs = GetFArgs, PackList>; +struct FunctionSignature { + using IndexND = GetIndexND, PackList>; + using FArgs = GetFArgs>; }; } // namespace meta - // 1D loop using RangePolicy loops template inline typename std::enable_if::type @@ -324,55 +324,56 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac function, std::forward(args)...); } - -template +template class FlatFunctor {}; template -class FlatFunctor, meta::PackList> { - Kokkos::Array ranges; - Kokkos::Array strides; - Function function; - - public: - template - FlatFunctor(const Function _function, Args ...args) +class FlatFunctor, + meta::PackList> { + Kokkos::Array ranges; + Kokkos::Array strides; + Function function; + + public: + template + FlatFunctor(const Function _function, Args... args) : function(_function), ranges({{args...}}) { - for (int ri=1; ri(fargs)...); + KOKKOS_INLINE_FUNCTION + void operator()(const int &idx, FArgs... fargs) const { + constexpr int ND = sizeof...(Is); + int inds[ND]; + inds[0] = idx; + for (int i = 1; i < ND; i++) { + inds[i] = idx; + inds[i - 1] /= strides[i - 1]; + for (int j = 0; j < i; j++) { + inds[i] -= inds[j] * strides[j]; } + } + for (int i = 0; i < ND; i++) { + inds[i] += ranges[i].s; + } + + function(inds[Is]..., std::forward(fargs)...); + } }; template auto MakeFlatFunctor(F &function, Args... args) { - using signature = meta::FunctionSignature; - using IndexND = typename signature::IndexND::value; - return FlatFunctor, - typename signature::FArgs::value>(function, std::forward(args)...); + using signature = meta::FunctionSignature; + using IndexND = typename signature::IndexND::value; + return FlatFunctor, + typename signature::FArgs::value>(function, + std::forward(args)...); } // 3D loop using Kokkos 1D Range @@ -388,8 +389,7 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp const int NkNjNi = Nk * Nj * Ni; const IndexRange k{kl, ku}, j{jl, ju}, i{il, iu}; kokkos_dispatch(tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi), - MakeFlatFunctor(function, k, j, i), - std::forward(args)...); + MakeFlatFunctor(function, k, j, i), std::forward(args)...); } // 3D loop using MDRange loops @@ -480,7 +480,6 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, function(k, j, i); } - // 4D loop using Kokkos 1D Range template inline typename std::enable_if::type @@ -496,8 +495,7 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp const int NnNkNjNi = Nn * Nk * Nj * Ni; const IndexRange n{nl, nu}, k{kl, ku}, j{jl, ju}, i{il, iu}; kokkos_dispatch(tag, name, Kokkos::RangePolicy<>(exec_space, 0, NnNkNjNi), - MakeFlatFunctor(function, n, k, j, i), - std::forward(args)...); + MakeFlatFunctor(function, n, k, j, i), std::forward(args)...); } // 4D loop using MDRange loops From 259115d01b060b322842caaabaac5a5830b6e2b4 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Wed, 26 Jun 2024 00:25:38 +0200 Subject: [PATCH 14/99] added overload for index ranges --- src/kokkos_abstraction.hpp | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 84d3e8314eaf..fd6cdc5d6c4c 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -336,8 +336,21 @@ class FlatFunctor, public: template - FlatFunctor(const Function _function, Args... args) - : function(_function), ranges({{args...}}) { + FlatFunctor(const Function _function, IndexRange idr, Args... args) + : function(_function), ranges({{idr, args...}}) { + Initialize(); + } + + template + FlatFunctor(const Function _function, Args... args) : function(_function) { + std::array indices{{args...}}; + for (int i = 0; i < sizeof...(Is); i++) { + ranges[i] = {indices[2 * i], indices[2 * i + 1]}; + } + Initialize(); + } + + inline void Initialize() { for (int ri = 1; ri < sizeof...(Is); ri++) { const int N = ranges[ri].e - ranges[ri].s + 1; strides[ri - 1] = N; @@ -387,9 +400,9 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp const int Nj = ju - jl + 1; const int Ni = iu - il + 1; const int NkNjNi = Nk * Nj * Ni; - const IndexRange k{kl, ku}, j{jl, ju}, i{il, iu}; kokkos_dispatch(tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi), - MakeFlatFunctor(function, k, j, i), std::forward(args)...); + MakeFlatFunctor(function, kl, ku, jl, ju, il, iu), + std::forward(args)...); } // 3D loop using MDRange loops @@ -493,9 +506,9 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp const int Nj = ju - jl + 1; const int Ni = iu - il + 1; const int NnNkNjNi = Nn * Nk * Nj * Ni; - const IndexRange n{nl, nu}, k{kl, ku}, j{jl, ju}, i{il, iu}; kokkos_dispatch(tag, name, Kokkos::RangePolicy<>(exec_space, 0, NnNkNjNi), - MakeFlatFunctor(function, n, k, j, i), std::forward(args)...); + MakeFlatFunctor(function, nl, nu, kl, ku, jl, ju, il, iu), + std::forward(args)...); } // 4D loop using MDRange loops From ad53f401e90c048c8bc9dfaa3985034f32e2266f Mon Sep 17 00:00:00 2001 From: adam reyes Date: Wed, 26 Jun 2024 16:09:40 +0200 Subject: [PATCH 15/99] generic par_dispatch for all flatrange loops --- src/kokkos_abstraction.hpp | 326 ++++++++++++++++++------------------- 1 file changed, 154 insertions(+), 172 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index fd6cdc5d6c4c..927ef2e1c6f9 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -216,15 +216,9 @@ inline void kokkos_dispatch(ParallelScanDispatch, Args &&...args) { } // namespace dispatch_impl namespace meta { -template -struct if_else { - using type = T; -}; -template -struct if_else { - using type = F; -}; +template +using base_type = typename std::remove_cv::type>::type; template struct PackList {}; @@ -234,39 +228,74 @@ constexpr int PackLength(PackList) { return sizeof...(Ts); } -} // namespace meta +template +struct PopList {}; -namespace meta { +template +struct PopList<1, PackList> { + using type = T; + using value = PackList; +}; + +template +struct PopList> { + static_assert(N > 1, "PopList requires N>=1"); + + private: + using pop = PopList>; + + public: + using type = typename pop::type; + using value = typename pop::value; +}; + +template +struct MergeLists {}; + +template +struct MergeLists, PackList<>> { + using value = PackList; +}; + +template +struct MergeLists, PackList> { + using value = typename MergeLists, PackList>::value; +}; template -struct GetIndexND {}; +struct PackSameType {}; template -struct GetIndexND, PackList<>> { +struct PackSameType, PackList<>> { using value = PackList; }; template -struct GetIndexND, PackList> { - using value = typename if_else< - std::is_same::value, - typename GetIndexND, PackList>::value, +struct PackSameType, PackList> { + using value = typename std::conditional< + std::is_convertible::value, + typename PackSameType, PackList>::value, PackList>::type; }; -template -struct GetFArgs {}; +} // namespace meta + +namespace meta { + +template +struct PackIntegralType {}; -template -struct GetFArgs> { - using value = PackList<>; +template +struct PackIntegralType, PackList<>> { + using value = PackList; }; -template -struct GetFArgs> { - using value = typename if_else::value, - typename GetFArgs>::value, - PackList>::type; +template +struct PackIntegralType, PackList> { + using value = std::conditional< + std::is_integral::value, + typename PackIntegralType, PackList>::value, + PackList>; }; template @@ -274,55 +303,23 @@ struct FunctionSignature {}; template struct FunctionSignature { - using IndexND = GetIndexND, PackList>; - using FArgs = GetFArgs>; + using IndexND = typename PackSameType, PackList>::value; + using FArgs = PopList>; }; -} // namespace meta - -// 1D loop using RangePolicy loops -template -inline typename std::enable_if::type -par_dispatch(Pattern, const std::string &name, DevExecSpace exec_space, const int &il, - const int &iu, const Function &function, Args &&...args) { - PARTHENON_INSTRUMENT_REGION(name) - if constexpr (std::is_same::value && - std::is_same::value) { -#pragma omp simd - for (auto i = il; i <= iu; i++) { - function(i); - } - } else { - Tag tag; - kokkos_dispatch(tag, name, - Kokkos::Experimental::require( - Kokkos::RangePolicy<>(exec_space, il, iu + 1), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - function, std::forward(args)...); - } -} - -template -inline typename std::enable_if::type -par_dispatch(Pattern p, const std::string &name, DevExecSpace exec_space, - const IndexRange &r, const Function &function, Args &&...args) { - par_dispatch(p, name, exec_space, r.s, r.e, function, std::forward(args)...); -} +template +struct DispatchSignature {}; + +template +struct DispatchSignature> { + using IndexND = typename PackSameType, PackList>::value; + using Function = + typename PopList>::type; + using Args = + typename PopList>::value; +}; -// 2D loop using MDRange loops -template -inline typename std::enable_if::type -par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space, - const int jl, const int ju, const int il, const int iu, - const Function &function, Args &&...args) { - Tag tag; - kokkos_dispatch(tag, name, - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>( - exec_space, {jl, il}, {ju + 1, iu + 1}, {1, iu + 1 - il}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - function, std::forward(args)...); -} +} // namespace meta template class FlatFunctor {}; @@ -383,26 +380,99 @@ class FlatFunctor, template auto MakeFlatFunctor(F &function, Args... args) { using signature = meta::FunctionSignature; - using IndexND = typename signature::IndexND::value; + using IndexND = typename signature::IndexND; return FlatFunctor, typename signature::FArgs::value>(function, std::forward(args)...); } -// 3D loop using Kokkos 1D Range +template +struct par_dispatch_impl {}; + +template +struct par_dispatch_impl, + meta::PackList> { + template + inline void dispatch(Pattern, std::string name, DevExecSpace exec_space, Index... ids, + Function function, Args... args) { + using index_type = typename meta::PopList<1, meta::PackList>::type; + constexpr bool is_FlatRange = std::is_same::value; + Tag tag; + + if constexpr (is_FlatRange) { + int rangeNx = 1; + if constexpr (std::is_same::value) { + for (auto &irange : {ids...}) { + rangeNx *= irange.e - irange.s + 1; + } + } else { + int indices[sizeof...(Index)] = {ids...}; + for (int i = 0; i < sizeof...(Index); i += 2) { + rangeNx *= indices[i + 1] - indices[i] + 1; + } + } + + kokkos_dispatch(tag, name, Kokkos::RangePolicy<>(exec_space, 0, rangeNx), + MakeFlatFunctor(function, std::forward(ids)...), + std::forward(args)...); + } + }; +}; + +template +inline typename std::enable_if::value, + void>::type +par_dispatch(Pattern, std::string name, DevExecSpace exec_space, AllArgs &&...args) { + using dispatchsig = meta::DispatchSignature>; + using Function = typename dispatchsig::Function; // functor type + using IndexND = typename dispatchsig::IndexND; // list of index types + using Args = typename dispatchsig::Args; // + par_dispatch_impl().dispatch( + Pattern(), name, exec_space, std::forward(args)...); +} + +// 1D loop using RangePolicy loops +template +inline typename std::enable_if::type +par_dispatch(Pattern, const std::string &name, DevExecSpace exec_space, const int &il, + const int &iu, const Function &function, Args &&...args) { + PARTHENON_INSTRUMENT_REGION(name) + if constexpr (std::is_same::value && + std::is_same::value) { +#pragma omp simd + for (auto i = il; i <= iu; i++) { + function(i); + } + } else { + Tag tag; + kokkos_dispatch(tag, name, + Kokkos::Experimental::require( + Kokkos::RangePolicy<>(exec_space, il, iu + 1), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + function, std::forward(args)...); + } +} + +template +inline typename std::enable_if::type +par_dispatch(Pattern p, const std::string &name, DevExecSpace exec_space, + const IndexRange &r, const Function &function, Args &&...args) { + par_dispatch(p, name, exec_space, r.s, r.e, function, std::forward(args)...); +} + +// 2D loop using MDRange loops template inline typename std::enable_if::type -par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_space, - const int kl, const int ku, const int jl, const int ju, const int il, - const int iu, const Function &function, Args &&...args) { +par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space, + const int jl, const int ju, const int il, const int iu, + const Function &function, Args &&...args) { Tag tag; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NkNjNi = Nk * Nj * Ni; - kokkos_dispatch(tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi), - MakeFlatFunctor(function, kl, ku, jl, ju, il, iu), - std::forward(args)...); + kokkos_dispatch(tag, name, + Kokkos::Experimental::require( + Kokkos::MDRangePolicy>( + exec_space, {jl, il}, {ju + 1, iu + 1}, {1, iu + 1 - il}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + function, std::forward(args)...); } // 3D loop using MDRange loops @@ -493,24 +563,6 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, function(k, j, i); } -// 4D loop using Kokkos 1D Range -template -inline typename std::enable_if::type -par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_space, - const int nl, const int nu, const int kl, const int ku, const int jl, - const int ju, const int il, const int iu, const Function &function, - Args &&...args) { - Tag tag; - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NnNkNjNi = Nn * Nk * Nj * Ni; - kokkos_dispatch(tag, name, Kokkos::RangePolicy<>(exec_space, 0, NnNkNjNi), - MakeFlatFunctor(function, nl, nu, kl, ku, jl, ju, il, iu), - std::forward(args)...); -} - // 4D loop using MDRange loops template inline typename std::enable_if::type @@ -632,39 +684,6 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac function, std::forward(args)...); } -// 5D loop using Kokkos 1D Range -template -inline void par_dispatch(LoopPatternFlatRange, const std::string &name, - DevExecSpace exec_space, const int bl, const int bu, - const int nl, const int nu, const int kl, const int ku, - const int jl, const int ju, const int il, const int iu, - const Function &function) { - const int Nb = bu - bl + 1; - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NbNnNkNjNi = Nb * Nn * Nk * Nj * Ni; - const int NnNkNjNi = Nn * Nk * Nj * Ni; - const int NkNjNi = Nk * Nj * Ni; - const int NjNi = Nj * Ni; - Kokkos::parallel_for( - name, Kokkos::RangePolicy<>(exec_space, 0, NbNnNkNjNi), - KOKKOS_LAMBDA(const int &idx) { - int b = idx / NnNkNjNi; - int n = (idx - b * NnNkNjNi) / NkNjNi; - int k = (idx - b * NnNkNjNi - n * NkNjNi) / NjNi; - int j = (idx - b * NnNkNjNi - n * NkNjNi - k * NjNi) / Ni; - int i = idx - b * NnNkNjNi - n * NkNjNi - k * NjNi - j * Ni; - b += bl; - n += nl; - k += kl; - j += jl; - i += il; - function(b, n, k, j, i); - }); -} - // 5D loop using SIMD FOR loops template inline void par_dispatch(LoopPatternSimdFor, const std::string &name, @@ -700,43 +719,6 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac function, std::forward(args)...); } -// 6D loop using Kokkos 1D Range -template -inline void par_dispatch(LoopPatternFlatRange, const std::string &name, - DevExecSpace exec_space, const int ll, const int lu, - const int ml, const int mu, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, - const int il, const int iu, const Function &function) { - const int Nl = lu - ll + 1; - const int Nm = mu - ml + 1; - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NjNi = Nj * Ni; - const int NkNjNi = Nk * NjNi; - const int NnNkNjNi = Nn * NkNjNi; - const int NmNnNkNjNi = Nm * NnNkNjNi; - const int NlNmNnNkNjNi = Nl * NmNnNkNjNi; - Kokkos::parallel_for( - name, Kokkos::RangePolicy<>(exec_space, 0, NlNmNnNkNjNi), - KOKKOS_LAMBDA(const int &idx) { - int l = idx / NmNnNkNjNi; - int m = (idx - l * NmNnNkNjNi) / NnNkNjNi; - int n = (idx - l * NmNnNkNjNi - m * NnNkNjNi) / NkNjNi; - int k = (idx - l * NmNnNkNjNi - m * NnNkNjNi - n * NkNjNi) / NjNi; - int j = (idx - l * NmNnNkNjNi - m * NnNkNjNi - n * NkNjNi - k * NjNi) / Ni; - int i = idx - l * NmNnNkNjNi - m * NnNkNjNi - n * NkNjNi - k * NjNi - j * Ni; - l += ll; - m += ml; - n += nl; - k += kl; - j += jl; - i += il; - function(l, m, n, k, j, i); - }); -} - // 6D loop using SIMD FOR loops template inline void par_dispatch(LoopPatternSimdFor, const std::string &name, From 442c04f414ca88dcc445e2e8a7908bc0bbc3cc5b Mon Sep 17 00:00:00 2001 From: adam reyes Date: Thu, 27 Jun 2024 01:12:04 +0200 Subject: [PATCH 16/99] wrapped 2D MDRange loop --- src/kokkos_abstraction.hpp | 194 +++++++++++++++++++++++++++++++------ 1 file changed, 163 insertions(+), 31 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 927ef2e1c6f9..26c47aaf1fc3 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -26,6 +26,7 @@ #include #include +#include #include "Kokkos_Macros.hpp" #include "basic_types.hpp" @@ -249,6 +250,14 @@ struct PopList> { using value = typename pop::value; }; +template +struct AppendList {}; + +template +struct AppendList> { + using value = PackList; +}; + template struct MergeLists {}; @@ -278,6 +287,25 @@ struct PackSameType, PackList> { PackList>::type; }; +template +struct SequenceOfOnes {}; + +template +struct SequenceOfOnes<0, std::integer_sequence>{ + using value = typename std::integer_sequence; +}; + +template +struct SequenceOfOnes> { + using value = typename SequenceOfOnes>::value; +}; + +template +struct SequenceOfOnes { + static_assert(N > 0, "N must be positive"); + using value = typename SequenceOfOnes>::value; +}; + } // namespace meta namespace meta { @@ -307,16 +335,40 @@ struct FunctionSignature { using FArgs = PopList>; }; +template +using function_signature = FunctionSignature; + +template +struct GetLaunchBounds {}; + +template +struct GetLaunchBounds> { + + template + struct is_one_of : std::false_type {}; + + template + struct is_one_of> + : std::bool_constant<(std::is_same_v || ...)> {}; + + using bound_variants = std::variant; + using bound = T; + using LaunchBounds = GetLaunchBounds>; + using value = typename std::conditional < is_one_of::type, + typename AppendList::value, PackList<>>::type; +}; + template struct DispatchSignature {}; template struct DispatchSignature> { - using IndexND = typename PackSameType, PackList>::value; + using LaunchBounds = typename PackSameType, PackList>::value; using Function = - typename PopList>::type; + typename PopList>::type; using Args = - typename PopList>::value; + typename PopList>::value; + }; } // namespace meta @@ -378,7 +430,7 @@ class FlatFunctor, }; template -auto MakeFlatFunctor(F &function, Args... args) { +auto MakeFlatFunctor(F &function, Args &&...args) { using signature = meta::FunctionSignature; using IndexND = typename signature::IndexND; return FlatFunctor, @@ -386,37 +438,112 @@ auto MakeFlatFunctor(F &function, Args... args) { std::forward(args)...); } +template +class MDRange {}; + +template +class MDRange, std::integer_sequence> { + Kokkos::Array lower, upper; + + public: + template + MDRange(IndexRange idr, Args... args) { + std::array ranges{{idr, args...}}; + for (int i = 0; i < sizeof...(Is); i++) { + lower[i] = ranges[i].s; + upper[i] = ranges[i].e; + } + } + + template + MDRange(Args... args) { + std::array indices{{args...}}; + for (int i = 0; i < sizeof...(Is); i++) { + lower[i] = indices[2*i]; + upper[i] = indices[2*i+1]; + } + } + + auto policy(DevExecSpace exec_space) { + constexpr int ND = sizeof...(Is); + return Kokkos::MDRangePolicy>(exec_space, + {lower[Is]...}, {1+upper[Is]...}, {ones..., upper[ND-1] + 1 - lower[ND-1]}); + } +}; + +template +auto MakeMDRangePolicy(DevExecSpace exec_space, Args &&...args) { + using signature = meta::FunctionSignature; + using IndexND = typename signature::IndexND; + static_assert(sizeof...(Args) % meta::PackLength(IndexND()) == 0, "Index doesn't match functor signature"); + return MDRange, + typename meta::SequenceOfOnes::value >(std::forward(args)...).policy(exec_space); +} + + + + template struct par_dispatch_impl {}; -template -struct par_dispatch_impl, +template +struct par_dispatch_impl, meta::PackList> { + + using BoundType = typename meta::PopList<1, meta::PackList>::type; + + // Index ...ids probably shouldn't be that template - inline void dispatch(Pattern, std::string name, DevExecSpace exec_space, Index... ids, - Function function, Args... args) { - using index_type = typename meta::PopList<1, meta::PackList>::type; - constexpr bool is_FlatRange = std::is_same::value; + inline void dispatch(Pattern, std::string name, DevExecSpace exec_space, Bounds &&...ids, + Function function, Args &&...args) { + Tag tag; - if constexpr (is_FlatRange) { + kokkos_dispatch(tag, name, + policy(Pattern(), exec_space, std::forward(ids)...), + functor(Pattern(), function, std::forward(ids)...), + std::forward(args)...); + }; + + template + KOKKOS_INLINE_FUNCTION + auto policy(Pattern, DevExecSpace exec_space, Bounds &&...ids) const { + constexpr bool is_FlatRange = std::is_same::value; + constexpr bool is_MDRange = std::is_same::value; + + if constexpr (is_FlatRange) { int rangeNx = 1; - if constexpr (std::is_same::value) { - for (auto &irange : {ids...}) { - rangeNx *= irange.e - irange.s + 1; - } + if constexpr (std::is_same::value) { + for (auto &irange : {ids...}) { + rangeNx *= irange.e - irange.s + 1; + } } else { - int indices[sizeof...(Index)] = {ids...}; - for (int i = 0; i < sizeof...(Index); i += 2) { - rangeNx *= indices[i + 1] - indices[i] + 1; - } + int indices[sizeof...(Bounds)] = {ids...}; + for (int i = 0; i < sizeof...(Bounds); i += 2) { + rangeNx *= indices[i + 1] - indices[i] + 1; + } } + return Kokkos::RangePolicy<>(exec_space, 0, rangeNx); - kokkos_dispatch(tag, name, Kokkos::RangePolicy<>(exec_space, 0, rangeNx), - MakeFlatFunctor(function, std::forward(ids)...), - std::forward(args)...); - } + } else if constexpr (is_MDRange) { + return MakeMDRangePolicy(exec_space, std::forward(ids)...); + } else { + } }; + + template + KOKKOS_INLINE_FUNCTION + auto functor(Pattern, Function function, Bounds &&...ids) const { + constexpr bool is_FlatRange = std::is_same::value; + constexpr bool is_MDRange = std::is_same::value; + if constexpr (is_FlatRange) { + return MakeFlatFunctor(function, std::forward(ids)...); + } else if constexpr(is_MDRange) { + return function; + } else { + } + + } }; template @@ -425,9 +552,9 @@ inline typename std::enable_if::valu par_dispatch(Pattern, std::string name, DevExecSpace exec_space, AllArgs &&...args) { using dispatchsig = meta::DispatchSignature>; using Function = typename dispatchsig::Function; // functor type - using IndexND = typename dispatchsig::IndexND; // list of index types + using LaunchBounds = typename dispatchsig::LaunchBounds; // list of index types using Args = typename dispatchsig::Args; // - par_dispatch_impl().dispatch( + par_dispatch_impl().dispatch( Pattern(), name, exec_space, std::forward(args)...); } @@ -467,12 +594,17 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac const int jl, const int ju, const int il, const int iu, const Function &function, Args &&...args) { Tag tag; - kokkos_dispatch(tag, name, - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>( - exec_space, {jl, il}, {ju + 1, iu + 1}, {1, iu + 1 - il}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - function, std::forward(args)...); + using Bounds = meta::PackList; + using dispArgs = meta::PackList; + par_dispatch_impl().dispatch(LoopPatternMDRange(), name, exec_space, + jl, ju, il, iu, function, std::forward(args)...); + /* kokkos_dispatch(tag, name, */ + /* Kokkos::Experimental::require( */ + /* Kokkos::MDRangePolicy>( */ + /* exec_space, {jl, il}, {ju + 1, iu + 1}, {1, iu + 1 - il}), */ + /* Kokkos::Experimental::WorkItemProperty::HintLightWeight), */ + /* function, std::forward(args)...); */ } // 3D loop using MDRange loops From a865f002f4baadafdfb9f88b7f6a6034e1367b7f Mon Sep 17 00:00:00 2001 From: adam reyes Date: Thu, 27 Jun 2024 13:23:58 +0200 Subject: [PATCH 17/99] wrapped rest of MDRange loops --- src/kokkos_abstraction.hpp | 130 ++++++++----------------------------- 1 file changed, 26 insertions(+), 104 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 26c47aaf1fc3..c13e9830c637 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -258,6 +258,14 @@ struct AppendList> { using value = PackList; }; +template +struct PrependList {}; + +template +struct PrependList> { + using value = PackList; +}; + template struct MergeLists {}; @@ -341,21 +349,24 @@ using function_signature = FunctionSignature; template struct GetLaunchBounds {}; +template <> +struct GetLaunchBounds> { + using value = PackList<>; +}; + template struct GetLaunchBounds> { - template - struct is_one_of : std::false_type {}; - - template - struct is_one_of> - : std::bool_constant<(std::is_same_v || ...)> {}; + template + static constexpr bool is_BoundType() { + return std::numeric_limits::is_integer || std::is_same_v ; + } - using bound_variants = std::variant; - using bound = T; + using bound_variants = std::variant; + using bound =std::remove_cv_t>; using LaunchBounds = GetLaunchBounds>; - using value = typename std::conditional < is_one_of::type, - typename AppendList::value, PackList<>>::type; + using value = typename std::conditional < is_BoundType(), + typename PrependList>::value>::value, PackList<>>::type; }; template @@ -363,7 +374,7 @@ struct DispatchSignature {}; template struct DispatchSignature> { - using LaunchBounds = typename PackSameType, PackList>::value; + using LaunchBounds = typename GetLaunchBounds>::value; using Function = typename PopList>::type; using Args = @@ -475,14 +486,12 @@ template auto MakeMDRangePolicy(DevExecSpace exec_space, Args &&...args) { using signature = meta::FunctionSignature; using IndexND = typename signature::IndexND; - static_assert(sizeof...(Args) % meta::PackLength(IndexND()) == 0, "Index doesn't match functor signature"); + static_assert(sizeof...(Args) % meta::PackLength(IndexND()) == 0, + "Launch Bounds don't match functor signature"); return MDRange, typename meta::SequenceOfOnes::value >(std::forward(args)...).policy(exec_space); } - - - template struct par_dispatch_impl {}; @@ -492,7 +501,6 @@ struct par_dispatch_impl, using BoundType = typename meta::PopList<1, meta::PackList>::type; - // Index ...ids probably shouldn't be that template inline void dispatch(Pattern, std::string name, DevExecSpace exec_space, Bounds &&...ids, Function function, Args &&...args) { @@ -547,7 +555,8 @@ struct par_dispatch_impl, }; template -inline typename std::enable_if::value, +inline typename std::enable_if::value || + std::is_same::value, void>::type par_dispatch(Pattern, std::string name, DevExecSpace exec_space, AllArgs &&...args) { using dispatchsig = meta::DispatchSignature>; @@ -587,42 +596,6 @@ par_dispatch(Pattern p, const std::string &name, DevExecSpace exec_space, par_dispatch(p, name, exec_space, r.s, r.e, function, std::forward(args)...); } -// 2D loop using MDRange loops -template -inline typename std::enable_if::type -par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space, - const int jl, const int ju, const int il, const int iu, - const Function &function, Args &&...args) { - Tag tag; - using Bounds = meta::PackList; - using dispArgs = meta::PackList; - par_dispatch_impl().dispatch(LoopPatternMDRange(), name, exec_space, - jl, ju, il, iu, function, std::forward(args)...); - /* kokkos_dispatch(tag, name, */ - /* Kokkos::Experimental::require( */ - /* Kokkos::MDRangePolicy>( */ - /* exec_space, {jl, il}, {ju + 1, iu + 1}, {1, iu + 1 - il}), */ - /* Kokkos::Experimental::WorkItemProperty::HintLightWeight), */ - /* function, std::forward(args)...); */ -} - -// 3D loop using MDRange loops -template -inline typename std::enable_if::type -par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space, - const int &kl, const int &ku, const int &jl, const int &ju, const int &il, - const int &iu, const Function &function, Args &&...args) { - Tag tag; - kokkos_dispatch(tag, name, - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>(exec_space, {kl, jl, il}, - {ku + 1, ju + 1, iu + 1}, - {1, 1, iu + 1 - il}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - function, std::forward(args)...); -} - // 3D loop using TeamPolicy with single inner TeamThreadRange template inline void par_dispatch(LoopPatternTPTTR, const std::string &name, @@ -695,23 +668,6 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, function(k, j, i); } -// 4D loop using MDRange loops -template -inline typename std::enable_if::type -par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space, - const int nl, const int nu, const int kl, const int ku, const int jl, - const int ju, const int il, const int iu, const Function &function, - Args &&...args) { - Tag tag; - kokkos_dispatch(tag, name, - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>( - exec_space, {nl, kl, jl, il}, {nu + 1, ku + 1, ju + 1, iu + 1}, - {1, 1, 1, iu + 1 - il}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - function, std::forward(args)...); -} - // 4D loop using TeamPolicy loop with inner TeamThreadRange template inline void par_dispatch(LoopPatternTPTTR, const std::string &name, @@ -798,23 +754,6 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, function(n, k, j, i); } -// 5D loop using MDRange loops -template -inline typename std::enable_if::type -par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space, - const int ml, const int mu, const int nl, const int nu, const int kl, - const int ku, const int jl, const int ju, const int il, const int iu, - const Function &function, Args &&...args) { - Tag tag; - kokkos_dispatch( - tag, name, - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>(exec_space, {ml, nl, kl, jl, il}, - {mu + 1, nu + 1, ku + 1, ju + 1, iu + 1}, - {1, 1, 1, 1, iu + 1 - il}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - function, std::forward(args)...); -} // 5D loop using SIMD FOR loops template @@ -833,23 +772,6 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, function(b, n, k, j, i); } -// 6D loop using MDRange loops -template -inline typename std::enable_if::type -par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_space, - const int ll, const int lu, const int ml, const int mu, const int nl, - const int nu, const int kl, const int ku, const int jl, const int ju, - const int il, const int iu, const Function &function, Args &&...args) { - Tag tag; - kokkos_dispatch(tag, name, - Kokkos::Experimental::require( - Kokkos::MDRangePolicy>( - exec_space, {ll, ml, nl, kl, jl, il}, - {lu + 1, mu + 1, nu + 1, ku + 1, ju + 1, iu + 1}, - {1, 1, 1, 1, 1, iu + 1 - il}), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - function, std::forward(args)...); -} // 6D loop using SIMD FOR loops template From dd46b7e5511fc5e9f80c5e275e8aa4e582a599a5 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Thu, 27 Jun 2024 22:11:33 +0200 Subject: [PATCH 18/99] enabled all simd loops --- src/kokkos_abstraction.hpp | 354 +++++++++++++++++-------------------- 1 file changed, 163 insertions(+), 191 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index c13e9830c637..ebfb45d4a523 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -20,6 +20,7 @@ #ifndef KOKKOS_ABSTRACTION_HPP_ #define KOKKOS_ABSTRACTION_HPP_ +#include #include #include #include @@ -32,6 +33,7 @@ #include "basic_types.hpp" #include "config.hpp" #include "parthenon_array_generic.hpp" +#include "utils/concepts_lite.hpp" #include "utils/error_checking.hpp" #include "utils/instrument.hpp" #include "utils/multi_pointer.hpp" @@ -255,7 +257,7 @@ struct AppendList {}; template struct AppendList> { - using value = PackList; + using value = PackList; }; template @@ -263,7 +265,7 @@ struct PrependList {}; template struct PrependList> { - using value = PackList; + using value = PackList; }; template @@ -295,23 +297,23 @@ struct PackSameType, PackList> { PackList>::type; }; -template +template struct SequenceOfOnes {}; -template -struct SequenceOfOnes<0, std::integer_sequence>{ - using value = typename std::integer_sequence; +template +struct SequenceOfOnes<0, std::integer_sequence> { + using value = typename std::integer_sequence; }; -template +template struct SequenceOfOnes> { - using value = typename SequenceOfOnes>::value; + using value = typename SequenceOfOnes>::value; }; -template +template struct SequenceOfOnes { - static_assert(N > 0, "N must be positive"); - using value = typename SequenceOfOnes>::value; + static_assert(N > 0, "N must be positive"); + using value = typename SequenceOfOnes>::value; }; } // namespace meta @@ -343,30 +345,32 @@ struct FunctionSignature { using FArgs = PopList>; }; -template +template using function_signature = FunctionSignature; -template +template struct GetLaunchBounds {}; template <> struct GetLaunchBounds> { - using value = PackList<>; + using value = PackList<>; }; template -struct GetLaunchBounds> { - - template - static constexpr bool is_BoundType() { - return std::numeric_limits::is_integer || std::is_same_v ; - } - - using bound_variants = std::variant; - using bound =std::remove_cv_t>; - using LaunchBounds = GetLaunchBounds>; - using value = typename std::conditional < is_BoundType(), - typename PrependList>::value>::value, PackList<>>::type; +struct GetLaunchBounds> { + + template + static constexpr bool is_BoundType() { + return std::numeric_limits::is_integer || std::is_same_v; + } + + using bound_variants = std::variant; + using bound = std::remove_cv_t>; + using LaunchBounds = GetLaunchBounds>; + using value = typename std::conditional< + is_BoundType(), + typename PrependList>::value>::value, + PackList<>>::type; }; template @@ -375,11 +379,9 @@ struct DispatchSignature {}; template struct DispatchSignature> { using LaunchBounds = typename GetLaunchBounds>::value; - using Function = - typename PopList>::type; - using Args = - typename PopList>::value; - + using pop = PopList>; + using Function = typename pop::type; + using Args = typename pop::value; }; } // namespace meta @@ -395,6 +397,7 @@ class FlatFunctor, Function function; public: + static constexpr int LoopDims = sizeof...(Is); template FlatFunctor(const Function _function, IndexRange idr, Args... args) : function(_function), ranges({{idr, args...}}) { @@ -422,17 +425,16 @@ class FlatFunctor, KOKKOS_INLINE_FUNCTION void operator()(const int &idx, FArgs... fargs) const { - constexpr int ND = sizeof...(Is); - int inds[ND]; + int inds[LoopDims]; inds[0] = idx; - for (int i = 1; i < ND; i++) { + for (int i = 1; i < LoopDims; i++) { inds[i] = idx; inds[i - 1] /= strides[i - 1]; for (int j = 0; j < i; j++) { inds[i] -= inds[j] * strides[j]; } } - for (int i = 0; i < ND; i++) { + for (int i = 0; i < LoopDims; i++) { inds[i] += ranges[i].s; } @@ -449,144 +451,180 @@ auto MakeFlatFunctor(F &function, Args &&...args) { std::forward(args)...); } -template +template class MDRange {}; -template -class MDRange, std::integer_sequence> { - Kokkos::Array lower, upper; +template +class MDRange> { + public: + static constexpr size_t Rank = sizeof...(Is); + Kokkos::Array lower, upper; - public: template MDRange(IndexRange idr, Args... args) { - std::array ranges{{idr, args...}}; - for (int i = 0; i < sizeof...(Is); i++) { - lower[i] = ranges[i].s; - upper[i] = ranges[i].e; + std::array ranges{{idr, args...}}; + for (int i = 0; i < Rank; i++) { + lower[i] = ranges[i].s; + upper[i] = ranges[i].e; } - } + } template MDRange(Args... args) { - std::array indices{{args...}}; - for (int i = 0; i < sizeof...(Is); i++) { - lower[i] = indices[2*i]; - upper[i] = indices[2*i+1]; + std::array indices{{static_cast(args)...}}; + for (int i = 0; i < Rank; i++) { + lower[i] = indices[2 * i]; + upper[i] = indices[2 * i + 1]; } } - auto policy(DevExecSpace exec_space) { - constexpr int ND = sizeof...(Is); - return Kokkos::MDRangePolicy>(exec_space, - {lower[Is]...}, {1+upper[Is]...}, {ones..., upper[ND-1] + 1 - lower[ND-1]}); + template + auto policy(std::integer_sequence, DevExecSpace exec_space) { + return Kokkos::MDRangePolicy>( + exec_space, {lower[Is]...}, {1 + upper[Is]...}, + {ones..., upper[Rank - 1] + 1 - lower[Rank - 1]}); } }; -template +template +inline auto MakeMDRange(Args &&...args) { + return MDRange>(std::forward(args)...); +} + +template auto MakeMDRangePolicy(DevExecSpace exec_space, Args &&...args) { - using signature = meta::FunctionSignature; - using IndexND = typename signature::IndexND; - static_assert(sizeof...(Args) % meta::PackLength(IndexND()) == 0, - "Launch Bounds don't match functor signature"); - return MDRange, - typename meta::SequenceOfOnes::value >(std::forward(args)...).policy(exec_space); + using Ones = typename meta::SequenceOfOnes::value; + return MakeMDRange(std::forward(args)...).policy(Ones(), exec_space); } -template +template +struct SimdFor { + template + using Sequence = std::make_index_sequence; + + std::array indices; + MDRange> mdrange; + + template + SimdFor(Args &&...args) : mdrange(std::forward(args)...) {} + + template + inline void dispatch(Function &function) { + dispatch_impl<1>(function); + } + + private: + template + inline void dispatch_simd(std::integer_sequence, Function &function) { + for (int i = mdrange.lower[Rank - 1]; i <= mdrange.upper[Rank - 1]; i++) { +#pragma omp simd + function(indices[Is]..., i); + } + } + + template + inline void dispatch_impl(Function &function) { + if constexpr (LoopCount < Rank) { + for (int i = mdrange.lower[LoopCount - 1]; i <= mdrange.upper[LoopCount - 1]; i++) { + indices[LoopCount - 1] = i; + dispatch_impl(function); + } + } else { + dispatch_simd(Sequence(), function); + } + } +}; + +template struct par_dispatch_impl {}; -template -struct par_dispatch_impl, +template +struct par_dispatch_impl, meta::PackList> { + using signature = meta::function_signature>; + static constexpr size_t Rank = meta::PackLength(typename signature::IndexND()); + using BoundType = typename meta::PopList<1, meta::PackList>::type; + static constexpr bool is_IndexRangeBounds = + std::is_same>>::value; + + static constexpr bool is_ParFor = + std::is_same::value; - template - inline void dispatch(Pattern, std::string name, DevExecSpace exec_space, Bounds &&...ids, + using IsFlatRange = std::is_same; + using IsMDRange = std::is_same; + using IsSimdFor = std::is_same; + + // fallback simd par_reduce to flat range + static constexpr bool is_FlatRange = + (IsFlatRange::value || (IsSimdFor::value && !is_ParFor)); + static constexpr bool is_SimdFor = (IsSimdFor::value && is_ParFor); + static constexpr bool is_MDRange = IsMDRange::value; + + inline void dispatch(std::string name, DevExecSpace exec_space, Bounds &&...ids, Function function, Args &&...args) { Tag tag; - - kokkos_dispatch(tag, name, - policy(Pattern(), exec_space, std::forward(ids)...), - functor(Pattern(), function, std::forward(ids)...), + static_assert(!(is_MDRange && Rank < 2), "Can not launch MDRange with Rank < 2"); + if constexpr (is_SimdFor) { + SimdFor(std::forward(ids)...).dispatch(function); + } else { + kokkos_dispatch(tag, name, policy(exec_space, std::forward(ids)...), + functor(function, std::forward(ids)...), std::forward(args)...); + } }; - template KOKKOS_INLINE_FUNCTION - auto policy(Pattern, DevExecSpace exec_space, Bounds &&...ids) const { - constexpr bool is_FlatRange = std::is_same::value; - constexpr bool is_MDRange = std::is_same::value; + auto policy(DevExecSpace exec_space, Bounds &&...ids) const { - if constexpr (is_FlatRange) { + if constexpr (is_FlatRange) { int rangeNx = 1; - if constexpr (std::is_same::value) { - for (auto &irange : {ids...}) { - rangeNx *= irange.e - irange.s + 1; - } + /* if constexpr (std::is_same::value) { */ + if constexpr (is_IndexRangeBounds) { + for (auto &irange : {ids...}) { + rangeNx *= irange.e - irange.s + 1; + } } else { - int indices[sizeof...(Bounds)] = {ids...}; - for (int i = 0; i < sizeof...(Bounds); i += 2) { - rangeNx *= indices[i + 1] - indices[i] + 1; - } + int indices[sizeof...(Bounds)] = {ids...}; + for (int i = 0; i < sizeof...(Bounds); i += 2) { + rangeNx *= indices[i + 1] - indices[i] + 1; + } } return Kokkos::RangePolicy<>(exec_space, 0, rangeNx); - } else if constexpr (is_MDRange) { - return MakeMDRangePolicy(exec_space, std::forward(ids)...); - } else { - } + } else if constexpr (is_MDRange) { + return MakeMDRangePolicy(exec_space, std::forward(ids)...); + } else if constexpr (is_SimdFor) { + return loop_pattern_simdfor_tag; + } }; - template KOKKOS_INLINE_FUNCTION - auto functor(Pattern, Function function, Bounds &&...ids) const { - constexpr bool is_FlatRange = std::is_same::value; - constexpr bool is_MDRange = std::is_same::value; - if constexpr (is_FlatRange) { - return MakeFlatFunctor(function, std::forward(ids)...); - } else if constexpr(is_MDRange) { - return function; - } else { - } - + auto functor(Function function, Bounds &&...ids) const { + if constexpr (is_FlatRange) { + return MakeFlatFunctor(function, std::forward(ids)...); + } else if constexpr (is_MDRange || is_SimdFor) { + return function; + } } }; template inline typename std::enable_if::value || - std::is_same::value, + std::is_same::value || + std::is_same::value, void>::type par_dispatch(Pattern, std::string name, DevExecSpace exec_space, AllArgs &&...args) { using dispatchsig = meta::DispatchSignature>; - using Function = typename dispatchsig::Function; // functor type - using LaunchBounds = typename dispatchsig::LaunchBounds; // list of index types - using Args = typename dispatchsig::Args; // - par_dispatch_impl().dispatch( - Pattern(), name, exec_space, std::forward(args)...); -} - -// 1D loop using RangePolicy loops -template -inline typename std::enable_if::type -par_dispatch(Pattern, const std::string &name, DevExecSpace exec_space, const int &il, - const int &iu, const Function &function, Args &&...args) { - PARTHENON_INSTRUMENT_REGION(name) - if constexpr (std::is_same::value && - std::is_same::value) { -#pragma omp simd - for (auto i = il; i <= iu; i++) { - function(i); - } - } else { - Tag tag; - kokkos_dispatch(tag, name, - Kokkos::Experimental::require( - Kokkos::RangePolicy<>(exec_space, il, iu + 1), - Kokkos::Experimental::WorkItemProperty::HintLightWeight), - function, std::forward(args)...); - } + using Function = typename dispatchsig::Function; // functor type + using LaunchBounds = typename dispatchsig::LaunchBounds; // list of index types + using Args = typename dispatchsig::Args; // + par_dispatch_impl().dispatch( + name, exec_space, std::forward(args)...); } template @@ -654,20 +692,6 @@ inline void par_dispatch(LoopPatternTPTTRTVR, const std::string &name, }); } -// 3D loop using SIMD FOR loops -template -inline void par_dispatch(LoopPatternSimdFor, const std::string &name, - DevExecSpace exec_space, const int &kl, const int &ku, - const int &jl, const int &ju, const int &il, const int &iu, - const Function &function) { - PARTHENON_INSTRUMENT_REGION(name) - for (auto k = kl; k <= ku; k++) - for (auto j = jl; j <= ju; j++) -#pragma omp simd - for (auto i = il; i <= iu; i++) - function(k, j, i); -} - // 4D loop using TeamPolicy loop with inner TeamThreadRange template inline void par_dispatch(LoopPatternTPTTR, const std::string &name, @@ -739,58 +763,6 @@ inline void par_dispatch(LoopPatternTPTTRTVR, const std::string &name, }); } -// 4D loop using SIMD FOR loops -template -inline void par_dispatch(LoopPatternSimdFor, const std::string &name, - DevExecSpace exec_space, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, - const int il, const int iu, const Function &function) { - PARTHENON_INSTRUMENT_REGION(name) - for (auto n = nl; n <= nu; n++) - for (auto k = kl; k <= ku; k++) - for (auto j = jl; j <= ju; j++) -#pragma omp simd - for (auto i = il; i <= iu; i++) - function(n, k, j, i); -} - - -// 5D loop using SIMD FOR loops -template -inline void par_dispatch(LoopPatternSimdFor, const std::string &name, - DevExecSpace exec_space, const int bl, const int bu, - const int nl, const int nu, const int kl, const int ku, - const int jl, const int ju, const int il, const int iu, - const Function &function) { - PARTHENON_INSTRUMENT_REGION(name) - for (auto b = bl; b <= bu; b++) - for (auto n = nl; n <= nu; n++) - for (auto k = kl; k <= ku; k++) - for (auto j = jl; j <= ju; j++) -#pragma omp simd - for (auto i = il; i <= iu; i++) - function(b, n, k, j, i); -} - - -// 6D loop using SIMD FOR loops -template -inline void par_dispatch(LoopPatternSimdFor, const std::string &name, - DevExecSpace exec_space, const int ll, const int lu, - const int ml, const int mu, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, - const int il, const int iu, const Function &function) { - PARTHENON_INSTRUMENT_REGION(name) - for (auto l = ll; l <= lu; l++) - for (auto m = ml; m <= mu; m++) - for (auto n = nl; n <= nu; n++) - for (auto k = kl; k <= ku; k++) - for (auto j = jl; j <= ju; j++) -#pragma omp simd - for (auto i = il; i <= iu; i++) - function(l, m, n, k, j, i); -} - template inline void par_dispatch(const std::string &name, Args &&...args) { par_dispatch(DEFAULT_LOOP_PATTERN, name, DevExecSpace(), From 7c7ecc01be0e5d4ba34c517b3d647febabff0cc9 Mon Sep 17 00:00:00 2001 From: Cloud User Date: Thu, 27 Jun 2024 21:04:29 +0000 Subject: [PATCH 19/99] cleaning up some warnings --- src/kokkos_abstraction.hpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index ebfb45d4a523..6bfb67c2b1fb 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -406,7 +406,7 @@ class FlatFunctor, template FlatFunctor(const Function _function, Args... args) : function(_function) { - std::array indices{{args...}}; + std::array indices{{static_cast(args)...}}; for (int i = 0; i < sizeof...(Is); i++) { ranges[i] = {indices[2 * i], indices[2 * i + 1]}; } @@ -443,7 +443,7 @@ class FlatFunctor, }; template -auto MakeFlatFunctor(F &function, Args &&...args) { +inline auto MakeFlatFunctor(F &function, Args &&...args) { using signature = meta::FunctionSignature; using IndexND = typename signature::IndexND; return FlatFunctor, @@ -492,7 +492,7 @@ inline auto MakeMDRange(Args &&...args) { } template -auto MakeMDRangePolicy(DevExecSpace exec_space, Args &&...args) { +inline auto MakeMDRangePolicy(DevExecSpace exec_space, Args &&...args) { using Ones = typename meta::SequenceOfOnes::value; return MakeMDRange(std::forward(args)...).policy(Ones(), exec_space); } @@ -578,18 +578,17 @@ struct par_dispatch_impl, } }; - KOKKOS_INLINE_FUNCTION + inline auto policy(DevExecSpace exec_space, Bounds &&...ids) const { if constexpr (is_FlatRange) { int rangeNx = 1; - /* if constexpr (std::is_same::value) { */ if constexpr (is_IndexRangeBounds) { for (auto &irange : {ids...}) { rangeNx *= irange.e - irange.s + 1; } } else { - int indices[sizeof...(Bounds)] = {ids...}; + int indices[sizeof...(Bounds)] = {static_cast(ids)...}; for (int i = 0; i < sizeof...(Bounds); i += 2) { rangeNx *= indices[i + 1] - indices[i] + 1; } @@ -603,7 +602,7 @@ struct par_dispatch_impl, } }; - KOKKOS_INLINE_FUNCTION + inline auto functor(Function function, Bounds &&...ids) const { if constexpr (is_FlatRange) { return MakeFlatFunctor(function, std::forward(ids)...); From e9b440d7706252899490102ca70d1be4557922aa Mon Sep 17 00:00:00 2001 From: adam reyes Date: Fri, 28 Jun 2024 00:32:27 +0200 Subject: [PATCH 20/99] cleaning up templates & traits --- src/kokkos_abstraction.hpp | 50 ++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 6bfb67c2b1fb..fec59f49a2f5 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -392,12 +392,13 @@ class FlatFunctor {}; template class FlatFunctor, meta::PackList> { - Kokkos::Array ranges; - Kokkos::Array strides; + + static constexpr size_t Rank = sizeof...(Is); + Kokkos::Array ranges; + Kokkos::Array strides; Function function; public: - static constexpr int LoopDims = sizeof...(Is); template FlatFunctor(const Function _function, IndexRange idr, Args... args) : function(_function), ranges({{idr, args...}}) { @@ -406,15 +407,15 @@ class FlatFunctor, template FlatFunctor(const Function _function, Args... args) : function(_function) { - std::array indices{{static_cast(args)...}}; - for (int i = 0; i < sizeof...(Is); i++) { + std::array indices{{static_cast(args)...}}; + for (int i = 0; i < Rank; i++) { ranges[i] = {indices[2 * i], indices[2 * i + 1]}; } Initialize(); } inline void Initialize() { - for (int ri = 1; ri < sizeof...(Is); ri++) { + for (int ri = 1; ri < Rank; ri++) { const int N = ranges[ri].e - ranges[ri].s + 1; strides[ri - 1] = N; for (int rj = 0; rj < ri - 1; rj++) { @@ -425,16 +426,16 @@ class FlatFunctor, KOKKOS_INLINE_FUNCTION void operator()(const int &idx, FArgs... fargs) const { - int inds[LoopDims]; + int inds[Rank]; inds[0] = idx; - for (int i = 1; i < LoopDims; i++) { + for (int i = 1; i < Rank; i++) { inds[i] = idx; inds[i - 1] /= strides[i - 1]; for (int j = 0; j < i; j++) { inds[i] -= inds[j] * strides[j]; } } - for (int i = 0; i < LoopDims; i++) { + for (int i = 0; i < Rank; i++) { inds[i] += ranges[i].s; } @@ -451,13 +452,9 @@ inline auto MakeFlatFunctor(F &function, Args &&...args) { std::forward(args)...); } -template -class MDRange {}; - -template -class MDRange> { +template +class MDRange { public: - static constexpr size_t Rank = sizeof...(Is); Kokkos::Array lower, upper; template @@ -478,8 +475,9 @@ class MDRange> { } } - template - auto policy(std::integer_sequence, DevExecSpace exec_space) { + template + auto policy(std::integer_sequence, + std::integer_sequence, DevExecSpace exec_space) { return Kokkos::MDRangePolicy>( exec_space, {lower[Is]...}, {1 + upper[Is]...}, {ones..., upper[Rank - 1] + 1 - lower[Rank - 1]}); @@ -488,13 +486,15 @@ class MDRange> { template inline auto MakeMDRange(Args &&...args) { - return MDRange>(std::forward(args)...); + return MDRange(std::forward(args)...); } template inline auto MakeMDRangePolicy(DevExecSpace exec_space, Args &&...args) { + using Indices = typename std::make_index_sequence; using Ones = typename meta::SequenceOfOnes::value; - return MakeMDRange(std::forward(args)...).policy(Ones(), exec_space); + return MakeMDRange(std::forward(args)...) + .policy(Indices(), Ones(), exec_space); } template @@ -503,7 +503,7 @@ struct SimdFor { using Sequence = std::make_index_sequence; std::array indices; - MDRange> mdrange; + MDRange mdrange; template SimdFor(Args &&...args) : mdrange(std::forward(args)...) {} @@ -557,6 +557,10 @@ struct par_dispatch_impl, using IsFlatRange = std::is_same; using IsMDRange = std::is_same; using IsSimdFor = std::is_same; + using IsTPTTR = std::is_same; + using IsTPTVR = std::is_same; + using IsTPTTRTVR = std::is_same; +//TODO: TPTTR, TPTVR, TPTTRTVR // fallback simd par_reduce to flat range static constexpr bool is_FlatRange = @@ -578,8 +582,7 @@ struct par_dispatch_impl, } }; - inline - auto policy(DevExecSpace exec_space, Bounds &&...ids) const { + inline auto policy(DevExecSpace exec_space, Bounds &&...ids) const { if constexpr (is_FlatRange) { int rangeNx = 1; @@ -602,8 +605,7 @@ struct par_dispatch_impl, } }; - inline - auto functor(Function function, Bounds &&...ids) const { + inline auto functor(Function function, Bounds &&...ids) const { if constexpr (is_FlatRange) { return MakeFlatFunctor(function, std::forward(ids)...); } else if constexpr (is_MDRange || is_SimdFor) { From 2452b48798fae727cda889f923e4d9824b274508 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sat, 29 Jun 2024 15:15:06 -0400 Subject: [PATCH 21/99] adding loop collapse patterns --- src/kokkos_abstraction.hpp | 266 ++++++++++++++++++++++++++++++++----- 1 file changed, 234 insertions(+), 32 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index fec59f49a2f5..009b2f91b854 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -160,6 +160,7 @@ static struct LoopPatternFlatRange { // a 1:1 indices matching static struct LoopPatternMDRange { } loop_pattern_mdrange_tag; + // Translates to a Kokkos::TeamPolicy with a single inner // Kokkos::TeamThreadRange static struct LoopPatternTPTTR { @@ -175,6 +176,37 @@ static struct LoopPatternTPTTRTVR { // Used to catch undefined behavior as it results in throwing an error static struct LoopPatternUndefined { } loop_pattern_undefined_tag; +// Translates to a Kokkos::TeamPolicy that collapse Nteams outer loops +// with Nthread & Nvector inner loop collapses +template +struct LoopPatternCollapse {}; + +template +struct LoopPatternTeam : std::false_type {}; + +template +struct LoopPatternTeam, team + thread + vector, void> + : std::true_type { + using Nvector = std::integral_constant; + using Nthread = std::integral_constant; + using Nteam = std::integral_constant; + using LoopPattern = LoopPatternCollapse; +}; + +template< typename Pattern, size_t Rank> +struct LoopPatternTeam::value || + std::is_same::value || + std::is_same::value>::type> { + + static constexpr bool IsTPTTR = std::is_same::value; // inner TeamThreadRange + static constexpr bool IsTPTVR = std::is_same::value; // inner ThreadVectorRange + static constexpr bool IsTPTTRTVR = std::is_same::value; + + using Nvector = std::integral_constant; + using Nthread = std::integral_constant; + using Nteam = std::integral_constant; + using LoopPattern = LoopPatternCollapse; +}; // Tags for Nested parallelism where the outermost layer supports 1, 2, or 3 // indices @@ -221,7 +253,7 @@ inline void kokkos_dispatch(ParallelScanDispatch, Args &&...args) { namespace meta { template -using base_type = typename std::remove_cv::type>::type; +using base_type = typename std::remove_cv_t>; template struct PackList {}; @@ -268,6 +300,21 @@ struct PrependList> { using value = PackList; }; +template +struct PopListBack {}; + +template +struct PopListBack<0, PackList> { + using value = PackList; +}; + +template +struct PopListBack> { + static constexpr bool NotFinished = N > 0; + using value = typename std::conditional< NotFinished , + typename PrependList>::value>::value, PackList>; +}; + template struct MergeLists {}; @@ -310,11 +357,8 @@ struct SequenceOfOnes> { using value = typename SequenceOfOnes>::value; }; -template -struct SequenceOfOnes { - static_assert(N > 0, "N must be positive"); - using value = typename SequenceOfOnes>::value; -}; +template +using sequence_of_ones = SequenceOfOnes>; } // namespace meta @@ -365,7 +409,7 @@ struct GetLaunchBounds> { } using bound_variants = std::variant; - using bound = std::remove_cv_t>; + using bound = base_type; using LaunchBounds = GetLaunchBounds>; using value = typename std::conditional< is_BoundType(), @@ -400,12 +444,14 @@ class FlatFunctor, public: template + KOKKOS_INLINE_FUNCTION FlatFunctor(const Function _function, IndexRange idr, Args... args) : function(_function), ranges({{idr, args...}}) { Initialize(); } template + KOKKOS_INLINE_FUNCTION FlatFunctor(const Function _function, Args... args) : function(_function) { std::array indices{{static_cast(args)...}}; for (int i = 0; i < Rank; i++) { @@ -414,7 +460,8 @@ class FlatFunctor, Initialize(); } - inline void Initialize() { + KOKKOS_INLINE_FUNCTION + void Initialize() { for (int ri = 1; ri < Rank; ri++) { const int N = ranges[ri].e - ranges[ri].s + 1; strides[ri - 1] = N; @@ -443,13 +490,155 @@ class FlatFunctor, } }; -template -inline auto MakeFlatFunctor(F &function, Args &&...args) { +template +KOKKOS_INLINE_FUNCTION +auto MakeFlatFunctor(F &function, Bounds &&...bounds) { using signature = meta::FunctionSignature; using IndexND = typename signature::IndexND; return FlatFunctor, typename signature::FArgs::value>(function, - std::forward(args)...); + std::forward(bounds)...); +} + +template +class CollapseFunctor {}; + +template +class CollapseFunctor< std::integer_sequence, + std::integer_sequence, + std::integer_sequence, Function> { + + static constexpr size_t Nteam = sizeof...(Iteam); + static constexpr size_t Nthread = sizeof...(Ithread); + static constexpr size_t Nvector = sizeof...(Ivector); + static constexpr size_t Rank = Nteam + Nthread + Nvector; + + Kokkos::Array ranges; + Kokkos::Array stridesTeam; + Kokkos::Array stridesThread; + Kokkos::Array stridesVector; + Function function; + public: + + template + KOKKOS_INLINE_FUNCTION + CollapseFunctor(const Function _function, IndexRange idr, Args... args) + : function(_function), ranges({{idr, args...}}) { + Initialize(); + } + + template + KOKKOS_INLINE_FUNCTION + CollapseFunctor(const Function _function, Args... args) : function(_function) { + std::array indices{{static_cast(args)...}}; + for (int i = 0; i < Rank; i++) { + ranges[i] = {indices[2 * i], indices[2 * i + 1]}; + } + Initialize(); + } + + KOKKOS_INLINE_FUNCTION + void Initialize() { + for (int ri = 1; ri < Nteam; ri++) { + const int N = ranges[ri].e - ranges[ri].s + 1; + stridesTeam[ri - 1] = N; + for (int rj = 0; rj < ri - 1; rj++) { + stridesTeam[rj] *= N; + } + } + for (int ri = 1+Nteam; ri < Nteam+Nthread; ri++) { + const int N = ranges[ri].e - ranges[ri].s + 1; + stridesThread[ri - 1] = N; + for (int rj = 0; rj < ri - 1; rj++) { + stridesThread[rj] *= N; + } + } + for (int ri = 1+Nteam+Nthread; ri < Rank; ri++) { + const int N = ranges[ri].e - ranges[ri].s + 1; + stridesVector[ri - 1] = N; + for (int rj = 0; rj < ri - 1; rj++) { + stridesVector[rj] *= N; + } + } + } + + KOKKOS_INLINE_FUNCTION + void recoverID(int *inds, int *strides, int &idx, int size) { + inds[0] = idx; + for (int i = 1; i < size; i++) { + inds[i] = idx; + inds[i-1] /= strides[i-1]; + for (int j = 0; j < 1; j++) { + inds[i] -= inds[j]*strides[j]; + } + } + } + + KOKKOS_INLINE_FUNCTION + int FlattenLaunchBound(int start, int end) { + int rangeNx = 1; + for (int i = start; i < end; i++) { + rangeNx *= ranges[i].e - ranges[i].s + 1; + } + return rangeNx; + } + + KOKKOS_INLINE_FUNCTION + void operator()(team_mbr_t team_member) { + int inds_team[Nteam]; + recoverID(inds_team, stridesTeam, team_member.league_rank(), Nteam); + + // recover the indices for the collapsed outer loops + if constexpr(Nthread > 0) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, 0, FlattenLaunchBound(Nteam, Nteam+Nthread)), + [&](const int idThread) { + int inds_thread[Nthread]; + recoverID(inds_thread, stridesThread, idThread, Nthread); + if constexpr (Nvector > 0 ) { + Kokkos::parallel_for( + Kokkos::TeamVectorRange(team_member, 0, FlattenLaunchBound(Nteam+Nthread, Rank)), + [&](const int idVector) { + int inds_vector[Nvector]; + recoverID(inds_vector, stridesVector, idVector, Nvector); + function(inds_team[Iteam]..., inds_thread[Ithread]..., inds_vector[Ivector]...); + }); + } else { + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, 0, FlattenLaunchBound(Nteam, Nteam+Nthread)), + [&](const int idThread) { + int inds_thread[Nthread]; + recoverID(inds_thread, stridesThread, idThread, Nthread); + function(inds_team[Iteam]..., inds_thread[Ithread]...); + }); + } + }); + } else { + Kokkos::parallel_for( + Kokkos::TeamVectorRange(team_member, 0, FlattenLaunchBound(Nteam+Nthread, Rank)), + [&](const int idVector) { + int inds_vector[Nvector]; + recoverID(inds_vector, stridesVector, idVector, Nvector); + function(inds_team[Iteam]..., inds_vector[Ivector]...); + }); + } + } +}; + + +template +KOKKOS_INLINE_FUNCTION +auto MakeCollapseFunctor(LoopPatternCollapse, F &function, Bounds &&...bounds) { + using signature = meta::FunctionSignature; + using IndexND = typename signature::IndexND; + constexpr size_t Rank = meta::PackLength(IndexND()); + static_assert(Rank == Nteam + Nthread + Nvector, + "Rank of functor/lambda in par_for must much total number of loops to collapse"); + + return CollapseFunctor, + std::make_index_sequence, + std::make_index_sequence, F>(function, std::forward(bounds)...); + } template @@ -492,7 +681,7 @@ inline auto MakeMDRange(Args &&...args) { template inline auto MakeMDRangePolicy(DevExecSpace exec_space, Args &&...args) { using Indices = typename std::make_index_sequence; - using Ones = typename meta::SequenceOfOnes::value; + using Ones = typename meta::sequence_of_ones::value; return MakeMDRange(std::forward(args)...) .policy(Indices(), Ones(), exec_space); } @@ -538,18 +727,18 @@ struct SimdFor { template struct par_dispatch_impl {}; -template +template struct par_dispatch_impl, meta::PackList> { - using signature = meta::function_signature>; + using signature = meta::function_signature>; static constexpr size_t Rank = meta::PackLength(typename signature::IndexND()); using BoundType = typename meta::PopList<1, meta::PackList>::type; static constexpr bool is_IndexRangeBounds = std::is_same>>::value; + meta::base_type>::value; static constexpr bool is_ParFor = std::is_same::value; @@ -557,22 +746,21 @@ struct par_dispatch_impl, using IsFlatRange = std::is_same; using IsMDRange = std::is_same; using IsSimdFor = std::is_same; - using IsTPTTR = std::is_same; - using IsTPTVR = std::is_same; - using IsTPTTRTVR = std::is_same; -//TODO: TPTTR, TPTVR, TPTTRTVR + using TeamPattern = LoopPatternTeam; //false_type unless we use an outer team policy // fallback simd par_reduce to flat range static constexpr bool is_FlatRange = (IsFlatRange::value || (IsSimdFor::value && !is_ParFor)); static constexpr bool is_SimdFor = (IsSimdFor::value && is_ParFor); static constexpr bool is_MDRange = IsMDRange::value; + static constexpr bool is_Collapse = TeamPattern::type; inline void dispatch(std::string name, DevExecSpace exec_space, Bounds &&...ids, Function function, Args &&...args) { - Tag tag; static_assert(!(is_MDRange && Rank < 2), "Can not launch MDRange with Rank < 2"); + Tag tag; + PARTHENON_INSTRUMENT_REGION(name) if constexpr (is_SimdFor) { SimdFor(std::forward(ids)...).dispatch(function); } else { @@ -585,23 +773,16 @@ struct par_dispatch_impl, inline auto policy(DevExecSpace exec_space, Bounds &&...ids) const { if constexpr (is_FlatRange) { - int rangeNx = 1; - if constexpr (is_IndexRangeBounds) { - for (auto &irange : {ids...}) { - rangeNx *= irange.e - irange.s + 1; - } - } else { - int indices[sizeof...(Bounds)] = {static_cast(ids)...}; - for (int i = 0; i < sizeof...(Bounds); i += 2) { - rangeNx *= indices[i + 1] - indices[i] + 1; - } - } + int rangeNx = FlattenLaunchBound(std::forward(ids)...); return Kokkos::RangePolicy<>(exec_space, 0, rangeNx); } else if constexpr (is_MDRange) { return MakeMDRangePolicy(exec_space, std::forward(ids)...); } else if constexpr (is_SimdFor) { return loop_pattern_simdfor_tag; + } else if constexpr (is_Collapse) { + int rangeNx = FlattenLaunchBound(std::forward(ids)...); + return Kokkos::RangePolicy<>(exec_space, 0, rangeNx); } }; @@ -610,8 +791,29 @@ struct par_dispatch_impl, return MakeFlatFunctor(function, std::forward(ids)...); } else if constexpr (is_MDRange || is_SimdFor) { return function; + } else if constexpr (is_Collapse) { + return MakeCollapseFunctor(TeamPattern::LoopPattern(), function, std::forward(ids)...); } } + + private: + template + inline int FlattenLaunchBound(Bounds &&...ids) const { + static_assert(NCollapse <= Rank, "Can't flatten more loops than rank"); + int rangeNx = 1; + if constexpr (is_IndexRangeBounds) { + std::array ranges{{ids...}}; + for (int i = 0; i < NCollapse; i++) { + rangeNx *= ranges[i].e - ranges[i].s + 1; + } + } else { + int indices[sizeof...(Bounds)] = {static_cast(ids)...}; + for (int i = 0; i < 2*NCollapse; i += 2) { + rangeNx *= indices[i + 1] - indices[i] + 1; + } + } + return rangeNx; + } }; template From 188d4138d0c0ca63344d1fe20c9af5007912834a Mon Sep 17 00:00:00 2001 From: adam reyes Date: Mon, 1 Jul 2024 05:15:48 -0400 Subject: [PATCH 22/99] wrapped team policy loops --- src/kokkos_abstraction.hpp | 218 ++++++++----------------------------- 1 file changed, 45 insertions(+), 173 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 009b2f91b854..f8aa7a4d51ea 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -196,7 +196,8 @@ struct LoopPatternTeam, team + thread template< typename Pattern, size_t Rank> struct LoopPatternTeam::value || std::is_same::value || - std::is_same::value>::type> { + std::is_same::value>::type> +: std::true_type { static constexpr bool IsTPTTR = std::is_same::value; // inner TeamThreadRange static constexpr bool IsTPTVR = std::is_same::value; // inner ThreadVectorRange @@ -514,9 +515,7 @@ class CollapseFunctor< std::integer_sequence, static constexpr size_t Rank = Nteam + Nthread + Nvector; Kokkos::Array ranges; - Kokkos::Array stridesTeam; - Kokkos::Array stridesThread; - Kokkos::Array stridesVector; + Kokkos::Array strides; Function function; public: @@ -539,43 +538,47 @@ class CollapseFunctor< std::integer_sequence, KOKKOS_INLINE_FUNCTION void Initialize() { - for (int ri = 1; ri < Nteam; ri++) { - const int N = ranges[ri].e - ranges[ri].s + 1; - stridesTeam[ri - 1] = N; - for (int rj = 0; rj < ri - 1; rj++) { - stridesTeam[rj] *= N; + for (int ri = 0; ri < Nteam-1; ri++) { + const int N = ranges[ri+1].e - ranges[ri+1].s + 1; + strides[ri] = N; + for (int rj = 0; rj < ri; rj++) { + strides[rj] *= N; } } - for (int ri = 1+Nteam; ri < Nteam+Nthread; ri++) { - const int N = ranges[ri].e - ranges[ri].s + 1; - stridesThread[ri - 1] = N; - for (int rj = 0; rj < ri - 1; rj++) { - stridesThread[rj] *= N; + for (int ri = Nteam; ri < Nteam+Nthread-1; ri++) { + const int N = ranges[ri+1].e - ranges[ri+1].s + 1; + strides[ri] = N; + for (int rj = Nteam; rj < ri; rj++) { + strides[rj ] *= N; } } - for (int ri = 1+Nteam+Nthread; ri < Rank; ri++) { - const int N = ranges[ri].e - ranges[ri].s + 1; - stridesVector[ri - 1] = N; - for (int rj = 0; rj < ri - 1; rj++) { - stridesVector[rj] *= N; + for (int ri = Nteam+Nthread; ri < Rank-1; ri++) { + const int N = ranges[ri+1].e - ranges[ri+1].s + 1; + strides[ri] = N; + for (int rj = Nteam+Nthread; rj < ri; rj++) { + strides[rj ] *= N; } } } + template KOKKOS_INLINE_FUNCTION - void recoverID(int *inds, int *strides, int &idx, int size) { + void recoverID(Kokkos::Array &inds, int idx) const { inds[0] = idx; - for (int i = 1; i < size; i++) { + for (int i = 1; i < N; i++) { inds[i] = idx; - inds[i-1] /= strides[i-1]; - for (int j = 0; j < 1; j++) { - inds[i] -= inds[j]*strides[j]; + inds[i-1] /= strides[i-1 + start]; + for (int j = 0; j < i; j++) { + inds[i] -= inds[j]*strides[j + start]; } } + for (int i=0; i< N; i++) { + inds[i] += ranges[i+start].s; + } } KOKKOS_INLINE_FUNCTION - int FlattenLaunchBound(int start, int end) { + int FlattenLaunchBound(int start, int end) const { int rangeNx = 1; for (int i = start; i < end; i++) { rangeNx *= ranges[i].e - ranges[i].s + 1; @@ -584,41 +587,35 @@ class CollapseFunctor< std::integer_sequence, } KOKKOS_INLINE_FUNCTION - void operator()(team_mbr_t team_member) { - int inds_team[Nteam]; - recoverID(inds_team, stridesTeam, team_member.league_rank(), Nteam); + void operator()(team_mbr_t team_member) const { + Kokkos::Array inds_team; + recoverID(inds_team, team_member.league_rank()); // recover the indices for the collapsed outer loops if constexpr(Nthread > 0) { Kokkos::parallel_for( Kokkos::TeamThreadRange<>(team_member, 0, FlattenLaunchBound(Nteam, Nteam+Nthread)), [&](const int idThread) { - int inds_thread[Nthread]; - recoverID(inds_thread, stridesThread, idThread, Nthread); + Kokkos::Array inds_thread; + recoverID(inds_thread, idThread); if constexpr (Nvector > 0 ) { Kokkos::parallel_for( Kokkos::TeamVectorRange(team_member, 0, FlattenLaunchBound(Nteam+Nthread, Rank)), [&](const int idVector) { - int inds_vector[Nvector]; - recoverID(inds_vector, stridesVector, idVector, Nvector); + Kokkos::Array inds_vector; + recoverID(inds_vector, idVector); function(inds_team[Iteam]..., inds_thread[Ithread]..., inds_vector[Ivector]...); }); } else { - Kokkos::parallel_for( - Kokkos::TeamThreadRange<>(team_member, 0, FlattenLaunchBound(Nteam, Nteam+Nthread)), - [&](const int idThread) { - int inds_thread[Nthread]; - recoverID(inds_thread, stridesThread, idThread, Nthread); - function(inds_team[Iteam]..., inds_thread[Ithread]...); - }); + function(inds_team[Iteam]..., inds_thread[Ithread]...); } }); } else { Kokkos::parallel_for( Kokkos::TeamVectorRange(team_member, 0, FlattenLaunchBound(Nteam+Nthread, Rank)), [&](const int idVector) { - int inds_vector[Nvector]; - recoverID(inds_vector, stridesVector, idVector, Nvector); + Kokkos::Array inds_vector; + recoverID(inds_vector, idVector); function(inds_team[Iteam]..., inds_vector[Ivector]...); }); } @@ -753,7 +750,7 @@ struct par_dispatch_impl, (IsFlatRange::value || (IsSimdFor::value && !is_ParFor)); static constexpr bool is_SimdFor = (IsSimdFor::value && is_ParFor); static constexpr bool is_MDRange = IsMDRange::value; - static constexpr bool is_Collapse = TeamPattern::type; + static constexpr bool is_Collapse = TeamPattern::value; inline void dispatch(std::string name, DevExecSpace exec_space, Bounds &&...ids, Function function, Args &&...args) { @@ -782,7 +779,8 @@ struct par_dispatch_impl, return loop_pattern_simdfor_tag; } else if constexpr (is_Collapse) { int rangeNx = FlattenLaunchBound(std::forward(ids)...); - return Kokkos::RangePolicy<>(exec_space, 0, rangeNx); + return team_policy(exec_space, rangeNx, Kokkos::AUTO); + /* return Kokkos::RangePolicy<>(exec_space, 0, rangeNx); */ } }; @@ -792,7 +790,7 @@ struct par_dispatch_impl, } else if constexpr (is_MDRange || is_SimdFor) { return function; } else if constexpr (is_Collapse) { - return MakeCollapseFunctor(TeamPattern::LoopPattern(), function, std::forward(ids)...); + return MakeCollapseFunctor(typename TeamPattern::LoopPattern(), function, std::forward(ids)...); } } @@ -819,6 +817,9 @@ struct par_dispatch_impl, template inline typename std::enable_if::value || std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || std::is_same::value, void>::type par_dispatch(Pattern, std::string name, DevExecSpace exec_space, AllArgs &&...args) { @@ -837,135 +838,6 @@ par_dispatch(Pattern p, const std::string &name, DevExecSpace exec_space, par_dispatch(p, name, exec_space, r.s, r.e, function, std::forward(args)...); } -// 3D loop using TeamPolicy with single inner TeamThreadRange -template -inline void par_dispatch(LoopPatternTPTTR, const std::string &name, - DevExecSpace exec_space, const int &kl, const int &ku, - const int &jl, const int &ju, const int &il, const int &iu, - const Function &function) { - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int NkNj = Nk * Nj; - Kokkos::parallel_for( - name, team_policy(exec_space, NkNj, Kokkos::AUTO), - KOKKOS_LAMBDA(team_mbr_t team_member) { - const int k = team_member.league_rank() / Nj + kl; - const int j = team_member.league_rank() % Nj + jl; - Kokkos::parallel_for(Kokkos::TeamThreadRange<>(team_member, il, iu + 1), - [&](const int i) { function(k, j, i); }); - }); -} - -// 3D loop using TeamPolicy with single inner ThreadVectorRange -template -inline void par_dispatch(LoopPatternTPTVR, const std::string &name, - DevExecSpace exec_space, const int &kl, const int &ku, - const int &jl, const int &ju, const int &il, const int &iu, - const Function &function) { - // TODO(pgrete) if exec space is Cuda,throw error - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int NkNj = Nk * Nj; - Kokkos::parallel_for( - name, team_policy(exec_space, NkNj, Kokkos::AUTO), - KOKKOS_LAMBDA(team_mbr_t team_member) { - const int k = team_member.league_rank() / Nj + kl; - const int j = team_member.league_rank() % Nj + jl; - Kokkos::parallel_for(Kokkos::TeamVectorRange<>(team_member, il, iu + 1), - [&](const int i) { function(k, j, i); }); - }); -} - -// 3D loop using TeamPolicy with nested TeamThreadRange and ThreadVectorRange -template -inline void par_dispatch(LoopPatternTPTTRTVR, const std::string &name, - DevExecSpace exec_space, const int &kl, const int &ku, - const int &jl, const int &ju, const int &il, const int &iu, - const Function &function) { - const int Nk = ku - kl + 1; - Kokkos::parallel_for( - name, team_policy(exec_space, Nk, Kokkos::AUTO), - KOKKOS_LAMBDA(team_mbr_t team_member) { - const int k = team_member.league_rank() + kl; - Kokkos::parallel_for( - Kokkos::TeamThreadRange<>(team_member, jl, ju + 1), [&](const int j) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, il, iu + 1), - [&](const int i) { function(k, j, i); }); - }); - }); -} - -// 4D loop using TeamPolicy loop with inner TeamThreadRange -template -inline void par_dispatch(LoopPatternTPTTR, const std::string &name, - DevExecSpace exec_space, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, - const int il, const int iu, const Function &function) { - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int NkNj = Nk * Nj; - const int NnNkNj = Nn * Nk * Nj; - Kokkos::parallel_for( - name, team_policy(exec_space, NnNkNj, Kokkos::AUTO), - KOKKOS_LAMBDA(team_mbr_t team_member) { - int n = team_member.league_rank() / NkNj; - int k = (team_member.league_rank() - n * NkNj) / Nj; - int j = team_member.league_rank() - n * NkNj - k * Nj + jl; - n += nl; - k += kl; - Kokkos::parallel_for(Kokkos::TeamThreadRange<>(team_member, il, iu + 1), - [&](const int i) { function(n, k, j, i); }); - }); -} - -// 4D loop using TeamPolicy loop with inner ThreadVectorRange -template -inline void par_dispatch(LoopPatternTPTVR, const std::string &name, - DevExecSpace exec_space, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, - const int il, const int iu, const Function &function) { - // TODO(pgrete) if exec space is Cuda,throw error - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int NkNj = Nk * Nj; - const int NnNkNj = Nn * Nk * Nj; - Kokkos::parallel_for( - name, team_policy(exec_space, NnNkNj, Kokkos::AUTO), - KOKKOS_LAMBDA(team_mbr_t team_member) { - int n = team_member.league_rank() / NkNj; - int k = (team_member.league_rank() - n * NkNj) / Nj; - int j = team_member.league_rank() - n * NkNj - k * Nj + jl; - n += nl; - k += kl; - Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, il, iu + 1), - [&](const int i) { function(n, k, j, i); }); - }); -} - -// 4D loop using TeamPolicy with nested TeamThreadRange and ThreadVectorRange -template -inline void par_dispatch(LoopPatternTPTTRTVR, const std::string &name, - DevExecSpace exec_space, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, - const int il, const int iu, const Function &function) { - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int NnNk = Nn * Nk; - Kokkos::parallel_for( - name, team_policy(exec_space, NnNk, Kokkos::AUTO), - KOKKOS_LAMBDA(team_mbr_t team_member) { - int n = team_member.league_rank() / Nk + nl; - int k = team_member.league_rank() % Nk + kl; - Kokkos::parallel_for( - Kokkos::TeamThreadRange<>(team_member, jl, ju + 1), [&](const int j) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, il, iu + 1), - [&](const int i) { function(n, k, j, i); }); - }); - }); -} - template inline void par_dispatch(const std::string &name, Args &&...args) { par_dispatch(DEFAULT_LOOP_PATTERN, name, DevExecSpace(), From 5bb77648783366ccb7005c1dc9da97814897c7e2 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Mon, 1 Jul 2024 08:39:12 -0400 Subject: [PATCH 23/99] separate inner loop collapses --- src/kokkos_abstraction.hpp | 49 +++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index f8aa7a4d51ea..f1d79c1558d3 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -501,6 +501,31 @@ auto MakeFlatFunctor(F &function, Bounds &&...bounds) { std::forward(bounds)...); } + +template +struct InnerFunctor {}; + +template +struct InnerFunctor, std::integer_sequence> { + static constexpr size_t Nteam = sizeof...(Iteam); + Function function; + Kokkos::Array inds_team; + + InnerFunctor(Kokkos::Array _inds_team, Function _function) + : inds_team(_inds_team), function(_function){} + + KOKKOS_INLINE_FUNCTION + void operator()(Index... inds) const { + function(inds_team[Iteam]..., std::forward(inds)...); + } +}; + +template +KOKKOS_INLINE_FUNCTION +auto MakeInnerFunctor(F &function) { + using signature = meta::function_signature; +} + template class CollapseFunctor {}; @@ -590,8 +615,17 @@ class CollapseFunctor< std::integer_sequence, void operator()(team_mbr_t team_member) const { Kokkos::Array inds_team; recoverID(inds_team, team_member.league_rank()); + using signature = meta::function_signature; + using ThreadVectorInds = typename meta::PopList::value; - // recover the indices for the collapsed outer loops + collapse_inner(team_member, + InnerFunctor> + (inds_team, function)); + } + + template + KOKKOS_INLINE_FUNCTION + void collapse_inner(team_mbr_t team_member, InnerFunction inner_function) const { if constexpr(Nthread > 0) { Kokkos::parallel_for( Kokkos::TeamThreadRange<>(team_member, 0, FlattenLaunchBound(Nteam, Nteam+Nthread)), @@ -604,10 +638,10 @@ class CollapseFunctor< std::integer_sequence, [&](const int idVector) { Kokkos::Array inds_vector; recoverID(inds_vector, idVector); - function(inds_team[Iteam]..., inds_thread[Ithread]..., inds_vector[Ivector]...); + inner_function(inds_thread[Ithread]..., inds_vector[Ivector]...); }); } else { - function(inds_team[Iteam]..., inds_thread[Ithread]...); + inner_function(inds_thread[Ithread]...); } }); } else { @@ -616,7 +650,7 @@ class CollapseFunctor< std::integer_sequence, [&](const int idVector) { Kokkos::Array inds_vector; recoverID(inds_vector, idVector); - function(inds_team[Iteam]..., inds_vector[Ivector]...); + inner_function(inds_vector[Ivector]...); }); } } @@ -779,8 +813,8 @@ struct par_dispatch_impl, return loop_pattern_simdfor_tag; } else if constexpr (is_Collapse) { int rangeNx = FlattenLaunchBound(std::forward(ids)...); - return team_policy(exec_space, rangeNx, Kokkos::AUTO); - /* return Kokkos::RangePolicy<>(exec_space, 0, rangeNx); */ + return team_policy(exec_space, rangeNx, Kokkos::AUTO) + .set_scratch_size(scratch_level, Kokkos::PerTeam(scratch_size_in_bytes)); } }; @@ -812,6 +846,9 @@ struct par_dispatch_impl, } return rangeNx; } + + size_t scratch_size_in_bytes = 0; + int scratch_level = 1; }; template From 110247093b90c37080fdd0a8b2d59bf8078e4a8a Mon Sep 17 00:00:00 2001 From: adam reyes Date: Mon, 1 Jul 2024 16:38:19 -0400 Subject: [PATCH 24/99] Wrapping inner par_for loops --- src/kokkos_abstraction.hpp | 225 ++++++++++++------------------------- 1 file changed, 73 insertions(+), 152 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index f1d79c1558d3..3f58e80c2ff9 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -229,6 +229,19 @@ constexpr InnerLoopPatternTTR inner_loop_pattern_ttr_tag; struct InnerLoopPatternSimdFor {}; constexpr InnerLoopPatternSimdFor inner_loop_pattern_simdfor_tag; +template< typename Pattern, size_t Rank> +struct LoopPatternTeam::value || + std::is_same::value>::type> +: std::true_type { + + static constexpr bool IsTTR = std::is_same::value; + static constexpr bool IsTVR = std::is_same::value; + + static constexpr size_t Nvector = IsTVR ? Rank : 0; + static constexpr size_t Nthread = IsTTR ? Rank : 0; + using LoopPattern = LoopPatternCollapse<0, Nthread, Nvector>; +}; + namespace dispatch_impl { static struct ParallelForDispatch { } parallel_for_dispatch_tag; @@ -520,11 +533,6 @@ struct InnerFunctor, std::integer_sequence -KOKKOS_INLINE_FUNCTION -auto MakeInnerFunctor(F &function) { - using signature = meta::function_signature; -} template class CollapseFunctor {}; @@ -563,26 +571,28 @@ class CollapseFunctor< std::integer_sequence, KOKKOS_INLINE_FUNCTION void Initialize() { - for (int ri = 0; ri < Nteam-1; ri++) { - const int N = ranges[ri+1].e - ranges[ri+1].s + 1; - strides[ri] = N; - for (int rj = 0; rj < ri; rj++) { - strides[rj] *= N; - } - } - for (int ri = Nteam; ri < Nteam+Nthread-1; ri++) { - const int N = ranges[ri+1].e - ranges[ri+1].s + 1; - strides[ri] = N; - for (int rj = Nteam; rj < ri; rj++) { - strides[rj ] *= N; - } - } - for (int ri = Nteam+Nthread; ri < Rank-1; ri++) { - const int N = ranges[ri+1].e - ranges[ri+1].s + 1; - strides[ri] = N; - for (int rj = Nteam+Nthread; rj < ri; rj++) { - strides[rj ] *= N; - } + if constexpr (Rank > 1) { + for (int ri = 0; ri < Nteam-1; ri++) { + const int N = ranges[ri+1].e - ranges[ri+1].s + 1; + strides[ri] = N; + for (int rj = 0; rj < ri; rj++) { + strides[rj] *= N; + } + } + for (int ri = Nteam; ri < Nteam+Nthread-1; ri++) { + const int N = ranges[ri+1].e - ranges[ri+1].s + 1; + strides[ri] = N; + for (int rj = Nteam; rj < ri; rj++) { + strides[rj ] *= N; + } + } + for (int ri = Nteam+Nthread; ri < Rank-1; ri++) { + const int N = ranges[ri+1].e - ranges[ri+1].s + 1; + strides[ri] = N; + for (int rj = Nteam+Nthread; rj < ri; rj++) { + strides[rj ] *= N; + } + } } } @@ -672,6 +682,22 @@ auto MakeCollapseFunctor(LoopPatternCollapse, F &functi } +template +struct par_dispatch_inner {}; + +template +struct par_dispatch_inner> { + using signature = meta::function_signature; + static constexpr size_t Rank = meta::PackLength(typename signature::IndexND()); + using LoopPattern = typename LoopPatternTeam::LoopPattern; + + KOKKOS_FORCEINLINE_FUNCTION + void dispatch(team_mbr_t team_member, Bounds &&... bounds, Function function) const { + MakeCollapseFunctor(LoopPattern(), function, std::forward(bounds)...) + .collapse_inner(team_member, function); + } +}; + template class MDRange { public: @@ -971,134 +997,29 @@ inline void par_for_outer(const std::string &name, Args &&...args) { std::forward(args)...); } -// Inner parallel loop using TeamThreadRange -template -KOKKOS_FORCEINLINE_FUNCTION void -par_for_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int ll, const int lu, - const int ml, const int mu, const int nl, const int nu, const int kl, - const int ku, const int jl, const int ju, const int il, const int iu, - const Function &function) { - const int Nl = lu - ll + 1; - const int Nm = mu - ml + 1; - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NjNi = Nj * Ni; - const int NkNjNi = Nk * NjNi; - const int NnNkNjNi = Nn * NkNjNi; - const int NmNnNkNjNi = Nm * NnNkNjNi; - const int NlNmNnNkNjNi = Nl * NmNnNkNjNi; - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team_member, NlNmNnNkNjNi), [&](const int &idx) { - int l = idx / NmNnNkNjNi; - int m = (idx - l * NmNnNkNjNi) / NnNkNjNi; - int n = (idx - l * NmNnNkNjNi - m * NnNkNjNi) / NkNjNi; - int k = (idx - l * NmNnNkNjNi - m * NnNkNjNi - n * NkNjNi) / NjNi; - int j = (idx - l * NmNnNkNjNi - m * NnNkNjNi - n * NkNjNi - k * NjNi) / Ni; - int i = idx - l * NmNnNkNjNi - m * NnNkNjNi - n * NkNjNi - k * NjNi - j * Ni; - l += nl; - m += ml; - n += nl; - k += kl; - j += jl; - i += il; - function(l, m, n, k, j, i); - }); -} -template -KOKKOS_FORCEINLINE_FUNCTION void -par_for_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int ml, const int mu, - const int nl, const int nu, const int kl, const int ku, const int jl, - const int ju, const int il, const int iu, const Function &function) { - const int Nm = mu - ml + 1; - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NjNi = Nj * Ni; - const int NkNjNi = Nk * NjNi; - const int NnNkNjNi = Nn * NkNjNi; - const int NmNnNkNjNi = Nm * NnNkNjNi; - Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, NmNnNkNjNi), - [&](const int &idx) { - int m = idx / NnNkNjNi; - int n = (idx - m * NnNkNjNi) / NkNjNi; - int k = (idx - m * NnNkNjNi - n * NkNjNi) / NjNi; - int j = (idx - m * NnNkNjNi - n * NkNjNi - k * NjNi) / Ni; - int i = idx - m * NnNkNjNi - n * NkNjNi - k * NjNi - j * Ni; - m += ml; - n += nl; - k += kl; - j += jl; - i += il; - function(m, n, k, j, i); - }); -} -template -KOKKOS_FORCEINLINE_FUNCTION void -par_for_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, const int il, - const int iu, const Function &function) { - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NjNi = Nj * Ni; - const int NkNjNi = Nk * NjNi; - const int NnNkNjNi = Nn * NkNjNi; - Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, NnNkNjNi), - [&](const int &idx) { - int n = idx / NkNjNi; - int k = (idx - n * NkNjNi) / NjNi; - int j = (idx - n * NkNjNi - k * NjNi) / Ni; - int i = idx - n * NkNjNi - k * NjNi - j * Ni; - n += nl; - k += kl; - j += jl; - i += il; - function(n, k, j, i); - }); -} -template -KOKKOS_FORCEINLINE_FUNCTION void -par_for_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int kl, const int ku, - const int jl, const int ju, const int il, const int iu, - const Function &function) { - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NkNjNi = Nk * Nj * Ni; - const int NjNi = Nj * Ni; - Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, NkNjNi), [&](const int &idx) { - int k = idx / NjNi; - int j = (idx - k * NjNi) / Ni; - int i = idx - k * NjNi - j * Ni; - k += kl; - j += jl; - i += il; - function(k, j, i); - }); -} -template -KOKKOS_FORCEINLINE_FUNCTION void -par_for_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int jl, const int ju, - const int il, const int iu, const Function &function) { - const int Nj = ju - jl + 1; - const int Ni = iu - il + 1; - const int NjNi = Nj * Ni; - Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, NjNi), [&](const int &idx) { - int j = idx / Ni + jl; - int i = idx % Ni + il; - function(j, i); - }); -} -template -KOKKOS_FORCEINLINE_FUNCTION void par_for_inner(InnerLoopPatternTTR, - team_mbr_t team_member, const int il, - const int iu, const Function &function) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, il, iu + 1), function); + +template +KOKKOS_FORCEINLINE_FUNCTION +typename std::enable_if || + std::is_same_v, void>::type +par_for_inner(Pattern, team_mbr_t team_member, Args &&...args) { + if constexpr (std::is_same_v) { + + } else { + using dispatchsig = meta::DispatchSignature>; + using Function = typename dispatchsig::Function; + using LaunchBounds = typename dispatchsig::LaunchBounds; + par_dispatch_inner() + .dispatch(team_member, std::forward(args)...); + /* if constexpr (std::is_same_v) { */ + /* /1* LaunchBounds f = 1.; *1/ */ + /* par_dispatch_inner(LaunchBounds(), team_member, std::forward(args)...); */ + /* } else { */ + /* par_dispatch_inner(LaunchBounds(), team_member, std::forward(args)...); */ + /* } */ + } } + // Inner parallel loop using TeamVectorRange template KOKKOS_FORCEINLINE_FUNCTION void par_for_inner(InnerLoopPatternTVR, From 049bf52832ecbc197a0dc8988ae490085677eee9 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Mon, 1 Jul 2024 20:09:01 -0400 Subject: [PATCH 25/99] simdfor inner loops --- src/kokkos_abstraction.hpp | 86 ++++++-------------------------------- 1 file changed, 12 insertions(+), 74 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 3f58e80c2ff9..073c4dd93cba 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -998,85 +998,23 @@ inline void par_for_outer(const std::string &name, Args &&...args) { } -template +template KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if || - std::is_same_v, void>::type -par_for_inner(Pattern, team_mbr_t team_member, Args &&...args) { + std::is_same_v || + std::is_same_v, void>::type +par_for_inner(Pattern, team_mbr_t team_member, AllArgs &&...args) { + using dispatchsig = meta::DispatchSignature>; + using Function = typename dispatchsig::Function; + using LaunchBounds = typename dispatchsig::LaunchBounds; if constexpr (std::is_same_v) { - + using Args = typename dispatchsig::Args; + par_dispatch_impl() + .dispatch("simd", HostExecSpace(), std::forward(args)...); } else { - using dispatchsig = meta::DispatchSignature>; - using Function = typename dispatchsig::Function; - using LaunchBounds = typename dispatchsig::LaunchBounds; par_dispatch_inner() - .dispatch(team_member, std::forward(args)...); - /* if constexpr (std::is_same_v) { */ - /* /1* LaunchBounds f = 1.; *1/ */ - /* par_dispatch_inner(LaunchBounds(), team_member, std::forward(args)...); */ - /* } else { */ - /* par_dispatch_inner(LaunchBounds(), team_member, std::forward(args)...); */ - /* } */ - } -} - -// Inner parallel loop using TeamVectorRange -template -KOKKOS_FORCEINLINE_FUNCTION void par_for_inner(InnerLoopPatternTVR, - team_mbr_t team_member, const int il, - const int iu, const Function &function) { - Kokkos::parallel_for(Kokkos::TeamVectorRange(team_member, il, iu + 1), function); -} - -// Inner parallel loop using FOR SIMD -template -KOKKOS_FORCEINLINE_FUNCTION void -par_for_inner(InnerLoopPatternSimdFor, team_mbr_t team_member, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, const int il, - const int iu, const Function &function) { - for (int n = nl; n <= nu; ++n) { - for (int k = kl; k <= ku; ++k) { - for (int j = jl; j <= ju; ++j) { -#pragma omp simd - for (int i = il; i <= iu; i++) { - function(k, j, i); - } - } - } - } -} -template -KOKKOS_FORCEINLINE_FUNCTION void -par_for_inner(InnerLoopPatternSimdFor, team_mbr_t team_member, const int kl, const int ku, - const int jl, const int ju, const int il, const int iu, - const Function &function) { - for (int k = kl; k <= ku; ++k) { - for (int j = jl; j <= ju; ++j) { -#pragma omp simd - for (int i = il; i <= iu; i++) { - function(k, j, i); - } - } - } -} -template -KOKKOS_FORCEINLINE_FUNCTION void -par_for_inner(InnerLoopPatternSimdFor, team_mbr_t team_member, const int jl, const int ju, - const int il, const int iu, const Function &function) { - for (int j = jl; j <= ju; ++j) { -#pragma omp simd - for (int i = il; i <= iu; i++) { - function(j, i); - } - } -} -template -KOKKOS_FORCEINLINE_FUNCTION void par_for_inner(InnerLoopPatternSimdFor, - team_mbr_t team_member, const int il, - const int iu, const Function &function) { -#pragma omp simd - for (int i = il; i <= iu; i++) { - function(i); + .dispatch(team_member, std::forward(args)...); } } From 9a39c029f762cdadfcd54d47386dae6461f547c2 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sat, 27 Jul 2024 16:59:15 +0200 Subject: [PATCH 26/99] formatting --- src/kokkos_abstraction.hpp | 484 +++++++++++++++++++------------------ 1 file changed, 248 insertions(+), 236 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 073c4dd93cba..1ad45d432c41 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -178,34 +178,38 @@ static struct LoopPatternUndefined { } loop_pattern_undefined_tag; // Translates to a Kokkos::TeamPolicy that collapse Nteams outer loops // with Nthread & Nvector inner loop collapses -template +template struct LoopPatternCollapse {}; -template +template struct LoopPatternTeam : std::false_type {}; -template -struct LoopPatternTeam, team + thread + vector, void> - : std::true_type { - using Nvector = std::integral_constant; - using Nthread = std::integral_constant; - using Nteam = std::integral_constant; - using LoopPattern = LoopPatternCollapse; +template +struct LoopPatternTeam, team + thread + vector, + void> : std::true_type { + using Nvector = std::integral_constant; + using Nthread = std::integral_constant; + using Nteam = std::integral_constant; + using LoopPattern = LoopPatternCollapse; }; -template< typename Pattern, size_t Rank> -struct LoopPatternTeam::value || - std::is_same::value || - std::is_same::value>::type> -: std::true_type { - - static constexpr bool IsTPTTR = std::is_same::value; // inner TeamThreadRange - static constexpr bool IsTPTVR = std::is_same::value; // inner ThreadVectorRange +template +struct LoopPatternTeam< + Pattern, Rank, + typename std::enable_if::value || + std::is_same::value || + std::is_same::value>::type> + : std::true_type { + + static constexpr bool IsTPTTR = + std::is_same::value; // inner TeamThreadRange + static constexpr bool IsTPTVR = + std::is_same::value; // inner ThreadVectorRange static constexpr bool IsTPTTRTVR = std::is_same::value; using Nvector = std::integral_constant; using Nthread = std::integral_constant; - using Nteam = std::integral_constant; + using Nteam = std::integral_constant; using LoopPattern = LoopPatternCollapse; }; @@ -229,13 +233,15 @@ constexpr InnerLoopPatternTTR inner_loop_pattern_ttr_tag; struct InnerLoopPatternSimdFor {}; constexpr InnerLoopPatternSimdFor inner_loop_pattern_simdfor_tag; -template< typename Pattern, size_t Rank> -struct LoopPatternTeam::value || - std::is_same::value>::type> -: std::true_type { +template +struct LoopPatternTeam< + Pattern, Rank, + typename std::enable_if::value || + std::is_same::value>::type> + : std::true_type { - static constexpr bool IsTTR = std::is_same::value; - static constexpr bool IsTVR = std::is_same::value; + static constexpr bool IsTTR = std::is_same::value; + static constexpr bool IsTVR = std::is_same::value; static constexpr size_t Nvector = IsTVR ? Rank : 0; static constexpr size_t Nthread = IsTTR ? Rank : 0; @@ -270,10 +276,10 @@ template using base_type = typename std::remove_cv_t>; template -struct PackList {}; +struct TypeList {}; template -constexpr int PackLength(PackList) { +constexpr int SizeOfList(TypeList) { return sizeof...(Ts); } @@ -281,17 +287,17 @@ template struct PopList {}; template -struct PopList<1, PackList> { +struct PopList<1, TypeList> { using type = T; - using value = PackList; + using value = TypeList; }; template -struct PopList> { - static_assert(N > 1, "PopList requires N>=1"); +struct PopList> { + static_assert(N >= 1, "PopList requires N>=1"); private: - using pop = PopList>; + using pop = PopList>; public: using type = typename pop::type; @@ -302,60 +308,62 @@ template struct AppendList {}; template -struct AppendList> { - using value = PackList; +struct AppendList> { + using value = TypeList; }; template struct PrependList {}; template -struct PrependList> { - using value = PackList; +struct PrependList> { + using value = TypeList; }; template struct PopListBack {}; -template -struct PopListBack<0, PackList> { - using value = PackList; +template +struct PopListBack<0, TypeList> { + using value = TypeList; }; template -struct PopListBack> { - static constexpr bool NotFinished = N > 0; - using value = typename std::conditional< NotFinished , - typename PrependList>::value>::value, PackList>; +struct PopListBack> { + static constexpr bool NotFinished = N > 0; + using value = typename std::conditional< + NotFinished, + typename PrependList>::value>::value, + TypeList>; }; template struct MergeLists {}; template -struct MergeLists, PackList<>> { - using value = PackList; +struct MergeLists, TypeList<>> { + using value = TypeList; }; template -struct MergeLists, PackList> { - using value = typename MergeLists, PackList>::value; +struct MergeLists, TypeList> { + using value = typename MergeLists, TypeList>::value; }; template struct PackSameType {}; template -struct PackSameType, PackList<>> { - using value = PackList; +struct PackSameType, TypeList<>> { + using value = TypeList; }; template -struct PackSameType, PackList> { +struct PackSameType, TypeList> { using value = typename std::conditional< std::is_convertible::value, - typename PackSameType, PackList>::value, - PackList>::type; + typename PackSameType, TypeList>::value, + TypeList>::type; }; template @@ -371,8 +379,8 @@ struct SequenceOfOnes> { using value = typename SequenceOfOnes>::value; }; -template -using sequence_of_ones = SequenceOfOnes>; +template +using sequence_of_ones = SequenceOfOnes>; } // namespace meta @@ -382,16 +390,16 @@ template struct PackIntegralType {}; template -struct PackIntegralType, PackList<>> { - using value = PackList; +struct PackIntegralType, TypeList<>> { + using value = TypeList; }; template -struct PackIntegralType, PackList> { +struct PackIntegralType, TypeList> { using value = std::conditional< std::is_integral::value, - typename PackIntegralType, PackList>::value, - PackList>; + typename PackIntegralType, TypeList>::value, + TypeList>; }; template @@ -399,8 +407,8 @@ struct FunctionSignature {}; template struct FunctionSignature { - using IndexND = typename PackSameType, PackList>::value; - using FArgs = PopList>; + using IndexND = typename PackSameType, TypeList>::value; + using FArgs = PopList>; }; template @@ -410,12 +418,12 @@ template struct GetLaunchBounds {}; template <> -struct GetLaunchBounds> { - using value = PackList<>; +struct GetLaunchBounds> { + using value = TypeList<>; }; template -struct GetLaunchBounds> { +struct GetLaunchBounds> { template static constexpr bool is_BoundType() { @@ -424,20 +432,20 @@ struct GetLaunchBounds> { using bound_variants = std::variant; using bound = base_type; - using LaunchBounds = GetLaunchBounds>; + using LaunchBounds = GetLaunchBounds>; using value = typename std::conditional< is_BoundType(), - typename PrependList>::value>::value, - PackList<>>::type; + typename PrependList>::value>::value, + TypeList<>>::type; }; template struct DispatchSignature {}; template -struct DispatchSignature> { - using LaunchBounds = typename GetLaunchBounds>::value; - using pop = PopList>; +struct DispatchSignature> { + using LaunchBounds = typename GetLaunchBounds>::value; + using pop = PopList>; using Function = typename pop::type; using Args = typename pop::value; }; @@ -449,7 +457,7 @@ class FlatFunctor {}; template class FlatFunctor, - meta::PackList> { + meta::TypeList> { static constexpr size_t Rank = sizeof...(Is); Kokkos::Array ranges; @@ -458,15 +466,15 @@ class FlatFunctor, public: template - KOKKOS_INLINE_FUNCTION - FlatFunctor(const Function _function, IndexRange idr, Args... args) + KOKKOS_INLINE_FUNCTION FlatFunctor(const Function _function, IndexRange idr, + Args... args) : function(_function), ranges({{idr, args...}}) { Initialize(); } template - KOKKOS_INLINE_FUNCTION - FlatFunctor(const Function _function, Args... args) : function(_function) { + KOKKOS_INLINE_FUNCTION FlatFunctor(const Function _function, Args... args) + : function(_function) { std::array indices{{static_cast(args)...}}; for (int i = 0; i < Rank; i++) { ranges[i] = {indices[2 * i], indices[2 * i + 1]}; @@ -505,63 +513,61 @@ class FlatFunctor, }; template -KOKKOS_INLINE_FUNCTION -auto MakeFlatFunctor(F &function, Bounds &&...bounds) { +KOKKOS_INLINE_FUNCTION auto MakeFlatFunctor(F &function, Bounds &&...bounds) { using signature = meta::FunctionSignature; using IndexND = typename signature::IndexND; - return FlatFunctor, + return FlatFunctor, typename signature::FArgs::value>(function, std::forward(bounds)...); } - template struct InnerFunctor {}; template -struct InnerFunctor, std::integer_sequence> { - static constexpr size_t Nteam = sizeof...(Iteam); - Function function; - Kokkos::Array inds_team; - - InnerFunctor(Kokkos::Array _inds_team, Function _function) - : inds_team(_inds_team), function(_function){} - - KOKKOS_INLINE_FUNCTION - void operator()(Index... inds) const { - function(inds_team[Iteam]..., std::forward(inds)...); - } -}; +struct InnerFunctor, + std::integer_sequence> { + static constexpr size_t Nteam = sizeof...(Iteam); + Function function; + Kokkos::Array inds_team; + + InnerFunctor(Kokkos::Array _inds_team, Function _function) + : inds_team(_inds_team), function(_function) {} + KOKKOS_INLINE_FUNCTION + void operator()(Index... inds) const { + function(inds_team[Iteam]..., std::forward(inds)...); + } +}; template class CollapseFunctor {}; template -class CollapseFunctor< std::integer_sequence, - std::integer_sequence, - std::integer_sequence, Function> { +class CollapseFunctor, + std::integer_sequence, + std::integer_sequence, Function> { - static constexpr size_t Nteam = sizeof...(Iteam); - static constexpr size_t Nthread = sizeof...(Ithread); - static constexpr size_t Nvector = sizeof...(Ivector); - static constexpr size_t Rank = Nteam + Nthread + Nvector; + static constexpr size_t Nteam = sizeof...(Iteam); + static constexpr size_t Nthread = sizeof...(Ithread); + static constexpr size_t Nvector = sizeof...(Ivector); + static constexpr size_t Rank = Nteam + Nthread + Nvector; Kokkos::Array ranges; Kokkos::Array strides; - Function function; - public: + Function function; + public: template - KOKKOS_INLINE_FUNCTION - CollapseFunctor(const Function _function, IndexRange idr, Args... args) + KOKKOS_INLINE_FUNCTION CollapseFunctor(const Function _function, IndexRange idr, + Args... args) : function(_function), ranges({{idr, args...}}) { Initialize(); } template - KOKKOS_INLINE_FUNCTION - CollapseFunctor(const Function _function, Args... args) : function(_function) { + KOKKOS_INLINE_FUNCTION CollapseFunctor(const Function _function, Args... args) + : function(_function) { std::array indices{{static_cast(args)...}}; for (int i = 0; i < Rank; i++) { ranges[i] = {indices[2 * i], indices[2 * i + 1]}; @@ -572,128 +578,132 @@ class CollapseFunctor< std::integer_sequence, KOKKOS_INLINE_FUNCTION void Initialize() { if constexpr (Rank > 1) { - for (int ri = 0; ri < Nteam-1; ri++) { - const int N = ranges[ri+1].e - ranges[ri+1].s + 1; - strides[ri] = N; - for (int rj = 0; rj < ri; rj++) { - strides[rj] *= N; - } - } - for (int ri = Nteam; ri < Nteam+Nthread-1; ri++) { - const int N = ranges[ri+1].e - ranges[ri+1].s + 1; - strides[ri] = N; - for (int rj = Nteam; rj < ri; rj++) { - strides[rj ] *= N; - } - } - for (int ri = Nteam+Nthread; ri < Rank-1; ri++) { - const int N = ranges[ri+1].e - ranges[ri+1].s + 1; - strides[ri] = N; - for (int rj = Nteam+Nthread; rj < ri; rj++) { - strides[rj ] *= N; - } - } + for (int ri = 0; ri < Nteam - 1; ri++) { + const int N = ranges[ri + 1].e - ranges[ri + 1].s + 1; + strides[ri] = N; + for (int rj = 0; rj < ri; rj++) { + strides[rj] *= N; + } + } + for (int ri = Nteam; ri < Nteam + Nthread - 1; ri++) { + const int N = ranges[ri + 1].e - ranges[ri + 1].s + 1; + strides[ri] = N; + for (int rj = Nteam; rj < ri; rj++) { + strides[rj] *= N; + } + } + for (int ri = Nteam + Nthread; ri < Rank - 1; ri++) { + const int N = ranges[ri + 1].e - ranges[ri + 1].s + 1; + strides[ri] = N; + for (int rj = Nteam + Nthread; rj < ri; rj++) { + strides[rj] *= N; + } + } } } - template - KOKKOS_INLINE_FUNCTION - void recoverID(Kokkos::Array &inds, int idx) const { - inds[0] = idx; - for (int i = 1; i < N; i++) { + template + KOKKOS_INLINE_FUNCTION void recoverID(Kokkos::Array &inds, int idx) const { + inds[0] = idx; + for (int i = 1; i < N; i++) { inds[i] = idx; - inds[i-1] /= strides[i-1 + start]; + inds[i - 1] /= strides[i - 1 + start]; for (int j = 0; j < i; j++) { - inds[i] -= inds[j]*strides[j + start]; + inds[i] -= inds[j] * strides[j + start]; } - } - for (int i=0; i< N; i++) { - inds[i] += ranges[i+start].s; - } + } + for (int i = 0; i < N; i++) { + inds[i] += ranges[i + start].s; + } } KOKKOS_INLINE_FUNCTION int FlattenLaunchBound(int start, int end) const { - int rangeNx = 1; - for (int i = start; i < end; i++) { - rangeNx *= ranges[i].e - ranges[i].s + 1; - } - return rangeNx; + int rangeNx = 1; + for (int i = start; i < end; i++) { + rangeNx *= ranges[i].e - ranges[i].s + 1; + } + return rangeNx; + } + + KOKKOS_INLINE_FUNCTION + void operator()(team_mbr_t team_member) const { + Kokkos::Array inds_team; + recoverID(inds_team, team_member.league_rank()); + using signature = meta::function_signature; + using ThreadVectorInds = + typename meta::PopList::value; + + collapse_inner( + team_member, + InnerFunctor>( + inds_team, function)); } - KOKKOS_INLINE_FUNCTION - void operator()(team_mbr_t team_member) const { - Kokkos::Array inds_team; - recoverID(inds_team, team_member.league_rank()); - using signature = meta::function_signature; - using ThreadVectorInds = typename meta::PopList::value; - - collapse_inner(team_member, - InnerFunctor> - (inds_team, function)); - } - - template - KOKKOS_INLINE_FUNCTION - void collapse_inner(team_mbr_t team_member, InnerFunction inner_function) const { - if constexpr(Nthread > 0) { + template + KOKKOS_INLINE_FUNCTION void collapse_inner(team_mbr_t team_member, + InnerFunction inner_function) const { + if constexpr (Nthread > 0) { Kokkos::parallel_for( - Kokkos::TeamThreadRange<>(team_member, 0, FlattenLaunchBound(Nteam, Nteam+Nthread)), - [&](const int idThread) { + Kokkos::TeamThreadRange<>(team_member, 0, + FlattenLaunchBound(Nteam, Nteam + Nthread)), + [&](const int idThread) { Kokkos::Array inds_thread; - recoverID(inds_thread, idThread); - if constexpr (Nvector > 0 ) { - Kokkos::parallel_for( - Kokkos::TeamVectorRange(team_member, 0, FlattenLaunchBound(Nteam+Nthread, Rank)), - [&](const int idVector) { - Kokkos::Array inds_vector; - recoverID(inds_vector, idVector); - inner_function(inds_thread[Ithread]..., inds_vector[Ivector]...); - }); + recoverID(inds_thread, idThread); + if constexpr (Nvector > 0) { + Kokkos::parallel_for( + Kokkos::TeamVectorRange(team_member, 0, + FlattenLaunchBound(Nteam + Nthread, Rank)), + [&](const int idVector) { + Kokkos::Array inds_vector; + recoverID(inds_vector, idVector); + inner_function(inds_thread[Ithread]..., inds_vector[Ivector]...); + }); } else { - inner_function(inds_thread[Ithread]...); + inner_function(inds_thread[Ithread]...); } - }); - } else { - Kokkos::parallel_for( - Kokkos::TeamVectorRange(team_member, 0, FlattenLaunchBound(Nteam+Nthread, Rank)), - [&](const int idVector) { - Kokkos::Array inds_vector; - recoverID(inds_vector, idVector); - inner_function(inds_vector[Ivector]...); - }); - } - } + }); + } else { + Kokkos::parallel_for(Kokkos::TeamVectorRange( + team_member, 0, FlattenLaunchBound(Nteam + Nthread, Rank)), + [&](const int idVector) { + Kokkos::Array inds_vector; + recoverID(inds_vector, idVector); + inner_function(inds_vector[Ivector]...); + }); + } + } }; - template -KOKKOS_INLINE_FUNCTION -auto MakeCollapseFunctor(LoopPatternCollapse, F &function, Bounds &&...bounds) { +KOKKOS_INLINE_FUNCTION auto +MakeCollapseFunctor(LoopPatternCollapse, F &function, + Bounds &&...bounds) { using signature = meta::FunctionSignature; using IndexND = typename signature::IndexND; - constexpr size_t Rank = meta::PackLength(IndexND()); - static_assert(Rank == Nteam + Nthread + Nvector, - "Rank of functor/lambda in par_for must much total number of loops to collapse"); + constexpr size_t Rank = meta::SizeOfList(IndexND()); + static_assert( + Rank == Nteam + Nthread + Nvector, + "Rank of functor/lambda in par_for must much total number of loops to collapse"); return CollapseFunctor, - std::make_index_sequence, - std::make_index_sequence, F>(function, std::forward(bounds)...); - + std::make_index_sequence, + std::make_index_sequence, F>( + function, std::forward(bounds)...); } -template +template struct par_dispatch_inner {}; -template -struct par_dispatch_inner> { +template +struct par_dispatch_inner> { using signature = meta::function_signature; - static constexpr size_t Rank = meta::PackLength(typename signature::IndexND()); - using LoopPattern = typename LoopPatternTeam::LoopPattern; + static constexpr size_t Rank = meta::SizeOfList(typename signature::IndexND()); + using LoopPattern = typename LoopPatternTeam::LoopPattern; KOKKOS_FORCEINLINE_FUNCTION - void dispatch(team_mbr_t team_member, Bounds &&... bounds, Function function) const { - MakeCollapseFunctor(LoopPattern(), function, std::forward(bounds)...) + void dispatch(team_mbr_t team_member, Bounds &&...bounds, Function function) const { + MakeCollapseFunctor(LoopPattern(), function, std::forward(bounds)...) .collapse_inner(team_member, function); } }; @@ -784,18 +794,17 @@ struct SimdFor { template struct par_dispatch_impl {}; -template -struct par_dispatch_impl, - meta::PackList> { +template +struct par_dispatch_impl, + meta::TypeList> { using signature = meta::function_signature>; - static constexpr size_t Rank = meta::PackLength(typename signature::IndexND()); + static constexpr size_t Rank = meta::SizeOfList(typename signature::IndexND()); - using BoundType = typename meta::PopList<1, meta::PackList>::type; + using BoundType = typename meta::PopList<1, meta::TypeList>::type; static constexpr bool is_IndexRangeBounds = - std::is_same>::value; + std::is_same>::value; static constexpr bool is_ParFor = std::is_same::value; @@ -803,7 +812,8 @@ struct par_dispatch_impl, using IsFlatRange = std::is_same; using IsMDRange = std::is_same; using IsSimdFor = std::is_same; - using TeamPattern = LoopPatternTeam; //false_type unless we use an outer team policy + using TeamPattern = + LoopPatternTeam; // false_type unless we use an outer team policy // fallback simd par_reduce to flat range static constexpr bool is_FlatRange = @@ -838,8 +848,9 @@ struct par_dispatch_impl, } else if constexpr (is_SimdFor) { return loop_pattern_simdfor_tag; } else if constexpr (is_Collapse) { - int rangeNx = FlattenLaunchBound(std::forward(ids)...); - return team_policy(exec_space, rangeNx, Kokkos::AUTO) + int rangeNx = + FlattenLaunchBound(std::forward(ids)...); + return team_policy(exec_space, rangeNx, Kokkos::AUTO) .set_scratch_size(scratch_level, Kokkos::PerTeam(scratch_size_in_bytes)); } }; @@ -850,27 +861,28 @@ struct par_dispatch_impl, } else if constexpr (is_MDRange || is_SimdFor) { return function; } else if constexpr (is_Collapse) { - return MakeCollapseFunctor(typename TeamPattern::LoopPattern(), function, std::forward(ids)...); + return MakeCollapseFunctor(typename TeamPattern::LoopPattern(), function, + std::forward(ids)...); } } - private: - template + private: + template inline int FlattenLaunchBound(Bounds &&...ids) const { - static_assert(NCollapse <= Rank, "Can't flatten more loops than rank"); - int rangeNx = 1; - if constexpr (is_IndexRangeBounds) { - std::array ranges{{ids...}}; - for (int i = 0; i < NCollapse; i++) { - rangeNx *= ranges[i].e - ranges[i].s + 1; - } - } else { - int indices[sizeof...(Bounds)] = {static_cast(ids)...}; - for (int i = 0; i < 2*NCollapse; i += 2) { - rangeNx *= indices[i + 1] - indices[i] + 1; - } + static_assert(NCollapse <= Rank, "Can't flatten more loops than rank"); + int rangeNx = 1; + if constexpr (is_IndexRangeBounds) { + std::array ranges{{ids...}}; + for (int i = 0; i < NCollapse; i++) { + rangeNx *= ranges[i].e - ranges[i].s + 1; + } + } else { + int indices[sizeof...(Bounds)] = {static_cast(ids)...}; + for (int i = 0; i < 2 * NCollapse; i += 2) { + rangeNx *= indices[i + 1] - indices[i] + 1; } - return rangeNx; + } + return rangeNx; } size_t scratch_size_in_bytes = 0; @@ -886,7 +898,7 @@ inline typename std::enable_if::valu std::is_same::value, void>::type par_dispatch(Pattern, std::string name, DevExecSpace exec_space, AllArgs &&...args) { - using dispatchsig = meta::DispatchSignature>; + using dispatchsig = meta::DispatchSignature>; using Function = typename dispatchsig::Function; // functor type using LaunchBounds = typename dispatchsig::LaunchBounds; // list of index types using Args = typename dispatchsig::Args; // @@ -997,24 +1009,24 @@ inline void par_for_outer(const std::string &name, Args &&...args) { std::forward(args)...); } - -template +template KOKKOS_FORCEINLINE_FUNCTION -typename std::enable_if || - std::is_same_v || - std::is_same_v, void>::type -par_for_inner(Pattern, team_mbr_t team_member, AllArgs &&...args) { - using dispatchsig = meta::DispatchSignature>; + typename std::enable_if || + std::is_same_v || + std::is_same_v, + void>::type + par_for_inner(Pattern, team_mbr_t team_member, AllArgs &&...args) { + using dispatchsig = meta::DispatchSignature>; using Function = typename dispatchsig::Function; using LaunchBounds = typename dispatchsig::LaunchBounds; if constexpr (std::is_same_v) { - using Args = typename dispatchsig::Args; - par_dispatch_impl() + using Args = typename dispatchsig::Args; + par_dispatch_impl() .dispatch("simd", HostExecSpace(), std::forward(args)...); } else { - par_dispatch_inner() - .dispatch(team_member, std::forward(args)...); + par_dispatch_inner().dispatch( + team_member, std::forward(args)...); } } From 08e788fb72c84a47eab51ba204084a668dc52318 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sat, 27 Jul 2024 20:31:27 +0200 Subject: [PATCH 27/99] cleaning up --- src/kokkos_abstraction.hpp | 72 +++++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 25 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 1ad45d432c41..bdae3079e817 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -21,6 +21,7 @@ #define KOKKOS_ABSTRACTION_HPP_ #include +#include #include #include #include @@ -272,6 +273,7 @@ inline void kokkos_dispatch(ParallelScanDispatch, Args &&...args) { namespace meta { +// c++-20 has std:remove_cvref_t that does this same thing template using base_type = typename std::remove_cv_t>; @@ -387,18 +389,18 @@ using sequence_of_ones = SequenceOfOnes> namespace meta { template -struct PackIntegralType {}; +struct PackIntegerType {}; template -struct PackIntegralType, TypeList<>> { +struct PackIntegerType, TypeList<>> { using value = TypeList; }; template -struct PackIntegralType, TypeList> { +struct PackIntegerType, TypeList> { using value = std::conditional< - std::is_integral::value, - typename PackIntegralType, TypeList>::value, + std::numeric_limits::is_integer, + typename PackIntegerType, TypeList>::value, TypeList>; }; @@ -408,11 +410,12 @@ struct FunctionSignature {}; template struct FunctionSignature { using IndexND = typename PackSameType, TypeList>::value; + using Rank = std::integral_constant; using FArgs = PopList>; }; template -using function_signature = FunctionSignature; +using function_signature = FunctionSignature::operator())>; template struct GetLaunchBounds {}; @@ -420,23 +423,40 @@ struct GetLaunchBounds {}; template <> struct GetLaunchBounds> { using value = TypeList<>; + using NumInds = std::integral_constant; }; template struct GetLaunchBounds> { - + private: template static constexpr bool is_BoundType() { return std::numeric_limits::is_integer || std::is_same_v; } + template + static constexpr size_t NumBnds() { + if constexpr (!is_BoundType()) { + return 0; + } + return std::is_same_v ? 2 : 1; + } + + template + using Rank_t = std::integral_constant; + using bound_variants = std::variant; using bound = base_type; using LaunchBounds = GetLaunchBounds>; + + public: using value = typename std::conditional< - is_BoundType(), - typename PrependList>::value>::value, + is_BoundType(), typename PrependList::value, TypeList<>>::type; + using NumInds = + std::conditional_t(), + Rank_t() + LaunchBounds::NumInds::value>, + Rank_t()>>; }; template @@ -444,8 +464,13 @@ struct DispatchSignature {}; template struct DispatchSignature> { - using LaunchBounds = typename GetLaunchBounds>::value; - using pop = PopList>; + private: + using LB = GetLaunchBounds>; + using pop = PopList>; + + public: + using LaunchBounds = typename LB::value; + using Rank = std::integral_constant; using Function = typename pop::type; using Args = typename pop::value; }; @@ -514,7 +539,7 @@ class FlatFunctor, template KOKKOS_INLINE_FUNCTION auto MakeFlatFunctor(F &function, Bounds &&...bounds) { - using signature = meta::FunctionSignature; + using signature = meta::function_signature; using IndexND = typename signature::IndexND; return FlatFunctor, typename signature::FArgs::value>(function, @@ -679,7 +704,7 @@ template , F &function, Bounds &&...bounds) { - using signature = meta::FunctionSignature; + using signature = meta::function_signature; using IndexND = typename signature::IndexND; constexpr size_t Rank = meta::SizeOfList(IndexND()); static_assert( @@ -799,7 +824,7 @@ template , meta::TypeList> { - using signature = meta::function_signature>; + using signature = meta::function_signature; static constexpr size_t Rank = meta::SizeOfList(typename signature::IndexND()); using BoundType = typename meta::PopList<1, meta::TypeList>::type; @@ -902,6 +927,9 @@ par_dispatch(Pattern, std::string name, DevExecSpace exec_space, AllArgs &&...ar using Function = typename dispatchsig::Function; // functor type using LaunchBounds = typename dispatchsig::LaunchBounds; // list of index types using Args = typename dispatchsig::Args; // + + using sig = meta::function_signature; + static_assert(sig::Rank::value == dispatchsig::Rank::value); par_dispatch_impl().dispatch( name, exec_space, std::forward(args)...); } @@ -1011,11 +1039,12 @@ inline void par_for_outer(const std::string &name, Args &&...args) { template KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if || - std::is_same_v || - std::is_same_v, - void>::type + std::enable_if_t || + std::is_same_v || + std::is_same_v, + void> par_for_inner(Pattern, team_mbr_t team_member, AllArgs &&...args) { + using dispatchsig = meta::DispatchSignature>; using Function = typename dispatchsig::Function; using LaunchBounds = typename dispatchsig::LaunchBounds; @@ -1030,13 +1059,6 @@ KOKKOS_FORCEINLINE_FUNCTION } } -template -KOKKOS_FORCEINLINE_FUNCTION void par_for_inner(const Tag &t, team_mbr_t member, - const IndexRange r, - const Function &function) { - par_for_inner(t, member, r.s, r.e, function); -} - template KOKKOS_FORCEINLINE_FUNCTION void par_for_inner(team_mbr_t team_member, Args &&...args) { par_for_inner(DEFAULT_INNER_LOOP_PATTERN, team_member, std::forward(args)...); From 8d1a5ca3a2ff83dde9cd1b194b774506ecd2a618 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sat, 27 Jul 2024 21:29:45 +0200 Subject: [PATCH 28/99] infer loop rank from launch bounds rather than functor signature --- src/kokkos_abstraction.hpp | 94 +++++++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 37 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index bdae3079e817..f3bbd9b16bfe 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -352,6 +352,27 @@ struct MergeLists, TypeList> { using value = typename MergeLists, TypeList>::value; }; +template +struct SplitList {}; + +template +struct SplitList<1, TypeList> { + using Left = TypeList; + using Right = TypeList; +}; + +template +struct SplitList> { + static_assert(sizeof...(Ts) + 1 >= N, "size of list must be > N"); + + private: + using split = SplitList>; + + public: + using Left = typename PrependList::value; + using Right = typename split::Right; +}; + template struct PackSameType {}; @@ -404,18 +425,23 @@ struct PackIntegerType, TypeList> { TypeList>; }; -template +template struct FunctionSignature {}; -template -struct FunctionSignature { - using IndexND = typename PackSameType, TypeList>::value; - using Rank = std::integral_constant; - using FArgs = PopList>; +template +struct FunctionSignature { + /* using IndexND = typename PackSameType, TypeList>::value; */ + /* using FArgs = PopList>; */ + private: + using split = SplitList>; + + public: + using IndexND = typename split::Left; + using FArgs = typename split::Right; }; -template -using function_signature = FunctionSignature::operator())>; +template +using function_signature = FunctionSignature::operator())>; template struct GetLaunchBounds {}; @@ -537,13 +563,12 @@ class FlatFunctor, } }; -template +template KOKKOS_INLINE_FUNCTION auto MakeFlatFunctor(F &function, Bounds &&...bounds) { - using signature = meta::function_signature; + using signature = meta::function_signature; using IndexND = typename signature::IndexND; - return FlatFunctor, - typename signature::FArgs::value>(function, - std::forward(bounds)...); + return FlatFunctor, typename signature::FArgs>( + function, std::forward(bounds)...); } template @@ -655,7 +680,7 @@ class CollapseFunctor, void operator()(team_mbr_t team_member) const { Kokkos::Array inds_team; recoverID(inds_team, team_member.league_rank()); - using signature = meta::function_signature; + using signature = meta::function_signature; using ThreadVectorInds = typename meta::PopList::value; @@ -704,12 +729,9 @@ template , F &function, Bounds &&...bounds) { - using signature = meta::function_signature; + constexpr size_t Rank = Nteam + Nthread + Nvector; + using signature = meta::function_signature; using IndexND = typename signature::IndexND; - constexpr size_t Rank = meta::SizeOfList(IndexND()); - static_assert( - Rank == Nteam + Nthread + Nvector, - "Rank of functor/lambda in par_for must much total number of loops to collapse"); return CollapseFunctor, std::make_index_sequence, @@ -717,13 +739,12 @@ MakeCollapseFunctor(LoopPatternCollapse, F &function, function, std::forward(bounds)...); } -template +template struct par_dispatch_inner {}; -template -struct par_dispatch_inner> { - using signature = meta::function_signature; - static constexpr size_t Rank = meta::SizeOfList(typename signature::IndexND()); +template +struct par_dispatch_inner> { + using signature = meta::function_signature; using LoopPattern = typename LoopPatternTeam::LoopPattern; KOKKOS_FORCEINLINE_FUNCTION @@ -816,16 +837,15 @@ struct SimdFor { } }; -template +template struct par_dispatch_impl {}; -template -struct par_dispatch_impl, +template +struct par_dispatch_impl, meta::TypeList> { - using signature = meta::function_signature; - static constexpr size_t Rank = meta::SizeOfList(typename signature::IndexND()); + using signature = meta::function_signature; using BoundType = typename meta::PopList<1, meta::TypeList>::type; static constexpr bool is_IndexRangeBounds = @@ -882,7 +902,7 @@ struct par_dispatch_impl, inline auto functor(Function function, Bounds &&...ids) const { if constexpr (is_FlatRange) { - return MakeFlatFunctor(function, std::forward(ids)...); + return MakeFlatFunctor(function, std::forward(ids)...); } else if constexpr (is_MDRange || is_SimdFor) { return function; } else if constexpr (is_Collapse) { @@ -924,13 +944,12 @@ inline typename std::enable_if::valu void>::type par_dispatch(Pattern, std::string name, DevExecSpace exec_space, AllArgs &&...args) { using dispatchsig = meta::DispatchSignature>; + static constexpr size_t Rank = dispatchsig::Rank::value; using Function = typename dispatchsig::Function; // functor type using LaunchBounds = typename dispatchsig::LaunchBounds; // list of index types using Args = typename dispatchsig::Args; // - using sig = meta::function_signature; - static_assert(sig::Rank::value == dispatchsig::Rank::value); - par_dispatch_impl().dispatch( + par_dispatch_impl().dispatch( name, exec_space, std::forward(args)...); } @@ -1046,15 +1065,16 @@ KOKKOS_FORCEINLINE_FUNCTION par_for_inner(Pattern, team_mbr_t team_member, AllArgs &&...args) { using dispatchsig = meta::DispatchSignature>; + constexpr size_t Rank = dispatchsig::Rank::value; using Function = typename dispatchsig::Function; using LaunchBounds = typename dispatchsig::LaunchBounds; if constexpr (std::is_same_v) { using Args = typename dispatchsig::Args; - par_dispatch_impl() + par_dispatch_impl() .dispatch("simd", HostExecSpace(), std::forward(args)...); } else { - par_dispatch_inner().dispatch( + par_dispatch_inner().dispatch( team_member, std::forward(args)...); } } From 3a32e8429e5ed990dc808be8c62c943ee3578903 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sun, 28 Jul 2024 00:24:40 +0200 Subject: [PATCH 29/99] helper type DispatchType to hold useuful parameters --- src/kokkos_abstraction.hpp | 134 +++++++++++++++++++++++-------------- 1 file changed, 83 insertions(+), 51 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index f3bbd9b16bfe..bf5bd75d688d 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -221,6 +221,13 @@ struct LoopPatternTeam< // Currently the only available option. static struct OuterLoopPatternTeams { } outer_loop_pattern_teams_tag; +template +struct LoopPatternTeam : std::true_type { + using Nvector = std::integral_constant; + using Nthread = std::integral_constant; + using Nteam = std::integral_constant; + using LoopPattern = LoopPatternCollapse; +}; // Inner loop pattern tags must be constexpr so they're available on device // Translate to a Kokkos::TeamVectorRange as innermost loop (single index) struct InnerLoopPatternTVR {}; @@ -430,8 +437,6 @@ struct FunctionSignature {}; template struct FunctionSignature { - /* using IndexND = typename PackSameType, TypeList>::value; */ - /* using FArgs = PopList>; */ private: using split = SplitList>; @@ -501,6 +506,27 @@ struct DispatchSignature> { using Args = typename pop::value; }; +template +struct DispatchType { + using BoundType = typename PopList<1, TypeList>::type; + static constexpr bool is_IndexRangeBounds = + std::is_same>::value; + static constexpr bool is_ParFor = + std::is_same::value; + + static constexpr bool IsFlatRange = std::is_same::value; + static constexpr bool IsMDRange = std::is_same::value; + static constexpr bool IsSimdFor = std::is_same::value; + using TeamPattern = + LoopPatternTeam; // false_type unless we use an outer team policy + + // fallback simd par_reduce to flat range + static constexpr bool is_FlatRange = (IsFlatRange || (IsSimdFor && !is_ParFor)); + static constexpr bool is_SimdFor = (IsSimdFor && is_ParFor); + static constexpr bool is_MDRange = IsMDRange; + static constexpr bool is_Collapse = TeamPattern::value; +}; + } // namespace meta template @@ -653,7 +679,7 @@ class CollapseFunctor, } template - KOKKOS_INLINE_FUNCTION void recoverID(Kokkos::Array &inds, int idx) const { + KOKKOS_INLINE_FUNCTION void recoverIndex(Kokkos::Array &inds, int idx) const { inds[0] = idx; for (int i = 1; i < N; i++) { inds[i] = idx; @@ -679,7 +705,7 @@ class CollapseFunctor, KOKKOS_INLINE_FUNCTION void operator()(team_mbr_t team_member) const { Kokkos::Array inds_team; - recoverID(inds_team, team_member.league_rank()); + recoverIndex(inds_team, team_member.league_rank()); using signature = meta::function_signature; using ThreadVectorInds = typename meta::PopList::value; @@ -699,14 +725,14 @@ class CollapseFunctor, FlattenLaunchBound(Nteam, Nteam + Nthread)), [&](const int idThread) { Kokkos::Array inds_thread; - recoverID(inds_thread, idThread); + recoverIndex(inds_thread, idThread); if constexpr (Nvector > 0) { Kokkos::parallel_for( Kokkos::TeamVectorRange(team_member, 0, FlattenLaunchBound(Nteam + Nthread, Rank)), [&](const int idVector) { Kokkos::Array inds_vector; - recoverID(inds_vector, idVector); + recoverIndex(inds_vector, idVector); inner_function(inds_thread[Ithread]..., inds_vector[Ivector]...); }); } else { @@ -718,7 +744,8 @@ class CollapseFunctor, team_member, 0, FlattenLaunchBound(Nteam + Nthread, Rank)), [&](const int idVector) { Kokkos::Array inds_vector; - recoverID(inds_vector, idVector); + recoverIndex(inds_vector, + idVector); inner_function(inds_vector[Ivector]...); }); } @@ -845,78 +872,67 @@ template , meta::TypeList> { - using signature = meta::function_signature; + using DType = meta::DispatchType; - using BoundType = typename meta::PopList<1, meta::TypeList>::type; - static constexpr bool is_IndexRangeBounds = - std::is_same>::value; - - static constexpr bool is_ParFor = - std::is_same::value; + static inline void dispatch(std::string name, DevExecSpace exec_space, Bounds &&...ids, + Function function, Args &&...args, + const int scratch_level = 0, + const size_t scratch_size_in_bytes = 0) { - using IsFlatRange = std::is_same; - using IsMDRange = std::is_same; - using IsSimdFor = std::is_same; - using TeamPattern = - LoopPatternTeam; // false_type unless we use an outer team policy - - // fallback simd par_reduce to flat range - static constexpr bool is_FlatRange = - (IsFlatRange::value || (IsSimdFor::value && !is_ParFor)); - static constexpr bool is_SimdFor = (IsSimdFor::value && is_ParFor); - static constexpr bool is_MDRange = IsMDRange::value; - static constexpr bool is_Collapse = TeamPattern::value; - - inline void dispatch(std::string name, DevExecSpace exec_space, Bounds &&...ids, - Function function, Args &&...args) { - - static_assert(!(is_MDRange && Rank < 2), "Can not launch MDRange with Rank < 2"); + static_assert(!(DType::is_MDRange && Rank < 2), + "Can not launch MDRange with Rank < 2"); Tag tag; PARTHENON_INSTRUMENT_REGION(name) - if constexpr (is_SimdFor) { + if constexpr (DType::is_SimdFor) { SimdFor(std::forward(ids)...).dispatch(function); } else { - kokkos_dispatch(tag, name, policy(exec_space, std::forward(ids)...), + kokkos_dispatch(tag, name, + policy(exec_space, std::forward(ids)..., scratch_level, + scratch_size_in_bytes), functor(function, std::forward(ids)...), std::forward(args)...); } }; - inline auto policy(DevExecSpace exec_space, Bounds &&...ids) const { + static inline auto policy(DevExecSpace exec_space, Bounds &&...ids, + const int scratch_level = 0, + const size_t scratch_size_in_bytes = 0) { - if constexpr (is_FlatRange) { + if constexpr (DType::is_FlatRange) { int rangeNx = FlattenLaunchBound(std::forward(ids)...); return Kokkos::RangePolicy<>(exec_space, 0, rangeNx); - } else if constexpr (is_MDRange) { + } else if constexpr (DType::is_MDRange) { return MakeMDRangePolicy(exec_space, std::forward(ids)...); - } else if constexpr (is_SimdFor) { + + } else if constexpr (DType::is_SimdFor) { return loop_pattern_simdfor_tag; - } else if constexpr (is_Collapse) { - int rangeNx = - FlattenLaunchBound(std::forward(ids)...); + + } else if constexpr (DType::is_Collapse) { + int rangeNx = FlattenLaunchBound( + std::forward(ids)...); return team_policy(exec_space, rangeNx, Kokkos::AUTO) .set_scratch_size(scratch_level, Kokkos::PerTeam(scratch_size_in_bytes)); } }; - inline auto functor(Function function, Bounds &&...ids) const { - if constexpr (is_FlatRange) { + static inline auto functor(Function function, Bounds &&...ids) { + if constexpr (DType::is_FlatRange) { return MakeFlatFunctor(function, std::forward(ids)...); - } else if constexpr (is_MDRange || is_SimdFor) { + } else if constexpr (DType::is_MDRange || DType::is_SimdFor) { return function; - } else if constexpr (is_Collapse) { - return MakeCollapseFunctor(typename TeamPattern::LoopPattern(), function, + } else if constexpr (DType::is_Collapse) { + return MakeCollapseFunctor(typename DType::TeamPattern::LoopPattern(), function, std::forward(ids)...); } } private: template - inline int FlattenLaunchBound(Bounds &&...ids) const { + static inline int FlattenLaunchBound(Bounds &&...ids) { static_assert(NCollapse <= Rank, "Can't flatten more loops than rank"); int rangeNx = 1; - if constexpr (is_IndexRangeBounds) { + if constexpr (DType::is_IndexRangeBounds) { std::array ranges{{ids...}}; for (int i = 0; i < NCollapse; i++) { rangeNx *= ranges[i].e - ranges[i].s + 1; @@ -929,9 +945,6 @@ struct par_dispatch_impl } return rangeNx; } - - size_t scratch_size_in_bytes = 0; - int scratch_level = 1; }; template @@ -949,7 +962,7 @@ par_dispatch(Pattern, std::string name, DevExecSpace exec_space, AllArgs &&...ar using LaunchBounds = typename dispatchsig::LaunchBounds; // list of index types using Args = typename dispatchsig::Args; // - par_dispatch_impl().dispatch( + par_dispatch_impl::dispatch( name, exec_space, std::forward(args)...); } @@ -981,6 +994,25 @@ inline void par_scan(Args &&...args) { par_dispatch(std::forward(args)...); } +/* template */ +/* inline std::enable_if_t::value, void> */ +/* par_for_outer(OuterLoopPatternTeams, const std::string &name, DevExecSpace exec_space, + */ +/* size_t scratch_size_in_bytes, const int scratch_level, AllArgs &&...args) + * { */ +/* using dispatchsig = meta::DispatchSignature>; */ +/* static constexpr size_t Rank = dispatchsig::Rank::value; */ +/* using Function = typename dispatchsig::Function; // functor type */ +/* using LaunchBounds = typename dispatchsig::LaunchBounds; // list of index types */ +/* using Args = typename dispatchsig::Args; // */ +/* using LoopPattern = LoopPatternTeam; */ +/* using Tag = dispatch_impl::ParallelForDispatch; */ + +/* par_dispatch_impl::dispatch( */ +/* name, exec_space, std::forward(args)..., scratch_level, */ +/* scratch_size_in_bytes); */ +/* } */ + // 1D outer parallel loop using Kokkos Teams template inline void par_for_outer(OuterLoopPatternTeams, const std::string &name, From c2ac94f323349b4a9adfb7255505a4ecddca49bc Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sun, 28 Jul 2024 01:27:57 +0200 Subject: [PATCH 30/99] done with par_for_outer --- src/kokkos_abstraction.hpp | 133 ++++++++++--------------------------- 1 file changed, 36 insertions(+), 97 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index bf5bd75d688d..b5d5321e784c 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -212,6 +212,7 @@ struct LoopPatternTeam< using Nthread = std::integral_constant; using Nteam = std::integral_constant; using LoopPattern = LoopPatternCollapse; + using OuterPattern = Pattern; }; // Tags for Nested parallelism where the outermost layer supports 1, 2, or 3 @@ -227,6 +228,7 @@ struct LoopPatternTeam : std::true_type { using Nthread = std::integral_constant; using Nteam = std::integral_constant; using LoopPattern = LoopPatternCollapse; + using OuterPattern = OuterLoopPatternTeams; }; // Inner loop pattern tags must be constexpr so they're available on device // Translate to a Kokkos::TeamVectorRange as innermost loop (single index) @@ -438,7 +440,8 @@ struct FunctionSignature {}; template struct FunctionSignature { private: - using split = SplitList>; + static constexpr bool team_mbr = std::is_same_v>; + using split = SplitList>; public: using IndexND = typename split::Left; @@ -616,13 +619,14 @@ struct InnerFunctor, } }; -template +template class CollapseFunctor {}; -template +template class CollapseFunctor, std::integer_sequence, - std::integer_sequence, Function> { + std::integer_sequence, Function, ParForOuter> { static constexpr size_t Nteam = sizeof...(Iteam); static constexpr size_t Nthread = sizeof...(Ithread); @@ -710,10 +714,14 @@ class CollapseFunctor, using ThreadVectorInds = typename meta::PopList::value; - collapse_inner( - team_member, - InnerFunctor>( - inds_team, function)); + if constexpr (ParForOuter) { + function(team_member, inds_team[Iteam]...); + } else { + collapse_inner( + team_member, + InnerFunctor>( + inds_team, function)); + } } template @@ -752,7 +760,8 @@ class CollapseFunctor, } }; -template +template KOKKOS_INLINE_FUNCTION auto MakeCollapseFunctor(LoopPatternCollapse, F &function, Bounds &&...bounds) { @@ -762,7 +771,7 @@ MakeCollapseFunctor(LoopPatternCollapse, F &function, return CollapseFunctor, std::make_index_sequence, - std::make_index_sequence, F>( + std::make_index_sequence, F, ParForOuter>( function, std::forward(bounds)...); } @@ -922,8 +931,9 @@ struct par_dispatch_impl } else if constexpr (DType::is_MDRange || DType::is_SimdFor) { return function; } else if constexpr (DType::is_Collapse) { - return MakeCollapseFunctor(typename DType::TeamPattern::LoopPattern(), function, - std::forward(ids)...); + constexpr bool ParForOuter = std::is_same_v; + return MakeCollapseFunctor(typename DType::TeamPattern::LoopPattern(), + function, std::forward(ids)...); } } @@ -994,92 +1004,21 @@ inline void par_scan(Args &&...args) { par_dispatch(std::forward(args)...); } -/* template */ -/* inline std::enable_if_t::value, void> */ -/* par_for_outer(OuterLoopPatternTeams, const std::string &name, DevExecSpace exec_space, - */ -/* size_t scratch_size_in_bytes, const int scratch_level, AllArgs &&...args) - * { */ -/* using dispatchsig = meta::DispatchSignature>; */ -/* static constexpr size_t Rank = dispatchsig::Rank::value; */ -/* using Function = typename dispatchsig::Function; // functor type */ -/* using LaunchBounds = typename dispatchsig::LaunchBounds; // list of index types */ -/* using Args = typename dispatchsig::Args; // */ -/* using LoopPattern = LoopPatternTeam; */ -/* using Tag = dispatch_impl::ParallelForDispatch; */ - -/* par_dispatch_impl::dispatch( */ -/* name, exec_space, std::forward(args)..., scratch_level, */ -/* scratch_size_in_bytes); */ -/* } */ - -// 1D outer parallel loop using Kokkos Teams -template -inline void par_for_outer(OuterLoopPatternTeams, const std::string &name, - DevExecSpace exec_space, size_t scratch_size_in_bytes, - const int scratch_level, const int kl, const int ku, - const Function &function) { - const int Nk = ku + 1 - kl; - - team_policy policy(exec_space, Nk, Kokkos::AUTO); - - Kokkos::parallel_for( - name, - policy.set_scratch_size(scratch_level, Kokkos::PerTeam(scratch_size_in_bytes)), - KOKKOS_LAMBDA(team_mbr_t team_member) { - const int k = team_member.league_rank() + kl; - function(team_member, k); - }); -} - -// 2D outer parallel loop using Kokkos Teams -template -inline void par_for_outer(OuterLoopPatternTeams, const std::string &name, - DevExecSpace exec_space, size_t scratch_size_in_bytes, - const int scratch_level, const int kl, const int ku, - const int jl, const int ju, const Function &function) { - const int Nk = ku + 1 - kl; - const int Nj = ju + 1 - jl; - const int NkNj = Nk * Nj; - - team_policy policy(exec_space, NkNj, Kokkos::AUTO); - - Kokkos::parallel_for( - name, - policy.set_scratch_size(scratch_level, Kokkos::PerTeam(scratch_size_in_bytes)), - KOKKOS_LAMBDA(team_mbr_t team_member) { - const int k = team_member.league_rank() / Nj + kl; - const int j = team_member.league_rank() % Nj + jl; - function(team_member, k, j); - }); -} +template +inline std::enable_if_t::value, void> +par_for_outer(Pattern, const std::string &name, DevExecSpace exec_space, + size_t scratch_size_in_bytes, const int scratch_level, AllArgs &&...args) { + using dispatchsig = meta::DispatchSignature>; + static constexpr size_t Rank = dispatchsig::Rank::value; + using Function = typename dispatchsig::Function; // functor type + using LaunchBounds = typename dispatchsig::LaunchBounds; // list of index types + using Args = typename dispatchsig::Args; // + using LoopPattern = LoopPatternTeam; + using Tag = dispatch_impl::ParallelForDispatch; -// 3D outer parallel loop using Kokkos Teams -template -inline void par_for_outer(OuterLoopPatternTeams, const std::string &name, - DevExecSpace exec_space, size_t scratch_size_in_bytes, - const int scratch_level, const int nl, const int nu, - const int kl, const int ku, const int jl, const int ju, - const Function &function) { - const int Nn = nu - nl + 1; - const int Nk = ku - kl + 1; - const int Nj = ju - jl + 1; - const int NkNj = Nk * Nj; - const int NnNkNj = Nn * Nk * Nj; - - team_policy policy(exec_space, NnNkNj, Kokkos::AUTO); - - Kokkos::parallel_for( - name, - policy.set_scratch_size(scratch_level, Kokkos::PerTeam(scratch_size_in_bytes)), - KOKKOS_LAMBDA(team_mbr_t team_member) { - int n = team_member.league_rank() / NkNj; - int k = (team_member.league_rank() - n * NkNj) / Nj; - const int j = team_member.league_rank() - n * NkNj - k * Nj + jl; - n += nl; - k += kl; - function(team_member, n, k, j); - }); + par_dispatch_impl::dispatch( + name, exec_space, std::forward(args)..., scratch_level, + scratch_size_in_bytes); } template From d7477c7619e15744554857cb25c0ff1839a7b5e7 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sun, 28 Jul 2024 01:44:32 +0200 Subject: [PATCH 31/99] cleanup --- src/kokkos_abstraction.hpp | 73 +++++++------------------------------- 1 file changed, 12 insertions(+), 61 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index b5d5321e784c..50c61670f296 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -188,9 +188,9 @@ struct LoopPatternTeam : std::false_type {}; template struct LoopPatternTeam, team + thread + vector, void> : std::true_type { - using Nvector = std::integral_constant; - using Nthread = std::integral_constant; - using Nteam = std::integral_constant; + static constexpr size_t Nvector = vector; + static constexpr size_t Nthread = thread; + static constexpr size_t Nteam = team; using LoopPattern = LoopPatternCollapse; }; @@ -208,10 +208,10 @@ struct LoopPatternTeam< std::is_same::value; // inner ThreadVectorRange static constexpr bool IsTPTTRTVR = std::is_same::value; - using Nvector = std::integral_constant; - using Nthread = std::integral_constant; - using Nteam = std::integral_constant; - using LoopPattern = LoopPatternCollapse; + static constexpr size_t Nvector = IsTPTVR || IsTPTTRTVR; + static constexpr size_t Nthread = IsTPTTR || IsTPTTRTVR; + static constexpr size_t Nteam = Rank - Nthread - Nvector; + using LoopPattern = LoopPatternCollapse; using OuterPattern = Pattern; }; @@ -224,9 +224,9 @@ static struct OuterLoopPatternTeams { } outer_loop_pattern_teams_tag; template struct LoopPatternTeam : std::true_type { - using Nvector = std::integral_constant; - using Nthread = std::integral_constant; - using Nteam = std::integral_constant; + static constexpr size_t Nvector = 0; + static constexpr size_t Nthread = 0; + static constexpr size_t Nteam = Rank; using LoopPattern = LoopPatternCollapse; using OuterPattern = OuterLoopPatternTeams; }; @@ -331,23 +331,6 @@ struct PrependList> { using value = TypeList; }; -template -struct PopListBack {}; - -template -struct PopListBack<0, TypeList> { - using value = TypeList; -}; - -template -struct PopListBack> { - static constexpr bool NotFinished = N > 0; - using value = typename std::conditional< - NotFinished, - typename PrependList>::value>::value, - TypeList>; -}; - template struct MergeLists {}; @@ -382,22 +365,6 @@ struct SplitList> { using Right = typename split::Right; }; -template -struct PackSameType {}; - -template -struct PackSameType, TypeList<>> { - using value = TypeList; -}; - -template -struct PackSameType, TypeList> { - using value = typename std::conditional< - std::is_convertible::value, - typename PackSameType, TypeList>::value, - TypeList>::type; -}; - template struct SequenceOfOnes {}; @@ -418,22 +385,6 @@ using sequence_of_ones = SequenceOfOnes> namespace meta { -template -struct PackIntegerType {}; - -template -struct PackIntegerType, TypeList<>> { - using value = TypeList; -}; - -template -struct PackIntegerType, TypeList> { - using value = std::conditional< - std::numeric_limits::is_integer, - typename PackIntegerType, TypeList>::value, - TypeList>; -}; - template struct FunctionSignature {}; @@ -918,8 +869,8 @@ struct par_dispatch_impl return loop_pattern_simdfor_tag; } else if constexpr (DType::is_Collapse) { - int rangeNx = FlattenLaunchBound( - std::forward(ids)...); + int rangeNx = + FlattenLaunchBound(std::forward(ids)...); return team_policy(exec_space, rangeNx, Kokkos::AUTO) .set_scratch_size(scratch_level, Kokkos::PerTeam(scratch_size_in_bytes)); } From 69cda3884302af6b5f31eacde6cca060ba4a2e17 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sun, 28 Jul 2024 16:32:58 +0200 Subject: [PATCH 32/99] adding tests --- src/kokkos_abstraction.hpp | 10 ++ tst/unit/kokkos_abstraction.cpp | 174 +++++++++++++++++++++++++++++++- 2 files changed, 183 insertions(+), 1 deletion(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 50c61670f296..63a75297de8d 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -365,6 +365,16 @@ struct SplitList> { using Right = typename split::Right; }; +template +struct ListOfType { + using value = typename PrependList::value>::value; +}; + +template +struct ListOfType<1, T> { + using value = TypeList; +}; + template struct SequenceOfOnes {}; diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index ae0e3fcb79e8..4e61d43f6b47 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -19,12 +19,16 @@ #include #include +#include #include #include +#include "Kokkos_Core.hpp" +#include "Kokkos_Macros.hpp" #include "basic_types.hpp" #include "kokkos_abstraction.hpp" +#include "parthenon_array_generic.hpp" using parthenon::DevExecSpace; using parthenon::ParArray1D; @@ -158,6 +162,172 @@ bool test_wrapper_3d(T loop_pattern, DevExecSpace exec_space) { return all_same; } +template +auto ParArrayND(Args &&...args) { + static_assert(ND <= 8, "ParArrayND supoorted up to ND=8"); + if constexpr (ND == 0) { + return parthenon::ParArray0D(std::forward(args)...); + } else if constexpr (ND == 1) { + return parthenon::ParArray1D(std::forward(args)...); + } else if constexpr (ND == 2) { + return parthenon::ParArray2D(std::forward(args)...); + } else if constexpr (ND == 3) { + return parthenon::ParArray3D(std::forward(args)...); + } else if constexpr (ND == 4) { + return parthenon::ParArray4D(std::forward(args)...); + } else if constexpr (ND == 5) { + return parthenon::ParArray5D(std::forward(args)...); + } else if constexpr (ND == 6) { + return parthenon::ParArray6D(std::forward(args)...); + } else if constexpr (ND == 7) { + return parthenon::ParArray7D(std::forward(args)...); + } else if constexpr (ND == 8) { + return parthenon::ParArray8D(std::forward(args)...); + } +} +template +auto HostArrayND(Args &&...args) { + static_assert(ND <= 7, "HostArrayND supoorted up to ND=7"); + if constexpr (ND == 0) { + return parthenon::HostArray0D(std::forward(args)...); + } else if constexpr (ND == 1) { + return parthenon::HostArray1D(std::forward(args)...); + } else if constexpr (ND == 2) { + return parthenon::HostArray2D(std::forward(args)...); + } else if constexpr (ND == 3) { + return parthenon::HostArray3D(std::forward(args)...); + } else if constexpr (ND == 4) { + return parthenon::HostArray4D(std::forward(args)...); + } else if constexpr (ND == 5) { + return parthenon::HostArray5D(std::forward(args)...); + } else if constexpr (ND == 6) { + return parthenon::HostArray6D(std::forward(args)...); + } else if constexpr (ND == 7) { + return parthenon::HostArray7D(std::forward(args)...); + } +} + +enum class lbounds { integer, indexrange }; + +template +struct test_wrapper_nd_impl { + template + using Sequence = std::make_index_sequence; + int N, indices[Rank - 1], int_bounds[2 * Rank]; + parthenon::IndexRange bounds[Rank]; + decltype(ParArrayND()) arr_dev; + decltype(HostArrayND()) arr_host_orig, arr_host_mod; + + test_wrapper_nd_impl(const int _N = 32) : N(_N) { + arr_dev = GetArray(typename parthenon::meta::sequence_of_ones::value()); + arr_host_orig = Kokkos::create_mirror(arr_dev); + arr_host_mod = Kokkos::create_mirror(arr_dev); + std::random_device rd; // Will be used to obtain a seed for the random number engine + std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd() + std::uniform_real_distribution dis(-1.0, 1.0); + par_for_init(std::make_index_sequence(), gen, dis); + } + + template + auto GetArray(std::index_sequence) { + static_assert(sizeof...(Is) == Rank); + return ParArrayND("device", N * Is...); + } + + template + void par_for_init(std::index_sequence, std::mt19937 &gen, + std::uniform_real_distribution &dis) { + constexpr size_t id = Rank - LoopsLeft; + bounds[id].s = 0; + bounds[id].e = N - 1; + int_bounds[2 * id] = 0; + int_bounds[2 * id + 1] = N - 1; + if constexpr (LoopsLeft == 1) { + for (int i = 0; i < N; i++) { + arr_host_orig(indices[Is]..., i) = 0.; // dis(gen); + } + } else { + for (int j = 0; j < N; j++) { + indices[Rank - LoopsLeft] = j; + par_for_init(Sequence(), gen, dis); + } + } + } + + template + KOKKOS_INLINE_FUNCTION Real increment_data(KJI... kji) { + static_assert(Rank == sizeof...(KJI), "number of indices matches Rank"); + int inc = 0; + int inds[sizeof...(KJI)]{kji...}; + for (int i = 0; i < Rank; i++) { + inc += N * inds[i]; + } + return static_cast(inc); + } + + template + bool par_for_comp(std::index_sequence) { + bool all_same = true; + if constexpr (LoopsLeft == 1) { + for (int i = 0; i < N; i++) { + if (arr_host_orig(indices[Is]..., i) + increment_data(indices[Is]..., i) != + arr_host_mod(indices[Is]..., i)) { + all_same = false; + } + } + } else { + for (int j = 0; j < N; j++) { + indices[Rank - LoopsLeft] = j; + all_same = par_for_comp(Sequence()); + } + } + return all_same; + } + + template + bool dispatch(parthenon::meta::TypeList, std::index_sequence, + T loop_pattern, DevExecSpace exec_space) { + Kokkos::deep_copy(arr_dev, arr_host_orig); + if constexpr (bound_type == lbounds::integer) { + parthenon::par_for( + loop_pattern, "unit test ND integer bounds", exec_space, int_bounds[Ids]..., + KOKKOS_LAMBDA(Ts... args) { + arr_dev(std::forward(args)...) += + increment_data(std::forward(args)...); + }); + } else { + parthenon::par_for( + loop_pattern, "unit test ND IndexRange bounds", exec_space, bounds[Ids]..., + KOKKOS_LAMBDA(Ts... args) { + arr_dev(std::forward(args)...) += + increment_data(std::forward(args)...); + }); + } + Kokkos::deep_copy(arr_host_mod, arr_dev); + return par_for_comp(Sequence()); + } + template + void test(T loop_pattern, DevExecSpace exec_space) { + /* REQUIRE(dispatch( */ + /* typename parthenon::meta::ListOfType::value(), */ + /* Sequence<2 * Rank>(), loop_pattern, exec_space) == true); */ + REQUIRE(dispatch( + typename parthenon::meta::ListOfType::value(), + Sequence(), loop_pattern, exec_space) == true); + } + + template + bool par_for_dev(T loop_pattern, DevExecSpace exec_space, lbounds bound_type) { + return dispatch(Sequence(), loop_pattern, exec_space, bound_type); + } +}; + +template +void test_wrapper_nd(DevExecSpace exec_space) { + auto wrappernd = test_wrapper_nd_impl(2); + wrappernd.test(parthenon::loop_pattern_flatrange_tag, exec_space); +} + template bool test_wrapper_4d(T loop_pattern, DevExecSpace exec_space) { // https://en.cppreference.com/w/cpp/numeric/random/uniform_real_distribution @@ -242,6 +412,7 @@ TEST_CASE("par_for loops", "[wrapper]") { } SECTION("4D loops") { + test_wrapper_nd<4>(default_exec_space); REQUIRE(test_wrapper_4d(parthenon::loop_pattern_flatrange_tag, default_exec_space) == true); @@ -358,6 +529,7 @@ bool test_wrapper_nested_4d(OuterLoopPattern outer_loop_pattern, // Compute the scratch memory needs const int scratch_level = 0; size_t scratch_size_in_bytes = parthenon::ScratchPad1D::shmem_size(N); + parthenon::IndexRange rng{0, N - 1}; // Compute the 2nd order centered derivative in x parthenon::par_for_outer( @@ -369,7 +541,7 @@ bool test_wrapper_nested_4d(OuterLoopPattern outer_loop_pattern, // Load a pencil in x to minimize DRAM accesses (and test scratch pad) parthenon::ScratchPad1D scratch_u(team_member.team_scratch(scratch_level), N); - parthenon::par_for_inner(inner_loop_pattern, team_member, 0, N - 1, + parthenon::par_for_inner(inner_loop_pattern, team_member, rng, [&](const int i) { scratch_u(i) = dev_u(n, k, j, i); }); // Sync all threads in the team so that scratch memory is consistent team_member.team_barrier(); From 12297d2a4c8db1851bf3e4126ed3e15075735b66 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sun, 28 Jul 2024 21:51:12 +0200 Subject: [PATCH 33/99] completeing tests for par_for & par_reduce. Testing up to 7D loops --- tst/unit/kokkos_abstraction.cpp | 467 ++++++++++---------------------- 1 file changed, 143 insertions(+), 324 deletions(-) diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index 4e61d43f6b47..e56cfb0d54fe 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -37,131 +37,6 @@ using parthenon::ParArray3D; using parthenon::ParArray4D; using Real = double; -template -bool test_wrapper_1d(T loop_pattern, DevExecSpace exec_space) { - // https://en.cppreference.com/w/cpp/numeric/random/uniform_real_distribution - std::random_device rd; // Will be used to obtain a seed for the random number engine - std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd() - std::uniform_real_distribution dis(-1.0, 1.0); - - const int N = 32; - ParArray1D arr_dev("device", N); - auto arr_host_orig = Kokkos::create_mirror(arr_dev); - auto arr_host_mod = Kokkos::create_mirror(arr_dev); - - // initialize random data on the host not using any wrapper - for (int i = 0; i < N; i++) - arr_host_orig(i) = dis(gen); - - // Copy host array content to device - Kokkos::deep_copy(arr_dev, arr_host_orig); - - // increment data on the device using prescribed wrapper - parthenon::par_for( - loop_pattern, "unit test 1D", exec_space, 0, N - 1, - KOKKOS_LAMBDA(const int i) { arr_dev(i) += static_cast(i); }); - - // Copy array back from device to host - Kokkos::deep_copy(arr_host_mod, arr_dev); - - bool all_same = true; - - // compare data on the host - for (int i = 0; i < N; i++) - if (arr_host_orig(i) + static_cast(i) != arr_host_mod(i)) { - all_same = false; - } - - return all_same; -} - -template -bool test_wrapper_2d(T loop_pattern, DevExecSpace exec_space) { - // https://en.cppreference.com/w/cpp/numeric/random/uniform_real_distribution - std::random_device rd; // Will be used to obtain a seed for the random number engine - std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd() - std::uniform_real_distribution dis(-1.0, 1.0); - - const int N = 32; - ParArray2D arr_dev("device", N, N); - auto arr_host_orig = Kokkos::create_mirror(arr_dev); - auto arr_host_mod = Kokkos::create_mirror(arr_dev); - - // initialize random data on the host not using any wrapper - for (int j = 0; j < N; j++) - for (int i = 0; i < N; i++) - arr_host_orig(j, i) = dis(gen); - - // Copy host array content to device - Kokkos::deep_copy(arr_dev, arr_host_orig); - - // increment data on the device using prescribed wrapper - parthenon::par_for( - loop_pattern, "unit test 2D", exec_space, 0, N - 1, 0, N - 1, - KOKKOS_LAMBDA(const int j, const int i) { - arr_dev(j, i) += static_cast(i + N * j); - }); - - // Copy array back from device to host - Kokkos::deep_copy(arr_host_mod, arr_dev); - - bool all_same = true; - - // compare data on the host - for (int j = 0; j < N; j++) - for (int i = 0; i < N; i++) - if (arr_host_orig(j, i) + static_cast(i + N * j) != arr_host_mod(j, i)) { - all_same = false; - } - - return all_same; -} - -template -bool test_wrapper_3d(T loop_pattern, DevExecSpace exec_space) { - // https://en.cppreference.com/w/cpp/numeric/random/uniform_real_distribution - std::random_device rd; // Will be used to obtain a seed for the random number engine - std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd() - std::uniform_real_distribution dis(-1.0, 1.0); - - const int N = 32; - ParArray3D arr_dev("device", N, N, N); - auto arr_host_orig = Kokkos::create_mirror(arr_dev); - auto arr_host_mod = Kokkos::create_mirror(arr_dev); - - // initialize random data on the host not using any wrapper - for (int k = 0; k < N; k++) - for (int j = 0; j < N; j++) - for (int i = 0; i < N; i++) - arr_host_orig(k, j, i) = dis(gen); - - // Copy host array content to device - Kokkos::deep_copy(arr_dev, arr_host_orig); - - // increment data on the device using prescribed wrapper - parthenon::par_for( - loop_pattern, "unit test 3D", exec_space, 0, N - 1, 0, N - 1, 0, N - 1, - KOKKOS_LAMBDA(const int k, const int j, const int i) { - arr_dev(k, j, i) += static_cast(i + N * (j + N * k)); - }); - - // Copy array back from device to host - Kokkos::deep_copy(arr_host_mod, arr_dev); - - bool all_same = true; - - // compare data on the host - for (int k = 0; k < N; k++) - for (int j = 0; j < N; j++) - for (int i = 0; i < N; i++) - if (arr_host_orig(k, j, i) + static_cast(i + N * (j + N * k)) != - arr_host_mod(k, j, i)) { - all_same = false; - } - - return all_same; -} - template auto ParArrayND(Args &&...args) { static_assert(ND <= 8, "ParArrayND supoorted up to ND=8"); @@ -244,7 +119,7 @@ struct test_wrapper_nd_impl { int_bounds[2 * id + 1] = N - 1; if constexpr (LoopsLeft == 1) { for (int i = 0; i < N; i++) { - arr_host_orig(indices[Is]..., i) = 0.; // dis(gen); + arr_host_orig(indices[Is]..., i) = dis(gen); } } else { for (int j = 0; j < N; j++) { @@ -308,14 +183,21 @@ struct test_wrapper_nd_impl { } template void test(T loop_pattern, DevExecSpace exec_space) { - /* REQUIRE(dispatch( */ - /* typename parthenon::meta::ListOfType::value(), */ - /* Sequence<2 * Rank>(), loop_pattern, exec_space) == true); */ - REQUIRE(dispatch( - typename parthenon::meta::ListOfType::value(), - Sequence(), loop_pattern, exec_space) == true); + SECTION("integer launch bounds") { + REQUIRE(dispatch( + typename parthenon::meta::ListOfType::value(), + Sequence<2 * Rank>(), loop_pattern, exec_space) == true); + } + SECTION("IndexRange launch bounds") { + REQUIRE(dispatch( + typename parthenon::meta::ListOfType::value(), + Sequence(), loop_pattern, exec_space) == true); + } } + template + void test_nest(OuterPattern outer_patter, InnerPattern inner_pattern) {} + template bool par_for_dev(T loop_pattern, DevExecSpace exec_space, lbounds bound_type) { return dispatch(Sequence(), loop_pattern, exec_space, bound_type); @@ -323,119 +205,49 @@ struct test_wrapper_nd_impl { }; template -void test_wrapper_nd(DevExecSpace exec_space) { - auto wrappernd = test_wrapper_nd_impl(2); - wrappernd.test(parthenon::loop_pattern_flatrange_tag, exec_space); -} - -template -bool test_wrapper_4d(T loop_pattern, DevExecSpace exec_space) { - // https://en.cppreference.com/w/cpp/numeric/random/uniform_real_distribution - std::random_device rd; // Will be used to obtain a seed for the random number engine - std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd() - std::uniform_real_distribution dis(-1.0, 1.0); - - const int N = 32; - ParArray4D arr_dev("device", N, N, N, N); - auto arr_host_orig = Kokkos::create_mirror(arr_dev); - auto arr_host_mod = Kokkos::create_mirror(arr_dev); - - // initialize random data on the host not using any wrapper - for (int n = 0; n < N; n++) - for (int k = 0; k < N; k++) - for (int j = 0; j < N; j++) - for (int i = 0; i < N; i++) - arr_host_orig(n, k, j, i) = dis(gen); - - // Copy host array content to device - Kokkos::deep_copy(arr_dev, arr_host_orig); - - // increment data on the device using prescribed wrapper - parthenon::par_for( - loop_pattern, "unit test 4D", exec_space, 0, N - 1, 0, N - 1, 0, N - 1, 0, N - 1, - KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) { - arr_dev(n, k, j, i) += static_cast(i + N * (j + N * (k + n))); - }); - - // Copy array back from device to host - Kokkos::deep_copy(arr_host_mod, arr_dev); - - bool all_same = true; - - // compare data on the host - for (int n = 0; n < N; n++) - for (int k = 0; k < N; k++) - for (int j = 0; j < N; j++) - for (int i = 0; i < N; i++) - if (arr_host_orig(n, k, j, i) + static_cast(i + N * (j + N * (k + n))) != - arr_host_mod(n, k, j, i)) { - all_same = false; - } - - return all_same; -} - -TEST_CASE("par_for loops", "[wrapper]") { - auto default_exec_space = DevExecSpace(); - - SECTION("1D loops") { - REQUIRE(test_wrapper_1d(parthenon::loop_pattern_flatrange_tag, default_exec_space) == - true); +void test_wrapper_nd(DevExecSpace exec_space, int N = 32) { + auto wrappernd = test_wrapper_nd_impl(N); + SECTION("LoopPatternFlatRange") { + wrappernd.test(parthenon::loop_pattern_flatrange_tag, exec_space); } - - SECTION("2D loops") { - REQUIRE(test_wrapper_2d(parthenon::loop_pattern_mdrange_tag, default_exec_space) == - true); + if constexpr (Rank > 1 && Rank < 7) { + SECTION("LoopPatternMDRange") { + wrappernd.test(parthenon::loop_pattern_mdrange_tag, exec_space); + } } - - SECTION("3D loops") { - REQUIRE(test_wrapper_3d(parthenon::loop_pattern_flatrange_tag, default_exec_space) == - true); - - REQUIRE(test_wrapper_3d(parthenon::loop_pattern_mdrange_tag, default_exec_space) == - true); - - REQUIRE(test_wrapper_3d(parthenon::loop_pattern_tpttrtvr_tag, default_exec_space) == - true); - - REQUIRE(test_wrapper_3d(parthenon::loop_pattern_tpttr_tag, default_exec_space) == - true); - - if constexpr (std::is_same::value) { - REQUIRE(test_wrapper_3d(parthenon::loop_pattern_tptvr_tag, default_exec_space) == - true); - - REQUIRE(test_wrapper_3d(parthenon::loop_pattern_simdfor_tag, default_exec_space) == - true); + if constexpr (Rank > 2) { + SECTION("LoopPatternTPTTRTVR") { + wrappernd.test(parthenon::loop_pattern_tpttrtvr_tag, exec_space); + } + SECTION("LoopPatternTPTTR") { + wrappernd.test(parthenon::loop_pattern_tpttr_tag, exec_space); } } - - SECTION("4D loops") { - test_wrapper_nd<4>(default_exec_space); - REQUIRE(test_wrapper_4d(parthenon::loop_pattern_flatrange_tag, default_exec_space) == - true); - - REQUIRE(test_wrapper_4d(parthenon::loop_pattern_mdrange_tag, default_exec_space) == - true); - - REQUIRE(test_wrapper_4d(parthenon::loop_pattern_tpttrtvr_tag, default_exec_space) == - true); - - REQUIRE(test_wrapper_4d(parthenon::loop_pattern_tpttr_tag, default_exec_space) == - true); - - if constexpr (std::is_same::value) { - REQUIRE(test_wrapper_4d(parthenon::loop_pattern_tptvr_tag, default_exec_space) == - true); - - REQUIRE(test_wrapper_4d(parthenon::loop_pattern_simdfor_tag, default_exec_space) == - true); + if constexpr (std::is_same::value) { + if constexpr (Rank > 2) { + SECTION("LoopPatternTPTVR") { + wrappernd.test(parthenon::loop_pattern_tptvr_tag, exec_space); + } + } + SECTION("LoopPatternSimdFor") { + wrappernd.test(parthenon::loop_pattern_simdfor_tag, exec_space); } } } +TEST_CASE("par_for loops", "[wrapper]") { + auto default_exec_space = DevExecSpace(); + + SECTION("1D loops") { test_wrapper_nd<1>(default_exec_space, 32); } + SECTION("2D loops") { test_wrapper_nd<2>(default_exec_space, 32); } + SECTION("3D loops") { test_wrapper_nd<3>(default_exec_space, 32); } + SECTION("4D loops") { test_wrapper_nd<4>(default_exec_space, 32); } + SECTION("5D loops") { test_wrapper_nd<5>(default_exec_space, 10); } + SECTION("6D loops") { test_wrapper_nd<6>(default_exec_space, 10); } + SECTION("7D loops") { test_wrapper_nd<7>(default_exec_space, 10); } +} + template bool test_wrapper_nested_3d(OuterLoopPattern outer_loop_pattern, InnerLoopPattern inner_loop_pattern, @@ -652,105 +464,112 @@ TEST_CASE("Parallel scan", "[par_scan]") { } } -template -bool test_wrapper_reduce_1d(T loop_pattern, DevExecSpace exec_space) { - constexpr int N = 10; - parthenon::IndexRange r{0, N - 1}; - parthenon::ParArray1D buffer("Testing buffer", N); - // Initialize data - parthenon::par_for( - loop_pattern, "Initialize parallel reduce array", exec_space, r, - KOKKOS_LAMBDA(const int i) { buffer(i) = i; }); - int total = 0; - for (int i = 0; i < N; ++i) { - total += i; +template +struct test_wrapper_reduce_nd_impl { + template + using Sequence = std::make_index_sequence; + int N, indices[Rank - 1], int_bounds[2 * Rank]; + parthenon::IndexRange bounds[Rank]; + int h_sum; + + test_wrapper_reduce_nd_impl(const int _N = 10) : N(_N) { + h_sum = 0; + par_red_init(std::make_index_sequence(), h_sum); } - int test_tot = 0; - parthenon::par_reduce( - loop_pattern, "Sum via par reduce", exec_space, r, - KOKKOS_LAMBDA(const int i, int &t) { t += i; }, Kokkos::Sum(test_tot)); - return total == test_tot; -} -template -bool test_wrapper_reduce_3d(T loop_pattern, DevExecSpace exec_space) { - constexpr int N = 10; - parthenon::ParArray3D buffer("Testing buffer", N, N, N); - // Initialize data - parthenon::par_for( - loop_pattern, "Initialize parallel reduce array", exec_space, 0, N - 1, 0, N - 1, 0, - N - 1, KOKKOS_LAMBDA(const int k, const int j, const int i) { - buffer(k, j, i) = i + j + k; - }); - int tot = 0; - for (int k = 0; k < N; ++k) { - for (int j = 0; j < N; ++j) { - for (int i = 0; i < N; ++i) { - tot += i + j + k; - } - } + template + auto GetArray(std::index_sequence) { + static_assert(sizeof...(Is) == Rank); + return ParArrayND("device", N * Is...); } - int test_tot = 0; - parthenon::par_reduce( - loop_pattern, "Sum via par reduce", exec_space, 0, N - 1, 0, N - 1, 0, N - 1, - KOKKOS_LAMBDA(const int k, const int j, const int i, int &t) { t += i + j + k; }, - Kokkos::Sum(test_tot)); - return tot == test_tot; -} -template -bool test_wrapper_reduce_4d(T loop_pattern, DevExecSpace exec_space) { - constexpr int N = 10; - parthenon::ParArray4D buffer("Testing buffer", N, N, N, N); - // Initialize data - parthenon::par_for( - loop_pattern, "Initialize parallel reduce array", exec_space, 0, N - 1, 0, N - 1, 0, - N - 1, 0, N - 1, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) { - buffer(n, k, j, i) = i + j + k + n; - }); - int tot = 0; - for (int n = 0; n < N; ++n) { - for (int k = 0; k < N; ++k) { - for (int j = 0; j < N; ++j) { - for (int i = 0; i < N; ++i) { - tot += i + j + k + n; - } + template + void par_red_init(std::index_sequence, int &sum) { + constexpr size_t id = Rank - LoopsLeft; + bounds[id].s = 0; + bounds[id].e = N - 1; + int_bounds[2 * id] = 0; + int_bounds[2 * id + 1] = N - 1; + if constexpr (LoopsLeft == 1) { + for (int i = 0; i < N; i++) { + sum += (i + ... + indices[Is]); + } + } else { + for (int j = 0; j < N; j++) { + indices[Rank - LoopsLeft] = j; + par_red_init(Sequence(), sum); } } } - int test_tot = 0; - parthenon::par_reduce( - loop_pattern, "Sum via par reduce", exec_space, 0, N - 1, 0, N - 1, 0, N - 1, 0, - N - 1, - KOKKOS_LAMBDA(const int n, const int k, const int j, const int i, int &t) { - t += i + j + k + n; - }, - Kokkos::Sum(test_tot)); - return tot == test_tot; -} -TEST_CASE("Parallel reduce", "[par_reduce]") { - auto default_exec_space = DevExecSpace(); - SECTION("1D loops") { - REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_flatrange_tag, - default_exec_space) == true); - if constexpr (std::is_same::value) { - REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_simdfor_tag, - default_exec_space) == true); + template + bool dispatch(parthenon::meta::TypeList, std::index_sequence, + T loop_pattern, DevExecSpace exec_space) { + int test_sum = 0; + if constexpr (bound_type == lbounds::integer) { + parthenon::par_reduce( + loop_pattern, "sum via par_reduce integer bounds", exec_space, + int_bounds[Ids]..., + KOKKOS_LAMBDA(Ts... args, int &sum) { sum += (args + ...); }, + Kokkos::Sum(test_sum)); + } else { + parthenon::par_reduce( + loop_pattern, "sum via par_reduce IndexRange bounds", exec_space, + bounds[Ids]..., KOKKOS_LAMBDA(Ts... args, int &sum) { sum += (args + ...); }, + Kokkos::Sum(test_sum)); + } + return test_sum == h_sum; + } + template + void test(T loop_pattern, DevExecSpace exec_space) { + SECTION("integer launch bounds") { + REQUIRE(dispatch( + typename parthenon::meta::ListOfType::value(), + Sequence<2 * Rank>(), loop_pattern, exec_space) == true); + } + SECTION("IndexRange launch bounds") { + REQUIRE(dispatch( + typename parthenon::meta::ListOfType::value(), + Sequence(), loop_pattern, exec_space) == true); } } - SECTION("3D loops") { - REQUIRE(test_wrapper_reduce_3d(parthenon::loop_pattern_flatrange_tag, - default_exec_space) == true); - REQUIRE(test_wrapper_reduce_3d(parthenon::loop_pattern_mdrange_tag, - default_exec_space) == true); + template + void test_nest(OuterPattern outer_patter, InnerPattern inner_pattern) {} + + template + bool par_for_dev(T loop_pattern, DevExecSpace exec_space, lbounds bound_type) { + return dispatch(Sequence(), loop_pattern, exec_space, bound_type); } +}; - SECTION("4D loops") { - REQUIRE(test_wrapper_reduce_4d(parthenon::loop_pattern_flatrange_tag, - default_exec_space) == true); - REQUIRE(test_wrapper_reduce_4d(parthenon::loop_pattern_mdrange_tag, - default_exec_space) == true); +template +void test_wrapper_reduce_nd(DevExecSpace exec_space, int N = 10) { + auto wrappernd = test_wrapper_reduce_nd_impl(N); + SECTION("LoopPatternFlatRange") { + wrappernd.test(parthenon::loop_pattern_flatrange_tag, exec_space); + } + if constexpr (Rank > 1 && Rank < 7) { + SECTION("LoopPatternMDRange") { + wrappernd.test(parthenon::loop_pattern_mdrange_tag, exec_space); + } } + if constexpr (std::is_same::value) { + // this should fall-back to LoopPatternFlatRange + SECTION("LoopPatternSimdFor") { + wrappernd.test(parthenon::loop_pattern_simdfor_tag, exec_space); + } + } +} + +TEST_CASE("Parallel reduce", "[par_reduce]") { + auto default_exec_space = DevExecSpace(); + SECTION("1D loops") { test_wrapper_reduce_nd<1>(default_exec_space, 10); } + SECTION("2D loops") { test_wrapper_reduce_nd<2>(default_exec_space, 10); } + SECTION("3D loops") { test_wrapper_reduce_nd<3>(default_exec_space, 10); } + SECTION("4D loops") { test_wrapper_reduce_nd<4>(default_exec_space, 10); } + SECTION("5D loops") { test_wrapper_reduce_nd<5>(default_exec_space, 10); } + SECTION("6D loops") { test_wrapper_reduce_nd<6>(default_exec_space, 10); } + SECTION("7D loops") { test_wrapper_reduce_nd<7>(default_exec_space, 10); } } From 0905832d3ed3143b0aa7c5e1d126886cab2b1f53 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sun, 28 Jul 2024 21:52:40 +0200 Subject: [PATCH 34/99] fixing sequence_of_ones --- src/kokkos_abstraction.hpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 63a75297de8d..abac6dd18c31 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -375,21 +375,23 @@ struct ListOfType<1, T> { using value = TypeList; }; -template +template struct SequenceOfOnes {}; -template -struct SequenceOfOnes<0, std::integer_sequence> { +template +struct SequenceOfOnes<0, VAL, std::integer_sequence> { using value = typename std::integer_sequence; }; -template -struct SequenceOfOnes> { - using value = typename SequenceOfOnes>::value; +template +struct SequenceOfOnes> { + using value = + typename SequenceOfOnes>::value; }; -template -using sequence_of_ones = SequenceOfOnes>; +template +using sequence_of_ones = SequenceOfOnes>; } // namespace meta From afc86c16f3879f8f65dd3e42d6cb3cebeddd3b4a Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sun, 28 Jul 2024 22:18:43 +0200 Subject: [PATCH 35/99] static_assert for par_scan --- src/kokkos_abstraction.hpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index abac6dd18c31..36e3f5ec3217 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -479,6 +479,8 @@ struct DispatchType { std::is_same>::value; static constexpr bool is_ParFor = std::is_same::value; + static constexpr bool is_ParScan = + std::is_same::value; static constexpr bool IsFlatRange = std::is_same::value; static constexpr bool IsMDRange = std::is_same::value; @@ -486,10 +488,11 @@ struct DispatchType { using TeamPattern = LoopPatternTeam; // false_type unless we use an outer team policy - // fallback simd par_reduce to flat range - static constexpr bool is_FlatRange = (IsFlatRange || (IsSimdFor && !is_ParFor)); + // fallback simd par_reduce to flat range and force par_scan to flat range + static constexpr bool is_FlatRange = + (IsFlatRange || (IsSimdFor && !is_ParFor)) || is_ParScan; static constexpr bool is_SimdFor = (IsSimdFor && is_ParFor); - static constexpr bool is_MDRange = IsMDRange; + static constexpr bool is_MDRange = (IsMDRange && !is_ParScan); static constexpr bool is_Collapse = TeamPattern::value; }; @@ -935,6 +938,9 @@ par_dispatch(Pattern, std::string name, DevExecSpace exec_space, AllArgs &&...ar using LaunchBounds = typename dispatchsig::LaunchBounds; // list of index types using Args = typename dispatchsig::Args; // + if constexpr (Rank > 1 && std::is_same_v) { + static_assert(always_false, "par_scan only for 1D loops"); + } par_dispatch_impl::dispatch( name, exec_space, std::forward(args)...); } From ceaac8f74d10b45ed0525cf24053b3fc440cfcc2 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Mon, 29 Jul 2024 10:51:05 +0200 Subject: [PATCH 36/99] modifying test to work on cuda machine --- src/kokkos_abstraction.hpp | 3 +- tst/unit/kokkos_abstraction.cpp | 195 +++++++++++++++++--------------- 2 files changed, 106 insertions(+), 92 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 36e3f5ec3217..d2f638ea1178 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -576,10 +576,11 @@ struct InnerFunctor, Function function; Kokkos::Array inds_team; + KOKKOS_INLINE_FUNCTION InnerFunctor(Kokkos::Array _inds_team, Function _function) : inds_team(_inds_team), function(_function) {} - KOKKOS_INLINE_FUNCTION + KOKKOS_FORCEINLINE_FUNCTION void operator()(Index... inds) const { function(inds_team[Iteam]..., std::forward(inds)...); } diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index e56cfb0d54fe..872be3686401 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -84,16 +84,16 @@ auto HostArrayND(Args &&...args) { enum class lbounds { integer, indexrange }; -template +template struct test_wrapper_nd_impl { - template - using Sequence = std::make_index_sequence; - int N, indices[Rank - 1], int_bounds[2 * Rank]; + template + using Sequence = std::make_index_sequence; + int indices[Rank - 1], int_bounds[2 * Rank]; parthenon::IndexRange bounds[Rank]; decltype(ParArrayND()) arr_dev; decltype(HostArrayND()) arr_host_orig, arr_host_mod; - test_wrapper_nd_impl(const int _N = 32) : N(_N) { + test_wrapper_nd_impl() { arr_dev = GetArray(typename parthenon::meta::sequence_of_ones::value()); arr_host_orig = Kokkos::create_mirror(arr_dev); arr_host_mod = Kokkos::create_mirror(arr_dev); @@ -130,7 +130,7 @@ struct test_wrapper_nd_impl { } template - KOKKOS_INLINE_FUNCTION Real increment_data(KJI... kji) { + KOKKOS_INLINE_FUNCTION static Real increment_data(KJI... kji) { static_assert(Rank == sizeof...(KJI), "number of indices matches Rank"); int inc = 0; int inds[sizeof...(KJI)]{kji...}; @@ -159,54 +159,62 @@ struct test_wrapper_nd_impl { return all_same; } - template - bool dispatch(parthenon::meta::TypeList, std::index_sequence, - T loop_pattern, DevExecSpace exec_space) { - Kokkos::deep_copy(arr_dev, arr_host_orig); - if constexpr (bound_type == lbounds::integer) { - parthenon::par_for( - loop_pattern, "unit test ND integer bounds", exec_space, int_bounds[Ids]..., - KOKKOS_LAMBDA(Ts... args) { - arr_dev(std::forward(args)...) += - increment_data(std::forward(args)...); - }); - } else { - parthenon::par_for( - loop_pattern, "unit test ND IndexRange bounds", exec_space, bounds[Ids]..., - KOKKOS_LAMBDA(Ts... args) { - arr_dev(std::forward(args)...) += - increment_data(std::forward(args)...); - }); + template + struct dispatch {}; + + template + struct dispatch, + parthenon::meta::TypeList> { + + template + void execute(DevExecSpace exec_space, view_t &dev, int *int_bounds, + parthenon::IndexRange *bounds) { + if constexpr (bound_type == lbounds::integer) { + parthenon::par_for( + Pattern(), "unit test ND integer bounds", exec_space, int_bounds[Ids]..., + + KOKKOS_CLASS_LAMBDA(Ts... args) { + dev(std::forward(args)...) += + increment_data(std::forward(args)...); + }); + } else { + parthenon::par_for( + Pattern(), "unit test ND IndexRange bounds", exec_space, bounds[Ids]..., + + KOKKOS_CLASS_LAMBDA(Ts... args) { + dev(std::forward(args)...) += + increment_data(std::forward(args)...); + }); + } } - Kokkos::deep_copy(arr_host_mod, arr_dev); - return par_for_comp(Sequence()); - } + }; + template void test(T loop_pattern, DevExecSpace exec_space) { + Kokkos::deep_copy(arr_dev, arr_host_orig); SECTION("integer launch bounds") { - REQUIRE(dispatch( - typename parthenon::meta::ListOfType::value(), - Sequence<2 * Rank>(), loop_pattern, exec_space) == true); + dispatch, + typename parthenon::meta::ListOfType::value>() + .execute(exec_space, arr_dev, int_bounds, bounds); + Kokkos::deep_copy(arr_host_mod, arr_dev); + REQUIRE(par_for_comp(Sequence()) == true); } SECTION("IndexRange launch bounds") { - REQUIRE(dispatch( - typename parthenon::meta::ListOfType::value(), - Sequence(), loop_pattern, exec_space) == true); + dispatch, + typename parthenon::meta::ListOfType::value>() + .execute(exec_space, arr_dev, int_bounds, bounds); + Kokkos::deep_copy(arr_host_mod, arr_dev); + REQUIRE(par_for_comp(Sequence()) == true); } } template void test_nest(OuterPattern outer_patter, InnerPattern inner_pattern) {} - - template - bool par_for_dev(T loop_pattern, DevExecSpace exec_space, lbounds bound_type) { - return dispatch(Sequence(), loop_pattern, exec_space, bound_type); - } }; -template -void test_wrapper_nd(DevExecSpace exec_space, int N = 32) { - auto wrappernd = test_wrapper_nd_impl(N); +template +void test_wrapper_nd(DevExecSpace exec_space) { + auto wrappernd = test_wrapper_nd_impl(); SECTION("LoopPatternFlatRange") { wrappernd.test(parthenon::loop_pattern_flatrange_tag, exec_space); } @@ -239,13 +247,13 @@ void test_wrapper_nd(DevExecSpace exec_space, int N = 32) { TEST_CASE("par_for loops", "[wrapper]") { auto default_exec_space = DevExecSpace(); - SECTION("1D loops") { test_wrapper_nd<1>(default_exec_space, 32); } - SECTION("2D loops") { test_wrapper_nd<2>(default_exec_space, 32); } - SECTION("3D loops") { test_wrapper_nd<3>(default_exec_space, 32); } - SECTION("4D loops") { test_wrapper_nd<4>(default_exec_space, 32); } - SECTION("5D loops") { test_wrapper_nd<5>(default_exec_space, 10); } - SECTION("6D loops") { test_wrapper_nd<6>(default_exec_space, 10); } - SECTION("7D loops") { test_wrapper_nd<7>(default_exec_space, 10); } + SECTION("1D loops") { test_wrapper_nd<1, 32>(default_exec_space); } + SECTION("2D loops") { test_wrapper_nd<2, 32>(default_exec_space); } + SECTION("3D loops") { test_wrapper_nd<3, 32>(default_exec_space); } + SECTION("4D loops") { test_wrapper_nd<4, 32>(default_exec_space); } + SECTION("5D loops") { test_wrapper_nd<5, 10>(default_exec_space); } + SECTION("6D loops") { test_wrapper_nd<6, 10>(default_exec_space); } + SECTION("7D loops") { test_wrapper_nd<7, 10>(default_exec_space); } } template @@ -464,15 +472,15 @@ TEST_CASE("Parallel scan", "[par_scan]") { } } -template +template struct test_wrapper_reduce_nd_impl { - template - using Sequence = std::make_index_sequence; - int N, indices[Rank - 1], int_bounds[2 * Rank]; + template + using Sequence = std::make_index_sequence; + int indices[Rank - 1], int_bounds[2 * Rank]; parthenon::IndexRange bounds[Rank]; int h_sum; - test_wrapper_reduce_nd_impl(const int _N = 10) : N(_N) { + test_wrapper_reduce_nd_impl() { h_sum = 0; par_red_init(std::make_index_sequence(), h_sum); } @@ -502,50 +510,55 @@ struct test_wrapper_reduce_nd_impl { } } - template - bool dispatch(parthenon::meta::TypeList, std::index_sequence, - T loop_pattern, DevExecSpace exec_space) { - int test_sum = 0; - if constexpr (bound_type == lbounds::integer) { - parthenon::par_reduce( - loop_pattern, "sum via par_reduce integer bounds", exec_space, - int_bounds[Ids]..., - KOKKOS_LAMBDA(Ts... args, int &sum) { sum += (args + ...); }, - Kokkos::Sum(test_sum)); - } else { - parthenon::par_reduce( - loop_pattern, "sum via par_reduce IndexRange bounds", exec_space, - bounds[Ids]..., KOKKOS_LAMBDA(Ts... args, int &sum) { sum += (args + ...); }, - Kokkos::Sum(test_sum)); + template + struct dispatch {}; + + template + struct dispatch, + parthenon::meta::TypeList> { + + bool execute(DevExecSpace exec_space, const int h_sum, int *int_bounds, + parthenon::IndexRange *bounds) { + int test_sum = 0; + if constexpr (bound_type == lbounds::integer) { + parthenon::par_reduce( + Pattern(), "sum via par_reduce integer bounds", exec_space, + int_bounds[Ids]..., + + KOKKOS_CLASS_LAMBDA(Ts... args, int &sum) { sum += (args + ...); }, + Kokkos::Sum(test_sum)); + } else { + parthenon::par_reduce( + Pattern(), "sum via par_reduce IndexRange bounds", exec_space, bounds[Ids]..., + + KOKKOS_CLASS_LAMBDA(Ts... args, int &sum) { sum += (args + ...); }, + Kokkos::Sum(test_sum)); + } + return h_sum == test_sum; } - return test_sum == h_sum; - } + }; + template void test(T loop_pattern, DevExecSpace exec_space) { SECTION("integer launch bounds") { - REQUIRE(dispatch( - typename parthenon::meta::ListOfType::value(), - Sequence<2 * Rank>(), loop_pattern, exec_space) == true); + REQUIRE(dispatch, + typename parthenon::meta::ListOfType::value>() + .execute(exec_space, h_sum, int_bounds, bounds) == true); } SECTION("IndexRange launch bounds") { - REQUIRE(dispatch( - typename parthenon::meta::ListOfType::value(), - Sequence(), loop_pattern, exec_space) == true); + REQUIRE(dispatch, + typename parthenon::meta::ListOfType::value>() + .execute(exec_space, h_sum, int_bounds, bounds) == true); } } template void test_nest(OuterPattern outer_patter, InnerPattern inner_pattern) {} - - template - bool par_for_dev(T loop_pattern, DevExecSpace exec_space, lbounds bound_type) { - return dispatch(Sequence(), loop_pattern, exec_space, bound_type); - } }; -template -void test_wrapper_reduce_nd(DevExecSpace exec_space, int N = 10) { - auto wrappernd = test_wrapper_reduce_nd_impl(N); +template +void test_wrapper_reduce_nd(DevExecSpace exec_space) { + auto wrappernd = test_wrapper_reduce_nd_impl(); SECTION("LoopPatternFlatRange") { wrappernd.test(parthenon::loop_pattern_flatrange_tag, exec_space); } @@ -565,11 +578,11 @@ void test_wrapper_reduce_nd(DevExecSpace exec_space, int N = 10) { TEST_CASE("Parallel reduce", "[par_reduce]") { auto default_exec_space = DevExecSpace(); - SECTION("1D loops") { test_wrapper_reduce_nd<1>(default_exec_space, 10); } - SECTION("2D loops") { test_wrapper_reduce_nd<2>(default_exec_space, 10); } - SECTION("3D loops") { test_wrapper_reduce_nd<3>(default_exec_space, 10); } - SECTION("4D loops") { test_wrapper_reduce_nd<4>(default_exec_space, 10); } - SECTION("5D loops") { test_wrapper_reduce_nd<5>(default_exec_space, 10); } - SECTION("6D loops") { test_wrapper_reduce_nd<6>(default_exec_space, 10); } - SECTION("7D loops") { test_wrapper_reduce_nd<7>(default_exec_space, 10); } + SECTION("1D loops") { test_wrapper_reduce_nd<1, 10>(default_exec_space); } + SECTION("2D loops") { test_wrapper_reduce_nd<2, 10>(default_exec_space); } + SECTION("3D loops") { test_wrapper_reduce_nd<3, 10>(default_exec_space); } + SECTION("4D loops") { test_wrapper_reduce_nd<4, 10>(default_exec_space); } + SECTION("5D loops") { test_wrapper_reduce_nd<5, 10>(default_exec_space); } + SECTION("6D loops") { test_wrapper_reduce_nd<6, 10>(default_exec_space); } + SECTION("7D loops") { test_wrapper_reduce_nd<7, 10>(default_exec_space); } } From de6df61d121b31a31462350c7a548aa91c7c9003 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Tue, 30 Jul 2024 23:44:24 +0200 Subject: [PATCH 37/99] workaround non-type template parameter usage --- src/kokkos_abstraction.hpp | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index d2f638ea1178..a6b0afce4ed0 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -182,19 +182,20 @@ static struct LoopPatternUndefined { template struct LoopPatternCollapse {}; -template +template struct LoopPatternTeam : std::false_type {}; template -struct LoopPatternTeam, team + thread + vector, - void> : std::true_type { +struct LoopPatternTeam, + std::integral_constant, void> + : std::true_type { static constexpr size_t Nvector = vector; static constexpr size_t Nthread = thread; static constexpr size_t Nteam = team; using LoopPattern = LoopPatternCollapse; }; -template +template struct LoopPatternTeam< Pattern, Rank, typename std::enable_if::value || @@ -210,7 +211,7 @@ struct LoopPatternTeam< static constexpr size_t Nvector = IsTPTVR || IsTPTTRTVR; static constexpr size_t Nthread = IsTPTTR || IsTPTTRTVR; - static constexpr size_t Nteam = Rank - Nthread - Nvector; + static constexpr size_t Nteam = Rank::value - Nthread - Nvector; using LoopPattern = LoopPatternCollapse; using OuterPattern = Pattern; }; @@ -223,7 +224,8 @@ struct LoopPatternTeam< static struct OuterLoopPatternTeams { } outer_loop_pattern_teams_tag; template -struct LoopPatternTeam : std::true_type { +struct LoopPatternTeam, void> + : std::true_type { static constexpr size_t Nvector = 0; static constexpr size_t Nthread = 0; static constexpr size_t Nteam = Rank; @@ -245,7 +247,7 @@ constexpr InnerLoopPatternSimdFor inner_loop_pattern_simdfor_tag; template struct LoopPatternTeam< - Pattern, Rank, + Pattern, std::integral_constant, typename std::enable_if::value || std::is_same::value>::type> : std::true_type { @@ -486,7 +488,9 @@ struct DispatchType { static constexpr bool IsMDRange = std::is_same::value; static constexpr bool IsSimdFor = std::is_same::value; using TeamPattern = - LoopPatternTeam; // false_type unless we use an outer team policy + LoopPatternTeam>; // false_type unless we use + // an outer team policy // fallback simd par_reduce to flat range and force par_scan to flat range static constexpr bool is_FlatRange = @@ -748,7 +752,9 @@ struct par_dispatch_inner {}; template struct par_dispatch_inner> { using signature = meta::function_signature; - using LoopPattern = typename LoopPatternTeam::LoopPattern; + using LoopPattern = + typename LoopPatternTeam>::LoopPattern; KOKKOS_FORCEINLINE_FUNCTION void dispatch(team_mbr_t team_member, Bounds &&...bounds, Function function) const { @@ -983,7 +989,7 @@ par_for_outer(Pattern, const std::string &name, DevExecSpace exec_space, using Function = typename dispatchsig::Function; // functor type using LaunchBounds = typename dispatchsig::LaunchBounds; // list of index types using Args = typename dispatchsig::Args; // - using LoopPattern = LoopPatternTeam; + using LoopPattern = LoopPatternTeam>; using Tag = dispatch_impl::ParallelForDispatch; par_dispatch_impl::dispatch( From 1d7719cdfba0c3546c27a875e7a55c656ec0927e Mon Sep 17 00:00:00 2001 From: adam reyes Date: Thu, 8 Aug 2024 16:04:40 +0200 Subject: [PATCH 38/99] moved TypeList to those in type_list.hpp --- src/kokkos_abstraction.hpp | 156 +++++--------------------------- src/utils/type_list.hpp | 55 ++++++++++- tst/unit/kokkos_abstraction.cpp | 15 +-- 3 files changed, 85 insertions(+), 141 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index a6b0afce4ed0..d70fa6c24873 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -39,6 +39,7 @@ #include "utils/instrument.hpp" #include "utils/multi_pointer.hpp" #include "utils/object_pool.hpp" +#include "utils/type_list.hpp" namespace parthenon { @@ -283,122 +284,10 @@ inline void kokkos_dispatch(ParallelScanDispatch, Args &&...args) { } // namespace dispatch_impl namespace meta { - // c++-20 has std:remove_cvref_t that does this same thing template using base_type = typename std::remove_cv_t>; -template -struct TypeList {}; - -template -constexpr int SizeOfList(TypeList) { - return sizeof...(Ts); -} - -template -struct PopList {}; - -template -struct PopList<1, TypeList> { - using type = T; - using value = TypeList; -}; - -template -struct PopList> { - static_assert(N >= 1, "PopList requires N>=1"); - - private: - using pop = PopList>; - - public: - using type = typename pop::type; - using value = typename pop::value; -}; - -template -struct AppendList {}; - -template -struct AppendList> { - using value = TypeList; -}; - -template -struct PrependList {}; - -template -struct PrependList> { - using value = TypeList; -}; - -template -struct MergeLists {}; - -template -struct MergeLists, TypeList<>> { - using value = TypeList; -}; - -template -struct MergeLists, TypeList> { - using value = typename MergeLists, TypeList>::value; -}; - -template -struct SplitList {}; - -template -struct SplitList<1, TypeList> { - using Left = TypeList; - using Right = TypeList; -}; - -template -struct SplitList> { - static_assert(sizeof...(Ts) + 1 >= N, "size of list must be > N"); - - private: - using split = SplitList>; - - public: - using Left = typename PrependList::value; - using Right = typename split::Right; -}; - -template -struct ListOfType { - using value = typename PrependList::value>::value; -}; - -template -struct ListOfType<1, T> { - using value = TypeList; -}; - -template -struct SequenceOfOnes {}; - -template -struct SequenceOfOnes<0, VAL, std::integer_sequence> { - using value = typename std::integer_sequence; -}; - -template -struct SequenceOfOnes> { - using value = - typename SequenceOfOnes>::value; -}; - -template -using sequence_of_ones = SequenceOfOnes>; - -} // namespace meta - -namespace meta { - template struct FunctionSignature {}; @@ -406,11 +295,11 @@ template struct FunctionSignature { private: static constexpr bool team_mbr = std::is_same_v>; - using split = SplitList>; + using TL = TypeList; public: - using IndexND = typename split::Left; - using FArgs = typename split::Right; + using IndexND = typename TL::template continuous_sublist<0, Rank + team_mbr - 1>; + using FArgs = typename TL::template continuous_sublist; }; template @@ -449,9 +338,10 @@ struct GetLaunchBounds> { using LaunchBounds = GetLaunchBounds>; public: - using value = typename std::conditional< - is_BoundType(), typename PrependList::value, - TypeList<>>::type; + using value = + typename std::conditional(), + insert_type_list_t, + TypeList<>>::type; using NumInds = std::conditional_t(), Rank_t() + LaunchBounds::NumInds::value>, @@ -465,18 +355,19 @@ template struct DispatchSignature> { private: using LB = GetLaunchBounds>; - using pop = PopList>; + using TL = TypeList; public: using LaunchBounds = typename LB::value; using Rank = std::integral_constant; - using Function = typename pop::type; - using Args = typename pop::value; + using Function = typename TL::template type; + using Args = typename TL::template continuous_sublist; }; template struct DispatchType { - using BoundType = typename PopList<1, TypeList>::type; + using BoundType = typename TypeList::template type<0>; static constexpr bool is_IndexRangeBounds = std::is_same>::value; static constexpr bool is_ParFor = @@ -506,8 +397,7 @@ template class FlatFunctor {}; template -class FlatFunctor, - meta::TypeList> { +class FlatFunctor, TypeList> { static constexpr size_t Rank = sizeof...(Is); Kokkos::Array ranges; @@ -574,7 +464,7 @@ template struct InnerFunctor {}; template -struct InnerFunctor, +struct InnerFunctor, std::integer_sequence> { static constexpr size_t Nteam = sizeof...(Iteam); Function function; @@ -683,7 +573,7 @@ class CollapseFunctor, recoverIndex(inds_team, team_member.league_rank()); using signature = meta::function_signature; using ThreadVectorInds = - typename meta::PopList::value; + typename signature::IndexND::template continuous_sublist; if constexpr (ParForOuter) { function(team_member, inds_team[Iteam]...); @@ -750,7 +640,7 @@ template struct par_dispatch_inner {}; template -struct par_dispatch_inner> { +struct par_dispatch_inner> { using signature = meta::function_signature; using LoopPattern = typename LoopPatternTeam inline auto MakeMDRangePolicy(DevExecSpace exec_space, Args &&...args) { using Indices = typename std::make_index_sequence; - using Ones = typename meta::sequence_of_ones::value; + using Ones = sequence_of_int_v; return MakeMDRange(std::forward(args)...) .policy(Indices(), Ones(), exec_space); } @@ -851,8 +741,8 @@ struct par_dispatch_impl {}; template -struct par_dispatch_impl, - meta::TypeList> { +struct par_dispatch_impl, + TypeList> { using DType = meta::DispatchType; @@ -939,7 +829,7 @@ inline typename std::enable_if::valu std::is_same::value, void>::type par_dispatch(Pattern, std::string name, DevExecSpace exec_space, AllArgs &&...args) { - using dispatchsig = meta::DispatchSignature>; + using dispatchsig = meta::DispatchSignature>; static constexpr size_t Rank = dispatchsig::Rank::value; using Function = typename dispatchsig::Function; // functor type using LaunchBounds = typename dispatchsig::LaunchBounds; // list of index types @@ -984,7 +874,7 @@ template inline std::enable_if_t::value, void> par_for_outer(Pattern, const std::string &name, DevExecSpace exec_space, size_t scratch_size_in_bytes, const int scratch_level, AllArgs &&...args) { - using dispatchsig = meta::DispatchSignature>; + using dispatchsig = meta::DispatchSignature>; static constexpr size_t Rank = dispatchsig::Rank::value; using Function = typename dispatchsig::Function; // functor type using LaunchBounds = typename dispatchsig::LaunchBounds; // list of index types @@ -1011,7 +901,7 @@ KOKKOS_FORCEINLINE_FUNCTION void> par_for_inner(Pattern, team_mbr_t team_member, AllArgs &&...args) { - using dispatchsig = meta::DispatchSignature>; + using dispatchsig = meta::DispatchSignature>; constexpr size_t Rank = dispatchsig::Rank::value; using Function = typename dispatchsig::Function; using LaunchBounds = typename dispatchsig::LaunchBounds; diff --git a/src/utils/type_list.hpp b/src/utils/type_list.hpp index a17b7b8f7063..33af276e34cc 100644 --- a/src/utils/type_list.hpp +++ b/src/utils/type_list.hpp @@ -65,7 +65,7 @@ struct TypeList { } public: - template + template using continuous_sublist = decltype(ContinuousSublist()); }; @@ -79,12 +79,29 @@ template auto ConcatenateTypeLists(TypeList, TypeList, Args...) { return ConcatenateTypeLists(TypeList(), Args()...); } + +template +static auto InsertTypeImpl(TypeList) { + if constexpr (I == 0) { + return TypeList(); + } else if constexpr (I == sizeof...(Ts)) { + return TypeList(); + } else { + using TL = TypeList; + return ConcatenateTypeLists(typename TL::template continuous_sublist<0, I>(), + TypeList(), + typename TL::template continuous_sublist()); + } +} } // namespace impl template using concatenate_type_lists_t = decltype(impl::ConcatenateTypeLists(std::declval()...)); +template +using insert_type_list_t = decltype(impl::InsertTypeImpl(TL())); + // Relevant only for lists of variable types template auto GetNames() { @@ -92,6 +109,42 @@ auto GetNames() { TL::IterateTypes([&names](auto t) { names.push_back(decltype(t)::name()); }); return names; } + +namespace impl { + +template +auto ListOfType() { + if constexpr (N == 1) { + return TypeList(); + } else { + return concatenate_type_lists_t, decltype(ListOfType())>(); + } +} + +template +struct SequenceOfInt {}; + +template +struct SequenceOfInt<0, VAL, std::integer_sequence> { + using value = typename std::integer_sequence; +}; + +template +struct SequenceOfInt> { + using value = + typename SequenceOfInt>::value; +}; + +} // namespace impl + +template +using list_of_type_t = decltype(impl::ListOfType()); + +template +using sequence_of_int_v = + typename impl::SequenceOfInt>::value; + } // namespace parthenon #endif // UTILS_TYPE_LIST_HPP_ diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index 872be3686401..0af10c484da5 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -29,6 +29,7 @@ #include "basic_types.hpp" #include "kokkos_abstraction.hpp" #include "parthenon_array_generic.hpp" +#include "utils/type_list.hpp" using parthenon::DevExecSpace; using parthenon::ParArray1D; @@ -94,7 +95,7 @@ struct test_wrapper_nd_impl { decltype(HostArrayND()) arr_host_orig, arr_host_mod; test_wrapper_nd_impl() { - arr_dev = GetArray(typename parthenon::meta::sequence_of_ones::value()); + arr_dev = GetArray(parthenon::sequence_of_int_v()); arr_host_orig = Kokkos::create_mirror(arr_dev); arr_host_mod = Kokkos::create_mirror(arr_dev); std::random_device rd; // Will be used to obtain a seed for the random number engine @@ -164,7 +165,7 @@ struct test_wrapper_nd_impl { template struct dispatch, - parthenon::meta::TypeList> { + parthenon::TypeList> { template void execute(DevExecSpace exec_space, view_t &dev, int *int_bounds, @@ -194,14 +195,14 @@ struct test_wrapper_nd_impl { Kokkos::deep_copy(arr_dev, arr_host_orig); SECTION("integer launch bounds") { dispatch, - typename parthenon::meta::ListOfType::value>() + parthenon::list_of_type_t>() .execute(exec_space, arr_dev, int_bounds, bounds); Kokkos::deep_copy(arr_host_mod, arr_dev); REQUIRE(par_for_comp(Sequence()) == true); } SECTION("IndexRange launch bounds") { dispatch, - typename parthenon::meta::ListOfType::value>() + parthenon::list_of_type_t>() .execute(exec_space, arr_dev, int_bounds, bounds); Kokkos::deep_copy(arr_host_mod, arr_dev); REQUIRE(par_for_comp(Sequence()) == true); @@ -515,7 +516,7 @@ struct test_wrapper_reduce_nd_impl { template struct dispatch, - parthenon::meta::TypeList> { + parthenon::TypeList> { bool execute(DevExecSpace exec_space, const int h_sum, int *int_bounds, parthenon::IndexRange *bounds) { @@ -542,12 +543,12 @@ struct test_wrapper_reduce_nd_impl { void test(T loop_pattern, DevExecSpace exec_space) { SECTION("integer launch bounds") { REQUIRE(dispatch, - typename parthenon::meta::ListOfType::value>() + parthenon::list_of_type_t>() .execute(exec_space, h_sum, int_bounds, bounds) == true); } SECTION("IndexRange launch bounds") { REQUIRE(dispatch, - typename parthenon::meta::ListOfType::value>() + parthenon::list_of_type_t>() .execute(exec_space, h_sum, int_bounds, bounds) == true); } } From 7b25d89c0f91d5241eab5e1519e4a43d0785134a Mon Sep 17 00:00:00 2001 From: Adam Date: Thu, 15 Aug 2024 13:40:17 +0000 Subject: [PATCH 39/99] use correct ThreadVectorRange policy --- src/kokkos_abstraction.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index d70fa6c24873..78b8bb2752ec 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -597,8 +597,8 @@ class CollapseFunctor, recoverIndex(inds_thread, idThread); if constexpr (Nvector > 0) { Kokkos::parallel_for( - Kokkos::TeamVectorRange(team_member, 0, - FlattenLaunchBound(Nteam + Nthread, Rank)), + Kokkos::ThreadVectorRange(team_member, 0, + FlattenLaunchBound(Nteam + Nthread, Rank)), [&](const int idVector) { Kokkos::Array inds_vector; recoverIndex(inds_vector, idVector); From c16fc6e8c97ca9fb164099e0d130189cb6b224d6 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Thu, 15 Aug 2024 21:20:53 +0200 Subject: [PATCH 40/99] cleanup --- src/kokkos_abstraction.hpp | 194 ++++++++++++++++++------------------- 1 file changed, 93 insertions(+), 101 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 78b8bb2752ec..07a405cc32d5 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -26,15 +26,13 @@ #include #include #include +#include #include -#include -#include "Kokkos_Macros.hpp" #include "basic_types.hpp" #include "config.hpp" #include "parthenon_array_generic.hpp" -#include "utils/concepts_lite.hpp" #include "utils/error_checking.hpp" #include "utils/instrument.hpp" #include "utils/multi_pointer.hpp" @@ -162,7 +160,6 @@ static struct LoopPatternFlatRange { // a 1:1 indices matching static struct LoopPatternMDRange { } loop_pattern_mdrange_tag; - // Translates to a Kokkos::TeamPolicy with a single inner // Kokkos::TeamThreadRange static struct LoopPatternTPTTR { @@ -180,22 +177,28 @@ static struct LoopPatternUndefined { } loop_pattern_undefined_tag; // Translates to a Kokkos::TeamPolicy that collapse Nteams outer loops // with Nthread & Nvector inner loop collapses -template +template struct LoopPatternCollapse {}; -template +// trait to track if pattern requests any type of hierarchial parallelism +template struct LoopPatternTeam : std::false_type {}; -template +// This pattern needs to determine the team and thread/vector count at compile time +// By contrast the others specify the thread/vector count at compile time and the +// outer team policy collapses all remaining loops +template struct LoopPatternTeam, std::integral_constant, void> : std::true_type { - static constexpr size_t Nvector = vector; - static constexpr size_t Nthread = thread; - static constexpr size_t Nteam = team; + static constexpr std::size_t Nvector = vector; + static constexpr std::size_t Nthread = thread; + static constexpr std::size_t Nteam = team; using LoopPattern = LoopPatternCollapse; }; +// Patterns with an outer team pattern that collapses all +// remaining loops template struct LoopPatternTeam< Pattern, Rank, @@ -210,29 +213,28 @@ struct LoopPatternTeam< std::is_same::value; // inner ThreadVectorRange static constexpr bool IsTPTTRTVR = std::is_same::value; - static constexpr size_t Nvector = IsTPTVR || IsTPTTRTVR; - static constexpr size_t Nthread = IsTPTTR || IsTPTTRTVR; - static constexpr size_t Nteam = Rank::value - Nthread - Nvector; + static constexpr std::size_t Nvector = IsTPTVR || IsTPTTRTVR; + static constexpr std::size_t Nthread = IsTPTTR || IsTPTTRTVR; + static constexpr std::size_t Nteam = Rank::value - Nthread - Nvector; using LoopPattern = LoopPatternCollapse; using OuterPattern = Pattern; }; -// Tags for Nested parallelism where the outermost layer supports 1, 2, or 3 -// indices +// Tags for Nested parallelism -// Translates to outermost loop being a Kokkos::TeamPolicy -// Currently the only available option. +// Translates to outermost loop being a Kokkos::TeamPolicy for par_for_outer like loops static struct OuterLoopPatternTeams { } outer_loop_pattern_teams_tag; template struct LoopPatternTeam, void> : std::true_type { - static constexpr size_t Nvector = 0; - static constexpr size_t Nthread = 0; - static constexpr size_t Nteam = Rank; + static constexpr std::size_t Nvector = 0; + static constexpr std::size_t Nthread = 0; + static constexpr std::size_t Nteam = Rank; using LoopPattern = LoopPatternCollapse; using OuterPattern = OuterLoopPatternTeams; }; + // Inner loop pattern tags must be constexpr so they're available on device // Translate to a Kokkos::TeamVectorRange as innermost loop (single index) struct InnerLoopPatternTVR {}; @@ -246,7 +248,8 @@ constexpr InnerLoopPatternTTR inner_loop_pattern_ttr_tag; struct InnerLoopPatternSimdFor {}; constexpr InnerLoopPatternSimdFor inner_loop_pattern_simdfor_tag; -template +// Patterns for par_for_inner +template struct LoopPatternTeam< Pattern, std::integral_constant, typename std::enable_if::value || @@ -256,8 +259,8 @@ struct LoopPatternTeam< static constexpr bool IsTTR = std::is_same::value; static constexpr bool IsTVR = std::is_same::value; - static constexpr size_t Nvector = IsTVR ? Rank : 0; - static constexpr size_t Nthread = IsTTR ? Rank : 0; + static constexpr std::size_t Nvector = IsTVR ? Rank : 0; + static constexpr std::size_t Nthread = IsTTR ? Rank : 0; using LoopPattern = LoopPatternCollapse<0, Nthread, Nvector>; }; @@ -283,7 +286,7 @@ inline void kokkos_dispatch(ParallelScanDispatch, Args &&...args) { } } // namespace dispatch_impl -namespace meta { +namespace impl { // c++-20 has std:remove_cvref_t that does this same thing template using base_type = typename std::remove_cv_t>; @@ -323,7 +326,7 @@ struct GetLaunchBounds> { } template - static constexpr size_t NumBnds() { + static constexpr std::size_t NumBnds() { if constexpr (!is_BoundType()) { return 0; } @@ -365,11 +368,11 @@ struct DispatchSignature> { TL::n_types - 1>; }; -template +template struct DispatchType { using BoundType = typename TypeList::template type<0>; static constexpr bool is_IndexRangeBounds = - std::is_same>::value; + std::is_same>::value; static constexpr bool is_ParFor = std::is_same::value; static constexpr bool is_ParScan = @@ -391,15 +394,15 @@ struct DispatchType { static constexpr bool is_Collapse = TeamPattern::value; }; -} // namespace meta +} // namespace impl template class FlatFunctor {}; -template +template class FlatFunctor, TypeList> { - static constexpr size_t Rank = sizeof...(Is); + static constexpr std::size_t Rank = sizeof...(Is); Kokkos::Array ranges; Kokkos::Array strides; Function function; @@ -454,7 +457,7 @@ class FlatFunctor, TypeList KOKKOS_INLINE_FUNCTION auto MakeFlatFunctor(F &function, Bounds &&...bounds) { - using signature = meta::function_signature; + using signature = impl::function_signature; using IndexND = typename signature::IndexND; return FlatFunctor, typename signature::FArgs>( function, std::forward(bounds)...); @@ -463,10 +466,10 @@ KOKKOS_INLINE_FUNCTION auto MakeFlatFunctor(F &function, Bounds &&...bounds) { template struct InnerFunctor {}; -template +template struct InnerFunctor, std::integer_sequence> { - static constexpr size_t Nteam = sizeof...(Iteam); + static constexpr std::size_t Nteam = sizeof...(Iteam); Function function; Kokkos::Array inds_team; @@ -483,16 +486,16 @@ struct InnerFunctor, template class CollapseFunctor {}; -template +template class CollapseFunctor, std::integer_sequence, std::integer_sequence, Function, ParForOuter> { - static constexpr size_t Nteam = sizeof...(Iteam); - static constexpr size_t Nthread = sizeof...(Ithread); - static constexpr size_t Nvector = sizeof...(Ivector); - static constexpr size_t Rank = Nteam + Nthread + Nvector; + static constexpr std::size_t Nteam = sizeof...(Iteam); + static constexpr std::size_t Nthread = sizeof...(Ithread); + static constexpr std::size_t Nvector = sizeof...(Ivector); + static constexpr std::size_t Rank = Nteam + Nthread + Nvector; Kokkos::Array ranges; Kokkos::Array strides; @@ -543,7 +546,7 @@ class CollapseFunctor, } } - template + template KOKKOS_INLINE_FUNCTION void recoverIndex(Kokkos::Array &inds, int idx) const { inds[0] = idx; for (int i = 1; i < N; i++) { @@ -571,7 +574,7 @@ class CollapseFunctor, void operator()(team_mbr_t team_member) const { Kokkos::Array inds_team; recoverIndex(inds_team, team_member.league_rank()); - using signature = meta::function_signature; + using signature = impl::function_signature; using ThreadVectorInds = typename signature::IndexND::template continuous_sublist; @@ -621,13 +624,13 @@ class CollapseFunctor, } }; -template +template KOKKOS_INLINE_FUNCTION auto MakeCollapseFunctor(LoopPatternCollapse, F &function, Bounds &&...bounds) { - constexpr size_t Rank = Nteam + Nthread + Nvector; - using signature = meta::function_signature; + constexpr std::size_t Rank = Nteam + Nthread + Nvector; + using signature = impl::function_signature; using IndexND = typename signature::IndexND; return CollapseFunctor, @@ -636,19 +639,19 @@ MakeCollapseFunctor(LoopPatternCollapse, F &function, function, std::forward(bounds)...); } -template +template struct par_dispatch_inner {}; -template +template struct par_dispatch_inner> { - using signature = meta::function_signature; - using LoopPattern = - typename LoopPatternTeam>::LoopPattern; + using signature = impl::function_signature; + using LPT = LoopPatternTeam>; + static_assert(LPT::value, "unsupported inner loop pattern"); KOKKOS_FORCEINLINE_FUNCTION void dispatch(team_mbr_t team_member, Bounds &&...bounds, Function function) const { - MakeCollapseFunctor(LoopPattern(), function, std::forward(bounds)...) + MakeCollapseFunctor(typename LPT::LoopPattern(), function, + std::forward(bounds)...) .collapse_inner(team_member, function); } }; @@ -676,7 +679,7 @@ class MDRange { } } - template + template auto policy(std::integer_sequence, std::integer_sequence, DevExecSpace exec_space) { return Kokkos::MDRangePolicy>( @@ -711,11 +714,11 @@ struct SimdFor { template inline void dispatch(Function &function) { - dispatch_impl<1>(function); + dispatch_simd<1>(function); } private: - template + template inline void dispatch_simd(std::integer_sequence, Function &function) { for (int i = mdrange.lower[Rank - 1]; i <= mdrange.upper[Rank - 1]; i++) { #pragma omp simd @@ -724,11 +727,11 @@ struct SimdFor { } template - inline void dispatch_impl(Function &function) { + inline void dispatch_simd(Function &function) { if constexpr (LoopCount < Rank) { for (int i = mdrange.lower[LoopCount - 1]; i <= mdrange.upper[LoopCount - 1]; i++) { indices[LoopCount - 1] = i; - dispatch_impl(function); + dispatch_simd(function); } } else { dispatch_simd(Sequence(), function); @@ -736,20 +739,20 @@ struct SimdFor { } }; -template +template struct par_dispatch_impl {}; -template struct par_dispatch_impl, TypeList> { - using DType = meta::DispatchType; + using DType = impl::DispatchType; static inline void dispatch(std::string name, DevExecSpace exec_space, Bounds &&...ids, Function function, Args &&...args, const int scratch_level = 0, - const size_t scratch_size_in_bytes = 0) { + const std::size_t scratch_size_in_bytes = 0) { static_assert(!(DType::is_MDRange && Rank < 2), "Can not launch MDRange with Rank < 2"); @@ -768,7 +771,7 @@ struct par_dispatch_impl, static inline auto policy(DevExecSpace exec_space, Bounds &&...ids, const int scratch_level = 0, - const size_t scratch_size_in_bytes = 0) { + const std::size_t scratch_size_in_bytes = 0) { if constexpr (DType::is_FlatRange) { int rangeNx = FlattenLaunchBound(std::forward(ids)...); @@ -785,6 +788,8 @@ struct par_dispatch_impl, FlattenLaunchBound(std::forward(ids)...); return team_policy(exec_space, rangeNx, Kokkos::AUTO) .set_scratch_size(scratch_level, Kokkos::PerTeam(scratch_size_in_bytes)); + } else { + static_assert(always_false, "can't make policy for pattern"); } }; @@ -797,6 +802,8 @@ struct par_dispatch_impl, constexpr bool ParForOuter = std::is_same_v; return MakeCollapseFunctor(typename DType::TeamPattern::LoopPattern(), function, std::forward(ids)...); + } else { + static_assert(always_false, "can't make functor for pattern"); } } @@ -821,19 +828,13 @@ struct par_dispatch_impl, }; template -inline typename std::enable_if::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value, - void>::type -par_dispatch(Pattern, std::string name, DevExecSpace exec_space, AllArgs &&...args) { - using dispatchsig = meta::DispatchSignature>; - static constexpr size_t Rank = dispatchsig::Rank::value; - using Function = typename dispatchsig::Function; // functor type - using LaunchBounds = typename dispatchsig::LaunchBounds; // list of index types - using Args = typename dispatchsig::Args; // +inline void par_dispatch(Pattern, std::string name, DevExecSpace exec_space, + AllArgs &&...args) { + using dispatchsig = impl::DispatchSignature>; + static constexpr std::size_t Rank = dispatchsig::Rank::value; + using Function = typename dispatchsig::Function; + using LaunchBounds = typename dispatchsig::LaunchBounds; + using Args = typename dispatchsig::Args; if constexpr (Rank > 1 && std::is_same_v) { static_assert(always_false, "par_scan only for 1D loops"); @@ -842,13 +843,6 @@ par_dispatch(Pattern, std::string name, DevExecSpace exec_space, AllArgs &&...ar name, exec_space, std::forward(args)...); } -template -inline typename std::enable_if::type -par_dispatch(Pattern p, const std::string &name, DevExecSpace exec_space, - const IndexRange &r, const Function &function, Args &&...args) { - par_dispatch(p, name, exec_space, r.s, r.e, function, std::forward(args)...); -} - template inline void par_dispatch(const std::string &name, Args &&...args) { par_dispatch(DEFAULT_LOOP_PATTERN, name, DevExecSpace(), @@ -873,13 +867,14 @@ inline void par_scan(Args &&...args) { template inline std::enable_if_t::value, void> par_for_outer(Pattern, const std::string &name, DevExecSpace exec_space, - size_t scratch_size_in_bytes, const int scratch_level, AllArgs &&...args) { - using dispatchsig = meta::DispatchSignature>; - static constexpr size_t Rank = dispatchsig::Rank::value; - using Function = typename dispatchsig::Function; // functor type - using LaunchBounds = typename dispatchsig::LaunchBounds; // list of index types - using Args = typename dispatchsig::Args; // - using LoopPattern = LoopPatternTeam>; + std::size_t scratch_size_in_bytes, const int scratch_level, + AllArgs &&...args) { + + using dispatchsig = impl::DispatchSignature>; + static constexpr std::size_t Rank = dispatchsig::Rank::value; + using Function = typename dispatchsig::Function; + using LaunchBounds = typename dispatchsig::LaunchBounds; + using Args = typename dispatchsig::Args; using Tag = dispatch_impl::ParallelForDispatch; par_dispatch_impl::dispatch( @@ -894,19 +889,16 @@ inline void par_for_outer(const std::string &name, Args &&...args) { } template -KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t || - std::is_same_v || - std::is_same_v, - void> - par_for_inner(Pattern, team_mbr_t team_member, AllArgs &&...args) { - - using dispatchsig = meta::DispatchSignature>; - constexpr size_t Rank = dispatchsig::Rank::value; - using Function = typename dispatchsig::Function; - using LaunchBounds = typename dispatchsig::LaunchBounds; +KOKKOS_FORCEINLINE_FUNCTION void par_for_inner(Pattern, team_mbr_t team_member, + AllArgs &&...args) { + + using DispatchSig = impl::DispatchSignature>; + constexpr std::size_t Rank = DispatchSig::Rank::value; + using Function = typename DispatchSig::Function; + using LaunchBounds = typename DispatchSig::LaunchBounds; + if constexpr (std::is_same_v) { - using Args = typename dispatchsig::Args; + using Args = typename DispatchSig::Args; par_dispatch_impl() .dispatch("simd", HostExecSpace(), std::forward(args)...); From a25ffef3b59a05cf4bd2445a4c29ba3d90f7699c Mon Sep 17 00:00:00 2001 From: adam reyes Date: Fri, 16 Aug 2024 00:37:52 +0200 Subject: [PATCH 41/99] template execution space --- src/kokkos_abstraction.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 07a405cc32d5..60ef568291cb 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -749,7 +749,8 @@ struct par_dispatch_impl, using DType = impl::DispatchType; - static inline void dispatch(std::string name, DevExecSpace exec_space, Bounds &&...ids, + template + static inline void dispatch(std::string name, ExecSpace exec_space, Bounds &&...ids, Function function, Args &&...args, const int scratch_level = 0, const std::size_t scratch_size_in_bytes = 0) { @@ -769,7 +770,8 @@ struct par_dispatch_impl, } }; - static inline auto policy(DevExecSpace exec_space, Bounds &&...ids, + template + static inline auto policy(ExecSpace exec_space, Bounds &&...ids, const int scratch_level = 0, const std::size_t scratch_size_in_bytes = 0) { From 7d49d4ee8e81822aa791600dbf539a8094891376 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Fri, 16 Aug 2024 00:46:47 +0200 Subject: [PATCH 42/99] linting --- src/kokkos_abstraction.hpp | 15 ++++----------- tst/unit/kokkos_abstraction.cpp | 3 +-- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 60ef568291cb..00ff279165a7 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -401,7 +401,6 @@ class FlatFunctor {}; template class FlatFunctor, TypeList> { - static constexpr std::size_t Rank = sizeof...(Is); Kokkos::Array ranges; Kokkos::Array strides; @@ -491,7 +490,6 @@ template , std::integer_sequence, std::integer_sequence, Function, ParForOuter> { - static constexpr std::size_t Nteam = sizeof...(Iteam); static constexpr std::size_t Nthread = sizeof...(Ithread); static constexpr std::size_t Nvector = sizeof...(Ivector); @@ -671,7 +669,7 @@ class MDRange { } template - MDRange(Args... args) { + explicit MDRange(Args... args) { std::array indices{{static_cast(args)...}}; for (int i = 0; i < Rank; i++) { lower[i] = indices[2 * i]; @@ -710,7 +708,7 @@ struct SimdFor { MDRange mdrange; template - SimdFor(Args &&...args) : mdrange(std::forward(args)...) {} + explicit SimdFor(Args &&...args) : mdrange(std::forward(args)...) {} template inline void dispatch(Function &function) { @@ -746,7 +744,6 @@ template struct par_dispatch_impl, TypeList> { - using DType = impl::DispatchType; template @@ -754,7 +751,6 @@ struct par_dispatch_impl, Function function, Args &&...args, const int scratch_level = 0, const std::size_t scratch_size_in_bytes = 0) { - static_assert(!(DType::is_MDRange && Rank < 2), "Can not launch MDRange with Rank < 2"); Tag tag; @@ -768,13 +764,12 @@ struct par_dispatch_impl, functor(function, std::forward(ids)...), std::forward(args)...); } - }; + } template static inline auto policy(ExecSpace exec_space, Bounds &&...ids, const int scratch_level = 0, const std::size_t scratch_size_in_bytes = 0) { - if constexpr (DType::is_FlatRange) { int rangeNx = FlattenLaunchBound(std::forward(ids)...); return Kokkos::RangePolicy<>(exec_space, 0, rangeNx); @@ -793,7 +788,7 @@ struct par_dispatch_impl, } else { static_assert(always_false, "can't make policy for pattern"); } - }; + } static inline auto functor(Function function, Bounds &&...ids) { if constexpr (DType::is_FlatRange) { @@ -871,7 +866,6 @@ inline std::enable_if_t::value, voi par_for_outer(Pattern, const std::string &name, DevExecSpace exec_space, std::size_t scratch_size_in_bytes, const int scratch_level, AllArgs &&...args) { - using dispatchsig = impl::DispatchSignature>; static constexpr std::size_t Rank = dispatchsig::Rank::value; using Function = typename dispatchsig::Function; @@ -893,7 +887,6 @@ inline void par_for_outer(const std::string &name, Args &&...args) { template KOKKOS_FORCEINLINE_FUNCTION void par_for_inner(Pattern, team_mbr_t team_member, AllArgs &&...args) { - using DispatchSig = impl::DispatchSignature>; constexpr std::size_t Rank = DispatchSig::Rank::value; using Function = typename DispatchSig::Function; diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index 0af10c484da5..55d7caf031d2 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -26,6 +26,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Macros.hpp" + #include "basic_types.hpp" #include "kokkos_abstraction.hpp" #include "parthenon_array_generic.hpp" @@ -166,7 +167,6 @@ struct test_wrapper_nd_impl { template struct dispatch, parthenon::TypeList> { - template void execute(DevExecSpace exec_space, view_t &dev, int *int_bounds, parthenon::IndexRange *bounds) { @@ -517,7 +517,6 @@ struct test_wrapper_reduce_nd_impl { template struct dispatch, parthenon::TypeList> { - bool execute(DevExecSpace exec_space, const int h_sum, int *int_bounds, parthenon::IndexRange *bounds) { int test_sum = 0; From 72aa43789b6b9703ff32fbea503cfd4e140ab328 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Fri, 16 Aug 2024 00:58:59 +0200 Subject: [PATCH 43/99] put simd pragma in correct place --- src/kokkos_abstraction.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 00ff279165a7..bccf597b6b64 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -718,8 +718,8 @@ struct SimdFor { private: template inline void dispatch_simd(std::integer_sequence, Function &function) { - for (int i = mdrange.lower[Rank - 1]; i <= mdrange.upper[Rank - 1]; i++) { #pragma omp simd + for (int i = mdrange.lower[Rank - 1]; i <= mdrange.upper[Rank - 1]; i++) { function(indices[Is]..., i); } } From ed0be07688d9c85c04ecd5155f5de34cc0762ca5 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Fri, 16 Aug 2024 01:14:14 +0200 Subject: [PATCH 44/99] ContinuousSubListImpl made public --- src/utils/type_list.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/utils/type_list.hpp b/src/utils/type_list.hpp index 33af276e34cc..b9a33a068108 100644 --- a/src/utils/type_list.hpp +++ b/src/utils/type_list.hpp @@ -54,9 +54,8 @@ struct TypeList { (func(Args()), ...); } - private: template - static auto ContinuousSublist() { + static auto ContinuousSublistImpl() { return ContinuousSublistImpl(std::make_index_sequence()); } template @@ -64,9 +63,8 @@ struct TypeList { return sublist<(Start + Is)...>(); } - public: template - using continuous_sublist = decltype(ContinuousSublist()); + using continuous_sublist = decltype(ContinuousSublistImpl()); }; namespace impl { From fb5ccb2b86e1e1e8a88314ef0d32fba98c1612e6 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Wed, 21 Aug 2024 12:13:11 +0200 Subject: [PATCH 45/99] Borrowing features from upstream/lroberts36/generalize-par-dispatch --- src/kokkos_abstraction.hpp | 25 +++++++++++++ src/utils/indexer.hpp | 20 +++++++++++ src/utils/type_list.hpp | 72 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 117 insertions(+) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index bccf597b6b64..1b1a757589f6 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -396,6 +396,31 @@ struct DispatchType { } // namespace impl +// Struct for translating between loop bounds given in terms of IndexRanges and loop +// bounds given in terms of raw integers +template +struct BoundTranslator { + using Bound_tl = TypeList; + static constexpr bool are_integers = std::is_integral_v< + typename std::remove_reference>::type>; + static constexpr uint rank = sizeof...(Bound_ts) / (1 + are_integers); + static std::array GetIndexRanges(Bound_ts... bounds) { + if constexpr (are_integers) { + std::array bounds_arr{static_cast(bounds)...}; + std::array out; + for (int r = 0; r < rank; ++r) { + out[r].s = static_cast(bounds_arr[2 * r]); + out[r].e = static_cast(bounds_arr[2 * r + 1]); + } + return out; + } else { + return std::array{bounds...}; + } + } +}; + +template +struct BoundTranslator> : public BoundTranslator {}; template class FlatFunctor {}; diff --git a/src/utils/indexer.hpp b/src/utils/indexer.hpp index 43e4b613087c..ba0453431431 100644 --- a/src/utils/indexer.hpp +++ b/src/utils/indexer.hpp @@ -189,5 +189,25 @@ using Indexer8D = Indexer; using SpatiallyMaskedIndexer6D = SpatiallyMaskedIndexer; +template +auto MakeIndexer(const std::pair &...ranges) { + return Indexer(ranges...); +} + +namespace impl { +template +auto MakeIndexerIntImpl(std::array args, std::index_sequence) { + return MakeIndexer(std::pair(args[2 * Is], args[2 * Is + 1])...); +} +} // namespace impl + +template +auto MakeIndexerInt(Ts &&...args) { + static_assert(sizeof...(Ts) % 2 == 0, + "Must have an upper and lower end to each index range."); + return impl::MakeIndexerIntImpl(std::array{args...}, + std::make_index_sequence()); +} + } // namespace parthenon #endif // UTILS_INDEXER_HPP_ diff --git a/src/utils/type_list.hpp b/src/utils/type_list.hpp index b9a33a068108..8db9684ce210 100644 --- a/src/utils/type_list.hpp +++ b/src/utils/type_list.hpp @@ -109,6 +109,17 @@ auto GetNames() { } namespace impl { +template +static constexpr int FirstNonIntegralImpl() { + if constexpr (cidx == TL::n_types) { + return TL::n_types; + } else { + if constexpr (std::is_integral_v>::type>) + return FirstNonIntegralImpl(); + return cidx; + } +} template auto ListOfType() { @@ -143,6 +154,67 @@ template using sequence_of_int_v = typename impl::SequenceOfInt>::value; +template +struct FuncSignature; + +template +struct FuncSignature : public FuncSignature {}; + +template +struct FuncSignature { + using type = R(Args...); + using arg_types_tl = TypeList; + using ret_type = R; +}; + +template +struct FuncSignature { + using type = R (T::*)(Args...); + using arg_types_tl = TypeList; + using ret_type = R; +}; + +template +static constexpr int FirstNonIntegralIdx() { + return impl::FirstNonIntegralImpl(); +} +template +struct is_functor : std::false_type {}; + +template +struct is_functor> : std::true_type {}; + +template +constexpr int FirstFuncIdx() { + if constexpr (idx == TL::n_types) { + return TL::n_types; + } else { + using cur_type = typename TL::template type; + if constexpr (is_functor::value) return idx; + if constexpr (std::is_function>::value) return idx; + return FirstFuncIdx(); + } +} + +template +struct FuncSignature; + +template +struct FuncSignature : public FuncSignature {}; + +template +struct FuncSignature { + using type = R(Args...); + using arg_types_tl = TypeList; + using ret_type = R; +}; + +template +struct FuncSignature { + using type = R (T::*)(Args...); + using arg_types_tl = TypeList; + using ret_type = R; +}; } // namespace parthenon #endif // UTILS_TYPE_LIST_HPP_ From aae696cf2288c851a842d5e69ee3f7c716e203b5 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Wed, 21 Aug 2024 15:16:27 +0200 Subject: [PATCH 46/99] rebuilding dispatch signature --- src/kokkos_abstraction.hpp | 147 +++++++++++++------------------------ src/utils/type_list.hpp | 98 +++++++++++++++++++++---- 2 files changed, 132 insertions(+), 113 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 1b1a757589f6..2baab823f9ef 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -33,7 +33,9 @@ #include "basic_types.hpp" #include "config.hpp" #include "parthenon_array_generic.hpp" +#include "utils/concepts_lite.hpp" #include "utils/error_checking.hpp" +#include "utils/indexer.hpp" #include "utils/instrument.hpp" #include "utils/multi_pointer.hpp" #include "utils/object_pool.hpp" @@ -286,93 +288,32 @@ inline void kokkos_dispatch(ParallelScanDispatch, Args &&...args) { } } // namespace dispatch_impl -namespace impl { -// c++-20 has std:remove_cvref_t that does this same thing -template -using base_type = typename std::remove_cv_t>; - -template -struct FunctionSignature {}; - -template -struct FunctionSignature { - private: - static constexpr bool team_mbr = std::is_same_v>; - using TL = TypeList; - - public: - using IndexND = typename TL::template continuous_sublist<0, Rank + team_mbr - 1>; - using FArgs = typename TL::template continuous_sublist; -}; - -template -using function_signature = FunctionSignature::operator())>; - -template -struct GetLaunchBounds {}; - -template <> -struct GetLaunchBounds> { - using value = TypeList<>; - using NumInds = std::integral_constant; -}; - -template -struct GetLaunchBounds> { - private: - template - static constexpr bool is_BoundType() { - return std::numeric_limits::is_integer || std::is_same_v; - } - - template - static constexpr std::size_t NumBnds() { - if constexpr (!is_BoundType()) { - return 0; - } - return std::is_same_v ? 2 : 1; - } - - template - using Rank_t = std::integral_constant; - - using bound_variants = std::variant; - using bound = base_type; - using LaunchBounds = GetLaunchBounds>; - - public: - using value = - typename std::conditional(), - insert_type_list_t, - TypeList<>>::type; - using NumInds = - std::conditional_t(), - Rank_t() + LaunchBounds::NumInds::value>, - Rank_t()>>; -}; - template struct DispatchSignature {}; template struct DispatchSignature> { private: - using LB = GetLaunchBounds>; using TL = TypeList; + static constexpr std::size_t func_idx = FirstFuncIdx(); public: - using LaunchBounds = typename LB::value; - using Rank = std::integral_constant; - using Function = typename TL::template type; - using Args = typename TL::template continuous_sublist; + using LaunchBounds = typename TL::template continuous_sublist<0, func_idx - 1>; + static constexpr std::size_t rank = GetNumBounds(LaunchBounds()) / 2; + using Rank = std::integral_constant; + using Function = typename TL::template type; + using Args = typename TL::template continuous_sublist; }; template struct DispatchType { + // c++-20 has std:remove_cvref_t that does this same thing + template + using base_type = typename std::remove_cv_t>; + using BoundType = typename TypeList::template type<0>; static constexpr bool is_IndexRangeBounds = - std::is_same>::value; + std::is_same>::value; static constexpr bool is_ParFor = std::is_same::value; static constexpr bool is_ParScan = @@ -394,28 +335,32 @@ struct DispatchType { static constexpr bool is_Collapse = TeamPattern::value; }; -} // namespace impl - // Struct for translating between loop bounds given in terms of IndexRanges and loop // bounds given in terms of raw integers template struct BoundTranslator { + private: + template + static void GetIndexRanges_impl(const int idx, std::array &out, + const int s, const int e, Bounds &&...bounds) { + GetIndexRanges_impl(idx + 1, out, std::forward(bounds)...); + out[idx].s = s; + out[idx].e = e; + } + template + static void GetIndexRanges_impl(const int idx, std::array &out, + const IndexRange ir, Bounds &&...bounds) { + GetIndexRanges_impl(idx + 1, out, std::forward(bounds)...); + out[idx] = ir; + } + + public: using Bound_tl = TypeList; - static constexpr bool are_integers = std::is_integral_v< - typename std::remove_reference>::type>; - static constexpr uint rank = sizeof...(Bound_ts) / (1 + are_integers); - static std::array GetIndexRanges(Bound_ts... bounds) { - if constexpr (are_integers) { - std::array bounds_arr{static_cast(bounds)...}; - std::array out; - for (int r = 0; r < rank; ++r) { - out[r].s = static_cast(bounds_arr[2 * r]); - out[r].e = static_cast(bounds_arr[2 * r + 1]); - } - return out; - } else { - return std::array{bounds...}; - } + static constexpr std::size_t rank = GetNumBounds(Bound_tl()) / 2; + static std::array GetIndexRanges(Bound_ts &&...bounds) { + std::array out; + GetIndexRanges_impl(0, out, std::forward(bounds)...); + return out; } }; @@ -479,9 +424,15 @@ class FlatFunctor, TypeList +using base_type = typename std::remove_cv_t>; +template +using function_signature = FunctionSignature::operator())>; + template KOKKOS_INLINE_FUNCTION auto MakeFlatFunctor(F &function, Bounds &&...bounds) { - using signature = impl::function_signature; + using signature = function_signature; using IndexND = typename signature::IndexND; return FlatFunctor, typename signature::FArgs>( function, std::forward(bounds)...); @@ -597,7 +548,7 @@ class CollapseFunctor, void operator()(team_mbr_t team_member) const { Kokkos::Array inds_team; recoverIndex(inds_team, team_member.league_rank()); - using signature = impl::function_signature; + using signature = function_signature; using ThreadVectorInds = typename signature::IndexND::template continuous_sublist; @@ -653,7 +604,7 @@ KOKKOS_INLINE_FUNCTION auto MakeCollapseFunctor(LoopPatternCollapse, F &function, Bounds &&...bounds) { constexpr std::size_t Rank = Nteam + Nthread + Nvector; - using signature = impl::function_signature; + using signature = function_signature; using IndexND = typename signature::IndexND; return CollapseFunctor, @@ -667,7 +618,7 @@ struct par_dispatch_inner {}; template struct par_dispatch_inner> { - using signature = impl::function_signature; + using signature = function_signature; using LPT = LoopPatternTeam>; static_assert(LPT::value, "unsupported inner loop pattern"); @@ -769,7 +720,7 @@ template struct par_dispatch_impl, TypeList> { - using DType = impl::DispatchType; + using DType = DispatchType; template static inline void dispatch(std::string name, ExecSpace exec_space, Bounds &&...ids, @@ -852,8 +803,8 @@ struct par_dispatch_impl, template inline void par_dispatch(Pattern, std::string name, DevExecSpace exec_space, AllArgs &&...args) { - using dispatchsig = impl::DispatchSignature>; - static constexpr std::size_t Rank = dispatchsig::Rank::value; + using dispatchsig = DispatchSignature>; + constexpr std::size_t Rank = dispatchsig::Rank::value; using Function = typename dispatchsig::Function; using LaunchBounds = typename dispatchsig::LaunchBounds; using Args = typename dispatchsig::Args; @@ -891,7 +842,7 @@ inline std::enable_if_t::value, voi par_for_outer(Pattern, const std::string &name, DevExecSpace exec_space, std::size_t scratch_size_in_bytes, const int scratch_level, AllArgs &&...args) { - using dispatchsig = impl::DispatchSignature>; + using dispatchsig = DispatchSignature>; static constexpr std::size_t Rank = dispatchsig::Rank::value; using Function = typename dispatchsig::Function; using LaunchBounds = typename dispatchsig::LaunchBounds; @@ -912,7 +863,7 @@ inline void par_for_outer(const std::string &name, Args &&...args) { template KOKKOS_FORCEINLINE_FUNCTION void par_for_inner(Pattern, team_mbr_t team_member, AllArgs &&...args) { - using DispatchSig = impl::DispatchSignature>; + using DispatchSig = DispatchSignature>; constexpr std::size_t Rank = DispatchSig::Rank::value; using Function = typename DispatchSig::Function; using LaunchBounds = typename DispatchSig::LaunchBounds; diff --git a/src/utils/type_list.hpp b/src/utils/type_list.hpp index 8db9684ce210..4d53810f1277 100644 --- a/src/utils/type_list.hpp +++ b/src/utils/type_list.hpp @@ -18,10 +18,18 @@ #include #include #include +#include #include +#include "basic_types.hpp" +#include "concepts_lite.hpp" + namespace parthenon { +// c++-20 has std:remove_cvref_t that does this same thing +template +using base_type = typename std::remove_cv_t>; + // Convenience struct for holding a variadic pack of types // and providing compile time indexing into that pack as // well as the ability to get the index of a given type within @@ -49,6 +57,19 @@ struct TypeList { } } + template + static constexpr bool IsIn() { + if constexpr (I == n_types) { + return false; + } else { + if constexpr (std::is_same_v>) { + return true; + } else { + return IsIn(); + } + } + } + template static void IterateTypes(F func) { (func(Args()), ...); @@ -184,6 +205,15 @@ struct is_functor : std::false_type {}; template struct is_functor> : std::true_type {}; +// non-integral BoundTypes to consider +using BoundTypes = TypeList; + +template +constexpr bool isBoundType() { + using btype = base_type; + return BoundTypes::template IsIn() || std::is_integral_v; +} + template constexpr int FirstFuncIdx() { if constexpr (idx == TL::n_types) { @@ -196,25 +226,63 @@ constexpr int FirstFuncIdx() { } } -template -struct FuncSignature; +template +constexpr std::size_t GetNumBounds(TypeList) { + using TL = TypeList; + if constexpr (sizeof...(Bnds) == 0) { + return 0; + } else { + using Bnd0 = typename TL::template type<0>; + if constexpr (std::is_same_v, IndexRange>) { + return 2 + GetNumBounds(typename TL::template continuous_sublist<1>()); + } else if constexpr (std::is_integral_v>) { + using Bnd1 = typename TL::template type<1>; + static_assert(std::is_integral_v>, + "integer launch bounds need to come in (start, end) pairs"); + return 2 + GetNumBounds(typename TL::template continuous_sublist<2>()); + } else { + static_assert(always_false, "launch bound type not supported"); + } + } +} -template -struct FuncSignature : public FuncSignature {}; +template +struct FunctionSignature {}; -template -struct FuncSignature { - using type = R(Args...); - using arg_types_tl = TypeList; - using ret_type = R; -}; +template +struct FunctionSignature { + private: + using team_mbr_t = Kokkos::TeamPolicy<>::member_type; + static constexpr bool team_mbr = std::is_same_v>; + using TL = TypeList; -template -struct FuncSignature { - using type = R (T::*)(Args...); - using arg_types_tl = TypeList; - using ret_type = R; + public: + using IndexND = typename TL::template continuous_sublist<0, Rank + team_mbr - 1>; + using FArgs = typename TL::template continuous_sublist; }; + +template +using function_signature = FunctionSignature::operator())>; + +/* template */ +/* struct FuncSignature; */ + +/* template */ +/* struct FuncSignature : public FuncSignature {}; */ + +/* template */ +/* struct FuncSignature { */ +/* using type = R(Args...); */ +/* using arg_types_tl = TypeList; */ +/* using ret_type = R; */ +/* }; */ + +/* template */ +/* struct FuncSignature { */ +/* using type = R (T::*)(Args...); */ +/* using arg_types_tl = TypeList; */ +/* using ret_type = R; */ +/* }; */ } // namespace parthenon #endif // UTILS_TYPE_LIST_HPP_ From 8fb8f5cef37bf17f5c86b2fc5318b53c04cae44f Mon Sep 17 00:00:00 2001 From: adam reyes Date: Wed, 21 Aug 2024 17:00:26 +0200 Subject: [PATCH 47/99] simdfor using indexer --- src/kokkos_abstraction.hpp | 134 ++++++++++++++++++++----------------- 1 file changed, 72 insertions(+), 62 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 2baab823f9ef..febe5ffab7b1 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -305,12 +305,13 @@ struct DispatchSignature> { using Args = typename TL::template continuous_sublist; }; -template +template struct DispatchType { // c++-20 has std:remove_cvref_t that does this same thing template using base_type = typename std::remove_cv_t>; + static constexpr std::size_t Rank = GetNumBounds(TypeList()) / 2; using BoundType = typename TypeList::template type<0>; static constexpr bool is_IndexRangeBounds = std::is_same>::value; @@ -343,15 +344,19 @@ struct BoundTranslator { template static void GetIndexRanges_impl(const int idx, std::array &out, const int s, const int e, Bounds &&...bounds) { - GetIndexRanges_impl(idx + 1, out, std::forward(bounds)...); out[idx].s = s; out[idx].e = e; + if constexpr (sizeof...(Bounds) > 0) { + GetIndexRanges_impl(idx + 1, out, std::forward(bounds)...); + } } template static void GetIndexRanges_impl(const int idx, std::array &out, const IndexRange ir, Bounds &&...bounds) { - GetIndexRanges_impl(idx + 1, out, std::forward(bounds)...); out[idx] = ir; + if constexpr (sizeof...(Bounds) > 0) { + GetIndexRanges_impl(idx + 1, out, std::forward(bounds)...); + } } public: @@ -675,69 +680,73 @@ inline auto MakeMDRangePolicy(DevExecSpace exec_space, Args &&...args) { .policy(Indices(), Ones(), exec_space); } -template -struct SimdFor { - template - using Sequence = std::make_index_sequence; - - std::array indices; - MDRange mdrange; - - template - explicit SimdFor(Args &&...args) : mdrange(std::forward(args)...) {} - - template - inline void dispatch(Function &function) { - dispatch_simd<1>(function); - } - - private: - template - inline void dispatch_simd(std::integer_sequence, Function &function) { +template +void SimdFor(std::index_sequence, Function function, + std::array bounds) { + if constexpr (Rank == 1) { #pragma omp simd - for (int i = mdrange.lower[Rank - 1]; i <= mdrange.upper[Rank - 1]; i++) { - function(indices[Is]..., i); + for (int i = bounds[0].s; i <= bounds[0].e; i++) { + function(i); } - } - - template - inline void dispatch_simd(Function &function) { - if constexpr (LoopCount < Rank) { - for (int i = mdrange.lower[LoopCount - 1]; i <= mdrange.upper[LoopCount - 1]; i++) { - indices[LoopCount - 1] = i; - dispatch_simd(function); + } else { + auto idxer = + MakeIndexer(std::pair(bounds[OuterIs].s, bounds[OuterIs].e)...); + for (int idx = 0; idx < idxer.size(); idx++) { + auto indices = std::tuple_cat(idxer(idx), std::tuple({0})); +#pragma omp simd + for (int i = bounds[0].s; i <= bounds[0].e; i++) { + int &j = std::get(indices); + j = i; + std::apply(function, indices); } - } else { - dispatch_simd(Sequence(), function); } } +} + +template +struct par_dispatch_functor {}; + +template +struct par_dispatch_functor, Function, + TypeList, TypeList> { + using dispatch_type = DispatchType; + void operator()(Pattern pattern, Bounds &&...bounds, Function function, + Args &&...args) { + constexpr std::size_t Rank = dispatch_type::Rank; + static_assert(!(dispatch_type::is_MDRange && Rank < 2), + "Can not launch MDRange with Rank < 2"); + } }; -template +template struct par_dispatch_impl {}; -template -struct par_dispatch_impl, - TypeList> { - using DType = DispatchType; +template +struct par_dispatch_impl, TypeList> { + using dispatch_type = DispatchType; + using bound_translator = BoundTranslator; + static constexpr std::size_t Rank = bound_translator::rank; template - static inline void dispatch(std::string name, ExecSpace exec_space, Bounds &&...ids, + static inline void dispatch(std::string name, ExecSpace exec_space, Bounds &&...bounds, Function function, Args &&...args, const int scratch_level = 0, const std::size_t scratch_size_in_bytes = 0) { - static_assert(!(DType::is_MDRange && Rank < 2), + static_assert(!(dispatch_type::is_MDRange && Rank < 2), "Can not launch MDRange with Rank < 2"); Tag tag; PARTHENON_INSTRUMENT_REGION(name) - if constexpr (DType::is_SimdFor) { - SimdFor(std::forward(ids)...).dispatch(function); + auto bound_arr = bound_translator::GetIndexRanges(std::forward(bounds)...); + if constexpr (dispatch_type::is_SimdFor) { + SimdFor(std::make_index_sequence(), function, bound_arr); } else { kokkos_dispatch(tag, name, - policy(exec_space, std::forward(ids)..., scratch_level, + policy(exec_space, std::forward(bounds)..., scratch_level, scratch_size_in_bytes), - functor(function, std::forward(ids)...), + functor(function, std::forward(bounds)...), std::forward(args)...); } } @@ -746,19 +755,19 @@ struct par_dispatch_impl, static inline auto policy(ExecSpace exec_space, Bounds &&...ids, const int scratch_level = 0, const std::size_t scratch_size_in_bytes = 0) { - if constexpr (DType::is_FlatRange) { + if constexpr (dispatch_type::is_FlatRange) { int rangeNx = FlattenLaunchBound(std::forward(ids)...); return Kokkos::RangePolicy<>(exec_space, 0, rangeNx); - } else if constexpr (DType::is_MDRange) { + } else if constexpr (dispatch_type::is_MDRange) { return MakeMDRangePolicy(exec_space, std::forward(ids)...); - } else if constexpr (DType::is_SimdFor) { + } else if constexpr (dispatch_type::is_SimdFor) { return loop_pattern_simdfor_tag; - } else if constexpr (DType::is_Collapse) { - int rangeNx = - FlattenLaunchBound(std::forward(ids)...); + } else if constexpr (dispatch_type::is_Collapse) { + int rangeNx = FlattenLaunchBound( + std::forward(ids)...); return team_policy(exec_space, rangeNx, Kokkos::AUTO) .set_scratch_size(scratch_level, Kokkos::PerTeam(scratch_size_in_bytes)); } else { @@ -767,14 +776,15 @@ struct par_dispatch_impl, } static inline auto functor(Function function, Bounds &&...ids) { - if constexpr (DType::is_FlatRange) { + if constexpr (dispatch_type::is_FlatRange) { return MakeFlatFunctor(function, std::forward(ids)...); - } else if constexpr (DType::is_MDRange || DType::is_SimdFor) { + } else if constexpr (dispatch_type::is_MDRange || dispatch_type::is_SimdFor) { return function; - } else if constexpr (DType::is_Collapse) { + } else if constexpr (dispatch_type::is_Collapse) { constexpr bool ParForOuter = std::is_same_v; - return MakeCollapseFunctor(typename DType::TeamPattern::LoopPattern(), - function, std::forward(ids)...); + return MakeCollapseFunctor( + typename dispatch_type::TeamPattern::LoopPattern(), function, + std::forward(ids)...); } else { static_assert(always_false, "can't make functor for pattern"); } @@ -785,7 +795,7 @@ struct par_dispatch_impl, static inline int FlattenLaunchBound(Bounds &&...ids) { static_assert(NCollapse <= Rank, "Can't flatten more loops than rank"); int rangeNx = 1; - if constexpr (DType::is_IndexRangeBounds) { + if constexpr (dispatch_type::is_IndexRangeBounds) { std::array ranges{{ids...}}; for (int i = 0; i < NCollapse; i++) { rangeNx *= ranges[i].e - ranges[i].s + 1; @@ -812,7 +822,7 @@ inline void par_dispatch(Pattern, std::string name, DevExecSpace exec_space, if constexpr (Rank > 1 && std::is_same_v) { static_assert(always_false, "par_scan only for 1D loops"); } - par_dispatch_impl::dispatch( + par_dispatch_impl::dispatch( name, exec_space, std::forward(args)...); } @@ -849,7 +859,7 @@ par_for_outer(Pattern, const std::string &name, DevExecSpace exec_space, using Args = typename dispatchsig::Args; using Tag = dispatch_impl::ParallelForDispatch; - par_dispatch_impl::dispatch( + par_dispatch_impl::dispatch( name, exec_space, std::forward(args)..., scratch_level, scratch_size_in_bytes); } @@ -870,8 +880,8 @@ KOKKOS_FORCEINLINE_FUNCTION void par_for_inner(Pattern, team_mbr_t team_member, if constexpr (std::is_same_v) { using Args = typename DispatchSig::Args; - par_dispatch_impl() + par_dispatch_impl() .dispatch("simd", HostExecSpace(), std::forward(args)...); } else { par_dispatch_inner().dispatch( From 066b3d7219cfd68d410197c6ef26e3efd44e8cb6 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Wed, 21 Aug 2024 22:18:40 +0200 Subject: [PATCH 48/99] generalized tag disptach --- src/kokkos_abstraction.hpp | 277 ++++++++++++------------------------- src/utils/indexer.hpp | 11 ++ 2 files changed, 100 insertions(+), 188 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index febe5ffab7b1..0e4739ed36b8 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -174,6 +174,9 @@ static struct LoopPatternTPTVR { // inner Kokkos::ThreadVectorRange static struct LoopPatternTPTTRTVR { } loop_pattern_tpttrtvr_tag; +// Used as generic catch all for LoopPatternTeam<> +static struct LoopPatternTeamGeneric { +} loop_pattern_team_generic_tag; // Used to catch undefined behavior as it results in throwing an error static struct LoopPatternUndefined { } loop_pattern_undefined_tag; @@ -184,7 +187,10 @@ struct LoopPatternCollapse {}; // trait to track if pattern requests any type of hierarchial parallelism template -struct LoopPatternTeam : std::false_type {}; +struct LoopPatternTeam : std::false_type { + static constexpr std::size_t Nvector = 0; + static constexpr std::size_t Nthread = 0; +}; // This pattern needs to determine the team and thread/vector count at compile time // By contrast the others specify the thread/vector count at compile time and the @@ -307,14 +313,9 @@ struct DispatchSignature> { template struct DispatchType { - // c++-20 has std:remove_cvref_t that does this same thing - template - using base_type = typename std::remove_cv_t>; static constexpr std::size_t Rank = GetNumBounds(TypeList()) / 2; - using BoundType = typename TypeList::template type<0>; - static constexpr bool is_IndexRangeBounds = - std::is_same>::value; + static constexpr bool is_ParFor = std::is_same::value; static constexpr bool is_ParScan = @@ -328,12 +329,14 @@ struct DispatchType { std::integral_constant>; // false_type unless we use // an outer team policy - // fallback simd par_reduce to flat range and force par_scan to flat range - static constexpr bool is_FlatRange = - (IsFlatRange || (IsSimdFor && !is_ParFor)) || is_ParScan; - static constexpr bool is_SimdFor = (IsSimdFor && is_ParFor); - static constexpr bool is_MDRange = (IsMDRange && !is_ParScan); - static constexpr bool is_Collapse = TeamPattern::value; + static constexpr auto GetTag() { + // fallback simd par_reduce to flat range and force par_scan to flat range + if constexpr (IsFlatRange || (IsSimdFor && !is_ParFor)) + return loop_pattern_flatrange_tag; + if constexpr (IsSimdFor && is_ParFor) return loop_pattern_simdfor_tag; + if constexpr (IsMDRange && !is_ParScan) return loop_pattern_mdrange_tag; + if constexpr (TeamPattern::value) return loop_pattern_team_generic_tag; + } }; // Struct for translating between loop bounds given in terms of IndexRanges and loop @@ -371,63 +374,6 @@ struct BoundTranslator { template struct BoundTranslator> : public BoundTranslator {}; -template -class FlatFunctor {}; - -template -class FlatFunctor, TypeList> { - static constexpr std::size_t Rank = sizeof...(Is); - Kokkos::Array ranges; - Kokkos::Array strides; - Function function; - - public: - template - KOKKOS_INLINE_FUNCTION FlatFunctor(const Function _function, IndexRange idr, - Args... args) - : function(_function), ranges({{idr, args...}}) { - Initialize(); - } - - template - KOKKOS_INLINE_FUNCTION FlatFunctor(const Function _function, Args... args) - : function(_function) { - std::array indices{{static_cast(args)...}}; - for (int i = 0; i < Rank; i++) { - ranges[i] = {indices[2 * i], indices[2 * i + 1]}; - } - Initialize(); - } - - KOKKOS_INLINE_FUNCTION - void Initialize() { - for (int ri = 1; ri < Rank; ri++) { - const int N = ranges[ri].e - ranges[ri].s + 1; - strides[ri - 1] = N; - for (int rj = 0; rj < ri - 1; rj++) { - strides[rj] *= N; - } - } - } - - KOKKOS_INLINE_FUNCTION - void operator()(const int &idx, FArgs... fargs) const { - int inds[Rank]; - inds[0] = idx; - for (int i = 1; i < Rank; i++) { - inds[i] = idx; - inds[i - 1] /= strides[i - 1]; - for (int j = 0; j < i; j++) { - inds[i] -= inds[j] * strides[j]; - } - } - for (int i = 0; i < Rank; i++) { - inds[i] += ranges[i].s; - } - - function(inds[Is]..., std::forward(fargs)...); - } -}; // don't know why my LSP can't find this in type_list.hpp template @@ -435,14 +381,6 @@ using base_type = typename std::remove_cv_t> template using function_signature = FunctionSignature::operator())>; -template -KOKKOS_INLINE_FUNCTION auto MakeFlatFunctor(F &function, Bounds &&...bounds) { - using signature = function_signature; - using IndexND = typename signature::IndexND; - return FlatFunctor, typename signature::FArgs>( - function, std::forward(bounds)...); -} - template struct InnerFunctor {}; @@ -635,51 +573,6 @@ struct par_dispatch_inner> { } }; -template -class MDRange { - public: - Kokkos::Array lower, upper; - - template - MDRange(IndexRange idr, Args... args) { - std::array ranges{{idr, args...}}; - for (int i = 0; i < Rank; i++) { - lower[i] = ranges[i].s; - upper[i] = ranges[i].e; - } - } - - template - explicit MDRange(Args... args) { - std::array indices{{static_cast(args)...}}; - for (int i = 0; i < Rank; i++) { - lower[i] = indices[2 * i]; - upper[i] = indices[2 * i + 1]; - } - } - - template - auto policy(std::integer_sequence, - std::integer_sequence, DevExecSpace exec_space) { - return Kokkos::MDRangePolicy>( - exec_space, {lower[Is]...}, {1 + upper[Is]...}, - {ones..., upper[Rank - 1] + 1 - lower[Rank - 1]}); - } -}; - -template -inline auto MakeMDRange(Args &&...args) { - return MDRange(std::forward(args)...); -} - -template -inline auto MakeMDRangePolicy(DevExecSpace exec_space, Args &&...args) { - using Indices = typename std::make_index_sequence; - using Ones = sequence_of_int_v; - return MakeMDRange(std::forward(args)...) - .policy(Indices(), Ones(), exec_space); -} - template void SimdFor(std::index_sequence, Function function, std::array bounds) { @@ -720,12 +613,13 @@ struct par_dispatch_functor, Function, } }; -template +template struct par_dispatch_impl {}; template -struct par_dispatch_impl, TypeList> { + typename... Args, typename... ExtraFuncArgs> +struct par_dispatch_impl, TypeList, + TypeList> { using dispatch_type = DispatchType; using bound_translator = BoundTranslator; static constexpr std::size_t Rank = bound_translator::rank; @@ -735,78 +629,82 @@ struct par_dispatch_impl, TypeList(bounds)...); - if constexpr (dispatch_type::is_SimdFor) { + constexpr auto tag = dispatch_type::GetTag(); + if constexpr (std::is_same_v>) { SimdFor(std::make_index_sequence(), function, bound_arr); } else { - kokkos_dispatch(tag, name, - policy(exec_space, std::forward(bounds)..., scratch_level, - scratch_size_in_bytes), - functor(function, std::forward(bounds)...), - std::forward(args)...); + dispatch(tag, std::make_index_sequence(), + std::make_index_sequence(), name, exec_space, bound_arr, function, + std::forward(args)...); } } - template - static inline auto policy(ExecSpace exec_space, Bounds &&...ids, - const int scratch_level = 0, - const std::size_t scratch_size_in_bytes = 0) { - if constexpr (dispatch_type::is_FlatRange) { - int rangeNx = FlattenLaunchBound(std::forward(ids)...); - return Kokkos::RangePolicy<>(exec_space, 0, rangeNx); - - } else if constexpr (dispatch_type::is_MDRange) { - return MakeMDRangePolicy(exec_space, std::forward(ids)...); - - } else if constexpr (dispatch_type::is_SimdFor) { - return loop_pattern_simdfor_tag; - - } else if constexpr (dispatch_type::is_Collapse) { - int rangeNx = FlattenLaunchBound( - std::forward(ids)...); - return team_policy(exec_space, rangeNx, Kokkos::AUTO) - .set_scratch_size(scratch_level, Kokkos::PerTeam(scratch_size_in_bytes)); - } else { - static_assert(always_false, "can't make policy for pattern"); - } + private: + template + using sequence = std::integer_sequence; + + template + static inline void + dispatch(LoopPatternFlatRange, sequence, sequence, + std::string name, ExecSpace exec_space, std::array bound_arr, + Function function, Args &&...args) { + auto idxer = MakeIndexer(bound_arr); + kokkos_dispatch( + Tag(), name, Kokkos::RangePolicy<>(exec_space, 0, idxer.size()), + KOKKOS_LAMBDA(const int idx, ExtraFuncArgs... fargs) { + auto idx_tuple = idxer(idx); + function(std::get(idx_tuple)..., + std::forward(fargs)...); + }, + std::forward(args)...); } - static inline auto functor(Function function, Bounds &&...ids) { - if constexpr (dispatch_type::is_FlatRange) { - return MakeFlatFunctor(function, std::forward(ids)...); - } else if constexpr (dispatch_type::is_MDRange || dispatch_type::is_SimdFor) { - return function; - } else if constexpr (dispatch_type::is_Collapse) { - constexpr bool ParForOuter = std::is_same_v; - return MakeCollapseFunctor( - typename dispatch_type::TeamPattern::LoopPattern(), function, - std::forward(ids)...); - } else { - static_assert(always_false, "can't make functor for pattern"); - } + template + static inline void + dispatch(LoopPatternMDRange, sequence, sequence, + std::string name, ExecSpace exec_space, std::array bound_arr, + Function function, Args &&...args) { + auto idxer = MakeIndexer(bound_arr); + kokkos_dispatch( + Tag(), name, Kokkos::RangePolicy<>(exec_space, 0, idxer.size()), + KOKKOS_LAMBDA(const int idx, ExtraFuncArgs... fargs) { + auto idx_tuple = idxer(idx); + function(std::get(idx_tuple)..., + std::forward(fargs)...); + }, + std::forward(args)...); } - private: - template - static inline int FlattenLaunchBound(Bounds &&...ids) { - static_assert(NCollapse <= Rank, "Can't flatten more loops than rank"); - int rangeNx = 1; - if constexpr (dispatch_type::is_IndexRangeBounds) { - std::array ranges{{ids...}}; - for (int i = 0; i < NCollapse; i++) { - rangeNx *= ranges[i].e - ranges[i].s + 1; - } + template + static inline void + dispatch(LoopPatternTeamGeneric, sequence, sequence, + std::string name, ExecSpace exec_space, std::array bound_arr, + Function function, Args &&...args) { + auto idxer = + MakeIndexer(std::array{bound_arr[OuterIs]...}); + constexpr bool ParForOuter = std::is_same_v; + if constexpr (ParForOuter) { + kokkos_dispatch( + Tag(), name, team_policy(exec_space, idxer.size(), Kokkos::AUTO), + KOKKOS_LAMBDA(team_mbr_t team_member, ExtraFuncArgs... fargs) { + auto idx_tuple = idxer(team_member.league_rank()); + function(team_member, std::get(idx_tuple)..., + std::forward(fargs)...); + }, + std::forward(args)...); } else { - int indices[sizeof...(Bounds)] = {static_cast(ids)...}; - for (int i = 0; i < 2 * NCollapse; i += 2) { - rangeNx *= indices[i + 1] - indices[i] + 1; - } + constexpr std::size_t Nouter = Rank - dispatch_type::TeamPattern::Nvector - + dispatch_type::TeamPattern::Nthread; + kokkos_dispatch(Tag(), name, team_policy(exec_space, idxer.size(), Kokkos::AUTO), + MakeCollapseFunctor( + typename dispatch_type::TeamPattern::LoopPattern(), function, + bound_arr[OuterIs]..., bound_arr[Nouter + InnerIs]...), + std::forward(args)...); } - return rangeNx; } }; @@ -818,11 +716,12 @@ inline void par_dispatch(Pattern, std::string name, DevExecSpace exec_space, using Function = typename dispatchsig::Function; using LaunchBounds = typename dispatchsig::LaunchBounds; using Args = typename dispatchsig::Args; + using ExtraFuncArgs = typename function_signature::FArgs; if constexpr (Rank > 1 && std::is_same_v) { static_assert(always_false, "par_scan only for 1D loops"); } - par_dispatch_impl::dispatch( + par_dispatch_impl::dispatch( name, exec_space, std::forward(args)...); } @@ -858,8 +757,9 @@ par_for_outer(Pattern, const std::string &name, DevExecSpace exec_space, using LaunchBounds = typename dispatchsig::LaunchBounds; using Args = typename dispatchsig::Args; using Tag = dispatch_impl::ParallelForDispatch; + using ExtraFuncArgs = typename function_signature::FArgs; - par_dispatch_impl::dispatch( + par_dispatch_impl::dispatch( name, exec_space, std::forward(args)..., scratch_level, scratch_size_in_bytes); } @@ -880,8 +780,9 @@ KOKKOS_FORCEINLINE_FUNCTION void par_for_inner(Pattern, team_mbr_t team_member, if constexpr (std::is_same_v) { using Args = typename DispatchSig::Args; + using ExtraFuncArgs = typename function_signature::FArgs; par_dispatch_impl() + LaunchBounds, Args, ExtraFuncArgs>() .dispatch("simd", HostExecSpace(), std::forward(args)...); } else { par_dispatch_inner().dispatch( diff --git a/src/utils/indexer.hpp b/src/utils/indexer.hpp index ba0453431431..6f39c5097ae6 100644 --- a/src/utils/indexer.hpp +++ b/src/utils/indexer.hpp @@ -194,6 +194,17 @@ auto MakeIndexer(const std::pair &...ranges) { return Indexer(ranges...); } +template +auto MakeIndexer(std::array bounds_arr, + std::integer_sequence) { + return MakeIndexer(std::pair(bounds_arr[Is].s, bounds_arr[Is].e)...); +} + +template +auto MakeIndexer(std::array bounds_arr) { + return MakeIndexer(bounds_arr, std::make_index_sequence()); +} + namespace impl { template auto MakeIndexerIntImpl(std::array args, std::index_sequence) { From 54a3c01cec1296f411fc11c462b518cbd219033c Mon Sep 17 00:00:00 2001 From: adam reyes Date: Wed, 21 Aug 2024 23:31:56 +0200 Subject: [PATCH 49/99] dispatch_collapse added for team patterns --- src/kokkos_abstraction.hpp | 101 ++++++++++++++++++++++++++----------- src/utils/type_list.hpp | 18 +++---- 2 files changed, 81 insertions(+), 38 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 0e4739ed36b8..e414f74dd965 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -30,6 +30,7 @@ #include +#include "Kokkos_Macros.hpp" #include "basic_types.hpp" #include "config.hpp" #include "parthenon_array_generic.hpp" @@ -316,20 +317,22 @@ struct DispatchType { static constexpr std::size_t Rank = GetNumBounds(TypeList()) / 2; - static constexpr bool is_ParFor = - std::is_same::value; - static constexpr bool is_ParScan = - std::is_same::value; - - static constexpr bool IsFlatRange = std::is_same::value; - static constexpr bool IsMDRange = std::is_same::value; - static constexpr bool IsSimdFor = std::is_same::value; using TeamPattern = LoopPatternTeam>; // false_type unless we use // an outer team policy + // check any confilcts with the requested pattern + // and return the actual one we use static constexpr auto GetTag() { + constexpr bool is_ParFor = + std::is_same::value; + constexpr bool is_ParScan = + std::is_same::value; + + constexpr bool IsFlatRange = std::is_same::value; + constexpr bool IsMDRange = std::is_same::value; + constexpr bool IsSimdFor = std::is_same::value; // fallback simd par_reduce to flat range and force par_scan to flat range if constexpr (IsFlatRange || (IsSimdFor && !is_ParFor)) return loop_pattern_flatrange_tag; @@ -344,6 +347,8 @@ struct DispatchType { template struct BoundTranslator { private: + // overloads for different launch bound types. + // should also be counted by isBoundType & GetNumBounds in type_list.hpp template static void GetIndexRanges_impl(const int idx, std::array &out, const int s, const int e, Bounds &&...bounds) { @@ -375,12 +380,6 @@ struct BoundTranslator { template struct BoundTranslator> : public BoundTranslator {}; -// don't know why my LSP can't find this in type_list.hpp -template -using base_type = typename std::remove_cv_t>; -template -using function_signature = FunctionSignature::operator())>; - template struct InnerFunctor {}; @@ -573,6 +572,50 @@ struct par_dispatch_inner> { } }; +template +KOKKOS_FORCEINLINE_FUNCTION void +dispatch_collapse(std::integer_sequence, + std::integer_sequence, team_mbr_t team_member, + IdxTeam idxer_team, std::array bound_arr, + Function function) { + constexpr std::size_t Nthread = sizeof...(ThreadIs); + constexpr std::size_t Nvector = sizeof...(VectorIs); + auto inds_team = idxer_team(team_member.league_rank()); + if constexpr (Nthread > 0) { + auto idxer_thread = + MakeIndexer(std::pair(bound_arr[ThreadIs].s, bound_arr[ThreadIs].e)...); + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, 0, idxer_thread.size()), + [&](const int idThread) { + auto inds_thread = idxer_thread(idThread); + if constexpr (Nvector > 0) { + auto idxer_vector = MakeIndexer(std::pair( + bound_arr[Nthread + VectorIs].s, bound_arr[Nthread + VectorIs].e)...); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(team_member, 0, idxer_vector.size()), + [&](const int idVector) { + auto inds_all = + std::tuple_cat(inds_team, inds_thread, idxer_vector(idVector)); + std::apply(function, inds_all); + }); + } else { + auto inds_all = std::tuple_cat(inds_team, inds_thread); + std::apply(function, inds_all); + } + }); + } else { + auto idxer_vector = MakeIndexer(std::pair( + bound_arr[Nthread + VectorIs].s, bound_arr[Nthread + VectorIs].e)...); + Kokkos::parallel_for(Kokkos::TeamVectorRange(team_member, 0, idxer_vector.size()), + [&](const int idVector) { + auto inds_all = + std::tuple_cat(inds_team, idxer_vector(idVector)); + std::apply(function, inds_all); + }); + } +} + template void SimdFor(std::index_sequence, Function function, std::array bounds) { @@ -687,24 +730,24 @@ struct par_dispatch_impl, TypeList{bound_arr[OuterIs]...}); constexpr bool ParForOuter = std::is_same_v; - if constexpr (ParForOuter) { - kokkos_dispatch( - Tag(), name, team_policy(exec_space, idxer.size(), Kokkos::AUTO), - KOKKOS_LAMBDA(team_mbr_t team_member, ExtraFuncArgs... fargs) { + kokkos_dispatch( + Tag(), name, team_policy(exec_space, idxer.size(), Kokkos::AUTO), + KOKKOS_LAMBDA(team_mbr_t team_member, ExtraFuncArgs... fargs) { + if constexpr (ParForOuter) { auto idx_tuple = idxer(team_member.league_rank()); function(team_member, std::get(idx_tuple)..., std::forward(fargs)...); - }, - std::forward(args)...); - } else { - constexpr std::size_t Nouter = Rank - dispatch_type::TeamPattern::Nvector - - dispatch_type::TeamPattern::Nthread; - kokkos_dispatch(Tag(), name, team_policy(exec_space, idxer.size(), Kokkos::AUTO), - MakeCollapseFunctor( - typename dispatch_type::TeamPattern::LoopPattern(), function, - bound_arr[OuterIs]..., bound_arr[Nouter + InnerIs]...), - std::forward(args)...); - } + } else { + using TeamPattern = typename dispatch_type::TeamPattern; + constexpr std::size_t Nvector = TeamPattern::Nvector; + constexpr std::size_t Nthread = TeamPattern::Nthread; + constexpr std::size_t Nouter = Rank - Nvector - Nthread; + dispatch_collapse( + std::make_index_sequence(), std::make_index_sequence(), + team_member, idxer, {bound_arr[Nouter + InnerIs]...}, function); + } + }, + std::forward(args)...); } }; diff --git a/src/utils/type_list.hpp b/src/utils/type_list.hpp index 4d53810f1277..5756390f93eb 100644 --- a/src/utils/type_list.hpp +++ b/src/utils/type_list.hpp @@ -205,15 +205,6 @@ struct is_functor : std::false_type {}; template struct is_functor> : std::true_type {}; -// non-integral BoundTypes to consider -using BoundTypes = TypeList; - -template -constexpr bool isBoundType() { - using btype = base_type; - return BoundTypes::template IsIn() || std::is_integral_v; -} - template constexpr int FirstFuncIdx() { if constexpr (idx == TL::n_types) { @@ -226,6 +217,15 @@ constexpr int FirstFuncIdx() { } } +// Recognized bound types +// additional types should be translated in BoundTranslator (kokkos_abstraction.hpp) +template +constexpr bool isBoundType() { + using BoundTypes = TypeList; + using btype = base_type; + return BoundTypes::template IsIn() || std::is_integral_v; +} + template constexpr std::size_t GetNumBounds(TypeList) { using TL = TypeList; From b2a6e493160527f7a7ae58124affb3ea2eefe728 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Thu, 22 Aug 2024 00:18:52 +0200 Subject: [PATCH 50/99] par_for_inner with new dispatch --- src/kokkos_abstraction.hpp | 244 +++++++------------------------------ src/utils/indexer.hpp | 6 + 2 files changed, 47 insertions(+), 203 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index e414f74dd965..87b76fd5f58d 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -298,10 +298,10 @@ inline void kokkos_dispatch(ParallelScanDispatch, Args &&...args) { template struct DispatchSignature {}; -template -struct DispatchSignature> { +template +struct DispatchSignature> { private: - using TL = TypeList; + using TL = TypeList; static constexpr std::size_t func_idx = FirstFuncIdx(); public: @@ -400,178 +400,6 @@ struct InnerFunctor, } }; -template -class CollapseFunctor {}; - -template -class CollapseFunctor, - std::integer_sequence, - std::integer_sequence, Function, ParForOuter> { - static constexpr std::size_t Nteam = sizeof...(Iteam); - static constexpr std::size_t Nthread = sizeof...(Ithread); - static constexpr std::size_t Nvector = sizeof...(Ivector); - static constexpr std::size_t Rank = Nteam + Nthread + Nvector; - - Kokkos::Array ranges; - Kokkos::Array strides; - Function function; - - public: - template - KOKKOS_INLINE_FUNCTION CollapseFunctor(const Function _function, IndexRange idr, - Args... args) - : function(_function), ranges({{idr, args...}}) { - Initialize(); - } - - template - KOKKOS_INLINE_FUNCTION CollapseFunctor(const Function _function, Args... args) - : function(_function) { - std::array indices{{static_cast(args)...}}; - for (int i = 0; i < Rank; i++) { - ranges[i] = {indices[2 * i], indices[2 * i + 1]}; - } - Initialize(); - } - - KOKKOS_INLINE_FUNCTION - void Initialize() { - if constexpr (Rank > 1) { - for (int ri = 0; ri < Nteam - 1; ri++) { - const int N = ranges[ri + 1].e - ranges[ri + 1].s + 1; - strides[ri] = N; - for (int rj = 0; rj < ri; rj++) { - strides[rj] *= N; - } - } - for (int ri = Nteam; ri < Nteam + Nthread - 1; ri++) { - const int N = ranges[ri + 1].e - ranges[ri + 1].s + 1; - strides[ri] = N; - for (int rj = Nteam; rj < ri; rj++) { - strides[rj] *= N; - } - } - for (int ri = Nteam + Nthread; ri < Rank - 1; ri++) { - const int N = ranges[ri + 1].e - ranges[ri + 1].s + 1; - strides[ri] = N; - for (int rj = Nteam + Nthread; rj < ri; rj++) { - strides[rj] *= N; - } - } - } - } - - template - KOKKOS_INLINE_FUNCTION void recoverIndex(Kokkos::Array &inds, int idx) const { - inds[0] = idx; - for (int i = 1; i < N; i++) { - inds[i] = idx; - inds[i - 1] /= strides[i - 1 + start]; - for (int j = 0; j < i; j++) { - inds[i] -= inds[j] * strides[j + start]; - } - } - for (int i = 0; i < N; i++) { - inds[i] += ranges[i + start].s; - } - } - - KOKKOS_INLINE_FUNCTION - int FlattenLaunchBound(int start, int end) const { - int rangeNx = 1; - for (int i = start; i < end; i++) { - rangeNx *= ranges[i].e - ranges[i].s + 1; - } - return rangeNx; - } - - KOKKOS_INLINE_FUNCTION - void operator()(team_mbr_t team_member) const { - Kokkos::Array inds_team; - recoverIndex(inds_team, team_member.league_rank()); - using signature = function_signature; - using ThreadVectorInds = - typename signature::IndexND::template continuous_sublist; - - if constexpr (ParForOuter) { - function(team_member, inds_team[Iteam]...); - } else { - collapse_inner( - team_member, - InnerFunctor>( - inds_team, function)); - } - } - - template - KOKKOS_INLINE_FUNCTION void collapse_inner(team_mbr_t team_member, - InnerFunction inner_function) const { - if constexpr (Nthread > 0) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange<>(team_member, 0, - FlattenLaunchBound(Nteam, Nteam + Nthread)), - [&](const int idThread) { - Kokkos::Array inds_thread; - recoverIndex(inds_thread, idThread); - if constexpr (Nvector > 0) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(team_member, 0, - FlattenLaunchBound(Nteam + Nthread, Rank)), - [&](const int idVector) { - Kokkos::Array inds_vector; - recoverIndex(inds_vector, idVector); - inner_function(inds_thread[Ithread]..., inds_vector[Ivector]...); - }); - } else { - inner_function(inds_thread[Ithread]...); - } - }); - } else { - Kokkos::parallel_for(Kokkos::TeamVectorRange( - team_member, 0, FlattenLaunchBound(Nteam + Nthread, Rank)), - [&](const int idVector) { - Kokkos::Array inds_vector; - recoverIndex(inds_vector, - idVector); - inner_function(inds_vector[Ivector]...); - }); - } - } -}; - -template -KOKKOS_INLINE_FUNCTION auto -MakeCollapseFunctor(LoopPatternCollapse, F &function, - Bounds &&...bounds) { - constexpr std::size_t Rank = Nteam + Nthread + Nvector; - using signature = function_signature; - using IndexND = typename signature::IndexND; - - return CollapseFunctor, - std::make_index_sequence, - std::make_index_sequence, F, ParForOuter>( - function, std::forward(bounds)...); -} - -template -struct par_dispatch_inner {}; - -template -struct par_dispatch_inner> { - using signature = function_signature; - using LPT = LoopPatternTeam>; - - static_assert(LPT::value, "unsupported inner loop pattern"); - KOKKOS_FORCEINLINE_FUNCTION - void dispatch(team_mbr_t team_member, Bounds &&...bounds, Function function) const { - MakeCollapseFunctor(typename LPT::LoopPattern(), function, - std::forward(bounds)...) - .collapse_inner(team_member, function); - } -}; - template KOKKOS_FORCEINLINE_FUNCTION void @@ -639,23 +467,47 @@ void SimdFor(std::index_sequence, Function function, } } -template -struct par_dispatch_functor {}; +template +struct par_disp_inner_impl {}; -template -struct par_dispatch_functor, Function, - TypeList, TypeList> { - using dispatch_type = DispatchType; - void operator()(Pattern pattern, Bounds &&...bounds, Function function, - Args &&...args) { - constexpr std::size_t Rank = dispatch_type::Rank; - static_assert(!(dispatch_type::is_MDRange && Rank < 2), - "Can not launch MDRange with Rank < 2"); +template +struct par_disp_inner_impl, TypeList, + TypeList> { + using bound_translator = BoundTranslator; + static constexpr std::size_t Rank = bound_translator::rank; + using TeamPattern = LoopPatternTeam>; + + KOKKOS_FORCEINLINE_FUNCTION void dispatch(team_mbr_t team_member, Bounds &&...bounds, + Function function, Args &&...args) { + // TODO(acreyes): I don't think this static method will wokr on device... + auto bound_arr = bound_translator::GetIndexRanges(std::forward(bounds)...); + if constexpr (std::is_same_v) { + SimdFor(std::make_index_sequence(), function, bound_arr); + } else { + auto idxer = Indexer<>(); + constexpr std::size_t Nthread = TeamPattern::Nthread; + constexpr std::size_t Nvector = TeamPattern::Nvector; + dispatch_collapse(std::make_index_sequence(), + std::make_index_sequence(), team_member, idxer, + bound_arr, function); + } } }; +template +KOKKOS_FORCEINLINE_FUNCTION void par_disp_inner(Pattern, team_mbr_t team_member, + AllArgs &&...args) { + using dispatchsig = DispatchSignature>; + constexpr std::size_t Rank = dispatchsig::Rank::value; + using Function = typename dispatchsig::Function; + using LaunchBounds = typename dispatchsig::LaunchBounds; + using Args = typename dispatchsig::Args; + using ExtraFuncArgs = typename function_signature::FArgs; + par_disp_inner_impl().dispatch( + team_member, std::forward(args)...); +} + template struct par_dispatch_impl {}; @@ -816,21 +668,7 @@ inline void par_for_outer(const std::string &name, Args &&...args) { template KOKKOS_FORCEINLINE_FUNCTION void par_for_inner(Pattern, team_mbr_t team_member, AllArgs &&...args) { - using DispatchSig = DispatchSignature>; - constexpr std::size_t Rank = DispatchSig::Rank::value; - using Function = typename DispatchSig::Function; - using LaunchBounds = typename DispatchSig::LaunchBounds; - - if constexpr (std::is_same_v) { - using Args = typename DispatchSig::Args; - using ExtraFuncArgs = typename function_signature::FArgs; - par_dispatch_impl() - .dispatch("simd", HostExecSpace(), std::forward(args)...); - } else { - par_dispatch_inner().dispatch( - team_member, std::forward(args)...); - } + par_disp_inner(Pattern(), team_member, std::forward(args)...); } template diff --git a/src/utils/indexer.hpp b/src/utils/indexer.hpp index 6f39c5097ae6..21b83b6b492f 100644 --- a/src/utils/indexer.hpp +++ b/src/utils/indexer.hpp @@ -149,6 +149,12 @@ struct Indexer { std::size_t _size; }; +template <> +struct Indexer<> { + KOKKOS_FORCEINLINE_FUNCTION + std::tuple<> operator()(int idx) const { return std::tuple<>(); } +}; + template class SpatiallyMaskedIndexer : public Indexer { public: From 5e3e8b2980d6724f5689d77350ff50105e5c5748 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Thu, 22 Aug 2024 00:34:50 +0200 Subject: [PATCH 51/99] mdrange dispatch --- src/kokkos_abstraction.hpp | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 87b76fd5f58d..11f9e8e348d8 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -33,6 +33,7 @@ #include "Kokkos_Macros.hpp" #include "basic_types.hpp" #include "config.hpp" +#include "impl/Kokkos_Tools_Generic.hpp" #include "parthenon_array_generic.hpp" #include "utils/concepts_lite.hpp" #include "utils/error_checking.hpp" @@ -350,8 +351,9 @@ struct BoundTranslator { // overloads for different launch bound types. // should also be counted by isBoundType & GetNumBounds in type_list.hpp template - static void GetIndexRanges_impl(const int idx, std::array &out, - const int s, const int e, Bounds &&...bounds) { + KOKKOS_INLINE_FUNCTION void + GetIndexRanges_impl(const int idx, std::array &out, const int s, + const int e, Bounds &&...bounds) { out[idx].s = s; out[idx].e = e; if constexpr (sizeof...(Bounds) > 0) { @@ -359,8 +361,9 @@ struct BoundTranslator { } } template - static void GetIndexRanges_impl(const int idx, std::array &out, - const IndexRange ir, Bounds &&...bounds) { + KOKKOS_INLINE_FUNCTION void + GetIndexRanges_impl(const int idx, std::array &out, const IndexRange ir, + Bounds &&...bounds) { out[idx] = ir; if constexpr (sizeof...(Bounds) > 0) { GetIndexRanges_impl(idx + 1, out, std::forward(bounds)...); @@ -370,7 +373,9 @@ struct BoundTranslator { public: using Bound_tl = TypeList; static constexpr std::size_t rank = GetNumBounds(Bound_tl()) / 2; - static std::array GetIndexRanges(Bound_ts &&...bounds) { + + KOKKOS_INLINE_FUNCTION + std::array GetIndexRanges(Bound_ts &&...bounds) { std::array out; GetIndexRanges_impl(0, out, std::forward(bounds)...); return out; @@ -481,7 +486,7 @@ struct par_disp_inner_impl, TypeList(bounds)...); + auto bound_arr = bound_translator().GetIndexRanges(std::forward(bounds)...); if constexpr (std::is_same_v) { SimdFor(std::make_index_sequence(), function, bound_arr); } else { @@ -527,7 +532,7 @@ struct par_dispatch_impl, TypeList(bounds)...); + auto bound_arr = bound_translator().GetIndexRanges(std::forward(bounds)...); constexpr auto tag = dispatch_type::GetTag(); if constexpr (std::is_same_v>) { SimdFor(std::make_index_sequence(), function, bound_arr); @@ -547,6 +552,7 @@ struct par_dispatch_impl, TypeList, sequence, std::string name, ExecSpace exec_space, std::array bound_arr, Function function, Args &&...args) { + static_assert(sizeof...(InnerIs) == 0); auto idxer = MakeIndexer(bound_arr); kokkos_dispatch( Tag(), name, Kokkos::RangePolicy<>(exec_space, 0, idxer.size()), @@ -563,15 +569,12 @@ struct par_dispatch_impl, TypeList, sequence, std::string name, ExecSpace exec_space, std::array bound_arr, Function function, Args &&...args) { - auto idxer = MakeIndexer(bound_arr); + static_assert(sizeof...(InnerIs) == 0); kokkos_dispatch( - Tag(), name, Kokkos::RangePolicy<>(exec_space, 0, idxer.size()), - KOKKOS_LAMBDA(const int idx, ExtraFuncArgs... fargs) { - auto idx_tuple = idxer(idx); - function(std::get(idx_tuple)..., - std::forward(fargs)...); - }, - std::forward(args)...); + Tag(), name, + Kokkos::MDRangePolicy>(exec_space, {bound_arr[OuterIs].s...}, + {(1 + bound_arr[OuterIs].e)...}), + function, std::forward(args)...); } template From d626fed8affccbb69d60ecda5cdc585c31f5fb63 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Thu, 22 Aug 2024 10:55:14 +0200 Subject: [PATCH 52/99] array indexer indices --- src/kokkos_abstraction.hpp | 73 ++++++++++++++++++++------------------ src/utils/indexer.hpp | 46 ++++++++++++++++++++---- 2 files changed, 78 insertions(+), 41 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 11f9e8e348d8..116ba36f384b 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -405,36 +405,37 @@ struct InnerFunctor, } }; -template +template KOKKOS_FORCEINLINE_FUNCTION void -dispatch_collapse(std::integer_sequence, +dispatch_collapse(std::integer_sequence, + std::integer_sequence, std::integer_sequence, team_mbr_t team_member, IdxTeam idxer_team, std::array bound_arr, Function function) { + constexpr std::size_t Nteam = sizeof...(TeamIs); constexpr std::size_t Nthread = sizeof...(ThreadIs); constexpr std::size_t Nvector = sizeof...(VectorIs); - auto inds_team = idxer_team(team_member.league_rank()); + auto inds_team = idxer_team.GetIdxArray(team_member.league_rank()); if constexpr (Nthread > 0) { auto idxer_thread = MakeIndexer(std::pair(bound_arr[ThreadIs].s, bound_arr[ThreadIs].e)...); Kokkos::parallel_for( Kokkos::TeamThreadRange<>(team_member, 0, idxer_thread.size()), [&](const int idThread) { - auto inds_thread = idxer_thread(idThread); + const auto inds_thread = idxer_thread.GetIdxArray(idThread); if constexpr (Nvector > 0) { auto idxer_vector = MakeIndexer(std::pair( bound_arr[Nthread + VectorIs].s, bound_arr[Nthread + VectorIs].e)...); Kokkos::parallel_for( Kokkos::ThreadVectorRange(team_member, 0, idxer_vector.size()), [&](const int idVector) { - auto inds_all = - std::tuple_cat(inds_team, inds_thread, idxer_vector(idVector)); - std::apply(function, inds_all); + const auto inds_vector = idxer_vector.GetIdxArray(idVector); + function(inds_team[TeamIs]..., inds_thread[ThreadIs]..., + inds_vector[VectorIs]...); }); } else { - auto inds_all = std::tuple_cat(inds_team, inds_thread); - std::apply(function, inds_all); + function(inds_team[TeamIs]..., inds_thread[ThreadIs]...); } }); } else { @@ -442,9 +443,8 @@ dispatch_collapse(std::integer_sequence, bound_arr[Nthread + VectorIs].s, bound_arr[Nthread + VectorIs].e)...); Kokkos::parallel_for(Kokkos::TeamVectorRange(team_member, 0, idxer_vector.size()), [&](const int idVector) { - auto inds_all = - std::tuple_cat(inds_team, idxer_vector(idVector)); - std::apply(function, inds_all); + const auto inds_vector = idxer_vector.GetIdxArray(idVector); + function(inds_team[TeamIs]..., inds_vector[VectorIs]...); }); } } @@ -461,12 +461,10 @@ void SimdFor(std::index_sequence, Function function, auto idxer = MakeIndexer(std::pair(bounds[OuterIs].s, bounds[OuterIs].e)...); for (int idx = 0; idx < idxer.size(); idx++) { - auto indices = std::tuple_cat(idxer(idx), std::tuple({0})); + const auto indices = idxer.GetIdxArray(idx); #pragma omp simd for (int i = bounds[0].s; i <= bounds[0].e; i++) { - int &j = std::get(indices); - j = i; - std::apply(function, indices); + function(indices[OuterIs]..., i); } } } @@ -493,9 +491,9 @@ struct par_disp_inner_impl, TypeList(); constexpr std::size_t Nthread = TeamPattern::Nthread; constexpr std::size_t Nvector = TeamPattern::Nvector; - dispatch_collapse(std::make_index_sequence(), - std::make_index_sequence(), team_member, idxer, - bound_arr, function); + dispatch_collapse( + std::make_index_sequence<0>(), std::make_index_sequence(), + std::make_index_sequence(), team_member, idxer, bound_arr, function); } } }; @@ -557,9 +555,8 @@ struct par_dispatch_impl, TypeList(exec_space, 0, idxer.size()), KOKKOS_LAMBDA(const int idx, ExtraFuncArgs... fargs) { - auto idx_tuple = idxer(idx); - function(std::get(idx_tuple)..., - std::forward(fargs)...); + const auto idx_arr = idxer.GetIdxArray(idx); + function(idx_arr[OuterIs]..., std::forward(fargs)...); }, std::forward(args)...); } @@ -585,24 +582,30 @@ struct par_dispatch_impl, TypeList{bound_arr[OuterIs]...}); constexpr bool ParForOuter = std::is_same_v; - kokkos_dispatch( - Tag(), name, team_policy(exec_space, idxer.size(), Kokkos::AUTO), - KOKKOS_LAMBDA(team_mbr_t team_member, ExtraFuncArgs... fargs) { - if constexpr (ParForOuter) { - auto idx_tuple = idxer(team_member.league_rank()); - function(team_member, std::get(idx_tuple)..., + if constexpr (ParForOuter) { + kokkos_dispatch( + Tag(), name, team_policy(exec_space, idxer.size(), Kokkos::AUTO), + KOKKOS_LAMBDA(team_mbr_t team_member, ExtraFuncArgs... fargs) { + const auto idx_arr = idxer.GetIdxArray(team_member.league_rank()); + function(team_member, idx_arr[OuterIs]..., std::forward(fargs)...); - } else { + }, + std::forward(args)...); + } else { + kokkos_dispatch( + Tag(), name, team_policy(exec_space, idxer.size(), Kokkos::AUTO), + KOKKOS_LAMBDA(team_mbr_t team_member, ExtraFuncArgs... fargs) { using TeamPattern = typename dispatch_type::TeamPattern; constexpr std::size_t Nvector = TeamPattern::Nvector; constexpr std::size_t Nthread = TeamPattern::Nthread; constexpr std::size_t Nouter = Rank - Nvector - Nthread; dispatch_collapse( - std::make_index_sequence(), std::make_index_sequence(), - team_member, idxer, {bound_arr[Nouter + InnerIs]...}, function); - } - }, - std::forward(args)...); + std::make_index_sequence(), std::make_index_sequence(), + std::make_index_sequence(), team_member, idxer, + {bound_arr[Nouter + InnerIs]...}, function); + }, + std::forward(args)...); + } } }; diff --git a/src/utils/indexer.hpp b/src/utils/indexer.hpp index 21b83b6b492f..7122153f6d50 100644 --- a/src/utils/indexer.hpp +++ b/src/utils/indexer.hpp @@ -21,6 +21,7 @@ #include "utils/concepts_lite.hpp" #include "utils/utils.hpp" +#include namespace parthenon { @@ -97,8 +98,12 @@ struct Indexer { KOKKOS_FORCEINLINE_FUNCTION auto GetIdxArray(int idx) const { - return get_array_from_tuple( - GetIndicesImpl(idx, std::make_index_sequence())); + return GetIndicesArrayImpl(idx, std::make_index_sequence()); + } + + KOKKOS_FORCEINLINE_FUNCTION + void GetIdxCArray(int idx, int *indices) const { + GetIndicesCArrayImpl(idx, indices, std::make_index_sequence()); } template @@ -120,14 +125,40 @@ struct Indexer { std::tuple idxs; ( [&] { - std::get(idxs) = idx / std::get(N); - idx -= std::get(idxs) * std::get(N); - std::get(idxs) += std::get(start); + std::get(idxs) = idx / N[Is]; + idx -= std::get(idxs) * N[Is]; + std::get(idxs) += start[Is]; }(), ...); return idxs; } + template + KOKKOS_FORCEINLINE_FUNCTION void + GetIndicesCArrayImpl(int idx, int *indices, std::index_sequence) const { + ( + [&] { + indices[Is] = idx / N[Is]; + idx -= indices[Is] * N[Is]; + indices[Is] += start[Is]; + }(), + ...); + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::array + GetIndicesArrayImpl(int idx, std::index_sequence) const { + std::array indices; + ( + [&] { + indices[Is] = idx / N[Is]; + idx -= indices[Is] * N[Is]; + indices[Is] += start[Is]; + }(), + ...); + return indices; + } + template KOKKOS_FORCEINLINE_FUNCTION static std::array GetFactors(std::tuple Nt, std::index_sequence) { @@ -136,7 +167,7 @@ struct Indexer { ( [&] { constexpr std::size_t idx = sizeof...(Ts) - (Is + 1); - std::get(N) = cur; + N[idx] = cur; cur *= std::get(Nt); }(), ...); @@ -153,6 +184,9 @@ template <> struct Indexer<> { KOKKOS_FORCEINLINE_FUNCTION std::tuple<> operator()(int idx) const { return std::tuple<>(); } + // this is a dummy and shouldn't ever actually get used to index an array + KOKKOS_FORCEINLINE_FUNCTION + std::array GetIdxArray(int idx) { return {-1}; } }; template From 565717fc298327fc4af7b1bce9067c6f816ae3e3 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Thu, 22 Aug 2024 22:36:41 +0200 Subject: [PATCH 53/99] fix team policy scratch size --- src/kokkos_abstraction.hpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 116ba36f384b..c930b8f4e6c9 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -304,6 +304,8 @@ struct DispatchSignature> { private: using TL = TypeList; static constexpr std::size_t func_idx = FirstFuncIdx(); + static_assert(sizeof...(AllArgs) > func_idx, + "Couldn't determine functor index in dispatc args"); public: using LaunchBounds = typename TL::template continuous_sublist<0, func_idx - 1>; @@ -537,7 +539,7 @@ struct par_dispatch_impl, TypeList(), std::make_index_sequence(), name, exec_space, bound_arr, function, - std::forward(args)...); + std::forward(args)..., scratch_level, scratch_size_in_bytes); } } @@ -549,7 +551,8 @@ struct par_dispatch_impl, TypeList, sequence, std::string name, ExecSpace exec_space, std::array bound_arr, - Function function, Args &&...args) { + Function function, Args &&...args, const int scratch_level, + const std::size_t scratch_size_in_bytes) { static_assert(sizeof...(InnerIs) == 0); auto idxer = MakeIndexer(bound_arr); kokkos_dispatch( @@ -565,7 +568,8 @@ struct par_dispatch_impl, TypeList, sequence, std::string name, ExecSpace exec_space, std::array bound_arr, - Function function, Args &&...args) { + Function function, Args &&...args, const int scratch_level, + const std::size_t scratch_size_in_bytes) { static_assert(sizeof...(InnerIs) == 0); kokkos_dispatch( Tag(), name, @@ -578,13 +582,16 @@ struct par_dispatch_impl, TypeList, sequence, std::string name, ExecSpace exec_space, std::array bound_arr, - Function function, Args &&...args) { + Function function, Args &&...args, const int scratch_level, + const std::size_t scratch_size_in_bytes) { auto idxer = MakeIndexer(std::array{bound_arr[OuterIs]...}); constexpr bool ParForOuter = std::is_same_v; if constexpr (ParForOuter) { kokkos_dispatch( - Tag(), name, team_policy(exec_space, idxer.size(), Kokkos::AUTO), + Tag(), name, + team_policy(exec_space, idxer.size(), Kokkos::AUTO) + .set_scratch_size(scratch_level, Kokkos::PerTeam(scratch_size_in_bytes)), KOKKOS_LAMBDA(team_mbr_t team_member, ExtraFuncArgs... fargs) { const auto idx_arr = idxer.GetIdxArray(team_member.league_rank()); function(team_member, idx_arr[OuterIs]..., From c53ec4fc0fecd148e0389cb981cd6a21d7bc3a85 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Thu, 22 Aug 2024 22:36:58 +0200 Subject: [PATCH 54/99] base_type in functor check --- src/utils/type_list.hpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/utils/type_list.hpp b/src/utils/type_list.hpp index 5756390f93eb..346b5c74d956 100644 --- a/src/utils/type_list.hpp +++ b/src/utils/type_list.hpp @@ -210,7 +210,7 @@ constexpr int FirstFuncIdx() { if constexpr (idx == TL::n_types) { return TL::n_types; } else { - using cur_type = typename TL::template type; + using cur_type = base_type>; if constexpr (is_functor::value) return idx; if constexpr (std::is_function>::value) return idx; return FirstFuncIdx(); @@ -223,7 +223,7 @@ template constexpr bool isBoundType() { using BoundTypes = TypeList; using btype = base_type; - return BoundTypes::template IsIn() || std::is_integral_v; + return std::is_same_v || std::is_integral_v; } template @@ -233,6 +233,7 @@ constexpr std::size_t GetNumBounds(TypeList) { return 0; } else { using Bnd0 = typename TL::template type<0>; + static_assert(isBoundType(), "unrecognized launch bound in par_dispatch"); if constexpr (std::is_same_v, IndexRange>) { return 2 + GetNumBounds(typename TL::template continuous_sublist<1>()); } else if constexpr (std::is_integral_v>) { @@ -240,8 +241,6 @@ constexpr std::size_t GetNumBounds(TypeList) { static_assert(std::is_integral_v>, "integer launch bounds need to come in (start, end) pairs"); return 2 + GetNumBounds(typename TL::template continuous_sublist<2>()); - } else { - static_assert(always_false, "launch bound type not supported"); } } } From 14c098ab5f0e29240ecfc7bc50650653ba383e8c Mon Sep 17 00:00:00 2001 From: adam reyes Date: Thu, 22 Aug 2024 23:07:57 +0200 Subject: [PATCH 55/99] simdfor inner loop bounds fix --- src/kokkos_abstraction.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index c930b8f4e6c9..4df262b65321 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -465,7 +465,7 @@ void SimdFor(std::index_sequence, Function function, for (int idx = 0; idx < idxer.size(); idx++) { const auto indices = idxer.GetIdxArray(idx); #pragma omp simd - for (int i = bounds[0].s; i <= bounds[0].e; i++) { + for (int i = bounds[Rank - 1].s; i <= bounds[Rank - 1].e; i++) { function(indices[OuterIs]..., i); } } From f6d9c21548860f401f9b6d8e1fe6242476fd5fa8 Mon Sep 17 00:00:00 2001 From: Adam Date: Fri, 23 Aug 2024 23:38:11 +0000 Subject: [PATCH 56/99] get things working on cuda --- src/kokkos_abstraction.hpp | 289 +++++++++++++++++++++---------------- src/utils/indexer.hpp | 68 +++++---- 2 files changed, 206 insertions(+), 151 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 4df262b65321..05a9b9b621f3 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -176,9 +176,13 @@ static struct LoopPatternTPTVR { // inner Kokkos::ThreadVectorRange static struct LoopPatternTPTTRTVR { } loop_pattern_tpttrtvr_tag; -// Used as generic catch all for LoopPatternTeam<> -static struct LoopPatternTeamGeneric { -} loop_pattern_team_generic_tag; +// Translates to an outer team policy +static struct LoopPatternTeamOuter { +} loop_pattern_team_outer_tag; +// Translates to an outter team policy with an inner collapse over a combination of +// thread/vecctor +static struct LoopPatternTeamCollapse { +} loop_pattern_team_collapse; // Used to catch undefined behavior as it results in throwing an error static struct LoopPatternUndefined { } loop_pattern_undefined_tag; @@ -327,7 +331,7 @@ struct DispatchType { // check any confilcts with the requested pattern // and return the actual one we use - static constexpr auto GetTag() { + static constexpr auto GetPatternTag() { constexpr bool is_ParFor = std::is_same::value; constexpr bool is_ParScan = @@ -337,11 +341,16 @@ struct DispatchType { constexpr bool IsMDRange = std::is_same::value; constexpr bool IsSimdFor = std::is_same::value; // fallback simd par_reduce to flat range and force par_scan to flat range - if constexpr (IsFlatRange || (IsSimdFor && !is_ParFor)) - return loop_pattern_flatrange_tag; - if constexpr (IsSimdFor && is_ParFor) return loop_pattern_simdfor_tag; - if constexpr (IsMDRange && !is_ParScan) return loop_pattern_mdrange_tag; - if constexpr (TeamPattern::value) return loop_pattern_team_generic_tag; + if constexpr (IsFlatRange || (IsSimdFor && !is_ParFor)) return LoopPatternFlatRange(); + if constexpr (IsSimdFor && is_ParFor) return LoopPatternSimdFor(); + if constexpr (IsMDRange && !is_ParScan) return LoopPatternMDRange(); + if constexpr (TeamPattern::value) { + if constexpr (std::is_same_v) { + return OuterLoopPatternTeams(); + } else { + return LoopPatternTeamCollapse(); + } + } } }; @@ -352,35 +361,33 @@ struct BoundTranslator { private: // overloads for different launch bound types. // should also be counted by isBoundType & GetNumBounds in type_list.hpp - template - KOKKOS_INLINE_FUNCTION void - GetIndexRanges_impl(const int idx, std::array &out, const int s, - const int e, Bounds &&...bounds) { - out[idx].s = s; - out[idx].e = e; + template + KOKKOS_INLINE_FUNCTION void GetIndexRanges_impl(const int idx, const int s, const int e, + Bounds &&...bounds) { + bound_arr[idx].s = s; + bound_arr[idx].e = e; if constexpr (sizeof...(Bounds) > 0) { - GetIndexRanges_impl(idx + 1, out, std::forward(bounds)...); + GetIndexRanges_impl(idx + 1, std::forward(bounds)...); } } - template - KOKKOS_INLINE_FUNCTION void - GetIndexRanges_impl(const int idx, std::array &out, const IndexRange ir, - Bounds &&...bounds) { - out[idx] = ir; + template + KOKKOS_INLINE_FUNCTION void GetIndexRanges_impl(const int idx, const IndexRange ir, + Bounds &&...bounds) { + bound_arr[idx] = ir; if constexpr (sizeof...(Bounds) > 0) { - GetIndexRanges_impl(idx + 1, out, std::forward(bounds)...); + GetIndexRanges_impl(idx + 1, std::forward(bounds)...); } } public: using Bound_tl = TypeList; static constexpr std::size_t rank = GetNumBounds(Bound_tl()) / 2; + Kokkos::Array bound_arr; KOKKOS_INLINE_FUNCTION - std::array GetIndexRanges(Bound_ts &&...bounds) { - std::array out; - GetIndexRanges_impl(0, out, std::forward(bounds)...); - return out; + Kokkos::Array GetIndexRanges(Bound_ts &&...bounds) { + GetIndexRanges_impl(0, std::forward(bounds)...); + return bound_arr; } }; @@ -407,53 +414,80 @@ struct InnerFunctor, } }; -template -KOKKOS_FORCEINLINE_FUNCTION void -dispatch_collapse(std::integer_sequence, - std::integer_sequence, - std::integer_sequence, team_mbr_t team_member, - IdxTeam idxer_team, std::array bound_arr, - Function function) { - constexpr std::size_t Nteam = sizeof...(TeamIs); - constexpr std::size_t Nthread = sizeof...(ThreadIs); - constexpr std::size_t Nvector = sizeof...(VectorIs); - auto inds_team = idxer_team.GetIdxArray(team_member.league_rank()); - if constexpr (Nthread > 0) { - auto idxer_thread = - MakeIndexer(std::pair(bound_arr[ThreadIs].s, bound_arr[ThreadIs].e)...); - Kokkos::parallel_for( - Kokkos::TeamThreadRange<>(team_member, 0, idxer_thread.size()), - [&](const int idThread) { - const auto inds_thread = idxer_thread.GetIdxArray(idThread); - if constexpr (Nvector > 0) { - auto idxer_vector = MakeIndexer(std::pair( - bound_arr[Nthread + VectorIs].s, bound_arr[Nthread + VectorIs].e)...); - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(team_member, 0, idxer_vector.size()), - [&](const int idVector) { - const auto inds_vector = idxer_vector.GetIdxArray(idVector); - function(inds_team[TeamIs]..., inds_thread[ThreadIs]..., - inds_vector[VectorIs]...); - }); - } else { - function(inds_team[TeamIs]..., inds_thread[ThreadIs]...); - } - }); - } else { - auto idxer_vector = MakeIndexer(std::pair( - bound_arr[Nthread + VectorIs].s, bound_arr[Nthread + VectorIs].e)...); - Kokkos::parallel_for(Kokkos::TeamVectorRange(team_member, 0, idxer_vector.size()), - [&](const int idVector) { - const auto inds_vector = idxer_vector.GetIdxArray(idVector); - function(inds_team[TeamIs]..., inds_vector[VectorIs]...); - }); +template +struct dispatch_collapse { + IdxTeam idxer_team; + Kokkos::Array bound_arr; + Function function; + + KOKKOS_FORCEINLINE_FUNCTION + dispatch_collapse(IdxTeam idxer, Kokkos::Array bounds, Function func) + : idxer_team(idxer), bound_arr(bounds), function(func) {} + + template + KOKKOS_FORCEINLINE_FUNCTION void + dispatch(std::integer_sequence, + std::integer_sequence, + std::integer_sequence, team_mbr_t team_member, + ExtraFuncArgs &&...args) const { + auto inds_team = idxer_team.GetIdxArray(team_member.league_rank()); + if constexpr (Nthread > 0) { + auto idxer_thread = + MakeIndexer(Kokkos::Array{bound_arr[ThreadIs]...}); + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, 0, idxer_thread.size()), + [&](const int idThread) { + const auto inds_thread = idxer_thread.GetIdxArray(idThread); + if constexpr (Nvector > 0) { + auto idxer_vector = MakeIndexer( + Kokkos::Array{bound_arr[Nthread + VectorIs]...}); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(team_member, 0, idxer_vector.size()), + [&](const int idVector) { + const auto inds_vector = idxer_vector.GetIdxArray(idVector); + function(inds_team[TeamIs]..., inds_thread[ThreadIs]..., + inds_vector[VectorIs]..., + std::forward(args)...); + }); + } else { + function(inds_team[TeamIs]..., inds_thread[ThreadIs]..., + std::forward(args)...); + } + }); + } else { + auto idxer_vector = MakeIndexer( + Kokkos::Array{bound_arr[Nthread + VectorIs]...}); + Kokkos::parallel_for(Kokkos::TeamVectorRange(team_member, 0, idxer_vector.size()), + [&](const int idVector) { + const auto inds_vector = idxer_vector.GetIdxArray(idVector); + function(inds_team[TeamIs]..., inds_vector[VectorIs]..., + std::forward(args)...); + }); + } } + + template + using sequence = std::make_index_sequence; + KOKKOS_FORCEINLINE_FUNCTION + void operator()(team_mbr_t team_member, ExtraFuncArgs &&...args) const { + dispatch(sequence(), sequence(), sequence(), team_member, + std::forward(args)...); + } +}; + +template +KOKKOS_FORCEINLINE_FUNCTION auto +MakeCollapse(IdxTeam idxer, Kokkos::Array bounds, Function func) { + return dispatch_collapse(idxer, bounds, func); } template void SimdFor(std::index_sequence, Function function, - std::array bounds) { + Kokkos::Array bounds) { if constexpr (Rank == 1) { #pragma omp simd for (int i = bounds[0].s; i <= bounds[0].e; i++) { @@ -493,9 +527,8 @@ struct par_disp_inner_impl, TypeList(); constexpr std::size_t Nthread = TeamPattern::Nthread; constexpr std::size_t Nvector = TeamPattern::Nvector; - dispatch_collapse( - std::make_index_sequence<0>(), std::make_index_sequence(), - std::make_index_sequence(), team_member, idxer, bound_arr, function); + MakeCollapse(idxer, bound_arr, + function)(team_member); } } }; @@ -525,34 +558,33 @@ struct par_dispatch_impl, TypeList - static inline void dispatch(std::string name, ExecSpace exec_space, Bounds &&...bounds, - Function function, Args &&...args, - const int scratch_level = 0, - const std::size_t scratch_size_in_bytes = 0) { + inline void dispatch(std::string name, ExecSpace exec_space, Bounds &&...bounds, + Function function, Args &&...args, const int scratch_level = 0, + const std::size_t scratch_size_in_bytes = 0) { PARTHENON_INSTRUMENT_REGION(name) constexpr std::size_t Ninner = dispatch_type::TeamPattern::Nvector + dispatch_type::TeamPattern::Nthread; auto bound_arr = bound_translator().GetIndexRanges(std::forward(bounds)...); - constexpr auto tag = dispatch_type::GetTag(); - if constexpr (std::is_same_v>) { + constexpr auto pattern_tag = dispatch_type::GetPatternTag(); + if constexpr (std::is_same_v>) { SimdFor(std::make_index_sequence(), function, bound_arr); } else { - dispatch(tag, std::make_index_sequence(), - std::make_index_sequence(), name, exec_space, bound_arr, function, - std::forward(args)..., scratch_level, scratch_size_in_bytes); + dispatch_impl(pattern_tag, std::make_index_sequence(), + std::make_index_sequence(), name, exec_space, bound_arr, + function, std::forward(args)..., scratch_level, + scratch_size_in_bytes); } } - private: template using sequence = std::integer_sequence; template - static inline void - dispatch(LoopPatternFlatRange, sequence, sequence, - std::string name, ExecSpace exec_space, std::array bound_arr, - Function function, Args &&...args, const int scratch_level, - const std::size_t scratch_size_in_bytes) { + inline void dispatch_impl(LoopPatternFlatRange, sequence, + sequence, std::string name, ExecSpace exec_space, + Kokkos::Array bound_arr, Function function, + Args &&...args, const int scratch_level, + const std::size_t scratch_size_in_bytes) { static_assert(sizeof...(InnerIs) == 0); auto idxer = MakeIndexer(bound_arr); kokkos_dispatch( @@ -565,11 +597,11 @@ struct par_dispatch_impl, TypeList - static inline void - dispatch(LoopPatternMDRange, sequence, sequence, - std::string name, ExecSpace exec_space, std::array bound_arr, - Function function, Args &&...args, const int scratch_level, - const std::size_t scratch_size_in_bytes) { + inline void dispatch_impl(LoopPatternMDRange, sequence, + sequence, std::string name, ExecSpace exec_space, + Kokkos::Array bound_arr, Function function, + Args &&...args, const int scratch_level, + const std::size_t scratch_size_in_bytes) { static_assert(sizeof...(InnerIs) == 0); kokkos_dispatch( Tag(), name, @@ -579,40 +611,45 @@ struct par_dispatch_impl, TypeList - static inline void - dispatch(LoopPatternTeamGeneric, sequence, sequence, - std::string name, ExecSpace exec_space, std::array bound_arr, - Function function, Args &&...args, const int scratch_level, - const std::size_t scratch_size_in_bytes) { + inline void dispatch_impl(OuterLoopPatternTeams, sequence, + sequence, std::string name, ExecSpace exec_space, + Kokkos::Array bound_arr, Function function, + Args &&...args, const int scratch_level, + const std::size_t scratch_size_in_bytes) { auto idxer = - MakeIndexer(std::array{bound_arr[OuterIs]...}); - constexpr bool ParForOuter = std::is_same_v; - if constexpr (ParForOuter) { - kokkos_dispatch( - Tag(), name, - team_policy(exec_space, idxer.size(), Kokkos::AUTO) - .set_scratch_size(scratch_level, Kokkos::PerTeam(scratch_size_in_bytes)), - KOKKOS_LAMBDA(team_mbr_t team_member, ExtraFuncArgs... fargs) { - const auto idx_arr = idxer.GetIdxArray(team_member.league_rank()); - function(team_member, idx_arr[OuterIs]..., - std::forward(fargs)...); - }, - std::forward(args)...); - } else { - kokkos_dispatch( - Tag(), name, team_policy(exec_space, idxer.size(), Kokkos::AUTO), - KOKKOS_LAMBDA(team_mbr_t team_member, ExtraFuncArgs... fargs) { - using TeamPattern = typename dispatch_type::TeamPattern; - constexpr std::size_t Nvector = TeamPattern::Nvector; - constexpr std::size_t Nthread = TeamPattern::Nthread; - constexpr std::size_t Nouter = Rank - Nvector - Nthread; - dispatch_collapse( - std::make_index_sequence(), std::make_index_sequence(), - std::make_index_sequence(), team_member, idxer, - {bound_arr[Nouter + InnerIs]...}, function); - }, - std::forward(args)...); - } + MakeIndexer(Kokkos::Array{bound_arr[OuterIs]...}); + kokkos_dispatch( + Tag(), name, + team_policy(exec_space, idxer.size(), Kokkos::AUTO) + .set_scratch_size(scratch_level, Kokkos::PerTeam(scratch_size_in_bytes)), + KOKKOS_LAMBDA(team_mbr_t team_member, ExtraFuncArgs... fargs) { + const auto idx_arr = idxer.GetIdxArray(team_member.league_rank()); + function(team_member, idx_arr[OuterIs]..., + std::forward(fargs)...); + }, + std::forward(args)...); + } + + template + inline void dispatch_impl(LoopPatternTeamCollapse, sequence, + sequence, std::string name, ExecSpace exec_space, + Kokkos::Array bound_arr, Function function, + Args &&...args, const int scratch_level, + const std::size_t scratch_size_in_bytes) { + auto idxer = + MakeIndexer(Kokkos::Array{bound_arr[OuterIs]...}); + using TeamPattern = typename dispatch_type::TeamPattern; + constexpr std::size_t Nvector = TeamPattern::Nvector; + constexpr std::size_t Nthread = TeamPattern::Nthread; + constexpr std::size_t Nouter = Rank - Nvector - Nthread; + kokkos_dispatch( + Tag(), name, + team_policy(exec_space, idxer.size(), Kokkos::AUTO) + .set_scratch_size(scratch_level, Kokkos::PerTeam(scratch_size_in_bytes)), + + MakeCollapse(idxer, bound_arr, + function), + std::forward(args)...); } }; @@ -629,7 +666,7 @@ inline void par_dispatch(Pattern, std::string name, DevExecSpace exec_space, if constexpr (Rank > 1 && std::is_same_v) { static_assert(always_false, "par_scan only for 1D loops"); } - par_dispatch_impl::dispatch( + par_dispatch_impl().dispatch( name, exec_space, std::forward(args)...); } @@ -667,7 +704,7 @@ par_for_outer(Pattern, const std::string &name, DevExecSpace exec_space, using Tag = dispatch_impl::ParallelForDispatch; using ExtraFuncArgs = typename function_signature::FArgs; - par_dispatch_impl::dispatch( + par_dispatch_impl().dispatch( name, exec_space, std::forward(args)..., scratch_level, scratch_size_in_bytes); } diff --git a/src/utils/indexer.hpp b/src/utils/indexer.hpp index 7122153f6d50..c37f5ff8e4a5 100644 --- a/src/utils/indexer.hpp +++ b/src/utils/indexer.hpp @@ -20,6 +20,7 @@ #include #include "utils/concepts_lite.hpp" +#include "utils/type_list.hpp" #include "utils/utils.hpp" #include @@ -67,7 +68,6 @@ struct block_ownership_t { private: bool ownership[3][3][3]; }; - template struct Indexer { KOKKOS_INLINE_FUNCTION @@ -83,13 +83,17 @@ struct Indexer { KOKKOS_INLINE_FUNCTION explicit Indexer(std::pair... Ns) - : N{GetFactors(std::make_tuple((Ns.second - Ns.first + 1)...), + : N{GetFactors({(Ns.second - Ns.first + 1)...}, std::make_index_sequence())}, start{Ns.first...}, end{Ns.second...}, _size(((Ns.second - Ns.first + 1) * ...)) { } - KOKKOS_FORCEINLINE_FUNCTION - std::size_t size() const { return _size; } + template + KOKKOS_INLINE_FUNCTION explicit Indexer(IndRngs... Ns) + : N{GetFactors({(Ns.e - Ns.s + 1)...}, std::make_index_sequence())}, + start{Ns.s...}, end{Ns.e...}, _size(((Ns.e - Ns.s + 1) * ...)) {} + + KOKKOS_FORCEINLINE_FUNCTION std::size_t size() const { return _size; } KOKKOS_FORCEINLINE_FUNCTION std::tuple operator()(int idx) const { @@ -146,9 +150,9 @@ struct Indexer { } template - KOKKOS_FORCEINLINE_FUNCTION std::array + KOKKOS_FORCEINLINE_FUNCTION Kokkos::Array GetIndicesArrayImpl(int idx, std::index_sequence) const { - std::array indices; + Kokkos::Array indices; ( [&] { indices[Is] = idx / N[Is]; @@ -160,33 +164,43 @@ struct Indexer { } template - KOKKOS_FORCEINLINE_FUNCTION static std::array - GetFactors(std::tuple Nt, std::index_sequence) { - std::array N; + KOKKOS_FORCEINLINE_FUNCTION static Kokkos::Array + GetFactors(Kokkos::Array Nt, std::index_sequence) { + Kokkos::Array N; int cur = 1; ( [&] { constexpr std::size_t idx = sizeof...(Ts) - (Is + 1); N[idx] = cur; - cur *= std::get(Nt); + cur *= Nt[idx]; }(), ...); return N; } - std::array N; - std::array start; - std::array end; + Kokkos::Array N; + Kokkos::Array start; + Kokkos::Array end; + std::size_t _size; +}; + +template +struct IndexRanger { + KOKKOS_INLINE_FUNCTION + IndexRanger() : N{}, _size{} {}; + + KOKKOS_INLINE_FUNCTION + explicit IndexRanger(Ts... IdrsA){}; + + Kokkos::Array N; std::size_t _size; }; template <> struct Indexer<> { - KOKKOS_FORCEINLINE_FUNCTION - std::tuple<> operator()(int idx) const { return std::tuple<>(); } // this is a dummy and shouldn't ever actually get used to index an array KOKKOS_FORCEINLINE_FUNCTION - std::array GetIdxArray(int idx) { return {-1}; } + Kokkos::Array GetIdxArray(int idx) { return {-1}; } }; template @@ -230,30 +244,34 @@ using Indexer8D = Indexer; using SpatiallyMaskedIndexer6D = SpatiallyMaskedIndexer; template -auto MakeIndexer(const std::pair &...ranges) { +KOKKOS_FORCEINLINE_FUNCTION auto MakeIndexer(const std::pair &...ranges) { return Indexer(ranges...); } -template -auto MakeIndexer(std::array bounds_arr, - std::integer_sequence) { - return MakeIndexer(std::pair(bounds_arr[Is].s, bounds_arr[Is].e)...); +template +KOKKOS_FORCEINLINE_FUNCTION auto MakeIndexer(TypeList, + Kokkos::Array bounds_arr, + std::integer_sequence) { + return Indexer(bounds_arr[Is]...); + /* return MakeIndexer(std::pair(bounds_arr[Is].s, bounds_arr[Is].e)...); */ } template -auto MakeIndexer(std::array bounds_arr) { - return MakeIndexer(bounds_arr, std::make_index_sequence()); +KOKKOS_FORCEINLINE_FUNCTION auto MakeIndexer(Kokkos::Array bounds_arr) { + return MakeIndexer(list_of_type_t(), bounds_arr, + std::make_index_sequence()); } namespace impl { template -auto MakeIndexerIntImpl(std::array args, std::index_sequence) { +KOKKOS_FORCEINLINE_FUNCTION auto MakeIndexerIntImpl(std::array args, + std::index_sequence) { return MakeIndexer(std::pair(args[2 * Is], args[2 * Is + 1])...); } } // namespace impl template -auto MakeIndexerInt(Ts &&...args) { +KOKKOS_FORCEINLINE_FUNCTION auto MakeIndexerInt(Ts &&...args) { static_assert(sizeof...(Ts) % 2 == 0, "Must have an upper and lower end to each index range."); return impl::MakeIndexerIntImpl(std::array{args...}, From be6ed047eecab7c64b5c9e0471fdd57f56ff81be Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sat, 24 Aug 2024 15:06:09 +0200 Subject: [PATCH 57/99] extra args for hierarchial loops --- src/kokkos_abstraction.hpp | 40 ++++++++++++++++++++------------------ src/utils/indexer.hpp | 2 +- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 05a9b9b621f3..797b5267eb28 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -425,19 +425,20 @@ struct dispatch_collapse { dispatch_collapse(IdxTeam idxer, Kokkos::Array bounds, Function func) : idxer_team(idxer), bound_arr(bounds), function(func) {} - template + template KOKKOS_FORCEINLINE_FUNCTION void dispatch(std::integer_sequence, std::integer_sequence, std::integer_sequence, team_mbr_t team_member, - ExtraFuncArgs &&...args) const { + Args &&...args) const { auto inds_team = idxer_team.GetIdxArray(team_member.league_rank()); if constexpr (Nthread > 0) { auto idxer_thread = MakeIndexer(Kokkos::Array{bound_arr[ThreadIs]...}); Kokkos::parallel_for( Kokkos::TeamThreadRange<>(team_member, 0, idxer_thread.size()), - [&](const int idThread) { + [&](const int idThread, ExtraFuncArgs... fargs) { const auto inds_thread = idxer_thread.GetIdxArray(idThread); if constexpr (Nvector > 0) { auto idxer_vector = MakeIndexer( @@ -448,37 +449,38 @@ struct dispatch_collapse { const auto inds_vector = idxer_vector.GetIdxArray(idVector); function(inds_team[TeamIs]..., inds_thread[ThreadIs]..., inds_vector[VectorIs]..., - std::forward(args)...); + std::forward(fargs)...); }); } else { function(inds_team[TeamIs]..., inds_thread[ThreadIs]..., - std::forward(args)...); + std::forward(fargs)...); } - }); + }, + std::forward(args)...); } else { auto idxer_vector = MakeIndexer( Kokkos::Array{bound_arr[Nthread + VectorIs]...}); - Kokkos::parallel_for(Kokkos::TeamVectorRange(team_member, 0, idxer_vector.size()), - [&](const int idVector) { - const auto inds_vector = idxer_vector.GetIdxArray(idVector); - function(inds_team[TeamIs]..., inds_vector[VectorIs]..., - std::forward(args)...); - }); + Kokkos::parallel_for( + Kokkos::TeamVectorRange(team_member, 0, idxer_vector.size()), + [&](const int idVector, ExtraFuncArgs... fargs) { + const auto inds_vector = idxer_vector.GetIdxArray(idVector); + function(inds_team[TeamIs]..., inds_vector[VectorIs]..., + std::forward(fargs)...); + }, + std::forward(args)...); } } template using sequence = std::make_index_sequence; KOKKOS_FORCEINLINE_FUNCTION - void operator()(team_mbr_t team_member, ExtraFuncArgs &&...args) const { - dispatch(sequence(), sequence(), sequence(), team_member, - std::forward(args)...); + void operator()(team_mbr_t team_member) const { + dispatch(sequence(), sequence(), sequence(), team_member); } }; template + typename IdxTeam, typename Function, typename... ExtraFuncArgs> KOKKOS_FORCEINLINE_FUNCTION auto MakeCollapse(IdxTeam idxer, Kokkos::Array bounds, Function func) { return dispatch_collapse, TypeList(idxer, bound_arr, - function), + MakeCollapse(idxer, bound_arr, + function), std::forward(args)...); } }; diff --git a/src/utils/indexer.hpp b/src/utils/indexer.hpp index c37f5ff8e4a5..9cd544c09b84 100644 --- a/src/utils/indexer.hpp +++ b/src/utils/indexer.hpp @@ -200,7 +200,7 @@ template <> struct Indexer<> { // this is a dummy and shouldn't ever actually get used to index an array KOKKOS_FORCEINLINE_FUNCTION - Kokkos::Array GetIdxArray(int idx) { return {-1}; } + Kokkos::Array GetIdxArray(int idx) const { return {-1}; } }; template From b0918601d29200da01d80c12af38f9c0f9e4a7c6 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sat, 24 Aug 2024 15:20:07 +0200 Subject: [PATCH 58/99] extra args in team/collapse loops --- src/kokkos_abstraction.hpp | 40 ++++++++++++++++++++------------------ src/utils/indexer.hpp | 2 +- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 05a9b9b621f3..797b5267eb28 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -425,19 +425,20 @@ struct dispatch_collapse { dispatch_collapse(IdxTeam idxer, Kokkos::Array bounds, Function func) : idxer_team(idxer), bound_arr(bounds), function(func) {} - template + template KOKKOS_FORCEINLINE_FUNCTION void dispatch(std::integer_sequence, std::integer_sequence, std::integer_sequence, team_mbr_t team_member, - ExtraFuncArgs &&...args) const { + Args &&...args) const { auto inds_team = idxer_team.GetIdxArray(team_member.league_rank()); if constexpr (Nthread > 0) { auto idxer_thread = MakeIndexer(Kokkos::Array{bound_arr[ThreadIs]...}); Kokkos::parallel_for( Kokkos::TeamThreadRange<>(team_member, 0, idxer_thread.size()), - [&](const int idThread) { + [&](const int idThread, ExtraFuncArgs... fargs) { const auto inds_thread = idxer_thread.GetIdxArray(idThread); if constexpr (Nvector > 0) { auto idxer_vector = MakeIndexer( @@ -448,37 +449,38 @@ struct dispatch_collapse { const auto inds_vector = idxer_vector.GetIdxArray(idVector); function(inds_team[TeamIs]..., inds_thread[ThreadIs]..., inds_vector[VectorIs]..., - std::forward(args)...); + std::forward(fargs)...); }); } else { function(inds_team[TeamIs]..., inds_thread[ThreadIs]..., - std::forward(args)...); + std::forward(fargs)...); } - }); + }, + std::forward(args)...); } else { auto idxer_vector = MakeIndexer( Kokkos::Array{bound_arr[Nthread + VectorIs]...}); - Kokkos::parallel_for(Kokkos::TeamVectorRange(team_member, 0, idxer_vector.size()), - [&](const int idVector) { - const auto inds_vector = idxer_vector.GetIdxArray(idVector); - function(inds_team[TeamIs]..., inds_vector[VectorIs]..., - std::forward(args)...); - }); + Kokkos::parallel_for( + Kokkos::TeamVectorRange(team_member, 0, idxer_vector.size()), + [&](const int idVector, ExtraFuncArgs... fargs) { + const auto inds_vector = idxer_vector.GetIdxArray(idVector); + function(inds_team[TeamIs]..., inds_vector[VectorIs]..., + std::forward(fargs)...); + }, + std::forward(args)...); } } template using sequence = std::make_index_sequence; KOKKOS_FORCEINLINE_FUNCTION - void operator()(team_mbr_t team_member, ExtraFuncArgs &&...args) const { - dispatch(sequence(), sequence(), sequence(), team_member, - std::forward(args)...); + void operator()(team_mbr_t team_member) const { + dispatch(sequence(), sequence(), sequence(), team_member); } }; template + typename IdxTeam, typename Function, typename... ExtraFuncArgs> KOKKOS_FORCEINLINE_FUNCTION auto MakeCollapse(IdxTeam idxer, Kokkos::Array bounds, Function func) { return dispatch_collapse, TypeList(idxer, bound_arr, - function), + MakeCollapse(idxer, bound_arr, + function), std::forward(args)...); } }; diff --git a/src/utils/indexer.hpp b/src/utils/indexer.hpp index c37f5ff8e4a5..9cd544c09b84 100644 --- a/src/utils/indexer.hpp +++ b/src/utils/indexer.hpp @@ -200,7 +200,7 @@ template <> struct Indexer<> { // this is a dummy and shouldn't ever actually get used to index an array KOKKOS_FORCEINLINE_FUNCTION - Kokkos::Array GetIdxArray(int idx) { return {-1}; } + Kokkos::Array GetIdxArray(int idx) const { return {-1}; } }; template From b532cc7ebf8b72c592f8871b23046c9228117111 Mon Sep 17 00:00:00 2001 From: "adam.c.reyes" Date: Sat, 24 Aug 2024 10:18:45 -0400 Subject: [PATCH 59/99] kokkos array in indexer --- src/kokkos_abstraction.hpp | 39 +++++++++++++++++++++----------------- src/utils/indexer.hpp | 27 ++++++++++++++++++++++---- 2 files changed, 45 insertions(+), 21 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 797b5267eb28..40a336685682 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -428,25 +428,27 @@ struct dispatch_collapse { template KOKKOS_FORCEINLINE_FUNCTION void - dispatch(std::integer_sequence, - std::integer_sequence, - std::integer_sequence, team_mbr_t team_member, - Args &&...args) const { - auto inds_team = idxer_team.GetIdxArray(team_member.league_rank()); + execute(std::integer_sequence, + std::integer_sequence, + std::integer_sequence, team_mbr_t team_member, + Args &&...args) const { + auto inds_team = idxer_team.GetIdxKArray(team_member.league_rank()); if constexpr (Nthread > 0) { auto idxer_thread = MakeIndexer(Kokkos::Array{bound_arr[ThreadIs]...}); Kokkos::parallel_for( Kokkos::TeamThreadRange<>(team_member, 0, idxer_thread.size()), [&](const int idThread, ExtraFuncArgs... fargs) { - const auto inds_thread = idxer_thread.GetIdxArray(idThread); + const auto inds_thread = idxer_thread.GetIdxKArray(idThread); if constexpr (Nvector > 0) { + static_assert(Nvector * Nthread == 0 || sizeof...(Args) == 0, + "thread + vector range pattern only supported for par_for "); auto idxer_vector = MakeIndexer( Kokkos::Array{bound_arr[Nthread + VectorIs]...}); Kokkos::parallel_for( Kokkos::ThreadVectorRange(team_member, 0, idxer_vector.size()), [&](const int idVector) { - const auto inds_vector = idxer_vector.GetIdxArray(idVector); + const auto inds_vector = idxer_vector.GetIdxKArray(idVector); function(inds_team[TeamIs]..., inds_thread[ThreadIs]..., inds_vector[VectorIs]..., std::forward(fargs)...); @@ -463,7 +465,7 @@ struct dispatch_collapse { Kokkos::parallel_for( Kokkos::TeamVectorRange(team_member, 0, idxer_vector.size()), [&](const int idVector, ExtraFuncArgs... fargs) { - const auto inds_vector = idxer_vector.GetIdxArray(idVector); + const auto inds_vector = idxer_vector.GetIdxKArray(idVector); function(inds_team[TeamIs]..., inds_vector[VectorIs]..., std::forward(fargs)...); }, @@ -475,7 +477,7 @@ struct dispatch_collapse { using sequence = std::make_index_sequence; KOKKOS_FORCEINLINE_FUNCTION void operator()(team_mbr_t team_member) const { - dispatch(sequence(), sequence(), sequence(), team_member); + execute(sequence(), sequence(), sequence(), team_member); } }; @@ -499,7 +501,7 @@ void SimdFor(std::index_sequence, Function function, auto idxer = MakeIndexer(std::pair(bounds[OuterIs].s, bounds[OuterIs].e)...); for (int idx = 0; idx < idxer.size(); idx++) { - const auto indices = idxer.GetIdxArray(idx); + const auto indices = idxer.GetIdxKArray(idx); #pragma omp simd for (int i = bounds[Rank - 1].s; i <= bounds[Rank - 1].e; i++) { function(indices[OuterIs]..., i); @@ -518,10 +520,11 @@ struct par_disp_inner_impl, TypeList; static constexpr std::size_t Rank = bound_translator::rank; using TeamPattern = LoopPatternTeam>; + template + using sequence = std::make_index_sequence; - KOKKOS_FORCEINLINE_FUNCTION void dispatch(team_mbr_t team_member, Bounds &&...bounds, - Function function, Args &&...args) { - // TODO(acreyes): I don't think this static method will wokr on device... + KOKKOS_FORCEINLINE_FUNCTION void execute(team_mbr_t team_member, Bounds &&...bounds, + Function function, Args &&...args) { auto bound_arr = bound_translator().GetIndexRanges(std::forward(bounds)...); if constexpr (std::is_same_v) { SimdFor(std::make_index_sequence(), function, bound_arr); @@ -530,7 +533,9 @@ struct par_disp_inner_impl, TypeList(idxer, bound_arr, - function)(team_member); + function) + .execute(sequence<0>(), sequence(), sequence(), team_member, + std::forward(args)...); } } }; @@ -544,7 +549,7 @@ KOKKOS_FORCEINLINE_FUNCTION void par_disp_inner(Pattern, team_mbr_t team_member, using LaunchBounds = typename dispatchsig::LaunchBounds; using Args = typename dispatchsig::Args; using ExtraFuncArgs = typename function_signature::FArgs; - par_disp_inner_impl().dispatch( + par_disp_inner_impl().execute( team_member, std::forward(args)...); } @@ -592,7 +597,7 @@ struct par_dispatch_impl, TypeList(exec_space, 0, idxer.size()), KOKKOS_LAMBDA(const int idx, ExtraFuncArgs... fargs) { - const auto idx_arr = idxer.GetIdxArray(idx); + const auto idx_arr = idxer.GetIdxKArray(idx); function(idx_arr[OuterIs]..., std::forward(fargs)...); }, std::forward(args)...); @@ -625,7 +630,7 @@ struct par_dispatch_impl, TypeList(fargs)...); }, diff --git a/src/utils/indexer.hpp b/src/utils/indexer.hpp index 9cd544c09b84..b1f8dae8a521 100644 --- a/src/utils/indexer.hpp +++ b/src/utils/indexer.hpp @@ -105,6 +105,11 @@ struct Indexer { return GetIndicesArrayImpl(idx, std::make_index_sequence()); } + KOKKOS_FORCEINLINE_FUNCTION + auto GetIdxKArray(int idx) const { + return GetIndicesKArrayImpl(idx, std::make_index_sequence()); + } + KOKKOS_FORCEINLINE_FUNCTION void GetIdxCArray(int idx, int *indices) const { GetIndicesCArrayImpl(idx, indices, std::make_index_sequence()); @@ -112,12 +117,12 @@ struct Indexer { template KOKKOS_FORCEINLINE_FUNCTION auto StartIdx() const { - return std::get(start); + return start[I]; } template KOKKOS_FORCEINLINE_FUNCTION auto EndIdx() const { - return std::get(end); + return end[I]; } static const constexpr std::size_t rank = sizeof...(Ts); @@ -151,7 +156,7 @@ struct Indexer { template KOKKOS_FORCEINLINE_FUNCTION Kokkos::Array - GetIndicesArrayImpl(int idx, std::index_sequence) const { + GetIndicesKArrayImpl(int idx, std::index_sequence) const { Kokkos::Array indices; ( [&] { @@ -163,6 +168,20 @@ struct Indexer { return indices; } + template + KOKKOS_FORCEINLINE_FUNCTION std::array + GetIndicesArrayImpl(int idx, std::index_sequence) const { + std::array indices; + ( + [&] { + indices[Is] = idx / N[Is]; + idx -= indices[Is] * N[Is]; + indices[Is] += start[Is]; + }(), + ...); + return indices; + } + template KOKKOS_FORCEINLINE_FUNCTION static Kokkos::Array GetFactors(Kokkos::Array Nt, std::index_sequence) { @@ -200,7 +219,7 @@ template <> struct Indexer<> { // this is a dummy and shouldn't ever actually get used to index an array KOKKOS_FORCEINLINE_FUNCTION - Kokkos::Array GetIdxArray(int idx) const { return {-1}; } + Kokkos::Array GetIdxKArray(int idx) const { return {-1}; } }; template From 8a74cbad489462e43587ef934e4df85eaf79e0f3 Mon Sep 17 00:00:00 2001 From: "adam.c.reyes" Date: Sat, 24 Aug 2024 10:48:20 -0400 Subject: [PATCH 60/99] cleanup --- src/kokkos_abstraction.hpp | 21 -------------------- src/utils/type_list.hpp | 40 -------------------------------------- 2 files changed, 61 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 40a336685682..bc08d8435239 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -394,26 +394,6 @@ struct BoundTranslator { template struct BoundTranslator> : public BoundTranslator {}; -template -struct InnerFunctor {}; - -template -struct InnerFunctor, - std::integer_sequence> { - static constexpr std::size_t Nteam = sizeof...(Iteam); - Function function; - Kokkos::Array inds_team; - - KOKKOS_INLINE_FUNCTION - InnerFunctor(Kokkos::Array _inds_team, Function _function) - : inds_team(_inds_team), function(_function) {} - - KOKKOS_FORCEINLINE_FUNCTION - void operator()(Index... inds) const { - function(inds_team[Iteam]..., std::forward(inds)...); - } -}; - template struct dispatch_collapse { @@ -568,7 +548,6 @@ struct par_dispatch_impl, TypeList(bounds)...); diff --git a/src/utils/type_list.hpp b/src/utils/type_list.hpp index 346b5c74d956..4f856baf037e 100644 --- a/src/utils/type_list.hpp +++ b/src/utils/type_list.hpp @@ -150,31 +150,11 @@ auto ListOfType() { return concatenate_type_lists_t, decltype(ListOfType())>(); } } - -template -struct SequenceOfInt {}; - -template -struct SequenceOfInt<0, VAL, std::integer_sequence> { - using value = typename std::integer_sequence; -}; - -template -struct SequenceOfInt> { - using value = - typename SequenceOfInt>::value; -}; - } // namespace impl template using list_of_type_t = decltype(impl::ListOfType()); -template -using sequence_of_int_v = - typename impl::SequenceOfInt>::value; - template struct FuncSignature; @@ -262,26 +242,6 @@ struct FunctionSignature { template using function_signature = FunctionSignature::operator())>; - -/* template */ -/* struct FuncSignature; */ - -/* template */ -/* struct FuncSignature : public FuncSignature {}; */ - -/* template */ -/* struct FuncSignature { */ -/* using type = R(Args...); */ -/* using arg_types_tl = TypeList; */ -/* using ret_type = R; */ -/* }; */ - -/* template */ -/* struct FuncSignature { */ -/* using type = R (T::*)(Args...); */ -/* using arg_types_tl = TypeList; */ -/* using ret_type = R; */ -/* }; */ } // namespace parthenon #endif // UTILS_TYPE_LIST_HPP_ From 64cf17987357e78887281ba115a78df177b04fa4 Mon Sep 17 00:00:00 2001 From: Adam Date: Sat, 24 Aug 2024 15:18:11 +0000 Subject: [PATCH 61/99] fix unit test --- tst/unit/kokkos_abstraction.cpp | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index 55d7caf031d2..a2aab35868e0 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -84,6 +84,25 @@ auto HostArrayND(Args &&...args) { } } +template +struct SequenceOfInt {}; + +template +struct SequenceOfInt<0, VAL, std::integer_sequence> { + using value = typename std::integer_sequence; +}; + +template +struct SequenceOfInt> { + using value = + typename SequenceOfInt>::value; +}; + +template +using sequence_of_int_v = + typename SequenceOfInt>::value; + enum class lbounds { integer, indexrange }; template @@ -96,7 +115,7 @@ struct test_wrapper_nd_impl { decltype(HostArrayND()) arr_host_orig, arr_host_mod; test_wrapper_nd_impl() { - arr_dev = GetArray(parthenon::sequence_of_int_v()); + arr_dev = GetArray(sequence_of_int_v()); arr_host_orig = Kokkos::create_mirror(arr_dev); arr_host_mod = Kokkos::create_mirror(arr_dev); std::random_device rd; // Will be used to obtain a seed for the random number engine From 001108ff2bec12689a8690731044a8833545116c Mon Sep 17 00:00:00 2001 From: Adam Date: Mon, 26 Aug 2024 17:55:57 +0000 Subject: [PATCH 62/99] static_asserts for simdfor loop patterns + seq_for --- src/kokkos_abstraction.hpp | 67 +++++++++++++++++++++++++++--------- src/solvers/solver_utils.hpp | 5 ++- src/utils/indexer.hpp | 17 --------- 3 files changed, 53 insertions(+), 36 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index bc08d8435239..997c1ee7273c 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -309,7 +309,7 @@ struct DispatchSignature> { using TL = TypeList; static constexpr std::size_t func_idx = FirstFuncIdx(); static_assert(sizeof...(AllArgs) > func_idx, - "Couldn't determine functor index in dispatc args"); + "Couldn't determine functor index from dispatch args"); public: using LaunchBounds = typename TL::template continuous_sublist<0, func_idx - 1>; @@ -340,10 +340,13 @@ struct DispatchType { constexpr bool IsFlatRange = std::is_same::value; constexpr bool IsMDRange = std::is_same::value; constexpr bool IsSimdFor = std::is_same::value; - // fallback simd par_reduce to flat range and force par_scan to flat range - if constexpr (IsFlatRange || (IsSimdFor && !is_ParFor)) return LoopPatternFlatRange(); - if constexpr (IsSimdFor && is_ParFor) return LoopPatternSimdFor(); - if constexpr (IsMDRange && !is_ParScan) return LoopPatternMDRange(); + + if constexpr (is_ParScan) return LoopPatternFlatRange(); + if constexpr (IsFlatRange) return LoopPatternFlatRange(); + if constexpr (IsSimdFor) { + return std::conditional_t(); + } + if constexpr (IsMDRange) return LoopPatternMDRange(); if constexpr (TeamPattern::value) { if constexpr (std::is_same_v) { return OuterLoopPatternTeams(); @@ -414,7 +417,7 @@ struct dispatch_collapse { Args &&...args) const { auto inds_team = idxer_team.GetIdxKArray(team_member.league_rank()); if constexpr (Nthread > 0) { - auto idxer_thread = + const auto idxer_thread = MakeIndexer(Kokkos::Array{bound_arr[ThreadIs]...}); Kokkos::parallel_for( Kokkos::TeamThreadRange<>(team_member, 0, idxer_thread.size()), @@ -423,7 +426,7 @@ struct dispatch_collapse { if constexpr (Nvector > 0) { static_assert(Nvector * Nthread == 0 || sizeof...(Args) == 0, "thread + vector range pattern only supported for par_for "); - auto idxer_vector = MakeIndexer( + const auto idxer_vector = MakeIndexer( Kokkos::Array{bound_arr[Nthread + VectorIs]...}); Kokkos::parallel_for( Kokkos::ThreadVectorRange(team_member, 0, idxer_vector.size()), @@ -440,7 +443,7 @@ struct dispatch_collapse { }, std::forward(args)...); } else { - auto idxer_vector = MakeIndexer( + const auto idxer_vector = MakeIndexer( Kokkos::Array{bound_arr[Nthread + VectorIs]...}); Kokkos::parallel_for( Kokkos::TeamVectorRange(team_member, 0, idxer_vector.size()), @@ -470,15 +473,15 @@ MakeCollapse(IdxTeam idxer, Kokkos::Array bounds, Function fun } template -void SimdFor(std::index_sequence, Function function, - Kokkos::Array bounds) { +KOKKOS_INLINE_FUNCTION void SimdFor(std::index_sequence, Function function, + Kokkos::Array bounds) { if constexpr (Rank == 1) { #pragma omp simd for (int i = bounds[0].s; i <= bounds[0].e; i++) { function(i); } } else { - auto idxer = + const auto idxer = MakeIndexer(std::pair(bounds[OuterIs].s, bounds[OuterIs].e)...); for (int idx = 0; idx < idxer.size(); idx++) { const auto indices = idxer.GetIdxKArray(idx); @@ -506,7 +509,11 @@ struct par_disp_inner_impl, TypeList(bounds)...); - if constexpr (std::is_same_v) { + constexpr bool isSimdFor = std::is_same_v; + if constexpr (isSimdFor) { + static_assert(!isSimdFor || + (isSimdFor && std::is_same_v), + "par_inner simd for pattern only supported on HostExecSpace"); SimdFor(std::make_index_sequence(), function, bound_arr); } else { auto idxer = Indexer<>(); @@ -552,7 +559,11 @@ struct par_dispatch_impl, TypeList(bounds)...); constexpr auto pattern_tag = dispatch_type::GetPatternTag(); - if constexpr (std::is_same_v>) { + constexpr bool isSimdFor = + std::is_same_v>; + if constexpr (isSimdFor) { + static_assert(!isSimdFor || (isSimdFor && std::is_same_v), + "SimdFor pattern only supported in HostExecSpace"); SimdFor(std::make_index_sequence(), function, bound_arr); } else { dispatch_impl(pattern_tag, std::make_index_sequence(), @@ -572,7 +583,7 @@ struct par_dispatch_impl, TypeList(exec_space, 0, idxer.size()), KOKKOS_LAMBDA(const int idx, ExtraFuncArgs... fargs) { @@ -602,7 +613,7 @@ struct par_dispatch_impl, TypeList bound_arr, Function function, Args &&...args, const int scratch_level, const std::size_t scratch_size_in_bytes) { - auto idxer = + const auto idxer = MakeIndexer(Kokkos::Array{bound_arr[OuterIs]...}); kokkos_dispatch( Tag(), name, @@ -622,7 +633,7 @@ struct par_dispatch_impl, TypeList bound_arr, Function function, Args &&...args, const int scratch_level, const std::size_t scratch_size_in_bytes) { - auto idxer = + const auto idxer = MakeIndexer(Kokkos::Array{bound_arr[OuterIs]...}); using TeamPattern = typename dispatch_type::TeamPattern; constexpr std::size_t Nvector = TeamPattern::Nvector; @@ -662,6 +673,30 @@ inline void par_dispatch(const std::string &name, Args &&...args) { std::forward(args)...); } +template +struct seq_for_impl {}; + +template +struct seq_for_impl> { + + KOKKOS_INLINE_FUNCTION void execute(Bounds &&...bounds, Function function) { + using bound_translator = BoundTranslator; + constexpr std::size_t Rank = bound_translator::rank; + const auto bound_arr = + bound_translator().GetIndexRanges(std::forward(bounds)...); + SimdFor(std::make_index_sequence(), function, bound_arr); + } +}; + +template +KOKKOS_INLINE_FUNCTION void seq_for(Args &&...args) { + using dispatchsig = DispatchSignature>; + using Function = typename dispatchsig::Function; + using LaunchBounds = typename dispatchsig::LaunchBounds; + + seq_for_impl().execute(std::forward(args)...); +} + template inline void par_for(Args &&...args) { par_dispatch(std::forward(args)...); diff --git a/src/solvers/solver_utils.hpp b/src/solvers/solver_utils.hpp index 871462d11f31..daea31177396 100644 --- a/src/solvers/solver_utils.hpp +++ b/src/solvers/solver_utils.hpp @@ -247,9 +247,8 @@ TaskStatus SetToZero(const std::shared_ptr> &md) { IndexRange kb = cb.GetBoundsK(IndexDomain::interior, te); const int nvars = pack.GetUpperBound(b, var()) - pack.GetLowerBound(b, var()) + 1; for (int c = 0; c < nvars; ++c) { - parthenon::par_for_inner( - parthenon::inner_loop_pattern_simdfor_tag, member, kb.s, kb.e, jb.s, jb.e, - ib.s, ib.e, + parthenon::seq_for( + kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, [&](int k, int j, int i) { pack(b, te, var(c), k, j, i) = 0.0; }); } }); diff --git a/src/utils/indexer.hpp b/src/utils/indexer.hpp index b1f8dae8a521..f9b45c845bb2 100644 --- a/src/utils/indexer.hpp +++ b/src/utils/indexer.hpp @@ -272,7 +272,6 @@ KOKKOS_FORCEINLINE_FUNCTION auto MakeIndexer(TypeList, Kokkos::Array bounds_arr, std::integer_sequence) { return Indexer(bounds_arr[Is]...); - /* return MakeIndexer(std::pair(bounds_arr[Is].s, bounds_arr[Is].e)...); */ } template @@ -281,21 +280,5 @@ KOKKOS_FORCEINLINE_FUNCTION auto MakeIndexer(Kokkos::Array bou std::make_index_sequence()); } -namespace impl { -template -KOKKOS_FORCEINLINE_FUNCTION auto MakeIndexerIntImpl(std::array args, - std::index_sequence) { - return MakeIndexer(std::pair(args[2 * Is], args[2 * Is + 1])...); -} -} // namespace impl - -template -KOKKOS_FORCEINLINE_FUNCTION auto MakeIndexerInt(Ts &&...args) { - static_assert(sizeof...(Ts) % 2 == 0, - "Must have an upper and lower end to each index range."); - return impl::MakeIndexerIntImpl(std::array{args...}, - std::make_index_sequence()); -} - } // namespace parthenon #endif // UTILS_INDEXER_HPP_ From bd9df9c6e4bb0e62965ea755b0dfb26ace3ee64a Mon Sep 17 00:00:00 2001 From: adam reyes Date: Tue, 27 Aug 2024 10:47:12 +0200 Subject: [PATCH 63/99] linting --- src/kokkos_abstraction.hpp | 4 +--- src/utils/indexer.hpp | 5 +++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index e18221c9153d..376a66b904ca 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -29,8 +29,8 @@ #include #include +#include -#include "Kokkos_Macros.hpp" #include "basic_types.hpp" #include "config.hpp" #include "impl/Kokkos_Tools_Generic.hpp" @@ -321,7 +321,6 @@ struct DispatchSignature> { template struct DispatchType { - static constexpr std::size_t Rank = GetNumBounds(TypeList()) / 2; using TeamPattern = @@ -678,7 +677,6 @@ struct seq_for_impl {}; template struct seq_for_impl> { - KOKKOS_INLINE_FUNCTION void execute(Bounds &&...bounds, Function function) { using bound_translator = BoundTranslator; constexpr std::size_t Rank = bound_translator::rank; diff --git a/src/utils/indexer.hpp b/src/utils/indexer.hpp index f9b45c845bb2..b49e7df0fcb6 100644 --- a/src/utils/indexer.hpp +++ b/src/utils/indexer.hpp @@ -19,10 +19,11 @@ #include #include +#include + #include "utils/concepts_lite.hpp" #include "utils/type_list.hpp" #include "utils/utils.hpp" -#include namespace parthenon { @@ -209,7 +210,7 @@ struct IndexRanger { IndexRanger() : N{}, _size{} {}; KOKKOS_INLINE_FUNCTION - explicit IndexRanger(Ts... IdrsA){}; + explicit IndexRanger(Ts... IdrsA) {} Kokkos::Array N; std::size_t _size; From ebae3cd8531fbb7792d99489fd69b1ac360587a6 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Tue, 27 Aug 2024 13:31:16 +0200 Subject: [PATCH 64/99] cleaning up & moving loop bounds to their own header --- src/kokkos_abstraction.hpp | 112 +++++++++++-------------------- src/loop_bounds.hpp | 131 +++++++++++++++++++++++++++++++++++++ src/utils/type_list.hpp | 104 ----------------------------- src/utils/utils.hpp | 1 - 4 files changed, 170 insertions(+), 178 deletions(-) create mode 100644 src/loop_bounds.hpp diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 376a66b904ca..51aa8beb03a3 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -20,13 +20,9 @@ #ifndef KOKKOS_ABSTRACTION_HPP_ #define KOKKOS_ABSTRACTION_HPP_ -#include -#include -#include #include #include #include -#include #include #include @@ -34,9 +30,9 @@ #include "basic_types.hpp" #include "config.hpp" #include "impl/Kokkos_Tools_Generic.hpp" +#include "loop_bounds.hpp" #include "parthenon_array_generic.hpp" #include "utils/concepts_lite.hpp" -#include "utils/error_checking.hpp" #include "utils/indexer.hpp" #include "utils/instrument.hpp" #include "utils/multi_pointer.hpp" @@ -312,16 +308,17 @@ struct DispatchSignature> { "Couldn't determine functor index from dispatch args"); public: - using LaunchBounds = typename TL::template continuous_sublist<0, func_idx - 1>; - static constexpr std::size_t rank = GetNumBounds(LaunchBounds()) / 2; - using Rank = std::integral_constant; + using LoopBounds = typename TL::template continuous_sublist<0, func_idx - 1>; + using Translator = LoopBoundTranslator; + static constexpr std::size_t Rank = Translator::Rank; using Function = typename TL::template type; using Args = typename TL::template continuous_sublist; }; template struct DispatchType { - static constexpr std::size_t Rank = GetNumBounds(TypeList()) / 2; + using Translator = LoopBoundTranslator; + static constexpr std::size_t Rank = Translator::Rank; using TeamPattern = LoopPatternTeam::value; constexpr bool IsSimdFor = std::is_same::value; - if constexpr (is_ParScan) return LoopPatternFlatRange(); - if constexpr (IsFlatRange) return LoopPatternFlatRange(); - if constexpr (IsSimdFor) { + if constexpr (is_ParScan) { + return LoopPatternFlatRange(); + } else if constexpr (IsFlatRange) { + return LoopPatternFlatRange(); + } else if constexpr (IsSimdFor) { return std::conditional_t(); - } - if constexpr (IsMDRange) return LoopPatternMDRange(); - if constexpr (TeamPattern::value) { + } else if constexpr (IsMDRange) { + return LoopPatternMDRange(); + } else if constexpr (TeamPattern::value) { if constexpr (std::is_same_v) { return OuterLoopPatternTeams(); } else { return LoopPatternTeamCollapse(); } + } else { + return LoopPatternUndefined(); } } }; -// Struct for translating between loop bounds given in terms of IndexRanges and loop -// bounds given in terms of raw integers -template -struct BoundTranslator { - private: - // overloads for different launch bound types. - // should also be counted by isBoundType & GetNumBounds in type_list.hpp - template - KOKKOS_INLINE_FUNCTION void GetIndexRanges_impl(const int idx, const int s, const int e, - Bounds &&...bounds) { - bound_arr[idx].s = s; - bound_arr[idx].e = e; - if constexpr (sizeof...(Bounds) > 0) { - GetIndexRanges_impl(idx + 1, std::forward(bounds)...); - } - } - template - KOKKOS_INLINE_FUNCTION void GetIndexRanges_impl(const int idx, const IndexRange ir, - Bounds &&...bounds) { - bound_arr[idx] = ir; - if constexpr (sizeof...(Bounds) > 0) { - GetIndexRanges_impl(idx + 1, std::forward(bounds)...); - } - } - - public: - using Bound_tl = TypeList; - static constexpr std::size_t rank = GetNumBounds(Bound_tl()) / 2; - Kokkos::Array bound_arr; - - KOKKOS_INLINE_FUNCTION - Kokkos::Array GetIndexRanges(Bound_ts &&...bounds) { - GetIndexRanges_impl(0, std::forward(bounds)...); - return bound_arr; - } -}; - -template -struct BoundTranslator> : public BoundTranslator {}; - template struct dispatch_collapse { @@ -499,8 +460,8 @@ template struct par_disp_inner_impl, TypeList, TypeList> { - using bound_translator = BoundTranslator; - static constexpr std::size_t Rank = bound_translator::rank; + using bound_translator = LoopBoundTranslator; + static constexpr std::size_t Rank = bound_translator::Rank; using TeamPattern = LoopPatternTeam>; template using sequence = std::make_index_sequence; @@ -530,12 +491,12 @@ template KOKKOS_FORCEINLINE_FUNCTION void par_disp_inner(Pattern, team_mbr_t team_member, AllArgs &&...args) { using dispatchsig = DispatchSignature>; - constexpr std::size_t Rank = dispatchsig::Rank::value; + constexpr std::size_t Rank = dispatchsig::Rank; using Function = typename dispatchsig::Function; - using LaunchBounds = typename dispatchsig::LaunchBounds; + using LoopBounds = typename dispatchsig::LoopBounds; using Args = typename dispatchsig::Args; using ExtraFuncArgs = typename function_signature::FArgs; - par_disp_inner_impl().execute( + par_disp_inner_impl().execute( team_member, std::forward(args)...); } @@ -547,8 +508,8 @@ template , TypeList, TypeList> { using dispatch_type = DispatchType; - using bound_translator = BoundTranslator; - static constexpr std::size_t Rank = bound_translator::rank; + using bound_translator = LoopBoundTranslator; + static constexpr std::size_t Rank = bound_translator::Rank; template inline void dispatch(std::string name, ExecSpace exec_space, Bounds &&...bounds, @@ -556,10 +517,15 @@ struct par_dispatch_impl, TypeList(bounds)...); constexpr auto pattern_tag = dispatch_type::GetPatternTag(); + static_assert( + !std::is_same_v && + !always_false, + "Loop pattern & tag combination not recognized in DispatchType::GetPatternTag"); + constexpr bool isSimdFor = std::is_same_v>; + auto bound_arr = bound_translator().GetIndexRanges(std::forward(bounds)...); if constexpr (isSimdFor) { static_assert(!isSimdFor || (isSimdFor && std::is_same_v), "SimdFor pattern only supported in HostExecSpace"); @@ -653,16 +619,16 @@ template inline void par_dispatch(Pattern, std::string name, DevExecSpace exec_space, AllArgs &&...args) { using dispatchsig = DispatchSignature>; - constexpr std::size_t Rank = dispatchsig::Rank::value; + constexpr std::size_t Rank = dispatchsig::Rank; using Function = typename dispatchsig::Function; - using LaunchBounds = typename dispatchsig::LaunchBounds; + using LoopBounds = typename dispatchsig::LoopBounds; using Args = typename dispatchsig::Args; using ExtraFuncArgs = typename function_signature::FArgs; if constexpr (Rank > 1 && std::is_same_v) { static_assert(always_false, "par_scan only for 1D loops"); } - par_dispatch_impl().dispatch( + par_dispatch_impl().dispatch( name, exec_space, std::forward(args)...); } @@ -678,8 +644,8 @@ struct seq_for_impl {}; template struct seq_for_impl> { KOKKOS_INLINE_FUNCTION void execute(Bounds &&...bounds, Function function) { - using bound_translator = BoundTranslator; - constexpr std::size_t Rank = bound_translator::rank; + using bound_translator = LoopBoundTranslator; + constexpr std::size_t Rank = bound_translator::Rank; const auto bound_arr = bound_translator().GetIndexRanges(std::forward(bounds)...); SimdFor(std::make_index_sequence(), function, bound_arr); @@ -690,9 +656,9 @@ template KOKKOS_INLINE_FUNCTION void seq_for(Args &&...args) { using dispatchsig = DispatchSignature>; using Function = typename dispatchsig::Function; - using LaunchBounds = typename dispatchsig::LaunchBounds; + using LoopBounds = typename dispatchsig::LoopBounds; - seq_for_impl().execute(std::forward(args)...); + seq_for_impl().execute(std::forward(args)...); } template @@ -716,14 +682,14 @@ par_for_outer(Pattern, const std::string &name, DevExecSpace exec_space, std::size_t scratch_size_in_bytes, const int scratch_level, AllArgs &&...args) { using dispatchsig = DispatchSignature>; - static constexpr std::size_t Rank = dispatchsig::Rank::value; + static constexpr std::size_t Rank = dispatchsig::Rank; using Function = typename dispatchsig::Function; - using LaunchBounds = typename dispatchsig::LaunchBounds; + using LoopBounds = typename dispatchsig::LoopBounds; using Args = typename dispatchsig::Args; using Tag = dispatch_impl::ParallelForDispatch; using ExtraFuncArgs = typename function_signature::FArgs; - par_dispatch_impl().dispatch( + par_dispatch_impl().dispatch( name, exec_space, std::forward(args)..., scratch_level, scratch_size_in_bytes); } diff --git a/src/loop_bounds.hpp b/src/loop_bounds.hpp new file mode 100644 index 000000000000..db03ab20cde7 --- /dev/null +++ b/src/loop_bounds.hpp @@ -0,0 +1,131 @@ +//======================================================================================== +// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved. +// +// This program was produced under U.S. Government contract 89233218CNA000001 for Los +// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC +// for the U.S. Department of Energy/National Nuclear Security Administration. All rights +// in the program are reserved by Triad National Security, LLC, and the U.S. Department +// of Energy/National Nuclear Security Administration. The Government is granted for +// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide +// license in this material to reproduce, prepare derivative works, distribute copies to +// the public, perform publicly and display publicly, and to permit others to do so. +//======================================================================================== + +#ifndef LOOP_BOUNDS_HPP_ +#define LOOP_BOUNDS_HPP_ + +#include + +#include + +#include "basic_types.hpp" +#include "utils/concepts_lite.hpp" +#include "utils/type_list.hpp" + +namespace parthenon { + +// Struct for translating between loop bounds given to par_dispatch into an array of +// IndexRanges +// +template +struct LoopBoundTranslator { + private: + using BoundTypes = TypeList; + // overloads for different launch bound types. + template + KOKKOS_INLINE_FUNCTION void GetIndexRanges_impl(const int idx, const int s, const int e, + Bounds &&...bounds) { + bound_arr[idx].s = s; + bound_arr[idx].e = e; + if constexpr (sizeof...(Bounds) > 0) { + GetIndexRanges_impl(idx + 1, std::forward(bounds)...); + } + } + template + KOKKOS_INLINE_FUNCTION void GetIndexRanges_impl(const int idx, const IndexRange ir, + Bounds &&...bounds) { + bound_arr[idx] = ir; + if constexpr (sizeof...(Bounds) > 0) { + GetIndexRanges_impl(idx + 1, std::forward(bounds)...); + } + } + + using Bound_tl = TypeList; + + public: + template + static constexpr bool isBoundType() { + using btype = base_type; + return std::is_same_v || std::is_integral_v; + } + + template + static constexpr std::size_t GetNumBounds(TypeList) { + using TL = TypeList; + if constexpr (sizeof...(Bnds) == 0) { + return 0; + } else { + using Bnd0 = typename TL::template type<0>; + static_assert(isBoundType(), "unrecognized launch bound in par_dispatch"); + if constexpr (std::is_same_v, IndexRange>) { + return 2 + GetNumBounds(typename TL::template continuous_sublist<1>()); + } else if constexpr (std::is_integral_v>) { + using Bnd1 = typename TL::template type<1>; + static_assert(std::is_integral_v>, + "integer launch bounds need to come in (start, end) pairs"); + return 2 + GetNumBounds(typename TL::template continuous_sublist<2>()); + } + } + } + static constexpr std::size_t Rank = GetNumBounds(Bound_tl()) / 2; + Kokkos::Array bound_arr; + + KOKKOS_INLINE_FUNCTION + Kokkos::Array GetIndexRanges(Bound_ts &&...bounds) { + GetIndexRanges_impl(0, std::forward(bounds)...); + return bound_arr; + } +}; + +template +struct LoopBoundTranslator> + : public LoopBoundTranslator {}; + +template +struct is_functor : std::false_type {}; + +template +struct is_functor> : std::true_type {}; + +template +constexpr int FirstFuncIdx() { + if constexpr (idx == TL::n_types) { + return TL::n_types; + } else { + using cur_type = base_type>; + if constexpr (is_functor::value) return idx; + if constexpr (std::is_function>::value) return idx; + return FirstFuncIdx(); + } +} + +template +struct FunctionSignature {}; + +template +struct FunctionSignature { + private: + using team_mbr_t = Kokkos::TeamPolicy<>::member_type; + static constexpr bool team_mbr = std::is_same_v>; + using TL = TypeList; + + public: + using FArgs = typename TL::template continuous_sublist; +}; + +template +using function_signature = FunctionSignature::operator())>; + +} // namespace parthenon + +#endif // LOOP_BOUNDS_HPP_ diff --git a/src/utils/type_list.hpp b/src/utils/type_list.hpp index 4f856baf037e..57d6a5b13482 100644 --- a/src/utils/type_list.hpp +++ b/src/utils/type_list.hpp @@ -18,12 +18,8 @@ #include #include #include -#include #include -#include "basic_types.hpp" -#include "concepts_lite.hpp" - namespace parthenon { // c++-20 has std:remove_cvref_t that does this same thing @@ -130,18 +126,6 @@ auto GetNames() { } namespace impl { -template -static constexpr int FirstNonIntegralImpl() { - if constexpr (cidx == TL::n_types) { - return TL::n_types; - } else { - if constexpr (std::is_integral_v>::type>) - return FirstNonIntegralImpl(); - return cidx; - } -} - template auto ListOfType() { if constexpr (N == 1) { @@ -154,94 +138,6 @@ auto ListOfType() { template using list_of_type_t = decltype(impl::ListOfType()); - -template -struct FuncSignature; - -template -struct FuncSignature : public FuncSignature {}; - -template -struct FuncSignature { - using type = R(Args...); - using arg_types_tl = TypeList; - using ret_type = R; -}; - -template -struct FuncSignature { - using type = R (T::*)(Args...); - using arg_types_tl = TypeList; - using ret_type = R; -}; - -template -static constexpr int FirstNonIntegralIdx() { - return impl::FirstNonIntegralImpl(); -} -template -struct is_functor : std::false_type {}; - -template -struct is_functor> : std::true_type {}; - -template -constexpr int FirstFuncIdx() { - if constexpr (idx == TL::n_types) { - return TL::n_types; - } else { - using cur_type = base_type>; - if constexpr (is_functor::value) return idx; - if constexpr (std::is_function>::value) return idx; - return FirstFuncIdx(); - } -} - -// Recognized bound types -// additional types should be translated in BoundTranslator (kokkos_abstraction.hpp) -template -constexpr bool isBoundType() { - using BoundTypes = TypeList; - using btype = base_type; - return std::is_same_v || std::is_integral_v; -} - -template -constexpr std::size_t GetNumBounds(TypeList) { - using TL = TypeList; - if constexpr (sizeof...(Bnds) == 0) { - return 0; - } else { - using Bnd0 = typename TL::template type<0>; - static_assert(isBoundType(), "unrecognized launch bound in par_dispatch"); - if constexpr (std::is_same_v, IndexRange>) { - return 2 + GetNumBounds(typename TL::template continuous_sublist<1>()); - } else if constexpr (std::is_integral_v>) { - using Bnd1 = typename TL::template type<1>; - static_assert(std::is_integral_v>, - "integer launch bounds need to come in (start, end) pairs"); - return 2 + GetNumBounds(typename TL::template continuous_sublist<2>()); - } - } -} - -template -struct FunctionSignature {}; - -template -struct FunctionSignature { - private: - using team_mbr_t = Kokkos::TeamPolicy<>::member_type; - static constexpr bool team_mbr = std::is_same_v>; - using TL = TypeList; - - public: - using IndexND = typename TL::template continuous_sublist<0, Rank + team_mbr - 1>; - using FArgs = typename TL::template continuous_sublist; -}; - -template -using function_signature = FunctionSignature::operator())>; } // namespace parthenon #endif // UTILS_TYPE_LIST_HPP_ diff --git a/src/utils/utils.hpp b/src/utils/utils.hpp index 33c1aac276a4..c519f112acad 100644 --- a/src/utils/utils.hpp +++ b/src/utils/utils.hpp @@ -31,7 +31,6 @@ #include "constants.hpp" #include "error_checking.hpp" -#include "kokkos_abstraction.hpp" namespace parthenon { From 48664ac63ba2aaa41559793c35b461e5b05cae56 Mon Sep 17 00:00:00 2001 From: acreyes Date: Wed, 28 Aug 2024 11:57:48 +0000 Subject: [PATCH 65/99] fix unit tests for cuda 11.4 --- tst/unit/kokkos_abstraction.cpp | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index a2aab35868e0..d222adae174d 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -189,22 +189,16 @@ struct test_wrapper_nd_impl { template void execute(DevExecSpace exec_space, view_t &dev, int *int_bounds, parthenon::IndexRange *bounds) { + const auto functor = KOKKOS_CLASS_LAMBDA(Ts... args) { + dev(std::forward(args)...) += + increment_data(std::forward(args)...); + }; if constexpr (bound_type == lbounds::integer) { - parthenon::par_for( - Pattern(), "unit test ND integer bounds", exec_space, int_bounds[Ids]..., - - KOKKOS_CLASS_LAMBDA(Ts... args) { - dev(std::forward(args)...) += - increment_data(std::forward(args)...); - }); + parthenon::par_for(Pattern(), "unit test ND integer bounds", exec_space, + int_bounds[Ids]..., functor); } else { - parthenon::par_for( - Pattern(), "unit test ND IndexRange bounds", exec_space, bounds[Ids]..., - - KOKKOS_CLASS_LAMBDA(Ts... args) { - dev(std::forward(args)...) += - increment_data(std::forward(args)...); - }); + parthenon::par_for(Pattern(), "unit test ND IndexRange bounds", exec_space, + bounds[Ids]..., functor); } } }; From 4887cfc6769c9cb862cd2b5c030b7d8eb823500a Mon Sep 17 00:00:00 2001 From: acreyes Date: Wed, 28 Aug 2024 12:09:06 +0000 Subject: [PATCH 66/99] fixing missing return warnings from if constexpr blocks --- src/kokkos_abstraction.hpp | 57 +++++++------- src/loop_bounds.hpp | 4 + src/utils/type_list.hpp | 21 +++--- tst/unit/kokkos_abstraction.cpp | 128 +++++++++++++++++++++++--------- 4 files changed, 139 insertions(+), 71 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 51aa8beb03a3..54deed040633 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -315,6 +315,11 @@ struct DispatchSignature> { using Args = typename TL::template continuous_sublist; }; +enum class LoopPattern { flat, md, simd, outer, collapse, undef }; + +template +struct LoopPatternTag {}; + template struct DispatchType { using Translator = LoopBoundTranslator; @@ -324,36 +329,35 @@ struct DispatchType { LoopPatternTeam>; // false_type unless we use // an outer team policy + static constexpr bool is_ParFor = + std::is_same::value; + static constexpr bool is_ParScan = + std::is_same::value; + + static constexpr bool IsFlatRange = std::is_same::value; + static constexpr bool IsMDRange = std::is_same::value; + static constexpr bool IsSimdFor = std::is_same::value; // check any confilcts with the requested pattern // and return the actual one we use - static constexpr auto GetPatternTag() { - constexpr bool is_ParFor = - std::is_same::value; - constexpr bool is_ParScan = - std::is_same::value; - - constexpr bool IsFlatRange = std::is_same::value; - constexpr bool IsMDRange = std::is_same::value; - constexpr bool IsSimdFor = std::is_same::value; + static constexpr LoopPattern GetPatternTag() { + using LP = LoopPattern; if constexpr (is_ParScan) { - return LoopPatternFlatRange(); + return LP::flat; } else if constexpr (IsFlatRange) { - return LoopPatternFlatRange(); + return LP::flat; } else if constexpr (IsSimdFor) { - return std::conditional_t(); + return is_ParFor ? LP::simd : LP::flat; } else if constexpr (IsMDRange) { - return LoopPatternMDRange(); + return LP::md; + } else if constexpr (std::is_same_v) { + return LP::outer; } else if constexpr (TeamPattern::value) { - if constexpr (std::is_same_v) { - return OuterLoopPatternTeams(); - } else { - return LoopPatternTeamCollapse(); - } - } else { - return LoopPatternUndefined(); + return LP::collapse; } + + return LP::undef; } }; @@ -507,6 +511,7 @@ template struct par_dispatch_impl, TypeList, TypeList> { + using LP = LoopPattern; using dispatch_type = DispatchType; using bound_translator = LoopBoundTranslator; static constexpr std::size_t Rank = bound_translator::Rank; @@ -517,9 +522,9 @@ struct par_dispatch_impl, TypeList(); static_assert( - !std::is_same_v && + !std::is_same_v> && !always_false, "Loop pattern & tag combination not recognized in DispatchType::GetPatternTag"); @@ -542,7 +547,7 @@ struct par_dispatch_impl, TypeList; template - inline void dispatch_impl(LoopPatternFlatRange, sequence, + inline void dispatch_impl(LoopPatternTag, sequence, sequence, std::string name, ExecSpace exec_space, Kokkos::Array bound_arr, Function function, Args &&...args, const int scratch_level, @@ -559,7 +564,7 @@ struct par_dispatch_impl, TypeList - inline void dispatch_impl(LoopPatternMDRange, sequence, + inline void dispatch_impl(LoopPatternTag, sequence, sequence, std::string name, ExecSpace exec_space, Kokkos::Array bound_arr, Function function, Args &&...args, const int scratch_level, @@ -573,7 +578,7 @@ struct par_dispatch_impl, TypeList - inline void dispatch_impl(OuterLoopPatternTeams, sequence, + inline void dispatch_impl(LoopPatternTag, sequence, sequence, std::string name, ExecSpace exec_space, Kokkos::Array bound_arr, Function function, Args &&...args, const int scratch_level, @@ -593,7 +598,7 @@ struct par_dispatch_impl, TypeList - inline void dispatch_impl(LoopPatternTeamCollapse, sequence, + inline void dispatch_impl(LoopPatternTag, sequence, sequence, std::string name, ExecSpace exec_space, Kokkos::Array bound_arr, Function function, Args &&...args, const int scratch_level, diff --git a/src/loop_bounds.hpp b/src/loop_bounds.hpp index db03ab20cde7..e3bcbb24034e 100644 --- a/src/loop_bounds.hpp +++ b/src/loop_bounds.hpp @@ -76,6 +76,8 @@ struct LoopBoundTranslator { return 2 + GetNumBounds(typename TL::template continuous_sublist<2>()); } } + // should never get here but makes older cuda compilers happy + return 0; } static constexpr std::size_t Rank = GetNumBounds(Bound_tl()) / 2; Kokkos::Array bound_arr; @@ -107,6 +109,8 @@ constexpr int FirstFuncIdx() { if constexpr (std::is_function>::value) return idx; return FirstFuncIdx(); } + // should never get here, but makes older cuda versions happy + return TL::n_types; } template diff --git a/src/utils/type_list.hpp b/src/utils/type_list.hpp index 57d6a5b13482..0b404b4208ab 100644 --- a/src/utils/type_list.hpp +++ b/src/utils/type_list.hpp @@ -126,18 +126,21 @@ auto GetNames() { } namespace impl { -template -auto ListOfType() { - if constexpr (N == 1) { - return TypeList(); - } else { - return concatenate_type_lists_t, decltype(ListOfType())>(); - } -} +template +struct ListOfType { + using Nm1 = std::integral_constant; + using type = concatenate_type_lists_t, typename ListOfType::type>; +}; + +template +struct ListOfType, T> { + using type = TypeList; +}; } // namespace impl template -using list_of_type_t = decltype(impl::ListOfType()); +using list_of_type_t = + typename impl::ListOfType, T>::type; } // namespace parthenon #endif // UTILS_TYPE_LIST_HPP_ diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index d222adae174d..2a84057a85b8 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -39,49 +39,105 @@ using parthenon::ParArray3D; using parthenon::ParArray4D; using Real = double; +template +struct ParArrayND_impl {}; +template <> +struct ParArrayND_impl<0> { + template + using type = parthenon::ParArray0D; +}; +template <> +struct ParArrayND_impl<1> { + template + using type = parthenon::ParArray1D; +}; +template <> +struct ParArrayND_impl<2> { + template + using type = parthenon::ParArray2D; +}; +template <> +struct ParArrayND_impl<3> { + template + using type = parthenon::ParArray3D; +}; +template <> +struct ParArrayND_impl<4> { + template + using type = parthenon::ParArray4D; +}; +template <> +struct ParArrayND_impl<5> { + template + using type = parthenon::ParArray5D; +}; +template <> +struct ParArrayND_impl<6> { + template + using type = parthenon::ParArray6D; +}; +template <> +struct ParArrayND_impl<7> { + template + using type = parthenon::ParArray7D; +}; +template <> +struct ParArrayND_impl<8> { + template + using type = parthenon::ParArray8D; +}; +template +struct HostArrayND_impl {}; +template <> +struct HostArrayND_impl<0> { + template + using type = parthenon::HostArray0D; +}; +template <> +struct HostArrayND_impl<1> { + template + using type = parthenon::HostArray1D; +}; +template <> +struct HostArrayND_impl<2> { + template + using type = parthenon::HostArray2D; +}; +template <> +struct HostArrayND_impl<3> { + template + using type = parthenon::HostArray3D; +}; +template <> +struct HostArrayND_impl<4> { + template + using type = parthenon::HostArray4D; +}; +template <> +struct HostArrayND_impl<5> { + template + using type = parthenon::HostArray5D; +}; +template <> +struct HostArrayND_impl<6> { + template + using type = parthenon::HostArray6D; +}; +template <> +struct HostArrayND_impl<7> { + template + using type = parthenon::HostArray7D; +}; + template auto ParArrayND(Args &&...args) { static_assert(ND <= 8, "ParArrayND supoorted up to ND=8"); - if constexpr (ND == 0) { - return parthenon::ParArray0D(std::forward(args)...); - } else if constexpr (ND == 1) { - return parthenon::ParArray1D(std::forward(args)...); - } else if constexpr (ND == 2) { - return parthenon::ParArray2D(std::forward(args)...); - } else if constexpr (ND == 3) { - return parthenon::ParArray3D(std::forward(args)...); - } else if constexpr (ND == 4) { - return parthenon::ParArray4D(std::forward(args)...); - } else if constexpr (ND == 5) { - return parthenon::ParArray5D(std::forward(args)...); - } else if constexpr (ND == 6) { - return parthenon::ParArray6D(std::forward(args)...); - } else if constexpr (ND == 7) { - return parthenon::ParArray7D(std::forward(args)...); - } else if constexpr (ND == 8) { - return parthenon::ParArray8D(std::forward(args)...); - } + return typename ParArrayND_impl::template type(std::forward(args)...); } template auto HostArrayND(Args &&...args) { static_assert(ND <= 7, "HostArrayND supoorted up to ND=7"); - if constexpr (ND == 0) { - return parthenon::HostArray0D(std::forward(args)...); - } else if constexpr (ND == 1) { - return parthenon::HostArray1D(std::forward(args)...); - } else if constexpr (ND == 2) { - return parthenon::HostArray2D(std::forward(args)...); - } else if constexpr (ND == 3) { - return parthenon::HostArray3D(std::forward(args)...); - } else if constexpr (ND == 4) { - return parthenon::HostArray4D(std::forward(args)...); - } else if constexpr (ND == 5) { - return parthenon::HostArray5D(std::forward(args)...); - } else if constexpr (ND == 6) { - return parthenon::HostArray6D(std::forward(args)...); - } else if constexpr (ND == 7) { - return parthenon::HostArray7D(std::forward(args)...); - } + return typename HostArrayND_impl::template type(std::forward(args)...); } template From 842b94d67e7387e42dac87e0f19ef3b3617c9759 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Thu, 5 Sep 2024 21:50:10 +0200 Subject: [PATCH 67/99] fix simdfor pattern check --- src/kokkos_abstraction.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 54deed040633..24ac30adece7 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -528,8 +528,8 @@ struct par_dispatch_impl, TypeList, "Loop pattern & tag combination not recognized in DispatchType::GetPatternTag"); - constexpr bool isSimdFor = - std::is_same_v>; + constexpr bool isSimdFor = std::is_same_v, + base_type>; auto bound_arr = bound_translator().GetIndexRanges(std::forward(bounds)...); if constexpr (isSimdFor) { static_assert(!isSimdFor || (isSimdFor && std::is_same_v), From beb68472aab74ed4d4708a3904c1b8c148dbb465 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Thu, 5 Sep 2024 23:01:36 +0200 Subject: [PATCH 68/99] have Indexer only return Kokkos::Array --- src/bvals/neighbor_block.cpp | 2 +- src/bvals/neighbor_block.hpp | 4 +-- src/kokkos_abstraction.hpp | 14 +++++----- src/mesh/forest/logical_location.cpp | 4 +-- src/mesh/forest/logical_location.hpp | 2 +- src/mesh/mesh-gmg.cpp | 12 ++++----- src/utils/cell_center_offsets.hpp | 4 +-- src/utils/indexer.hpp | 38 +--------------------------- 8 files changed, 22 insertions(+), 58 deletions(-) diff --git a/src/bvals/neighbor_block.cpp b/src/bvals/neighbor_block.cpp index da9a730ea078..2ecfe3bd51e1 100644 --- a/src/bvals/neighbor_block.cpp +++ b/src/bvals/neighbor_block.cpp @@ -42,7 +42,7 @@ NeighborBlock::NeighborBlock() offsets(0, 0, 0), ownership(true) {} NeighborBlock::NeighborBlock(Mesh *mesh, LogicalLocation loc, LogicalLocation origin_loc, - int rank, int gid, std::array offsets_in, int bid, + int rank, int gid, Kokkos::Array offsets_in, int bid, int target_id, int fi1, int fi2) : rank{rank}, gid{gid}, bufid{bid}, targetid{target_id}, loc{loc}, origin_loc{origin_loc}, fi1{fi1}, fi2{fi2}, block_size(mesh->GetBlockSize(loc)), diff --git a/src/bvals/neighbor_block.hpp b/src/bvals/neighbor_block.hpp index 78f954b04386..5dfc235ff3a5 100644 --- a/src/bvals/neighbor_block.hpp +++ b/src/bvals/neighbor_block.hpp @@ -69,8 +69,8 @@ struct NeighborBlock { NeighborBlock(); NeighborBlock(Mesh *mesh, LogicalLocation loc, LogicalLocation origin_loc, int rank, - int gid, std::array offsets, int bid, int target_id, int ifi1, - int ifi2); + int gid, Kokkos::Array offsets_in, int bid, int target_id, + int fi1, int fi2); }; //---------------------------------------------------------------------------------------- diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 24ac30adece7..15cffacc3df4 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -379,14 +379,14 @@ struct dispatch_collapse { std::integer_sequence, std::integer_sequence, team_mbr_t team_member, Args &&...args) const { - auto inds_team = idxer_team.GetIdxKArray(team_member.league_rank()); + auto inds_team = idxer_team.GetIdxArray(team_member.league_rank()); if constexpr (Nthread > 0) { const auto idxer_thread = MakeIndexer(Kokkos::Array{bound_arr[ThreadIs]...}); Kokkos::parallel_for( Kokkos::TeamThreadRange<>(team_member, 0, idxer_thread.size()), [&](const int idThread, ExtraFuncArgs... fargs) { - const auto inds_thread = idxer_thread.GetIdxKArray(idThread); + const auto inds_thread = idxer_thread.GetIdxArray(idThread); if constexpr (Nvector > 0) { static_assert(Nvector * Nthread == 0 || sizeof...(Args) == 0, "thread + vector range pattern only supported for par_for "); @@ -395,7 +395,7 @@ struct dispatch_collapse { Kokkos::parallel_for( Kokkos::ThreadVectorRange(team_member, 0, idxer_vector.size()), [&](const int idVector) { - const auto inds_vector = idxer_vector.GetIdxKArray(idVector); + const auto inds_vector = idxer_vector.GetIdxArray(idVector); function(inds_team[TeamIs]..., inds_thread[ThreadIs]..., inds_vector[VectorIs]..., std::forward(fargs)...); @@ -412,7 +412,7 @@ struct dispatch_collapse { Kokkos::parallel_for( Kokkos::TeamVectorRange(team_member, 0, idxer_vector.size()), [&](const int idVector, ExtraFuncArgs... fargs) { - const auto inds_vector = idxer_vector.GetIdxKArray(idVector); + const auto inds_vector = idxer_vector.GetIdxArray(idVector); function(inds_team[TeamIs]..., inds_vector[VectorIs]..., std::forward(fargs)...); }, @@ -448,7 +448,7 @@ KOKKOS_INLINE_FUNCTION void SimdFor(std::index_sequence, Function fu const auto idxer = MakeIndexer(std::pair(bounds[OuterIs].s, bounds[OuterIs].e)...); for (int idx = 0; idx < idxer.size(); idx++) { - const auto indices = idxer.GetIdxKArray(idx); + const auto indices = idxer.GetIdxArray(idx); #pragma omp simd for (int i = bounds[Rank - 1].s; i <= bounds[Rank - 1].e; i++) { function(indices[OuterIs]..., i); @@ -557,7 +557,7 @@ struct par_dispatch_impl, TypeList(exec_space, 0, idxer.size()), KOKKOS_LAMBDA(const int idx, ExtraFuncArgs... fargs) { - const auto idx_arr = idxer.GetIdxKArray(idx); + const auto idx_arr = idxer.GetIdxArray(idx); function(idx_arr[OuterIs]..., std::forward(fargs)...); }, std::forward(args)...); @@ -590,7 +590,7 @@ struct par_dispatch_impl, TypeList(fargs)...); }, diff --git a/src/mesh/forest/logical_location.cpp b/src/mesh/forest/logical_location.cpp index 3d9c4e126e59..04d761bec61a 100644 --- a/src/mesh/forest/logical_location.cpp +++ b/src/mesh/forest/logical_location.cpp @@ -95,9 +95,9 @@ bool LogicalLocation::Contains(const LogicalLocation &containee) const { return (shifted_lx1 == lx1()) && (shifted_lx2 == lx2()) && (shifted_lx3 == lx3()); } -std::array +Kokkos::Array LogicalLocation::GetSameLevelOffsets(const LogicalLocation &neighbor) const { - std::array offsets; + Kokkos::Array offsets; const int level_shift_neigh = std::max(neighbor.level() - level(), 0); const int level_shift_me = std::max(level() - neighbor.level(), 0); for (int dir = 0; dir < 3; ++dir) { diff --git a/src/mesh/forest/logical_location.hpp b/src/mesh/forest/logical_location.hpp index 1c7a5d7e073b..a0dacf573a92 100644 --- a/src/mesh/forest/logical_location.hpp +++ b/src/mesh/forest/logical_location.hpp @@ -90,7 +90,7 @@ class LogicalLocation { // aggregate and POD type bool Contains(const LogicalLocation &containee) const; - std::array GetSameLevelOffsets(const LogicalLocation &neighbor) const; + Kokkos::Array GetSameLevelOffsets(const LogicalLocation &neighbor) const; // Being a neighbor implies that you share a face, edge, or node and don't share a // volume diff --git a/src/mesh/mesh-gmg.cpp b/src/mesh/mesh-gmg.cpp index 71784b7d5a01..0547a4204315 100644 --- a/src/mesh/mesh-gmg.cpp +++ b/src/mesh/mesh-gmg.cpp @@ -166,7 +166,7 @@ void Mesh::SetGMGNeighbors() { int leaf_gid = forest.GetLeafGid(ploc); pmb->gmg_coarser_neighbors.emplace_back( pmb->pmy_mesh, ploc, ploc, ranklist[leaf_gid], gid, - std::array{0, 0, 0}, 0, 0, 0, 0); + Kokkos::Array{0, 0, 0}, 0, 0, 0, 0); } } @@ -180,15 +180,15 @@ void Mesh::SetGMGNeighbors() { if (gid >= 0) { int leaf_gid = forest.GetLeafGid(d); pmb->gmg_finer_neighbors.emplace_back(pmb->pmy_mesh, d, d, ranklist[leaf_gid], - gid, std::array{0, 0, 0}, 0, 0, - 0, 0); + gid, Kokkos::Array{0, 0, 0}, 0, + 0, 0, 0); } } if (pmb->gmg_finer_neighbors.size() == 0) { // This is a leaf block, so add itself as a finer neighbor - pmb->gmg_leaf_neighbors.emplace_back(pmb->pmy_mesh, pmb->loc, pmb->loc, - Globals::my_rank, pmb->gid, - std::array{0, 0, 0}, 0, 0, 0, 0); + pmb->gmg_leaf_neighbors.emplace_back( + pmb->pmy_mesh, pmb->loc, pmb->loc, Globals::my_rank, pmb->gid, + Kokkos::Array{0, 0, 0}, 0, 0, 0, 0); } } diff --git a/src/utils/cell_center_offsets.hpp b/src/utils/cell_center_offsets.hpp index 1ef49d74855e..4bdbeafacd69 100644 --- a/src/utils/cell_center_offsets.hpp +++ b/src/utils/cell_center_offsets.hpp @@ -45,11 +45,11 @@ inline int operator+(int b, Offset a) { return static_cast(a) + b; } inline Offset operator-(Offset in) { return static_cast(-static_cast(in)); } struct CellCentOffsets { - std::array u; + Kokkos::Array u; CellCentOffsets() = default; - explicit CellCentOffsets(const std::array &in) + explicit CellCentOffsets(const Kokkos::Array &in) : u{static_cast(in[0]), static_cast(in[1]), static_cast(in[2])} {} diff --git a/src/utils/indexer.hpp b/src/utils/indexer.hpp index b49e7df0fcb6..8b2b03762286 100644 --- a/src/utils/indexer.hpp +++ b/src/utils/indexer.hpp @@ -103,19 +103,9 @@ struct Indexer { KOKKOS_FORCEINLINE_FUNCTION auto GetIdxArray(int idx) const { - return GetIndicesArrayImpl(idx, std::make_index_sequence()); - } - - KOKKOS_FORCEINLINE_FUNCTION - auto GetIdxKArray(int idx) const { return GetIndicesKArrayImpl(idx, std::make_index_sequence()); } - KOKKOS_FORCEINLINE_FUNCTION - void GetIdxCArray(int idx, int *indices) const { - GetIndicesCArrayImpl(idx, indices, std::make_index_sequence()); - } - template KOKKOS_FORCEINLINE_FUNCTION auto StartIdx() const { return start[I]; @@ -143,18 +133,6 @@ struct Indexer { return idxs; } - template - KOKKOS_FORCEINLINE_FUNCTION void - GetIndicesCArrayImpl(int idx, int *indices, std::index_sequence) const { - ( - [&] { - indices[Is] = idx / N[Is]; - idx -= indices[Is] * N[Is]; - indices[Is] += start[Is]; - }(), - ...); - } - template KOKKOS_FORCEINLINE_FUNCTION Kokkos::Array GetIndicesKArrayImpl(int idx, std::index_sequence) const { @@ -169,20 +147,6 @@ struct Indexer { return indices; } - template - KOKKOS_FORCEINLINE_FUNCTION std::array - GetIndicesArrayImpl(int idx, std::index_sequence) const { - std::array indices; - ( - [&] { - indices[Is] = idx / N[Is]; - idx -= indices[Is] * N[Is]; - indices[Is] += start[Is]; - }(), - ...); - return indices; - } - template KOKKOS_FORCEINLINE_FUNCTION static Kokkos::Array GetFactors(Kokkos::Array Nt, std::index_sequence) { @@ -220,7 +184,7 @@ template <> struct Indexer<> { // this is a dummy and shouldn't ever actually get used to index an array KOKKOS_FORCEINLINE_FUNCTION - Kokkos::Array GetIdxKArray(int idx) const { return {-1}; } + Kokkos::Array GetIdxArray(int idx) const { return {-1}; } }; template From f68ec09db065ef30086cf3d6aa4fa2d85a9e0167 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Thu, 5 Sep 2024 23:08:33 +0200 Subject: [PATCH 69/99] move kokkos typedefs to their own header --- src/kokkos_abstraction.hpp | 109 +---------------------------- src/kokkos_types.hpp | 136 +++++++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+), 108 deletions(-) create mode 100644 src/kokkos_types.hpp diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 15cffacc3df4..416552cc7f10 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -30,122 +30,15 @@ #include "basic_types.hpp" #include "config.hpp" #include "impl/Kokkos_Tools_Generic.hpp" +#include "kokkos_types.hpp" #include "loop_bounds.hpp" -#include "parthenon_array_generic.hpp" #include "utils/concepts_lite.hpp" #include "utils/indexer.hpp" #include "utils/instrument.hpp" -#include "utils/multi_pointer.hpp" -#include "utils/object_pool.hpp" #include "utils/type_list.hpp" namespace parthenon { -#ifdef KOKKOS_ENABLE_CUDA_UVM -using DevMemSpace = Kokkos::CudaUVMSpace; -using HostMemSpace = Kokkos::CudaUVMSpace; -using DevExecSpace = Kokkos::Cuda; -#else -using DevMemSpace = Kokkos::DefaultExecutionSpace::memory_space; -using HostMemSpace = Kokkos::HostSpace; -using DevExecSpace = Kokkos::DefaultExecutionSpace; -#endif -using ScratchMemSpace = DevExecSpace::scratch_memory_space; - -using HostExecSpace = Kokkos::DefaultHostExecutionSpace; -using LayoutWrapper = Kokkos::LayoutRight; -using MemUnmanaged = Kokkos::MemoryTraits; - -#if defined(PARTHENON_ENABLE_HOST_COMM_BUFFERS) -#if defined(KOKKOS_ENABLE_CUDA) -using BufMemSpace = Kokkos::CudaHostPinnedSpace::memory_space; -#elif defined(KOKKOS_ENABLE_HIP) -using BufMemSpace = Kokkos::Experimental::HipHostPinnedSpace::memory_space; -#else -#error "Unknow comm buffer space for chose execution space." -#endif -#else -using BufMemSpace = Kokkos::DefaultExecutionSpace::memory_space; -#endif - -// MPI communication buffers -template -using BufArray1D = Kokkos::View; - -// Structures for reusable memory pools and communication -template -using buf_pool_t = ObjectPool>; - -template -using ParArray0D = ParArrayGeneric, State>; -template -using ParArray1D = ParArrayGeneric, State>; -template -using ParArray2D = ParArrayGeneric, State>; -template -using ParArray3D = - ParArrayGeneric, State>; -template -using ParArray4D = - ParArrayGeneric, State>; -template -using ParArray5D = - ParArrayGeneric, State>; -template -using ParArray6D = - ParArrayGeneric, State>; -template -using ParArray7D = - ParArrayGeneric, State>; -template -using ParArray8D = - ParArrayGeneric, State>; - -// Host mirrors -template -using HostArray0D = typename ParArray0D::HostMirror; -template -using HostArray1D = typename ParArray1D::HostMirror; -template -using HostArray2D = typename ParArray2D::HostMirror; -template -using HostArray3D = typename ParArray3D::HostMirror; -template -using HostArray4D = typename ParArray4D::HostMirror; -template -using HostArray5D = typename ParArray5D::HostMirror; -template -using HostArray6D = typename ParArray6D::HostMirror; -template -using HostArray7D = typename ParArray7D::HostMirror; - -using team_policy = Kokkos::TeamPolicy<>; -using team_mbr_t = Kokkos::TeamPolicy<>::member_type; - -template -using ScratchPad1D = Kokkos::View; -template -using ScratchPad2D = Kokkos::View; -template -using ScratchPad3D = Kokkos::View; -template -using ScratchPad4D = Kokkos::View; -template -using ScratchPad5D = Kokkos::View; -template -using ScratchPad6D = Kokkos::View; - -// Used for ParArrayND -// TODO(JMM): Should all of parthenon_arrays.hpp -// be moved here? Or should all of the above stuff be moved to -// parthenon_arrays.hpp? -inline constexpr std::size_t MAX_VARIABLE_DIMENSION = 7; -template -using device_view_t = - Kokkos::View, Layout, DevMemSpace>; -template -using host_view_t = typename device_view_t::HostMirror; - // Defining tags to determine loop_patterns using a tag dispatch design pattern // Translates a non-Kokkos standard C++ nested `for` loop where the innermost diff --git a/src/kokkos_types.hpp b/src/kokkos_types.hpp new file mode 100644 index 000000000000..2fbd285a0861 --- /dev/null +++ b/src/kokkos_types.hpp @@ -0,0 +1,136 @@ +//======================================================================================== +// Parthenon performance portable AMR framework +// Copyright(C) 2020-2023 The Parthenon collaboration +// Licensed under the 3-clause BSD License, see LICENSE file for details +//======================================================================================== +// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved. +// +// This program was produced under U.S. Government contract 89233218CNA000001 +// for Los Alamos National Laboratory (LANL), which is operated by Triad +// National Security, LLC for the U.S. Department of Energy/National Nuclear +// Security Administration. All rights in the program are reserved by Triad +// National Security, LLC, and the U.S. Department of Energy/National Nuclear +// Security Administration. The Government is granted for itself and others +// acting on its behalf a nonexclusive, paid-up, irrevocable worldwide license +// in this material to reproduce, prepare derivative works, distribute copies to +// the public, perform publicly and display publicly, and to permit others to do +// so. +//======================================================================================== + +#ifndef KOKKOS_TYPES_HPP_ +#define KOKKOS_TYPES_HPP_ + +#include + +#include "parthenon_array_generic.hpp" +#include "utils/multi_pointer.hpp" +#include "utils/object_pool.hpp" + +namespace parthenon { +#ifdef KOKKOS_ENABLE_CUDA_UVM +using DevMemSpace = Kokkos::CudaUVMSpace; +using HostMemSpace = Kokkos::CudaUVMSpace; +using DevExecSpace = Kokkos::Cuda; +#else +using DevMemSpace = Kokkos::DefaultExecutionSpace::memory_space; +using HostMemSpace = Kokkos::HostSpace; +using DevExecSpace = Kokkos::DefaultExecutionSpace; +#endif +using ScratchMemSpace = DevExecSpace::scratch_memory_space; + +using HostExecSpace = Kokkos::DefaultHostExecutionSpace; +using LayoutWrapper = Kokkos::LayoutRight; +using MemUnmanaged = Kokkos::MemoryTraits; + +#if defined(PARTHENON_ENABLE_HOST_COMM_BUFFERS) +#if defined(KOKKOS_ENABLE_CUDA) +using BufMemSpace = Kokkos::CudaHostPinnedSpace::memory_space; +#elif defined(KOKKOS_ENABLE_HIP) +using BufMemSpace = Kokkos::Experimental::HipHostPinnedSpace::memory_space; +#else +#error "Unknow comm buffer space for chose execution space." +#endif +#else +using BufMemSpace = Kokkos::DefaultExecutionSpace::memory_space; +#endif + +// MPI communication buffers +template +using BufArray1D = Kokkos::View; + +// Structures for reusable memory pools and communication +template +using buf_pool_t = ObjectPool>; + +template +using ParArray0D = ParArrayGeneric, State>; +template +using ParArray1D = ParArrayGeneric, State>; +template +using ParArray2D = ParArrayGeneric, State>; +template +using ParArray3D = + ParArrayGeneric, State>; +template +using ParArray4D = + ParArrayGeneric, State>; +template +using ParArray5D = + ParArrayGeneric, State>; +template +using ParArray6D = + ParArrayGeneric, State>; +template +using ParArray7D = + ParArrayGeneric, State>; +template +using ParArray8D = + ParArrayGeneric, State>; + +// Host mirrors +template +using HostArray0D = typename ParArray0D::HostMirror; +template +using HostArray1D = typename ParArray1D::HostMirror; +template +using HostArray2D = typename ParArray2D::HostMirror; +template +using HostArray3D = typename ParArray3D::HostMirror; +template +using HostArray4D = typename ParArray4D::HostMirror; +template +using HostArray5D = typename ParArray5D::HostMirror; +template +using HostArray6D = typename ParArray6D::HostMirror; +template +using HostArray7D = typename ParArray7D::HostMirror; + +using team_policy = Kokkos::TeamPolicy<>; +using team_mbr_t = Kokkos::TeamPolicy<>::member_type; + +template +using ScratchPad1D = Kokkos::View; +template +using ScratchPad2D = Kokkos::View; +template +using ScratchPad3D = Kokkos::View; +template +using ScratchPad4D = Kokkos::View; +template +using ScratchPad5D = Kokkos::View; +template +using ScratchPad6D = Kokkos::View; + +// Used for ParArrayND +// TODO(JMM): Should all of parthenon_arrays.hpp +// be moved here? Or should all of the above stuff be moved to +// parthenon_arrays.hpp? +inline constexpr std::size_t MAX_VARIABLE_DIMENSION = 7; +template +using device_view_t = + Kokkos::View, Layout, DevMemSpace>; +template +using host_view_t = typename device_view_t::HostMirror; +} // namespace parthenon + +#endif // KOKKOS_TYPES_HPP_ From 9aa1560ab4631ad52c824768fc5a21714b491ae9 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Thu, 5 Sep 2024 23:23:52 +0200 Subject: [PATCH 70/99] collapse pattern infers Nteam from Rank --- src/kokkos_abstraction.hpp | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 416552cc7f10..7b1dc450cc1b 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -77,7 +77,7 @@ static struct LoopPatternUndefined { } loop_pattern_undefined_tag; // Translates to a Kokkos::TeamPolicy that collapse Nteams outer loops // with Nthread & Nvector inner loop collapses -template +template struct LoopPatternCollapse {}; // trait to track if pattern requests any type of hierarchial parallelism @@ -90,14 +90,13 @@ struct LoopPatternTeam : std::false_type { // This pattern needs to determine the team and thread/vector count at compile time // By contrast the others specify the thread/vector count at compile time and the // outer team policy collapses all remaining loops -template -struct LoopPatternTeam, - std::integral_constant, void> +template +struct LoopPatternTeam, + std::integral_constant, void> : std::true_type { static constexpr std::size_t Nvector = vector; static constexpr std::size_t Nthread = thread; - static constexpr std::size_t Nteam = team; - using LoopPattern = LoopPatternCollapse; + using LoopPattern = LoopPatternCollapse; }; // Patterns with an outer team pattern that collapses all @@ -118,8 +117,7 @@ struct LoopPatternTeam< static constexpr std::size_t Nvector = IsTPTVR || IsTPTTRTVR; static constexpr std::size_t Nthread = IsTPTTR || IsTPTTRTVR; - static constexpr std::size_t Nteam = Rank::value - Nthread - Nvector; - using LoopPattern = LoopPatternCollapse; + using LoopPattern = LoopPatternCollapse; using OuterPattern = Pattern; }; @@ -134,7 +132,7 @@ struct LoopPatternTeam; + using LoopPattern = LoopPatternCollapse<0, 0>; using OuterPattern = OuterLoopPatternTeams; }; @@ -164,7 +162,7 @@ struct LoopPatternTeam< static constexpr std::size_t Nvector = IsTVR ? Rank : 0; static constexpr std::size_t Nthread = IsTTR ? Rank : 0; - using LoopPattern = LoopPatternCollapse<0, Nthread, Nvector>; + using LoopPattern = LoopPatternCollapse; }; namespace dispatch_impl { From f2cfd9150e1528482623805ddaaf3e0c452e8e8c Mon Sep 17 00:00:00 2001 From: adam reyes Date: Fri, 6 Sep 2024 00:00:48 +0200 Subject: [PATCH 71/99] clean up team/collapse patterns --- src/kokkos_abstraction.hpp | 134 ++++++++++++------------------------- 1 file changed, 41 insertions(+), 93 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 7b1dc450cc1b..192735b3be4d 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -53,116 +53,65 @@ static struct LoopPatternFlatRange { // a 1:1 indices matching static struct LoopPatternMDRange { } loop_pattern_mdrange_tag; +// Translates to a Kokkos::TeamPolicy that collapse Nthread & Nvector inner loop collapses +template +struct LoopPatternCollapse : std::true_type { + static constexpr std::size_t Nthread = num_thread; + static constexpr std::size_t Nvector = num_vector; +}; // Translates to a Kokkos::TeamPolicy with a single inner // Kokkos::TeamThreadRange -static struct LoopPatternTPTTR { -} loop_pattern_tpttr_tag; +using LoopPatternTPTTR = LoopPatternCollapse<1, 0>; +constexpr auto loop_pattern_tpttr_tag = LoopPatternTPTTR(); // Translates to a Kokkos::TeamPolicy with a single inner // Kokkos::ThreadVectorRange -static struct LoopPatternTPTVR { -} loop_pattern_tptvr_tag; +using LoopPatternTPTVR = LoopPatternCollapse<0, 1>; +constexpr auto loop_pattern_tptvr_tag = LoopPatternTPTVR(); // Translates to a Kokkos::TeamPolicy with a middle Kokkos::TeamThreadRange and // inner Kokkos::ThreadVectorRange -static struct LoopPatternTPTTRTVR { -} loop_pattern_tpttrtvr_tag; +using LoopPatternTPTTRTVR = LoopPatternCollapse<1, 1>; +constexpr auto loop_pattern_tpttrtvr_tag = LoopPatternTPTTRTVR(); // Translates to an outer team policy -static struct LoopPatternTeamOuter { -} loop_pattern_team_outer_tag; -// Translates to an outter team policy with an inner collapse over a combination of -// thread/vecctor -static struct LoopPatternTeamCollapse { -} loop_pattern_team_collapse; +using LoopPatternTeamOuter = LoopPatternCollapse<0, 0>; +constexpr auto loop_pattern_team_outer_tag = LoopPatternTeamOuter(); // Used to catch undefined behavior as it results in throwing an error static struct LoopPatternUndefined { } loop_pattern_undefined_tag; -// Translates to a Kokkos::TeamPolicy that collapse Nteams outer loops -// with Nthread & Nvector inner loop collapses -template -struct LoopPatternCollapse {}; - -// trait to track if pattern requests any type of hierarchial parallelism -template -struct LoopPatternTeam : std::false_type { - static constexpr std::size_t Nvector = 0; - static constexpr std::size_t Nthread = 0; -}; - -// This pattern needs to determine the team and thread/vector count at compile time -// By contrast the others specify the thread/vector count at compile time and the -// outer team policy collapses all remaining loops -template -struct LoopPatternTeam, - std::integral_constant, void> - : std::true_type { - static constexpr std::size_t Nvector = vector; - static constexpr std::size_t Nthread = thread; - using LoopPattern = LoopPatternCollapse; -}; - -// Patterns with an outer team pattern that collapses all -// remaining loops -template -struct LoopPatternTeam< - Pattern, Rank, - typename std::enable_if::value || - std::is_same::value || - std::is_same::value>::type> - : std::true_type { - - static constexpr bool IsTPTTR = - std::is_same::value; // inner TeamThreadRange - static constexpr bool IsTPTVR = - std::is_same::value; // inner ThreadVectorRange - static constexpr bool IsTPTTRTVR = std::is_same::value; - - static constexpr std::size_t Nvector = IsTPTVR || IsTPTTRTVR; - static constexpr std::size_t Nthread = IsTPTTR || IsTPTTRTVR; - using LoopPattern = LoopPatternCollapse; - using OuterPattern = Pattern; -}; // Tags for Nested parallelism // Translates to outermost loop being a Kokkos::TeamPolicy for par_for_outer like loops static struct OuterLoopPatternTeams { } outer_loop_pattern_teams_tag; -template -struct LoopPatternTeam, void> - : std::true_type { - static constexpr std::size_t Nvector = 0; - static constexpr std::size_t Nthread = 0; - static constexpr std::size_t Nteam = Rank; - using LoopPattern = LoopPatternCollapse<0, 0>; - using OuterPattern = OuterLoopPatternTeams; -}; - // Inner loop pattern tags must be constexpr so they're available on device // Translate to a Kokkos::TeamVectorRange as innermost loop (single index) -struct InnerLoopPatternTVR {}; -constexpr InnerLoopPatternTVR inner_loop_pattern_tvr_tag; +using InnerLoopPatternTVR = LoopPatternCollapse<0, 0>; +constexpr auto inner_loop_pattern_tvr_tag = InnerLoopPatternTVR(); // Translates to a Kokkos::TeamThreadRange as innermost loop -struct InnerLoopPatternTTR {}; -constexpr InnerLoopPatternTTR inner_loop_pattern_ttr_tag; +using InnerLoopPatternTTR = LoopPatternCollapse<0, 0>; +constexpr auto inner_loop_pattern_ttr_tag = InnerLoopPatternTTR(); // Translate to a non-Kokkos plain C++ innermost loop (single index) // decorated with #pragma omp simd // IMPORTANT: currently only supported on CPUs -struct InnerLoopPatternSimdFor {}; -constexpr InnerLoopPatternSimdFor inner_loop_pattern_simdfor_tag; - -// Patterns for par_for_inner -template -struct LoopPatternTeam< - Pattern, std::integral_constant, - typename std::enable_if::value || - std::is_same::value>::type> - : std::true_type { - - static constexpr bool IsTTR = std::is_same::value; - static constexpr bool IsTVR = std::is_same::value; - - static constexpr std::size_t Nvector = IsTVR ? Rank : 0; - static constexpr std::size_t Nthread = IsTTR ? Rank : 0; - using LoopPattern = LoopPatternCollapse; +using InnerLoopPatternSimdFor = LoopPatternCollapse<0, 0>; +constexpr auto inner_loop_pattern_simdfor_tag = InnerLoopPatternSimdFor(); + +// trait to track if pattern requests any type of hierarchial parallelism +template +struct UsesHierarchialPar : std::false_type { + static constexpr std::size_t Nvector = 0; + static constexpr std::size_t Nthread = 0; +}; + +template +struct UsesHierarchialPar> : std::true_type { + static constexpr std::size_t Nthread = num_thread; + static constexpr std::size_t Nvector = num_vector; +}; +template <> +struct UsesHierarchialPar : std::true_type { + static constexpr std::size_t Nvector = 0; + static constexpr std::size_t Nthread = 0; }; namespace dispatch_impl { @@ -216,10 +165,8 @@ struct DispatchType { using Translator = LoopBoundTranslator; static constexpr std::size_t Rank = Translator::Rank; - using TeamPattern = - LoopPatternTeam>; // false_type unless we use - // an outer team policy + using TeamPattern = UsesHierarchialPar; // false_type unless we use + // an outer team policy static constexpr bool is_ParFor = std::is_same::value; static constexpr bool is_ParScan = @@ -357,7 +304,8 @@ struct par_disp_inner_impl, TypeList> { using bound_translator = LoopBoundTranslator; static constexpr std::size_t Rank = bound_translator::Rank; - using TeamPattern = LoopPatternTeam>; + using TeamPattern = + UsesHierarchialPar>; template using sequence = std::make_index_sequence; From f255f71507d74e79c03f0ac1e291c540e3820097 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Fri, 6 Sep 2024 02:49:09 +0200 Subject: [PATCH 72/99] making simdfor pattern uniform with others --- src/kokkos_abstraction.hpp | 201 ++++++++++++++++++++++---------- tst/unit/kokkos_abstraction.cpp | 24 ++-- 2 files changed, 151 insertions(+), 74 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 192735b3be4d..eeae56a36c65 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -59,6 +59,7 @@ struct LoopPatternCollapse : std::true_type { static constexpr std::size_t Nthread = num_thread; static constexpr std::size_t Nvector = num_vector; }; + // Translates to a Kokkos::TeamPolicy with a single inner // Kokkos::TeamThreadRange using LoopPatternTPTTR = LoopPatternCollapse<1, 0>; @@ -83,37 +84,48 @@ static struct LoopPatternUndefined { // Translates to outermost loop being a Kokkos::TeamPolicy for par_for_outer like loops static struct OuterLoopPatternTeams { } outer_loop_pattern_teams_tag; + +template +struct InnerLoopCollapse {}; + // Inner loop pattern tags must be constexpr so they're available on device // Translate to a Kokkos::TeamVectorRange as innermost loop (single index) -using InnerLoopPatternTVR = LoopPatternCollapse<0, 0>; -constexpr auto inner_loop_pattern_tvr_tag = InnerLoopPatternTVR(); +using InnerLoopPatternTVR = InnerLoopCollapse<1>; +constexpr InnerLoopPatternTVR inner_loop_pattern_tvr_tag; // Translates to a Kokkos::TeamThreadRange as innermost loop -using InnerLoopPatternTTR = LoopPatternCollapse<0, 0>; -constexpr auto inner_loop_pattern_ttr_tag = InnerLoopPatternTTR(); +using InnerLoopPatternTTR = InnerLoopCollapse<0>; +constexpr InnerLoopPatternTTR inner_loop_pattern_ttr_tag; // Translate to a non-Kokkos plain C++ innermost loop (single index) // decorated with #pragma omp simd // IMPORTANT: currently only supported on CPUs -using InnerLoopPatternSimdFor = LoopPatternCollapse<0, 0>; -constexpr auto inner_loop_pattern_simdfor_tag = InnerLoopPatternSimdFor(); +struct InnerLoopPatternSimdFor {}; +constexpr InnerLoopPatternSimdFor inner_loop_pattern_simdfor_tag; // trait to track if pattern requests any type of hierarchial parallelism -template +template struct UsesHierarchialPar : std::false_type { static constexpr std::size_t Nvector = 0; static constexpr std::size_t Nthread = 0; }; -template -struct UsesHierarchialPar> : std::true_type { +template +struct UsesHierarchialPar, Rank> + : std::true_type { static constexpr std::size_t Nthread = num_thread; static constexpr std::size_t Nvector = num_vector; }; -template <> -struct UsesHierarchialPar : std::true_type { + +template +struct UsesHierarchialPar : std::true_type { static constexpr std::size_t Nvector = 0; static constexpr std::size_t Nthread = 0; }; +template +struct UsesHierarchialPar, Rank> : std::true_type { + static constexpr std::size_t Nvector = num_vector; +}; + namespace dispatch_impl { static struct ParallelForDispatch { } parallel_for_dispatch_tag; @@ -165,8 +177,9 @@ struct DispatchType { using Translator = LoopBoundTranslator; static constexpr std::size_t Rank = Translator::Rank; - using TeamPattern = UsesHierarchialPar; // false_type unless we use - // an outer team policy + using TeamPattern = UsesHierarchialPar< + Pattern, std::integral_constant>; // false_type unless we use + // an outer team policy static constexpr bool is_ParFor = std::is_same::value; static constexpr bool is_ParScan = @@ -175,6 +188,8 @@ struct DispatchType { static constexpr bool IsFlatRange = std::is_same::value; static constexpr bool IsMDRange = std::is_same::value; static constexpr bool IsSimdFor = std::is_same::value; + static constexpr bool IsSimdForInner = + std::is_same::value; // check any confilcts with the requested pattern // and return the actual one we use @@ -187,6 +202,10 @@ struct DispatchType { return LP::flat; } else if constexpr (IsSimdFor) { return is_ParFor ? LP::simd : LP::flat; + } else if constexpr (IsSimdForInner) { + // for now this is guaranteed to be par_for_inner, when par_reduce_inner is + // supported need to check + return LP::simd; } else if constexpr (IsMDRange) { return LP::md; } else if constexpr (std::is_same_v) { @@ -205,6 +224,7 @@ struct dispatch_collapse { IdxTeam idxer_team; Kokkos::Array bound_arr; Function function; + using LP = LoopPattern; KOKKOS_FORCEINLINE_FUNCTION dispatch_collapse(IdxTeam idxer, Kokkos::Array bounds, Function func) @@ -213,7 +233,7 @@ struct dispatch_collapse { template KOKKOS_FORCEINLINE_FUNCTION void - execute(std::integer_sequence, + execute(LoopPatternTag, std::integer_sequence, std::integer_sequence, std::integer_sequence, team_mbr_t team_member, Args &&...args) const { @@ -258,11 +278,40 @@ struct dispatch_collapse { } } + template + KOKKOS_FORCEINLINE_FUNCTION void + execute(LoopPatternTag, std::integer_sequence, + std::integer_sequence, + std::integer_sequence, team_mbr_t team_member, + Args &&...args) const { + static_assert(sizeof...(ThreadIs) == Rank - 1); + static_assert(sizeof...(OuterIs) == 0 && sizeof...(InnerIs) == 0, + "simd inner pattern should only provide outer indices"); + if constexpr (Rank == 1) { +#pragma omp simd + for (int i = bound_arr[0].s; i <= bound_arr[0].e; i++) { + function(i); + } + } else { + const auto idxer = MakeIndexer( + std::pair(bound_arr[ThreadIs].s, bound_arr[ThreadIs].e)...); + for (int idx = 0; idx < idxer.size(); idx++) { + const auto indices = idxer.GetIdxArray(idx); +#pragma omp simd + for (int i = bound_arr[Rank - 1].s; i <= bound_arr[Rank - 1].e; i++) { + function(indices[ThreadIs]..., i); + } + } + } + } + template using sequence = std::make_index_sequence; KOKKOS_FORCEINLINE_FUNCTION void operator()(team_mbr_t team_member) const { - execute(sequence(), sequence(), sequence(), team_member); + execute(LoopPatternTag(), sequence(), sequence(), + sequence(), team_member); } }; @@ -274,27 +323,6 @@ MakeCollapse(IdxTeam idxer, Kokkos::Array bounds, Function fun ExtraFuncArgs...>(idxer, bounds, func); } -template -KOKKOS_INLINE_FUNCTION void SimdFor(std::index_sequence, Function function, - Kokkos::Array bounds) { - if constexpr (Rank == 1) { -#pragma omp simd - for (int i = bounds[0].s; i <= bounds[0].e; i++) { - function(i); - } - } else { - const auto idxer = - MakeIndexer(std::pair(bounds[OuterIs].s, bounds[OuterIs].e)...); - for (int idx = 0; idx < idxer.size(); idx++) { - const auto indices = idxer.GetIdxArray(idx); -#pragma omp simd - for (int i = bounds[Rank - 1].s; i <= bounds[Rank - 1].e; i++) { - function(indices[OuterIs]..., i); - } - } - } -} - template struct par_disp_inner_impl {}; @@ -303,9 +331,10 @@ template , TypeList, TypeList> { using bound_translator = LoopBoundTranslator; + using dispatch_type = + DispatchType; static constexpr std::size_t Rank = bound_translator::Rank; - using TeamPattern = - UsesHierarchialPar>; + template using sequence = std::make_index_sequence; @@ -313,20 +342,22 @@ struct par_disp_inner_impl, TypeList(bounds)...); constexpr bool isSimdFor = std::is_same_v; - if constexpr (isSimdFor) { - static_assert(!isSimdFor || - (isSimdFor && std::is_same_v), - "par_inner simd for pattern only supported on HostExecSpace"); - SimdFor(std::make_index_sequence(), function, bound_arr); - } else { - auto idxer = Indexer<>(); - constexpr std::size_t Nthread = TeamPattern::Nthread; - constexpr std::size_t Nvector = TeamPattern::Nvector; - MakeCollapse(idxer, bound_arr, - function) - .execute(sequence<0>(), sequence(), sequence(), team_member, - std::forward(args)...); - } + constexpr std::size_t Nvector = dispatch_type::TeamPattern::Nvector; + constexpr std::size_t Nthread = Rank - Nvector; + constexpr auto pattern_tag = LoopPatternTag(); + + static_assert(!isSimdFor || + (isSimdFor && std::is_same_v), + "par_inner simd for pattern only supported on HostExecSpace"); + static_assert( + !std::is_same_v> && + !always_false, + "Inner Loop pattern not recognized in DispatchType::GetPatternTag"); + + auto idxer = Indexer<>(); + MakeCollapse(idxer, bound_arr, function) + .execute(pattern_tag, sequence<0>(), sequence(), + sequence(), team_member, std::forward(args)...); } }; @@ -370,21 +401,43 @@ struct par_dispatch_impl, TypeList, base_type>; auto bound_arr = bound_translator().GetIndexRanges(std::forward(bounds)...); - if constexpr (isSimdFor) { - static_assert(!isSimdFor || (isSimdFor && std::is_same_v), - "SimdFor pattern only supported in HostExecSpace"); - SimdFor(std::make_index_sequence(), function, bound_arr); - } else { - dispatch_impl(pattern_tag, std::make_index_sequence(), - std::make_index_sequence(), name, exec_space, bound_arr, - function, std::forward(args)..., scratch_level, - scratch_size_in_bytes); - } + static_assert(!isSimdFor || (isSimdFor && std::is_same_v), + "SimdFor pattern only supported in HostExecSpace"); + dispatch_impl(pattern_tag, std::make_index_sequence(), + std::make_index_sequence(), name, exec_space, bound_arr, + function, std::forward(args)..., scratch_level, + scratch_size_in_bytes); } template using sequence = std::integer_sequence; + template + inline void dispatch_impl(LoopPatternTag, sequence, + sequence, std::string name, ExecSpace exec_space, + Kokkos::Array bound_arr, Function function, + Args &&...args, const int scratch_level, + const std::size_t scratch_size_in_bytes) { + static_assert(sizeof...(InnerIs) == 0); + static_assert(sizeof...(OuterIs) == Rank - 1); + if constexpr (Rank == 1) { +#pragma omp simd + for (int i = bound_arr[0].s; i <= bound_arr[0].e; i++) { + function(i); + } + } else { + const auto idxer = + MakeIndexer(std::pair(bound_arr[OuterIs].s, bound_arr[OuterIs].e)...); + for (int idx = 0; idx < idxer.size(); idx++) { + const auto indices = idxer.GetIdxArray(idx); +#pragma omp simd + for (int i = bound_arr[Rank - 1].s; i <= bound_arr[Rank - 1].e; i++) { + function(indices[OuterIs]..., i); + } + } + } + } + template inline void dispatch_impl(LoopPatternTag, sequence, sequence, std::string name, ExecSpace exec_space, @@ -482,6 +535,28 @@ inline void par_dispatch(const std::string &name, Args &&...args) { std::forward(args)...); } +template +KOKKOS_INLINE_FUNCTION void SequentialFor(std::index_sequence, + Function function, + Kokkos::Array bounds) { + if constexpr (Rank == 1) { +#pragma omp simd + for (int i = bounds[0].s; i <= bounds[0].e; i++) { + function(i); + } + } else { + const auto idxer = + MakeIndexer(std::pair(bounds[OuterIs].s, bounds[OuterIs].e)...); + for (int idx = 0; idx < idxer.size(); idx++) { + const auto indices = idxer.GetIdxArray(idx); +#pragma omp simd + for (int i = bounds[Rank - 1].s; i <= bounds[Rank - 1].e; i++) { + function(indices[OuterIs]..., i); + } + } + } +} + template struct seq_for_impl {}; @@ -492,7 +567,7 @@ struct seq_for_impl> { constexpr std::size_t Rank = bound_translator::Rank; const auto bound_arr = bound_translator().GetIndexRanges(std::forward(bounds)...); - SimdFor(std::make_index_sequence(), function, bound_arr); + SequentialFor(std::make_index_sequence(), function, bound_arr); } }; diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index 2a84057a85b8..ce40d963bf82 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -350,27 +350,29 @@ bool test_wrapper_nested_3d(OuterLoopPattern outer_loop_pattern, // Compute the scratch memory needs const int scratch_level = 0; - size_t scratch_size_in_bytes = parthenon::ScratchPad1D::shmem_size(N); + size_t scratch_size_in_bytes = parthenon::ScratchPad1D::shmem_size(N * N); // Compute the 2nd order centered derivative in x parthenon::par_for_outer( outer_loop_pattern, "unit test Nested 3D", exec_space, scratch_size_in_bytes, - scratch_level, 0, N - 1, 0, N - 1, + scratch_level, 0, N - 1, - KOKKOS_LAMBDA(parthenon::team_mbr_t team_member, const int k, const int j) { + KOKKOS_LAMBDA(parthenon::team_mbr_t team_member, const int k) { // Load a pencil in x to minimize DRAM accesses (and test scratch pad) - parthenon::ScratchPad1D scratch_u(team_member.team_scratch(scratch_level), - N); - parthenon::par_for_inner(inner_loop_pattern, team_member, 0, N - 1, - [&](const int i) { scratch_u(i) = dev_u(k, j, i); }); + parthenon::ScratchPad2D scratch_u(team_member.team_scratch(scratch_level), + N, N); + parthenon::par_for_inner( + inner_loop_pattern, team_member, 0, N - 1, 0, N - 1, + [&](const int j, const int i) { scratch_u(j, i) = dev_u(k, j, i); }); // Sync all threads in the team so that scratch memory is consistent team_member.team_barrier(); // Compute the derivative from scratch memory - parthenon::par_for_inner( - inner_loop_pattern, team_member, 1, N - 2, [&](const int i) { - dev_du(k, j, i - 1) = (scratch_u(i + 1) - scratch_u(i - 1)) / 2.; - }); + parthenon::par_for_inner(inner_loop_pattern, team_member, 0, N - 1, 1, N - 2, + [&](const int j, const int i) { + dev_du(k, j, i - 1) = + (scratch_u(j, i + 1) - scratch_u(j, i - 1)) / 2.; + }); }); // Copy array back from device to host From c6f9de14cb13a0c572d4b242bd2bfc3cbc0de725 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Fri, 6 Sep 2024 10:56:15 +0200 Subject: [PATCH 73/99] adding comments and cleaning up LoopPattern* type name use --- src/kokkos_abstraction.hpp | 106 +++++++++++++++++++++++-------------- 1 file changed, 66 insertions(+), 40 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index eeae56a36c65..a069acac895e 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -167,9 +167,21 @@ struct DispatchSignature> { using Args = typename TL::template continuous_sublist; }; -enum class LoopPattern { flat, md, simd, outer, collapse, undef }; - -template +// tags to resolve requested [Outer|Inner]LoopPattern* + dispatch_impl::Parallel*Dispatch +// combinations into the final par_dispatch_impl::dispatch_impl call using +// DispatchType::GetPatternTag() to prevent any incompatible combinations (e.g., simdfor + +// par_reduce). +// * flat -- results in a single Kokkos::RangePolicy flattening all loop bounds. +// * md -- results in a Kokkos::MDRangePolicy +// * simd -- innermost loop gets a #pragma omp simd, outer loops flattened to a single +// raw for +// * outer -- only explicit parallelism is an outer team_policy +// * collapse -- Any specialization of [Inner]LoopPatternCollapse patterns. Explicitly +// uses hierarchial parrallelism +// * undef -- combination not handled, will raise a compilation error. +enum class PatternTag { flat, md, simd, outer, collapse, undef }; + +template struct LoopPatternTag {}; template @@ -177,13 +189,13 @@ struct DispatchType { using Translator = LoopBoundTranslator; static constexpr std::size_t Rank = Translator::Rank; - using TeamPattern = UsesHierarchialPar< - Pattern, std::integral_constant>; // false_type unless we use - // an outer team policy + using HierarchialPar = + UsesHierarchialPar>; + static constexpr bool is_ParFor = std::is_same::value; static constexpr bool is_ParScan = - std::is_same::value; + std::is_same::value; static constexpr bool IsFlatRange = std::is_same::value; static constexpr bool IsMDRange = std::is_same::value; @@ -193,28 +205,28 @@ struct DispatchType { // check any confilcts with the requested pattern // and return the actual one we use - static constexpr LoopPattern GetPatternTag() { - using LP = LoopPattern; + static constexpr PatternTag GetPatternTag() { + using PT = PatternTag; if constexpr (is_ParScan) { - return LP::flat; + return PT::flat; } else if constexpr (IsFlatRange) { - return LP::flat; + return PT::flat; } else if constexpr (IsSimdFor) { - return is_ParFor ? LP::simd : LP::flat; + return is_ParFor ? PT::simd : PT::flat; } else if constexpr (IsSimdForInner) { // for now this is guaranteed to be par_for_inner, when par_reduce_inner is // supported need to check - return LP::simd; + return PT::simd; } else if constexpr (IsMDRange) { - return LP::md; + return PT::md; } else if constexpr (std::is_same_v) { - return LP::outer; - } else if constexpr (TeamPattern::value) { - return LP::collapse; + return PT::outer; + } else if constexpr (HierarchialPar::value) { + return PT::collapse; } - return LP::undef; + return PT::undef; } }; @@ -224,16 +236,18 @@ struct dispatch_collapse { IdxTeam idxer_team; Kokkos::Array bound_arr; Function function; - using LP = LoopPattern; + using PT = PatternTag; KOKKOS_FORCEINLINE_FUNCTION dispatch_collapse(IdxTeam idxer, Kokkos::Array bounds, Function func) : idxer_team(idxer), bound_arr(bounds), function(func) {} + // collapse inner parallel regions using a combination of Team/Thread/Vector range + // policies template KOKKOS_FORCEINLINE_FUNCTION void - execute(LoopPatternTag, std::integer_sequence, + execute(LoopPatternTag, std::integer_sequence, std::integer_sequence, std::integer_sequence, team_mbr_t team_member, Args &&...args) const { @@ -278,16 +292,17 @@ struct dispatch_collapse { } } + // simdfor loop collapse inside an outer team policy loop. Only valid on HostExecSpace template KOKKOS_FORCEINLINE_FUNCTION void - execute(LoopPatternTag, std::integer_sequence, + execute(LoopPatternTag, std::integer_sequence, std::integer_sequence, std::integer_sequence, team_mbr_t team_member, Args &&...args) const { static_assert(sizeof...(ThreadIs) == Rank - 1); static_assert(sizeof...(OuterIs) == 0 && sizeof...(InnerIs) == 0, - "simd inner pattern should only provide outer indices"); + "simd inner pattern should only provide thread indices"); if constexpr (Rank == 1) { #pragma omp simd for (int i = bound_arr[0].s; i <= bound_arr[0].e; i++) { @@ -310,11 +325,13 @@ struct dispatch_collapse { using sequence = std::make_index_sequence; KOKKOS_FORCEINLINE_FUNCTION void operator()(team_mbr_t team_member) const { - execute(LoopPatternTag(), sequence(), sequence(), + execute(LoopPatternTag(), sequence(), sequence(), sequence(), team_member); } }; +// builds a functor that uses inner hierarchial parrallelism used by both par_disp_inner & +// par_dipsatch for LoopPatternCollapse template KOKKOS_FORCEINLINE_FUNCTION auto @@ -342,7 +359,7 @@ struct par_disp_inner_impl, TypeList(bounds)...); constexpr bool isSimdFor = std::is_same_v; - constexpr std::size_t Nvector = dispatch_type::TeamPattern::Nvector; + constexpr std::size_t Nvector = dispatch_type::HierarchialPar::Nvector; constexpr std::size_t Nthread = Rank - Nvector; constexpr auto pattern_tag = LoopPatternTag(); @@ -350,8 +367,8 @@ struct par_disp_inner_impl, TypeList), "par_inner simd for pattern only supported on HostExecSpace"); static_assert( - !std::is_same_v> && - !always_false, + !std::is_same_v> || + always_false, "Inner Loop pattern not recognized in DispatchType::GetPatternTag"); auto idxer = Indexer<>(); @@ -381,7 +398,7 @@ template struct par_dispatch_impl, TypeList, TypeList> { - using LP = LoopPattern; + using PT = PatternTag; using dispatch_type = DispatchType; using bound_translator = LoopBoundTranslator; static constexpr std::size_t Rank = bound_translator::Rank; @@ -391,18 +408,20 @@ struct par_dispatch_impl, TypeList(); static_assert( - !std::is_same_v> && + !std::is_same_v> && !always_false, - "Loop pattern & tag combination not recognized in DispatchType::GetPatternTag"); + "LoopPattern & Tag combination not recognized in DispatchType::GetPatternTag"); - constexpr bool isSimdFor = std::is_same_v, + constexpr bool isSimdFor = std::is_same_v, base_type>; - auto bound_arr = bound_translator().GetIndexRanges(std::forward(bounds)...); static_assert(!isSimdFor || (isSimdFor && std::is_same_v), "SimdFor pattern only supported in HostExecSpace"); + + auto bound_arr = bound_translator().GetIndexRanges(std::forward(bounds)...); dispatch_impl(pattern_tag, std::make_index_sequence(), std::make_index_sequence(), name, exec_space, bound_arr, function, std::forward(args)..., scratch_level, @@ -412,8 +431,10 @@ struct par_dispatch_impl, TypeList using sequence = std::integer_sequence; + // #pragma omp simd for innermost loop, flatten remaining outer loops into a single raw + // for template - inline void dispatch_impl(LoopPatternTag, sequence, + inline void dispatch_impl(LoopPatternTag, sequence, sequence, std::string name, ExecSpace exec_space, Kokkos::Array bound_arr, Function function, Args &&...args, const int scratch_level, @@ -438,8 +459,9 @@ struct par_dispatch_impl, TypeList - inline void dispatch_impl(LoopPatternTag, sequence, + inline void dispatch_impl(LoopPatternTag, sequence, sequence, std::string name, ExecSpace exec_space, Kokkos::Array bound_arr, Function function, Args &&...args, const int scratch_level, @@ -455,8 +477,9 @@ struct par_dispatch_impl, TypeList(args)...); } + // Kokkos::MDRangePolicy for all loop bounds template - inline void dispatch_impl(LoopPatternTag, sequence, + inline void dispatch_impl(LoopPatternTag, sequence, sequence, std::string name, ExecSpace exec_space, Kokkos::Array bound_arr, Function function, Args &&...args, const int scratch_level, @@ -469,8 +492,9 @@ struct par_dispatch_impl, TypeList(args)...); } + // Flatten loop bounds into a single outer team_policy template - inline void dispatch_impl(LoopPatternTag, sequence, + inline void dispatch_impl(LoopPatternTag, sequence, sequence, std::string name, ExecSpace exec_space, Kokkos::Array bound_arr, Function function, Args &&...args, const int scratch_level, @@ -489,17 +513,19 @@ struct par_dispatch_impl, TypeList(args)...); } + // Collapse inner Nvector + Nthread loops to thread/vector range policies and remaining + // outer loops to a team_policy. template - inline void dispatch_impl(LoopPatternTag, sequence, + inline void dispatch_impl(LoopPatternTag, sequence, sequence, std::string name, ExecSpace exec_space, Kokkos::Array bound_arr, Function function, Args &&...args, const int scratch_level, const std::size_t scratch_size_in_bytes) { const auto idxer = MakeIndexer(Kokkos::Array{bound_arr[OuterIs]...}); - using TeamPattern = typename dispatch_type::TeamPattern; - constexpr std::size_t Nvector = TeamPattern::Nvector; - constexpr std::size_t Nthread = TeamPattern::Nthread; + using HierarchialPar = typename dispatch_type::HierarchialPar; + constexpr std::size_t Nvector = HierarchialPar::Nvector; + constexpr std::size_t Nthread = HierarchialPar::Nthread; constexpr std::size_t Nouter = Rank - Nvector - Nthread; kokkos_dispatch( Tag(), name, From f1403b1ad4022aefad7d8179ddca9b3eb361f526 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Fri, 6 Sep 2024 11:08:01 +0200 Subject: [PATCH 74/99] SequentialFor added --- src/kokkos_abstraction.hpp | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index a069acac895e..13b4b20e71b5 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -565,21 +565,11 @@ template KOKKOS_INLINE_FUNCTION void SequentialFor(std::index_sequence, Function function, Kokkos::Array bounds) { - if constexpr (Rank == 1) { -#pragma omp simd - for (int i = bounds[0].s; i <= bounds[0].e; i++) { - function(i); - } - } else { - const auto idxer = - MakeIndexer(std::pair(bounds[OuterIs].s, bounds[OuterIs].e)...); - for (int idx = 0; idx < idxer.size(); idx++) { - const auto indices = idxer.GetIdxArray(idx); -#pragma omp simd - for (int i = bounds[Rank - 1].s; i <= bounds[Rank - 1].e; i++) { - function(indices[OuterIs]..., i); - } - } + const auto idxer = + MakeIndexer(std::pair(bounds[OuterIs].s, bounds[OuterIs].e)...); + for (int idx = 0; idx < idxer.size(); idx++) { + const auto indices = idxer.GetIdxArray(idx); + function(indices[OuterIs]...); } } @@ -593,7 +583,7 @@ struct seq_for_impl> { constexpr std::size_t Rank = bound_translator::Rank; const auto bound_arr = bound_translator().GetIndexRanges(std::forward(bounds)...); - SequentialFor(std::make_index_sequence(), function, bound_arr); + SequentialFor(std::make_index_sequence(), function, bound_arr); } }; From b31b1dcd056a00a1f3f98140cb59af15bdfc7baf Mon Sep 17 00:00:00 2001 From: adam reyes Date: Fri, 6 Sep 2024 11:42:24 +0200 Subject: [PATCH 75/99] use Indexer in unit tests --- tst/unit/kokkos_abstraction.cpp | 131 +++++++++++++++----------------- 1 file changed, 61 insertions(+), 70 deletions(-) diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index ce40d963bf82..74806073a639 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -30,6 +30,7 @@ #include "basic_types.hpp" #include "kokkos_abstraction.hpp" #include "parthenon_array_generic.hpp" +#include "utils/indexer.hpp" #include "utils/type_list.hpp" using parthenon::DevExecSpace; @@ -129,43 +130,43 @@ struct HostArrayND_impl<7> { using type = parthenon::HostArray7D; }; -template +template auto ParArrayND(Args &&...args) { static_assert(ND <= 8, "ParArrayND supoorted up to ND=8"); return typename ParArrayND_impl::template type(std::forward(args)...); } -template +template auto HostArrayND(Args &&...args) { static_assert(ND <= 7, "HostArrayND supoorted up to ND=7"); return typename HostArrayND_impl::template type(std::forward(args)...); } -template +template struct SequenceOfInt {}; -template -struct SequenceOfInt<0, VAL, std::integer_sequence> { - using value = typename std::integer_sequence; +template +struct SequenceOfInt<0, VAL, std::integer_sequence> { + using value = typename std::integer_sequence; }; -template -struct SequenceOfInt> { +template +struct SequenceOfInt> { using value = typename SequenceOfInt>::value; + std::integer_sequence>::value; }; -template +template using sequence_of_int_v = - typename SequenceOfInt>::value; + typename SequenceOfInt>::value; enum class lbounds { integer, indexrange }; -template +template struct test_wrapper_nd_impl { - template + template using Sequence = std::make_index_sequence; - int indices[Rank - 1], int_bounds[2 * Rank]; + int int_bounds[2 * Rank]; parthenon::IndexRange bounds[Rank]; decltype(ParArrayND()) arr_dev; decltype(HostArrayND()) arr_host_orig, arr_host_mod; @@ -177,32 +178,29 @@ struct test_wrapper_nd_impl { std::random_device rd; // Will be used to obtain a seed for the random number engine std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd() std::uniform_real_distribution dis(-1.0, 1.0); - par_for_init(std::make_index_sequence(), gen, dis); + par_for_init(std::make_index_sequence(), gen, dis); } - template + template auto GetArray(std::index_sequence) { static_assert(sizeof...(Is) == Rank); return ParArrayND("device", N * Is...); } - template + template void par_for_init(std::index_sequence, std::mt19937 &gen, std::uniform_real_distribution &dis) { - constexpr size_t id = Rank - LoopsLeft; - bounds[id].s = 0; - bounds[id].e = N - 1; - int_bounds[2 * id] = 0; - int_bounds[2 * id + 1] = N - 1; - if constexpr (LoopsLeft == 1) { - for (int i = 0; i < N; i++) { - arr_host_orig(indices[Is]..., i) = dis(gen); - } - } else { - for (int j = 0; j < N; j++) { - indices[Rank - LoopsLeft] = j; - par_for_init(Sequence(), gen, dis); - } + for (int id = 0; id < Rank; id++) { + bounds[id].s = 0; + bounds[id].e = N - 1; + int_bounds[2 * id] = 0; + int_bounds[2 * id + 1] = N - 1; + } + const auto idxer = + parthenon::MakeIndexer(std::pair(bounds[Is].s, bounds[Is].e)...); + for (int idx = 0; idx < idxer.size(); idx++) { + const auto indices = idxer.GetIdxArray(idx); + arr_host_orig(indices[Is]...) = dis(gen); } } @@ -217,20 +215,16 @@ struct test_wrapper_nd_impl { return static_cast(inc); } - template + template bool par_for_comp(std::index_sequence) { bool all_same = true; - if constexpr (LoopsLeft == 1) { - for (int i = 0; i < N; i++) { - if (arr_host_orig(indices[Is]..., i) + increment_data(indices[Is]..., i) != - arr_host_mod(indices[Is]..., i)) { - all_same = false; - } - } - } else { - for (int j = 0; j < N; j++) { - indices[Rank - LoopsLeft] = j; - all_same = par_for_comp(Sequence()); + const auto idxer = + parthenon::MakeIndexer(std::pair(bounds[Is].s, bounds[Is].e)...); + for (int idx = 0; idx < idxer.size(); idx++) { + const auto indices = idxer.GetIdxArray(idx); + if (arr_host_orig(indices[Is]...) + increment_data(indices[Is]...) != + arr_host_mod(indices[Is]...)) { + all_same = false; } } return all_same; @@ -239,7 +233,7 @@ struct test_wrapper_nd_impl { template struct dispatch {}; - template + template struct dispatch, parthenon::TypeList> { template @@ -267,14 +261,14 @@ struct test_wrapper_nd_impl { parthenon::list_of_type_t>() .execute(exec_space, arr_dev, int_bounds, bounds); Kokkos::deep_copy(arr_host_mod, arr_dev); - REQUIRE(par_for_comp(Sequence()) == true); + REQUIRE(par_for_comp(Sequence()) == true); } SECTION("IndexRange launch bounds") { dispatch, parthenon::list_of_type_t>() .execute(exec_space, arr_dev, int_bounds, bounds); Kokkos::deep_copy(arr_host_mod, arr_dev); - REQUIRE(par_for_comp(Sequence()) == true); + REQUIRE(par_for_comp(Sequence()) == true); } } @@ -282,7 +276,7 @@ struct test_wrapper_nd_impl { void test_nest(OuterPattern outer_patter, InnerPattern inner_pattern) {} }; -template +template void test_wrapper_nd(DevExecSpace exec_space) { auto wrappernd = test_wrapper_nd_impl(); SECTION("LoopPatternFlatRange") { @@ -350,7 +344,7 @@ bool test_wrapper_nested_3d(OuterLoopPattern outer_loop_pattern, // Compute the scratch memory needs const int scratch_level = 0; - size_t scratch_size_in_bytes = parthenon::ScratchPad1D::shmem_size(N * N); + std::size_t scratch_size_in_bytes = parthenon::ScratchPad1D::shmem_size(N * N); // Compute the 2nd order centered derivative in x parthenon::par_for_outer( @@ -420,7 +414,7 @@ bool test_wrapper_nested_4d(OuterLoopPattern outer_loop_pattern, // Compute the scratch memory needs const int scratch_level = 0; - size_t scratch_size_in_bytes = parthenon::ScratchPad1D::shmem_size(N); + std::size_t scratch_size_in_bytes = parthenon::ScratchPad1D::shmem_size(N); parthenon::IndexRange rng{0, N - 1}; // Compute the 2nd order centered derivative in x @@ -544,9 +538,9 @@ TEST_CASE("Parallel scan", "[par_scan]") { } } -template +template struct test_wrapper_reduce_nd_impl { - template + template using Sequence = std::make_index_sequence; int indices[Rank - 1], int_bounds[2 * Rank]; parthenon::IndexRange bounds[Rank]; @@ -554,38 +548,35 @@ struct test_wrapper_reduce_nd_impl { test_wrapper_reduce_nd_impl() { h_sum = 0; - par_red_init(std::make_index_sequence(), h_sum); + par_red_init(std::make_index_sequence(), h_sum); } - template + template auto GetArray(std::index_sequence) { static_assert(sizeof...(Is) == Rank); return ParArrayND("device", N * Is...); } - template + template void par_red_init(std::index_sequence, int &sum) { - constexpr size_t id = Rank - LoopsLeft; - bounds[id].s = 0; - bounds[id].e = N - 1; - int_bounds[2 * id] = 0; - int_bounds[2 * id + 1] = N - 1; - if constexpr (LoopsLeft == 1) { - for (int i = 0; i < N; i++) { - sum += (i + ... + indices[Is]); - } - } else { - for (int j = 0; j < N; j++) { - indices[Rank - LoopsLeft] = j; - par_red_init(Sequence(), sum); - } + for (int id = 0; id < Rank; id++) { + bounds[id].s = 0; + bounds[id].e = N - 1; + int_bounds[2 * id] = 0; + int_bounds[2 * id + 1] = N - 1; + } + const auto idxer = + parthenon::MakeIndexer(std::pair(bounds[Is].s, bounds[Is].e)...); + for (int idx = 0; idx < idxer.size(); idx++) { + const auto indices = idxer.GetIdxArray(idx); + sum += (0 + ... + indices[Is]); } } template struct dispatch {}; - template + template struct dispatch, parthenon::TypeList> { bool execute(DevExecSpace exec_space, const int h_sum, int *int_bounds, @@ -627,7 +618,7 @@ struct test_wrapper_reduce_nd_impl { void test_nest(OuterPattern outer_patter, InnerPattern inner_pattern) {} }; -template +template void test_wrapper_reduce_nd(DevExecSpace exec_space) { auto wrappernd = test_wrapper_reduce_nd_impl(); SECTION("LoopPatternFlatRange") { From 4fbef8f61f0c495a120e38b771c1d9249e402db1 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Fri, 6 Sep 2024 16:24:08 +0200 Subject: [PATCH 76/99] adding test coverage for outer loop patterns --- src/utils/type_list.hpp | 5 + tst/unit/kokkos_abstraction.cpp | 341 +++++++++++++++++--------------- 2 files changed, 192 insertions(+), 154 deletions(-) diff --git a/src/utils/type_list.hpp b/src/utils/type_list.hpp index 0b404b4208ab..9550a659f056 100644 --- a/src/utils/type_list.hpp +++ b/src/utils/type_list.hpp @@ -132,6 +132,11 @@ struct ListOfType { using type = concatenate_type_lists_t, typename ListOfType::type>; }; +template +struct ListOfType, T> { + using type = TypeList<>; +}; + template struct ListOfType, T> { using type = TypeList; diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index 74806073a639..790d68003843 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -40,6 +41,12 @@ using parthenon::ParArray3D; using parthenon::ParArray4D; using Real = double; +template +using Sequence = std::make_index_sequence; + +template +void capture(Args... args) {} + template struct ParArrayND_impl {}; template <> @@ -130,6 +137,19 @@ struct HostArrayND_impl<7> { using type = parthenon::HostArray7D; }; +template +struct ScratchPadND_impl {}; +template <> +struct ScratchPadND_impl<1> { + template + using type = parthenon::ScratchPad1D; +}; +template <> +struct ScratchPadND_impl<2> { + template + using type = parthenon::ScratchPad2D; +}; + template auto ParArrayND(Args &&...args) { static_assert(ND <= 8, "ParArrayND supoorted up to ND=8"); @@ -141,6 +161,12 @@ auto HostArrayND(Args &&...args) { return typename HostArrayND_impl::template type(std::forward(args)...); } +template +auto ScratchPadND(Args &&...args) { + static_assert(ND <= 2, "ScratchPadND supported up to ND=2"); + return typename ScratchPadND_impl::template type(std::forward(args)...); +} + template struct SequenceOfInt {}; @@ -160,12 +186,16 @@ template using sequence_of_int_v = typename SequenceOfInt>::value; +template +auto GetArray_impl(Args... Ns) { + static_assert(sizeof...(Args) == Rank); + return ParArrayND("device", Ns...); +} + enum class lbounds { integer, indexrange }; template struct test_wrapper_nd_impl { - template - using Sequence = std::make_index_sequence; int int_bounds[2 * Rank]; parthenon::IndexRange bounds[Rank]; decltype(ParArrayND()) arr_dev; @@ -184,7 +214,7 @@ struct test_wrapper_nd_impl { template auto GetArray(std::index_sequence) { static_assert(sizeof...(Is) == Rank); - return ParArrayND("device", N * Is...); + return GetArray_impl(N * Is...); } template @@ -320,178 +350,183 @@ TEST_CASE("par_for loops", "[wrapper]") { SECTION("7D loops") { test_wrapper_nd<7, 10>(default_exec_space); } } -template -bool test_wrapper_nested_3d(OuterLoopPattern outer_loop_pattern, - InnerLoopPattern inner_loop_pattern, - DevExecSpace exec_space) { - // Compute the 2nd order centered derivative in x of i+1^2 * j+1^2 * k+1^2 - - const int N = 32; - ParArray3D dev_u("device u", N, N, N); - ParArray3D dev_du("device du", N, N, N - 2); - auto host_u = Kokkos::create_mirror(dev_u); - auto host_du = Kokkos::create_mirror(dev_du); - - // initialize with i^2 * j^2 * k^2 - for (int n = 0; n < N; n++) - for (int k = 0; k < N; k++) - for (int j = 0; j < N; j++) - for (int i = 0; i < N; i++) - host_u(k, j, i) = pow((i + 1) * (j + 2) * (k + 3), 2.0); - - // Copy host array content to device - Kokkos::deep_copy(dev_u, host_u); - - // Compute the scratch memory needs - const int scratch_level = 0; - std::size_t scratch_size_in_bytes = parthenon::ScratchPad1D::shmem_size(N * N); - - // Compute the 2nd order centered derivative in x - parthenon::par_for_outer( - outer_loop_pattern, "unit test Nested 3D", exec_space, scratch_size_in_bytes, - scratch_level, 0, N - 1, - - KOKKOS_LAMBDA(parthenon::team_mbr_t team_member, const int k) { - // Load a pencil in x to minimize DRAM accesses (and test scratch pad) - parthenon::ScratchPad2D scratch_u(team_member.team_scratch(scratch_level), - N, N); - parthenon::par_for_inner( - inner_loop_pattern, team_member, 0, N - 1, 0, N - 1, - [&](const int j, const int i) { scratch_u(j, i) = dev_u(k, j, i); }); - // Sync all threads in the team so that scratch memory is consistent - team_member.team_barrier(); - - // Compute the derivative from scratch memory - parthenon::par_for_inner(inner_loop_pattern, team_member, 0, N - 1, 1, N - 2, - [&](const int j, const int i) { - dev_du(k, j, i - 1) = - (scratch_u(j, i + 1) - scratch_u(j, i - 1)) / 2.; - }); - }); - - // Copy array back from device to host - Kokkos::deep_copy(host_du, dev_du); - - Real max_rel_err = -1; - const Real rel_tol = std::numeric_limits::epsilon(); +template +struct test_wrapper_nested_nd_impl { + Kokkos::Array bounds; + decltype(ParArrayND()) dev_u, dev_du; + decltype(HostArrayND()) host_u, host_du; + + test_wrapper_nested_nd_impl() { + dev_u = GetArray(sequence_of_int_v()); + dev_du = GetArray(sequence_of_int_v(), N - 2); + host_u = Kokkos::create_mirror(dev_u); + host_du = Kokkos::create_mirror(dev_du); + init(std::make_index_sequence()); + } - // compare data on the host - for (int k = 0; k < N; k++) { - for (int j = 0; j < N; j++) { - for (int i = 1; i < N - 1; i++) { - const Real analytic = 2.0 * (i + 1) * pow((j + 2) * (k + 3), 2.0); - const Real err = host_du(k, j, i - 1) - analytic; + template + auto GetArray(std::index_sequence, Args... Ns) { + return GetArray_impl(Is * N..., Ns...); + } - max_rel_err = fmax(fabs(err / analytic), max_rel_err); - } + template + void init(std::index_sequence) { + for (int id = 0; id < Rank; id++) { + bounds[id].s = 0; + bounds[id].e = N - 1; + } + const auto idxer = + parthenon::MakeIndexer(std::pair(bounds[Is].s, bounds[Is].e)...); + for (int idx = 0; idx < idxer.size(); idx++) { + const auto indices = idxer.GetIdxArray(idx); + // initialize with i^2 * j^2 * k^2 + host_u(indices[Is]...) = pow((1 * ... * (indices[Is] + 1 + Is)), 2.0); } + // Copy host array content to device + Kokkos::deep_copy(dev_u, host_u); } - return max_rel_err < rel_tol; -} + template + struct dispatch {}; -template -bool test_wrapper_nested_4d(OuterLoopPattern outer_loop_pattern, - InnerLoopPattern inner_loop_pattern, - DevExecSpace exec_space) { - // Compute the 2nd order centered derivative in x of i+1^2 * j+1^2 * k+1^2 * n+1^2 - - const int N = 32; - ParArray4D dev_u("device u", N, N, N, N); - ParArray4D dev_du("device du", N, N, N, N - 2); - auto host_u = Kokkos::create_mirror(dev_u); - auto host_du = Kokkos::create_mirror(dev_du); - - // initialize with i^2 * j^2 * k^2 - for (int n = 0; n < N; n++) - for (int k = 0; k < N; k++) - for (int j = 0; j < N; j++) - for (int i = 0; i < N; i++) - host_u(n, k, j, i) = pow((i + 1) * (j + 2) * (k + 3) * (n + 4), 2.0); - - // Copy host array content to device - Kokkos::deep_copy(dev_u, host_u); - - // Compute the scratch memory needs - const int scratch_level = 0; - std::size_t scratch_size_in_bytes = parthenon::ScratchPad1D::shmem_size(N); - parthenon::IndexRange rng{0, N - 1}; - - // Compute the 2nd order centered derivative in x - parthenon::par_for_outer( - outer_loop_pattern, "unit test Nested 4D", exec_space, scratch_size_in_bytes, - scratch_level, 0, N - 1, 0, N - 1, 0, N - 1, - - KOKKOS_LAMBDA(parthenon::team_mbr_t team_member, const int n, const int k, - const int j) { - // Load a pencil in x to minimize DRAM accesses (and test scratch pad) - parthenon::ScratchPad1D scratch_u(team_member.team_scratch(scratch_level), - N); - parthenon::par_for_inner(inner_loop_pattern, team_member, rng, - [&](const int i) { scratch_u(i) = dev_u(n, k, j, i); }); - // Sync all threads in the team so that scratch memory is consistent - team_member.team_barrier(); - - // Compute the derivative from scratch memory - parthenon::par_for_inner( - inner_loop_pattern, team_member, 1, N - 2, [&](const int i) { - dev_du(n, k, j, i - 1) = (scratch_u(i + 1) - scratch_u(i - 1)) / 2.; - }); - }); - - // Copy array back from device to host - Kokkos::deep_copy(host_du, dev_du); - - Real max_rel_err = -1; - const Real rel_tol = std::numeric_limits::epsilon(); + template + struct dispatch, + parthenon::TypeList, InnerPattern, + std::index_sequence, parthenon::TypeList> { + using team_mbr_t = parthenon::team_mbr_t; + static constexpr std::size_t Nouter = sizeof...(OuterIs); + static constexpr std::size_t Ninner = Rank - Nouter - 1; - // compare data on the host - for (int n = 0; n < N; n++) { - for (int k = 0; k < N; k++) { - for (int j = 0; j < N; j++) { - for (int i = 1; i < N - 1; i++) { - const Real analytic = 2.0 * (i + 1) * pow((j + 2) * (k + 3) * (n + 4), 2.0); - const Real err = host_du(n, k, j, i - 1) - analytic; - - max_rel_err = fmax(fabs(err / analytic), max_rel_err); - } + template + void execute(DevExecSpace exec_space, view_t &dev_u, view_t &dev_du, + Kokkos::Array bounds) { + // Compute the scratch memory needs + const int scratch_level = 0; + std::size_t scratch_size_in_bytes = + parthenon::ScratchPad1D::shmem_size(pow(N, Ninner)); + + parthenon::par_for( + OuterPattern(), "unit test ND nested", exec_space, bounds[OuterIs]..., + KOKKOS_CLASS_LAMBDA(team_mbr_t team_member, OuterArgs... outer_args) { + auto scratch_u = GetScratchPad(std::make_index_sequence(), + team_member, scratch_level); + + parthenon::par_for_inner( + InnerPattern(), team_member, bounds[Nouter + InnerIs]..., + bounds[Rank - 1], [&](InnerArgs... inner_args, const int i) { + scratch_u(inner_args..., i) = dev_u(outer_args..., inner_args..., i); + }); + // Sync all threads in the team so that scratch memory is consistent + team_member.team_barrier(); + + // Compute the derivative from scratch memory + parthenon::par_for_inner(InnerPattern(), team_member, + bounds[Nouter + InnerIs]..., 1, N - 2, + [&](InnerArgs... inner_args, const int i) { + dev_du(outer_args..., inner_args..., i) = + (scratch_u(inner_args..., i + 1) - + scratch_u(inner_args..., i - 1)) / + 2.; + }); + }); + } + + template + KOKKOS_INLINE_FUNCTION auto GetScratchPad(std::index_sequence, + team_mbr_t team_member, + const int &scratch_level) const { + return ScratchPadND(team_member.team_scratch(scratch_level), + N * Is...); + } + }; + + template + bool test_comp(std::index_sequence) { + // Copy array back from device to host + Kokkos::deep_copy(host_du, dev_du); + + Real max_rel_err = -1; + const Real rel_tol = std::numeric_limits::epsilon(); + + auto idxer = + parthenon::MakeIndexer(std::pair(bounds[Is].s, bounds[Is].e)...); + for (int idx = 0; idx < idxer.size(); idx++) { + auto indices = idxer.GetIdxArray(idx); + for (int i = 1 + bounds[Rank - 1].s; i < bounds[Rank - 1].e - 1; i++) { + const Real analytic = + 2.0 * (i + Rank) * pow((1 * ... * (indices[Is] + 1 + Is)), 2.0); + const Real err = host_du(indices[Is]..., i) - analytic; + max_rel_err = fmax(fabs(err / analytic), max_rel_err); } } + + return max_rel_err < rel_tol; } - return max_rel_err < rel_tol; -} + template + bool test(OuterPattern, InnerPattern, DevExecSpace exec_space) { + Kokkos::deep_copy(dev_du, -11111.0); + constexpr std::size_t Nouter = Rank - Ninner; + dispatch, + parthenon::list_of_type_t, InnerPattern, + std::make_index_sequence, + parthenon::list_of_type_t>() + .execute(exec_space, dev_u, dev_du, bounds); + + Kokkos::fence(); + return test_comp(std::make_index_sequence()); + } +}; -TEST_CASE("nested par_for loops", "[wrapper]") { +template +void test_nested_nd() { auto default_exec_space = DevExecSpace(); - - SECTION("3D nested loops") { - REQUIRE(test_wrapper_nested_3d(parthenon::outer_loop_pattern_teams_tag, - parthenon::inner_loop_pattern_tvr_tag, - default_exec_space) == true); - + auto test_nested_ND = test_wrapper_nested_nd_impl(); + SECTION("Inner collaspe 1") { + REQUIRE(test_nested_ND.template test<1>(parthenon::outer_loop_pattern_teams_tag, + parthenon::inner_loop_pattern_tvr_tag, + default_exec_space) == true); + REQUIRE(test_nested_ND.template test<1>(parthenon::outer_loop_pattern_teams_tag, + parthenon::inner_loop_pattern_ttr_tag, + default_exec_space) == true); if constexpr (std::is_same::value) { - REQUIRE(test_wrapper_nested_3d(parthenon::outer_loop_pattern_teams_tag, - parthenon::inner_loop_pattern_simdfor_tag, - default_exec_space) == true); - } - } + REQUIRE(test_nested_ND.template test<1>(parthenon::outer_loop_pattern_teams_tag, - SECTION("4D nested loops") { - REQUIRE(test_wrapper_nested_4d(parthenon::outer_loop_pattern_teams_tag, - parthenon::inner_loop_pattern_tvr_tag, - default_exec_space) == true); + parthenon::inner_loop_pattern_simdfor_tag, + default_exec_space) == true); + } + } + SECTION("Inner collaspe 2") { + REQUIRE(test_nested_ND.template test<2>(parthenon::outer_loop_pattern_teams_tag, + parthenon::inner_loop_pattern_tvr_tag, + default_exec_space) == true); + REQUIRE(test_nested_ND.template test<2>(parthenon::outer_loop_pattern_teams_tag, + parthenon::inner_loop_pattern_ttr_tag, + default_exec_space) == true); if constexpr (std::is_same::value) { - REQUIRE(test_wrapper_nested_4d(parthenon::outer_loop_pattern_teams_tag, - parthenon::inner_loop_pattern_simdfor_tag, - default_exec_space) == true); + REQUIRE(test_nested_ND.template test<2>(parthenon::outer_loop_pattern_teams_tag, + + parthenon::inner_loop_pattern_simdfor_tag, + + default_exec_space) == true); } } } +TEST_CASE("nested par_for loops", "[wrapper]") { + auto default_exec_space = DevExecSpace(); + + SECTION("3D nested loops") { test_nested_nd<3, 32>(); } + SECTION("4D nested loops") { test_nested_nd<4, 32>(); } + SECTION("5D nested loops") { test_nested_nd<5, 10>(); } + SECTION("6D nested loops") { test_nested_nd<6, 10>(); } + SECTION("7D nested loops") { test_nested_nd<7, 10>(); } +} + template bool test_wrapper_scan_1d(T loop_pattern, DevExecSpace exec_space) { const int N = 10; @@ -540,8 +575,6 @@ TEST_CASE("Parallel scan", "[par_scan]") { template struct test_wrapper_reduce_nd_impl { - template - using Sequence = std::make_index_sequence; int indices[Rank - 1], int_bounds[2 * Rank]; parthenon::IndexRange bounds[Rank]; int h_sum; From 0ee6005a46ae7693d0f32e170118b6bd1708a0f1 Mon Sep 17 00:00:00 2001 From: Adam Date: Fri, 6 Sep 2024 16:43:12 +0000 Subject: [PATCH 77/99] fix par_for_outer tests on gpu --- tst/unit/kokkos_abstraction.cpp | 40 ++++++++++++--------------------- 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index 790d68003843..8a091c433009 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -44,9 +44,6 @@ using Real = double; template using Sequence = std::make_index_sequence; -template -void capture(Args... args) {} - template struct ParArrayND_impl {}; template <> @@ -162,7 +159,7 @@ auto HostArrayND(Args &&...args) { } template -auto ScratchPadND(Args &&...args) { +KOKKOS_INLINE_FUNCTION auto ScratchPadND(Args &&...args) { static_assert(ND <= 2, "ScratchPadND supported up to ND=2"); return typename ScratchPadND_impl::template type(std::forward(args)...); } @@ -404,10 +401,12 @@ struct test_wrapper_nested_nd_impl { // Compute the scratch memory needs const int scratch_level = 0; std::size_t scratch_size_in_bytes = - parthenon::ScratchPad1D::shmem_size(pow(N, Ninner)); + ScratchPadND_impl::template type::shmem_size( + pow(N, Ninner + 1)); - parthenon::par_for( - OuterPattern(), "unit test ND nested", exec_space, bounds[OuterIs]..., + parthenon::par_for_outer( + OuterPattern(), "unit test ND nested", exec_space, scratch_size_in_bytes, + scratch_level, bounds[OuterIs]..., KOKKOS_CLASS_LAMBDA(team_mbr_t team_member, OuterArgs... outer_args) { auto scratch_u = GetScratchPad(std::make_index_sequence(), team_member, scratch_level); @@ -483,32 +482,21 @@ template void test_nested_nd() { auto default_exec_space = DevExecSpace(); auto test_nested_ND = test_wrapper_nested_nd_impl(); - SECTION("Inner collaspe 1") { + SECTION("TVR") { REQUIRE(test_nested_ND.template test<1>(parthenon::outer_loop_pattern_teams_tag, parthenon::inner_loop_pattern_tvr_tag, default_exec_space) == true); + } + SECTION("TTR") { REQUIRE(test_nested_ND.template test<1>(parthenon::outer_loop_pattern_teams_tag, - parthenon::inner_loop_pattern_ttr_tag, - default_exec_space) == true); - if constexpr (std::is_same::value) { - REQUIRE(test_nested_ND.template test<1>(parthenon::outer_loop_pattern_teams_tag, - - parthenon::inner_loop_pattern_simdfor_tag, - default_exec_space) == true); - } - } - SECTION("Inner collaspe 2") { - REQUIRE(test_nested_ND.template test<2>(parthenon::outer_loop_pattern_teams_tag, - parthenon::inner_loop_pattern_tvr_tag, - default_exec_space) == true); - REQUIRE(test_nested_ND.template test<2>(parthenon::outer_loop_pattern_teams_tag, parthenon::inner_loop_pattern_ttr_tag, default_exec_space) == true); - if constexpr (std::is_same::value) { - REQUIRE(test_nested_ND.template test<2>(parthenon::outer_loop_pattern_teams_tag, + } + if constexpr (std::is_same::value) { + SECTION("SimdFor") { + REQUIRE(test_nested_ND.template test<1>(parthenon::outer_loop_pattern_teams_tag, parthenon::inner_loop_pattern_simdfor_tag, From 6e3775397a4358bc381283cfe22af9acae6b41df Mon Sep 17 00:00:00 2001 From: Adam Date: Fri, 6 Sep 2024 18:46:03 +0000 Subject: [PATCH 78/99] cleanup scratch memory for par_for_outer test --- tst/unit/kokkos_abstraction.cpp | 65 +++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 15 deletions(-) diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index 8a091c433009..6ef5c8a2b333 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -399,10 +399,14 @@ struct test_wrapper_nested_nd_impl { void execute(DevExecSpace exec_space, view_t &dev_u, view_t &dev_du, Kokkos::Array bounds) { // Compute the scratch memory needs + Kokkos::Array shape; + for (int i = 0; i < Ninner + 1; i++) { + shape[i] = N; + } const int scratch_level = 0; std::size_t scratch_size_in_bytes = ScratchPadND_impl::template type::shmem_size( - pow(N, Ninner + 1)); + shape[InnerIs]..., N); parthenon::par_for_outer( OuterPattern(), "unit test ND nested", exec_space, scratch_size_in_bytes, @@ -482,26 +486,57 @@ template void test_nested_nd() { auto default_exec_space = DevExecSpace(); auto test_nested_ND = test_wrapper_nested_nd_impl(); - SECTION("TVR") { - REQUIRE(test_nested_ND.template test<1>(parthenon::outer_loop_pattern_teams_tag, - parthenon::inner_loop_pattern_tvr_tag, - default_exec_space) == true); - } - SECTION("TTR") { - REQUIRE(test_nested_ND.template test<1>(parthenon::outer_loop_pattern_teams_tag, + SECTION("collapse inner 1") { + SECTION("TVR") { + REQUIRE(test_nested_ND.template test<1>(parthenon::outer_loop_pattern_teams_tag, + parthenon::inner_loop_pattern_tvr_tag, + default_exec_space) == true); + } + SECTION("TTR") { + REQUIRE(test_nested_ND.template test<1>(parthenon::outer_loop_pattern_teams_tag, - parthenon::inner_loop_pattern_ttr_tag, - default_exec_space) == true); + parthenon::inner_loop_pattern_ttr_tag, + default_exec_space) == true); + } + if constexpr (std::is_same::value) { + SECTION("SimdFor") { + REQUIRE(test_nested_ND.template test<1>(parthenon::outer_loop_pattern_teams_tag, + + parthenon::inner_loop_pattern_simdfor_tag, + + default_exec_space) == true); + } + } } - if constexpr (std::is_same::value) { - SECTION("SimdFor") { - REQUIRE(test_nested_ND.template test<1>(parthenon::outer_loop_pattern_teams_tag, + SECTION("collapse inner 2") { + SECTION("TVR") { + REQUIRE(test_nested_ND.template test<2>(parthenon::outer_loop_pattern_teams_tag, + parthenon::inner_loop_pattern_tvr_tag, + default_exec_space) == true); + } + SECTION("TTR") { + REQUIRE(test_nested_ND.template test<2>(parthenon::outer_loop_pattern_teams_tag, - parthenon::inner_loop_pattern_simdfor_tag, + parthenon::inner_loop_pattern_ttr_tag, + default_exec_space) == true); + } + SECTION("collapse<1>") { + REQUIRE(test_nested_ND.template test<2>(parthenon::outer_loop_pattern_teams_tag, + parthenon::InnerLoopCollapse<1>(), default_exec_space) == true); } + if constexpr (std::is_same::value) { + SECTION("SimdFor") { + REQUIRE(test_nested_ND.template test<2>(parthenon::outer_loop_pattern_teams_tag, + + parthenon::inner_loop_pattern_simdfor_tag, + + default_exec_space) == true); + } + } } } From 1976164986c16850cd4e0bd4ec1da7a18531734f Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sat, 7 Sep 2024 00:21:11 +0200 Subject: [PATCH 79/99] adding example par_for* test cases --- tst/unit/kokkos_abstraction.cpp | 90 +++++++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 4 deletions(-) diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index 6ef5c8a2b333..5d6aa5c82768 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -29,9 +29,12 @@ #include "Kokkos_Macros.hpp" #include "basic_types.hpp" +#include "config.hpp" #include "kokkos_abstraction.hpp" +#include "kokkos_types.hpp" #include "parthenon_array_generic.hpp" #include "utils/indexer.hpp" +#include "utils/instrument.hpp" #include "utils/type_list.hpp" using parthenon::DevExecSpace; @@ -298,14 +301,12 @@ struct test_wrapper_nd_impl { REQUIRE(par_for_comp(Sequence()) == true); } } - - template - void test_nest(OuterPattern outer_patter, InnerPattern inner_pattern) {} }; template void test_wrapper_nd(DevExecSpace exec_space) { auto wrappernd = test_wrapper_nd_impl(); + SECTION("LoopPatternFlatRange") { wrappernd.test(parthenon::loop_pattern_flatrange_tag, exec_space); } @@ -477,7 +478,6 @@ struct test_wrapper_nested_nd_impl { parthenon::list_of_type_t>() .execute(exec_space, dev_u, dev_du, bounds); - Kokkos::fence(); return test_comp(std::make_index_sequence()); } }; @@ -704,3 +704,85 @@ TEST_CASE("Parallel reduce", "[par_reduce]") { SECTION("6D loops") { test_wrapper_reduce_nd<6, 10>(default_exec_space); } SECTION("7D loops") { test_wrapper_reduce_nd<7, 10>(default_exec_space); } } + +TEST_CASE("DEFAULT loop patterns", "[default]") { + constexpr std::size_t N = 32; + SECTION("par_for") { + auto wrapper_par_for = test_wrapper_nd_impl<3, N>(); + auto arr_host_orig = wrapper_par_for.arr_host_orig; + auto dev_view = wrapper_par_for.arr_dev; + Kokkos::deep_copy(dev_view, arr_host_orig); + + // loop bounds with integer pairs + parthenon::par_for( + PARTHENON_AUTO_LABEL, 0, N - 1, 0, N - 1, 0, N - 1, + KOKKOS_LAMBDA(const int k, const int j, const int i) { + dev_view(k, j, i) += wrapper_par_for.increment_data(k, j, i); + }); + + auto arr_host_mod = wrapper_par_for.arr_host_mod; + Kokkos::deep_copy(arr_host_mod, dev_view); + bool all_same = true; + // seq_for can be used just like par_for but will translate to raw for loops + parthenon::seq_for( + 0, N - 1, 0, N - 1, 0, N - 1, [&](const int k, const int j, const int i) { + if (arr_host_orig(k, j, i) + wrapper_par_for.increment_data(k, j, i) != + dev_view(k, j, i)) { + all_same = false; + } + }); + REQUIRE(all_same); + } + + SECTION("par_reduce") { + auto wrapper_par_reduce = test_wrapper_reduce_nd_impl<3, N>(); + int test_sum = 0; + // parthenon::IndexRange as loop bounds + using Idr = parthenon::IndexRange; + parthenon::par_reduce( + PARTHENON_AUTO_LABEL, Idr{0, N - 1}, Idr{0, N - 1}, Idr{0, N - 1}, + KOKKOS_LAMBDA(const int k, const int j, const int i, int &sum) { + sum += k + j + i; + }, + Kokkos::Sum(test_sum)); + auto h_sum = wrapper_par_reduce.h_sum; + REQUIRE(h_sum == test_sum); + } + + SECTION("par_for_outer") { + auto wrapper_par_for_outer = test_wrapper_nested_nd_impl<3, N>(); + const int scratch_level = 0; + std::size_t scratch_size_in_bytes = parthenon::ScratchPad1D::shmem_size(N); + + auto dev_du = wrapper_par_for_outer.dev_du; + auto dev_u = wrapper_par_for_outer.dev_u; + + parthenon::IndexRange interior_bnds{1, N - 2}; + parthenon::par_for_outer( + PARTHENON_AUTO_LABEL, scratch_size_in_bytes, scratch_level, 0, N - 1, 0, N - 1, + KOKKOS_LAMBDA(parthenon::team_mbr_t team_member, const int k, const int j) { + auto scratch_u = + parthenon::ScratchPad1D(team_member.team_scratch(scratch_level), N); + + parthenon::par_for_inner(team_member, 0, N - 1, + [&](const int i) { scratch_u(i) = dev_u(k, j, i); }); + team_member.team_barrier(); + parthenon::par_for_inner(team_member, interior_bnds, [&](const int i) { + dev_du(k, j, i - 1) = (scratch_u(i + 1) - scratch_u(i - 1)) / 2.; + }); + }); + + Real max_rel_err = -1; + const Real rel_tol = std::numeric_limits::epsilon(); + auto host_du = wrapper_par_for_outer.host_du; + Kokkos::deep_copy(host_du, dev_du); + // mixing IndexRange & integer loop bounds + parthenon::seq_for(0, N - 1, 0, N - 1, interior_bnds, + [&](const int k, const int j, const int i) { + const Real analytic = 2. * (i + 3) * pow((k + 1) * (j + 2), 2.0); + const Real err = fabs(analytic - host_du(k, j, i - 1)); + max_rel_err = fmax(fabs(err / analytic), max_rel_err); + }); + REQUIRE(max_rel_err < rel_tol); + } +} From 7c20e6d15983094358922a1408fc97b3e579b4ff Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sat, 7 Sep 2024 13:22:23 +0200 Subject: [PATCH 80/99] LoopPatternCollapse -> LoopPatternTeamThreadVec --- src/kokkos_abstraction.hpp | 39 +++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 13b4b20e71b5..3bb5048d8f41 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -53,27 +53,27 @@ static struct LoopPatternFlatRange { // a 1:1 indices matching static struct LoopPatternMDRange { } loop_pattern_mdrange_tag; -// Translates to a Kokkos::TeamPolicy that collapse Nthread & Nvector inner loop collapses +// Translates to a Kokkos::TeamPolicy that collapse Nthread & Nvector inner loops template -struct LoopPatternCollapse : std::true_type { +struct LoopPatternTeamThreadVec { static constexpr std::size_t Nthread = num_thread; static constexpr std::size_t Nvector = num_vector; }; // Translates to a Kokkos::TeamPolicy with a single inner // Kokkos::TeamThreadRange -using LoopPatternTPTTR = LoopPatternCollapse<1, 0>; +using LoopPatternTPTTR = LoopPatternTeamThreadVec<1, 0>; constexpr auto loop_pattern_tpttr_tag = LoopPatternTPTTR(); // Translates to a Kokkos::TeamPolicy with a single inner // Kokkos::ThreadVectorRange -using LoopPatternTPTVR = LoopPatternCollapse<0, 1>; +using LoopPatternTPTVR = LoopPatternTeamThreadVec<0, 1>; constexpr auto loop_pattern_tptvr_tag = LoopPatternTPTVR(); // Translates to a Kokkos::TeamPolicy with a middle Kokkos::TeamThreadRange and // inner Kokkos::ThreadVectorRange -using LoopPatternTPTTRTVR = LoopPatternCollapse<1, 1>; +using LoopPatternTPTTRTVR = LoopPatternTeamThreadVec<1, 1>; constexpr auto loop_pattern_tpttrtvr_tag = LoopPatternTPTTRTVR(); // Translates to an outer team policy -using LoopPatternTeamOuter = LoopPatternCollapse<0, 0>; +using LoopPatternTeamOuter = LoopPatternTeamThreadVec<0, 0>; constexpr auto loop_pattern_team_outer_tag = LoopPatternTeamOuter(); // Used to catch undefined behavior as it results in throwing an error static struct LoopPatternUndefined { @@ -85,15 +85,17 @@ static struct LoopPatternUndefined { static struct OuterLoopPatternTeams { } outer_loop_pattern_teams_tag; -template -struct InnerLoopCollapse {}; +// collapses Nvector inner loops over a VectorRange policy and remaining over a +// ThreadRange +template +struct InnerLoopThreadVec {}; // Inner loop pattern tags must be constexpr so they're available on device // Translate to a Kokkos::TeamVectorRange as innermost loop (single index) -using InnerLoopPatternTVR = InnerLoopCollapse<1>; +using InnerLoopPatternTVR = InnerLoopThreadVec<1>; constexpr InnerLoopPatternTVR inner_loop_pattern_tvr_tag; // Translates to a Kokkos::TeamThreadRange as innermost loop -using InnerLoopPatternTTR = InnerLoopCollapse<0>; +using InnerLoopPatternTTR = InnerLoopThreadVec<0>; constexpr InnerLoopPatternTTR inner_loop_pattern_ttr_tag; // Translate to a non-Kokkos plain C++ innermost loop (single index) // decorated with #pragma omp simd @@ -102,27 +104,27 @@ struct InnerLoopPatternSimdFor {}; constexpr InnerLoopPatternSimdFor inner_loop_pattern_simdfor_tag; // trait to track if pattern requests any type of hierarchial parallelism -template +template struct UsesHierarchialPar : std::false_type { static constexpr std::size_t Nvector = 0; static constexpr std::size_t Nthread = 0; }; -template -struct UsesHierarchialPar, Rank> +template +struct UsesHierarchialPar> : std::true_type { static constexpr std::size_t Nthread = num_thread; static constexpr std::size_t Nvector = num_vector; }; -template -struct UsesHierarchialPar : std::true_type { +template <> +struct UsesHierarchialPar : std::true_type { static constexpr std::size_t Nvector = 0; static constexpr std::size_t Nthread = 0; }; -template -struct UsesHierarchialPar, Rank> : std::true_type { +template +struct UsesHierarchialPar> : std::true_type { static constexpr std::size_t Nvector = num_vector; }; @@ -189,8 +191,7 @@ struct DispatchType { using Translator = LoopBoundTranslator; static constexpr std::size_t Rank = Translator::Rank; - using HierarchialPar = - UsesHierarchialPar>; + using HierarchialPar = UsesHierarchialPar; static constexpr bool is_ParFor = std::is_same::value; From 8758aef9736059fae222193def872167c07ffd95 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sat, 7 Sep 2024 13:22:52 +0200 Subject: [PATCH 81/99] cleaning up LoopBounds Translator --- src/loop_bounds.hpp | 143 +++++++++++++++++++++----------- tst/unit/kokkos_abstraction.cpp | 46 +++++++++- 2 files changed, 136 insertions(+), 53 deletions(-) diff --git a/src/loop_bounds.hpp b/src/loop_bounds.hpp index e3bcbb24034e..54929bdc52f6 100644 --- a/src/loop_bounds.hpp +++ b/src/loop_bounds.hpp @@ -14,6 +14,7 @@ #ifndef LOOP_BOUNDS_HPP_ #define LOOP_BOUNDS_HPP_ +#include #include #include @@ -24,67 +25,109 @@ namespace parthenon { -// Struct for translating between loop bounds given to par_dispatch into an array of -// IndexRanges -// -template -struct LoopBoundTranslator { - private: - using BoundTypes = TypeList; - // overloads for different launch bound types. - template - KOKKOS_INLINE_FUNCTION void GetIndexRanges_impl(const int idx, const int s, const int e, - Bounds &&...bounds) { +// struct that can be specialized to register new types that can be processed to obtain +// loop bounds in a par_for* loop +template +struct ProcessLoopBound : std::false_type { + template + static constexpr std::size_t GetNumBounds(TypeList) { + static_assert(always_false, "Invalid loop bound type"); + return 0; + } + + template + KOKKOS_INLINE_FUNCTION static void + GetIndexRanges(const int &idx, Kokkos::Array &bound_arr, Bound &bound, + Bnds &&...bounds) { + static_assert(always_false, "Invalid loop bound type"); + } +}; + +namespace LoopBounds { +template +constexpr std::size_t GetNumBounds(TypeList) { + if constexpr (sizeof...(Bnds) > 0) { + using TL = TypeList; + using NextBound = ProcessLoopBound>>; + static_assert(NextBound::value, "unrecognized loop bound"); + return NextBound::GetNumBounds(TL()); + } + return 0; +} + +template +KOKKOS_INLINE_FUNCTION void GetIndexRanges(const int &idx, + Kokkos::Array &bound_arr, + Bnds &&...bounds) { + if constexpr (sizeof...(Bnds) > 0) { + using TL = TypeList; + using NextBound = typename TypeList::template type<0>; + ProcessLoopBound>::GetIndexRanges(idx, bound_arr, + std::forward(bounds)...); + } +} +} // namespace LoopBounds + +template +struct ProcessLoopBound>> + : std::true_type { + + template + static constexpr std::size_t GetNumBounds(TypeList) { + static_assert(std::is_integral_v> && + std::is_integral_v>, + "Integer bounds must come in pairs"); + + return 2 + LoopBounds::GetNumBounds(TypeList()); + } + + template + KOKKOS_INLINE_FUNCTION static void + GetIndexRanges(const int &idx, Kokkos::Array &bound_arr, const int &s, + const int &e, Bnds &&...bounds) { bound_arr[idx].s = s; bound_arr[idx].e = e; - if constexpr (sizeof...(Bounds) > 0) { - GetIndexRanges_impl(idx + 1, std::forward(bounds)...); - } - } - template - KOKKOS_INLINE_FUNCTION void GetIndexRanges_impl(const int idx, const IndexRange ir, - Bounds &&...bounds) { - bound_arr[idx] = ir; - if constexpr (sizeof...(Bounds) > 0) { - GetIndexRanges_impl(idx + 1, std::forward(bounds)...); - } + LoopBounds::GetIndexRanges(idx + 1, bound_arr, std::forward(bounds)...); } +}; - using Bound_tl = TypeList; +template <> +struct ProcessLoopBound : std::true_type { + template + using isIdRng = std::is_same; - public: - template - static constexpr bool isBoundType() { - using btype = base_type; - return std::is_same_v || std::is_integral_v; + template + static constexpr std::size_t GetNumBounds(TypeList) { + static_assert(std::is_same_v, IndexRange>, + "expected IndexRange loop bound"); + + return 2 + LoopBounds::GetNumBounds(TypeList()); } - template - static constexpr std::size_t GetNumBounds(TypeList) { - using TL = TypeList; - if constexpr (sizeof...(Bnds) == 0) { - return 0; - } else { - using Bnd0 = typename TL::template type<0>; - static_assert(isBoundType(), "unrecognized launch bound in par_dispatch"); - if constexpr (std::is_same_v, IndexRange>) { - return 2 + GetNumBounds(typename TL::template continuous_sublist<1>()); - } else if constexpr (std::is_integral_v>) { - using Bnd1 = typename TL::template type<1>; - static_assert(std::is_integral_v>, - "integer launch bounds need to come in (start, end) pairs"); - return 2 + GetNumBounds(typename TL::template continuous_sublist<2>()); - } - } - // should never get here but makes older cuda compilers happy - return 0; + template + KOKKOS_INLINE_FUNCTION static void + GetIndexRanges(const int &idx, Kokkos::Array &bound_arr, + const IndexRange &idr, Bnds &&...bounds) { + bound_arr[idx] = idr; + LoopBounds::GetIndexRanges(idx + 1, bound_arr, std::forward(bounds)...); } - static constexpr std::size_t Rank = GetNumBounds(Bound_tl()) / 2; - Kokkos::Array bound_arr; +}; + +// Struct for translating between loop bounds given to par_dispatch into an array of +// IndexRanges +template +struct LoopBoundTranslator { + public: + // make sure all the Bound_ts... types are valid loop bounds and count the number of + // bounds contained in each type + static constexpr std::size_t Rank = + LoopBounds::GetNumBounds(TypeList()) / 2; + // process all of the loop bounds into an array of IndexRanges KOKKOS_INLINE_FUNCTION Kokkos::Array GetIndexRanges(Bound_ts &&...bounds) { - GetIndexRanges_impl(0, std::forward(bounds)...); + Kokkos::Array bound_arr; + LoopBounds::GetIndexRanges(0, bound_arr, std::forward(bounds)...); return bound_arr; } }; diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index 5d6aa5c82768..8eef2de6cfc9 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -32,6 +32,7 @@ #include "config.hpp" #include "kokkos_abstraction.hpp" #include "kokkos_types.hpp" +#include "loop_bounds.hpp" #include "parthenon_array_generic.hpp" #include "utils/indexer.hpp" #include "utils/instrument.hpp" @@ -524,7 +525,7 @@ void test_nested_nd() { SECTION("collapse<1>") { REQUIRE(test_nested_ND.template test<2>(parthenon::outer_loop_pattern_teams_tag, - parthenon::InnerLoopCollapse<1>(), + parthenon::InnerLoopThreadVec<1>(), default_exec_space) == true); } if constexpr (std::is_same(default_exec_space); } } +// add our own loop bound type +struct IndexRange3D { + parthenon::IndexRange i1{0, 0}; + parthenon::IndexRange i2{0, 0}; + parthenon::IndexRange i3{0, 0}; +}; +namespace parthenon { +template <> +struct ProcessLoopBound : std::true_type { + template + static constexpr std::size_t GetNumBounds(TypeList) { + static_assert(std::is_same_v, IndexRange3D>); + return 6 + LoopBounds::GetNumBounds(TypeList()); + } + + template + KOKKOS_INLINE_FUNCTION static void + GetIndexRanges(const int &idx, Kokkos::Array &bound_arr, + const IndexRange3D &idr3d, Bnds &&...bounds) { + bound_arr[idx] = idr3d.i3; + bound_arr[idx + 1] = idr3d.i2; + bound_arr[idx + 2] = idr3d.i1; + LoopBounds::GetIndexRanges(idx + 3, bound_arr, std::forward(bounds)...); + } +}; +} // namespace parthenon + TEST_CASE("DEFAULT loop patterns", "[default]") { constexpr std::size_t N = 32; SECTION("par_for") { @@ -738,15 +766,27 @@ TEST_CASE("DEFAULT loop patterns", "[default]") { auto wrapper_par_reduce = test_wrapper_reduce_nd_impl<3, N>(); int test_sum = 0; // parthenon::IndexRange as loop bounds - using Idr = parthenon::IndexRange; + parthenon::IndexRange Idr{0, N - 1}; parthenon::par_reduce( - PARTHENON_AUTO_LABEL, Idr{0, N - 1}, Idr{0, N - 1}, Idr{0, N - 1}, + PARTHENON_AUTO_LABEL, Idr, Idr, Idr, KOKKOS_LAMBDA(const int k, const int j, const int i, int &sum) { sum += k + j + i; }, Kokkos::Sum(test_sum)); auto h_sum = wrapper_par_reduce.h_sum; REQUIRE(h_sum == test_sum); + + // our custom IndexRange3D as loop bounds + IndexRange3D idr3d{Idr, Idr, Idr}; + int test_sum2 = 0; + parthenon::par_reduce( + PARTHENON_AUTO_LABEL, idr3d, + KOKKOS_LAMBDA(const int k, const int j, const int i, int &sum) { + sum += k + j + i; + }, + Kokkos::Sum(test_sum2)); + + REQUIRE(h_sum == test_sum2); } SECTION("par_for_outer") { From 4fa3932f846711fcd7961265968e011bcef762cd Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sat, 7 Sep 2024 15:23:04 +0200 Subject: [PATCH 82/99] workaround for rtd theme table box line breaks --- doc/sphinx/_static/css/custom.css | 5 +++++ doc/sphinx/conf.py | 1 + 2 files changed, 6 insertions(+) create mode 100644 doc/sphinx/_static/css/custom.css diff --git a/doc/sphinx/_static/css/custom.css b/doc/sphinx/_static/css/custom.css new file mode 100644 index 000000000000..9d1d7fe8d309 --- /dev/null +++ b/doc/sphinx/_static/css/custom.css @@ -0,0 +1,5 @@ + /* The rtd theme doesn't break lines on long text in table boxes. This is a workaround taken from */ + /* https://github.com/Korne127/docs.appimage.org/commit/6040a810ff32d2b22c21115caa3f9396de0eda70 */ +.wy-table-responsive table td, .wy-table-responsive table th { + white-space: normal; +} diff --git a/doc/sphinx/conf.py b/doc/sphinx/conf.py index dc548e4f3378..aa7edd2b2152 100644 --- a/doc/sphinx/conf.py +++ b/doc/sphinx/conf.py @@ -49,6 +49,7 @@ # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] +html_css_files = ["css/custom.css"] # configuration for sphinx_multiversion smv_remote_whitelist = r"^(origin)$" From ce39cccfee77802ebb955e37658effd3e0a473bb Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sat, 7 Sep 2024 16:17:01 +0200 Subject: [PATCH 83/99] par_for docs --- doc/sphinx/src/par_for.rst | 132 +++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 doc/sphinx/src/par_for.rst diff --git a/doc/sphinx/src/par_for.rst b/doc/sphinx/src/par_for.rst new file mode 100644 index 000000000000..7b2ba89be2b2 --- /dev/null +++ b/doc/sphinx/src/par_for.rst @@ -0,0 +1,132 @@ +.. par_for: + +Parallelism +=========== + +The loop wrappers documented here abstract the ``Kokkos::parallel_*`` parallel launches. The wrappers +simplify the use of Kokkos `execution policies `_ +for multidimensional loops through a common interface using loop pattern tags. + +Additionally there is a provided ``parthenon::seq_for`` wrapper that uses a similar interface to perform +multidimensional sequential loops. + +An example of usage can be found in `the unit +test `__ + +.. list-table:: parallel launches + :widths: 25 25 + :header-rows: 1 + + * - Parthenon + - Kokkos + * - ``par_for`` + - ``parallel_for`` + * - ``par_reduce`` + - ``parallel_reduce`` + * - ``par_scan`` + - ``parallel_scan`` + +Parallel launches are passed a string label, a set of inclusive loop bounds, a functor, and any extra arguments needed +for parallel reductions/scans. Optionally a loop pattern tag and an execution space may be provided. +When ommitted the ``DEFAULT_LOOP_PATTERN`` is used. + +.. code:: cpp + + parthenon::par_for( + loop_pattern_tag, exec_space, PARTHENON_AUTO_LABEL, ks, ke, js, je, is, ie, + KOKKOS_LAMBDA(const int k, const int j, const int i) { + data(k, j, i) += 1.; + }); + +.. list-table:: parallel launch parameters + :widths: 25 75 + :header-rows: 1 + + * - Parameter + - + * - loop_pattern_tag + - Determines the execution policy. See table below. + * - exec_space + - kokkos execution space + * - loop bounds + - inclusive start/end pairs for the multidimensional loop. Supported types are ``integral`` and ``parthenon::IndexRange``. + Can be extended to accept other types (see below). + * - functor + - Defines the body of the parallel loop. + See `Kokkos programming guide `_ + for more. + +.. list-table:: Loop Pattern tags + :widths: 40 60 + :header-rows: 1 + + * - Tag + - Execution Policy + * - ``loop_pattern_flatrange_tag`` + - Flattens all of the loops into a single ``Kokkos::RangePolicy`` + * - ``loop_pattern_simdfor_tag`` + - Maps to two C-style loops. The innermost gets decorated with a ``#pragma omp simd`` and the remaining + loops are flattened into a single C-style for looop. Only supported on CPU. + * - ``loop_pattern_mdrange_tag`` + - Maps all the loop bounds onto a ``Kokkos::MDRangePolicy`` + * - ``LoopPatternTeamThreadVec()`` + - Maps onto a hierarchial parrallel loop. The ``Nv`` inner loops are flattened onto a ``VectorRange`` policy, + the next ``Nt`` onto a ``ThreadRange`` policy, and the remaining loops are + flattened into an outer ``TeamThreadRange``. The specializations ``loop_pattern_[tpttr|tptvr|tpttrtvr]_tag`` correspond + to ``<1,0>``, ``<0,1>``, ``<1,1>`` respectively. + + +Cmake Options +------------- + +``PAR_LOOP_LAYOUT`` controls the ``DEFAULT_LOOP_PATTERN`` macro. + +.. list-table:: ``PAR_LOOP_LAYOUT`` options. + :widths: 25 25 + :header-rows: 1 + + * - ``PAR_LOOP_LAYOUT`` + - Pattern Tag + * - "MANUAL1D_LOOP" + - loop_pattern_flatrange_tag + * - "SIMDFOR_LOOP" + - loop_pattern_simdfor_tag + * - "MDRANGE_LOOP" + - loop_pattern_mdrange_tag + * - "TP_TTR_LOOP" + - loop_pattern_tpttr_tag + * - "TP_TVR_LOOP" + - loop_pattern_tptvr_tag + * - "TPTTRTVR_LOOP" + - loop_pattern_tpttrtvr_tag + +Adding New Loop Patterns +------------------------ + +All of the ``par_for*`` overloads get processed into the ``par_dispatch_impl`` struct that +determines the types of the loop pattern, functor, functor arguments, loop bounds, and any +extra arguments need for scans/reductions. The struct implements overloads of the +``par_dispatch_impl::dispatch_impl`` method that are tagged using the ``PatternTag`` ``enum`` +to specialize the ``LoopPatternTag`` struct. New loop patterns need to extend this enum and +provide an additional overload. + +There is a chance that the requested loop pattern passed through ``parthenon::par_for``, for +example a ``loop_pattern_simdfor_tag`` ``DEFAULT_LOOP_PATTERN`` being used in a ``par_reduce``, +resulting in a conflict. For this reason the ``DispatchType`` type trait provides the +``DispatchType::GetPatternTag()`` method that processes the requested loop pattern and returns +a ``PatternTag`` and provides sensible fallbacks for the loop pattern if there are any conflicts. +In this way ``DEFAULT_LOOP_PATTERN`` can be reliably used. + +Adding New Loop Bound Types +--------------------------- + +All of the loop bounds provided to any parallel wrapper gets processed by the ``LoopBoundTranslator`` +to determine the rank of the multidimensional loop and translate the start/end pairs into an array +of ``IndexRange``s. Each bound type gets processed individually and allows the flexibility to mix +loop bound types as long as they are supported. + +New types can be provided by specializing the ``ProcessLoopBound`` struct in the ``parthenon`` namespace. +These structs need to provide a ``GetNumBounds`` method to count the number of start/end bounds contained +in the type, as well as a ``GetIndexRanges`` method to fill the ``IndexRange`` bounds used in the +parallel dispatch. + From a450c7c22eb947b3ce42236a13d60315db5d275e Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sat, 7 Sep 2024 16:24:13 +0200 Subject: [PATCH 84/99] count loop bounds more sensibly --- src/loop_bounds.hpp | 7 +++---- tst/unit/kokkos_abstraction.cpp | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/loop_bounds.hpp b/src/loop_bounds.hpp index 54929bdc52f6..c2d500f0541c 100644 --- a/src/loop_bounds.hpp +++ b/src/loop_bounds.hpp @@ -78,7 +78,7 @@ struct ProcessLoopBound>> std::is_integral_v>, "Integer bounds must come in pairs"); - return 2 + LoopBounds::GetNumBounds(TypeList()); + return 1 + LoopBounds::GetNumBounds(TypeList()); } template @@ -101,7 +101,7 @@ struct ProcessLoopBound : std::true_type { static_assert(std::is_same_v, IndexRange>, "expected IndexRange loop bound"); - return 2 + LoopBounds::GetNumBounds(TypeList()); + return 1 + LoopBounds::GetNumBounds(TypeList()); } template @@ -120,8 +120,7 @@ struct LoopBoundTranslator { public: // make sure all the Bound_ts... types are valid loop bounds and count the number of // bounds contained in each type - static constexpr std::size_t Rank = - LoopBounds::GetNumBounds(TypeList()) / 2; + static constexpr std::size_t Rank = LoopBounds::GetNumBounds(TypeList()); // process all of the loop bounds into an array of IndexRanges KOKKOS_INLINE_FUNCTION diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index 8eef2de6cfc9..4677e54b606f 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -718,7 +718,7 @@ struct ProcessLoopBound : std::true_type { template static constexpr std::size_t GetNumBounds(TypeList) { static_assert(std::is_same_v, IndexRange3D>); - return 6 + LoopBounds::GetNumBounds(TypeList()); + return 3 + LoopBounds::GetNumBounds(TypeList()); } template From 85568d68a95f0d724a44280a598322faf5ae52be Mon Sep 17 00:00:00 2001 From: adam reyes Date: Sun, 8 Sep 2024 16:57:28 +0200 Subject: [PATCH 85/99] Revert "workaround for rtd theme table box line breaks" This reverts commit 4fa3932f846711fcd7961265968e011bcef762cd. --- doc/sphinx/_static/css/custom.css | 5 ----- doc/sphinx/conf.py | 1 - 2 files changed, 6 deletions(-) delete mode 100644 doc/sphinx/_static/css/custom.css diff --git a/doc/sphinx/_static/css/custom.css b/doc/sphinx/_static/css/custom.css deleted file mode 100644 index 9d1d7fe8d309..000000000000 --- a/doc/sphinx/_static/css/custom.css +++ /dev/null @@ -1,5 +0,0 @@ - /* The rtd theme doesn't break lines on long text in table boxes. This is a workaround taken from */ - /* https://github.com/Korne127/docs.appimage.org/commit/6040a810ff32d2b22c21115caa3f9396de0eda70 */ -.wy-table-responsive table td, .wy-table-responsive table th { - white-space: normal; -} diff --git a/doc/sphinx/conf.py b/doc/sphinx/conf.py index aa7edd2b2152..dc548e4f3378 100644 --- a/doc/sphinx/conf.py +++ b/doc/sphinx/conf.py @@ -49,7 +49,6 @@ # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] -html_css_files = ["css/custom.css"] # configuration for sphinx_multiversion smv_remote_whitelist = r"^(origin)$" From 052e39fed13b6b1ef6c43c03d82365374884d7e2 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Mon, 9 Sep 2024 09:17:11 +0200 Subject: [PATCH 86/99] ParArray moved to kokkos_types.hpp --- src/kokkos_types.hpp | 101 +++++++++++++++++++++ tst/unit/kokkos_abstraction.cpp | 151 +++----------------------------- 2 files changed, 111 insertions(+), 141 deletions(-) diff --git a/src/kokkos_types.hpp b/src/kokkos_types.hpp index 2fbd285a0861..495ace2d75ad 100644 --- a/src/kokkos_types.hpp +++ b/src/kokkos_types.hpp @@ -20,6 +20,8 @@ #ifndef KOKKOS_TYPES_HPP_ #define KOKKOS_TYPES_HPP_ +#include + #include #include "parthenon_array_generic.hpp" @@ -131,6 +133,105 @@ using device_view_t = Kokkos::View, Layout, DevMemSpace>; template using host_view_t = typename device_view_t::HostMirror; + +template +struct ParArrayND_impl { + static_assert(ND::value <= 8, "ParArray only supported up to ND=8"); +}; + +template +struct ScratchPadND_impl { + static_assert(ND::value <= 6, "ScratchPad only supported up to ND=6"); +}; + +template +using ParArray = typename ParArrayND_impl, + State>::template type; + +template +using HostArray = typename ParArrayND_impl< + std::integral_constant>::template type::HostMirror; + +template +using ScratchPad = + typename ScratchPadND_impl>::template type; + +template +struct ParArrayND_impl, State> { + template + using type = parthenon::ParArray0D; +}; +template +struct ParArrayND_impl, State> { + template + using type = parthenon::ParArray1D; +}; +template +struct ParArrayND_impl, State> { + template + using type = parthenon::ParArray2D; +}; +template +struct ParArrayND_impl, State> { + template + using type = parthenon::ParArray3D; +}; +template +struct ParArrayND_impl, State> { + template + using type = parthenon::ParArray4D; +}; +template +struct ParArrayND_impl, State> { + template + using type = parthenon::ParArray5D; +}; +template +struct ParArrayND_impl, State> { + template + using type = parthenon::ParArray6D; +}; +template +struct ParArrayND_impl, State> { + template + using type = parthenon::ParArray7D; +}; +template +struct ParArrayND_impl, State> { + template + using type = parthenon::ParArray8D; +}; + +template <> +struct ScratchPadND_impl> { + template + using type = parthenon::ScratchPad1D; +}; +template <> +struct ScratchPadND_impl> { + template + using type = parthenon::ScratchPad2D; +}; +template <> +struct ScratchPadND_impl> { + template + using type = parthenon::ScratchPad3D; +}; +template <> +struct ScratchPadND_impl> { + template + using type = parthenon::ScratchPad4D; +}; +template <> +struct ScratchPadND_impl> { + template + using type = parthenon::ScratchPad5D; +}; +template <> +struct ScratchPadND_impl> { + template + using type = parthenon::ScratchPad6D; +}; } // namespace parthenon #endif // KOKKOS_TYPES_HPP_ diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index 4677e54b606f..e1a27a3a2305 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -39,135 +39,11 @@ #include "utils/type_list.hpp" using parthenon::DevExecSpace; -using parthenon::ParArray1D; -using parthenon::ParArray2D; -using parthenon::ParArray3D; -using parthenon::ParArray4D; using Real = double; template using Sequence = std::make_index_sequence; -template -struct ParArrayND_impl {}; -template <> -struct ParArrayND_impl<0> { - template - using type = parthenon::ParArray0D; -}; -template <> -struct ParArrayND_impl<1> { - template - using type = parthenon::ParArray1D; -}; -template <> -struct ParArrayND_impl<2> { - template - using type = parthenon::ParArray2D; -}; -template <> -struct ParArrayND_impl<3> { - template - using type = parthenon::ParArray3D; -}; -template <> -struct ParArrayND_impl<4> { - template - using type = parthenon::ParArray4D; -}; -template <> -struct ParArrayND_impl<5> { - template - using type = parthenon::ParArray5D; -}; -template <> -struct ParArrayND_impl<6> { - template - using type = parthenon::ParArray6D; -}; -template <> -struct ParArrayND_impl<7> { - template - using type = parthenon::ParArray7D; -}; -template <> -struct ParArrayND_impl<8> { - template - using type = parthenon::ParArray8D; -}; -template -struct HostArrayND_impl {}; -template <> -struct HostArrayND_impl<0> { - template - using type = parthenon::HostArray0D; -}; -template <> -struct HostArrayND_impl<1> { - template - using type = parthenon::HostArray1D; -}; -template <> -struct HostArrayND_impl<2> { - template - using type = parthenon::HostArray2D; -}; -template <> -struct HostArrayND_impl<3> { - template - using type = parthenon::HostArray3D; -}; -template <> -struct HostArrayND_impl<4> { - template - using type = parthenon::HostArray4D; -}; -template <> -struct HostArrayND_impl<5> { - template - using type = parthenon::HostArray5D; -}; -template <> -struct HostArrayND_impl<6> { - template - using type = parthenon::HostArray6D; -}; -template <> -struct HostArrayND_impl<7> { - template - using type = parthenon::HostArray7D; -}; - -template -struct ScratchPadND_impl {}; -template <> -struct ScratchPadND_impl<1> { - template - using type = parthenon::ScratchPad1D; -}; -template <> -struct ScratchPadND_impl<2> { - template - using type = parthenon::ScratchPad2D; -}; - -template -auto ParArrayND(Args &&...args) { - static_assert(ND <= 8, "ParArrayND supoorted up to ND=8"); - return typename ParArrayND_impl::template type(std::forward(args)...); -} -template -auto HostArrayND(Args &&...args) { - static_assert(ND <= 7, "HostArrayND supoorted up to ND=7"); - return typename HostArrayND_impl::template type(std::forward(args)...); -} - -template -KOKKOS_INLINE_FUNCTION auto ScratchPadND(Args &&...args) { - static_assert(ND <= 2, "ScratchPadND supported up to ND=2"); - return typename ScratchPadND_impl::template type(std::forward(args)...); -} - template struct SequenceOfInt {}; @@ -187,20 +63,14 @@ template using sequence_of_int_v = typename SequenceOfInt>::value; -template -auto GetArray_impl(Args... Ns) { - static_assert(sizeof...(Args) == Rank); - return ParArrayND("device", Ns...); -} - enum class lbounds { integer, indexrange }; template struct test_wrapper_nd_impl { int int_bounds[2 * Rank]; parthenon::IndexRange bounds[Rank]; - decltype(ParArrayND()) arr_dev; - decltype(HostArrayND()) arr_host_orig, arr_host_mod; + parthenon::ParArray arr_dev; + parthenon::HostArray arr_host_orig, arr_host_mod; test_wrapper_nd_impl() { arr_dev = GetArray(sequence_of_int_v()); @@ -215,7 +85,7 @@ struct test_wrapper_nd_impl { template auto GetArray(std::index_sequence) { static_assert(sizeof...(Is) == Rank); - return GetArray_impl(N * Is...); + return parthenon::ParArray("device", N * Is...); } template @@ -352,8 +222,8 @@ TEST_CASE("par_for loops", "[wrapper]") { template struct test_wrapper_nested_nd_impl { Kokkos::Array bounds; - decltype(ParArrayND()) dev_u, dev_du; - decltype(HostArrayND()) host_u, host_du; + parthenon::ParArray dev_u, dev_du; + parthenon::HostArray host_u, host_du; test_wrapper_nested_nd_impl() { dev_u = GetArray(sequence_of_int_v()); @@ -365,7 +235,7 @@ struct test_wrapper_nested_nd_impl { template auto GetArray(std::index_sequence, Args... Ns) { - return GetArray_impl(Is * N..., Ns...); + return parthenon::ParArray("device", Is * N..., Ns...); } template @@ -407,8 +277,7 @@ struct test_wrapper_nested_nd_impl { } const int scratch_level = 0; std::size_t scratch_size_in_bytes = - ScratchPadND_impl::template type::shmem_size( - shape[InnerIs]..., N); + parthenon::ScratchPad::shmem_size(shape[InnerIs]..., N); parthenon::par_for_outer( OuterPattern(), "unit test ND nested", exec_space, scratch_size_in_bytes, @@ -441,8 +310,8 @@ struct test_wrapper_nested_nd_impl { KOKKOS_INLINE_FUNCTION auto GetScratchPad(std::index_sequence, team_mbr_t team_member, const int &scratch_level) const { - return ScratchPadND(team_member.team_scratch(scratch_level), - N * Is...); + return parthenon::ScratchPad( + team_member.team_scratch(scratch_level), N * Is...); } }; @@ -611,7 +480,7 @@ struct test_wrapper_reduce_nd_impl { template auto GetArray(std::index_sequence) { static_assert(sizeof...(Is) == Rank); - return ParArrayND("device", N * Is...); + return parthenon::ParArray("device", N * Is...); } template From ae7a3b96c3bfde020d6a473a0eb11755e6cf4a08 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Mon, 9 Sep 2024 15:17:57 +0200 Subject: [PATCH 87/99] move is_functor -> concepts_lite.hpp --- src/loop_bounds.hpp | 6 ------ src/utils/concepts_lite.hpp | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/loop_bounds.hpp b/src/loop_bounds.hpp index c2d500f0541c..aca3b1c9e5fc 100644 --- a/src/loop_bounds.hpp +++ b/src/loop_bounds.hpp @@ -135,12 +135,6 @@ template struct LoopBoundTranslator> : public LoopBoundTranslator {}; -template -struct is_functor : std::false_type {}; - -template -struct is_functor> : std::true_type {}; - template constexpr int FirstFuncIdx() { if constexpr (idx == TL::n_types) { diff --git a/src/utils/concepts_lite.hpp b/src/utils/concepts_lite.hpp index b3b49d756198..ba055e8603d3 100644 --- a/src/utils/concepts_lite.hpp +++ b/src/utils/concepts_lite.hpp @@ -49,6 +49,12 @@ struct is_specialization_of, TEMPL> : public std::true_type {} template using void_t = void; +template +struct is_functor : std::false_type {}; + +template +struct is_functor> : std::true_type {}; + // implements is a template struct for checking if type T implements a particular // concept, which here simply means that it conforms to some interface. // (I think people call this concepts lite, since there are more From 562b3968559f4868f455e0609a02a532cd417fb8 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Mon, 9 Sep 2024 17:23:49 +0200 Subject: [PATCH 88/99] fix a typo --- doc/sphinx/src/par_for.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/sphinx/src/par_for.rst b/doc/sphinx/src/par_for.rst index 7b2ba89be2b2..6d2951cbb1ff 100644 --- a/doc/sphinx/src/par_for.rst +++ b/doc/sphinx/src/par_for.rst @@ -122,7 +122,7 @@ Adding New Loop Bound Types All of the loop bounds provided to any parallel wrapper gets processed by the ``LoopBoundTranslator`` to determine the rank of the multidimensional loop and translate the start/end pairs into an array -of ``IndexRange``s. Each bound type gets processed individually and allows the flexibility to mix +of ``IndexRange`` s. Each bound type gets processed individually and allows the flexibility to mix loop bound types as long as they are supported. New types can be provided by specializing the ``ProcessLoopBound`` struct in the ``parthenon`` namespace. From 034efeddc7a864ffa8e629c1a53bc34ab510ee2b Mon Sep 17 00:00:00 2001 From: adam reyes Date: Mon, 9 Sep 2024 17:49:10 +0200 Subject: [PATCH 89/99] update changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 62b1ea9c5c78..3cdc860a2ac9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ ## Current develop +### Added (new features/APIs/variables/...) +- [[PR 1142]](https://github.com/parthenon-hpc-lab/parthenon/pull/1142) Unify par_dispatch, par_for_outer & par_for_inner overloads + ### Changed (changing behavior/API/variables/...) From ccf3e2d26bad23cb0754c8226a0f529d0d3f3fba Mon Sep 17 00:00:00 2001 From: acreyes Date: Tue, 10 Sep 2024 05:52:29 -0400 Subject: [PATCH 90/99] fix scratch pad initialization for gcc 9.4 --- tst/unit/kokkos_abstraction.cpp | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tst/unit/kokkos_abstraction.cpp b/tst/unit/kokkos_abstraction.cpp index e1a27a3a2305..5c7630a28737 100644 --- a/tst/unit/kokkos_abstraction.cpp +++ b/tst/unit/kokkos_abstraction.cpp @@ -227,7 +227,7 @@ struct test_wrapper_nested_nd_impl { test_wrapper_nested_nd_impl() { dev_u = GetArray(sequence_of_int_v()); - dev_du = GetArray(sequence_of_int_v(), N - 2); + dev_du = GetArray(sequence_of_int_v()); host_u = Kokkos::create_mirror(dev_u); host_du = Kokkos::create_mirror(dev_du); init(std::make_index_sequence()); @@ -283,8 +283,7 @@ struct test_wrapper_nested_nd_impl { OuterPattern(), "unit test ND nested", exec_space, scratch_size_in_bytes, scratch_level, bounds[OuterIs]..., KOKKOS_CLASS_LAMBDA(team_mbr_t team_member, OuterArgs... outer_args) { - auto scratch_u = GetScratchPad(std::make_index_sequence(), - team_member, scratch_level); + auto scratch_u = GetScratchPad(team_member, scratch_level); parthenon::par_for_inner( InnerPattern(), team_member, bounds[Nouter + InnerIs]..., @@ -306,12 +305,17 @@ struct test_wrapper_nested_nd_impl { }); } - template - KOKKOS_INLINE_FUNCTION auto GetScratchPad(std::index_sequence, - team_mbr_t team_member, + KOKKOS_INLINE_FUNCTION auto GetScratchPad(team_mbr_t team_member, const int &scratch_level) const { - return parthenon::ScratchPad( - team_member.team_scratch(scratch_level), N * Is...); + parthenon::ScratchPad scrchPad; + if constexpr (Ninner == 0) { + scrchPad = + parthenon::ScratchPad<1, Real>(team_member.team_scratch(scratch_level), N); + } else if constexpr (Ninner == 1) { + scrchPad = + parthenon::ScratchPad<2, Real>(team_member.team_scratch(scratch_level), N, N); + } + return scrchPad; } }; From 11616d84707336875ab5072226cebc749eadbced Mon Sep 17 00:00:00 2001 From: ADAM REYES Date: Tue, 12 Nov 2024 19:27:10 -0500 Subject: [PATCH 91/99] fix merged test for device --- tst/unit/test_kokkos_abstraction.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tst/unit/test_kokkos_abstraction.cpp b/tst/unit/test_kokkos_abstraction.cpp index 52d9f4adb023..dde08e25e107 100644 --- a/tst/unit/test_kokkos_abstraction.cpp +++ b/tst/unit/test_kokkos_abstraction.cpp @@ -333,7 +333,6 @@ struct test_wrapper_nested_nd_impl { // Copy array back from device to host Kokkos::deep_copy(host_du, dev_du); - Real max_rel_err = -1; const Real rel_tol = std::numeric_limits::epsilon(); auto idxer = @@ -343,7 +342,6 @@ struct test_wrapper_nested_nd_impl { for (int i = 1 + bounds[Rank - 1].s; i < bounds[Rank - 1].e - 1; i++) { const Real analytic = 2.0 * (i + Rank) * pow((1 * ... * (indices[Is] + 1 + Is)), 2.0); - const Real rel_tol = std::numeric_limits::epsilon(); if (!SoftEquiv(host_du(indices[Is]..., i), analytic, rel_tol)) { return false; @@ -640,7 +638,7 @@ TEST_CASE("DEFAULT loop patterns", "[default]") { parthenon::seq_for( 0, N - 1, 0, N - 1, 0, N - 1, [&](const int k, const int j, const int i) { if (arr_host_orig(k, j, i) + wrapper_par_for.increment_data(k, j, i) != - dev_view(k, j, i)) { + arr_host_mod(k, j, i)) { all_same = false; } }); From 28c3beed2fded5c2d0b0b0f8747a5b19838e556c Mon Sep 17 00:00:00 2001 From: adam reyes Date: Fri, 6 Dec 2024 14:06:51 -0800 Subject: [PATCH 92/99] fix for HtoD copies in par_for_outer --- src/kokkos_abstraction.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index dd3e473b0915..ca7ae863070b 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -500,13 +500,14 @@ struct par_dispatch_impl, TypeList bound_arr, Function function, Args &&...args, const int scratch_level, const std::size_t scratch_size_in_bytes) { - const auto idxer = - MakeIndexer(Kokkos::Array{bound_arr[OuterIs]...}); + const std::size_t size = ((bound_arr[OuterIs].e - bound_arr[OuterIs].s + 1) * ...); kokkos_dispatch( Tag(), name, - team_policy(exec_space, idxer.size(), Kokkos::AUTO) + team_policy(exec_space, size, Kokkos::AUTO) .set_scratch_size(scratch_level, Kokkos::PerTeam(scratch_size_in_bytes)), KOKKOS_LAMBDA(team_mbr_t team_member, ExtraFuncArgs... fargs) { + const auto idxer = MakeIndexer( + Kokkos::Array{bound_arr[OuterIs]...}); const auto idx_arr = idxer.GetIdxArray(team_member.league_rank()); function(team_member, idx_arr[OuterIs]..., std::forward(fargs)...); From 3dfc5cf8f3beab4862edff0347cfa5de89855360 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Fri, 6 Dec 2024 14:47:11 -0800 Subject: [PATCH 93/99] add mdrange tiling --- src/kokkos_abstraction.hpp | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index ca7ae863070b..a2f2525ece9e 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -105,26 +105,26 @@ constexpr InnerLoopPatternSimdFor inner_loop_pattern_simdfor_tag; // trait to track if pattern requests any type of hierarchial parallelism template -struct UsesHierarchialPar : std::false_type { +struct UsesHierarchicalPar : std::false_type { static constexpr std::size_t Nvector = 0; static constexpr std::size_t Nthread = 0; }; template -struct UsesHierarchialPar> +struct UsesHierarchicalPar> : std::true_type { static constexpr std::size_t Nthread = num_thread; static constexpr std::size_t Nvector = num_vector; }; template <> -struct UsesHierarchialPar : std::true_type { +struct UsesHierarchicalPar : std::true_type { static constexpr std::size_t Nvector = 0; static constexpr std::size_t Nthread = 0; }; template -struct UsesHierarchialPar> : std::true_type { +struct UsesHierarchicalPar> : std::true_type { static constexpr std::size_t Nvector = num_vector; }; @@ -191,7 +191,7 @@ struct DispatchType { using Translator = LoopBoundTranslator; static constexpr std::size_t Rank = Translator::Rank; - using HierarchialPar = UsesHierarchialPar; + using HierarchicalPar = UsesHierarchicalPar; static constexpr bool is_ParFor = std::is_same::value; @@ -223,7 +223,7 @@ struct DispatchType { return PT::md; } else if constexpr (std::is_same_v) { return PT::outer; - } else if constexpr (HierarchialPar::value) { + } else if constexpr (HierarchicalPar::value) { return PT::collapse; } @@ -360,7 +360,7 @@ struct par_disp_inner_impl, TypeList(bounds)...); constexpr bool isSimdFor = std::is_same_v; - constexpr std::size_t Nvector = dispatch_type::HierarchialPar::Nvector; + constexpr std::size_t Nvector = dispatch_type::HierarchicalPar::Nvector; constexpr std::size_t Nthread = Rank - Nvector; constexpr auto pattern_tag = LoopPatternTag(); @@ -409,7 +409,7 @@ struct par_dispatch_impl, TypeList(); static_assert( @@ -486,11 +486,16 @@ struct par_dispatch_impl, TypeList>(exec_space, {bound_arr[OuterIs].s...}, - {(1 + bound_arr[OuterIs].e)...}), - function, std::forward(args)...); + constexpr std::size_t Nouter = sizeof...(OuterIs); + Kokkos::Array tiling{(OuterIs, 1)...}; + tiling[Nouter - 1] = bound_arr[Nouter - 1].e + 1 - bound_arr[Nouter - 1].s; + kokkos_dispatch(Tag(), name, + Kokkos::Experimental::require( + Kokkos::MDRangePolicy>( + exec_space, {bound_arr[OuterIs].s...}, + {(1 + bound_arr[OuterIs].e)...}, tiling), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + function, std::forward(args)...); } // Flatten loop bounds into a single outer team_policy @@ -525,9 +530,9 @@ struct par_dispatch_impl, TypeList{bound_arr[OuterIs]...}); - using HierarchialPar = typename dispatch_type::HierarchialPar; - constexpr std::size_t Nvector = HierarchialPar::Nvector; - constexpr std::size_t Nthread = HierarchialPar::Nthread; + using HierarchicalPar = typename dispatch_type::HierarchicalPar; + constexpr std::size_t Nvector = HierarchicalPar::Nvector; + constexpr std::size_t Nthread = HierarchicalPar::Nthread; constexpr std::size_t Nouter = Rank - Nvector - Nthread; kokkos_dispatch( Tag(), name, From 2de2eabe88044850d25de84f07ade86e24ad7f58 Mon Sep 17 00:00:00 2001 From: adam reyes Date: Fri, 6 Dec 2024 14:59:05 -0800 Subject: [PATCH 94/99] docs for ParArray --- doc/sphinx/src/parthenon_arrays.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/sphinx/src/parthenon_arrays.rst b/doc/sphinx/src/parthenon_arrays.rst index 07a9cd9af3c4..94ab31cb22e1 100644 --- a/doc/sphinx/src/parthenon_arrays.rst +++ b/doc/sphinx/src/parthenon_arrays.rst @@ -25,6 +25,12 @@ where ``LayoutWrapper`` is currently hardcoded to with the default execution space. If UVM is enabled, it is ``Kokkos::CudaUVMSpace``. +The above ``ParArray#D``s are also aliased with an integral template parameter + +.. code:: c++ + ParArray3D myArray("a 3d array", 6, 5, 4); + ParArray<3, Real> myArray2("same type as myArray", 6, 5, 4); + Parthenon Arbitrary-Dimensional Arrays ====================================== From 206cd311b913ba9794ce0a39e4ff566141e29c9d Mon Sep 17 00:00:00 2001 From: Adam C Reyes Date: Sat, 7 Dec 2024 00:05:49 +0100 Subject: [PATCH 95/99] Update src/kokkos_abstraction.hpp Co-authored-by: Philipp Grete --- src/kokkos_abstraction.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index a2f2525ece9e..3ad0cf15a820 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -332,7 +332,7 @@ struct dispatch_collapse { }; // builds a functor that uses inner hierarchial parrallelism used by both par_disp_inner & -// par_dipsatch for LoopPatternCollapse +// par_dispatch for LoopPatternCollapse template KOKKOS_FORCEINLINE_FUNCTION auto From 7bc731ead82eed7781eed7389e484f4750a06180 Mon Sep 17 00:00:00 2001 From: Adam C Reyes Date: Sat, 7 Dec 2024 00:29:39 +0100 Subject: [PATCH 96/99] Update doc/sphinx/src/par_for.rst Co-authored-by: Philipp Grete --- doc/sphinx/src/par_for.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/sphinx/src/par_for.rst b/doc/sphinx/src/par_for.rst index 6d2951cbb1ff..f1a13758bcb2 100644 --- a/doc/sphinx/src/par_for.rst +++ b/doc/sphinx/src/par_for.rst @@ -70,7 +70,7 @@ When ommitted the ``DEFAULT_LOOP_PATTERN`` is used. * - ``loop_pattern_mdrange_tag`` - Maps all the loop bounds onto a ``Kokkos::MDRangePolicy`` * - ``LoopPatternTeamThreadVec()`` - - Maps onto a hierarchial parrallel loop. The ``Nv`` inner loops are flattened onto a ``VectorRange`` policy, + - Maps onto a hierarchical parallel loop. The ``Nv`` inner loops are flattened onto a ``VectorRange`` policy, the next ``Nt`` onto a ``ThreadRange`` policy, and the remaining loops are flattened into an outer ``TeamThreadRange``. The specializations ``loop_pattern_[tpttr|tptvr|tpttrtvr]_tag`` correspond to ``<1,0>``, ``<0,1>``, ``<1,1>`` respectively. From c448372ed18986f64011f2f481346267da97b87e Mon Sep 17 00:00:00 2001 From: Adam C Reyes Date: Sat, 7 Dec 2024 00:30:30 +0100 Subject: [PATCH 97/99] Update doc/sphinx/src/par_for.rst Co-authored-by: Philipp Grete --- doc/sphinx/src/par_for.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/sphinx/src/par_for.rst b/doc/sphinx/src/par_for.rst index f1a13758bcb2..0dcbada62403 100644 --- a/doc/sphinx/src/par_for.rst +++ b/doc/sphinx/src/par_for.rst @@ -66,7 +66,7 @@ When ommitted the ``DEFAULT_LOOP_PATTERN`` is used. - Flattens all of the loops into a single ``Kokkos::RangePolicy`` * - ``loop_pattern_simdfor_tag`` - Maps to two C-style loops. The innermost gets decorated with a ``#pragma omp simd`` and the remaining - loops are flattened into a single C-style for looop. Only supported on CPU. + loops are flattened into a single C-style for loop. Only supported on CPU. * - ``loop_pattern_mdrange_tag`` - Maps all the loop bounds onto a ``Kokkos::MDRangePolicy`` * - ``LoopPatternTeamThreadVec()`` From 10f3bc2ba6979ff54adc2fb3515d64cae51fc2ea Mon Sep 17 00:00:00 2001 From: adam reyes Date: Fri, 6 Dec 2024 17:31:08 -0800 Subject: [PATCH 98/99] check par_reduce for hierarchical patterns --- src/kokkos_abstraction.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 3ad0cf15a820..ecb4ff1b6a74 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -195,6 +195,8 @@ struct DispatchType { static constexpr bool is_ParFor = std::is_same::value; + static constexpr bool is_ParRed = + std::is_same::value; static constexpr bool is_ParScan = std::is_same::value; @@ -219,7 +221,8 @@ struct DispatchType { // for now this is guaranteed to be par_for_inner, when par_reduce_inner is // supported need to check return PT::simd; - } else if constexpr (IsMDRange) { + } else if constexpr (IsMDRange || is_ParRed) { + // par_reduce does not currently work with either team-based patterns return PT::md; } else if constexpr (std::is_same_v) { return PT::outer; From 279b12691a1e4242510f043b1da37c6df5a8765f Mon Sep 17 00:00:00 2001 From: adam reyes Date: Thu, 12 Dec 2024 09:50:54 -0500 Subject: [PATCH 99/99] fill tiling array --- src/kokkos_abstraction.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index ecb4ff1b6a74..702f442123f7 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -490,7 +490,9 @@ struct par_dispatch_impl, TypeList tiling{(OuterIs, 1)...}; + Kokkos::Array tiling; + for (int i = 0; i < Nouter - 1; i++) + tiling[i] = 1; tiling[Nouter - 1] = bound_arr[Nouter - 1].e + 1 - bound_arr[Nouter - 1].s; kokkos_dispatch(Tag(), name, Kokkos::Experimental::require(