Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor Algo::Level{2,3}::Blocked::mb() #1265

Merged
merged 7 commits into from
Jan 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 74 additions & 90 deletions src/batched/KokkosBatched_Util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,60 @@ struct Mode {
};
};

#if !defined(KOKKOS_IF_HOST)

template <class>
struct algo_level3_blocked_mb_impl;
template <>
struct algo_level3_blocked_mb_impl<Kokkos::HostSpace> {
static constexpr int value = 4;
};
#if defined(KOKKOS_ENABLE_CUDA)
template <>
struct algo_level3_blocked_mb_impl<Kokkos::CudaSpace> {
static constexpr int value = 2;
};
#endif
#if defined(KOKKOS_ENABLE_HIP)
template <>
struct algo_level3_blocked_mb_impl<Kokkos::Experimental::HIPSpace> {
static constexpr int value = 2;
};
#endif
#if defined(KOKKOS_ENABLE_SYCL)
template <>
struct algo_level3_blocked_mb_impl<Kokkos::Experimental::SYCLDeviceUSMSpace> {
static constexpr int value = 2;
};
#endif

template <class>
struct algo_level2_blocked_mb_impl;
template <>
struct algo_level2_blocked_mb_impl<Kokkos::HostSpace> {
static constexpr int value = 4;
};
#if defined(KOKKOS_ENABLE_CUDA)
template <>
struct algo_level2_blocked_mb_impl<Kokkos::CudaSpace> {
static constexpr int value = 1;
};
#endif
#if defined(KOKKOS_ENABLE_HIP)
template <>
struct algo_level2_blocked_mb_impl<Kokkos::Experimental::HIPSpace> {
static constexpr int value = 1;
};
#endif
#if defined(KOKKOS_ENABLE_SYCL)
template <>
struct algo_level2_blocked_mb_impl<Kokkos::Experimental::SYCLDeviceUSMSpace> {
static constexpr int value = 1;
};
#endif

#endif

struct Algo {
struct Level3 {
struct Unblocked {
Expand All @@ -316,42 +370,19 @@ struct Algo {
// - team policy (smaller) or range policy (bigger)
// - space (gpu vs host)
// - blocksize input (blk <= 4 mb = 2, otherwise mb = 4), etc.
#if defined(KOKKOS_ENABLE_CUDA)
template <typename ActiveMemorySpaceType>
KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if<
std::is_same<ActiveMemorySpaceType, Kokkos::CudaSpace>::value,
int>::type
mb() {
return 2;
#if defined(KOKKOS_IF_HOST)
static constexpr KOKKKOS_FUNCTION int mb() {
KOKKOS_IF_HOST((return 4;))
KOKKOS_IF_DEVICE((return 2;))
}
#endif
#if defined(KOKKOS_ENABLE_HIP)
template <typename ActiveMemorySpaceType>
KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if<
std::is_same<ActiveMemorySpaceType,
Kokkos::Experimental::HIPSpace>::value,
int>::type
mb() {
return 2;
}
#endif
#if defined(KOKKOS_ENABLE_SYCL)
template <typename ActiveMemorySpaceType>
KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if<
std::is_same<ActiveMemorySpaceType,
Kokkos::Experimental::SYCLDeviceUSMSpace>::value,
int>::type
mb() {
return 2;

#else // FIXME remove when requiring minimum version of Kokkos 3.6
static constexpr KOKKOS_FUNCTION int mb() {
return algo_level3_blocked_mb_impl<
Kokkos::Impl::ActiveExecutionMemorySpace>::value;
}

#endif
template <typename ActiveMemorySpaceType>
KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if<
std::is_same<ActiveMemorySpaceType, Kokkos::HostSpace>::value,
int>::type
mb() {
return 4;
}
};
struct MKL {
static const char *name() { return "MKL"; }
Expand Down Expand Up @@ -389,42 +420,19 @@ struct Algo {
// - team policy (smaller) or range policy (bigger)
// - space (cuda vs host)
// - blocksize input (blk <= 4 mb = 2, otherwise mb = 4), etc.
#if defined(KOKKOS_ENABLE_CUDA)
template <typename ActiveMemorySpaceType>
KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if<
std::is_same<ActiveMemorySpaceType, Kokkos::CudaSpace>::value,
int>::type
mb() {
return 1;
}
#endif
#if defined(KOKKOS_ENABLE_HIP)
template <typename ActiveMemorySpaceType>
KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if<
std::is_same<ActiveMemorySpaceType,
Kokkos::Experimental::HIPSpace>::value,
int>::type
mb() {
return 1;
#if defined(KOKKOS_IF_HOST)
static constexpr KOKKKOS_FUNCTION int mb() {
KOKKOS_IF_HOST((return 4;))
KOKKOS_IF_DEVICE((return 1;))
}
#endif
#if defined(KOKKOS_ENABLE_SYCL)
template <typename ActiveMemorySpaceType>
KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if<
std::is_same<ActiveMemorySpaceType,
Kokkos::Experimental::SYCLDeviceUSMSpace>::value,
int>::type
mb() {
return 1;

#else // FIXME remove when requiring minimum version of Kokkos 3.6
static constexpr KOKKOS_FUNCTION int mb() {
return algo_level2_blocked_mb_impl<
Kokkos::Impl::ActiveExecutionMemorySpace>::value;
}

#endif
template <typename ActiveMemorySpaceType>
KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if<
std::is_same<ActiveMemorySpaceType, Kokkos::HostSpace>::value,
int>::type
mb() {
return 4;
}
};
struct MKL {};
struct CompactMKL {};
Expand All @@ -442,30 +450,6 @@ struct Algo {
using Gemv = Level2;
using Trsv = Level2;
using ApplyQ = Level2;

// struct Level1 {
// struct Unblocked {};
// struct Blocked {
// // TODO:: for now harwire the blocksizes; this should reflect
// // regieter blocking (not about team parallelism).
// // this mb should vary according to
// // - team policy (smaller) or range policy (bigger)
// // - space (cuda vs host)
// // - blocksize input (blk <= 4 mb = 2, otherwise mb = 4), etc.
// #if defined(KOKKOS_ENABLE_CUDA)
// template<typename ActiveMemorySpaceType> KOKKOS_INLINE_FUNCTION
// static constexpr typename
// std::enable_if<std::is_same<ActiveMemorySpaceType,Kokkos::CudaSpace>::value,int>
// ::type mb() { return 4; }
// #endif
// template<typename ActiveMemorySpaceType> KOKKOS_INLINE_FUNCTION
// static constexpr typename
// std::enable_if<std::is_same<ActiveMemorySpaceType,Kokkos::HostSpace>::value,int>
// ::type mb() { return 4; }
// };
// //struct MKL {};
// //struct CompactMKL {};
// };
};

struct Util {
Expand Down
3 changes: 1 addition & 2 deletions src/batched/dense/KokkosBatched_Vector_SIMD.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ class Vector<SIMD<T>, l> {

public:
KOKKOS_INLINE_FUNCTION Vector() {
// static_assert(std::is_same<Kokkos::Impl::ActiveExecutionMemorySpace,Kokkos::HostSpace>::value,
// "Vector SIMD should not be instanciated in CudaSpace");
// NOTE Not meant to be instantiated for CUDA
#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
#pragma ivdep
#endif
Expand Down
7 changes: 2 additions & 5 deletions src/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,8 @@ KOKKOS_INLINE_FUNCTION int SerialGemmInternal<Algo::Gemm::Blocked>::invoke(
// C = beta C + alpha A B
// C (m x n), A(m x k), B(k x n)

enum : int {
mbAlgo =
Algo::Gemm::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>(),
nbAlgo = Algo::Gemm::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
};
constexpr int mbAlgo = Algo::Gemm::Blocked::mb();
constexpr int nbAlgo = Algo::Gemm::Blocked::mb();

const ScalarType one(1.0), zero(0.0);

Expand Down
7 changes: 2 additions & 5 deletions src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,8 @@ KOKKOS_INLINE_FUNCTION int TeamGemmInternal<Algo::Gemm::Blocked>::invoke(
// C = beta C + alpha A B
// C (m x n), A(m x k), B(k x n)

enum : int {
mbAlgo =
Algo::Gemm::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>(),
nbAlgo = Algo::Gemm::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
};
constexpr int mbAlgo = Algo::Gemm::Blocked::mb();
constexpr int nbAlgo = Algo::Gemm::Blocked::mb();

const ScalarType one(1.0), zero(0.0);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,7 @@ KOKKOS_INLINE_FUNCTION int SerialGemvInternal<Algo::Gemv::Blocked>::invoke(
// y = beta y + alpha A x
// y (m), A(m x n), B(n)

enum : int {
mbAlgo = Algo::Gemv::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
};
constexpr int mbAlgo = Algo::Gemv::Blocked::mb();

if (beta == zero)
SerialSetInternal ::invoke(m, zero, y, ys0);
Expand Down
4 changes: 1 addition & 3 deletions src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,7 @@ KOKKOS_INLINE_FUNCTION int TeamGemvInternal<Algo::Gemv::Blocked>::invoke(
// y = beta y + alpha A x
// y (m), A(m x n), B(n)

enum : int {
mbAlgo = Algo::Gemv::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
};
constexpr int mbAlgo = Algo::Gemv::Blocked::mb();

if (beta == zero)
TeamSetInternal ::invoke(member, m, zero, y, ys0);
Expand Down
4 changes: 1 addition & 3 deletions src/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,7 @@ KOKKOS_INLINE_FUNCTION int SerialLU_Internal<Algo::LU::Blocked>::invoke(
const int m, const int n, ValueType *KOKKOS_RESTRICT A, const int as0,
const int as1,
const typename MagnitudeScalarType<ValueType>::type /*tiny*/) {
enum : int {
mbAlgo = Algo::LU::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
};
constexpr int mbAlgo = Algo::LU::Blocked::mb();
const typename MagnitudeScalarType<ValueType>::type one(1.0), minus_one(-1.0);

const int k = (m < n ? m : n);
Expand Down
4 changes: 1 addition & 3 deletions src/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,7 @@ KOKKOS_INLINE_FUNCTION int TeamLU_Internal<Algo::LU::Blocked>::invoke(
const MemberType &member, const int m, const int n,
ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
const typename MagnitudeScalarType<ValueType>::type /*tiny*/) {
enum : int {
mbAlgo = Algo::LU::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
};
constexpr int mbAlgo = Algo::LU::Blocked::mb();

const int k = (m < n ? m : n);
if (k <= 0) return 0;
Expand Down
8 changes: 2 additions & 6 deletions src/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,7 @@ SerialTrsmInternalLeftLower<Algo::Trsm::Blocked>::invoke(
const bool use_unit_diag, const int m, const int n, const ScalarType alpha,
const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
/**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
enum : int {
mbAlgo = Algo::Trsm::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
};
constexpr int mbAlgo = Algo::Trsm::Blocked::mb();

const ScalarType one(1.0), zero(0.0), minus_one(-1.0);

Expand Down Expand Up @@ -201,9 +199,7 @@ SerialTrsmInternalLeftUpper<Algo::Trsm::Blocked>::invoke(
/**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
const ScalarType one(1.0), zero(0.0), minus_one(-1.0);

enum : int {
mbAlgo = Algo::Trsm::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
};
constexpr int mbAlgo = Algo::Trsm::Blocked::mb();

if (alpha == zero)
SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1);
Expand Down
8 changes: 2 additions & 6 deletions src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,7 @@ TeamTrsmInternalLeftLower<Algo::Trsm::Blocked>::invoke(
const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
const int as0, const int as1,
/**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
enum : int {
mbAlgo = Algo::Trsm::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
};
constexpr int mbAlgo = Algo::Trsm::Blocked::mb();

const ScalarType one(1.0), zero(0.0), minus_one(-1.0);

Expand Down Expand Up @@ -225,9 +223,7 @@ TeamTrsmInternalLeftUpper<Algo::Trsm::Blocked>::invoke(
const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
const int as0, const int as1,
/**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
enum : int {
mbAlgo = Algo::Trsm::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
};
constexpr int mbAlgo = Algo::Trsm::Blocked::mb();

const ScalarType one(1.0), zero(0.0), minus_one(-1.0);

Expand Down
8 changes: 2 additions & 6 deletions src/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,7 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalLower<Algo::Trsv::Blocked>::invoke(
/**/ ValueType *KOKKOS_RESTRICT b, const int bs0) {
const ScalarType one(1.0), zero(0.0), minus_one(-1.0);

enum : int {
mbAlgo = Algo::Trsv::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
};
constexpr int mbAlgo = Algo::Trsv::Blocked::mb();

if (alpha == zero)
SerialSetInternal::invoke(m, zero, b, bs0);
Expand Down Expand Up @@ -168,9 +166,7 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalUpper<Algo::Trsv::Blocked>::invoke(
/**/ ValueType *KOKKOS_RESTRICT b, const int bs0) {
const ScalarType one(1.0), zero(0.0), minus_one(-1.0);

enum : int {
mbAlgo = Algo::Trsm::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
};
constexpr int mbAlgo = Algo::Trsm::Blocked::mb();
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW was it a typo? Was it mean to be {Trsm -> Trsv}?


// note that parallel range is different ( m*n vs m-1*n);
if (alpha == zero)
Expand Down
8 changes: 2 additions & 6 deletions src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,7 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower<Algo::Trsv::Blocked>::invoke(
/**/ ValueType *KOKKOS_RESTRICT b, const int bs0) {
const ScalarType one(1.0), zero(0.0), minus_one(-1.0);

enum : int {
mbAlgo = Algo::Trsv::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
};
constexpr int mbAlgo = Algo::Trsv::Blocked::mb();

if (alpha == zero)
TeamSetInternal::invoke(member, m, zero, b, bs0);
Expand Down Expand Up @@ -195,9 +193,7 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper<Algo::Trsv::Blocked>::invoke(
/**/ ValueType *KOKKOS_RESTRICT b, const int bs0) {
const ScalarType one(1.0), zero(0.0), minus_one(-1.0);

enum : int {
mbAlgo = Algo::Trsm::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
};
constexpr int mbAlgo = Algo::Trsm::Blocked::mb();
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same question about possible typo Trs{m -> v}


// note that parallel range is different ( m*n vs m-1*n);
if (alpha == zero)
Expand Down
3 changes: 0 additions & 3 deletions src/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@

namespace KokkosBatched {

//#define KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(A) typename
// std::enable_if<std::is_same<Kokkos::Impl::ActiveExecutionMemorySpace,Kokkos::HostSpace>::value,A
//>::type
#define KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) Vector<SIMD<T>, l>
#define KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) \
Vector<SIMD<T>, l> &
Expand Down
3 changes: 0 additions & 3 deletions src/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,6 @@

namespace KokkosBatched {

//#define KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE typename
// std::enable_if<std::is_same<Kokkos::Impl::ActiveExecutionMemorySpace,Kokkos::HostSpace>::value,Vector<SIMD<T>,l>
//>::type
#define KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) Vector<SIMD<T>, l>
#define KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) \
typename std::enable_if<!std::is_integral<T>::value, \
Expand Down