Skip to content

Commit

Permalink
[SYCL][ESIMD][EMU] Add implementation of atomic operations in ESIMD e…
Browse files Browse the repository at this point in the history
…mulator (#6661)


Co-authored-by: kbobrovs <Konstantin.S.Bobrovsky@intel.com>
  • Loading branch information
dongkyunahn-intel and kbobrovs authored Oct 14, 2022
1 parent dd70c33 commit a6a0dea
Show file tree
Hide file tree
Showing 4 changed files with 288 additions and 209 deletions.
135 changes: 107 additions & 28 deletions sycl/include/sycl/ext/intel/esimd/detail/atomic_intrin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,126 +16,205 @@ namespace __ESIMD_DNS {
// This function implements atomic update of pre-existing variable in the
// absense of C++ 20's atomic_ref.

template <typename Ty> Ty atomic_load(Ty *ptr) {
// __atomic_* functions support only integral types. In order to
// support floating types for certain operations like min/max,
// 'cmpxchg' operation is applied for result values using
// 'bridging' variables in integral type.
template <typename Ty> using CmpxchgTy = __ESIMD_DNS::uint_type_t<sizeof(Ty)>;

template <typename Ty> inline Ty atomic_load(Ty *ptr) {
#ifdef _WIN32
// TODO: Windows will be supported soon
__ESIMD_UNSUPPORTED_ON_HOST;
#else
return __atomic_load(ptr, __ATOMIC_SEQ_CST);
__ESIMD_UNSUPPORTED_ON_HOST;
// TODO : Enable with unit test
/* return sycl::bit_cast<Ty>(__atomic_load_n((CmpxchgTy<Ty> *)ptr,
__ATOMIC_SEQ_CST)); */
#endif
}

template <typename Ty> Ty atomic_store(Ty *ptr, Ty val) {
template <typename Ty> inline Ty atomic_store(Ty *ptr, Ty val) {
#ifdef _WIN32
// TODO: Windows will be supported soon
__ESIMD_UNSUPPORTED_ON_HOST;
#else
__atomic_store(ptr, val, __ATOMIC_SEQ_CST);
Ty ret = atomic_load<Ty>((CmpxchgTy<Ty> *)ptr);
__atomic_store_n((CmpxchgTy<Ty> *)ptr, val, __ATOMIC_SEQ_CST);
return ret;
#endif
}

template <typename Ty> Ty atomic_add_fetch(Ty *ptr, Ty val) {
template <typename Ty> inline Ty atomic_add(Ty *ptr, Ty val) {
#ifdef _WIN32
// TODO: Windows will be supported soon
__ESIMD_UNSUPPORTED_ON_HOST;
#else
return __atomic_add_fetch(ptr, val, __ATOMIC_SEQ_CST);
if constexpr (std::is_integral_v<Ty>) {
return __atomic_fetch_add(ptr, val, __ATOMIC_SEQ_CST);
} else {
// For Floating type
Ty _old, _new;
CmpxchgTy<Ty> _old_bits, _new_bits;
do {
_old = *ptr;
_new = _old + val;
_old_bits = *(CmpxchgTy<Ty> *)&_old;
_new_bits = *(CmpxchgTy<Ty> *)&_new;
} while (!__atomic_compare_exchange_n((CmpxchgTy<Ty> *)ptr, &_old_bits,
_new_bits, false, __ATOMIC_SEQ_CST,
__ATOMIC_SEQ_CST));
return _old;
}
#endif
}

template <typename Ty> Ty atomic_sub_fetch(Ty *ptr, Ty val) {
template <typename Ty> inline Ty atomic_sub(Ty *ptr, Ty val) {
#ifdef _WIN32
// TODO: Windows will be supported soon
__ESIMD_UNSUPPORTED_ON_HOST;
#else
return __atomic_sub_fetch(ptr, val, __ATOMIC_SEQ_CST);
if constexpr (std::is_integral_v<Ty>) {
return __atomic_fetch_sub(ptr, val, __ATOMIC_SEQ_CST);
} else {
// For Floating type
Ty _old, _new;
CmpxchgTy<Ty> _old_bits, _new_bits;
do {
_old = *ptr;
_new = _old - val;
_old_bits = *(CmpxchgTy<Ty> *)&_old;
_new_bits = *(CmpxchgTy<Ty> *)&_new;
} while (!__atomic_compare_exchange_n((CmpxchgTy<Ty> *)ptr, &_old_bits,
_new_bits, false, __ATOMIC_SEQ_CST,
__ATOMIC_SEQ_CST));
return _old;
}
#endif
}

template <typename Ty> Ty atomic_and_fetch(Ty *ptr, Ty val) {
template <typename Ty> inline Ty atomic_and(Ty *ptr, Ty val) {
#ifdef _WIN32
// TODO: Windows will be supported soon
__ESIMD_UNSUPPORTED_ON_HOST;
#else
return __atomic_and_fetch(ptr, val, __ATOMIC_SEQ_CST);
static_assert(std::is_integral<Ty>::value);
return __atomic_fetch_and(ptr, val, __ATOMIC_SEQ_CST);
#endif
}

template <typename Ty> Ty atomic_or_fetch(Ty *ptr, Ty val) {
template <typename Ty> inline Ty atomic_or(Ty *ptr, Ty val) {
#ifdef _WIN32
// TODO: Windows will be supported soon
__ESIMD_UNSUPPORTED_ON_HOST;
#else
return __atomic_or_fetch(ptr, val, __ATOMIC_SEQ_CST);
static_assert(std::is_integral<Ty>::value);
return __atomic_fetch_or(ptr, val, __ATOMIC_SEQ_CST);
#endif
}

template <typename Ty> Ty atomic_xor_fetch(Ty *ptr, Ty val) {
template <typename Ty> inline Ty atomic_xor(Ty *ptr, Ty val) {
#ifdef _WIN32
// TODO: Windows will be supported soon
__ESIMD_UNSUPPORTED_ON_HOST;
#else
return __atomic_xor_fetch(ptr, val, __ATOMIC_SEQ_CST);
static_assert(std::is_integral<Ty>::value);
return __atomic_fetch_xor(ptr, val, __ATOMIC_SEQ_CST);
#endif
}

template <typename Ty> Ty atomic_min(Ty *ptr, Ty val) {
template <typename Ty> inline Ty atomic_min(Ty *ptr, Ty val) {
#ifdef _WIN32
// TODO: Windows will be supported soon
__ESIMD_UNSUPPORTED_ON_HOST;
#else
// TODO FIXME: fix implementation for FP types.
if constexpr (std::is_integral_v<Ty>) {
Ty _old, _new;
do {
_old = *ptr;
_new = std::min<Ty>(_old, val);
} while (!__atomic_compare_exchange_n(ptr, &_old, _new, false,
__ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST));
return _new;
return _old;
} else {
__ESIMD_UNSUPPORTED_ON_HOST;
Ty _old, _new;
CmpxchgTy<Ty> _old_bits, _new_bits;
do {
_old = *ptr;
_new = std::min(_old, val);
_old_bits = *(CmpxchgTy<Ty> *)&_old;
_new_bits = *(CmpxchgTy<Ty> *)&_new;
} while (!__atomic_compare_exchange_n((CmpxchgTy<Ty> *)ptr, &_old_bits,
_new_bits, false, __ATOMIC_SEQ_CST,
__ATOMIC_SEQ_CST));
return _old;
}
#endif
}

template <typename Ty> Ty atomic_max(Ty *ptr, Ty val) {
template <typename Ty> inline Ty atomic_max(Ty *ptr, Ty val) {
#ifdef _WIN32
// TODO: Windows will be supported soon
__ESIMD_UNSUPPORTED_ON_HOST;
#else
// TODO FIXME: fix implementation for FP types.
if constexpr (std::is_integral_v<Ty>) {
Ty _old, _new;
do {
_old = *ptr;
_new = std::max<Ty>(_old, val);
} while (!__atomic_compare_exchange_n(ptr, &_old, _new, false,
__ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST));
return _new;
return _old;
} else {
__ESIMD_UNSUPPORTED_ON_HOST;
Ty _old, _new;
CmpxchgTy<Ty> _old_bits, _new_bits;
do {
_old = *ptr;
_new = std::max(_old, val);
_old_bits = *(CmpxchgTy<Ty> *)&_old;
_new_bits = *(CmpxchgTy<Ty> *)&_new;
} while (!__atomic_compare_exchange_n((CmpxchgTy<Ty> *)(CmpxchgTy<Ty> *)ptr,
&_old_bits, _new_bits, false,
__ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST));
return _old;
}
#endif
}

template <typename Ty> Ty atomic_cmpxchg(Ty *ptr, Ty expected, Ty desired) {
template <typename Ty>
inline Ty atomic_cmpxchg(Ty *ptr, Ty expected, Ty desired) {
#ifdef _WIN32
// TODO: Windows will be supported soon
__ESIMD_UNSUPPORTED_ON_HOST;
#else
// TODO FIXME: fix implementation for FP types.
if constexpr (std::is_integral_v<Ty>) {
Ty _old = expected;
__atomic_compare_exchange_n(ptr, &_old, desired, false, __ATOMIC_SEQ_CST,
Ty local = expected;
__atomic_compare_exchange_n(ptr, &local, desired, false, __ATOMIC_SEQ_CST,
__ATOMIC_SEQ_CST);
return *ptr;
// if exchange occured, this means 'local=expected=*ptr'. So local
// is returned as old val
// if exchange did not occur, *ptr value compared against 'local'
// is stored in 'local'. So local is returned as old val
return local;
} else {
__ESIMD_UNSUPPORTED_ON_HOST;
CmpxchgTy<Ty> desired_bits = *(CmpxchgTy<Ty> *)&desired;
CmpxchgTy<Ty> local_bits = *(CmpxchgTy<Ty> *)&expected;
__atomic_compare_exchange_n((CmpxchgTy<Ty> *)ptr, &local_bits, desired_bits,
false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
return *((Ty *)&local_bits);
}
#endif
}

inline void atomic_fence() {
#ifdef _WIN32
// TODO: Windows will be supported soon
__ESIMD_UNSUPPORTED_ON_HOST;
#else
__atomic_thread_fence(__ATOMIC_SEQ_CST);
#endif
}

} // namespace __ESIMD_DNS

/// @endcond ESIMD_DETAIL
91 changes: 75 additions & 16 deletions sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,25 @@ __esimd_svm_atomic0(__ESIMD_DNS::vector_type_t<uint64_t, N> addrs,
;
#else
{
__ESIMD_UNSUPPORTED_ON_HOST;
__ESIMD_DNS::vector_type_t<Ty, N> Oldval = 0;

for (int AddrIdx = 0; AddrIdx < N; AddrIdx += 1) {
if (pred[AddrIdx] == 0) {
// Skip Oldval vector elements correpsonding to
// predicates whose value is zero
continue;
}
if constexpr (Op == __ESIMD_NS::atomic_op::load) {
Oldval[AddrIdx] = __ESIMD_DNS::atomic_load<Ty>((Ty *)addrs[AddrIdx]);
} else if constexpr (Op == __ESIMD_NS::atomic_op::inc) {
Oldval[AddrIdx] =
__ESIMD_DNS::atomic_add<Ty>((Ty *)addrs[AddrIdx], static_cast<Ty>(1));
} else if constexpr (Op == __ESIMD_NS::atomic_op::dec) {
Oldval[AddrIdx] =
__ESIMD_DNS::atomic_sub<Ty>((Ty *)addrs[AddrIdx], static_cast<Ty>(1));
}
}
return Oldval;
}
#endif // __SYCL_DEVICE_ONLY__

Expand All @@ -498,23 +516,49 @@ __esimd_svm_atomic1(__ESIMD_DNS::vector_type_t<uint64_t, N> addrs,
;
#else
{
__ESIMD_DNS::vector_type_t<Ty, N> retv;
__ESIMD_DNS::vector_type_t<Ty, N> Oldval;

for (int i = 0; i < N; i++) {
if (pred[i]) {
Ty *p = reinterpret_cast<Ty *>(addrs[i]);
for (int AddrIdx = 0; AddrIdx < N; AddrIdx++) {
if (pred[AddrIdx] == 0) {
// Skip Output vector elements correpsonding to
// predicates whose value is zero
continue;
}

switch (Op) {
case __ESIMD_NS::atomic_op::add:
retv[i] = __ESIMD_DNS::atomic_add_fetch<Ty>(p, src0[i]);
break;
default:
__ESIMD_UNSUPPORTED_ON_HOST;
}
if constexpr (Op == __ESIMD_NS::atomic_op::store) {
Oldval[AddrIdx] =
__ESIMD_DNS::atomic_store<Ty>((Ty *)addrs[AddrIdx], src0[AddrIdx]);
} else if constexpr ((Op == __ESIMD_NS::atomic_op::add) ||
(Op == __ESIMD_NS::atomic_op::fadd)) {
Oldval[AddrIdx] =
__ESIMD_DNS::atomic_add<Ty>((Ty *)addrs[AddrIdx], src0[AddrIdx]);
} else if constexpr ((Op == __ESIMD_NS::atomic_op::sub) ||
(Op == __ESIMD_NS::atomic_op::fsub)) {
Oldval[AddrIdx] =
__ESIMD_DNS::atomic_sub<Ty>((Ty *)addrs[AddrIdx], src0[AddrIdx]);
} else if constexpr ((Op == __ESIMD_NS::atomic_op::minsint) ||
(Op == __ESIMD_NS::atomic_op::min) ||
(Op == __ESIMD_NS::atomic_op::fmin)) {
Oldval[AddrIdx] =
__ESIMD_DNS::atomic_min<Ty>((Ty *)addrs[AddrIdx], src0[AddrIdx]);
} else if constexpr ((Op == __ESIMD_NS::atomic_op::maxsint) ||
(Op == __ESIMD_NS::atomic_op::max) ||
(Op == __ESIMD_NS::atomic_op::fmax)) {
Oldval[AddrIdx] =
__ESIMD_DNS::atomic_max<Ty>((Ty *)addrs[AddrIdx], src0[AddrIdx]);
} else if constexpr (Op == __ESIMD_NS::atomic_op::bit_and) {
Oldval[AddrIdx] =
__ESIMD_DNS::atomic_and<Ty>((Ty *)addrs[AddrIdx], src0[AddrIdx]);
} else if constexpr (Op == __ESIMD_NS::atomic_op::bit_or) {
Oldval[AddrIdx] =
__ESIMD_DNS::atomic_or<Ty>((Ty *)addrs[AddrIdx], src0[AddrIdx]);
} else if constexpr (Op == __ESIMD_NS::atomic_op::bit_xor) {
Oldval[AddrIdx] =
__ESIMD_DNS::atomic_xor<Ty>((Ty *)addrs[AddrIdx], src0[AddrIdx]);
}
}

return retv;
return Oldval;
}
#endif // __SYCL_DEVICE_ONLY__

Expand All @@ -528,7 +572,20 @@ __esimd_svm_atomic2(__ESIMD_DNS::vector_type_t<uint64_t, N> addrs,
;
#else
{
__ESIMD_UNSUPPORTED_ON_HOST;
__ESIMD_DNS::vector_type_t<Ty, N> Oldval;

for (int AddrIdx = 0; AddrIdx < N; AddrIdx++) {
if (pred[AddrIdx] == 0) {
// Skip Output vector elements correpsonding to
// predicates whose value is zero
continue;
}
static_assert((Op == __ESIMD_NS::atomic_op::cmpxchg) ||
(Op == __ESIMD_NS::atomic_op::fcmpxchg));
Oldval[AddrIdx] = __ESIMD_DNS::atomic_cmpxchg((Ty *)addrs[AddrIdx],
src0[AddrIdx], src1[AddrIdx]);
}
return Oldval;
}
#endif // __SYCL_DEVICE_ONLY__

Expand Down Expand Up @@ -557,7 +614,9 @@ __ESIMD_INTRIN void __esimd_fence(uint8_t cntl)
;
#else
{
sycl::detail::getESIMDDeviceInterface()->cm_fence_ptr();
// CM_EMU's 'cm_fence' is NOP. Disabled.
// sycl::detail::getESIMDDeviceInterface()->cm_fence_ptr();
__ESIMD_DNS::atomic_fence();
}
#endif // __SYCL_DEVICE_ONLY__

Expand Down Expand Up @@ -849,7 +908,7 @@ __esimd_dword_atomic0(__ESIMD_DNS::simd_mask_storage_t<N> pred,

switch (Op) {
case __ESIMD_NS::atomic_op::inc:
retv[i] = __ESIMD_DNS::atomic_add_fetch<Ty>(p, 1);
retv[i] = __ESIMD_DNS::atomic_add<Ty>(p, 1);
break;
default:
__ESIMD_UNSUPPORTED_ON_HOST;
Expand Down
Loading

0 comments on commit a6a0dea

Please sign in to comment.