Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added HWY_DASSERTs asserting buffer alignment to all Load and Store operations that require alignment. #1847

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion hwy/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@

#endif // !HWY_IDE

#if !defined(HWY_NO_LIBCXX) && HWY_CXX_LANG > 201703L && \
#if !defined(HWY_NO_LIBCXX) && HWY_CXX_LANG > 201703L && \
__cpp_impl_three_way_comparison >= 201907L && defined(__has_include) && \
!defined(HWY_DISABLE_CXX20_THREE_WAY_COMPARE)
#if __has_include(<compare>)
Expand Down Expand Up @@ -293,6 +293,11 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
} while (0)
#endif

#define HWY_DASSERT_ALIGNED(d, addr) \
HWY_DASSERT(reinterpret_cast<uintptr_t>(addr) % \
(Lanes(d) * sizeof(TFromD<decltype(d)>)) == \
0)
jan-wassenberg marked this conversation as resolved.
Show resolved Hide resolved

#if __cpp_constexpr >= 201304L
#define HWY_CXX14_CONSTEXPR constexpr
#else
Expand Down
2 changes: 2 additions & 0 deletions hwy/ops/arm_neon-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -3538,6 +3538,7 @@ HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
// On Arm, Load is the same as LoadU.
template <class D>
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
return LoadU(d, p);
}

Expand Down Expand Up @@ -3742,6 +3743,7 @@ HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
// On Arm, Store is the same as StoreU.
template <class D>
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
HWY_DASSERT_ALIGNED(d, aligned);
StoreU(v, d, aligned);
}

Expand Down
27 changes: 15 additions & 12 deletions hwy/ops/arm_sve-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <arm_sve.h>

#include "hwy/base.h"
#include "hwy/ops/shared-inl.h"

// Arm C215 declares that SVE vector lengths will always be a power of two.
Expand Down Expand Up @@ -1635,10 +1636,10 @@ HWY_SVE_FOREACH_BF16(HWY_SVE_LOAD_DUP128, LoadDupFull128, ld1rq)
#if !HWY_SVE_HAVE_BFLOAT16

template <size_t N, int kPow2>
HWY_API VBF16 Load(Simd<bfloat16_t, N, kPow2> d,
const bfloat16_t* HWY_RESTRICT p) {
return BitCast(d, Load(RebindToUnsigned<decltype(d)>(),
reinterpret_cast<const uint16_t * HWY_RESTRICT>(p)));
HWY_API VBF16 LoadU(Simd<bfloat16_t, N, kPow2> d,
const bfloat16_t* HWY_RESTRICT p) {
return BitCast(d, LoadU(RebindToUnsigned<decltype(d)>(),
reinterpret_cast<const uint16_t * HWY_RESTRICT>(p)));
}

template <size_t N, int kPow2>
Expand Down Expand Up @@ -1688,10 +1689,10 @@ HWY_API VBF16 LoadDup128(D d, const bfloat16_t* HWY_RESTRICT p) {
#if !HWY_SVE_HAVE_BFLOAT16

template <size_t N, int kPow2>
HWY_API void Store(VBF16 v, Simd<bfloat16_t, N, kPow2> d,
bfloat16_t* HWY_RESTRICT p) {
HWY_API void StoreU(VBF16 v, Simd<bfloat16_t, N, kPow2> d,
bfloat16_t* HWY_RESTRICT p) {
const RebindToUnsigned<decltype(d)> du;
Store(BitCast(du, v), du, reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
StoreU(BitCast(du, v), du, reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
}

template <size_t N, int kPow2>
Expand All @@ -1711,18 +1712,20 @@ HWY_API void BlendedStore(VBF16 v, svbool_t m, Simd<bfloat16_t, N, kPow2> d,

#endif

// ------------------------------ Load/StoreU
// ------------------------------ Load/Store

// SVE only requires lane alignment, not natural alignment of the entire
// vector.
template <class D>
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
return Load(d, p);
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
return LoadU(d, p);
}

template <class V, class D>
HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
Store(v, d, p);
HWY_API void Store(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
StoreU(v, d, p);
}

// ------------------------------ MaskedLoadOr
Expand Down
18 changes: 10 additions & 8 deletions hwy/ops/emu128-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1364,9 +1364,9 @@ HWY_API VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
// ------------------------------ Load

template <class D>
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) {
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
VFromD<D> v;
CopyBytes<d.MaxBytes()>(aligned, v.raw); // copy from array
CopyBytes<d.MaxBytes()>(p, v.raw); // copy from array
return v;
}

Expand All @@ -1383,8 +1383,9 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
}

template <class D>
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
return Load(d, p);
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) {
HWY_DASSERT_ALIGNED(d, aligned);
return LoadU(d, aligned);
}

// In some use cases, "load single lane" is sufficient; otherwise avoid this.
Expand Down Expand Up @@ -1422,13 +1423,14 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
// ------------------------------ Store

template <class D>
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
CopyBytes<d.MaxBytes()>(v.raw, aligned); // copy to array
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
CopyBytes<d.MaxBytes()>(v.raw, p); // copy to array
}

template <class D>
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
Store(v, d, p);
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
HWY_DASSERT_ALIGNED(d, aligned);
StoreU(v, d, aligned);
}

template <class D>
Expand Down
22 changes: 14 additions & 8 deletions hwy/ops/ppc_vsx-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -614,7 +614,9 @@ HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
// ------------------------------ Load

template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
HWY_API Vec128<T> Load(D d, const T* HWY_RESTRICT aligned) {
HWY_DASSERT_ALIGNED(d, aligned);
(void)d;
using LoadRaw = typename detail::Raw128<T>::AlignedRawVec;
const LoadRaw* HWY_RESTRICT p = HWY_RCAST_ALIGNED(const LoadRaw*, aligned);
using ResultRaw = typename detail::Raw128<T>::type;
Expand All @@ -623,7 +625,7 @@ HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {

// Any <= 64 bit
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
HWY_API VFromD<D> Load(D d, const T* HWY_RESTRICT p) {
HWY_API VFromD<D> LoadU(D d, const T* HWY_RESTRICT p) {
using BitsT = UnsignedFromSize<d.MaxBytes()>;

BitsT bits;
Expand Down Expand Up @@ -1072,8 +1074,9 @@ HWY_API Vec128<T> LoadU(D /* tag */, const T* HWY_RESTRICT p) {

// For < 128 bit, LoadU == Load.
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
HWY_API VFromD<D> LoadU(D d, const T* HWY_RESTRICT p) {
return Load(d, p);
HWY_API VFromD<D> Load(D d, const T* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
return LoadU(d, p);
}

// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
Expand Down Expand Up @@ -1212,7 +1215,9 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
// ------------------------------ Store

template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
HWY_API void Store(Vec128<T> v, D /* tag */, T* HWY_RESTRICT aligned) {
HWY_API void Store(Vec128<T> v, D d, T* HWY_RESTRICT aligned) {
HWY_DASSERT_ALIGNED(d, aligned);
(void)d;
using StoreRaw = typename detail::Raw128<T>::AlignedRawVec;
*HWY_RCAST_ALIGNED(StoreRaw*, aligned) = reinterpret_cast<StoreRaw>(v.raw);
}
Expand All @@ -1224,7 +1229,7 @@ HWY_API void StoreU(Vec128<T> v, D /* tag */, T* HWY_RESTRICT p) {
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
HWY_API void Store(VFromD<D> v, D d, T* HWY_RESTRICT p) {
HWY_API void StoreU(VFromD<D> v, D d, T* HWY_RESTRICT p) {
using BitsT = UnsignedFromSize<d.MaxBytes()>;

const Repartition<BitsT, decltype(d)> d_bits;
Expand All @@ -1234,8 +1239,9 @@ HWY_API void Store(VFromD<D> v, D d, T* HWY_RESTRICT p) {

// For < 128 bit, StoreU == Store.
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
HWY_API void StoreU(VFromD<D> v, D d, T* HWY_RESTRICT p) {
Store(v, d, p);
HWY_API void Store(VFromD<D> v, D d, T* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
StoreU(v, d, p);
}

#if HWY_PPC_HAVE_9
Expand Down
24 changes: 13 additions & 11 deletions hwy/ops/rvv-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1627,15 +1627,16 @@ HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le, _ALL_VIRT)

// There is no native BF16, treat as int16_t.
template <size_t N, int kPow2>
HWY_API VFromD<Simd<int16_t, N, kPow2>> Load(Simd<bfloat16_t, N, kPow2> d,
HWY_API VFromD<Simd<int16_t, N, kPow2>> LoadU(Simd<bfloat16_t, N, kPow2> d,
const bfloat16_t* HWY_RESTRICT p) {
return Load(RebindToSigned<decltype(d)>(),
return LoadU(RebindToSigned<decltype(d)>(),
reinterpret_cast<const int16_t * HWY_RESTRICT>(p));
}

template <size_t N, int kPow2>
HWY_API void Store(VFromD<Simd<int16_t, N, kPow2>> v,
Simd<bfloat16_t, N, kPow2> d, bfloat16_t* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
Store(v, RebindToSigned<decltype(d)>(),
reinterpret_cast<int16_t * HWY_RESTRICT>(p));
}
Expand All @@ -1644,26 +1645,26 @@ HWY_API void Store(VFromD<Simd<int16_t, N, kPow2>> v,

// NOTE: different type for float16_t than bfloat16_t, see Set().
template <size_t N, int kPow2>
HWY_API VFromD<Simd<uint16_t, N, kPow2>> Load(Simd<float16_t, N, kPow2> d,
HWY_API VFromD<Simd<uint16_t, N, kPow2>> LoadU(Simd<float16_t, N, kPow2> d,
const float16_t* HWY_RESTRICT p) {
return Load(RebindToUnsigned<decltype(d)>(),
reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
}

template <size_t N, int kPow2>
HWY_API void Store(VFromD<Simd<uint16_t, N, kPow2>> v,
HWY_API void StoreU(VFromD<Simd<uint16_t, N, kPow2>> v,
Simd<float16_t, N, kPow2> d, float16_t* HWY_RESTRICT p) {
Store(v, RebindToUnsigned<decltype(d)>(),
StoreU(v, RebindToUnsigned<decltype(d)>(),
reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
}

#endif // !HWY_HAVE_FLOAT16

// ------------------------------ LoadU
template <class D>
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
// RVV only requires element alignment, not vector alignment.
return Load(d, p);
return LoadU(d, p);
}

// ------------------------------ MaskedLoad
Expand Down Expand Up @@ -1858,11 +1859,12 @@ HWY_API void StoreN(VFromD<D> v, D /*d*/, T* HWY_RESTRICT p,
reinterpret_cast<TStore * HWY_RESTRICT>(p));
}

// ------------------------------ StoreU
// ------------------------------ Store
template <class V, class D>
HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
HWY_API void Store(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
// RVV only requires element alignment, not vector alignment.
Store(v, d, p);
StoreU(v, d, p);
}

// ------------------------------ Stream
Expand Down
17 changes: 11 additions & 6 deletions hwy/ops/scalar-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1068,7 +1068,7 @@ HWY_API Mask1<double> IsFinite(const Vec1<double> v) {
// ------------------------------ Load

template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
HWY_API Vec1<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
HWY_API Vec1<T> LoadU(D d, const T* HWY_RESTRICT aligned) {
T t;
CopySameSize(aligned, &t);
return Vec1<T>(t);
Expand All @@ -1086,8 +1086,9 @@ HWY_API Vec1<T> MaskedLoadOr(Vec1<T> v, Mask1<T> m, D d,
}

template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
HWY_API Vec1<T> LoadU(D d, const T* HWY_RESTRICT p) {
return Load(d, p);
HWY_API Vec1<T> Load(D d, const T* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
return LoadU(d, p);
}

// In some use cases, "load single lane" is sufficient; otherwise avoid this.
Expand Down Expand Up @@ -1117,13 +1118,17 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const T* HWY_RESTRICT p,
// ------------------------------ Store

template <class D, typename T = TFromD<D>>
HWY_API void Store(const Vec1<T> v, D /* tag */, T* HWY_RESTRICT aligned) {
HWY_API void StoreU(const Vec1<T> v, D d, T* HWY_RESTRICT aligned) {
(void)d;
CopySameSize(&v.raw, aligned);
}

template <class D, typename T = TFromD<D>>
HWY_API void StoreU(const Vec1<T> v, D d, T* HWY_RESTRICT p) {
return Store(v, d, p);
HWY_API void Store(const Vec1<T> v, D d, T* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
(void)d;
CopySameSize(&v.raw, p);
return StoreU(v, d, p);
}

template <class D, typename T = TFromD<D>>
Expand Down
15 changes: 10 additions & 5 deletions hwy/ops/wasm_128-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1860,13 +1860,15 @@ HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
// ------------------------------ Load

template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
HWY_API Vec128<T> Load(D d, const T* HWY_RESTRICT aligned) {
HWY_DASSERT_ALIGNED(d, aligned);
return Vec128<T>{wasm_v128_load(aligned)};
}

// Partial
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
VFromD<D> v;
CopyBytes<d.MaxBytes()>(p, &v);
return v;
Expand Down Expand Up @@ -1939,24 +1941,27 @@ HWY_INLINE double ExtractLane(const Vec128<double, N> v) {
} // namespace detail

template <class D, HWY_IF_V_SIZE_D(D, 16)>
HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
HWY_DASSERT_ALIGNED(d, aligned);
(void)d;
wasm_v128_store(aligned, v.raw);
}

// Partial
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)>
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
CopyBytes<d.MaxBytes()>(&v, p);
}

template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
*p = detail::ExtractLane<0>(v);
}

// StoreU == Store.
template <class D>
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
Store(v, d, p);
}

Expand Down
Loading
Loading