Skip to content

Commit

Permalink
Added HWY_DASSERTs asserting buffer alignment to all Load and Store o…
Browse files Browse the repository at this point in the history
…perations that require alignment.

Fixed some bugs where Load/StoreU called Load/Store.

PiperOrigin-RevId: 578459068
  • Loading branch information
Martin Bruse authored and copybara-github committed Nov 1, 2023
1 parent 46c9056 commit b02293a
Show file tree
Hide file tree
Showing 12 changed files with 172 additions and 94 deletions.
7 changes: 6 additions & 1 deletion hwy/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@

#endif // !HWY_IDE

#if !defined(HWY_NO_LIBCXX) && HWY_CXX_LANG > 201703L && \
#if !defined(HWY_NO_LIBCXX) && HWY_CXX_LANG > 201703L && \
__cpp_impl_three_way_comparison >= 201907L && defined(__has_include) && \
!defined(HWY_DISABLE_CXX20_THREE_WAY_COMPARE)
#if __has_include(<compare>)
Expand Down Expand Up @@ -293,6 +293,11 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
} while (0)
#endif

#define HWY_DASSERT_ALIGNED(d, addr) \
HWY_DASSERT(reinterpret_cast<uintptr_t>(addr) % \
(Lanes(d) * sizeof(TFromD<decltype(d)>)) == \
0)

#if __cpp_constexpr >= 201304L
#define HWY_CXX14_CONSTEXPR constexpr
#else
Expand Down
2 changes: 2 additions & 0 deletions hwy/ops/arm_neon-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -3538,6 +3538,7 @@ HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
// On Arm, Load is the same as LoadU.
template <class D>
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
return LoadU(d, p);
}

Expand Down Expand Up @@ -3742,6 +3743,7 @@ HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
// On Arm, Store is the same as StoreU.
template <class D>
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
HWY_DASSERT_ALIGNED(d, aligned);
StoreU(v, d, aligned);
}

Expand Down
27 changes: 15 additions & 12 deletions hwy/ops/arm_sve-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <arm_sve.h>

#include "hwy/base.h"
#include "hwy/ops/shared-inl.h"

// Arm C215 declares that SVE vector lengths will always be a power of two.
Expand Down Expand Up @@ -1635,10 +1636,10 @@ HWY_SVE_FOREACH_BF16(HWY_SVE_LOAD_DUP128, LoadDupFull128, ld1rq)
#if !HWY_SVE_HAVE_BFLOAT16

template <size_t N, int kPow2>
HWY_API VBF16 Load(Simd<bfloat16_t, N, kPow2> d,
const bfloat16_t* HWY_RESTRICT p) {
return BitCast(d, Load(RebindToUnsigned<decltype(d)>(),
reinterpret_cast<const uint16_t * HWY_RESTRICT>(p)));
HWY_API VBF16 LoadU(Simd<bfloat16_t, N, kPow2> d,
const bfloat16_t* HWY_RESTRICT p) {
return BitCast(d, LoadU(RebindToUnsigned<decltype(d)>(),
reinterpret_cast<const uint16_t * HWY_RESTRICT>(p)));
}

template <size_t N, int kPow2>
Expand Down Expand Up @@ -1688,10 +1689,10 @@ HWY_API VBF16 LoadDup128(D d, const bfloat16_t* HWY_RESTRICT p) {
#if !HWY_SVE_HAVE_BFLOAT16

template <size_t N, int kPow2>
HWY_API void Store(VBF16 v, Simd<bfloat16_t, N, kPow2> d,
bfloat16_t* HWY_RESTRICT p) {
HWY_API void StoreU(VBF16 v, Simd<bfloat16_t, N, kPow2> d,
bfloat16_t* HWY_RESTRICT p) {
const RebindToUnsigned<decltype(d)> du;
Store(BitCast(du, v), du, reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
StoreU(BitCast(du, v), du, reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
}

template <size_t N, int kPow2>
Expand All @@ -1711,18 +1712,20 @@ HWY_API void BlendedStore(VBF16 v, svbool_t m, Simd<bfloat16_t, N, kPow2> d,

#endif

// ------------------------------ Load/StoreU
// ------------------------------ Load/Store

// SVE only requires lane alignment, not natural alignment of the entire
// vector.
template <class D>
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
return Load(d, p);
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
return LoadU(d, p);
}

template <class V, class D>
HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
Store(v, d, p);
HWY_API void Store(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
StoreU(v, d, p);
}

// ------------------------------ MaskedLoadOr
Expand Down
18 changes: 10 additions & 8 deletions hwy/ops/emu128-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1364,9 +1364,9 @@ HWY_API VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
// ------------------------------ Load

template <class D>
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) {
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
VFromD<D> v;
CopyBytes<d.MaxBytes()>(aligned, v.raw); // copy from array
CopyBytes<d.MaxBytes()>(p, v.raw); // copy from array
return v;
}

Expand All @@ -1383,8 +1383,9 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
}

template <class D>
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
return Load(d, p);
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) {
HWY_DASSERT_ALIGNED(d, aligned);
return LoadU(d, aligned);
}

// In some use cases, "load single lane" is sufficient; otherwise avoid this.
Expand Down Expand Up @@ -1422,13 +1423,14 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
// ------------------------------ Store

template <class D>
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
CopyBytes<d.MaxBytes()>(v.raw, aligned); // copy to array
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
CopyBytes<d.MaxBytes()>(v.raw, p); // copy to array
}

template <class D>
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
Store(v, d, p);
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
HWY_DASSERT_ALIGNED(d, aligned);
StoreU(v, d, aligned);
}

template <class D>
Expand Down
22 changes: 14 additions & 8 deletions hwy/ops/ppc_vsx-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -614,7 +614,9 @@ HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
// ------------------------------ Load

template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
HWY_API Vec128<T> Load(D d, const T* HWY_RESTRICT aligned) {
HWY_DASSERT_ALIGNED(d, aligned);
(void)d;
using LoadRaw = typename detail::Raw128<T>::AlignedRawVec;
const LoadRaw* HWY_RESTRICT p = HWY_RCAST_ALIGNED(const LoadRaw*, aligned);
using ResultRaw = typename detail::Raw128<T>::type;
Expand All @@ -623,7 +625,7 @@ HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {

// Any <= 64 bit
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
HWY_API VFromD<D> Load(D d, const T* HWY_RESTRICT p) {
HWY_API VFromD<D> LoadU(D d, const T* HWY_RESTRICT p) {
using BitsT = UnsignedFromSize<d.MaxBytes()>;

BitsT bits;
Expand Down Expand Up @@ -1072,8 +1074,9 @@ HWY_API Vec128<T> LoadU(D /* tag */, const T* HWY_RESTRICT p) {

// For < 128 bit, LoadU == Load.
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
HWY_API VFromD<D> LoadU(D d, const T* HWY_RESTRICT p) {
return Load(d, p);
HWY_API VFromD<D> Load(D d, const T* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
return LoadU(d, p);
}

// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
Expand Down Expand Up @@ -1212,7 +1215,9 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
// ------------------------------ Store

template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
HWY_API void Store(Vec128<T> v, D /* tag */, T* HWY_RESTRICT aligned) {
HWY_API void Store(Vec128<T> v, D d, T* HWY_RESTRICT aligned) {
HWY_DASSERT_ALIGNED(d, aligned);
(void)d;
using StoreRaw = typename detail::Raw128<T>::AlignedRawVec;
*HWY_RCAST_ALIGNED(StoreRaw*, aligned) = reinterpret_cast<StoreRaw>(v.raw);
}
Expand All @@ -1224,7 +1229,7 @@ HWY_API void StoreU(Vec128<T> v, D /* tag */, T* HWY_RESTRICT p) {
}

template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
HWY_API void Store(VFromD<D> v, D d, T* HWY_RESTRICT p) {
HWY_API void StoreU(VFromD<D> v, D d, T* HWY_RESTRICT p) {
using BitsT = UnsignedFromSize<d.MaxBytes()>;

const Repartition<BitsT, decltype(d)> d_bits;
Expand All @@ -1234,8 +1239,9 @@ HWY_API void Store(VFromD<D> v, D d, T* HWY_RESTRICT p) {

// For < 128 bit, StoreU == Store.
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
HWY_API void StoreU(VFromD<D> v, D d, T* HWY_RESTRICT p) {
Store(v, d, p);
HWY_API void Store(VFromD<D> v, D d, T* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
StoreU(v, d, p);
}

#if HWY_PPC_HAVE_9
Expand Down
24 changes: 13 additions & 11 deletions hwy/ops/rvv-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1627,15 +1627,16 @@ HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le, _ALL_VIRT)

// There is no native BF16, treat as int16_t.
template <size_t N, int kPow2>
HWY_API VFromD<Simd<int16_t, N, kPow2>> Load(Simd<bfloat16_t, N, kPow2> d,
HWY_API VFromD<Simd<int16_t, N, kPow2>> LoadU(Simd<bfloat16_t, N, kPow2> d,
const bfloat16_t* HWY_RESTRICT p) {
return Load(RebindToSigned<decltype(d)>(),
return LoadU(RebindToSigned<decltype(d)>(),
reinterpret_cast<const int16_t * HWY_RESTRICT>(p));
}

template <size_t N, int kPow2>
HWY_API void Store(VFromD<Simd<int16_t, N, kPow2>> v,
Simd<bfloat16_t, N, kPow2> d, bfloat16_t* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
Store(v, RebindToSigned<decltype(d)>(),
reinterpret_cast<int16_t * HWY_RESTRICT>(p));
}
Expand All @@ -1644,26 +1645,26 @@ HWY_API void Store(VFromD<Simd<int16_t, N, kPow2>> v,

// NOTE: different type for float16_t than bfloat16_t, see Set().
template <size_t N, int kPow2>
HWY_API VFromD<Simd<uint16_t, N, kPow2>> Load(Simd<float16_t, N, kPow2> d,
HWY_API VFromD<Simd<uint16_t, N, kPow2>> LoadU(Simd<float16_t, N, kPow2> d,
const float16_t* HWY_RESTRICT p) {
return Load(RebindToUnsigned<decltype(d)>(),
reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
}

template <size_t N, int kPow2>
HWY_API void Store(VFromD<Simd<uint16_t, N, kPow2>> v,
HWY_API void StoreU(VFromD<Simd<uint16_t, N, kPow2>> v,
Simd<float16_t, N, kPow2> d, float16_t* HWY_RESTRICT p) {
Store(v, RebindToUnsigned<decltype(d)>(),
StoreU(v, RebindToUnsigned<decltype(d)>(),
reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
}

#endif // !HWY_HAVE_FLOAT16

// ------------------------------ LoadU
template <class D>
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
// RVV only requires element alignment, not vector alignment.
return Load(d, p);
return LoadU(d, p);
}

// ------------------------------ MaskedLoad
Expand Down Expand Up @@ -1858,11 +1859,12 @@ HWY_API void StoreN(VFromD<D> v, D /*d*/, T* HWY_RESTRICT p,
reinterpret_cast<TStore * HWY_RESTRICT>(p));
}

// ------------------------------ StoreU
// ------------------------------ Store
template <class V, class D>
HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
HWY_API void Store(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
// RVV only requires element alignment, not vector alignment.
Store(v, d, p);
StoreU(v, d, p);
}

// ------------------------------ Stream
Expand Down
17 changes: 11 additions & 6 deletions hwy/ops/scalar-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1068,7 +1068,7 @@ HWY_API Mask1<double> IsFinite(const Vec1<double> v) {
// ------------------------------ Load

template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
HWY_API Vec1<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
HWY_API Vec1<T> LoadU(D d, const T* HWY_RESTRICT aligned) {
T t;
CopySameSize(aligned, &t);
return Vec1<T>(t);
Expand All @@ -1086,8 +1086,9 @@ HWY_API Vec1<T> MaskedLoadOr(Vec1<T> v, Mask1<T> m, D d,
}

template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
HWY_API Vec1<T> LoadU(D d, const T* HWY_RESTRICT p) {
return Load(d, p);
HWY_API Vec1<T> Load(D d, const T* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
return LoadU(d, p);
}

// In some use cases, "load single lane" is sufficient; otherwise avoid this.
Expand Down Expand Up @@ -1117,13 +1118,17 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const T* HWY_RESTRICT p,
// ------------------------------ Store

template <class D, typename T = TFromD<D>>
HWY_API void Store(const Vec1<T> v, D /* tag */, T* HWY_RESTRICT aligned) {
HWY_API void StoreU(const Vec1<T> v, D d, T* HWY_RESTRICT aligned) {
(void)d;
CopySameSize(&v.raw, aligned);
}

template <class D, typename T = TFromD<D>>
HWY_API void StoreU(const Vec1<T> v, D d, T* HWY_RESTRICT p) {
return Store(v, d, p);
HWY_API void Store(const Vec1<T> v, D d, T* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
(void)d;
CopySameSize(&v.raw, p);
return StoreU(v, d, p);
}

template <class D, typename T = TFromD<D>>
Expand Down
15 changes: 10 additions & 5 deletions hwy/ops/wasm_128-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1860,13 +1860,15 @@ HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
// ------------------------------ Load

template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
HWY_API Vec128<T> Load(D d, const T* HWY_RESTRICT aligned) {
HWY_DASSERT_ALIGNED(d, aligned);
return Vec128<T>{wasm_v128_load(aligned)};
}

// Partial
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
VFromD<D> v;
CopyBytes<d.MaxBytes()>(p, &v);
return v;
Expand Down Expand Up @@ -1939,24 +1941,27 @@ HWY_INLINE double ExtractLane(const Vec128<double, N> v) {
} // namespace detail

template <class D, HWY_IF_V_SIZE_D(D, 16)>
HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
HWY_DASSERT_ALIGNED(d, aligned);
(void)d;
wasm_v128_store(aligned, v.raw);
}

// Partial
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)>
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
CopyBytes<d.MaxBytes()>(&v, p);
}

template <class D, HWY_IF_LANES_D(D, 1)>
HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
*p = detail::ExtractLane<0>(v);
}

// StoreU == Store.
template <class D>
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
HWY_DASSERT_ALIGNED(d, p);
Store(v, d, p);
}

Expand Down
Loading

0 comments on commit b02293a

Please sign in to comment.