Skip to content

Commit

Permalink
Fix overflow in add_dpbusd_epi32x2 (#268)
Browse files Browse the repository at this point in the history
* Fix overflow in add_dpbusd_epi32x2
This patch fixes 16bit overflow in *_add_dpbusd_epi32x2 functions,
that can be triggered in rare cases depending on the NNUE weights.

While the code leads to some slowdown on affected architectures
(most notably avx2), the fix is simpler than some of the other
options discussed in
#4394

Code suggested by Sopel97.

Result of "bench 4096 1 30 default depth nnue":

| Architecture        | master    | patch (gcc) | patch (clang) |
|---------------------|-----------|-------------|---------------|
| x86-64-vnni512      | 762122798 | 762122798   | 762122798     |
| x86-64-avx512       | 769723503 | 762122798   | 762122798     |
| x86-64-bmi2         | 769723503 | 762122798   | 762122798     |
| x86-64-ssse3        | 769723503 | 762122798   | 762122798     |
| x86-64              | 762122798 | 762122798   | 762122798     |

Following architectures will experience ~4% slowdown due to an
additional instruction in the middle of hot path:

* x86-64-avx512
* x86-64-bmi2
* x86-64-avx2
* x86-64-sse41-popcnt (x86-64-modern)
* x86-64-ssse3
* x86-32-sse41-popcnt
official-stockfish/Stockfish@2c36d1e

* Unify type alias declarations
The commit unifies the declaration of type aliases by replacing all
typedefs with corresponding using statements.

closing #4412

No functional change
official-stockfish/Stockfish@564456a

# Conflicts:
#	source/misc.cpp
#	source/misc.h
#	source/movepick.h
#	source/position.h
#	source/usi.h
  • Loading branch information
nodchip authored Oct 26, 2023
1 parent e55beaa commit 144b812
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 14 deletions.
18 changes: 9 additions & 9 deletions source/eval/nnue/layers/affine_transform.h
Original file line number Diff line number Diff line change
Expand Up @@ -212,9 +212,9 @@ class AffineTransform {
#else
__m512i product0 = _mm512_maddubs_epi16(a0, b0);
__m512i product1 = _mm512_maddubs_epi16(a1, b1);
product0 = _mm512_adds_epi16(product0, product1);
product0 = _mm512_madd_epi16(product0, kOnes512);
acc = _mm512_add_epi32(acc, product0);
product0 = _mm512_madd_epi16(product0, _mm512_set1_epi16(1));
product1 = _mm512_madd_epi16(product1, _mm512_set1_epi16(1));
acc = _mm512_add_epi32(acc, _mm512_add_epi32(product0, product1));
#endif
};

Expand Down Expand Up @@ -261,9 +261,9 @@ class AffineTransform {
#else
__m256i product0 = _mm256_maddubs_epi16(a0, b0);
__m256i product1 = _mm256_maddubs_epi16(a1, b1);
product0 = _mm256_adds_epi16(product0, product1);
product0 = _mm256_madd_epi16(product0, kOnes256);
acc = _mm256_add_epi32(acc, product0);
product0 = _mm256_madd_epi16(product0, _mm256_set1_epi16(1));
product1 = _mm256_madd_epi16(product1, _mm256_set1_epi16(1));
acc = _mm256_add_epi32(acc, _mm256_add_epi32(product0, product1));
#endif
};

Expand Down Expand Up @@ -299,9 +299,9 @@ class AffineTransform {
__m128i b1) {
__m128i product0 = _mm_maddubs_epi16(a0, b0);
__m128i product1 = _mm_maddubs_epi16(a1, b1);
product0 = _mm_adds_epi16(product0, product1);
product0 = _mm_madd_epi16(product0, kOnes128);
acc = _mm_add_epi32(acc, product0);
product0 = _mm_madd_epi16(product0, _mm_set1_epi16(1));
product1 = _mm_madd_epi16(product1, _mm_set1_epi16(1));
acc = _mm_add_epi32(acc, _mm_add_epi32(product0, product1));
};

#endif
Expand Down
10 changes: 5 additions & 5 deletions source/eval/nnue/nnue_feature_transformer.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ namespace Eval::NNUE {
#define VECTOR

#if defined(USE_AVX512)
typedef __m512i vec_t;
using vec_t = __m512i;
#define vec_load(a) _mm512_load_si512(a)
#define vec_store(a, b) _mm512_store_si512(a, b)
#define vec_add_16(a, b) _mm512_add_epi16(a, b)
Expand All @@ -33,7 +33,7 @@ typedef __m512i vec_t;
static constexpr IndexType kNumRegs = 8; // only 8 are needed

#elif defined(USE_AVX2)
typedef __m256i vec_t;
using vec_t = __m256i;
#define vec_load(a) _mm256_load_si256(a)
#define vec_store(a, b) _mm256_store_si256(a, b)
#define vec_add_16(a, b) _mm256_add_epi16(a, b)
Expand All @@ -42,7 +42,7 @@ typedef __m256i vec_t;
static constexpr IndexType kNumRegs = 16;

#elif defined(USE_SSE2)
typedef __m128i vec_t;
using vec_t = __m128i;
#define vec_load(a) (*(a))
#define vec_store(a, b) *(a) = (b)
#define vec_add_16(a, b) _mm_add_epi16(a, b)
Expand All @@ -51,7 +51,7 @@ typedef __m128i vec_t;
static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;

#elif defined(USE_MMX)
typedef __m64 vec_t;
using vec_t = __m64;
#define vec_load(a) (*(a))
#define vec_store(a, b) *(a) = (b)
#define vec_add_16(a, b) _mm_add_pi16(a, b)
Expand All @@ -60,7 +60,7 @@ typedef __m64 vec_t;
static constexpr IndexType kNumRegs = 8;

#elif defined(USE_NEON)
typedef int16x8_t vec_t;
using vec_t = int16x8_t;
#define vec_load(a) (*(a))
#define vec_store(a, b) *(a) = (b)
#define vec_add_16(a, b) vaddq_s16(a, b)
Expand Down

0 comments on commit 144b812

Please sign in to comment.