Fix overflow in add_dpbusd_epi32x2 (#268)

* Fix overflow in add_dpbusd_epi32x2 This patch fixes 16bit overflow in *_add_dpbusd_epi32x2 functions, that can be triggered in rare cases depending on the NNUE weights. While the code leads to some slowdown on affected architectures (most notably avx2), the fix is simpler than some of the other options discussed in #4394 Code suggested by Sopel97. Result of "bench 4096 1 30 default depth nnue": | Architecture | master | patch (gcc) | patch (clang) | |---------------------|-----------|-------------|---------------| | x86-64-vnni512 | 762122798 | 762122798 | 762122798 | | x86-64-avx512 | 769723503 | 762122798 | 762122798 | | x86-64-bmi2 | 769723503 | 762122798 | 762122798 | | x86-64-ssse3 | 769723503 | 762122798 | 762122798 | | x86-64 | 762122798 | 762122798 | 762122798 | Following architectures will experience ~4% slowdown due to an additional instruction in the middle of hot path: * x86-64-avx512 * x86-64-bmi2 * x86-64-avx2 * x86-64-sse41-popcnt (x86-64-modern) * x86-64-ssse3 * x86-32-sse41-popcnt official-stockfish/Stockfish@2c36d1e * Unify type alias declarations The commit unifies the declaration of type aliases by replacing all typedefs with corresponding using statements. closing #4412 No functional change official-stockfish/Stockfish@564456a # Conflicts: # source/misc.cpp # source/misc.h # source/movepick.h # source/position.h # source/usi.h
yaneurao · Oct 26, 2023 · 144b812 · 144b812
1 parent e55beaa
commit 144b812
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 14 deletions.
diff --git a/source/eval/nnue/layers/affine_transform.h b/source/eval/nnue/layers/affine_transform.h
@@ -212,9 +212,9 @@ class AffineTransform {
 #else
 			__m512i product0 = _mm512_maddubs_epi16(a0, b0);
 			__m512i product1 = _mm512_maddubs_epi16(a1, b1);
-			product0         = _mm512_adds_epi16(product0, product1);
-			product0         = _mm512_madd_epi16(product0, kOnes512);
-			acc              = _mm512_add_epi32(acc, product0);
+			product0 = _mm512_madd_epi16(product0, _mm512_set1_epi16(1));
+			product1 = _mm512_madd_epi16(product1, _mm512_set1_epi16(1));
+			acc = _mm512_add_epi32(acc, _mm512_add_epi32(product0, product1));
 #endif
 		};
 
@@ -261,9 +261,9 @@ class AffineTransform {
 #else
 			__m256i product0 = _mm256_maddubs_epi16(a0, b0);
 			__m256i product1 = _mm256_maddubs_epi16(a1, b1);
-			product0         = _mm256_adds_epi16(product0, product1);
-			product0         = _mm256_madd_epi16(product0, kOnes256);
-			acc              = _mm256_add_epi32(acc, product0);
+			product0 = _mm256_madd_epi16(product0, _mm256_set1_epi16(1));
+			product1 = _mm256_madd_epi16(product1, _mm256_set1_epi16(1));
+			acc = _mm256_add_epi32(acc, _mm256_add_epi32(product0, product1));
 #endif
 		};
 
@@ -299,9 +299,9 @@ class AffineTransform {
 		                                                    __m128i b1) {
 			__m128i product0 = _mm_maddubs_epi16(a0, b0);
 			__m128i product1 = _mm_maddubs_epi16(a1, b1);
-			product0         = _mm_adds_epi16(product0, product1);
-			product0         = _mm_madd_epi16(product0, kOnes128);
-			acc              = _mm_add_epi32(acc, product0);
+			product0 = _mm_madd_epi16(product0, _mm_set1_epi16(1));
+			product1 = _mm_madd_epi16(product1, _mm_set1_epi16(1));
+			acc = _mm_add_epi32(acc, _mm_add_epi32(product0, product1));
 		};
 
 #endif

diff --git a/source/eval/nnue/nnue_feature_transformer.h b/source/eval/nnue/nnue_feature_transformer.h
@@ -24,7 +24,7 @@ namespace Eval::NNUE {
 #define VECTOR
 
 #if defined(USE_AVX512)
-typedef __m512i vec_t;
+using vec_t = __m512i;
 #define vec_load(a) _mm512_load_si512(a)
 #define vec_store(a, b) _mm512_store_si512(a, b)
 #define vec_add_16(a, b) _mm512_add_epi16(a, b)
@@ -33,7 +33,7 @@ typedef __m512i vec_t;
 static constexpr IndexType kNumRegs = 8;  // only 8 are needed
 
 #elif defined(USE_AVX2)
-typedef __m256i vec_t;
+using vec_t = __m256i;
 #define vec_load(a) _mm256_load_si256(a)
 #define vec_store(a, b) _mm256_store_si256(a, b)
 #define vec_add_16(a, b) _mm256_add_epi16(a, b)
@@ -42,7 +42,7 @@ typedef __m256i vec_t;
 static constexpr IndexType kNumRegs = 16;
 
 #elif defined(USE_SSE2)
-typedef __m128i vec_t;
+using vec_t = __m128i;
 #define vec_load(a) (*(a))
 #define vec_store(a, b) *(a) = (b)
 #define vec_add_16(a, b) _mm_add_epi16(a, b)
@@ -51,7 +51,7 @@ typedef __m128i vec_t;
 static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
 
 #elif defined(USE_MMX)
-typedef __m64 vec_t;
+using vec_t = __m64;
 #define vec_load(a) (*(a))
 #define vec_store(a, b) *(a) = (b)
 #define vec_add_16(a, b) _mm_add_pi16(a, b)
@@ -60,7 +60,7 @@ typedef __m64 vec_t;
 static constexpr IndexType kNumRegs = 8;
 
 #elif defined(USE_NEON)
-typedef int16x8_t vec_t;
+using vec_t = int16x8_t;
 #define vec_load(a) (*(a))
 #define vec_store(a, b) *(a) = (b)
 #define vec_add_16(a, b) vaddq_s16(a, b)