From 14de627bc3e66da56a42279e1994b3dde03e0a72 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Sun, 8 Oct 2023 16:13:43 +0800
Subject: [PATCH] fine

---
 src/layer/x86/convolution_3x3_winograd_int8.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/layer/x86/convolution_3x3_winograd_int8.h b/src/layer/x86/convolution_3x3_winograd_int8.h
index 061a98fedd63..675416e36a7a 100644
--- a/src/layer/x86/convolution_3x3_winograd_int8.h
+++ b/src/layer/x86/convolution_3x3_winograd_int8.h
@@ -2756,7 +2756,7 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile,
             {
                 const short* pA = pAT;
 
-#if 0//__AVX2__
+#if __AVX2__
                 __m256i _sum0;
                 __m256i _sum1;
 #else
@@ -2768,7 +2768,7 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile,
 
                 if (k == 0)
                 {
-#if 0//__AVX2__
+#if __AVX2__
                     _sum0 = _mm256_setzero_si256();
                     _sum1 = _mm256_setzero_si256();
 #else
@@ -2780,7 +2780,7 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile,
                 }
                 else
                 {
-#if 0//__AVX2__
+#if __AVX2__
                     _sum0 = _mm256_loadu_si256((const __m256i*)outptr);
                     _sum1 = _mm256_loadu_si256((const __m256i*)(outptr + 8));
 #else
@@ -2794,9 +2794,9 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile,
                 int kk = 0;
                 for (; kk + 1 < max_kk; kk += 2)
                 {
-#if 0//__AVX2__
-                    __m256i _pA0 = _mm256_castps_si256(_mm256_broadcast_ss((const float*)pA));
-                    __m256i _pA1 = _mm256_castps_si256(_mm256_broadcast_ss((const float*)(pA + 2)));
+#if __AVX2__
+                    __m256i _pA0 = _mm256_set1_epi32(((const int*)pA)[0]);
+                    __m256i _pA1 = _mm256_set1_epi32(((const int*)pA)[1]);
                     __m256i _pB0 = _mm256_loadu_si256((const __m256i*)pB);
 #if __AVX512VNNI__ || __AVXVNNI__
                     _sum0 = _mm256_dpwssd_epi32(_sum0, _pA0, _pB0);
@@ -2822,7 +2822,7 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile,
                 for (; kk < max_kk; kk++)
                 {
                     __m128i _pB = _mm_load_si128((const __m128i*)pB);
-#if 0//__AVX2__
+#if __AVX2__
                     __m256i _pA0 = _mm256_set1_epi32(pA[0]);
                     __m256i _pA1 = _mm256_set1_epi32(pA[1]);
                     __m256i _pB0 = _mm256_cvtepi16_epi32(_pB);
@@ -2852,7 +2852,7 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile,
                     pB += 8;
                 }
 
-#if 0//__AVX2__
+#if __AVX2__
                 if (k_end)
                 {
                     __m256i _tmp0 = _mm256_unpacklo_epi32(_sum0, _sum1);