From 14de627bc3e66da56a42279e1994b3dde03e0a72 Mon Sep 17 00:00:00 2001 From: nihui Date: Sun, 8 Oct 2023 16:13:43 +0800 Subject: [PATCH] fine --- src/layer/x86/convolution_3x3_winograd_int8.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/layer/x86/convolution_3x3_winograd_int8.h b/src/layer/x86/convolution_3x3_winograd_int8.h index 061a98fedd63..675416e36a7a 100644 --- a/src/layer/x86/convolution_3x3_winograd_int8.h +++ b/src/layer/x86/convolution_3x3_winograd_int8.h @@ -2756,7 +2756,7 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, { const short* pA = pAT; -#if 0//__AVX2__ +#if __AVX2__ __m256i _sum0; __m256i _sum1; #else @@ -2768,7 +2768,7 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, if (k == 0) { -#if 0//__AVX2__ +#if __AVX2__ _sum0 = _mm256_setzero_si256(); _sum1 = _mm256_setzero_si256(); #else @@ -2780,7 +2780,7 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, } else { -#if 0//__AVX2__ +#if __AVX2__ _sum0 = _mm256_loadu_si256((const __m256i*)outptr); _sum1 = _mm256_loadu_si256((const __m256i*)(outptr + 8)); #else @@ -2794,9 +2794,9 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, int kk = 0; for (; kk + 1 < max_kk; kk += 2) { -#if 0//__AVX2__ - __m256i _pA0 = _mm256_castps_si256(_mm256_broadcast_ss((const float*)pA)); - __m256i _pA1 = _mm256_castps_si256(_mm256_broadcast_ss((const float*)(pA + 2))); +#if __AVX2__ + __m256i _pA0 = _mm256_set1_epi32(((const int*)pA)[0]); + __m256i _pA1 = _mm256_set1_epi32(((const int*)pA)[1]); __m256i _pB0 = _mm256_loadu_si256((const __m256i*)pB); #if __AVX512VNNI__ || __AVXVNNI__ _sum0 = _mm256_dpwssd_epi32(_sum0, _pA0, _pB0); @@ -2822,7 +2822,7 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, for (; kk < max_kk; kk++) { __m128i _pB = _mm_load_si128((const __m128i*)pB); -#if 0//__AVX2__ +#if __AVX2__ __m256i _pA0 = _mm256_set1_epi32(pA[0]); __m256i _pA1 = _mm256_set1_epi32(pA[1]); __m256i _pB0 = _mm256_cvtepi16_epi32(_pB); @@ -2852,7 +2852,7 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, pB += 8; } -#if 0//__AVX2__ +#if __AVX2__ if (k_end) { __m256i _tmp0 = _mm256_unpacklo_epi32(_sum0, _sum1);