Skip to content

Commit

Permalink
comp
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui committed Nov 29, 2024
1 parent 1069d2f commit f4e41e1
Show file tree
Hide file tree
Showing 5 changed files with 193 additions and 175 deletions.
36 changes: 18 additions & 18 deletions src/layer/x86/convolution_3x3_winograd_int8.h
Original file line number Diff line number Diff line change
Expand Up @@ -1620,14 +1620,14 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile,
__m256i _pB3 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(2, 1, 0, 3));

#if __AVXVNNI__
_sum0 = _mm256_dpwssd_epi32(_sum0, _pA0, _pB0);
_sum1 = _mm256_dpwssd_epi32(_sum1, _pA0, _pB1);
_sum2 = _mm256_dpwssd_epi32(_sum2, _pA0, _pB2);
_sum3 = _mm256_dpwssd_epi32(_sum3, _pA0, _pB3);
_sum4 = _mm256_dpwssd_epi32(_sum4, _pA1, _pB0);
_sum5 = _mm256_dpwssd_epi32(_sum5, _pA1, _pB1);
_sum6 = _mm256_dpwssd_epi32(_sum6, _pA1, _pB2);
_sum7 = _mm256_dpwssd_epi32(_sum7, _pA1, _pB3);
_sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA0, _pB0);
_sum1 = _mm256_comp_dpwssd_epi32(_sum1, _pA0, _pB1);
_sum2 = _mm256_comp_dpwssd_epi32(_sum2, _pA0, _pB2);
_sum3 = _mm256_comp_dpwssd_epi32(_sum3, _pA0, _pB3);
_sum4 = _mm256_comp_dpwssd_epi32(_sum4, _pA1, _pB0);
_sum5 = _mm256_comp_dpwssd_epi32(_sum5, _pA1, _pB1);
_sum6 = _mm256_comp_dpwssd_epi32(_sum6, _pA1, _pB2);
_sum7 = _mm256_comp_dpwssd_epi32(_sum7, _pA1, _pB3);
#else
_sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0));
_sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA0, _pB1));
Expand Down Expand Up @@ -1855,10 +1855,10 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile,
__m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1));

#if __AVXVNNI__ || __AVX512VNNI__
_sum0 = _mm256_dpwssd_epi32(_sum0, _pA0, _pB0);
_sum1 = _mm256_dpwssd_epi32(_sum1, _pA0, _pB1);
_sum2 = _mm256_dpwssd_epi32(_sum2, _pA1, _pB0);
_sum3 = _mm256_dpwssd_epi32(_sum3, _pA1, _pB1);
_sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA0, _pB0);
_sum1 = _mm256_comp_dpwssd_epi32(_sum1, _pA0, _pB1);
_sum2 = _mm256_comp_dpwssd_epi32(_sum2, _pA1, _pB0);
_sum3 = _mm256_comp_dpwssd_epi32(_sum3, _pA1, _pB1);
#else
_sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0));
_sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA0, _pB1));
Expand Down Expand Up @@ -1949,8 +1949,8 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile,
__m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 1, 0, 1));

#if __AVXVNNI__ || __AVX512VNNI__
_sum0 = _mm256_dpwssd_epi32(_sum0, _pA, _pB0);
_sum1 = _mm256_dpwssd_epi32(_sum1, _pA, _pB1);
_sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA, _pB0);
_sum1 = _mm256_comp_dpwssd_epi32(_sum1, _pA, _pB1);
#else
_sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA, _pB0));
_sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA, _pB1));
Expand Down Expand Up @@ -2232,10 +2232,10 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile,
__m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1));

#if __AVXVNNI__ || __AVX512VNNI__
_sum0 = _mm256_dpwssd_epi32(_sum0, _pA0, _pB0);
_sum1 = _mm256_dpwssd_epi32(_sum1, _pA0, _pB1);
_sum2 = _mm256_dpwssd_epi32(_sum2, _pA1, _pB0);
_sum3 = _mm256_dpwssd_epi32(_sum3, _pA1, _pB1);
_sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA0, _pB0);
_sum1 = _mm256_comp_dpwssd_epi32(_sum1, _pA0, _pB1);
_sum2 = _mm256_comp_dpwssd_epi32(_sum2, _pA1, _pB0);
_sum3 = _mm256_comp_dpwssd_epi32(_sum3, _pA1, _pB1);
#else
_sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0));
_sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA0, _pB1));
Expand Down
44 changes: 22 additions & 22 deletions src/layer/x86/convolution_im2col_gemm_int8.h
Original file line number Diff line number Diff line change
Expand Up @@ -2828,14 +2828,14 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M
__m256i _pB3 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(2, 1, 0, 3));

#if __AVXVNNI__
_sum0 = _mm256_dpwssd_epi32(_sum0, _pA0, _pB0);
_sum1 = _mm256_dpwssd_epi32(_sum1, _pA0, _pB1);
_sum2 = _mm256_dpwssd_epi32(_sum2, _pA0, _pB2);
_sum3 = _mm256_dpwssd_epi32(_sum3, _pA0, _pB3);
_sum4 = _mm256_dpwssd_epi32(_sum4, _pA1, _pB0);
_sum5 = _mm256_dpwssd_epi32(_sum5, _pA1, _pB1);
_sum6 = _mm256_dpwssd_epi32(_sum6, _pA1, _pB2);
_sum7 = _mm256_dpwssd_epi32(_sum7, _pA1, _pB3);
_sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA0, _pB0);
_sum1 = _mm256_comp_dpwssd_epi32(_sum1, _pA0, _pB1);
_sum2 = _mm256_comp_dpwssd_epi32(_sum2, _pA0, _pB2);
_sum3 = _mm256_comp_dpwssd_epi32(_sum3, _pA0, _pB3);
_sum4 = _mm256_comp_dpwssd_epi32(_sum4, _pA1, _pB0);
_sum5 = _mm256_comp_dpwssd_epi32(_sum5, _pA1, _pB1);
_sum6 = _mm256_comp_dpwssd_epi32(_sum6, _pA1, _pB2);
_sum7 = _mm256_comp_dpwssd_epi32(_sum7, _pA1, _pB3);
#else
_sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0));
_sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA0, _pB1));
Expand Down Expand Up @@ -3316,10 +3316,10 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M
__m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1));

#if __AVXVNNI__ || __AVX512VNNI__
_sum0 = _mm256_dpwssd_epi32(_sum0, _pA0, _pB0);
_sum1 = _mm256_dpwssd_epi32(_sum1, _pA0, _pB1);
_sum2 = _mm256_dpwssd_epi32(_sum2, _pA1, _pB0);
_sum3 = _mm256_dpwssd_epi32(_sum3, _pA1, _pB1);
_sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA0, _pB0);
_sum1 = _mm256_comp_dpwssd_epi32(_sum1, _pA0, _pB1);
_sum2 = _mm256_comp_dpwssd_epi32(_sum2, _pA1, _pB0);
_sum3 = _mm256_comp_dpwssd_epi32(_sum3, _pA1, _pB1);
#else
_sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0));
_sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA0, _pB1));
Expand Down Expand Up @@ -3518,8 +3518,8 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M
__m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 1, 0, 1));

#if __AVXVNNI__ || __AVX512VNNI__
_sum0 = _mm256_dpwssd_epi32(_sum0, _pA0, _pB0);
_sum1 = _mm256_dpwssd_epi32(_sum1, _pA0, _pB1);
_sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA0, _pB0);
_sum1 = _mm256_comp_dpwssd_epi32(_sum1, _pA0, _pB1);
#else
_sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0));
_sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA0, _pB1));
Expand Down Expand Up @@ -3654,7 +3654,7 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M
__m256i _pB0 = _mm256_shuffle_epi32(_pBB, _MM_SHUFFLE(0, 0, 0, 0));

#if __AVXVNNI__ || __AVX512VNNI__
_sum0 = _mm256_dpwssd_epi32(_sum0, _pA0, _pB0);
_sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA0, _pB0);
#else
_sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0));
#endif
Expand Down Expand Up @@ -3984,10 +3984,10 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M
__m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1));

#if __AVXVNNI__ || __AVX512VNNI__
_sum0 = _mm256_dpwssd_epi32(_sum0, _pA0, _pB0);
_sum1 = _mm256_dpwssd_epi32(_sum1, _pA0, _pB1);
_sum2 = _mm256_dpwssd_epi32(_sum2, _pA1, _pB0);
_sum3 = _mm256_dpwssd_epi32(_sum3, _pA1, _pB1);
_sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA0, _pB0);
_sum1 = _mm256_comp_dpwssd_epi32(_sum1, _pA0, _pB1);
_sum2 = _mm256_comp_dpwssd_epi32(_sum2, _pA1, _pB0);
_sum3 = _mm256_comp_dpwssd_epi32(_sum3, _pA1, _pB1);
#else
_sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0));
_sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA0, _pB1));
Expand Down Expand Up @@ -4943,8 +4943,8 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M
__m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1));

#if __AVX512VNNI__ || __AVXVNNI__
_sum0 = _mm256_dpwssd_epi32(_sum0, _pA0, _pB0);
_sum1 = _mm256_dpwssd_epi32(_sum1, _pA0, _pB1);
_sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA0, _pB0);
_sum1 = _mm256_comp_dpwssd_epi32(_sum1, _pA0, _pB1);
#else
_sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0));
_sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA0, _pB1));
Expand Down Expand Up @@ -5467,7 +5467,7 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M
__m256i _pB0 = _mm256_cvtepi8_epi16(_pB);

#if __AVX512VNNI__ || __AVXVNNI__
_sum0 = _mm256_dpwssd_epi32(_sum0, _pA0, _pB0);
_sum0 = _mm256_comp_dpwssd_epi32(_sum0, _pA0, _pB0);
#else
_sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0));
#endif // __AVX512VNNI__ || __AVXVNNI__
Expand Down
Loading

0 comments on commit f4e41e1

Please sign in to comment.