diff --git a/src/layer/x86/gemm_int8.h b/src/layer/x86/gemm_int8.h index c59cd7e789d..0ff42303ffb 100644 --- a/src/layer/x86/gemm_int8.h +++ b/src/layer/x86/gemm_int8.h @@ -18836,7 +18836,16 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, { __m256i _pA = _mm256_loadu_si256((const __m256i*)pA); __m256i _pB = _mm256_castps_si256(_mm256_broadcast_ss((const float*)pB)); +#if __AVX512VNNI__ && _MSC_VER < 1932 + // old msvc crash here --- nihui + __m512i _pA0 = _mm512_cvtepi8_epi16(_pA); + __m512i _pB0 = _mm512_cvtepu8_epi16(_pB); + __m512i _s0 = _mm512_madd_epi16(_pA0, _pB0); + __m256i _s1 = _mm256_hadd_epi32(_mm512_extracti32x8_epi32(_s0, 0), _mm512_extracti32x8_epi32(_s0, 1)); + _sum0 = _mm256_add_epi32(_sum0, _mm256_permute4x64_epi64(_s1, _MM_SHUFFLE(3, 1, 2, 0))); +#else _sum0 = _mm256_comp_dpbusd_epi32(_sum0, _pB, _pA); +#endif pA += 32; pB += 4; } @@ -19511,7 +19520,16 @@ static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, { __m128i _pA = _mm_loadu_si128((const __m128i*)pA); __m128i _pB = _mm_castps_si128(_mm_load1_ps((const float*)pB)); +#if __AVX512VNNI__ && _MSC_VER < 1932 + // old msvc crash here --- nihui + __m256i _pA0 = _mm256_cvtepi8_epi16(_pA); + __m256i _pB0 = _mm256_cvtepu8_epi16(_pB); + __m256i _s0 = _mm256_madd_epi16(_pA0, _pB0); + __m128i _s1 = _mm_hadd_epi32(_mm256_extracti128_si256(_s0, 0), _mm256_extracti128_si256(_s0, 1)); + _sum0 = _mm_add_epi32(_sum0, _s1); +#else _sum0 = _mm_comp_dpbusd_epi32(_sum0, _pB, _pA); +#endif pA += 16; pB += 4; }