diff --git a/src/layer/arm/convolution1d_packed.h b/src/layer/arm/convolution1d_packed.h index 5e090e21c75..7b487413527 100644 --- a/src/layer/arm/convolution1d_packed.h +++ b/src/layer/arm/convolution1d_packed.h @@ -60,10 +60,11 @@ static void convolution1d_transform_kernel_packed(const Mat& kernel, Mat& kernel #endif // __aarch64__ if (inh >= 4) kernel_tm.create(2 * 4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh / 2 + outh % 2); - else if (inh >= 2) - kernel_tm.create(2 * 2 * kernel_w, inh / 2 + inh % 2, outh / 2 + outh % 2); else #endif // __ARM_NEON + if (inh >= 2) + kernel_tm.create(2 * 2 * kernel_w, inh / 2 + inh % 2, outh / 2 + outh % 2); + else kernel_tm.create(2 * kernel_w, inh, outh / 2 + outh % 2); } else @@ -76,10 +77,11 @@ static void convolution1d_transform_kernel_packed(const Mat& kernel, Mat& kernel #endif // __aarch64__ if (inh >= 4) kernel_tm.create(4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh); - else if (inh >= 2) - kernel_tm.create(2 * kernel_w, inh / 2 + inh % 2, outh); else #endif // __ARM_NEON + if (inh >= 2) + kernel_tm.create(2 * kernel_w, inh / 2 + inh % 2, outh); + else kernel_tm.create(kernel_w, inh, outh); } // *INDENT-ON* diff --git a/src/layer/arm/convolution1d_packed_bf16s.h b/src/layer/arm/convolution1d_packed_bf16s.h index 224f0335b71..8f626b578d8 100644 --- a/src/layer/arm/convolution1d_packed_bf16s.h +++ b/src/layer/arm/convolution1d_packed_bf16s.h @@ -60,10 +60,11 @@ static void convolution1d_transform_kernel_packed_bf16s(const Mat& kernel, Mat& #endif // __aarch64__ if (inh >= 4) kernel_tm.create(2 * 4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh / 2 + outh % 2, (size_t)2u); - else if (inh >= 2) - kernel_tm.create(2 * 2 * kernel_w, inh / 2 + inh % 2, outh / 2 + outh % 2, (size_t)2u); else #endif // __ARM_NEON + if (inh >= 2) + kernel_tm.create(2 * 2 * kernel_w, inh / 2 + inh % 2, outh / 2 + outh % 2, (size_t)2u); + else kernel_tm.create(2 * kernel_w, inh, outh / 2 + outh % 2, (size_t)2u); } else @@ -76,10 +77,11 @@ static void convolution1d_transform_kernel_packed_bf16s(const Mat& kernel, Mat& #endif // __aarch64__ if (inh >= 4) kernel_tm.create(4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh, (size_t)2u); - else if (inh >= 2) - kernel_tm.create(2 * kernel_w, inh / 2 + inh % 2, outh, (size_t)2u); else #endif // __ARM_NEON + if (inh >= 2) + kernel_tm.create(2 * kernel_w, inh / 2 + inh % 2, outh, (size_t)2u); + else kernel_tm.create(kernel_w, inh, outh, (size_t)2u); } // *INDENT-ON* diff --git a/src/layer/arm/convolution_packed.h b/src/layer/arm/convolution_packed.h index ab3be578a7a..0be5c1d4245 100644 --- a/src/layer/arm/convolution_packed.h +++ b/src/layer/arm/convolution_packed.h @@ -62,10 +62,11 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t #endif // __aarch64__ if (inch >= 4) kernel_tm.create(2 * 4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch / 2 + outch % 2); - else if (inch >= 2) - kernel_tm.create(2 * 2 * maxk, inch / 2 + inch % 2, outch / 2 + outch % 2); else #endif // __ARM_NEON + if (inch >= 2) + kernel_tm.create(2 * 2 * maxk, inch / 2 + inch % 2, outch / 2 + outch % 2); + else kernel_tm.create(2 * maxk, inch, outch / 2 + outch % 2); } else @@ -78,10 +79,11 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t #endif // __aarch64__ if (inch >= 4) kernel_tm.create(4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch); - else if (inch >= 2) - kernel_tm.create(2 * maxk, inch / 2 + inch % 2, outch); else #endif // __ARM_NEON + if (inch >= 2) + kernel_tm.create(2 * maxk, inch / 2 + inch % 2, outch); + else kernel_tm.create(maxk, inch, outch); } // *INDENT-ON* diff --git a/src/layer/arm/convolution_packed_bf16s.h b/src/layer/arm/convolution_packed_bf16s.h index c29c9225b51..b4000da9530 100644 --- a/src/layer/arm/convolution_packed_bf16s.h +++ b/src/layer/arm/convolution_packed_bf16s.h @@ -62,10 +62,11 @@ static void convolution_transform_kernel_packed_bf16s(const Mat& kernel, Mat& ke #endif // __aarch64__ if (inch >= 4) kernel_tm.create(2 * 4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch / 2 + outch % 2, (size_t)2u); - else if (inch >= 2) - kernel_tm.create(2 * 2 * maxk, inch / 2 + inch % 2, outch / 2 + outch % 2, (size_t)2u); else #endif // __ARM_NEON + if (inch >= 2) + kernel_tm.create(2 * 2 * maxk, inch / 2 + inch % 2, outch / 2 + outch % 2, (size_t)2u); + else kernel_tm.create(2 * maxk, inch, outch / 2 + outch % 2, (size_t)2u); } else @@ -78,10 +79,11 @@ static void convolution_transform_kernel_packed_bf16s(const Mat& kernel, Mat& ke #endif // __aarch64__ if (inch >= 4) kernel_tm.create(4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch, (size_t)2u); - else if (inch >= 2) - kernel_tm.create(2 * maxk, inch / 2 + inch % 2, outch, (size_t)2u); else #endif // __ARM_NEON + if (inch >= 2) + kernel_tm.create(2 * maxk, inch / 2 + inch % 2, outch, (size_t)2u); + else kernel_tm.create(maxk, inch, outch, (size_t)2u); } // *INDENT-ON* diff --git a/src/layer/x86/convolution1d_packed.h b/src/layer/x86/convolution1d_packed.h index be429da88da..5d20b83fc00 100644 --- a/src/layer/x86/convolution1d_packed.h +++ b/src/layer/x86/convolution1d_packed.h @@ -91,10 +91,11 @@ static void convolution1d_transform_kernel_packed(const Mat& kernel, Mat& kernel #endif // __AVX__ if (inh >= 4) kernel_tm.create(2 * 4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh / 2 + outh % 2); - else if (inh >= 2) - kernel_tm.create(2 * 2 * kernel_w, inh / 2 + inh % 2, outh / 2 + outh % 2); else #endif // __SSE2__ + if (inh >= 2) + kernel_tm.create(2 * 2 * kernel_w, inh / 2 + inh % 2, outh / 2 + outh % 2); + else kernel_tm.create(2 * kernel_w, inh, outh / 2 + outh % 2); } else @@ -112,10 +113,11 @@ static void convolution1d_transform_kernel_packed(const Mat& kernel, Mat& kernel #endif // __AVX__ if (inh >= 4) kernel_tm.create(4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh); - else if (inh >= 2) - kernel_tm.create(2 * kernel_w, inh / 2 + inh % 2, outh); else #endif // __SSE2__ + if (inh >= 2) + kernel_tm.create(2 * kernel_w, inh / 2 + inh % 2, outh); + else kernel_tm.create(kernel_w, inh, outh); } // *INDENT-ON* @@ -146,284 +148,149 @@ static void convolution1d_transform_kernel_packed(const Mat& kernel, Mat& kernel float* g00 = kernel_tm.channel(q / 16); + __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(kernel_w)); + int p = 0; -#if __AVX__ -#if __AVX512F__ for (; p + 15 < inh; p += 16) { for (int k = 0; k < kernel_w; k++) { - const float* k0 = kptr0 + p * kernel_w; - const float* k1 = kptr1 + p * kernel_w; - const float* k2 = kptr2 + p * kernel_w; - const float* k3 = kptr3 + p * kernel_w; - const float* k4 = kptr4 + p * kernel_w; - const float* k5 = kptr5 + p * kernel_w; - const float* k6 = kptr6 + p * kernel_w; - const float* k7 = kptr7 + p * kernel_w; - const float* k8 = kptr8 + p * kernel_w; - const float* k9 = kptr9 + p * kernel_w; - const float* ka = kptra + p * kernel_w; - const float* kb = kptrb + p * kernel_w; - const float* kc = kptrc + p * kernel_w; - const float* kd = kptrd + p * kernel_w; - const float* ke = kptre + p * kernel_w; - const float* kf = kptrf + p * kernel_w; - - for (int i = 0; i < 16; i++) - { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - g00[4] = k4[k]; - g00[5] = k5[k]; - g00[6] = k6[k]; - g00[7] = k7[k]; - g00[8] = k8[k]; - g00[9] = k9[k]; - g00[10] = ka[k]; - g00[11] = kb[k]; - g00[12] = kc[k]; - g00[13] = kd[k]; - g00[14] = ke[k]; - g00[15] = kf[k]; - k0 += kernel_w; - k1 += kernel_w; - k2 += kernel_w; - k3 += kernel_w; - k4 += kernel_w; - k5 += kernel_w; - k6 += kernel_w; - k7 += kernel_w; - k8 += kernel_w; - k9 += kernel_w; - ka += kernel_w; - kb += kernel_w; - kc += kernel_w; - kd += kernel_w; - ke += kernel_w; - kf += kernel_w; - g00 += 16; - } - } + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; + const float* k4 = kptr4 + k; + const float* k5 = kptr5 + k; + const float* k6 = kptr6 + k; + const float* k7 = kptr7 + k; + const float* k8 = kptr8 + k; + const float* k9 = kptr9 + k; + const float* ka = kptra + k; + const float* kb = kptrb + k; + const float* kc = kptrc + k; + const float* kd = kptrd + k; + const float* ke = kptre + k; + const float* kf = kptrf + k; + + __m512 _k0 = _mm512_i32gather_ps(_vindex, k0, sizeof(float)); + __m512 _k1 = _mm512_i32gather_ps(_vindex, k1, sizeof(float)); + __m512 _k2 = _mm512_i32gather_ps(_vindex, k2, sizeof(float)); + __m512 _k3 = _mm512_i32gather_ps(_vindex, k3, sizeof(float)); + __m512 _k4 = _mm512_i32gather_ps(_vindex, k4, sizeof(float)); + __m512 _k5 = _mm512_i32gather_ps(_vindex, k5, sizeof(float)); + __m512 _k6 = _mm512_i32gather_ps(_vindex, k6, sizeof(float)); + __m512 _k7 = _mm512_i32gather_ps(_vindex, k7, sizeof(float)); + __m512 _k8 = _mm512_i32gather_ps(_vindex, k8, sizeof(float)); + __m512 _k9 = _mm512_i32gather_ps(_vindex, k9, sizeof(float)); + __m512 _ka = _mm512_i32gather_ps(_vindex, ka, sizeof(float)); + __m512 _kb = _mm512_i32gather_ps(_vindex, kb, sizeof(float)); + __m512 _kc = _mm512_i32gather_ps(_vindex, kc, sizeof(float)); + __m512 _kd = _mm512_i32gather_ps(_vindex, kd, sizeof(float)); + __m512 _ke = _mm512_i32gather_ps(_vindex, ke, sizeof(float)); + __m512 _kf = _mm512_i32gather_ps(_vindex, kf, sizeof(float)); + + transpose16x16_ps(_k0, _k1, _k2, _k3, _k4, _k5, _k6, _k7, _k8, _k9, _ka, _kb, _kc, _kd, _ke, _kf); + + _mm512_store_ps(g00, _k0); + _mm512_store_ps(g00 + 16, _k1); + _mm512_store_ps(g00 + 16 * 2, _k2); + _mm512_store_ps(g00 + 16 * 3, _k3); + _mm512_store_ps(g00 + 16 * 4, _k4); + _mm512_store_ps(g00 + 16 * 5, _k5); + _mm512_store_ps(g00 + 16 * 6, _k6); + _mm512_store_ps(g00 + 16 * 7, _k7); + _mm512_store_ps(g00 + 16 * 8, _k8); + _mm512_store_ps(g00 + 16 * 9, _k9); + _mm512_store_ps(g00 + 16 * 10, _ka); + _mm512_store_ps(g00 + 16 * 11, _kb); + _mm512_store_ps(g00 + 16 * 12, _kc); + _mm512_store_ps(g00 + 16 * 13, _kd); + _mm512_store_ps(g00 + 16 * 14, _ke); + _mm512_store_ps(g00 + 16 * 15, _kf); + + g00 += 256; + } + + kptr0 += kernel_w * 16; + kptr1 += kernel_w * 16; + kptr2 += kernel_w * 16; + kptr3 += kernel_w * 16; + kptr4 += kernel_w * 16; + kptr5 += kernel_w * 16; + kptr6 += kernel_w * 16; + kptr7 += kernel_w * 16; + kptr8 += kernel_w * 16; + kptr9 += kernel_w * 16; + kptra += kernel_w * 16; + kptrb += kernel_w * 16; + kptrc += kernel_w * 16; + kptrd += kernel_w * 16; + kptre += kernel_w * 16; + kptrf += kernel_w * 16; } -#endif // __AVX512F__ + + _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(inh)); + for (; p + 7 < inh; p += 8) { for (int k = 0; k < kernel_w; k++) { - const float* k0 = kptr0 + p * kernel_w; - const float* k1 = kptr1 + p * kernel_w; - const float* k2 = kptr2 + p * kernel_w; - const float* k3 = kptr3 + p * kernel_w; - const float* k4 = kptr4 + p * kernel_w; - const float* k5 = kptr5 + p * kernel_w; - const float* k6 = kptr6 + p * kernel_w; - const float* k7 = kptr7 + p * kernel_w; - const float* k8 = kptr8 + p * kernel_w; - const float* k9 = kptr9 + p * kernel_w; - const float* ka = kptra + p * kernel_w; - const float* kb = kptrb + p * kernel_w; - const float* kc = kptrc + p * kernel_w; - const float* kd = kptrd + p * kernel_w; - const float* ke = kptre + p * kernel_w; - const float* kf = kptrf + p * kernel_w; + const float* k0 = kptr0 + k; for (int i = 0; i < 8; i++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - g00[4] = k4[k]; - g00[5] = k5[k]; - g00[6] = k6[k]; - g00[7] = k7[k]; - g00[8] = k8[k]; - g00[9] = k9[k]; - g00[10] = ka[k]; - g00[11] = kb[k]; - g00[12] = kc[k]; - g00[13] = kd[k]; - g00[14] = ke[k]; - g00[15] = kf[k]; + __m512 _k0 = _mm512_i32gather_ps(_vindex, k0, sizeof(float)); + _mm512_store_ps(g00, _k0); k0 += kernel_w; - k1 += kernel_w; - k2 += kernel_w; - k3 += kernel_w; - k4 += kernel_w; - k5 += kernel_w; - k6 += kernel_w; - k7 += kernel_w; - k8 += kernel_w; - k9 += kernel_w; - ka += kernel_w; - kb += kernel_w; - kc += kernel_w; - kd += kernel_w; - ke += kernel_w; - kf += kernel_w; g00 += 16; } } + + kptr0 += kernel_w * 8; } -#endif // __AVX__ for (; p + 3 < inh; p += 4) { for (int k = 0; k < kernel_w; k++) { - const float* k0 = kptr0 + p * kernel_w; - const float* k1 = kptr1 + p * kernel_w; - const float* k2 = kptr2 + p * kernel_w; - const float* k3 = kptr3 + p * kernel_w; - const float* k4 = kptr4 + p * kernel_w; - const float* k5 = kptr5 + p * kernel_w; - const float* k6 = kptr6 + p * kernel_w; - const float* k7 = kptr7 + p * kernel_w; - const float* k8 = kptr8 + p * kernel_w; - const float* k9 = kptr9 + p * kernel_w; - const float* ka = kptra + p * kernel_w; - const float* kb = kptrb + p * kernel_w; - const float* kc = kptrc + p * kernel_w; - const float* kd = kptrd + p * kernel_w; - const float* ke = kptre + p * kernel_w; - const float* kf = kptrf + p * kernel_w; + const float* k0 = kptr0 + k; for (int i = 0; i < 4; i++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - g00[4] = k4[k]; - g00[5] = k5[k]; - g00[6] = k6[k]; - g00[7] = k7[k]; - g00[8] = k8[k]; - g00[9] = k9[k]; - g00[10] = ka[k]; - g00[11] = kb[k]; - g00[12] = kc[k]; - g00[13] = kd[k]; - g00[14] = ke[k]; - g00[15] = kf[k]; + __m512 _k0 = _mm512_i32gather_ps(_vindex, k0, sizeof(float)); + _mm512_store_ps(g00, _k0); k0 += kernel_w; - k1 += kernel_w; - k2 += kernel_w; - k3 += kernel_w; - k4 += kernel_w; - k5 += kernel_w; - k6 += kernel_w; - k7 += kernel_w; - k8 += kernel_w; - k9 += kernel_w; - ka += kernel_w; - kb += kernel_w; - kc += kernel_w; - kd += kernel_w; - ke += kernel_w; - kf += kernel_w; g00 += 16; } } + + kptr0 += kernel_w * 4; } for (; p + 1 < inh; p += 2) { for (int k = 0; k < kernel_w; k++) { - const float* k0 = kptr0 + p * kernel_w; - const float* k1 = kptr1 + p * kernel_w; - const float* k2 = kptr2 + p * kernel_w; - const float* k3 = kptr3 + p * kernel_w; - const float* k4 = kptr4 + p * kernel_w; - const float* k5 = kptr5 + p * kernel_w; - const float* k6 = kptr6 + p * kernel_w; - const float* k7 = kptr7 + p * kernel_w; - const float* k8 = kptr8 + p * kernel_w; - const float* k9 = kptr9 + p * kernel_w; - const float* ka = kptra + p * kernel_w; - const float* kb = kptrb + p * kernel_w; - const float* kc = kptrc + p * kernel_w; - const float* kd = kptrd + p * kernel_w; - const float* ke = kptre + p * kernel_w; - const float* kf = kptrf + p * kernel_w; + const float* k0 = kptr0 + k; for (int i = 0; i < 2; i++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - g00[4] = k4[k]; - g00[5] = k5[k]; - g00[6] = k6[k]; - g00[7] = k7[k]; - g00[8] = k8[k]; - g00[9] = k9[k]; - g00[10] = ka[k]; - g00[11] = kb[k]; - g00[12] = kc[k]; - g00[13] = kd[k]; - g00[14] = ke[k]; - g00[15] = kf[k]; + __m512 _k0 = _mm512_i32gather_ps(_vindex, k0, sizeof(float)); + _mm512_store_ps(g00, _k0); k0 += kernel_w; - k1 += kernel_w; - k2 += kernel_w; - k3 += kernel_w; - k4 += kernel_w; - k5 += kernel_w; - k6 += kernel_w; - k7 += kernel_w; - k8 += kernel_w; - k9 += kernel_w; - ka += kernel_w; - kb += kernel_w; - kc += kernel_w; - kd += kernel_w; - ke += kernel_w; - kf += kernel_w; g00 += 16; } } + + kptr0 += kernel_w * 2; } for (; p < inh; p++) { - const float* k0 = kptr0 + p * kernel_w; - const float* k1 = kptr1 + p * kernel_w; - const float* k2 = kptr2 + p * kernel_w; - const float* k3 = kptr3 + p * kernel_w; - const float* k4 = kptr4 + p * kernel_w; - const float* k5 = kptr5 + p * kernel_w; - const float* k6 = kptr6 + p * kernel_w; - const float* k7 = kptr7 + p * kernel_w; - const float* k8 = kptr8 + p * kernel_w; - const float* k9 = kptr9 + p * kernel_w; - const float* ka = kptra + p * kernel_w; - const float* kb = kptrb + p * kernel_w; - const float* kc = kptrc + p * kernel_w; - const float* kd = kptrd + p * kernel_w; - const float* ke = kptre + p * kernel_w; - const float* kf = kptrf + p * kernel_w; - for (int k = 0; k < kernel_w; k++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - g00[4] = k4[k]; - g00[5] = k5[k]; - g00[6] = k6[k]; - g00[7] = k7[k]; - g00[8] = k8[k]; - g00[9] = k9[k]; - g00[10] = ka[k]; - g00[11] = kb[k]; - g00[12] = kc[k]; - g00[13] = kd[k]; - g00[14] = ke[k]; - g00[15] = kf[k]; + const float* k0 = kptr0 + k; + + __m512 _k0 = _mm512_i32gather_ps(_vindex, k0, sizeof(float)); + _mm512_store_ps(g00, _k0); g00 += 16; } } @@ -446,67 +313,109 @@ static void convolution1d_transform_kernel_packed(const Mat& kernel, Mat& kernel float* g00 = kernel_tm.channel(q / 8); #endif +#if __AVX2__ + __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(kernel_w)); +#if __AVX512F__ + __m512i _vindex_512 = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + _vindex_512 = _mm512_mullo_epi32(_vindex_512, _mm512_set1_epi32(kernel_w)); +#endif // __AVX512F__ +#endif // __AVX2__ + int p = 0; #if __AVX512F__ for (; p + 15 < inh; p += 16) { for (int k = 0; k < kernel_w; k++) { - const float* k0 = kptr0 + p * kernel_w; - const float* k1 = kptr1 + p * kernel_w; - const float* k2 = kptr2 + p * kernel_w; - const float* k3 = kptr3 + p * kernel_w; - const float* k4 = kptr4 + p * kernel_w; - const float* k5 = kptr5 + p * kernel_w; - const float* k6 = kptr6 + p * kernel_w; - const float* k7 = kptr7 + p * kernel_w; - - for (int i = 0; i < 16; i++) - { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - g00[4] = k4[k]; - g00[5] = k5[k]; - g00[6] = k6[k]; - g00[7] = k7[k]; - k0 += kernel_w; - k1 += kernel_w; - k2 += kernel_w; - k3 += kernel_w; - k4 += kernel_w; - k5 += kernel_w; - k6 += kernel_w; - k7 += kernel_w; - g00 += 8; - } - } + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; + const float* k4 = kptr4 + k; + const float* k5 = kptr5 + k; + const float* k6 = kptr6 + k; + const float* k7 = kptr7 + k; + + __m512 _k0 = _mm512_i32gather_ps(_vindex_512, k0, sizeof(float)); + __m512 _k1 = _mm512_i32gather_ps(_vindex_512, k1, sizeof(float)); + __m512 _k2 = _mm512_i32gather_ps(_vindex_512, k2, sizeof(float)); + __m512 _k3 = _mm512_i32gather_ps(_vindex_512, k3, sizeof(float)); + __m512 _k4 = _mm512_i32gather_ps(_vindex_512, k4, sizeof(float)); + __m512 _k5 = _mm512_i32gather_ps(_vindex_512, k5, sizeof(float)); + __m512 _k6 = _mm512_i32gather_ps(_vindex_512, k6, sizeof(float)); + __m512 _k7 = _mm512_i32gather_ps(_vindex_512, k7, sizeof(float)); + + transpose16x8_ps(_k0, _k1, _k2, _k3, _k4, _k5, _k6, _k7); + + _mm512_storeu_ps(g00, _k0); + _mm512_storeu_ps(g00 + 16, _k1); + _mm512_storeu_ps(g00 + 16 * 2, _k2); + _mm512_storeu_ps(g00 + 16 * 3, _k3); + _mm512_storeu_ps(g00 + 16 * 4, _k4); + _mm512_storeu_ps(g00 + 16 * 5, _k5); + _mm512_storeu_ps(g00 + 16 * 6, _k6); + _mm512_storeu_ps(g00 + 16 * 7, _k7); + + g00 += 128; + } + + kptr0 += kernel_w * 16; + kptr1 += kernel_w * 16; + kptr2 += kernel_w * 16; + kptr3 += kernel_w * 16; + kptr4 += kernel_w * 16; + kptr5 += kernel_w * 16; + kptr6 += kernel_w * 16; + kptr7 += kernel_w * 16; } #endif // __AVX512F__ for (; p + 7 < inh; p += 8) { for (int k = 0; k < kernel_w; k++) { - const float* k0 = kptr0 + p * kernel_w; - const float* k1 = kptr1 + p * kernel_w; - const float* k2 = kptr2 + p * kernel_w; - const float* k3 = kptr3 + p * kernel_w; - const float* k4 = kptr4 + p * kernel_w; - const float* k5 = kptr5 + p * kernel_w; - const float* k6 = kptr6 + p * kernel_w; - const float* k7 = kptr7 + p * kernel_w; - + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; + const float* k4 = kptr4 + k; + const float* k5 = kptr5 + k; + const float* k6 = kptr6 + k; + const float* k7 = kptr7 + k; + +#if __AVX2__ + __m256 _k0 = _mm256_i32gather_ps(k0, _vindex, sizeof(float)); + __m256 _k1 = _mm256_i32gather_ps(k1, _vindex, sizeof(float)); + __m256 _k2 = _mm256_i32gather_ps(k2, _vindex, sizeof(float)); + __m256 _k3 = _mm256_i32gather_ps(k3, _vindex, sizeof(float)); + __m256 _k4 = _mm256_i32gather_ps(k4, _vindex, sizeof(float)); + __m256 _k5 = _mm256_i32gather_ps(k5, _vindex, sizeof(float)); + __m256 _k6 = _mm256_i32gather_ps(k6, _vindex, sizeof(float)); + __m256 _k7 = _mm256_i32gather_ps(k7, _vindex, sizeof(float)); + + transpose8x8_ps(_k0, _k1, _k2, _k3, _k4, _k5, _k6, _k7); + + _mm256_store_ps(g00, _k0); + _mm256_store_ps(g00 + 8, _k1); + _mm256_store_ps(g00 + 8 * 2, _k2); + _mm256_store_ps(g00 + 8 * 3, _k3); + _mm256_store_ps(g00 + 8 * 4, _k4); + _mm256_store_ps(g00 + 8 * 5, _k5); + _mm256_store_ps(g00 + 8 * 6, _k6); + _mm256_store_ps(g00 + 8 * 7, _k7); + + g00 += 64; +#else // __AVX2__ for (int i = 0; i < 8; i++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - g00[4] = k4[k]; - g00[5] = k5[k]; - g00[6] = k6[k]; - g00[7] = k7[k]; + g00[0] = k0[0]; + g00[1] = k1[0]; + g00[2] = k2[0]; + g00[3] = k3[0]; + g00[4] = k4[0]; + g00[5] = k5[0]; + g00[6] = k6[0]; + g00[7] = k7[0]; k0 += kernel_w; k1 += kernel_w; k2 += kernel_w; @@ -517,31 +426,54 @@ static void convolution1d_transform_kernel_packed(const Mat& kernel, Mat& kernel k7 += kernel_w; g00 += 8; } +#endif // __AVX2__ } + + kptr0 += kernel_w * 8; + kptr1 += kernel_w * 8; + kptr2 += kernel_w * 8; + kptr3 += kernel_w * 8; + kptr4 += kernel_w * 8; + kptr5 += kernel_w * 8; + kptr6 += kernel_w * 8; + kptr7 += kernel_w * 8; } + +#if __AVX2__ + _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(inh)); +#endif // __AVX2__ + for (; p + 3 < inh; p += 4) { for (int k = 0; k < kernel_w; k++) { - const float* k0 = kptr0 + p * kernel_w; - const float* k1 = kptr1 + p * kernel_w; - const float* k2 = kptr2 + p * kernel_w; - const float* k3 = kptr3 + p * kernel_w; - const float* k4 = kptr4 + p * kernel_w; - const float* k5 = kptr5 + p * kernel_w; - const float* k6 = kptr6 + p * kernel_w; - const float* k7 = kptr7 + p * kernel_w; + const float* k0 = kptr0 + k; +#if !__AVX2__ + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; + const float* k4 = kptr4 + k; + const float* k5 = kptr5 + k; + const float* k6 = kptr6 + k; + const float* k7 = kptr7 + k; +#endif // !__AVX2__ for (int i = 0; i < 4; i++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - g00[4] = k4[k]; - g00[5] = k5[k]; - g00[6] = k6[k]; - g00[7] = k7[k]; +#if __AVX2__ + __m256 _k0 = _mm256_i32gather_ps(k0, _vindex, sizeof(float)); + _mm256_store_ps(g00, _k0); + k0 += kernel_w; + g00 += 8; +#else // __AVX2__ + g00[0] = k0[0]; + g00[1] = k1[0]; + g00[2] = k2[0]; + g00[3] = k3[0]; + g00[4] = k4[0]; + g00[5] = k5[0]; + g00[6] = k6[0]; + g00[7] = k7[0]; k0 += kernel_w; k1 += kernel_w; k2 += kernel_w; @@ -551,32 +483,52 @@ static void convolution1d_transform_kernel_packed(const Mat& kernel, Mat& kernel k6 += kernel_w; k7 += kernel_w; g00 += 8; +#endif // __AVX2__ } } + + kptr0 += kernel_w * 4; +#if !__AVX2__ + kptr1 += kernel_w * 4; + kptr2 += kernel_w * 4; + kptr3 += kernel_w * 4; + kptr4 += kernel_w * 4; + kptr5 += kernel_w * 4; + kptr6 += kernel_w * 4; + kptr7 += kernel_w * 4; +#endif // !__AVX2__ } for (; p + 1 < inh; p += 2) { for (int k = 0; k < kernel_w; k++) { - const float* k0 = kptr0 + p * kernel_w; - const float* k1 = kptr1 + p * kernel_w; - const float* k2 = kptr2 + p * kernel_w; - const float* k3 = kptr3 + p * kernel_w; - const float* k4 = kptr4 + p * kernel_w; - const float* k5 = kptr5 + p * kernel_w; - const float* k6 = kptr6 + p * kernel_w; - const float* k7 = kptr7 + p * kernel_w; + const float* k0 = kptr0 + k; +#if !__AVX2__ + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; + const float* k4 = kptr4 + k; + const float* k5 = kptr5 + k; + const float* k6 = kptr6 + k; + const float* k7 = kptr7 + k; +#endif // !__AVX2__ for (int i = 0; i < 2; i++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - g00[4] = k4[k]; - g00[5] = k5[k]; - g00[6] = k6[k]; - g00[7] = k7[k]; +#if __AVX2__ + __m256 _k0 = _mm256_i32gather_ps(k0, _vindex, sizeof(float)); + _mm256_store_ps(g00, _k0); + k0 += kernel_w; + g00 += 8; +#else // __AVX2__ + g00[0] = k0[0]; + g00[1] = k1[0]; + g00[2] = k2[0]; + g00[3] = k3[0]; + g00[4] = k4[0]; + g00[5] = k5[0]; + g00[6] = k6[0]; + g00[7] = k7[0]; k0 += kernel_w; k1 += kernel_w; k2 += kernel_w; @@ -586,31 +538,49 @@ static void convolution1d_transform_kernel_packed(const Mat& kernel, Mat& kernel k6 += kernel_w; k7 += kernel_w; g00 += 8; +#endif // __AVX2__ } } + + kptr0 += kernel_w * 2; +#if !__AVX2__ + kptr1 += kernel_w * 2; + kptr2 += kernel_w * 2; + kptr3 += kernel_w * 2; + kptr4 += kernel_w * 2; + kptr5 += kernel_w * 2; + kptr6 += kernel_w * 2; + kptr7 += kernel_w * 2; +#endif // !__AVX2__ } for (; p < inh; p++) { - const float* k0 = kptr0 + p * kernel_w; - const float* k1 = kptr1 + p * kernel_w; - const float* k2 = kptr2 + p * kernel_w; - const float* k3 = kptr3 + p * kernel_w; - const float* k4 = kptr4 + p * kernel_w; - const float* k5 = kptr5 + p * kernel_w; - const float* k6 = kptr6 + p * kernel_w; - const float* k7 = kptr7 + p * kernel_w; - for (int k = 0; k < kernel_w; k++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - g00[4] = k4[k]; - g00[5] = k5[k]; - g00[6] = k6[k]; - g00[7] = k7[k]; + const float* k0 = kptr0 + k; +#if __AVX2__ + __m256 _k0 = _mm256_i32gather_ps(k0, _vindex, sizeof(float)); + _mm256_store_ps(g00, _k0); + g00 += 8; +#else // __AVX2__ + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; + const float* k4 = kptr4 + k; + const float* k5 = kptr5 + k; + const float* k6 = kptr6 + k; + const float* k7 = kptr7 + k; + + g00[0] = k0[0]; + g00[1] = k1[0]; + g00[2] = k2[0]; + g00[3] = k3[0]; + g00[4] = k4[0]; + g00[5] = k5[0]; + g00[6] = k6[0]; + g00[7] = k7[0]; g00 += 8; +#endif // __AVX2__ } } } @@ -630,6 +600,17 @@ static void convolution1d_transform_kernel_packed(const Mat& kernel, Mat& kernel float* g00 = kernel_tm.channel(q / 4); #endif +#if __AVX2__ + __m128i _vindex = _mm_setr_epi32(0, 1, 2, 3); + _vindex = _mm_mullo_epi32(_vindex, _mm_set1_epi32(kernel_w)); + __m256i _vindex_256 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + _vindex_256 = _mm256_mullo_epi32(_vindex_256, _mm256_set1_epi32(kernel_w)); +#if __AVX512F__ + __m512i _vindex_512 = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + _vindex_512 = _mm512_mullo_epi32(_vindex_512, _mm512_set1_epi32(kernel_w)); +#endif // __AVX512F__ +#endif // __AVX2__ + int p = 0; #if __AVX__ #if __AVX512F__ @@ -637,110 +618,185 @@ static void convolution1d_transform_kernel_packed(const Mat& kernel, Mat& kernel { for (int k = 0; k < kernel_w; k++) { - const float* k0 = kptr0 + p * kernel_w; - const float* k1 = kptr1 + p * kernel_w; - const float* k2 = kptr2 + p * kernel_w; - const float* k3 = kptr3 + p * kernel_w; + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; - for (int i = 0; i < 16; i++) - { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - k0 += kernel_w; - k1 += kernel_w; - k2 += kernel_w; - k3 += kernel_w; - g00 += 4; - } + __m512 _k0 = _mm512_i32gather_ps(_vindex_512, k0, sizeof(float)); + __m512 _k1 = _mm512_i32gather_ps(_vindex_512, k1, sizeof(float)); + __m512 _k2 = _mm512_i32gather_ps(_vindex_512, k2, sizeof(float)); + __m512 _k3 = _mm512_i32gather_ps(_vindex_512, k3, sizeof(float)); + + transpose16x4_ps(_k0, _k1, _k2, _k3); + + _mm512_storeu_ps(g00, _k0); + _mm512_storeu_ps(g00 + 16, _k1); + _mm512_storeu_ps(g00 + 16 * 2, _k2); + _mm512_storeu_ps(g00 + 16 * 3, _k3); + + g00 += 64; } + + kptr0 += kernel_w * 16; + kptr1 += kernel_w * 16; + kptr2 += kernel_w * 16; + kptr3 += kernel_w * 16; } #endif // __AVX512F__ for (; p + 7 < inh; p += 8) { for (int k = 0; k < kernel_w; k++) { - const float* k0 = kptr0 + p * kernel_w; - const float* k1 = kptr1 + p * kernel_w; - const float* k2 = kptr2 + p * kernel_w; - const float* k3 = kptr3 + p * kernel_w; + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; +#if __AVX2__ + __m256 _k0 = _mm256_i32gather_ps(k0, _vindex_256, sizeof(float)); + __m256 _k1 = _mm256_i32gather_ps(k1, _vindex_256, sizeof(float)); + __m256 _k2 = _mm256_i32gather_ps(k2, _vindex_256, sizeof(float)); + __m256 _k3 = _mm256_i32gather_ps(k3, _vindex_256, sizeof(float)); + + transpose8x4_ps(_k0, _k1, _k2, _k3); + + _mm256_storeu_ps(g00, _k0); + _mm256_storeu_ps(g00 + 8, _k1); + _mm256_storeu_ps(g00 + 8 * 2, _k2); + _mm256_storeu_ps(g00 + 8 * 3, _k3); + + g00 += 32; +#else // __AVX2__ for (int i = 0; i < 8; i++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; + g00[0] = k0[0]; + g00[1] = k1[0]; + g00[2] = k2[0]; + g00[3] = k3[0]; k0 += kernel_w; k1 += kernel_w; k2 += kernel_w; k3 += kernel_w; g00 += 4; } +#endif // __AVX2__ } + + kptr0 += kernel_w * 8; + kptr1 += kernel_w * 8; + kptr2 += kernel_w * 8; + kptr3 += kernel_w * 8; } #endif // __AVX__ for (; p + 3 < inh; p += 4) { for (int k = 0; k < kernel_w; k++) { - const float* k0 = kptr0 + p * kernel_w; - const float* k1 = kptr1 + p * kernel_w; - const float* k2 = kptr2 + p * kernel_w; - const float* k3 = kptr3 + p * kernel_w; + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; +#if __AVX2__ + __m128 _k0 = _mm_i32gather_ps(k0, _vindex, sizeof(float)); + __m128 _k1 = _mm_i32gather_ps(k1, _vindex, sizeof(float)); + __m128 _k2 = _mm_i32gather_ps(k2, _vindex, sizeof(float)); + __m128 _k3 = _mm_i32gather_ps(k3, _vindex, sizeof(float)); + + _MM_TRANSPOSE4_PS(_k0, _k1, _k2, _k3); + + _mm_store_ps(g00, _k0); + _mm_store_ps(g00 + 4, _k1); + _mm_store_ps(g00 + 4 * 2, _k2); + _mm_store_ps(g00 + 4 * 3, _k3); + + g00 += 16; +#else // __AVX2__ for (int i = 0; i < 4; i++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; + g00[0] = k0[0]; + g00[1] = k1[0]; + g00[2] = k2[0]; + g00[3] = k3[0]; k0 += kernel_w; k1 += kernel_w; k2 += kernel_w; k3 += kernel_w; g00 += 4; } +#endif // __AVX2__ } + + kptr0 += kernel_w * 4; + kptr1 += kernel_w * 4; + kptr2 += kernel_w * 4; + kptr3 += kernel_w * 4; } + +#if __AVX2__ + _vindex = _mm_mullo_epi32(_vindex, _mm_set1_epi32(inh)); +#endif // __AVX2__ + for (; p + 1 < inh; p += 2) { for (int k = 0; k < kernel_w; k++) { - const float* k0 = kptr0 + p * kernel_w; - const float* k1 = kptr1 + p * kernel_w; - const float* k2 = kptr2 + p * kernel_w; - const float* k3 = kptr3 + p * kernel_w; + const float* k0 = kptr0 + k; +#if !__AVX2__ + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; +#endif // !__AVX2__ for (int i = 0; i < 2; i++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; +#if __AVX2__ + __m128 _k0 = _mm_i32gather_ps(k0, _vindex, sizeof(float)); + _mm_store_ps(g00, _k0); + k0 += kernel_w; + g00 += 4; +#else // __AVX2__ + g00[0] = k0[0]; + g00[1] = k1[0]; + g00[2] = k2[0]; + g00[3] = k3[0]; k0 += kernel_w; k1 += kernel_w; k2 += kernel_w; k3 += kernel_w; g00 += 4; +#endif // __AVX2__ } } + + kptr0 += kernel_w * 2; +#if !__AVX2__ + kptr1 += kernel_w * 2; + kptr2 += kernel_w * 2; + kptr3 += kernel_w * 2; +#endif // !__AVX2__ } for (; p < inh; p++) { - const float* k0 = kptr0 + p * kernel_w; - const float* k1 = kptr1 + p * kernel_w; - const float* k2 = kptr2 + p * kernel_w; - const float* k3 = kptr3 + p * kernel_w; - for (int k = 0; k < kernel_w; k++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; + const float* k0 = kptr0 + k; +#if __AVX2__ + __m128 _k0 = _mm_i32gather_ps(k0, _vindex, sizeof(float)); + _mm_store_ps(g00, _k0); + g00 += 4; +#else // __AVX2__ + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; + + g00[0] = k0[0]; + g00[1] = k1[0]; + g00[2] = k2[0]; + g00[3] = k3[0]; g00 += 4; +#endif // __AVX2__ } } } @@ -760,6 +816,17 @@ static void convolution1d_transform_kernel_packed(const Mat& kernel, Mat& kernel float* g00 = kernel_tm.channel(q / 2); #endif +#if __AVX2__ + __m128i _vindex = _mm_setr_epi32(0, 1, 2, 3); + _vindex = _mm_mullo_epi32(_vindex, _mm_set1_epi32(kernel_w)); + __m256i _vindex_256 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + _vindex_256 = _mm256_mullo_epi32(_vindex_256, _mm256_set1_epi32(kernel_w)); +#if __AVX512F__ + __m512i _vindex_512 = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + _vindex_512 = _mm512_mullo_epi32(_vindex_512, _mm512_set1_epi32(kernel_w)); +#endif // __AVX512F__ +#endif // __AVX2__ + int p = 0; #if __SSE2__ #if __AVX__ @@ -768,52 +835,34 @@ static void convolution1d_transform_kernel_packed(const Mat& kernel, Mat& kernel { for (int k = 0; k < kernel_w; k++) { - const float* k0 = kptr0 + p * kernel_w + k; - const float* k1 = kptr1 + p * kernel_w + k; + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; - g00[0] = k0[0]; - g00[1] = k0[kernel_w]; - g00[2] = k0[kernel_w * 2]; - g00[3] = k0[kernel_w * 3]; - g00[4] = k0[kernel_w * 4]; - g00[5] = k0[kernel_w * 5]; - g00[6] = k0[kernel_w * 6]; - g00[7] = k0[kernel_w * 7]; - g00[8] = k0[kernel_w * 8]; - g00[9] = k0[kernel_w * 9]; - g00[10] = k0[kernel_w * 10]; - g00[11] = k0[kernel_w * 11]; - g00[12] = k0[kernel_w * 12]; - g00[13] = k0[kernel_w * 13]; - g00[14] = k0[kernel_w * 14]; - g00[15] = k0[kernel_w * 15]; - g00[16] = k1[0]; - g00[17] = k1[kernel_w]; - g00[18] = k1[kernel_w * 2]; - g00[19] = k1[kernel_w * 3]; - g00[20] = k1[kernel_w * 4]; - g00[21] = k1[kernel_w * 5]; - g00[22] = k1[kernel_w * 6]; - g00[23] = k1[kernel_w * 7]; - g00[24] = k1[kernel_w * 8]; - g00[25] = k1[kernel_w * 9]; - g00[26] = k1[kernel_w * 10]; - g00[27] = k1[kernel_w * 11]; - g00[28] = k1[kernel_w * 12]; - g00[29] = k1[kernel_w * 13]; - g00[30] = k1[kernel_w * 14]; - g00[31] = k1[kernel_w * 15]; + __m512 _k0 = _mm512_i32gather_ps(_vindex_512, k0, sizeof(float)); + __m512 _k1 = _mm512_i32gather_ps(_vindex_512, k1, sizeof(float)); + _mm512_storeu_ps(g00, _k0); + _mm512_storeu_ps(g00 + 16, _k1); g00 += 32; } + + kptr0 += kernel_w * 16; + kptr1 += kernel_w * 16; } #endif // __AVX512F__ for (; p + 7 < inh; p += 8) { for (int k = 0; k < kernel_w; k++) { - const float* k0 = kptr0 + p * kernel_w + k; - const float* k1 = kptr1 + p * kernel_w + k; + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; +#if __AVX2__ + __m256 _k0 = _mm256_i32gather_ps(k0, _vindex_256, sizeof(float)); + __m256 _k1 = _mm256_i32gather_ps(k1, _vindex_256, sizeof(float)); + _mm256_storeu_ps(g00, _k0); + _mm256_storeu_ps(g00 + 8, _k1); + g00 += 16; +#else // __AVX2__ g00[0] = k0[0]; g00[1] = k0[kernel_w]; g00[2] = k0[kernel_w * 2]; @@ -831,16 +880,27 @@ static void convolution1d_transform_kernel_packed(const Mat& kernel, Mat& kernel g00[14] = k1[kernel_w * 6]; g00[15] = k1[kernel_w * 7]; g00 += 16; +#endif // __AVX2__ } + + kptr0 += kernel_w * 8; + kptr1 += kernel_w * 8; } #endif // __AVX__ for (; p + 3 < inh; p += 4) { for (int k = 0; k < kernel_w; k++) { - const float* k0 = kptr0 + p * kernel_w + k; - const float* k1 = kptr1 + p * kernel_w + k; + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; +#if __AVX2__ + __m128 _k0 = _mm_i32gather_ps(k0, _vindex, sizeof(float)); + __m128 _k1 = _mm_i32gather_ps(k1, _vindex, sizeof(float)); + _mm_storeu_ps(g00, _k0); + _mm_storeu_ps(g00 + 4, _k1); + g00 += 8; +#else // __AVX2__ g00[0] = k0[0]; g00[1] = k0[kernel_w]; g00[2] = k0[kernel_w * 2]; @@ -850,35 +910,42 @@ static void convolution1d_transform_kernel_packed(const Mat& kernel, Mat& kernel g00[6] = k1[kernel_w * 2]; g00[7] = k1[kernel_w * 3]; g00 += 8; +#endif // __AVX2__ } + + kptr0 += kernel_w * 4; + kptr1 += kernel_w * 4; } #endif // __SSE2__ for (; p + 1 < inh; p += 2) { for (int k = 0; k < kernel_w; k++) { - const float* k0 = kptr0 + p * kernel_w; - const float* k1 = kptr1 + p * kernel_w; + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; for (int i = 0; i < 2; i++) { - g00[0] = k0[k]; - g00[1] = k1[k]; + g00[0] = k0[0]; + g00[1] = k1[0]; k0 += kernel_w; k1 += kernel_w; g00 += 2; } } + + kptr0 += kernel_w * 2; + kptr1 += kernel_w * 2; } for (; p < inh; p++) { - const float* k0 = kptr0 + p * kernel_w; - const float* k1 = kptr1 + p * kernel_w; - for (int k = 0; k < kernel_w; k++) { - g00[0] = k0[k]; - g00[1] = k1[k]; + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; + + g00[0] = k0[0]; + g00[1] = k1[0]; g00 += 2; } } @@ -897,6 +964,17 @@ static void convolution1d_transform_kernel_packed(const Mat& kernel, Mat& kernel float* g00 = kernel_tm.channel(q / 2 + q % 2); #endif +#if __AVX2__ + __m128i _vindex = _mm_setr_epi32(0, 1, 2, 3); + _vindex = _mm_mullo_epi32(_vindex, _mm_set1_epi32(kernel_w)); + __m256i _vindex_256 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + _vindex_256 = _mm256_mullo_epi32(_vindex_256, _mm256_set1_epi32(kernel_w)); +#if __AVX512F__ + __m512i _vindex_512 = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + _vindex_512 = _mm512_mullo_epi32(_vindex_512, _mm512_set1_epi32(kernel_w)); +#endif // __AVX512F__ +#endif // __AVX2__ + int p = 0; #if __SSE2__ #if __AVX__ @@ -905,68 +983,84 @@ static void convolution1d_transform_kernel_packed(const Mat& kernel, Mat& kernel { for (int k = 0; k < kernel_w; k++) { - const float* k0 = kptr + p * kernel_w; + const float* k0 = kptr + k; - for (int i = 0; i < 16; i++) - { - g00[0] = k0[k]; - k0 += kernel_w; - g00 += 1; - } + __m512 _k0 = _mm512_i32gather_ps(_vindex_512, k0, sizeof(float)); + _mm512_storeu_ps(g00, _k0); + g00 += 16; } + + kptr += kernel_w * 16; } #endif // __AVX512F__ for (; p + 7 < inh; p += 8) { for (int k = 0; k < kernel_w; k++) { - const float* k0 = kptr + p * kernel_w; + const float* k0 = kptr + k; +#if __AVX2__ + __m256 _k0 = _mm256_i32gather_ps(k0, _vindex_256, sizeof(float)); + _mm256_storeu_ps(g00, _k0); + g00 += 8; +#else // __AVX2__ for (int i = 0; i < 8; i++) { - g00[0] = k0[k]; + g00[0] = k0[0]; k0 += kernel_w; g00 += 1; } +#endif // __AVX2__ } + + kptr += kernel_w * 8; } #endif // __AVX__ for (; p + 3 < inh; p += 4) { for (int k = 0; k < kernel_w; k++) { - const float* k0 = kptr + p * kernel_w; + const float* k0 = kptr + k; +#if __AVX2__ + __m128 _k0 = _mm_i32gather_ps(k0, _vindex, sizeof(float)); + _mm_storeu_ps(g00, _k0); + g00 += 4; +#else // __AVX2__ for (int i = 0; i < 4; i++) { - g00[0] = k0[k]; + g00[0] = k0[0]; k0 += kernel_w; g00 += 1; } +#endif // __AVX2__ } + + kptr += kernel_w * 4; } #endif // __SSE2__ for (; p + 1 < inh; p += 2) { for (int k = 0; k < kernel_w; k++) { - const float* k0 = kptr + p * kernel_w; + const float* k0 = kptr + k; for (int i = 0; i < 2; i++) { - g00[0] = k0[k]; + g00[0] = k0[0]; k0 += kernel_w; g00 += 1; } } + + kptr += kernel_w * 2; } for (; p < inh; p++) { - const float* k0 = kptr + p * kernel_w; - for (int k = 0; k < kernel_w; k++) { - g00[0] = k0[k]; + const float* k0 = kptr + k; + g00[0] = k0[0]; g00++; } } diff --git a/src/layer/x86/convolution_packed.h b/src/layer/x86/convolution_packed.h index 6557f77f204..97ba6583443 100644 --- a/src/layer/x86/convolution_packed.h +++ b/src/layer/x86/convolution_packed.h @@ -93,10 +93,11 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t #endif // __AVX__ if (inch >= 4) kernel_tm.create(2 * 4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch / 2 + outch % 2); - else if (inch >= 2) - kernel_tm.create(2 * 2 * maxk, inch / 2 + inch % 2, outch / 2 + outch % 2); else #endif // __SSE2__ + if (inch >= 2) + kernel_tm.create(2 * 2 * maxk, inch / 2 + inch % 2, outch / 2 + outch % 2); + else kernel_tm.create(2 * maxk, inch, outch / 2 + outch % 2); } else @@ -114,10 +115,11 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t #endif // __AVX__ if (inch >= 4) kernel_tm.create(4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch); - else if (inch >= 2) - kernel_tm.create(2 * maxk, inch / 2 + inch % 2, outch); else #endif // __SSE2__ + if (inch >= 2) + kernel_tm.create(2 * maxk, inch / 2 + inch % 2, outch); + else kernel_tm.create(maxk, inch, outch); } // *INDENT-ON* @@ -148,284 +150,149 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t float* g00 = kernel_tm.channel(q / 16); + __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(maxk)); + int p = 0; -#if __AVX__ -#if __AVX512F__ for (; p + 15 < inch; p += 16) { for (int k = 0; k < maxk; k++) { - const float* k0 = kptr0 + p * maxk; - const float* k1 = kptr1 + p * maxk; - const float* k2 = kptr2 + p * maxk; - const float* k3 = kptr3 + p * maxk; - const float* k4 = kptr4 + p * maxk; - const float* k5 = kptr5 + p * maxk; - const float* k6 = kptr6 + p * maxk; - const float* k7 = kptr7 + p * maxk; - const float* k8 = kptr8 + p * maxk; - const float* k9 = kptr9 + p * maxk; - const float* ka = kptra + p * maxk; - const float* kb = kptrb + p * maxk; - const float* kc = kptrc + p * maxk; - const float* kd = kptrd + p * maxk; - const float* ke = kptre + p * maxk; - const float* kf = kptrf + p * maxk; - - for (int i = 0; i < 16; i++) - { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - g00[4] = k4[k]; - g00[5] = k5[k]; - g00[6] = k6[k]; - g00[7] = k7[k]; - g00[8] = k8[k]; - g00[9] = k9[k]; - g00[10] = ka[k]; - g00[11] = kb[k]; - g00[12] = kc[k]; - g00[13] = kd[k]; - g00[14] = ke[k]; - g00[15] = kf[k]; - k0 += maxk; - k1 += maxk; - k2 += maxk; - k3 += maxk; - k4 += maxk; - k5 += maxk; - k6 += maxk; - k7 += maxk; - k8 += maxk; - k9 += maxk; - ka += maxk; - kb += maxk; - kc += maxk; - kd += maxk; - ke += maxk; - kf += maxk; - g00 += 16; - } + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; + const float* k4 = kptr4 + k; + const float* k5 = kptr5 + k; + const float* k6 = kptr6 + k; + const float* k7 = kptr7 + k; + const float* k8 = kptr8 + k; + const float* k9 = kptr9 + k; + const float* ka = kptra + k; + const float* kb = kptrb + k; + const float* kc = kptrc + k; + const float* kd = kptrd + k; + const float* ke = kptre + k; + const float* kf = kptrf + k; + + __m512 _k0 = _mm512_i32gather_ps(_vindex, k0, sizeof(float)); + __m512 _k1 = _mm512_i32gather_ps(_vindex, k1, sizeof(float)); + __m512 _k2 = _mm512_i32gather_ps(_vindex, k2, sizeof(float)); + __m512 _k3 = _mm512_i32gather_ps(_vindex, k3, sizeof(float)); + __m512 _k4 = _mm512_i32gather_ps(_vindex, k4, sizeof(float)); + __m512 _k5 = _mm512_i32gather_ps(_vindex, k5, sizeof(float)); + __m512 _k6 = _mm512_i32gather_ps(_vindex, k6, sizeof(float)); + __m512 _k7 = _mm512_i32gather_ps(_vindex, k7, sizeof(float)); + __m512 _k8 = _mm512_i32gather_ps(_vindex, k8, sizeof(float)); + __m512 _k9 = _mm512_i32gather_ps(_vindex, k9, sizeof(float)); + __m512 _ka = _mm512_i32gather_ps(_vindex, ka, sizeof(float)); + __m512 _kb = _mm512_i32gather_ps(_vindex, kb, sizeof(float)); + __m512 _kc = _mm512_i32gather_ps(_vindex, kc, sizeof(float)); + __m512 _kd = _mm512_i32gather_ps(_vindex, kd, sizeof(float)); + __m512 _ke = _mm512_i32gather_ps(_vindex, ke, sizeof(float)); + __m512 _kf = _mm512_i32gather_ps(_vindex, kf, sizeof(float)); + + transpose16x16_ps(_k0, _k1, _k2, _k3, _k4, _k5, _k6, _k7, _k8, _k9, _ka, _kb, _kc, _kd, _ke, _kf); + + _mm512_store_ps(g00, _k0); + _mm512_store_ps(g00 + 16, _k1); + _mm512_store_ps(g00 + 16 * 2, _k2); + _mm512_store_ps(g00 + 16 * 3, _k3); + _mm512_store_ps(g00 + 16 * 4, _k4); + _mm512_store_ps(g00 + 16 * 5, _k5); + _mm512_store_ps(g00 + 16 * 6, _k6); + _mm512_store_ps(g00 + 16 * 7, _k7); + _mm512_store_ps(g00 + 16 * 8, _k8); + _mm512_store_ps(g00 + 16 * 9, _k9); + _mm512_store_ps(g00 + 16 * 10, _ka); + _mm512_store_ps(g00 + 16 * 11, _kb); + _mm512_store_ps(g00 + 16 * 12, _kc); + _mm512_store_ps(g00 + 16 * 13, _kd); + _mm512_store_ps(g00 + 16 * 14, _ke); + _mm512_store_ps(g00 + 16 * 15, _kf); + + g00 += 256; } + + kptr0 += maxk * 16; + kptr1 += maxk * 16; + kptr2 += maxk * 16; + kptr3 += maxk * 16; + kptr4 += maxk * 16; + kptr5 += maxk * 16; + kptr6 += maxk * 16; + kptr7 += maxk * 16; + kptr8 += maxk * 16; + kptr9 += maxk * 16; + kptra += maxk * 16; + kptrb += maxk * 16; + kptrc += maxk * 16; + kptrd += maxk * 16; + kptre += maxk * 16; + kptrf += maxk * 16; } -#endif // __AVX512F__ + + _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(inch)); + for (; p + 7 < inch; p += 8) { for (int k = 0; k < maxk; k++) { - const float* k0 = kptr0 + p * maxk; - const float* k1 = kptr1 + p * maxk; - const float* k2 = kptr2 + p * maxk; - const float* k3 = kptr3 + p * maxk; - const float* k4 = kptr4 + p * maxk; - const float* k5 = kptr5 + p * maxk; - const float* k6 = kptr6 + p * maxk; - const float* k7 = kptr7 + p * maxk; - const float* k8 = kptr8 + p * maxk; - const float* k9 = kptr9 + p * maxk; - const float* ka = kptra + p * maxk; - const float* kb = kptrb + p * maxk; - const float* kc = kptrc + p * maxk; - const float* kd = kptrd + p * maxk; - const float* ke = kptre + p * maxk; - const float* kf = kptrf + p * maxk; + const float* k0 = kptr0 + k; for (int i = 0; i < 8; i++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - g00[4] = k4[k]; - g00[5] = k5[k]; - g00[6] = k6[k]; - g00[7] = k7[k]; - g00[8] = k8[k]; - g00[9] = k9[k]; - g00[10] = ka[k]; - g00[11] = kb[k]; - g00[12] = kc[k]; - g00[13] = kd[k]; - g00[14] = ke[k]; - g00[15] = kf[k]; + __m512 _k0 = _mm512_i32gather_ps(_vindex, k0, sizeof(float)); + _mm512_store_ps(g00, _k0); k0 += maxk; - k1 += maxk; - k2 += maxk; - k3 += maxk; - k4 += maxk; - k5 += maxk; - k6 += maxk; - k7 += maxk; - k8 += maxk; - k9 += maxk; - ka += maxk; - kb += maxk; - kc += maxk; - kd += maxk; - ke += maxk; - kf += maxk; g00 += 16; } } + + kptr0 += maxk * 8; } -#endif // __AVX__ for (; p + 3 < inch; p += 4) { for (int k = 0; k < maxk; k++) { - const float* k0 = kptr0 + p * maxk; - const float* k1 = kptr1 + p * maxk; - const float* k2 = kptr2 + p * maxk; - const float* k3 = kptr3 + p * maxk; - const float* k4 = kptr4 + p * maxk; - const float* k5 = kptr5 + p * maxk; - const float* k6 = kptr6 + p * maxk; - const float* k7 = kptr7 + p * maxk; - const float* k8 = kptr8 + p * maxk; - const float* k9 = kptr9 + p * maxk; - const float* ka = kptra + p * maxk; - const float* kb = kptrb + p * maxk; - const float* kc = kptrc + p * maxk; - const float* kd = kptrd + p * maxk; - const float* ke = kptre + p * maxk; - const float* kf = kptrf + p * maxk; + const float* k0 = kptr0 + k; for (int i = 0; i < 4; i++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - g00[4] = k4[k]; - g00[5] = k5[k]; - g00[6] = k6[k]; - g00[7] = k7[k]; - g00[8] = k8[k]; - g00[9] = k9[k]; - g00[10] = ka[k]; - g00[11] = kb[k]; - g00[12] = kc[k]; - g00[13] = kd[k]; - g00[14] = ke[k]; - g00[15] = kf[k]; + __m512 _k0 = _mm512_i32gather_ps(_vindex, k0, sizeof(float)); + _mm512_store_ps(g00, _k0); k0 += maxk; - k1 += maxk; - k2 += maxk; - k3 += maxk; - k4 += maxk; - k5 += maxk; - k6 += maxk; - k7 += maxk; - k8 += maxk; - k9 += maxk; - ka += maxk; - kb += maxk; - kc += maxk; - kd += maxk; - ke += maxk; - kf += maxk; g00 += 16; } } + + kptr0 += maxk * 4; } for (; p + 1 < inch; p += 2) { for (int k = 0; k < maxk; k++) { - const float* k0 = kptr0 + p * maxk; - const float* k1 = kptr1 + p * maxk; - const float* k2 = kptr2 + p * maxk; - const float* k3 = kptr3 + p * maxk; - const float* k4 = kptr4 + p * maxk; - const float* k5 = kptr5 + p * maxk; - const float* k6 = kptr6 + p * maxk; - const float* k7 = kptr7 + p * maxk; - const float* k8 = kptr8 + p * maxk; - const float* k9 = kptr9 + p * maxk; - const float* ka = kptra + p * maxk; - const float* kb = kptrb + p * maxk; - const float* kc = kptrc + p * maxk; - const float* kd = kptrd + p * maxk; - const float* ke = kptre + p * maxk; - const float* kf = kptrf + p * maxk; + const float* k0 = kptr0 + k; for (int i = 0; i < 2; i++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - g00[4] = k4[k]; - g00[5] = k5[k]; - g00[6] = k6[k]; - g00[7] = k7[k]; - g00[8] = k8[k]; - g00[9] = k9[k]; - g00[10] = ka[k]; - g00[11] = kb[k]; - g00[12] = kc[k]; - g00[13] = kd[k]; - g00[14] = ke[k]; - g00[15] = kf[k]; + __m512 _k0 = _mm512_i32gather_ps(_vindex, k0, sizeof(float)); + _mm512_store_ps(g00, _k0); k0 += maxk; - k1 += maxk; - k2 += maxk; - k3 += maxk; - k4 += maxk; - k5 += maxk; - k6 += maxk; - k7 += maxk; - k8 += maxk; - k9 += maxk; - ka += maxk; - kb += maxk; - kc += maxk; - kd += maxk; - ke += maxk; - kf += maxk; g00 += 16; } } + + kptr0 += maxk * 2; } for (; p < inch; p++) { - const float* k0 = kptr0 + p * maxk; - const float* k1 = kptr1 + p * maxk; - const float* k2 = kptr2 + p * maxk; - const float* k3 = kptr3 + p * maxk; - const float* k4 = kptr4 + p * maxk; - const float* k5 = kptr5 + p * maxk; - const float* k6 = kptr6 + p * maxk; - const float* k7 = kptr7 + p * maxk; - const float* k8 = kptr8 + p * maxk; - const float* k9 = kptr9 + p * maxk; - const float* ka = kptra + p * maxk; - const float* kb = kptrb + p * maxk; - const float* kc = kptrc + p * maxk; - const float* kd = kptrd + p * maxk; - const float* ke = kptre + p * maxk; - const float* kf = kptrf + p * maxk; - for (int k = 0; k < maxk; k++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - g00[4] = k4[k]; - g00[5] = k5[k]; - g00[6] = k6[k]; - g00[7] = k7[k]; - g00[8] = k8[k]; - g00[9] = k9[k]; - g00[10] = ka[k]; - g00[11] = kb[k]; - g00[12] = kc[k]; - g00[13] = kd[k]; - g00[14] = ke[k]; - g00[15] = kf[k]; + const float* k0 = kptr0 + k; + + __m512 _k0 = _mm512_i32gather_ps(_vindex, k0, sizeof(float)); + _mm512_store_ps(g00, _k0); g00 += 16; } } @@ -448,67 +315,109 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t float* g00 = kernel_tm.channel(q / 8); #endif +#if __AVX2__ + __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(maxk)); +#if __AVX512F__ + __m512i _vindex_512 = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + _vindex_512 = _mm512_mullo_epi32(_vindex_512, _mm512_set1_epi32(maxk)); +#endif // __AVX512F__ +#endif // __AVX2__ + int p = 0; #if __AVX512F__ for (; p + 15 < inch; p += 16) { for (int k = 0; k < maxk; k++) { - const float* k0 = kptr0 + p * maxk; - const float* k1 = kptr1 + p * maxk; - const float* k2 = kptr2 + p * maxk; - const float* k3 = kptr3 + p * maxk; - const float* k4 = kptr4 + p * maxk; - const float* k5 = kptr5 + p * maxk; - const float* k6 = kptr6 + p * maxk; - const float* k7 = kptr7 + p * maxk; - - for (int i = 0; i < 16; i++) - { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - g00[4] = k4[k]; - g00[5] = k5[k]; - g00[6] = k6[k]; - g00[7] = k7[k]; - k0 += maxk; - k1 += maxk; - k2 += maxk; - k3 += maxk; - k4 += maxk; - k5 += maxk; - k6 += maxk; - k7 += maxk; - g00 += 8; - } + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; + const float* k4 = kptr4 + k; + const float* k5 = kptr5 + k; + const float* k6 = kptr6 + k; + const float* k7 = kptr7 + k; + + __m512 _k0 = _mm512_i32gather_ps(_vindex_512, k0, sizeof(float)); + __m512 _k1 = _mm512_i32gather_ps(_vindex_512, k1, sizeof(float)); + __m512 _k2 = _mm512_i32gather_ps(_vindex_512, k2, sizeof(float)); + __m512 _k3 = _mm512_i32gather_ps(_vindex_512, k3, sizeof(float)); + __m512 _k4 = _mm512_i32gather_ps(_vindex_512, k4, sizeof(float)); + __m512 _k5 = _mm512_i32gather_ps(_vindex_512, k5, sizeof(float)); + __m512 _k6 = _mm512_i32gather_ps(_vindex_512, k6, sizeof(float)); + __m512 _k7 = _mm512_i32gather_ps(_vindex_512, k7, sizeof(float)); + + transpose16x8_ps(_k0, _k1, _k2, _k3, _k4, _k5, _k6, _k7); + + _mm512_storeu_ps(g00, _k0); + _mm512_storeu_ps(g00 + 16, _k1); + _mm512_storeu_ps(g00 + 16 * 2, _k2); + _mm512_storeu_ps(g00 + 16 * 3, _k3); + _mm512_storeu_ps(g00 + 16 * 4, _k4); + _mm512_storeu_ps(g00 + 16 * 5, _k5); + _mm512_storeu_ps(g00 + 16 * 6, _k6); + _mm512_storeu_ps(g00 + 16 * 7, _k7); + + g00 += 128; } + + kptr0 += maxk * 16; + kptr1 += maxk * 16; + kptr2 += maxk * 16; + kptr3 += maxk * 16; + kptr4 += maxk * 16; + kptr5 += maxk * 16; + kptr6 += maxk * 16; + kptr7 += maxk * 16; } #endif // __AVX512F__ for (; p + 7 < inch; p += 8) { for (int k = 0; k < maxk; k++) { - const float* k0 = kptr0 + p * maxk; - const float* k1 = kptr1 + p * maxk; - const float* k2 = kptr2 + p * maxk; - const float* k3 = kptr3 + p * maxk; - const float* k4 = kptr4 + p * maxk; - const float* k5 = kptr5 + p * maxk; - const float* k6 = kptr6 + p * maxk; - const float* k7 = kptr7 + p * maxk; - + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; + const float* k4 = kptr4 + k; + const float* k5 = kptr5 + k; + const float* k6 = kptr6 + k; + const float* k7 = kptr7 + k; + +#if __AVX2__ + __m256 _k0 = _mm256_i32gather_ps(k0, _vindex, sizeof(float)); + __m256 _k1 = _mm256_i32gather_ps(k1, _vindex, sizeof(float)); + __m256 _k2 = _mm256_i32gather_ps(k2, _vindex, sizeof(float)); + __m256 _k3 = _mm256_i32gather_ps(k3, _vindex, sizeof(float)); + __m256 _k4 = _mm256_i32gather_ps(k4, _vindex, sizeof(float)); + __m256 _k5 = _mm256_i32gather_ps(k5, _vindex, sizeof(float)); + __m256 _k6 = _mm256_i32gather_ps(k6, _vindex, sizeof(float)); + __m256 _k7 = _mm256_i32gather_ps(k7, _vindex, sizeof(float)); + + transpose8x8_ps(_k0, _k1, _k2, _k3, _k4, _k5, _k6, _k7); + + _mm256_store_ps(g00, _k0); + _mm256_store_ps(g00 + 8, _k1); + _mm256_store_ps(g00 + 8 * 2, _k2); + _mm256_store_ps(g00 + 8 * 3, _k3); + _mm256_store_ps(g00 + 8 * 4, _k4); + _mm256_store_ps(g00 + 8 * 5, _k5); + _mm256_store_ps(g00 + 8 * 6, _k6); + _mm256_store_ps(g00 + 8 * 7, _k7); + + g00 += 64; +#else // __AVX2__ for (int i = 0; i < 8; i++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - g00[4] = k4[k]; - g00[5] = k5[k]; - g00[6] = k6[k]; - g00[7] = k7[k]; + g00[0] = k0[0]; + g00[1] = k1[0]; + g00[2] = k2[0]; + g00[3] = k3[0]; + g00[4] = k4[0]; + g00[5] = k5[0]; + g00[6] = k6[0]; + g00[7] = k7[0]; k0 += maxk; k1 += maxk; k2 += maxk; @@ -519,31 +428,54 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t k7 += maxk; g00 += 8; } +#endif // __AVX2__ } + + kptr0 += maxk * 8; + kptr1 += maxk * 8; + kptr2 += maxk * 8; + kptr3 += maxk * 8; + kptr4 += maxk * 8; + kptr5 += maxk * 8; + kptr6 += maxk * 8; + kptr7 += maxk * 8; } + +#if __AVX2__ + _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(inch)); +#endif // __AVX2__ + for (; p + 3 < inch; p += 4) { for (int k = 0; k < maxk; k++) { - const float* k0 = kptr0 + p * maxk; - const float* k1 = kptr1 + p * maxk; - const float* k2 = kptr2 + p * maxk; - const float* k3 = kptr3 + p * maxk; - const float* k4 = kptr4 + p * maxk; - const float* k5 = kptr5 + p * maxk; - const float* k6 = kptr6 + p * maxk; - const float* k7 = kptr7 + p * maxk; + const float* k0 = kptr0 + k; +#if !__AVX2__ + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; + const float* k4 = kptr4 + k; + const float* k5 = kptr5 + k; + const float* k6 = kptr6 + k; + const float* k7 = kptr7 + k; +#endif // !__AVX2__ for (int i = 0; i < 4; i++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - g00[4] = k4[k]; - g00[5] = k5[k]; - g00[6] = k6[k]; - g00[7] = k7[k]; +#if __AVX2__ + __m256 _k0 = _mm256_i32gather_ps(k0, _vindex, sizeof(float)); + _mm256_store_ps(g00, _k0); + k0 += maxk; + g00 += 8; +#else // __AVX2__ + g00[0] = k0[0]; + g00[1] = k1[0]; + g00[2] = k2[0]; + g00[3] = k3[0]; + g00[4] = k4[0]; + g00[5] = k5[0]; + g00[6] = k6[0]; + g00[7] = k7[0]; k0 += maxk; k1 += maxk; k2 += maxk; @@ -553,32 +485,52 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t k6 += maxk; k7 += maxk; g00 += 8; +#endif // __AVX2__ } } + + kptr0 += maxk * 4; +#if !__AVX2__ + kptr1 += maxk * 4; + kptr2 += maxk * 4; + kptr3 += maxk * 4; + kptr4 += maxk * 4; + kptr5 += maxk * 4; + kptr6 += maxk * 4; + kptr7 += maxk * 4; +#endif // !__AVX2__ } for (; p + 1 < inch; p += 2) { for (int k = 0; k < maxk; k++) { - const float* k0 = kptr0 + p * maxk; - const float* k1 = kptr1 + p * maxk; - const float* k2 = kptr2 + p * maxk; - const float* k3 = kptr3 + p * maxk; - const float* k4 = kptr4 + p * maxk; - const float* k5 = kptr5 + p * maxk; - const float* k6 = kptr6 + p * maxk; - const float* k7 = kptr7 + p * maxk; + const float* k0 = kptr0 + k; +#if !__AVX2__ + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; + const float* k4 = kptr4 + k; + const float* k5 = kptr5 + k; + const float* k6 = kptr6 + k; + const float* k7 = kptr7 + k; +#endif // !__AVX2__ for (int i = 0; i < 2; i++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - g00[4] = k4[k]; - g00[5] = k5[k]; - g00[6] = k6[k]; - g00[7] = k7[k]; +#if __AVX2__ + __m256 _k0 = _mm256_i32gather_ps(k0, _vindex, sizeof(float)); + _mm256_store_ps(g00, _k0); + k0 += maxk; + g00 += 8; +#else // __AVX2__ + g00[0] = k0[0]; + g00[1] = k1[0]; + g00[2] = k2[0]; + g00[3] = k3[0]; + g00[4] = k4[0]; + g00[5] = k5[0]; + g00[6] = k6[0]; + g00[7] = k7[0]; k0 += maxk; k1 += maxk; k2 += maxk; @@ -588,31 +540,49 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t k6 += maxk; k7 += maxk; g00 += 8; +#endif // __AVX2__ } } + + kptr0 += maxk * 2; +#if !__AVX2__ + kptr1 += maxk * 2; + kptr2 += maxk * 2; + kptr3 += maxk * 2; + kptr4 += maxk * 2; + kptr5 += maxk * 2; + kptr6 += maxk * 2; + kptr7 += maxk * 2; +#endif // !__AVX2__ } for (; p < inch; p++) { - const float* k0 = kptr0 + p * maxk; - const float* k1 = kptr1 + p * maxk; - const float* k2 = kptr2 + p * maxk; - const float* k3 = kptr3 + p * maxk; - const float* k4 = kptr4 + p * maxk; - const float* k5 = kptr5 + p * maxk; - const float* k6 = kptr6 + p * maxk; - const float* k7 = kptr7 + p * maxk; - for (int k = 0; k < maxk; k++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - g00[4] = k4[k]; - g00[5] = k5[k]; - g00[6] = k6[k]; - g00[7] = k7[k]; + const float* k0 = kptr0 + k; +#if __AVX2__ + __m256 _k0 = _mm256_i32gather_ps(k0, _vindex, sizeof(float)); + _mm256_store_ps(g00, _k0); + g00 += 8; +#else // __AVX2__ + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; + const float* k4 = kptr4 + k; + const float* k5 = kptr5 + k; + const float* k6 = kptr6 + k; + const float* k7 = kptr7 + k; + + g00[0] = k0[0]; + g00[1] = k1[0]; + g00[2] = k2[0]; + g00[3] = k3[0]; + g00[4] = k4[0]; + g00[5] = k5[0]; + g00[6] = k6[0]; + g00[7] = k7[0]; g00 += 8; +#endif // __AVX2__ } } } @@ -632,6 +602,17 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t float* g00 = kernel_tm.channel(q / 4); #endif +#if __AVX2__ + __m128i _vindex = _mm_setr_epi32(0, 1, 2, 3); + _vindex = _mm_mullo_epi32(_vindex, _mm_set1_epi32(maxk)); + __m256i _vindex_256 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + _vindex_256 = _mm256_mullo_epi32(_vindex_256, _mm256_set1_epi32(maxk)); +#if __AVX512F__ + __m512i _vindex_512 = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + _vindex_512 = _mm512_mullo_epi32(_vindex_512, _mm512_set1_epi32(maxk)); +#endif // __AVX512F__ +#endif // __AVX2__ + int p = 0; #if __AVX__ #if __AVX512F__ @@ -639,110 +620,185 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t { for (int k = 0; k < maxk; k++) { - const float* k0 = kptr0 + p * maxk; - const float* k1 = kptr1 + p * maxk; - const float* k2 = kptr2 + p * maxk; - const float* k3 = kptr3 + p * maxk; + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; - for (int i = 0; i < 16; i++) - { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; - k0 += maxk; - k1 += maxk; - k2 += maxk; - k3 += maxk; - g00 += 4; - } + __m512 _k0 = _mm512_i32gather_ps(_vindex_512, k0, sizeof(float)); + __m512 _k1 = _mm512_i32gather_ps(_vindex_512, k1, sizeof(float)); + __m512 _k2 = _mm512_i32gather_ps(_vindex_512, k2, sizeof(float)); + __m512 _k3 = _mm512_i32gather_ps(_vindex_512, k3, sizeof(float)); + + transpose16x4_ps(_k0, _k1, _k2, _k3); + + _mm512_storeu_ps(g00, _k0); + _mm512_storeu_ps(g00 + 16, _k1); + _mm512_storeu_ps(g00 + 16 * 2, _k2); + _mm512_storeu_ps(g00 + 16 * 3, _k3); + + g00 += 64; } + + kptr0 += maxk * 16; + kptr1 += maxk * 16; + kptr2 += maxk * 16; + kptr3 += maxk * 16; } #endif // __AVX512F__ for (; p + 7 < inch; p += 8) { for (int k = 0; k < maxk; k++) { - const float* k0 = kptr0 + p * maxk; - const float* k1 = kptr1 + p * maxk; - const float* k2 = kptr2 + p * maxk; - const float* k3 = kptr3 + p * maxk; + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; + +#if __AVX2__ + __m256 _k0 = _mm256_i32gather_ps(k0, _vindex_256, sizeof(float)); + __m256 _k1 = _mm256_i32gather_ps(k1, _vindex_256, sizeof(float)); + __m256 _k2 = _mm256_i32gather_ps(k2, _vindex_256, sizeof(float)); + __m256 _k3 = _mm256_i32gather_ps(k3, _vindex_256, sizeof(float)); + transpose8x4_ps(_k0, _k1, _k2, _k3); + + _mm256_storeu_ps(g00, _k0); + _mm256_storeu_ps(g00 + 8, _k1); + _mm256_storeu_ps(g00 + 8 * 2, _k2); + _mm256_storeu_ps(g00 + 8 * 3, _k3); + + g00 += 32; +#else // __AVX2__ for (int i = 0; i < 8; i++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; + g00[0] = k0[0]; + g00[1] = k1[0]; + g00[2] = k2[0]; + g00[3] = k3[0]; k0 += maxk; k1 += maxk; k2 += maxk; k3 += maxk; g00 += 4; } +#endif // __AVX2__ } + + kptr0 += maxk * 8; + kptr1 += maxk * 8; + kptr2 += maxk * 8; + kptr3 += maxk * 8; } #endif // __AVX__ for (; p + 3 < inch; p += 4) { for (int k = 0; k < maxk; k++) { - const float* k0 = kptr0 + p * maxk; - const float* k1 = kptr1 + p * maxk; - const float* k2 = kptr2 + p * maxk; - const float* k3 = kptr3 + p * maxk; + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; + +#if __AVX2__ + __m128 _k0 = _mm_i32gather_ps(k0, _vindex, sizeof(float)); + __m128 _k1 = _mm_i32gather_ps(k1, _vindex, sizeof(float)); + __m128 _k2 = _mm_i32gather_ps(k2, _vindex, sizeof(float)); + __m128 _k3 = _mm_i32gather_ps(k3, _vindex, sizeof(float)); + _MM_TRANSPOSE4_PS(_k0, _k1, _k2, _k3); + + _mm_store_ps(g00, _k0); + _mm_store_ps(g00 + 4, _k1); + _mm_store_ps(g00 + 4 * 2, _k2); + _mm_store_ps(g00 + 4 * 3, _k3); + + g00 += 16; +#else // __AVX2__ for (int i = 0; i < 4; i++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; + g00[0] = k0[0]; + g00[1] = k1[0]; + g00[2] = k2[0]; + g00[3] = k3[0]; k0 += maxk; k1 += maxk; k2 += maxk; k3 += maxk; g00 += 4; } +#endif // __AVX2__ } + + kptr0 += maxk * 4; + kptr1 += maxk * 4; + kptr2 += maxk * 4; + kptr3 += maxk * 4; } + +#if __AVX2__ + _vindex = _mm_mullo_epi32(_vindex, _mm_set1_epi32(inch)); +#endif // __AVX2__ + for (; p + 1 < inch; p += 2) { for (int k = 0; k < maxk; k++) { - const float* k0 = kptr0 + p * maxk; - const float* k1 = kptr1 + p * maxk; - const float* k2 = kptr2 + p * maxk; - const float* k3 = kptr3 + p * maxk; + const float* k0 = kptr0 + k; +#if !__AVX2__ + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; +#endif // !__AVX2__ for (int i = 0; i < 2; i++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; +#if __AVX2__ + __m128 _k0 = _mm_i32gather_ps(k0, _vindex, sizeof(float)); + _mm_store_ps(g00, _k0); + k0 += maxk; + g00 += 4; +#else // __AVX2__ + g00[0] = k0[0]; + g00[1] = k1[0]; + g00[2] = k2[0]; + g00[3] = k3[0]; k0 += maxk; k1 += maxk; k2 += maxk; k3 += maxk; g00 += 4; +#endif // __AVX2__ } } + + kptr0 += maxk * 2; +#if !__AVX2__ + kptr1 += maxk * 2; + kptr2 += maxk * 2; + kptr3 += maxk * 2; +#endif // !__AVX2__ } for (; p < inch; p++) { - const float* k0 = kptr0 + p * maxk; - const float* k1 = kptr1 + p * maxk; - const float* k2 = kptr2 + p * maxk; - const float* k3 = kptr3 + p * maxk; - for (int k = 0; k < maxk; k++) { - g00[0] = k0[k]; - g00[1] = k1[k]; - g00[2] = k2[k]; - g00[3] = k3[k]; + const float* k0 = kptr0 + k; +#if __AVX2__ + __m128 _k0 = _mm_i32gather_ps(k0, _vindex, sizeof(float)); + _mm_store_ps(g00, _k0); g00 += 4; +#else // __AVX2__ + const float* k1 = kptr1 + k; + const float* k2 = kptr2 + k; + const float* k3 = kptr3 + k; + + g00[0] = k0[0]; + g00[1] = k1[0]; + g00[2] = k2[0]; + g00[3] = k3[0]; + g00 += 4; +#endif // __AVX2__ } } } @@ -762,6 +818,17 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t float* g00 = kernel_tm.channel(q / 2); #endif +#if __AVX2__ + __m128i _vindex = _mm_setr_epi32(0, 1, 2, 3); + _vindex = _mm_mullo_epi32(_vindex, _mm_set1_epi32(maxk)); + __m256i _vindex_256 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + _vindex_256 = _mm256_mullo_epi32(_vindex_256, _mm256_set1_epi32(maxk)); +#if __AVX512F__ + __m512i _vindex_512 = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + _vindex_512 = _mm512_mullo_epi32(_vindex_512, _mm512_set1_epi32(maxk)); +#endif // __AVX512F__ +#endif // __AVX2__ + int p = 0; #if __SSE2__ #if __AVX__ @@ -770,52 +837,34 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t { for (int k = 0; k < maxk; k++) { - const float* k0 = kptr0 + p * maxk + k; - const float* k1 = kptr1 + p * maxk + k; + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; - g00[0] = k0[0]; - g00[1] = k0[maxk]; - g00[2] = k0[maxk * 2]; - g00[3] = k0[maxk * 3]; - g00[4] = k0[maxk * 4]; - g00[5] = k0[maxk * 5]; - g00[6] = k0[maxk * 6]; - g00[7] = k0[maxk * 7]; - g00[8] = k0[maxk * 8]; - g00[9] = k0[maxk * 9]; - g00[10] = k0[maxk * 10]; - g00[11] = k0[maxk * 11]; - g00[12] = k0[maxk * 12]; - g00[13] = k0[maxk * 13]; - g00[14] = k0[maxk * 14]; - g00[15] = k0[maxk * 15]; - g00[16] = k1[0]; - g00[17] = k1[maxk]; - g00[18] = k1[maxk * 2]; - g00[19] = k1[maxk * 3]; - g00[20] = k1[maxk * 4]; - g00[21] = k1[maxk * 5]; - g00[22] = k1[maxk * 6]; - g00[23] = k1[maxk * 7]; - g00[24] = k1[maxk * 8]; - g00[25] = k1[maxk * 9]; - g00[26] = k1[maxk * 10]; - g00[27] = k1[maxk * 11]; - g00[28] = k1[maxk * 12]; - g00[29] = k1[maxk * 13]; - g00[30] = k1[maxk * 14]; - g00[31] = k1[maxk * 15]; + __m512 _k0 = _mm512_i32gather_ps(_vindex_512, k0, sizeof(float)); + __m512 _k1 = _mm512_i32gather_ps(_vindex_512, k1, sizeof(float)); + _mm512_storeu_ps(g00, _k0); + _mm512_storeu_ps(g00 + 16, _k1); g00 += 32; } + + kptr0 += maxk * 16; + kptr1 += maxk * 16; } #endif // __AVX512F__ for (; p + 7 < inch; p += 8) { for (int k = 0; k < maxk; k++) { - const float* k0 = kptr0 + p * maxk + k; - const float* k1 = kptr1 + p * maxk + k; - + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; + +#if __AVX2__ + __m256 _k0 = _mm256_i32gather_ps(k0, _vindex_256, sizeof(float)); + __m256 _k1 = _mm256_i32gather_ps(k1, _vindex_256, sizeof(float)); + _mm256_storeu_ps(g00, _k0); + _mm256_storeu_ps(g00 + 8, _k1); + g00 += 16; +#else // __AVX2__ g00[0] = k0[0]; g00[1] = k0[maxk]; g00[2] = k0[maxk * 2]; @@ -833,16 +882,27 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t g00[14] = k1[maxk * 6]; g00[15] = k1[maxk * 7]; g00 += 16; +#endif // __AVX2__ } + + kptr0 += maxk * 8; + kptr1 += maxk * 8; } #endif // __AVX__ for (; p + 3 < inch; p += 4) { for (int k = 0; k < maxk; k++) { - const float* k0 = kptr0 + p * maxk + k; - const float* k1 = kptr1 + p * maxk + k; - + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; + +#if __AVX2__ + __m128 _k0 = _mm_i32gather_ps(k0, _vindex, sizeof(float)); + __m128 _k1 = _mm_i32gather_ps(k1, _vindex, sizeof(float)); + _mm_storeu_ps(g00, _k0); + _mm_storeu_ps(g00 + 4, _k1); + g00 += 8; +#else // __AVX2__ g00[0] = k0[0]; g00[1] = k0[maxk]; g00[2] = k0[maxk * 2]; @@ -852,35 +912,42 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t g00[6] = k1[maxk * 2]; g00[7] = k1[maxk * 3]; g00 += 8; +#endif // __AVX2__ } + + kptr0 += maxk * 4; + kptr1 += maxk * 4; } #endif // __SSE2__ for (; p + 1 < inch; p += 2) { for (int k = 0; k < maxk; k++) { - const float* k0 = kptr0 + p * maxk; - const float* k1 = kptr1 + p * maxk; + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; for (int i = 0; i < 2; i++) { - g00[0] = k0[k]; - g00[1] = k1[k]; + g00[0] = k0[0]; + g00[1] = k1[0]; k0 += maxk; k1 += maxk; g00 += 2; } } + + kptr0 += maxk * 2; + kptr1 += maxk * 2; } for (; p < inch; p++) { - const float* k0 = kptr0 + p * maxk; - const float* k1 = kptr1 + p * maxk; - for (int k = 0; k < maxk; k++) { - g00[0] = k0[k]; - g00[1] = k1[k]; + const float* k0 = kptr0 + k; + const float* k1 = kptr1 + k; + + g00[0] = k0[0]; + g00[1] = k1[0]; g00 += 2; } } @@ -899,6 +966,17 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t float* g00 = kernel_tm.channel(q / 2 + q % 2); #endif +#if __AVX2__ + __m128i _vindex = _mm_setr_epi32(0, 1, 2, 3); + _vindex = _mm_mullo_epi32(_vindex, _mm_set1_epi32(maxk)); + __m256i _vindex_256 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + _vindex_256 = _mm256_mullo_epi32(_vindex_256, _mm256_set1_epi32(maxk)); +#if __AVX512F__ + __m512i _vindex_512 = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + _vindex_512 = _mm512_mullo_epi32(_vindex_512, _mm512_set1_epi32(maxk)); +#endif // __AVX512F__ +#endif // __AVX2__ + int p = 0; #if __SSE2__ #if __AVX__ @@ -907,68 +985,85 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t { for (int k = 0; k < maxk; k++) { - const float* k0 = kptr + p * maxk; + const float* k0 = kptr + k; - for (int i = 0; i < 16; i++) - { - g00[0] = k0[k]; - k0 += maxk; - g00 += 1; - } + __m512 _k0 = _mm512_i32gather_ps(_vindex_512, k0, sizeof(float)); + _mm512_storeu_ps(g00, _k0); + g00 += 16; } + + kptr += maxk * 16; } #endif // __AVX512F__ for (; p + 7 < inch; p += 8) { for (int k = 0; k < maxk; k++) { - const float* k0 = kptr + p * maxk; + const float* k0 = kptr + k; +#if __AVX2__ + __m256 _k0 = _mm256_i32gather_ps(k0, _vindex_256, sizeof(float)); + _mm256_storeu_ps(g00, _k0); + g00 += 8; +#else // __AVX2__ for (int i = 0; i < 8; i++) { - g00[0] = k0[k]; + g00[0] = k0[0]; k0 += maxk; g00 += 1; } +#endif // __AVX2__ } + + kptr += maxk * 8; } #endif // __AVX__ for (; p + 3 < inch; p += 4) { for (int k = 0; k < maxk; k++) { - const float* k0 = kptr + p * maxk; + const float* k0 = kptr + k; +#if __AVX2__ + __m128 _k0 = _mm_i32gather_ps(k0, _vindex, sizeof(float)); + _mm_storeu_ps(g00, _k0); + g00 += 4; +#else // __AVX2__ for (int i = 0; i < 4; i++) { - g00[0] = k0[k]; + g00[0] = k0[0]; k0 += maxk; g00 += 1; } +#endif // __AVX2__ } + + kptr += maxk * 4; } #endif // __SSE2__ for (; p + 1 < inch; p += 2) { for (int k = 0; k < maxk; k++) { - const float* k0 = kptr + p * maxk; + const float* k0 = kptr + k; for (int i = 0; i < 2; i++) { - g00[0] = k0[k]; + g00[0] = k0[0]; k0 += maxk; g00 += 1; } } + + kptr += maxk * 2; } for (; p < inch; p++) { - const float* k0 = kptr + p * maxk; - for (int k = 0; k < maxk; k++) { - g00[0] = k0[k]; + const float* k0 = kptr + k; + + g00[0] = k0[0]; g00++; } }