Skip to content

Commit

Permalink
x86 handle allocation failures (#5489)
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui authored Jun 5, 2024
1 parent 75ba2a0 commit b437963
Show file tree
Hide file tree
Showing 24 changed files with 357 additions and 135 deletions.
1 change: 0 additions & 1 deletion src/layer/arm/convolutiondepthwise_arm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1547,7 +1547,6 @@ int ConvolutionDepthWise_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_
return -100;
}

#pragma omp parallel for num_threads(opt.num_threads)
for (int g = 0; g < group; g++)
{
const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
Expand Down
4 changes: 4 additions & 0 deletions src/layer/x86/concat_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,8 @@ int Concat_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
if (elempack < out_elempack)
{
convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
if (top_blob.empty())
return -100;
}
}

Expand Down Expand Up @@ -685,6 +687,8 @@ int Concat_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
if (elempack < out_elempack)
{
convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
if (top_blob.empty())
return -100;
}
}

Expand Down
36 changes: 33 additions & 3 deletions src/layer/x86/convolution_3x3_winograd.h
Original file line number Diff line number Diff line change
Expand Up @@ -3135,7 +3135,7 @@ static inline void conv3x3s1_winograd23_transform_output_tile(const Mat& top_til
}
}

static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
static int conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
{
int outw = top_blob.w;
int outh = top_blob.h;
Expand All @@ -3162,12 +3162,16 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

if (nT > 1 && nn_NK < nT)
{
Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
if (B_tile.empty())
return -100;

for (int ppjk = 0; ppjk < nn_NK; ppjk++)
{
Expand All @@ -3191,6 +3195,8 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma
else
{
Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
if (B_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppjk = 0; ppjk < nn_NK; ppjk++)
Expand All @@ -3216,6 +3222,8 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma
}

Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (top_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_M; ppj++)
Expand Down Expand Up @@ -3245,6 +3253,8 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma
conv3x3s1_winograd23_transform_output_tile(top_tile, top_blob, bias, i, max_ii, j, max_jj);
}
}

return 0;
}

static inline void conv3x3s1_winograd43_transform_kernel_tile(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
Expand Down Expand Up @@ -4977,7 +4987,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile(const Mat& top_til
}
}

static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
static int conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
{
int outw = top_blob.w;
int outh = top_blob.h;
Expand All @@ -5004,12 +5014,16 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

if (nT > 1 && nn_NK < nT)
{
Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
if (B_tile.empty())
return -100;

for (int ppjk = 0; ppjk < nn_NK; ppjk++)
{
Expand All @@ -5033,6 +5047,8 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma
else
{
Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
if (B_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppjk = 0; ppjk < nn_NK; ppjk++)
Expand All @@ -5058,6 +5074,8 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma
}

Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (top_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_M; ppj++)
Expand Down Expand Up @@ -5087,6 +5105,8 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma
conv3x3s1_winograd43_transform_output_tile(top_tile, top_blob, bias, i, max_ii, j, max_jj);
}
}

return 0;
}

static inline void conv3x3s1_winograd63_transform_kernel_tile(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
Expand Down Expand Up @@ -7267,7 +7287,7 @@ static inline void conv3x3s1_winograd63_transform_output_tile(const Mat& top_til
}
}

static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
static int conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
{
int outw = top_blob.w;
int outh = top_blob.h;
Expand All @@ -7294,12 +7314,16 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

if (nT > 1 && nn_NK < nT)
{
Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
if (B_tile.empty())
return -100;

for (int ppjk = 0; ppjk < nn_NK; ppjk++)
{
Expand All @@ -7323,6 +7347,8 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma
else
{
Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
if (B_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppjk = 0; ppjk < nn_NK; ppjk++)
Expand All @@ -7348,6 +7374,8 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma
}

Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (top_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_M; ppj++)
Expand Down Expand Up @@ -7377,4 +7405,6 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma
conv3x3s1_winograd63_transform_output_tile(top_tile, top_blob, bias, i, max_ii, j, max_jj);
}
}

return 0;
}
64 changes: 38 additions & 26 deletions src/layer/x86/convolution_3x3_winograd_int8.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,25 @@
// specific language governing permissions and limitations under the License.

#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
void conv3x3s1_winograd23_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
void conv3x3s1_winograd43_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
int conv3x3s1_winograd23_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
int conv3x3s1_winograd43_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
#endif

#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
void conv3x3s1_winograd23_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
void conv3x3s1_winograd43_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
int conv3x3s1_winograd23_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
int conv3x3s1_winograd43_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
#endif

#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
void conv3x3s1_winograd23_transform_kernel_int8_avx2(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt);
void conv3x3s1_winograd23_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
int conv3x3s1_winograd23_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
void conv3x3s1_winograd43_transform_kernel_int8_avx2(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt);
void conv3x3s1_winograd43_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
int conv3x3s1_winograd43_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
#endif

#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
void conv3x3s1_winograd23_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
void conv3x3s1_winograd43_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
int conv3x3s1_winograd23_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
int conv3x3s1_winograd43_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
#endif

static void pack_A_tile_int8(const Mat& A, Mat& AT, int batch, int max_ii, int max_kk)
Expand Down Expand Up @@ -4424,37 +4424,33 @@ static inline void conv3x3s1_winograd23_transform_output_tile_int8(const Mat& to
}
}

static void conv3x3s1_winograd23_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
static int conv3x3s1_winograd23_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
if (ncnn::cpu_support_x86_avx512_vnni())
{
conv3x3s1_winograd23_int8_avx512vnni(bottom_blob, top_blob, AT, nT, opt);
return;
return conv3x3s1_winograd23_int8_avx512vnni(bottom_blob, top_blob, AT, nT, opt);
}
#endif

#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
if (ncnn::cpu_support_x86_avx_vnni())
{
conv3x3s1_winograd23_int8_avxvnni(bottom_blob, top_blob, AT, nT, opt);
return;
return conv3x3s1_winograd23_int8_avxvnni(bottom_blob, top_blob, AT, nT, opt);
}
#endif

#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
if (ncnn::cpu_support_x86_avx2())
{
conv3x3s1_winograd23_int8_avx2(bottom_blob, top_blob, AT, nT, opt);
return;
return conv3x3s1_winograd23_int8_avx2(bottom_blob, top_blob, AT, nT, opt);
}
#endif

#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
if (ncnn::cpu_support_x86_xop())
{
conv3x3s1_winograd23_int8_xop(bottom_blob, top_blob, AT, nT, opt);
return;
return conv3x3s1_winograd23_int8_xop(bottom_blob, top_blob, AT, nT, opt);
}
#endif

Expand Down Expand Up @@ -4483,12 +4479,16 @@ static void conv3x3s1_winograd23_int8(const Mat& bottom_blob, Mat& top_blob, con
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

if (nT > 1 && nn_NK < nT)
{
Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator);
if (B_tile.empty())
return -100;

for (int ppjk = 0; ppjk < nn_NK; ppjk++)
{
Expand All @@ -4512,6 +4512,8 @@ static void conv3x3s1_winograd23_int8(const Mat& bottom_blob, Mat& top_blob, con
else
{
Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator);
if (B_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppjk = 0; ppjk < nn_NK; ppjk++)
Expand All @@ -4537,6 +4539,8 @@ static void conv3x3s1_winograd23_int8(const Mat& bottom_blob, Mat& top_blob, con
}

Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (top_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_M; ppj++)
Expand Down Expand Up @@ -4568,6 +4572,8 @@ static void conv3x3s1_winograd23_int8(const Mat& bottom_blob, Mat& top_blob, con
conv3x3s1_winograd23_transform_output_tile_int8(top_tile, top_blob, i, max_ii, j, max_jj);
}
}

return 0;
}

static inline void conv3x3s1_winograd43_transform_kernel_tile_int8(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
Expand Down Expand Up @@ -6250,37 +6256,33 @@ static inline void conv3x3s1_winograd43_transform_output_tile_int8(const Mat& to
}
}

static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
static int conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
if (ncnn::cpu_support_x86_avx512_vnni())
{
conv3x3s1_winograd43_int8_avx512vnni(bottom_blob, top_blob, AT, nT, opt);
return;
return conv3x3s1_winograd43_int8_avx512vnni(bottom_blob, top_blob, AT, nT, opt);
}
#endif

#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
if (ncnn::cpu_support_x86_avx_vnni())
{
conv3x3s1_winograd43_int8_avxvnni(bottom_blob, top_blob, AT, nT, opt);
return;
return conv3x3s1_winograd43_int8_avxvnni(bottom_blob, top_blob, AT, nT, opt);
}
#endif

#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
if (ncnn::cpu_support_x86_avx2())
{
conv3x3s1_winograd43_int8_avx2(bottom_blob, top_blob, AT, nT, opt);
return;
return conv3x3s1_winograd43_int8_avx2(bottom_blob, top_blob, AT, nT, opt);
}
#endif

#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
if (ncnn::cpu_support_x86_xop())
{
conv3x3s1_winograd43_int8_xop(bottom_blob, top_blob, AT, nT, opt);
return;
return conv3x3s1_winograd43_int8_xop(bottom_blob, top_blob, AT, nT, opt);
}
#endif

Expand Down Expand Up @@ -6309,12 +6311,16 @@ static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, con
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
if (BT.empty())
return -100;

const int nn_NK = nn_N * nn_K;

if (nT > 1 && nn_NK < nT)
{
Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
if (B_tile.empty())
return -100;

for (int ppjk = 0; ppjk < nn_NK; ppjk++)
{
Expand All @@ -6338,6 +6344,8 @@ static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, con
else
{
Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
if (B_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppjk = 0; ppjk < nn_NK; ppjk++)
Expand All @@ -6363,6 +6371,8 @@ static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, con
}

Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
if (top_tileX.empty())
return -100;

#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_M; ppj++)
Expand Down Expand Up @@ -6394,4 +6404,6 @@ static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, con
conv3x3s1_winograd43_transform_output_tile_int8(top_tile, top_blob, i, max_ii, j, max_jj);
}
}

return 0;
}
Loading

0 comments on commit b437963

Please sign in to comment.