Skip to content

Commit

Permalink
reduce memory usage
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui committed Oct 21, 2023
1 parent adede93 commit dedf148
Showing 1 changed file with 11 additions and 7 deletions.
18 changes: 11 additions & 7 deletions src/layer/arm/convolution_3x3_winograd_int8.h
Original file line number Diff line number Diff line change
Expand Up @@ -4257,7 +4257,7 @@ static inline void conv3x3s1_winograd23_transform_output_tile_int8(const Mat& to
}
}

static void conv3x3s1_winograd23_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
static void conv3x3s1_winograd23_int8(Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
{
int outw = top_blob.w;
int outh = top_blob.h;
Expand Down Expand Up @@ -4337,6 +4337,8 @@ static void conv3x3s1_winograd23_int8(const Mat& bottom_blob, Mat& top_blob, con
}
}

bottom_blob.release();

Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);

#pragma omp parallel for num_threads(nT)
Expand Down Expand Up @@ -4444,9 +4446,9 @@ static void conv3x3s1_winograd43_transform_kernel_int8(const Mat& kernel, Mat& A

const int nn_M = (M + TILE_M - 1) / TILE_M;

Mat A_tileX(B * TILE_M * TILE_K, 1, opt.num_threads, 4u, (Allocator*)0);
Mat A_tileX(B * TILE_M * TILE_K, 1, opt.num_threads, 2u, (Allocator*)0);

AT.create(TILE_K * TILE_M, B, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, 4u, (Allocator*)0);
AT.create(TILE_K * TILE_M, B, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, 2u, (Allocator*)0);

#pragma omp parallel for num_threads(opt.num_threads)
for (int ppj = 0; ppj < nn_M; ppj++)
Expand Down Expand Up @@ -5602,7 +5604,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile_int8(const Mat& to
}
}

static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
static void conv3x3s1_winograd43_int8(Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
{
int outw = top_blob.w;
int outh = top_blob.h;
Expand All @@ -5628,13 +5630,13 @@ static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, con

// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);

const int nn_NK = nn_N * nn_K;

if (nT > 1 && nn_NK < nT)
{
Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator);

for (int ppjk = 0; ppjk < nn_NK; ppjk++)
{
Expand All @@ -5657,7 +5659,7 @@ static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, con
}
else
{
Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator);

#pragma omp parallel for num_threads(nT)
for (int ppjk = 0; ppjk < nn_NK; ppjk++)
Expand All @@ -5682,6 +5684,8 @@ static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, con
}
}

bottom_blob.release();

Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);

#pragma omp parallel for num_threads(nT)
Expand Down

0 comments on commit dedf148

Please sign in to comment.