Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix im2col on X86, fix opt and interpolate on ARM. #7365

Merged
merged 7 commits into from
Oct 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 32 additions & 3 deletions lite/backends/arm/math/fp16/interpolate_fp16.cc
Original file line number Diff line number Diff line change
Expand Up @@ -411,13 +411,39 @@ void interpolate(lite::Tensor* X,
float scale,
bool with_align,
int align_mode,
std::string interpolate_type) {
std::string interpolate_type,
std::vector<float> scale_data) {
int in_h = X->dims()[2];
int in_w = X->dims()[3];
float height_scale = 0.f;
float width_scale = 0.f;

if (SizeTensor.size() > 0) {
auto new_size = get_new_shape(SizeTensor);
out_height = new_size[0];
out_width = new_size[1];
} else if (scale_data.size() > 0) {
if (scale_data.size() == 1) {
if (scale_data[0] > 0) {
out_height = static_cast<int>(in_h * scale_data[0]);
out_width = static_cast<int>(in_w * scale_data[0]);
} else {
LOG(FATAL) << "scale data <= 0";
}
} else if (scale_data.size() == 2) {
if (scale_data[0] > 0 && scale_data[1] > 0) {
out_height = static_cast<int>(in_h * scale_data[0]);
out_width = static_cast<int>(in_w * scale_data[1]);
} else {
LOG(FATAL) << "scale data <= 0";
}
}
auto out_size = OutSize;
if (out_size != nullptr) {
auto out_size_data = get_new_data_from_tensor<int>(out_size);
out_height = out_size_data[0];
out_width = out_size_data[1];
}
} else {
auto scale_tensor = Scale;
if (scale_tensor != nullptr) {
Expand All @@ -435,11 +461,14 @@ void interpolate(lite::Tensor* X,
out_width = out_size_data[1];
}
}
float height_scale = scale;
float width_scale = scale;
height_scale = scale;
width_scale = scale;
if (out_width > 0 && out_height > 0) {
height_scale = static_cast<float>(out_height / X->dims()[2]);
width_scale = static_cast<float>(out_width / X->dims()[3]);
} else {
out_height = static_cast<int>(X->dims()[2] * height_scale + 0.5f);
out_width = static_cast<int>(X->dims()[3] * width_scale + 0.5f);
}
int num_cout = X->dims()[0];
int c_cout = X->dims()[1];
Expand Down
3 changes: 2 additions & 1 deletion lite/backends/arm/math/fp16/interpolate_fp16.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ void interpolate(lite::Tensor* X,
float scale,
bool with_align,
int align_mode,
std::string interpolate_type);
std::string interpolate_type,
std::vector<float> scale_data);

} // namespace fp16
} // namespace math
Expand Down
35 changes: 32 additions & 3 deletions lite/backends/arm/math/interpolate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -509,13 +509,39 @@ void interpolate(lite::Tensor* X,
float scale,
bool with_align,
int align_mode,
std::string interpolate_type) {
std::string interpolate_type,
std::vector<float> scale_data) {
int in_h = X->dims()[2];
int in_w = X->dims()[3];
float height_scale = 0.f;
float width_scale = 0.f;

if (SizeTensor.size() > 0) {
auto new_size = get_new_shape(SizeTensor);
out_height = new_size[0];
out_width = new_size[1];
} else if (scale_data.size() > 0) {
if (scale_data.size() == 1) {
if (scale_data[0] > 0) {
out_height = static_cast<int>(in_h * scale_data[0]);
out_width = static_cast<int>(in_w * scale_data[0]);
} else {
LOG(FATAL) << "scale data <= 0";
}
} else if (scale_data.size() == 2) {
if (scale_data[0] > 0 && scale_data[1] > 0) {
out_height = static_cast<int>(in_h * scale_data[0]);
out_width = static_cast<int>(in_w * scale_data[1]);
} else {
LOG(FATAL) << "scale data <= 0";
}
}
auto out_size = OutSize;
if (out_size != nullptr) {
auto out_size_data = get_new_data_from_tensor<int>(out_size);
out_height = out_size_data[0];
out_width = out_size_data[1];
}
} else {
auto scale_tensor = Scale;
if (scale_tensor != nullptr) {
Expand All @@ -533,11 +559,14 @@ void interpolate(lite::Tensor* X,
out_width = out_size_data[1];
}
}
float height_scale = scale;
float width_scale = scale;
height_scale = scale;
width_scale = scale;
if (out_width > 0 && out_height > 0) {
height_scale = static_cast<float>(out_height / X->dims()[2]);
width_scale = static_cast<float>(out_width / X->dims()[3]);
} else {
out_height = static_cast<int>(X->dims()[2] * height_scale + 0.5f);
out_width = static_cast<int>(X->dims()[3] * width_scale + 0.5f);
}
int num_cout = X->dims()[0];
int c_cout = X->dims()[1];
Expand Down
3 changes: 2 additions & 1 deletion lite/backends/arm/math/interpolate.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ void interpolate(lite::Tensor* X,
float scale,
bool with_align,
int align_mode,
std::string interpolate_type);
std::string interpolate_type,
std::vector<float> scale_data);

} /* namespace math */
} /* namespace arm */
Expand Down
72 changes: 40 additions & 32 deletions lite/backends/x86/math/avx/conv_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -861,31 +861,36 @@ void im2col_s1<float>(const float* data_im,
(width + pad_left + pad_right - (dilation_w * (kernel_w - 1) + 1)) + 1;
const int in_channel_size = height * width;
const int out_channel_size = output_h * output_w;
const int output_plane_size = output_h * output_w * kernel_h * kernel_w;
memset(data_col, 0, output_plane_size * channels * sizeof(float));
const unsigned int output_plane_size =
output_h * output_w * kernel_h * kernel_w;
size_t tmp_size = static_cast<size_t>(output_plane_size);
size_t mem_size = tmp_size * channels * sizeof(float);
memset(data_col, 0, mem_size);
#pragma omp parallel for
for (int c = 0; c < channels; c++) {
int data_im_z = c * in_channel_size;
int data_col_z1 = c * output_plane_size;
unsigned int data_im_z = c * in_channel_size;
unsigned int data_col_z1 = c * output_plane_size;
for (int ky = 0, h_offset = 0; ky < kernel_h;
ky++, h_offset += dilation_h) {
int data_col_z2 = ky * out_channel_size * kernel_w;
unsigned int data_col_z2 = ky * out_channel_size * kernel_w;
for (int kx = 0, w_offset = 0; kx < kernel_w;
kx++, w_offset += dilation_w) {
int data_col_z3 = kx * out_channel_size;
int data_col_z = data_col_z1 + data_col_z2 + data_col_z3;
int oh_begin = std::max(((pad_top - h_offset)), 0);
int oh_end = std::min(((height + pad_bottom - h_offset)), output_h);
unsigned int data_col_z3 = kx * out_channel_size;
unsigned int data_col_z = data_col_z1 + data_col_z2 + data_col_z3;
unsigned int oh_begin = std::max(((pad_top - h_offset)), 0);
unsigned int oh_end =
std::min(((height + pad_bottom - h_offset)), output_h);
oh_end = std::max(oh_begin, oh_end);
int ow_begin = std::max(((pad_left - w_offset)), 0);
int ow_end = std::min(((width + pad_right - w_offset)), output_w);
unsigned int ow_begin = std::max(((pad_left - w_offset)), 0);
unsigned int ow_end =
std::min(((width + pad_right - w_offset)), output_w);
ow_end = std::max(ow_begin, ow_end);
int ih = oh_begin - pad_top + h_offset;
unsigned int ih = oh_begin - pad_top + h_offset;
for (int oh = oh_begin; oh < oh_end; ++oh, ++ih) {
int iw = ow_begin - pad_left + w_offset;
int ow = ow_begin;
int data_im_offset = data_im_z + ih * width;
int data_col_offset = data_col_z + oh * output_w;
unsigned int iw = ow_begin - pad_left + w_offset;
unsigned int ow = ow_begin;
unsigned int data_im_offset = data_im_z + ih * width;
unsigned int data_col_offset = data_col_z + oh * output_w;
const float* data_im_ptr = data_im + data_im_offset;
float* data_col_ptr = data_col + data_col_offset;
#ifdef __AVX__
Expand Down Expand Up @@ -929,33 +934,36 @@ void im2col_s2<float>(const float* data_im,
(width + pad_left + pad_right - (dilation_w * (kernel_w - 1) + 1)) / 2 +
1;
const int in_channel_size = height * width;
const int output_plane_size = output_h * output_w * kernel_h * kernel_w;
memset(data_col, 0, output_plane_size * channels * sizeof(float));
const unsigned int output_plane_size =
output_h * output_w * kernel_h * kernel_w;
size_t tmp_size = static_cast<size_t>(output_plane_size);
size_t mem_size = tmp_size * channels * sizeof(float);
memset(data_col, 0, mem_size);
#pragma omp parallel for
for (int c = 0; c < channels; c++) {
int data_im_z = c * in_channel_size;
int data_col_z1 = c * output_plane_size;
unsigned int data_im_z = c * in_channel_size;
unsigned int data_col_z1 = c * output_plane_size;
for (int ky = 0, h_offset = 0; ky < kernel_h;
ky++, h_offset += dilation_h) {
int data_col_z2 = ky * output_h * output_w * kernel_w;
unsigned int data_col_z2 = ky * output_h * output_w * kernel_w;
for (int kx = 0, w_offset = 0; kx < kernel_w;
kx++, w_offset += dilation_w) {
int data_col_z3 = kx * output_h * output_w;
int data_col_z = data_col_z1 + data_col_z2 + data_col_z3;
int oh_begin = std::max(((pad_top - h_offset + 1) / 2), 0);
int oh_end =
unsigned int data_col_z3 = kx * output_h * output_w;
unsigned int data_col_z = data_col_z1 + data_col_z2 + data_col_z3;
unsigned int oh_begin = std::max(((pad_top - h_offset + 1) / 2), 0);
unsigned int oh_end =
std::min(((height + pad_bottom - h_offset + 1) / 2), output_h);
oh_end = std::max(oh_begin, oh_end);
int ow_begin = std::max(((pad_left - w_offset + 1) / 2), 0);
int ow_end =
unsigned int ow_begin = std::max(((pad_left - w_offset + 1) / 2), 0);
unsigned int ow_end =
std::min(((width + pad_right - w_offset + 1) / 2), output_w);
ow_end = std::max(ow_begin, ow_end);
int ih = oh_begin * 2 - pad_top + h_offset;
unsigned int ih = oh_begin * 2 - pad_top + h_offset;
for (int oh = oh_begin; oh < oh_end; ++oh, ih += 2) {
int iw = ow_begin * 2 - pad_left + w_offset;
int ow = ow_begin;
int data_im_offset = data_im_z + ih * width;
int data_col_offset = data_col_z + oh * output_w;
unsigned int iw = ow_begin * 2 - pad_left + w_offset;
unsigned int ow = ow_begin;
unsigned int data_im_offset = data_im_z + ih * width;
unsigned int data_col_offset = data_col_z + oh * output_w;
const float* data_im_ptr = data_im + data_im_offset;
float* data_col_ptr = data_col + data_col_offset;
for (; ow + 3 < ow_end; ow += 4, iw += 8) {
Expand Down
1 change: 1 addition & 0 deletions lite/kernels/arm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ add_kernel(transpose_compute_arm ARM basic SRCS transpose_compute.cc)
add_kernel(shuffle_channel_compute_arm ARM basic SRCS shuffle_channel_compute.cc)
add_kernel(argmax_compute_arm ARM basic SRCS argmax_compute.cc)
add_kernel(conv_transpose_compute_arm ARM basic SRCS conv_transpose_compute.cc)
add_kernel(depthwise_conv_transpose_compute_arm ARM extra SRCS depthwise_conv_transpose_compute.cc)
add_kernel(interpolate_compute_arm ARM basic SRCS interpolate_compute.cc)
add_kernel(box_coder_compute_arm ARM basic SRCS box_coder_compute.cc)
add_kernel(slice_compute_arm ARM basic SRCS slice_compute.cc)
Expand Down
92 changes: 92 additions & 0 deletions lite/kernels/arm/depthwise_conv_transpose_compute.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/kernels/arm/depthwise_conv_transpose_compute.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"

namespace paddle {
namespace lite {
namespace kernels {
namespace arm {} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle

typedef paddle::lite::kernels::arm::
DepthwiseConv2DTransposeCompute<PRECISION(kFloat), PRECISION(kFloat)>
DepConvTransFp32;
typedef paddle::lite::kernels::arm::
DepthwiseConv2DTransposeCompute<PRECISION(kInt8), PRECISION(kFloat)>
DepConvTranInt8_Fp32;
typedef paddle::lite::kernels::arm::
DepthwiseConv2DTransposeCompute<PRECISION(kInt8), PRECISION(kInt8)>
DepConvTranInt8_Int8;

#ifdef ENABLE_ARM_FP16
typedef paddle::lite::kernels::arm::
DepthwiseConv2DTransposeCompute<PRECISION(kFP16), PRECISION(kFP16)>
DepConvTranFp16;

REGISTER_LITE_KERNEL(
depthwise_conv2d_transpose, kARM, kFP16, kNCHW, DepConvTranFp16, def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFP16))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFP16))})
.BindInput("Filter",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFP16))})
.BindOutput("Output",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFP16))})
.BindPaddleOpVersion("conv2d_transpose", 1)
.Finalize();

#endif // ENABLE_ARM_FP16

REGISTER_LITE_KERNEL(
depthwise_conv2d_transpose, kARM, kFloat, kNCHW, DepConvTransFp32, def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))})
.BindPaddleOpVersion("conv2d_transpose", 1)
.Finalize();

REGISTER_LITE_KERNEL(depthwise_conv2d_transpose,
kARM,
kInt8,
kNCHW,
DepConvTranInt8_Fp32,
fp32_out)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
.BindInput("Filter",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
.BindOutput("Output",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
.BindPaddleOpVersion("conv2d_transpose", 1)
.Finalize();

REGISTER_LITE_KERNEL(depthwise_conv2d_transpose,
kARM,
kInt8,
kNCHW,
DepConvTranInt8_Int8,
int8_out)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
.BindInput("Filter",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
.BindOutput("Output",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
.BindPaddleOpVersion("conv2d_transpose", 1)
.Finalize();
32 changes: 32 additions & 0 deletions lite/kernels/arm/depthwise_conv_transpose_compute.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once
#include <string>
#include <vector>
#include "lite/backends/arm/math/funcs.h"
#include "lite/core/kernel.h"
#include "lite/operators/conv_transpose_op.h"

namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
template <PrecisionType Ptype, PrecisionType Otype>
class DepthwiseConv2DTransposeCompute : public KernelLite<TARGET(kARM), Ptype> {
};
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
3 changes: 2 additions & 1 deletion lite/kernels/arm/interpolate_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,12 @@ namespace arm {
int out_h = param.out_h; \
bool align_corners = param.align_corners; \
int align_mode = param.align_mode; \
auto scale_v = param.scale_v; \
std::string interp_method = method_name;

#define INTERP_PARAM \
X, OutSize, SizeTensor, Scale, Out, out_h, out_w, scale, align_corners, \
align_mode, interp_method
align_mode, interp_method, scale_v

template <>
void BilinearInterpCompute<PRECISION(kFloat)>::Run() {
Expand Down
Loading