diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_broadcast_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_broadcast_kernel.cl new file mode 100644 index 00000000000..d2e20fda8c7 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/elementwise_broadcast_kernel.cl @@ -0,0 +1,328 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void broadcast_elementwise_common( + __read_only image2d_t input_x, + __read_only image2d_t input_y, + __write_only image2d_t output_image, + __private const int4 input_nhwc4, + __private const int4 bias_nhwc4, + __private const int4 output_nhwc4, + __private const int inputx_broadcast_c_flag, + __private const int inputy_broadcast_c_flag, + __private const int image_folder_flag_x, + __private const int image_folder_flag_y, + __private const int bias_width) { + int idwc4 = get_global_id(0); + int idbh = get_global_id(1); + + if (idwc4 >= output_nhwc4.w * output_nhwc4.z || + idbh >= output_nhwc4.x * output_nhwc4.y) { + return; + } + + int4 id_shape; + id_shape.w = idwc4 / output_nhwc4.z; // c4 + id_shape.z = idwc4 % output_nhwc4.z; // w + id_shape.y = idbh % output_nhwc4.y; // h + id_shape.x = idbh / output_nhwc4.y; // n + + int4 v_zero = (int4)(0); + int4 flag_v = (int4)(0); + flag_v = isless(convert_float4(id_shape), convert_float4(input_nhwc4)); + int4 idx_shape = select(v_zero, id_shape, flag_v); + + int2 cur_index = (int2)(idx_shape.w * input_nhwc4.z + idx_shape.z, + idx_shape.x * input_nhwc4.y + idx_shape.y); + CL_DTYPE4 in_x = (CL_DTYPE4)(0.f); + + if (image_folder_flag_x == 0) { + in_x = READ_IMG_TYPE(CL_DTYPE_CHAR, input_x, SAMPLER, cur_index); + } + + // w -> n + if (image_folder_flag_x == 1) { + CL_DTYPE4 in0 = 0.f; + CL_DTYPE4 in1 = 0.f; + CL_DTYPE4 in2 = 0.f; + CL_DTYPE4 in3 = 0.f; + + in0 = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_x, SAMPLER, (int2)(cur_index.x * 4, cur_index.y)); + + if (cur_index.x * 4 + 1 < input_nhwc4.w * 4) { + in1 = READ_IMG_TYPE(CL_DTYPE_CHAR, + input_x, + SAMPLER, + (int2)(cur_index.x * 4 + 1, cur_index.y)); + } + if (cur_index.x * 4 + 2 < input_nhwc4.w * 4) { + in2 = READ_IMG_TYPE(CL_DTYPE_CHAR, + input_x, + SAMPLER, + (int2)(cur_index.x * 4 + 2, cur_index.y)); + } + if (cur_index.x * 4 + 3 < input_nhwc4.w * 4) { + in3 = READ_IMG_TYPE(CL_DTYPE_CHAR, + input_x, + SAMPLER, + (int2)(cur_index.x * 4 + 3, cur_index.y)); + } + in_x = (CL_DTYPE4)(in0.x, in1.x, in2.x, in3.x); + } + + // w -> h + if (image_folder_flag_x == 2) { + in_x = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_x, SAMPLER, (int2)(cur_index.y, cur_index.x)); + } + + // hw -> ch + if (image_folder_flag_x == 3) { + CL_DTYPE4 in0 = 0.f; + CL_DTYPE4 in1 = 0.f; + CL_DTYPE4 in2 = 0.f; + CL_DTYPE4 in3 = 0.f; + + in0 = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_x, SAMPLER, (int2)(cur_index.y, cur_index.x * 4)); + + if (cur_index.x * 4 + 1 < input_nhwc4.z * input_nhwc4.w * 4) { + in1 = READ_IMG_TYPE(CL_DTYPE_CHAR, + input_x, + SAMPLER, + (int2)(cur_index.y, cur_index.x * 4 + 1)); + } + if (cur_index.x * 4 + 2 < input_nhwc4.z * input_nhwc4.w * 4) { + in2 = READ_IMG_TYPE(CL_DTYPE_CHAR, + input_x, + SAMPLER, + (int2)(cur_index.y, cur_index.x * 4 + 2)); + } + if (cur_index.x * 4 + 3 < input_nhwc4.z * input_nhwc4.w * 4) { + in3 = READ_IMG_TYPE(CL_DTYPE_CHAR, + input_x, + SAMPLER, + (int2)(cur_index.y, cur_index.x * 4 + 3)); + } + in_x = (CL_DTYPE4)(in0.x, in1.x, in2.x, in3.x); + } + + // chw -> nch + if (image_folder_flag_x == 4) { + int tmp_c4 = idx_shape.x / 4; // n; + int tmp_h = idx_shape.w * 4; // c4 * 4; + int tmp_w = idx_shape.y; + + cur_index = + (int2)(tmp_c4 * 1 + tmp_w, idx_shape.x * input_nhwc4.y + idx_shape.y); + + CL_DTYPE4 in0 = 0.f; + CL_DTYPE4 in1 = 0.f; + CL_DTYPE4 in2 = 0.f; + CL_DTYPE4 in3 = 0.f; + + in0 = READ_IMG_TYPE(CL_DTYPE_CHAR, + input_x, + SAMPLER, + (int2)(tmp_c4 * input_nhwc4.y + tmp_w, tmp_h)); + + if (cur_index.x + 1 < input_nhwc4.x * input_nhwc4.y) { + in1 = READ_IMG_TYPE(CL_DTYPE_CHAR, + input_x, + SAMPLER, + (int2)(tmp_c4 * input_nhwc4.y + tmp_w, tmp_h + 1)); + } + if (cur_index.x + 2 < input_nhwc4.x * input_nhwc4.y) { + in2 = READ_IMG_TYPE(CL_DTYPE_CHAR, + input_x, + SAMPLER, + (int2)(tmp_c4 * input_nhwc4.y + tmp_w, tmp_h + 2)); + } + if (cur_index.x + 3 < input_nhwc4.x * input_nhwc4.y) { + in3 = READ_IMG_TYPE(CL_DTYPE_CHAR, + input_x, + SAMPLER, + (int2)(tmp_c4 * input_nhwc4.y + tmp_w, tmp_h + 3)); + } + + if (idx_shape.x % 4 == 0) { + in_x = (CL_DTYPE4)(in0.x, in1.x, in2.x, in3.x); + } + + if (idx_shape.x % 4 == 1) { + in_x = (CL_DTYPE4)(in0.y, in1.y, in2.y, in3.y); + } + + if (idx_shape.x % 4 == 2) { + in_x = (CL_DTYPE4)(in0.z, in1.z, in2.z, in3.z); + } + + if (idx_shape.x % 4 == 3) { + in_x = (CL_DTYPE4)(in0.w, in1.w, in2.w, in3.w); + } + } + + /***************************get y data*******************************/ + flag_v = isless(convert_float4(id_shape), convert_float4(bias_nhwc4)); + int4 idy_shape = select(v_zero, id_shape, flag_v); + + cur_index = (int2)(idy_shape.w * bias_nhwc4.z + idy_shape.z, + idy_shape.x * bias_nhwc4.y + idy_shape.y); + CL_DTYPE4 in_y = (CL_DTYPE4)(0.f); + + if (image_folder_flag_y == 0) { + in_y = READ_IMG_TYPE(CL_DTYPE_CHAR, input_y, SAMPLER, cur_index); + } + + // w -> n (ImageDefault->ImageFolder for elementwise ) + if (image_folder_flag_y == 1) { + CL_DTYPE4 in0 = 0.f; + CL_DTYPE4 in1 = 0.f; + CL_DTYPE4 in2 = 0.f; + CL_DTYPE4 in3 = 0.f; + + in0 = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_y, SAMPLER, (int2)(cur_index.x * 4, cur_index.y)); + + if (cur_index.x * 4 + 1 < bias_width) { + in1 = READ_IMG_TYPE(CL_DTYPE_CHAR, + input_y, + SAMPLER, + (int2)(cur_index.x * 4 + 1, cur_index.y)); + } + if (cur_index.x * 4 + 2 < bias_width) { + in2 = READ_IMG_TYPE(CL_DTYPE_CHAR, + input_y, + SAMPLER, + (int2)(cur_index.x * 4 + 2, cur_index.y)); + } + if (cur_index.x * 4 + 3 < bias_width) { + in3 = READ_IMG_TYPE(CL_DTYPE_CHAR, + input_y, + SAMPLER, + (int2)(cur_index.x * 4 + 3, cur_index.y)); + } + in_y = (CL_DTYPE4)(in0.x, in1.x, in2.x, in3.x); + } + + // w -> h + if (image_folder_flag_y == 2) { + in_y = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_y, SAMPLER, (int2)(cur_index.y, cur_index.x)); + } + + // hw -> ch + if (image_folder_flag_y == 3) { + CL_DTYPE4 in0 = 0.f; + CL_DTYPE4 in1 = 0.f; + CL_DTYPE4 in2 = 0.f; + CL_DTYPE4 in3 = 0.f; + + in0 = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_y, SAMPLER, (int2)(cur_index.y, cur_index.x * 4)); + + if (cur_index.x * 4 + 1 < bias_nhwc4.z * bias_nhwc4.w * 4) { + in1 = READ_IMG_TYPE(CL_DTYPE_CHAR, + input_y, + SAMPLER, + (int2)(cur_index.y, cur_index.x * 4 + 1)); + } + if (cur_index.x * 4 + 2 < bias_nhwc4.z * bias_nhwc4.w * 4) { + in2 = READ_IMG_TYPE(CL_DTYPE_CHAR, + input_y, + SAMPLER, + (int2)(cur_index.y, cur_index.x * 4 + 2)); + } + if (cur_index.x * 4 + 3 < bias_nhwc4.z * bias_nhwc4.w * 4) { + in3 = READ_IMG_TYPE(CL_DTYPE_CHAR, + input_y, + SAMPLER, + (int2)(cur_index.y, cur_index.x * 4 + 3)); + } + in_y = (CL_DTYPE4)(in0.x, in1.x, in2.x, in3.x); + } + + // chw -> nch + if (image_folder_flag_y == 4) { + int tmp_c4 = idy_shape.x / 4; // n; + int tmp_h = idy_shape.w * 4; // c4 * 4; + int tmp_w = idy_shape.y; + + cur_index = + (int2)(tmp_c4 * 1 + tmp_w, idy_shape.x * bias_nhwc4.y + idy_shape.y); + + CL_DTYPE4 in0 = 0.f; + CL_DTYPE4 in1 = 0.f; + CL_DTYPE4 in2 = 0.f; + CL_DTYPE4 in3 = 0.f; + + in0 = READ_IMG_TYPE(CL_DTYPE_CHAR, + input_y, + SAMPLER, + (int2)(tmp_c4 * bias_nhwc4.y + tmp_w, tmp_h)); + + if (cur_index.x + 1 < bias_nhwc4.x * bias_nhwc4.y) { + in1 = READ_IMG_TYPE(CL_DTYPE_CHAR, + input_y, + SAMPLER, + (int2)(tmp_c4 * bias_nhwc4.y + tmp_w, tmp_h + 1)); + } + if (cur_index.x + 2 < bias_nhwc4.x * bias_nhwc4.y) { + in2 = READ_IMG_TYPE(CL_DTYPE_CHAR, + input_y, + SAMPLER, + (int2)(tmp_c4 * bias_nhwc4.y + tmp_w, tmp_h + 2)); + } + if (cur_index.x + 3 < bias_nhwc4.x * bias_nhwc4.y) { + in3 = READ_IMG_TYPE(CL_DTYPE_CHAR, + input_y, + SAMPLER, + (int2)(tmp_c4 * bias_nhwc4.y + tmp_w, tmp_h + 3)); + } + + if (idy_shape.x % 4 == 0) { + in_y = (CL_DTYPE4)(in0.x, in1.x, in2.x, in3.x); + } + + if (idy_shape.x % 4 == 1) { + in_y = (CL_DTYPE4)(in0.y, in1.y, in2.y, in3.y); + } + + if (idy_shape.x % 4 == 2) { + in_y = (CL_DTYPE4)(in0.z, in1.z, in2.z, in3.z); + } + + if (idy_shape.x % 4 == 3) { + in_y = (CL_DTYPE4)(in0.w, in1.w, in2.w, in3.w); + } + } + + in_x = SELECT(in_x, (CL_DTYPE4)(in_x.x), inputx_broadcast_c_flag); + in_y = SELECT(in_y, (CL_DTYPE4)(in_y.x), inputy_broadcast_c_flag); + + CL_DTYPE4 output = OPERATOR(in_x, in_y); +#ifdef FUSE_SCALE + output = fuse_scale(output, SCALE_SLOPE, SCALE_BIAS, SCALE_ALPHA); +#endif + +#ifdef RELU + CL_DTYPE4 alpha; + output = activation_type4(output, alpha); +#endif + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(idwc4, idbh), output); +} \ No newline at end of file diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_kernel.cl new file mode 100644 index 00000000000..797b23d7b37 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/elementwise_kernel.cl @@ -0,0 +1,100 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void elementwise_compute(__read_only image2d_t input, + __read_only image2d_t bias, + __write_only image2d_t outputImage) { + int x = get_global_id(0); + int y = get_global_id(1); + + int2 coords; + coords.x = x; + coords.y = y; + + CL_DTYPE4 in_x = READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, coords); + CL_DTYPE4 in_y = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, coords); + + CL_DTYPE4 output = OPERATOR(in_x, in_y); +#ifdef FUSE_SCALE + output = fuse_scale(output, SCALE_SLOPE, SCALE_BIAS, SCALE_ALPHA); +#endif + +#ifdef RELU + CL_DTYPE4 alpha; + output = activation_type4(output, alpha); +#endif + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); +} + +__kernel void broadcast_elementwise_basic( + __read_only image2d_t input_x, + __read_only image2d_t input_y, + __write_only image2d_t output_image, + __private const int4 input_nhwc4, + __private const int4 bias_nhwc4, + __private const int4 output_nhwc4, + __private const int inputx_broadcast_c_flag, + __private const int inputy_broadcast_c_flag, + __private const int image_folder_flag_x, + __private const int image_folder_flag_y, + __private const int bias_width) { + int idwc4 = get_global_id(0); + int idbh = get_global_id(1); + + if (idwc4 >= output_nhwc4.w * output_nhwc4.z || + idbh >= output_nhwc4.x * output_nhwc4.y) { + return; + } + + int4 id_shape; + id_shape.w = idwc4 / output_nhwc4.z; // c4 + id_shape.z = idwc4 % output_nhwc4.z; // w + id_shape.y = idbh % output_nhwc4.y; // h + id_shape.x = idbh / output_nhwc4.y; // n + + int4 v_zero = (int4)(0); + int4 flag_v = (int4)(0); + flag_v = isless(convert_float4(id_shape), convert_float4(input_nhwc4)); + int4 idx_shape = select(v_zero, id_shape, flag_v); + + int2 cur_index = (int2)(idx_shape.w * input_nhwc4.z + idx_shape.z, + idx_shape.x * input_nhwc4.y + idx_shape.y); + CL_DTYPE4 in_x = READ_IMG_TYPE(CL_DTYPE_CHAR, input_x, SAMPLER, cur_index); + + /***************************get y data*******************************/ + flag_v = isless(convert_float4(id_shape), convert_float4(bias_nhwc4)); + int4 idy_shape = select(v_zero, id_shape, flag_v); + + cur_index = (int2)(idy_shape.w * bias_nhwc4.z + idy_shape.z, + idy_shape.x * bias_nhwc4.y + idy_shape.y); + CL_DTYPE4 in_y = READ_IMG_TYPE(CL_DTYPE_CHAR, input_y, SAMPLER, cur_index); + + in_x = SELECT(in_x, (CL_DTYPE4)(in_x.x), inputx_broadcast_c_flag); + in_y = SELECT(in_y, (CL_DTYPE4)(in_y.x), inputy_broadcast_c_flag); + + CL_DTYPE4 output = OPERATOR(in_x, in_y); +#ifdef FUSE_SCALE + output = fuse_scale(output, SCALE_SLOPE, SCALE_BIAS, SCALE_ALPHA); +#endif + +#ifdef RELU + CL_DTYPE4 alpha; + output = activation_type4(output, alpha); +#endif + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(idwc4, idbh), output); +} \ No newline at end of file diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt index 157daedfe97..0a797e02ce5 100644 --- a/lite/kernels/opencl/CMakeLists.txt +++ b/lite/kernels/opencl/CMakeLists.txt @@ -1,3 +1,4 @@ + if(LITE_WITH_OPENCL) set(IS_FAKED_KERNEL false CACHE INTERNAL "") set(cl_kernel_deps ops cl_runtime cl_context cl_wrapper cl_target_wrapper cl_image_converter) @@ -13,11 +14,13 @@ endif() # image kernel # ##################### # basic -add_kernel(elementwise_add_opencl_image OPENCL basic SRCS elementwise_add_image_compute.cc) -add_kernel(elementwise_sub_opencl_image OPENCL basic SRCS elementwise_sub_image_compute.cc) -add_kernel(elementwise_mul_opencl_image OPENCL basic SRCS elementwise_mul_image_compute.cc) -add_kernel(fusion_elementwise_sub_activation_opencl_image - OPENCL basic SRCS fusion_elementwise_sub_activation_image_compute.cc) + +#add_kernel(elementwise_add_opencl_image OPENCL basic SRCS elementwise_add_image_compute.cc) +#add_kernel(elementwise_sub_opencl_image OPENCL basic SRCS elementwise_sub_image_compute.cc) +#add_kernel(elementwise_mul_opencl_image OPENCL basic SRCS elementwise_mul_image_compute.cc) +#add_kernel(fusion_elementwise_sub_activation_opencl_image +# OPENCL basic SRCS fusion_elementwise_sub_activation_image_compute.cc) +add_kernel(elementwise_opencl_image OPENCL basic SRCS elementwise_image_compute.cc) add_kernel(pool_opencl_image OPENCL basic SRCS pool_image_compute.cc) add_kernel(activation_opencl_image OPENCL basic SRCS activation_image_compute.cc) @@ -101,9 +104,12 @@ lite_cc_test(test_pixel_shuffle_image_opencl SRCS pixel_shuffle_image_compute_te lite_cc_test(test_expand_image_opencl SRCS expand_image_compute_test.cc DEPS kernels core) -lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_compute_test.cc - DEPS kernels core) -lite_cc_test(test_elementwise_sub_image_opencl SRCS elementwise_sub_image_compute_test.cc +#lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_compute_test.cc +# DEPS kernels core) +#lite_cc_test(test_elementwise_sub_image_opencl SRCS elementwise_sub_image_compute_test.cc +# DEPS kernels core) + +lite_cc_test(test_elementwise_image_opencl SRCS elementwise_image_compute_test.cc DEPS kernels core) lite_cc_test(test_grid_sampler_image_opencl SRCS grid_sampler_image_compute_test.cc diff --git a/lite/kernels/opencl/elementwise_add_image_compute.cc b/lite/kernels/opencl/elementwise_add_image_compute.cc index 5c7b27d8ba6..86d87c73fda 100644 --- a/lite/kernels/opencl/elementwise_add_image_compute.cc +++ b/lite/kernels/opencl/elementwise_add_image_compute.cc @@ -275,43 +275,43 @@ namespace ocl = paddle::lite::kernels::opencl; // may from anther branch like "X" (kOpenCL, nothing to do). // Consider 2 situations have different actions when pass running(pick kernel), // set target of "Y" as kOpenCL temporarily. -REGISTER_LITE_KERNEL(elementwise_add, - kOpenCL, - kFP16, - kImageDefault, - ocl::ElementwiseAddImageCompute, - def) - .BindInput("X", - {LiteType::GetTensorTy(TARGET(kOpenCL), - PRECISION(kFP16), - DATALAYOUT(kImageDefault))}) - .BindInput("Y", - {LiteType::GetTensorTy(TARGET(kOpenCL), - PRECISION(kFP16), - DATALAYOUT(kImageDefault))}) - .BindOutput("Out", - {LiteType::GetTensorTy(TARGET(kOpenCL), - PRECISION(kFP16), - DATALAYOUT(kImageDefault))}) - .Finalize(); +// REGISTER_LITE_KERNEL(elementwise_add, +// kOpenCL, +// kFP16, +// kImageDefault, +// ocl::ElementwiseAddImageCompute, +// def) +// .BindInput("X", +// {LiteType::GetTensorTy(TARGET(kOpenCL), +// PRECISION(kFP16), +// DATALAYOUT(kImageDefault))}) +// .BindInput("Y", +// {LiteType::GetTensorTy(TARGET(kOpenCL), +// PRECISION(kFP16), +// DATALAYOUT(kImageDefault))}) +// .BindOutput("Out", +// {LiteType::GetTensorTy(TARGET(kOpenCL), +// PRECISION(kFP16), +// DATALAYOUT(kImageDefault))}) +// .Finalize(); -REGISTER_LITE_KERNEL(fusion_elementwise_add_activation, - kOpenCL, - kFP16, - kImageDefault, - ocl::ElementwiseAddImageCompute, - def) - .BindInput("X", - {LiteType::GetTensorTy(TARGET(kOpenCL), - PRECISION(kFP16), - DATALAYOUT(kImageDefault))}) - .BindInput("Y", - {LiteType::GetTensorTy(TARGET(kOpenCL), - PRECISION(kFP16), - DATALAYOUT(kImageDefault))}) - .BindOutput("Out", - {LiteType::GetTensorTy(TARGET(kOpenCL), - PRECISION(kFP16), - DATALAYOUT(kImageDefault))}) - .Finalize(); -#define LITE_WITH_LOG +// REGISTER_LITE_KERNEL(fusion_elementwise_add_activation, +// kOpenCL, +// kFP16, +// kImageDefault, +// ocl::ElementwiseAddImageCompute, +// def) +// .BindInput("X", +// {LiteType::GetTensorTy(TARGET(kOpenCL), +// PRECISION(kFP16), +// DATALAYOUT(kImageDefault))}) +// .BindInput("Y", +// {LiteType::GetTensorTy(TARGET(kOpenCL), +// PRECISION(kFP16), +// DATALAYOUT(kImageDefault))}) +// .BindOutput("Out", +// {LiteType::GetTensorTy(TARGET(kOpenCL), +// PRECISION(kFP16), +// DATALAYOUT(kImageDefault))}) +// .Finalize(); +// #define LITE_WITH_LOG diff --git a/lite/kernels/opencl/elementwise_image_compute.cc b/lite/kernels/opencl/elementwise_image_compute.cc new file mode 100644 index 00000000000..6e92ba515f9 --- /dev/null +++ b/lite/kernels/opencl/elementwise_image_compute.cc @@ -0,0 +1,589 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "lite/backends/opencl/cl_half.h" +#include "lite/backends/opencl/cl_image_converter.h" +#include "lite/backends/opencl/cl_include.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/host/elementwise_op_func.h" +#include "lite/kernels/opencl/image_helper.h" +#include "lite/operators/op_params.h" +#include "lite/utils/log/logging.h" +#include "lite/utils/replace_stl/stream.h" +#ifdef LITE_WITH_PROFILE +#include "lite/core/profile/profiler.h" +#endif +#include "lite/backends/opencl/cl_utility.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace opencl { + +class ElementwiseImageCompute : public KernelLite { + public: + using param_t = operators::ElementwiseParam; + + std::string doc() const override { + return "Elementwise using cl::Image2D(ImageDefault/RGBA), kFP32"; + } + + void PrepareForRun() override { + auto& context = ctx_->As(); + if (param_.is_type()) { + ele_param_ = param_.get_mutable(); + } else { + ele_param_ = + param_.get_mutable(); + auto act_t = + static_cast(ele_param_) + ->act_type; + VLOG(4) << "act: " << act_t; + if (act_t == "relu") { + build_options_ += " -DRELU"; + } else { + LOG(FATAL) << "Unsupported Activation type: " << act_t; + } + } + + auto* x = ele_param_->X; + auto* y = ele_param_->Y; + x_dims_ = ele_param_->X->dims(); + y_dims_ = ele_param_->Y->dims(); + auto& out_dims = ele_param_->Out->dims(); + axis_ = ele_param_->axis; + out_nchw_ = out_dims.Vectorize(); + + host::fix_x_y_dims( + x, y, ele_param_->Out, axis_, &x_nchw_, &y_nchw_); + + while (x_nchw_.size() < 4) { + x_nchw_.insert(x_nchw_.cbegin(), 1); + } + + while (y_nchw_.size() < 4) { + y_nchw_.insert(y_nchw_.cbegin(), 1); + } + + while (out_nchw_.size() < 4) { + out_nchw_.insert(out_nchw_.cbegin(), 1); + } + + image_folder_flag_y_ = 0; + image_folder_flag_x_ = 0; + int broadcast_elementwise_common_flag = 0; + if (y->persistable()) { + LOG(INFO) << "with y->persistable"; + y_weights_image_ = std::unique_ptr(new Tensor); + std::unique_ptr tensor_hold_y_image_ = + std::unique_ptr(new Tensor); + CLImageConverterDefault default_converter; + const DDim& y_image_dims = + default_converter.InitImageDimInfoWith(DDim(y_nchw_)); + tensor_hold_y_image_->Resize({1, y_image_dims[0], y_image_dims[1], 4}); + + auto* y_cpu_image = MUTABLE_DATA_CPU(tensor_hold_y_image_); + auto* y_cpu_nchw = static_cast(const_cast(y->raw_data())); + default_converter.NCHWToImage(y_cpu_nchw, y_cpu_image, DDim(y_nchw_)); + MUTABLE_DATA_GPU( + y_weights_image_, y_image_dims[0], y_image_dims[1], y_cpu_image); + } else { + if ((y_dims_.size() == 1) && + (axis_ != -1) && // x{n,c,h,w} && y{c} || x{c,h,w} && y{c} + (axis_ == x_dims_.size() - 3)) { + image_folder_flag_y_ = 1; + } + if ((y_dims_.size() == 1) && + (axis_ != -1) && // x{n,c,h,w} && y{h} || x{c,h,w} && y{h} + (axis_ == x_dims_.size() - 2)) { + image_folder_flag_y_ = 2; + } + if ((y_dims_.size() == 2) && + (axis_ != -1) && // x{n,c,h,w} && y{c,h} || x{c,h,w} && y{c,h} + (axis_ == x_dims_.size() - 3)) { + image_folder_flag_y_ = 3; + } + if ((y_dims_.size() == 2) && (x_dims_.size() == 4) && + (axis_ == 0)) { // x{n,c,h,w} && y{n,c} + image_folder_flag_y_ = 1; + } + if ((y_dims_.size() == 3) && (x_dims_.size() == 4) && + (axis_ == 0)) { // x{n,c,h,w} && y{n,c,h} + image_folder_flag_y_ = 4; + } + + if ((x_dims_.size() == 1) && (axis_ != -1) && + (axis_ == y_dims_.size() - 3)) { + image_folder_flag_x_ = 1; + } + if ((x_dims_.size() == 1) && (axis_ != -1) && + (axis_ == y_dims_.size() - 2)) { + image_folder_flag_x_ = 2; + } + if ((x_dims_.size() == 2) && (axis_ != -1) && + (axis_ == y_dims_.size() - 3)) { + image_folder_flag_x_ = 3; + } + if ((x_dims_.size() == 2) && (y_dims_.size() == 4) && (axis_ == 0)) { + image_folder_flag_x_ = 1; + } + if ((x_dims_.size() == 3) && (y_dims_.size() == 4) && (axis_ == 0)) { + image_folder_flag_x_ = 4; + } + } + + if (image_folder_flag_x_ != 0 || image_folder_flag_y_ != 0) { + broadcast_elementwise_common_flag = 1; + } + + if (y_dims_ == x_dims_) { + kernel_func_name_ = "elementwise_compute"; + } else if (broadcast_elementwise_common_flag == 0) { + kernel_func_name_ = "broadcast_elementwise_basic"; + } else { + kernel_func_name_ = "broadcast_elementwise_common"; + } + + if (broadcast_elementwise_common_flag == 1) { + kernel_func_paths_ = "image/elementwise_broadcast_kernel.cl"; + } else { + kernel_func_paths_ = "image/elementwise_kernel.cl"; + } + + // op_type + auto elementwise_compute_type = op_type(); + if (elementwise_compute_type == "elementwise_div") { + build_options_ += " -DOPERATOR(in,bias)=(in/bias) "; + } else if (elementwise_compute_type == "elementwise_add") { + build_options_ += " -DOPERATOR(in,bias)=(in+bias) "; + } else if (elementwise_compute_type == "elementwise_sub") { + build_options_ += " -DOPERATOR(in,bias)=(in-bias) "; + } else if (elementwise_compute_type == "elementwise_mul") { + build_options_ += " -DOPERATOR(in,bias)=(in*bias) "; + } else if (elementwise_compute_type == "elementwise_max") { + build_options_ += " -DOPERATOR(in,bias)=fmax(in,bias) "; + } else if (elementwise_compute_type == "elementwise_min") { + build_options_ += " -DOPERATOR(in,bias)=fmin(in,bias) "; + } else if (elementwise_compute_type == "elementwise_pow") { + build_options_ += " -DOPERATOR(in,bias)=pow(in,bias) "; + } else if (elementwise_compute_type == "elementwise_mod") { + build_options_ += " -DOPERATOR(in,bias)=fmod(in,bias) "; + } + + if (ele_param_->fuse_scale) { + build_options_ += + "-DFUSE_SCALE -DSCALE_SLOPE=" + std::to_string(ele_param_->scale) + + "f " + " -DSCALE_BIAS=" + std::to_string(ele_param_->bias) + "f " + + " -DSCALE_ALPHA=" + std::to_string(ele_param_->alpha) + "f "; + } + context.cl_context()->AddKernel( + kernel_func_name_, kernel_func_paths_, build_options_, time_stamp_); + + STL::stringstream kernel_key; + kernel_key << kernel_func_name_ << build_options_ << time_stamp_; + kernel_ = context.cl_context()->GetKernel(kernel_key.str()); + } + +#ifdef LITE_WITH_PROFILE + void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) { + std::string fuse_scale_str = ele_param_->fuse_scale ? "/fuse_scale" : ""; + ch->kernel_func_name = kernel_func_name_ + fuse_scale_str; + ch->global_work_size = ch->NDRangeToStr(gws_); + ch->cl_event = + event_; // `event_` defined in `kernel.h`, valid after kernel::Run + } +#endif + + void ReInitWhenNeeded() override { + if ((!first_epoch_for_reinit_ && x_dims_ != last_x_dims_) || + first_epoch_for_reinit_) { + last_x_dims_ = x_dims_; + first_epoch_for_reinit_ = false; + + // compute global work size + int hb = out_nchw_[0] * out_nchw_[2]; + int cw = + out_nchw_[3] * + maptofactor(out_nchw_[1], 4); // return (i + factor - 1) / factor; + + gws_ = cl::NDRange{static_cast(cw), + static_cast(hb), + static_cast(1)}; + } + } + + void Run() override { + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + const auto* x_img = GET_DATA_GPU(ele_param_->X); + const auto* y_img = GET_DATA_GPU(ele_param_->Y); + if (ele_param_->Y->persistable()) { + y_img = GET_DATA_GPU(y_weights_image_); + } + auto out_image_shape = InitImageDimInfoWith(DDim(out_nchw_)); // w, h + auto* out_img = MUTABLE_DATA_GPU(ele_param_->Out, + out_image_shape["width"], + out_image_shape["height"], + nullptr); + + // nchw --> nhwc4 + cl_int4 inx_dim = {static_cast(x_nchw_[0]), + static_cast(x_nchw_[2]), + static_cast(x_nchw_[3]), + static_cast((x_nchw_[1] + 3) / 4)}; + + cl_int4 iny_dim = {static_cast(y_nchw_[0]), + static_cast(y_nchw_[2]), + static_cast(y_nchw_[3]), + static_cast((y_nchw_[1] + 3) / 4)}; + + cl_int4 out_dim = {static_cast(out_nchw_[0]), + static_cast(out_nchw_[2]), + static_cast(out_nchw_[3]), + static_cast((out_nchw_[1] + 3) / 4)}; + + int inputx_broadcast_c_flag = (x_nchw_[1] == 1) ? 1 : 0; + int inputy_broadcast_c_flag = (y_nchw_[1] == 1) ? 1 : 0; + int bias_width = y_nchw_[1]; + + if (y_dims_ == x_dims_) { + cl_int status = kernel_.setArg(0, *x_img); + CL_CHECK_FATAL(status); + status = kernel_.setArg(1, *y_img); + CL_CHECK_FATAL(status); + status = kernel_.setArg(2, *out_img); + CL_CHECK_FATAL(status); + } else { + cl_int status = kernel_.setArg(0, *x_img); + CL_CHECK_FATAL(status); + status = kernel_.setArg(1, *y_img); + CL_CHECK_FATAL(status); + status = kernel_.setArg(2, *out_img); + CL_CHECK_FATAL(status); + status = kernel_.setArg(3, inx_dim); + CL_CHECK_FATAL(status); + status = kernel_.setArg(4, iny_dim); + CL_CHECK_FATAL(status); + status = kernel_.setArg(5, out_dim); + CL_CHECK_FATAL(status); + status = kernel_.setArg(6, inputx_broadcast_c_flag); + CL_CHECK_FATAL(status); + status = kernel_.setArg(7, inputy_broadcast_c_flag); + CL_CHECK_FATAL(status); + status = kernel_.setArg(8, image_folder_flag_x_); + CL_CHECK_FATAL(status); + status = kernel_.setArg(9, image_folder_flag_y_); + CL_CHECK_FATAL(status); + status = kernel_.setArg(10, bias_width); + CL_CHECK_FATAL(status); + } + + auto status = EnqueueNDRangeKernel( + context, kernel_, cl::NullRange, gws_, cl::NullRange, nullptr, event_); + CL_CHECK_FATAL(status); + +#ifdef LITE_WITH_PROFILE + event_.wait(); + auto queue_start_nanos = + event_.getProfilingInfo(); + auto submit_start_nanos = + event_.getProfilingInfo(); + auto run_start_nanos = + event_.getProfilingInfo(); + auto run_stop_nanos = event_.getProfilingInfo(); + + double time_ms = (submit_start_nanos - queue_start_nanos) / 1000000.0; + VLOG(4) << "GetQueuedToSubmitTime: " << time_ms << std::endl; + + time_ms = (run_start_nanos - submit_start_nanos) / 1000000.0; + VLOG(4) << "GetSubmitToStartTime: " << time_ms << std::endl; + + time_ms = (run_stop_nanos - run_start_nanos) / 1000000.0; + VLOG(4) << "GetStartToEndTime: " << time_ms << std::endl; +#endif + } + + private: + param_t* ele_param_{nullptr}; + bool first_epoch_for_reinit_{true}; + DDim last_x_dims_; + std::vector x_nchw_{}; + std::vector y_nchw_{}; + std::vector out_nchw_{}; + std::string kernel_func_name_{}; + std::string build_options_{}; + std::string kernel_func_paths_{}; + std::string time_stamp_{GetTimeStamp()}; + cl::Kernel kernel_; + cl::NDRange gws_; + // y is persistable + std::unique_ptr y_weights_image_{nullptr}; + int image_folder_flag_x_{0}; + int image_folder_flag_y_{0}; + int axis_{-1}; + DDimLite x_dims_{}; + DDimLite y_dims_{}; +}; + +} // namespace opencl +} // namespace kernels +} // namespace lite +} // namespace paddle + +namespace ocl = paddle::lite::kernels::opencl; +REGISTER_LITE_KERNEL(elementwise_div, + kOpenCL, + kFP16, + kImageDefault, + ocl::ElementwiseImageCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .Finalize(); + +REGISTER_LITE_KERNEL(elementwise_add, + kOpenCL, + kFP16, + kImageDefault, + ocl::ElementwiseImageCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .Finalize(); + +REGISTER_LITE_KERNEL(elementwise_sub, + kOpenCL, + kFP16, + kImageDefault, + ocl::ElementwiseImageCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .Finalize(); + +REGISTER_LITE_KERNEL(elementwise_mul, + kOpenCL, + kFP16, + kImageDefault, + ocl::ElementwiseImageCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .Finalize(); + +REGISTER_LITE_KERNEL(elementwise_max, + kOpenCL, + kFP16, + kImageDefault, + ocl::ElementwiseImageCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .Finalize(); + +REGISTER_LITE_KERNEL(elementwise_min, + kOpenCL, + kFP16, + kImageDefault, + ocl::ElementwiseImageCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .Finalize(); + +REGISTER_LITE_KERNEL(elementwise_pow, + kOpenCL, + kFP16, + kImageDefault, + ocl::ElementwiseImageCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .Finalize(); + +REGISTER_LITE_KERNEL(elementwise_mod, + kOpenCL, + kFP16, + kImageDefault, + ocl::ElementwiseImageCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .Finalize(); + +REGISTER_LITE_KERNEL(fusion_elementwise_add_activation, + kOpenCL, + kFP16, + kImageDefault, + ocl::ElementwiseImageCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .Finalize(); + +REGISTER_LITE_KERNEL(fusion_elementwise_sub_activation, + kOpenCL, + kFP16, + kImageDefault, + ocl::ElementwiseImageCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .Finalize(); + +REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation, + kOpenCL, + kFP16, + kImageDefault, + ocl::ElementwiseImageCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .Finalize(); + +REGISTER_LITE_KERNEL(fusion_elementwise_div_activation, + kOpenCL, + kFP16, + kImageDefault, + ocl::ElementwiseImageCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .Finalize(); diff --git a/lite/kernels/opencl/elementwise_image_compute_test.cc b/lite/kernels/opencl/elementwise_image_compute_test.cc new file mode 100644 index 00000000000..111c65c3067 --- /dev/null +++ b/lite/kernels/opencl/elementwise_image_compute_test.cc @@ -0,0 +1,751 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "lite/backends/opencl/target_wrapper.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" +#include "lite/kernels/opencl/test_helper.h" +#include "lite/tests/utils/fill_data.h" + +#define FP16_MAX_DIFF (5e-1) +#define FP32_ABS_DIFF (1e-7) +#define FP32_RELATIVE_DIFF (1e-4) +#define FP16_ABS_DIFF (1e-1) +#define FP16_RELATIVE_DIFF (1e-1) +namespace paddle { +namespace lite { + +template +void fill_data(dtype* x, const int length, int set_value = -1) { + if (set_value == -1) { + for (size_t idx = 0; idx < length; ++idx) { + x[idx] = idx; + } + } else if (set_value != -1) { + for (size_t idx = 0; idx < length; ++idx) { + x[idx] = set_value; + } + } +} + +int randint(int beg, int end) { + int res = 0; + fill_data_rand(&res, beg, end, 1); + return res; +} + +bool randbool() { return randint(0, 1000000) < 500000; } + +template +T* AtLogicInd(T* data, + const std::vector& dim, + const std::vector& logic_index) { + assert(dim.size() == logic_index.size()); + + int offset = 0; + int stride = 1; + for (int i = dim.size() - 1; i >= 0; --i) { + int ind = logic_index[i]; + if (dim[i] == 1) { + ind = 0; + } + assert(ind < dim[i]); + offset += ind * stride; + stride *= dim[i]; + } + return data + offset; +} + +std::vector GenLogicIndex(int logic_offset, const std::vector& dim) { + std::vector strides(dim.size(), 1); + for (int i = dim.size() - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * dim[i + 1]; + } + std::vector ret(dim.size(), 0); + for (int i = 0; i < dim.size(); ++i) { + ret[i] = logic_offset / strides[i]; + logic_offset %= strides[i]; + } + return ret; +} + +template +void BroadcastCPURef(const T* x, + const T* y, + T* z, + const std::vector& x_dim, + const std::vector& y_dim, + const std::vector& z_dim, + bool use_relu, + const std::function op) { + int N = 1; + for (int i = 0; i < z_dim.size(); ++i) { + N *= z_dim[i]; + } + for (int i = 0; i < N; ++i) { + auto logic_index = GenLogicIndex(i, z_dim); + const T* x_d = AtLogicInd(x, x_dim, logic_index); + const T* y_d = AtLogicInd(y, y_dim, logic_index); + T* z_d = AtLogicInd(z, z_dim, logic_index); + *z_d = op(*x_d, *y_d); + if (use_relu) { + *z_d = std::max(*z_d, static_cast(0)); + } + } +} + +template +void RunElementwiseBroadcast(const Place& place, + const int dim_size, + bool fuse_act, + const lite_api::CLPrecisionType p, + const std::string& alias, + const std::string& elt_type, + const std::string& act_type, + const std::function op, + double abs_error = 1e-3) { + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + CLRuntime::Global()->set_precision(p); + const bool fp16_flag = (p == lite_api::CLPrecisionType::CL_PRECISION_FP16); + LOG(INFO) << "\n\t[ START ] Test Precision=" + << lite_api::CLPrecisionTypeToStr(p); + // set kernel + auto kernels = KernelRegistry::Global().Create( + elt_type, TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)); + ASSERT_FALSE(kernels.empty()); + + auto elemul_img_kernel = std::move(kernels.front()); + VLOG(4) << "get elemul kernel: " << elemul_img_kernel->doc(); + const int MAX_SHAPE_VALUE = 10; + // gen out_dim + std::vector out_shape(dim_size, 0); + for (int i = 0; i < dim_size; ++i) { + out_shape[i] = randint(2, MAX_SHAPE_VALUE); + } + std::vector x_shape_full = out_shape; + std::vector y_shape_full = out_shape; + + std::vector x_shape_cut; + std::vector y_shape_cut; + + int axis = -1; + static bool cut_dimension = true; + cut_dimension = !cut_dimension; + if (cut_dimension) { + // generate x_shape_cut and y_shape_cut by remove dimension + static bool use_axis = true; + use_axis = !use_axis; + if (use_axis) { + x_shape_cut = x_shape_full; + // we will cut y only, and set tail of y to be 1 + axis = randint(0, dim_size - 1); + + int tail1_num = randint(0, dim_size - axis); + for (int i = 0; i < axis; ++i) { + y_shape_full[i] = 1; + } + for (int i = axis; i < (dim_size - tail1_num); ++i) { + y_shape_cut.push_back(y_shape_full[i]); + } + for (int i = 0; i < tail1_num; ++i) { + y_shape_cut.push_back(1); + } + for (int i = dim_size - tail1_num; i < dim_size; ++i) { + y_shape_full[i] = 1; + } + static bool swap_x_and_y = true; + swap_x_and_y = !swap_x_and_y; + if (swap_x_and_y) { + std::swap(x_shape_cut, y_shape_cut); + std::swap(x_shape_full, y_shape_full); + } + } else { + // we will cut x or y + if (randbool()) { + y_shape_cut = y_shape_full; + int cut_x_num = randint(0, dim_size) * randbool(); + for (int i = 0; i < cut_x_num; ++i) { + x_shape_full[i] = 1; + } + for (int i = cut_x_num; i < dim_size; ++i) { + x_shape_cut.push_back(x_shape_full[i]); + } + } else { + x_shape_cut = x_shape_full; + int cut_y_num = randint(0, dim_size) * randbool(); + for (int i = 0; i < cut_y_num; ++i) { + y_shape_full[i] = 1; + } + for (int i = cut_y_num; i < dim_size; ++i) { + y_shape_cut.push_back(y_shape_full[i]); + } + } + } + } else { + // generate x_shape_cut and y_shape_cut by random + // random assign 1 to some dim + for (int i = 0; i < dim_size; ++i) { + if (randbool() && y_shape_full[i] != 1) { + x_shape_full[i] = 1; + } + if (randbool() && x_shape_full[i] != 1) { + y_shape_full[i] = 1; + } + } + // just remove 1 at high dimesion + int ind = 0; + while (x_shape_full[ind] == 1) { + ++ind; + } + for (int i = ind; i < dim_size; ++i) { + x_shape_cut.push_back(x_shape_full[i]); + } + ind = 0; + while (y_shape_full[ind] == 1) { + ++ind; + } + for (int i = ind; i < dim_size; ++i) { + y_shape_cut.push_back(y_shape_full[i]); + } + } + + DDim x_dim = + DDim(std::vector(x_shape_cut.begin(), x_shape_cut.end())); + DDim y_dim = + DDim(std::vector(y_shape_cut.begin(), y_shape_cut.end())); + DDim out_dim = DDim(std::vector(out_shape.begin(), out_shape.end())); + + LOG(INFO) << "==================" << elt_type << "==================="; + LOG(INFO) << "x_dim:" << x_dim << "\ty_dim:" << y_dim + << "\tout_dim:" << out_dim; + LOG(INFO) << "fuse_act:" << fuse_act << "; axis:" << axis; + + // tensor + lite::Tensor ele_x, ele_y, ele_out; + ele_x.Resize(x_dim); + ele_y.Resize(y_dim); + ele_out.Resize(out_dim); + + // initialize tensors + VLOG(4) << "initialize tensors"; + paddle::lite::CLImageConverterDefault* default_convertor = + new CLImageConverterDefault(); + + // operator param + operators::FusionElementwiseActivationParam + fuseEleParam; // enabled if fuse_act is true + fuseEleParam.X = &ele_x; + fuseEleParam.Y = &ele_y; + fuseEleParam.Out = &ele_out; + fuseEleParam.axis = axis; + fuseEleParam.act_type = fuse_act ? "relu" : ""; + + operators::ElementwiseParam eleParam; + eleParam.X = &ele_x; + eleParam.Y = &ele_y; + eleParam.Out = &ele_out; + eleParam.axis = axis; + + if (fuse_act) { + elemul_img_kernel->SetParam(fuseEleParam); + } else { + elemul_img_kernel->SetParam(eleParam); + } + + elemul_img_kernel->SetContext(std::move(context)); + + // x + std::vector x_v(x_dim.production()); + // fill_data(x_v.data(), x_v.size()); + fill_data_rand(x_v.data(), -10.f, 10.f, x_dim.production()); + auto x_img_shape = default_convertor->InitImageDimInfoWith(x_dim); // w, h + const size_t dtype_size = fp16_flag ? sizeof(half_t) : sizeof(float); + std::vector x_image_data(x_img_shape.production() * 4 * + dtype_size); // 4: RGBA + default_convertor->NCHWToImage(x_v.data(), x_image_data.data(), x_dim); + MUTABLE_DATA_GPU(&ele_x, x_img_shape[0], x_img_shape[1], x_image_data.data()); + + // y + std::vector y_v(y_dim.production()); + // fill_data(y_v.data(), y_v.size()); + fill_data_rand(y_v.data(), -10.f, 10.f, y_dim.production()); + auto y_img_shape = default_convertor->InitImageDimInfoWith(y_dim); // w, h + std::vector y_image_data(y_img_shape.production() * 4 * + dtype_size); // 4: RGBA + default_convertor->NCHWToImage(y_v.data(), y_image_data.data(), y_dim); + MUTABLE_DATA_GPU(&ele_y, y_img_shape[0], y_img_shape[1], y_image_data.data()); + + // out + std::vector out_from_gpu(out_dim.production()); + auto out_img_shape = + default_convertor->InitImageDimInfoWith(out_dim); // w, h + auto* out_image = + MUTABLE_DATA_GPU(&ele_out, out_img_shape[0], out_img_shape[1], nullptr); + + // run kernel + elemul_img_kernel->Launch(); + CLRuntime::Global()->command_queue().finish(); + // download gpu result to cpu + const size_t cl_image2d_row_pitch{0}; + const size_t cl_image2d_slice_pitch{0}; + std::vector out_image_data(out_img_shape.production() * 4 * + dtype_size); // 4 : RGBA + TargetWrapperCL::ImgcpySync(out_image_data.data(), + out_image, + out_img_shape[0], + out_img_shape[1], + cl_image2d_row_pitch, + cl_image2d_slice_pitch, + IoDirection::DtoH); + default_convertor->ImageToNCHW( + out_image_data.data(), out_from_gpu.data(), out_img_shape, out_dim); + + // compute cpu reference + std::unique_ptr out_from_cpu(new float[out_dim.production()]); + BroadcastCPURef(x_v.data(), + y_v.data(), + out_from_cpu.get(), + x_shape_full, + y_shape_full, + out_shape, + fuse_act, + op); + + VLOG(4) << "output_data vs output_ref_data"; + auto relative_diff_thres = + fp16_flag ? FP16_RELATIVE_DIFF : FP32_RELATIVE_DIFF; + auto abs_diff_thres = fp16_flag ? FP16_ABS_DIFF : FP32_ABS_DIFF; + uint32_t diff_cnt = 0; + for (int i = 0; i < out_dim.production(); i++) { + auto relative_diff = + COMPUTE_RELATIVE_DIFF(out_from_gpu[i], out_from_cpu[i]); + auto abs_diff = COMPUTE_ABS_DIFF(out_from_gpu[i], out_from_cpu[i]); + EXPECT_FALSE(relative_diff > relative_diff_thres && + abs_diff > abs_diff_thres); + if (relative_diff > relative_diff_thres && abs_diff > abs_diff_thres) { + LOG(WARNING) << lite_api::CLPrecisionTypeToStr(p) << " err idx: " << i + << " abs_diff: " << abs_diff + << "\t relative_diff: " << relative_diff + << "\t out_ins: " << out_from_gpu[i] + << "\t out_ref: " << out_from_cpu[i]; + diff_cnt++; + } + } + if (diff_cnt != 0) { + LOG(FATAL) << "Err num " << diff_cnt << "/" << out_dim.production(); + } + + LOG(INFO) << "\n\t[ PASSED ] " + << " Test Precision=" << lite_api::CLPrecisionTypeToStr(p); +} + +template +void RunElementwiseCommonSize(std::vector x_shape_full, + std::vector y_shape_full, + std::vector x_shape_cut, + std::vector y_shape_cut, + std::vector out_shape, + int axis, + bool xy_swap_flag, + double abs_error = 1e-3) { + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + const lite_api::CLPrecisionType p = + paddle::lite_api::CLPrecisionType::CL_PRECISION_FP32; + CLRuntime::Global()->set_precision(p); + const bool fp16_flag = (p == lite_api::CLPrecisionType::CL_PRECISION_FP16); + LOG(INFO) << "\n\t[ START ] Test Precision=" + << lite_api::CLPrecisionTypeToStr(p); + // set kernel + std::string elt_type = "elementwise_add"; + const std::function op = [](float l, float r) { return l + r; }; + bool fuse_act = false; + auto kernels = KernelRegistry::Global().Create( + elt_type, TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)); + ASSERT_FALSE(kernels.empty()); + + auto elemul_img_kernel = std::move(kernels.front()); + VLOG(4) << "get elemul kernel: " << elemul_img_kernel->doc(); + + if (xy_swap_flag) { + std::swap(x_shape_cut, y_shape_cut); + std::swap(x_shape_full, y_shape_full); + } + + DDim x_dim = + DDim(std::vector(x_shape_cut.begin(), x_shape_cut.end())); + DDim y_dim = + DDim(std::vector(y_shape_cut.begin(), y_shape_cut.end())); + DDim out_dim = DDim(std::vector(out_shape.begin(), out_shape.end())); + + LOG(INFO) << "==================" << elt_type << "==================="; + LOG(INFO) << "x_dim:" << x_dim << "\ty_dim:" << y_dim + << "\tout_dim:" << out_dim; + LOG(INFO) << "fuse_act:" << fuse_act << "; axis:" << axis; + + // tensor + lite::Tensor ele_x, ele_y, ele_out; + ele_x.Resize(x_dim); + ele_y.Resize(y_dim); + ele_out.Resize(out_dim); + + // initialize tensors + VLOG(4) << "initialize tensors"; + paddle::lite::CLImageConverterDefault* default_convertor = + new CLImageConverterDefault(); + + // operator param + operators::FusionElementwiseActivationParam + fuseEleParam; // enabled if fuse_act is true + fuseEleParam.X = &ele_x; + fuseEleParam.Y = &ele_y; + fuseEleParam.Out = &ele_out; + fuseEleParam.axis = axis; + fuseEleParam.act_type = fuse_act ? "relu" : ""; + + operators::ElementwiseParam eleParam; + eleParam.X = &ele_x; + eleParam.Y = &ele_y; + eleParam.Out = &ele_out; + eleParam.axis = axis; + + if (fuse_act) { + elemul_img_kernel->SetParam(fuseEleParam); + } else { + elemul_img_kernel->SetParam(eleParam); + } + + elemul_img_kernel->SetContext(std::move(context)); + + // x + std::vector x_v(x_dim.production()); + // fill_data(x_v.data(), x_v.size()); + fill_data_rand(x_v.data(), -10.f, 10.f, x_dim.production()); + auto x_img_shape = default_convertor->InitImageDimInfoWith(x_dim); // w, h + const size_t dtype_size = fp16_flag ? sizeof(half_t) : sizeof(float); + std::vector x_image_data(x_img_shape.production() * 4 * + dtype_size); // 4: RGBA + default_convertor->NCHWToImage(x_v.data(), x_image_data.data(), x_dim); + MUTABLE_DATA_GPU(&ele_x, x_img_shape[0], x_img_shape[1], x_image_data.data()); + + // y + std::vector y_v(y_dim.production()); + // fill_data(y_v.data(), y_v.size()); + fill_data_rand(y_v.data(), -10.f, 10.f, y_dim.production()); + auto y_img_shape = default_convertor->InitImageDimInfoWith(y_dim); // w, h + std::vector y_image_data(y_img_shape.production() * 4 * + dtype_size); // 4: RGBA + default_convertor->NCHWToImage(y_v.data(), y_image_data.data(), y_dim); + MUTABLE_DATA_GPU(&ele_y, y_img_shape[0], y_img_shape[1], y_image_data.data()); + + // out + std::vector out_from_gpu(out_dim.production()); + auto out_img_shape = + default_convertor->InitImageDimInfoWith(out_dim); // w, h + auto* out_image = + MUTABLE_DATA_GPU(&ele_out, out_img_shape[0], out_img_shape[1], nullptr); + + // run kernel + elemul_img_kernel->Launch(); + CLRuntime::Global()->command_queue().finish(); + // download gpu result to cpu + const size_t cl_image2d_row_pitch{0}; + const size_t cl_image2d_slice_pitch{0}; + std::vector out_image_data(out_img_shape.production() * 4 * + dtype_size); // 4 : RGBA + TargetWrapperCL::ImgcpySync(out_image_data.data(), + out_image, + out_img_shape[0], + out_img_shape[1], + cl_image2d_row_pitch, + cl_image2d_slice_pitch, + IoDirection::DtoH); + default_convertor->ImageToNCHW( + out_image_data.data(), out_from_gpu.data(), out_img_shape, out_dim); + + // compute cpu reference + std::unique_ptr out_from_cpu(new float[out_dim.production()]); + BroadcastCPURef(x_v.data(), + y_v.data(), + out_from_cpu.get(), + x_shape_full, + y_shape_full, + out_shape, + fuse_act, + op); + + VLOG(4) << "output_data vs output_ref_data"; + auto relative_diff_thres = + fp16_flag ? FP16_RELATIVE_DIFF : FP32_RELATIVE_DIFF; + auto abs_diff_thres = fp16_flag ? FP16_ABS_DIFF : FP32_ABS_DIFF; + uint32_t diff_cnt = 0; + for (int i = 0; i < out_dim.production(); i++) { + auto relative_diff = + COMPUTE_RELATIVE_DIFF(out_from_gpu[i], out_from_cpu[i]); + auto abs_diff = COMPUTE_ABS_DIFF(out_from_gpu[i], out_from_cpu[i]); + EXPECT_FALSE(relative_diff > relative_diff_thres && + abs_diff > abs_diff_thres); + if (relative_diff > relative_diff_thres && abs_diff > abs_diff_thres) { + LOG(WARNING) << lite_api::CLPrecisionTypeToStr(p) << " err idx: " << i + << " abs_diff: " << abs_diff + << "\t relative_diff: " << relative_diff + << "\t out_ins: " << out_from_gpu[i] + << "\t out_ref: " << out_from_cpu[i]; + diff_cnt++; + break; + } + } + if (diff_cnt != 0) { + LOG(FATAL) << "Err num " << diff_cnt << "/" << out_dim.production(); + } + + LOG(INFO) << "\n\t[ PASSED ] " + << " Test Precision=" << lite_api::CLPrecisionTypeToStr(p); +} + +void test_elementwise_all_dim_data_gpu() { + // test elementwise common size, only add compute, data in gpu + int n = 40; + int c = 40; + int h = 40; + int w = 40; + n = randint(1, 40); + c = randint(1, 40); + h = randint(1, 40); + w = randint(1, 40); + std::vector xy_swap_flags{false, true}; + for (auto xy_swap_flag : xy_swap_flags) { + RunElementwiseCommonSize({n, c, h, w}, + {n, c, h, w}, + {n, c, h, w}, + {n, c, h, w}, + {n, c, h, w}, + 0, + xy_swap_flag); + + RunElementwiseCommonSize({n, c, h, w}, + {1, c, 1, 1}, + {n, c, h, w}, + {1, c, 1, 1}, + {n, c, h, w}, + -1, + xy_swap_flag); + + RunElementwiseCommonSize({n, c, h, w}, + {1, c, 1, 1}, + {n, c, h, w}, + {c}, + {n, c, h, w}, + 1, + xy_swap_flag); + + RunElementwiseCommonSize({n, c, h, w}, + {1, 1, h, 1}, + {n, c, h, w}, + {h}, + {n, c, h, w}, + 2, + xy_swap_flag); + + RunElementwiseCommonSize({n, c, h, w}, + {1, 1, 1, w}, + {n, c, h, w}, + {w}, + {n, c, h, w}, + -1, + xy_swap_flag); + + RunElementwiseCommonSize({n, c, h, w}, + {1, c, h, w}, + {n, c, h, w}, + {c, h, w}, + {n, c, h, w}, + -1, + xy_swap_flag); + + RunElementwiseCommonSize({n, c, h, w}, + {n, c, h, 1}, + {n, c, h, w}, + {n, c, h}, + {n, c, h, w}, + 0, + xy_swap_flag); + + RunElementwiseCommonSize({n, c, h, w}, + {n, c, 1, 1}, + {n, c, h, w}, + {n, c}, + {n, c, h, w}, + 0, + xy_swap_flag); + + RunElementwiseCommonSize({n, c, h, w}, + {1, c, h, 1}, + {n, c, h, w}, + {c, h}, + {n, c, h, w}, + 1, + xy_swap_flag); + + RunElementwiseCommonSize({n, c, h, w}, + {1, 1, h, w}, + {n, c, h, w}, + {h, w}, + {n, c, h, w}, + -1, + xy_swap_flag); + + RunElementwiseCommonSize( + {n, c, h}, {n, c, h}, {n, c, h}, {n, c, h}, {n, c, h}, 0, xy_swap_flag); + + RunElementwiseCommonSize( + {n, c, h}, {1, 1, h}, {n, c, h}, {h}, {n, c, h}, -1, xy_swap_flag); + + RunElementwiseCommonSize( + {n, c, h}, {1, c, 1}, {n, c, h}, {c}, {n, c, h}, 1, xy_swap_flag); + + RunElementwiseCommonSize( + {n, c, h}, {n, 1, 1}, {n, c, h}, {n}, {n, c, h}, 0, xy_swap_flag); + + RunElementwiseCommonSize( + {n, c, h}, {n, c, 1}, {n, c, h}, {n, c}, {n, c, h}, 0, xy_swap_flag); + + RunElementwiseCommonSize( + {n, c, h}, {1, c, h}, {n, c, h}, {c, h}, {n, c, h}, -1, xy_swap_flag); + + RunElementwiseCommonSize( + {h, w}, {h, 1}, {h, w}, {h}, {h, w}, 0, xy_swap_flag); + + RunElementwiseCommonSize( + {h, w}, {1, w}, {h, w}, {w}, {h, w}, -1, xy_swap_flag); + } +} + +void test_elementwise_broadcast_all_op() { + const int TEST_RETEAT_NUM = 1; + std::vector relu_flag_v{false, true}; + for (int repeat_count = 0; repeat_count < TEST_RETEAT_NUM; ++repeat_count) { + for (int dim_size = 4; dim_size <= 4; dim_size++) { + for (auto fuse_act : relu_flag_v) { + for (const auto precision_type : + {paddle::lite_api::CLPrecisionType::CL_PRECISION_FP32}) { + RunElementwiseBroadcast( + TARGET(kOpenCL), + dim_size, + fuse_act, + precision_type, + "def", + "elementwise_add", + "", + [](float l, float r) { return l + r; }); + RunElementwiseBroadcast( + TARGET(kOpenCL), + dim_size, + fuse_act, + precision_type, + "def", + "elementwise_sub", + "", + [](float l, float r) { return l - r; }); + RunElementwiseBroadcast( + TARGET(kOpenCL), + dim_size, + fuse_act, + precision_type, + "def", + "elementwise_mul", + "", + [](float l, float r) { return l * r; }); + RunElementwiseBroadcast( + TARGET(kOpenCL), + dim_size, + fuse_act, + precision_type, + "def", + "elementwise_div", + "", + [](float l, float r) { return l / r; }); + RunElementwiseBroadcast( + TARGET(kOpenCL), + dim_size, + fuse_act, + precision_type, + "def", + "elementwise_max", + "", + [](float l, float r) { return fmax(l, r); }); + RunElementwiseBroadcast( + TARGET(kOpenCL), + dim_size, + fuse_act, + precision_type, + "def", + "elementwise_min", + "", + [](float l, float r) { return fmin(l, r); }); + RunElementwiseBroadcast( + TARGET(kOpenCL), + dim_size, + fuse_act, + precision_type, + "def", + "elementwise_pow", + "", + [](float l, float r) { return pow(l, r); }); + RunElementwiseBroadcast( + TARGET(kOpenCL), + dim_size, + fuse_act, + precision_type, + "def", + "elementwise_mod", + "", + [](float l, float r) { return fmod(l, r); }); + } + } + } + } +} + +TEST(elementwise_broadcast, compute_basic) { + // test elementwise broadcast + test_elementwise_broadcast_all_op(); + + // test elementwise all dims, only add compute, data in gpu + test_elementwise_all_dim_data_gpu(); +} + +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(elementwise_div, kOpenCL, kFP16, kImageDefault, def); +USE_LITE_KERNEL(elementwise_add, kOpenCL, kFP16, kImageDefault, def); +USE_LITE_KERNEL(elementwise_sub, kOpenCL, kFP16, kImageDefault, def); +USE_LITE_KERNEL(elementwise_mul, kOpenCL, kFP16, kImageDefault, def); +USE_LITE_KERNEL(elementwise_max, kOpenCL, kFP16, kImageDefault, def); +USE_LITE_KERNEL(elementwise_min, kOpenCL, kFP16, kImageDefault, def); +USE_LITE_KERNEL(elementwise_pow, kOpenCL, kFP16, kImageDefault, def); +USE_LITE_KERNEL(elementwise_mod, kOpenCL, kFP16, kImageDefault, def); +USE_LITE_KERNEL( + fusion_elementwise_add_activation, kOpenCL, kFP16, kImageDefault, def); +USE_LITE_KERNEL( + fusion_elementwise_sub_activation, kOpenCL, kFP16, kImageDefault, def); +USE_LITE_KERNEL( + fusion_elementwise_mul_activation, kOpenCL, kFP16, kImageDefault, def); +USE_LITE_KERNEL( + fusion_elementwise_div_activation, kOpenCL, kFP16, kImageDefault, def); diff --git a/lite/kernels/opencl/elementwise_mul_compute.cc b/lite/kernels/opencl/elementwise_mul_compute.cc index d0e8bc92d56..12343a509f0 100644 --- a/lite/kernels/opencl/elementwise_mul_compute.cc +++ b/lite/kernels/opencl/elementwise_mul_compute.cc @@ -171,23 +171,23 @@ void ElementwiseMulFloatImageCompute::Run() { } // namespace lite } // namespace paddle -namespace ocl = paddle::lite::kernels::opencl; -REGISTER_LITE_KERNEL(elementwise_mul, - kOpenCL, - kFloat, - kImageDefault, - ocl::ElementwiseMulFloatImageCompute, - def) - .BindInput("X", - {LiteType::GetTensorTy(TARGET(kOpenCL), - PRECISION(kFloat), - DATALAYOUT(kImageDefault))}) - .BindInput("Y", - {LiteType::GetTensorTy(TARGET(kOpenCL), - PRECISION(kFloat), - DATALAYOUT(kImageDefault))}) - .BindOutput("Out", - {LiteType::GetTensorTy(TARGET(kOpenCL), - PRECISION(kFloat), - DATALAYOUT(kImageDefault))}) - .Finalize(); +// namespace ocl = paddle::lite::kernels::opencl; +// REGISTER_LITE_KERNEL(elementwise_mul, +// kOpenCL, +// kFloat, +// kImageDefault, +// ocl::ElementwiseMulFloatImageCompute, +// def) +// .BindInput("X", +// {LiteType::GetTensorTy(TARGET(kOpenCL), +// PRECISION(kFloat), +// DATALAYOUT(kImageDefault))}) +// .BindInput("Y", +// {LiteType::GetTensorTy(TARGET(kOpenCL), +// PRECISION(kFloat), +// DATALAYOUT(kImageDefault))}) +// .BindOutput("Out", +// {LiteType::GetTensorTy(TARGET(kOpenCL), +// PRECISION(kFloat), +// DATALAYOUT(kImageDefault))}) +// .Finalize(); diff --git a/lite/kernels/opencl/elementwise_mul_image_compute.cc b/lite/kernels/opencl/elementwise_mul_image_compute.cc index 70a27cd3e17..fbb6c2e8331 100644 --- a/lite/kernels/opencl/elementwise_mul_image_compute.cc +++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc @@ -252,23 +252,23 @@ class ElementwiseMulImageCompute } // namespace lite } // namespace paddle -namespace ocl = paddle::lite::kernels::opencl; -REGISTER_LITE_KERNEL(elementwise_mul, - kOpenCL, - kFP16, - kImageDefault, - ocl::ElementwiseMulImageCompute, - def) - .BindInput("X", - {LiteType::GetTensorTy(TARGET(kOpenCL), - PRECISION(kFP16), - DATALAYOUT(kImageDefault))}) - .BindInput("Y", - {LiteType::GetTensorTy(TARGET(kOpenCL), - PRECISION(kFP16), - DATALAYOUT(kImageDefault))}) - .BindOutput("Out", - {LiteType::GetTensorTy(TARGET(kOpenCL), - PRECISION(kFP16), - DATALAYOUT(kImageDefault))}) - .Finalize(); +// namespace ocl = paddle::lite::kernels::opencl; +// REGISTER_LITE_KERNEL(elementwise_mul, +// kOpenCL, +// kFP16, +// kImageDefault, +// ocl::ElementwiseMulImageCompute, +// def) +// .BindInput("X", +// {LiteType::GetTensorTy(TARGET(kOpenCL), +// PRECISION(kFP16), +// DATALAYOUT(kImageDefault))}) +// .BindInput("Y", +// {LiteType::GetTensorTy(TARGET(kOpenCL), +// PRECISION(kFP16), +// DATALAYOUT(kImageDefault))}) +// .BindOutput("Out", +// {LiteType::GetTensorTy(TARGET(kOpenCL), +// PRECISION(kFP16), +// DATALAYOUT(kImageDefault))}) +// .Finalize(); diff --git a/lite/kernels/opencl/elementwise_sub_image_compute.cc b/lite/kernels/opencl/elementwise_sub_image_compute.cc index 45e3ed8134b..195a4ce85cb 100644 --- a/lite/kernels/opencl/elementwise_sub_image_compute.cc +++ b/lite/kernels/opencl/elementwise_sub_image_compute.cc @@ -161,22 +161,22 @@ namespace ocl = paddle::lite::kernels::opencl; // may from anther branch like "X" (kOpenCL, nothing to do). // Consider 2 situations have different actions when pass running(pick kernel), // set target of "Y" as kOpenCL temporarily. -REGISTER_LITE_KERNEL(elementwise_sub, - kOpenCL, - kFP16, - kImageDefault, - ocl::ElementwiseSubImageCompute, - def) - .BindInput("X", - {LiteType::GetTensorTy(TARGET(kOpenCL), - PRECISION(kFP16), - DATALAYOUT(kImageDefault))}) - .BindInput("Y", - {LiteType::GetTensorTy(TARGET(kOpenCL), - PRECISION(kFP16), - DATALAYOUT(kImageDefault))}) - .BindOutput("Out", - {LiteType::GetTensorTy(TARGET(kOpenCL), - PRECISION(kFP16), - DATALAYOUT(kImageDefault))}) - .Finalize(); +// REGISTER_LITE_KERNEL(elementwise_sub, +// kOpenCL, +// kFP16, +// kImageDefault, +// ocl::ElementwiseSubImageCompute, +// def) +// .BindInput("X", +// {LiteType::GetTensorTy(TARGET(kOpenCL), +// PRECISION(kFP16), +// DATALAYOUT(kImageDefault))}) +// .BindInput("Y", +// {LiteType::GetTensorTy(TARGET(kOpenCL), +// PRECISION(kFP16), +// DATALAYOUT(kImageDefault))}) +// .BindOutput("Out", +// {LiteType::GetTensorTy(TARGET(kOpenCL), +// PRECISION(kFP16), +// DATALAYOUT(kImageDefault))}) +// .Finalize();