Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FC & Softmax #6560

Merged
merged 12 commits into from
Jul 29, 2021
8 changes: 4 additions & 4 deletions docs/demo_guides/opencl.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ rm ./lite/api/paddle_use_ops.h
build_opencl
```

注:如果要调试cl kernel,假设已经完成上述脚本编译(已生成cmake文件)。调试只需要修改`./lite/backends/opencl/cl_kernel/`下对应的kernel文件,保存后在项目根目录执行`python ./lite/tools/cmake_tools/gen_opencl_code.py ./lite/backends/opencl/cl_kernel ./lite/backends/opencl/opencl_kernels_source.cc`,该命令会自动将修改后,再切到build目录下执行`make publish_inference`或者你要编译的单测的可执行文件名,cl kernel文件的内容会随着编译自动打包到产物包如 .so 中或者对应单测可执行文件中。
注:如果要调试cl kernel,假设已经完成上述脚本编译(已生成cmake文件)。调试只需要修改`./lite/backends/opencl/cl_kernel/`下对应的kernel文件,保存后在项目根目录执行`python ./lite/tools/cmake_tools/gen_opencl_code.py ./lite/backends/opencl/cl_kernel ./lite/backends/opencl/opencl_kernels_source.cc`,该命令会自动更新`opencl_kernels_source.cc`,然后进入 build 目录(如`build.lite.android.armv8.gcc`)下执行`make publish_inference`或者待编译的单测的可执行文件名(如`make test_fc_image_opencl`),cl kernel文件的内容会随着编译自动打包到产物包如 .so 中或者对应单测可执行文件中。

### 1.3 编译产物说明

Expand Down Expand Up @@ -185,7 +185,7 @@ adb shell "export LD_LIBRARY_PATH=/data/local/tmp/opencl/; \
/data/local/tmp/opencl/mobilenetv1_light_api \
/data/local/tmp/opencl/mobilenetv1_opencl_fp32_opt_releasev2.6_b8234efb_20200423.nb \
1,3,224,224 \
100 10 0 1 1 0"
100 10 0 1 1 0"
# repeats=100, warmup=10
# power_mode=0 绑定大核, thread_num=1
# accelerate_opencl=1 开启 opencl kernel cache & tuning,仅当模型运行在 opencl 后端时该选项才会生效
Expand Down Expand Up @@ -263,9 +263,9 @@ macOS x86 平台下分析:
Windows x86 平台下分析:
```
# 开启性能分析,会打印出每个 op 耗时信息和汇总信息
.\lite\tools\build_windows.bat with_opencl with_extra with_profile
.\lite\tools\build_windows.bat with_opencl with_extra with_profile
# 开启精度分析,会打印出每个 op 输出数据的均值和标准差信息
.\lite\tools\build_windows.bat with_opencl with_extra with_precision_profile
.\lite\tools\build_windows.bat with_opencl with_extra with_precision_profile
```
详细输出信息的说明可查阅[调试工具](../user_guides/debug)。

Expand Down
2 changes: 1 addition & 1 deletion lite/backends/opencl/cl_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class CLContext {
p.second.reset();
}
CLRuntime::Global()->program_map().clear();
LOG(INFO) << "release cl::Program, cl::Kernel finished.";
VLOG(4) << "release cl::Program, cl::Kernel finished.";
}

cl::CommandQueue &GetCommandQueue();
Expand Down
16 changes: 11 additions & 5 deletions lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -2468,15 +2468,19 @@ __kernel void conv2d_1x1_fc(__read_only image2d_t input,
#ifdef ELT_FUSE
__read_only image2d_t second_input_image,
#endif // ELT_FUSE
int batch,
int in_c_blks,
int out_c_blks) {
int out_n = get_global_id(2);
int out_c = get_global_id(0);
int2 tid = (int2)(get_local_id(0), get_local_id(1));
CL_DTYPE4 s = (CL_DTYPE4)(0.0f);
if (out_n >= batch) return;

if (out_c < out_c_blks) {
for (int c = tid.y; c < in_c_blks; c += 4) {
CL_DTYPE4 v = READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, (int2)(c, 0));
CL_DTYPE4 v =
READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, (int2)(c, out_n));
CL_DTYPE16 w = weights[c * out_c_blks + out_c];
CL_DTYPE4 partial = v.x * w.s0123;
partial += v.y * w.s4567;
Expand All @@ -2496,20 +2500,22 @@ __kernel void conv2d_1x1_fc(__read_only image2d_t input,
s += temp[tid.x][1];
s += temp[tid.x][2];
s += temp[tid.x][3];
int2 output_pos0 = (int2)(out_c, 0);
int2 output_pos0 = (int2)(out_c, out_n);

#ifdef BIASE_CH
CL_DTYPE4 output0 =
s + READ_IMG_TYPE(CL_DTYPE_CHAR, biases, SAMPLER, output_pos0);
s + READ_IMG_TYPE(CL_DTYPE_CHAR, biases, SAMPLER, (int2)(out_c, 0));
#else
CL_DTYPE4 output0 = s;
#endif

CL_DTYPE4 alpha0;
#ifdef PRELU_CH
alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0);
alpha0 =
READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0));
#elif defined(PRELU_ELE)
alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0);
alpha0 =
READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0));
#elif defined(PRELU_ALL)
alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0));
alpha0.y = alpha0.x;
Expand Down
76 changes: 76 additions & 0 deletions lite/backends/opencl/cl_kernel/image/softmax_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -188,3 +188,79 @@ __kernel void softmax_channel(__read_only image2d_t input,
WRITE_IMG_TYPE(
CL_DTYPE_CHAR, output, (int2)(cur_out_width_pos, bh_idx), input_data);
}

__kernel void softmax_1x1(__read_only image2d_t input,
__write_only image2d_t output,
__private const float4 mask,
__private const int c_blks) {
const int c_blk_idx = get_global_id(0);
const int b_idx = get_global_id(1);
const int tid = get_local_id(0);

// Compute Max
float4 maxx4 = read_imagef(input, SAMPLER, (int2)(0, b_idx));
for (int s = tid; s < c_blks; s += 32) { // as workgroup size is 32
float4 mask_a = s == c_blks - 1 ? mask : (float4)(1.0f);
float4 src = read_imagef(input, SAMPLER, (int2)(s, b_idx));
src = src * mask_a;
maxx4 = max(maxx4, src);
}
float maximum = max(maxx4.x, maxx4.y);
maximum = max(maximum, maxx4.z);
maximum = max(maximum, maxx4.w);

// We need to find the final max value among each workgroup.
// Note workgroup size is 32, so we need 8 float4 data to store 32 maximum.
__local float4 tmp[8];
__local float* tmpx1 = (__local float*)tmp;
tmpx1[tid] = maximum;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid == 0) {
maxx4 = max(tmp[0], tmp[1]);
maxx4 = max(maxx4, tmp[2]);
maxx4 = max(maxx4, tmp[3]);
maxx4 = max(maxx4, tmp[4]);
maxx4 = max(maxx4, tmp[5]);
maxx4 = max(maxx4, tmp[6]);
maxx4 = max(maxx4, tmp[7]);
maximum = max(maxx4.x, maxx4.y);
maximum = max(maximum, maxx4.z);
maximum = max(maximum, maxx4.w);
tmpx1[0] = maximum;
}
barrier(CLK_LOCAL_MEM_FENCE);
maximum = tmpx1[0];

// Compute Exp Sum
float sum = 0.0f;
for (int s = tid; s < c_blks; s += 32) {
float4 mask_temp = s == c_blks - 1 ? mask : (float4)(1.0f);
float4 src =
read_imagef(input, SAMPLER, (int2)(s, b_idx)) - (float4)(maximum);
sum += dot(mask_temp, exp(src));
}
barrier(CLK_LOCAL_MEM_FENCE);
tmpx1[tid] = sum;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid == 0) {
sum = dot((float4)(1.0f), tmp[0]);
sum += dot((float4)(1.0f), tmp[1]);
sum += dot((float4)(1.0f), tmp[2]);
sum += dot((float4)(1.0f), tmp[3]);
sum += dot((float4)(1.0f), tmp[4]);
sum += dot((float4)(1.0f), tmp[5]);
sum += dot((float4)(1.0f), tmp[6]);
sum += dot((float4)(1.0f), tmp[7]);
tmpx1[0] = 1.0f / sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
sum = tmpx1[0];

// Compute Result
if (c_blk_idx < c_blks) {
float4 src = read_imagef(input, SAMPLER, (int2)(c_blk_idx, b_idx)) -
(float4)(maximum);
CL_DTYPE4 res = CONVERT_TYPE_TO(exp(src) * sum, CL_DTYPE4);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(c_blk_idx, b_idx), res);
}
}
2 changes: 1 addition & 1 deletion lite/core/optimizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ std::unique_ptr<RuntimeProgram> RunDefaultOptimizer(
"variable_place_inference_pass", // inference arg/var's
"control_flow_op_shared_inputs_and_outputs_place_sync_pass",
"__fpga_kernel_place_correct_pass",
"opencl_kernel_place_correct_pass",
// "opencl_kernel_place_correct_pass", // uncommit this pass
"mlu_postprocess_pass",
// info(target/precision/layout/device)
// using kernel info
Expand Down
65 changes: 48 additions & 17 deletions lite/core/profile/precision_profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,8 @@ class PrecisionProfiler {
MkDirRecur(log_dir_);
const char* write_to_file_raw =
std::getenv("PADDLELITE_PRECISION_WRITE_TO_FILE");
write_result_to_file_ = (write_to_file_raw && atoi(write_to_file_raw) > 0)
? atoi(write_to_file_raw) > 0
: false;
write_result_to_file_ =
(write_to_file_raw && atoi(write_to_file_raw) > 0) ? true : false;
}

std::string GetSummaryHeader() {
Expand Down Expand Up @@ -235,6 +234,7 @@ class PrecisionProfiler {
}

void compute_tensor_precision_info(const Tensor* in,
const std::string op_name,
DataLayoutType layout_type,
double* mean,
double* std_dev,
Expand All @@ -258,7 +258,9 @@ class PrecisionProfiler {
*std_dev =
compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
if (write_result_to_file) {
write_tensorfile<float>(in, name, log_dir_);
}
return;
}
#ifdef ENABLE_ARM_FP16
Expand All @@ -268,15 +270,19 @@ class PrecisionProfiler {
*std_dev =
compute_standard_deviation<__fp16>(ptr, in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<__fp16>(ptr, in->numel());
write_result_to_file&& write_tensorfile<__fp16>(in, name, log_dir_);
if (write_result_to_file) {
write_tensorfile<__fp16>(in, name, log_dir_);
}
return;
}
#endif
case PRECISION(kBool): {
*mean = -333333333333;
*std_dev = -33333333333;
*ave_grow_rate = -33333333333;
write_result_to_file&& write_tensorfile<bool>(in, name, log_dir_);
if (write_result_to_file) {
write_tensorfile<bool>(in, name, log_dir_);
}
return;
}
case PRECISION(kInt8): {
Expand All @@ -285,7 +291,9 @@ class PrecisionProfiler {
*std_dev =
compute_standard_deviation<int8_t>(ptr, in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<int8_t>(ptr, in->numel());
write_result_to_file&& write_tensorfile<int8_t>(in, name, log_dir_);
if (write_result_to_file) {
write_tensorfile<int8_t>(in, name, log_dir_);
}
return;
}
case PRECISION(kInt32): {
Expand All @@ -294,15 +302,19 @@ class PrecisionProfiler {
*std_dev = compute_standard_deviation<int32_t>(
ptr, in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<int32_t>(ptr, in->numel());
write_result_to_file&& write_tensorfile<int32_t>(in, name, log_dir_);
if (write_result_to_file) {
write_tensorfile<int32_t>(in, name, log_dir_);
}
return;
}
case PRECISION(kInt64): {
auto ptr = in->data<int64_t>();
*mean = compute_mean<int64_t>(ptr, in->numel());
*std_dev = compute_standard_deviation<int64_t>(
ptr, in->numel(), true, *mean);
write_result_to_file&& write_tensorfile<int64_t>(in, name, log_dir_);
if (write_result_to_file) {
write_tensorfile<int64_t>(in, name, log_dir_);
}
return;
}
default:
Expand All @@ -323,6 +335,10 @@ class PrecisionProfiler {
case DATALAYOUT(kImageDefault): {
paddle::lite::CLImageConverterDefault default_convertor;
auto image_shape = default_convertor.InitImageDimInfoWith(in->dims());
if (op_name == "fc" || op_name == "softmax") {
image_shape = DDim(std::vector<DDim::value_type>(
{in->dims()[1] / 4, in->dims()[0]}));
}
size_t im_w = image_shape[0];
size_t im_h = image_shape[1];
VLOG(1) << "image shape(W,H) of " << name << ": " << im_w << " "
Expand All @@ -345,6 +361,9 @@ class PrecisionProfiler {
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
// TODO(zhaoyang-star): Tensor shape padding mode will change from
// high-dim padding to low-dim padding to fit image2d.
// ImageConverter will be changed.
default_convertor.ImageToNCHW(
in_data_v, real_out_v.data(), image_shape, in->dims());
CHECK(real_out_v.size() == in->numel());
Expand All @@ -359,8 +378,9 @@ class PrecisionProfiler {
memcpy(real_out_data,
real_out_v.data(),
real_out_v.size() * sizeof(float));
write_result_to_file&& write_tensorfile<float>(
real_out_t.get(), name, log_dir_);
if (write_result_to_file) {
write_tensorfile<float>(real_out_t.get(), name, log_dir_);
}
return;
}
case DATALAYOUT(kNCHW): {
Expand All @@ -381,8 +401,9 @@ class PrecisionProfiler {
memcpy(real_out_data,
in_data_v.data(),
in_data_v.size() * sizeof(float));
write_result_to_file&& write_tensorfile<float>(
real_out_t.get(), name, log_dir_);
if (write_result_to_file) {
write_tensorfile<float>(real_out_t.get(), name, log_dir_);
}
return;
}
default:
Expand All @@ -409,7 +430,9 @@ class PrecisionProfiler {
in_data_v.data(), in->numel(), true, *mean);
*ave_grow_rate =
compute_average_grow_rate<float>(in_data_v.data(), in->numel());
write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
if (write_result_to_file) {
write_tensorfile<float>(in, name, log_dir_);
}
return;
}
case PRECISION(kInt32): {
Expand All @@ -424,7 +447,9 @@ class PrecisionProfiler {
in_data_v.data(), in->numel(), true, *mean);
*ave_grow_rate =
compute_average_grow_rate<int>(in_data_v.data(), in->numel());
write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
if (write_result_to_file) {
write_tensorfile<int>(in, name, log_dir_);
}
return;
}
case PRECISION(kInt64): {
Expand All @@ -439,7 +464,9 @@ class PrecisionProfiler {
in_data_v.data(), in->numel(), true, *mean);
*ave_grow_rate =
compute_average_grow_rate<int64_t>(in_data_v.data(), in->numel());
write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
if (write_result_to_file) {
write_tensorfile<int64_t>(in, name, log_dir_);
}
return;
}
case PRECISION(kFP16): {
Expand All @@ -460,7 +487,9 @@ class PrecisionProfiler {
in_data_v.data(), in->numel(), true, *mean);
*ave_grow_rate =
compute_average_grow_rate<float>(in_data_v.data(), in->numel());
write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
if (write_result_to_file) {
write_tensorfile<float>(in, name, log_dir_);
}
return;
}
default:
Expand Down Expand Up @@ -521,6 +550,7 @@ class PrecisionProfiler {

if (tout->IsInitialized()) {
compute_tensor_precision_info(tout,
op_name,
type->layout(),
&mean,
&std_dev,
Expand Down Expand Up @@ -559,6 +589,7 @@ class PrecisionProfiler {

if (tout->IsInitialized()) {
compute_tensor_precision_info(tout,
op_name,
type->layout(),
&mean,
&std_dev,
Expand Down
7 changes: 7 additions & 0 deletions lite/kernels/opencl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ add_kernel(batch_norm_opencl_image OPENCL basic SRCS batch_norm_image_compute.cc
add_kernel(reduce_mean_opencl_image OPENCL basic SRCS reduce_mean_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(clip_opencl_image OPENCL basic SRCS clip_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(softmax_opencl_image OPENCL basic SRCS softmax_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(fc_opencl_image OPENCL basic SRCS fc_image_compute.cc DEPS ${cl_kernel_deps})
# extra
# wait to add ...

Expand Down Expand Up @@ -121,6 +122,12 @@ lite_cc_test(test_box_coder_image_opencl SRCS box_coder_image_compute_test.cc
lite_cc_test(test_trigonometric_image_opencl SRCS trigonometric_image_compute_test.cc
DEPS trigonometric_opencl_image op_registry program context)

lite_cc_test(test_fc_image_opencl SRCS fc_image_compute_test.cc
DEPS fc_opencl_image op_registry program context)

lite_cc_test(test_softmax_image_opencl SRCS softmax_image_compute_test.cc
DEPS softmax_opencl_image op_registry program context)

######################
# buffer kernel #
######################
Expand Down
2 changes: 2 additions & 0 deletions lite/kernels/opencl/conv_image_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1506,6 +1506,8 @@ void ConvImageCompute::Conv2d1x1FC() {
status_ = kernel_.setArg(cnt++, *second_input_image_p_);
CL_CHECK_FATAL(status_);
}
status_ = kernel_.setArg(cnt++, output_tensor_n_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(cnt++, UP_DIV(input_tensor_c_, 4));
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(cnt++, UP_DIV(output_tensor_c_, 4));
Expand Down
Loading