PaddlePaddle · zhaoyang-star · Jul 29, 2021 · Jul 8, 2021 · Jul 8, 2021 · Jul 9, 2021
@@ -100,7 +100,7 @@ rm ./lite/api/paddle_use_ops.h
   build_opencl
 ```
 
-注：如果要调试cl kernel，假设已经完成上述脚本编译(已生成cmake文件)。调试只需要修改`./lite/backends/opencl/cl_kernel/`下对应的kernel文件，保存后在项目根目录执行`python ./lite/tools/cmake_tools/gen_opencl_code.py ./lite/backends/opencl/cl_kernel ./lite/backends/opencl/opencl_kernels_source.cc`，该命令会自动将修改后，再切到build目录下执行`make publish_inference`或者你要编译的单测的可执行文件名，cl kernel文件的内容会随着编译自动打包到产物包如 .so 中或者对应单测可执行文件中。
+注：如果要调试cl kernel，假设已经完成上述脚本编译(已生成cmake文件)。调试只需要修改`./lite/backends/opencl/cl_kernel/`下对应的kernel文件，保存后在项目根目录执行`python ./lite/tools/cmake_tools/gen_opencl_code.py ./lite/backends/opencl/cl_kernel ./lite/backends/opencl/opencl_kernels_source.cc`，该命令会自动更新`opencl_kernels_source.cc`，然后进入 build 目录（如`build.lite.android.armv8.gcc`）下执行`make publish_inference`或者待编译的单测的可执行文件名（如`make test_fc_image_opencl`），cl kernel文件的内容会随着编译自动打包到产物包如 .so 中或者对应单测可执行文件中。
 
 ### 1.3 编译产物说明
 
@@ -185,7 +185,7 @@ adb shell "export LD_LIBRARY_PATH=/data/local/tmp/opencl/; \
            /data/local/tmp/opencl/mobilenetv1_light_api \
            /data/local/tmp/opencl/mobilenetv1_opencl_fp32_opt_releasev2.6_b8234efb_20200423.nb \
            1,3,224,224 \
-           100 10 0 1 1 0" 
+           100 10 0 1 1 0"
            # repeats=100, warmup=10
            # power_mode=0 绑定大核, thread_num=1
            # accelerate_opencl=1 开启 opencl kernel cache & tuning，仅当模型运行在 opencl 后端时该选项才会生效
@@ -263,9 +263,9 @@ macOS x86 平台下分析：
 Windows x86 平台下分析：
 ```
 # 开启性能分析，会打印出每个 op 耗时信息和汇总信息
-.\lite\tools\build_windows.bat with_opencl with_extra with_profile 
+.\lite\tools\build_windows.bat with_opencl with_extra with_profile
 # 开启精度分析，会打印出每个 op 输出数据的均值和标准差信息
-.\lite\tools\build_windows.bat with_opencl with_extra with_precision_profile 
+.\lite\tools\build_windows.bat with_opencl with_extra with_precision_profile
 ```
 详细输出信息的说明可查阅[调试工具](../user_guides/debug)。
 

@@ -40,7 +40,7 @@ class CLContext {
       p.second.reset();
     }
     CLRuntime::Global()->program_map().clear();
-    LOG(INFO) << "release cl::Program, cl::Kernel finished.";
+    VLOG(4) << "release cl::Program, cl::Kernel finished.";
   }
 
   cl::CommandQueue &GetCommandQueue();

@@ -2468,15 +2468,19 @@ __kernel void conv2d_1x1_fc(__read_only image2d_t input,
 #ifdef ELT_FUSE
                             __read_only image2d_t second_input_image,
 #endif  // ELT_FUSE
+                            int batch,
                             int in_c_blks,
                             int out_c_blks) {
+  int out_n = get_global_id(2);
   int out_c = get_global_id(0);
   int2 tid = (int2)(get_local_id(0), get_local_id(1));
   CL_DTYPE4 s = (CL_DTYPE4)(0.0f);
+  if (out_n >= batch) return;
 
   if (out_c < out_c_blks) {
     for (int c = tid.y; c < in_c_blks; c += 4) {
-      CL_DTYPE4 v = READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, (int2)(c, 0));
+      CL_DTYPE4 v =
+          READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, (int2)(c, out_n));
       CL_DTYPE16 w = weights[c * out_c_blks + out_c];
       CL_DTYPE4 partial = v.x * w.s0123;
       partial += v.y * w.s4567;
@@ -2496,20 +2500,22 @@ __kernel void conv2d_1x1_fc(__read_only image2d_t input,
     s += temp[tid.x][1];
     s += temp[tid.x][2];
     s += temp[tid.x][3];
-    int2 output_pos0 = (int2)(out_c, 0);
+    int2 output_pos0 = (int2)(out_c, out_n);
 
 #ifdef BIASE_CH
     CL_DTYPE4 output0 =
-        s + READ_IMG_TYPE(CL_DTYPE_CHAR, biases, SAMPLER, output_pos0);
+        s + READ_IMG_TYPE(CL_DTYPE_CHAR, biases, SAMPLER, (int2)(out_c, 0));
 #else
     CL_DTYPE4 output0 = s;
 #endif
 
     CL_DTYPE4 alpha0;
 #ifdef PRELU_CH
-    alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0);
+    alpha0 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0));
 #elif defined(PRELU_ELE)
-    alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0);
+    alpha0 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0));
 #elif defined(PRELU_ALL)
     alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0));
     alpha0.y = alpha0.x;

@@ -188,3 +188,79 @@ __kernel void softmax_channel(__read_only image2d_t input,
   WRITE_IMG_TYPE(
       CL_DTYPE_CHAR, output, (int2)(cur_out_width_pos, bh_idx), input_data);
 }
+
+__kernel void softmax_1x1(__read_only image2d_t input,
+                          __write_only image2d_t output,
+                          __private const float4 mask,
+                          __private const int c_blks) {
+  const int c_blk_idx = get_global_id(0);
+  const int b_idx = get_global_id(1);
+  const int tid = get_local_id(0);
+
+  // Compute Max
+  float4 maxx4 = read_imagef(input, SAMPLER, (int2)(0, b_idx));
+  for (int s = tid; s < c_blks; s += 32) {  // as workgroup size is 32
+    float4 mask_a = s == c_blks - 1 ? mask : (float4)(1.0f);
+    float4 src = read_imagef(input, SAMPLER, (int2)(s, b_idx));
+    src = src * mask_a;
+    maxx4 = max(maxx4, src);
+  }
+  float maximum = max(maxx4.x, maxx4.y);
+  maximum = max(maximum, maxx4.z);
+  maximum = max(maximum, maxx4.w);
+
+  // We need to find the final max value among each workgroup.
+  // Note workgroup size is 32, so we need 8 float4 data to store 32 maximum.
+  __local float4 tmp[8];
+  __local float* tmpx1 = (__local float*)tmp;
+  tmpx1[tid] = maximum;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (tid == 0) {
+    maxx4 = max(tmp[0], tmp[1]);
+    maxx4 = max(maxx4, tmp[2]);
+    maxx4 = max(maxx4, tmp[3]);
+    maxx4 = max(maxx4, tmp[4]);
+    maxx4 = max(maxx4, tmp[5]);
+    maxx4 = max(maxx4, tmp[6]);
+    maxx4 = max(maxx4, tmp[7]);
+    maximum = max(maxx4.x, maxx4.y);
+    maximum = max(maximum, maxx4.z);
+    maximum = max(maximum, maxx4.w);
+    tmpx1[0] = maximum;
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  maximum = tmpx1[0];
+
+  // Compute Exp Sum
+  float sum = 0.0f;
+  for (int s = tid; s < c_blks; s += 32) {
+    float4 mask_temp = s == c_blks - 1 ? mask : (float4)(1.0f);
+    float4 src =
+        read_imagef(input, SAMPLER, (int2)(s, b_idx)) - (float4)(maximum);
+    sum += dot(mask_temp, exp(src));
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  tmpx1[tid] = sum;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (tid == 0) {
+    sum = dot((float4)(1.0f), tmp[0]);
+    sum += dot((float4)(1.0f), tmp[1]);
+    sum += dot((float4)(1.0f), tmp[2]);
+    sum += dot((float4)(1.0f), tmp[3]);
+    sum += dot((float4)(1.0f), tmp[4]);
+    sum += dot((float4)(1.0f), tmp[5]);
+    sum += dot((float4)(1.0f), tmp[6]);
+    sum += dot((float4)(1.0f), tmp[7]);
+    tmpx1[0] = 1.0f / sum;
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  sum = tmpx1[0];
+
+  // Compute Result
+  if (c_blk_idx < c_blks) {
+    float4 src = read_imagef(input, SAMPLER, (int2)(c_blk_idx, b_idx)) -
+                 (float4)(maximum);
+    CL_DTYPE4 res = CONVERT_TYPE_TO(exp(src) * sum, CL_DTYPE4);
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(c_blk_idx, b_idx), res);
+  }
+}
@@ -224,7 +224,7 @@ std::unique_ptr<RuntimeProgram> RunDefaultOptimizer(
        "variable_place_inference_pass",  // inference arg/var's
        "control_flow_op_shared_inputs_and_outputs_place_sync_pass",
        "__fpga_kernel_place_correct_pass",
-       "opencl_kernel_place_correct_pass",
+       // "opencl_kernel_place_correct_pass", // uncommit this pass
        "mlu_postprocess_pass",
        // info(target/precision/layout/device)
        // using kernel info

@@ -136,9 +136,8 @@ class PrecisionProfiler {
     MkDirRecur(log_dir_);
     const char* write_to_file_raw =
         std::getenv("PADDLELITE_PRECISION_WRITE_TO_FILE");
-    write_result_to_file_ = (write_to_file_raw && atoi(write_to_file_raw) > 0)
-                                ? atoi(write_to_file_raw) > 0
-                                : false;
+    write_result_to_file_ =
+        (write_to_file_raw && atoi(write_to_file_raw) > 0) ? true : false;
   }
 
   std::string GetSummaryHeader() {
@@ -235,6 +234,7 @@ class PrecisionProfiler {
   }
 
   void compute_tensor_precision_info(const Tensor* in,
+                                     const std::string op_name,
                                      DataLayoutType layout_type,
                                      double* mean,
                                      double* std_dev,
@@ -258,7 +258,9 @@ class PrecisionProfiler {
           *std_dev =
               compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
           *ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
-          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
+          if (write_result_to_file) {
+            write_tensorfile<float>(in, name, log_dir_);
+          }
           return;
         }
 #ifdef ENABLE_ARM_FP16
@@ -268,15 +270,19 @@ class PrecisionProfiler {
           *std_dev =
               compute_standard_deviation<__fp16>(ptr, in->numel(), true, *mean);
           *ave_grow_rate = compute_average_grow_rate<__fp16>(ptr, in->numel());
-          write_result_to_file&& write_tensorfile<__fp16>(in, name, log_dir_);
+          if (write_result_to_file) {
+            write_tensorfile<__fp16>(in, name, log_dir_);
+          }
           return;
         }
 #endif
         case PRECISION(kBool): {
           *mean = -333333333333;
           *std_dev = -33333333333;
           *ave_grow_rate = -33333333333;
-          write_result_to_file&& write_tensorfile<bool>(in, name, log_dir_);
+          if (write_result_to_file) {
+            write_tensorfile<bool>(in, name, log_dir_);
+          }
           return;
         }
         case PRECISION(kInt8): {
@@ -285,7 +291,9 @@ class PrecisionProfiler {
           *std_dev =
               compute_standard_deviation<int8_t>(ptr, in->numel(), true, *mean);
           *ave_grow_rate = compute_average_grow_rate<int8_t>(ptr, in->numel());
-          write_result_to_file&& write_tensorfile<int8_t>(in, name, log_dir_);
+          if (write_result_to_file) {
+            write_tensorfile<int8_t>(in, name, log_dir_);
+          }
           return;
         }
         case PRECISION(kInt32): {
@@ -294,15 +302,19 @@ class PrecisionProfiler {
           *std_dev = compute_standard_deviation<int32_t>(
               ptr, in->numel(), true, *mean);
           *ave_grow_rate = compute_average_grow_rate<int32_t>(ptr, in->numel());
-          write_result_to_file&& write_tensorfile<int32_t>(in, name, log_dir_);
+          if (write_result_to_file) {
+            write_tensorfile<int32_t>(in, name, log_dir_);
+          }
           return;
         }
         case PRECISION(kInt64): {
           auto ptr = in->data<int64_t>();
           *mean = compute_mean<int64_t>(ptr, in->numel());
           *std_dev = compute_standard_deviation<int64_t>(
               ptr, in->numel(), true, *mean);
-          write_result_to_file&& write_tensorfile<int64_t>(in, name, log_dir_);
+          if (write_result_to_file) {
+            write_tensorfile<int64_t>(in, name, log_dir_);
+          }
           return;
         }
         default:
@@ -323,6 +335,10 @@ class PrecisionProfiler {
         case DATALAYOUT(kImageDefault): {
           paddle::lite::CLImageConverterDefault default_convertor;
           auto image_shape = default_convertor.InitImageDimInfoWith(in->dims());
+          if (op_name == "fc" || op_name == "softmax") {
+            image_shape = DDim(std::vector<DDim::value_type>(
+                {in->dims()[1] / 4, in->dims()[0]}));
+          }
           size_t im_w = image_shape[0];
           size_t im_h = image_shape[1];
           VLOG(1) << "image shape(W,H) of " << name << ": " << im_w << " "
@@ -345,6 +361,9 @@ class PrecisionProfiler {
                                       cl_image2d_row_pitch,
                                       cl_image2d_slice_pitch,
                                       IoDirection::DtoH);
+          // TODO(zhaoyang-star): Tensor shape padding mode will change from
+          // high-dim padding to low-dim padding to fit image2d.
+          // ImageConverter will be changed.
           default_convertor.ImageToNCHW(
               in_data_v, real_out_v.data(), image_shape, in->dims());
           CHECK(real_out_v.size() == in->numel());
@@ -359,8 +378,9 @@ class PrecisionProfiler {
           memcpy(real_out_data,
                  real_out_v.data(),
                  real_out_v.size() * sizeof(float));
-          write_result_to_file&& write_tensorfile<float>(
-              real_out_t.get(), name, log_dir_);
+          if (write_result_to_file) {
+            write_tensorfile<float>(real_out_t.get(), name, log_dir_);
+          }
           return;
         }
         case DATALAYOUT(kNCHW): {
@@ -381,8 +401,9 @@ class PrecisionProfiler {
           memcpy(real_out_data,
                  in_data_v.data(),
                  in_data_v.size() * sizeof(float));
-          write_result_to_file&& write_tensorfile<float>(
-              real_out_t.get(), name, log_dir_);
+          if (write_result_to_file) {
+            write_tensorfile<float>(real_out_t.get(), name, log_dir_);
+          }
           return;
         }
         default:
@@ -409,7 +430,9 @@ class PrecisionProfiler {
               in_data_v.data(), in->numel(), true, *mean);
           *ave_grow_rate =
               compute_average_grow_rate<float>(in_data_v.data(), in->numel());
-          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
+          if (write_result_to_file) {
+            write_tensorfile<float>(in, name, log_dir_);
+          }
           return;
         }
         case PRECISION(kInt32): {
@@ -424,7 +447,9 @@ class PrecisionProfiler {
               in_data_v.data(), in->numel(), true, *mean);
           *ave_grow_rate =
               compute_average_grow_rate<int>(in_data_v.data(), in->numel());
-          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
+          if (write_result_to_file) {
+            write_tensorfile<int>(in, name, log_dir_);
+          }
           return;
         }
         case PRECISION(kInt64): {
@@ -439,7 +464,9 @@ class PrecisionProfiler {
               in_data_v.data(), in->numel(), true, *mean);
           *ave_grow_rate =
               compute_average_grow_rate<int64_t>(in_data_v.data(), in->numel());
-          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
+          if (write_result_to_file) {
+            write_tensorfile<int64_t>(in, name, log_dir_);
+          }
           return;
         }
         case PRECISION(kFP16): {
@@ -460,7 +487,9 @@ class PrecisionProfiler {
               in_data_v.data(), in->numel(), true, *mean);
           *ave_grow_rate =
               compute_average_grow_rate<float>(in_data_v.data(), in->numel());
-          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
+          if (write_result_to_file) {
+            write_tensorfile<float>(in, name, log_dir_);
+          }
           return;
         }
         default:
@@ -521,6 +550,7 @@ class PrecisionProfiler {
 
           if (tout->IsInitialized()) {
             compute_tensor_precision_info(tout,
+                                          op_name,
                                           type->layout(),
                                           &mean,
                                           &std_dev,
@@ -559,6 +589,7 @@ class PrecisionProfiler {
 
             if (tout->IsInitialized()) {
               compute_tensor_precision_info(tout,
+                                            op_name,
                                             type->layout(),
                                             &mean,
                                             &std_dev,

@@ -42,6 +42,7 @@ add_kernel(batch_norm_opencl_image OPENCL basic SRCS batch_norm_image_compute.cc
 add_kernel(reduce_mean_opencl_image OPENCL basic SRCS reduce_mean_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(clip_opencl_image OPENCL basic SRCS clip_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(softmax_opencl_image OPENCL basic SRCS softmax_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(fc_opencl_image OPENCL basic SRCS fc_image_compute.cc DEPS ${cl_kernel_deps})
 # extra
 # wait to add ...
 
@@ -121,6 +122,12 @@ lite_cc_test(test_box_coder_image_opencl SRCS box_coder_image_compute_test.cc
 lite_cc_test(test_trigonometric_image_opencl SRCS trigonometric_image_compute_test.cc
              DEPS trigonometric_opencl_image op_registry program context)
 
+lite_cc_test(test_fc_image_opencl SRCS fc_image_compute_test.cc
+             DEPS fc_opencl_image op_registry program context)
+
+lite_cc_test(test_softmax_image_opencl SRCS softmax_image_compute_test.cc
+             DEPS softmax_opencl_image op_registry program context)
+
 ######################
 # buffer kernel      #
 ######################

@@ -1506,6 +1506,8 @@ void ConvImageCompute::Conv2d1x1FC() {
     status_ = kernel_.setArg(cnt++, *second_input_image_p_);
     CL_CHECK_FATAL(status_);
   }
+  status_ = kernel_.setArg(cnt++, output_tensor_n_);
+  CL_CHECK_FATAL(status_);
   status_ = kernel_.setArg(cnt++, UP_DIV(input_tensor_c_, 4));
   CL_CHECK_FATAL(status_);
   status_ = kernel_.setArg(cnt++, UP_DIV(output_tensor_c_, 4));