diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_broadcast_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_broadcast_kernel.cl
new file mode 100644
index 00000000000..d2e20fda8c7
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/elementwise_broadcast_kernel.cl
@@ -0,0 +1,328 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void broadcast_elementwise_common(
+    __read_only image2d_t input_x,
+    __read_only image2d_t input_y,
+    __write_only image2d_t output_image,
+    __private const int4 input_nhwc4,
+    __private const int4 bias_nhwc4,
+    __private const int4 output_nhwc4,
+    __private const int inputx_broadcast_c_flag,
+    __private const int inputy_broadcast_c_flag,
+    __private const int image_folder_flag_x,
+    __private const int image_folder_flag_y,
+    __private const int bias_width) {
+  int idwc4 = get_global_id(0);
+  int idbh = get_global_id(1);
+
+  if (idwc4 >= output_nhwc4.w * output_nhwc4.z ||
+      idbh >= output_nhwc4.x * output_nhwc4.y) {
+    return;
+  }
+
+  int4 id_shape;
+  id_shape.w = idwc4 / output_nhwc4.z;  // c4
+  id_shape.z = idwc4 % output_nhwc4.z;  // w
+  id_shape.y = idbh % output_nhwc4.y;   // h
+  id_shape.x = idbh / output_nhwc4.y;   // n
+
+  int4 v_zero = (int4)(0);
+  int4 flag_v = (int4)(0);
+  flag_v = isless(convert_float4(id_shape), convert_float4(input_nhwc4));
+  int4 idx_shape = select(v_zero, id_shape, flag_v);
+
+  int2 cur_index = (int2)(idx_shape.w * input_nhwc4.z + idx_shape.z,
+                          idx_shape.x * input_nhwc4.y + idx_shape.y);
+  CL_DTYPE4 in_x = (CL_DTYPE4)(0.f);
+
+  if (image_folder_flag_x == 0) {
+    in_x = READ_IMG_TYPE(CL_DTYPE_CHAR, input_x, SAMPLER, cur_index);
+  }
+
+  // w -> n
+  if (image_folder_flag_x == 1) {
+    CL_DTYPE4 in0 = 0.f;
+    CL_DTYPE4 in1 = 0.f;
+    CL_DTYPE4 in2 = 0.f;
+    CL_DTYPE4 in3 = 0.f;
+
+    in0 = READ_IMG_TYPE(
+        CL_DTYPE_CHAR, input_x, SAMPLER, (int2)(cur_index.x * 4, cur_index.y));
+
+    if (cur_index.x * 4 + 1 < input_nhwc4.w * 4) {
+      in1 = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_x,
+                          SAMPLER,
+                          (int2)(cur_index.x * 4 + 1, cur_index.y));
+    }
+    if (cur_index.x * 4 + 2 < input_nhwc4.w * 4) {
+      in2 = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_x,
+                          SAMPLER,
+                          (int2)(cur_index.x * 4 + 2, cur_index.y));
+    }
+    if (cur_index.x * 4 + 3 < input_nhwc4.w * 4) {
+      in3 = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_x,
+                          SAMPLER,
+                          (int2)(cur_index.x * 4 + 3, cur_index.y));
+    }
+    in_x = (CL_DTYPE4)(in0.x, in1.x, in2.x, in3.x);
+  }
+
+  // w -> h
+  if (image_folder_flag_x == 2) {
+    in_x = READ_IMG_TYPE(
+        CL_DTYPE_CHAR, input_x, SAMPLER, (int2)(cur_index.y, cur_index.x));
+  }
+
+  // hw -> ch
+  if (image_folder_flag_x == 3) {
+    CL_DTYPE4 in0 = 0.f;
+    CL_DTYPE4 in1 = 0.f;
+    CL_DTYPE4 in2 = 0.f;
+    CL_DTYPE4 in3 = 0.f;
+
+    in0 = READ_IMG_TYPE(
+        CL_DTYPE_CHAR, input_x, SAMPLER, (int2)(cur_index.y, cur_index.x * 4));
+
+    if (cur_index.x * 4 + 1 < input_nhwc4.z * input_nhwc4.w * 4) {
+      in1 = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_x,
+                          SAMPLER,
+                          (int2)(cur_index.y, cur_index.x * 4 + 1));
+    }
+    if (cur_index.x * 4 + 2 < input_nhwc4.z * input_nhwc4.w * 4) {
+      in2 = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_x,
+                          SAMPLER,
+                          (int2)(cur_index.y, cur_index.x * 4 + 2));
+    }
+    if (cur_index.x * 4 + 3 < input_nhwc4.z * input_nhwc4.w * 4) {
+      in3 = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_x,
+                          SAMPLER,
+                          (int2)(cur_index.y, cur_index.x * 4 + 3));
+    }
+    in_x = (CL_DTYPE4)(in0.x, in1.x, in2.x, in3.x);
+  }
+
+  // chw -> nch
+  if (image_folder_flag_x == 4) {
+    int tmp_c4 = idx_shape.x / 4;  // n;
+    int tmp_h = idx_shape.w * 4;   // c4 * 4;
+    int tmp_w = idx_shape.y;
+
+    cur_index =
+        (int2)(tmp_c4 * 1 + tmp_w, idx_shape.x * input_nhwc4.y + idx_shape.y);
+
+    CL_DTYPE4 in0 = 0.f;
+    CL_DTYPE4 in1 = 0.f;
+    CL_DTYPE4 in2 = 0.f;
+    CL_DTYPE4 in3 = 0.f;
+
+    in0 = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                        input_x,
+                        SAMPLER,
+                        (int2)(tmp_c4 * input_nhwc4.y + tmp_w, tmp_h));
+
+    if (cur_index.x + 1 < input_nhwc4.x * input_nhwc4.y) {
+      in1 = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_x,
+                          SAMPLER,
+                          (int2)(tmp_c4 * input_nhwc4.y + tmp_w, tmp_h + 1));
+    }
+    if (cur_index.x + 2 < input_nhwc4.x * input_nhwc4.y) {
+      in2 = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_x,
+                          SAMPLER,
+                          (int2)(tmp_c4 * input_nhwc4.y + tmp_w, tmp_h + 2));
+    }
+    if (cur_index.x + 3 < input_nhwc4.x * input_nhwc4.y) {
+      in3 = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_x,
+                          SAMPLER,
+                          (int2)(tmp_c4 * input_nhwc4.y + tmp_w, tmp_h + 3));
+    }
+
+    if (idx_shape.x % 4 == 0) {
+      in_x = (CL_DTYPE4)(in0.x, in1.x, in2.x, in3.x);
+    }
+
+    if (idx_shape.x % 4 == 1) {
+      in_x = (CL_DTYPE4)(in0.y, in1.y, in2.y, in3.y);
+    }
+
+    if (idx_shape.x % 4 == 2) {
+      in_x = (CL_DTYPE4)(in0.z, in1.z, in2.z, in3.z);
+    }
+
+    if (idx_shape.x % 4 == 3) {
+      in_x = (CL_DTYPE4)(in0.w, in1.w, in2.w, in3.w);
+    }
+  }
+
+  /***************************get y data*******************************/
+  flag_v = isless(convert_float4(id_shape), convert_float4(bias_nhwc4));
+  int4 idy_shape = select(v_zero, id_shape, flag_v);
+
+  cur_index = (int2)(idy_shape.w * bias_nhwc4.z + idy_shape.z,
+                     idy_shape.x * bias_nhwc4.y + idy_shape.y);
+  CL_DTYPE4 in_y = (CL_DTYPE4)(0.f);
+
+  if (image_folder_flag_y == 0) {
+    in_y = READ_IMG_TYPE(CL_DTYPE_CHAR, input_y, SAMPLER, cur_index);
+  }
+
+  // w -> n (ImageDefault->ImageFolder for elementwise )
+  if (image_folder_flag_y == 1) {
+    CL_DTYPE4 in0 = 0.f;
+    CL_DTYPE4 in1 = 0.f;
+    CL_DTYPE4 in2 = 0.f;
+    CL_DTYPE4 in3 = 0.f;
+
+    in0 = READ_IMG_TYPE(
+        CL_DTYPE_CHAR, input_y, SAMPLER, (int2)(cur_index.x * 4, cur_index.y));
+
+    if (cur_index.x * 4 + 1 < bias_width) {
+      in1 = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_y,
+                          SAMPLER,
+                          (int2)(cur_index.x * 4 + 1, cur_index.y));
+    }
+    if (cur_index.x * 4 + 2 < bias_width) {
+      in2 = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_y,
+                          SAMPLER,
+                          (int2)(cur_index.x * 4 + 2, cur_index.y));
+    }
+    if (cur_index.x * 4 + 3 < bias_width) {
+      in3 = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_y,
+                          SAMPLER,
+                          (int2)(cur_index.x * 4 + 3, cur_index.y));
+    }
+    in_y = (CL_DTYPE4)(in0.x, in1.x, in2.x, in3.x);
+  }
+
+  // w -> h
+  if (image_folder_flag_y == 2) {
+    in_y = READ_IMG_TYPE(
+        CL_DTYPE_CHAR, input_y, SAMPLER, (int2)(cur_index.y, cur_index.x));
+  }
+
+  // hw -> ch
+  if (image_folder_flag_y == 3) {
+    CL_DTYPE4 in0 = 0.f;
+    CL_DTYPE4 in1 = 0.f;
+    CL_DTYPE4 in2 = 0.f;
+    CL_DTYPE4 in3 = 0.f;
+
+    in0 = READ_IMG_TYPE(
+        CL_DTYPE_CHAR, input_y, SAMPLER, (int2)(cur_index.y, cur_index.x * 4));
+
+    if (cur_index.x * 4 + 1 < bias_nhwc4.z * bias_nhwc4.w * 4) {
+      in1 = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_y,
+                          SAMPLER,
+                          (int2)(cur_index.y, cur_index.x * 4 + 1));
+    }
+    if (cur_index.x * 4 + 2 < bias_nhwc4.z * bias_nhwc4.w * 4) {
+      in2 = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_y,
+                          SAMPLER,
+                          (int2)(cur_index.y, cur_index.x * 4 + 2));
+    }
+    if (cur_index.x * 4 + 3 < bias_nhwc4.z * bias_nhwc4.w * 4) {
+      in3 = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_y,
+                          SAMPLER,
+                          (int2)(cur_index.y, cur_index.x * 4 + 3));
+    }
+    in_y = (CL_DTYPE4)(in0.x, in1.x, in2.x, in3.x);
+  }
+
+  // chw -> nch
+  if (image_folder_flag_y == 4) {
+    int tmp_c4 = idy_shape.x / 4;  // n;
+    int tmp_h = idy_shape.w * 4;   // c4 * 4;
+    int tmp_w = idy_shape.y;
+
+    cur_index =
+        (int2)(tmp_c4 * 1 + tmp_w, idy_shape.x * bias_nhwc4.y + idy_shape.y);
+
+    CL_DTYPE4 in0 = 0.f;
+    CL_DTYPE4 in1 = 0.f;
+    CL_DTYPE4 in2 = 0.f;
+    CL_DTYPE4 in3 = 0.f;
+
+    in0 = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                        input_y,
+                        SAMPLER,
+                        (int2)(tmp_c4 * bias_nhwc4.y + tmp_w, tmp_h));
+
+    if (cur_index.x + 1 < bias_nhwc4.x * bias_nhwc4.y) {
+      in1 = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_y,
+                          SAMPLER,
+                          (int2)(tmp_c4 * bias_nhwc4.y + tmp_w, tmp_h + 1));
+    }
+    if (cur_index.x + 2 < bias_nhwc4.x * bias_nhwc4.y) {
+      in2 = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_y,
+                          SAMPLER,
+                          (int2)(tmp_c4 * bias_nhwc4.y + tmp_w, tmp_h + 2));
+    }
+    if (cur_index.x + 3 < bias_nhwc4.x * bias_nhwc4.y) {
+      in3 = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_y,
+                          SAMPLER,
+                          (int2)(tmp_c4 * bias_nhwc4.y + tmp_w, tmp_h + 3));
+    }
+
+    if (idy_shape.x % 4 == 0) {
+      in_y = (CL_DTYPE4)(in0.x, in1.x, in2.x, in3.x);
+    }
+
+    if (idy_shape.x % 4 == 1) {
+      in_y = (CL_DTYPE4)(in0.y, in1.y, in2.y, in3.y);
+    }
+
+    if (idy_shape.x % 4 == 2) {
+      in_y = (CL_DTYPE4)(in0.z, in1.z, in2.z, in3.z);
+    }
+
+    if (idy_shape.x % 4 == 3) {
+      in_y = (CL_DTYPE4)(in0.w, in1.w, in2.w, in3.w);
+    }
+  }
+
+  in_x = SELECT(in_x, (CL_DTYPE4)(in_x.x), inputx_broadcast_c_flag);
+  in_y = SELECT(in_y, (CL_DTYPE4)(in_y.x), inputy_broadcast_c_flag);
+
+  CL_DTYPE4 output = OPERATOR(in_x, in_y);
+#ifdef FUSE_SCALE
+  output = fuse_scale(output, SCALE_SLOPE, SCALE_BIAS, SCALE_ALPHA);
+#endif
+
+#ifdef RELU
+  CL_DTYPE4 alpha;
+  output = activation_type4(output, alpha);
+#endif
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(idwc4, idbh), output);
+}
\ No newline at end of file
diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_kernel.cl
new file mode 100644
index 00000000000..797b23d7b37
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/elementwise_kernel.cl
@@ -0,0 +1,100 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void elementwise_compute(__read_only image2d_t input,
+                                  __read_only image2d_t bias,
+                                  __write_only image2d_t outputImage) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+
+  CL_DTYPE4 in_x = READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, coords);
+  CL_DTYPE4 in_y = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, coords);
+
+  CL_DTYPE4 output = OPERATOR(in_x, in_y);
+#ifdef FUSE_SCALE
+  output = fuse_scale(output, SCALE_SLOPE, SCALE_BIAS, SCALE_ALPHA);
+#endif
+
+#ifdef RELU
+  CL_DTYPE4 alpha;
+  output = activation_type4(output, alpha);
+#endif
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
+
+__kernel void broadcast_elementwise_basic(
+    __read_only image2d_t input_x,
+    __read_only image2d_t input_y,
+    __write_only image2d_t output_image,
+    __private const int4 input_nhwc4,
+    __private const int4 bias_nhwc4,
+    __private const int4 output_nhwc4,
+    __private const int inputx_broadcast_c_flag,
+    __private const int inputy_broadcast_c_flag,
+    __private const int image_folder_flag_x,
+    __private const int image_folder_flag_y,
+    __private const int bias_width) {
+  int idwc4 = get_global_id(0);
+  int idbh = get_global_id(1);
+
+  if (idwc4 >= output_nhwc4.w * output_nhwc4.z ||
+      idbh >= output_nhwc4.x * output_nhwc4.y) {
+    return;
+  }
+
+  int4 id_shape;
+  id_shape.w = idwc4 / output_nhwc4.z;  // c4
+  id_shape.z = idwc4 % output_nhwc4.z;  // w
+  id_shape.y = idbh % output_nhwc4.y;   // h
+  id_shape.x = idbh / output_nhwc4.y;   // n
+
+  int4 v_zero = (int4)(0);
+  int4 flag_v = (int4)(0);
+  flag_v = isless(convert_float4(id_shape), convert_float4(input_nhwc4));
+  int4 idx_shape = select(v_zero, id_shape, flag_v);
+
+  int2 cur_index = (int2)(idx_shape.w * input_nhwc4.z + idx_shape.z,
+                          idx_shape.x * input_nhwc4.y + idx_shape.y);
+  CL_DTYPE4 in_x = READ_IMG_TYPE(CL_DTYPE_CHAR, input_x, SAMPLER, cur_index);
+
+  /***************************get y data*******************************/
+  flag_v = isless(convert_float4(id_shape), convert_float4(bias_nhwc4));
+  int4 idy_shape = select(v_zero, id_shape, flag_v);
+
+  cur_index = (int2)(idy_shape.w * bias_nhwc4.z + idy_shape.z,
+                     idy_shape.x * bias_nhwc4.y + idy_shape.y);
+  CL_DTYPE4 in_y = READ_IMG_TYPE(CL_DTYPE_CHAR, input_y, SAMPLER, cur_index);
+
+  in_x = SELECT(in_x, (CL_DTYPE4)(in_x.x), inputx_broadcast_c_flag);
+  in_y = SELECT(in_y, (CL_DTYPE4)(in_y.x), inputy_broadcast_c_flag);
+
+  CL_DTYPE4 output = OPERATOR(in_x, in_y);
+#ifdef FUSE_SCALE
+  output = fuse_scale(output, SCALE_SLOPE, SCALE_BIAS, SCALE_ALPHA);
+#endif
+
+#ifdef RELU
+  CL_DTYPE4 alpha;
+  output = activation_type4(output, alpha);
+#endif
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(idwc4, idbh), output);
+}
\ No newline at end of file
diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt
index 157daedfe97..0a797e02ce5 100644
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -1,3 +1,4 @@
+
 if(LITE_WITH_OPENCL)
   set(IS_FAKED_KERNEL false CACHE INTERNAL "")
   set(cl_kernel_deps ops cl_runtime cl_context cl_wrapper cl_target_wrapper cl_image_converter)
@@ -13,11 +14,13 @@ endif()
 # image kernel      #
 #####################
 # basic
-add_kernel(elementwise_add_opencl_image OPENCL basic SRCS elementwise_add_image_compute.cc)
-add_kernel(elementwise_sub_opencl_image OPENCL basic SRCS elementwise_sub_image_compute.cc)
-add_kernel(elementwise_mul_opencl_image OPENCL basic SRCS elementwise_mul_image_compute.cc)
-add_kernel(fusion_elementwise_sub_activation_opencl_image
-           OPENCL basic SRCS fusion_elementwise_sub_activation_image_compute.cc)
+
+#add_kernel(elementwise_add_opencl_image OPENCL basic SRCS elementwise_add_image_compute.cc)
+#add_kernel(elementwise_sub_opencl_image OPENCL basic SRCS elementwise_sub_image_compute.cc)
+#add_kernel(elementwise_mul_opencl_image OPENCL basic SRCS elementwise_mul_image_compute.cc)
+#add_kernel(fusion_elementwise_sub_activation_opencl_image
+#           OPENCL basic SRCS fusion_elementwise_sub_activation_image_compute.cc)
+add_kernel(elementwise_opencl_image OPENCL basic SRCS elementwise_image_compute.cc)
 
 add_kernel(pool_opencl_image OPENCL basic SRCS pool_image_compute.cc)
 add_kernel(activation_opencl_image OPENCL basic SRCS activation_image_compute.cc)
@@ -101,9 +104,12 @@ lite_cc_test(test_pixel_shuffle_image_opencl SRCS pixel_shuffle_image_compute_te
 lite_cc_test(test_expand_image_opencl SRCS expand_image_compute_test.cc
              DEPS kernels core)
 
-lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_compute_test.cc
-             DEPS kernels core)
-lite_cc_test(test_elementwise_sub_image_opencl SRCS elementwise_sub_image_compute_test.cc
+#lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_compute_test.cc
+#             DEPS kernels core)
+#lite_cc_test(test_elementwise_sub_image_opencl SRCS elementwise_sub_image_compute_test.cc
+#             DEPS kernels core)
+
+lite_cc_test(test_elementwise_image_opencl SRCS elementwise_image_compute_test.cc
              DEPS kernels core)
 
 lite_cc_test(test_grid_sampler_image_opencl SRCS grid_sampler_image_compute_test.cc
diff --git a/lite/kernels/opencl/elementwise_add_image_compute.cc b/lite/kernels/opencl/elementwise_add_image_compute.cc
index 5c7b27d8ba6..86d87c73fda 100644
--- a/lite/kernels/opencl/elementwise_add_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute.cc
@@ -275,43 +275,43 @@ namespace ocl = paddle::lite::kernels::opencl;
 //     may from anther branch like "X" (kOpenCL, nothing to do).
 // Consider 2 situations have different actions when pass running(pick kernel),
 //     set target of "Y" as kOpenCL temporarily.
-REGISTER_LITE_KERNEL(elementwise_add,
-                     kOpenCL,
-                     kFP16,
-                     kImageDefault,
-                     ocl::ElementwiseAddImageCompute,
-                     def)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageDefault))})
-    .BindInput("Y",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
+// REGISTER_LITE_KERNEL(elementwise_add,
+//                      kOpenCL,
+//                      kFP16,
+//                      kImageDefault,
+//                      ocl::ElementwiseAddImageCompute,
+//                      def)
+//     .BindInput("X",
+//                {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                       PRECISION(kFP16),
+//                                       DATALAYOUT(kImageDefault))})
+//     .BindInput("Y",
+//                {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                       PRECISION(kFP16),
+//                                       DATALAYOUT(kImageDefault))})
+//     .BindOutput("Out",
+//                 {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                        PRECISION(kFP16),
+//                                        DATALAYOUT(kImageDefault))})
+//     .Finalize();
 
-REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
-                     kOpenCL,
-                     kFP16,
-                     kImageDefault,
-                     ocl::ElementwiseAddImageCompute,
-                     def)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageDefault))})
-    .BindInput("Y",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
-#define LITE_WITH_LOG
+// REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
+//                      kOpenCL,
+//                      kFP16,
+//                      kImageDefault,
+//                      ocl::ElementwiseAddImageCompute,
+//                      def)
+//     .BindInput("X",
+//                {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                       PRECISION(kFP16),
+//                                       DATALAYOUT(kImageDefault))})
+//     .BindInput("Y",
+//                {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                       PRECISION(kFP16),
+//                                       DATALAYOUT(kImageDefault))})
+//     .BindOutput("Out",
+//                 {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                        PRECISION(kFP16),
+//                                        DATALAYOUT(kImageDefault))})
+//     .Finalize();
+// #define LITE_WITH_LOG
diff --git a/lite/kernels/opencl/elementwise_image_compute.cc b/lite/kernels/opencl/elementwise_image_compute.cc
new file mode 100644
index 00000000000..6e92ba515f9
--- /dev/null
+++ b/lite/kernels/opencl/elementwise_image_compute.cc
@@ -0,0 +1,589 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cmath>
+#include <memory>
+#include <string>
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/host/elementwise_op_func.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/log/logging.h"
+#include "lite/utils/replace_stl/stream.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class ElementwiseImageCompute : public KernelLite<TARGET(kOpenCL),
+                                                  PRECISION(kFP16),
+                                                  DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  std::string doc() const override {
+    return "Elementwise using cl::Image2D(ImageDefault/RGBA), kFP32";
+  }
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    if (param_.is_type<param_t>()) {
+      ele_param_ = param_.get_mutable<param_t>();
+    } else {
+      ele_param_ =
+          param_.get_mutable<operators::FusionElementwiseActivationParam>();
+      auto act_t =
+          static_cast<operators::FusionElementwiseActivationParam*>(ele_param_)
+              ->act_type;
+      VLOG(4) << "act: " << act_t;
+      if (act_t == "relu") {
+        build_options_ += " -DRELU";
+      } else {
+        LOG(FATAL) << "Unsupported Activation type: " << act_t;
+      }
+    }
+
+    auto* x = ele_param_->X;
+    auto* y = ele_param_->Y;
+    x_dims_ = ele_param_->X->dims();
+    y_dims_ = ele_param_->Y->dims();
+    auto& out_dims = ele_param_->Out->dims();
+    axis_ = ele_param_->axis;
+    out_nchw_ = out_dims.Vectorize();
+
+    host::fix_x_y_dims<int64_t>(
+        x, y, ele_param_->Out, axis_, &x_nchw_, &y_nchw_);
+
+    while (x_nchw_.size() < 4) {
+      x_nchw_.insert(x_nchw_.cbegin(), 1);
+    }
+
+    while (y_nchw_.size() < 4) {
+      y_nchw_.insert(y_nchw_.cbegin(), 1);
+    }
+
+    while (out_nchw_.size() < 4) {
+      out_nchw_.insert(out_nchw_.cbegin(), 1);
+    }
+
+    image_folder_flag_y_ = 0;
+    image_folder_flag_x_ = 0;
+    int broadcast_elementwise_common_flag = 0;
+    if (y->persistable()) {
+      LOG(INFO) << "with y->persistable";
+      y_weights_image_ = std::unique_ptr<Tensor>(new Tensor);
+      std::unique_ptr<Tensor> tensor_hold_y_image_ =
+          std::unique_ptr<Tensor>(new Tensor);
+      CLImageConverterDefault default_converter;
+      const DDim& y_image_dims =
+          default_converter.InitImageDimInfoWith(DDim(y_nchw_));
+      tensor_hold_y_image_->Resize({1, y_image_dims[0], y_image_dims[1], 4});
+
+      auto* y_cpu_image = MUTABLE_DATA_CPU(tensor_hold_y_image_);
+      auto* y_cpu_nchw = static_cast<float*>(const_cast<void*>(y->raw_data()));
+      default_converter.NCHWToImage(y_cpu_nchw, y_cpu_image, DDim(y_nchw_));
+      MUTABLE_DATA_GPU(
+          y_weights_image_, y_image_dims[0], y_image_dims[1], y_cpu_image);
+    } else {
+      if ((y_dims_.size() == 1) &&
+          (axis_ != -1) &&  // x{n,c,h,w} && y{c} || x{c,h,w} && y{c}
+          (axis_ == x_dims_.size() - 3)) {
+        image_folder_flag_y_ = 1;
+      }
+      if ((y_dims_.size() == 1) &&
+          (axis_ != -1) &&  // x{n,c,h,w} && y{h} || x{c,h,w} && y{h}
+          (axis_ == x_dims_.size() - 2)) {
+        image_folder_flag_y_ = 2;
+      }
+      if ((y_dims_.size() == 2) &&
+          (axis_ != -1) &&  // x{n,c,h,w} && y{c,h} || x{c,h,w} && y{c,h}
+          (axis_ == x_dims_.size() - 3)) {
+        image_folder_flag_y_ = 3;
+      }
+      if ((y_dims_.size() == 2) && (x_dims_.size() == 4) &&
+          (axis_ == 0)) {  // x{n,c,h,w} && y{n,c}
+        image_folder_flag_y_ = 1;
+      }
+      if ((y_dims_.size() == 3) && (x_dims_.size() == 4) &&
+          (axis_ == 0)) {  // x{n,c,h,w} && y{n,c,h}
+        image_folder_flag_y_ = 4;
+      }
+
+      if ((x_dims_.size() == 1) && (axis_ != -1) &&
+          (axis_ == y_dims_.size() - 3)) {
+        image_folder_flag_x_ = 1;
+      }
+      if ((x_dims_.size() == 1) && (axis_ != -1) &&
+          (axis_ == y_dims_.size() - 2)) {
+        image_folder_flag_x_ = 2;
+      }
+      if ((x_dims_.size() == 2) && (axis_ != -1) &&
+          (axis_ == y_dims_.size() - 3)) {
+        image_folder_flag_x_ = 3;
+      }
+      if ((x_dims_.size() == 2) && (y_dims_.size() == 4) && (axis_ == 0)) {
+        image_folder_flag_x_ = 1;
+      }
+      if ((x_dims_.size() == 3) && (y_dims_.size() == 4) && (axis_ == 0)) {
+        image_folder_flag_x_ = 4;
+      }
+    }
+
+    if (image_folder_flag_x_ != 0 || image_folder_flag_y_ != 0) {
+      broadcast_elementwise_common_flag = 1;
+    }
+
+    if (y_dims_ == x_dims_) {
+      kernel_func_name_ = "elementwise_compute";
+    } else if (broadcast_elementwise_common_flag == 0) {
+      kernel_func_name_ = "broadcast_elementwise_basic";
+    } else {
+      kernel_func_name_ = "broadcast_elementwise_common";
+    }
+
+    if (broadcast_elementwise_common_flag == 1) {
+      kernel_func_paths_ = "image/elementwise_broadcast_kernel.cl";
+    } else {
+      kernel_func_paths_ = "image/elementwise_kernel.cl";
+    }
+
+    // op_type
+    auto elementwise_compute_type = op_type();
+    if (elementwise_compute_type == "elementwise_div") {
+      build_options_ += " -DOPERATOR(in,bias)=(in/bias) ";
+    } else if (elementwise_compute_type == "elementwise_add") {
+      build_options_ += " -DOPERATOR(in,bias)=(in+bias) ";
+    } else if (elementwise_compute_type == "elementwise_sub") {
+      build_options_ += " -DOPERATOR(in,bias)=(in-bias) ";
+    } else if (elementwise_compute_type == "elementwise_mul") {
+      build_options_ += " -DOPERATOR(in,bias)=(in*bias) ";
+    } else if (elementwise_compute_type == "elementwise_max") {
+      build_options_ += " -DOPERATOR(in,bias)=fmax(in,bias) ";
+    } else if (elementwise_compute_type == "elementwise_min") {
+      build_options_ += " -DOPERATOR(in,bias)=fmin(in,bias) ";
+    } else if (elementwise_compute_type == "elementwise_pow") {
+      build_options_ += " -DOPERATOR(in,bias)=pow(in,bias) ";
+    } else if (elementwise_compute_type == "elementwise_mod") {
+      build_options_ += " -DOPERATOR(in,bias)=fmod(in,bias) ";
+    }
+
+    if (ele_param_->fuse_scale) {
+      build_options_ +=
+          "-DFUSE_SCALE -DSCALE_SLOPE=" + std::to_string(ele_param_->scale) +
+          "f " + " -DSCALE_BIAS=" + std::to_string(ele_param_->bias) + "f " +
+          " -DSCALE_ALPHA=" + std::to_string(ele_param_->alpha) + "f ";
+    }
+    context.cl_context()->AddKernel(
+        kernel_func_name_, kernel_func_paths_, build_options_, time_stamp_);
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+  }
+
+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    std::string fuse_scale_str = ele_param_->fuse_scale ? "/fuse_scale" : "";
+    ch->kernel_func_name = kernel_func_name_ + fuse_scale_str;
+    ch->global_work_size = ch->NDRangeToStr(gws_);
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
+  void ReInitWhenNeeded() override {
+    if ((!first_epoch_for_reinit_ && x_dims_ != last_x_dims_) ||
+        first_epoch_for_reinit_) {
+      last_x_dims_ = x_dims_;
+      first_epoch_for_reinit_ = false;
+
+      // compute global work size
+      int hb = out_nchw_[0] * out_nchw_[2];
+      int cw =
+          out_nchw_[3] *
+          maptofactor(out_nchw_[1], 4);  // return (i + factor - 1) / factor;
+
+      gws_ = cl::NDRange{static_cast<cl::size_type>(cw),
+                         static_cast<cl::size_type>(hb),
+                         static_cast<cl::size_type>(1)};
+    }
+  }
+
+  void Run() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    const auto* x_img = GET_DATA_GPU(ele_param_->X);
+    const auto* y_img = GET_DATA_GPU(ele_param_->Y);
+    if (ele_param_->Y->persistable()) {
+      y_img = GET_DATA_GPU(y_weights_image_);
+    }
+    auto out_image_shape = InitImageDimInfoWith(DDim(out_nchw_));  // w, h
+    auto* out_img = MUTABLE_DATA_GPU(ele_param_->Out,
+                                     out_image_shape["width"],
+                                     out_image_shape["height"],
+                                     nullptr);
+
+    // nchw --> nhwc4
+    cl_int4 inx_dim = {static_cast<int>(x_nchw_[0]),
+                       static_cast<int>(x_nchw_[2]),
+                       static_cast<int>(x_nchw_[3]),
+                       static_cast<int>((x_nchw_[1] + 3) / 4)};
+
+    cl_int4 iny_dim = {static_cast<int>(y_nchw_[0]),
+                       static_cast<int>(y_nchw_[2]),
+                       static_cast<int>(y_nchw_[3]),
+                       static_cast<int>((y_nchw_[1] + 3) / 4)};
+
+    cl_int4 out_dim = {static_cast<int>(out_nchw_[0]),
+                       static_cast<int>(out_nchw_[2]),
+                       static_cast<int>(out_nchw_[3]),
+                       static_cast<int>((out_nchw_[1] + 3) / 4)};
+
+    int inputx_broadcast_c_flag = (x_nchw_[1] == 1) ? 1 : 0;
+    int inputy_broadcast_c_flag = (y_nchw_[1] == 1) ? 1 : 0;
+    int bias_width = y_nchw_[1];
+
+    if (y_dims_ == x_dims_) {
+      cl_int status = kernel_.setArg(0, *x_img);
+      CL_CHECK_FATAL(status);
+      status = kernel_.setArg(1, *y_img);
+      CL_CHECK_FATAL(status);
+      status = kernel_.setArg(2, *out_img);
+      CL_CHECK_FATAL(status);
+    } else {
+      cl_int status = kernel_.setArg(0, *x_img);
+      CL_CHECK_FATAL(status);
+      status = kernel_.setArg(1, *y_img);
+      CL_CHECK_FATAL(status);
+      status = kernel_.setArg(2, *out_img);
+      CL_CHECK_FATAL(status);
+      status = kernel_.setArg(3, inx_dim);
+      CL_CHECK_FATAL(status);
+      status = kernel_.setArg(4, iny_dim);
+      CL_CHECK_FATAL(status);
+      status = kernel_.setArg(5, out_dim);
+      CL_CHECK_FATAL(status);
+      status = kernel_.setArg(6, inputx_broadcast_c_flag);
+      CL_CHECK_FATAL(status);
+      status = kernel_.setArg(7, inputy_broadcast_c_flag);
+      CL_CHECK_FATAL(status);
+      status = kernel_.setArg(8, image_folder_flag_x_);
+      CL_CHECK_FATAL(status);
+      status = kernel_.setArg(9, image_folder_flag_y_);
+      CL_CHECK_FATAL(status);
+      status = kernel_.setArg(10, bias_width);
+      CL_CHECK_FATAL(status);
+    }
+
+    auto status = EnqueueNDRangeKernel(
+        context, kernel_, cl::NullRange, gws_, cl::NullRange, nullptr, event_);
+    CL_CHECK_FATAL(status);
+
+#ifdef LITE_WITH_PROFILE
+    event_.wait();
+    auto queue_start_nanos =
+        event_.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>();
+    auto submit_start_nanos =
+        event_.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>();
+    auto run_start_nanos =
+        event_.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+    auto run_stop_nanos = event_.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+
+    double time_ms = (submit_start_nanos - queue_start_nanos) / 1000000.0;
+    VLOG(4) << "GetQueuedToSubmitTime: " << time_ms << std::endl;
+
+    time_ms = (run_start_nanos - submit_start_nanos) / 1000000.0;
+    VLOG(4) << "GetSubmitToStartTime: " << time_ms << std::endl;
+
+    time_ms = (run_stop_nanos - run_start_nanos) / 1000000.0;
+    VLOG(4) << "GetStartToEndTime: " << time_ms << std::endl;
+#endif
+  }
+
+ private:
+  param_t* ele_param_{nullptr};
+  bool first_epoch_for_reinit_{true};
+  DDim last_x_dims_;
+  std::vector<int64_t> x_nchw_{};
+  std::vector<int64_t> y_nchw_{};
+  std::vector<int64_t> out_nchw_{};
+  std::string kernel_func_name_{};
+  std::string build_options_{};
+  std::string kernel_func_paths_{};
+  std::string time_stamp_{GetTimeStamp()};
+  cl::Kernel kernel_;
+  cl::NDRange gws_;
+  // y is persistable
+  std::unique_ptr<Tensor> y_weights_image_{nullptr};
+  int image_folder_flag_x_{0};
+  int image_folder_flag_y_{0};
+  int axis_{-1};
+  DDimLite x_dims_{};
+  DDimLite y_dims_{};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+namespace ocl = paddle::lite::kernels::opencl;
+REGISTER_LITE_KERNEL(elementwise_div,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     ocl::ElementwiseImageCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_add,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     ocl::ElementwiseImageCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_sub,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     ocl::ElementwiseImageCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_mul,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     ocl::ElementwiseImageCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_max,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     ocl::ElementwiseImageCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_min,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     ocl::ElementwiseImageCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_pow,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     ocl::ElementwiseImageCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_mod,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     ocl::ElementwiseImageCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     ocl::ElementwiseImageCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fusion_elementwise_sub_activation,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     ocl::ElementwiseImageCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     ocl::ElementwiseImageCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fusion_elementwise_div_activation,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     ocl::ElementwiseImageCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/elementwise_image_compute_test.cc b/lite/kernels/opencl/elementwise_image_compute_test.cc
new file mode 100644
index 00000000000..111c65c3067
--- /dev/null
+++ b/lite/kernels/opencl/elementwise_image_compute_test.cc
@@ -0,0 +1,751 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
+#include "lite/tests/utils/fill_data.h"
+
+#define FP16_MAX_DIFF (5e-1)
+#define FP32_ABS_DIFF (1e-7)
+#define FP32_RELATIVE_DIFF (1e-4)
+#define FP16_ABS_DIFF (1e-1)
+#define FP16_RELATIVE_DIFF (1e-1)
+namespace paddle {
+namespace lite {
+
+template <typename dtype>
+void fill_data(dtype* x, const int length, int set_value = -1) {
+  if (set_value == -1) {
+    for (size_t idx = 0; idx < length; ++idx) {
+      x[idx] = idx;
+    }
+  } else if (set_value != -1) {
+    for (size_t idx = 0; idx < length; ++idx) {
+      x[idx] = set_value;
+    }
+  }
+}
+
+int randint(int beg, int end) {
+  int res = 0;
+  fill_data_rand<int>(&res, beg, end, 1);
+  return res;
+}
+
+bool randbool() { return randint(0, 1000000) < 500000; }
+
+template <class T>
+T* AtLogicInd(T* data,
+              const std::vector<int>& dim,
+              const std::vector<int>& logic_index) {
+  assert(dim.size() == logic_index.size());
+
+  int offset = 0;
+  int stride = 1;
+  for (int i = dim.size() - 1; i >= 0; --i) {
+    int ind = logic_index[i];
+    if (dim[i] == 1) {
+      ind = 0;
+    }
+    assert(ind < dim[i]);
+    offset += ind * stride;
+    stride *= dim[i];
+  }
+  return data + offset;
+}
+
+std::vector<int> GenLogicIndex(int logic_offset, const std::vector<int>& dim) {
+  std::vector<int> strides(dim.size(), 1);
+  for (int i = dim.size() - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * dim[i + 1];
+  }
+  std::vector<int> ret(dim.size(), 0);
+  for (int i = 0; i < dim.size(); ++i) {
+    ret[i] = logic_offset / strides[i];
+    logic_offset %= strides[i];
+  }
+  return ret;
+}
+
+template <class T>
+void BroadcastCPURef(const T* x,
+                     const T* y,
+                     T* z,
+                     const std::vector<int>& x_dim,
+                     const std::vector<int>& y_dim,
+                     const std::vector<int>& z_dim,
+                     bool use_relu,
+                     const std::function<T(T, T)> op) {
+  int N = 1;
+  for (int i = 0; i < z_dim.size(); ++i) {
+    N *= z_dim[i];
+  }
+  for (int i = 0; i < N; ++i) {
+    auto logic_index = GenLogicIndex(i, z_dim);
+    const T* x_d = AtLogicInd(x, x_dim, logic_index);
+    const T* y_d = AtLogicInd(y, y_dim, logic_index);
+    T* z_d = AtLogicInd(z, z_dim, logic_index);
+    *z_d = op(*x_d, *y_d);
+    if (use_relu) {
+      *z_d = std::max(*z_d, static_cast<T>(0));
+    }
+  }
+}
+
+template <class T>
+void RunElementwiseBroadcast(const Place& place,
+                             const int dim_size,
+                             bool fuse_act,
+                             const lite_api::CLPrecisionType p,
+                             const std::string& alias,
+                             const std::string& elt_type,
+                             const std::string& act_type,
+                             const std::function<T(T, T)> op,
+                             double abs_error = 1e-3) {
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+  CLRuntime::Global()->set_precision(p);
+  const bool fp16_flag = (p == lite_api::CLPrecisionType::CL_PRECISION_FP16);
+  LOG(INFO) << "\n\t[  START  ] Test Precision="
+            << lite_api::CLPrecisionTypeToStr(p);
+  // set kernel
+  auto kernels = KernelRegistry::Global().Create(
+      elt_type, TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+
+  auto elemul_img_kernel = std::move(kernels.front());
+  VLOG(4) << "get elemul kernel: " << elemul_img_kernel->doc();
+  const int MAX_SHAPE_VALUE = 10;
+  // gen out_dim
+  std::vector<int> out_shape(dim_size, 0);
+  for (int i = 0; i < dim_size; ++i) {
+    out_shape[i] = randint(2, MAX_SHAPE_VALUE);
+  }
+  std::vector<int> x_shape_full = out_shape;
+  std::vector<int> y_shape_full = out_shape;
+
+  std::vector<int> x_shape_cut;
+  std::vector<int> y_shape_cut;
+
+  int axis = -1;
+  static bool cut_dimension = true;
+  cut_dimension = !cut_dimension;
+  if (cut_dimension) {
+    // generate x_shape_cut and y_shape_cut by remove dimension
+    static bool use_axis = true;
+    use_axis = !use_axis;
+    if (use_axis) {
+      x_shape_cut = x_shape_full;
+      // we will cut y only, and set tail of y to be 1
+      axis = randint(0, dim_size - 1);
+
+      int tail1_num = randint(0, dim_size - axis);
+      for (int i = 0; i < axis; ++i) {
+        y_shape_full[i] = 1;
+      }
+      for (int i = axis; i < (dim_size - tail1_num); ++i) {
+        y_shape_cut.push_back(y_shape_full[i]);
+      }
+      for (int i = 0; i < tail1_num; ++i) {
+        y_shape_cut.push_back(1);
+      }
+      for (int i = dim_size - tail1_num; i < dim_size; ++i) {
+        y_shape_full[i] = 1;
+      }
+      static bool swap_x_and_y = true;
+      swap_x_and_y = !swap_x_and_y;
+      if (swap_x_and_y) {
+        std::swap(x_shape_cut, y_shape_cut);
+        std::swap(x_shape_full, y_shape_full);
+      }
+    } else {
+      // we will cut x or y
+      if (randbool()) {
+        y_shape_cut = y_shape_full;
+        int cut_x_num = randint(0, dim_size) * randbool();
+        for (int i = 0; i < cut_x_num; ++i) {
+          x_shape_full[i] = 1;
+        }
+        for (int i = cut_x_num; i < dim_size; ++i) {
+          x_shape_cut.push_back(x_shape_full[i]);
+        }
+      } else {
+        x_shape_cut = x_shape_full;
+        int cut_y_num = randint(0, dim_size) * randbool();
+        for (int i = 0; i < cut_y_num; ++i) {
+          y_shape_full[i] = 1;
+        }
+        for (int i = cut_y_num; i < dim_size; ++i) {
+          y_shape_cut.push_back(y_shape_full[i]);
+        }
+      }
+    }
+  } else {
+    // generate x_shape_cut and y_shape_cut by random
+    // random assign 1 to some dim
+    for (int i = 0; i < dim_size; ++i) {
+      if (randbool() && y_shape_full[i] != 1) {
+        x_shape_full[i] = 1;
+      }
+      if (randbool() && x_shape_full[i] != 1) {
+        y_shape_full[i] = 1;
+      }
+    }
+    // just remove 1 at high dimesion
+    int ind = 0;
+    while (x_shape_full[ind] == 1) {
+      ++ind;
+    }
+    for (int i = ind; i < dim_size; ++i) {
+      x_shape_cut.push_back(x_shape_full[i]);
+    }
+    ind = 0;
+    while (y_shape_full[ind] == 1) {
+      ++ind;
+    }
+    for (int i = ind; i < dim_size; ++i) {
+      y_shape_cut.push_back(y_shape_full[i]);
+    }
+  }
+
+  DDim x_dim =
+      DDim(std::vector<int64_t>(x_shape_cut.begin(), x_shape_cut.end()));
+  DDim y_dim =
+      DDim(std::vector<int64_t>(y_shape_cut.begin(), y_shape_cut.end()));
+  DDim out_dim = DDim(std::vector<int64_t>(out_shape.begin(), out_shape.end()));
+
+  LOG(INFO) << "==================" << elt_type << "===================";
+  LOG(INFO) << "x_dim:" << x_dim << "\ty_dim:" << y_dim
+            << "\tout_dim:" << out_dim;
+  LOG(INFO) << "fuse_act:" << fuse_act << "; axis:" << axis;
+
+  // tensor
+  lite::Tensor ele_x, ele_y, ele_out;
+  ele_x.Resize(x_dim);
+  ele_y.Resize(y_dim);
+  ele_out.Resize(out_dim);
+
+  // initialize tensors
+  VLOG(4) << "initialize tensors";
+  paddle::lite::CLImageConverterDefault* default_convertor =
+      new CLImageConverterDefault();
+
+  // operator param
+  operators::FusionElementwiseActivationParam
+      fuseEleParam;  // enabled if fuse_act is true
+  fuseEleParam.X = &ele_x;
+  fuseEleParam.Y = &ele_y;
+  fuseEleParam.Out = &ele_out;
+  fuseEleParam.axis = axis;
+  fuseEleParam.act_type = fuse_act ? "relu" : "";
+
+  operators::ElementwiseParam eleParam;
+  eleParam.X = &ele_x;
+  eleParam.Y = &ele_y;
+  eleParam.Out = &ele_out;
+  eleParam.axis = axis;
+
+  if (fuse_act) {
+    elemul_img_kernel->SetParam(fuseEleParam);
+  } else {
+    elemul_img_kernel->SetParam(eleParam);
+  }
+
+  elemul_img_kernel->SetContext(std::move(context));
+
+  // x
+  std::vector<float> x_v(x_dim.production());
+  // fill_data<float>(x_v.data(), x_v.size());
+  fill_data_rand(x_v.data(), -10.f, 10.f, x_dim.production());
+  auto x_img_shape = default_convertor->InitImageDimInfoWith(x_dim);  // w, h
+  const size_t dtype_size = fp16_flag ? sizeof(half_t) : sizeof(float);
+  std::vector<char> x_image_data(x_img_shape.production() * 4 *
+                                 dtype_size);  // 4: RGBA
+  default_convertor->NCHWToImage(x_v.data(), x_image_data.data(), x_dim);
+  MUTABLE_DATA_GPU(&ele_x, x_img_shape[0], x_img_shape[1], x_image_data.data());
+
+  // y
+  std::vector<float> y_v(y_dim.production());
+  // fill_data<float>(y_v.data(), y_v.size());
+  fill_data_rand(y_v.data(), -10.f, 10.f, y_dim.production());
+  auto y_img_shape = default_convertor->InitImageDimInfoWith(y_dim);  // w, h
+  std::vector<char> y_image_data(y_img_shape.production() * 4 *
+                                 dtype_size);  // 4: RGBA
+  default_convertor->NCHWToImage(y_v.data(), y_image_data.data(), y_dim);
+  MUTABLE_DATA_GPU(&ele_y, y_img_shape[0], y_img_shape[1], y_image_data.data());
+
+  // out
+  std::vector<float> out_from_gpu(out_dim.production());
+  auto out_img_shape =
+      default_convertor->InitImageDimInfoWith(out_dim);  // w, h
+  auto* out_image =
+      MUTABLE_DATA_GPU(&ele_out, out_img_shape[0], out_img_shape[1], nullptr);
+
+  // run kernel
+  elemul_img_kernel->Launch();
+  CLRuntime::Global()->command_queue().finish();
+  // download gpu result to cpu
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+  std::vector<char> out_image_data(out_img_shape.production() * 4 *
+                                   dtype_size);  // 4 : RGBA
+  TargetWrapperCL::ImgcpySync(out_image_data.data(),
+                              out_image,
+                              out_img_shape[0],
+                              out_img_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+  default_convertor->ImageToNCHW(
+      out_image_data.data(), out_from_gpu.data(), out_img_shape, out_dim);
+
+  // compute cpu reference
+  std::unique_ptr<float[]> out_from_cpu(new float[out_dim.production()]);
+  BroadcastCPURef<float>(x_v.data(),
+                         y_v.data(),
+                         out_from_cpu.get(),
+                         x_shape_full,
+                         y_shape_full,
+                         out_shape,
+                         fuse_act,
+                         op);
+
+  VLOG(4) << "output_data vs output_ref_data";
+  auto relative_diff_thres =
+      fp16_flag ? FP16_RELATIVE_DIFF : FP32_RELATIVE_DIFF;
+  auto abs_diff_thres = fp16_flag ? FP16_ABS_DIFF : FP32_ABS_DIFF;
+  uint32_t diff_cnt = 0;
+  for (int i = 0; i < out_dim.production(); i++) {
+    auto relative_diff =
+        COMPUTE_RELATIVE_DIFF(out_from_gpu[i], out_from_cpu[i]);
+    auto abs_diff = COMPUTE_ABS_DIFF(out_from_gpu[i], out_from_cpu[i]);
+    EXPECT_FALSE(relative_diff > relative_diff_thres &&
+                 abs_diff > abs_diff_thres);
+    if (relative_diff > relative_diff_thres && abs_diff > abs_diff_thres) {
+      LOG(WARNING) << lite_api::CLPrecisionTypeToStr(p) << "   err idx: " << i
+                   << " abs_diff: " << abs_diff
+                   << "\t relative_diff: " << relative_diff
+                   << "\t out_ins: " << out_from_gpu[i]
+                   << "\t out_ref: " << out_from_cpu[i];
+      diff_cnt++;
+    }
+  }
+  if (diff_cnt != 0) {
+    LOG(FATAL) << "Err num " << diff_cnt << "/" << out_dim.production();
+  }
+
+  LOG(INFO) << "\n\t[  PASSED  ] "
+            << " Test Precision=" << lite_api::CLPrecisionTypeToStr(p);
+}
+
+template <class T>
+void RunElementwiseCommonSize(std::vector<int> x_shape_full,
+                              std::vector<int> y_shape_full,
+                              std::vector<int> x_shape_cut,
+                              std::vector<int> y_shape_cut,
+                              std::vector<int> out_shape,
+                              int axis,
+                              bool xy_swap_flag,
+                              double abs_error = 1e-3) {
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+  const lite_api::CLPrecisionType p =
+      paddle::lite_api::CLPrecisionType::CL_PRECISION_FP32;
+  CLRuntime::Global()->set_precision(p);
+  const bool fp16_flag = (p == lite_api::CLPrecisionType::CL_PRECISION_FP16);
+  LOG(INFO) << "\n\t[  START  ] Test Precision="
+            << lite_api::CLPrecisionTypeToStr(p);
+  // set kernel
+  std::string elt_type = "elementwise_add";
+  const std::function<T(T, T)> op = [](float l, float r) { return l + r; };
+  bool fuse_act = false;
+  auto kernels = KernelRegistry::Global().Create(
+      elt_type, TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+
+  auto elemul_img_kernel = std::move(kernels.front());
+  VLOG(4) << "get elemul kernel: " << elemul_img_kernel->doc();
+
+  if (xy_swap_flag) {
+    std::swap(x_shape_cut, y_shape_cut);
+    std::swap(x_shape_full, y_shape_full);
+  }
+
+  DDim x_dim =
+      DDim(std::vector<int64_t>(x_shape_cut.begin(), x_shape_cut.end()));
+  DDim y_dim =
+      DDim(std::vector<int64_t>(y_shape_cut.begin(), y_shape_cut.end()));
+  DDim out_dim = DDim(std::vector<int64_t>(out_shape.begin(), out_shape.end()));
+
+  LOG(INFO) << "==================" << elt_type << "===================";
+  LOG(INFO) << "x_dim:" << x_dim << "\ty_dim:" << y_dim
+            << "\tout_dim:" << out_dim;
+  LOG(INFO) << "fuse_act:" << fuse_act << "; axis:" << axis;
+
+  // tensor
+  lite::Tensor ele_x, ele_y, ele_out;
+  ele_x.Resize(x_dim);
+  ele_y.Resize(y_dim);
+  ele_out.Resize(out_dim);
+
+  // initialize tensors
+  VLOG(4) << "initialize tensors";
+  paddle::lite::CLImageConverterDefault* default_convertor =
+      new CLImageConverterDefault();
+
+  // operator param
+  operators::FusionElementwiseActivationParam
+      fuseEleParam;  // enabled if fuse_act is true
+  fuseEleParam.X = &ele_x;
+  fuseEleParam.Y = &ele_y;
+  fuseEleParam.Out = &ele_out;
+  fuseEleParam.axis = axis;
+  fuseEleParam.act_type = fuse_act ? "relu" : "";
+
+  operators::ElementwiseParam eleParam;
+  eleParam.X = &ele_x;
+  eleParam.Y = &ele_y;
+  eleParam.Out = &ele_out;
+  eleParam.axis = axis;
+
+  if (fuse_act) {
+    elemul_img_kernel->SetParam(fuseEleParam);
+  } else {
+    elemul_img_kernel->SetParam(eleParam);
+  }
+
+  elemul_img_kernel->SetContext(std::move(context));
+
+  // x
+  std::vector<float> x_v(x_dim.production());
+  // fill_data<float>(x_v.data(), x_v.size());
+  fill_data_rand(x_v.data(), -10.f, 10.f, x_dim.production());
+  auto x_img_shape = default_convertor->InitImageDimInfoWith(x_dim);  // w, h
+  const size_t dtype_size = fp16_flag ? sizeof(half_t) : sizeof(float);
+  std::vector<char> x_image_data(x_img_shape.production() * 4 *
+                                 dtype_size);  // 4: RGBA
+  default_convertor->NCHWToImage(x_v.data(), x_image_data.data(), x_dim);
+  MUTABLE_DATA_GPU(&ele_x, x_img_shape[0], x_img_shape[1], x_image_data.data());
+
+  // y
+  std::vector<float> y_v(y_dim.production());
+  // fill_data<float>(y_v.data(), y_v.size());
+  fill_data_rand(y_v.data(), -10.f, 10.f, y_dim.production());
+  auto y_img_shape = default_convertor->InitImageDimInfoWith(y_dim);  // w, h
+  std::vector<char> y_image_data(y_img_shape.production() * 4 *
+                                 dtype_size);  // 4: RGBA
+  default_convertor->NCHWToImage(y_v.data(), y_image_data.data(), y_dim);
+  MUTABLE_DATA_GPU(&ele_y, y_img_shape[0], y_img_shape[1], y_image_data.data());
+
+  // out
+  std::vector<float> out_from_gpu(out_dim.production());
+  auto out_img_shape =
+      default_convertor->InitImageDimInfoWith(out_dim);  // w, h
+  auto* out_image =
+      MUTABLE_DATA_GPU(&ele_out, out_img_shape[0], out_img_shape[1], nullptr);
+
+  // run kernel
+  elemul_img_kernel->Launch();
+  CLRuntime::Global()->command_queue().finish();
+  // download gpu result to cpu
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+  std::vector<char> out_image_data(out_img_shape.production() * 4 *
+                                   dtype_size);  // 4 : RGBA
+  TargetWrapperCL::ImgcpySync(out_image_data.data(),
+                              out_image,
+                              out_img_shape[0],
+                              out_img_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+  default_convertor->ImageToNCHW(
+      out_image_data.data(), out_from_gpu.data(), out_img_shape, out_dim);
+
+  // compute cpu reference
+  std::unique_ptr<float[]> out_from_cpu(new float[out_dim.production()]);
+  BroadcastCPURef<float>(x_v.data(),
+                         y_v.data(),
+                         out_from_cpu.get(),
+                         x_shape_full,
+                         y_shape_full,
+                         out_shape,
+                         fuse_act,
+                         op);
+
+  VLOG(4) << "output_data vs output_ref_data";
+  auto relative_diff_thres =
+      fp16_flag ? FP16_RELATIVE_DIFF : FP32_RELATIVE_DIFF;
+  auto abs_diff_thres = fp16_flag ? FP16_ABS_DIFF : FP32_ABS_DIFF;
+  uint32_t diff_cnt = 0;
+  for (int i = 0; i < out_dim.production(); i++) {
+    auto relative_diff =
+        COMPUTE_RELATIVE_DIFF(out_from_gpu[i], out_from_cpu[i]);
+    auto abs_diff = COMPUTE_ABS_DIFF(out_from_gpu[i], out_from_cpu[i]);
+    EXPECT_FALSE(relative_diff > relative_diff_thres &&
+                 abs_diff > abs_diff_thres);
+    if (relative_diff > relative_diff_thres && abs_diff > abs_diff_thres) {
+      LOG(WARNING) << lite_api::CLPrecisionTypeToStr(p) << "   err idx: " << i
+                   << " abs_diff: " << abs_diff
+                   << "\t relative_diff: " << relative_diff
+                   << "\t out_ins: " << out_from_gpu[i]
+                   << "\t out_ref: " << out_from_cpu[i];
+      diff_cnt++;
+      break;
+    }
+  }
+  if (diff_cnt != 0) {
+    LOG(FATAL) << "Err num " << diff_cnt << "/" << out_dim.production();
+  }
+
+  LOG(INFO) << "\n\t[  PASSED  ] "
+            << " Test Precision=" << lite_api::CLPrecisionTypeToStr(p);
+}
+
+void test_elementwise_all_dim_data_gpu() {
+  // test elementwise common size, only add compute, data in gpu
+  int n = 40;
+  int c = 40;
+  int h = 40;
+  int w = 40;
+  n = randint(1, 40);
+  c = randint(1, 40);
+  h = randint(1, 40);
+  w = randint(1, 40);
+  std::vector<bool> xy_swap_flags{false, true};
+  for (auto xy_swap_flag : xy_swap_flags) {
+    RunElementwiseCommonSize<float>({n, c, h, w},
+                                    {n, c, h, w},
+                                    {n, c, h, w},
+                                    {n, c, h, w},
+                                    {n, c, h, w},
+                                    0,
+                                    xy_swap_flag);
+
+    RunElementwiseCommonSize<float>({n, c, h, w},
+                                    {1, c, 1, 1},
+                                    {n, c, h, w},
+                                    {1, c, 1, 1},
+                                    {n, c, h, w},
+                                    -1,
+                                    xy_swap_flag);
+
+    RunElementwiseCommonSize<float>({n, c, h, w},
+                                    {1, c, 1, 1},
+                                    {n, c, h, w},
+                                    {c},
+                                    {n, c, h, w},
+                                    1,
+                                    xy_swap_flag);
+
+    RunElementwiseCommonSize<float>({n, c, h, w},
+                                    {1, 1, h, 1},
+                                    {n, c, h, w},
+                                    {h},
+                                    {n, c, h, w},
+                                    2,
+                                    xy_swap_flag);
+
+    RunElementwiseCommonSize<float>({n, c, h, w},
+                                    {1, 1, 1, w},
+                                    {n, c, h, w},
+                                    {w},
+                                    {n, c, h, w},
+                                    -1,
+                                    xy_swap_flag);
+
+    RunElementwiseCommonSize<float>({n, c, h, w},
+                                    {1, c, h, w},
+                                    {n, c, h, w},
+                                    {c, h, w},
+                                    {n, c, h, w},
+                                    -1,
+                                    xy_swap_flag);
+
+    RunElementwiseCommonSize<float>({n, c, h, w},
+                                    {n, c, h, 1},
+                                    {n, c, h, w},
+                                    {n, c, h},
+                                    {n, c, h, w},
+                                    0,
+                                    xy_swap_flag);
+
+    RunElementwiseCommonSize<float>({n, c, h, w},
+                                    {n, c, 1, 1},
+                                    {n, c, h, w},
+                                    {n, c},
+                                    {n, c, h, w},
+                                    0,
+                                    xy_swap_flag);
+
+    RunElementwiseCommonSize<float>({n, c, h, w},
+                                    {1, c, h, 1},
+                                    {n, c, h, w},
+                                    {c, h},
+                                    {n, c, h, w},
+                                    1,
+                                    xy_swap_flag);
+
+    RunElementwiseCommonSize<float>({n, c, h, w},
+                                    {1, 1, h, w},
+                                    {n, c, h, w},
+                                    {h, w},
+                                    {n, c, h, w},
+                                    -1,
+                                    xy_swap_flag);
+
+    RunElementwiseCommonSize<float>(
+        {n, c, h}, {n, c, h}, {n, c, h}, {n, c, h}, {n, c, h}, 0, xy_swap_flag);
+
+    RunElementwiseCommonSize<float>(
+        {n, c, h}, {1, 1, h}, {n, c, h}, {h}, {n, c, h}, -1, xy_swap_flag);
+
+    RunElementwiseCommonSize<float>(
+        {n, c, h}, {1, c, 1}, {n, c, h}, {c}, {n, c, h}, 1, xy_swap_flag);
+
+    RunElementwiseCommonSize<float>(
+        {n, c, h}, {n, 1, 1}, {n, c, h}, {n}, {n, c, h}, 0, xy_swap_flag);
+
+    RunElementwiseCommonSize<float>(
+        {n, c, h}, {n, c, 1}, {n, c, h}, {n, c}, {n, c, h}, 0, xy_swap_flag);
+
+    RunElementwiseCommonSize<float>(
+        {n, c, h}, {1, c, h}, {n, c, h}, {c, h}, {n, c, h}, -1, xy_swap_flag);
+
+    RunElementwiseCommonSize<float>(
+        {h, w}, {h, 1}, {h, w}, {h}, {h, w}, 0, xy_swap_flag);
+
+    RunElementwiseCommonSize<float>(
+        {h, w}, {1, w}, {h, w}, {w}, {h, w}, -1, xy_swap_flag);
+  }
+}
+
+void test_elementwise_broadcast_all_op() {
+  const int TEST_RETEAT_NUM = 1;
+  std::vector<bool> relu_flag_v{false, true};
+  for (int repeat_count = 0; repeat_count < TEST_RETEAT_NUM; ++repeat_count) {
+    for (int dim_size = 4; dim_size <= 4; dim_size++) {
+      for (auto fuse_act : relu_flag_v) {
+        for (const auto precision_type :
+             {paddle::lite_api::CLPrecisionType::CL_PRECISION_FP32}) {
+          RunElementwiseBroadcast<float>(
+              TARGET(kOpenCL),
+              dim_size,
+              fuse_act,
+              precision_type,
+              "def",
+              "elementwise_add",
+              "",
+              [](float l, float r) { return l + r; });
+          RunElementwiseBroadcast<float>(
+              TARGET(kOpenCL),
+              dim_size,
+              fuse_act,
+              precision_type,
+              "def",
+              "elementwise_sub",
+              "",
+              [](float l, float r) { return l - r; });
+          RunElementwiseBroadcast<float>(
+              TARGET(kOpenCL),
+              dim_size,
+              fuse_act,
+              precision_type,
+              "def",
+              "elementwise_mul",
+              "",
+              [](float l, float r) { return l * r; });
+          RunElementwiseBroadcast<float>(
+              TARGET(kOpenCL),
+              dim_size,
+              fuse_act,
+              precision_type,
+              "def",
+              "elementwise_div",
+              "",
+              [](float l, float r) { return l / r; });
+          RunElementwiseBroadcast<float>(
+              TARGET(kOpenCL),
+              dim_size,
+              fuse_act,
+              precision_type,
+              "def",
+              "elementwise_max",
+              "",
+              [](float l, float r) { return fmax(l, r); });
+          RunElementwiseBroadcast<float>(
+              TARGET(kOpenCL),
+              dim_size,
+              fuse_act,
+              precision_type,
+              "def",
+              "elementwise_min",
+              "",
+              [](float l, float r) { return fmin(l, r); });
+          RunElementwiseBroadcast<float>(
+              TARGET(kOpenCL),
+              dim_size,
+              fuse_act,
+              precision_type,
+              "def",
+              "elementwise_pow",
+              "",
+              [](float l, float r) { return pow(l, r); });
+          RunElementwiseBroadcast<float>(
+              TARGET(kOpenCL),
+              dim_size,
+              fuse_act,
+              precision_type,
+              "def",
+              "elementwise_mod",
+              "",
+              [](float l, float r) { return fmod(l, r); });
+        }
+      }
+    }
+  }
+}
+
+TEST(elementwise_broadcast, compute_basic) {
+  // test elementwise broadcast
+  test_elementwise_broadcast_all_op();
+
+  // test elementwise all dims, only add compute, data in gpu
+  test_elementwise_all_dim_data_gpu();
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(elementwise_div, kOpenCL, kFP16, kImageDefault, def);
+USE_LITE_KERNEL(elementwise_add, kOpenCL, kFP16, kImageDefault, def);
+USE_LITE_KERNEL(elementwise_sub, kOpenCL, kFP16, kImageDefault, def);
+USE_LITE_KERNEL(elementwise_mul, kOpenCL, kFP16, kImageDefault, def);
+USE_LITE_KERNEL(elementwise_max, kOpenCL, kFP16, kImageDefault, def);
+USE_LITE_KERNEL(elementwise_min, kOpenCL, kFP16, kImageDefault, def);
+USE_LITE_KERNEL(elementwise_pow, kOpenCL, kFP16, kImageDefault, def);
+USE_LITE_KERNEL(elementwise_mod, kOpenCL, kFP16, kImageDefault, def);
+USE_LITE_KERNEL(
+    fusion_elementwise_add_activation, kOpenCL, kFP16, kImageDefault, def);
+USE_LITE_KERNEL(
+    fusion_elementwise_sub_activation, kOpenCL, kFP16, kImageDefault, def);
+USE_LITE_KERNEL(
+    fusion_elementwise_mul_activation, kOpenCL, kFP16, kImageDefault, def);
+USE_LITE_KERNEL(
+    fusion_elementwise_div_activation, kOpenCL, kFP16, kImageDefault, def);
diff --git a/lite/kernels/opencl/elementwise_mul_compute.cc b/lite/kernels/opencl/elementwise_mul_compute.cc
index d0e8bc92d56..12343a509f0 100644
--- a/lite/kernels/opencl/elementwise_mul_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_compute.cc
@@ -171,23 +171,23 @@ void ElementwiseMulFloatImageCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-namespace ocl = paddle::lite::kernels::opencl;
-REGISTER_LITE_KERNEL(elementwise_mul,
-                     kOpenCL,
-                     kFloat,
-                     kImageDefault,
-                     ocl::ElementwiseMulFloatImageCompute,
-                     def)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault))})
-    .BindInput("Y",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
+// namespace ocl = paddle::lite::kernels::opencl;
+// REGISTER_LITE_KERNEL(elementwise_mul,
+//                      kOpenCL,
+//                      kFloat,
+//                      kImageDefault,
+//                      ocl::ElementwiseMulFloatImageCompute,
+//                      def)
+//     .BindInput("X",
+//                {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                       PRECISION(kFloat),
+//                                       DATALAYOUT(kImageDefault))})
+//     .BindInput("Y",
+//                {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                       PRECISION(kFloat),
+//                                       DATALAYOUT(kImageDefault))})
+//     .BindOutput("Out",
+//                 {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                        PRECISION(kFloat),
+//                                        DATALAYOUT(kImageDefault))})
+//     .Finalize();
diff --git a/lite/kernels/opencl/elementwise_mul_image_compute.cc b/lite/kernels/opencl/elementwise_mul_image_compute.cc
index 70a27cd3e17..fbb6c2e8331 100644
--- a/lite/kernels/opencl/elementwise_mul_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc
@@ -252,23 +252,23 @@ class ElementwiseMulImageCompute
 }  // namespace lite
 }  // namespace paddle
 
-namespace ocl = paddle::lite::kernels::opencl;
-REGISTER_LITE_KERNEL(elementwise_mul,
-                     kOpenCL,
-                     kFP16,
-                     kImageDefault,
-                     ocl::ElementwiseMulImageCompute,
-                     def)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageDefault))})
-    .BindInput("Y",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
+// namespace ocl = paddle::lite::kernels::opencl;
+// REGISTER_LITE_KERNEL(elementwise_mul,
+//                      kOpenCL,
+//                      kFP16,
+//                      kImageDefault,
+//                      ocl::ElementwiseMulImageCompute,
+//                      def)
+//     .BindInput("X",
+//                {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                       PRECISION(kFP16),
+//                                       DATALAYOUT(kImageDefault))})
+//     .BindInput("Y",
+//                {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                       PRECISION(kFP16),
+//                                       DATALAYOUT(kImageDefault))})
+//     .BindOutput("Out",
+//                 {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                        PRECISION(kFP16),
+//                                        DATALAYOUT(kImageDefault))})
+//     .Finalize();
diff --git a/lite/kernels/opencl/elementwise_sub_image_compute.cc b/lite/kernels/opencl/elementwise_sub_image_compute.cc
index 45e3ed8134b..195a4ce85cb 100644
--- a/lite/kernels/opencl/elementwise_sub_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.cc
@@ -161,22 +161,22 @@ namespace ocl = paddle::lite::kernels::opencl;
 //     may from anther branch like "X" (kOpenCL, nothing to do).
 // Consider 2 situations have different actions when pass running(pick kernel),
 //     set target of "Y" as kOpenCL temporarily.
-REGISTER_LITE_KERNEL(elementwise_sub,
-                     kOpenCL,
-                     kFP16,
-                     kImageDefault,
-                     ocl::ElementwiseSubImageCompute,
-                     def)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageDefault))})
-    .BindInput("Y",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
+// REGISTER_LITE_KERNEL(elementwise_sub,
+//                      kOpenCL,
+//                      kFP16,
+//                      kImageDefault,
+//                      ocl::ElementwiseSubImageCompute,
+//                      def)
+//     .BindInput("X",
+//                {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                       PRECISION(kFP16),
+//                                       DATALAYOUT(kImageDefault))})
+//     .BindInput("Y",
+//                {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                       PRECISION(kFP16),
+//                                       DATALAYOUT(kImageDefault))})
+//     .BindOutput("Out",
+//                 {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                        PRECISION(kFP16),
+//                                        DATALAYOUT(kImageDefault))})
+//     .Finalize();