diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index 1968ace61c9..64e9ba6d396 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -178,6 +178,62 @@ class XavierFiller : public Filler<Dtype> {
   }
 };
 
+
+
+/*!
+@brief Fills a Blob with coefficients of bilinear interpolation for upsampling.
+This is intended to be used in DeconvolutionLayer acting as UpsamplingLayer.
+You can upsample a feature map with shape of (B, C, H, W) by any integer factor
+using the following proto.
+\code
+layer {
+  name: "upsample", type: "Deconvolution"
+  bottom: "{{bottom_name}}" top: "{{top_name}}"
+  convolution_param {
+    kernel_size: {{2 * factor - factor % 2}} stride: {{factor}}
+    num_output: {{C}} group: {{C}}
+    pad: {{ceil((factor - 1) / 2.)}}
+    weight_filler: { type: "bilinear_upsampling" } bias_term: false
+  }
+  param { lr_mult: 0 decay_mult: 0 }
+}
+\endcode
+Please use this by replacing `{{}}` with your values. By specifying
+`num_output: {{C}} group: {{C}}`, it behaves as
+channel-wise convolution. The filter shape of this deconvolution layer will be
+(C, 1, K, K) where K is `kernel_size`, and this filler will set a (K, K)
+interpolation kernel for every channel of the filter identically. The resulting
+shape of the top feature map will be (B, C, factor * H, factor * W).
+Note that the learning rate and the
+weight decay are set to 0 in order to keep coefficient values of bilinear
+interpolation unchanged during training. If you apply this to an image, this
+operation is equivalent to the following call in Python with Scikit.Image.
+\code{.py}
+out = skimage.transform.rescale(img, factor, mode='constant', cval=0)
+\endcode
+ */
+template <typename Dtype>
+class BilinearFiller : public Filler<Dtype> {
+ public:
+  explicit BilinearFiller(const FillerParameter& param)
+      : Filler<Dtype>(param) {}
+  virtual void Fill(Blob<Dtype>* blob) {
+    CHECK_EQ(blob->num_axes(), 4) << "Blob must be 4 dim.";
+    CHECK_EQ(blob->width(), blob->height()) << "Filter must be square";
+    Dtype* data = blob->mutable_cpu_data();
+    int f = ceil(blob->width() / 2.);
+    float c = (2 * f - 1 - f % 2) / (2. * f);
+    for (int i = 0; i < blob->count(); ++i) {
+      float x = i % blob->width();
+      float y = (i / blob->width()) % blob->height();
+      data[i] = ((1.0 - fabs(x / f - c)) * (1.0 - fabs(y / f - c)));
+    }
+    CHECK_EQ(this->filler_param_.sparse(), -1)
+         << "Sparsity not supported by this Filler.";
+  }
+};
+
+
 /**
  * @brief Fills a Blob with values @f$ x \sim N(0, \sigma^2) @f$ where
  *        @f$ \sigma^2 @f$ is set inversely proportional to number of incoming
@@ -240,6 +296,8 @@ Filler<Dtype>* GetFiller(const FillerParameter& param) {
     return new UniformFiller<Dtype>(param);
   } else if (type == "xavier") {
     return new XavierFiller<Dtype>(param);
+  } else if (type == "bilinear") {
+    return new BilinearFiller<Dtype>(param);
   } else if (type == "msra") {
     return new MSRAFiller<Dtype>(param);
   } else {
diff --git a/src/caffe/greentea/cl_kernels.cpp b/src/caffe/greentea/cl_kernels.cpp
index 5af538a716b..1d1f46d03a9 100644
--- a/src/caffe/greentea/cl_kernels.cpp
+++ b/src/caffe/greentea/cl_kernels.cpp
@@ -21,7 +21,7 @@ std::string lrn_float = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#en
 std::string math_float = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(mul,Dtype)(const int n, __global const Dtype* a,\n                                  const int offa,\n                                  __global Dtype* b,\n                                  const int offb, __global Dtype* y,\n                                  const int offy) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    y[index + offy] = a[index + offa] * b[index + offb];\n  }\n}\n\n__kernel void TEMPLATE(div,Dtype)(const int n, __global const Dtype* a,\n                                  const int offa,\n                                  __global Dtype* b,\n                                  const int offb, __global Dtype* y,\n                                  const int offy) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    y[index + offy] = a[index + offa] / b[index + offb];\n  }\n}\n\n__kernel void TEMPLATE(add_scalar,Dtype)(const int N, const Dtype alpha,\n__global Dtype* Y,\n                                         const int offY) {\n  for (int index = get_global_id(0); index < N; index += get_global_size(0)) {\n    Y[offY + index] += alpha;\n  }\n}\n\n__kernel void TEMPLATE(add,Dtype)(const int n, __global const Dtype* a,\n                                  const int offa, __global const Dtype* b,\n                                  const int offb, __global Dtype* y,\n                                  const int offy) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    y[offy + index] = a[offa + index] + b[offb + index];\n  }\n}\n\n__kernel void TEMPLATE(sub,Dtype)(const int n, __global const Dtype* a,\n                                  const int offa, __global const Dtype* b,\n                                  const int offb, __global Dtype* y,\n                                  const int offy) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    y[offy + index] = a[offa + index] - b[offb + index];\n  }\n}\n\n__kernel void TEMPLATE(abs,Dtype)(const int n, __global const Dtype* a,\n                                  const int offa, __global Dtype* y,\n                                  const int offy) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    y[offy + index] = fabs((Dtype)(a[offa + index]));\n  }\n}\n\n__kernel void TEMPLATE(exp,Dtype)(const int n, __global const Dtype* a,\n                                  const int offa, __global Dtype* y,\n                                  const int offy) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    y[offy + index] = exp(a[offa + index]);\n  }\n}\n\n__kernel void TEMPLATE(log,Dtype)(const int n, __global const Dtype* a,\n                                  const int offa, __global Dtype* y,\n                                  const int offy) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    y[offy + index] = log(a[offa + index]);\n  }\n}\n\n__kernel void TEMPLATE(powx,Dtype)(const int n, __global const Dtype* a,\n                                   const int offa, Dtype alpha,\n                                   __global Dtype* y,\n                                   const int offy) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n      if(a[offa + index] < 0 && alpha < 1 && alpha > -1) {\n        y[offy + index] = NAN;\n      } else {\n        y[offy + index] = pow(a[offa + index], alpha);\n      }\n  }\n}\n\n__kernel void TEMPLATE(sign,Dtype)(const int n, __global const Dtype* x,\n                                   const int offx, __global Dtype* y,\n                                   const int offy) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    y[index + offy] = (0.0 < x[index + offx])\n        - (x[index + offx] < 0.0);\n  }\n}\n\n__kernel void TEMPLATE(sgnbit,Dtype)(const int n, __global const Dtype* x,\n                                     const int offx, __global Dtype* y,\n                                     const int offy) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    y[index + offy] = signbit(x[index + offx]);\n  }\n}";  // NOLINT
 std::string mergecrop_float = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(merge_copy_forward, Dtype)(\n    const int nthreads, __global const Dtype* bottom_a,\n    __global const Dtype* bottom_b,\n    __global Dtype* top,\n    int num, int channels_a, int channels_b, int height_a, int width_a,\n    int height_b, int width_b) {\n\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n\n    int pad_h = (height_b - height_a) / 2;\n    int pad_w = (width_b - width_a) / 2;\n\n    int batch_id = index / ((channels_a + channels_b) * height_a * width_a);\n\n    int bottom_id = ((index\n        - batch_id * (channels_a + channels_b) * height_a * width_a)\n        / (channels_a * height_a * width_a)) % 2;\n\n    int h = ((index / width_a) % height_a);\n    int w = (index % width_a);\n\n    if (bottom_id == 0) {\n      int channel_id = (index / ((width_a * height_a)) % channels_a);\n      int aidx = ((((batch_id) * channels_a + channel_id) * height_a + h)\n          * width_a + w);\n      top[index] = bottom_a[aidx];\n    } else {\n      int channel_id = (index / ((width_a * height_a)) % channels_b);\n      int bidx =\n          (((batch_id) * channels_b + channel_id) * height_b\n              * width_b) + width_b * (h + pad_h) + pad_w + w;\n      top[index] = bottom_b[bidx];\n    }\n  }\n\n}\n\n__kernel void TEMPLATE(merge_copy_backward,Dtype)(const int nthreads,\n__global Dtype* bottom_a,\n                                                  __global const Dtype* top,\n                                                  int num, int channels_a,\n                                                  int channels_b, int height_a,\n                                                  int width_a, int height_b,\n                                                  int width_b) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    int batch_id = index / ((channels_a + channels_b) * height_a * width_a);\n\n    int bottom_id = ((index\n        - batch_id * (channels_a + channels_b) * height_a * width_a)\n        / (channels_a * height_a * width_a)) % 2;\n\n    int h = ((index / width_a) % height_a);\n    int w = (index % width_a);\n\n    if (bottom_id == 0) {\n      int channel_id = (index / ((width_a * height_a)) % channels_a);\n      int aidx = ((((batch_id) * channels_a + channel_id) * height_a + h)\n          * width_a + w);\n      bottom_a[aidx] = top[index];\n    }\n  }\n}";  // NOLINT
 std::string pooling_float = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(max_pool_forward,Dtype)(\n    const int nthreads, __global const Dtype* bottom_data, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,\n    const int pad_w,\n    __global Dtype* top_data,\n    const int use_mask, __global int* mask, __global Dtype* top_mask) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    const int pw = index % pooled_width;\n    const int ph = (index / pooled_width) % pooled_height;\n    const int c = (index / pooled_width / pooled_height) % channels;\n    const int n = index / pooled_width / pooled_height / channels;\n    int hstart = ph * stride_h - pad_h;\n    int wstart = pw * stride_w - pad_w;\n    const int hend = min(hstart + kernel_h, height);\n    const int wend = min(wstart + kernel_w, width);\n    hstart = max(hstart, 0);\n    wstart = max(wstart, 0);\n    Dtype maxval = -FLT_MAX;\n    int maxidx = -1;\n    __global const Dtype* bottom_slice = bottom_data\n        + (n * channels + c) * height * width;\n    for (int h = hstart; h < hend; ++h) {\n      for (int w = wstart; w < wend; ++w) {\n        if (bottom_slice[h * width + w] > maxval) {\n          maxidx = h * width + w;\n          maxval = bottom_slice[maxidx];\n        }\n      }\n    }\n    top_data[index] = maxval;\n    if (use_mask == 1) {\n      mask[index] = maxidx;\n    } else {\n      top_mask[index] = maxidx;\n    }\n  }\n}\n\n__kernel void TEMPLATE(ave_pool_forward,Dtype)(\n    const int nthreads, __global const Dtype* const bottom_data, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,\n    const int pad_w, __global Dtype* top_data) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    {\n      const int pw = index % pooled_width;\n      const int ph = (index / pooled_width) % pooled_height;\n      const int c = (index / pooled_width / pooled_height) % channels;\n      const int n = index / pooled_width / pooled_height / channels;\n      int hstart = ph * stride_h - pad_h;\n      int wstart = pw * stride_w - pad_w;\n      int hend = min(hstart + kernel_h, height + pad_h);\n      int wend = min(wstart + kernel_w, width + pad_w);\n      const int pool_size = (hend - hstart) * (wend - wstart);\n      hstart = max(hstart, 0);\n      wstart = max(wstart, 0);\n      hend = min(hend, height);\n      wend = min(wend, width);\n      Dtype aveval = 0;\n      __global const Dtype* bottom_slice = bottom_data\n          + (n * channels + c) * height * width;\n      for (int h = hstart; h < hend; ++h) {\n        for (int w = wstart; w < wend; ++w) {\n          aveval += bottom_slice[h * width + w];\n        }\n      }\n      top_data[index] = aveval / pool_size;\n    }\n  }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_train,Dtype)(\n    const int nthreads, __global const Dtype* bottom_data, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int stride_h, const int stride_w,\n    __global Dtype* rand_idx,\n    __global Dtype* top_data) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    const int pw = index % pooled_width;\n    const int ph = (index / pooled_width) % pooled_height;\n    const int c = (index / pooled_width / pooled_height) % channels;\n    const int n = index / pooled_width / pooled_height / channels;\n    const int hstart = ph * stride_h;\n    const int hend = min(hstart + kernel_h, height);\n    const int wstart = pw * stride_w;\n    const int wend = min(wstart + kernel_w, width);\n    Dtype cumsum = 0.;\n    __global const Dtype* bottom_slice = bottom_data\n        + (n * channels + c) * height * width;\n    // First pass: get sum\n    for (int h = hstart; h < hend; ++h) {\n      for (int w = wstart; w < wend; ++w) {\n        cumsum += bottom_slice[h * width + w];\n      }\n    }\n    const float thres = rand_idx[index] * cumsum;\n    // Second pass: get value, and set index.\n    cumsum = 0;\n    for (int h = hstart; h < hend; ++h) {\n      for (int w = wstart; w < wend; ++w) {\n        cumsum += bottom_slice[h * width + w];\n        if (cumsum >= thres) {\n          rand_idx[index] = ((n * channels + c) * height + h) * width + w;\n          top_data[index] = bottom_slice[h * width + w];\n          h = hend;\n          w = wend;\n        }\n      }\n    }\n  }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_test,Dtype)(\n    const int nthreads, __global const Dtype* const bottom_data, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int stride_h, const int stride_w,\n    __global Dtype* top_data) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    const int pw = index % pooled_width;\n    const int ph = (index / pooled_width) % pooled_height;\n    const int c = (index / pooled_width / pooled_height) % channels;\n    const int n = index / pooled_width / pooled_height / channels;\n    const int hstart = ph * stride_h;\n    const int hend = min(hstart + kernel_h, height);\n    const int wstart = pw * stride_w;\n    const int wend = min(wstart + kernel_w, width);\n    // We set cumsum to be 0 to avoid divide-by-zero problems\n    Dtype cumsum = FLT_MIN;\n    Dtype cumvalues = 0.;\n    __global const Dtype* bottom_slice = bottom_data\n        + (n * channels + c) * height * width;\n    // First pass: get sum\n    for (int h = hstart; h < hend; ++h) {\n      for (int w = wstart; w < wend; ++w) {\n        cumsum += bottom_slice[h * width + w];\n        cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];\n      }\n    }\n    top_data[index] = cumvalues / cumsum;\n  }\n}\n\n__kernel void TEMPLATE(max_pool_backward,Dtype)(const int nthreads,\n                                                __global const Dtype* top_diff,\n                                                const int use_mask,\n                                                __global const int* mask,\n                                                __global const Dtype* top_mask,\n                                                const int num,\n                                                const int channels,\n                                                const int height,\n                                                const int width,\n                                                const int pooled_height,\n                                                const int pooled_width,\n                                                const int kernel_h,\n                                                const int kernel_w,\n                                                const int stride_h,\n                                                const int stride_w,\n                                                const int pad_h,\n                                                const int pad_w,\n                                                __global Dtype* bottom_diff) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    // find out the local index\n    // find out the local offset\n    const int w = index % width;\n    const int h = (index / width) % height;\n    const int c = (index / width / height) % channels;\n    const int n = index / width / height / channels;\n    const int phstart =\n        (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;\n    const int phend = min((h + pad_h) / stride_h + 1, pooled_height);\n    const int pwstart =\n        (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;\n    const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);\n    Dtype gradient = 0;\n    const int offset = (n * channels + c) * pooled_height * pooled_width;\n    __global const Dtype* top_diff_slice = top_diff + offset;\n    if (use_mask == 1) {\n      __global const int* mask_slice = mask + offset;\n      for (int ph = phstart; ph < phend; ++ph) {\n        for (int pw = pwstart; pw < pwend; ++pw) {\n          if (mask_slice[ph * pooled_width + pw] == h * width + w) {\n            gradient += top_diff_slice[ph * pooled_width + pw];\n          }\n        }\n      }\n    } else {\n      __global const Dtype* top_mask_slice = top_mask + offset;\n      for (int ph = phstart; ph < phend; ++ph) {\n        for (int pw = pwstart; pw < pwend; ++pw) {\n          if (top_mask_slice[ph * pooled_width + pw] == h * width + w) {\n            gradient += top_diff_slice[ph * pooled_width + pw];\n          }\n        }\n      }\n    }\n    bottom_diff[index] = gradient;\n  }\n}\n\n__kernel void TEMPLATE(ave_pool_backward,Dtype)(const int nthreads,\n                                                __global const Dtype* top_diff,\n                                                const int num,\n                                                const int channels,\n                                                const int height,\n                                                const int width,\n                                                const int pooled_height,\n                                                const int pooled_width,\n                                                const int kernel_h,\n                                                const int kernel_w,\n                                                const int stride_h,\n                                                const int stride_w,\n                                                const int pad_h,\n                                                const int pad_w,\n                                                __global Dtype* bottom_diff) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    // find out the local index\n    // find out the local offset\n    const int w = index % width + pad_w;\n    const int h = (index / width) % height + pad_h;\n    const int c = (index / width / height) % channels;\n    const int n = index / width / height / channels;\n    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;\n    const int phend = min(h / stride_h + 1, pooled_height);\n    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;\n    const int pwend = min(w / stride_w + 1, pooled_width);\n    Dtype gradient = 0;\n    __global const Dtype* const top_diff_slice = top_diff\n        + (n * channels + c) * pooled_height * pooled_width;\n    for (int ph = phstart; ph < phend; ++ph) {\n      for (int pw = pwstart; pw < pwend; ++pw) {\n        // figure out the pooling size\n        int hstart = ph * stride_h - pad_h;\n        int wstart = pw * stride_w - pad_w;\n        int hend = min(hstart + kernel_h, height + pad_h);\n        int wend = min(wstart + kernel_w, width + pad_w);\n        int pool_size = (hend - hstart) * (wend - wstart);\n        gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;\n      }\n    }\n    bottom_diff[index] = gradient;\n  }\n}\n\n__kernel void TEMPLATE(sto_pool_backward,Dtype)(\n    const int nthreads, __global const Dtype* rand_idx,\n    __global const Dtype* const top_diff, const int num, const int channels,\n    const int height, const int width, const int pooled_height,\n    const int pooled_width, const int kernel_h, const int kernel_w,\n    const int stride_h, const int stride_w, __global Dtype* bottom_diff) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    // find out the local index\n    // find out the local offset\n    const int w = index % width;\n    const int h = (index / width) % height;\n    const int c = (index / width / height) % channels;\n    const int n = index / width / height / channels;\n    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;\n    const int phend = min(h / stride_h + 1, pooled_height);\n    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;\n    const int pwend = min(w / stride_w + 1, pooled_width);\n    Dtype gradient = 0;\n    __global const Dtype* rand_idx_slice = rand_idx\n        + (n * channels + c) * pooled_height * pooled_width;\n    __global const Dtype* top_diff_slice = top_diff\n        + (n * channels + c) * pooled_height * pooled_width;\n    for (int ph = phstart; ph < phend; ++ph) {\n      for (int pw = pwstart; pw < pwend; ++pw) {\n        gradient += top_diff_slice[ph * pooled_width + pw]\n            * (index == (int) (rand_idx_slice[ph * pooled_width + pw]));\n      }\n    }\n    bottom_diff[index] = gradient;\n  }\n}";  // NOLINT
-std::string pooling_sk_float = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(max_pool_forward_sk,Dtype)(const int nthreads,\n__global Dtype* bottom_data,\n                                                  const int num,\n                                                  const int channels,\n                                                  const int height,\n                                                  const int width,\n                                                  const int pooled_height,\n                                                  const int pooled_width,\n                                                  const int kernel_h,\n                                                  const int kernel_w,\n                                                  const int ext_kernel_h,\n                                                  const int ext_kernel_w,\n                                                  const int stride_h,\n                                                  const int stride_w,\n                                                  const int kstride_h,\n                                                  const int kstride_w,\n                                                  const int pad_h,\n                                                  const int pad_w,\n                                                  __global Dtype* top_data,\n                                                  const int use_mask,\n                                                  __global int* mask,\n                                                  __global Dtype* top_mask) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n    int hstart = ph * stride_h - pad_h;\n    int wstart = pw * stride_w - pad_w;\n    int hend = min(hstart + ext_kernel_h, height);\n    int wend = min(wstart + ext_kernel_w, width);\n    hstart = max(hstart, (int) 0);\n    wstart = max(wstart, (int) 0);\n    Dtype maxval = -FLT_MAX;\n    int maxidx = -1;\n    __global Dtype* bottom_data_ptr = bottom_data\n        + (n * channels + c) * height * width;\n    for (int h = hstart; h < hend; h += kstride_h) {\n      for (int w = wstart; w < wend; w += kstride_w) {\n        if (bottom_data_ptr[h * width + w] > maxval) {\n          maxidx = h * width + w;\n          maxval = bottom_data_ptr[maxidx];\n        }\n      }\n    }\n    top_data[index] = maxval;\n    if (use_mask == 1) {\n      mask[index] = maxidx;\n    } else {\n      top_mask[index] = maxidx;\n    }\n  }\n}\n\n__kernel void TEMPLATE(max_pool_backward_sk,Dtype)(\n    const int nthreads, __global const Dtype* top_diff, const int use_mask,\n    __global const int* mask, __global const Dtype* top_mask, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n    const int stride_h, const int stride_w, const int kstride_h,\n    const int kstride_w, const int pad_h, const int pad_w,\n    __global Dtype* bottom_diff) {\n\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n\n    __global const int* mask_ptr = mask;\n    __global const Dtype* top_diff_ptr = top_diff;\n\n// find out the local index\n// find out the local offset\n    int w = index % width;\n    int h = (index / width) % height;\n    int c = (index / width / height) % channels;\n    int n = index / width / height / channels;\n\n    int pooled_height_1 = pooled_height - 1;\n    int pooled_width_1 = pooled_width - 1;\n    int phstart = (h < ext_kernel_h) ? h % kstride_h : (h - ext_kernel_h) + 1;\n    int phend =\n        (h >= pooled_height) ?\n            pooled_height_1 - (pooled_height_1 - phstart) % kstride_h : h;\n    int pwstart = (w < ext_kernel_w) ? w % kstride_w : (w - ext_kernel_w) + 1;\n    int pwend =\n        (w >= pooled_width) ?\n            pooled_width_1 - (pooled_width_1 - pwstart) % kstride_w : w;\n\n    Dtype gradient = 0;\n    int offset = (n * channels + c) * pooled_height * pooled_width;\n    top_diff_ptr += offset;\n    if (use_mask == 1) {\n      mask_ptr += offset;\n      for (int ph = phstart; ph <= phend; ph += kstride_h) {\n        for (int pw = pwstart; pw <= pwend; pw += kstride_w) {\n          if (mask_ptr[ph * pooled_width + pw] == h * width + w) {\n            gradient += top_diff_ptr[ph * pooled_width + pw];\n          }\n        }\n      }\n    } else {\n      for (int ph = phstart; ph <= phend; ph += kstride_h) {\n        for (int pw = pwstart; pw <= pwend; pw += kstride_w) {\n          if (top_mask[ph * pooled_width + pw] == h * width + w) {\n            gradient += top_diff_ptr[ph * pooled_width + pw];\n          }\n        }\n      }\n    }\n    bottom_diff[index] = gradient;\n  }\n}\n\n__kernel void TEMPLATE(ave_pool_forward_sk,Dtype)(\n    const int nthreads, __global const Dtype* bottom_data, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n    const int stride_h, const int stride_w, const int kstride_h,\n    const int kstride_w, const int pad_h, const int pad_w,\n    __global Dtype* top_data) {\n\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n    int hstart = ph * stride_h - pad_h;\n    int wstart = pw * stride_w - pad_w;\n    int hend = min(hstart + ext_kernel_h, height + pad_h);\n    int wend = min(wstart + ext_kernel_w, width + pad_w);\n    hstart = max(hstart, 0);\n    wstart = max(wstart, 0);\n    hend = min(hend, height);\n    wend = min(wend, width);\n    Dtype aveval = 0;\n    __global const Dtype* bottom_data_ptr = bottom_data;\n    bottom_data_ptr += (n * channels + c) * height * width;\n    int pool_size = 0;\n    for (int h = hstart; h < hend; ++h) {\n      for (int w = wstart; w < wend; ++w) {\n        aveval += bottom_data_ptr[h * width + w];\n        ++pool_size;\n      }\n    }\n    top_data[index] = aveval / pool_size;\n  }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_train_sk,Dtype)(\n    const int nthreads, __global const Dtype* bottom_data, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n    const int stride_h, const int stride_w, const int kstride_h,\n    const int kstride_w, __global Dtype* rand_idx,\n    __global Dtype* top_data) {\n\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n    int hstart = ph * stride_h;\n    int hend = min(hstart + ext_kernel_h, height);\n    int wstart = pw * stride_w;\n    int wend = min(wstart + ext_kernel_w, width);\n    Dtype cumsum = 0.;\n    __global const Dtype* bottom_data_ptr = bottom_data;\n    bottom_data_ptr += (n * channels + c) * height * width;\n    // First pass: get sum\n    for (int h = hstart; h < hend; h += kstride_h) {\n      for (int w = wstart; w < wend; w += kstride_w) {\n        cumsum += bottom_data_ptr[h * width + w];\n      }\n    }\n    float thres = rand_idx[index] * cumsum;\n    // Second pass: get value, and set index.\n    cumsum = 0;\n    for (int h = hstart; h < hend; h += kstride_h) {\n      for (int w = wstart; w < wend; w += kstride_w) {\n        cumsum += bottom_data_ptr[h * width + w];\n        if (cumsum >= thres) {\n          rand_idx[index] = ((n * channels + c) * height + h) * width + w;\n          top_data[index] = bottom_data_ptr[h * width + w];\n          return;\n        }\n      }\n    }\n  }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_test_sk,Dtype)(\n    const int nthreads, __global const Dtype* bottom_data, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n    const int stride_h, const int stride_w, const int kstride_h,\n    const int kstride_w,\n    __global Dtype* top_data) {\n\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n    int hstart = ph * stride_h;\n    int hend = min(hstart + ext_kernel_h, height);\n    int wstart = pw * stride_w;\n    int wend = min(wstart + ext_kernel_w, width);\n    // We set cumsum to be 0 to avoid divide-by-zero problems\n    Dtype cumsum = FLT_MIN;\n    Dtype cumvalues = 0.;\n    __global const Dtype* bottom_data_ptr = bottom_data;\n    bottom_data_ptr += (n * channels + c) * height * width;\n    // First pass: get sum\n    for (int h = hstart; h < hend; h += kstride_h) {\n      for (int w = wstart; w < wend; w += kstride_w) {\n        cumsum += bottom_data_ptr[h * width + w];\n        cumvalues += bottom_data_ptr[h * width + w]\n            * bottom_data_ptr[h * width + w];\n      }\n    }\n    top_data[index] = cumvalues / cumsum;\n  }\n\n}";  // NOLINT
+std::string pooling_sk_float = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(max_pool_forward_sk,Dtype)(const int nthreads,\n__global Dtype* bottom_data,\n                                                  const int num,\n                                                  const int channels,\n                                                  const int height,\n                                                  const int width,\n                                                  const int pooled_height,\n                                                  const int pooled_width,\n                                                  const int kernel_h,\n                                                  const int kernel_w,\n                                                  const int ext_kernel_h,\n                                                  const int ext_kernel_w,\n                                                  const int stride_h,\n                                                  const int stride_w,\n                                                  const int kstride_h,\n                                                  const int kstride_w,\n                                                  const int pad_h,\n                                                  const int pad_w,\n                                                  __global Dtype* top_data,\n                                                  const int use_mask,\n                                                  __global int* mask,\n                                                  __global Dtype* top_mask) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n    int hstart = ph * stride_h - pad_h;\n    int wstart = pw * stride_w - pad_w;\n    int hend = min(hstart + ext_kernel_h, height);\n    int wend = min(wstart + ext_kernel_w, width);\n    hstart = max(hstart, (int) 0);\n    wstart = max(wstart, (int) 0);\n    Dtype maxval = -FLT_MAX;\n    int maxidx = -1;\n    __global Dtype* bottom_data_ptr = bottom_data\n        + (n * channels + c) * height * width;\n    for (int h = hstart; h < hend; h += kstride_h) {\n      for (int w = wstart; w < wend; w += kstride_w) {\n        if (bottom_data_ptr[h * width + w] > maxval) {\n          maxidx = h * width + w;\n          maxval = bottom_data_ptr[maxidx];\n        }\n      }\n    }\n    top_data[index] = maxval;\n    if (use_mask == 1) {\n      mask[index] = maxidx;\n    } else {\n      top_mask[index] = maxidx;\n    }\n  }\n}\n\n__kernel void TEMPLATE(max_pool_backward_sk,Dtype)(\n    const int nthreads, __global const Dtype* top_diff, const int use_mask,\n    __global const int* mask, __global const Dtype* top_mask, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n    const int stride_h, const int stride_w, const int kstride_h,\n    const int kstride_w, const int pad_h, const int pad_w,\n    __global Dtype* bottom_diff) {\n\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n\n    __global const int* mask_ptr = mask;\n    __global const Dtype* top_diff_ptr = top_diff;\n\n// find out the local index\n// find out the local offset\n    int w = index % width;\n    int h = (index / width) % height;\n    int c = (index / width / height) % channels;\n    int n = index / width / height / channels;\n\n    int pooled_height_1 = pooled_height - 1;\n    int pooled_width_1 = pooled_width - 1;\n    int phstart = (h < ext_kernel_h) ? h % kstride_h : (h - ext_kernel_h) + 1;\n    int phend =\n        (h >= pooled_height) ?\n            pooled_height_1 - (pooled_height_1 - phstart) % kstride_h : h;\n    int pwstart = (w < ext_kernel_w) ? w % kstride_w : (w - ext_kernel_w) + 1;\n    int pwend =\n        (w >= pooled_width) ?\n            pooled_width_1 - (pooled_width_1 - pwstart) % kstride_w : w;\n\n    Dtype gradient = 0;\n    int offset = (n * channels + c) * pooled_height * pooled_width;\n    top_diff_ptr += offset;\n    if (use_mask == 1) {\n      mask_ptr += offset;\n      for (int ph = phstart; ph <= phend; ph += kstride_h) {\n        for (int pw = pwstart; pw <= pwend; pw += kstride_w) {\n          if (mask_ptr[ph * pooled_width + pw] == h * width + w) {\n            gradient += top_diff_ptr[ph * pooled_width + pw];\n          }\n        }\n      }\n    } else {\n      for (int ph = phstart; ph <= phend; ph += kstride_h) {\n        for (int pw = pwstart; pw <= pwend; pw += kstride_w) {\n          if (top_mask[ph * pooled_width + pw] == h * width + w) {\n            gradient += top_diff_ptr[ph * pooled_width + pw];\n          }\n        }\n      }\n    }\n    bottom_diff[index] = gradient;\n  }\n}\n\n__kernel void TEMPLATE(ave_pool_forward_sk,Dtype)(\n    const int nthreads, __global const Dtype* bottom_data, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n    const int stride_h, const int stride_w, const int kstride_h,\n    const int kstride_w, const int pad_h, const int pad_w,\n    __global Dtype* top_data) {\n\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n    int hstart = ph * stride_h - pad_h;\n    int wstart = pw * stride_w - pad_w;\n    int hend = min(hstart + ext_kernel_h, height + pad_h);\n    int wend = min(wstart + ext_kernel_w, width + pad_w);\n    hstart = max(hstart, 0);\n    wstart = max(wstart, 0);\n    hend = min(hend, height);\n    wend = min(wend, width);\n    Dtype aveval = 0;\n    __global const Dtype* bottom_data_ptr = bottom_data;\n    bottom_data_ptr += (n * channels + c) * height * width;\n    int pool_size = 0;\n    for (int h = hstart; h < hend; ++h) {\n      for (int w = wstart; w < wend; ++w) {\n        aveval += bottom_data_ptr[h * width + w];\n        ++pool_size;\n      }\n    }\n    top_data[index] = aveval / pool_size;\n  }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_train_sk,Dtype)(\n    const int nthreads, __global const Dtype* bottom_data, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n    const int stride_h, const int stride_w, const int kstride_h,\n    const int kstride_w, __global Dtype* rand_idx,\n    __global Dtype* top_data) {\n\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n    int hstart = ph * stride_h;\n    int hend = min(hstart + ext_kernel_h, height);\n    int wstart = pw * stride_w;\n    int wend = min(wstart + ext_kernel_w, width);\n    Dtype cumsum = 0.;\n    __global const Dtype* bottom_data_ptr = bottom_data;\n    bottom_data_ptr += (n * channels + c) * height * width;\n    // First pass: get sum\n    for (int h = hstart; h < hend; h += kstride_h) {\n      for (int w = wstart; w < wend; w += kstride_w) {\n        cumsum += bottom_data_ptr[h * width + w];\n      }\n    }\n    float thres = rand_idx[index] * cumsum;\n    // Second pass: get value, and set index.\n    cumsum = 0;\n    for (int h = hstart; h < hend; h += kstride_h) {\n      for (int w = wstart; w < wend; w += kstride_w) {\n        cumsum += bottom_data_ptr[h * width + w];\n        if (cumsum >= thres) {\n          rand_idx[index] = ((n * channels + c) * height + h) * width + w;\n          top_data[index] = bottom_data_ptr[h * width + w];\n          h = hend;\n          w = wend;\n        }\n      }\n    }\n  }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_test_sk,Dtype)(\n    const int nthreads, __global const Dtype* bottom_data, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n    const int stride_h, const int stride_w, const int kstride_h,\n    const int kstride_w,\n    __global Dtype* top_data) {\n\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n    int hstart = ph * stride_h;\n    int hend = min(hstart + ext_kernel_h, height);\n    int wstart = pw * stride_w;\n    int wend = min(wstart + ext_kernel_w, width);\n    // We set cumsum to be 0 to avoid divide-by-zero problems\n    Dtype cumsum = FLT_MIN;\n    Dtype cumvalues = 0.;\n    __global const Dtype* bottom_data_ptr = bottom_data;\n    bottom_data_ptr += (n * channels + c) * height * width;\n    // First pass: get sum\n    for (int h = hstart; h < hend; h += kstride_h) {\n      for (int w = wstart; w < wend; w += kstride_w) {\n        cumsum += bottom_data_ptr[h * width + w];\n        cumvalues += bottom_data_ptr[h * width + w]\n            * bottom_data_ptr[h * width + w];\n      }\n    }\n    top_data[index] = cumvalues / cumsum;\n  }\n\n}";  // NOLINT
 std::string slice_float = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(slice,Dtype)(const int nthreads,\n                                    __global const Dtype* in_data,\n                                    const int forward, const int num_slices,\n                                    const int slice_size,\n                                    const int bottom_slice_axis,\n                                    const int top_slice_axis,\n                                    const int offset_slice_axis,\n                                    __global Dtype* out_data) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    const int total_slice_size = slice_size * top_slice_axis;\n    const int slice_num = index / total_slice_size;\n    const int slice_index = index % total_slice_size;\n    const int bottom_index = slice_index\n        + (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size;\n    if (forward == 1) {\n      out_data[index] = in_data[bottom_index];\n    } else {\n      out_data[bottom_index] = in_data[index];\n    }\n  }\n}";  // NOLINT
 std::string softmax_loss_float = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(softmax_loss_forward,Dtype)(\n    int n, __global const Dtype* prob_data, __global const Dtype* label,\n    __global Dtype* loss,\n    const int num, const int dim, const int spatial_dim,\n    const int has_ignore_label_, const int ignore_label_,\n    __global Dtype* counts) {\n\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    const int n = index / spatial_dim;\n    const int s = index % spatial_dim;\n    const int label_value = (int) (label[n * spatial_dim + s]);\n    if (has_ignore_label_ == 1 && label_value == ignore_label_) {\n      loss[index] = 0;\n      counts[index] = 0;\n    } else {\n      loss[index] = -log(\n          max((Dtype) (prob_data[n * dim + label_value * spatial_dim + s]),\n              (Dtype) FLT_MIN));\n      counts[index] = 1;\n    }\n  }\n}\n\n__kernel void TEMPLATE(softmax_loss_backward,Dtype)(const int nthreads,\n                                                    __global const Dtype* top,\n                                                    __global const Dtype* label,\n                                                    __global Dtype* bottom_diff,\n                                                    const int num,\n                                                    const int dim,\n                                                    const int spatial_dim,\n                                                    const int has_ignore_label_,\n                                                    const int ignore_label_,\n                                                    __global Dtype* counts) {\n\n  const int channels = dim / spatial_dim;\n\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    {\n      const int n = index / spatial_dim;\n      const int s = index % spatial_dim;\n      const int label_value = (int) (label[n * spatial_dim + s]);\n\n      if (has_ignore_label_ == 1 && label_value == ignore_label_) {\n        for (int c = 0; c < channels; ++c) {\n          bottom_diff[n * dim + c * spatial_dim + s] = 0;\n        }\n        counts[index] = 0;\n      } else {\n        bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;\n        counts[index] = 1;\n      }\n    }\n  }\n}";  // NOLINT
 std::string activation_double = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(relu_forward,Dtype)(const int n,\n                                           __global const Dtype* in,\n                                           __global Dtype* out,\n                                           Dtype negative_slope) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope;\n  }\n}\n\n__kernel void TEMPLATE(relu_backward,Dtype)(const int n,\n                                            __global const Dtype* in_diff,\n                                            __global const Dtype* in_data,\n                                            __global Dtype* out_diff,\n                                            Dtype negative_slope) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    out_diff[index] = in_diff[index]\n        * ((in_data[index] > 0) + (in_data[index] <= 0) * negative_slope);\n  }\n}\n\n__kernel void TEMPLATE(tanh_forward,Dtype)(const int n,\n                                           __global const Dtype* in,\n                                           __global Dtype* out) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    out[index] = tanh(in[index]);\n  }\n}\n\n__kernel void TEMPLATE(tanh_backward,Dtype)(const int n,\n                                            __global const Dtype* in_diff,\n                                            __global const Dtype* out_data,\n                                            __global Dtype* out_diff) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    Dtype tanhx = out_data[index];\n    out_diff[index] = in_diff[index] * (1 - tanhx * tanhx);\n  }\n}\n\n__kernel void TEMPLATE(sigmoid_forward,Dtype)(const int n,\n                                              __global const Dtype* in,\n                                              __global Dtype* out) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    out[index] = 1. / (1. + exp(-in[index]));\n  }\n}\n\n__kernel void TEMPLATE(sigmoid_backward,Dtype)(const int n,\n                                               __global const Dtype* in_diff,\n                                               __global const Dtype* out_data,\n                                               __global Dtype* out_diff) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    const Dtype sigmoid_x = out_data[index];\n    out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x);\n  }\n}\n\n__kernel void TEMPLATE(threshold,Dtype)(const int n, const Dtype threshold,\n                                        __global const Dtype* in,\n                                        __global Dtype* out) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    out[index] = in[index] > threshold ? 1 : 0;\n  }\n}\n\n__kernel void TEMPLATE(prelu_forward,Dtype)(const int n, const int channels,\n                                            const int dim,\n                                            __global const Dtype* in,\n                                            __global Dtype* out,\n                                            __global const Dtype* slope_data,\n                                            const int div_factor) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    int c = (index / dim) % channels / div_factor;\n    out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];\n  }\n}\n\n__kernel void TEMPLATE(prelu_backward,Dtype)(const int n, const int channels,\n                                             const int dim,\n                                             __global const Dtype* in_diff,\n                                             __global const Dtype* in_data,\n                                             __global Dtype* out_diff,\n                                             __global const Dtype* slope_data,\n                                             const int div_factor) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    int c = (index / dim) % channels / div_factor;\n    out_diff[index] = in_diff[index]\n        * ((in_data[index] > 0) + (in_data[index] <= 0) * slope_data[c]);\n  }\n}\n\n__kernel void TEMPLATE(prelu_param_backward,Dtype)(const int n,\n                                                   __global const Dtype* in_diff, const int in_diff_off,\n                                                   __global const Dtype* in_data, const int in_data_off,\n                                                   __global Dtype* out_diff) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    out_diff[index] = in_diff[index + in_diff_off] * in_data[index + in_data_off] * (in_data[index + in_data_off] <= 0);\n  }\n}";  // NOLINT
@@ -39,7 +39,7 @@ std::string lrn_double = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#e
 std::string math_double = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(mul,Dtype)(const int n, __global const Dtype* a,\n                                  const int offa,\n                                  __global Dtype* b,\n                                  const int offb, __global Dtype* y,\n                                  const int offy) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    y[index + offy] = a[index + offa] * b[index + offb];\n  }\n}\n\n__kernel void TEMPLATE(div,Dtype)(const int n, __global const Dtype* a,\n                                  const int offa,\n                                  __global Dtype* b,\n                                  const int offb, __global Dtype* y,\n                                  const int offy) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    y[index + offy] = a[index + offa] / b[index + offb];\n  }\n}\n\n__kernel void TEMPLATE(add_scalar,Dtype)(const int N, const Dtype alpha,\n__global Dtype* Y,\n                                         const int offY) {\n  for (int index = get_global_id(0); index < N; index += get_global_size(0)) {\n    Y[offY + index] += alpha;\n  }\n}\n\n__kernel void TEMPLATE(add,Dtype)(const int n, __global const Dtype* a,\n                                  const int offa, __global const Dtype* b,\n                                  const int offb, __global Dtype* y,\n                                  const int offy) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    y[offy + index] = a[offa + index] + b[offb + index];\n  }\n}\n\n__kernel void TEMPLATE(sub,Dtype)(const int n, __global const Dtype* a,\n                                  const int offa, __global const Dtype* b,\n                                  const int offb, __global Dtype* y,\n                                  const int offy) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    y[offy + index] = a[offa + index] - b[offb + index];\n  }\n}\n\n__kernel void TEMPLATE(abs,Dtype)(const int n, __global const Dtype* a,\n                                  const int offa, __global Dtype* y,\n                                  const int offy) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    y[offy + index] = fabs((Dtype)(a[offa + index]));\n  }\n}\n\n__kernel void TEMPLATE(exp,Dtype)(const int n, __global const Dtype* a,\n                                  const int offa, __global Dtype* y,\n                                  const int offy) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    y[offy + index] = exp(a[offa + index]);\n  }\n}\n\n__kernel void TEMPLATE(log,Dtype)(const int n, __global const Dtype* a,\n                                  const int offa, __global Dtype* y,\n                                  const int offy) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    y[offy + index] = log(a[offa + index]);\n  }\n}\n\n__kernel void TEMPLATE(powx,Dtype)(const int n, __global const Dtype* a,\n                                   const int offa, Dtype alpha,\n                                   __global Dtype* y,\n                                   const int offy) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n      if(a[offa + index] < 0 && alpha < 1 && alpha > -1) {\n        y[offy + index] = NAN;\n      } else {\n        y[offy + index] = pow(a[offa + index], alpha);\n      }\n  }\n}\n\n__kernel void TEMPLATE(sign,Dtype)(const int n, __global const Dtype* x,\n                                   const int offx, __global Dtype* y,\n                                   const int offy) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    y[index + offy] = (0.0 < x[index + offx])\n        - (x[index + offx] < 0.0);\n  }\n}\n\n__kernel void TEMPLATE(sgnbit,Dtype)(const int n, __global const Dtype* x,\n                                     const int offx, __global Dtype* y,\n                                     const int offy) {\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    y[index + offy] = signbit(x[index + offx]);\n  }\n}";  // NOLINT
 std::string mergecrop_double = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(merge_copy_forward, Dtype)(\n    const int nthreads, __global const Dtype* bottom_a,\n    __global const Dtype* bottom_b,\n    __global Dtype* top,\n    int num, int channels_a, int channels_b, int height_a, int width_a,\n    int height_b, int width_b) {\n\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n\n    int pad_h = (height_b - height_a) / 2;\n    int pad_w = (width_b - width_a) / 2;\n\n    int batch_id = index / ((channels_a + channels_b) * height_a * width_a);\n\n    int bottom_id = ((index\n        - batch_id * (channels_a + channels_b) * height_a * width_a)\n        / (channels_a * height_a * width_a)) % 2;\n\n    int h = ((index / width_a) % height_a);\n    int w = (index % width_a);\n\n    if (bottom_id == 0) {\n      int channel_id = (index / ((width_a * height_a)) % channels_a);\n      int aidx = ((((batch_id) * channels_a + channel_id) * height_a + h)\n          * width_a + w);\n      top[index] = bottom_a[aidx];\n    } else {\n      int channel_id = (index / ((width_a * height_a)) % channels_b);\n      int bidx =\n          (((batch_id) * channels_b + channel_id) * height_b\n              * width_b) + width_b * (h + pad_h) + pad_w + w;\n      top[index] = bottom_b[bidx];\n    }\n  }\n\n}\n\n__kernel void TEMPLATE(merge_copy_backward,Dtype)(const int nthreads,\n__global Dtype* bottom_a,\n                                                  __global const Dtype* top,\n                                                  int num, int channels_a,\n                                                  int channels_b, int height_a,\n                                                  int width_a, int height_b,\n                                                  int width_b) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    int batch_id = index / ((channels_a + channels_b) * height_a * width_a);\n\n    int bottom_id = ((index\n        - batch_id * (channels_a + channels_b) * height_a * width_a)\n        / (channels_a * height_a * width_a)) % 2;\n\n    int h = ((index / width_a) % height_a);\n    int w = (index % width_a);\n\n    if (bottom_id == 0) {\n      int channel_id = (index / ((width_a * height_a)) % channels_a);\n      int aidx = ((((batch_id) * channels_a + channel_id) * height_a + h)\n          * width_a + w);\n      bottom_a[aidx] = top[index];\n    }\n  }\n}";  // NOLINT
 std::string pooling_double = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(max_pool_forward,Dtype)(\n    const int nthreads, __global const Dtype* bottom_data, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,\n    const int pad_w,\n    __global Dtype* top_data,\n    const int use_mask, __global int* mask, __global Dtype* top_mask) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    const int pw = index % pooled_width;\n    const int ph = (index / pooled_width) % pooled_height;\n    const int c = (index / pooled_width / pooled_height) % channels;\n    const int n = index / pooled_width / pooled_height / channels;\n    int hstart = ph * stride_h - pad_h;\n    int wstart = pw * stride_w - pad_w;\n    const int hend = min(hstart + kernel_h, height);\n    const int wend = min(wstart + kernel_w, width);\n    hstart = max(hstart, 0);\n    wstart = max(wstart, 0);\n    Dtype maxval = -FLT_MAX;\n    int maxidx = -1;\n    __global const Dtype* bottom_slice = bottom_data\n        + (n * channels + c) * height * width;\n    for (int h = hstart; h < hend; ++h) {\n      for (int w = wstart; w < wend; ++w) {\n        if (bottom_slice[h * width + w] > maxval) {\n          maxidx = h * width + w;\n          maxval = bottom_slice[maxidx];\n        }\n      }\n    }\n    top_data[index] = maxval;\n    if (use_mask == 1) {\n      mask[index] = maxidx;\n    } else {\n      top_mask[index] = maxidx;\n    }\n  }\n}\n\n__kernel void TEMPLATE(ave_pool_forward,Dtype)(\n    const int nthreads, __global const Dtype* const bottom_data, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,\n    const int pad_w, __global Dtype* top_data) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    {\n      const int pw = index % pooled_width;\n      const int ph = (index / pooled_width) % pooled_height;\n      const int c = (index / pooled_width / pooled_height) % channels;\n      const int n = index / pooled_width / pooled_height / channels;\n      int hstart = ph * stride_h - pad_h;\n      int wstart = pw * stride_w - pad_w;\n      int hend = min(hstart + kernel_h, height + pad_h);\n      int wend = min(wstart + kernel_w, width + pad_w);\n      const int pool_size = (hend - hstart) * (wend - wstart);\n      hstart = max(hstart, 0);\n      wstart = max(wstart, 0);\n      hend = min(hend, height);\n      wend = min(wend, width);\n      Dtype aveval = 0;\n      __global const Dtype* bottom_slice = bottom_data\n          + (n * channels + c) * height * width;\n      for (int h = hstart; h < hend; ++h) {\n        for (int w = wstart; w < wend; ++w) {\n          aveval += bottom_slice[h * width + w];\n        }\n      }\n      top_data[index] = aveval / pool_size;\n    }\n  }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_train,Dtype)(\n    const int nthreads, __global const Dtype* bottom_data, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int stride_h, const int stride_w,\n    __global Dtype* rand_idx,\n    __global Dtype* top_data) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    const int pw = index % pooled_width;\n    const int ph = (index / pooled_width) % pooled_height;\n    const int c = (index / pooled_width / pooled_height) % channels;\n    const int n = index / pooled_width / pooled_height / channels;\n    const int hstart = ph * stride_h;\n    const int hend = min(hstart + kernel_h, height);\n    const int wstart = pw * stride_w;\n    const int wend = min(wstart + kernel_w, width);\n    Dtype cumsum = 0.;\n    __global const Dtype* bottom_slice = bottom_data\n        + (n * channels + c) * height * width;\n    // First pass: get sum\n    for (int h = hstart; h < hend; ++h) {\n      for (int w = wstart; w < wend; ++w) {\n        cumsum += bottom_slice[h * width + w];\n      }\n    }\n    const float thres = rand_idx[index] * cumsum;\n    // Second pass: get value, and set index.\n    cumsum = 0;\n    for (int h = hstart; h < hend; ++h) {\n      for (int w = wstart; w < wend; ++w) {\n        cumsum += bottom_slice[h * width + w];\n        if (cumsum >= thres) {\n          rand_idx[index] = ((n * channels + c) * height + h) * width + w;\n          top_data[index] = bottom_slice[h * width + w];\n          h = hend;\n          w = wend;\n        }\n      }\n    }\n  }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_test,Dtype)(\n    const int nthreads, __global const Dtype* const bottom_data, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int stride_h, const int stride_w,\n    __global Dtype* top_data) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    const int pw = index % pooled_width;\n    const int ph = (index / pooled_width) % pooled_height;\n    const int c = (index / pooled_width / pooled_height) % channels;\n    const int n = index / pooled_width / pooled_height / channels;\n    const int hstart = ph * stride_h;\n    const int hend = min(hstart + kernel_h, height);\n    const int wstart = pw * stride_w;\n    const int wend = min(wstart + kernel_w, width);\n    // We set cumsum to be 0 to avoid divide-by-zero problems\n    Dtype cumsum = FLT_MIN;\n    Dtype cumvalues = 0.;\n    __global const Dtype* bottom_slice = bottom_data\n        + (n * channels + c) * height * width;\n    // First pass: get sum\n    for (int h = hstart; h < hend; ++h) {\n      for (int w = wstart; w < wend; ++w) {\n        cumsum += bottom_slice[h * width + w];\n        cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];\n      }\n    }\n    top_data[index] = cumvalues / cumsum;\n  }\n}\n\n__kernel void TEMPLATE(max_pool_backward,Dtype)(const int nthreads,\n                                                __global const Dtype* top_diff,\n                                                const int use_mask,\n                                                __global const int* mask,\n                                                __global const Dtype* top_mask,\n                                                const int num,\n                                                const int channels,\n                                                const int height,\n                                                const int width,\n                                                const int pooled_height,\n                                                const int pooled_width,\n                                                const int kernel_h,\n                                                const int kernel_w,\n                                                const int stride_h,\n                                                const int stride_w,\n                                                const int pad_h,\n                                                const int pad_w,\n                                                __global Dtype* bottom_diff) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    // find out the local index\n    // find out the local offset\n    const int w = index % width;\n    const int h = (index / width) % height;\n    const int c = (index / width / height) % channels;\n    const int n = index / width / height / channels;\n    const int phstart =\n        (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;\n    const int phend = min((h + pad_h) / stride_h + 1, pooled_height);\n    const int pwstart =\n        (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;\n    const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);\n    Dtype gradient = 0;\n    const int offset = (n * channels + c) * pooled_height * pooled_width;\n    __global const Dtype* top_diff_slice = top_diff + offset;\n    if (use_mask == 1) {\n      __global const int* mask_slice = mask + offset;\n      for (int ph = phstart; ph < phend; ++ph) {\n        for (int pw = pwstart; pw < pwend; ++pw) {\n          if (mask_slice[ph * pooled_width + pw] == h * width + w) {\n            gradient += top_diff_slice[ph * pooled_width + pw];\n          }\n        }\n      }\n    } else {\n      __global const Dtype* top_mask_slice = top_mask + offset;\n      for (int ph = phstart; ph < phend; ++ph) {\n        for (int pw = pwstart; pw < pwend; ++pw) {\n          if (top_mask_slice[ph * pooled_width + pw] == h * width + w) {\n            gradient += top_diff_slice[ph * pooled_width + pw];\n          }\n        }\n      }\n    }\n    bottom_diff[index] = gradient;\n  }\n}\n\n__kernel void TEMPLATE(ave_pool_backward,Dtype)(const int nthreads,\n                                                __global const Dtype* top_diff,\n                                                const int num,\n                                                const int channels,\n                                                const int height,\n                                                const int width,\n                                                const int pooled_height,\n                                                const int pooled_width,\n                                                const int kernel_h,\n                                                const int kernel_w,\n                                                const int stride_h,\n                                                const int stride_w,\n                                                const int pad_h,\n                                                const int pad_w,\n                                                __global Dtype* bottom_diff) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    // find out the local index\n    // find out the local offset\n    const int w = index % width + pad_w;\n    const int h = (index / width) % height + pad_h;\n    const int c = (index / width / height) % channels;\n    const int n = index / width / height / channels;\n    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;\n    const int phend = min(h / stride_h + 1, pooled_height);\n    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;\n    const int pwend = min(w / stride_w + 1, pooled_width);\n    Dtype gradient = 0;\n    __global const Dtype* const top_diff_slice = top_diff\n        + (n * channels + c) * pooled_height * pooled_width;\n    for (int ph = phstart; ph < phend; ++ph) {\n      for (int pw = pwstart; pw < pwend; ++pw) {\n        // figure out the pooling size\n        int hstart = ph * stride_h - pad_h;\n        int wstart = pw * stride_w - pad_w;\n        int hend = min(hstart + kernel_h, height + pad_h);\n        int wend = min(wstart + kernel_w, width + pad_w);\n        int pool_size = (hend - hstart) * (wend - wstart);\n        gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;\n      }\n    }\n    bottom_diff[index] = gradient;\n  }\n}\n\n__kernel void TEMPLATE(sto_pool_backward,Dtype)(\n    const int nthreads, __global const Dtype* rand_idx,\n    __global const Dtype* const top_diff, const int num, const int channels,\n    const int height, const int width, const int pooled_height,\n    const int pooled_width, const int kernel_h, const int kernel_w,\n    const int stride_h, const int stride_w, __global Dtype* bottom_diff) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    // find out the local index\n    // find out the local offset\n    const int w = index % width;\n    const int h = (index / width) % height;\n    const int c = (index / width / height) % channels;\n    const int n = index / width / height / channels;\n    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;\n    const int phend = min(h / stride_h + 1, pooled_height);\n    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;\n    const int pwend = min(w / stride_w + 1, pooled_width);\n    Dtype gradient = 0;\n    __global const Dtype* rand_idx_slice = rand_idx\n        + (n * channels + c) * pooled_height * pooled_width;\n    __global const Dtype* top_diff_slice = top_diff\n        + (n * channels + c) * pooled_height * pooled_width;\n    for (int ph = phstart; ph < phend; ++ph) {\n      for (int pw = pwstart; pw < pwend; ++pw) {\n        gradient += top_diff_slice[ph * pooled_width + pw]\n            * (index == (int) (rand_idx_slice[ph * pooled_width + pw]));\n      }\n    }\n    bottom_diff[index] = gradient;\n  }\n}";  // NOLINT
-std::string pooling_sk_double = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(max_pool_forward_sk,Dtype)(const int nthreads,\n__global Dtype* bottom_data,\n                                                  const int num,\n                                                  const int channels,\n                                                  const int height,\n                                                  const int width,\n                                                  const int pooled_height,\n                                                  const int pooled_width,\n                                                  const int kernel_h,\n                                                  const int kernel_w,\n                                                  const int ext_kernel_h,\n                                                  const int ext_kernel_w,\n                                                  const int stride_h,\n                                                  const int stride_w,\n                                                  const int kstride_h,\n                                                  const int kstride_w,\n                                                  const int pad_h,\n                                                  const int pad_w,\n                                                  __global Dtype* top_data,\n                                                  const int use_mask,\n                                                  __global int* mask,\n                                                  __global Dtype* top_mask) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n    int hstart = ph * stride_h - pad_h;\n    int wstart = pw * stride_w - pad_w;\n    int hend = min(hstart + ext_kernel_h, height);\n    int wend = min(wstart + ext_kernel_w, width);\n    hstart = max(hstart, (int) 0);\n    wstart = max(wstart, (int) 0);\n    Dtype maxval = -FLT_MAX;\n    int maxidx = -1;\n    __global Dtype* bottom_data_ptr = bottom_data\n        + (n * channels + c) * height * width;\n    for (int h = hstart; h < hend; h += kstride_h) {\n      for (int w = wstart; w < wend; w += kstride_w) {\n        if (bottom_data_ptr[h * width + w] > maxval) {\n          maxidx = h * width + w;\n          maxval = bottom_data_ptr[maxidx];\n        }\n      }\n    }\n    top_data[index] = maxval;\n    if (use_mask == 1) {\n      mask[index] = maxidx;\n    } else {\n      top_mask[index] = maxidx;\n    }\n  }\n}\n\n__kernel void TEMPLATE(max_pool_backward_sk,Dtype)(\n    const int nthreads, __global const Dtype* top_diff, const int use_mask,\n    __global const int* mask, __global const Dtype* top_mask, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n    const int stride_h, const int stride_w, const int kstride_h,\n    const int kstride_w, const int pad_h, const int pad_w,\n    __global Dtype* bottom_diff) {\n\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n\n    __global const int* mask_ptr = mask;\n    __global const Dtype* top_diff_ptr = top_diff;\n\n// find out the local index\n// find out the local offset\n    int w = index % width;\n    int h = (index / width) % height;\n    int c = (index / width / height) % channels;\n    int n = index / width / height / channels;\n\n    int pooled_height_1 = pooled_height - 1;\n    int pooled_width_1 = pooled_width - 1;\n    int phstart = (h < ext_kernel_h) ? h % kstride_h : (h - ext_kernel_h) + 1;\n    int phend =\n        (h >= pooled_height) ?\n            pooled_height_1 - (pooled_height_1 - phstart) % kstride_h : h;\n    int pwstart = (w < ext_kernel_w) ? w % kstride_w : (w - ext_kernel_w) + 1;\n    int pwend =\n        (w >= pooled_width) ?\n            pooled_width_1 - (pooled_width_1 - pwstart) % kstride_w : w;\n\n    Dtype gradient = 0;\n    int offset = (n * channels + c) * pooled_height * pooled_width;\n    top_diff_ptr += offset;\n    if (use_mask == 1) {\n      mask_ptr += offset;\n      for (int ph = phstart; ph <= phend; ph += kstride_h) {\n        for (int pw = pwstart; pw <= pwend; pw += kstride_w) {\n          if (mask_ptr[ph * pooled_width + pw] == h * width + w) {\n            gradient += top_diff_ptr[ph * pooled_width + pw];\n          }\n        }\n      }\n    } else {\n      for (int ph = phstart; ph <= phend; ph += kstride_h) {\n        for (int pw = pwstart; pw <= pwend; pw += kstride_w) {\n          if (top_mask[ph * pooled_width + pw] == h * width + w) {\n            gradient += top_diff_ptr[ph * pooled_width + pw];\n          }\n        }\n      }\n    }\n    bottom_diff[index] = gradient;\n  }\n}\n\n__kernel void TEMPLATE(ave_pool_forward_sk,Dtype)(\n    const int nthreads, __global const Dtype* bottom_data, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n    const int stride_h, const int stride_w, const int kstride_h,\n    const int kstride_w, const int pad_h, const int pad_w,\n    __global Dtype* top_data) {\n\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n    int hstart = ph * stride_h - pad_h;\n    int wstart = pw * stride_w - pad_w;\n    int hend = min(hstart + ext_kernel_h, height + pad_h);\n    int wend = min(wstart + ext_kernel_w, width + pad_w);\n    hstart = max(hstart, 0);\n    wstart = max(wstart, 0);\n    hend = min(hend, height);\n    wend = min(wend, width);\n    Dtype aveval = 0;\n    __global const Dtype* bottom_data_ptr = bottom_data;\n    bottom_data_ptr += (n * channels + c) * height * width;\n    int pool_size = 0;\n    for (int h = hstart; h < hend; ++h) {\n      for (int w = wstart; w < wend; ++w) {\n        aveval += bottom_data_ptr[h * width + w];\n        ++pool_size;\n      }\n    }\n    top_data[index] = aveval / pool_size;\n  }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_train_sk,Dtype)(\n    const int nthreads, __global const Dtype* bottom_data, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n    const int stride_h, const int stride_w, const int kstride_h,\n    const int kstride_w, __global Dtype* rand_idx,\n    __global Dtype* top_data) {\n\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n    int hstart = ph * stride_h;\n    int hend = min(hstart + ext_kernel_h, height);\n    int wstart = pw * stride_w;\n    int wend = min(wstart + ext_kernel_w, width);\n    Dtype cumsum = 0.;\n    __global const Dtype* bottom_data_ptr = bottom_data;\n    bottom_data_ptr += (n * channels + c) * height * width;\n    // First pass: get sum\n    for (int h = hstart; h < hend; h += kstride_h) {\n      for (int w = wstart; w < wend; w += kstride_w) {\n        cumsum += bottom_data_ptr[h * width + w];\n      }\n    }\n    float thres = rand_idx[index] * cumsum;\n    // Second pass: get value, and set index.\n    cumsum = 0;\n    for (int h = hstart; h < hend; h += kstride_h) {\n      for (int w = wstart; w < wend; w += kstride_w) {\n        cumsum += bottom_data_ptr[h * width + w];\n        if (cumsum >= thres) {\n          rand_idx[index] = ((n * channels + c) * height + h) * width + w;\n          top_data[index] = bottom_data_ptr[h * width + w];\n          return;\n        }\n      }\n    }\n  }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_test_sk,Dtype)(\n    const int nthreads, __global const Dtype* bottom_data, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n    const int stride_h, const int stride_w, const int kstride_h,\n    const int kstride_w,\n    __global Dtype* top_data) {\n\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n    int hstart = ph * stride_h;\n    int hend = min(hstart + ext_kernel_h, height);\n    int wstart = pw * stride_w;\n    int wend = min(wstart + ext_kernel_w, width);\n    // We set cumsum to be 0 to avoid divide-by-zero problems\n    Dtype cumsum = FLT_MIN;\n    Dtype cumvalues = 0.;\n    __global const Dtype* bottom_data_ptr = bottom_data;\n    bottom_data_ptr += (n * channels + c) * height * width;\n    // First pass: get sum\n    for (int h = hstart; h < hend; h += kstride_h) {\n      for (int w = wstart; w < wend; w += kstride_w) {\n        cumsum += bottom_data_ptr[h * width + w];\n        cumvalues += bottom_data_ptr[h * width + w]\n            * bottom_data_ptr[h * width + w];\n      }\n    }\n    top_data[index] = cumvalues / cumsum;\n  }\n\n}";  // NOLINT
+std::string pooling_sk_double = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(max_pool_forward_sk,Dtype)(const int nthreads,\n__global Dtype* bottom_data,\n                                                  const int num,\n                                                  const int channels,\n                                                  const int height,\n                                                  const int width,\n                                                  const int pooled_height,\n                                                  const int pooled_width,\n                                                  const int kernel_h,\n                                                  const int kernel_w,\n                                                  const int ext_kernel_h,\n                                                  const int ext_kernel_w,\n                                                  const int stride_h,\n                                                  const int stride_w,\n                                                  const int kstride_h,\n                                                  const int kstride_w,\n                                                  const int pad_h,\n                                                  const int pad_w,\n                                                  __global Dtype* top_data,\n                                                  const int use_mask,\n                                                  __global int* mask,\n                                                  __global Dtype* top_mask) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n    int hstart = ph * stride_h - pad_h;\n    int wstart = pw * stride_w - pad_w;\n    int hend = min(hstart + ext_kernel_h, height);\n    int wend = min(wstart + ext_kernel_w, width);\n    hstart = max(hstart, (int) 0);\n    wstart = max(wstart, (int) 0);\n    Dtype maxval = -FLT_MAX;\n    int maxidx = -1;\n    __global Dtype* bottom_data_ptr = bottom_data\n        + (n * channels + c) * height * width;\n    for (int h = hstart; h < hend; h += kstride_h) {\n      for (int w = wstart; w < wend; w += kstride_w) {\n        if (bottom_data_ptr[h * width + w] > maxval) {\n          maxidx = h * width + w;\n          maxval = bottom_data_ptr[maxidx];\n        }\n      }\n    }\n    top_data[index] = maxval;\n    if (use_mask == 1) {\n      mask[index] = maxidx;\n    } else {\n      top_mask[index] = maxidx;\n    }\n  }\n}\n\n__kernel void TEMPLATE(max_pool_backward_sk,Dtype)(\n    const int nthreads, __global const Dtype* top_diff, const int use_mask,\n    __global const int* mask, __global const Dtype* top_mask, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n    const int stride_h, const int stride_w, const int kstride_h,\n    const int kstride_w, const int pad_h, const int pad_w,\n    __global Dtype* bottom_diff) {\n\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n\n    __global const int* mask_ptr = mask;\n    __global const Dtype* top_diff_ptr = top_diff;\n\n// find out the local index\n// find out the local offset\n    int w = index % width;\n    int h = (index / width) % height;\n    int c = (index / width / height) % channels;\n    int n = index / width / height / channels;\n\n    int pooled_height_1 = pooled_height - 1;\n    int pooled_width_1 = pooled_width - 1;\n    int phstart = (h < ext_kernel_h) ? h % kstride_h : (h - ext_kernel_h) + 1;\n    int phend =\n        (h >= pooled_height) ?\n            pooled_height_1 - (pooled_height_1 - phstart) % kstride_h : h;\n    int pwstart = (w < ext_kernel_w) ? w % kstride_w : (w - ext_kernel_w) + 1;\n    int pwend =\n        (w >= pooled_width) ?\n            pooled_width_1 - (pooled_width_1 - pwstart) % kstride_w : w;\n\n    Dtype gradient = 0;\n    int offset = (n * channels + c) * pooled_height * pooled_width;\n    top_diff_ptr += offset;\n    if (use_mask == 1) {\n      mask_ptr += offset;\n      for (int ph = phstart; ph <= phend; ph += kstride_h) {\n        for (int pw = pwstart; pw <= pwend; pw += kstride_w) {\n          if (mask_ptr[ph * pooled_width + pw] == h * width + w) {\n            gradient += top_diff_ptr[ph * pooled_width + pw];\n          }\n        }\n      }\n    } else {\n      for (int ph = phstart; ph <= phend; ph += kstride_h) {\n        for (int pw = pwstart; pw <= pwend; pw += kstride_w) {\n          if (top_mask[ph * pooled_width + pw] == h * width + w) {\n            gradient += top_diff_ptr[ph * pooled_width + pw];\n          }\n        }\n      }\n    }\n    bottom_diff[index] = gradient;\n  }\n}\n\n__kernel void TEMPLATE(ave_pool_forward_sk,Dtype)(\n    const int nthreads, __global const Dtype* bottom_data, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n    const int stride_h, const int stride_w, const int kstride_h,\n    const int kstride_w, const int pad_h, const int pad_w,\n    __global Dtype* top_data) {\n\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n    int hstart = ph * stride_h - pad_h;\n    int wstart = pw * stride_w - pad_w;\n    int hend = min(hstart + ext_kernel_h, height + pad_h);\n    int wend = min(wstart + ext_kernel_w, width + pad_w);\n    hstart = max(hstart, 0);\n    wstart = max(wstart, 0);\n    hend = min(hend, height);\n    wend = min(wend, width);\n    Dtype aveval = 0;\n    __global const Dtype* bottom_data_ptr = bottom_data;\n    bottom_data_ptr += (n * channels + c) * height * width;\n    int pool_size = 0;\n    for (int h = hstart; h < hend; ++h) {\n      for (int w = wstart; w < wend; ++w) {\n        aveval += bottom_data_ptr[h * width + w];\n        ++pool_size;\n      }\n    }\n    top_data[index] = aveval / pool_size;\n  }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_train_sk,Dtype)(\n    const int nthreads, __global const Dtype* bottom_data, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n    const int stride_h, const int stride_w, const int kstride_h,\n    const int kstride_w, __global Dtype* rand_idx,\n    __global Dtype* top_data) {\n\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n    int hstart = ph * stride_h;\n    int hend = min(hstart + ext_kernel_h, height);\n    int wstart = pw * stride_w;\n    int wend = min(wstart + ext_kernel_w, width);\n    Dtype cumsum = 0.;\n    __global const Dtype* bottom_data_ptr = bottom_data;\n    bottom_data_ptr += (n * channels + c) * height * width;\n    // First pass: get sum\n    for (int h = hstart; h < hend; h += kstride_h) {\n      for (int w = wstart; w < wend; w += kstride_w) {\n        cumsum += bottom_data_ptr[h * width + w];\n      }\n    }\n    float thres = rand_idx[index] * cumsum;\n    // Second pass: get value, and set index.\n    cumsum = 0;\n    for (int h = hstart; h < hend; h += kstride_h) {\n      for (int w = wstart; w < wend; w += kstride_w) {\n        cumsum += bottom_data_ptr[h * width + w];\n        if (cumsum >= thres) {\n          rand_idx[index] = ((n * channels + c) * height + h) * width + w;\n          top_data[index] = bottom_data_ptr[h * width + w];\n          h = hend;\n          w = wend;\n        }\n      }\n    }\n  }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_test_sk,Dtype)(\n    const int nthreads, __global const Dtype* bottom_data, const int num,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int kernel_h,\n    const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n    const int stride_h, const int stride_w, const int kstride_h,\n    const int kstride_w,\n    __global Dtype* top_data) {\n\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n    int hstart = ph * stride_h;\n    int hend = min(hstart + ext_kernel_h, height);\n    int wstart = pw * stride_w;\n    int wend = min(wstart + ext_kernel_w, width);\n    // We set cumsum to be 0 to avoid divide-by-zero problems\n    Dtype cumsum = FLT_MIN;\n    Dtype cumvalues = 0.;\n    __global const Dtype* bottom_data_ptr = bottom_data;\n    bottom_data_ptr += (n * channels + c) * height * width;\n    // First pass: get sum\n    for (int h = hstart; h < hend; h += kstride_h) {\n      for (int w = wstart; w < wend; w += kstride_w) {\n        cumsum += bottom_data_ptr[h * width + w];\n        cumvalues += bottom_data_ptr[h * width + w]\n            * bottom_data_ptr[h * width + w];\n      }\n    }\n    top_data[index] = cumvalues / cumsum;\n  }\n\n}";  // NOLINT
 std::string slice_double = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(slice,Dtype)(const int nthreads,\n                                    __global const Dtype* in_data,\n                                    const int forward, const int num_slices,\n                                    const int slice_size,\n                                    const int bottom_slice_axis,\n                                    const int top_slice_axis,\n                                    const int offset_slice_axis,\n                                    __global Dtype* out_data) {\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    const int total_slice_size = slice_size * top_slice_axis;\n    const int slice_num = index / total_slice_size;\n    const int slice_index = index % total_slice_size;\n    const int bottom_index = slice_index\n        + (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size;\n    if (forward == 1) {\n      out_data[index] = in_data[bottom_index];\n    } else {\n      out_data[bottom_index] = in_data[index];\n    }\n  }\n}";  // NOLINT
 std::string softmax_loss_double = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(softmax_loss_forward,Dtype)(\n    int n, __global const Dtype* prob_data, __global const Dtype* label,\n    __global Dtype* loss,\n    const int num, const int dim, const int spatial_dim,\n    const int has_ignore_label_, const int ignore_label_,\n    __global Dtype* counts) {\n\n  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    const int n = index / spatial_dim;\n    const int s = index % spatial_dim;\n    const int label_value = (int) (label[n * spatial_dim + s]);\n    if (has_ignore_label_ == 1 && label_value == ignore_label_) {\n      loss[index] = 0;\n      counts[index] = 0;\n    } else {\n      loss[index] = -log(\n          max((Dtype) (prob_data[n * dim + label_value * spatial_dim + s]),\n              (Dtype) FLT_MIN));\n      counts[index] = 1;\n    }\n  }\n}\n\n__kernel void TEMPLATE(softmax_loss_backward,Dtype)(const int nthreads,\n                                                    __global const Dtype* top,\n                                                    __global const Dtype* label,\n                                                    __global Dtype* bottom_diff,\n                                                    const int num,\n                                                    const int dim,\n                                                    const int spatial_dim,\n                                                    const int has_ignore_label_,\n                                                    const int ignore_label_,\n                                                    __global Dtype* counts) {\n\n  const int channels = dim / spatial_dim;\n\n  for (int index = get_global_id(0); index < nthreads;\n      index += get_global_size(0)) {\n    {\n      const int n = index / spatial_dim;\n      const int s = index % spatial_dim;\n      const int label_value = (int) (label[n * spatial_dim + s]);\n\n      if (has_ignore_label_ == 1 && label_value == ignore_label_) {\n        for (int c = 0; c < channels; ++c) {\n          bottom_diff[n * dim + c * spatial_dim + s] = 0;\n        }\n        counts[index] = 0;\n      } else {\n        bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;\n        counts[index] = 1;\n      }\n    }\n  }\n}";  // NOLINT
 viennacl::ocl::program & RegisterKernels(viennacl::ocl::context *ctx) {
diff --git a/src/caffe/greentea/cl_kernels/pooling_sk.cl b/src/caffe/greentea/cl_kernels/pooling_sk.cl
index 3b2108b870e..da98af58206 100644
--- a/src/caffe/greentea/cl_kernels/pooling_sk.cl
+++ b/src/caffe/greentea/cl_kernels/pooling_sk.cl
@@ -191,7 +191,8 @@ __kernel void TEMPLATE(sto_pool_forward_train_sk,Dtype)(
         if (cumsum >= thres) {
           rand_idx[index] = ((n * channels + c) * height + h) * width + w;
           top_data[index] = bottom_data_ptr[h * width + w];
-          return;
+          h = hend;
+          w = wend;
         }
       }
     }