diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp index 1968ace61c9..64e9ba6d396 100644 --- a/include/caffe/filler.hpp +++ b/include/caffe/filler.hpp @@ -178,6 +178,62 @@ class XavierFiller : public Filler { } }; + + +/*! +@brief Fills a Blob with coefficients of bilinear interpolation for upsampling. +This is intended to be used in DeconvolutionLayer acting as UpsamplingLayer. +You can upsample a feature map with shape of (B, C, H, W) by any integer factor +using the following proto. +\code +layer { + name: "upsample", type: "Deconvolution" + bottom: "{{bottom_name}}" top: "{{top_name}}" + convolution_param { + kernel_size: {{2 * factor - factor % 2}} stride: {{factor}} + num_output: {{C}} group: {{C}} + pad: {{ceil((factor - 1) / 2.)}} + weight_filler: { type: "bilinear_upsampling" } bias_term: false + } + param { lr_mult: 0 decay_mult: 0 } +} +\endcode +Please use this by replacing `{{}}` with your values. By specifying +`num_output: {{C}} group: {{C}}`, it behaves as +channel-wise convolution. The filter shape of this deconvolution layer will be +(C, 1, K, K) where K is `kernel_size`, and this filler will set a (K, K) +interpolation kernel for every channel of the filter identically. The resulting +shape of the top feature map will be (B, C, factor * H, factor * W). +Note that the learning rate and the +weight decay are set to 0 in order to keep coefficient values of bilinear +interpolation unchanged during training. If you apply this to an image, this +operation is equivalent to the following call in Python with Scikit.Image. +\code{.py} +out = skimage.transform.rescale(img, factor, mode='constant', cval=0) +\endcode + */ +template +class BilinearFiller : public Filler { + public: + explicit BilinearFiller(const FillerParameter& param) + : Filler(param) {} + virtual void Fill(Blob* blob) { + CHECK_EQ(blob->num_axes(), 4) << "Blob must be 4 dim."; + CHECK_EQ(blob->width(), blob->height()) << "Filter must be square"; + Dtype* data = blob->mutable_cpu_data(); + int f = ceil(blob->width() / 2.); + float c = (2 * f - 1 - f % 2) / (2. * f); + for (int i = 0; i < blob->count(); ++i) { + float x = i % blob->width(); + float y = (i / blob->width()) % blob->height(); + data[i] = ((1.0 - fabs(x / f - c)) * (1.0 - fabs(y / f - c))); + } + CHECK_EQ(this->filler_param_.sparse(), -1) + << "Sparsity not supported by this Filler."; + } +}; + + /** * @brief Fills a Blob with values @f$ x \sim N(0, \sigma^2) @f$ where * @f$ \sigma^2 @f$ is set inversely proportional to number of incoming @@ -240,6 +296,8 @@ Filler* GetFiller(const FillerParameter& param) { return new UniformFiller(param); } else if (type == "xavier") { return new XavierFiller(param); + } else if (type == "bilinear") { + return new BilinearFiller(param); } else if (type == "msra") { return new MSRAFiller(param); } else { diff --git a/src/caffe/greentea/cl_kernels.cpp b/src/caffe/greentea/cl_kernels.cpp index 5af538a716b..1d1f46d03a9 100644 --- a/src/caffe/greentea/cl_kernels.cpp +++ b/src/caffe/greentea/cl_kernels.cpp @@ -21,7 +21,7 @@ std::string lrn_float = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#en std::string math_float = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(mul,Dtype)(const int n, __global const Dtype* a,\n const int offa,\n __global Dtype* b,\n const int offb, __global Dtype* y,\n const int offy) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n y[index + offy] = a[index + offa] * b[index + offb];\n }\n}\n\n__kernel void TEMPLATE(div,Dtype)(const int n, __global const Dtype* a,\n const int offa,\n __global Dtype* b,\n const int offb, __global Dtype* y,\n const int offy) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n y[index + offy] = a[index + offa] / b[index + offb];\n }\n}\n\n__kernel void TEMPLATE(add_scalar,Dtype)(const int N, const Dtype alpha,\n__global Dtype* Y,\n const int offY) {\n for (int index = get_global_id(0); index < N; index += get_global_size(0)) {\n Y[offY + index] += alpha;\n }\n}\n\n__kernel void TEMPLATE(add,Dtype)(const int n, __global const Dtype* a,\n const int offa, __global const Dtype* b,\n const int offb, __global Dtype* y,\n const int offy) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n y[offy + index] = a[offa + index] + b[offb + index];\n }\n}\n\n__kernel void TEMPLATE(sub,Dtype)(const int n, __global const Dtype* a,\n const int offa, __global const Dtype* b,\n const int offb, __global Dtype* y,\n const int offy) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n y[offy + index] = a[offa + index] - b[offb + index];\n }\n}\n\n__kernel void TEMPLATE(abs,Dtype)(const int n, __global const Dtype* a,\n const int offa, __global Dtype* y,\n const int offy) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n y[offy + index] = fabs((Dtype)(a[offa + index]));\n }\n}\n\n__kernel void TEMPLATE(exp,Dtype)(const int n, __global const Dtype* a,\n const int offa, __global Dtype* y,\n const int offy) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n y[offy + index] = exp(a[offa + index]);\n }\n}\n\n__kernel void TEMPLATE(log,Dtype)(const int n, __global const Dtype* a,\n const int offa, __global Dtype* y,\n const int offy) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n y[offy + index] = log(a[offa + index]);\n }\n}\n\n__kernel void TEMPLATE(powx,Dtype)(const int n, __global const Dtype* a,\n const int offa, Dtype alpha,\n __global Dtype* y,\n const int offy) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n if(a[offa + index] < 0 && alpha < 1 && alpha > -1) {\n y[offy + index] = NAN;\n } else {\n y[offy + index] = pow(a[offa + index], alpha);\n }\n }\n}\n\n__kernel void TEMPLATE(sign,Dtype)(const int n, __global const Dtype* x,\n const int offx, __global Dtype* y,\n const int offy) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n y[index + offy] = (0.0 < x[index + offx])\n - (x[index + offx] < 0.0);\n }\n}\n\n__kernel void TEMPLATE(sgnbit,Dtype)(const int n, __global const Dtype* x,\n const int offx, __global Dtype* y,\n const int offy) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n y[index + offy] = signbit(x[index + offx]);\n }\n}"; // NOLINT std::string mergecrop_float = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(merge_copy_forward, Dtype)(\n const int nthreads, __global const Dtype* bottom_a,\n __global const Dtype* bottom_b,\n __global Dtype* top,\n int num, int channels_a, int channels_b, int height_a, int width_a,\n int height_b, int width_b) {\n\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n\n int pad_h = (height_b - height_a) / 2;\n int pad_w = (width_b - width_a) / 2;\n\n int batch_id = index / ((channels_a + channels_b) * height_a * width_a);\n\n int bottom_id = ((index\n - batch_id * (channels_a + channels_b) * height_a * width_a)\n / (channels_a * height_a * width_a)) % 2;\n\n int h = ((index / width_a) % height_a);\n int w = (index % width_a);\n\n if (bottom_id == 0) {\n int channel_id = (index / ((width_a * height_a)) % channels_a);\n int aidx = ((((batch_id) * channels_a + channel_id) * height_a + h)\n * width_a + w);\n top[index] = bottom_a[aidx];\n } else {\n int channel_id = (index / ((width_a * height_a)) % channels_b);\n int bidx =\n (((batch_id) * channels_b + channel_id) * height_b\n * width_b) + width_b * (h + pad_h) + pad_w + w;\n top[index] = bottom_b[bidx];\n }\n }\n\n}\n\n__kernel void TEMPLATE(merge_copy_backward,Dtype)(const int nthreads,\n__global Dtype* bottom_a,\n __global const Dtype* top,\n int num, int channels_a,\n int channels_b, int height_a,\n int width_a, int height_b,\n int width_b) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n int batch_id = index / ((channels_a + channels_b) * height_a * width_a);\n\n int bottom_id = ((index\n - batch_id * (channels_a + channels_b) * height_a * width_a)\n / (channels_a * height_a * width_a)) % 2;\n\n int h = ((index / width_a) % height_a);\n int w = (index % width_a);\n\n if (bottom_id == 0) {\n int channel_id = (index / ((width_a * height_a)) % channels_a);\n int aidx = ((((batch_id) * channels_a + channel_id) * height_a + h)\n * width_a + w);\n bottom_a[aidx] = top[index];\n }\n }\n}"; // NOLINT std::string pooling_float = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(max_pool_forward,Dtype)(\n const int nthreads, __global const Dtype* bottom_data, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int stride_h, const int stride_w, const int pad_h,\n const int pad_w,\n __global Dtype* top_data,\n const int use_mask, __global int* mask, __global Dtype* top_mask) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n const int pw = index % pooled_width;\n const int ph = (index / pooled_width) % pooled_height;\n const int c = (index / pooled_width / pooled_height) % channels;\n const int n = index / pooled_width / pooled_height / channels;\n int hstart = ph * stride_h - pad_h;\n int wstart = pw * stride_w - pad_w;\n const int hend = min(hstart + kernel_h, height);\n const int wend = min(wstart + kernel_w, width);\n hstart = max(hstart, 0);\n wstart = max(wstart, 0);\n Dtype maxval = -FLT_MAX;\n int maxidx = -1;\n __global const Dtype* bottom_slice = bottom_data\n + (n * channels + c) * height * width;\n for (int h = hstart; h < hend; ++h) {\n for (int w = wstart; w < wend; ++w) {\n if (bottom_slice[h * width + w] > maxval) {\n maxidx = h * width + w;\n maxval = bottom_slice[maxidx];\n }\n }\n }\n top_data[index] = maxval;\n if (use_mask == 1) {\n mask[index] = maxidx;\n } else {\n top_mask[index] = maxidx;\n }\n }\n}\n\n__kernel void TEMPLATE(ave_pool_forward,Dtype)(\n const int nthreads, __global const Dtype* const bottom_data, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int stride_h, const int stride_w, const int pad_h,\n const int pad_w, __global Dtype* top_data) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n {\n const int pw = index % pooled_width;\n const int ph = (index / pooled_width) % pooled_height;\n const int c = (index / pooled_width / pooled_height) % channels;\n const int n = index / pooled_width / pooled_height / channels;\n int hstart = ph * stride_h - pad_h;\n int wstart = pw * stride_w - pad_w;\n int hend = min(hstart + kernel_h, height + pad_h);\n int wend = min(wstart + kernel_w, width + pad_w);\n const int pool_size = (hend - hstart) * (wend - wstart);\n hstart = max(hstart, 0);\n wstart = max(wstart, 0);\n hend = min(hend, height);\n wend = min(wend, width);\n Dtype aveval = 0;\n __global const Dtype* bottom_slice = bottom_data\n + (n * channels + c) * height * width;\n for (int h = hstart; h < hend; ++h) {\n for (int w = wstart; w < wend; ++w) {\n aveval += bottom_slice[h * width + w];\n }\n }\n top_data[index] = aveval / pool_size;\n }\n }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_train,Dtype)(\n const int nthreads, __global const Dtype* bottom_data, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int stride_h, const int stride_w,\n __global Dtype* rand_idx,\n __global Dtype* top_data) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n const int pw = index % pooled_width;\n const int ph = (index / pooled_width) % pooled_height;\n const int c = (index / pooled_width / pooled_height) % channels;\n const int n = index / pooled_width / pooled_height / channels;\n const int hstart = ph * stride_h;\n const int hend = min(hstart + kernel_h, height);\n const int wstart = pw * stride_w;\n const int wend = min(wstart + kernel_w, width);\n Dtype cumsum = 0.;\n __global const Dtype* bottom_slice = bottom_data\n + (n * channels + c) * height * width;\n // First pass: get sum\n for (int h = hstart; h < hend; ++h) {\n for (int w = wstart; w < wend; ++w) {\n cumsum += bottom_slice[h * width + w];\n }\n }\n const float thres = rand_idx[index] * cumsum;\n // Second pass: get value, and set index.\n cumsum = 0;\n for (int h = hstart; h < hend; ++h) {\n for (int w = wstart; w < wend; ++w) {\n cumsum += bottom_slice[h * width + w];\n if (cumsum >= thres) {\n rand_idx[index] = ((n * channels + c) * height + h) * width + w;\n top_data[index] = bottom_slice[h * width + w];\n h = hend;\n w = wend;\n }\n }\n }\n }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_test,Dtype)(\n const int nthreads, __global const Dtype* const bottom_data, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int stride_h, const int stride_w,\n __global Dtype* top_data) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n const int pw = index % pooled_width;\n const int ph = (index / pooled_width) % pooled_height;\n const int c = (index / pooled_width / pooled_height) % channels;\n const int n = index / pooled_width / pooled_height / channels;\n const int hstart = ph * stride_h;\n const int hend = min(hstart + kernel_h, height);\n const int wstart = pw * stride_w;\n const int wend = min(wstart + kernel_w, width);\n // We set cumsum to be 0 to avoid divide-by-zero problems\n Dtype cumsum = FLT_MIN;\n Dtype cumvalues = 0.;\n __global const Dtype* bottom_slice = bottom_data\n + (n * channels + c) * height * width;\n // First pass: get sum\n for (int h = hstart; h < hend; ++h) {\n for (int w = wstart; w < wend; ++w) {\n cumsum += bottom_slice[h * width + w];\n cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];\n }\n }\n top_data[index] = cumvalues / cumsum;\n }\n}\n\n__kernel void TEMPLATE(max_pool_backward,Dtype)(const int nthreads,\n __global const Dtype* top_diff,\n const int use_mask,\n __global const int* mask,\n __global const Dtype* top_mask,\n const int num,\n const int channels,\n const int height,\n const int width,\n const int pooled_height,\n const int pooled_width,\n const int kernel_h,\n const int kernel_w,\n const int stride_h,\n const int stride_w,\n const int pad_h,\n const int pad_w,\n __global Dtype* bottom_diff) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n // find out the local index\n // find out the local offset\n const int w = index % width;\n const int h = (index / width) % height;\n const int c = (index / width / height) % channels;\n const int n = index / width / height / channels;\n const int phstart =\n (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;\n const int phend = min((h + pad_h) / stride_h + 1, pooled_height);\n const int pwstart =\n (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;\n const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);\n Dtype gradient = 0;\n const int offset = (n * channels + c) * pooled_height * pooled_width;\n __global const Dtype* top_diff_slice = top_diff + offset;\n if (use_mask == 1) {\n __global const int* mask_slice = mask + offset;\n for (int ph = phstart; ph < phend; ++ph) {\n for (int pw = pwstart; pw < pwend; ++pw) {\n if (mask_slice[ph * pooled_width + pw] == h * width + w) {\n gradient += top_diff_slice[ph * pooled_width + pw];\n }\n }\n }\n } else {\n __global const Dtype* top_mask_slice = top_mask + offset;\n for (int ph = phstart; ph < phend; ++ph) {\n for (int pw = pwstart; pw < pwend; ++pw) {\n if (top_mask_slice[ph * pooled_width + pw] == h * width + w) {\n gradient += top_diff_slice[ph * pooled_width + pw];\n }\n }\n }\n }\n bottom_diff[index] = gradient;\n }\n}\n\n__kernel void TEMPLATE(ave_pool_backward,Dtype)(const int nthreads,\n __global const Dtype* top_diff,\n const int num,\n const int channels,\n const int height,\n const int width,\n const int pooled_height,\n const int pooled_width,\n const int kernel_h,\n const int kernel_w,\n const int stride_h,\n const int stride_w,\n const int pad_h,\n const int pad_w,\n __global Dtype* bottom_diff) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n // find out the local index\n // find out the local offset\n const int w = index % width + pad_w;\n const int h = (index / width) % height + pad_h;\n const int c = (index / width / height) % channels;\n const int n = index / width / height / channels;\n const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;\n const int phend = min(h / stride_h + 1, pooled_height);\n const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;\n const int pwend = min(w / stride_w + 1, pooled_width);\n Dtype gradient = 0;\n __global const Dtype* const top_diff_slice = top_diff\n + (n * channels + c) * pooled_height * pooled_width;\n for (int ph = phstart; ph < phend; ++ph) {\n for (int pw = pwstart; pw < pwend; ++pw) {\n // figure out the pooling size\n int hstart = ph * stride_h - pad_h;\n int wstart = pw * stride_w - pad_w;\n int hend = min(hstart + kernel_h, height + pad_h);\n int wend = min(wstart + kernel_w, width + pad_w);\n int pool_size = (hend - hstart) * (wend - wstart);\n gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;\n }\n }\n bottom_diff[index] = gradient;\n }\n}\n\n__kernel void TEMPLATE(sto_pool_backward,Dtype)(\n const int nthreads, __global const Dtype* rand_idx,\n __global const Dtype* const top_diff, const int num, const int channels,\n const int height, const int width, const int pooled_height,\n const int pooled_width, const int kernel_h, const int kernel_w,\n const int stride_h, const int stride_w, __global Dtype* bottom_diff) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n // find out the local index\n // find out the local offset\n const int w = index % width;\n const int h = (index / width) % height;\n const int c = (index / width / height) % channels;\n const int n = index / width / height / channels;\n const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;\n const int phend = min(h / stride_h + 1, pooled_height);\n const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;\n const int pwend = min(w / stride_w + 1, pooled_width);\n Dtype gradient = 0;\n __global const Dtype* rand_idx_slice = rand_idx\n + (n * channels + c) * pooled_height * pooled_width;\n __global const Dtype* top_diff_slice = top_diff\n + (n * channels + c) * pooled_height * pooled_width;\n for (int ph = phstart; ph < phend; ++ph) {\n for (int pw = pwstart; pw < pwend; ++pw) {\n gradient += top_diff_slice[ph * pooled_width + pw]\n * (index == (int) (rand_idx_slice[ph * pooled_width + pw]));\n }\n }\n bottom_diff[index] = gradient;\n }\n}"; // NOLINT -std::string pooling_sk_float = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(max_pool_forward_sk,Dtype)(const int nthreads,\n__global Dtype* bottom_data,\n const int num,\n const int channels,\n const int height,\n const int width,\n const int pooled_height,\n const int pooled_width,\n const int kernel_h,\n const int kernel_w,\n const int ext_kernel_h,\n const int ext_kernel_w,\n const int stride_h,\n const int stride_w,\n const int kstride_h,\n const int kstride_w,\n const int pad_h,\n const int pad_w,\n __global Dtype* top_data,\n const int use_mask,\n __global int* mask,\n __global Dtype* top_mask) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n int pw = index % pooled_width;\n int ph = (index / pooled_width) % pooled_height;\n int c = (index / pooled_width / pooled_height) % channels;\n int n = index / pooled_width / pooled_height / channels;\n int hstart = ph * stride_h - pad_h;\n int wstart = pw * stride_w - pad_w;\n int hend = min(hstart + ext_kernel_h, height);\n int wend = min(wstart + ext_kernel_w, width);\n hstart = max(hstart, (int) 0);\n wstart = max(wstart, (int) 0);\n Dtype maxval = -FLT_MAX;\n int maxidx = -1;\n __global Dtype* bottom_data_ptr = bottom_data\n + (n * channels + c) * height * width;\n for (int h = hstart; h < hend; h += kstride_h) {\n for (int w = wstart; w < wend; w += kstride_w) {\n if (bottom_data_ptr[h * width + w] > maxval) {\n maxidx = h * width + w;\n maxval = bottom_data_ptr[maxidx];\n }\n }\n }\n top_data[index] = maxval;\n if (use_mask == 1) {\n mask[index] = maxidx;\n } else {\n top_mask[index] = maxidx;\n }\n }\n}\n\n__kernel void TEMPLATE(max_pool_backward_sk,Dtype)(\n const int nthreads, __global const Dtype* top_diff, const int use_mask,\n __global const int* mask, __global const Dtype* top_mask, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n const int stride_h, const int stride_w, const int kstride_h,\n const int kstride_w, const int pad_h, const int pad_w,\n __global Dtype* bottom_diff) {\n\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n\n __global const int* mask_ptr = mask;\n __global const Dtype* top_diff_ptr = top_diff;\n\n// find out the local index\n// find out the local offset\n int w = index % width;\n int h = (index / width) % height;\n int c = (index / width / height) % channels;\n int n = index / width / height / channels;\n\n int pooled_height_1 = pooled_height - 1;\n int pooled_width_1 = pooled_width - 1;\n int phstart = (h < ext_kernel_h) ? h % kstride_h : (h - ext_kernel_h) + 1;\n int phend =\n (h >= pooled_height) ?\n pooled_height_1 - (pooled_height_1 - phstart) % kstride_h : h;\n int pwstart = (w < ext_kernel_w) ? w % kstride_w : (w - ext_kernel_w) + 1;\n int pwend =\n (w >= pooled_width) ?\n pooled_width_1 - (pooled_width_1 - pwstart) % kstride_w : w;\n\n Dtype gradient = 0;\n int offset = (n * channels + c) * pooled_height * pooled_width;\n top_diff_ptr += offset;\n if (use_mask == 1) {\n mask_ptr += offset;\n for (int ph = phstart; ph <= phend; ph += kstride_h) {\n for (int pw = pwstart; pw <= pwend; pw += kstride_w) {\n if (mask_ptr[ph * pooled_width + pw] == h * width + w) {\n gradient += top_diff_ptr[ph * pooled_width + pw];\n }\n }\n }\n } else {\n for (int ph = phstart; ph <= phend; ph += kstride_h) {\n for (int pw = pwstart; pw <= pwend; pw += kstride_w) {\n if (top_mask[ph * pooled_width + pw] == h * width + w) {\n gradient += top_diff_ptr[ph * pooled_width + pw];\n }\n }\n }\n }\n bottom_diff[index] = gradient;\n }\n}\n\n__kernel void TEMPLATE(ave_pool_forward_sk,Dtype)(\n const int nthreads, __global const Dtype* bottom_data, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n const int stride_h, const int stride_w, const int kstride_h,\n const int kstride_w, const int pad_h, const int pad_w,\n __global Dtype* top_data) {\n\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n\n int pw = index % pooled_width;\n int ph = (index / pooled_width) % pooled_height;\n int c = (index / pooled_width / pooled_height) % channels;\n int n = index / pooled_width / pooled_height / channels;\n int hstart = ph * stride_h - pad_h;\n int wstart = pw * stride_w - pad_w;\n int hend = min(hstart + ext_kernel_h, height + pad_h);\n int wend = min(wstart + ext_kernel_w, width + pad_w);\n hstart = max(hstart, 0);\n wstart = max(wstart, 0);\n hend = min(hend, height);\n wend = min(wend, width);\n Dtype aveval = 0;\n __global const Dtype* bottom_data_ptr = bottom_data;\n bottom_data_ptr += (n * channels + c) * height * width;\n int pool_size = 0;\n for (int h = hstart; h < hend; ++h) {\n for (int w = wstart; w < wend; ++w) {\n aveval += bottom_data_ptr[h * width + w];\n ++pool_size;\n }\n }\n top_data[index] = aveval / pool_size;\n }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_train_sk,Dtype)(\n const int nthreads, __global const Dtype* bottom_data, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n const int stride_h, const int stride_w, const int kstride_h,\n const int kstride_w, __global Dtype* rand_idx,\n __global Dtype* top_data) {\n\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n int pw = index % pooled_width;\n int ph = (index / pooled_width) % pooled_height;\n int c = (index / pooled_width / pooled_height) % channels;\n int n = index / pooled_width / pooled_height / channels;\n int hstart = ph * stride_h;\n int hend = min(hstart + ext_kernel_h, height);\n int wstart = pw * stride_w;\n int wend = min(wstart + ext_kernel_w, width);\n Dtype cumsum = 0.;\n __global const Dtype* bottom_data_ptr = bottom_data;\n bottom_data_ptr += (n * channels + c) * height * width;\n // First pass: get sum\n for (int h = hstart; h < hend; h += kstride_h) {\n for (int w = wstart; w < wend; w += kstride_w) {\n cumsum += bottom_data_ptr[h * width + w];\n }\n }\n float thres = rand_idx[index] * cumsum;\n // Second pass: get value, and set index.\n cumsum = 0;\n for (int h = hstart; h < hend; h += kstride_h) {\n for (int w = wstart; w < wend; w += kstride_w) {\n cumsum += bottom_data_ptr[h * width + w];\n if (cumsum >= thres) {\n rand_idx[index] = ((n * channels + c) * height + h) * width + w;\n top_data[index] = bottom_data_ptr[h * width + w];\n return;\n }\n }\n }\n }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_test_sk,Dtype)(\n const int nthreads, __global const Dtype* bottom_data, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n const int stride_h, const int stride_w, const int kstride_h,\n const int kstride_w,\n __global Dtype* top_data) {\n\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n int pw = index % pooled_width;\n int ph = (index / pooled_width) % pooled_height;\n int c = (index / pooled_width / pooled_height) % channels;\n int n = index / pooled_width / pooled_height / channels;\n int hstart = ph * stride_h;\n int hend = min(hstart + ext_kernel_h, height);\n int wstart = pw * stride_w;\n int wend = min(wstart + ext_kernel_w, width);\n // We set cumsum to be 0 to avoid divide-by-zero problems\n Dtype cumsum = FLT_MIN;\n Dtype cumvalues = 0.;\n __global const Dtype* bottom_data_ptr = bottom_data;\n bottom_data_ptr += (n * channels + c) * height * width;\n // First pass: get sum\n for (int h = hstart; h < hend; h += kstride_h) {\n for (int w = wstart; w < wend; w += kstride_w) {\n cumsum += bottom_data_ptr[h * width + w];\n cumvalues += bottom_data_ptr[h * width + w]\n * bottom_data_ptr[h * width + w];\n }\n }\n top_data[index] = cumvalues / cumsum;\n }\n\n}"; // NOLINT +std::string pooling_sk_float = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(max_pool_forward_sk,Dtype)(const int nthreads,\n__global Dtype* bottom_data,\n const int num,\n const int channels,\n const int height,\n const int width,\n const int pooled_height,\n const int pooled_width,\n const int kernel_h,\n const int kernel_w,\n const int ext_kernel_h,\n const int ext_kernel_w,\n const int stride_h,\n const int stride_w,\n const int kstride_h,\n const int kstride_w,\n const int pad_h,\n const int pad_w,\n __global Dtype* top_data,\n const int use_mask,\n __global int* mask,\n __global Dtype* top_mask) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n int pw = index % pooled_width;\n int ph = (index / pooled_width) % pooled_height;\n int c = (index / pooled_width / pooled_height) % channels;\n int n = index / pooled_width / pooled_height / channels;\n int hstart = ph * stride_h - pad_h;\n int wstart = pw * stride_w - pad_w;\n int hend = min(hstart + ext_kernel_h, height);\n int wend = min(wstart + ext_kernel_w, width);\n hstart = max(hstart, (int) 0);\n wstart = max(wstart, (int) 0);\n Dtype maxval = -FLT_MAX;\n int maxidx = -1;\n __global Dtype* bottom_data_ptr = bottom_data\n + (n * channels + c) * height * width;\n for (int h = hstart; h < hend; h += kstride_h) {\n for (int w = wstart; w < wend; w += kstride_w) {\n if (bottom_data_ptr[h * width + w] > maxval) {\n maxidx = h * width + w;\n maxval = bottom_data_ptr[maxidx];\n }\n }\n }\n top_data[index] = maxval;\n if (use_mask == 1) {\n mask[index] = maxidx;\n } else {\n top_mask[index] = maxidx;\n }\n }\n}\n\n__kernel void TEMPLATE(max_pool_backward_sk,Dtype)(\n const int nthreads, __global const Dtype* top_diff, const int use_mask,\n __global const int* mask, __global const Dtype* top_mask, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n const int stride_h, const int stride_w, const int kstride_h,\n const int kstride_w, const int pad_h, const int pad_w,\n __global Dtype* bottom_diff) {\n\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n\n __global const int* mask_ptr = mask;\n __global const Dtype* top_diff_ptr = top_diff;\n\n// find out the local index\n// find out the local offset\n int w = index % width;\n int h = (index / width) % height;\n int c = (index / width / height) % channels;\n int n = index / width / height / channels;\n\n int pooled_height_1 = pooled_height - 1;\n int pooled_width_1 = pooled_width - 1;\n int phstart = (h < ext_kernel_h) ? h % kstride_h : (h - ext_kernel_h) + 1;\n int phend =\n (h >= pooled_height) ?\n pooled_height_1 - (pooled_height_1 - phstart) % kstride_h : h;\n int pwstart = (w < ext_kernel_w) ? w % kstride_w : (w - ext_kernel_w) + 1;\n int pwend =\n (w >= pooled_width) ?\n pooled_width_1 - (pooled_width_1 - pwstart) % kstride_w : w;\n\n Dtype gradient = 0;\n int offset = (n * channels + c) * pooled_height * pooled_width;\n top_diff_ptr += offset;\n if (use_mask == 1) {\n mask_ptr += offset;\n for (int ph = phstart; ph <= phend; ph += kstride_h) {\n for (int pw = pwstart; pw <= pwend; pw += kstride_w) {\n if (mask_ptr[ph * pooled_width + pw] == h * width + w) {\n gradient += top_diff_ptr[ph * pooled_width + pw];\n }\n }\n }\n } else {\n for (int ph = phstart; ph <= phend; ph += kstride_h) {\n for (int pw = pwstart; pw <= pwend; pw += kstride_w) {\n if (top_mask[ph * pooled_width + pw] == h * width + w) {\n gradient += top_diff_ptr[ph * pooled_width + pw];\n }\n }\n }\n }\n bottom_diff[index] = gradient;\n }\n}\n\n__kernel void TEMPLATE(ave_pool_forward_sk,Dtype)(\n const int nthreads, __global const Dtype* bottom_data, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n const int stride_h, const int stride_w, const int kstride_h,\n const int kstride_w, const int pad_h, const int pad_w,\n __global Dtype* top_data) {\n\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n\n int pw = index % pooled_width;\n int ph = (index / pooled_width) % pooled_height;\n int c = (index / pooled_width / pooled_height) % channels;\n int n = index / pooled_width / pooled_height / channels;\n int hstart = ph * stride_h - pad_h;\n int wstart = pw * stride_w - pad_w;\n int hend = min(hstart + ext_kernel_h, height + pad_h);\n int wend = min(wstart + ext_kernel_w, width + pad_w);\n hstart = max(hstart, 0);\n wstart = max(wstart, 0);\n hend = min(hend, height);\n wend = min(wend, width);\n Dtype aveval = 0;\n __global const Dtype* bottom_data_ptr = bottom_data;\n bottom_data_ptr += (n * channels + c) * height * width;\n int pool_size = 0;\n for (int h = hstart; h < hend; ++h) {\n for (int w = wstart; w < wend; ++w) {\n aveval += bottom_data_ptr[h * width + w];\n ++pool_size;\n }\n }\n top_data[index] = aveval / pool_size;\n }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_train_sk,Dtype)(\n const int nthreads, __global const Dtype* bottom_data, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n const int stride_h, const int stride_w, const int kstride_h,\n const int kstride_w, __global Dtype* rand_idx,\n __global Dtype* top_data) {\n\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n int pw = index % pooled_width;\n int ph = (index / pooled_width) % pooled_height;\n int c = (index / pooled_width / pooled_height) % channels;\n int n = index / pooled_width / pooled_height / channels;\n int hstart = ph * stride_h;\n int hend = min(hstart + ext_kernel_h, height);\n int wstart = pw * stride_w;\n int wend = min(wstart + ext_kernel_w, width);\n Dtype cumsum = 0.;\n __global const Dtype* bottom_data_ptr = bottom_data;\n bottom_data_ptr += (n * channels + c) * height * width;\n // First pass: get sum\n for (int h = hstart; h < hend; h += kstride_h) {\n for (int w = wstart; w < wend; w += kstride_w) {\n cumsum += bottom_data_ptr[h * width + w];\n }\n }\n float thres = rand_idx[index] * cumsum;\n // Second pass: get value, and set index.\n cumsum = 0;\n for (int h = hstart; h < hend; h += kstride_h) {\n for (int w = wstart; w < wend; w += kstride_w) {\n cumsum += bottom_data_ptr[h * width + w];\n if (cumsum >= thres) {\n rand_idx[index] = ((n * channels + c) * height + h) * width + w;\n top_data[index] = bottom_data_ptr[h * width + w];\n h = hend;\n w = wend;\n }\n }\n }\n }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_test_sk,Dtype)(\n const int nthreads, __global const Dtype* bottom_data, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n const int stride_h, const int stride_w, const int kstride_h,\n const int kstride_w,\n __global Dtype* top_data) {\n\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n int pw = index % pooled_width;\n int ph = (index / pooled_width) % pooled_height;\n int c = (index / pooled_width / pooled_height) % channels;\n int n = index / pooled_width / pooled_height / channels;\n int hstart = ph * stride_h;\n int hend = min(hstart + ext_kernel_h, height);\n int wstart = pw * stride_w;\n int wend = min(wstart + ext_kernel_w, width);\n // We set cumsum to be 0 to avoid divide-by-zero problems\n Dtype cumsum = FLT_MIN;\n Dtype cumvalues = 0.;\n __global const Dtype* bottom_data_ptr = bottom_data;\n bottom_data_ptr += (n * channels + c) * height * width;\n // First pass: get sum\n for (int h = hstart; h < hend; h += kstride_h) {\n for (int w = wstart; w < wend; w += kstride_w) {\n cumsum += bottom_data_ptr[h * width + w];\n cumvalues += bottom_data_ptr[h * width + w]\n * bottom_data_ptr[h * width + w];\n }\n }\n top_data[index] = cumvalues / cumsum;\n }\n\n}"; // NOLINT std::string slice_float = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(slice,Dtype)(const int nthreads,\n __global const Dtype* in_data,\n const int forward, const int num_slices,\n const int slice_size,\n const int bottom_slice_axis,\n const int top_slice_axis,\n const int offset_slice_axis,\n __global Dtype* out_data) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n const int total_slice_size = slice_size * top_slice_axis;\n const int slice_num = index / total_slice_size;\n const int slice_index = index % total_slice_size;\n const int bottom_index = slice_index\n + (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size;\n if (forward == 1) {\n out_data[index] = in_data[bottom_index];\n } else {\n out_data[bottom_index] = in_data[index];\n }\n }\n}"; // NOLINT std::string softmax_loss_float = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(softmax_loss_forward,Dtype)(\n int n, __global const Dtype* prob_data, __global const Dtype* label,\n __global Dtype* loss,\n const int num, const int dim, const int spatial_dim,\n const int has_ignore_label_, const int ignore_label_,\n __global Dtype* counts) {\n\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n const int n = index / spatial_dim;\n const int s = index % spatial_dim;\n const int label_value = (int) (label[n * spatial_dim + s]);\n if (has_ignore_label_ == 1 && label_value == ignore_label_) {\n loss[index] = 0;\n counts[index] = 0;\n } else {\n loss[index] = -log(\n max((Dtype) (prob_data[n * dim + label_value * spatial_dim + s]),\n (Dtype) FLT_MIN));\n counts[index] = 1;\n }\n }\n}\n\n__kernel void TEMPLATE(softmax_loss_backward,Dtype)(const int nthreads,\n __global const Dtype* top,\n __global const Dtype* label,\n __global Dtype* bottom_diff,\n const int num,\n const int dim,\n const int spatial_dim,\n const int has_ignore_label_,\n const int ignore_label_,\n __global Dtype* counts) {\n\n const int channels = dim / spatial_dim;\n\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n {\n const int n = index / spatial_dim;\n const int s = index % spatial_dim;\n const int label_value = (int) (label[n * spatial_dim + s]);\n\n if (has_ignore_label_ == 1 && label_value == ignore_label_) {\n for (int c = 0; c < channels; ++c) {\n bottom_diff[n * dim + c * spatial_dim + s] = 0;\n }\n counts[index] = 0;\n } else {\n bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;\n counts[index] = 1;\n }\n }\n }\n}"; // NOLINT std::string activation_double = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(relu_forward,Dtype)(const int n,\n __global const Dtype* in,\n __global Dtype* out,\n Dtype negative_slope) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope;\n }\n}\n\n__kernel void TEMPLATE(relu_backward,Dtype)(const int n,\n __global const Dtype* in_diff,\n __global const Dtype* in_data,\n __global Dtype* out_diff,\n Dtype negative_slope) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n out_diff[index] = in_diff[index]\n * ((in_data[index] > 0) + (in_data[index] <= 0) * negative_slope);\n }\n}\n\n__kernel void TEMPLATE(tanh_forward,Dtype)(const int n,\n __global const Dtype* in,\n __global Dtype* out) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n out[index] = tanh(in[index]);\n }\n}\n\n__kernel void TEMPLATE(tanh_backward,Dtype)(const int n,\n __global const Dtype* in_diff,\n __global const Dtype* out_data,\n __global Dtype* out_diff) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n Dtype tanhx = out_data[index];\n out_diff[index] = in_diff[index] * (1 - tanhx * tanhx);\n }\n}\n\n__kernel void TEMPLATE(sigmoid_forward,Dtype)(const int n,\n __global const Dtype* in,\n __global Dtype* out) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n out[index] = 1. / (1. + exp(-in[index]));\n }\n}\n\n__kernel void TEMPLATE(sigmoid_backward,Dtype)(const int n,\n __global const Dtype* in_diff,\n __global const Dtype* out_data,\n __global Dtype* out_diff) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n const Dtype sigmoid_x = out_data[index];\n out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x);\n }\n}\n\n__kernel void TEMPLATE(threshold,Dtype)(const int n, const Dtype threshold,\n __global const Dtype* in,\n __global Dtype* out) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n out[index] = in[index] > threshold ? 1 : 0;\n }\n}\n\n__kernel void TEMPLATE(prelu_forward,Dtype)(const int n, const int channels,\n const int dim,\n __global const Dtype* in,\n __global Dtype* out,\n __global const Dtype* slope_data,\n const int div_factor) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n int c = (index / dim) % channels / div_factor;\n out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];\n }\n}\n\n__kernel void TEMPLATE(prelu_backward,Dtype)(const int n, const int channels,\n const int dim,\n __global const Dtype* in_diff,\n __global const Dtype* in_data,\n __global Dtype* out_diff,\n __global const Dtype* slope_data,\n const int div_factor) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n int c = (index / dim) % channels / div_factor;\n out_diff[index] = in_diff[index]\n * ((in_data[index] > 0) + (in_data[index] <= 0) * slope_data[c]);\n }\n}\n\n__kernel void TEMPLATE(prelu_param_backward,Dtype)(const int n,\n __global const Dtype* in_diff, const int in_diff_off,\n __global const Dtype* in_data, const int in_data_off,\n __global Dtype* out_diff) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n out_diff[index] = in_diff[index + in_diff_off] * in_data[index + in_data_off] * (in_data[index + in_data_off] <= 0);\n }\n}"; // NOLINT @@ -39,7 +39,7 @@ std::string lrn_double = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#e std::string math_double = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(mul,Dtype)(const int n, __global const Dtype* a,\n const int offa,\n __global Dtype* b,\n const int offb, __global Dtype* y,\n const int offy) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n y[index + offy] = a[index + offa] * b[index + offb];\n }\n}\n\n__kernel void TEMPLATE(div,Dtype)(const int n, __global const Dtype* a,\n const int offa,\n __global Dtype* b,\n const int offb, __global Dtype* y,\n const int offy) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n y[index + offy] = a[index + offa] / b[index + offb];\n }\n}\n\n__kernel void TEMPLATE(add_scalar,Dtype)(const int N, const Dtype alpha,\n__global Dtype* Y,\n const int offY) {\n for (int index = get_global_id(0); index < N; index += get_global_size(0)) {\n Y[offY + index] += alpha;\n }\n}\n\n__kernel void TEMPLATE(add,Dtype)(const int n, __global const Dtype* a,\n const int offa, __global const Dtype* b,\n const int offb, __global Dtype* y,\n const int offy) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n y[offy + index] = a[offa + index] + b[offb + index];\n }\n}\n\n__kernel void TEMPLATE(sub,Dtype)(const int n, __global const Dtype* a,\n const int offa, __global const Dtype* b,\n const int offb, __global Dtype* y,\n const int offy) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n y[offy + index] = a[offa + index] - b[offb + index];\n }\n}\n\n__kernel void TEMPLATE(abs,Dtype)(const int n, __global const Dtype* a,\n const int offa, __global Dtype* y,\n const int offy) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n y[offy + index] = fabs((Dtype)(a[offa + index]));\n }\n}\n\n__kernel void TEMPLATE(exp,Dtype)(const int n, __global const Dtype* a,\n const int offa, __global Dtype* y,\n const int offy) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n y[offy + index] = exp(a[offa + index]);\n }\n}\n\n__kernel void TEMPLATE(log,Dtype)(const int n, __global const Dtype* a,\n const int offa, __global Dtype* y,\n const int offy) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n y[offy + index] = log(a[offa + index]);\n }\n}\n\n__kernel void TEMPLATE(powx,Dtype)(const int n, __global const Dtype* a,\n const int offa, Dtype alpha,\n __global Dtype* y,\n const int offy) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n if(a[offa + index] < 0 && alpha < 1 && alpha > -1) {\n y[offy + index] = NAN;\n } else {\n y[offy + index] = pow(a[offa + index], alpha);\n }\n }\n}\n\n__kernel void TEMPLATE(sign,Dtype)(const int n, __global const Dtype* x,\n const int offx, __global Dtype* y,\n const int offy) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n y[index + offy] = (0.0 < x[index + offx])\n - (x[index + offx] < 0.0);\n }\n}\n\n__kernel void TEMPLATE(sgnbit,Dtype)(const int n, __global const Dtype* x,\n const int offx, __global Dtype* y,\n const int offy) {\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n y[index + offy] = signbit(x[index + offx]);\n }\n}"; // NOLINT std::string mergecrop_double = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(merge_copy_forward, Dtype)(\n const int nthreads, __global const Dtype* bottom_a,\n __global const Dtype* bottom_b,\n __global Dtype* top,\n int num, int channels_a, int channels_b, int height_a, int width_a,\n int height_b, int width_b) {\n\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n\n int pad_h = (height_b - height_a) / 2;\n int pad_w = (width_b - width_a) / 2;\n\n int batch_id = index / ((channels_a + channels_b) * height_a * width_a);\n\n int bottom_id = ((index\n - batch_id * (channels_a + channels_b) * height_a * width_a)\n / (channels_a * height_a * width_a)) % 2;\n\n int h = ((index / width_a) % height_a);\n int w = (index % width_a);\n\n if (bottom_id == 0) {\n int channel_id = (index / ((width_a * height_a)) % channels_a);\n int aidx = ((((batch_id) * channels_a + channel_id) * height_a + h)\n * width_a + w);\n top[index] = bottom_a[aidx];\n } else {\n int channel_id = (index / ((width_a * height_a)) % channels_b);\n int bidx =\n (((batch_id) * channels_b + channel_id) * height_b\n * width_b) + width_b * (h + pad_h) + pad_w + w;\n top[index] = bottom_b[bidx];\n }\n }\n\n}\n\n__kernel void TEMPLATE(merge_copy_backward,Dtype)(const int nthreads,\n__global Dtype* bottom_a,\n __global const Dtype* top,\n int num, int channels_a,\n int channels_b, int height_a,\n int width_a, int height_b,\n int width_b) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n int batch_id = index / ((channels_a + channels_b) * height_a * width_a);\n\n int bottom_id = ((index\n - batch_id * (channels_a + channels_b) * height_a * width_a)\n / (channels_a * height_a * width_a)) % 2;\n\n int h = ((index / width_a) % height_a);\n int w = (index % width_a);\n\n if (bottom_id == 0) {\n int channel_id = (index / ((width_a * height_a)) % channels_a);\n int aidx = ((((batch_id) * channels_a + channel_id) * height_a + h)\n * width_a + w);\n bottom_a[aidx] = top[index];\n }\n }\n}"; // NOLINT std::string pooling_double = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(max_pool_forward,Dtype)(\n const int nthreads, __global const Dtype* bottom_data, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int stride_h, const int stride_w, const int pad_h,\n const int pad_w,\n __global Dtype* top_data,\n const int use_mask, __global int* mask, __global Dtype* top_mask) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n const int pw = index % pooled_width;\n const int ph = (index / pooled_width) % pooled_height;\n const int c = (index / pooled_width / pooled_height) % channels;\n const int n = index / pooled_width / pooled_height / channels;\n int hstart = ph * stride_h - pad_h;\n int wstart = pw * stride_w - pad_w;\n const int hend = min(hstart + kernel_h, height);\n const int wend = min(wstart + kernel_w, width);\n hstart = max(hstart, 0);\n wstart = max(wstart, 0);\n Dtype maxval = -FLT_MAX;\n int maxidx = -1;\n __global const Dtype* bottom_slice = bottom_data\n + (n * channels + c) * height * width;\n for (int h = hstart; h < hend; ++h) {\n for (int w = wstart; w < wend; ++w) {\n if (bottom_slice[h * width + w] > maxval) {\n maxidx = h * width + w;\n maxval = bottom_slice[maxidx];\n }\n }\n }\n top_data[index] = maxval;\n if (use_mask == 1) {\n mask[index] = maxidx;\n } else {\n top_mask[index] = maxidx;\n }\n }\n}\n\n__kernel void TEMPLATE(ave_pool_forward,Dtype)(\n const int nthreads, __global const Dtype* const bottom_data, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int stride_h, const int stride_w, const int pad_h,\n const int pad_w, __global Dtype* top_data) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n {\n const int pw = index % pooled_width;\n const int ph = (index / pooled_width) % pooled_height;\n const int c = (index / pooled_width / pooled_height) % channels;\n const int n = index / pooled_width / pooled_height / channels;\n int hstart = ph * stride_h - pad_h;\n int wstart = pw * stride_w - pad_w;\n int hend = min(hstart + kernel_h, height + pad_h);\n int wend = min(wstart + kernel_w, width + pad_w);\n const int pool_size = (hend - hstart) * (wend - wstart);\n hstart = max(hstart, 0);\n wstart = max(wstart, 0);\n hend = min(hend, height);\n wend = min(wend, width);\n Dtype aveval = 0;\n __global const Dtype* bottom_slice = bottom_data\n + (n * channels + c) * height * width;\n for (int h = hstart; h < hend; ++h) {\n for (int w = wstart; w < wend; ++w) {\n aveval += bottom_slice[h * width + w];\n }\n }\n top_data[index] = aveval / pool_size;\n }\n }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_train,Dtype)(\n const int nthreads, __global const Dtype* bottom_data, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int stride_h, const int stride_w,\n __global Dtype* rand_idx,\n __global Dtype* top_data) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n const int pw = index % pooled_width;\n const int ph = (index / pooled_width) % pooled_height;\n const int c = (index / pooled_width / pooled_height) % channels;\n const int n = index / pooled_width / pooled_height / channels;\n const int hstart = ph * stride_h;\n const int hend = min(hstart + kernel_h, height);\n const int wstart = pw * stride_w;\n const int wend = min(wstart + kernel_w, width);\n Dtype cumsum = 0.;\n __global const Dtype* bottom_slice = bottom_data\n + (n * channels + c) * height * width;\n // First pass: get sum\n for (int h = hstart; h < hend; ++h) {\n for (int w = wstart; w < wend; ++w) {\n cumsum += bottom_slice[h * width + w];\n }\n }\n const float thres = rand_idx[index] * cumsum;\n // Second pass: get value, and set index.\n cumsum = 0;\n for (int h = hstart; h < hend; ++h) {\n for (int w = wstart; w < wend; ++w) {\n cumsum += bottom_slice[h * width + w];\n if (cumsum >= thres) {\n rand_idx[index] = ((n * channels + c) * height + h) * width + w;\n top_data[index] = bottom_slice[h * width + w];\n h = hend;\n w = wend;\n }\n }\n }\n }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_test,Dtype)(\n const int nthreads, __global const Dtype* const bottom_data, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int stride_h, const int stride_w,\n __global Dtype* top_data) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n const int pw = index % pooled_width;\n const int ph = (index / pooled_width) % pooled_height;\n const int c = (index / pooled_width / pooled_height) % channels;\n const int n = index / pooled_width / pooled_height / channels;\n const int hstart = ph * stride_h;\n const int hend = min(hstart + kernel_h, height);\n const int wstart = pw * stride_w;\n const int wend = min(wstart + kernel_w, width);\n // We set cumsum to be 0 to avoid divide-by-zero problems\n Dtype cumsum = FLT_MIN;\n Dtype cumvalues = 0.;\n __global const Dtype* bottom_slice = bottom_data\n + (n * channels + c) * height * width;\n // First pass: get sum\n for (int h = hstart; h < hend; ++h) {\n for (int w = wstart; w < wend; ++w) {\n cumsum += bottom_slice[h * width + w];\n cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];\n }\n }\n top_data[index] = cumvalues / cumsum;\n }\n}\n\n__kernel void TEMPLATE(max_pool_backward,Dtype)(const int nthreads,\n __global const Dtype* top_diff,\n const int use_mask,\n __global const int* mask,\n __global const Dtype* top_mask,\n const int num,\n const int channels,\n const int height,\n const int width,\n const int pooled_height,\n const int pooled_width,\n const int kernel_h,\n const int kernel_w,\n const int stride_h,\n const int stride_w,\n const int pad_h,\n const int pad_w,\n __global Dtype* bottom_diff) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n // find out the local index\n // find out the local offset\n const int w = index % width;\n const int h = (index / width) % height;\n const int c = (index / width / height) % channels;\n const int n = index / width / height / channels;\n const int phstart =\n (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;\n const int phend = min((h + pad_h) / stride_h + 1, pooled_height);\n const int pwstart =\n (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;\n const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);\n Dtype gradient = 0;\n const int offset = (n * channels + c) * pooled_height * pooled_width;\n __global const Dtype* top_diff_slice = top_diff + offset;\n if (use_mask == 1) {\n __global const int* mask_slice = mask + offset;\n for (int ph = phstart; ph < phend; ++ph) {\n for (int pw = pwstart; pw < pwend; ++pw) {\n if (mask_slice[ph * pooled_width + pw] == h * width + w) {\n gradient += top_diff_slice[ph * pooled_width + pw];\n }\n }\n }\n } else {\n __global const Dtype* top_mask_slice = top_mask + offset;\n for (int ph = phstart; ph < phend; ++ph) {\n for (int pw = pwstart; pw < pwend; ++pw) {\n if (top_mask_slice[ph * pooled_width + pw] == h * width + w) {\n gradient += top_diff_slice[ph * pooled_width + pw];\n }\n }\n }\n }\n bottom_diff[index] = gradient;\n }\n}\n\n__kernel void TEMPLATE(ave_pool_backward,Dtype)(const int nthreads,\n __global const Dtype* top_diff,\n const int num,\n const int channels,\n const int height,\n const int width,\n const int pooled_height,\n const int pooled_width,\n const int kernel_h,\n const int kernel_w,\n const int stride_h,\n const int stride_w,\n const int pad_h,\n const int pad_w,\n __global Dtype* bottom_diff) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n // find out the local index\n // find out the local offset\n const int w = index % width + pad_w;\n const int h = (index / width) % height + pad_h;\n const int c = (index / width / height) % channels;\n const int n = index / width / height / channels;\n const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;\n const int phend = min(h / stride_h + 1, pooled_height);\n const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;\n const int pwend = min(w / stride_w + 1, pooled_width);\n Dtype gradient = 0;\n __global const Dtype* const top_diff_slice = top_diff\n + (n * channels + c) * pooled_height * pooled_width;\n for (int ph = phstart; ph < phend; ++ph) {\n for (int pw = pwstart; pw < pwend; ++pw) {\n // figure out the pooling size\n int hstart = ph * stride_h - pad_h;\n int wstart = pw * stride_w - pad_w;\n int hend = min(hstart + kernel_h, height + pad_h);\n int wend = min(wstart + kernel_w, width + pad_w);\n int pool_size = (hend - hstart) * (wend - wstart);\n gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;\n }\n }\n bottom_diff[index] = gradient;\n }\n}\n\n__kernel void TEMPLATE(sto_pool_backward,Dtype)(\n const int nthreads, __global const Dtype* rand_idx,\n __global const Dtype* const top_diff, const int num, const int channels,\n const int height, const int width, const int pooled_height,\n const int pooled_width, const int kernel_h, const int kernel_w,\n const int stride_h, const int stride_w, __global Dtype* bottom_diff) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n // find out the local index\n // find out the local offset\n const int w = index % width;\n const int h = (index / width) % height;\n const int c = (index / width / height) % channels;\n const int n = index / width / height / channels;\n const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;\n const int phend = min(h / stride_h + 1, pooled_height);\n const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;\n const int pwend = min(w / stride_w + 1, pooled_width);\n Dtype gradient = 0;\n __global const Dtype* rand_idx_slice = rand_idx\n + (n * channels + c) * pooled_height * pooled_width;\n __global const Dtype* top_diff_slice = top_diff\n + (n * channels + c) * pooled_height * pooled_width;\n for (int ph = phstart; ph < phend; ++ph) {\n for (int pw = pwstart; pw < pwend; ++pw) {\n gradient += top_diff_slice[ph * pooled_width + pw]\n * (index == (int) (rand_idx_slice[ph * pooled_width + pw]));\n }\n }\n bottom_diff[index] = gradient;\n }\n}"; // NOLINT -std::string pooling_sk_double = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(max_pool_forward_sk,Dtype)(const int nthreads,\n__global Dtype* bottom_data,\n const int num,\n const int channels,\n const int height,\n const int width,\n const int pooled_height,\n const int pooled_width,\n const int kernel_h,\n const int kernel_w,\n const int ext_kernel_h,\n const int ext_kernel_w,\n const int stride_h,\n const int stride_w,\n const int kstride_h,\n const int kstride_w,\n const int pad_h,\n const int pad_w,\n __global Dtype* top_data,\n const int use_mask,\n __global int* mask,\n __global Dtype* top_mask) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n int pw = index % pooled_width;\n int ph = (index / pooled_width) % pooled_height;\n int c = (index / pooled_width / pooled_height) % channels;\n int n = index / pooled_width / pooled_height / channels;\n int hstart = ph * stride_h - pad_h;\n int wstart = pw * stride_w - pad_w;\n int hend = min(hstart + ext_kernel_h, height);\n int wend = min(wstart + ext_kernel_w, width);\n hstart = max(hstart, (int) 0);\n wstart = max(wstart, (int) 0);\n Dtype maxval = -FLT_MAX;\n int maxidx = -1;\n __global Dtype* bottom_data_ptr = bottom_data\n + (n * channels + c) * height * width;\n for (int h = hstart; h < hend; h += kstride_h) {\n for (int w = wstart; w < wend; w += kstride_w) {\n if (bottom_data_ptr[h * width + w] > maxval) {\n maxidx = h * width + w;\n maxval = bottom_data_ptr[maxidx];\n }\n }\n }\n top_data[index] = maxval;\n if (use_mask == 1) {\n mask[index] = maxidx;\n } else {\n top_mask[index] = maxidx;\n }\n }\n}\n\n__kernel void TEMPLATE(max_pool_backward_sk,Dtype)(\n const int nthreads, __global const Dtype* top_diff, const int use_mask,\n __global const int* mask, __global const Dtype* top_mask, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n const int stride_h, const int stride_w, const int kstride_h,\n const int kstride_w, const int pad_h, const int pad_w,\n __global Dtype* bottom_diff) {\n\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n\n __global const int* mask_ptr = mask;\n __global const Dtype* top_diff_ptr = top_diff;\n\n// find out the local index\n// find out the local offset\n int w = index % width;\n int h = (index / width) % height;\n int c = (index / width / height) % channels;\n int n = index / width / height / channels;\n\n int pooled_height_1 = pooled_height - 1;\n int pooled_width_1 = pooled_width - 1;\n int phstart = (h < ext_kernel_h) ? h % kstride_h : (h - ext_kernel_h) + 1;\n int phend =\n (h >= pooled_height) ?\n pooled_height_1 - (pooled_height_1 - phstart) % kstride_h : h;\n int pwstart = (w < ext_kernel_w) ? w % kstride_w : (w - ext_kernel_w) + 1;\n int pwend =\n (w >= pooled_width) ?\n pooled_width_1 - (pooled_width_1 - pwstart) % kstride_w : w;\n\n Dtype gradient = 0;\n int offset = (n * channels + c) * pooled_height * pooled_width;\n top_diff_ptr += offset;\n if (use_mask == 1) {\n mask_ptr += offset;\n for (int ph = phstart; ph <= phend; ph += kstride_h) {\n for (int pw = pwstart; pw <= pwend; pw += kstride_w) {\n if (mask_ptr[ph * pooled_width + pw] == h * width + w) {\n gradient += top_diff_ptr[ph * pooled_width + pw];\n }\n }\n }\n } else {\n for (int ph = phstart; ph <= phend; ph += kstride_h) {\n for (int pw = pwstart; pw <= pwend; pw += kstride_w) {\n if (top_mask[ph * pooled_width + pw] == h * width + w) {\n gradient += top_diff_ptr[ph * pooled_width + pw];\n }\n }\n }\n }\n bottom_diff[index] = gradient;\n }\n}\n\n__kernel void TEMPLATE(ave_pool_forward_sk,Dtype)(\n const int nthreads, __global const Dtype* bottom_data, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n const int stride_h, const int stride_w, const int kstride_h,\n const int kstride_w, const int pad_h, const int pad_w,\n __global Dtype* top_data) {\n\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n\n int pw = index % pooled_width;\n int ph = (index / pooled_width) % pooled_height;\n int c = (index / pooled_width / pooled_height) % channels;\n int n = index / pooled_width / pooled_height / channels;\n int hstart = ph * stride_h - pad_h;\n int wstart = pw * stride_w - pad_w;\n int hend = min(hstart + ext_kernel_h, height + pad_h);\n int wend = min(wstart + ext_kernel_w, width + pad_w);\n hstart = max(hstart, 0);\n wstart = max(wstart, 0);\n hend = min(hend, height);\n wend = min(wend, width);\n Dtype aveval = 0;\n __global const Dtype* bottom_data_ptr = bottom_data;\n bottom_data_ptr += (n * channels + c) * height * width;\n int pool_size = 0;\n for (int h = hstart; h < hend; ++h) {\n for (int w = wstart; w < wend; ++w) {\n aveval += bottom_data_ptr[h * width + w];\n ++pool_size;\n }\n }\n top_data[index] = aveval / pool_size;\n }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_train_sk,Dtype)(\n const int nthreads, __global const Dtype* bottom_data, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n const int stride_h, const int stride_w, const int kstride_h,\n const int kstride_w, __global Dtype* rand_idx,\n __global Dtype* top_data) {\n\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n int pw = index % pooled_width;\n int ph = (index / pooled_width) % pooled_height;\n int c = (index / pooled_width / pooled_height) % channels;\n int n = index / pooled_width / pooled_height / channels;\n int hstart = ph * stride_h;\n int hend = min(hstart + ext_kernel_h, height);\n int wstart = pw * stride_w;\n int wend = min(wstart + ext_kernel_w, width);\n Dtype cumsum = 0.;\n __global const Dtype* bottom_data_ptr = bottom_data;\n bottom_data_ptr += (n * channels + c) * height * width;\n // First pass: get sum\n for (int h = hstart; h < hend; h += kstride_h) {\n for (int w = wstart; w < wend; w += kstride_w) {\n cumsum += bottom_data_ptr[h * width + w];\n }\n }\n float thres = rand_idx[index] * cumsum;\n // Second pass: get value, and set index.\n cumsum = 0;\n for (int h = hstart; h < hend; h += kstride_h) {\n for (int w = wstart; w < wend; w += kstride_w) {\n cumsum += bottom_data_ptr[h * width + w];\n if (cumsum >= thres) {\n rand_idx[index] = ((n * channels + c) * height + h) * width + w;\n top_data[index] = bottom_data_ptr[h * width + w];\n return;\n }\n }\n }\n }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_test_sk,Dtype)(\n const int nthreads, __global const Dtype* bottom_data, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n const int stride_h, const int stride_w, const int kstride_h,\n const int kstride_w,\n __global Dtype* top_data) {\n\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n int pw = index % pooled_width;\n int ph = (index / pooled_width) % pooled_height;\n int c = (index / pooled_width / pooled_height) % channels;\n int n = index / pooled_width / pooled_height / channels;\n int hstart = ph * stride_h;\n int hend = min(hstart + ext_kernel_h, height);\n int wstart = pw * stride_w;\n int wend = min(wstart + ext_kernel_w, width);\n // We set cumsum to be 0 to avoid divide-by-zero problems\n Dtype cumsum = FLT_MIN;\n Dtype cumvalues = 0.;\n __global const Dtype* bottom_data_ptr = bottom_data;\n bottom_data_ptr += (n * channels + c) * height * width;\n // First pass: get sum\n for (int h = hstart; h < hend; h += kstride_h) {\n for (int w = wstart; w < wend; w += kstride_w) {\n cumsum += bottom_data_ptr[h * width + w];\n cumvalues += bottom_data_ptr[h * width + w]\n * bottom_data_ptr[h * width + w];\n }\n }\n top_data[index] = cumvalues / cumsum;\n }\n\n}"; // NOLINT +std::string pooling_sk_double = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(max_pool_forward_sk,Dtype)(const int nthreads,\n__global Dtype* bottom_data,\n const int num,\n const int channels,\n const int height,\n const int width,\n const int pooled_height,\n const int pooled_width,\n const int kernel_h,\n const int kernel_w,\n const int ext_kernel_h,\n const int ext_kernel_w,\n const int stride_h,\n const int stride_w,\n const int kstride_h,\n const int kstride_w,\n const int pad_h,\n const int pad_w,\n __global Dtype* top_data,\n const int use_mask,\n __global int* mask,\n __global Dtype* top_mask) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n int pw = index % pooled_width;\n int ph = (index / pooled_width) % pooled_height;\n int c = (index / pooled_width / pooled_height) % channels;\n int n = index / pooled_width / pooled_height / channels;\n int hstart = ph * stride_h - pad_h;\n int wstart = pw * stride_w - pad_w;\n int hend = min(hstart + ext_kernel_h, height);\n int wend = min(wstart + ext_kernel_w, width);\n hstart = max(hstart, (int) 0);\n wstart = max(wstart, (int) 0);\n Dtype maxval = -FLT_MAX;\n int maxidx = -1;\n __global Dtype* bottom_data_ptr = bottom_data\n + (n * channels + c) * height * width;\n for (int h = hstart; h < hend; h += kstride_h) {\n for (int w = wstart; w < wend; w += kstride_w) {\n if (bottom_data_ptr[h * width + w] > maxval) {\n maxidx = h * width + w;\n maxval = bottom_data_ptr[maxidx];\n }\n }\n }\n top_data[index] = maxval;\n if (use_mask == 1) {\n mask[index] = maxidx;\n } else {\n top_mask[index] = maxidx;\n }\n }\n}\n\n__kernel void TEMPLATE(max_pool_backward_sk,Dtype)(\n const int nthreads, __global const Dtype* top_diff, const int use_mask,\n __global const int* mask, __global const Dtype* top_mask, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n const int stride_h, const int stride_w, const int kstride_h,\n const int kstride_w, const int pad_h, const int pad_w,\n __global Dtype* bottom_diff) {\n\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n\n __global const int* mask_ptr = mask;\n __global const Dtype* top_diff_ptr = top_diff;\n\n// find out the local index\n// find out the local offset\n int w = index % width;\n int h = (index / width) % height;\n int c = (index / width / height) % channels;\n int n = index / width / height / channels;\n\n int pooled_height_1 = pooled_height - 1;\n int pooled_width_1 = pooled_width - 1;\n int phstart = (h < ext_kernel_h) ? h % kstride_h : (h - ext_kernel_h) + 1;\n int phend =\n (h >= pooled_height) ?\n pooled_height_1 - (pooled_height_1 - phstart) % kstride_h : h;\n int pwstart = (w < ext_kernel_w) ? w % kstride_w : (w - ext_kernel_w) + 1;\n int pwend =\n (w >= pooled_width) ?\n pooled_width_1 - (pooled_width_1 - pwstart) % kstride_w : w;\n\n Dtype gradient = 0;\n int offset = (n * channels + c) * pooled_height * pooled_width;\n top_diff_ptr += offset;\n if (use_mask == 1) {\n mask_ptr += offset;\n for (int ph = phstart; ph <= phend; ph += kstride_h) {\n for (int pw = pwstart; pw <= pwend; pw += kstride_w) {\n if (mask_ptr[ph * pooled_width + pw] == h * width + w) {\n gradient += top_diff_ptr[ph * pooled_width + pw];\n }\n }\n }\n } else {\n for (int ph = phstart; ph <= phend; ph += kstride_h) {\n for (int pw = pwstart; pw <= pwend; pw += kstride_w) {\n if (top_mask[ph * pooled_width + pw] == h * width + w) {\n gradient += top_diff_ptr[ph * pooled_width + pw];\n }\n }\n }\n }\n bottom_diff[index] = gradient;\n }\n}\n\n__kernel void TEMPLATE(ave_pool_forward_sk,Dtype)(\n const int nthreads, __global const Dtype* bottom_data, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n const int stride_h, const int stride_w, const int kstride_h,\n const int kstride_w, const int pad_h, const int pad_w,\n __global Dtype* top_data) {\n\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n\n int pw = index % pooled_width;\n int ph = (index / pooled_width) % pooled_height;\n int c = (index / pooled_width / pooled_height) % channels;\n int n = index / pooled_width / pooled_height / channels;\n int hstart = ph * stride_h - pad_h;\n int wstart = pw * stride_w - pad_w;\n int hend = min(hstart + ext_kernel_h, height + pad_h);\n int wend = min(wstart + ext_kernel_w, width + pad_w);\n hstart = max(hstart, 0);\n wstart = max(wstart, 0);\n hend = min(hend, height);\n wend = min(wend, width);\n Dtype aveval = 0;\n __global const Dtype* bottom_data_ptr = bottom_data;\n bottom_data_ptr += (n * channels + c) * height * width;\n int pool_size = 0;\n for (int h = hstart; h < hend; ++h) {\n for (int w = wstart; w < wend; ++w) {\n aveval += bottom_data_ptr[h * width + w];\n ++pool_size;\n }\n }\n top_data[index] = aveval / pool_size;\n }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_train_sk,Dtype)(\n const int nthreads, __global const Dtype* bottom_data, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n const int stride_h, const int stride_w, const int kstride_h,\n const int kstride_w, __global Dtype* rand_idx,\n __global Dtype* top_data) {\n\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n int pw = index % pooled_width;\n int ph = (index / pooled_width) % pooled_height;\n int c = (index / pooled_width / pooled_height) % channels;\n int n = index / pooled_width / pooled_height / channels;\n int hstart = ph * stride_h;\n int hend = min(hstart + ext_kernel_h, height);\n int wstart = pw * stride_w;\n int wend = min(wstart + ext_kernel_w, width);\n Dtype cumsum = 0.;\n __global const Dtype* bottom_data_ptr = bottom_data;\n bottom_data_ptr += (n * channels + c) * height * width;\n // First pass: get sum\n for (int h = hstart; h < hend; h += kstride_h) {\n for (int w = wstart; w < wend; w += kstride_w) {\n cumsum += bottom_data_ptr[h * width + w];\n }\n }\n float thres = rand_idx[index] * cumsum;\n // Second pass: get value, and set index.\n cumsum = 0;\n for (int h = hstart; h < hend; h += kstride_h) {\n for (int w = wstart; w < wend; w += kstride_w) {\n cumsum += bottom_data_ptr[h * width + w];\n if (cumsum >= thres) {\n rand_idx[index] = ((n * channels + c) * height + h) * width + w;\n top_data[index] = bottom_data_ptr[h * width + w];\n h = hend;\n w = wend;\n }\n }\n }\n }\n}\n\n__kernel void TEMPLATE(sto_pool_forward_test_sk,Dtype)(\n const int nthreads, __global const Dtype* bottom_data, const int num,\n const int channels, const int height, const int width,\n const int pooled_height, const int pooled_width, const int kernel_h,\n const int kernel_w, const int ext_kernel_h, const int ext_kernel_w,\n const int stride_h, const int stride_w, const int kstride_h,\n const int kstride_w,\n __global Dtype* top_data) {\n\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n int pw = index % pooled_width;\n int ph = (index / pooled_width) % pooled_height;\n int c = (index / pooled_width / pooled_height) % channels;\n int n = index / pooled_width / pooled_height / channels;\n int hstart = ph * stride_h;\n int hend = min(hstart + ext_kernel_h, height);\n int wstart = pw * stride_w;\n int wend = min(wstart + ext_kernel_w, width);\n // We set cumsum to be 0 to avoid divide-by-zero problems\n Dtype cumsum = FLT_MIN;\n Dtype cumvalues = 0.;\n __global const Dtype* bottom_data_ptr = bottom_data;\n bottom_data_ptr += (n * channels + c) * height * width;\n // First pass: get sum\n for (int h = hstart; h < hend; h += kstride_h) {\n for (int w = wstart; w < wend; w += kstride_w) {\n cumsum += bottom_data_ptr[h * width + w];\n cumvalues += bottom_data_ptr[h * width + w]\n * bottom_data_ptr[h * width + w];\n }\n }\n top_data[index] = cumvalues / cumsum;\n }\n\n}"; // NOLINT std::string slice_double = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(slice,Dtype)(const int nthreads,\n __global const Dtype* in_data,\n const int forward, const int num_slices,\n const int slice_size,\n const int bottom_slice_axis,\n const int top_slice_axis,\n const int offset_slice_axis,\n __global Dtype* out_data) {\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n const int total_slice_size = slice_size * top_slice_axis;\n const int slice_num = index / total_slice_size;\n const int slice_index = index % total_slice_size;\n const int bottom_index = slice_index\n + (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size;\n if (forward == 1) {\n out_data[index] = in_data[bottom_index];\n } else {\n out_data[bottom_index] = in_data[index];\n }\n }\n}"; // NOLINT std::string softmax_loss_double = "#ifndef __OPENCL_VERSION__\n#include \"header.cl\"\n#endif\n\n__kernel void TEMPLATE(softmax_loss_forward,Dtype)(\n int n, __global const Dtype* prob_data, __global const Dtype* label,\n __global Dtype* loss,\n const int num, const int dim, const int spatial_dim,\n const int has_ignore_label_, const int ignore_label_,\n __global Dtype* counts) {\n\n for (int index = get_global_id(0); index < n; index += get_global_size(0)) {\n const int n = index / spatial_dim;\n const int s = index % spatial_dim;\n const int label_value = (int) (label[n * spatial_dim + s]);\n if (has_ignore_label_ == 1 && label_value == ignore_label_) {\n loss[index] = 0;\n counts[index] = 0;\n } else {\n loss[index] = -log(\n max((Dtype) (prob_data[n * dim + label_value * spatial_dim + s]),\n (Dtype) FLT_MIN));\n counts[index] = 1;\n }\n }\n}\n\n__kernel void TEMPLATE(softmax_loss_backward,Dtype)(const int nthreads,\n __global const Dtype* top,\n __global const Dtype* label,\n __global Dtype* bottom_diff,\n const int num,\n const int dim,\n const int spatial_dim,\n const int has_ignore_label_,\n const int ignore_label_,\n __global Dtype* counts) {\n\n const int channels = dim / spatial_dim;\n\n for (int index = get_global_id(0); index < nthreads;\n index += get_global_size(0)) {\n {\n const int n = index / spatial_dim;\n const int s = index % spatial_dim;\n const int label_value = (int) (label[n * spatial_dim + s]);\n\n if (has_ignore_label_ == 1 && label_value == ignore_label_) {\n for (int c = 0; c < channels; ++c) {\n bottom_diff[n * dim + c * spatial_dim + s] = 0;\n }\n counts[index] = 0;\n } else {\n bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;\n counts[index] = 1;\n }\n }\n }\n}"; // NOLINT viennacl::ocl::program & RegisterKernels(viennacl::ocl::context *ctx) { diff --git a/src/caffe/greentea/cl_kernels/pooling_sk.cl b/src/caffe/greentea/cl_kernels/pooling_sk.cl index 3b2108b870e..da98af58206 100644 --- a/src/caffe/greentea/cl_kernels/pooling_sk.cl +++ b/src/caffe/greentea/cl_kernels/pooling_sk.cl @@ -191,7 +191,8 @@ __kernel void TEMPLATE(sto_pool_forward_train_sk,Dtype)( if (cumsum >= thres) { rand_idx[index] = ((n * channels + c) * height + h) * width + w; top_data[index] = bottom_data_ptr[h * width + w]; - return; + h = hend; + w = wend; } } }