TileLayer: add CUDA kernels

BVLC · Aug 19, 2015 · 22751c5 · 22751c5
1 parent fa81a0b
commit 22751c5
Showing 1 changed file with 39 additions and 14 deletions.
diff --git a/src/caffe/layers/tile_layer.cu b/src/caffe/layers/tile_layer.cu
@@ -6,17 +6,45 @@
 
 namespace caffe {
 
+template <typename Dtype>
+__global__ void Tile(const int nthreads, const Dtype* bottom_data,
+    const int tile_size, const int num_tiles, const int bottom_tile_axis,
+    Dtype* top_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int d = index % tile_size;
+    const int b = (index / tile_size / num_tiles) % bottom_tile_axis;
+    const int n = index / tile_size / num_tiles / bottom_tile_axis;
+    const int bottom_index = (n * bottom_tile_axis + b) * tile_size + d;
+    top_data[index] = bottom_data[bottom_index];
+  }
+}
+
 template <typename Dtype>
 void TileLayer<Dtype>::Forward_gpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
-  for (int i = 0; i < outer_dim_; ++i) {
-    for (int t = 0; t < tiles_; ++t) {
-      caffe_copy(inner_dim_, bottom_data, top_data);
-      top_data += inner_dim_;
+  const int bottom_tile_axis = bottom[0]->shape(axis_);
+  const int nthreads = top[0]->count();
+  Tile<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
+      nthreads, bottom_data, inner_dim_, tiles_, bottom_tile_axis, top_data);
+}
+
+template <typename Dtype>
+__global__ void TileBackward(const int nthreads, const Dtype* top_diff,
+    const int tile_size, const int num_tiles, const int bottom_tile_axis,
+    Dtype* bottom_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int d = index % tile_size;
+    const int b = (index / tile_size) % bottom_tile_axis;
+    const int n = index / tile_size / bottom_tile_axis;
+    bottom_diff[index] = 0;
+    int top_index = (n * num_tiles * bottom_tile_axis + b) * tile_size + d;
+    for (int t = 0; t < num_tiles; ++t) {
+      bottom_diff[index] += top_diff[top_index];
+      top_index += bottom_tile_axis * tile_size;
     }
-    bottom_data += inner_dim_;
   }
 }
 
@@ -26,15 +54,12 @@ void TileLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   if (!propagate_down[0]) { return; }
   const Dtype* top_diff = top[0]->gpu_diff();
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  for (int i = 0; i < outer_dim_; ++i) {
-    caffe_copy(inner_dim_, top_diff, bottom_diff);
-    top_diff += inner_dim_;
-    for (int t = 1; t < tiles_; ++t) {
-      caffe_gpu_axpy(inner_dim_, Dtype(1), top_diff, bottom_diff);
-      top_diff += inner_dim_;
-    }
-    bottom_diff += inner_dim_;
-  }
+  const int bottom_tile_axis = bottom[0]->shape(axis_);
+  const int tile_size = inner_dim_ / bottom_tile_axis;
+  const int nthreads = bottom[0]->count();
+  TileBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
+      nthreads, top_diff, tile_size, tiles_, bottom_tile_axis, bottom_diff);
 }
 
 INSTANTIATE_LAYER_GPU_FUNCS(TileLayer);