diff --git a/src/caffe/layers/tile_layer.cu b/src/caffe/layers/tile_layer.cu index 3af8e2eb72f..7fd3bc47d0f 100644 --- a/src/caffe/layers/tile_layer.cu +++ b/src/caffe/layers/tile_layer.cu @@ -6,17 +6,45 @@ namespace caffe { +template +__global__ void Tile(const int nthreads, const Dtype* bottom_data, + const int tile_size, const int num_tiles, const int bottom_tile_axis, + Dtype* top_data) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int d = index % tile_size; + const int b = (index / tile_size / num_tiles) % bottom_tile_axis; + const int n = index / tile_size / num_tiles / bottom_tile_axis; + const int bottom_index = (n * bottom_tile_axis + b) * tile_size + d; + top_data[index] = bottom_data[bottom_index]; + } +} + template void TileLayer::Forward_gpu( const vector*>& bottom, const vector*>& top) { const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); - for (int i = 0; i < outer_dim_; ++i) { - for (int t = 0; t < tiles_; ++t) { - caffe_copy(inner_dim_, bottom_data, top_data); - top_data += inner_dim_; + const int bottom_tile_axis = bottom[0]->shape(axis_); + const int nthreads = top[0]->count(); + Tile // NOLINT_NEXT_LINE(whitespace/operators) + <<>>( + nthreads, bottom_data, inner_dim_, tiles_, bottom_tile_axis, top_data); +} + +template +__global__ void TileBackward(const int nthreads, const Dtype* top_diff, + const int tile_size, const int num_tiles, const int bottom_tile_axis, + Dtype* bottom_diff) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int d = index % tile_size; + const int b = (index / tile_size) % bottom_tile_axis; + const int n = index / tile_size / bottom_tile_axis; + bottom_diff[index] = 0; + int top_index = (n * num_tiles * bottom_tile_axis + b) * tile_size + d; + for (int t = 0; t < num_tiles; ++t) { + bottom_diff[index] += top_diff[top_index]; + top_index += bottom_tile_axis * tile_size; } - bottom_data += inner_dim_; } } @@ -26,15 +54,12 @@ void TileLayer::Backward_gpu(const vector*>& top, if (!propagate_down[0]) { return; } const Dtype* top_diff = top[0]->gpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - for (int i = 0; i < outer_dim_; ++i) { - caffe_copy(inner_dim_, top_diff, bottom_diff); - top_diff += inner_dim_; - for (int t = 1; t < tiles_; ++t) { - caffe_gpu_axpy(inner_dim_, Dtype(1), top_diff, bottom_diff); - top_diff += inner_dim_; - } - bottom_diff += inner_dim_; - } + const int bottom_tile_axis = bottom[0]->shape(axis_); + const int tile_size = inner_dim_ / bottom_tile_axis; + const int nthreads = bottom[0]->count(); + TileBackward // NOLINT_NEXT_LINE(whitespace/operators) + <<>>( + nthreads, top_diff, tile_size, tiles_, bottom_tile_axis, bottom_diff); } INSTANTIATE_LAYER_GPU_FUNCS(TileLayer);