diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu index 8dbbb6aa606..91f5d85ddbf 100644 --- a/src/caffe/layers/cudnn_conv_layer.cu +++ b/src/caffe/layers/cudnn_conv_layer.cu @@ -28,11 +28,19 @@ namespace caffe { const Dtype* bottom_data = bottom[i]->gpu_data(); Dtype* top_data = top[i]->mutable_gpu_data(); + // Test free space and force reshape if allocations have changed + size_t workspace_limit_bytes, total_memory; + gpu_memory::getInfo(&workspace_limit_bytes, &total_memory); + if (workspace_fwd_sizes_[i] > workspace_limit_bytes) { + this->Reshape(bottom, top); + } + + // !!!! Not safe if group_ > 1 !!!! + workspace.reserve(workspace_fwd_sizes_[i]); // Forward through cuDNN in parallel over groups. for (int g = 0; g < this->group_; g++) { - workspace.reserve(workspace_fwd_sizes_[i]); - // Filters. + // Filters. CUDNN_CHECK(cudnnConvFwd(Caffe::cudnn_handle(), cudnn::dataType::one, bottom_descs_[i], @@ -47,8 +55,6 @@ namespace caffe { top_descs_[i], top_data + top_offset_ * g)); - workspace.release(); - // Bias. if (this->bias_term_) { const Dtype* bias_data = this->blobs_[1]->gpu_data(); @@ -62,6 +68,7 @@ namespace caffe { } } + workspace.release(); // Synchronize the work across groups, each of which went into its own // stream, by launching an empty kernel into the default (null) stream. // NOLINT_NEXT_LINE(whitespace/operators) @@ -78,6 +85,7 @@ namespace caffe { const Dtype* weight = NULL; Dtype* weight_diff = NULL; + if (this->param_propagate_down_[0]) { weight = this->blobs_[0]->gpu_data(); weight_diff = this->blobs_[0]->mutable_gpu_diff(); @@ -89,69 +97,83 @@ namespace caffe { } for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - - // Backward through cuDNN in parallel over groups and gradients. - for (int g = 0; g < this->group_; g++) { - // Gradient w.r.t. bias. - if (this->bias_term_ && this->param_propagate_down_[1]) { - CUDNN_CHECK(cudnnConvBwdBias(Caffe::cudnn_handle(), - cudnn::dataType::one, - top_descs_[i], - top_diff + top_offset_ * g, - cudnn::dataType::one, - bias_desc_, - bias_diff + bias_offset_ * g)); + const Dtype* top_diff = top[i]->gpu_diff(); + + // Test free space and force reshape if allocations have changed + size_t workspace_limit_bytes, total_memory; + gpu_memory::getInfo(&workspace_limit_bytes, &total_memory); + if (workspace_bwd_filter_sizes_[i] > workspace_limit_bytes || + workspace_bwd_data_sizes_[i] > workspace_limit_bytes) { + this->Reshape(bottom, top); } - // Gradient w.r.t. weights. - if (this->param_propagate_down_[0]) { - workspace.reserve(workspace_bwd_filter_sizes_[i]); - const Dtype* bottom_data = bottom[i]->gpu_data(); - CUDNN_CHECK(cudnnConvBwdFilter(Caffe::cudnn_handle(), - cudnn::dataType::one, - bottom_descs_[i], - bottom_data + bottom_offset_ * g, - top_descs_[i], - top_diff + top_offset_ * g, - conv_descs_[i], - bwd_filter_algo_[i], - workspace.data(), - workspace.size(), - cudnn::dataType::one, - filter_desc_, - weight_diff + weight_offset_ * g)); - workspace.release(); + // To remove pressure on allocator, allocate the larger of the + // workspaces needed for the following steps + size_t workspace_reserve = workspace_bwd_filter_sizes_[i] > + workspace_bwd_data_sizes_[i] ? + workspace_bwd_filter_sizes_[i] : workspace_bwd_data_sizes_[i]; + + // !!!! Not safe if group_ > 1 !!!! + workspace.reserve(workspace_reserve); + + // Backward through cuDNN in parallel over groups and gradients. + for (int g = 0; g < this->group_; g++) { + // Gradient w.r.t. bias. + if (this->bias_term_ && this->param_propagate_down_[1]) { + CUDNN_CHECK(cudnnConvBwdBias(Caffe::cudnn_handle(), + cudnn::dataType::one, + top_descs_[i], + top_diff + top_offset_ * g, + cudnn::dataType::one, + bias_desc_, + bias_diff + bias_offset_ * g)); + } + + // Gradient w.r.t. weights. + if (this->param_propagate_down_[0]) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + CUDNN_CHECK(cudnnConvBwdFilter(Caffe::cudnn_handle(), + cudnn::dataType::one, + bottom_descs_[i], + bottom_data + bottom_offset_ * g, + top_descs_[i], + top_diff + top_offset_ * g, + conv_descs_[i], + bwd_filter_algo_[i], + workspace.data(), + workspace.size(), + cudnn::dataType::one, + filter_desc_, + weight_diff + weight_offset_ * g)); + } + + // Gradient w.r.t. bottom data. + if (propagate_down[i]) { + if (weight == NULL) { + weight = this->blobs_[0]->gpu_data(); + } + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + CUDNN_CHECK(cudnnConvBwdData(Caffe::cudnn_handle(), + cudnn::dataType::one, + filter_desc_, + weight + this->weight_offset_ * g, + top_descs_[i], + top_diff + top_offset_ * g, + conv_descs_[i], + bwd_data_algo_[i], + workspace.data(), + workspace.size(), + cudnn::dataType::zero, + bottom_descs_[i], + bottom_diff + bottom_offset_ * g)); + } } - // Gradient w.r.t. bottom data. - if (propagate_down[i]) { - if (weight == NULL) { - weight = this->blobs_[0]->gpu_data(); - } - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - workspace.reserve(workspace_bwd_data_sizes_[i]); - CUDNN_CHECK(cudnnConvBwdData(Caffe::cudnn_handle(), - cudnn::dataType::one, - filter_desc_, - weight + this->weight_offset_ * g, - top_descs_[i], - top_diff + top_offset_ * g, - conv_descs_[i], - bwd_data_algo_[i], - workspace.data(), - workspace.size(), - cudnn::dataType::zero, - bottom_descs_[i], - bottom_diff + bottom_offset_ * g)); - workspace.release(); - } - } - - // Synchronize the work across groups, each of which went into its own - // stream, by launching an empty kernel into the default (null) stream. - // NOLINT_NEXT_LINE(whitespace/operators) - CUDA_CHECK(cudaStreamSynchronize(cudaStreamLegacy)); + workspace.release(); + // Synchronize the work across groups, each of which went into its own + // stream, by launching an empty kernel into the default (null) stream. + // NOLINT_NEXT_LINE(whitespace/operators) + CUDA_CHECK(cudaStreamSynchronize(cudaStreamLegacy)); } } diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp index 5dd312153cc..65e9b1cc19c 100644 --- a/src/caffe/util/gpu_memory.cpp +++ b/src/caffe/util/gpu_memory.cpp @@ -131,7 +131,11 @@ namespace caffe { cubAlloc = new cub::CachingDeviceAllocator( 2, // defaults 6, - 16, + 32, // largest + // cached + // allocation + // becomes + // 2^32 here poolsize_, false, debug_);