From add73fbea5767cc1f8a75d9c0715e334b9651335 Mon Sep 17 00:00:00 2001 From: Nuno Subtil Date: Tue, 24 Mar 2015 15:48:16 -0700 Subject: [PATCH] Fallback to different cuDNN algorithm when under memory pressure CUDNN_CONVOLUTION_FWD_PREFER_FASTEST requires a lot of GPU memory, which may not always be available. Add a fallback path that uses CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM when the allocation fails. --- src/caffe/layers/cudnn_conv_layer.cpp | 2 ++ src/caffe/layers/cudnn_conv_layer.cu | 19 +++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp index 524caf1320f..104d2b9d669 100644 --- a/src/caffe/layers/cudnn_conv_layer.cpp +++ b/src/caffe/layers/cudnn_conv_layer.cpp @@ -24,6 +24,8 @@ void CuDNNConvolutionLayer::LayerSetUp( // Initialize CUDA streams and cuDNN. stream_ = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; handle_ = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; + workspaceSizeInBytes = 0; + workspace = NULL; for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) { CUDA_CHECK(cudaStreamCreate(&stream_[g])); diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu index 08f5201bc22..4a1a4c4f4f2 100644 --- a/src/caffe/layers/cudnn_conv_layer.cu +++ b/src/caffe/layers/cudnn_conv_layer.cu @@ -19,6 +19,11 @@ void CuDNNConvolutionLayer::Forward_gpu( Dtype* top_data = top[i]->mutable_gpu_data(); const Dtype* weight = this->blobs_[0]->gpu_data(); + size_t workspace_limit_bytes = this->kernel_h_ * + this->kernel_w_ * + this->channels_ * + sizeof(int) + 1; + // Forward through cuDNN in parallel over groups. for (int g = 0; g < this->group_; g++) { cudnnConvolutionFwdAlgo_t algo; @@ -32,8 +37,8 @@ void CuDNNConvolutionLayer::Forward_gpu( filter_desc_, conv_descs_[i], top_descs_[i], - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, - 0, // memoryLimitInBytes, + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_limit_bytes, // memoryLimitInBytes, &algo)); // get minimum size of the workspace needed for the desired algorithm @@ -45,13 +50,19 @@ void CuDNNConvolutionLayer::Forward_gpu( conv_descs_[i], top_descs_[i], algo, - &workspaceSizeInBytes)); + &workspaceSizeInBytes_temp)); if (workspaceSizeInBytes_temp > workspaceSizeInBytes) { workspaceSizeInBytes = workspaceSizeInBytes_temp; // free the existing workspace and allocate a new (larger) one cudaFree(this->workspace); - cudaMalloc(&(this->workspace), workspaceSizeInBytes); + cudaError_t err = cudaMalloc(&(this->workspace), workspaceSizeInBytes); + if (err != cudaSuccess) { + // force zero memory path + algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; + workspace = NULL; + workspaceSizeInBytes = 0; + } } // Filters.