From add73fbea5767cc1f8a75d9c0715e334b9651335 Mon Sep 17 00:00:00 2001
From: Nuno Subtil <nsubtil@nvidia.com>
Date: Tue, 24 Mar 2015 15:48:16 -0700
Subject: [PATCH] Fallback to different cuDNN algorithm when under memory
 pressure

CUDNN_CONVOLUTION_FWD_PREFER_FASTEST requires a lot of GPU memory, which may
not always be available. Add a fallback path that uses
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM when the allocation fails.
---
 src/caffe/layers/cudnn_conv_layer.cpp |  2 ++
 src/caffe/layers/cudnn_conv_layer.cu  | 19 +++++++++++++++----
 2 files changed, 17 insertions(+), 4 deletions(-)
diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
index 524caf1320f..104d2b9d669 100644
--- a/src/caffe/layers/cudnn_conv_layer.cpp
+++ b/src/caffe/layers/cudnn_conv_layer.cpp
@@ -24,6 +24,8 @@ void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
   // Initialize CUDA streams and cuDNN.
   stream_         = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
   handle_         = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
+  workspaceSizeInBytes = 0;
+  workspace = NULL;
 
   for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
     CUDA_CHECK(cudaStreamCreate(&stream_[g]));
diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu
index 08f5201bc22..4a1a4c4f4f2 100644
--- a/src/caffe/layers/cudnn_conv_layer.cu
+++ b/src/caffe/layers/cudnn_conv_layer.cu
@@ -19,6 +19,11 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
     Dtype* top_data = top[i]->mutable_gpu_data();
     const Dtype* weight = this->blobs_[0]->gpu_data();
 
+    size_t workspace_limit_bytes = this->kernel_h_ *
+                                   this->kernel_w_ *
+                                   this->channels_ *
+                                   sizeof(int) + 1;
+
     // Forward through cuDNN in parallel over groups.
     for (int g = 0; g < this->group_; g++) {
       cudnnConvolutionFwdAlgo_t algo;
@@ -32,8 +37,8 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
         filter_desc_,
         conv_descs_[i],
         top_descs_[i],
-        CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
-        0,  // memoryLimitInBytes,
+        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        workspace_limit_bytes,  // memoryLimitInBytes,
         &algo));
 
       // get minimum size of the workspace needed for the desired algorithm
@@ -45,13 +50,19 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
         conv_descs_[i],
         top_descs_[i],
         algo,
-        &workspaceSizeInBytes));
+        &workspaceSizeInBytes_temp));
 
       if (workspaceSizeInBytes_temp > workspaceSizeInBytes) {
         workspaceSizeInBytes = workspaceSizeInBytes_temp;
         // free the existing workspace and allocate a new (larger) one
         cudaFree(this->workspace);
-        cudaMalloc(&(this->workspace), workspaceSizeInBytes);
+        cudaError_t err = cudaMalloc(&(this->workspace), workspaceSizeInBytes);
+        if (err != cudaSuccess) {
+          // force zero memory path
+          algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+          workspace = NULL;
+          workspaceSizeInBytes = 0;
+        }
       }
 
       // Filters.