Merge pull request BVLC#178 from drnikolaev/caffe-0.15-multigpu-ws

Multi-GPU support for the GPUMemory::Workspace
lukeyeager · Jun 24, 2016 · dce8bbf · dce8bbf
2 parents f1b9845 + d0ed093
commit dce8bbf
Show file tree

Hide file tree

Showing 4 changed files with 115 additions and 47 deletions.
diff --git a/include/caffe/layers/cudnn_conv_layer.hpp b/include/caffe/layers/cudnn_conv_layer.hpp
@@ -46,7 +46,7 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
   // This is the workspace used by all Convolution layers one after another.
   // We carry it global to prevent unnecessary allocations/deallocations
   // because they hurt performance.
-  static GPUMemory::Workspace WORKSPACE;
+  static GPUMemory::MultiWorkspace WORKSPACE;
 
  public:
   explicit CuDNNConvolutionLayer(const LayerParameter& param)
@@ -114,7 +114,7 @@ const size_t CuDNNConvolutionLayer<Dtype>::INITIAL_WORKSPACE_SIZE =
     4*1024*1024;
 
 template<typename Dtype>
-GPUMemory::Workspace CuDNNConvolutionLayer<Dtype>::WORKSPACE;
+GPUMemory::MultiWorkspace CuDNNConvolutionLayer<Dtype>::WORKSPACE;
 
 template<typename Dtype>
 const float CuDNNConvolutionLayer<Dtype>::MAX_WORKSPACE_RATIO = 0.95F;

diff --git a/include/caffe/util/gpu_memory.hpp b/include/caffe/util/gpu_memory.hpp
@@ -18,20 +18,22 @@ struct GPUMemory {
 
   template <class Any>
   static void allocate(Any** ptr, size_t size,
+      int device = INVALID_DEVICE,
       cudaStream_t stream = cudaStreamDefault) {
-    if (!try_allocate(reinterpret_cast<void**>(ptr), size, stream)) {
-      LOG(FATAL) << "Out of memory: failed to allocate " << size << " bytes";
+    if (!try_allocate(reinterpret_cast<void**>(ptr), size, device, stream)) {
+      LOG(FATAL) << "Out of memory: failed to allocate " << size
+          << " bytes on device " << device;
     }
   }
 
-  static void deallocate(void* ptr,
+  static void deallocate(void* ptr, int device = INVALID_DEVICE,
       cudaStream_t stream = cudaStreamDefault) {
-    mgr_.deallocate(ptr, stream);
+    mgr_.deallocate(ptr, device, stream);
   }
 
-  static bool try_allocate(void** ptr, size_t size,
+  static bool try_allocate(void** ptr, size_t size, int device = INVALID_DEVICE,
       cudaStream_t stream = cudaStreamDefault) {
-    return mgr_.try_allocate(ptr, size, stream);
+    return mgr_.try_allocate(ptr, size, device, stream);
   }
 
   enum Mode {
@@ -52,39 +54,49 @@ struct GPUMemory {
   // Workspace's release() functionality depends on global pool availability
   // If pool is available, it returns memory to the pool and sets ptr to NULL
   // If pool is not available, it retains memory.
+  // This is single GPU workspace. See MultiWorkspace for multi-GPU support.
   struct Workspace {
-    Workspace() : ptr_(NULL), stream_(), size_(0) {}
-    Workspace(size_t size, cudaStream_t s = cudaStreamDefault) : stream_(s) {
-      reserve(size);
+    Workspace()
+      : ptr_(NULL), size_(0), device_(INVALID_DEVICE),
+        stream_(cudaStreamDefault) {}
+    Workspace(size_t size, int device = INVALID_DEVICE,
+        cudaStream_t s = cudaStreamDefault)
+      : ptr_(NULL), size_(0), device_(device), stream_(s) {
+      reserve(size, device);
     }
-    ~Workspace() { mgr_.deallocate(ptr_, stream_); }
+    ~Workspace() { mgr_.deallocate(ptr_, device_, stream_); }
 
     void* data() const { return ptr_; }
     size_t size() const { return size_; }
+    int device() const { return device_; }
 
-    bool try_reserve(size_t size) {
+    bool try_reserve(size_t size, int device = INVALID_DEVICE) {
       bool status = true;
       if (size > size_) {
-        if (ptr_) {
-          mgr_.deallocate(ptr_, stream_);
+        if (ptr_ != NULL) {
+          mgr_.deallocate(ptr_, device_, stream_);
+        }
+        if (device != INVALID_DEVICE) {
+          device_ = device;  // switch from default to specific one
         }
-        status = mgr_.try_allocate(&ptr_, size, stream_);
+        status = mgr_.try_allocate(&ptr_, size, device_, stream_);
         if (status) {
           size_ = size;
         }
       }
       return status;
     }
 
-    void reserve(size_t size) {
-      if (!try_reserve(size)) {
-        LOG(FATAL) << "Out of memory: failed to allocate " << size << " bytes";
+    void reserve(size_t size, int device = INVALID_DEVICE) {
+      if (!try_reserve(size, device)) {
+        LOG(FATAL) << "Out of memory: failed to allocate " << size
+            << " bytes on device " << device;
       }
     }
 
     void release() {
-      if (mgr_.using_pool()) {
-        mgr_.deallocate(ptr_, stream_);
+      if (mgr_.using_pool() && ptr_ != NULL) {
+        mgr_.deallocate(ptr_, device_, stream_);
         ptr_ = NULL;
         size_ = 0;
       }
@@ -93,17 +105,46 @@ struct GPUMemory {
 
    private:
     void* ptr_;
-    cudaStream_t stream_;
     size_t size_;
+    int device_;
+    cudaStream_t stream_;
+
+    DISABLE_COPY_AND_ASSIGN(Workspace);
+  };
+
+  // This implementation maintains workspaces on per-GPU basis.
+  struct MultiWorkspace {
+    bool try_reserve(size_t size) {
+      return current_workspace()->try_reserve(size);
+    }
+    void reserve(size_t size) {
+      current_workspace()->reserve(size);
+    }
+    void release() {
+      current_workspace()->release();
+    }
+    void* data() const {
+      return current_workspace()->data();
+    }
+    size_t size() const {
+      return current_workspace()->size();
+    }
+    int device() const {
+      return current_workspace()->device();
+    }
+
+   private:
+    shared_ptr<Workspace> current_workspace() const;
+    mutable vector<shared_ptr<Workspace> > ws_;
   };
 
  private:
   struct Manager {
     Manager();
     ~Manager();
     void GetInfo(size_t* free_mem, size_t* used_mem);
-    void deallocate(void* ptr, cudaStream_t stream);
-    bool try_allocate(void** ptr, size_t size, cudaStream_t);
+    void deallocate(void* ptr, int device, cudaStream_t stream);
+    bool try_allocate(void** ptr, size_t size, int device, cudaStream_t);
     const char* pool_name() const;
     bool using_pool() const { return mode_ != CUDA_MALLOC; }
     void init(const std::vector<int>&, Mode, bool);
@@ -125,12 +166,14 @@ struct GPUMemory {
     bool initialized_;
     cub::CachingDeviceAllocator* cub_allocator_;
 
-    static unsigned int BIN_GROWTH;  ///< Geometric growth factor for bin-sizes
-    static unsigned int MIN_BIN;  ///< Minimum bin
-    static unsigned int MAX_BIN;  ///< Maximum bin
-    static size_t MAX_CACHED_BYTES;  ///< Maximum aggregate cached bytes
+    static const unsigned int BIN_GROWTH;  ///< Geometric growth factor
+    static const unsigned int MIN_BIN;  ///< Minimum bin
+    static const unsigned int MAX_BIN;  ///< Maximum bin
+    static const size_t MAX_CACHED_BYTES;  ///< Maximum aggregate cached bytes
   };
 
+  static const int INVALID_DEVICE;  ///< Default is invalid: CUB takes care
+
   static Manager mgr_;
 };
 

diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
@@ -90,8 +90,6 @@ void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
   use_reshape_ = true;
   // When true, cached bottom and conv descriptors need to be set.
   initialized_cached_descs_ = false;
-  // In case of reusing it
-  WORKSPACE.release();
 }
 
 template <typename Dtype>

diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp
@@ -9,10 +9,12 @@
 namespace caffe {
 using std::vector;
 
-unsigned int GPUMemory::Manager::BIN_GROWTH = 2;
-unsigned int GPUMemory::Manager::MIN_BIN = 6;
-unsigned int GPUMemory::Manager::MAX_BIN = 22;
-size_t GPUMemory::Manager::MAX_CACHED_BYTES = (size_t) -1;
+const int GPUMemory::INVALID_DEVICE =
+    cub::CachingDeviceAllocator::INVALID_DEVICE_ORDINAL;
+const unsigned int GPUMemory::Manager::BIN_GROWTH = 2;
+const unsigned int GPUMemory::Manager::MIN_BIN = 6;
+const unsigned int GPUMemory::Manager::MAX_BIN = 22;
+const size_t GPUMemory::Manager::MAX_CACHED_BYTES = (size_t) -1;
 
 GPUMemory::Manager GPUMemory::mgr_;
 
@@ -61,30 +63,40 @@ GPUMemory::Manager::~Manager() {
   }
 }
 
-bool GPUMemory::Manager::try_allocate(void** ptr, size_t size,
+bool GPUMemory::Manager::try_allocate(void** ptr, size_t size, int device,
     cudaStream_t stream) {
   CHECK(initialized_) << "Create GPUMemory::Scope to initialize Memory Manager";
   CHECK_NOTNULL(ptr);
   cudaError_t status = cudaSuccess, last_err = cudaSuccess;
   switch (mode_) {
   case CUB_ALLOCATOR:
     // Clean Cache & Retry logic is inside now
-    status = cub_allocator_->DeviceAllocate(ptr, size, stream);
+    status = cub_allocator_->DeviceAllocate(device, ptr, size, stream);
     // If there was a retry and it succeeded we get good status here but
     // we need to clean up last error...
     last_err = cudaGetLastError();
     // ...and update the dev info if something was wrong
     if (status != cudaSuccess || last_err != cudaSuccess) {
-      int cur_device;
-      CUDA_CHECK(cudaGetDevice(&cur_device));
-      // Refresh per-device saved values.
-      for (int i = 0; i < dev_info_.size(); ++i) {
+      // If we know what particular device failed we update its info only
+      if (device > INVALID_DEVICE && device < dev_info_.size()) {
         // only query devices that were initialized
-        if (dev_info_[i].total_) {
-          update_dev_info(i);
-          // record which device caused cache flush
-          if (i == cur_device) {
-            dev_info_[i].flush_count_++;
+        if (dev_info_[device].total_) {
+          update_dev_info(device);
+          dev_info_[device].flush_count_++;
+        }
+      } else {
+        // Update them all otherwise
+        int cur_device;
+        CUDA_CHECK(cudaGetDevice(&cur_device));
+        // Refresh per-device saved values.
+        for (int i = 0; i < dev_info_.size(); ++i) {
+          // only query devices that were initialized
+          if (dev_info_[i].total_) {
+            update_dev_info(i);
+            // record which device caused cache flush
+            if (i == cur_device) {
+              dev_info_[i].flush_count_++;
+            }
           }
         }
       }
@@ -97,14 +109,15 @@ bool GPUMemory::Manager::try_allocate(void** ptr, size_t size,
   return status == cudaSuccess;
 }
 
-void GPUMemory::Manager::deallocate(void* ptr, cudaStream_t stream) {
+void GPUMemory::Manager::deallocate(void* ptr, int device,
+    cudaStream_t stream) {
   // allow for null pointer deallocation
   if (!ptr) {
     return;
   }
   switch (mode_) {
   case CUB_ALLOCATOR:
-    CUDA_CHECK(cub_allocator_->DeviceFree(ptr));
+    CUDA_CHECK(cub_allocator_->DeviceFree(device, ptr));
     break;
   default:
     CUDA_CHECK(cudaFree(ptr));
@@ -165,6 +178,20 @@ void GPUMemory::Manager::GetInfo(size_t* free_mem, size_t* total_mem) {
   }
 }
 
+shared_ptr<GPUMemory::Workspace>
+GPUMemory::MultiWorkspace::current_workspace() const {
+  int current_device;
+  CUDA_CHECK(cudaGetDevice(&current_device));
+  if (current_device + 1 > ws_.size()) {
+    ws_.resize(current_device + 1);
+  }
+  if (!ws_[current_device]) {  // In case if --gpu=1,0
+    ws_[current_device].reset(
+        new GPUMemory::Workspace(0, current_device));
+  }
+  return ws_[current_device];
+}
+
 }  // namespace caffe
 
 #endif  // CPU_ONLY