Skip to content

Commit

Permalink
Merge pull request BVLC#178 from drnikolaev/caffe-0.15-multigpu-ws
Browse files Browse the repository at this point in the history
Multi-GPU support for the GPUMemory::Workspace
  • Loading branch information
drnikolaev authored Jun 24, 2016
2 parents f1b9845 + d0ed093 commit dce8bbf
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 47 deletions.
4 changes: 2 additions & 2 deletions include/caffe/layers/cudnn_conv_layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
// This is the workspace used by all Convolution layers one after another.
// We carry it global to prevent unnecessary allocations/deallocations
// because they hurt performance.
static GPUMemory::Workspace WORKSPACE;
static GPUMemory::MultiWorkspace WORKSPACE;

public:
explicit CuDNNConvolutionLayer(const LayerParameter& param)
Expand Down Expand Up @@ -114,7 +114,7 @@ const size_t CuDNNConvolutionLayer<Dtype>::INITIAL_WORKSPACE_SIZE =
4*1024*1024;

template<typename Dtype>
GPUMemory::Workspace CuDNNConvolutionLayer<Dtype>::WORKSPACE;
GPUMemory::MultiWorkspace CuDNNConvolutionLayer<Dtype>::WORKSPACE;

template<typename Dtype>
const float CuDNNConvolutionLayer<Dtype>::MAX_WORKSPACE_RATIO = 0.95F;
Expand Down
95 changes: 69 additions & 26 deletions include/caffe/util/gpu_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,22 @@ struct GPUMemory {

template <class Any>
static void allocate(Any** ptr, size_t size,
int device = INVALID_DEVICE,
cudaStream_t stream = cudaStreamDefault) {
if (!try_allocate(reinterpret_cast<void**>(ptr), size, stream)) {
LOG(FATAL) << "Out of memory: failed to allocate " << size << " bytes";
if (!try_allocate(reinterpret_cast<void**>(ptr), size, device, stream)) {
LOG(FATAL) << "Out of memory: failed to allocate " << size
<< " bytes on device " << device;
}
}

static void deallocate(void* ptr,
static void deallocate(void* ptr, int device = INVALID_DEVICE,
cudaStream_t stream = cudaStreamDefault) {
mgr_.deallocate(ptr, stream);
mgr_.deallocate(ptr, device, stream);
}

static bool try_allocate(void** ptr, size_t size,
static bool try_allocate(void** ptr, size_t size, int device = INVALID_DEVICE,
cudaStream_t stream = cudaStreamDefault) {
return mgr_.try_allocate(ptr, size, stream);
return mgr_.try_allocate(ptr, size, device, stream);
}

enum Mode {
Expand All @@ -52,39 +54,49 @@ struct GPUMemory {
// Workspace's release() functionality depends on global pool availability
// If pool is available, it returns memory to the pool and sets ptr to NULL
// If pool is not available, it retains memory.
// This is single GPU workspace. See MultiWorkspace for multi-GPU support.
struct Workspace {
Workspace() : ptr_(NULL), stream_(), size_(0) {}
Workspace(size_t size, cudaStream_t s = cudaStreamDefault) : stream_(s) {
reserve(size);
Workspace()
: ptr_(NULL), size_(0), device_(INVALID_DEVICE),
stream_(cudaStreamDefault) {}
Workspace(size_t size, int device = INVALID_DEVICE,
cudaStream_t s = cudaStreamDefault)
: ptr_(NULL), size_(0), device_(device), stream_(s) {
reserve(size, device);
}
~Workspace() { mgr_.deallocate(ptr_, stream_); }
~Workspace() { mgr_.deallocate(ptr_, device_, stream_); }

void* data() const { return ptr_; }
size_t size() const { return size_; }
int device() const { return device_; }

bool try_reserve(size_t size) {
bool try_reserve(size_t size, int device = INVALID_DEVICE) {
bool status = true;
if (size > size_) {
if (ptr_) {
mgr_.deallocate(ptr_, stream_);
if (ptr_ != NULL) {
mgr_.deallocate(ptr_, device_, stream_);
}
if (device != INVALID_DEVICE) {
device_ = device; // switch from default to specific one
}
status = mgr_.try_allocate(&ptr_, size, stream_);
status = mgr_.try_allocate(&ptr_, size, device_, stream_);
if (status) {
size_ = size;
}
}
return status;
}

void reserve(size_t size) {
if (!try_reserve(size)) {
LOG(FATAL) << "Out of memory: failed to allocate " << size << " bytes";
void reserve(size_t size, int device = INVALID_DEVICE) {
if (!try_reserve(size, device)) {
LOG(FATAL) << "Out of memory: failed to allocate " << size
<< " bytes on device " << device;
}
}

void release() {
if (mgr_.using_pool()) {
mgr_.deallocate(ptr_, stream_);
if (mgr_.using_pool() && ptr_ != NULL) {
mgr_.deallocate(ptr_, device_, stream_);
ptr_ = NULL;
size_ = 0;
}
Expand All @@ -93,17 +105,46 @@ struct GPUMemory {

private:
void* ptr_;
cudaStream_t stream_;
size_t size_;
int device_;
cudaStream_t stream_;

DISABLE_COPY_AND_ASSIGN(Workspace);
};

// This implementation maintains workspaces on per-GPU basis.
struct MultiWorkspace {
bool try_reserve(size_t size) {
return current_workspace()->try_reserve(size);
}
void reserve(size_t size) {
current_workspace()->reserve(size);
}
void release() {
current_workspace()->release();
}
void* data() const {
return current_workspace()->data();
}
size_t size() const {
return current_workspace()->size();
}
int device() const {
return current_workspace()->device();
}

private:
shared_ptr<Workspace> current_workspace() const;
mutable vector<shared_ptr<Workspace> > ws_;
};

private:
struct Manager {
Manager();
~Manager();
void GetInfo(size_t* free_mem, size_t* used_mem);
void deallocate(void* ptr, cudaStream_t stream);
bool try_allocate(void** ptr, size_t size, cudaStream_t);
void deallocate(void* ptr, int device, cudaStream_t stream);
bool try_allocate(void** ptr, size_t size, int device, cudaStream_t);
const char* pool_name() const;
bool using_pool() const { return mode_ != CUDA_MALLOC; }
void init(const std::vector<int>&, Mode, bool);
Expand All @@ -125,12 +166,14 @@ struct GPUMemory {
bool initialized_;
cub::CachingDeviceAllocator* cub_allocator_;

static unsigned int BIN_GROWTH; ///< Geometric growth factor for bin-sizes
static unsigned int MIN_BIN; ///< Minimum bin
static unsigned int MAX_BIN; ///< Maximum bin
static size_t MAX_CACHED_BYTES; ///< Maximum aggregate cached bytes
static const unsigned int BIN_GROWTH; ///< Geometric growth factor
static const unsigned int MIN_BIN; ///< Minimum bin
static const unsigned int MAX_BIN; ///< Maximum bin
static const size_t MAX_CACHED_BYTES; ///< Maximum aggregate cached bytes
};

static const int INVALID_DEVICE; ///< Default is invalid: CUB takes care

static Manager mgr_;
};

Expand Down
2 changes: 0 additions & 2 deletions src/caffe/layers/cudnn_conv_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,6 @@ void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
use_reshape_ = true;
// When true, cached bottom and conv descriptors need to be set.
initialized_cached_descs_ = false;
// In case of reusing it
WORKSPACE.release();
}

template <typename Dtype>
Expand Down
61 changes: 44 additions & 17 deletions src/caffe/util/gpu_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@
namespace caffe {
using std::vector;

unsigned int GPUMemory::Manager::BIN_GROWTH = 2;
unsigned int GPUMemory::Manager::MIN_BIN = 6;
unsigned int GPUMemory::Manager::MAX_BIN = 22;
size_t GPUMemory::Manager::MAX_CACHED_BYTES = (size_t) -1;
const int GPUMemory::INVALID_DEVICE =
cub::CachingDeviceAllocator::INVALID_DEVICE_ORDINAL;
const unsigned int GPUMemory::Manager::BIN_GROWTH = 2;
const unsigned int GPUMemory::Manager::MIN_BIN = 6;
const unsigned int GPUMemory::Manager::MAX_BIN = 22;
const size_t GPUMemory::Manager::MAX_CACHED_BYTES = (size_t) -1;

GPUMemory::Manager GPUMemory::mgr_;

Expand Down Expand Up @@ -61,30 +63,40 @@ GPUMemory::Manager::~Manager() {
}
}

bool GPUMemory::Manager::try_allocate(void** ptr, size_t size,
bool GPUMemory::Manager::try_allocate(void** ptr, size_t size, int device,
cudaStream_t stream) {
CHECK(initialized_) << "Create GPUMemory::Scope to initialize Memory Manager";
CHECK_NOTNULL(ptr);
cudaError_t status = cudaSuccess, last_err = cudaSuccess;
switch (mode_) {
case CUB_ALLOCATOR:
// Clean Cache & Retry logic is inside now
status = cub_allocator_->DeviceAllocate(ptr, size, stream);
status = cub_allocator_->DeviceAllocate(device, ptr, size, stream);
// If there was a retry and it succeeded we get good status here but
// we need to clean up last error...
last_err = cudaGetLastError();
// ...and update the dev info if something was wrong
if (status != cudaSuccess || last_err != cudaSuccess) {
int cur_device;
CUDA_CHECK(cudaGetDevice(&cur_device));
// Refresh per-device saved values.
for (int i = 0; i < dev_info_.size(); ++i) {
// If we know what particular device failed we update its info only
if (device > INVALID_DEVICE && device < dev_info_.size()) {
// only query devices that were initialized
if (dev_info_[i].total_) {
update_dev_info(i);
// record which device caused cache flush
if (i == cur_device) {
dev_info_[i].flush_count_++;
if (dev_info_[device].total_) {
update_dev_info(device);
dev_info_[device].flush_count_++;
}
} else {
// Update them all otherwise
int cur_device;
CUDA_CHECK(cudaGetDevice(&cur_device));
// Refresh per-device saved values.
for (int i = 0; i < dev_info_.size(); ++i) {
// only query devices that were initialized
if (dev_info_[i].total_) {
update_dev_info(i);
// record which device caused cache flush
if (i == cur_device) {
dev_info_[i].flush_count_++;
}
}
}
}
Expand All @@ -97,14 +109,15 @@ bool GPUMemory::Manager::try_allocate(void** ptr, size_t size,
return status == cudaSuccess;
}

void GPUMemory::Manager::deallocate(void* ptr, cudaStream_t stream) {
void GPUMemory::Manager::deallocate(void* ptr, int device,
cudaStream_t stream) {
// allow for null pointer deallocation
if (!ptr) {
return;
}
switch (mode_) {
case CUB_ALLOCATOR:
CUDA_CHECK(cub_allocator_->DeviceFree(ptr));
CUDA_CHECK(cub_allocator_->DeviceFree(device, ptr));
break;
default:
CUDA_CHECK(cudaFree(ptr));
Expand Down Expand Up @@ -165,6 +178,20 @@ void GPUMemory::Manager::GetInfo(size_t* free_mem, size_t* total_mem) {
}
}

shared_ptr<GPUMemory::Workspace>
GPUMemory::MultiWorkspace::current_workspace() const {
int current_device;
CUDA_CHECK(cudaGetDevice(&current_device));
if (current_device + 1 > ws_.size()) {
ws_.resize(current_device + 1);
}
if (!ws_[current_device]) { // In case if --gpu=1,0
ws_[current_device].reset(
new GPUMemory::Workspace(0, current_device));
}
return ws_[current_device];
}

} // namespace caffe

#endif // CPU_ONLY

0 comments on commit dce8bbf

Please sign in to comment.