Skip to content

Commit

Permalink
Optimisations for gpu_hist.
Browse files Browse the repository at this point in the history
* Use streams to overlap operations.

* Reduce redundant calls to cudaSetDevice().

* ColumnSampler now uses HostDeviceVector to prevent repeatedly copying feature vectors to the device.
  • Loading branch information
RAMitchell committed Mar 12, 2019
1 parent 6fb4c5e commit 55a1f37
Show file tree
Hide file tree
Showing 8 changed files with 269 additions and 128 deletions.
45 changes: 36 additions & 9 deletions src/common/device_helpers.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -212,13 +212,23 @@ inline void LaunchN(int device_idx, size_t n, L lambda) {
return;
}

safe_cuda(cudaSetDevice(device_idx));
const int GRID_SIZE =
static_cast<int>(DivRoundUp(n, ITEMS_PER_THREAD * BLOCK_THREADS));
LaunchNKernel<<<GRID_SIZE, BLOCK_THREADS>>>(static_cast<size_t>(0), n,
lambda);
}

template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
inline void LaunchN(int device_idx, size_t n, cudaStream_t stream, L lambda) {
if (n == 0) {
return;
}

const int GRID_SIZE =
static_cast<int>(DivRoundUp(n, ITEMS_PER_THREAD * BLOCK_THREADS));
LaunchNKernel<<<GRID_SIZE, BLOCK_THREADS, 0, stream>>>(static_cast<size_t>(0),
n, lambda);
}
/*
* Memory
*/
Expand Down Expand Up @@ -247,7 +257,6 @@ class DVec {
ptr_ = static_cast<T *>(ptr);
size_ = size;
device_idx_ = device_idx;
safe_cuda(cudaSetDevice(device_idx_));
}

DVec() : ptr_(NULL), size_(0), device_idx_(-1) {}
Expand All @@ -269,7 +278,6 @@ class DVec {

std::vector<T> AsVector() const {
std::vector<T> h_vector(Size());
safe_cuda(cudaSetDevice(device_idx_));
safe_cuda(cudaMemcpy(h_vector.data(), ptr_, Size() * sizeof(T),
cudaMemcpyDeviceToHost));
return h_vector;
Expand Down Expand Up @@ -306,7 +314,6 @@ class DVec {
throw std::runtime_error(
"Cannot copy assign DVec to DVec, sizes are different");
}
safe_cuda(cudaSetDevice(this->DeviceIdx()));
if (other.DeviceIdx() == this->DeviceIdx()) {
dh::safe_cuda(cudaMemcpyAsync(this->Data(), other.Data(),
other.Size() * sizeof(T),
Expand All @@ -324,7 +331,6 @@ class DVec {

template <typename IterT>
void copy(IterT begin, IterT end) {
safe_cuda(cudaSetDevice(this->DeviceIdx()));
if (end - begin != Size()) {
LOG(FATAL) << "Cannot copy assign vector to DVec, sizes are different" <<
" vector::Size(): " << end - begin << " DVec::Size(): " << Size();
Expand All @@ -333,7 +339,6 @@ class DVec {
}

void copy(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
safe_cuda(cudaSetDevice(this->DeviceIdx()));
if (end - begin != Size()) {
throw std::runtime_error(
"Cannot copy assign vector to dvec, sizes are different");
Expand Down Expand Up @@ -431,7 +436,6 @@ class BulkAllocator {

char *AllocateDevice(int device_idx, size_t bytes, MemoryType t) {
char *ptr;
safe_cuda(cudaSetDevice(device_idx));
safe_cuda(cudaMalloc(&ptr, bytes));
return ptr;
}
Expand Down Expand Up @@ -473,7 +477,6 @@ class BulkAllocator {
~BulkAllocator() {
for (size_t i = 0; i < d_ptr_.size(); i++) {
if (!(d_ptr_[i] == nullptr)) {
safe_cuda(cudaSetDevice(device_idx_[i]));
safe_cuda(cudaFree(d_ptr_[i]));
d_ptr_[i] = nullptr;
}
Expand All @@ -499,6 +502,31 @@ class BulkAllocator {
}
};

// Keep track of pinned memory allocation
struct PinnedMemory {
void *temp_storage{nullptr};
size_t temp_storage_bytes{0};

~PinnedMemory() { Free(); }

template <typename T>
xgboost::common::Span<T> GetSpan(size_t size) {
size_t num_bytes = size * sizeof(T);
if (num_bytes > temp_storage_bytes) {
Free();
safe_cuda(cudaMallocHost(&temp_storage, num_bytes));
temp_storage_bytes = num_bytes;
}
return xgboost::common::Span<T>(static_cast<T *>(temp_storage), size);
}

void Free() {
if (temp_storage != nullptr) {
safe_cuda(cudaFreeHost(temp_storage));
}
}
};

// Keep track of cub library device allocation
struct CubMemory {
void *d_temp_storage;
Expand Down Expand Up @@ -653,7 +681,6 @@ void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory,
OffsetT count, SegmentIterT segments,
OffsetT num_segments, FunctionT f) {
typedef typename cub::CubVector<OffsetT, 2>::Type CoordinateT;
dh::safe_cuda(cudaSetDevice(device_idx));
const int BLOCK_THREADS = 256;
const int ITEMS_PER_THREAD = 1;
const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
Expand Down
39 changes: 22 additions & 17 deletions src/common/random.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <random>

#include "io.h"
#include "host_device_vector.h"

namespace xgboost {
namespace common {
Expand Down Expand Up @@ -84,28 +85,31 @@ GlobalRandomEngine& GlobalRandom(); // NOLINT(*)
*/

class ColumnSampler {
std::shared_ptr<std::vector<int>> feature_set_tree_;
std::map<int, std::shared_ptr<std::vector<int>>> feature_set_level_;
std::shared_ptr<HostDeviceVector<int>> feature_set_tree_;
std::map<int, std::shared_ptr<HostDeviceVector<int>>> feature_set_level_;
float colsample_bylevel_{1.0f};
float colsample_bytree_{1.0f};
float colsample_bynode_{1.0f};

std::shared_ptr<std::vector<int>> ColSample
(std::shared_ptr<std::vector<int>> p_features, float colsample) const {
std::shared_ptr<HostDeviceVector<int>> ColSample
(std::shared_ptr<HostDeviceVector<int>> p_features, float colsample) const {
if (colsample == 1.0f) return p_features;
const auto& features = *p_features;
const auto& features = p_features->HostVector();
CHECK_GT(features.size(), 0);
int n = std::max(1, static_cast<int>(colsample * features.size()));
auto p_new_features = std::make_shared<std::vector<int>>();
auto p_new_features = std::make_shared<HostDeviceVector<int>>();
auto& new_features = *p_new_features;
new_features.resize(features.size());
std::copy(features.begin(), features.end(), new_features.begin());
std::shuffle(new_features.begin(), new_features.end(), common::GlobalRandom());
new_features.resize(n);
std::sort(new_features.begin(), new_features.end());
new_features.Resize(features.size());
std::copy(features.begin(), features.end(),
new_features.HostVector().begin());
std::shuffle(new_features.HostVector().begin(),
new_features.HostVector().end(), common::GlobalRandom());
new_features.Resize(n);
std::sort(new_features.HostVector().begin(),
new_features.HostVector().end());

// ensure that new_features are the same across ranks
rabit::Broadcast(&new_features, 0);
rabit::Broadcast(&new_features.HostVector(), 0);

return p_new_features;
}
Expand All @@ -127,13 +131,14 @@ class ColumnSampler {
colsample_bynode_ = colsample_bynode;

if (feature_set_tree_ == nullptr) {
feature_set_tree_ = std::make_shared<std::vector<int>>();
feature_set_tree_ = std::make_shared<HostDeviceVector<int>>();
}
Reset();

int begin_idx = skip_index_0 ? 1 : 0;
feature_set_tree_->resize(num_col - begin_idx);
std::iota(feature_set_tree_->begin(), feature_set_tree_->end(), begin_idx);
feature_set_tree_->Resize(num_col - begin_idx);
std::iota(feature_set_tree_->HostVector().begin(),
feature_set_tree_->HostVector().end(), begin_idx);

feature_set_tree_ = ColSample(feature_set_tree_, colsample_bytree_);
}
Expand All @@ -142,7 +147,7 @@ class ColumnSampler {
* \brief Resets this object.
*/
void Reset() {
feature_set_tree_->clear();
feature_set_tree_->Resize(0);
feature_set_level_.clear();
}

Expand All @@ -154,7 +159,7 @@ class ColumnSampler {
* \note If colsample_bynode_ < 1.0, this method creates a new feature set each time it
* is called. Therefore, it should be called only once per node.
*/
std::shared_ptr<std::vector<int>> GetFeatureSet(int depth) {
std::shared_ptr<HostDeviceVector<int>> GetFeatureSet(int depth) {
if (colsample_bylevel_ == 1.0f && colsample_bynode_ == 1.0f) {
return feature_set_tree_;
}
Expand Down
3 changes: 1 addition & 2 deletions src/tree/updater_colmaker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -631,8 +631,7 @@ class ColMaker: public TreeUpdater {
const std::vector<GradientPair> &gpair,
DMatrix *p_fmat,
RegTree *p_tree) {
auto p_feature_set = column_sampler_.GetFeatureSet(depth);
const auto& feat_set = *p_feature_set;
auto &feat_set = column_sampler_.GetFeatureSet(depth)->HostVector();
for (const auto &batch : p_fmat->GetSortedColumnBatches()) {
this->UpdateSolution(batch, feat_set, gpair, p_fmat);
}
Expand Down
12 changes: 12 additions & 0 deletions src/tree/updater_gpu_common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,18 @@ struct DeviceSplitCandidate {
XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; }
};

struct DeviceSplitCandidateReduceOp {
GPUTrainingParam param;
DeviceSplitCandidateReduceOp(GPUTrainingParam param) : param(param) {}
XGBOOST_DEVICE DeviceSplitCandidate operator()(
const DeviceSplitCandidate& a, const DeviceSplitCandidate& b) const {
DeviceSplitCandidate best;
best.Update(a, param);
best.Update(b, param);
return best;
}
};

struct DeviceNodeStats {
GradientPair sum_gradients;
float root_gain;
Expand Down
Loading

0 comments on commit 55a1f37

Please sign in to comment.