Skip to content

Commit

Permalink
Use GPUSet in timer.h. Fix GPUSet range.
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis committed Aug 24, 2018
1 parent 5581bb6 commit ac9c026
Show file tree
Hide file tree
Showing 9 changed files with 76 additions and 60 deletions.
16 changes: 8 additions & 8 deletions src/common/device_helpers.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@ const T *Raw(const thrust::device_vector<T> &v) { // NOLINT
}

// if n_devices=-1, then use all visible devices
inline void SynchronizeNDevices(int n_devices, std::vector<int> dList) {
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
int device_idx = dList[d_idx];
safe_cuda(cudaSetDevice(device_idx));
inline void SynchronizeNDevices(xgboost::GPUSet &devices) {
devices = devices.IsEmpty() ? xgboost::GPUSet::AllVisible() : devices;
for (auto d : devices.Unnormalised()) {
safe_cuda(cudaSetDevice(d));
safe_cuda(cudaDeviceSynchronize());
}
}
Expand Down Expand Up @@ -112,15 +112,15 @@ inline void CheckComputeCapability() {
oss << "CUDA Capability Major/Minor version number: " << prop.major << "."
<< prop.minor << " is insufficient. Need >=3.5";
int failed = prop.major < 3 || prop.major == 3 && prop.minor < 5;
if (failed) LOG(WARNING) << oss.str() << " for device: " << d_idx;
if (failed) LOG(WARNING) << oss.str() << " for device: " << d_idx;
}
}

DEV_INLINE void AtomicOrByte(unsigned int* __restrict__ buffer, size_t ibyte, unsigned char b) {
atomicOr(&buffer[ibyte / sizeof(unsigned int)], (unsigned int)b << (ibyte % (sizeof(unsigned int)) * 8));
}

/*!
/*!
* \brief Find the strict upper bound for an element in a sorted array
* using binary search.
* \param cuts pointer to the first element of the sorted array
Expand Down Expand Up @@ -460,7 +460,7 @@ class BulkAllocator {
BulkAllocator(BulkAllocator<MemoryT>&&) = delete;
void operator=(const BulkAllocator<MemoryT>&) = delete;
void operator=(BulkAllocator<MemoryT>&&) = delete;

~BulkAllocator() {
for (size_t i = 0; i < d_ptr_.size(); i++) {
if (!(d_ptr_[i] == nullptr)) {
Expand Down
46 changes: 26 additions & 20 deletions src/common/gpu_set.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <string>

#include "common.h"
#include "span.h"

#if defined(__CUDACC__)
#include <thrust/system/cuda/error.h>
Expand Down Expand Up @@ -38,34 +39,28 @@ inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file,

namespace xgboost {

// set of devices across which HostDeviceVector can be distributed;
// currently implemented as a range, but can be changed later to something else,
// e.g. a bitset
/* \brief set of devices across which HostDeviceVector can be distributed.
*
* Currently implemented as a range, but can be changed later to something else,
* e.g. a bitset
*/
class GPUSet {
public:
explicit GPUSet(int start = 0, int ndevices = 0)
: devices_(start, ndevices) {}
: devices_(start, start + ndevices) {}

static GPUSet Empty() { return GPUSet(); }

static GPUSet Range(int start, int ndevices) {
return ndevices <= 0 ? Empty() : GPUSet{start, ndevices};
return ndevices <= 0 ? Empty() : GPUSet{start, start + ndevices};
}

// counting from gpu_id
static GPUSet Normalized(int gpu_id, int ndevices,
int num_rows = std::numeric_limits<int>::max()) {
/* \brief ndevices and num_rows both are upper bounds. */
static GPUSet All(int ndevices, int num_rows = std::numeric_limits<int>::max()) {
int n_devices_visible = AllVisible().Size();
ndevices = ndevices < 0 ? n_devices_visible : ndevices;
// fix-up device number to be limited by number of rows
ndevices = ndevices > num_rows ? num_rows : ndevices;

return GPUSet{gpu_id, ndevices};
}

// n_gpus and num_rows both for upper bound
static GPUSet All(int ndevices, int num_rows = std::numeric_limits<int>::max()) {
return Normalized(0, ndevices, num_rows);
return GPUSet{0, ndevices};
}

static GPUSet AllVisible() {
Expand All @@ -75,22 +70,33 @@ class GPUSet {
#endif
return GPUSet{0, n_visgpus};
}
// ensure gpu_id is correct, so not dependent upon user knowing details
/* \brief Ensure gpu_id is correct, so not dependent upon user knowing details */
static int GetDeviceIdx(int gpu_id) {
return (std::abs(gpu_id) + 0) % AllVisible().Size();
}
/* \brief Counting from gpu_id */
GPUSet Normalised(int gpu_id) {
return Range(gpu_id, *devices_.end() + gpu_id);
}
/* \brief Counting from 0 */
GPUSet Unnormalised() {
return Range(0, *devices_.end() - *devices_.begin());
}

int Size() const { return *devices_.end(); } // NOLINT
int Size() const {
int res = *devices_.end() - *devices_.begin();
return res < 0 ? 0 : res;
}

int operator[](int index) const {
CHECK(index >= 0 && index < *(devices_.end()));
return *devices_.begin() + index;
}

bool IsEmpty() const { return *(devices_.end()) == 0; } // NOLINT
bool IsEmpty() const { return Size() == 0; } // NOLINT

int Index(int device) const {
CHECK(device >= *devices_.begin() && device < *devices_.end());
CHECK(Contains(device));
return device - *devices_.begin();
}

Expand Down
2 changes: 1 addition & 1 deletion src/common/hist_util.cu
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ struct GPUSketcher {
}

GPUSketcher(tree::TrainParam param, size_t n_rows) : param_(std::move(param)) {
devices_ = GPUSet::Normalized(param_.gpu_id, param_.n_gpus, n_rows);
devices_ = GPUSet::All(param_.n_gpus, n_rows).Normalised(param_.gpu_id);
}

std::vector<std::unique_ptr<DeviceShard>> shards_;
Expand Down
11 changes: 6 additions & 5 deletions src/common/timer.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
#include <iostream>
#include <map>
#include <string>
#include <vector>

#include "gpu_set.h"

namespace xgboost {
namespace common {
Expand Down Expand Up @@ -66,21 +67,21 @@ struct Monitor {
this->label = label;
}
void Start(const std::string &name) { timer_map[name].Start(); }
void Start(const std::string &name, std::vector<int> dList) {
void Start(const std::string &name, GPUSet devices) {
if (debug_verbose) {
#ifdef __CUDACC__
#include "device_helpers.cuh"
dh::SynchronizeNDevices(dList.size(), dList);
dh::SynchronizeNDevices(devices);
#endif
}
timer_map[name].Start();
}
void Stop(const std::string &name) { timer_map[name].Stop(); }
void Stop(const std::string &name, std::vector<int> dList) {
void Stop(const std::string &name, GPUSet devices) {
if (debug_verbose) {
#ifdef __CUDACC__
#include "device_helpers.cuh"
dh::SynchronizeNDevices(dList.size(), dList);
dh::SynchronizeNDevices(devices);
#endif
}
timer_map[name].Stop();
Expand Down
2 changes: 1 addition & 1 deletion src/objective/regression_obj_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ class GPURegLossObj : public ObjFunction {
void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
param_.InitAllowUnknown(args);
// CHECK(param_.n_gpus != 0) << "Must have at least one device";
devices_ = GPUSet::Normalized(param_.gpu_id, param_.n_gpus);
devices_ = GPUSet::All(param_.n_gpus).Normalised(param_.gpu_id);
}

void GetGradient(HostDeviceVector<float>* preds,
Expand Down
2 changes: 1 addition & 1 deletion src/predictor/gpu_predictor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,7 @@ class GPUPredictor : public xgboost::Predictor {
Predictor::Init(cfg, cache);
cpu_predictor->Init(cfg, cache);
param.InitAllowUnknown(cfg);
devices = GPUSet::Normalized(param.gpu_id, param.n_gpus);
devices = GPUSet::All(param.n_gpus).Normalised(param.gpu_id);
max_shared_memory_bytes = dh::MaxSharedMemory(param.gpu_id);
}

Expand Down
2 changes: 1 addition & 1 deletion src/tree/updater_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,7 @@ class GPUMaker : public TreeUpdater {
maxNodes = (1 << (param.max_depth + 1)) - 1;
maxLeaves = 1 << param.max_depth;

devices_ = GPUSet::Normalized(param.gpu_id, param.n_gpus);
devices_ = GPUSet::All(param.n_gpus).Normalised(param.gpu_id);
}

void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
Expand Down
46 changes: 23 additions & 23 deletions src/tree/updater_gpu_hist.cu
Original file line number Diff line number Diff line change
Expand Up @@ -728,7 +728,7 @@ class GPUHistMaker : public TreeUpdater {
param_.InitAllowUnknown(args);
CHECK(param_.n_gpus != 0) << "Must have at least one device";
n_devices_ = param_.n_gpus;
devices_ = GPUSet::Normalized(param_.gpu_id, param_.n_gpus);
devices_ = GPUSet::All(param_.n_gpus).Normalised(param_.gpu_id);

dh::CheckComputeCapability();

Expand All @@ -743,7 +743,7 @@ class GPUHistMaker : public TreeUpdater {

void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
const std::vector<RegTree*>& trees) override {
monitor_.Start("Update", device_list_);
monitor_.Start("Update", devices_);
GradStats::CheckInfo(dmat->Info());
// rescale learning rate according to size of trees
float lr = param_.learning_rate;
Expand All @@ -759,7 +759,7 @@ class GPUHistMaker : public TreeUpdater {
LOG(FATAL) << "Exception in gpu_hist: " << e.what() << std::endl;
}
param_.learning_rate = lr;
monitor_.Stop("Update", device_list_);
monitor_.Stop("Update", devices_);
}

void InitDataOnce(DMatrix* dmat) {
Expand Down Expand Up @@ -792,16 +792,16 @@ class GPUHistMaker : public TreeUpdater {
shard->InitRowPtrs(batch);
});

monitor_.Start("Quantiles", device_list_);
monitor_.Start("Quantiles", devices_);
common::DeviceSketch(batch, *info_, param_, &hmat_);
n_bins_ = hmat_.row_ptr.back();
monitor_.Stop("Quantiles", device_list_);
monitor_.Stop("Quantiles", devices_);

monitor_.Start("BinningCompression", device_list_);
monitor_.Start("BinningCompression", devices_);
dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {
shard->InitCompressedData(hmat_, batch);
});
monitor_.Stop("BinningCompression", device_list_);
monitor_.Stop("BinningCompression", devices_);

CHECK(!iter->Next()) << "External memory not supported";

Expand All @@ -811,20 +811,20 @@ class GPUHistMaker : public TreeUpdater {

void InitData(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
const RegTree& tree) {
monitor_.Start("InitDataOnce", device_list_);
monitor_.Start("InitDataOnce", devices_);
if (!initialised_) {
this->InitDataOnce(dmat);
}
monitor_.Stop("InitDataOnce", device_list_);
monitor_.Stop("InitDataOnce", devices_);

column_sampler_.Init(info_->num_col_, param_);

// Copy gpair & reset memory
monitor_.Start("InitDataReset", device_list_);
monitor_.Start("InitDataReset", devices_);

gpair->Reshard(devices_);
dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {shard->Reset(gpair); });
monitor_.Stop("InitDataReset", device_list_);
monitor_.Stop("InitDataReset", devices_);
}

void AllReduceHist(int nidx) {
Expand Down Expand Up @@ -1036,12 +1036,12 @@ class GPUHistMaker : public TreeUpdater {
RegTree* p_tree) {
auto& tree = *p_tree;

monitor_.Start("InitData", device_list_);
monitor_.Start("InitData", devices_);
this->InitData(gpair, p_fmat, *p_tree);
monitor_.Stop("InitData", device_list_);
monitor_.Start("InitRoot", device_list_);
monitor_.Stop("InitData", devices_);
monitor_.Start("InitRoot", devices_);
this->InitRoot(p_tree);
monitor_.Stop("InitRoot", device_list_);
monitor_.Stop("InitRoot", devices_);

auto timestamp = qexpand_->size();
auto num_leaves = 1;
Expand All @@ -1051,9 +1051,9 @@ class GPUHistMaker : public TreeUpdater {
qexpand_->pop();
if (!candidate.IsValid(param_, num_leaves)) continue;
// std::cout << candidate;
monitor_.Start("ApplySplit", device_list_);
monitor_.Start("ApplySplit", devices_);
this->ApplySplit(candidate, p_tree);
monitor_.Stop("ApplySplit", device_list_);
monitor_.Stop("ApplySplit", devices_);
num_leaves++;

auto left_child_nidx = tree[candidate.nid].LeftChild();
Expand All @@ -1062,12 +1062,12 @@ class GPUHistMaker : public TreeUpdater {
// Only create child entries if needed
if (ExpandEntry::ChildIsValid(param_, tree.GetDepth(left_child_nidx),
num_leaves)) {
monitor_.Start("BuildHist", device_list_);
monitor_.Start("BuildHist", devices_);
this->BuildHistLeftRight(candidate.nid, left_child_nidx,
right_child_nidx);
monitor_.Stop("BuildHist", device_list_);
monitor_.Stop("BuildHist", devices_);

monitor_.Start("EvaluateSplits", device_list_);
monitor_.Start("EvaluateSplits", devices_);
auto splits =
this->EvaluateSplits({left_child_nidx, right_child_nidx}, p_tree);
qexpand_->push(ExpandEntry(left_child_nidx,
Expand All @@ -1076,21 +1076,21 @@ class GPUHistMaker : public TreeUpdater {
qexpand_->push(ExpandEntry(right_child_nidx,
tree.GetDepth(right_child_nidx), splits[1],
timestamp++));
monitor_.Stop("EvaluateSplits", device_list_);
monitor_.Stop("EvaluateSplits", devices_);
}
}
}

bool UpdatePredictionCache(
const DMatrix* data, HostDeviceVector<bst_float>* p_out_preds) override {
monitor_.Start("UpdatePredictionCache", device_list_);
monitor_.Start("UpdatePredictionCache", devices_);
if (shards_.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data)
return false;
p_out_preds->Reshard(devices_);
dh::ExecuteShards(&shards_, [&](std::unique_ptr<DeviceShard>& shard) {
shard->UpdatePredictionCache(p_out_preds->DevicePointer(shard->device_idx));
});
monitor_.Stop("UpdatePredictionCache", device_list_);
monitor_.Stop("UpdatePredictionCache", devices_);
return true;
}

Expand Down
9 changes: 9 additions & 0 deletions tests/cpp/common/test_gpuset.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,23 @@ TEST(GPUSet, Basic) {

devices = GPUSet{0, 1};
ASSERT_TRUE(devices != GPUSet::Empty());
EXPECT_EQ(devices.Size(), 1);

EXPECT_ANY_THROW(devices.Index(1));
EXPECT_ANY_THROW(devices.Index(-1));

devices = GPUSet::Range(1, 0);
EXPECT_EQ(devices, GPUSet::Empty());
EXPECT_EQ(devices.Size(), 0);

EXPECT_FALSE(devices.Contains(1));

devices = GPUSet::Range(2, 8);
EXPECT_EQ(devices.Size(), 10);
devices = devices.Unnormalised();

EXPECT_EQ(*devices.begin(), 0);
EXPECT_EQ(*devices.end(), devices.Size());
}

} // namespace xgboost

0 comments on commit ac9c026

Please sign in to comment.