Skip to content

Commit

Permalink
more explict sharding methods
Browse files Browse the repository at this point in the history
  • Loading branch information
rongou committed Apr 29, 2019
1 parent 5e582b0 commit 1ccb8d8
Show file tree
Hide file tree
Showing 12 changed files with 199 additions and 77 deletions.
7 changes: 5 additions & 2 deletions src/common/host_device_vector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -154,10 +154,13 @@ bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
}

template <typename T>
void HostDeviceVector<T>::Reshard(const GPUDistribution& distribution) const { }
void HostDeviceVector<T>::Shard(const GPUDistribution& distribution) const { }

template <typename T>
void HostDeviceVector<T>::Reshard(GPUSet devices) const { }
void HostDeviceVector<T>::Shard(GPUSet devices) const { }

template <typename T>
void Reshard(const GPUDistribution &distribution, bool preserve) { }

// explicit instantiations are required, as HostDeviceVector isn't header-only
template class HostDeviceVector<bst_float>;
Expand Down
37 changes: 25 additions & 12 deletions src/common/host_device_vector.cu
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ struct HostDeviceVectorImpl {
// Data is on device;
if (distribution_ != other->distribution_) {
distribution_ = GPUDistribution();
Reshard(other->Distribution());
Shard(other->Distribution());
size_d_ = other->size_d_;
}
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
Expand Down Expand Up @@ -358,19 +358,27 @@ struct HostDeviceVectorImpl {
return data_h_;
}

void Reshard(const GPUDistribution& distribution) {
void Shard(const GPUDistribution& distribution) {
if (distribution_ == distribution) { return; }
CHECK(distribution_.IsEmpty() || distribution.IsEmpty());
if (distribution.IsEmpty()) {
LazySyncHost(GPUAccess::kWrite);
}
CHECK(distribution_.IsEmpty());
distribution_ = distribution;
InitShards();
}

void Reshard(GPUSet new_devices) {
void Shard(GPUSet new_devices) {
if (distribution_.Devices() == new_devices) { return; }
Reshard(GPUDistribution::Block(new_devices));
Shard(GPUDistribution::Block(new_devices));
}

void Reshard(const GPUDistribution &distribution, bool preserve) {
if (distribution_ == distribution) { return; }
if (preserve) {
LazySyncHost(GPUAccess::kWrite);
}
distribution_ = distribution;
shards_.clear();
perm_h_.Grant(kWrite);
InitShards();
}

void Resize(size_t new_size, T v) {
Expand Down Expand Up @@ -586,13 +594,18 @@ bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
}

template <typename T>
void HostDeviceVector<T>::Reshard(GPUSet new_devices) const {
impl_->Reshard(new_devices);
void HostDeviceVector<T>::Shard(GPUSet new_devices) const {
impl_->Shard(new_devices);
}

template <typename T>
void HostDeviceVector<T>::Shard(const GPUDistribution &distribution) const {
impl_->Shard(distribution);
}

template <typename T>
void HostDeviceVector<T>::Reshard(const GPUDistribution& distribution) const {
impl_->Reshard(distribution);
void HostDeviceVector<T>::Reshard(const GPUDistribution &distribution, bool preserve) {
impl_->Reshard(distribution, preserve);
}

template <typename T>
Expand Down
16 changes: 11 additions & 5 deletions src/common/host_device_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* Initialization/Allocation:<br/>
* One can choose to initialize the vector on CPU or GPU during constructor.
* (use the 'devices' argument) Or, can choose to use the 'Resize' method to
* allocate/resize memory explicitly, and use the 'Reshard' method
* allocate/resize memory explicitly, and use the 'Shard' method
* to specify the devices.
*
* Accessing underlying data:<br/>
Expand Down Expand Up @@ -98,6 +98,8 @@ class GPUDistribution {
offsets_(std::move(offsets)) {}

public:
static GPUDistribution Empty() { return GPUDistribution(); }

static GPUDistribution Block(GPUSet devices) { return GPUDistribution(devices); }

static GPUDistribution Overlap(GPUSet devices, int overlap) {
Expand Down Expand Up @@ -250,11 +252,15 @@ class HostDeviceVector {

/*!
* \brief Specify memory distribution.
*
* If GPUSet::Empty() is used, all data will be drawn back to CPU.
*/
void Reshard(const GPUDistribution& distribution) const;
void Reshard(GPUSet devices) const;
void Shard(const GPUDistribution &distribution) const;
void Shard(GPUSet devices) const;

/*!
* \brief Change memory distribution.
*/
void Reshard(const GPUDistribution &distribution, bool preserve=true);

void Resize(size_t new_size, T v = T());

private:
Expand Down
42 changes: 21 additions & 21 deletions src/common/transform.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,13 @@ class Transform {
template <typename Functor>
struct Evaluator {
public:
Evaluator(Functor func, Range range, GPUSet devices, bool reshard) :
Evaluator(Functor func, Range range, GPUSet devices, bool shard) :
func_(func), range_{std::move(range)},
reshard_{reshard},
shard_{shard},
distribution_{std::move(GPUDistribution::Block(devices))} {}
Evaluator(Functor func, Range range, GPUDistribution dist,
bool reshard) :
func_(func), range_{std::move(range)}, reshard_{reshard},
bool shard) :
func_(func), range_{std::move(range)}, shard_{shard},
distribution_{std::move(dist)} {}

/*!
Expand Down Expand Up @@ -106,25 +106,25 @@ class Transform {
return Span<T const> {_vec->ConstHostPointer(),
static_cast<typename Span<T>::index_type>(_vec->Size())};
}
// Recursive unpack for Reshard.
// Recursive unpack for Shard.
template <typename T>
void UnpackReshard(GPUDistribution dist, const HostDeviceVector<T>* vector) const {
vector->Reshard(dist);
void UnpackShard(GPUDistribution dist, const HostDeviceVector<T> *vector) const {
vector->Shard(dist);
}
template <typename Head, typename... Rest>
void UnpackReshard(GPUDistribution dist,
const HostDeviceVector<Head>* _vector,
const HostDeviceVector<Rest>*... _vectors) const {
_vector->Reshard(dist);
UnpackReshard(dist, _vectors...);
void UnpackShard(GPUDistribution dist,
const HostDeviceVector<Head> *_vector,
const HostDeviceVector<Rest> *... _vectors) const {
_vector->Shard(dist);
UnpackShard(dist, _vectors...);
}

#if defined(__CUDACC__)
template <typename std::enable_if<CompiledWithCuda>::type* = nullptr,
typename... HDV>
void LaunchCUDA(Functor _func, HDV*... _vectors) const {
if (reshard_)
UnpackReshard(distribution_, _vectors...);
if (shard_)
UnpackShard(distribution_, _vectors...);

GPUSet devices = distribution_.Devices();
size_t range_size = *range_.end() - *range_.begin();
Expand Down Expand Up @@ -170,8 +170,8 @@ class Transform {
Functor func_;
/*! \brief Range object specifying parallel threads index range. */
Range range_;
/*! \brief Whether resharding for vectors is required. */
bool reshard_;
/*! \brief Whether sharding for vectors is required. */
bool shard_;
GPUDistribution distribution_;
};

Expand All @@ -187,19 +187,19 @@ class Transform {
* \param range Range object specifying parallel threads index range.
* \param devices GPUSet specifying GPUs to use, when compiling for CPU,
* this should be GPUSet::Empty().
* \param reshard Whether Reshard for HostDeviceVector is needed.
* \param shard Whether Shard for HostDeviceVector is needed.
*/
template <typename Functor>
static Evaluator<Functor> Init(Functor func, Range const range,
GPUSet const devices,
bool const reshard = true) {
return Evaluator<Functor> {func, std::move(range), std::move(devices), reshard};
bool const shard = true) {
return Evaluator<Functor> {func, std::move(range), std::move(devices), shard};
}
template <typename Functor>
static Evaluator<Functor> Init(Functor func, Range const range,
GPUDistribution const dist,
bool const reshard = true) {
return Evaluator<Functor> {func, std::move(range), std::move(dist), reshard};
bool const shard = true) {
return Evaluator<Functor> {func, std::move(range), std::move(dist), shard};
}
};

Expand Down
6 changes: 3 additions & 3 deletions src/metric/elementwise_metric.cu
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,9 @@ class ElementWiseMetricsReduction {
allocators_.clear();
allocators_.resize(devices.Size());
}
preds.Reshard(devices);
labels.Reshard(devices);
weights.Reshard(devices);
preds.Shard(devices);
labels.Shard(devices);
weights.Shard(devices);
std::vector<PackedReduceResult> res_per_device(devices.Size());

#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
Expand Down
6 changes: 3 additions & 3 deletions src/metric/multiclass_metric.cu
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,9 @@ class MultiClassMetricsReduction {
allocators_.clear();
allocators_.resize(devices.Size());
}
preds.Reshard(GPUDistribution::Granular(devices, n_class));
labels.Reshard(devices);
weights.Reshard(devices);
preds.Shard(GPUDistribution::Granular(devices, n_class));
labels.Shard(devices);
weights.Shard(devices);
std::vector<PackedReduceResult> res_per_device(devices.Size());

#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
Expand Down
16 changes: 8 additions & 8 deletions src/objective/multiclass_obj.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ struct SoftmaxMultiClassParam : public dmlc::Parameter<SoftmaxMultiClassParam> {
.describe("gpu to use for objective function evaluation");
}
};
// TODO(trivialfis): Currently the resharding in softmax is less than ideal
// TODO(trivialfis): Currently the sharding in softmax is less than ideal
// due to repeated copying data between CPU and GPUs. Maybe we just use single
// GPU?
class SoftmaxMultiClassObj : public ObjFunction {
Expand All @@ -63,11 +63,11 @@ class SoftmaxMultiClassObj : public ObjFunction {
const int nclass = param_.num_class;
const auto ndata = static_cast<int64_t>(preds.Size() / nclass);

out_gpair->Reshard(GPUDistribution::Granular(devices_, nclass));
info.labels_.Reshard(GPUDistribution::Block(devices_));
info.weights_.Reshard(GPUDistribution::Block(devices_));
preds.Reshard(GPUDistribution::Granular(devices_, nclass));
label_correct_.Reshard(GPUDistribution::Block(devices_));
out_gpair->Shard(GPUDistribution::Granular(devices_, nclass));
info.labels_.Shard(GPUDistribution::Block(devices_));
info.weights_.Shard(GPUDistribution::Block(devices_));
preds.Shard(GPUDistribution::Granular(devices_, nclass));
label_correct_.Shard(GPUDistribution::Block(devices_));

out_gpair->Resize(preds.Size());
label_correct_.Fill(1);
Expand Down Expand Up @@ -136,8 +136,8 @@ class SoftmaxMultiClassObj : public ObjFunction {
common::Range{0, ndata}, GPUDistribution::Granular(devices_, nclass))
.Eval(io_preds);
} else {
io_preds->Reshard(GPUDistribution::Granular(devices_, nclass));
max_preds_.Reshard(GPUDistribution::Block(devices_));
io_preds->Shard(GPUDistribution::Granular(devices_, nclass));
max_preds_.Shard(GPUDistribution::Block(devices_));
common::Transform<>::Init(
[=] XGBOOST_DEVICE(size_t _idx,
common::Span<const bst_float> _preds,
Expand Down
10 changes: 5 additions & 5 deletions src/predictor/gpu_predictor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -327,11 +327,11 @@ class GPUPredictor : public xgboost::Predictor {

for (const auto &batch : dmat->GetRowBatches()) {
CHECK_EQ(i_batch, 0) << "External memory not supported";
// out_preds have been resharded and resized in InitOutPredictions()
batch.offset.Reshard(GPUDistribution::Overlap(devices_, 1));
// out_preds have been sharded and resized in InitOutPredictions()
batch.offset.Shard(GPUDistribution::Overlap(devices_, 1));
std::vector<size_t> device_offsets;
DeviceOffsets(batch.offset, &device_offsets);
batch.data.Reshard(GPUDistribution::Explicit(devices_, device_offsets));
batch.data.Shard(GPUDistribution::Explicit(devices_, device_offsets));
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
shard.PredictInternal(batch, dmat->Info(), out_preds, model,
h_tree_segments, h_nodes, tree_begin, tree_end);
Expand Down Expand Up @@ -373,7 +373,7 @@ class GPUPredictor : public xgboost::Predictor {
size_t n_classes = model.param.num_output_group;
size_t n = n_classes * info.num_row_;
const HostDeviceVector<bst_float>& base_margin = info.base_margin_;
out_preds->Reshard(GPUDistribution::Granular(devices_, n_classes));
out_preds->Shard(GPUDistribution::Granular(devices_, n_classes));
out_preds->Resize(n);
if (base_margin.Size() != 0) {
CHECK_EQ(out_preds->Size(), n);
Expand All @@ -392,7 +392,7 @@ class GPUPredictor : public xgboost::Predictor {
const HostDeviceVector<bst_float>& y = it->second.predictions;
if (y.Size() != 0) {
monitor_.StartCuda("PredictFromCache");
out_preds->Reshard(y.Distribution());
out_preds->Shard(y.Distribution());
out_preds->Resize(y.Size());
out_preds->Copy(y);
monitor_.StopCuda("PredictFromCache");
Expand Down
4 changes: 2 additions & 2 deletions src/tree/updater_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ class GPUMaker : public TreeUpdater {
int maxNodes_;
int maxLeaves_;

// devices are only used for resharding the HostDeviceVector passed as a parameter;
// devices are only used for sharding the HostDeviceVector passed as a parameter;
// the algorithm works with a single GPU only
GPUSet devices_;

Expand Down Expand Up @@ -594,7 +594,7 @@ class GPUMaker : public TreeUpdater {
float lr = param_.learning_rate;
param_.learning_rate = lr / trees.size();

gpair->Reshard(devices_);
gpair->Shard(devices_);

try {
// build tree
Expand Down
4 changes: 2 additions & 2 deletions src/tree/updater_gpu_hist.cu
Original file line number Diff line number Diff line change
Expand Up @@ -836,7 +836,7 @@ struct DeviceShard {
for (auto i = 0ull; i < nidxs.size(); i++) {
auto nidx = nidxs[i];
auto p_feature_set = column_sampler.GetFeatureSet(tree.GetDepth(nidx));
p_feature_set->Reshard(GPUSet(device_id, 1));
p_feature_set->Shard(GPUSet(device_id, 1));
auto d_feature_set = p_feature_set->DeviceSpan(device_id);
auto d_split_candidates =
d_split_candidates_all.subspan(i * num_columns, d_feature_set.size());
Expand Down Expand Up @@ -1527,7 +1527,7 @@ class GPUHistMakerSpecialised{
return false;
}
monitor_.StartCuda("UpdatePredictionCache");
p_out_preds->Reshard(dist_.Devices());
p_out_preds->Shard(dist_.Devices());
dh::ExecuteIndexShards(
&shards_,
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
Expand Down
Loading

0 comments on commit 1ccb8d8

Please sign in to comment.