Skip to content

Commit

Permalink
Use double for GPU Hist node sum. (dmlc#7507)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis authored Dec 22, 2021
1 parent eabec37 commit 7f399ea
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 98 deletions.
67 changes: 31 additions & 36 deletions src/tree/gpu_hist/evaluate_splits.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*!
* Copyright 2020 by XGBoost Contributors
* Copyright 2020-2021 by XGBoost Contributors
*/
#include <limits>
#include "evaluate_splits.cuh"
Expand All @@ -9,15 +9,13 @@ namespace xgboost {
namespace tree {

// With constraints
template <typename GradientPairT>
XGBOOST_DEVICE float
LossChangeMissing(const GradientPairT &scan, const GradientPairT &missing,
const GradientPairT &parent_sum,
const GPUTrainingParam &param,
bst_node_t nidx,
bst_feature_t fidx,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
bool &missing_left_out) { // NOLINT
XGBOOST_DEVICE float LossChangeMissing(const GradientPairPrecise &scan,
const GradientPairPrecise &missing,
const GradientPairPrecise &parent_sum,
const GPUTrainingParam &param, bst_node_t nidx,
bst_feature_t fidx,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
bool &missing_left_out) { // NOLINT
float parent_gain = CalcGain(param, parent_sum);
float missing_left_gain =
evaluator.CalcSplitGain(param, nidx, fidx, GradStats(scan + missing),
Expand Down Expand Up @@ -72,41 +70,41 @@ ReduceFeature(common::Span<const GradientSumT> feature_histogram,
}

template <typename GradientSumT, typename TempStorageT> struct OneHotBin {
GradientSumT __device__ operator()(
bool thread_active, uint32_t scan_begin,
SumCallbackOp<GradientSumT>*,
GradientSumT const &missing,
EvaluateSplitInputs<GradientSumT> const &inputs, TempStorageT *) {
GradientSumT __device__ operator()(bool thread_active, uint32_t scan_begin,
SumCallbackOp<GradientSumT> *,
GradientPairPrecise const &missing,
EvaluateSplitInputs<GradientSumT> const &inputs,
TempStorageT *) {
GradientSumT bin = thread_active
? inputs.gradient_histogram[scan_begin + threadIdx.x]
: GradientSumT();
auto rest = inputs.parent_sum - bin - missing;
return rest;
auto rest = inputs.parent_sum - GradientPairPrecise(bin) - missing;
return GradientSumT{rest};
}
};

template <typename GradientSumT>
struct UpdateOneHot {
void __device__ operator()(bool missing_left, uint32_t scan_begin, float gain,
bst_feature_t fidx, GradientSumT const &missing,
bst_feature_t fidx, GradientPairPrecise const &missing,
GradientSumT const &bin,
EvaluateSplitInputs<GradientSumT> const &inputs,
DeviceSplitCandidate *best_split) {
int split_gidx = (scan_begin + threadIdx.x);
float fvalue = inputs.feature_values[split_gidx];
GradientSumT left = missing_left ? bin + missing : bin;
GradientSumT right = inputs.parent_sum - left;
best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx,
GradientPair(left), GradientPair(right), true,
GradientPairPrecise left =
missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin};
GradientPairPrecise right = inputs.parent_sum - left;
best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right, true,
inputs.param);
}
};

template <typename GradientSumT, typename TempStorageT, typename ScanT>
struct NumericBin {
GradientSumT __device__ operator()(bool thread_active, uint32_t scan_begin,
SumCallbackOp<GradientSumT>* prefix_callback,
GradientSumT const &missing,
SumCallbackOp<GradientSumT> *prefix_callback,
GradientPairPrecise const &missing,
EvaluateSplitInputs<GradientSumT> inputs,
TempStorageT *temp_storage) {
GradientSumT bin = thread_active
Expand All @@ -120,7 +118,7 @@ struct NumericBin {
template <typename GradientSumT>
struct UpdateNumeric {
void __device__ operator()(bool missing_left, uint32_t scan_begin, float gain,
bst_feature_t fidx, GradientSumT const &missing,
bst_feature_t fidx, GradientPairPrecise const &missing,
GradientSumT const &bin,
EvaluateSplitInputs<GradientSumT> const &inputs,
DeviceSplitCandidate *best_split) {
Expand All @@ -133,11 +131,11 @@ struct UpdateNumeric {
} else {
fvalue = inputs.feature_values[split_gidx];
}
GradientSumT left = missing_left ? bin + missing : bin;
GradientSumT right = inputs.parent_sum - left;
best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue,
fidx, GradientPair(left), GradientPair(right),
false, inputs.param);
GradientPairPrecise left =
missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin};
GradientPairPrecise right = inputs.parent_sum - left;
best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right, false,
inputs.param);
}
};

Expand All @@ -164,7 +162,7 @@ __device__ void EvaluateFeature(
ReduceFeature<BLOCK_THREADS, ReduceT, TempStorageT, GradientSumT>(
feature_hist, temp_storage);

GradientSumT const missing = inputs.parent_sum - feature_sum;
GradientPairPrecise const missing = inputs.parent_sum - GradientPairPrecise{feature_sum};
float const null_gain = -std::numeric_limits<bst_float>::infinity();

SumCallbackOp<GradientSumT> prefix_op = SumCallbackOp<GradientSumT>();
Expand All @@ -177,11 +175,8 @@ __device__ void EvaluateFeature(
bool missing_left = true;
float gain = null_gain;
if (thread_active) {
gain = LossChangeMissing(bin, missing, inputs.parent_sum, inputs.param,
inputs.nidx,
fidx,
evaluator,
missing_left);
gain = LossChangeMissing(GradientPairPrecise{bin}, missing, inputs.parent_sum, inputs.param,
inputs.nidx, fidx, evaluator, missing_left);
}

__syncthreads();
Expand Down
2 changes: 1 addition & 1 deletion src/tree/gpu_hist/evaluate_splits.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ namespace tree {
template <typename GradientSumT>
struct EvaluateSplitInputs {
int nidx;
GradientSumT parent_sum;
GradientPairPrecise parent_sum;
GPUTrainingParam param;
common::Span<const bst_feature_t> feature_set;
common::Span<FeatureType const> feature_types;
Expand Down
8 changes: 4 additions & 4 deletions src/tree/updater_gpu_common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ struct DeviceSplitCandidate {
float fvalue {0};
bool is_cat { false };

GradientPair left_sum;
GradientPair right_sum;
GradientPairPrecise left_sum;
GradientPairPrecise right_sum;

XGBOOST_DEVICE DeviceSplitCandidate() {} // NOLINT

Expand All @@ -78,8 +78,8 @@ struct DeviceSplitCandidate {

XGBOOST_DEVICE void Update(float loss_chg_in, DefaultDirection dir_in,
float fvalue_in, int findex_in,
GradientPair left_sum_in,
GradientPair right_sum_in,
GradientPairPrecise left_sum_in,
GradientPairPrecise right_sum_in,
bool cat,
const GPUTrainingParam& param) {
if (loss_chg_in > loss_chg &&
Expand Down
88 changes: 40 additions & 48 deletions src/tree/updater_gpu_hist.cu
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ struct GPUHistMakerDevice {
dh::caching_device_vector<int> monotone_constraints;

/*! \brief Sum gradient for each node. */
std::vector<GradientPair> node_sum_gradients;
std::vector<GradientPairPrecise> node_sum_gradients;

TrainParam param;

Expand Down Expand Up @@ -239,8 +239,7 @@ struct GPUHistMakerDevice {
dh::safe_cuda(cudaSetDevice(device_id));
tree_evaluator = TreeEvaluator(param, dmat->Info().num_col_, device_id);
this->interaction_constraints.Reset();
std::fill(node_sum_gradients.begin(), node_sum_gradients.end(),
GradientPair());
std::fill(node_sum_gradients.begin(), node_sum_gradients.end(), GradientPairPrecise{});

if (d_gpair.size() != dh_gpair->Size()) {
d_gpair.resize(dh_gpair->Size());
Expand All @@ -260,7 +259,7 @@ struct GPUHistMakerDevice {
}


DeviceSplitCandidate EvaluateRootSplit(GradientPair root_sum) {
DeviceSplitCandidate EvaluateRootSplit(GradientPairPrecise root_sum) {
int nidx = RegTree::kRoot;
dh::TemporaryArray<DeviceSplitCandidate> splits_out(1);
GPUTrainingParam gpu_param(param);
Expand All @@ -269,16 +268,15 @@ struct GPUHistMakerDevice {
common::Span<bst_feature_t> feature_set =
interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
auto matrix = page->GetDeviceAccessor(device_id);
EvaluateSplitInputs<GradientSumT> inputs{
nidx,
{root_sum.GetGrad(), root_sum.GetHess()},
gpu_param,
feature_set,
feature_types,
matrix.feature_segments,
matrix.gidx_fvalue_map,
matrix.min_fvalue,
hist.GetNodeHistogram(nidx)};
EvaluateSplitInputs<GradientSumT> inputs{nidx,
root_sum,
gpu_param,
feature_set,
feature_types,
matrix.feature_segments,
matrix.gidx_fvalue_map,
matrix.min_fvalue,
hist.GetNodeHistogram(nidx)};
auto gain_calc = tree_evaluator.GetEvaluator<GPUTrainingParam>();
EvaluateSingleSplit(dh::ToSpan(splits_out), gain_calc, inputs);
std::vector<DeviceSplitCandidate> result(1);
Expand Down Expand Up @@ -307,28 +305,24 @@ struct GPUHistMakerDevice {
left_nidx);
auto matrix = page->GetDeviceAccessor(device_id);

EvaluateSplitInputs<GradientSumT> left{
left_nidx,
{candidate.split.left_sum.GetGrad(),
candidate.split.left_sum.GetHess()},
gpu_param,
left_feature_set,
feature_types,
matrix.feature_segments,
matrix.gidx_fvalue_map,
matrix.min_fvalue,
hist.GetNodeHistogram(left_nidx)};
EvaluateSplitInputs<GradientSumT> right{
right_nidx,
{candidate.split.right_sum.GetGrad(),
candidate.split.right_sum.GetHess()},
gpu_param,
right_feature_set,
feature_types,
matrix.feature_segments,
matrix.gidx_fvalue_map,
matrix.min_fvalue,
hist.GetNodeHistogram(right_nidx)};
EvaluateSplitInputs<GradientSumT> left{left_nidx,
candidate.split.left_sum,
gpu_param,
left_feature_set,
feature_types,
matrix.feature_segments,
matrix.gidx_fvalue_map,
matrix.min_fvalue,
hist.GetNodeHistogram(left_nidx)};
EvaluateSplitInputs<GradientSumT> right{right_nidx,
candidate.split.right_sum,
gpu_param,
right_feature_set,
feature_types,
matrix.feature_segments,
matrix.gidx_fvalue_map,
matrix.min_fvalue,
hist.GetNodeHistogram(right_nidx)};
auto d_splits_out = dh::ToSpan(splits_out);
EvaluateSplits(d_splits_out, tree_evaluator.GetEvaluator<GPUTrainingParam>(), left, right);
dh::TemporaryArray<GPUExpandEntry> entries(2);
Expand Down Expand Up @@ -502,12 +496,11 @@ struct GPUHistMakerDevice {
auto d_ridx = row_partitioner->GetRows();

GPUTrainingParam param_d(param);
dh::TemporaryArray<GradientPair> device_node_sum_gradients(node_sum_gradients.size());
dh::TemporaryArray<GradientPairPrecise> device_node_sum_gradients(node_sum_gradients.size());

dh::safe_cuda(
cudaMemcpyAsync(device_node_sum_gradients.data().get(), node_sum_gradients.data(),
sizeof(GradientPair) * node_sum_gradients.size(),
cudaMemcpyHostToDevice));
dh::safe_cuda(cudaMemcpyAsync(device_node_sum_gradients.data().get(), node_sum_gradients.data(),
sizeof(GradientPairPrecise) * node_sum_gradients.size(),
cudaMemcpyHostToDevice));
auto d_position = row_partitioner->GetPosition();
auto d_node_sum_gradients = device_node_sum_gradients.data().get();
auto evaluator = tree_evaluator.GetEvaluator<GPUTrainingParam>();
Expand Down Expand Up @@ -623,13 +616,12 @@ struct GPUHistMakerDevice {
GPUExpandEntry InitRoot(RegTree* p_tree, dh::AllReducer* reducer) {
constexpr bst_node_t kRootNIdx = 0;
dh::XGBCachingDeviceAllocator<char> alloc;
GradientPair root_sum = dh::Reduce(
thrust::cuda::par(alloc),
thrust::device_ptr<GradientPair const>(gpair.data()),
thrust::device_ptr<GradientPair const>(gpair.data() + gpair.size()),
GradientPair{}, thrust::plus<GradientPair>{});
rabit::Allreduce<rabit::op::Sum, float>(reinterpret_cast<float*>(&root_sum),
2);
auto gpair_it = dh::MakeTransformIterator<GradientPairPrecise>(
dh::tbegin(gpair), [] __device__(auto const& gpair) { return GradientPairPrecise{gpair}; });
GradientPairPrecise root_sum =
dh::Reduce(thrust::cuda::par(alloc), gpair_it, gpair_it + gpair.size(),
GradientPairPrecise{}, thrust::plus<GradientPairPrecise>{});
rabit::Allreduce<rabit::op::Sum, double>(reinterpret_cast<double*>(&root_sum), 2);

this->BuildHist(kRootNIdx);
this->AllReduceHist(kRootNIdx, reducer);
Expand Down
18 changes: 9 additions & 9 deletions tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ auto ZeroParam() {

void TestEvaluateSingleSplit(bool is_categorical) {
thrust::device_vector<DeviceSplitCandidate> out_splits(1);
GradientPair parent_sum(0.0, 1.0);
GradientPairPrecise parent_sum(0.0, 1.0);
TrainParam tparam = ZeroParam();
GPUTrainingParam param{tparam};

Expand Down Expand Up @@ -73,7 +73,7 @@ TEST(GpuHist, EvaluateCategoricalSplit) {

TEST(GpuHist, EvaluateSingleSplitMissing) {
thrust::device_vector<DeviceSplitCandidate> out_splits(1);
GradientPair parent_sum(1.0, 1.5);
GradientPairPrecise parent_sum(1.0, 1.5);
TrainParam tparam = ZeroParam();
GPUTrainingParam param{tparam};

Expand Down Expand Up @@ -104,8 +104,8 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
EXPECT_EQ(result.findex, 0);
EXPECT_EQ(result.fvalue, 1.0);
EXPECT_EQ(result.dir, kRightDir);
EXPECT_EQ(result.left_sum, GradientPair(-0.5, 0.5));
EXPECT_EQ(result.right_sum, GradientPair(1.5, 1.0));
EXPECT_EQ(result.left_sum, GradientPairPrecise(-0.5, 0.5));
EXPECT_EQ(result.right_sum, GradientPairPrecise(1.5, 1.0));
}

TEST(GpuHist, EvaluateSingleSplitEmpty) {
Expand All @@ -130,7 +130,7 @@ TEST(GpuHist, EvaluateSingleSplitEmpty) {
// Feature 0 has a better split, but the algorithm must select feature 1
TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
thrust::device_vector<DeviceSplitCandidate> out_splits(1);
GradientPair parent_sum(0.0, 1.0);
GradientPairPrecise parent_sum(0.0, 1.0);
TrainParam tparam = ZeroParam();
tparam.UpdateAllowUnknown(Args{});
GPUTrainingParam param{tparam};
Expand Down Expand Up @@ -164,14 +164,14 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
DeviceSplitCandidate result = out_splits[0];
EXPECT_EQ(result.findex, 1);
EXPECT_EQ(result.fvalue, 11.0);
EXPECT_EQ(result.left_sum, GradientPair(-0.5, 0.5));
EXPECT_EQ(result.right_sum, GradientPair(0.5, 0.5));
EXPECT_EQ(result.left_sum, GradientPairPrecise(-0.5, 0.5));
EXPECT_EQ(result.right_sum, GradientPairPrecise(0.5, 0.5));
}

// Features 0 and 1 have identical gain, the algorithm must select 0
TEST(GpuHist, EvaluateSingleSplitBreakTies) {
thrust::device_vector<DeviceSplitCandidate> out_splits(1);
GradientPair parent_sum(0.0, 1.0);
GradientPairPrecise parent_sum(0.0, 1.0);
TrainParam tparam = ZeroParam();
tparam.UpdateAllowUnknown(Args{});
GPUTrainingParam param{tparam};
Expand Down Expand Up @@ -209,7 +209,7 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) {

TEST(GpuHist, EvaluateSplits) {
thrust::device_vector<DeviceSplitCandidate> out_splits(2);
GradientPair parent_sum(0.0, 1.0);
GradientPairPrecise parent_sum(0.0, 1.0);
TrainParam tparam = ZeroParam();
tparam.UpdateAllowUnknown(Args{});
GPUTrainingParam param{tparam};
Expand Down

0 comments on commit 7f399ea

Please sign in to comment.