Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce device synchronisation #5631

Merged
merged 4 commits into from
May 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions src/common/column_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,16 @@ template <typename BinIdxType>
class DenseColumn: public Column<BinIdxType> {
public:
DenseColumn(ColumnType type, common::Span<const BinIdxType> index,
uint32_t index_base,
const std::vector<bool>::const_iterator missing_flags)
uint32_t index_base, const std::vector<bool>& missing_flags,
size_t feature_offset)
: Column<BinIdxType>(type, index, index_base),
missing_flags_(missing_flags) {}
bool IsMissing(size_t idx) const { return missing_flags_[idx]; }
missing_flags_(missing_flags),
feature_offset_(feature_offset) {}
bool IsMissing(size_t idx) const { return missing_flags_[feature_offset_ + idx]; }
private:
/* flags for missing values in dense columns */
std::vector<bool>::const_iterator missing_flags_;
const std::vector<bool>& missing_flags_;
size_t feature_offset_;
};

/*! \brief a collection of columns, with support for construction from
Expand Down Expand Up @@ -208,10 +210,8 @@ class ColumnMatrix {
column_size };
std::unique_ptr<const Column<BinIdxType> > res;
if (type_[fid] == ColumnType::kDenseColumn) {
std::vector<bool>::const_iterator column_iterator = missing_flags_.begin();
advance(column_iterator, feature_offset); // increment iterator to right position
res.reset(new DenseColumn<BinIdxType>(type_[fid], bin_index, index_base_[fid],
column_iterator));
missing_flags_, feature_offset));
} else {
res.reset(new SparseColumn<BinIdxType>(type_[fid], bin_index, index_base_[fid],
{&row_ind_[feature_offset], column_size}));
Expand Down
9 changes: 9 additions & 0 deletions src/common/device_helpers.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,15 @@ struct PinnedMemory {
return xgboost::common::Span<T>(static_cast<T *>(temp_storage), size);
}

template <typename T>
xgboost::common::Span<T> GetSpan(size_t size, T init) {
auto result = this->GetSpan<T>(size);
for (auto &e : result) {
e = init;
}
return result;
}

void Free() {
if (temp_storage != nullptr) {
safe_cuda(cudaFreeHost(temp_storage));
Expand Down
120 changes: 120 additions & 0 deletions src/tree/gpu_hist/driver.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/*!
* Copyright 2020 by XGBoost Contributors
*/
#ifndef DRIVER_CUH_
#define DRIVER_CUH_
#include <xgboost/span.h>
#include <queue>
#include "../param.h"
#include "evaluate_splits.cuh"

namespace xgboost {
namespace tree {
struct ExpandEntry {
int nid;
int depth;
DeviceSplitCandidate split;
ExpandEntry() = default;
XGBOOST_DEVICE ExpandEntry(int nid, int depth, DeviceSplitCandidate split)
: nid(nid), depth(depth), split(std::move(split)) {}
bool IsValid(const TrainParam& param, int num_leaves) const {
if (split.loss_chg <= kRtEps) return false;
if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0) {
return false;
}
if (split.loss_chg < param.min_split_loss) {
return false;
}
if (param.max_depth > 0 && depth == param.max_depth) {
return false;
}
if (param.max_leaves > 0 && num_leaves == param.max_leaves) {
return false;
}
return true;
}

static bool ChildIsValid(const TrainParam& param, int depth, int num_leaves) {
if (param.max_depth > 0 && depth >= param.max_depth) return false;
if (param.max_leaves > 0 && num_leaves >= param.max_leaves) return false;
return true;
}

friend std::ostream& operator<<(std::ostream& os, const ExpandEntry& e) {
os << "ExpandEntry: \n";
os << "nidx: " << e.nid << "\n";
os << "depth: " << e.depth << "\n";
os << "loss: " << e.split.loss_chg << "\n";
os << "left_sum: " << e.split.left_sum << "\n";
os << "right_sum: " << e.split.right_sum << "\n";
return os;
}
};

inline bool DepthWise(const ExpandEntry& lhs, const ExpandEntry& rhs) {
return lhs.depth > rhs.depth; // favor small depth
}

inline bool LossGuide(const ExpandEntry& lhs, const ExpandEntry& rhs) {
if (lhs.split.loss_chg == rhs.split.loss_chg) {
return lhs.nid > rhs.nid; // favor small timestamp
} else {
return lhs.split.loss_chg < rhs.split.loss_chg; // favor large loss_chg
}
}

// Drives execution of tree building on device
class Driver {
using ExpandQueue =
std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
std::function<bool(ExpandEntry, ExpandEntry)>>;

public:
explicit Driver(TrainParam::TreeGrowPolicy policy)
: policy_(policy),
queue_(policy == TrainParam::kDepthWise ? DepthWise : LossGuide) {}
template <typename EntryIterT>
void Push(EntryIterT begin,EntryIterT end) {
for (auto it = begin; it != end; ++it) {
const ExpandEntry& e = *it;
if (e.split.loss_chg > kRtEps) {
queue_.push(e);
}
}
}
void Push(const std::vector<ExpandEntry> &entries) {
this->Push(entries.begin(), entries.end());
}
// Return the set of nodes to be expanded
// This set has no dependencies between entries so they may be expanded in
// parallel or asynchronously
std::vector<ExpandEntry> Pop() {
if (queue_.empty()) return {};
// Return a single entry for loss guided mode
if (policy_ == TrainParam::kLossGuide) {
ExpandEntry e = queue_.top();
queue_.pop();
return {e};
}
// Return nodes on same level for depth wise
std::vector<ExpandEntry> result;
ExpandEntry e = queue_.top();
int level = e.depth;
while (e.depth == level && !queue_.empty()) {
queue_.pop();
result.emplace_back(e);
if (!queue_.empty()) {
e = queue_.top();
}
}
return result;
}

private:
TrainParam::TreeGrowPolicy policy_;
ExpandQueue queue_;
};
} // namespace tree
} // namespace xgboost

#endif // DRIVER_CUH_
7 changes: 4 additions & 3 deletions src/tree/gpu_hist/row_partitioner.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class RowPartitioner {
dh::caching_device_vector<int64_t>
left_counts_; // Useful to keep a bunch of zeroed memory for sort position
std::vector<cudaStream_t> streams_;
dh::PinnedMemory pinned_;

public:
RowPartitioner(int device_idx, size_t num_rows);
Expand Down Expand Up @@ -129,12 +130,12 @@ class RowPartitioner {
d_position[idx] = new_position;
});
// Overlap device to host memory copy (left_count) with sort
int64_t left_count;
int64_t &left_count = pinned_.GetSpan<int64_t>(1)[0];
dh::safe_cuda(cudaMemcpyAsync(&left_count, d_left_count, sizeof(int64_t),
cudaMemcpyDeviceToHost, streams_[0]));

SortPositionAndCopy(segment, left_nidx, right_nidx, d_left_count,
streams_[1]);
SortPositionAndCopy(segment, left_nidx, right_nidx, d_left_count, streams_[1]
);

dh::safe_cuda(cudaStreamSynchronize(streams_[0]));
CHECK_LE(left_count, segment.Size());
Expand Down
Loading