-
-
Notifications
You must be signed in to change notification settings - Fork 8.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add ellpack page #4833
add ellpack page #4833
Changes from all commits
5e81760
7fd5453
992d229
b4cb3a9
47cc2fd
8b07fe3
f70bc6e
82ec6ac
ac6eccf
fd464b0
a3ff3dc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,6 +26,8 @@ | |
namespace xgboost { | ||
// forward declare learner. | ||
class LearnerImpl; | ||
// forward declare dmatrix. | ||
class DMatrix; | ||
|
||
/*! \brief data type accepted by xgboost interface */ | ||
enum DataType { | ||
|
@@ -86,7 +88,7 @@ class MetaInfo { | |
* \return The pre-defined root index of i-th instance. | ||
*/ | ||
inline unsigned GetRoot(size_t i) const { | ||
return root_index_.size() != 0 ? root_index_[i] : 0U; | ||
return !root_index_.empty() ? root_index_[i] : 0U; | ||
} | ||
/*! \brief get sorted indexes (argsort) of labels by absolute value (used by cox loss) */ | ||
inline const std::vector<size_t>& LabelAbsSort() const { | ||
|
@@ -166,7 +168,7 @@ class SparsePage { | |
/*! \brief the data of the segments */ | ||
HostDeviceVector<Entry> data; | ||
|
||
size_t base_rowid; | ||
size_t base_rowid{}; | ||
|
||
/*! \brief an instance of sparse vector in the batch */ | ||
using Inst = common::Span<Entry const>; | ||
|
@@ -215,23 +217,23 @@ class SparsePage { | |
const int nthread = omp_get_max_threads(); | ||
builder.InitBudget(num_columns, nthread); | ||
long batch_size = static_cast<long>(this->Size()); // NOLINT(*) | ||
#pragma omp parallel for schedule(static) | ||
#pragma omp parallel for default(none) shared(batch_size, builder) schedule(static) | ||
for (long i = 0; i < batch_size; ++i) { // NOLINT(*) | ||
int tid = omp_get_thread_num(); | ||
auto inst = (*this)[i]; | ||
for (bst_uint j = 0; j < inst.size(); ++j) { | ||
builder.AddBudget(inst[j].index, tid); | ||
for (const auto& entry : inst) { | ||
builder.AddBudget(entry.index, tid); | ||
} | ||
} | ||
builder.InitStorage(); | ||
#pragma omp parallel for schedule(static) | ||
#pragma omp parallel for default(none) shared(batch_size, builder) schedule(static) | ||
for (long i = 0; i < batch_size; ++i) { // NOLINT(*) | ||
int tid = omp_get_thread_num(); | ||
auto inst = (*this)[i]; | ||
for (bst_uint j = 0; j < inst.size(); ++j) { | ||
for (const auto& entry : inst) { | ||
builder.Push( | ||
inst[j].index, | ||
Entry(static_cast<bst_uint>(this->base_rowid + i), inst[j].fvalue), | ||
entry.index, | ||
Entry(static_cast<bst_uint>(this->base_rowid + i), entry.fvalue), | ||
tid); | ||
} | ||
} | ||
|
@@ -240,7 +242,7 @@ class SparsePage { | |
|
||
void SortRows() { | ||
auto ncol = static_cast<bst_omp_uint>(this->Size()); | ||
#pragma omp parallel for schedule(dynamic, 1) | ||
#pragma omp parallel for default(none) shared(ncol) schedule(dynamic, 1) | ||
for (bst_omp_uint i = 0; i < ncol; ++i) { | ||
if (this->offset.HostVector()[i] < this->offset.HostVector()[i + 1]) { | ||
std::sort( | ||
|
@@ -287,10 +289,29 @@ class SortedCSCPage : public SparsePage { | |
explicit SortedCSCPage(SparsePage page) : SparsePage(std::move(page)) {} | ||
}; | ||
|
||
class EllpackPageImpl; | ||
/*! | ||
* \brief A page stored in ELLPACK format. | ||
* | ||
* This class uses the PImpl idiom (https://en.cppreference.com/w/cpp/language/pimpl) to avoid | ||
* including CUDA-specific implementation details in the header. | ||
*/ | ||
class EllpackPage { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should this be more generalized so we can support multiple binned matrix types fairly easily. perhaps, make this an adapter, with a factory method to create the the underlying implementation:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can probably wait until we have another binned page. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree, let's keep this in mind but not for this PR. |
||
public: | ||
explicit EllpackPage(DMatrix* dmat); | ||
~EllpackPage(); | ||
|
||
const EllpackPageImpl* Impl() const { return impl_.get(); } | ||
EllpackPageImpl* Impl() { return impl_.get(); } | ||
|
||
private: | ||
std::unique_ptr<EllpackPageImpl> impl_; | ||
}; | ||
|
||
template<typename T> | ||
class BatchIteratorImpl { | ||
public: | ||
virtual ~BatchIteratorImpl() {} | ||
virtual ~BatchIteratorImpl() = default; | ||
virtual T& operator*() = 0; | ||
virtual const T& operator*() const = 0; | ||
virtual void operator++() = 0; | ||
|
@@ -412,7 +433,7 @@ class DMatrix { | |
bool silent, | ||
bool load_row_split, | ||
const std::string& file_format = "auto", | ||
const size_t page_size = kPageSize); | ||
size_t page_size = kPageSize); | ||
|
||
/*! | ||
* \brief create a new DMatrix, by wrapping a row_iterator, and meta info. | ||
|
@@ -438,7 +459,7 @@ class DMatrix { | |
*/ | ||
static DMatrix* Create(dmlc::Parser<uint32_t>* parser, | ||
const std::string& cache_prefix = "", | ||
const size_t page_size = kPageSize); | ||
size_t page_size = kPageSize); | ||
|
||
/*! \brief page size 32 MB */ | ||
static const size_t kPageSize = 32UL << 20UL; | ||
|
@@ -447,6 +468,7 @@ class DMatrix { | |
virtual BatchSet<SparsePage> GetRowBatches() = 0; | ||
virtual BatchSet<CSCPage> GetColumnBatches() = 0; | ||
virtual BatchSet<SortedCSCPage> GetSortedColumnBatches() = 0; | ||
virtual BatchSet<EllpackPage> GetEllpackBatches() = 0; | ||
}; | ||
|
||
template<> | ||
|
@@ -463,6 +485,11 @@ template<> | |
inline BatchSet<SortedCSCPage> DMatrix::GetBatches() { | ||
return GetSortedColumnBatches(); | ||
} | ||
|
||
template<> | ||
inline BatchSet<EllpackPage> DMatrix::GetBatches() { | ||
return GetEllpackBatches(); | ||
} | ||
} // namespace xgboost | ||
|
||
namespace dmlc { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -99,15 +99,15 @@ struct SketchContainer { | |
std::vector<std::mutex> col_locks_; // NOLINT | ||
static constexpr int kOmpNumColsParallelizeLimit = 1000; | ||
|
||
SketchContainer(const tree::TrainParam ¶m, DMatrix *dmat) : | ||
SketchContainer(int max_bin, DMatrix *dmat) : | ||
col_locks_(dmat->Info().num_col_) { | ||
const MetaInfo &info = dmat->Info(); | ||
// Initialize Sketches for this dmatrix | ||
sketches_.resize(info.num_col_); | ||
#pragma omp parallel for default(none) shared(info, param) schedule(static) \ | ||
#pragma omp parallel for default(none) shared(info, max_bin) schedule(static) \ | ||
if (info.num_col_ > kOmpNumColsParallelizeLimit) // NOLINT | ||
for (int icol = 0; icol < info.num_col_; ++icol) { // NOLINT | ||
sketches_[icol].Init(info.num_row_, 1.0 / (8 * param.max_bin)); | ||
sketches_[icol].Init(info.num_row_, 1.0 / (8 * max_bin)); | ||
} | ||
} | ||
|
||
|
@@ -130,7 +130,7 @@ struct GPUSketcher { | |
bool has_weights_{false}; | ||
size_t row_stride_{0}; | ||
|
||
tree::TrainParam param_; | ||
const int max_bin_; | ||
SketchContainer *sketch_container_; | ||
dh::device_vector<size_t> row_ptrs_{}; | ||
dh::device_vector<Entry> entries_{}; | ||
|
@@ -148,11 +148,11 @@ struct GPUSketcher { | |
public: | ||
DeviceShard(int device, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This DeviceShard class is now redundant right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes there are 4-5 places we still have a |
||
bst_uint n_rows, | ||
tree::TrainParam param, | ||
int max_bin, | ||
SketchContainer* sketch_container) : | ||
device_(device), | ||
n_rows_(n_rows), | ||
param_(std::move(param)), | ||
max_bin_(max_bin), | ||
sketch_container_(sketch_container) { | ||
} | ||
|
||
|
@@ -183,7 +183,7 @@ struct GPUSketcher { | |
} | ||
|
||
constexpr int kFactor = 8; | ||
double eps = 1.0 / (kFactor * param_.max_bin); | ||
double eps = 1.0 / (kFactor * max_bin_); | ||
size_t dummy_nlevel; | ||
WXQSketch::LimitSizeLevel(gpu_batch_nrows_, eps, &dummy_nlevel, &n_cuts_); | ||
|
||
|
@@ -362,7 +362,7 @@ struct GPUSketcher { | |
// add cuts into sketches | ||
thrust::copy(cuts_d_.begin(), cuts_d_.end(), cuts_h_.begin()); | ||
#pragma omp parallel for default(none) schedule(static) \ | ||
if (num_cols_ > SketchContainer::kOmpNumColsParallelizeLimit) // NOLINT | ||
if (num_cols_ > SketchContainer::kOmpNumColsParallelizeLimit) // NOLINT | ||
trivialfis marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for (int icol = 0; icol < num_cols_; ++icol) { | ||
WXQSketch::SummaryContainer summary; | ||
summary.Reserve(n_cuts_); | ||
|
@@ -403,10 +403,8 @@ struct GPUSketcher { | |
}; | ||
|
||
void SketchBatch(const SparsePage &batch, const MetaInfo &info) { | ||
auto device = generic_param_.gpu_id; | ||
|
||
// create device shard | ||
shard_.reset(new DeviceShard(device, batch.Size(), param_, sketch_container_.get())); | ||
shard_.reset(new DeviceShard(device_, batch.Size(), max_bin_, sketch_container_.get())); | ||
|
||
// compute sketches for the shard | ||
shard_->Init(batch, info, gpu_batch_nrows_); | ||
|
@@ -417,39 +415,40 @@ struct GPUSketcher { | |
row_stride_ = shard_->GetRowStride(); | ||
} | ||
|
||
GPUSketcher(const tree::TrainParam ¶m, const GenericParameter &generic_param, int gpu_nrows) | ||
: param_(param), generic_param_(generic_param), gpu_batch_nrows_(gpu_nrows), row_stride_(0) { | ||
} | ||
GPUSketcher(int device, int max_bin, int gpu_nrows) | ||
: device_(device), max_bin_(max_bin), gpu_batch_nrows_(gpu_nrows), row_stride_(0) {} | ||
|
||
/* Builds the sketches on the GPU for the dmatrix and returns the row stride | ||
* for the entire dataset */ | ||
size_t Sketch(DMatrix *dmat, DenseCuts *hmat) { | ||
const MetaInfo &info = dmat->Info(); | ||
|
||
row_stride_ = 0; | ||
sketch_container_.reset(new SketchContainer(param_, dmat)); | ||
sketch_container_.reset(new SketchContainer(max_bin_, dmat)); | ||
for (const auto &batch : dmat->GetBatches<SparsePage>()) { | ||
this->SketchBatch(batch, info); | ||
} | ||
|
||
hmat->Init(&sketch_container_->sketches_, param_.max_bin); | ||
hmat->Init(&sketch_container_->sketches_, max_bin_); | ||
|
||
return row_stride_; | ||
} | ||
|
||
private: | ||
std::unique_ptr<DeviceShard> shard_; | ||
const tree::TrainParam ¶m_; | ||
const GenericParameter &generic_param_; | ||
const int device_; | ||
const int max_bin_; | ||
int gpu_batch_nrows_; | ||
size_t row_stride_; | ||
std::unique_ptr<SketchContainer> sketch_container_; | ||
}; | ||
|
||
size_t DeviceSketch | ||
(const tree::TrainParam ¶m, const GenericParameter &learner_param, int gpu_batch_nrows, | ||
DMatrix *dmat, HistogramCuts *hmat) { | ||
GPUSketcher sketcher(param, learner_param, gpu_batch_nrows); | ||
size_t DeviceSketch(int device, | ||
int max_bin, | ||
int gpu_batch_nrows, | ||
DMatrix* dmat, | ||
HistogramCuts* hmat) { | ||
GPUSketcher sketcher(device, max_bin, gpu_batch_nrows); | ||
// We only need to return the result in HistogramCuts container, so it is safe to | ||
// use a pointer of local HistogramCutsDense | ||
DenseCuts dense_cuts(hmat); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
/*! | ||
* Copyright 2019 XGBoost contributors | ||
* | ||
* \file ellpack_page.cc | ||
*/ | ||
#ifndef XGBOOST_USE_CUDA | ||
|
||
#include <xgboost/data.h> | ||
|
||
// dummy implementation of ELlpackPage in case CUDA is not used | ||
namespace xgboost { | ||
|
||
class EllpackPageImpl {}; | ||
|
||
EllpackPage::EllpackPage(DMatrix* dmat) { | ||
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but EllpackPage is required"; | ||
} | ||
|
||
EllpackPage::~EllpackPage() { | ||
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but EllpackPage is required"; | ||
} | ||
|
||
} // namespace xgboost | ||
|
||
#endif // XGBOOST_USE_CUDA |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you explain briefly why this class needs pimpl?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Mainly to get CUDA-specific implementation details out of the header, since DMatrix is used all over the place in CPU-only code.