Skip to content

Commit

Permalink
add ellpack page
Browse files Browse the repository at this point in the history
  • Loading branch information
rongou committed Sep 9, 2019
1 parent 0fc7dcf commit a90d099
Show file tree
Hide file tree
Showing 13 changed files with 682 additions and 68 deletions.
54 changes: 41 additions & 13 deletions include/xgboost/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
namespace xgboost {
// forward declare learner.
class LearnerImpl;
// forward declare dmatrix.
class DMatrix;

/*! \brief data type accepted by xgboost interface */
enum DataType {
Expand Down Expand Up @@ -86,7 +88,7 @@ class MetaInfo {
* \return The pre-defined root index of i-th instance.
*/
inline unsigned GetRoot(size_t i) const {
return root_index_.size() != 0 ? root_index_[i] : 0U;
return !root_index_.empty() ? root_index_[i] : 0U;
}
/*! \brief get sorted indexes (argsort) of labels by absolute value (used by cox loss) */
inline const std::vector<size_t>& LabelAbsSort() const {
Expand Down Expand Up @@ -166,7 +168,7 @@ class SparsePage {
/*! \brief the data of the segments */
HostDeviceVector<Entry> data;

size_t base_rowid;
size_t base_rowid{};

/*! \brief an instance of sparse vector in the batch */
using Inst = common::Span<Entry const>;
Expand Down Expand Up @@ -215,23 +217,23 @@ class SparsePage {
const int nthread = omp_get_max_threads();
builder.InitBudget(num_columns, nthread);
long batch_size = static_cast<long>(this->Size()); // NOLINT(*)
#pragma omp parallel for schedule(static)
#pragma omp parallel for default(none) shared(batch_size, builder) schedule(static)
for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
int tid = omp_get_thread_num();
auto inst = (*this)[i];
for (bst_uint j = 0; j < inst.size(); ++j) {
builder.AddBudget(inst[j].index, tid);
for (const auto& entry : inst) {
builder.AddBudget(entry.index, tid);
}
}
builder.InitStorage();
#pragma omp parallel for schedule(static)
#pragma omp parallel for default(none) shared(batch_size, builder) schedule(static)
for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
int tid = omp_get_thread_num();
auto inst = (*this)[i];
for (bst_uint j = 0; j < inst.size(); ++j) {
for (const auto& entry : inst) {
builder.Push(
inst[j].index,
Entry(static_cast<bst_uint>(this->base_rowid + i), inst[j].fvalue),
entry.index,
Entry(static_cast<bst_uint>(this->base_rowid + i), entry.fvalue),
tid);
}
}
Expand All @@ -240,7 +242,7 @@ class SparsePage {

void SortRows() {
auto ncol = static_cast<bst_omp_uint>(this->Size());
#pragma omp parallel for schedule(dynamic, 1)
#pragma omp parallel for default(none) shared(ncol) schedule(dynamic, 1)
for (bst_omp_uint i = 0; i < ncol; ++i) {
if (this->offset.HostVector()[i] < this->offset.HostVector()[i + 1]) {
std::sort(
Expand Down Expand Up @@ -287,10 +289,30 @@ class SortedCSCPage : public SparsePage {
explicit SortedCSCPage(SparsePage page) : SparsePage(std::move(page)) {}
};

class EllpackPageImpl;
class EllpackPage {
public:
explicit EllpackPage(DMatrix* dmat);
~EllpackPage();
EllpackPage() = delete;
// no copy
EllpackPage(const EllpackPage&) = delete;
EllpackPage& operator=(const EllpackPage&) = delete;
// movable
EllpackPage(EllpackPage&&) = default;
EllpackPage& operator=(EllpackPage&&) = default;

const EllpackPageImpl* Impl() const { return impl_.get(); }
EllpackPageImpl* Impl() { return impl_.get(); }

private:
std::unique_ptr<EllpackPageImpl> impl_;
};

template<typename T>
class BatchIteratorImpl {
public:
virtual ~BatchIteratorImpl() {}
virtual ~BatchIteratorImpl() = default;
virtual T& operator*() = 0;
virtual const T& operator*() const = 0;
virtual void operator++() = 0;
Expand Down Expand Up @@ -412,7 +434,7 @@ class DMatrix {
bool silent,
bool load_row_split,
const std::string& file_format = "auto",
const size_t page_size = kPageSize);
size_t page_size = kPageSize);

/*!
* \brief create a new DMatrix, by wrapping a row_iterator, and meta info.
Expand All @@ -438,7 +460,7 @@ class DMatrix {
*/
static DMatrix* Create(dmlc::Parser<uint32_t>* parser,
const std::string& cache_prefix = "",
const size_t page_size = kPageSize);
size_t page_size = kPageSize);

/*! \brief page size 32 MB */
static const size_t kPageSize = 32UL << 20UL;
Expand All @@ -447,6 +469,7 @@ class DMatrix {
virtual BatchSet<SparsePage> GetRowBatches() = 0;
virtual BatchSet<CSCPage> GetColumnBatches() = 0;
virtual BatchSet<SortedCSCPage> GetSortedColumnBatches() = 0;
virtual BatchSet<EllpackPage> GetEllpackBatches() = 0;
};

template<>
Expand All @@ -463,6 +486,11 @@ template<>
inline BatchSet<SortedCSCPage> DMatrix::GetBatches() {
return GetSortedColumnBatches();
}

template<>
inline BatchSet<EllpackPage> DMatrix::GetBatches() {
return GetEllpackBatches();
}
} // namespace xgboost

namespace dmlc {
Expand Down
43 changes: 21 additions & 22 deletions src/common/hist_util.cu
Original file line number Diff line number Diff line change
Expand Up @@ -99,15 +99,15 @@ struct SketchContainer {
std::vector<std::mutex> col_locks_; // NOLINT
static constexpr int kOmpNumColsParallelizeLimit = 1000;

SketchContainer(const tree::TrainParam &param, DMatrix *dmat) :
SketchContainer(int max_bin, DMatrix *dmat) :
col_locks_(dmat->Info().num_col_) {
const MetaInfo &info = dmat->Info();
// Initialize Sketches for this dmatrix
sketches_.resize(info.num_col_);
#pragma omp parallel for default(none) shared(info, param) schedule(static) \
#pragma omp parallel for default(none) shared(info, max_bin) schedule(static) \
if (info.num_col_ > kOmpNumColsParallelizeLimit) // NOLINT
for (int icol = 0; icol < info.num_col_; ++icol) { // NOLINT
sketches_[icol].Init(info.num_row_, 1.0 / (8 * param.max_bin));
sketches_[icol].Init(info.num_row_, 1.0 / (8 * max_bin));
}
}

Expand All @@ -130,7 +130,7 @@ struct GPUSketcher {
bool has_weights_{false};
size_t row_stride_{0};

tree::TrainParam param_;
const int max_bin_;
SketchContainer *sketch_container_;
dh::device_vector<size_t> row_ptrs_{};
dh::device_vector<Entry> entries_{};
Expand All @@ -148,11 +148,11 @@ struct GPUSketcher {
public:
DeviceShard(int device,
bst_uint n_rows,
tree::TrainParam param,
int max_bin,
SketchContainer* sketch_container) :
device_(device),
n_rows_(n_rows),
param_(std::move(param)),
max_bin_(max_bin),
sketch_container_(sketch_container) {
}

Expand Down Expand Up @@ -183,7 +183,7 @@ struct GPUSketcher {
}

constexpr int kFactor = 8;
double eps = 1.0 / (kFactor * param_.max_bin);
double eps = 1.0 / (kFactor * max_bin_);
size_t dummy_nlevel;
WXQSketch::LimitSizeLevel(gpu_batch_nrows_, eps, &dummy_nlevel, &n_cuts_);

Expand Down Expand Up @@ -362,7 +362,7 @@ struct GPUSketcher {
// add cuts into sketches
thrust::copy(cuts_d_.begin(), cuts_d_.end(), cuts_h_.begin());
#pragma omp parallel for default(none) schedule(static) \
if (num_cols_ > SketchContainer::kOmpNumColsParallelizeLimit) // NOLINT
if (num_cols_ > SketchContainer::kOmpNumColsParallelizeLimit) // NOLINT
for (int icol = 0; icol < num_cols_; ++icol) {
WXQSketch::SummaryContainer summary;
summary.Reserve(n_cuts_);
Expand Down Expand Up @@ -403,10 +403,8 @@ struct GPUSketcher {
};

void SketchBatch(const SparsePage &batch, const MetaInfo &info) {
auto device = generic_param_.gpu_id;

// create device shard
shard_.reset(new DeviceShard(device, batch.Size(), param_, sketch_container_.get()));
shard_.reset(new DeviceShard(device_, batch.Size(), max_bin_, sketch_container_.get()));

// compute sketches for the shard
shard_->Init(batch, info, gpu_batch_nrows_);
Expand All @@ -417,39 +415,40 @@ struct GPUSketcher {
row_stride_ = shard_->GetRowStride();
}

GPUSketcher(const tree::TrainParam &param, const GenericParameter &generic_param, int gpu_nrows)
: param_(param), generic_param_(generic_param), gpu_batch_nrows_(gpu_nrows), row_stride_(0) {
}
GPUSketcher(int device, int max_bin, int gpu_nrows)
: device_(device), max_bin_(max_bin), gpu_batch_nrows_(gpu_nrows), row_stride_(0) {}

/* Builds the sketches on the GPU for the dmatrix and returns the row stride
* for the entire dataset */
size_t Sketch(DMatrix *dmat, DenseCuts *hmat) {
const MetaInfo &info = dmat->Info();

row_stride_ = 0;
sketch_container_.reset(new SketchContainer(param_, dmat));
sketch_container_.reset(new SketchContainer(max_bin_, dmat));
for (const auto &batch : dmat->GetBatches<SparsePage>()) {
this->SketchBatch(batch, info);
}

hmat->Init(&sketch_container_->sketches_, param_.max_bin);
hmat->Init(&sketch_container_->sketches_, max_bin_);

return row_stride_;
}

private:
std::unique_ptr<DeviceShard> shard_;
const tree::TrainParam &param_;
const GenericParameter &generic_param_;
const int device_;
const int max_bin_;
int gpu_batch_nrows_;
size_t row_stride_;
std::unique_ptr<SketchContainer> sketch_container_;
};

size_t DeviceSketch
(const tree::TrainParam &param, const GenericParameter &learner_param, int gpu_batch_nrows,
DMatrix *dmat, HistogramCuts *hmat) {
GPUSketcher sketcher(param, learner_param, gpu_batch_nrows);
size_t DeviceSketch(int device,
int max_bin,
int gpu_batch_nrows,
DMatrix* dmat,
HistogramCuts* hmat) {
GPUSketcher sketcher(device, max_bin, gpu_batch_nrows);
// We only need to return the result in HistogramCuts container, so it is safe to
// use a pointer of local HistogramCutsDense
DenseCuts dense_cuts(hmat);
Expand Down
9 changes: 5 additions & 4 deletions src/common/hist_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,10 +290,11 @@ class DenseCuts : public CutsBuilder {
*
* \return The row stride across the entire dataset.
*/
size_t DeviceSketch
(const tree::TrainParam& param, const GenericParameter &learner_param, int gpu_batch_nrows,
DMatrix* dmat, HistogramCuts* hmat);

size_t DeviceSketch(int device,
int max_bin,
int gpu_batch_nrows,
DMatrix* dmat,
HistogramCuts* hmat);

/*!
* \brief preprocessed global index matrix, in CSR format
Expand Down
25 changes: 25 additions & 0 deletions src/data/ellpack_page.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*!
* Copyright 2019 XGBoost contributors
*
* \file ellpack_page.cc
*/
#ifndef XGBOOST_USE_CUDA

#include <xgboost/data.h>

// dummy implementation of ELlpackPage in case CUDA is not used
namespace xgboost {

class EllpackPageImpl {};

EllpackPage::EllpackPage(DMatrix* dmat) {
LOG(FATAL) << "Not implemented.";
}

EllpackPage::~EllpackPage() {
LOG(FATAL) << "Not implemented.";
}

} // namespace xgboost

#endif // XGBOOST_USE_CUDA
Loading

0 comments on commit a90d099

Please sign in to comment.