Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add ellpack page #4833

Merged
merged 11 commits into from
Sep 17, 2019
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions amalgamation/xgboost-all0.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

// data
#include "../src/data/data.cc"
#include "../src/data/ellpack_page.cc"
#include "../src/data/simple_csr_source.cc"
#include "../src/data/simple_dmatrix.cc"
#include "../src/data/sparse_page_raw_format.cc"
Expand Down
47 changes: 34 additions & 13 deletions include/xgboost/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
namespace xgboost {
// forward declare learner.
class LearnerImpl;
// forward declare dmatrix.
class DMatrix;

/*! \brief data type accepted by xgboost interface */
enum DataType {
Expand Down Expand Up @@ -86,7 +88,7 @@ class MetaInfo {
* \return The pre-defined root index of i-th instance.
*/
inline unsigned GetRoot(size_t i) const {
return root_index_.size() != 0 ? root_index_[i] : 0U;
return !root_index_.empty() ? root_index_[i] : 0U;
}
/*! \brief get sorted indexes (argsort) of labels by absolute value (used by cox loss) */
inline const std::vector<size_t>& LabelAbsSort() const {
Expand Down Expand Up @@ -166,7 +168,7 @@ class SparsePage {
/*! \brief the data of the segments */
HostDeviceVector<Entry> data;

size_t base_rowid;
size_t base_rowid{};

/*! \brief an instance of sparse vector in the batch */
using Inst = common::Span<Entry const>;
Expand Down Expand Up @@ -215,23 +217,23 @@ class SparsePage {
const int nthread = omp_get_max_threads();
builder.InitBudget(num_columns, nthread);
long batch_size = static_cast<long>(this->Size()); // NOLINT(*)
#pragma omp parallel for schedule(static)
#pragma omp parallel for default(none) shared(batch_size, builder) schedule(static)
for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
int tid = omp_get_thread_num();
auto inst = (*this)[i];
for (bst_uint j = 0; j < inst.size(); ++j) {
builder.AddBudget(inst[j].index, tid);
for (const auto& entry : inst) {
builder.AddBudget(entry.index, tid);
}
}
builder.InitStorage();
#pragma omp parallel for schedule(static)
#pragma omp parallel for default(none) shared(batch_size, builder) schedule(static)
for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
int tid = omp_get_thread_num();
auto inst = (*this)[i];
for (bst_uint j = 0; j < inst.size(); ++j) {
for (const auto& entry : inst) {
builder.Push(
inst[j].index,
Entry(static_cast<bst_uint>(this->base_rowid + i), inst[j].fvalue),
entry.index,
Entry(static_cast<bst_uint>(this->base_rowid + i), entry.fvalue),
tid);
}
}
Expand All @@ -240,7 +242,7 @@ class SparsePage {

void SortRows() {
auto ncol = static_cast<bst_omp_uint>(this->Size());
#pragma omp parallel for schedule(dynamic, 1)
#pragma omp parallel for default(none) shared(ncol) schedule(dynamic, 1)
for (bst_omp_uint i = 0; i < ncol; ++i) {
if (this->offset.HostVector()[i] < this->offset.HostVector()[i + 1]) {
std::sort(
Expand Down Expand Up @@ -287,10 +289,23 @@ class SortedCSCPage : public SparsePage {
explicit SortedCSCPage(SparsePage page) : SparsePage(std::move(page)) {}
};

class EllpackPageImpl;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain briefly why this class needs pimpl?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mainly to get CUDA-specific implementation details out of the header, since DMatrix is used all over the place in CPU-only code.

class EllpackPage {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be more generalized so we can support multiple binned matrix types fairly easily. perhaps, make this an adapter, with a factory method to create the the underlying implementation:

struct binned_page {
   // common interface applicable to all pages
   virtual int num_bins() = 0;
   virtual int get_feature_bin(int bidx) = 0;
   virtual float get_feature_value(int fbin) = 0;
   // specifics for a page
   virtual float get_feature_value(ridx, fidx);
   // etc...
};

struct ellpack_page : binned_page {
   // realize interface
};

struct csr_page : binned_page {
   // realize interface
};

// etc...
static binned_page *create_binned_page(pertinent_data_after_quantile_generation);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can probably wait until we have another binned page.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree, let's keep this in mind but not for this PR.

public:
explicit EllpackPage(DMatrix* dmat);
~EllpackPage();

const EllpackPageImpl* Impl() const { return impl_.get(); }
EllpackPageImpl* Impl() { return impl_.get(); }

private:
std::unique_ptr<EllpackPageImpl> impl_;
};

template<typename T>
class BatchIteratorImpl {
public:
virtual ~BatchIteratorImpl() {}
virtual ~BatchIteratorImpl() = default;
virtual T& operator*() = 0;
virtual const T& operator*() const = 0;
virtual void operator++() = 0;
Expand Down Expand Up @@ -412,7 +427,7 @@ class DMatrix {
bool silent,
bool load_row_split,
const std::string& file_format = "auto",
const size_t page_size = kPageSize);
size_t page_size = kPageSize);

/*!
* \brief create a new DMatrix, by wrapping a row_iterator, and meta info.
Expand All @@ -438,7 +453,7 @@ class DMatrix {
*/
static DMatrix* Create(dmlc::Parser<uint32_t>* parser,
const std::string& cache_prefix = "",
const size_t page_size = kPageSize);
size_t page_size = kPageSize);

/*! \brief page size 32 MB */
static const size_t kPageSize = 32UL << 20UL;
Expand All @@ -447,6 +462,7 @@ class DMatrix {
virtual BatchSet<SparsePage> GetRowBatches() = 0;
virtual BatchSet<CSCPage> GetColumnBatches() = 0;
virtual BatchSet<SortedCSCPage> GetSortedColumnBatches() = 0;
virtual BatchSet<EllpackPage> GetEllpackBatches() = 0;
};

template<>
Expand All @@ -463,6 +479,11 @@ template<>
inline BatchSet<SortedCSCPage> DMatrix::GetBatches() {
return GetSortedColumnBatches();
}

template<>
inline BatchSet<EllpackPage> DMatrix::GetBatches() {
return GetEllpackBatches();
}
} // namespace xgboost

namespace dmlc {
Expand Down
43 changes: 21 additions & 22 deletions src/common/hist_util.cu
Original file line number Diff line number Diff line change
Expand Up @@ -99,15 +99,15 @@ struct SketchContainer {
std::vector<std::mutex> col_locks_; // NOLINT
static constexpr int kOmpNumColsParallelizeLimit = 1000;

SketchContainer(const tree::TrainParam &param, DMatrix *dmat) :
SketchContainer(int max_bin, DMatrix *dmat) :
col_locks_(dmat->Info().num_col_) {
const MetaInfo &info = dmat->Info();
// Initialize Sketches for this dmatrix
sketches_.resize(info.num_col_);
#pragma omp parallel for default(none) shared(info, param) schedule(static) \
#pragma omp parallel for default(none) shared(info, max_bin) schedule(static) \
if (info.num_col_ > kOmpNumColsParallelizeLimit) // NOLINT
for (int icol = 0; icol < info.num_col_; ++icol) { // NOLINT
sketches_[icol].Init(info.num_row_, 1.0 / (8 * param.max_bin));
sketches_[icol].Init(info.num_row_, 1.0 / (8 * max_bin));
}
}

Expand All @@ -130,7 +130,7 @@ struct GPUSketcher {
bool has_weights_{false};
size_t row_stride_{0};

tree::TrainParam param_;
const int max_bin_;
SketchContainer *sketch_container_;
dh::device_vector<size_t> row_ptrs_{};
dh::device_vector<Entry> entries_{};
Expand All @@ -148,11 +148,11 @@ struct GPUSketcher {
public:
DeviceShard(int device,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This DeviceShard class is now redundant right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes there are 4-5 places we still have a DeviceShard that's no longer necessary. I'll send out a followup PR to clean them up.

bst_uint n_rows,
tree::TrainParam param,
int max_bin,
SketchContainer* sketch_container) :
device_(device),
n_rows_(n_rows),
param_(std::move(param)),
max_bin_(max_bin),
sketch_container_(sketch_container) {
}

Expand Down Expand Up @@ -183,7 +183,7 @@ struct GPUSketcher {
}

constexpr int kFactor = 8;
double eps = 1.0 / (kFactor * param_.max_bin);
double eps = 1.0 / (kFactor * max_bin_);
size_t dummy_nlevel;
WXQSketch::LimitSizeLevel(gpu_batch_nrows_, eps, &dummy_nlevel, &n_cuts_);

Expand Down Expand Up @@ -362,7 +362,7 @@ struct GPUSketcher {
// add cuts into sketches
thrust::copy(cuts_d_.begin(), cuts_d_.end(), cuts_h_.begin());
#pragma omp parallel for default(none) schedule(static) \
if (num_cols_ > SketchContainer::kOmpNumColsParallelizeLimit) // NOLINT
if (num_cols_ > SketchContainer::kOmpNumColsParallelizeLimit) // NOLINT
trivialfis marked this conversation as resolved.
Show resolved Hide resolved
for (int icol = 0; icol < num_cols_; ++icol) {
WXQSketch::SummaryContainer summary;
summary.Reserve(n_cuts_);
Expand Down Expand Up @@ -403,10 +403,8 @@ struct GPUSketcher {
};

void SketchBatch(const SparsePage &batch, const MetaInfo &info) {
auto device = generic_param_.gpu_id;

// create device shard
shard_.reset(new DeviceShard(device, batch.Size(), param_, sketch_container_.get()));
shard_.reset(new DeviceShard(device_, batch.Size(), max_bin_, sketch_container_.get()));

// compute sketches for the shard
shard_->Init(batch, info, gpu_batch_nrows_);
Expand All @@ -417,39 +415,40 @@ struct GPUSketcher {
row_stride_ = shard_->GetRowStride();
}

GPUSketcher(const tree::TrainParam &param, const GenericParameter &generic_param, int gpu_nrows)
: param_(param), generic_param_(generic_param), gpu_batch_nrows_(gpu_nrows), row_stride_(0) {
}
GPUSketcher(int device, int max_bin, int gpu_nrows)
: device_(device), max_bin_(max_bin), gpu_batch_nrows_(gpu_nrows), row_stride_(0) {}

/* Builds the sketches on the GPU for the dmatrix and returns the row stride
* for the entire dataset */
size_t Sketch(DMatrix *dmat, DenseCuts *hmat) {
const MetaInfo &info = dmat->Info();

row_stride_ = 0;
sketch_container_.reset(new SketchContainer(param_, dmat));
sketch_container_.reset(new SketchContainer(max_bin_, dmat));
for (const auto &batch : dmat->GetBatches<SparsePage>()) {
this->SketchBatch(batch, info);
}

hmat->Init(&sketch_container_->sketches_, param_.max_bin);
hmat->Init(&sketch_container_->sketches_, max_bin_);

return row_stride_;
}

private:
std::unique_ptr<DeviceShard> shard_;
const tree::TrainParam &param_;
const GenericParameter &generic_param_;
const int device_;
const int max_bin_;
int gpu_batch_nrows_;
size_t row_stride_;
std::unique_ptr<SketchContainer> sketch_container_;
};

size_t DeviceSketch
(const tree::TrainParam &param, const GenericParameter &learner_param, int gpu_batch_nrows,
DMatrix *dmat, HistogramCuts *hmat) {
GPUSketcher sketcher(param, learner_param, gpu_batch_nrows);
size_t DeviceSketch(int device,
int max_bin,
int gpu_batch_nrows,
DMatrix* dmat,
HistogramCuts* hmat) {
GPUSketcher sketcher(device, max_bin, gpu_batch_nrows);
// We only need to return the result in HistogramCuts container, so it is safe to
// use a pointer of local HistogramCutsDense
DenseCuts dense_cuts(hmat);
Expand Down
9 changes: 5 additions & 4 deletions src/common/hist_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,10 +290,11 @@ class DenseCuts : public CutsBuilder {
*
* \return The row stride across the entire dataset.
*/
size_t DeviceSketch
(const tree::TrainParam& param, const GenericParameter &learner_param, int gpu_batch_nrows,
DMatrix* dmat, HistogramCuts* hmat);

size_t DeviceSketch(int device,
int max_bin,
int gpu_batch_nrows,
DMatrix* dmat,
HistogramCuts* hmat);

/*!
* \brief preprocessed global index matrix, in CSR format
Expand Down
25 changes: 25 additions & 0 deletions src/data/ellpack_page.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*!
* Copyright 2019 XGBoost contributors
*
* \file ellpack_page.cc
*/
#ifndef XGBOOST_USE_CUDA

#include <xgboost/data.h>

// dummy implementation of ELlpackPage in case CUDA is not used
namespace xgboost {

class EllpackPageImpl {};

EllpackPage::EllpackPage(DMatrix* dmat) {
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but EllpackPage is required";
}

EllpackPage::~EllpackPage() {
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but EllpackPage is required";
}

} // namespace xgboost

#endif // XGBOOST_USE_CUDA
Loading