Skip to content

Commit

Permalink
Support external memory in CPU histogram building. (#7372)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis authored Nov 22, 2021
1 parent d33854a commit 176110a
Show file tree
Hide file tree
Showing 6 changed files with 304 additions and 161 deletions.
171 changes: 92 additions & 79 deletions src/common/hist_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -133,148 +133,161 @@ struct Prefetch {

constexpr size_t Prefetch::kNoPrefetchSize;


template<typename FPType, bool do_prefetch, typename BinIdxType, bool any_missing = true>
void BuildHistKernel(const std::vector<GradientPair>& gpair,
template <typename FPType, bool do_prefetch, typename BinIdxType,
bool first_page, bool any_missing = true>
void BuildHistKernel(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow<FPType> hist) {
const GHistIndexMatrix &gmat, GHistRow<FPType> hist) {
const size_t size = row_indices.Size();
const size_t* rid = row_indices.begin;
const float* pgh = reinterpret_cast<const float*>(gpair.data());
const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
const size_t* row_ptr = gmat.row_ptr.data();
const uint32_t* offsets = gmat.index.Offset();
const size_t n_features = row_ptr[row_indices.begin[0]+1] - row_ptr[row_indices.begin[0]];
FPType* hist_data = reinterpret_cast<FPType*>(hist.data());
const uint32_t two {2}; // Each element from 'gpair' and 'hist' contains
// 2 FP values: gradient and hessian.
// So we need to multiply each row-index/bin-index by 2
// to work with gradient pairs as a singe row FP array
const size_t *rid = row_indices.begin;
auto const *pgh = reinterpret_cast<const float *>(gpair.data());
const BinIdxType *gradient_index = gmat.index.data<BinIdxType>();

auto const &row_ptr = gmat.row_ptr.data();
auto base_rowid = gmat.base_rowid;
const uint32_t *offsets = gmat.index.Offset();
auto get_row_ptr = [&](size_t ridx) {
return first_page ? row_ptr[ridx] : row_ptr[ridx - base_rowid];
};
auto get_rid = [&](size_t ridx) {
return first_page ? ridx : (ridx - base_rowid);
};

const size_t n_features =
get_row_ptr(row_indices.begin[0] + 1) - get_row_ptr(row_indices.begin[0]);
auto hist_data = reinterpret_cast<FPType *>(hist.data());
const uint32_t two{2}; // Each element from 'gpair' and 'hist' contains
// 2 FP values: gradient and hessian.
// So we need to multiply each row-index/bin-index by 2
// to work with gradient pairs as a singe row FP array

for (size_t i = 0; i < size; ++i) {
const size_t icol_start = any_missing ? row_ptr[rid[i]] : rid[i] * n_features;
const size_t icol_end = any_missing ? row_ptr[rid[i]+1] : icol_start + n_features;
const size_t icol_start =
any_missing ? get_row_ptr(rid[i]) : get_rid(rid[i]) * n_features;
const size_t icol_end =
any_missing ? get_row_ptr(rid[i] + 1) : icol_start + n_features;

const size_t row_size = icol_end - icol_start;
const size_t idx_gh = two * rid[i];

if (do_prefetch) {
const size_t icol_start_prefetch = any_missing ? row_ptr[rid[i+Prefetch::kPrefetchOffset]] :
rid[i + Prefetch::kPrefetchOffset] * n_features;
const size_t icol_end_prefetch = any_missing ? row_ptr[rid[i+Prefetch::kPrefetchOffset]+1] :
icol_start_prefetch + n_features;
const size_t icol_start_prefetch =
any_missing
? get_row_ptr(rid[i + Prefetch::kPrefetchOffset])
: get_rid(rid[i + Prefetch::kPrefetchOffset]) * n_features;
const size_t icol_end_prefetch =
any_missing ? get_row_ptr(rid[i + Prefetch::kPrefetchOffset] + 1)
: icol_start_prefetch + n_features;

PREFETCH_READ_T0(pgh + two * rid[i + Prefetch::kPrefetchOffset]);
for (size_t j = icol_start_prefetch; j < icol_end_prefetch;
j+=Prefetch::GetPrefetchStep<uint32_t>()) {
j += Prefetch::GetPrefetchStep<uint32_t>()) {
PREFETCH_READ_T0(gradient_index + j);
}
}
const BinIdxType* gr_index_local = gradient_index + icol_start;
const BinIdxType *gr_index_local = gradient_index + icol_start;

for (size_t j = 0; j < row_size; ++j) {
const uint32_t idx_bin = two * (static_cast<uint32_t>(gr_index_local[j]) + (
any_missing ? 0 : offsets[j]));

hist_data[idx_bin] += pgh[idx_gh];
hist_data[idx_bin+1] += pgh[idx_gh+1];
const uint32_t idx_bin = two * (static_cast<uint32_t>(gr_index_local[j]) +
(any_missing ? 0 : offsets[j]));
hist_data[idx_bin] += pgh[idx_gh];
hist_data[idx_bin + 1] += pgh[idx_gh + 1];
}
}
}

template<typename FPType, bool do_prefetch, bool any_missing>
void BuildHistDispatch(const std::vector<GradientPair>& gpair,
template <typename FPType, bool do_prefetch, bool any_missing>
void BuildHistDispatch(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat, GHistRow<FPType> hist) {
switch (gmat.index.GetBinTypeSize()) {
const GHistIndexMatrix &gmat, GHistRow<FPType> hist) {
auto first_page = gmat.base_rowid == 0;
if (first_page) {
switch (gmat.index.GetBinTypeSize()) {
case kUint8BinsTypeSize:
BuildHistKernel<FPType, do_prefetch, uint8_t, any_missing>(gpair, row_indices,
gmat, hist);
BuildHistKernel<FPType, do_prefetch, uint8_t, true, any_missing>(
gpair, row_indices, gmat, hist);
break;
case kUint16BinsTypeSize:
BuildHistKernel<FPType, do_prefetch, uint16_t, any_missing>(gpair, row_indices,
gmat, hist);
BuildHistKernel<FPType, do_prefetch, uint16_t, true, any_missing>(
gpair, row_indices, gmat, hist);
break;
case kUint32BinsTypeSize:
BuildHistKernel<FPType, do_prefetch, uint32_t, any_missing>(gpair, row_indices,
gmat, hist);
BuildHistKernel<FPType, do_prefetch, uint32_t, true, any_missing>(
gpair, row_indices, gmat, hist);
break;
default:
CHECK(false); // no default behavior
}
} else {
switch (gmat.index.GetBinTypeSize()) {
case kUint8BinsTypeSize:
BuildHistKernel<FPType, do_prefetch, uint8_t, false, any_missing>(
gpair, row_indices, gmat, hist);
break;
case kUint16BinsTypeSize:
BuildHistKernel<FPType, do_prefetch, uint16_t, false, any_missing>(
gpair, row_indices, gmat, hist);
break;
case kUint32BinsTypeSize:
BuildHistKernel<FPType, do_prefetch, uint32_t, false, any_missing>(
gpair, row_indices, gmat, hist);
break;
default:
CHECK(false); // no default behavior
}
}
}

template <typename GradientSumT>
template <bool any_missing>
void GHistBuilder<GradientSumT>::BuildHist(
const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix &gmat,
GHistRowT hist) {
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRowT hist) const {
const size_t nrows = row_indices.Size();
const size_t no_prefetch_size = Prefetch::NoPrefetchSize(nrows);

// if need to work with all rows from bin-matrix (e.g. root node)
const bool contiguousBlock = (row_indices.begin[nrows - 1] - row_indices.begin[0]) == (nrows - 1);
const bool contiguousBlock =
(row_indices.begin[nrows - 1] - row_indices.begin[0]) == (nrows - 1);

if (contiguousBlock) {
// contiguous memory access, built-in HW prefetching is enough
BuildHistDispatch<GradientSumT, false, any_missing>(gpair, row_indices, gmat, hist);
BuildHistDispatch<GradientSumT, false, any_missing>(gpair, row_indices,
gmat, hist);
} else {
const RowSetCollection::Elem span1(row_indices.begin, row_indices.end - no_prefetch_size);
const RowSetCollection::Elem span2(row_indices.end - no_prefetch_size, row_indices.end);
const RowSetCollection::Elem span1(row_indices.begin,
row_indices.end - no_prefetch_size);
const RowSetCollection::Elem span2(row_indices.end - no_prefetch_size,
row_indices.end);

BuildHistDispatch<GradientSumT, true, any_missing>(gpair, span1, gmat, hist);
BuildHistDispatch<GradientSumT, true, any_missing>(gpair, span1, gmat,
hist);
// no prefetching to avoid loading extra memory
BuildHistDispatch<GradientSumT, false, any_missing>(gpair, span2, gmat, hist);
BuildHistDispatch<GradientSumT, false, any_missing>(gpair, span2, gmat,
hist);
}
}

template void
GHistBuilder<float>::BuildHist<true>(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix &gmat,
GHistRow<float> hist);
GHistRow<float> hist) const;
template void
GHistBuilder<float>::BuildHist<false>(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix &gmat,
GHistRow<float> hist);
GHistRow<float> hist) const;
template void
GHistBuilder<double>::BuildHist<true>(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix &gmat,
GHistRow<double> hist);
GHistRow<double> hist) const;
template void
GHistBuilder<double>::BuildHist<false>(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix &gmat,
GHistRow<double> hist);

template<typename GradientSumT>
void GHistBuilder<GradientSumT>::SubtractionTrick(GHistRowT self,
GHistRowT sibling,
GHistRowT parent) {
const size_t size = self.size();
CHECK_EQ(sibling.size(), size);
CHECK_EQ(parent.size(), size);

const size_t block_size = 1024; // aproximatly 1024 values per block
size_t n_blocks = size/block_size + !!(size%block_size);

ParallelFor(omp_ulong(n_blocks), [&](omp_ulong iblock) {
const size_t ibegin = iblock*block_size;
const size_t iend = (((iblock+1)*block_size > size) ? size : ibegin + block_size);
SubtractionHist(self, parent, sibling, ibegin, iend);
});
}
template
void GHistBuilder<float>::SubtractionTrick(GHistRow<float> self,
GHistRow<float> sibling,
GHistRow<float> parent);
template
void GHistBuilder<double>::SubtractionTrick(GHistRow<double> self,
GHistRow<double> sibling,
GHistRow<double> parent);

GHistRow<double> hist) const;
} // namespace common
} // namespace xgboost
18 changes: 5 additions & 13 deletions src/common/hist_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,7 @@ class ParallelGHistBuilder {
}

// Reduce following bins (begin, end] for nid-node in dst across threads
void ReduceHist(size_t nid, size_t begin, size_t end) {
void ReduceHist(size_t nid, size_t begin, size_t end) const {
CHECK_GT(end, begin);
CHECK_LT(nid, nodes_);

Expand All @@ -486,7 +486,6 @@ class ParallelGHistBuilder {
}
}

protected:
void MatchThreadsToNodes(const BlockedSpace2d& space) {
const size_t space_size = space.Size();
const size_t chunck_size = space_size / nthreads_ + !!(space_size % nthreads_);
Expand Down Expand Up @@ -533,6 +532,7 @@ class ParallelGHistBuilder {
}
}

private:
void MatchNodeNidPairToHist() {
size_t hist_allocated_additionally = 0;

Expand Down Expand Up @@ -586,26 +586,18 @@ class GHistBuilder {
using GHistRowT = GHistRow<GradientSumT>;

GHistBuilder() = default;
GHistBuilder(size_t nthread, uint32_t nbins) : nthread_{nthread}, nbins_{nbins} {}
explicit GHistBuilder(uint32_t nbins): nbins_{nbins} {}

// construct a histogram via histogram aggregation
template <bool any_missing>
void BuildHist(const std::vector<GradientPair>& gpair,
void BuildHist(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRowT hist);
// construct a histogram via subtraction trick
void SubtractionTrick(GHistRowT self,
GHistRowT sibling,
GHistRowT parent);

const GHistIndexMatrix &gmat, GHistRowT hist) const;
uint32_t GetNumBins() const {
return nbins_;
}

private:
/*! \brief number of threads for parallel computation */
size_t nthread_ { 0 };
/*! \brief number of all bins over all features */
uint32_t nbins_ { 0 };
};
Expand Down
Loading

0 comments on commit 176110a

Please sign in to comment.