Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Write ELLPACK pages to disk #4879

Merged
merged 47 commits into from
Oct 23, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
6db7333
add ellpack source
rongou Sep 19, 2019
78bceb9
add placeholders
rongou Sep 20, 2019
1f2db17
fix cpu code
rongou Sep 20, 2019
b189287
Merge branch 'master' into ellpack-source
rongou Sep 23, 2019
3143767
add a test for ellpack source
rongou Sep 23, 2019
db549c6
Merge branch 'master' into ellpack-source
rongou Sep 24, 2019
d130311
fix cpu build
rongou Sep 24, 2019
0e83453
more templating
rongou Sep 24, 2019
64e5e86
fix
rongou Sep 24, 2019
b41a5cc
Merge branch 'master' into ellpack-source
rongou Sep 25, 2019
8839c1e
add batch param
rongou Sep 25, 2019
051f868
extract function to parse cache info
rongou Sep 25, 2019
1864847
more place holders
rongou Sep 28, 2019
759a390
better templating
rongou Sep 28, 2019
63d6f4d
fix windows build
rongou Sep 28, 2019
4fb5607
add stubs
rongou Sep 30, 2019
92aa9b8
Merge branch 'master' into ellpack-source
rongou Sep 30, 2019
3b69eaa
Merge branch 'master' into ellpack-source
rongou Oct 2, 2019
c727ed3
implementing push
rongou Oct 2, 2019
b34f3c2
remove member variables from ellpack page impl
rongou Oct 2, 2019
f9af5ec
extract ellpack matrix info struct
rongou Oct 3, 2019
842b201
fix build for older compilers
rongou Oct 3, 2019
39b0a63
initialize ellpack matrix info separately
rongou Oct 3, 2019
6c452bd
add IsDense() to DMatrix
rongou Oct 4, 2019
39c1235
construct ellpack info separately
rongou Oct 4, 2019
20c5764
clear ellpack page
rongou Oct 4, 2019
e74d981
push batch to ellpack page
rongou Oct 4, 2019
a124fbc
write out the page
rongou Oct 4, 2019
308a403
Merge branch 'master' into ellpack-source
rongou Oct 7, 2019
4014f0a
implement size()
rongou Oct 7, 2019
0d35f89
check magic in sparse page source
rongou Oct 7, 2019
1540a47
pimpl ellpack page source
rongou Oct 7, 2019
9744786
narrow ellpack page interface
rongou Oct 7, 2019
aae7b99
make sparse page source reusable
rongou Oct 7, 2019
a87480c
almost there
rongou Oct 7, 2019
f70c7eb
got ellpack batch test passing
rongou Oct 8, 2019
869777f
re-enable hist test
rongou Oct 8, 2019
ec86cf9
working now
rongou Oct 8, 2019
b7f3383
minor cleanups
rongou Oct 8, 2019
fa2d97a
Merge branch 'master' into ellpack-source
rongou Oct 10, 2019
3a77a2e
fix clang tidy error
rongou Oct 10, 2019
020d376
update submodule
rongou Oct 11, 2019
84620d5
add comments
rongou Oct 11, 2019
24919c2
Merge branch 'master' into ellpack-source
rongou Oct 14, 2019
8863c63
Merge branch 'master' into ellpack-source
rongou Oct 16, 2019
da75a23
address review comments
rongou Oct 16, 2019
49b6c42
Merge branch 'master' into ellpack-source
rongou Oct 17, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion amalgamation/xgboost-all0.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@

#if DMLC_ENABLE_STD_THREAD
#include "../src/data/sparse_page_dmatrix.cc"
#include "../src/data/sparse_page_writer.cc"
#endif

// tress
Expand Down
76 changes: 59 additions & 17 deletions include/xgboost/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,18 @@ struct Entry {
}
};

/*!
* \brief Parameters for constructing batches.
*/
struct BatchParam {
/*! \brief The GPU device to use. */
int gpu_id;
/*! \brief Maximum number of bins per feature for histograms. */
int max_bin;
/*! \brief Number of rows in a GPU batch, used for finding quantiles on GPU. */
int gpu_batch_nrows;
};

/*!
* \brief In-memory storage unit of sparse batch, stored in CSR format.
*/
Expand Down Expand Up @@ -195,14 +207,17 @@ class SparsePage {
SparsePage() {
this->Clear();
}
/*! \return number of instance in the page */

/*! \return Number of instances in the page. */
inline size_t Size() const {
return offset.Size() - 1;
}

/*! \return estimation of memory cost of this page */
inline size_t MemCostBytes() const {
return offset.Size() * sizeof(size_t) + data.Size() * sizeof(Entry);
}

/*! \brief clear the page */
inline void Clear() {
base_rowid = 0;
Expand All @@ -212,6 +227,11 @@ class SparsePage {
data.HostVector().clear();
}

/*! \brief Set the base row id for this page. */
inline void SetBaseRowId(size_t row_id) {
base_rowid = row_id;
}

SparsePage GetTranspose(int num_columns) const;

void SortRows() {
Expand Down Expand Up @@ -242,13 +262,6 @@ class SparsePage {
* \param batch The row batch to be pushed
*/
void PushCSC(const SparsePage& batch);
/*!
* \brief Push one instance into page
* \param inst an instance row
*/
void Push(const Inst &inst);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where has this method been moved to? or, is this permanently deleted?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method is not being used so I removed it. Can add it back if you guys think we should keep it.


size_t Size() { return offset.Size() - 1; }
};

class CSCPage: public SparsePage {
Expand All @@ -272,9 +285,31 @@ class EllpackPageImpl;
*/
class EllpackPage {
public:
explicit EllpackPage(DMatrix* dmat);
/*!
* \brief Default constructor.
*
* This is used in the external memory case. An empty ELLPACK page is constructed with its content
* set later by the reader.
*/
EllpackPage();

/*!
* \brief Constructor from an existing DMatrix.
*
* This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
* in CSR format.
*/
explicit EllpackPage(DMatrix* dmat, const BatchParam& param);

/*! \brief Destructor. */
~EllpackPage();

/*! \return Number of instances in the page. */
size_t Size() const;

/*! \brief Set the base row id for this page. */
void SetBaseRowId(size_t row_id);

const EllpackPageImpl* Impl() const { return impl_.get(); }
EllpackPageImpl* Impl() { return impl_.get(); }

Expand Down Expand Up @@ -360,7 +395,8 @@ class DataSource : public dmlc::DataIter<T> {
* There are two ways to create a customized DMatrix that reads in user defined-format.
*
* - Provide a dmlc::Parser and pass into the DMatrix::Create
* - Alternatively, if data can be represented by an URL, define a new dmlc::Parser and register by DMLC_REGISTER_DATA_PARSER;
* - Alternatively, if data can be represented by an URL, define a new dmlc::Parser and register by
* DMLC_REGISTER_DATA_PARSER;
* - This works best for user defined data input source, such as data-base, filesystem.
* - Provide a DataSource, that can be passed to DMatrix::Create
* This can be used to re-use inmemory data structure into DMatrix.
Expand All @@ -377,7 +413,7 @@ class DMatrix {
* \brief Gets batches. Use range based for loop over BatchSet to access individual batches.
*/
template<typename T>
BatchSet<T> GetBatches();
BatchSet<T> GetBatches(const BatchParam& param = {});
// the following are column meta data, should be able to answer them fast.
/*! \return Whether the data columns single column block. */
virtual bool SingleColBlock() const = 0;
Expand All @@ -393,6 +429,12 @@ class DMatrix {
* \return The created DMatrix.
*/
virtual void SaveToLocalFile(const std::string& fname);

/*! \brief Whether the matrix is dense. */
bool IsDense() const {
return Info().num_nonzero_ == Info().num_row_ * Info().num_col_;
}

/*!
* \brief Load DMatrix from URI.
* \param uri The URI of input.
Expand Down Expand Up @@ -442,27 +484,27 @@ class DMatrix {
virtual BatchSet<SparsePage> GetRowBatches() = 0;
virtual BatchSet<CSCPage> GetColumnBatches() = 0;
virtual BatchSet<SortedCSCPage> GetSortedColumnBatches() = 0;
virtual BatchSet<EllpackPage> GetEllpackBatches() = 0;
virtual BatchSet<EllpackPage> GetEllpackBatches(const BatchParam& param) = 0;
};

template<>
inline BatchSet<SparsePage> DMatrix::GetBatches() {
inline BatchSet<SparsePage> DMatrix::GetBatches(const BatchParam&) {
return GetRowBatches();
}

template<>
inline BatchSet<CSCPage> DMatrix::GetBatches() {
inline BatchSet<CSCPage> DMatrix::GetBatches(const BatchParam&) {
return GetColumnBatches();
}

template<>
inline BatchSet<SortedCSCPage> DMatrix::GetBatches() {
inline BatchSet<SortedCSCPage> DMatrix::GetBatches(const BatchParam&) {
return GetSortedColumnBatches();
}

template<>
inline BatchSet<EllpackPage> DMatrix::GetBatches() {
return GetEllpackBatches();
inline BatchSet<EllpackPage> DMatrix::GetBatches(const BatchParam& param) {
return GetEllpackBatches(param);
}
} // namespace xgboost

Expand Down
17 changes: 13 additions & 4 deletions src/common/device_helpers.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -519,16 +519,21 @@ class BulkAllocator {
}

public:
BulkAllocator() = default;
BulkAllocator() = default;
// prevent accidental copying, moving or assignment of this object
BulkAllocator(const BulkAllocator&) = delete;
BulkAllocator(BulkAllocator&&) = delete;
void operator=(const BulkAllocator&) = delete;
void operator=(BulkAllocator&&) = delete;

~BulkAllocator() {
for (size_t i = 0; i < d_ptr_.size(); i++) {
if (!(d_ptr_[i] == nullptr)) {
/*!
* \brief Clear the bulk allocator.
*
* This frees the GPU memory managed by this allocator.
*/
void Clear() {
for (size_t i = 0; i < d_ptr_.size(); i++) { // NOLINT(modernize-loop-convert)
if (d_ptr_[i] != nullptr) {
safe_cuda(cudaSetDevice(device_idx_[i]));
XGBDeviceAllocator<char> allocator;
allocator.deallocate(thrust::device_ptr<char>(d_ptr_[i]), size_[i]);
Expand All @@ -537,6 +542,10 @@ class BulkAllocator {
}
}

~BulkAllocator() {
Clear();
}

// returns sum of bytes for all allocations
size_t Size() {
return std::accumulate(size_.begin(), size_.end(), static_cast<size_t>(0));
Expand Down
42 changes: 4 additions & 38 deletions src/data/data.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@
#endif // DMLC_ENABLE_STD_THREAD

namespace dmlc {
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg);
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SparsePage>);
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::CSCPage>);
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SortedCSCPage>);
DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::EllpackPage>);
} // namespace dmlc

namespace xgboost {
Expand Down Expand Up @@ -298,31 +301,6 @@ DMatrix* DMatrix::Create(std::unique_ptr<DataSource<SparsePage>>&& source,
} // namespace xgboost

namespace xgboost {
data::SparsePageFormat* data::SparsePageFormat::Create(const std::string& name) {
trivialfis marked this conversation as resolved.
Show resolved Hide resolved
auto *e = ::dmlc::Registry< ::xgboost::data::SparsePageFormatReg>::Get()->Find(name);
if (e == nullptr) {
LOG(FATAL) << "Unknown format type " << name;
}
return (e->body)();
}

std::pair<std::string, std::string>
data::SparsePageFormat::DecideFormat(const std::string& cache_prefix) {
size_t pos = cache_prefix.rfind(".fmt-");

if (pos != std::string::npos) {
std::string fmt = cache_prefix.substr(pos + 5, cache_prefix.length());
size_t cpos = fmt.rfind('-');
if (cpos != std::string::npos) {
return std::make_pair(fmt.substr(0, cpos), fmt.substr(cpos + 1, fmt.length()));
} else {
return std::make_pair(fmt, fmt);
}
} else {
std::string raw = "raw";
return std::make_pair(raw, raw);
}
}
SparsePage SparsePage::GetTranspose(int num_columns) const {
SparsePage transpose;
common::ParallelGroupBuilder<Entry> builder(&transpose.offset.HostVector(),
Expand Down Expand Up @@ -445,18 +423,6 @@ void SparsePage::PushCSC(const SparsePage &batch) {
self_offset = std::move(offset);
}

void SparsePage::Push(const Inst &inst) {
auto& data_vec = data.HostVector();
auto& offset_vec = offset.HostVector();
offset_vec.push_back(offset_vec.back() + inst.size());
size_t begin = data_vec.size();
data_vec.resize(begin + inst.size());
if (inst.size() != 0) {
std::memcpy(dmlc::BeginPtr(data_vec) + begin, inst.data(),
sizeof(Entry) * inst.size());
}
}

namespace data {
// List of files that will be force linked in static links.
DMLC_REGISTRY_LINK_TAG(sparse_page_raw_format);
Expand Down
6 changes: 2 additions & 4 deletions src/data/ellpack_page.cc
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
/*!
* Copyright 2019 XGBoost contributors
*
* \file ellpack_page.cc
*/
#ifndef XGBOOST_USE_CUDA

#include <xgboost/data.h>

// dummy implementation of ELlpackPage in case CUDA is not used
// dummy implementation of EllpackPage in case CUDA is not used
namespace xgboost {

class EllpackPageImpl {};

EllpackPage::EllpackPage(DMatrix* dmat) {
EllpackPage::EllpackPage(DMatrix* dmat, const BatchParam& param) {
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but EllpackPage is required";
}

Expand Down
Loading