-
-
Notifications
You must be signed in to change notification settings - Fork 8.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement cudf construction with adapters. (#5189)
- Loading branch information
1 parent
9559f81
commit 87ebfc1
Showing
14 changed files
with
705 additions
and
34 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
// Copyright (c) 2014-2019 by Contributors | ||
|
||
#include "xgboost/data.h" | ||
#include "xgboost/c_api.h" | ||
#include "c_api_error.h" | ||
#include "../data/simple_csr_source.h" | ||
#include "../data/device_adapter.cuh" | ||
|
||
namespace xgboost { | ||
XGB_DLL int XGDMatrixCreateFromArrayInterfaces(char const* c_json_strs, | ||
bst_int has_missing, | ||
bst_float missing, | ||
DMatrixHandle* out) { | ||
API_BEGIN(); | ||
std::string json_str{c_json_strs}; | ||
data::CudfAdapter adapter(json_str); | ||
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, 1)); | ||
API_END(); | ||
} | ||
} // namespace xgboost |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
/*! | ||
* Copyright (c) 2019 by Contributors | ||
* \file device_adapter.cuh | ||
*/ | ||
#ifndef XGBOOST_DATA_DEVICE_ADAPTER_H_ | ||
#define XGBOOST_DATA_DEVICE_ADAPTER_H_ | ||
#include <limits> | ||
#include <memory> | ||
#include <string> | ||
#include "columnar.h" | ||
#include "adapter.h" | ||
#include "../common/device_helpers.cuh" | ||
|
||
namespace xgboost { | ||
namespace data { | ||
|
||
class CudfAdapterBatch : public detail::NoMetaInfo { | ||
public: | ||
CudfAdapterBatch() = default; | ||
CudfAdapterBatch(common::Span<Columnar> columns, | ||
common::Span<size_t> column_ptr, size_t num_elements) | ||
: columns_(columns),column_ptr_(column_ptr), num_elements(num_elements) {} | ||
size_t Size()const { return num_elements; } | ||
__device__ COOTuple GetElement(size_t idx)const | ||
{ | ||
size_t column_idx = | ||
dh::UpperBound(column_ptr_.data(), column_ptr_.size(), idx) - 1; | ||
auto& column = columns_[column_idx]; | ||
size_t row_idx = idx - column_ptr_[column_idx]; | ||
float value = column.valid.Data() == nullptr || column.valid.Check(row_idx) | ||
? column.GetElement(row_idx) | ||
: std::numeric_limits<float>::quiet_NaN(); | ||
return COOTuple(row_idx, column_idx, value); | ||
} | ||
|
||
private: | ||
common::Span<Columnar> columns_; | ||
common::Span<size_t> column_ptr_; | ||
size_t num_elements; | ||
}; | ||
|
||
class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> { | ||
public: | ||
explicit CudfAdapter(std::string cuda_interfaces_str) { | ||
Json interfaces = | ||
Json::Load({cuda_interfaces_str.c_str(), cuda_interfaces_str.size()}); | ||
std::vector<Json> const& json_columns = get<Array>(interfaces); | ||
size_t n_columns = json_columns.size(); | ||
CHECK_GT(n_columns, 0) << "Number of columns must not equal to 0."; | ||
|
||
auto const& typestr = get<String const>(json_columns[0]["typestr"]); | ||
CHECK_EQ(typestr.size(), 3) << ColumnarErrors::TypestrFormat(); | ||
CHECK_NE(typestr.front(), '>') << ColumnarErrors::BigEndian(); | ||
std::vector<Columnar> columns; | ||
std::vector<size_t> column_ptr({0}); | ||
auto first_column = Columnar(get<Object const>(json_columns[0])); | ||
device_idx_ = dh::CudaGetPointerDevice(first_column.data); | ||
CHECK_NE(device_idx_, -1); | ||
dh::safe_cuda(cudaSetDevice(device_idx_)); | ||
num_rows_ = first_column.size; | ||
for (auto& json_col : json_columns) { | ||
auto column = Columnar(get<Object const>(json_col)); | ||
columns.push_back(column); | ||
column_ptr.emplace_back(column_ptr.back() + column.size); | ||
num_rows_ = std::max(num_rows_, size_t(column.size)); | ||
CHECK_EQ(device_idx_, dh::CudaGetPointerDevice(column.data)) | ||
<< "All columns should use the same device."; | ||
CHECK_EQ(num_rows_, column.size) | ||
<< "All columns should have same number of rows."; | ||
} | ||
columns_ = columns; | ||
column_ptr_ = column_ptr; | ||
batch = CudfAdapterBatch(dh::ToSpan(columns_), dh::ToSpan(column_ptr_), | ||
column_ptr.back()); | ||
} | ||
const CudfAdapterBatch& Value() const override { return batch; } | ||
|
||
size_t NumRows() const { return num_rows_; } | ||
size_t NumColumns() const { return columns_.size(); } | ||
size_t DeviceIdx()const { | ||
return device_idx_; | ||
} | ||
|
||
// Cudf is column major | ||
bool IsRowMajor() { return false; } | ||
private: | ||
CudfAdapterBatch batch; | ||
dh::device_vector<Columnar> columns_; | ||
dh::device_vector<size_t> column_ptr_; // Exclusive scan of column sizes | ||
size_t num_rows_{0}; | ||
int device_idx_; | ||
}; | ||
}; // namespace data | ||
} // namespace xgboost | ||
#endif // XGBOOST_DATA_DEVICE_ADAPTER_H_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
/*! | ||
* Copyright 2019 by Contributors | ||
* \file simple_dmatrix.cu | ||
*/ | ||
#include <thrust/copy.h> | ||
#include <thrust/execution_policy.h> | ||
#include <thrust/sort.h> | ||
#include <xgboost/data.h> | ||
#include "../common/random.h" | ||
#include "./simple_dmatrix.h" | ||
#include "device_adapter.cuh" | ||
|
||
namespace xgboost { | ||
namespace data { | ||
|
||
XGBOOST_DEVICE bool IsValid(float value, float missing) { | ||
if (common::CheckNAN(value) || value == missing) { | ||
return false; | ||
} | ||
return true; | ||
} | ||
|
||
template <typename AdapterBatchT> | ||
void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset, | ||
int device_idx, float missing) { | ||
// Count elements per row | ||
dh::LaunchN(device_idx, batch.Size(), [=] __device__(size_t idx) { | ||
auto element = batch.GetElement(idx); | ||
if (IsValid(element.value, missing)) { | ||
atomicAdd(reinterpret_cast<unsigned long long*>( // NOLINT | ||
&offset[element.row_idx]), | ||
static_cast<unsigned long long>(1)); // NOLINT | ||
} | ||
}); | ||
|
||
dh::XGBCachingDeviceAllocator<char> alloc; | ||
thrust::exclusive_scan(thrust::cuda::par(alloc), | ||
thrust::device_pointer_cast(offset.data()), | ||
thrust::device_pointer_cast(offset.data() + offset.size()), | ||
thrust::device_pointer_cast(offset.data())); | ||
} | ||
|
||
template <typename AdapterT> | ||
void CopyDataColumnMajor(AdapterT* adapter, common::Span<Entry> data, | ||
int device_idx, float missing, | ||
common::Span<size_t> row_ptr) { | ||
// Step 1: Get the sizes of the input columns | ||
dh::device_vector<size_t> column_sizes(adapter->NumColumns()); | ||
auto d_column_sizes = column_sizes.data().get(); | ||
auto& batch = adapter->Value(); | ||
// Populate column sizes | ||
dh::LaunchN(device_idx, batch.Size(), [=] __device__(size_t idx) { | ||
const auto& e = batch.GetElement(idx); | ||
atomicAdd(reinterpret_cast<unsigned long long*>( // NOLINT | ||
&d_column_sizes[e.column_idx]), | ||
static_cast<unsigned long long>(1)); // NOLINT | ||
}); | ||
|
||
thrust::host_vector<size_t> host_column_sizes = column_sizes; | ||
|
||
// Step 2: Iterate over columns, place elements in correct row, increment | ||
// temporary row pointers | ||
dh::device_vector<size_t> temp_row_ptr( | ||
thrust::device_pointer_cast(row_ptr.data()), | ||
thrust::device_pointer_cast(row_ptr.data() + row_ptr.size())); | ||
auto d_temp_row_ptr = temp_row_ptr.data().get(); | ||
size_t begin = 0; | ||
for (auto size : host_column_sizes) { | ||
size_t end = begin + size; | ||
dh::LaunchN(device_idx, end - begin, [=] __device__(size_t idx) { | ||
const auto& e = batch.GetElement(idx + begin); | ||
if (!IsValid(e.value, missing)) return; | ||
data[d_temp_row_ptr[e.row_idx]] = Entry(e.column_idx, e.value); | ||
d_temp_row_ptr[e.row_idx] += 1; | ||
}); | ||
|
||
begin = end; | ||
} | ||
} | ||
|
||
// Does not currently support metainfo as no on-device data source contains this | ||
// Current implementation assumes a single batch. More batches can | ||
// be supported in future. Does not currently support inferring row/column size | ||
template <typename AdapterT> | ||
SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) { | ||
source_.reset(new SimpleCSRSource()); | ||
SimpleCSRSource& mat = *reinterpret_cast<SimpleCSRSource*>(source_.get()); | ||
CHECK(adapter->NumRows() != kAdapterUnknownSize); | ||
CHECK(adapter->NumColumns() != kAdapterUnknownSize); | ||
|
||
adapter->BeforeFirst(); | ||
adapter->Next(); | ||
auto& batch = adapter->Value(); | ||
mat.page_.offset.SetDevice(adapter->DeviceIdx()); | ||
mat.page_.data.SetDevice(adapter->DeviceIdx()); | ||
|
||
// Enforce single batch | ||
CHECK(!adapter->Next()); | ||
mat.page_.offset.Resize(adapter->NumRows() + 1); | ||
auto s_offset = mat.page_.offset.DeviceSpan(); | ||
CountRowOffsets(batch, s_offset, adapter->DeviceIdx(), missing); | ||
mat.info.num_nonzero_ = mat.page_.offset.HostVector().back(); | ||
mat.page_.data.Resize(mat.info.num_nonzero_); | ||
if (adapter->IsRowMajor()) { | ||
LOG(FATAL) << "Not implemented."; | ||
} else { | ||
CopyDataColumnMajor(adapter, mat.page_.data.DeviceSpan(), | ||
adapter->DeviceIdx(), missing, s_offset); | ||
} | ||
|
||
mat.info.num_col_ = adapter->NumColumns(); | ||
mat.info.num_row_ = adapter->NumRows(); | ||
// Synchronise worker columns | ||
rabit::Allreduce<rabit::op::Max>(&mat.info.num_col_, 1); | ||
} | ||
|
||
template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing, | ||
int nthread); | ||
} // namespace data | ||
} // namespace xgboost |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.