From 87ebfc131510e246551ae7b09ff84b00ed3eb323 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Thu, 9 Jan 2020 20:23:06 +1300 Subject: [PATCH] Implement cudf construction with adapters. (#5189) --- include/xgboost/data.h | 2 +- src/c_api/c_api.cc | 9 +- src/c_api/c_api.cu | 20 ++ src/common/device_helpers.cuh | 5 + src/data/adapter.h | 4 +- src/data/columnar.h | 1 + src/data/data.cu | 15 ++ src/data/device_adapter.cuh | 95 +++++++ src/data/simple_dmatrix.cu | 120 +++++++++ tests/cpp/data/test_adapter.cc | 4 +- tests/cpp/data/test_columnar.h | 65 +++++ tests/cpp/data/test_device_adapter.cu | 55 ++++ tests/cpp/data/test_simple_csr_source.cu | 26 +- tests/cpp/data/test_simple_dmatrix.cu | 318 +++++++++++++++++++++++ 14 files changed, 705 insertions(+), 34 deletions(-) create mode 100644 src/c_api/c_api.cu create mode 100644 src/data/device_adapter.cuh create mode 100644 src/data/simple_dmatrix.cu create mode 100644 tests/cpp/data/test_columnar.h create mode 100644 tests/cpp/data/test_device_adapter.cu create mode 100644 tests/cpp/data/test_simple_dmatrix.cu diff --git a/include/xgboost/data.h b/include/xgboost/data.h index a3b870712acd..08983950cb9a 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -148,7 +148,7 @@ struct Entry { * \param index The feature or row index. * \param fvalue The feature value. */ - Entry(bst_feature_t index, bst_float fvalue) : index(index), fvalue(fvalue) {} + XGBOOST_DEVICE Entry(bst_feature_t index, bst_float fvalue) : index(index), fvalue(fvalue) {} /*! \brief reversely compare feature values */ inline static bool CmpValue(const Entry& a, const Entry& b) { return a.fvalue < b.fvalue; diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index baa3dfc54412..98a0f1dc3ee4 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -204,15 +204,14 @@ int XGDMatrixCreateFromDataIter( API_END(); } +#ifndef XGBOOST_USE_CUDA XGB_DLL int XGDMatrixCreateFromArrayInterfaces( - char const* c_json_strs, bst_int has_missing, bst_float missing, DMatrixHandle* out) { + char const* c_json_strs, bst_int has_missing, bst_float missing, DMatrixHandle* out) { API_BEGIN(); - std::string json_str {c_json_strs}; - std::unique_ptr source (new data::SimpleCSRSource()); - source->CopyFrom(json_str, has_missing, missing); - *out = new std::shared_ptr(DMatrix::Create(std::move(source))); + LOG(FATAL) << "Xgboost not compiled with cuda"; API_END(); } +#endif XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr, const unsigned* indices, diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu new file mode 100644 index 000000000000..9e085b113b05 --- /dev/null +++ b/src/c_api/c_api.cu @@ -0,0 +1,20 @@ +// Copyright (c) 2014-2019 by Contributors + +#include "xgboost/data.h" +#include "xgboost/c_api.h" +#include "c_api_error.h" +#include "../data/simple_csr_source.h" +#include "../data/device_adapter.cuh" + +namespace xgboost { +XGB_DLL int XGDMatrixCreateFromArrayInterfaces(char const* c_json_strs, + bst_int has_missing, + bst_float missing, + DMatrixHandle* out) { + API_BEGIN(); + std::string json_str{c_json_strs}; + data::CudfAdapter adapter(json_str); + *out = new std::shared_ptr(DMatrix::Create(&adapter, missing, 1)); + API_END(); +} +} // namespace xgboost diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index c695087ac03f..6ec021ef3465 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -372,6 +373,10 @@ public: safe_cuda(cudaGetDevice(¤t_device)); stats_.RegisterDeallocation(ptr, n, current_device); } + size_t PeakMemory() + { + return stats_.peak_allocated_bytes; + } void Log() { if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) return; diff --git a/src/data/adapter.h b/src/data/adapter.h index 671a440feddb..ea89ba4fa339 100644 --- a/src/data/adapter.h +++ b/src/data/adapter.h @@ -4,9 +4,11 @@ */ #ifndef XGBOOST_DATA_ADAPTER_H_ #define XGBOOST_DATA_ADAPTER_H_ +#include #include #include #include + namespace xgboost { namespace data { @@ -56,7 +58,7 @@ namespace data { constexpr size_t kAdapterUnknownSize = std::numeric_limits::max(); struct COOTuple { - COOTuple(size_t row_idx, size_t column_idx, float value) + XGBOOST_DEVICE COOTuple(size_t row_idx, size_t column_idx, float value) : row_idx(row_idx), column_idx(column_idx), value(value) {} size_t row_idx{0}; diff --git a/src/data/columnar.h b/src/data/columnar.h index 84398598513e..4b6c0952d711 100644 --- a/src/data/columnar.h +++ b/src/data/columnar.h @@ -225,6 +225,7 @@ class Columnar { using index_type = int32_t; public: + Columnar() = default; explicit Columnar(std::map const& column) { ArrayInterfaceHandler::Validate(column); data = ArrayInterfaceHandler::GetPtrFromArrayData(column); diff --git a/src/data/data.cu b/src/data/data.cu index d95a983d2b13..94dab0ec2894 100644 --- a/src/data/data.cu +++ b/src/data/data.cu @@ -9,6 +9,8 @@ #include "xgboost/json.h" #include "columnar.h" #include "../common/device_helpers.cuh" +#include "device_adapter.cuh" +#include "simple_dmatrix.h" namespace xgboost { @@ -67,4 +69,17 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) { LOG(FATAL) << "Unknown metainfo: " << key; } } + +template +DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, + const std::string& cache_prefix, size_t page_size) { + CHECK_EQ(cache_prefix.size(), 0) + << "Device memory construction is not currently supported with external " + "memory."; + return new data::SimpleDMatrix(adapter, missing, nthread); +} + +template DMatrix* DMatrix::Create( + data::CudfAdapter* adapter, float missing, int nthread, + const std::string& cache_prefix, size_t page_size); } // namespace xgboost diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh new file mode 100644 index 000000000000..8304ce7e2bb7 --- /dev/null +++ b/src/data/device_adapter.cuh @@ -0,0 +1,95 @@ +/*! + * Copyright (c) 2019 by Contributors + * \file device_adapter.cuh + */ +#ifndef XGBOOST_DATA_DEVICE_ADAPTER_H_ +#define XGBOOST_DATA_DEVICE_ADAPTER_H_ +#include +#include +#include +#include "columnar.h" +#include "adapter.h" +#include "../common/device_helpers.cuh" + +namespace xgboost { +namespace data { + +class CudfAdapterBatch : public detail::NoMetaInfo { + public: + CudfAdapterBatch() = default; + CudfAdapterBatch(common::Span columns, + common::Span column_ptr, size_t num_elements) + : columns_(columns),column_ptr_(column_ptr), num_elements(num_elements) {} + size_t Size()const { return num_elements; } + __device__ COOTuple GetElement(size_t idx)const + { + size_t column_idx = + dh::UpperBound(column_ptr_.data(), column_ptr_.size(), idx) - 1; + auto& column = columns_[column_idx]; + size_t row_idx = idx - column_ptr_[column_idx]; + float value = column.valid.Data() == nullptr || column.valid.Check(row_idx) + ? column.GetElement(row_idx) + : std::numeric_limits::quiet_NaN(); + return COOTuple(row_idx, column_idx, value); + } + + private: + common::Span columns_; + common::Span column_ptr_; + size_t num_elements; +}; + +class CudfAdapter : public detail::SingleBatchDataIter { + public: + explicit CudfAdapter(std::string cuda_interfaces_str) { + Json interfaces = + Json::Load({cuda_interfaces_str.c_str(), cuda_interfaces_str.size()}); + std::vector const& json_columns = get(interfaces); + size_t n_columns = json_columns.size(); + CHECK_GT(n_columns, 0) << "Number of columns must not equal to 0."; + + auto const& typestr = get(json_columns[0]["typestr"]); + CHECK_EQ(typestr.size(), 3) << ColumnarErrors::TypestrFormat(); + CHECK_NE(typestr.front(), '>') << ColumnarErrors::BigEndian(); + std::vector columns; + std::vector column_ptr({0}); + auto first_column = Columnar(get(json_columns[0])); + device_idx_ = dh::CudaGetPointerDevice(first_column.data); + CHECK_NE(device_idx_, -1); + dh::safe_cuda(cudaSetDevice(device_idx_)); + num_rows_ = first_column.size; + for (auto& json_col : json_columns) { + auto column = Columnar(get(json_col)); + columns.push_back(column); + column_ptr.emplace_back(column_ptr.back() + column.size); + num_rows_ = std::max(num_rows_, size_t(column.size)); + CHECK_EQ(device_idx_, dh::CudaGetPointerDevice(column.data)) + << "All columns should use the same device."; + CHECK_EQ(num_rows_, column.size) + << "All columns should have same number of rows."; + } + columns_ = columns; + column_ptr_ = column_ptr; + batch = CudfAdapterBatch(dh::ToSpan(columns_), dh::ToSpan(column_ptr_), + column_ptr.back()); + } + const CudfAdapterBatch& Value() const override { return batch; } + + size_t NumRows() const { return num_rows_; } + size_t NumColumns() const { return columns_.size(); } + size_t DeviceIdx()const { + return device_idx_; + } + + // Cudf is column major + bool IsRowMajor() { return false; } + private: + CudfAdapterBatch batch; + dh::device_vector columns_; + dh::device_vector column_ptr_; // Exclusive scan of column sizes + size_t num_rows_{0}; + int device_idx_; +}; +}; // namespace data +} // namespace xgboost +#endif // XGBOOST_DATA_DEVICE_ADAPTER_H_ diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu new file mode 100644 index 000000000000..cfa5a2c58959 --- /dev/null +++ b/src/data/simple_dmatrix.cu @@ -0,0 +1,120 @@ +/*! + * Copyright 2019 by Contributors + * \file simple_dmatrix.cu + */ +#include +#include +#include +#include +#include "../common/random.h" +#include "./simple_dmatrix.h" +#include "device_adapter.cuh" + +namespace xgboost { +namespace data { + +XGBOOST_DEVICE bool IsValid(float value, float missing) { + if (common::CheckNAN(value) || value == missing) { + return false; + } + return true; +} + +template +void CountRowOffsets(const AdapterBatchT& batch, common::Span offset, + int device_idx, float missing) { + // Count elements per row + dh::LaunchN(device_idx, batch.Size(), [=] __device__(size_t idx) { + auto element = batch.GetElement(idx); + if (IsValid(element.value, missing)) { + atomicAdd(reinterpret_cast( // NOLINT + &offset[element.row_idx]), + static_cast(1)); // NOLINT + } + }); + + dh::XGBCachingDeviceAllocator alloc; + thrust::exclusive_scan(thrust::cuda::par(alloc), + thrust::device_pointer_cast(offset.data()), + thrust::device_pointer_cast(offset.data() + offset.size()), + thrust::device_pointer_cast(offset.data())); +} + +template +void CopyDataColumnMajor(AdapterT* adapter, common::Span data, + int device_idx, float missing, + common::Span row_ptr) { + // Step 1: Get the sizes of the input columns + dh::device_vector column_sizes(adapter->NumColumns()); + auto d_column_sizes = column_sizes.data().get(); + auto& batch = adapter->Value(); + // Populate column sizes + dh::LaunchN(device_idx, batch.Size(), [=] __device__(size_t idx) { + const auto& e = batch.GetElement(idx); + atomicAdd(reinterpret_cast( // NOLINT + &d_column_sizes[e.column_idx]), + static_cast(1)); // NOLINT + }); + + thrust::host_vector host_column_sizes = column_sizes; + + // Step 2: Iterate over columns, place elements in correct row, increment + // temporary row pointers + dh::device_vector temp_row_ptr( + thrust::device_pointer_cast(row_ptr.data()), + thrust::device_pointer_cast(row_ptr.data() + row_ptr.size())); + auto d_temp_row_ptr = temp_row_ptr.data().get(); + size_t begin = 0; + for (auto size : host_column_sizes) { + size_t end = begin + size; + dh::LaunchN(device_idx, end - begin, [=] __device__(size_t idx) { + const auto& e = batch.GetElement(idx + begin); + if (!IsValid(e.value, missing)) return; + data[d_temp_row_ptr[e.row_idx]] = Entry(e.column_idx, e.value); + d_temp_row_ptr[e.row_idx] += 1; + }); + + begin = end; + } +} + +// Does not currently support metainfo as no on-device data source contains this +// Current implementation assumes a single batch. More batches can +// be supported in future. Does not currently support inferring row/column size +template +SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) { + source_.reset(new SimpleCSRSource()); + SimpleCSRSource& mat = *reinterpret_cast(source_.get()); + CHECK(adapter->NumRows() != kAdapterUnknownSize); + CHECK(adapter->NumColumns() != kAdapterUnknownSize); + + adapter->BeforeFirst(); + adapter->Next(); + auto& batch = adapter->Value(); + mat.page_.offset.SetDevice(adapter->DeviceIdx()); + mat.page_.data.SetDevice(adapter->DeviceIdx()); + + // Enforce single batch + CHECK(!adapter->Next()); + mat.page_.offset.Resize(adapter->NumRows() + 1); + auto s_offset = mat.page_.offset.DeviceSpan(); + CountRowOffsets(batch, s_offset, adapter->DeviceIdx(), missing); + mat.info.num_nonzero_ = mat.page_.offset.HostVector().back(); + mat.page_.data.Resize(mat.info.num_nonzero_); + if (adapter->IsRowMajor()) { + LOG(FATAL) << "Not implemented."; + } else { + CopyDataColumnMajor(adapter, mat.page_.data.DeviceSpan(), + adapter->DeviceIdx(), missing, s_offset); + } + + mat.info.num_col_ = adapter->NumColumns(); + mat.info.num_row_ = adapter->NumRows(); + // Synchronise worker columns + rabit::Allreduce(&mat.info.num_col_, 1); +} + +template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing, + int nthread); +} // namespace data +} // namespace xgboost diff --git a/tests/cpp/data/test_adapter.cc b/tests/cpp/data/test_adapter.cc index 1b1077115481..5c73030a393d 100644 --- a/tests/cpp/data/test_adapter.cc +++ b/tests/cpp/data/test_adapter.cc @@ -6,7 +6,7 @@ #include "../../../src/common/timer.h" #include "../helpers.h" using namespace xgboost; // NOLINT -TEST(c_api, CSRAdapter) { +TEST(adapter, CSRAdapter) { int m = 3; int n = 2; std::vector data = {1, 2, 3, 4, 5}; @@ -29,7 +29,7 @@ TEST(c_api, CSRAdapter) { EXPECT_EQ(line2 .GetElement(0).column_idx, 1); } -TEST(c_api, CSCAdapterColsMoreThanRows) { +TEST(adapter, CSCAdapterColsMoreThanRows) { std::vector data = {1, 2, 3, 4, 5, 6, 7, 8}; std::vector row_idx = {0, 1, 0, 1, 0, 1, 0, 1}; std::vector col_ptr = {0, 2, 4, 6, 8}; diff --git a/tests/cpp/data/test_columnar.h b/tests/cpp/data/test_columnar.h new file mode 100644 index 000000000000..ad0923eb1bfc --- /dev/null +++ b/tests/cpp/data/test_columnar.h @@ -0,0 +1,65 @@ +// Copyright (c) 2019 by Contributors +#include +#include +#include +#include + +#include +#include "../../../src/common/bitfield.h" +#include "../../../src/common/device_helpers.cuh" +#include "../../../src/data/simple_csr_source.h" +#include "../../../src/data/columnar.h" + +namespace xgboost { + +template +Json GenerateDenseColumn(std::string const& typestr, size_t kRows, + thrust::device_vector* out_d_data) { + auto& d_data = *out_d_data; + d_data.resize(kRows); + Json column { Object() }; + std::vector j_shape {Json(Integer(static_cast(kRows)))}; + column["shape"] = Array(j_shape); + column["strides"] = Array(std::vector{Json(Integer(static_cast(sizeof(T))))}); + + d_data.resize(kRows); + thrust::sequence(thrust::device, d_data.begin(), d_data.end(), 0.0f, 2.0f); + + auto p_d_data = dh::Raw(d_data); + + std::vector j_data { + Json(Integer(reinterpret_cast(p_d_data))), + Json(Boolean(false))}; + column["data"] = j_data; + + column["version"] = Integer(static_cast(1)); + column["typestr"] = String(typestr); + return column; +} + +template +Json GenerateSparseColumn(std::string const& typestr, size_t kRows, + thrust::device_vector* out_d_data) { + auto& d_data = *out_d_data; + Json column { Object() }; + std::vector j_shape {Json(Integer(static_cast(kRows)))}; + column["shape"] = Array(j_shape); + column["strides"] = Array(std::vector{Json(Integer(static_cast(sizeof(T))))}); + + d_data.resize(kRows); + for (size_t i = 0; i < d_data.size(); ++i) { + d_data[i] = i * 2.0; + } + + auto p_d_data = dh::Raw(d_data); + + std::vector j_data { + Json(Integer(reinterpret_cast(p_d_data))), + Json(Boolean(false))}; + column["data"] = j_data; + + column["version"] = Integer(static_cast(1)); + column["typestr"] = String(typestr); + return column; +} +} // namespace xgboost diff --git a/tests/cpp/data/test_device_adapter.cu b/tests/cpp/data/test_device_adapter.cu new file mode 100644 index 000000000000..919e6de80f11 --- /dev/null +++ b/tests/cpp/data/test_device_adapter.cu @@ -0,0 +1,55 @@ +// Copyright (c) 2019 by Contributors +#include +#include +#include "../../../src/data/adapter.h" +#include "../../../src/data/simple_dmatrix.h" +#include "../../../src/common/timer.h" +#include "../helpers.h" +#include +#include "../../../src/data/device_adapter.cuh" +#include "test_columnar.h" +using namespace xgboost; // NOLINT + +void TestCudfAdapter() +{ + constexpr size_t kRowsA {16}; + constexpr size_t kRowsB {16}; + std::vector columns; + thrust::device_vector d_data_0(kRowsA); + thrust::device_vector d_data_1(kRowsB); + + columns.emplace_back(GenerateDenseColumn("(" -Json GenerateDenseColumn(std::string const& typestr, size_t kRows, - thrust::device_vector* out_d_data) { - auto& d_data = *out_d_data; - Json column { Object() }; - std::vector j_shape {Json(Integer(static_cast(kRows)))}; - column["shape"] = Array(j_shape); - column["strides"] = Array(std::vector{Json(Integer(static_cast(sizeof(T))))}); - - d_data.resize(kRows); - for (size_t i = 0; i < d_data.size(); ++i) { - d_data[i] = i * 2.0; - } - - auto p_d_data = dh::Raw(d_data); - - std::vector j_data { - Json(Integer(reinterpret_cast(p_d_data))), - Json(Boolean(false))}; - column["data"] = j_data; - - column["version"] = Integer(static_cast(1)); - column["typestr"] = String(typestr); - return column; -} void TestGetElement() { thrust::device_vector data; diff --git a/tests/cpp/data/test_simple_dmatrix.cu b/tests/cpp/data/test_simple_dmatrix.cu new file mode 100644 index 000000000000..df347d349e8c --- /dev/null +++ b/tests/cpp/data/test_simple_dmatrix.cu @@ -0,0 +1,318 @@ +// Copyright by Contributors +#include +#include +#include "../../../src/data/simple_dmatrix.h" + +#include +#include "../../../src/data/device_adapter.cuh" +#include "../helpers.h" +#include "test_columnar.h" + +using namespace xgboost; // NOLINT + +TEST(SimpleDMatrix, FromColumnarDenseBasic) { + constexpr size_t kRows{16}; + std::vector columns; + thrust::device_vector d_data_0(kRows); + thrust::device_vector d_data_1(kRows); + + columns.emplace_back(GenerateDenseColumn("("::quiet_NaN(), + -1); + EXPECT_EQ(dmat.Info().num_col_, 2); + EXPECT_EQ(dmat.Info().num_row_, 16); + EXPECT_EQ(dmat.Info().num_nonzero_, 32); +} + +void TestDenseColumn(DMatrix* dmat, size_t n_rows, size_t n_cols) { + for (auto& batch : dmat->GetBatches()) { + for (auto i = 0ull; i < batch.Size(); i++) { + auto inst = batch[i]; + for (auto j = 0ull; j < inst.size(); j++) { + EXPECT_EQ(inst[j].fvalue, i * 2); + EXPECT_EQ(inst[j].index, j); + } + } + } + ASSERT_EQ(dmat->Info().num_row_, n_rows); + ASSERT_EQ(dmat->Info().num_col_, n_cols); +} + +TEST(SimpleDMatrix, FromColumnarDense) { + constexpr size_t kRows{16}; + constexpr size_t kCols{2}; + std::vector columns; + thrust::device_vector d_data_0(kRows); + thrust::device_vector d_data_1(kRows); + columns.emplace_back(GenerateDenseColumn("("::quiet_NaN(), + -1); + TestDenseColumn(&dmat, kRows, kCols); + } + + // with missing value specified + { + data::CudfAdapter adapter(str); + data::SimpleDMatrix dmat(&adapter, 4.0, -1); + + ASSERT_EQ(dmat.Info().num_row_, kRows); + ASSERT_EQ(dmat.Info().num_col_, kCols); + ASSERT_EQ(dmat.Info().num_nonzero_, kCols * kRows - 2); + } + + { + // no missing value, but has NaN + d_data_0[3] = std::numeric_limits::quiet_NaN(); + ASSERT_TRUE(std::isnan(d_data_0[3])); // removes 6.0 + data::CudfAdapter adapter(str); + data::SimpleDMatrix dmat(&adapter, std::numeric_limits::quiet_NaN(), + -1); + ASSERT_EQ(dmat.Info().num_nonzero_, kRows * kCols - 1); + ASSERT_EQ(dmat.Info().num_row_, kRows); + ASSERT_EQ(dmat.Info().num_col_, kCols); + } +} + +TEST(SimpleDMatrix, FromColumnarWithEmptyRows) { + constexpr size_t kRows = 102; + constexpr size_t kCols = 24; + + std::vector v_columns(kCols); + std::vector> columns_data(kCols); + std::vector> column_bitfields( + kCols); + + RBitField8::value_type constexpr kUCOne = 1; + + for (size_t i = 0; i < kCols; ++i) { + auto& col = v_columns[i]; + col = Object(); + auto& data = columns_data[i]; + data.resize(kRows); + thrust::sequence(data.begin(), data.end(), 0); + dh::safe_cuda(cudaDeviceSynchronize()); + dh::safe_cuda(cudaGetLastError()); + + ASSERT_EQ(data.size(), kRows); + + auto p_d_data = raw_pointer_cast(data.data()); + std::vector j_data{ + Json(Integer(reinterpret_cast(p_d_data))), + Json(Boolean(false))}; + col["data"] = j_data; + std::vector j_shape{Json(Integer(static_cast(kRows)))}; + col["shape"] = Array(j_shape); + col["version"] = Integer(static_cast(1)); + col["typestr"] = String("(1)); + auto& mask_storage = column_bitfields[i]; + mask_storage.resize(16); // 16 bytes + + mask_storage[0] = ~(kUCOne << 2); // 3^th row is missing + mask_storage[1] = ~(kUCOne << 3); // 12^th row is missing + size_t last_ind = 12; + mask_storage[last_ind] = ~(kUCOne << 5); + std::set missing_row_index{0, 1, last_ind}; + + for (size_t j = 0; j < mask_storage.size(); ++j) { + if (missing_row_index.find(j) == missing_row_index.cend()) { + // all other rows are valid + mask_storage[j] = ~0; + } + } + + j_mask["data"] = std::vector{ + Json( + Integer(reinterpret_cast(mask_storage.data().get()))), + Json(Boolean(false))}; + j_mask["shape"] = Array( + std::vector{Json(Integer(static_cast(kRows)))}); + j_mask["typestr"] = String("|i1"); + } + + Json column_arr{Array(v_columns)}; + std::stringstream ss; + Json::Dump(column_arr, &ss); + std::string str = ss.str(); + data::CudfAdapter adapter(str); + data::SimpleDMatrix dmat(&adapter, std::numeric_limits::quiet_NaN(), + -1); + + for (auto& batch : dmat.GetBatches()) { + for (auto i = 0ull; i < batch.Size(); i++) { + auto inst = batch[i]; + for (auto j = 0ull; j < inst.size(); j++) { + EXPECT_EQ(inst[j].fvalue, i); + EXPECT_EQ(inst[j].index, j); + } + } + } + ASSERT_EQ(dmat.Info().num_nonzero_, (kRows - 3) * kCols); + ASSERT_EQ(dmat.Info().num_row_, kRows); + ASSERT_EQ(dmat.Info().num_col_, kCols); +} + +TEST(SimpleCSRSource, FromColumnarSparse) { + constexpr size_t kRows = 32; + constexpr size_t kCols = 2; + RBitField8::value_type constexpr kUCOne = 1; + + std::vector> columns_data(kCols); + std::vector> column_bitfields(kCols); + + { + // column 0 + auto& mask = column_bitfields[0]; + mask.resize(8); + + for (size_t j = 0; j < mask.size(); ++j) { + mask[j] = ~0; + } + // the 2^th entry of first column is invalid + // [0 0 0 0 0 1 0 0] + mask[0] = ~(kUCOne << 2); + } + { + // column 1 + auto& mask = column_bitfields[1]; + mask.resize(8); + + for (size_t j = 0; j < mask.size(); ++j) { + mask[j] = ~0; + } + // the 19^th entry of second column is invalid + // [~0~], [~0~], [0 0 0 0 1 0 0 0] + mask[2] = ~(kUCOne << 3); + } + + for (size_t c = 0; c < kCols; ++c) { + columns_data[c].resize(kRows); + thrust::sequence(columns_data[c].begin(), columns_data[c].end(), 0); + } + + std::vector j_columns(kCols); + + for (size_t c = 0; c < kCols; ++c) { + auto& column = j_columns[c]; + column = Object(); + column["version"] = Integer(static_cast(1)); + column["typestr"] = String(" j_data { + Json(Integer(reinterpret_cast(p_d_data))), + Json(Boolean(false))}; + column["data"] = j_data; + std::vector j_shape {Json(Integer(static_cast(kRows)))}; + column["shape"] = Array(j_shape); + column["version"] = Integer(static_cast(1)); + column["typestr"] = String("(1)); + j_mask["data"] = std::vector{ + Json(Integer(reinterpret_cast(column_bitfields[c].data().get()))), + Json(Boolean(false))}; + j_mask["shape"] = Array(std::vector{Json(Integer(static_cast(kRows)))}); + j_mask["typestr"] = String("|i1"); + } + + Json column_arr {Array(j_columns)}; + + std::stringstream ss; + Json::Dump(column_arr, &ss); + std::string str = ss.str(); + + { + data::CudfAdapter adapter(str); + data::SimpleDMatrix dmat(&adapter, std::numeric_limits::quiet_NaN(), -1); + + ASSERT_EQ(dmat.Info().num_row_, kRows); + ASSERT_EQ(dmat.Info().num_nonzero_, (kRows*kCols)-2); + } + + { + data::CudfAdapter adapter(str); + data::SimpleDMatrix dmat(&adapter, 2.0, -1); + for (auto& batch : dmat.GetBatches()) { + for (auto i = 0ull; i < batch.Size(); i++) { + auto inst = batch[i]; + for (auto e : inst) { + ASSERT_NE(e.fvalue, 2.0); + } + } + } + } + + { + // no missing value, but has NaN + data::CudfAdapter adapter(str); + columns_data[0][4] = std::numeric_limits::quiet_NaN(); // 0^th column 4^th row + data::SimpleDMatrix dmat(&adapter, std::numeric_limits::quiet_NaN(), + -1); + ASSERT_TRUE(std::isnan(columns_data[0][4])); + + // Two invalid entries and one NaN, in CSC + // 0^th column: 0, 1, 4, 5, 6, ..., kRows + // 1^th column: 0, 1, 2, 3, ..., 19, 21, ..., kRows + ASSERT_EQ(dmat.Info().num_nonzero_, kRows * kCols - 3); + } +} + + +TEST(SimpleDMatrix, FromColumnarSparseBasic) { + constexpr size_t kRows{16}; + std::vector columns; + thrust::device_vector d_data_0(kRows); + thrust::device_vector d_data_1(kRows); + + columns.emplace_back(GenerateSparseColumn("("::quiet_NaN(), + -1); + EXPECT_EQ(dmat.Info().num_col_, 2); + EXPECT_EQ(dmat.Info().num_row_, 16); + EXPECT_EQ(dmat.Info().num_nonzero_, 32); + + for (auto& batch : dmat.GetBatches()) { + for (auto i = 0ull; i < batch.Size(); i++) { + auto inst = batch[i]; + for (auto j = 0ull; j < inst.size(); j++) { + EXPECT_EQ(inst[j].fvalue, i * 2); + EXPECT_EQ(inst[j].index, j); + } + } + } +}