From 0cdd826921929cba7560f8911c78b5a9a3442b9c Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 10 Oct 2019 01:17:52 -0400 Subject: [PATCH 1/7] Ignore columnar alignment requirement. * Ignore the 8 bytes requirement. * Fix stride size. * tests. --- src/common/bitfield.h | 2 +- src/data/columnar.h | 37 +++++++------ src/data/simple_csr_source.cu | 3 +- tests/cpp/data/test_simple_csr_source.cu | 67 +++++++++++++++++------- tests/python-gpu/test_from_columnar.py | 7 +++ 5 files changed, 79 insertions(+), 37 deletions(-) diff --git a/src/common/bitfield.h b/src/common/bitfield.h index a4a1091483c4..b5ff34febc6b 100644 --- a/src/common/bitfield.h +++ b/src/common/bitfield.h @@ -216,7 +216,7 @@ struct RBitsPolicy : public BitFieldContainer> { } }; -// Format: BitField, underlying type must be unsigned. +// Format: BitField, underlying type must be unsigned. using LBitField64 = BitFieldContainer>; using RBitField8 = BitFieldContainer>; diff --git a/src/data/columnar.h b/src/data/columnar.h index 18c23b350704..52e64542e182 100644 --- a/src/data/columnar.h +++ b/src/data/columnar.h @@ -35,7 +35,7 @@ struct ColumnarErrors { return "Memory should be contigious."; } static char const* TypestrFormat() { - return "`typestr` should be of format ."; + return "`typestr' should be of format ."; } // Not supported in Apache Arrow. static char const* BigEndian() { @@ -50,7 +50,7 @@ struct ColumnarErrors { return str.c_str(); } static char const* Version() { - return "Only version 1 of __cuda_array_interface__ is being supported."; + return "Only version 1 of `__cuda_array_interface__' is supported."; } static char const* ofType(std::string const& type) { static std::string str; @@ -110,23 +110,23 @@ class ArrayInterfaceHandler { static void Validate(std::map const& array) { if (array.find("version") == array.cend()) { - LOG(FATAL) << "Missing version field for array interface"; + LOG(FATAL) << "Missing `version' field for array interface"; } auto version = get(array.at("version")); CHECK_EQ(version, 1) << ColumnarErrors::Version(); if (array.find("typestr") == array.cend()) { - LOG(FATAL) << "Missing typestr field for array interface"; + LOG(FATAL) << "Missing `typestr' field for array interface"; } auto typestr = get(array.at("typestr")); CHECK_EQ(typestr.size(), 3) << ColumnarErrors::TypestrFormat(); CHECK_NE(typestr.front(), '>') << ColumnarErrors::BigEndian(); if (array.find("shape") == array.cend()) { - LOG(FATAL) << "Missing shape field for array interface"; + LOG(FATAL) << "Missing `shape' field for array interface"; } if (array.find("data") == array.cend()) { - LOG(FATAL) << "Missing data field for array interface"; + LOG(FATAL) << "Missing `data' field for array interface"; } } @@ -143,22 +143,29 @@ class ArrayInterfaceHandler { auto j_shape = get(j_mask.at("shape")); CHECK_EQ(j_shape.size(), 1) << ColumnarErrors::Dimension(1); - CHECK_EQ(get(j_shape.front()) % 8, 0) << - "Length of validity mask must be a multiple of 8 bytes."; - int64_t size = get(j_shape.at(0)) * - sizeof(unsigned char) / sizeof(RBitField8::value_type); auto typestr = get(j_mask.at("typestr")); + // For now this is just 1, we can support different size of interger in mask. + int64_t const type_length = typestr.at(2) - 48; + // shape represents how many bits is in the mask. (This is a grey area, don't be + // suprised if it suddently represents something else when supporting a new + // implementation). + int64_t const size = get(j_shape.at(0)) * type_length / + sizeof(RBitField8::value_type); + + if (j_mask.find("strides") != j_mask.cend()) { + auto strides = get(column.at("strides")); + CHECK_EQ(strides.size(), 1) << ColumnarErrors::Dimension(1); + CHECK_EQ(get(strides.at(0)), type_length) << ColumnarErrors::Contigious(); + } if (typestr.at(1) == 't') { - CHECK_EQ(typestr.at(2), '1') << "There can be only 1 bit in each entry of bitfield."; + CHECK_EQ(typestr.at(2), '1') << "mask with integer type should be of 1 byte per bitfield."; } else if (typestr.at(1) == 'i') { CHECK_EQ(typestr.at(2), '1') << "mask with integer type should be of 1 byte per integer."; } else { LOG(FATAL) << "mask must be of integer type or bit field type."; } - // For now this is just 1 - int64_t const type_length = typestr.at(2) - 48; s_mask = {p_mask, size / type_length}; } } @@ -178,8 +185,8 @@ class ArrayInterfaceHandler { if (column.find("strides") != column.cend()) { auto strides = get(column.at("strides")); - CHECK_EQ(strides.size(), 1) << ColumnarErrors::Dimension(1); - CHECK_EQ(get(strides.at(0)), 4) << ColumnarErrors::Contigious(); + CHECK_EQ(strides.size(), 1) << ColumnarErrors::Dimension(1); + CHECK_EQ(get(strides.at(0)), sizeof(T)) << ColumnarErrors::Contigious(); } auto length = get(j_shape.at(0)); diff --git a/src/data/simple_csr_source.cu b/src/data/simple_csr_source.cu index b1ba5e2b63eb..93a9462bfbc8 100644 --- a/src/data/simple_csr_source.cu +++ b/src/data/simple_csr_source.cu @@ -186,7 +186,8 @@ void SimpleCSRSource::FromDeviceColumnar(std::vector const& columns, // one copy seems easier. this->info.num_nonzero_ = tmp_offset.back(); - int device = this->page_.offset.DeviceIdx(); + // Device is obtained and set in `CountValid' + int32_t const device = this->page_.offset.DeviceIdx(); this->page_.data.SetDevice(device); this->page_.data.Resize(this->info.num_nonzero_); auto s_data = this->page_.data.DeviceSpan(); diff --git a/tests/cpp/data/test_simple_csr_source.cu b/tests/cpp/data/test_simple_csr_source.cu index 7a6579aea347..c5b5be99edc8 100644 --- a/tests/cpp/data/test_simple_csr_source.cu +++ b/tests/cpp/data/test_simple_csr_source.cu @@ -47,7 +47,7 @@ Json GenerateDenseColumn(std::string const& typestr, size_t kRows, Json column { Object() }; std::vector j_shape {Json(Integer(static_cast(kRows)))}; column["shape"] = Array(j_shape); - column["strides"] = Array(std::vector{Json(Integer(static_cast(4)))}); + column["strides"] = Array(std::vector{Json(Integer(static_cast(sizeof(T))))}); d_data.resize(kRows); for (size_t i = 0; i < d_data.size(); ++i) { @@ -66,6 +66,29 @@ Json GenerateDenseColumn(std::string const& typestr, size_t kRows, return column; } +void TestDenseColumn(std::unique_ptr const& source, + size_t n_rows, size_t n_cols) { + auto const& data = source->page_.data.HostVector(); + auto const& offset = source->page_.offset.HostVector(); + + for (size_t i = 0; i < n_rows; i++) { + auto const idx = i * n_cols; + auto const e_0 = data.at(idx); + ASSERT_NEAR(e_0.fvalue, i * 2.0, kRtEps) << "idx: " << idx; + ASSERT_EQ(e_0.index, 0); // feature 0 + + auto e_1 = data.at(idx+1); + ASSERT_NEAR(e_1.fvalue, i * 2.0, kRtEps); + ASSERT_EQ(e_1.index, 1); // feature 1 + } + ASSERT_EQ(offset.back(), n_rows * n_cols); + for (size_t i = 0; i < n_rows + 1; ++i) { + ASSERT_EQ(offset[i], i * n_cols); + } + ASSERT_EQ(source->info.num_row_, n_rows); + ASSERT_EQ(source->info.num_col_, n_cols); +} + TEST(SimpleCSRSource, FromColumnarDense) { constexpr size_t kRows {16}; constexpr size_t kCols {2}; @@ -85,25 +108,7 @@ TEST(SimpleCSRSource, FromColumnarDense) { { std::unique_ptr source (new data::SimpleCSRSource()); source->CopyFrom(str.c_str(), false); - - auto const& data = source->page_.data.HostVector(); - auto const& offset = source->page_.offset.HostVector(); - for (size_t i = 0; i < kRows; i++) { - auto const idx = i * kCols; - auto const e_0 = data.at(idx); - ASSERT_NEAR(e_0.fvalue, i * 2.0, kRtEps) << "idx: " << idx; - ASSERT_EQ(e_0.index, 0); // feature 0 - - auto e_1 = data.at(idx+1); - ASSERT_NEAR(e_1.fvalue, i * 2.0, kRtEps); - ASSERT_EQ(e_1.index, 1); // feature 1 - } - ASSERT_EQ(offset.back(), kRows * kCols); - for (size_t i = 0; i < kRows + 1; ++i) { - ASSERT_EQ(offset[i], i * kCols); - } - ASSERT_EQ(source->info.num_row_, kRows); - ASSERT_EQ(source->info.num_col_, kCols); + TestDenseColumn(source, kRows, kCols); } // with missing value specified @@ -348,4 +353,26 @@ TEST(SimpleCSRSource, FromColumnarSparse) { } } +TEST(SimpleCSRSource, Types) { + // Test with different types of different size + constexpr size_t kRows {16}; + constexpr size_t kCols {2}; + std::vector columns; + thrust::device_vector d_data_0(kRows); + thrust::device_vector d_data_1(kRows); + + columns.emplace_back(GenerateDenseColumn("(" source (new data::SimpleCSRSource()); + source->CopyFrom(str.c_str(), false); + TestDenseColumn(source, kRows, kCols); +} + } // namespace xgboost \ No newline at end of file diff --git a/tests/python-gpu/test_from_columnar.py b/tests/python-gpu/test_from_columnar.py index 53fdfcc10035..8038fbd98e35 100644 --- a/tests/python-gpu/test_from_columnar.py +++ b/tests/python-gpu/test_from_columnar.py @@ -69,3 +69,10 @@ def test_from_cudf(self): with pytest.raises(Exception): dtrain = xgb.DMatrix(cd, label=cd) + + # Test when number of elements is less than 8 + X = cudf.DataFrame({'x': cudf.Series([0, 1, 2, np.NAN, 4], + dtype=np.int32)}) + dtrain = xgb.DMatrix(X) + assert dtrain.num_col() == 1 + assert dtrain.num_row() == 5 From 8672abce60a528db1c08264cf4c64af91e89b9f5 Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 10 Oct 2019 01:42:48 -0400 Subject: [PATCH 2/7] Typo. --- src/data/columnar.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data/columnar.h b/src/data/columnar.h index 52e64542e182..dcdbdbb78f52 100644 --- a/src/data/columnar.h +++ b/src/data/columnar.h @@ -159,7 +159,7 @@ class ArrayInterfaceHandler { } if (typestr.at(1) == 't') { - CHECK_EQ(typestr.at(2), '1') << "mask with integer type should be of 1 byte per bitfield."; + CHECK_EQ(typestr.at(2), '1') << "mask with bitfield type should be of 1 byte per bitfield."; } else if (typestr.at(1) == 'i') { CHECK_EQ(typestr.at(2), '1') << "mask with integer type should be of 1 byte per integer."; } else { From 208aab58255c80c49f5c145afa0503400e8648b9 Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 10 Oct 2019 01:56:47 -0400 Subject: [PATCH 3/7] Lint. --- src/data/columnar.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/data/columnar.h b/src/data/columnar.h index dcdbdbb78f52..7ca7e828f7e9 100644 --- a/src/data/columnar.h +++ b/src/data/columnar.h @@ -159,9 +159,9 @@ class ArrayInterfaceHandler { } if (typestr.at(1) == 't') { - CHECK_EQ(typestr.at(2), '1') << "mask with bitfield type should be of 1 byte per bitfield."; + CHECK_EQ(typestr.at(2), '1') << "mask with bitfield type should be of 1 byte per bitfield."; } else if (typestr.at(1) == 'i') { - CHECK_EQ(typestr.at(2), '1') << "mask with integer type should be of 1 byte per integer."; + CHECK_EQ(typestr.at(2), '1') << "mask with integer type should be of 1 byte per integer."; } else { LOG(FATAL) << "mask must be of integer type or bit field type."; } From 3368457c37b92adc866e396b0df3e68673bd398b Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 11 Oct 2019 02:47:04 -0400 Subject: [PATCH 4/7] Clarify the mask object. --- src/common/bitfield.h | 17 ++++-------- src/data/columnar.h | 35 +++++++++++++++++------- src/data/simple_csr_source.h | 5 ++-- tests/cpp/common/test_bitfield.cu | 10 +++++++ tests/cpp/data/test_simple_csr_source.cu | 12 ++++---- 5 files changed, 50 insertions(+), 29 deletions(-) diff --git a/src/common/bitfield.h b/src/common/bitfield.h index b5ff34febc6b..54e00aba6f95 100644 --- a/src/common/bitfield.h +++ b/src/common/bitfield.h @@ -19,6 +19,7 @@ #endif // defined(__CUDACC__) #include "xgboost/span.h" +#include "common.h" namespace xgboost { @@ -84,17 +85,11 @@ struct BitFieldContainer { XGBOOST_DEVICE BitFieldContainer(common::Span bits) : bits_{bits} {} XGBOOST_DEVICE BitFieldContainer(BitFieldContainer const& other) : bits_{other.bits_} {} + /*\brief Compute the size of needed memory allocation. The returned value is in terms + * of number of elements with `BitFieldContainer::value_type'. + */ static size_t ComputeStorageSize(size_t size) { - auto pos = ToBitPos(size); - if (size < kValueSize) { - return 1; - } - - if (pos.bit_pos != 0) { - return pos.int_pos + 2; - } else { - return pos.int_pos + 1; - } + return common::DivRoundUp(size, kValueSize); } #if defined(__CUDA_ARCH__) __device__ BitFieldContainer& operator|=(BitFieldContainer const& rhs) { @@ -218,7 +213,7 @@ struct RBitsPolicy : public BitFieldContainer> { // Format: BitField, underlying type must be unsigned. using LBitField64 = BitFieldContainer>; -using RBitField8 = BitFieldContainer>; +using RBitField8 = BitFieldContainer>; #if defined(__CUDACC__) diff --git a/src/data/columnar.h b/src/data/columnar.h index 7ca7e828f7e9..16f8336434b9 100644 --- a/src/data/columnar.h +++ b/src/data/columnar.h @@ -35,7 +35,7 @@ struct ColumnarErrors { return "Memory should be contigious."; } static char const* TypestrFormat() { - return "`typestr' should be of format ."; + return "`typestr' should be of format ."; } // Not supported in Apache Arrow. static char const* BigEndian() { @@ -132,8 +132,8 @@ class ArrayInterfaceHandler { // Find null mask (validity mask) field // Mask object is also an array interface, but with different requirements. - static void ExtractMask(std::map const& column, - common::Span* p_out) { + static size_t ExtractMask(std::map const &column, + common::Span *p_out) { auto& s_mask = *p_out; if (column.find("mask") != column.cend()) { auto const& j_mask = get(column.at("mask")); @@ -146,11 +146,20 @@ class ArrayInterfaceHandler { auto typestr = get(j_mask.at("typestr")); // For now this is just 1, we can support different size of interger in mask. int64_t const type_length = typestr.at(2) - 48; - // shape represents how many bits is in the mask. (This is a grey area, don't be - // suprised if it suddently represents something else when supporting a new - // implementation). - int64_t const size = get(j_shape.at(0)) * type_length / - sizeof(RBitField8::value_type); + /* + * shape represents how many bits is in the mask. (This is a grey area, don't be + * suprised if it suddently represents something else when supporting a new + * implementation). Quoting from numpy array interface: + * + * The shape of this object should be "broadcastable" to the shape of the original + * array. + * + * And that's the only requirement. + */ + int64_t const n_bits = get(j_shape.at(0)); + // The size of span required to cover all bits. Here with 8 bits bitfield, we + // assume 1 byte alignment. + int64_t const span_size = RBitField8::ComputeStorageSize(n_bits); if (j_mask.find("strides") != j_mask.cend()) { auto strides = get(column.at("strides")); @@ -166,8 +175,10 @@ class ArrayInterfaceHandler { LOG(FATAL) << "mask must be of integer type or bit field type."; } - s_mask = {p_mask, size / type_length}; + s_mask = {p_mask, span_size}; + return n_bits; } + return 0; } template @@ -204,10 +215,14 @@ class ArrayInterfaceHandler { foreign_col.size = s_data.size(); common::Span s_mask; - ArrayInterfaceHandler::ExtractMask(column, &s_mask); + size_t n_bits = ArrayInterfaceHandler::ExtractMask(column, &s_mask); foreign_col.valid = RBitField8(s_mask); + if (s_mask.data()) { + CHECK_EQ(n_bits, foreign_col.data.size()); + } + return foreign_col; } }; diff --git a/src/data/simple_csr_source.h b/src/data/simple_csr_source.h index 5236021cd613..1ae0d189685b 100644 --- a/src/data/simple_csr_source.h +++ b/src/data/simple_csr_source.h @@ -16,9 +16,10 @@ #include #include -#include "columnar.h" - namespace xgboost { + +class Json; + namespace data { /*! * \brief The simplest form of data holder, can be used to create DMatrix. diff --git a/tests/cpp/common/test_bitfield.cu b/tests/cpp/common/test_bitfield.cu index e4ad58b0044c..d641debd8b7e 100644 --- a/tests/cpp/common/test_bitfield.cu +++ b/tests/cpp/common/test_bitfield.cu @@ -17,6 +17,16 @@ __global__ void TestSetKernel(LBitField64 bits) { } } +TEST(BitField, StorageSize) { + size_t constexpr kElements { 16 }; + size_t size = LBitField64::ComputeStorageSize(kElements); + ASSERT_EQ(1, size); + size = RBitField8::ComputeStorageSize(4); + ASSERT_EQ(1, size); + size = RBitField8::ComputeStorageSize(kElements); + ASSERT_EQ(2, size); +} + TEST(BitField, GPU_Set) { dh::device_vector storage; uint32_t constexpr kBits = 128; diff --git a/tests/cpp/data/test_simple_csr_source.cu b/tests/cpp/data/test_simple_csr_source.cu index c5b5be99edc8..1443739f874e 100644 --- a/tests/cpp/data/test_simple_csr_source.cu +++ b/tests/cpp/data/test_simple_csr_source.cu @@ -150,9 +150,9 @@ TEST(SimpleCSRSource, FromColumnarWithEmptyRows) { std::vector v_columns (kCols); std::vector> columns_data(kCols); - std::vector> column_bitfields(kCols); + std::vector> column_bitfields(kCols); - unsigned char constexpr kUCOne = 1; + RBitField8::value_type constexpr kUCOne = 1; for (size_t i = 0; i < kCols; ++i) { auto& col = v_columns[i]; @@ -198,7 +198,7 @@ TEST(SimpleCSRSource, FromColumnarWithEmptyRows) { j_mask["data"] = std::vector{ Json(Integer(reinterpret_cast(mask_storage.data().get()))), Json(Boolean(false))}; - j_mask["shape"] = Array(std::vector{Json(Integer(static_cast(16)))}); + j_mask["shape"] = Array(std::vector{Json(Integer(static_cast(kRows)))}); j_mask["typestr"] = String("|i1"); } @@ -225,10 +225,10 @@ TEST(SimpleCSRSource, FromColumnarWithEmptyRows) { TEST(SimpleCSRSource, FromColumnarSparse) { constexpr size_t kRows = 32; constexpr size_t kCols = 2; - unsigned char constexpr kUCOne = 1; + RBitField8::value_type constexpr kUCOne = 1; std::vector> columns_data(kCols); - std::vector> column_bitfields(kCols); + std::vector> column_bitfields(kCols); { // column 0 @@ -283,7 +283,7 @@ TEST(SimpleCSRSource, FromColumnarSparse) { j_mask["data"] = std::vector{ Json(Integer(reinterpret_cast(column_bitfields[c].data().get()))), Json(Boolean(false))}; - j_mask["shape"] = Array(std::vector{Json(Integer(static_cast(8)))}); + j_mask["shape"] = Array(std::vector{Json(Integer(static_cast(kRows)))}); j_mask["typestr"] = String("|i1"); } From 0cc512efbb302987ee615c2e4fb9ef298bb0e92f Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 11 Oct 2019 03:01:41 -0400 Subject: [PATCH 5/7] Check for matching shape. --- src/data/columnar.h | 4 +++- tests/cpp/data/test_simple_csr_source.cu | 9 +++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/data/columnar.h b/src/data/columnar.h index 16f8336434b9..381f4c58c8da 100644 --- a/src/data/columnar.h +++ b/src/data/columnar.h @@ -220,7 +220,9 @@ class ArrayInterfaceHandler { foreign_col.valid = RBitField8(s_mask); if (s_mask.data()) { - CHECK_EQ(n_bits, foreign_col.data.size()); + CHECK_EQ(n_bits, foreign_col.data.size()) + << "Shape of bit mask doesn't match data shape. " + << "XGBoost doesn't support internal broadcasting."; } return foreign_col; diff --git a/tests/cpp/data/test_simple_csr_source.cu b/tests/cpp/data/test_simple_csr_source.cu index 1443739f874e..47bd19d0492b 100644 --- a/tests/cpp/data/test_simple_csr_source.cu +++ b/tests/cpp/data/test_simple_csr_source.cu @@ -38,6 +38,15 @@ TEST(ArrayInterfaceHandler, Error) { Json(Boolean(false))}; column["data"] = j_data; EXPECT_NO_THROW(ArrayInterfaceHandler::ExtractArray(column_obj)); + + std::vector j_mask_shape {Json(Integer(static_cast(kRows - 1)))}; + column["mask"] = Object(); + column["mask"]["shape"] = j_mask_shape; + column["mask"]["data"] = j_data; + column["mask"]["typestr"] = String("(1)); + // shape of mask and data doesn't match. + EXPECT_THROW(ArrayInterfaceHandler::ExtractArray(column_obj), dmlc::Error); } template From e585e7005f0e160f4649fcccf39d518bf4ce62d4 Mon Sep 17 00:00:00 2001 From: fis Date: Sat, 12 Oct 2019 04:24:10 -0400 Subject: [PATCH 6/7] Better error msg for wrong type. --- src/data/columnar.h | 55 +++++++++++++++++--------- tests/python-gpu/test_from_columnar.py | 10 +++++ 2 files changed, 47 insertions(+), 18 deletions(-) diff --git a/src/data/columnar.h b/src/data/columnar.h index 381f4c58c8da..5c221d36b6af 100644 --- a/src/data/columnar.h +++ b/src/data/columnar.h @@ -60,22 +60,6 @@ struct ColumnarErrors { str += " type."; return str.c_str(); } - static std::string UnknownTypeStr(std::string const& typestr) { - return "typestr from array interface: " + typestr + " is not supported."; - } -}; - -// TODO(trivialfis): Abstract this into a class that accept a json -// object and turn it into an array (for cupy and numba). -class ArrayInterfaceHandler { - public: - template - static constexpr char TypeChar() { - return - (std::is_floating_point::value ? 'f' : - (std::is_integral::value ? - (std::is_signed::value ? 'i' : 'u') : '\0')); - } static std::string TypeStr(char c) { switch (c) { @@ -89,12 +73,46 @@ class ArrayInterfaceHandler { return "Unsigned integer"; case 'f': return "Floating point"; + case 'c': + return "Complex floating point"; + case 'm': + return "Timedelta"; + case 'M': + return "Datetime"; + case 'O': + return "Object"; + case 'S': + return "String"; + case 'U': + return "Unicode"; + case 'V': + return "Other"; default: - LOG(FATAL) << "Invalid type code: " << c << " in typestr of input array interface."; + LOG(FATAL) << "Invalid type code: " << c << " in `typestr' of input array." + << "\nPlease verify the `__cuda_array_interface__' of your input data complies to: " + << "https://docs.scipy.org/doc/numpy/reference/arrays.interface.html" + << "\nOr open an issue."; return ""; } } + static std::string UnSupportedType(std::string const& typestr) { + return TypeStr(typestr.at(1)) + " is not supported."; + } +}; + +// TODO(trivialfis): Abstract this into a class that accept a json +// object and turn it into an array (for cupy and numba). +class ArrayInterfaceHandler { + public: + template + static constexpr char TypeChar() { + return + (std::is_floating_point::value ? 'f' : + (std::is_integral::value ? + (std::is_signed::value ? 'i' : 'u') : '\0')); + } + template static PtrType GetPtrFromArrayData(std::map const& obj) { if (obj.find("data") == obj.cend()) { @@ -230,6 +248,7 @@ class ArrayInterfaceHandler { }; #define DISPATCH_TYPE(__dispatched_func, __typestr, ...) { \ + CHECK_EQ(__typestr.size(), 3) << ColumnarErrors::TypestrFormat(); \ if (__typestr.at(1) == 'f' && __typestr.at(2) == '4') { \ __dispatched_func(__VA_ARGS__); \ } else if (__typestr.at(1) == 'f' && __typestr.at(2) == '8') { \ @@ -251,7 +270,7 @@ class ArrayInterfaceHandler { } else if (__typestr.at(1) == 'u' && __typestr.at(2) == '8') { \ __dispatched_func(__VA_ARGS__); \ } else { \ - LOG(FATAL) << ColumnarErrors::UnknownTypeStr(__typestr); \ + LOG(FATAL) << ColumnarErrors::UnSupportedType(__typestr); \ } \ } diff --git a/tests/python-gpu/test_from_columnar.py b/tests/python-gpu/test_from_columnar.py index 8038fbd98e35..cd5a567c8af3 100644 --- a/tests/python-gpu/test_from_columnar.py +++ b/tests/python-gpu/test_from_columnar.py @@ -76,3 +76,13 @@ def test_from_cudf(self): dtrain = xgb.DMatrix(X) assert dtrain.num_col() == 1 assert dtrain.num_row() == 5 + + # Boolean is not supported. + X_boolean = cudf.DataFrame({'x': cudf.Series([True, False])}) + with pytest.raises(Exception): + dtrain = xgb.DMatrix(X_boolean) + + y_boolean = cudf.DataFrame({ + 'x': cudf.Series([True, False, True, True, True])}) + with pytest.raises(Exception): + dtrain = xgb.DMatrix(X_boolean, label=y_boolean) From 9e5768ad101354a399b95ab1d6abc8c79c75b208 Mon Sep 17 00:00:00 2001 From: fis Date: Sat, 12 Oct 2019 06:14:36 -0400 Subject: [PATCH 7/7] Lint. --- src/data/columnar.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/data/columnar.h b/src/data/columnar.h index 5c221d36b6af..652fd207c207 100644 --- a/src/data/columnar.h +++ b/src/data/columnar.h @@ -89,7 +89,8 @@ struct ColumnarErrors { return "Other"; default: LOG(FATAL) << "Invalid type code: " << c << " in `typestr' of input array." - << "\nPlease verify the `__cuda_array_interface__' of your input data complies to: " + << "\nPlease verify the `__cuda_array_interface__' " + << "of your input data complies to: " << "https://docs.scipy.org/doc/numpy/reference/arrays.interface.html" << "\nOr open an issue."; return "";