From 3d46bd0fa55601a0b115a5c14f3a396d62c12a39 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Sun, 13 Oct 2019 06:41:43 -0400 Subject: [PATCH] Ignore columnar alignment requirement. (#4928) * Better error message for wrong type. * Fix stride size. --- src/common/bitfield.h | 19 ++-- src/data/columnar.h | 120 ++++++++++++++++------- src/data/simple_csr_source.cu | 3 +- src/data/simple_csr_source.h | 5 +- tests/cpp/common/test_bitfield.cu | 10 ++ tests/cpp/data/test_simple_csr_source.cu | 88 ++++++++++++----- tests/python-gpu/test_from_columnar.py | 17 ++++ 7 files changed, 183 insertions(+), 79 deletions(-) diff --git a/src/common/bitfield.h b/src/common/bitfield.h index a4a1091483c4..54e00aba6f95 100644 --- a/src/common/bitfield.h +++ b/src/common/bitfield.h @@ -19,6 +19,7 @@ #endif // defined(__CUDACC__) #include "xgboost/span.h" +#include "common.h" namespace xgboost { @@ -84,17 +85,11 @@ struct BitFieldContainer { XGBOOST_DEVICE BitFieldContainer(common::Span bits) : bits_{bits} {} XGBOOST_DEVICE BitFieldContainer(BitFieldContainer const& other) : bits_{other.bits_} {} + /*\brief Compute the size of needed memory allocation. The returned value is in terms + * of number of elements with `BitFieldContainer::value_type'. + */ static size_t ComputeStorageSize(size_t size) { - auto pos = ToBitPos(size); - if (size < kValueSize) { - return 1; - } - - if (pos.bit_pos != 0) { - return pos.int_pos + 2; - } else { - return pos.int_pos + 1; - } + return common::DivRoundUp(size, kValueSize); } #if defined(__CUDA_ARCH__) __device__ BitFieldContainer& operator|=(BitFieldContainer const& rhs) { @@ -216,9 +211,9 @@ struct RBitsPolicy : public BitFieldContainer> { } }; -// Format: BitField, underlying type must be unsigned. +// Format: BitField, underlying type must be unsigned. using LBitField64 = BitFieldContainer>; -using RBitField8 = BitFieldContainer>; +using RBitField8 = BitFieldContainer>; #if defined(__CUDACC__) diff --git a/src/data/columnar.h b/src/data/columnar.h index 18c23b350704..652fd207c207 100644 --- a/src/data/columnar.h +++ b/src/data/columnar.h @@ -35,7 +35,7 @@ struct ColumnarErrors { return "Memory should be contigious."; } static char const* TypestrFormat() { - return "`typestr` should be of format ."; + return "`typestr' should be of format ."; } // Not supported in Apache Arrow. static char const* BigEndian() { @@ -50,7 +50,7 @@ struct ColumnarErrors { return str.c_str(); } static char const* Version() { - return "Only version 1 of __cuda_array_interface__ is being supported."; + return "Only version 1 of `__cuda_array_interface__' is supported."; } static char const* ofType(std::string const& type) { static std::string str; @@ -60,22 +60,6 @@ struct ColumnarErrors { str += " type."; return str.c_str(); } - static std::string UnknownTypeStr(std::string const& typestr) { - return "typestr from array interface: " + typestr + " is not supported."; - } -}; - -// TODO(trivialfis): Abstract this into a class that accept a json -// object and turn it into an array (for cupy and numba). -class ArrayInterfaceHandler { - public: - template - static constexpr char TypeChar() { - return - (std::is_floating_point::value ? 'f' : - (std::is_integral::value ? - (std::is_signed::value ? 'i' : 'u') : '\0')); - } static std::string TypeStr(char c) { switch (c) { @@ -89,12 +73,47 @@ class ArrayInterfaceHandler { return "Unsigned integer"; case 'f': return "Floating point"; + case 'c': + return "Complex floating point"; + case 'm': + return "Timedelta"; + case 'M': + return "Datetime"; + case 'O': + return "Object"; + case 'S': + return "String"; + case 'U': + return "Unicode"; + case 'V': + return "Other"; default: - LOG(FATAL) << "Invalid type code: " << c << " in typestr of input array interface."; + LOG(FATAL) << "Invalid type code: " << c << " in `typestr' of input array." + << "\nPlease verify the `__cuda_array_interface__' " + << "of your input data complies to: " + << "https://docs.scipy.org/doc/numpy/reference/arrays.interface.html" + << "\nOr open an issue."; return ""; } } + static std::string UnSupportedType(std::string const& typestr) { + return TypeStr(typestr.at(1)) + " is not supported."; + } +}; + +// TODO(trivialfis): Abstract this into a class that accept a json +// object and turn it into an array (for cupy and numba). +class ArrayInterfaceHandler { + public: + template + static constexpr char TypeChar() { + return + (std::is_floating_point::value ? 'f' : + (std::is_integral::value ? + (std::is_signed::value ? 'i' : 'u') : '\0')); + } + template static PtrType GetPtrFromArrayData(std::map const& obj) { if (obj.find("data") == obj.cend()) { @@ -110,30 +129,30 @@ class ArrayInterfaceHandler { static void Validate(std::map const& array) { if (array.find("version") == array.cend()) { - LOG(FATAL) << "Missing version field for array interface"; + LOG(FATAL) << "Missing `version' field for array interface"; } auto version = get(array.at("version")); CHECK_EQ(version, 1) << ColumnarErrors::Version(); if (array.find("typestr") == array.cend()) { - LOG(FATAL) << "Missing typestr field for array interface"; + LOG(FATAL) << "Missing `typestr' field for array interface"; } auto typestr = get(array.at("typestr")); CHECK_EQ(typestr.size(), 3) << ColumnarErrors::TypestrFormat(); CHECK_NE(typestr.front(), '>') << ColumnarErrors::BigEndian(); if (array.find("shape") == array.cend()) { - LOG(FATAL) << "Missing shape field for array interface"; + LOG(FATAL) << "Missing `shape' field for array interface"; } if (array.find("data") == array.cend()) { - LOG(FATAL) << "Missing data field for array interface"; + LOG(FATAL) << "Missing `data' field for array interface"; } } // Find null mask (validity mask) field // Mask object is also an array interface, but with different requirements. - static void ExtractMask(std::map const& column, - common::Span* p_out) { + static size_t ExtractMask(std::map const &column, + common::Span *p_out) { auto& s_mask = *p_out; if (column.find("mask") != column.cend()) { auto const& j_mask = get(column.at("mask")); @@ -143,24 +162,42 @@ class ArrayInterfaceHandler { auto j_shape = get(j_mask.at("shape")); CHECK_EQ(j_shape.size(), 1) << ColumnarErrors::Dimension(1); - CHECK_EQ(get(j_shape.front()) % 8, 0) << - "Length of validity mask must be a multiple of 8 bytes."; - int64_t size = get(j_shape.at(0)) * - sizeof(unsigned char) / sizeof(RBitField8::value_type); auto typestr = get(j_mask.at("typestr")); + // For now this is just 1, we can support different size of interger in mask. + int64_t const type_length = typestr.at(2) - 48; + /* + * shape represents how many bits is in the mask. (This is a grey area, don't be + * suprised if it suddently represents something else when supporting a new + * implementation). Quoting from numpy array interface: + * + * The shape of this object should be "broadcastable" to the shape of the original + * array. + * + * And that's the only requirement. + */ + int64_t const n_bits = get(j_shape.at(0)); + // The size of span required to cover all bits. Here with 8 bits bitfield, we + // assume 1 byte alignment. + int64_t const span_size = RBitField8::ComputeStorageSize(n_bits); + + if (j_mask.find("strides") != j_mask.cend()) { + auto strides = get(column.at("strides")); + CHECK_EQ(strides.size(), 1) << ColumnarErrors::Dimension(1); + CHECK_EQ(get(strides.at(0)), type_length) << ColumnarErrors::Contigious(); + } if (typestr.at(1) == 't') { - CHECK_EQ(typestr.at(2), '1') << "There can be only 1 bit in each entry of bitfield."; + CHECK_EQ(typestr.at(2), '1') << "mask with bitfield type should be of 1 byte per bitfield."; } else if (typestr.at(1) == 'i') { - CHECK_EQ(typestr.at(2), '1') << "mask with integer type should be of 1 byte per integer."; + CHECK_EQ(typestr.at(2), '1') << "mask with integer type should be of 1 byte per integer."; } else { LOG(FATAL) << "mask must be of integer type or bit field type."; } - // For now this is just 1 - int64_t const type_length = typestr.at(2) - 48; - s_mask = {p_mask, size / type_length}; + s_mask = {p_mask, span_size}; + return n_bits; } + return 0; } template @@ -178,8 +215,8 @@ class ArrayInterfaceHandler { if (column.find("strides") != column.cend()) { auto strides = get(column.at("strides")); - CHECK_EQ(strides.size(), 1) << ColumnarErrors::Dimension(1); - CHECK_EQ(get(strides.at(0)), 4) << ColumnarErrors::Contigious(); + CHECK_EQ(strides.size(), 1) << ColumnarErrors::Dimension(1); + CHECK_EQ(get(strides.at(0)), sizeof(T)) << ColumnarErrors::Contigious(); } auto length = get(j_shape.at(0)); @@ -197,15 +234,22 @@ class ArrayInterfaceHandler { foreign_col.size = s_data.size(); common::Span s_mask; - ArrayInterfaceHandler::ExtractMask(column, &s_mask); + size_t n_bits = ArrayInterfaceHandler::ExtractMask(column, &s_mask); foreign_col.valid = RBitField8(s_mask); + if (s_mask.data()) { + CHECK_EQ(n_bits, foreign_col.data.size()) + << "Shape of bit mask doesn't match data shape. " + << "XGBoost doesn't support internal broadcasting."; + } + return foreign_col; } }; #define DISPATCH_TYPE(__dispatched_func, __typestr, ...) { \ + CHECK_EQ(__typestr.size(), 3) << ColumnarErrors::TypestrFormat(); \ if (__typestr.at(1) == 'f' && __typestr.at(2) == '4') { \ __dispatched_func(__VA_ARGS__); \ } else if (__typestr.at(1) == 'f' && __typestr.at(2) == '8') { \ @@ -227,7 +271,7 @@ class ArrayInterfaceHandler { } else if (__typestr.at(1) == 'u' && __typestr.at(2) == '8') { \ __dispatched_func(__VA_ARGS__); \ } else { \ - LOG(FATAL) << ColumnarErrors::UnknownTypeStr(__typestr); \ + LOG(FATAL) << ColumnarErrors::UnSupportedType(__typestr); \ } \ } diff --git a/src/data/simple_csr_source.cu b/src/data/simple_csr_source.cu index b1ba5e2b63eb..93a9462bfbc8 100644 --- a/src/data/simple_csr_source.cu +++ b/src/data/simple_csr_source.cu @@ -186,7 +186,8 @@ void SimpleCSRSource::FromDeviceColumnar(std::vector const& columns, // one copy seems easier. this->info.num_nonzero_ = tmp_offset.back(); - int device = this->page_.offset.DeviceIdx(); + // Device is obtained and set in `CountValid' + int32_t const device = this->page_.offset.DeviceIdx(); this->page_.data.SetDevice(device); this->page_.data.Resize(this->info.num_nonzero_); auto s_data = this->page_.data.DeviceSpan(); diff --git a/src/data/simple_csr_source.h b/src/data/simple_csr_source.h index 5236021cd613..1ae0d189685b 100644 --- a/src/data/simple_csr_source.h +++ b/src/data/simple_csr_source.h @@ -16,9 +16,10 @@ #include #include -#include "columnar.h" - namespace xgboost { + +class Json; + namespace data { /*! * \brief The simplest form of data holder, can be used to create DMatrix. diff --git a/tests/cpp/common/test_bitfield.cu b/tests/cpp/common/test_bitfield.cu index e4ad58b0044c..d641debd8b7e 100644 --- a/tests/cpp/common/test_bitfield.cu +++ b/tests/cpp/common/test_bitfield.cu @@ -17,6 +17,16 @@ __global__ void TestSetKernel(LBitField64 bits) { } } +TEST(BitField, StorageSize) { + size_t constexpr kElements { 16 }; + size_t size = LBitField64::ComputeStorageSize(kElements); + ASSERT_EQ(1, size); + size = RBitField8::ComputeStorageSize(4); + ASSERT_EQ(1, size); + size = RBitField8::ComputeStorageSize(kElements); + ASSERT_EQ(2, size); +} + TEST(BitField, GPU_Set) { dh::device_vector storage; uint32_t constexpr kBits = 128; diff --git a/tests/cpp/data/test_simple_csr_source.cu b/tests/cpp/data/test_simple_csr_source.cu index 7a6579aea347..47bd19d0492b 100644 --- a/tests/cpp/data/test_simple_csr_source.cu +++ b/tests/cpp/data/test_simple_csr_source.cu @@ -38,6 +38,15 @@ TEST(ArrayInterfaceHandler, Error) { Json(Boolean(false))}; column["data"] = j_data; EXPECT_NO_THROW(ArrayInterfaceHandler::ExtractArray(column_obj)); + + std::vector j_mask_shape {Json(Integer(static_cast(kRows - 1)))}; + column["mask"] = Object(); + column["mask"]["shape"] = j_mask_shape; + column["mask"]["data"] = j_data; + column["mask"]["typestr"] = String("(1)); + // shape of mask and data doesn't match. + EXPECT_THROW(ArrayInterfaceHandler::ExtractArray(column_obj), dmlc::Error); } template @@ -47,7 +56,7 @@ Json GenerateDenseColumn(std::string const& typestr, size_t kRows, Json column { Object() }; std::vector j_shape {Json(Integer(static_cast(kRows)))}; column["shape"] = Array(j_shape); - column["strides"] = Array(std::vector{Json(Integer(static_cast(4)))}); + column["strides"] = Array(std::vector{Json(Integer(static_cast(sizeof(T))))}); d_data.resize(kRows); for (size_t i = 0; i < d_data.size(); ++i) { @@ -66,6 +75,29 @@ Json GenerateDenseColumn(std::string const& typestr, size_t kRows, return column; } +void TestDenseColumn(std::unique_ptr const& source, + size_t n_rows, size_t n_cols) { + auto const& data = source->page_.data.HostVector(); + auto const& offset = source->page_.offset.HostVector(); + + for (size_t i = 0; i < n_rows; i++) { + auto const idx = i * n_cols; + auto const e_0 = data.at(idx); + ASSERT_NEAR(e_0.fvalue, i * 2.0, kRtEps) << "idx: " << idx; + ASSERT_EQ(e_0.index, 0); // feature 0 + + auto e_1 = data.at(idx+1); + ASSERT_NEAR(e_1.fvalue, i * 2.0, kRtEps); + ASSERT_EQ(e_1.index, 1); // feature 1 + } + ASSERT_EQ(offset.back(), n_rows * n_cols); + for (size_t i = 0; i < n_rows + 1; ++i) { + ASSERT_EQ(offset[i], i * n_cols); + } + ASSERT_EQ(source->info.num_row_, n_rows); + ASSERT_EQ(source->info.num_col_, n_cols); +} + TEST(SimpleCSRSource, FromColumnarDense) { constexpr size_t kRows {16}; constexpr size_t kCols {2}; @@ -85,25 +117,7 @@ TEST(SimpleCSRSource, FromColumnarDense) { { std::unique_ptr source (new data::SimpleCSRSource()); source->CopyFrom(str.c_str(), false); - - auto const& data = source->page_.data.HostVector(); - auto const& offset = source->page_.offset.HostVector(); - for (size_t i = 0; i < kRows; i++) { - auto const idx = i * kCols; - auto const e_0 = data.at(idx); - ASSERT_NEAR(e_0.fvalue, i * 2.0, kRtEps) << "idx: " << idx; - ASSERT_EQ(e_0.index, 0); // feature 0 - - auto e_1 = data.at(idx+1); - ASSERT_NEAR(e_1.fvalue, i * 2.0, kRtEps); - ASSERT_EQ(e_1.index, 1); // feature 1 - } - ASSERT_EQ(offset.back(), kRows * kCols); - for (size_t i = 0; i < kRows + 1; ++i) { - ASSERT_EQ(offset[i], i * kCols); - } - ASSERT_EQ(source->info.num_row_, kRows); - ASSERT_EQ(source->info.num_col_, kCols); + TestDenseColumn(source, kRows, kCols); } // with missing value specified @@ -145,9 +159,9 @@ TEST(SimpleCSRSource, FromColumnarWithEmptyRows) { std::vector v_columns (kCols); std::vector> columns_data(kCols); - std::vector> column_bitfields(kCols); + std::vector> column_bitfields(kCols); - unsigned char constexpr kUCOne = 1; + RBitField8::value_type constexpr kUCOne = 1; for (size_t i = 0; i < kCols; ++i) { auto& col = v_columns[i]; @@ -193,7 +207,7 @@ TEST(SimpleCSRSource, FromColumnarWithEmptyRows) { j_mask["data"] = std::vector{ Json(Integer(reinterpret_cast(mask_storage.data().get()))), Json(Boolean(false))}; - j_mask["shape"] = Array(std::vector{Json(Integer(static_cast(16)))}); + j_mask["shape"] = Array(std::vector{Json(Integer(static_cast(kRows)))}); j_mask["typestr"] = String("|i1"); } @@ -220,10 +234,10 @@ TEST(SimpleCSRSource, FromColumnarWithEmptyRows) { TEST(SimpleCSRSource, FromColumnarSparse) { constexpr size_t kRows = 32; constexpr size_t kCols = 2; - unsigned char constexpr kUCOne = 1; + RBitField8::value_type constexpr kUCOne = 1; std::vector> columns_data(kCols); - std::vector> column_bitfields(kCols); + std::vector> column_bitfields(kCols); { // column 0 @@ -278,7 +292,7 @@ TEST(SimpleCSRSource, FromColumnarSparse) { j_mask["data"] = std::vector{ Json(Integer(reinterpret_cast(column_bitfields[c].data().get()))), Json(Boolean(false))}; - j_mask["shape"] = Array(std::vector{Json(Integer(static_cast(8)))}); + j_mask["shape"] = Array(std::vector{Json(Integer(static_cast(kRows)))}); j_mask["typestr"] = String("|i1"); } @@ -348,4 +362,26 @@ TEST(SimpleCSRSource, FromColumnarSparse) { } } +TEST(SimpleCSRSource, Types) { + // Test with different types of different size + constexpr size_t kRows {16}; + constexpr size_t kCols {2}; + std::vector columns; + thrust::device_vector d_data_0(kRows); + thrust::device_vector d_data_1(kRows); + + columns.emplace_back(GenerateDenseColumn("(" source (new data::SimpleCSRSource()); + source->CopyFrom(str.c_str(), false); + TestDenseColumn(source, kRows, kCols); +} + } // namespace xgboost \ No newline at end of file diff --git a/tests/python-gpu/test_from_columnar.py b/tests/python-gpu/test_from_columnar.py index 53fdfcc10035..cd5a567c8af3 100644 --- a/tests/python-gpu/test_from_columnar.py +++ b/tests/python-gpu/test_from_columnar.py @@ -69,3 +69,20 @@ def test_from_cudf(self): with pytest.raises(Exception): dtrain = xgb.DMatrix(cd, label=cd) + + # Test when number of elements is less than 8 + X = cudf.DataFrame({'x': cudf.Series([0, 1, 2, np.NAN, 4], + dtype=np.int32)}) + dtrain = xgb.DMatrix(X) + assert dtrain.num_col() == 1 + assert dtrain.num_row() == 5 + + # Boolean is not supported. + X_boolean = cudf.DataFrame({'x': cudf.Series([True, False])}) + with pytest.raises(Exception): + dtrain = xgb.DMatrix(X_boolean) + + y_boolean = cudf.DataFrame({ + 'x': cudf.Series([True, False, True, True, True])}) + with pytest.raises(Exception): + dtrain = xgb.DMatrix(X_boolean, label=y_boolean)