Skip to content

Commit

Permalink
Ignore columnar alignment requirement. (#4928)
Browse files Browse the repository at this point in the history
* Better error message for wrong type.
* Fix stride size.
  • Loading branch information
trivialfis authored Oct 13, 2019
1 parent 05d4751 commit 3d46bd0
Show file tree
Hide file tree
Showing 7 changed files with 183 additions and 79 deletions.
19 changes: 7 additions & 12 deletions src/common/bitfield.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#endif // defined(__CUDACC__)

#include "xgboost/span.h"
#include "common.h"

namespace xgboost {

Expand Down Expand Up @@ -84,17 +85,11 @@ struct BitFieldContainer {
XGBOOST_DEVICE BitFieldContainer(common::Span<value_type> bits) : bits_{bits} {}
XGBOOST_DEVICE BitFieldContainer(BitFieldContainer const& other) : bits_{other.bits_} {}

/*\brief Compute the size of needed memory allocation. The returned value is in terms
* of number of elements with `BitFieldContainer::value_type'.
*/
static size_t ComputeStorageSize(size_t size) {
auto pos = ToBitPos(size);
if (size < kValueSize) {
return 1;
}

if (pos.bit_pos != 0) {
return pos.int_pos + 2;
} else {
return pos.int_pos + 1;
}
return common::DivRoundUp(size, kValueSize);
}
#if defined(__CUDA_ARCH__)
__device__ BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
Expand Down Expand Up @@ -216,9 +211,9 @@ struct RBitsPolicy : public BitFieldContainer<VT, RBitsPolicy<VT>> {
}
};

// Format: <Direction>BitField<size of underlying type>, underlying type must be unsigned.
// Format: <Direction>BitField<size of underlying type in bits>, underlying type must be unsigned.
using LBitField64 = BitFieldContainer<uint64_t, LBitsPolicy<uint64_t>>;
using RBitField8 = BitFieldContainer<unsigned char, RBitsPolicy<unsigned char>>;
using RBitField8 = BitFieldContainer<uint8_t, RBitsPolicy<unsigned char>>;

#if defined(__CUDACC__)

Expand Down
120 changes: 82 additions & 38 deletions src/data/columnar.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ struct ColumnarErrors {
return "Memory should be contigious.";
}
static char const* TypestrFormat() {
return "`typestr` should be of format <endian><type><size>.";
return "`typestr' should be of format <endian><type><size of type in bytes>.";
}
// Not supported in Apache Arrow.
static char const* BigEndian() {
Expand All @@ -50,7 +50,7 @@ struct ColumnarErrors {
return str.c_str();
}
static char const* Version() {
return "Only version 1 of __cuda_array_interface__ is being supported.";
return "Only version 1 of `__cuda_array_interface__' is supported.";
}
static char const* ofType(std::string const& type) {
static std::string str;
Expand All @@ -60,22 +60,6 @@ struct ColumnarErrors {
str += " type.";
return str.c_str();
}
static std::string UnknownTypeStr(std::string const& typestr) {
return "typestr from array interface: " + typestr + " is not supported.";
}
};

// TODO(trivialfis): Abstract this into a class that accept a json
// object and turn it into an array (for cupy and numba).
class ArrayInterfaceHandler {
public:
template <typename T>
static constexpr char TypeChar() {
return
(std::is_floating_point<T>::value ? 'f' :
(std::is_integral<T>::value ?
(std::is_signed<T>::value ? 'i' : 'u') : '\0'));
}

static std::string TypeStr(char c) {
switch (c) {
Expand All @@ -89,12 +73,47 @@ class ArrayInterfaceHandler {
return "Unsigned integer";
case 'f':
return "Floating point";
case 'c':
return "Complex floating point";
case 'm':
return "Timedelta";
case 'M':
return "Datetime";
case 'O':
return "Object";
case 'S':
return "String";
case 'U':
return "Unicode";
case 'V':
return "Other";
default:
LOG(FATAL) << "Invalid type code: " << c << " in typestr of input array interface.";
LOG(FATAL) << "Invalid type code: " << c << " in `typestr' of input array."
<< "\nPlease verify the `__cuda_array_interface__' "
<< "of your input data complies to: "
<< "https://docs.scipy.org/doc/numpy/reference/arrays.interface.html"
<< "\nOr open an issue.";
return "";
}
}

static std::string UnSupportedType(std::string const& typestr) {
return TypeStr(typestr.at(1)) + " is not supported.";
}
};

// TODO(trivialfis): Abstract this into a class that accept a json
// object and turn it into an array (for cupy and numba).
class ArrayInterfaceHandler {
public:
template <typename T>
static constexpr char TypeChar() {
return
(std::is_floating_point<T>::value ? 'f' :
(std::is_integral<T>::value ?
(std::is_signed<T>::value ? 'i' : 'u') : '\0'));
}

template <typename PtrType>
static PtrType GetPtrFromArrayData(std::map<std::string, Json> const& obj) {
if (obj.find("data") == obj.cend()) {
Expand All @@ -110,30 +129,30 @@ class ArrayInterfaceHandler {

static void Validate(std::map<std::string, Json> const& array) {
if (array.find("version") == array.cend()) {
LOG(FATAL) << "Missing version field for array interface";
LOG(FATAL) << "Missing `version' field for array interface";
}
auto version = get<Integer const>(array.at("version"));
CHECK_EQ(version, 1) << ColumnarErrors::Version();

if (array.find("typestr") == array.cend()) {
LOG(FATAL) << "Missing typestr field for array interface";
LOG(FATAL) << "Missing `typestr' field for array interface";
}
auto typestr = get<String const>(array.at("typestr"));
CHECK_EQ(typestr.size(), 3) << ColumnarErrors::TypestrFormat();
CHECK_NE(typestr.front(), '>') << ColumnarErrors::BigEndian();

if (array.find("shape") == array.cend()) {
LOG(FATAL) << "Missing shape field for array interface";
LOG(FATAL) << "Missing `shape' field for array interface";
}
if (array.find("data") == array.cend()) {
LOG(FATAL) << "Missing data field for array interface";
LOG(FATAL) << "Missing `data' field for array interface";
}
}

// Find null mask (validity mask) field
// Mask object is also an array interface, but with different requirements.
static void ExtractMask(std::map<std::string, Json> const& column,
common::Span<RBitField8::value_type>* p_out) {
static size_t ExtractMask(std::map<std::string, Json> const &column,
common::Span<RBitField8::value_type> *p_out) {
auto& s_mask = *p_out;
if (column.find("mask") != column.cend()) {
auto const& j_mask = get<Object const>(column.at("mask"));
Expand All @@ -143,24 +162,42 @@ class ArrayInterfaceHandler {

auto j_shape = get<Array const>(j_mask.at("shape"));
CHECK_EQ(j_shape.size(), 1) << ColumnarErrors::Dimension(1);
CHECK_EQ(get<Integer>(j_shape.front()) % 8, 0) <<
"Length of validity mask must be a multiple of 8 bytes.";
int64_t size = get<Integer>(j_shape.at(0)) *
sizeof(unsigned char) / sizeof(RBitField8::value_type);
auto typestr = get<String const>(j_mask.at("typestr"));
// For now this is just 1, we can support different size of interger in mask.
int64_t const type_length = typestr.at(2) - 48;
/*
* shape represents how many bits is in the mask. (This is a grey area, don't be
* suprised if it suddently represents something else when supporting a new
* implementation). Quoting from numpy array interface:
*
* The shape of this object should be "broadcastable" to the shape of the original
* array.
*
* And that's the only requirement.
*/
int64_t const n_bits = get<Integer>(j_shape.at(0));
// The size of span required to cover all bits. Here with 8 bits bitfield, we
// assume 1 byte alignment.
int64_t const span_size = RBitField8::ComputeStorageSize(n_bits);

if (j_mask.find("strides") != j_mask.cend()) {
auto strides = get<Array const>(column.at("strides"));
CHECK_EQ(strides.size(), 1) << ColumnarErrors::Dimension(1);
CHECK_EQ(get<Integer>(strides.at(0)), type_length) << ColumnarErrors::Contigious();
}

if (typestr.at(1) == 't') {
CHECK_EQ(typestr.at(2), '1') << "There can be only 1 bit in each entry of bitfield.";
CHECK_EQ(typestr.at(2), '1') << "mask with bitfield type should be of 1 byte per bitfield.";
} else if (typestr.at(1) == 'i') {
CHECK_EQ(typestr.at(2), '1') << "mask with integer type should be of 1 byte per integer.";
CHECK_EQ(typestr.at(2), '1') << "mask with integer type should be of 1 byte per integer.";
} else {
LOG(FATAL) << "mask must be of integer type or bit field type.";
}

// For now this is just 1
int64_t const type_length = typestr.at(2) - 48;
s_mask = {p_mask, size / type_length};
s_mask = {p_mask, span_size};
return n_bits;
}
return 0;
}

template <typename T>
Expand All @@ -178,8 +215,8 @@ class ArrayInterfaceHandler {

if (column.find("strides") != column.cend()) {
auto strides = get<Array const>(column.at("strides"));
CHECK_EQ(strides.size(), 1) << ColumnarErrors::Dimension(1);
CHECK_EQ(get<Integer>(strides.at(0)), 4) << ColumnarErrors::Contigious();
CHECK_EQ(strides.size(), 1) << ColumnarErrors::Dimension(1);
CHECK_EQ(get<Integer>(strides.at(0)), sizeof(T)) << ColumnarErrors::Contigious();
}

auto length = get<Integer const>(j_shape.at(0));
Expand All @@ -197,15 +234,22 @@ class ArrayInterfaceHandler {
foreign_col.size = s_data.size();

common::Span<RBitField8::value_type> s_mask;
ArrayInterfaceHandler::ExtractMask(column, &s_mask);
size_t n_bits = ArrayInterfaceHandler::ExtractMask(column, &s_mask);

foreign_col.valid = RBitField8(s_mask);

if (s_mask.data()) {
CHECK_EQ(n_bits, foreign_col.data.size())
<< "Shape of bit mask doesn't match data shape. "
<< "XGBoost doesn't support internal broadcasting.";
}

return foreign_col;
}
};

#define DISPATCH_TYPE(__dispatched_func, __typestr, ...) { \
CHECK_EQ(__typestr.size(), 3) << ColumnarErrors::TypestrFormat(); \
if (__typestr.at(1) == 'f' && __typestr.at(2) == '4') { \
__dispatched_func<float>(__VA_ARGS__); \
} else if (__typestr.at(1) == 'f' && __typestr.at(2) == '8') { \
Expand All @@ -227,7 +271,7 @@ class ArrayInterfaceHandler {
} else if (__typestr.at(1) == 'u' && __typestr.at(2) == '8') { \
__dispatched_func<uint64_t>(__VA_ARGS__); \
} else { \
LOG(FATAL) << ColumnarErrors::UnknownTypeStr(__typestr); \
LOG(FATAL) << ColumnarErrors::UnSupportedType(__typestr); \
} \
}

Expand Down
3 changes: 2 additions & 1 deletion src/data/simple_csr_source.cu
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,8 @@ void SimpleCSRSource::FromDeviceColumnar(std::vector<Json> const& columns,
// one copy seems easier.
this->info.num_nonzero_ = tmp_offset.back();

int device = this->page_.offset.DeviceIdx();
// Device is obtained and set in `CountValid'
int32_t const device = this->page_.offset.DeviceIdx();
this->page_.data.SetDevice(device);
this->page_.data.Resize(this->info.num_nonzero_);
auto s_data = this->page_.data.DeviceSpan();
Expand Down
5 changes: 3 additions & 2 deletions src/data/simple_csr_source.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@
#include <vector>
#include <limits>

#include "columnar.h"

namespace xgboost {

class Json;

namespace data {
/*!
* \brief The simplest form of data holder, can be used to create DMatrix.
Expand Down
10 changes: 10 additions & 0 deletions tests/cpp/common/test_bitfield.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,16 @@ __global__ void TestSetKernel(LBitField64 bits) {
}
}

TEST(BitField, StorageSize) {
size_t constexpr kElements { 16 };
size_t size = LBitField64::ComputeStorageSize(kElements);
ASSERT_EQ(1, size);
size = RBitField8::ComputeStorageSize(4);
ASSERT_EQ(1, size);
size = RBitField8::ComputeStorageSize(kElements);
ASSERT_EQ(2, size);
}

TEST(BitField, GPU_Set) {
dh::device_vector<LBitField64::value_type> storage;
uint32_t constexpr kBits = 128;
Expand Down
Loading

0 comments on commit 3d46bd0

Please sign in to comment.