Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ignore columnar alignment requirement. #4928

Merged
merged 7 commits into from
Oct 13, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 7 additions & 12 deletions src/common/bitfield.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#endif // defined(__CUDACC__)

#include "xgboost/span.h"
#include "common.h"

namespace xgboost {

Expand Down Expand Up @@ -84,17 +85,11 @@ struct BitFieldContainer {
XGBOOST_DEVICE BitFieldContainer(common::Span<value_type> bits) : bits_{bits} {}
XGBOOST_DEVICE BitFieldContainer(BitFieldContainer const& other) : bits_{other.bits_} {}

/*\brief Compute the size of needed memory allocation. The returned value is in terms
* of number of elements with `BitFieldContainer::value_type'.
*/
static size_t ComputeStorageSize(size_t size) {
auto pos = ToBitPos(size);
if (size < kValueSize) {
return 1;
}

if (pos.bit_pos != 0) {
return pos.int_pos + 2;
} else {
return pos.int_pos + 1;
}
return common::DivRoundUp(size, kValueSize);
}
#if defined(__CUDA_ARCH__)
__device__ BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
Expand Down Expand Up @@ -216,9 +211,9 @@ struct RBitsPolicy : public BitFieldContainer<VT, RBitsPolicy<VT>> {
}
};

// Format: <Direction>BitField<size of underlying type>, underlying type must be unsigned.
// Format: <Direction>BitField<size of underlying type in bits>, underlying type must be unsigned.
using LBitField64 = BitFieldContainer<uint64_t, LBitsPolicy<uint64_t>>;
using RBitField8 = BitFieldContainer<unsigned char, RBitsPolicy<unsigned char>>;
using RBitField8 = BitFieldContainer<uint8_t, RBitsPolicy<unsigned char>>;

#if defined(__CUDACC__)

Expand Down
120 changes: 82 additions & 38 deletions src/data/columnar.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ struct ColumnarErrors {
return "Memory should be contigious.";
}
static char const* TypestrFormat() {
return "`typestr` should be of format <endian><type><size>.";
return "`typestr' should be of format <endian><type><size of type in bytes>.";
}
// Not supported in Apache Arrow.
static char const* BigEndian() {
Expand All @@ -50,7 +50,7 @@ struct ColumnarErrors {
return str.c_str();
}
static char const* Version() {
return "Only version 1 of __cuda_array_interface__ is being supported.";
return "Only version 1 of `__cuda_array_interface__' is supported.";
}
static char const* ofType(std::string const& type) {
static std::string str;
Expand All @@ -60,22 +60,6 @@ struct ColumnarErrors {
str += " type.";
return str.c_str();
}
static std::string UnknownTypeStr(std::string const& typestr) {
return "typestr from array interface: " + typestr + " is not supported.";
}
};

// TODO(trivialfis): Abstract this into a class that accept a json
// object and turn it into an array (for cupy and numba).
class ArrayInterfaceHandler {
public:
template <typename T>
static constexpr char TypeChar() {
return
(std::is_floating_point<T>::value ? 'f' :
(std::is_integral<T>::value ?
(std::is_signed<T>::value ? 'i' : 'u') : '\0'));
}

static std::string TypeStr(char c) {
switch (c) {
Expand All @@ -89,12 +73,47 @@ class ArrayInterfaceHandler {
return "Unsigned integer";
case 'f':
return "Floating point";
case 'c':
return "Complex floating point";
case 'm':
return "Timedelta";
case 'M':
return "Datetime";
case 'O':
return "Object";
case 'S':
return "String";
case 'U':
return "Unicode";
case 'V':
return "Other";
default:
LOG(FATAL) << "Invalid type code: " << c << " in typestr of input array interface.";
LOG(FATAL) << "Invalid type code: " << c << " in `typestr' of input array."
<< "\nPlease verify the `__cuda_array_interface__' "
<< "of your input data complies to: "
<< "https://docs.scipy.org/doc/numpy/reference/arrays.interface.html"
<< "\nOr open an issue.";
return "";
}
}

static std::string UnSupportedType(std::string const& typestr) {
return TypeStr(typestr.at(1)) + " is not supported.";
}
};

// TODO(trivialfis): Abstract this into a class that accept a json
// object and turn it into an array (for cupy and numba).
class ArrayInterfaceHandler {
public:
template <typename T>
static constexpr char TypeChar() {
return
(std::is_floating_point<T>::value ? 'f' :
(std::is_integral<T>::value ?
(std::is_signed<T>::value ? 'i' : 'u') : '\0'));
}

template <typename PtrType>
static PtrType GetPtrFromArrayData(std::map<std::string, Json> const& obj) {
if (obj.find("data") == obj.cend()) {
Expand All @@ -110,30 +129,30 @@ class ArrayInterfaceHandler {

static void Validate(std::map<std::string, Json> const& array) {
if (array.find("version") == array.cend()) {
LOG(FATAL) << "Missing version field for array interface";
LOG(FATAL) << "Missing `version' field for array interface";
}
auto version = get<Integer const>(array.at("version"));
CHECK_EQ(version, 1) << ColumnarErrors::Version();

if (array.find("typestr") == array.cend()) {
LOG(FATAL) << "Missing typestr field for array interface";
LOG(FATAL) << "Missing `typestr' field for array interface";
}
auto typestr = get<String const>(array.at("typestr"));
CHECK_EQ(typestr.size(), 3) << ColumnarErrors::TypestrFormat();
CHECK_NE(typestr.front(), '>') << ColumnarErrors::BigEndian();

if (array.find("shape") == array.cend()) {
LOG(FATAL) << "Missing shape field for array interface";
LOG(FATAL) << "Missing `shape' field for array interface";
}
if (array.find("data") == array.cend()) {
LOG(FATAL) << "Missing data field for array interface";
LOG(FATAL) << "Missing `data' field for array interface";
}
}

// Find null mask (validity mask) field
// Mask object is also an array interface, but with different requirements.
static void ExtractMask(std::map<std::string, Json> const& column,
common::Span<RBitField8::value_type>* p_out) {
static size_t ExtractMask(std::map<std::string, Json> const &column,
common::Span<RBitField8::value_type> *p_out) {
auto& s_mask = *p_out;
if (column.find("mask") != column.cend()) {
auto const& j_mask = get<Object const>(column.at("mask"));
Expand All @@ -143,24 +162,42 @@ class ArrayInterfaceHandler {

auto j_shape = get<Array const>(j_mask.at("shape"));
CHECK_EQ(j_shape.size(), 1) << ColumnarErrors::Dimension(1);
CHECK_EQ(get<Integer>(j_shape.front()) % 8, 0) <<
"Length of validity mask must be a multiple of 8 bytes.";
int64_t size = get<Integer>(j_shape.at(0)) *
sizeof(unsigned char) / sizeof(RBitField8::value_type);
auto typestr = get<String const>(j_mask.at("typestr"));
// For now this is just 1, we can support different size of interger in mask.
int64_t const type_length = typestr.at(2) - 48;
/*
* shape represents how many bits is in the mask. (This is a grey area, don't be
* suprised if it suddently represents something else when supporting a new
* implementation). Quoting from numpy array interface:
*
* The shape of this object should be "broadcastable" to the shape of the original
* array.
*
* And that's the only requirement.
*/
int64_t const n_bits = get<Integer>(j_shape.at(0));
// The size of span required to cover all bits. Here with 8 bits bitfield, we
// assume 1 byte alignment.
int64_t const span_size = RBitField8::ComputeStorageSize(n_bits);

if (j_mask.find("strides") != j_mask.cend()) {
auto strides = get<Array const>(column.at("strides"));
CHECK_EQ(strides.size(), 1) << ColumnarErrors::Dimension(1);
CHECK_EQ(get<Integer>(strides.at(0)), type_length) << ColumnarErrors::Contigious();
}

if (typestr.at(1) == 't') {
CHECK_EQ(typestr.at(2), '1') << "There can be only 1 bit in each entry of bitfield.";
CHECK_EQ(typestr.at(2), '1') << "mask with bitfield type should be of 1 byte per bitfield.";
} else if (typestr.at(1) == 'i') {
CHECK_EQ(typestr.at(2), '1') << "mask with integer type should be of 1 byte per integer.";
CHECK_EQ(typestr.at(2), '1') << "mask with integer type should be of 1 byte per integer.";
} else {
LOG(FATAL) << "mask must be of integer type or bit field type.";
}

// For now this is just 1
int64_t const type_length = typestr.at(2) - 48;
s_mask = {p_mask, size / type_length};
s_mask = {p_mask, span_size};
return n_bits;
}
return 0;
}

template <typename T>
Expand All @@ -178,8 +215,8 @@ class ArrayInterfaceHandler {

if (column.find("strides") != column.cend()) {
auto strides = get<Array const>(column.at("strides"));
CHECK_EQ(strides.size(), 1) << ColumnarErrors::Dimension(1);
CHECK_EQ(get<Integer>(strides.at(0)), 4) << ColumnarErrors::Contigious();
CHECK_EQ(strides.size(), 1) << ColumnarErrors::Dimension(1);
CHECK_EQ(get<Integer>(strides.at(0)), sizeof(T)) << ColumnarErrors::Contigious();
}

auto length = get<Integer const>(j_shape.at(0));
Expand All @@ -197,15 +234,22 @@ class ArrayInterfaceHandler {
foreign_col.size = s_data.size();

common::Span<RBitField8::value_type> s_mask;
ArrayInterfaceHandler::ExtractMask(column, &s_mask);
size_t n_bits = ArrayInterfaceHandler::ExtractMask(column, &s_mask);

foreign_col.valid = RBitField8(s_mask);

if (s_mask.data()) {
CHECK_EQ(n_bits, foreign_col.data.size())
<< "Shape of bit mask doesn't match data shape. "
<< "XGBoost doesn't support internal broadcasting.";
}

return foreign_col;
}
};

#define DISPATCH_TYPE(__dispatched_func, __typestr, ...) { \
CHECK_EQ(__typestr.size(), 3) << ColumnarErrors::TypestrFormat(); \
if (__typestr.at(1) == 'f' && __typestr.at(2) == '4') { \
__dispatched_func<float>(__VA_ARGS__); \
} else if (__typestr.at(1) == 'f' && __typestr.at(2) == '8') { \
Expand All @@ -227,7 +271,7 @@ class ArrayInterfaceHandler {
} else if (__typestr.at(1) == 'u' && __typestr.at(2) == '8') { \
__dispatched_func<uint64_t>(__VA_ARGS__); \
} else { \
LOG(FATAL) << ColumnarErrors::UnknownTypeStr(__typestr); \
LOG(FATAL) << ColumnarErrors::UnSupportedType(__typestr); \
} \
}

Expand Down
3 changes: 2 additions & 1 deletion src/data/simple_csr_source.cu
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,8 @@ void SimpleCSRSource::FromDeviceColumnar(std::vector<Json> const& columns,
// one copy seems easier.
this->info.num_nonzero_ = tmp_offset.back();

int device = this->page_.offset.DeviceIdx();
// Device is obtained and set in `CountValid'
int32_t const device = this->page_.offset.DeviceIdx();
this->page_.data.SetDevice(device);
this->page_.data.Resize(this->info.num_nonzero_);
auto s_data = this->page_.data.DeviceSpan();
Expand Down
5 changes: 3 additions & 2 deletions src/data/simple_csr_source.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@
#include <vector>
#include <limits>

#include "columnar.h"

namespace xgboost {

class Json;

namespace data {
/*!
* \brief The simplest form of data holder, can be used to create DMatrix.
Expand Down
10 changes: 10 additions & 0 deletions tests/cpp/common/test_bitfield.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,16 @@ __global__ void TestSetKernel(LBitField64 bits) {
}
}

TEST(BitField, StorageSize) {
size_t constexpr kElements { 16 };
size_t size = LBitField64::ComputeStorageSize(kElements);
ASSERT_EQ(1, size);
size = RBitField8::ComputeStorageSize(4);
ASSERT_EQ(1, size);
size = RBitField8::ComputeStorageSize(kElements);
ASSERT_EQ(2, size);
}

TEST(BitField, GPU_Set) {
dh::device_vector<LBitField64::value_type> storage;
uint32_t constexpr kBits = 128;
Expand Down
Loading