Ignore columnar alignment requirement. (#4928)

* Better error message for wrong type. * Fix stride size.
dmlc · Oct 13, 2019 · 3d46bd0 · 3d46bd0
1 parent 05d4751
commit 3d46bd0
Show file tree

Hide file tree

Showing 7 changed files with 183 additions and 79 deletions.
diff --git a/src/common/bitfield.h b/src/common/bitfield.h
@@ -19,6 +19,7 @@
 #endif  // defined(__CUDACC__)
 
 #include "xgboost/span.h"
+#include "common.h"
 
 namespace xgboost {
 
@@ -84,17 +85,11 @@ struct BitFieldContainer {
   XGBOOST_DEVICE BitFieldContainer(common::Span<value_type> bits) : bits_{bits} {}
   XGBOOST_DEVICE BitFieldContainer(BitFieldContainer const& other) : bits_{other.bits_} {}
 
+  /*\brief Compute the size of needed memory allocation.  The returned value is in terms
+   *       of number of elements with `BitFieldContainer::value_type'.
+   */
   static size_t ComputeStorageSize(size_t size) {
-    auto pos = ToBitPos(size);
-    if (size < kValueSize) {
-      return 1;
-    }
-
-    if (pos.bit_pos != 0) {
-      return pos.int_pos + 2;
-    } else {
-      return pos.int_pos + 1;
-    }
+    return common::DivRoundUp(size, kValueSize);
   }
 #if defined(__CUDA_ARCH__)
   __device__ BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
@@ -216,9 +211,9 @@ struct RBitsPolicy : public BitFieldContainer<VT, RBitsPolicy<VT>> {
   }
 };
 
-// Format: <Direction>BitField<size of underlying type>, underlying type must be unsigned.
+// Format: <Direction>BitField<size of underlying type in bits>, underlying type must be unsigned.
 using LBitField64 = BitFieldContainer<uint64_t, LBitsPolicy<uint64_t>>;
-using RBitField8 = BitFieldContainer<unsigned char, RBitsPolicy<unsigned char>>;
+using RBitField8 = BitFieldContainer<uint8_t, RBitsPolicy<unsigned char>>;
 
 #if defined(__CUDACC__)
 

diff --git a/src/data/columnar.h b/src/data/columnar.h
@@ -35,7 +35,7 @@ struct ColumnarErrors {
     return "Memory should be contigious.";
   }
   static char const* TypestrFormat() {
-    return "`typestr` should be of format <endian><type><size>.";
+    return "`typestr' should be of format <endian><type><size of type in bytes>.";
   }
   // Not supported in Apache Arrow.
   static char const* BigEndian() {
@@ -50,7 +50,7 @@ struct ColumnarErrors {
     return str.c_str();
   }
   static char const* Version() {
-    return "Only version 1 of __cuda_array_interface__ is being supported.";
+    return "Only version 1 of `__cuda_array_interface__' is supported.";
   }
   static char const* ofType(std::string const& type) {
     static std::string str;
@@ -60,22 +60,6 @@ struct ColumnarErrors {
     str += " type.";
     return str.c_str();
   }
-  static std::string UnknownTypeStr(std::string const& typestr) {
-    return "typestr from array interface: " + typestr + " is not supported.";
-  }
-};
-
-// TODO(trivialfis): Abstract this into a class that accept a json
-// object and turn it into an array (for cupy and numba).
-class ArrayInterfaceHandler {
- public:
-  template <typename T>
-  static constexpr char TypeChar() {
-    return
-        (std::is_floating_point<T>::value ? 'f' :
-         (std::is_integral<T>::value ?
-          (std::is_signed<T>::value ? 'i' : 'u') : '\0'));
-  }
 
   static std::string TypeStr(char c) {
     switch (c) {
@@ -89,12 +73,47 @@ class ArrayInterfaceHandler {
         return "Unsigned integer";
       case 'f':
         return "Floating point";
+      case 'c':
+        return "Complex floating point";
+      case 'm':
+        return "Timedelta";
+      case 'M':
+        return "Datetime";
+      case 'O':
+        return "Object";
+      case 'S':
+        return "String";
+      case 'U':
+        return "Unicode";
+      case 'V':
+        return "Other";
       default:
-        LOG(FATAL) << "Invalid type code: " << c << " in typestr of input array interface.";
+        LOG(FATAL) << "Invalid type code: " << c << " in `typestr' of input array."
+                   << "\nPlease verify the `__cuda_array_interface__' "
+                   << "of your input data complies to: "
+                   << "https://docs.scipy.org/doc/numpy/reference/arrays.interface.html"
+                   << "\nOr open an issue.";
         return "";
     }
   }
 
+  static std::string UnSupportedType(std::string const& typestr) {
+    return TypeStr(typestr.at(1)) + " is not supported.";
+  }
+};
+
+// TODO(trivialfis): Abstract this into a class that accept a json
+// object and turn it into an array (for cupy and numba).
+class ArrayInterfaceHandler {
+ public:
+  template <typename T>
+  static constexpr char TypeChar() {
+    return
+        (std::is_floating_point<T>::value ? 'f' :
+         (std::is_integral<T>::value ?
+          (std::is_signed<T>::value ? 'i' : 'u') : '\0'));
+  }
+
   template <typename PtrType>
   static PtrType GetPtrFromArrayData(std::map<std::string, Json> const& obj) {
     if (obj.find("data") == obj.cend()) {
@@ -110,30 +129,30 @@ class ArrayInterfaceHandler {
 
   static void Validate(std::map<std::string, Json> const& array) {
     if (array.find("version") == array.cend()) {
-      LOG(FATAL) << "Missing version field for array interface";
+      LOG(FATAL) << "Missing `version' field for array interface";
     }
     auto version = get<Integer const>(array.at("version"));
     CHECK_EQ(version, 1) << ColumnarErrors::Version();
 
     if (array.find("typestr") == array.cend()) {
-      LOG(FATAL) << "Missing typestr field for array interface";
+      LOG(FATAL) << "Missing `typestr' field for array interface";
     }
     auto typestr = get<String const>(array.at("typestr"));
     CHECK_EQ(typestr.size(),    3) << ColumnarErrors::TypestrFormat();
     CHECK_NE(typestr.front(), '>') << ColumnarErrors::BigEndian();
 
     if (array.find("shape") == array.cend()) {
-      LOG(FATAL) << "Missing shape field for array interface";
+      LOG(FATAL) << "Missing `shape' field for array interface";
     }
     if (array.find("data") == array.cend()) {
-      LOG(FATAL) << "Missing data field for array interface";
+      LOG(FATAL) << "Missing `data' field for array interface";
     }
   }
 
   // Find null mask (validity mask) field
   // Mask object is also an array interface, but with different requirements.
-  static void ExtractMask(std::map<std::string, Json> const& column,
-                             common::Span<RBitField8::value_type>* p_out) {
+  static size_t ExtractMask(std::map<std::string, Json> const &column,
+                            common::Span<RBitField8::value_type> *p_out) {
     auto& s_mask = *p_out;
     if (column.find("mask") != column.cend()) {
       auto const& j_mask = get<Object const>(column.at("mask"));
@@ -143,24 +162,42 @@ class ArrayInterfaceHandler {
 
       auto j_shape = get<Array const>(j_mask.at("shape"));
       CHECK_EQ(j_shape.size(), 1) << ColumnarErrors::Dimension(1);
-      CHECK_EQ(get<Integer>(j_shape.front()) % 8, 0) <<
-          "Length of validity mask must be a multiple of 8 bytes.";
-      int64_t size = get<Integer>(j_shape.at(0)) *
-                     sizeof(unsigned char) / sizeof(RBitField8::value_type);
       auto typestr = get<String const>(j_mask.at("typestr"));
+      // For now this is just 1, we can support different size of interger in mask.
+      int64_t const type_length = typestr.at(2) - 48;
+      /*
+       * shape represents how many bits is in the mask. (This is a grey area, don't be
+       * suprised if it suddently represents something else when supporting a new
+       * implementation).  Quoting from numpy array interface:
+       *
+       *   The shape of this object should be "broadcastable" to the shape of the original
+       *   array.
+       *
+       * And that's the only requirement.
+       */
+      int64_t const n_bits = get<Integer>(j_shape.at(0));
+      // The size of span required to cover all bits.  Here with 8 bits bitfield, we
+      // assume 1 byte alignment.
+      int64_t const span_size = RBitField8::ComputeStorageSize(n_bits);
+
+      if (j_mask.find("strides") != j_mask.cend()) {
+        auto strides = get<Array const>(column.at("strides"));
+        CHECK_EQ(strides.size(),                        1) << ColumnarErrors::Dimension(1);
+        CHECK_EQ(get<Integer>(strides.at(0)), type_length) << ColumnarErrors::Contigious();
+      }
 
       if (typestr.at(1) == 't') {
-        CHECK_EQ(typestr.at(2),   '1') << "There can be only 1 bit in each entry of bitfield.";
+        CHECK_EQ(typestr.at(2), '1') << "mask with bitfield type should be of 1 byte per bitfield.";
       } else if (typestr.at(1) == 'i') {
-        CHECK_EQ(typestr.at(2),   '1') << "mask with integer type should be of 1 byte per integer.";
+        CHECK_EQ(typestr.at(2), '1') << "mask with integer type should be of 1 byte per integer.";
       } else {
         LOG(FATAL) << "mask must be of integer type or bit field type.";
       }
 
-      // For now this is just 1
-      int64_t const type_length = typestr.at(2) - 48;
-      s_mask = {p_mask, size / type_length};
+      s_mask = {p_mask, span_size};
+      return n_bits;
     }
+    return 0;
   }
 
   template <typename T>
@@ -178,8 +215,8 @@ class ArrayInterfaceHandler {
 
     if (column.find("strides") != column.cend()) {
       auto strides = get<Array const>(column.at("strides"));
-      CHECK_EQ(strides.size(), 1)              << ColumnarErrors::Dimension(1);
-      CHECK_EQ(get<Integer>(strides.at(0)), 4) << ColumnarErrors::Contigious();
+      CHECK_EQ(strides.size(),         1)              << ColumnarErrors::Dimension(1);
+      CHECK_EQ(get<Integer>(strides.at(0)), sizeof(T)) << ColumnarErrors::Contigious();
     }
 
     auto length = get<Integer const>(j_shape.at(0));
@@ -197,15 +234,22 @@ class ArrayInterfaceHandler {
     foreign_col.size  = s_data.size();
 
     common::Span<RBitField8::value_type> s_mask;
-    ArrayInterfaceHandler::ExtractMask(column, &s_mask);
+    size_t n_bits = ArrayInterfaceHandler::ExtractMask(column, &s_mask);
 
     foreign_col.valid = RBitField8(s_mask);
 
+    if (s_mask.data()) {
+      CHECK_EQ(n_bits, foreign_col.data.size())
+          << "Shape of bit mask doesn't match data shape. "
+          << "XGBoost doesn't support internal broadcasting.";
+    }
+
     return foreign_col;
   }
 };
 
 #define DISPATCH_TYPE(__dispatched_func, __typestr, ...) {              \
+    CHECK_EQ(__typestr.size(), 3) << ColumnarErrors::TypestrFormat();   \
     if (__typestr.at(1) == 'f' && __typestr.at(2) == '4') {             \
       __dispatched_func<float>(__VA_ARGS__);                            \
     } else if (__typestr.at(1) == 'f' && __typestr.at(2) == '8') {      \
@@ -227,7 +271,7 @@ class ArrayInterfaceHandler {
     } else if (__typestr.at(1) == 'u' && __typestr.at(2) == '8') {      \
       __dispatched_func<uint64_t>(__VA_ARGS__);                         \
     } else {                                                            \
-      LOG(FATAL) << ColumnarErrors::UnknownTypeStr(__typestr);          \
+      LOG(FATAL) << ColumnarErrors::UnSupportedType(__typestr);         \
     }                                                                   \
   }
 

diff --git a/src/data/simple_csr_source.cu b/src/data/simple_csr_source.cu
@@ -186,7 +186,8 @@ void SimpleCSRSource::FromDeviceColumnar(std::vector<Json> const& columns,
   // one copy seems easier.
   this->info.num_nonzero_ = tmp_offset.back();
 
-  int device = this->page_.offset.DeviceIdx();
+  // Device is obtained and set in `CountValid'
+  int32_t const device = this->page_.offset.DeviceIdx();
   this->page_.data.SetDevice(device);
   this->page_.data.Resize(this->info.num_nonzero_);
   auto s_data = this->page_.data.DeviceSpan();

diff --git a/src/data/simple_csr_source.h b/src/data/simple_csr_source.h
@@ -16,9 +16,10 @@
 #include <vector>
 #include <limits>
 
-#include "columnar.h"
-
 namespace xgboost {
+
+class Json;
+
 namespace data {
 /*!
  * \brief The simplest form of data holder, can be used to create DMatrix.

diff --git a/tests/cpp/common/test_bitfield.cu b/tests/cpp/common/test_bitfield.cu
@@ -17,6 +17,16 @@ __global__ void TestSetKernel(LBitField64 bits) {
   }
 }
 
+TEST(BitField, StorageSize) {
+  size_t constexpr kElements { 16 };
+  size_t size = LBitField64::ComputeStorageSize(kElements);
+  ASSERT_EQ(1, size);
+  size = RBitField8::ComputeStorageSize(4);
+  ASSERT_EQ(1, size);
+  size = RBitField8::ComputeStorageSize(kElements);
+  ASSERT_EQ(2, size);
+}
+
 TEST(BitField, GPU_Set) {
   dh::device_vector<LBitField64::value_type> storage;
   uint32_t constexpr kBits = 128;