dmlc · trivialfis · Sep 17, 2019 · Aug 5, 2019 · Sep 10, 2019 · Sep 10, 2019
diff --git a/amalgamation/xgboost-all0.cc b/amalgamation/xgboost-all0.cc
@@ -29,6 +29,7 @@
 
 // data
 #include "../src/data/data.cc"
+#include "../src/data/ellpack_page.cc"
 #include "../src/data/simple_csr_source.cc"
 #include "../src/data/simple_dmatrix.cc"
 #include "../src/data/sparse_page_raw_format.cc"

diff --git a/include/xgboost/data.h b/include/xgboost/data.h
@@ -26,6 +26,8 @@
 namespace xgboost {
 // forward declare learner.
 class LearnerImpl;
+// forward declare dmatrix.
+class DMatrix;
 
 /*! \brief data type accepted by xgboost interface */
 enum DataType {
@@ -86,7 +88,7 @@ class MetaInfo {
    * \return The pre-defined root index of i-th instance.
    */
   inline unsigned GetRoot(size_t i) const {
-    return root_index_.size() != 0 ? root_index_[i] : 0U;
+    return !root_index_.empty() ? root_index_[i] : 0U;
   }
   /*! \brief get sorted indexes (argsort) of labels by absolute value (used by cox loss) */
   inline const std::vector<size_t>& LabelAbsSort() const {
@@ -166,7 +168,7 @@ class SparsePage {
   /*! \brief the data of the segments */
   HostDeviceVector<Entry> data;
 
-  size_t base_rowid;
+  size_t base_rowid{};
 
   /*! \brief an instance of sparse vector in the batch */
   using Inst = common::Span<Entry const>;
@@ -215,23 +217,23 @@ class SparsePage {
     const int nthread = omp_get_max_threads();
     builder.InitBudget(num_columns, nthread);
     long batch_size = static_cast<long>(this->Size());  // NOLINT(*)
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for default(none) shared(batch_size, builder) schedule(static)
     for (long i = 0; i < batch_size; ++i) {  // NOLINT(*)
       int tid = omp_get_thread_num();
       auto inst = (*this)[i];
-      for (bst_uint j = 0; j < inst.size(); ++j) {
-        builder.AddBudget(inst[j].index, tid);
+      for (const auto& entry : inst) {
+        builder.AddBudget(entry.index, tid);
       }
     }
     builder.InitStorage();
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for default(none) shared(batch_size, builder) schedule(static)
     for (long i = 0; i < batch_size; ++i) {  // NOLINT(*)
       int tid = omp_get_thread_num();
       auto inst = (*this)[i];
-      for (bst_uint j = 0; j < inst.size(); ++j) {
+      for (const auto& entry : inst) {
         builder.Push(
-            inst[j].index,
-            Entry(static_cast<bst_uint>(this->base_rowid + i), inst[j].fvalue),
+            entry.index,
+            Entry(static_cast<bst_uint>(this->base_rowid + i), entry.fvalue),
             tid);
       }
     }
@@ -240,7 +242,7 @@ class SparsePage {
 
   void SortRows() {
     auto ncol = static_cast<bst_omp_uint>(this->Size());
-#pragma omp parallel for schedule(dynamic, 1)
+#pragma omp parallel for default(none) shared(ncol) schedule(dynamic, 1)
     for (bst_omp_uint i = 0; i < ncol; ++i) {
       if (this->offset.HostVector()[i] < this->offset.HostVector()[i + 1]) {
         std::sort(
@@ -287,10 +289,29 @@ class SortedCSCPage : public SparsePage {
   explicit SortedCSCPage(SparsePage page) : SparsePage(std::move(page)) {}
 };
 
+class EllpackPageImpl;
+/*!
+ * \brief A page stored in ELLPACK format.
+ *
+ * This class uses the PImpl idiom (https://en.cppreference.com/w/cpp/language/pimpl) to avoid
+ * including CUDA-specific implementation details in the header.
+ */
+class EllpackPage {
+ public:
+  explicit EllpackPage(DMatrix* dmat);
+  ~EllpackPage();
+
+  const EllpackPageImpl* Impl() const { return impl_.get(); }
+  EllpackPageImpl* Impl() { return impl_.get(); }
+
+ private:
+  std::unique_ptr<EllpackPageImpl> impl_;
+};
+
 template<typename T>
 class BatchIteratorImpl {
  public:
-  virtual ~BatchIteratorImpl() {}
+  virtual ~BatchIteratorImpl() = default;
   virtual T& operator*() = 0;
   virtual const T& operator*() const = 0;
   virtual void operator++() = 0;
@@ -412,7 +433,7 @@ class DMatrix {
                        bool silent,
                        bool load_row_split,
                        const std::string& file_format = "auto",
-                       const size_t page_size = kPageSize);
+                       size_t page_size = kPageSize);
 
   /*!
    * \brief create a new DMatrix, by wrapping a row_iterator, and meta info.
@@ -438,7 +459,7 @@ class DMatrix {
    */
   static DMatrix* Create(dmlc::Parser<uint32_t>* parser,
                          const std::string& cache_prefix = "",
-                         const size_t page_size = kPageSize);
+                         size_t page_size = kPageSize);
 
   /*! \brief page size 32 MB */
   static const size_t kPageSize = 32UL << 20UL;
@@ -447,6 +468,7 @@ class DMatrix {
   virtual BatchSet<SparsePage> GetRowBatches() = 0;
   virtual BatchSet<CSCPage> GetColumnBatches() = 0;
   virtual BatchSet<SortedCSCPage> GetSortedColumnBatches() = 0;
+  virtual BatchSet<EllpackPage> GetEllpackBatches() = 0;
 };
 
 template<>
@@ -463,6 +485,11 @@ template<>
 inline BatchSet<SortedCSCPage> DMatrix::GetBatches() {
   return GetSortedColumnBatches();
 }
+
+template<>
+inline BatchSet<EllpackPage> DMatrix::GetBatches() {
+  return GetEllpackBatches();
+}
 }  // namespace xgboost
 
 namespace dmlc {

diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
@@ -99,15 +99,15 @@ struct SketchContainer {
   std::vector<std::mutex> col_locks_; // NOLINT
   static constexpr int kOmpNumColsParallelizeLimit = 1000;
 
-  SketchContainer(const tree::TrainParam &param, DMatrix *dmat) :
+  SketchContainer(int max_bin, DMatrix *dmat) :
     col_locks_(dmat->Info().num_col_) {
     const MetaInfo &info = dmat->Info();
     // Initialize Sketches for this dmatrix
     sketches_.resize(info.num_col_);
-#pragma omp parallel for default(none) shared(info, param) schedule(static) \
+#pragma omp parallel for default(none) shared(info, max_bin) schedule(static) \
 if (info.num_col_ > kOmpNumColsParallelizeLimit)  // NOLINT
     for (int icol = 0; icol < info.num_col_; ++icol) {  // NOLINT
-      sketches_[icol].Init(info.num_row_, 1.0 / (8 * param.max_bin));
+      sketches_[icol].Init(info.num_row_, 1.0 / (8 * max_bin));
     }
   }
 
@@ -130,7 +130,7 @@ struct GPUSketcher {
     bool has_weights_{false};
     size_t row_stride_{0};
 
-    tree::TrainParam param_;
+    const int max_bin_;
     SketchContainer *sketch_container_;
     dh::device_vector<size_t> row_ptrs_{};
     dh::device_vector<Entry> entries_{};
@@ -148,11 +148,11 @@ struct GPUSketcher {
    public:
     DeviceShard(int device,
                 bst_uint n_rows,
-                tree::TrainParam param,
+                int max_bin,
                 SketchContainer* sketch_container) :
         device_(device),
         n_rows_(n_rows),
-        param_(std::move(param)),
+        max_bin_(max_bin),
         sketch_container_(sketch_container) {
     }
 
@@ -183,7 +183,7 @@ struct GPUSketcher {
       }
 
       constexpr int kFactor = 8;
-      double eps = 1.0 / (kFactor * param_.max_bin);
+      double eps = 1.0 / (kFactor * max_bin_);
       size_t dummy_nlevel;
       WXQSketch::LimitSizeLevel(gpu_batch_nrows_, eps, &dummy_nlevel, &n_cuts_);
 
@@ -362,7 +362,7 @@ struct GPUSketcher {
       // add cuts into sketches
       thrust::copy(cuts_d_.begin(), cuts_d_.end(), cuts_h_.begin());
 #pragma omp parallel for default(none) schedule(static) \
-      if (num_cols_ > SketchContainer::kOmpNumColsParallelizeLimit) // NOLINT
+if (num_cols_ > SketchContainer::kOmpNumColsParallelizeLimit) // NOLINT
       for (int icol = 0; icol < num_cols_; ++icol) {
         WXQSketch::SummaryContainer summary;
         summary.Reserve(n_cuts_);
@@ -403,10 +403,8 @@ struct GPUSketcher {
   };
 
   void SketchBatch(const SparsePage &batch, const MetaInfo &info) {
-    auto device = generic_param_.gpu_id;
-
     // create device shard
-    shard_.reset(new DeviceShard(device, batch.Size(), param_, sketch_container_.get()));
+    shard_.reset(new DeviceShard(device_, batch.Size(), max_bin_, sketch_container_.get()));
 
     // compute sketches for the shard
     shard_->Init(batch, info, gpu_batch_nrows_);
@@ -417,39 +415,40 @@ struct GPUSketcher {
     row_stride_ = shard_->GetRowStride();
   }
 
-  GPUSketcher(const tree::TrainParam &param, const GenericParameter &generic_param, int gpu_nrows)
-    : param_(param), generic_param_(generic_param), gpu_batch_nrows_(gpu_nrows), row_stride_(0) {
-  }
+  GPUSketcher(int device, int max_bin, int gpu_nrows)
+      : device_(device), max_bin_(max_bin), gpu_batch_nrows_(gpu_nrows), row_stride_(0) {}
 
   /* Builds the sketches on the GPU for the dmatrix and returns the row stride
    * for the entire dataset */
   size_t Sketch(DMatrix *dmat, DenseCuts *hmat) {
     const MetaInfo &info = dmat->Info();
 
     row_stride_ = 0;
-    sketch_container_.reset(new SketchContainer(param_, dmat));
+    sketch_container_.reset(new SketchContainer(max_bin_, dmat));
     for (const auto &batch : dmat->GetBatches<SparsePage>()) {
       this->SketchBatch(batch, info);
     }
 
-    hmat->Init(&sketch_container_->sketches_, param_.max_bin);
+    hmat->Init(&sketch_container_->sketches_, max_bin_);
 
     return row_stride_;
   }
 
  private:
   std::unique_ptr<DeviceShard> shard_;
-  const tree::TrainParam &param_;
-  const GenericParameter &generic_param_;
+  const int device_;
+  const int max_bin_;
   int gpu_batch_nrows_;
   size_t row_stride_;
   std::unique_ptr<SketchContainer> sketch_container_;
 };
 
-size_t DeviceSketch
-  (const tree::TrainParam &param, const GenericParameter &learner_param, int gpu_batch_nrows,
-   DMatrix *dmat, HistogramCuts *hmat) {
-  GPUSketcher sketcher(param, learner_param, gpu_batch_nrows);
+size_t DeviceSketch(int device,
+                    int max_bin,
+                    int gpu_batch_nrows,
+                    DMatrix* dmat,
+                    HistogramCuts* hmat) {
+  GPUSketcher sketcher(device, max_bin, gpu_batch_nrows);
   // We only need to return the result in HistogramCuts container, so it is safe to
   // use a pointer of local HistogramCutsDense
   DenseCuts dense_cuts(hmat);

diff --git a/src/common/hist_util.h b/src/common/hist_util.h
@@ -290,10 +290,11 @@ class DenseCuts  : public CutsBuilder {
  *
  *  \return The row stride across the entire dataset.
  */
-size_t DeviceSketch
-  (const tree::TrainParam& param, const GenericParameter &learner_param, int gpu_batch_nrows,
-   DMatrix* dmat, HistogramCuts* hmat);
-
+size_t DeviceSketch(int device,
+                    int max_bin,
+                    int gpu_batch_nrows,
+                    DMatrix* dmat,
+                    HistogramCuts* hmat);
 
 /*!
  * \brief preprocessed global index matrix, in CSR format

diff --git a/src/data/ellpack_page.cc b/src/data/ellpack_page.cc
@@ -0,0 +1,25 @@
+/*!
+ * Copyright 2019 XGBoost contributors
+ *
+ * \file ellpack_page.cc
+ */
+#ifndef XGBOOST_USE_CUDA
+
+#include <xgboost/data.h>
+
+// dummy implementation of ELlpackPage in case CUDA is not used
+namespace xgboost {
+
+class EllpackPageImpl {};
+
+EllpackPage::EllpackPage(DMatrix* dmat) {
+  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but EllpackPage is required";
+}
+
+EllpackPage::~EllpackPage() {
+  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but EllpackPage is required";
+}
+
+}  // namespace xgboost
+
+#endif  // XGBOOST_USE_CUDA