Skip to content

Commit

Permalink
Rebase
Browse files Browse the repository at this point in the history
  • Loading branch information
RAMitchell committed Aug 29, 2018
1 parent bf950c2 commit 0fb1ef1
Show file tree
Hide file tree
Showing 10 changed files with 64 additions and 75 deletions.
21 changes: 5 additions & 16 deletions include/xgboost/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,31 +236,20 @@ class SparsePage {
* \param inst an instance row
*/
inline void Push(const Inst &inst) {
<<<<<<< HEAD
offset.push_back(offset.back() + inst.size());
size_t begin = data.size();
data.resize(begin + inst.size());
if (inst.size() != 0) {
std::memcpy(dmlc::BeginPtr(data) + begin, inst.data(),
sizeof(Entry) * inst.size());
=======
auto& data_vec = data.HostVector();
auto& offset_vec = offset.HostVector();
offset_vec.push_back(offset_vec.back() + inst.length);
offset_vec.push_back(offset_vec.back() + inst.size());
size_t begin = data.Size();
data_vec.resize(begin + inst.length);
if (inst.length != 0) {
std::memcpy(dmlc::BeginPtr(data_vec) + begin, inst.data,
sizeof(Entry) * inst.length);
>>>>>>> Replaced std::vector with HostDeviceVector in MetaInfo and SparsePage.
data_vec.resize(begin + inst.size());
if (inst.size() != 0) {
std::memcpy(dmlc::BeginPtr(data_vec) + begin, inst.data(),
sizeof(Entry) * inst.size());
}
}

size_t Size() { return offset.Size() - 1; }
};



/*!
* \brief This is data structure that user can pass to DMatrix::Create
* to create a DMatrix for training, user can create this data structure
Expand Down
8 changes: 4 additions & 4 deletions src/c_api/c_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -338,10 +338,10 @@ XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
CHECK_LE(mat.info.num_row_, num_row);
// provision for empty rows at the bottom of matrix
for (uint64_t i = mat.info.num_row_; i < static_cast<uint64_t>(num_row); ++i) {
mat.page_.offset.push_back(mat.page_.offset.back());
offset_vec.push_back(offset_vec.back());
}
mat.info.num_row_ = num_row;
CHECK_EQ(mat.info.num_row_, mat.page_.offset.size() - 1); // sanity check
CHECK_EQ(mat.info.num_row_, offset_vec.size() - 1); // sanity check
}
mat.info.num_col_ = ncol;
mat.info.num_nonzero_ = nelem;
Expand Down Expand Up @@ -707,9 +707,9 @@ XGB_DLL int XGDMatrixSliceDMatrix(DMatrixHandle handle,
const int ridx = idxset[i];
auto inst = batch[ridx];
CHECK_LT(static_cast<xgboost::bst_ulong>(ridx), batch.Size());
ret.page_.data.insert(ret.page_.data.end(), inst.data(),
data_vec.insert(data_vec.end(), inst.data(),
inst.data() + inst.size());
ret.page_.offset.push_back(ret.page_.offset.back() + inst.size());
offset_vec.push_back(offset_vec.back() + inst.size());
ret.info.num_nonzero_ += inst.size();

if (src_labels.size() != 0) {
Expand Down
30 changes: 17 additions & 13 deletions src/common/hist_util.cu
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ struct GPUSketcher {

void Init(const SparsePage& row_batch, const MetaInfo& info) {
num_cols_ = info.num_col_;
has_weights_ = info.weights_.size() > 0;
has_weights_ = info.weights_.Size() > 0;

// find the batch size
if (param_.gpu_batch_nrows == 0) {
Expand Down Expand Up @@ -282,19 +282,23 @@ struct GPUSketcher {
size_t batch_row_end = std::min((gpu_batch + 1) * gpu_batch_nrows_,
static_cast<size_t>(n_rows_));
size_t batch_nrows = batch_row_end - batch_row_begin;
size_t n_entries =
row_batch.offset[row_begin_ + batch_row_end] -
row_batch.offset[row_begin_ + batch_row_begin];

const auto& offset_vec = row_batch.offset.HostVector();
const auto& data_vec = row_batch.data.HostVector();

size_t n_entries = offset_vec[row_begin_ + batch_row_end] -
offset_vec[row_begin_ + batch_row_begin];
// copy the batch to the GPU
dh::safe_cuda
(cudaMemcpy(entries_.data().get(),
&row_batch.data[row_batch.offset[row_begin_ + batch_row_begin]],
n_entries * sizeof(Entry), cudaMemcpyDefault));
dh::safe_cuda(
cudaMemcpy(entries_.data().get(),
data_vec.data() + offset_vec[row_begin_ + batch_row_begin],
n_entries * sizeof(Entry), cudaMemcpyDefault));
// copy the weights if necessary
if (has_weights_) {
const auto& weights_vec = info.weights_.HostVector();
dh::safe_cuda
(cudaMemcpy(weights_.data().get(),
info.weights_.data() + row_begin_ + batch_row_begin,
weights_vec.data() + row_begin_ + batch_row_begin,
batch_nrows * sizeof(bst_float), cudaMemcpyDefault));
}

Expand All @@ -310,7 +314,7 @@ struct GPUSketcher {
row_ptrs_.data().get() + batch_row_begin,
has_weights_ ? weights_.data().get() : nullptr, entries_.data().get(),
gpu_batch_nrows_, num_cols_,
row_batch.offset[row_begin_ + batch_row_begin], batch_nrows);
offset_vec[row_begin_ + batch_row_begin], batch_nrows);
dh::safe_cuda(cudaGetLastError()); // NOLINT
dh::safe_cuda(cudaDeviceSynchronize()); // NOLINT

Expand All @@ -331,10 +335,10 @@ struct GPUSketcher {
void Sketch(const SparsePage& row_batch, const MetaInfo& info) {
// copy rows to the device
dh::safe_cuda(cudaSetDevice(device_));
const auto& offset_vec = row_batch.offset.HostVector();
row_ptrs_.resize(n_rows_ + 1);
thrust::copy(row_batch.offset.data() + row_begin_,
row_batch.offset.data() + row_end_ + 1,
row_ptrs_.begin());
thrust::copy(offset_vec.data() + row_begin_,
offset_vec.data() + row_end_ + 1, row_ptrs_.begin());

size_t gpu_nbatches = dh::DivRoundUp(n_rows_, gpu_batch_nrows_);

Expand Down
46 changes: 23 additions & 23 deletions src/common/host_device_vector.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ struct HostDeviceVectorImpl {
void Init(HostDeviceVectorImpl<T>* vec, int device) {
if (vec_ == nullptr) { vec_ = vec; }
CHECK_EQ(vec, vec_);
device_ = device % dh::NVisibleDevices();
device_ = device;
index_ = vec_->distribution_.devices_.Index(device);
LazyResize(vec_->Size());
perm_d_ = vec_->perm_h_.Complementary();
Expand Down Expand Up @@ -193,10 +193,19 @@ struct HostDeviceVectorImpl {
}

common::Span<T> DeviceSpan(int device) {
CHECK(devices_.Contains(device));
LazySyncDevice(device);
return { shards_[devices_.Index(device)].data_.data().get(),
static_cast<typename common::Span<T>::index_type>(Size()) };
GPUSet devices = distribution_.devices_;
CHECK(devices.Contains(device));
LazySyncDevice(device, GPUAccess::kWrite);
return {shards_[devices.Index(device)].data_.data().get(),
static_cast<typename common::Span<T>::index_type>(Size())};
}

common::Span<const T> ConstDeviceSpan(int device) {
GPUSet devices = distribution_.devices_;
CHECK(devices.Contains(device));
LazySyncDevice(device, GPUAccess::kRead);
return{ shards_[devices.Index(device)].data_.data().get(),
static_cast<typename common::Span<const T>::index_type>(Size()) };
}

size_t DeviceSize(int device) {
Expand Down Expand Up @@ -229,15 +238,10 @@ struct HostDeviceVectorImpl {

void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
CHECK_EQ(end - begin, Size());
<<<<<<< HEAD
if (on_h_) {
if (perm_h_.CanWrite()) {
dh::safe_cuda(cudaMemcpy(data_h_.data(), begin.get(),
(end - begin) * sizeof(T),
cudaMemcpyDeviceToHost));
=======
if (perm_h_.CanWrite()) {
thrust::copy(begin, end, data_h_.begin());
>>>>>>> Added read-only state for HostDeviceVector sync.
} else {
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
shard.ScatterFrom(begin.get());
Expand Down Expand Up @@ -440,15 +444,19 @@ common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
return impl_->DeviceSpan(device);
}

template <typename T>
common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
return impl_->ConstDeviceSpan(device);
}

template <typename T>
const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
return impl_->ConstDevicePointer(device);
}

template <typename T>
size_t HostDeviceVector<T>::DeviceStart(int device) { return impl_->DeviceStart(device); }
const T* HostDeviceVector<T>::DevicePointer(int device) const {
return impl_->DevicePointer(device);
size_t HostDeviceVector<T>::DeviceStart(int device) const {
return impl_->DeviceStart(device);
}

template <typename T>
Expand Down Expand Up @@ -544,16 +552,8 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
// explicit instantiations are required, as HostDeviceVector isn't header-only
template class HostDeviceVector<bst_float>;
template class HostDeviceVector<GradientPair>;
<<<<<<< HEAD
<<<<<<< HEAD
template class HostDeviceVector<unsigned int>;
template class HostDeviceVector<int>;
=======
template class HostDeviceVector<uint32_t>;
>>>>>>> Fixed linter and test errors.
=======
template class HostDeviceVector<int>;
>>>>>>> Fixed explicit template instantiation errors for HostDeviceVector.
template class HostDeviceVector<Entry>;
template class HostDeviceVector<size_t>;

Expand Down
2 changes: 2 additions & 0 deletions src/common/host_device_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,8 @@ class HostDeviceVector {
GPUSet Devices() const;
const GPUDistribution& Distribution() const;
common::Span<T> DeviceSpan(int device);
common::Span<const T> ConstDeviceSpan(int device) const;
common::Span<const T> DeviceSpan(int device) const { return ConstDeviceSpan(device); }
T* DevicePointer(int device);
const T* ConstDevicePointer(int device) const;
const T* DevicePointer(int device) const { return ConstDevicePointer(device); }
Expand Down
2 changes: 1 addition & 1 deletion src/objective/hinge.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class HingeObj : public ObjFunction {
auto& gpair = out_gpair->HostVector();

for (size_t i = 0; i < preds_h.size(); ++i) {
auto y = info.labels_[i] * 2.0 - 1.0;
auto y = labels_h[i] * 2.0 - 1.0;
bst_float p = preds_h[i];
bst_float w = info.GetWeight(i);
bst_float g, h;
Expand Down
13 changes: 4 additions & 9 deletions src/objective/regression_obj_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -120,14 +120,14 @@ class GPURegLossObj : public ObjFunction {
#pragma omp parallel for schedule(static, 1) if (devices_.Size() > 1)
for (int i = 0; i < devices_.Size(); ++i) {
int d = devices_[i];
dh::safe_cuda(cudaSetDevice(d % dh::NVisibleDevices()));
dh::safe_cuda(cudaSetDevice(d));
const int block = 256;
size_t n = preds.DeviceSize(d);
if (n > 0) {
get_gradient_k<Loss><<<dh::DivRoundUp(n, block), block>>>(
out_gpair->DeviceSpan(d), label_correct_.DeviceSpan(d),
preds->DeviceSpan(d), labels_.DeviceSpan(d),
info.weights_.Size() > 0 ? weights_.DevicePointer(d) : nullptr, n,
preds.ConstDeviceSpan(d), info.labels_.ConstDeviceSpan(d),
info.weights_.Size() > 0 ? info.weights_.DevicePointer(d) : nullptr, n,
param_.scale_pos_weight);
dh::safe_cuda(cudaGetLastError());
}
Expand Down Expand Up @@ -157,17 +157,12 @@ class GPURegLossObj : public ObjFunction {
#pragma omp parallel for schedule(static, 1) if (devices_.Size() > 1)
for (int i = 0; i < devices_.Size(); ++i) {
int d = devices_[i];
dh::safe_cuda(cudaSetDevice(d % dh::NVisibleDevices()));
dh::safe_cuda(cudaSetDevice(d));
const int block = 256;
size_t n = preds->DeviceSize(d);
if (n > 0) {
<<<<<<< HEAD
pred_transform_k<Loss><<<dh::DivRoundUp(n, block), block>>>(
preds->DeviceSpan(d), n);
=======
pred_transform_k<Loss><<<dh::DivRoundUp(n, block), block>>>
(preds->DevicePointer(d), n);
>>>>>>> Replaced std::vector with HostDeviceVector in MetaInfo and SparsePage.
dh::safe_cuda(cudaGetLastError());
}
dh::safe_cuda(cudaDeviceSynchronize());
Expand Down
4 changes: 2 additions & 2 deletions src/predictor/gpu_predictor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ struct DeviceMatrix {
auto end_itr = begin_itr + batch.Size() + 1;
IncrementOffset(begin_itr, end_itr, batch.base_rowid);
}
dh::safe_cuda(cudaMemcpy(data.Data() + data_offset, batch.data.data(),
sizeof(Entry) * batch.data.size(),
dh::safe_cuda(cudaMemcpy(data.Data() + data_offset, data_vec.data(),
sizeof(Entry) * data_vec.size(),
cudaMemcpyHostToDevice));
// Copy data
data_offset += batch.data.Size();
Expand Down
7 changes: 3 additions & 4 deletions src/tree/updater_gpu_hist.cu
Original file line number Diff line number Diff line change
Expand Up @@ -387,11 +387,7 @@ struct DeviceShard {

void InitRowPtrs(const SparsePage& row_batch) {
dh::safe_cuda(cudaSetDevice(device_idx));
thrust::device_vector<float> cuts_d(hmat.cut);
thrust::device_vector<size_t> cut_row_ptrs_d(hmat.row_ptr);

auto& offset_vec = row_batch.offset.HostVector();
auto& data_vec = row_batch.data.HostVector();
// find the maximum row size
thrust::device_vector<size_t> row_ptr_d(
&offset_vec[row_begin_idx], &offset_vec[row_end_idx + 1]);
Expand Down Expand Up @@ -437,6 +433,9 @@ struct DeviceShard {
(dh::TotalMemory(device_idx) / (16 * row_stride * sizeof(Entry)),
static_cast<size_t>(n_rows));

const auto& offset_vec = row_batch.offset.HostVector();
const auto& data_vec = row_batch.data.HostVector();

thrust::device_vector<Entry> entries_d(gpu_batch_nrows * row_stride);

size_t gpu_nbatches = dh::DivRoundUp(n_rows, gpu_batch_nrows);
Expand Down
6 changes: 3 additions & 3 deletions tests/cpp/common/test_host_device_vector.cu
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ void InitHostDeviceVector(size_t n, const GPUDistribution& distribution,
void PlusOne(HostDeviceVector<int> *v) {
int n_devices = v->Devices().Size();
for (int i = 0; i < n_devices; ++i) {
dh::safe_cuda(cudaSetDevice(i % dh::NVisibleDevices()));
dh::safe_cuda(cudaSetDevice(i));
thrust::transform(v->tbegin(i), v->tend(i), v->tbegin(i),
[=]__device__(unsigned int a){ return a + 1; });
}
Expand All @@ -66,7 +66,7 @@ void CheckDevice(HostDeviceVector<int> *v,
ASSERT_EQ(v->Devices().Size(), n_devices);
for (int i = 0; i < n_devices; ++i) {
ASSERT_EQ(v->DeviceSize(i), sizes.at(i));
dh::safe_cuda(cudaSetDevice(i % dh::NVisibleDevices()));
dh::safe_cuda(cudaSetDevice(i));
ASSERT_TRUE(thrust::equal(v->tcbegin(i), v->tcend(i),
thrust::make_counting_iterator(first + starts[i])));
ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kRead));
Expand All @@ -76,7 +76,7 @@ void CheckDevice(HostDeviceVector<int> *v,
ASSERT_EQ(v->HostCanAccess(GPUAccess::kRead), access == GPUAccess::kRead);
ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite));
for (int i = 0; i < n_devices; ++i) {
dh::safe_cuda(cudaSetDevice(i % dh::NVisibleDevices()));
dh::safe_cuda(cudaSetDevice(i));
ASSERT_TRUE(thrust::equal(v->tbegin(i), v->tend(i),
thrust::make_counting_iterator(first + starts[i])));
ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kRead));
Expand Down

0 comments on commit 0fb1ef1

Please sign in to comment.