-
-
Notifications
You must be signed in to change notification settings - Fork 8.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fixes for large size clusters. #10880
Changes from all commits
a27cd7c
b516ad0
d673a51
6edd4f1
f067799
1954830
9e4c76f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -123,7 +123,8 @@ RabitTracker::RabitTracker(Json const& config) : Tracker{config} { | |
listener_ = TCPSocket::Create(addr.IsV4() ? SockDomain::kV4 : SockDomain::kV6); | ||
return listener_.Bind(host_, &this->port_); | ||
} << [&] { | ||
return listener_.Listen(); | ||
CHECK_GT(this->n_workers_, 0); | ||
return listener_.Listen(this->n_workers_); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What happens now if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, the listener should put them in the backlog until the queue is not full. The tracker handles workers whenever they connect, first come first serve. |
||
}; | ||
SafeColl(rc); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -203,13 +203,6 @@ __global__ void LaunchNKernel(size_t begin, size_t end, L lambda) { | |
lambda(i); | ||
} | ||
} | ||
template <typename L> | ||
__global__ void LaunchNKernel(int device_idx, size_t begin, size_t end, | ||
L lambda) { | ||
for (auto i : GridStrideRange(begin, end)) { | ||
lambda(i, device_idx); | ||
} | ||
} | ||
Comment on lines
-206
to
-212
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From what I can see this is being removed because it's not used anywhere, right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not used, small cleanup. |
||
|
||
/* \brief A wrapper around kernel launching syntax, used to guard against empty input. | ||
* | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,8 +16,9 @@ | |
#include "../../../../src/data/ellpack_page.cuh" | ||
#include "../../../../src/tree/gpu_hist/expand_entry.cuh" // for GPUExpandEntry | ||
#include "../../../../src/tree/gpu_hist/row_partitioner.cuh" | ||
#include "../../../../src/tree/param.h" // for TrainParam | ||
#include "../../helpers.h" // for RandomDataGenerator | ||
#include "../../../../src/tree/param.h" // for TrainParam | ||
#include "../../collective/test_worker.h" // for TestDistributedGlobal | ||
#include "../../helpers.h" // for RandomDataGenerator | ||
|
||
namespace xgboost::tree { | ||
void TestUpdatePositionBatch() { | ||
|
@@ -61,7 +62,9 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se | |
thrust::device_vector<cuda_impl::RowIndexT> ridx_tmp(ridx_in.size()); | ||
thrust::device_vector<cuda_impl::RowIndexT> counts(segments.size()); | ||
|
||
auto op = [=] __device__(auto ridx, int split_index, int data) { return ridx % 2 == 0; }; | ||
auto op = [=] __device__(auto ridx, int split_index, int data) { | ||
return ridx % 2 == 0; | ||
}; | ||
std::vector<int> op_data(segments.size()); | ||
std::vector<PerNodeData<int>> h_batch_info(segments.size()); | ||
dh::TemporaryArray<PerNodeData<int>> d_batch_info(segments.size()); | ||
|
@@ -79,7 +82,9 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se | |
dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows, op, | ||
&tmp); | ||
|
||
auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; }; | ||
auto op_without_data = [=] __device__(auto ridx) { | ||
return ridx % 2 == 0; | ||
}; | ||
for (size_t i = 0; i < segments.size(); i++) { | ||
auto begin = ridx.begin() + segments[i].begin; | ||
auto end = ridx.begin() + segments[i].end; | ||
|
@@ -93,7 +98,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se | |
} | ||
} | ||
|
||
TEST(GpuHist, SortPositionBatch) { | ||
TEST(RowPartitioner, SortPositionBatch) { | ||
TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 3}, {3, 6}}); | ||
TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 1}, {3, 6}}); | ||
TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 6}}); | ||
|
@@ -178,4 +183,34 @@ void TestExternalMemory() { | |
} // anonymous namespace | ||
|
||
TEST(RowPartitioner, LeafPartitionExternalMemory) { TestExternalMemory(); } | ||
|
||
namespace { | ||
void TestEmptyNode(std::int32_t n_workers) { | ||
collective::TestDistributedGlobal(n_workers, [] { | ||
auto ctx = MakeCUDACtx(DistGpuIdx()); | ||
RowPartitioner partitioner; | ||
bst_idx_t n_samples = (collective::GetRank() == 0) ? 0 : 1024; | ||
bst_idx_t base_rowid = 0; | ||
partitioner.Reset(&ctx, n_samples, base_rowid); | ||
std::vector<RegTree::Node> splits(1); | ||
partitioner.UpdatePositionBatch( | ||
&ctx, {0}, {1}, {2}, splits, | ||
[] XGBOOST_DEVICE(bst_idx_t ridx, std::int32_t /*nidx_in_batch*/, RegTree::Node) { | ||
return ridx < 3; | ||
}); | ||
ASSERT_EQ(partitioner.GetNumNodes(), 3); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cumulative sum of left and right indices, in line 197 above: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
if (collective::GetRank() == 0) { | ||
for (std::size_t i = 0; i < 3; ++i) { | ||
ASSERT_TRUE(partitioner.GetRows(i).empty()); | ||
} | ||
} | ||
ctx.CUDACtx()->Stream().Sync(); | ||
}); | ||
} | ||
} // anonymous namespace | ||
|
||
TEST(RowPartitioner, MGPUEmpty) { | ||
std::int32_t n_workers = curt::AllVisibleGPUs(); | ||
TestEmptyNode(n_workers); | ||
} | ||
} // namespace xgboost::tree |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since you're imposing a limit of 256, should that be added to the method docs as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will add a brief mention, thank you for pointing this out.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.