From 0e8df9efb5d39aeb084bc9fb71b7f644fae53837 Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 4 Aug 2022 10:28:37 +0800 Subject: [PATCH 1/7] Use CUDA docker image 11.0.3 tag. --- Jenkinsfile | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index a36720058a43..dcaff6ee11e5 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -7,7 +7,7 @@ dockerRun = 'tests/ci_build/ci_build.sh' // Which CUDA version to use when building reference distribution wheel -ref_cuda_ver = '11.0' +ref_cuda_ver = '11.0.3' import groovy.transform.Field @@ -61,9 +61,9 @@ pipeline { 'build-cpu-rabit-mock': { BuildCPUMock() }, // Build reference, distribution-ready Python wheel with CUDA 11.0 // using CentOS 7 image - 'build-gpu-cuda11.0': { BuildCUDA(cuda_version: '11.0', build_rmm: true) }, - 'build-gpu-rpkg': { BuildRPackageWithCUDA(cuda_version: '11.0') }, - 'build-jvm-packages-gpu-cuda11.0': { BuildJVMPackagesWithCUDA(spark_version: '3.0.1', cuda_version: '11.0') }, + 'build-gpu-cuda11.0': { BuildCUDA(cuda_version: '11.0.3', build_rmm: true) }, + 'build-gpu-rpkg': { BuildRPackageWithCUDA(cuda_version: '11.0.3') }, + 'build-jvm-packages-gpu-cuda11.0': { BuildJVMPackagesWithCUDA(spark_version: '3.0.1', cuda_version: '11.0.3') }, 'build-jvm-packages': { BuildJVMPackages(spark_version: '3.0.1') }, 'build-jvm-doc': { BuildJVMDoc() } ]) @@ -78,9 +78,9 @@ pipeline { 'test-python-cpu': { TestPythonCPU() }, 'test-python-cpu-arm64': { TestPythonCPUARM64() }, // artifact_cuda_version doesn't apply to RMM tests; RMM tests will always match CUDA version between artifact and host env - 'test-python-gpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0', test_rmm: true) }, - 'test-python-mgpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0', multi_gpu: true, test_rmm: true) }, - 'test-cpp-gpu-cuda11.0': { TestCppGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0', test_rmm: true) }, + 'test-python-gpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0.3', host_cuda_version: '11.0.3', test_rmm: true) }, + 'test-python-mgpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0.3', host_cuda_version: '11.0.3', multi_gpu: true, test_rmm: true) }, + 'test-cpp-gpu-cuda11.0': { TestCppGPU(artifact_cuda_version: '11.0.3', host_cuda_version: '11.0.3', test_rmm: true) }, 'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '3.0.0') } ]) } @@ -124,7 +124,7 @@ def ClangTidy() { echo "Running clang-tidy job..." def container_type = "clang_tidy" def docker_binary = "docker" - def dockerArgs = "--build-arg CUDA_VERSION_ARG=11.0" + def dockerArgs = "--build-arg CUDA_VERSION_ARG=11.0.3" sh """ ${dockerRun} ${container_type} ${docker_binary} ${dockerArgs} python3 tests/ci_build/tidy.py --cuda-archs 75 """ @@ -446,7 +446,7 @@ def DeployJVMPackages(args) { if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) { echo 'Deploying to xgboost-maven-repo S3 repo...' sh """ - ${dockerRun} jvm_gpu_build docker --build-arg CUDA_VERSION_ARG=11.0 tests/ci_build/deploy_jvm_packages.sh ${args.spark_version} + ${dockerRun} jvm_gpu_build docker --build-arg CUDA_VERSION_ARG=11.0.3 tests/ci_build/deploy_jvm_packages.sh ${args.spark_version} """ } deleteDir() From 65c7fda61ce5b8efe3f0e7021ab69d1b244b916a Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 4 Aug 2022 10:46:16 +0800 Subject: [PATCH 2/7] Unknown package. --- tests/ci_build/Dockerfile.gpu_build_centos7 | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/ci_build/Dockerfile.gpu_build_centos7 b/tests/ci_build/Dockerfile.gpu_build_centos7 index 138edacc29fc..c52837a455cc 100644 --- a/tests/ci_build/Dockerfile.gpu_build_centos7 +++ b/tests/ci_build/Dockerfile.gpu_build_centos7 @@ -4,7 +4,6 @@ ARG CUDA_VERSION_ARG # Install all basic requirements RUN \ - rpm --erase gpg-pubkey-7fa2af80* && \ curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \ > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ yum install -y epel-release centos-release-scl && \ From 9f29da1b1e1f7f49612989a0b31e503a640baa2f Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 4 Aug 2022 10:55:58 +0800 Subject: [PATCH 3/7] Unknown package. --- tests/ci_build/Dockerfile.jvm_gpu_build | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/ci_build/Dockerfile.jvm_gpu_build b/tests/ci_build/Dockerfile.jvm_gpu_build index 896a406b09cd..eb7ed9f73942 100644 --- a/tests/ci_build/Dockerfile.jvm_gpu_build +++ b/tests/ci_build/Dockerfile.jvm_gpu_build @@ -4,7 +4,6 @@ ARG CUDA_VERSION_ARG # Install all basic requirements RUN \ - rpm --erase gpg-pubkey-7fa2af80* && \ curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \ > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ yum install -y epel-release centos-release-scl && \ From aa418457bcd5ed4cf3fc72cbebeffd3feaad1bcf Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 4 Aug 2022 11:20:01 +0800 Subject: [PATCH 4/7] NCCL version. --- tests/ci_build/Dockerfile.gpu_build_centos7 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci_build/Dockerfile.gpu_build_centos7 b/tests/ci_build/Dockerfile.gpu_build_centos7 index c52837a455cc..182b3ab2eb0a 100644 --- a/tests/ci_build/Dockerfile.gpu_build_centos7 +++ b/tests/ci_build/Dockerfile.gpu_build_centos7 @@ -21,7 +21,7 @@ RUN \ # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) RUN \ export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \ - export NCCL_VERSION=2.7.3-1 && \ + export NCCL_VERSION=2.12.12-1 && \ wget -nv -nc https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \ rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \ yum -y update && \ From 266222b5288adffe2e6a14ce1dcb2e6bf1b59623 Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 4 Aug 2022 11:22:05 +0800 Subject: [PATCH 5/7] NCCL version. --- tests/ci_build/Dockerfile.gpu_build | 2 +- tests/ci_build/Dockerfile.rmm | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ci_build/Dockerfile.gpu_build b/tests/ci_build/Dockerfile.gpu_build index 2f463ce837b6..ff25f31df251 100644 --- a/tests/ci_build/Dockerfile.gpu_build +++ b/tests/ci_build/Dockerfile.gpu_build @@ -24,7 +24,7 @@ RUN \ # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) RUN \ export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \ - export NCCL_VERSION=2.7.5-1 && \ + export NCCL_VERSION=2.12.12-1 && \ apt-get update && \ apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT} diff --git a/tests/ci_build/Dockerfile.rmm b/tests/ci_build/Dockerfile.rmm index 7810cbf2992d..3affdae2c77b 100644 --- a/tests/ci_build/Dockerfile.rmm +++ b/tests/ci_build/Dockerfile.rmm @@ -18,7 +18,7 @@ RUN \ # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) RUN \ export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \ - export NCCL_VERSION=2.7.5-1 && \ + export NCCL_VERSION=2.12.12-1 && \ apt-get update && \ apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT} From e007d7164b6e0b3d9185e0087bd86ea9d23770b0 Mon Sep 17 00:00:00 2001 From: fis Date: Sun, 7 Aug 2022 01:30:44 +0800 Subject: [PATCH 6/7] 2.13 --- tests/ci_build/Dockerfile.gpu_build | 2 +- tests/ci_build/Dockerfile.gpu_build_centos7 | 2 +- tests/ci_build/Dockerfile.jvm_gpu_build | 2 +- tests/ci_build/Dockerfile.rmm | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/ci_build/Dockerfile.gpu_build b/tests/ci_build/Dockerfile.gpu_build index ff25f31df251..b9eaa0a59294 100644 --- a/tests/ci_build/Dockerfile.gpu_build +++ b/tests/ci_build/Dockerfile.gpu_build @@ -24,7 +24,7 @@ RUN \ # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) RUN \ export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \ - export NCCL_VERSION=2.12.12-1 && \ + export NCCL_VERSION=2.13.4-1 && \ apt-get update && \ apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT} diff --git a/tests/ci_build/Dockerfile.gpu_build_centos7 b/tests/ci_build/Dockerfile.gpu_build_centos7 index 182b3ab2eb0a..611a0d5d5956 100644 --- a/tests/ci_build/Dockerfile.gpu_build_centos7 +++ b/tests/ci_build/Dockerfile.gpu_build_centos7 @@ -21,7 +21,7 @@ RUN \ # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) RUN \ export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \ - export NCCL_VERSION=2.12.12-1 && \ + export NCCL_VERSION=2.13.4-1 && \ wget -nv -nc https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \ rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \ yum -y update && \ diff --git a/tests/ci_build/Dockerfile.jvm_gpu_build b/tests/ci_build/Dockerfile.jvm_gpu_build index eb7ed9f73942..cddbb1f65729 100644 --- a/tests/ci_build/Dockerfile.jvm_gpu_build +++ b/tests/ci_build/Dockerfile.jvm_gpu_build @@ -24,7 +24,7 @@ RUN \ # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) RUN \ export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \ - export NCCL_VERSION=2.12.12-1 && \ + export NCCL_VERSION=2.13.4-1 && \ yum-config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \ yum -y update && \ yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-static-${NCCL_VERSION}+cuda${CUDA_SHORT} diff --git a/tests/ci_build/Dockerfile.rmm b/tests/ci_build/Dockerfile.rmm index 3affdae2c77b..237aa11b7ba0 100644 --- a/tests/ci_build/Dockerfile.rmm +++ b/tests/ci_build/Dockerfile.rmm @@ -18,7 +18,7 @@ RUN \ # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) RUN \ export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \ - export NCCL_VERSION=2.12.12-1 && \ + export NCCL_VERSION=2.13.4-1 && \ apt-get update && \ apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT} From 47b01da94173eb2975086d218a0adf3d7d79242e Mon Sep 17 00:00:00 2001 From: fis Date: Sun, 7 Aug 2022 01:34:28 +0800 Subject: [PATCH 7/7] Early return if running in local mode. --- src/common/device_helpers.cu | 3 +++ src/common/device_helpers.cuh | 27 +++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/src/common/device_helpers.cu b/src/common/device_helpers.cu index 5f0e922233b9..12ee25e87ee3 100644 --- a/src/common/device_helpers.cu +++ b/src/common/device_helpers.cu @@ -33,6 +33,9 @@ std::string PrintUUID(xgboost::common::Span uuid) { void NcclAllReducer::DoInit(int _device_ordinal) { int32_t const rank = rabit::GetRank(); int32_t const world = rabit::GetWorldSize(); + if (world == 1) { + return; + } std::vector uuids(world * kUuidLength, 0); auto s_uuid = xgboost::common::Span{uuids.data(), uuids.size()}; diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index f3d387983b61..5c922bbf3092 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -754,6 +754,9 @@ class AllReducerBase : public xgboost::common::Crtp { void Init(int _device_ordinal) { device_ordinal_ = _device_ordinal; dh::safe_cuda(cudaSetDevice(device_ordinal_)); + if (rabit::GetWorldSize() == 1) { + return; + } this->Underlying().DoInit(_device_ordinal); initialised_ = true; } @@ -769,6 +772,9 @@ class AllReducerBase : public xgboost::common::Crtp { */ void AllGather(void const *data, size_t length_bytes, std::vector *segments, dh::caching_device_vector *recvbuf) { + if (rabit::GetWorldSize() == 1) { + return; + } CHECK(initialised_); dh::safe_cuda(cudaSetDevice(device_ordinal_)); this->Underlying().DoAllGather(data, length_bytes, segments, recvbuf); @@ -784,6 +790,9 @@ class AllReducerBase : public xgboost::common::Crtp { */ void AllGather(uint32_t const *data, size_t length, dh::caching_device_vector *recvbuf) { + if (rabit::GetWorldSize() == 1) { + return; + } CHECK(initialised_); dh::safe_cuda(cudaSetDevice(device_ordinal_)); this->Underlying().DoAllGather(data, length, recvbuf); @@ -798,6 +807,9 @@ class AllReducerBase : public xgboost::common::Crtp { * \param count Number of elements. */ void AllReduceSum(const double *sendbuff, double *recvbuff, int count) { + if (rabit::GetWorldSize() == 1) { + return; + } CHECK(initialised_); dh::safe_cuda(cudaSetDevice(device_ordinal_)); this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count); @@ -814,6 +826,9 @@ class AllReducerBase : public xgboost::common::Crtp { * \param count Number of elements. */ void AllReduceSum(const float *sendbuff, float *recvbuff, int count) { + if (rabit::GetWorldSize() == 1) { + return; + } CHECK(initialised_); dh::safe_cuda(cudaSetDevice(device_ordinal_)); this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count); @@ -831,6 +846,9 @@ class AllReducerBase : public xgboost::common::Crtp { * \param count Number of. */ void AllReduceSum(const int64_t *sendbuff, int64_t *recvbuff, int count) { + if (rabit::GetWorldSize() == 1) { + return; + } CHECK(initialised_); dh::safe_cuda(cudaSetDevice(device_ordinal_)); this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count); @@ -847,6 +865,9 @@ class AllReducerBase : public xgboost::common::Crtp { * \param count Number of elements. */ void AllReduceSum(const uint32_t *sendbuff, uint32_t *recvbuff, int count) { + if (rabit::GetWorldSize() == 1) { + return; + } CHECK(initialised_); dh::safe_cuda(cudaSetDevice(device_ordinal_)); this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count); @@ -863,6 +884,9 @@ class AllReducerBase : public xgboost::common::Crtp { * \param count Number of elements. */ void AllReduceSum(const uint64_t *sendbuff, uint64_t *recvbuff, int count) { + if (rabit::GetWorldSize() == 1) { + return; + } CHECK(initialised_); dh::safe_cuda(cudaSetDevice(device_ordinal_)); this->Underlying().DoAllReduceSum(sendbuff, recvbuff, count); @@ -886,6 +910,9 @@ class AllReducerBase : public xgboost::common::Crtp { !std::is_same::value> // NOLINT * = nullptr> void AllReduceSum(const T *sendbuff, T *recvbuff, int count) { // NOLINT + if (rabit::GetWorldSize() == 1) { + return; + } CHECK(initialised_); dh::safe_cuda(cudaSetDevice(device_ordinal_)); static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); // NOLINT