diff --git a/.github/runs-on.yml b/.github/runs-on.yml index e21895ee8c3b..bdcdabf45204 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -5,7 +5,12 @@ images: platform: "linux" arch: "x64" owner: "492475357299" # XGBooost CI - name: "xgboost-ci-runs-on-linux-*" + name: "xgboost-ci-runs-on-linux-amd64-*" + linux-arm64: + platform: "linux" + arch: "arm64" + owner: "492475357299" # XGBooost CI + name: "xgboost-ci-runs-on-linux-arm64-*" windows-amd64: platform: "windows" arch: "x64" @@ -26,7 +31,7 @@ runners: linux-arm64-cpu: cpu: 16 family: ["c6g", "c7g"] - image: ubuntu24-full-arm64 + image: linux-arm64 windows-gpu: family: ["g4dn.2xlarge"] image: windows-amd64 diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml index 8b7c71a82bf8..26ceaf758f3a 100644 --- a/.github/workflows/i386.yml +++ b/.github/workflows/i386.yml @@ -3,7 +3,7 @@ name: XGBoost-i386-test on: [push, pull_request] permissions: - contents: read # to fetch code (actions/checkout) + contents: read # to fetch code (actions/checkout) concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -12,32 +12,16 @@ concurrency: jobs: build-32bit: name: Build 32-bit - runs-on: ubuntu-latest - services: - registry: - image: registry:2 - ports: - - 5000:5000 + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=i386-build-32bit steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker - uses: actions/checkout@v4 with: - submodules: 'true' - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - with: - driver-opts: network=host - - name: Build and push container - uses: docker/build-push-action@v6 - with: - context: . - file: ops/docker/dockerfile/Dockerfile.i386 - push: true - tags: localhost:5000/xgboost/build-32bit:latest - cache-from: type=gha - cache-to: type=gha,mode=max - - name: Build XGBoost - run: | - docker run --rm -v $PWD:/workspace -w /workspace \ - -e CXXFLAGS='-Wno-error=overloaded-virtual -Wno-error=maybe-uninitialized -Wno-error=redundant-move' \ - localhost:5000/xgboost/build-32bit:latest \ - bash ops/script/build_via_cmake.sh + submodules: "true" + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh + - run: bash ops/pipeline/test-cpp-i386.sh diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index 53e695721887..afc8b1cccfc1 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -12,40 +12,12 @@ concurrency: env: BRANCH_NAME: >- ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} - USE_DOCKER_CACHE: 1 jobs: - build-containers: - name: Build CI containers (${{ matrix.container_id }}) - runs-on: - - runs-on - - runner=${{ matrix.runner }} - - run-id=${{ github.run_id }} - - tag=jvm-tests-build-containers-${{ matrix.container_id }} - strategy: - matrix: - container_id: - - xgb-ci.manylinux2014_x86_64 - - xgb-ci.jvm - - xgb-ci.jvm_gpu_build - runner: [linux-amd64-cpu] - include: - - container_id: xgb-ci.manylinux2014_aarch64 - runner: linux-arm64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Build ${{ matrix.container_id }} - run: bash ops/docker_build.sh ${{ matrix.container_id }} - build-jvm-manylinux2014: name: >- Build libxgboost4j.so targeting glibc 2.17 (arch ${{ matrix.arch }}, runner ${{ matrix.runner }}) - needs: build-containers runs-on: - runs-on - runner=${{ matrix.runner }} @@ -65,19 +37,12 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh xgb-ci.manylinux2014_${{ matrix.arch }} + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }} - - name: Upload libxgboost4j.so - run: | - libname=lib/libxgboost4j_linux_${{ matrix.arch }}_${{ github.sha }}.so - mv -v lib/libxgboost4j.so ${libname} - bash ops/pipeline/publish-artifact.sh ${libname} \ - s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/ build-jvm-gpu: name: Build libxgboost4j.so with CUDA - needs: build-containers runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu @@ -88,12 +53,15 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh xgb-ci.jvm_gpu_build + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh - run: bash ops/pipeline/build-jvm-gpu.sh - name: Stash files run: | - bash ops/pipeline/stash-artifacts.sh stash build-jvm-gpu lib/libxgboost4j.so + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix cache/${{ github.run_id }}/build-jvm-gpu \ + lib/libxgboost4j.so build-jvm-mac: name: "Build libxgboost4j.dylib for ${{ matrix.description }}" @@ -104,11 +72,11 @@ jobs: include: - description: "MacOS (Apple Silicon)" script: ops/pipeline/build-jvm-macos-apple-silicon.sh - libname: libxgboost4j_m1_${{ github.sha }}.dylib + libname: libxgboost4j_m1.dylib runner: macos-14 - description: "MacOS (Intel)" script: ops/pipeline/build-jvm-macos-intel.sh - libname: libxgboost4j_intel_${{ github.sha }}.dylib + libname: libxgboost4j_intel.dylib runner: macos-13 steps: - uses: actions/checkout@v4 @@ -116,10 +84,13 @@ jobs: submodules: "true" - run: bash ${{ matrix.script }} - name: Upload libxgboost4j.dylib + if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') run: | mv -v lib/libxgboost4j.dylib ${{ matrix.libname }} - bash ops/pipeline/publish-artifact.sh ${{ matrix.libname }} \ - s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/ + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket xgboost-nightly-builds \ + --prefix ${{ env.BRANCH_NAME }}/${{ github.sha }} --make-public \ + ${{ matrix.libname }} env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} @@ -137,21 +108,25 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh xgb-ci.jvm_gpu_build + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh - name: Unstash files run: | - bash ops/pipeline/stash-artifacts.sh unstash build-jvm-gpu lib/libxgboost4j.so + python3 ops/pipeline/manage-artifacts.py download \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix cache/${{ github.run_id }}/build-jvm-gpu \ + --dest-dir lib \ + libxgboost4j.so - run: bash ops/pipeline/build-jvm-doc.sh - name: Upload JVM doc run: | - bash ops/pipeline/publish-artifact.sh \ - jvm-packages/${{ env.BRANCH_NAME }}.tar.bz2 \ - s3://xgboost-docs/ + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket xgboost-docs \ + --prefix ${BRANCH_NAME}/${GITHUB_SHA} --make-public \ + jvm-packages/${{ env.BRANCH_NAME }}.tar.bz2 build-test-jvm-packages: name: Build and test JVM packages (Linux, Scala ${{ matrix.scala_version }}) - needs: build-containers runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu @@ -166,16 +141,18 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh xgb-ci.jvm + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh - name: Build and test JVM packages (Scala ${{ matrix.scala_version }}) run: bash ops/pipeline/build-test-jvm-packages.sh env: SCALA_VERSION: ${{ matrix.scala_version }} - name: Stash files run: | - bash ops/pipeline/stash-artifacts.sh stash \ - build-test-jvm-packages lib/libxgboost4j.so + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix cache/${{ github.run_id }}/build-test-jvm-packages \ + lib/libxgboost4j.so if: matrix.scala_version == '2.13' build-test-jvm-packages-other-os: @@ -194,7 +171,7 @@ jobs: with: distribution: 'temurin' java-version: '8' - - uses: dmlc/xgboost-devops/miniforge-setup@main + - uses: dmlc/xgboost-devops/actions/miniforge-setup@main with: environment-name: minimal environment-file: ops/conda_env/minimal.yml @@ -210,11 +187,10 @@ jobs: mvn test -B -pl :xgboost4j_2.12 - name: Publish artifact xgboost4j.dll to S3 run: | - cd lib/ - Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll - python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll ` - s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/ ` - --acl public-read --region us-west-2 + python ops/pipeline/manage-artifacts.py upload ` + --s3-bucket xgboost-nightly-builds ` + --prefix ${{ env.BRANCH_NAME }}/${{ github.sha }} --make-public ` + lib/xgboost4j.dll if: | (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && matrix.os == 'windows-latest' @@ -239,11 +215,15 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh xgb-ci.jvm_gpu_build + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh - name: Unstash files run: | - bash ops/pipeline/stash-artifacts.sh unstash build-jvm-gpu lib/libxgboost4j.so + python3 ops/pipeline/manage-artifacts.py download \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix cache/${{ github.run_id }}/build-jvm-gpu \ + --dest-dir lib \ + libxgboost4j.so - run: bash ops/pipeline/test-jvm-gpu.sh env: SCALA_VERSION: ${{ matrix.scala_version }} @@ -273,13 +253,15 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh ${{ matrix.variant.container_id }} + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh - name: Unstash files run: | - bash ops/pipeline/stash-artifacts.sh \ - unstash ${{ matrix.variant.artifact_from }} \ - lib/libxgboost4j.so + python3 ops/pipeline/manage-artifacts.py download \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix cache/${{ github.run_id }}/${{ matrix.variant.artifact_from }} \ + --dest-dir lib \ + libxgboost4j.so ls -lh lib/libxgboost4j.so - name: Deploy JVM packages to S3 run: | diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 2c400b073988..73636e7ce66d 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -14,26 +14,8 @@ env: ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} jobs: - build-containers: - name: Build CI containers - env: - CONTAINER_ID: xgb-ci.clang_tidy - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - - tag=lint-build-containers - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Build ${{ env.CONTAINER_ID }} - run: bash ops/docker_build.sh ${{ env.CONTAINER_ID }} - clang-tidy: name: Run clang-tidy - needs: build-containers runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu @@ -44,8 +26,8 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh xgb-ci.clang_tidy + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh - run: bash ops/pipeline/run-clang-tidy.sh python-mypy-lint: @@ -55,7 +37,7 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - uses: dmlc/xgboost-devops/miniforge-setup@main + - uses: dmlc/xgboost-devops/actions/miniforge-setup@main with: environment-name: python_lint environment-file: ops/conda_env/python_lint.yml diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index cbed730405fa..fd1b94c7af4c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -12,44 +12,10 @@ concurrency: env: BRANCH_NAME: >- ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} - USE_DOCKER_CACHE: 1 jobs: - build-containers: - name: Build CI containers (${{ matrix.container_id }}) - runs-on: - - runs-on - - runner=${{ matrix.runner }} - - run-id=${{ github.run_id }} - - tag=main-build-containers-${{ matrix.container_id }} - strategy: - matrix: - container_id: - - xgb-ci.gpu_build_rockylinux8 - - xgb-ci.gpu_build_rockylinux8_dev_ver - - xgb-ci.gpu_build_r_rockylinux8 - - xgb-ci.gpu - - xgb-ci.cpu - - xgb-ci.manylinux_2_28_x86_64 - - xgb-ci.manylinux2014_x86_64 - runner: [linux-amd64-cpu] - include: - - container_id: xgb-ci.manylinux2014_aarch64 - runner: linux-arm64-cpu - - container_id: xgb-ci.aarch64 - runner: linux-arm64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Build ${{ matrix.container_id }} - run: bash ops/docker_build.sh ${{ matrix.container_id }} - build-cpu: name: Build CPU - needs: build-containers runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu @@ -60,15 +26,18 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh xgb-ci.cpu + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh - run: bash ops/pipeline/build-cpu.sh - name: Stash CLI executable - run: bash ops/pipeline/stash-artifacts.sh stash build-cpu ./xgboost + run: | + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix cache/${{ github.run_id }}/build-cpu \ + ./xgboost build-cpu-arm64: name: Build CPU ARM64 + manylinux_2_28_aarch64 wheel - needs: build-containers runs-on: - runs-on=${{ github.run_id }} - runner=linux-arm64-cpu @@ -79,21 +48,18 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh xgb-ci.aarch64 + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh - run: bash ops/pipeline/build-cpu-arm64.sh - name: Stash files run: | - bash ops/pipeline/stash-artifacts.sh stash build-cpu-arm64 \ - ./xgboost python-package/dist/*.whl - - name: Upload Python wheel - run: | - bash ops/pipeline/publish-artifact.sh python-package/dist/*.whl \ - s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix cache/${{ github.run_id }}/build-cpu-arm64 \ + ./xgboost python-package/dist/*.whl build-cuda: name: Build CUDA + manylinux_2_28_x86_64 wheel - needs: build-containers runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu @@ -104,26 +70,19 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh xgb-ci.gpu_build_rockylinux8 - - name: Fetch container from cache - run: bash ops/docker_build.sh xgb-ci.manylinux_2_28_x86_64 - - run: bash ops/pipeline/build-cuda.sh + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh + - run: | + bash ops/pipeline/build-cuda.sh xgb-ci.gpu_build_rockylinux8 disable-rmm - name: Stash files run: | - bash ops/pipeline/stash-artifacts.sh stash build-cuda \ + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix cache/${{ github.run_id }}/build-cuda \ build/testxgboost ./xgboost python-package/dist/*.whl - - name: Upload Python wheel - run: | - for file in python-package/dist/*.whl python-package/dist/meta.json - do - bash ops/pipeline/publish-artifact.sh "${file}" \ - s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ - done build-cuda-with-rmm: name: Build CUDA with RMM - needs: build-containers runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu @@ -134,24 +93,19 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh xgb-ci.gpu_build_rockylinux8 - - name: Fetch container from cache - run: bash ops/docker_build.sh xgb-ci.manylinux_2_28_x86_64 + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh - run: | - bash ops/pipeline/build-cuda-with-rmm.sh xgb-ci.gpu_build_rockylinux8 + bash ops/pipeline/build-cuda.sh xgb-ci.gpu_build_rockylinux8 enable-rmm - name: Stash files run: | - bash ops/pipeline/stash-artifacts.sh \ - stash build-cuda-with-rmm build/testxgboost - - name: Upload Python wheel - run: | - bash ops/pipeline/publish-artifact.sh python-package/dist/*.whl \ - s3://xgboost-nightly-builds/experimental_build_with_rmm/ + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix cache/${{ github.run_id }}/build-cuda-with-rmm \ + build/testxgboost build-cuda-with-rmm-dev: name: Build CUDA with RMM (dev) - needs: build-containers runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu @@ -162,16 +116,14 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh xgb-ci.gpu_build_rockylinux8_dev_ver - - name: Fetch container from cache - run: bash ops/docker_build.sh xgb-ci.manylinux_2_28_x86_64 + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh - run: | - bash ops/pipeline/build-cuda-with-rmm.sh xgb-ci.gpu_build_rockylinux8_dev_ver + bash ops/pipeline/build-cuda.sh \ + xgb-ci.gpu_build_rockylinux8_dev_ver enable-rmm build-manylinux2014: name: Build manylinux2014_${{ matrix.arch }} wheel - needs: build-containers runs-on: - runs-on - runner=${{ matrix.runner }} @@ -191,20 +143,12 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh xgb-ci.manylinux2014_${{ matrix.arch }} + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh - run: bash ops/pipeline/build-manylinux2014.sh ${{ matrix.arch }} - - name: Upload Python wheel - run: | - for wheel in python-package/dist/*.whl - do - bash ops/pipeline/publish-artifact.sh "${wheel}" \ - s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ - done build-gpu-rpkg: name: Build GPU-enabled R package - needs: build-containers runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu @@ -215,13 +159,9 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh xgb-ci.gpu_build_r_rockylinux8 + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh - run: bash ops/pipeline/build-gpu-rpkg.sh - - name: Upload R tarball - run: | - bash ops/pipeline/publish-artifact.sh xgboost_r_gpu_linux_*.tar.gz \ - s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ test-cpp-gpu: @@ -253,12 +193,15 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh xgb-ci.gpu + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh - name: Unstash gtest run: | - bash ops/pipeline/stash-artifacts.sh unstash ${{ matrix.artifact_from }} \ - build/testxgboost + python3 ops/pipeline/manage-artifacts.py download \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix cache/${{ github.run_id }}/${{ matrix.artifact_from }} \ + --dest-dir build \ + testxgboost chmod +x build/testxgboost - run: bash ops/pipeline/test-cpp-gpu.sh ${{ matrix.suite }} @@ -300,12 +243,16 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh ${{ matrix.container }} + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh - name: Unstash Python wheel run: | - bash ops/pipeline/stash-artifacts.sh unstash ${{ matrix.artifact_from }} \ - python-package/dist/*.whl ./xgboost + python3 ops/pipeline/manage-artifacts.py download \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix cache/${{ github.run_id }}/${{ matrix.artifact_from }} \ + --dest-dir wheelhouse \ + *.whl xgboost + mv -v wheelhouse/xgboost . chmod +x ./xgboost - name: Run Python tests, ${{ matrix.description }} run: bash ops/pipeline/test-python-wheel.sh ${{ matrix.suite }} ${{ matrix.container }} diff --git a/.github/workflows/misc.yml b/.github/workflows/misc.yml index 67c1bf57d3a2..54d0078a6164 100644 --- a/.github/workflows/misc.yml +++ b/.github/workflows/misc.yml @@ -24,12 +24,8 @@ jobs: - name: Install system packages run: | sudo apt-get install -y --no-install-recommends ninja-build - - name: Build and install XGBoost - run: bash ops/script/build_via_cmake.sh -DUSE_OPENMP=OFF - - name: Run gtest binary - run: | - cd build - ctest --extra-verbose + - name: Build and test XGBoost + run: bash ops/pipeline/build-test-cpu-nonomp.sh c-api-demo: name: Test installing XGBoost lib + building the C API demo @@ -41,7 +37,7 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - uses: dmlc/xgboost-devops/miniforge-setup@main + - uses: dmlc/xgboost-devops/actions/miniforge-setup@main with: environment-name: cpp_test environment-file: ops/conda_env/cpp_test.yml diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index dc8de819e2bb..180b1a855733 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -25,7 +25,7 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - uses: dmlc/xgboost-devops/miniforge-setup@main + - uses: dmlc/xgboost-devops/actions/miniforge-setup@main with: environment-name: sdist_test environment-file: ops/conda_env/sdist_test.yml @@ -44,7 +44,7 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - uses: dmlc/xgboost-devops/miniforge-setup@main + - uses: dmlc/xgboost-devops/actions/miniforge-setup@main with: environment-name: macos_cpu_test environment-file: ops/conda_env/macos_cpu_test.yml diff --git a/.github/workflows/python_wheels_macos.yml b/.github/workflows/python_wheels_macos.yml index ab13dfa395cd..33eabbd09dca 100644 --- a/.github/workflows/python_wheels_macos.yml +++ b/.github/workflows/python_wheels_macos.yml @@ -37,7 +37,7 @@ jobs: uses: Homebrew/actions/setup-homebrew@13341b4d5e459a98bbe0b122b12c11bf90518cc8 - name: Install libomp run: brew install libomp - - uses: dmlc/xgboost-devops/miniforge-setup@main + - uses: dmlc/xgboost-devops/actions/miniforge-setup@main with: environment-name: minimal environment-file: ops/conda_env/minimal.yml @@ -46,8 +46,10 @@ jobs: - name: Upload Python wheel if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') run: | - python -m pip install awscli - python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ --acl public-read --region us-west-2 + python ops/pipeline/manage-artifacts.py upload \ + --s3-bucket xgboost-nightly-builds \ + --prefix ${{ env.BRANCH_NAME }}/${{ github.sha }} --make-public \ + wheelhouse/*.whl env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} diff --git a/.github/workflows/sycl_tests.yml b/.github/workflows/sycl_tests.yml index 22456b1b68e5..8efdc98d7fd9 100644 --- a/.github/workflows/sycl_tests.yml +++ b/.github/workflows/sycl_tests.yml @@ -25,7 +25,7 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - uses: dmlc/xgboost-devops/miniforge-setup@main + - uses: dmlc/xgboost-devops/actions/miniforge-setup@main with: environment-name: linux_sycl_test environment-file: ops/conda_env/linux_sycl_test.yml @@ -40,7 +40,7 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - uses: dmlc/xgboost-devops/miniforge-setup@main + - uses: dmlc/xgboost-devops/actions/miniforge-setup@main with: environment-name: linux_sycl_test environment-file: ops/conda_env/linux_sycl_test.yml diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index f97daf761abf..53a1b5c0520b 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -30,8 +30,12 @@ jobs: submodules: "true" - run: powershell ops/pipeline/build-win64-gpu.ps1 - name: Stash files + shell: powershell run: | - powershell ops/pipeline/stash-artifacts.ps1 stash build-win64-gpu ` + conda activate + python ops/pipeline/manage-artifacts.py upload ` + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} ` + --prefix cache/${{ github.run_id }}/build-win64-gpu ` build/testxgboost.exe xgboost.exe ` (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName) @@ -47,7 +51,16 @@ jobs: with: submodules: "true" - name: Unstash files + shell: powershell run: | - powershell ops/pipeline/stash-artifacts.ps1 unstash build-win64-gpu ` - build/testxgboost.exe xgboost.exe python-package/dist/*.whl + conda activate + python ops/pipeline/manage-artifacts.py download ` + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} ` + --prefix cache/${{ github.run_id }}/build-win64-gpu ` + --dest-dir build ` + *.whl testxgboost.exe xgboost.exe + Move-Item -Path build/xgboost.exe -Destination . + New-Item -ItemType Directory -Path python-package/dist/ -Force + Move-Item -Path (Get-ChildItem build/*.whl | Select-Object -Expand FullName) ` + -Destination python-package/dist/ - run: powershell ops/pipeline/test-win64-gpu.ps1 diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 8bd8caabc20f..1fa0d7a9cf46 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -21,7 +21,7 @@ S3method(variable.names,xgb.Booster) export("xgb.attr<-") export("xgb.attributes<-") export("xgb.config<-") -export("xgb.parameters<-") +export("xgb.model.parameters<-") export(getinfo) export(setinfo) export(xgb.Callback) @@ -61,6 +61,7 @@ export(xgb.is.same.Booster) export(xgb.load) export(xgb.load.raw) export(xgb.model.dt.tree) +export(xgb.params) export(xgb.plot.deepness) export(xgb.plot.importance) export(xgb.plot.multi.trees) diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R index d83b6b184329..961715a2e02e 100644 --- a/R-package/R/callbacks.R +++ b/R-package/R/callbacks.R @@ -61,7 +61,7 @@ #' will be the same as parameter `begin_iteration`, then next one will add +1, and so on). #' #' - iter_feval Evaluation metrics for `evals` that were supplied, either -#' determined by the objective, or by parameter `feval`. +#' determined by the objective, or by parameter `custom_metric`. #' #' For [xgb.train()], this will be a named vector with one entry per element in #' `evals`, where the names are determined as 'evals name' + '-' + 'metric name' - for @@ -204,10 +204,9 @@ #' dm <- xgb.DMatrix(x, label = y, nthread = 1) #' model <- xgb.train( #' data = dm, -#' params = list(objective = "reg:squarederror", nthread = 1), +#' params = xgb.params(objective = "reg:squarederror", nthread = 1), #' nrounds = 5, -#' callbacks = list(ssq_callback), -#' keep_extra_attributes = TRUE +#' callbacks = list(ssq_callback) #' ) #' #' # Result from 'f_after_iter' will be available as an attribute @@ -451,7 +450,7 @@ xgb.cb.print.evaluation <- function(period = 1, showsd = TRUE) { #' Callback for logging the evaluation history #' #' @details This callback creates a table with per-iteration evaluation metrics (see parameters -#' `evals` and `feval` in [xgb.train()]). +#' `evals` and `custom_metric` in [xgb.train()]). #' #' Note: in the column names of the final data.table, the dash '-' character is replaced with #' the underscore '_' in order to make the column names more like regular R identifiers. @@ -563,7 +562,7 @@ xgb.cb.reset.parameters <- function(new_params) { } }, f_before_iter = function(env, model, data, evals, iteration) { - pars <- lapply(env$new_params, function(p) { + params <- lapply(env$new_params, function(p) { if (is.function(p)) { return(p(iteration, env$end_iteration)) } else { @@ -572,10 +571,10 @@ xgb.cb.reset.parameters <- function(new_params) { }) if (inherits(model, "xgb.Booster")) { - xgb.parameters(model) <- pars + xgb.model.parameters(model) <- params } else { for (fd in model) { - xgb.parameters(fd$bst) <- pars + xgb.model.parameters(fd$bst) <- params } } return(FALSE) @@ -957,7 +956,7 @@ xgb.cb.cv.predict <- function(save_models = FALSE, outputmargin = FALSE) { #' label = 1 * (iris$Species == "versicolor"), #' nthread = nthread #' ) -#' param <- list( +#' param <- xgb.params( #' booster = "gblinear", #' objective = "reg:logistic", #' eval_metric = "auc", @@ -971,11 +970,10 @@ xgb.cb.cv.predict <- function(save_models = FALSE, outputmargin = FALSE) { #' # rate does not break the convergence, but allows us to illustrate the typical pattern of #' # "stochastic explosion" behaviour of this lock-free algorithm at early boosting iterations. #' bst <- xgb.train( -#' param, +#' c(param, list(eta = 1.)), #' dtrain, -#' list(tr = dtrain), +#' evals = list(tr = dtrain), #' nrounds = 200, -#' eta = 1., #' callbacks = list(xgb.cb.gblinear.history()) #' ) #' @@ -986,14 +984,18 @@ xgb.cb.cv.predict <- function(save_models = FALSE, outputmargin = FALSE) { #' # With the deterministic coordinate descent updater, it is safer to use higher learning rates. #' # Will try the classical componentwise boosting which selects a single best feature per round: #' bst <- xgb.train( -#' param, +#' c( +#' param, +#' xgb.params( +#' eta = 0.8, +#' updater = "coord_descent", +#' feature_selector = "thrifty", +#' top_k = 1 +#' ) +#' ), #' dtrain, -#' list(tr = dtrain), +#' evals = list(tr = dtrain), #' nrounds = 200, -#' eta = 0.8, -#' updater = "coord_descent", -#' feature_selector = "thrifty", -#' top_k = 1, #' callbacks = list(xgb.cb.gblinear.history()) #' ) #' matplot(xgb.gblinear.history(bst), type = "l") @@ -1003,11 +1005,10 @@ xgb.cb.cv.predict <- function(save_models = FALSE, outputmargin = FALSE) { #' #' # For xgb.cv: #' bst <- xgb.cv( -#' param, +#' c(param, list(eta = 0.8)), #' dtrain, #' nfold = 5, #' nrounds = 100, -#' eta = 0.8, #' callbacks = list(xgb.cb.gblinear.history()) #' ) #' # coefficients in the CV fold #3 @@ -1017,7 +1018,7 @@ xgb.cb.cv.predict <- function(save_models = FALSE, outputmargin = FALSE) { #' #### Multiclass classification: #' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = nthread) #' -#' param <- list( +#' param <- xgb.params( #' booster = "gblinear", #' objective = "multi:softprob", #' num_class = 3, @@ -1029,11 +1030,10 @@ xgb.cb.cv.predict <- function(save_models = FALSE, outputmargin = FALSE) { #' # For the default linear updater 'shotgun' it sometimes is helpful #' # to use smaller eta to reduce instability #' bst <- xgb.train( -#' param, +#' c(param, list(eta = 0.5)), #' dtrain, -#' list(tr = dtrain), +#' evals = list(tr = dtrain), #' nrounds = 50, -#' eta = 0.5, #' callbacks = list(xgb.cb.gblinear.history()) #' ) #' @@ -1044,11 +1044,10 @@ xgb.cb.cv.predict <- function(save_models = FALSE, outputmargin = FALSE) { #' #' # CV: #' bst <- xgb.cv( -#' param, +#' c(param, list(eta = 0.5)), #' dtrain, #' nfold = 5, #' nrounds = 70, -#' eta = 0.5, #' callbacks = list(xgb.cb.gblinear.history(FALSE)) #' ) #' # 1st fold of 1st class diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 008a88dcd715..81e15f7e4421 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -71,21 +71,13 @@ NVL <- function(x, val) { # Merges booster params with whatever is provided in ... # plus runs some checks -check.booster.params <- function(params, ...) { +check.booster.params <- function(params) { if (!identical(class(params), "list")) stop("params must be a list") # in R interface, allow for '.' instead of '_' in parameter names names(params) <- gsub(".", "_", names(params), fixed = TRUE) - # merge parameters from the params and the dots-expansion - dot_params <- list(...) - names(dot_params) <- gsub(".", "_", names(dot_params), fixed = TRUE) - if (length(intersect(names(params), - names(dot_params))) > 0) - stop("Same parameters in 'params' and in the call are not allowed. Please check your 'params' list.") - params <- c(params, dot_params) - # providing a parameter multiple times makes sense only for 'eval_metric' name_freqs <- table(names(params)) multi_names <- setdiff(names(name_freqs[name_freqs > 1]), 'eval_metric') @@ -110,7 +102,6 @@ check.booster.params <- function(params, ...) { } # monotone_constraints parser - if (!is.null(params[['monotone_constraints']]) && typeof(params[['monotone_constraints']]) != "character") { vec2str <- paste(params[['monotone_constraints']], collapse = ',') @@ -144,55 +135,56 @@ check.booster.params <- function(params, ...) { # Performs some checks related to custom objective function. -# WARNING: has side-effects and can modify 'params' and 'obj' in its calling frame -check.custom.obj <- function(env = parent.frame()) { - if (!is.null(env$params[['objective']]) && !is.null(env$obj)) - stop("Setting objectives in 'params' and 'obj' at the same time is not allowed") +check.custom.obj <- function(params, objective) { + if (!is.null(params[['objective']]) && !is.null(objective)) + stop("Setting objectives in 'params' and 'objective' at the same time is not allowed") - if (!is.null(env$obj) && typeof(env$obj) != 'closure') - stop("'obj' must be a function") + if (!is.null(objective) && typeof(objective) != 'closure') + stop("'objective' must be a function") # handle the case when custom objective function was provided through params - if (!is.null(env$params[['objective']]) && - typeof(env$params$objective) == 'closure') { - env$obj <- env$params$objective - env$params$objective <- NULL + if (!is.null(params[['objective']]) && + typeof(params$objective) == 'closure') { + objective <- params$objective + params$objective <- NULL } + return(list(params = params, objective = objective)) } # Performs some checks related to custom evaluation function. -# WARNING: has side-effects and can modify 'params' and 'feval' in its calling frame -check.custom.eval <- function(env = parent.frame()) { - if (!is.null(env$params[['eval_metric']]) && !is.null(env$feval)) - stop("Setting evaluation metrics in 'params' and 'feval' at the same time is not allowed") +check.custom.eval <- function(params, custom_metric, maximize, early_stopping_rounds, callbacks) { + if (!is.null(params[['eval_metric']]) && !is.null(custom_metric)) + stop("Setting evaluation metrics in 'params' and 'custom_metric' at the same time is not allowed") - if (!is.null(env$feval) && typeof(env$feval) != 'closure') - stop("'feval' must be a function") + if (!is.null(custom_metric) && typeof(custom_metric) != 'closure') + stop("'custom_metric' must be a function") # handle a situation when custom eval function was provided through params - if (!is.null(env$params[['eval_metric']]) && - typeof(env$params$eval_metric) == 'closure') { - env$feval <- env$params$eval_metric - env$params$eval_metric <- NULL + if (!is.null(params[['eval_metric']]) && + typeof(params$eval_metric) == 'closure') { + custom_metric <- params$eval_metric + params$eval_metric <- NULL } - # require maximize to be set when custom feval and early stopping are used together - if (!is.null(env$feval) && - is.null(env$maximize) && ( - !is.null(env$early_stopping_rounds) || - has.callbacks(env$callbacks, "early_stop"))) + # require maximize to be set when custom metric and early stopping are used together + if (!is.null(custom_metric) && + is.null(maximize) && ( + !is.null(early_stopping_rounds) || + has.callbacks(callbacks, "early_stop"))) stop("Please set 'maximize' to indicate whether the evaluation metric needs to be maximized or not") + + return(list(params = params, custom_metric = custom_metric)) } # Update a booster handle for an iteration with dtrain data -xgb.iter.update <- function(bst, dtrain, iter, obj) { +xgb.iter.update <- function(bst, dtrain, iter, objective) { if (!inherits(dtrain, "xgb.DMatrix")) { stop("dtrain must be of xgb.DMatrix class") } handle <- xgb.get.handle(bst) - if (is.null(obj)) { + if (is.null(objective)) { .Call(XGBoosterUpdateOneIter_R, handle, as.integer(iter), dtrain) } else { pred <- predict( @@ -201,12 +193,12 @@ xgb.iter.update <- function(bst, dtrain, iter, obj) { outputmargin = TRUE, training = TRUE ) - gpair <- obj(pred, dtrain) - n_samples <- dim(dtrain)[1] + gpair <- objective(pred, dtrain) + n_samples <- dim(dtrain)[1L] grad <- gpair$grad hess <- gpair$hess - if ((is.matrix(grad) && dim(grad)[1] != n_samples) || + if ((is.matrix(grad) && dim(grad)[1L] != n_samples) || (is.vector(grad) && length(grad) != n_samples) || (is.vector(grad) != is.vector(hess))) { warning(paste( @@ -230,14 +222,14 @@ xgb.iter.update <- function(bst, dtrain, iter, obj) { # Evaluate one iteration. # Returns a named vector of evaluation metrics # with the names in a 'datasetname-metricname' format. -xgb.iter.eval <- function(bst, evals, iter, feval) { +xgb.iter.eval <- function(bst, evals, iter, custom_metric) { handle <- xgb.get.handle(bst) if (length(evals) == 0) return(NULL) evnames <- names(evals) - if (is.null(feval)) { + if (is.null(custom_metric)) { msg <- .Call(XGBoosterEvalOneIter_R, handle, as.integer(iter), evals, as.list(evnames)) mat <- matrix(strsplit(msg, '\\s+|:')[[1]][-1], nrow = 2) res <- structure(as.numeric(mat[2, ]), names = mat[1, ]) @@ -246,7 +238,7 @@ xgb.iter.eval <- function(bst, evals, iter, feval) { w <- evals[[j]] ## predict using all trees preds <- predict(bst, w, outputmargin = TRUE, iterationrange = "all") - eval_res <- feval(preds, w) + eval_res <- custom_metric(preds, w) out <- eval_res$value names(out) <- paste0(evnames[j], "-", eval_res$metric) out @@ -285,7 +277,7 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) { return(generate.group.folds(nfold, group)) } objective <- params$objective - if (!is.character(objective)) { + if (stratified && !is.character(objective)) { warning("Will use unstratified splitting (custom objective used)") stratified <- FALSE } @@ -498,11 +490,13 @@ NULL #' #' bst <- xgb.train( #' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), -#' max_depth = 2, -#' eta = 1, -#' nthread = 2, #' nrounds = 2, -#' objective = "binary:logistic" +#' params = xgb.params( +#' max_depth = 2, +#' eta = 1, +#' nthread = 2, +#' objective = "binary:logistic" +#' ) #' ) #' #' # Save as a stand-alone file; load it with xgb.load() @@ -535,44 +529,88 @@ NULL NULL # Lookup table for the deprecated parameters bookkeeping -depr_par_lut <- matrix(c( - 'print.every.n', 'print_every_n', - 'early.stop.round', 'early_stopping_rounds', - 'training.data', 'data', - 'with.stats', 'with_stats', - 'numberOfClusters', 'n_clusters', - 'features.keep', 'features_keep', - 'plot.height', 'plot_height', - 'plot.width', 'plot_width', - 'n_first_tree', 'trees', - 'dummy', 'DUMMY', - 'watchlist', 'evals' -), ncol = 2, byrow = TRUE) -colnames(depr_par_lut) <- c('old', 'new') +deprecated_train_params <- list( + 'print.every.n' = 'print_every_n', + 'early.stop.round' = 'early_stopping_rounds', + 'training.data' = 'data', + 'dtrain' = 'data', + 'watchlist' = 'evals', + 'feval' = 'custom_metric' +) +deprecated_dttree_params <- list( + 'n_first_tree' = 'trees' +) +deprecated_plot_params <- list( + 'plot.height' = 'plot_height', + 'plot.width' = 'plot_width' +) +deprecated_multitrees_params <- c( + deprecated_plot_params, + list('features.keep' = 'features_keep') +) +deprecated_dump_params <- list( + 'with.stats' = 'with_stats' +) +deprecated_plottree_params <- c( + deprecated_plot_params, + deprecated_dump_params +) # Checks the dot-parameters for deprecated names # (including partial matching), gives a deprecation warning, # and sets new parameters to the old parameters' values within its parent frame. # WARNING: has side-effects -check.deprecation <- function(..., env = parent.frame()) { - pars <- list(...) - # exact and partial matches - all_match <- pmatch(names(pars), depr_par_lut[, 1]) - # indices of matched pars' names - idx_pars <- which(!is.na(all_match)) - if (length(idx_pars) == 0) return() - # indices of matched LUT rows - idx_lut <- all_match[idx_pars] - # which of idx_lut were the exact matches? - ex_match <- depr_par_lut[idx_lut, 1] %in% names(pars) - for (i in seq_along(idx_pars)) { - pars_par <- names(pars)[idx_pars[i]] - old_par <- depr_par_lut[idx_lut[i], 1] - new_par <- depr_par_lut[idx_lut[i], 2] - if (!ex_match[i]) { - warning("'", pars_par, "' was partially matched to '", old_par, "'") +check.deprecation <- function( + deprecated_list, + fn_call, + ..., + env = parent.frame(), + allow_unrecognized = FALSE +) { + params <- list(...) + if (length(params) == 0) { + return(NULL) + } + if (is.null(names(params)) || min(nchar(names(params))) == 0L) { + stop("Passed invalid positional arguments") + } + all_match <- pmatch(names(params), names(deprecated_list)) + # throw error on unrecognized parameters + if (!allow_unrecognized && anyNA(all_match)) { + names_unrecognized <- names(params)[is.na(all_match)] + # make it informative if they match something that goes under 'params' + if (deprecated_list[[1L]] == deprecated_train_params[[1L]]) { + names_params <- formalArgs(xgb.params) + names_params <- c(names_params, gsub("_", ".", names_params, fixed = TRUE)) + names_under_params <- intersect(names_unrecognized, names_params) + if (length(names_under_params)) { + stop( + "Passed invalid function arguments: ", + paste(head(names_under_params), collapse = ", "), + ". These should be passed as a list to argument 'params'." + ) + } + } + # otherwise throw a generic error + stop( + "Passed unrecognized parameters: ", + paste(head(names_unrecognized), collapse = ", ") + ) + } + + matched_params <- deprecated_list[all_match[!is.na(all_match)]] + idx_orig <- seq_along(params)[!is.na(all_match)] + function_args_passed <- names(as.list(fn_call))[-1L] + for (idx in seq_along(matched_params)) { + match_old <- names(matched_params)[[idx]] + match_new <- matched_params[[idx]] + warning( + "Parameter '", match_old, "' has been renamed to '", + match_new, "' and will be removed in a future version." + ) + if (match_new %in% function_args_passed) { + stop("Passed both '", match_new, "' and '", match_old, "'.") } - .Deprecated(new_par, old = old_par, package = 'xgboost') - stop() + env[[match_new]] <- params[[idx_orig[idx]]] } } diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index b38cd42bcef3..6ffaa299b500 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -20,20 +20,20 @@ xgb.Booster <- function(params, cachelist, modelfile) { .Call(XGBoosterLoadModel_R, xgb.get.handle(bst), enc2utf8(modelfile[1])) niter <- xgb.get.num.boosted.rounds(bst) if (length(params) > 0) { - xgb.parameters(bst) <- params + xgb.model.parameters(bst) <- params } return(list(bst = bst, niter = niter)) } else if (is.raw(modelfile)) { ## A memory buffer bst <- xgb.load.raw(modelfile) niter <- xgb.get.num.boosted.rounds(bst) - xgb.parameters(bst) <- params + xgb.model.parameters(bst) <- params return(list(bst = bst, niter = niter)) } else if (inherits(modelfile, "xgb.Booster")) { ## A booster object bst <- .Call(XGDuplicate_R, modelfile) niter <- xgb.get.num.boosted.rounds(bst) - xgb.parameters(bst) <- params + xgb.model.parameters(bst) <- params return(list(bst = bst, niter = niter)) } else { stop("modelfile must be either character filename, or raw booster dump, or xgb.Booster object") @@ -42,7 +42,7 @@ xgb.Booster <- function(params, cachelist, modelfile) { ## Create new model bst <- .Call(XGBoosterCreate_R, cachelist) if (length(params) > 0) { - xgb.parameters(bst) <- params + xgb.model.parameters(bst) <- params } return(list(bst = bst, niter = 0L)) } @@ -196,7 +196,7 @@ xgb.get.handle <- function(object) { #' of the most important features first. See below about the format of the returned results. #' #' The `predict()` method uses as many threads as defined in `xgb.Booster` object (all by default). -#' If you want to change their number, assign a new number to `nthread` using [xgb.parameters<-()]. +#' If you want to change their number, assign a new number to `nthread` using [xgb.model.parameters<-()]. #' Note that converting a matrix to [xgb.DMatrix()] uses multiple threads too. #' #' @return @@ -264,11 +264,13 @@ xgb.get.handle <- function(object) { #' #' bst <- xgb.train( #' data = xgb.DMatrix(train$data, label = train$label), -#' max_depth = 2, -#' eta = 0.5, -#' nthread = nthread, #' nrounds = 5, -#' objective = "binary:logistic" +#' params = xgb.params( +#' max_depth = 2, +#' eta = 0.5, +#' nthread = nthread, +#' objective = "binary:logistic" +#' ) #' ) #' #' # use all trees by default @@ -307,13 +309,15 @@ xgb.get.handle <- function(object) { #' #' bst <- xgb.train( #' data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb), -#' max_depth = 4, -#' eta = 0.5, -#' nthread = 2, #' nrounds = 10, -#' subsample = 0.5, -#' objective = "multi:softprob", -#' num_class = num_class +#' params = xgb.params( +#' max_depth = 4, +#' eta = 0.5, +#' nthread = 2, +#' subsample = 0.5, +#' objective = "multi:softprob", +#' num_class = num_class +#' ) #' ) #' #' # predict for softmax returns num_class probability numbers per case: @@ -329,13 +333,15 @@ xgb.get.handle <- function(object) { #' #' bst <- xgb.train( #' data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb), -#' max_depth = 4, -#' eta = 0.5, -#' nthread = 2, #' nrounds = 10, -#' subsample = 0.5, -#' objective = "multi:softmax", -#' num_class = num_class +#' params = xgb.params( +#' max_depth = 4, +#' eta = 0.5, +#' nthread = 2, +#' subsample = 0.5, +#' objective = "multi:softmax", +#' num_class = num_class +#' ) #' ) #' #' pred <- predict(bst, as.matrix(iris[, -5])) @@ -631,7 +637,7 @@ validate.features <- function(bst, newdata) { #' and its serialization is handled externally. #' Also, setting an attribute that has the same name as one of XGBoost's parameters wouldn't #' change the value of that parameter for a model. -#' Use [xgb.parameters<-()] to set or change model parameters. +#' Use [xgb.model.parameters<-()] to set or change model parameters. #' #' The `xgb.attributes<-` setter either updates the existing or adds one or several attributes, #' but it doesn't delete the other existing attributes. @@ -662,11 +668,13 @@ validate.features <- function(bst, newdata) { #' #' bst <- xgb.train( #' data = xgb.DMatrix(train$data, label = train$label), -#' max_depth = 2, -#' eta = 1, -#' nthread = 2, #' nrounds = 2, -#' objective = "binary:logistic" +#' params = xgb.params( +#' max_depth = 2, +#' eta = 1, +#' nthread = 2, +#' objective = "binary:logistic" +#' ) #' ) #' #' xgb.attr(bst, "my_attribute") <- "my attribute value" @@ -768,11 +776,13 @@ xgb.attributes <- function(object) { #' #' bst <- xgb.train( #' data = xgb.DMatrix(train$data, label = train$label), -#' max_depth = 2, -#' eta = 1, -#' nthread = nthread, #' nrounds = 2, -#' objective = "binary:logistic" +#' params = xgb.params( +#' max_depth = 2, +#' eta = 1, +#' nthread = nthread, +#' objective = "binary:logistic" +#' ) #' ) #' #' config <- xgb.config(bst) @@ -821,18 +831,20 @@ xgb.config <- function(object) { #' #' bst <- xgb.train( #' data = xgb.DMatrix(train$data, label = train$label), -#' max_depth = 2, -#' eta = 1, -#' nthread = 2, #' nrounds = 2, -#' objective = "binary:logistic" +#' params = xgb.params( +#' max_depth = 2, +#' eta = 1, +#' nthread = 2, +#' objective = "binary:logistic" +#' ) #' ) #' -#' xgb.parameters(bst) <- list(eta = 0.1) +#' xgb.model.parameters(bst) <- list(eta = 0.1) #' -#' @rdname xgb.parameters +#' @rdname xgb.model.parameters #' @export -`xgb.parameters<-` <- function(object, value) { +`xgb.model.parameters<-` <- function(object, value) { if (length(value) == 0) return(object) p <- as.list(value) if (is.null(names(p)) || any(nchar(names(p)) == 0)) { @@ -897,7 +909,7 @@ setinfo.xgb.Booster <- function(object, name, info) { #' @param model,x A fitted `xgb.Booster` model. #' @return The number of rounds saved in the model as an integer. #' @details Note that setting booster parameters related to training -#' continuation / updates through [xgb.parameters<-()] will reset the +#' continuation / updates through [xgb.model.parameters<-()] will reset the #' number of rounds to zero. #' @export #' @rdname xgb.get.num.boosted.rounds @@ -936,7 +948,7 @@ length.xgb.Booster <- function(x) { #' x <- as.matrix(mtcars[, -1]) #' #' dm <- xgb.DMatrix(x, label = y, nthread = 1) -#' model <- xgb.train(data = dm, params = list(nthread = 1), nrounds = 5) +#' model <- xgb.train(data = dm, params = xgb.params(nthread = 1), nrounds = 5) #' model_slice <- xgb.slice.Booster(model, 1, 3) #' # Prediction for first three rounds #' predict(model, x, predleaf = TRUE)[, 1:3] @@ -1080,7 +1092,7 @@ xgb.best_iteration <- function(bst) { #' x <- as.matrix(mtcars[, -1]) #' #' dm <- xgb.DMatrix(data = x, label = y, nthread = 1) -#' params <- list(booster = "gblinear", nthread = 1) +#' params <- xgb.params(booster = "gblinear", nthread = 1) #' model <- xgb.train(data = dm, params = params, nrounds = 2) #' coef(model) #' @export @@ -1163,8 +1175,8 @@ coef.xgb.Booster <- function(object, ...) { #' #' model <- xgb.train( #' data = dm, -#' params = list(nthread = 1), -#' nround = 3 +#' params = xgb.params(nthread = 1), +#' nrounds = 3 #' ) #' #' # Set an arbitrary attribute kept at the C level @@ -1225,9 +1237,9 @@ xgb.copy.Booster <- function(model) { #' x <- as.matrix(mtcars[, -1]) #' #' model <- xgb.train( -#' params = list(nthread = 1), +#' params = xgb.params(nthread = 1), #' data = xgb.DMatrix(x, label = y, nthread = 1), -#' nround = 3 +#' nrounds = 3 #' ) #' #' model_shallow_copy <- model @@ -1266,11 +1278,13 @@ xgb.is.same.Booster <- function(obj1, obj2) { #' #' bst <- xgb.train( #' data = xgb.DMatrix(train$data, label = train$label), -#' max_depth = 2, -#' eta = 1, -#' nthread = 2, #' nrounds = 2, -#' objective = "binary:logistic" +#' params = xgb.params( +#' max_depth = 2, +#' eta = 1, +#' nthread = 2, +#' objective = "binary:logistic" +#' ) #' ) #' #' attr(bst, "myattr") <- "memo" diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index 280fcf52ee3e..66bd7205570b 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -569,7 +569,6 @@ xgb.ProxyDMatrix <- function(proxy_handle, data_iterator) { tmp <- .process.df.for.dmatrix(lst$data, lst$feature_types) lst$feature_types <- tmp$feature_types .Call(XGProxyDMatrixSetDataColumnar_R, proxy_handle, tmp$lst) - rm(tmp) } else if (is.matrix(lst$data)) { .Call(XGProxyDMatrixSetDataDense_R, proxy_handle, lst$data) } else if (inherits(lst$data, "dgRMatrix")) { @@ -688,7 +687,7 @@ xgb.ProxyDMatrix <- function(proxy_handle, data_iterator) { #' dm <- xgb.ExtMemDMatrix(data_iterator, cache_prefix, nthread = 1) #' #' # After construction, can be used as a regular DMatrix -#' params <- list(nthread = 1, objective = "reg:squarederror") +#' params <- xgb.params(nthread = 1, objective = "reg:squarederror") #' model <- xgb.train(data = dm, nrounds = 2, params = params) #' #' # Predictions can also be called on it, and should be the same @@ -1136,7 +1135,7 @@ setinfo.xgb.DMatrix <- function(object, name, info) { #' # DMatrix is not quantized right away, but will be once a hist model is generated #' model <- xgb.train( #' data = dm, -#' params = list(tree_method = "hist", max_bin = 8, nthread = 1), +#' params = xgb.params(tree_method = "hist", max_bin = 8, nthread = 1), #' nrounds = 3 #' ) #' diff --git a/R-package/R/xgb.create.features.R b/R-package/R/xgb.create.features.R index 2c4015c5f2de..85d9d560cef0 100644 --- a/R-package/R/xgb.create.features.R +++ b/R-package/R/xgb.create.features.R @@ -42,7 +42,6 @@ #' #' @param model Decision tree boosting model learned on the original data. #' @param data Original data (usually provided as a `dgCMatrix` matrix). -#' @param ... Currently not used. #' #' @return A `dgCMatrix` matrix including both the original data and the new features. #' @@ -53,10 +52,10 @@ #' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2)) #' dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2)) #' -#' param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic') +#' param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic', nthread = 1) #' nrounds = 4 #' -#' bst <- xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2) +#' bst <- xgb.train(params = param, data = dtrain, nrounds = nrounds) #' #' # Model accuracy without new features #' accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / @@ -68,12 +67,12 @@ #' #' # learning with new features #' new.dtrain <- xgb.DMatrix( -#' data = new.features.train, label = agaricus.train$label, nthread = 2 +#' data = new.features.train, label = agaricus.train$label #' ) #' new.dtest <- xgb.DMatrix( -#' data = new.features.test, label = agaricus.test$label, nthread = 2 +#' data = new.features.test, label = agaricus.test$label #' ) -#' bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2) +#' bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds) #' #' # Model accuracy with new features #' accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / @@ -84,8 +83,7 @@ #' accuracy.after, "!\n")) #' #' @export -xgb.create.features <- function(model, data, ...) { - check.deprecation(...) +xgb.create.features <- function(model, data) { pred_with_leaf <- predict.xgb.Booster(model, data, predleaf = TRUE) cols <- lapply(as.data.frame(pred_with_leaf), factor) cbind(data, sparse.model.matrix(~ . -1, cols)) # nolint diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 67821919f71c..9b4095150117 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -2,30 +2,12 @@ #' #' The cross validation function of xgboost. #' -#' @param params The list of parameters. The complete list of parameters is available in the -#' [online documentation](http://xgboost.readthedocs.io/en/latest/parameter.html). -#' Below is a shorter summary: -#' - `objective`: Objective function, common ones are -#' - `reg:squarederror`: Regression with squared loss. -#' - `binary:logistic`: Logistic regression for classification. -#' -#' See [xgb.train()] for complete list of objectives. -#' - `eta`: Step size of each boosting step -#' - `max_depth`: Maximum depth of the tree -#' - `nthread`: Number of threads used in training. If not set, all threads are used -#' -#' See [xgb.train()] for further details. -#' See also demo for walkthrough example in R. -#' -#' Note that, while `params` accepts a `seed` entry and will use such parameter for model training if -#' supplied, this seed is not used for creation of train-test splits, which instead rely on R's own RNG -#' system - thus, for reproducible results, one needs to call the [set.seed()] function beforehand. +#' @inheritParams xgb.train #' @param data An `xgb.DMatrix` object, with corresponding fields like `label` or bounds as required #' for model training by the objective. #' #' Note that only the basic `xgb.DMatrix` class is supported - variants such as `xgb.QuantileDMatrix` #' or `xgb.ExtMemDMatrix` are not supported here. -#' @param nrounds The max number of iterations. #' @param nfold The original dataset is randomly partitioned into `nfold` equal size subsamples. #' @param prediction A logical value indicating whether to return the test fold predictions #' from each CV model. This parameter engages the [xgb.cb.cv.predict()] callback. @@ -41,10 +23,6 @@ #' - `auc`: Area under curve #' - `aucpr`: Area under PR curve #' - `merror`: Exact matching error used to evaluate multi-class classification -#' @param obj Customized objective function. Returns gradient and second order -#' gradient with given prediction and dtrain. -#' @param feval Customized evaluation function. Returns -#' `list(metric='metric-name', value='metric-value')` with given prediction and dtrain. #' @param stratified Logical flag indicating whether sampling of folds should be stratified #' by the values of outcome labels. For real-valued labels in regression objectives, #' stratification will be done by discretizing the labels into up to 5 buckets beforehand. @@ -68,24 +46,10 @@ #' (the default) all indices not specified in `folds` will be used for training. #' #' This is not supported when `data` has `group` field. -#' @param verbose Logical flag. Should statistics be printed during the process? -#' @param print_every_n Print each nth iteration evaluation messages when `verbose > 0`. -#' Default is 1 which means all messages are printed. This parameter is passed to the -#' [xgb.cb.print.evaluation()] callback. -#' @param early_stopping_rounds If `NULL`, the early stopping function is not triggered. -#' If set to an integer `k`, training with a validation set will stop if the performance -#' doesn't improve for `k` rounds. -#' Setting this parameter engages the [xgb.cb.early.stop()] callback. -#' @param maximize If `feval` and `early_stopping_rounds` are set, -#' then this parameter must be set as well. -#' When it is `TRUE`, it means the larger the evaluation score the better. -#' This parameter is passed to the [xgb.cb.early.stop()] callback. #' @param callbacks A list of callback functions to perform various task during boosting. #' See [xgb.Callback()]. Some of the callbacks are automatically created depending on the #' parameters' values. User can provide either existing or their own callback methods in order #' to customize the training process. -#' @param ... Other parameters to pass to `params`. -#' #' @details #' The original sample is randomly partitioned into `nfold` equal size subsamples. #' @@ -129,35 +93,42 @@ #' cv <- xgb.cv( #' data = dtrain, #' nrounds = 3, -#' nthread = 2, +#' params = xgb.params( +#' nthread = 2, +#' max_depth = 3, +#' eta = 1, +#' objective = "binary:logistic" +#' ), #' nfold = 5, -#' metrics = list("rmse","auc"), -#' max_depth = 3, -#' eta = 1,objective = "binary:logistic" +#' metrics = list("rmse","auc") #' ) #' print(cv) #' print(cv, verbose = TRUE) #' #' @export -xgb.cv <- function(params = list(), data, nrounds, nfold, +xgb.cv <- function(params = xgb.params(), data, nrounds, nfold, prediction = FALSE, showsd = TRUE, metrics = list(), - obj = NULL, feval = NULL, stratified = "auto", folds = NULL, train_folds = NULL, - verbose = TRUE, print_every_n = 1L, + objective = NULL, custom_metric = NULL, stratified = "auto", + folds = NULL, train_folds = NULL, verbose = TRUE, print_every_n = 1L, early_stopping_rounds = NULL, maximize = NULL, callbacks = list(), ...) { + check.deprecation(deprecated_train_params, match.call(), ...) - check.deprecation(...) stopifnot(inherits(data, "xgb.DMatrix")) if (inherits(data, "xgb.DMatrix") && .Call(XGCheckNullPtr_R, data)) { stop("'data' is an invalid 'xgb.DMatrix' object. Must be constructed again.") } - params <- check.booster.params(params, ...) + params <- check.booster.params(params) # TODO: should we deprecate the redundant 'metrics' parameter? for (m in metrics) params <- c(params, list("eval_metric" = m)) - check.custom.obj() - check.custom.eval() + tmp <- check.custom.obj(params, objective) + params <- tmp$params + objective <- tmp$objective + tmp <- check.custom.eval(params, custom_metric, maximize, early_stopping_rounds, callbacks) + params <- tmp$params + custom_metric <- tmp$custom_metric if (stratified == "auto") { if (is.character(params$objective)) { @@ -276,13 +247,13 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, bst = fd$bst, dtrain = fd$dtrain, iter = iteration - 1, - obj = obj + objective = objective ) xgb.iter.eval( bst = fd$bst, evals = fd$evals, iter = iteration - 1, - feval = feval + custom_metric = custom_metric ) }) msg <- simplify2array(msg) @@ -342,11 +313,13 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, #' cv <- xgb.cv( #' data = xgb.DMatrix(train$data, label = train$label), #' nfold = 5, -#' max_depth = 2, -#' eta = 1, -#' nthread = 2, #' nrounds = 2, -#' objective = "binary:logistic" +#' params = xgb.params( +#' max_depth = 2, +#' eta = 1, +#' nthread = 2, +#' objective = "binary:logistic" +#' ) #' ) #' print(cv) #' print(cv, verbose = TRUE) diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index 76271ec515c9..cf601e6388c1 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -16,8 +16,7 @@ #' #' Format 'dot' for a single tree can be passed directly to packages that consume this format #' for graph visualization, such as function `DiagrammeR::grViz()` -#' @param ... Currently not used -#' +#' @inheritParams xgb.train #' @return #' If fname is not provided or set to `NULL` the function will return the model #' as a character vector. Otherwise it will return `TRUE`. @@ -32,11 +31,13 @@ #' #' bst <- xgb.train( #' data = xgb.DMatrix(train$data, label = train$label), -#' max_depth = 2, -#' eta = 1, -#' nthread = 2, #' nrounds = 2, -#' objective = "binary:logistic" +#' params = xgb.params( +#' max_depth = 2, +#' eta = 1, +#' nthread = 2, +#' objective = "binary:logistic" +#' ) #' ) #' #' # save the model in file 'xgb.model.dump' @@ -56,7 +57,7 @@ #' @export xgb.dump <- function(model, fname = NULL, fmap = "", with_stats = FALSE, dump_format = c("text", "json", "dot"), ...) { - check.deprecation(...) + check.deprecation(deprecated_dump_params, match.call(), ...) dump_format <- match.arg(dump_format) if (!inherits(model, "xgb.Booster")) stop("model: argument must be of type xgb.Booster") diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 548421d2c83c..c1b45e81bb8c 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -46,11 +46,13 @@ #' #' bst <- xgb.train( #' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), -#' max_depth = 2, -#' eta = 1, -#' nthread = 2, #' nrounds = 2, -#' objective = "binary:logistic" +#' params = xgb.params( +#' max_depth = 2, +#' eta = 1, +#' nthread = 2, +#' objective = "binary:logistic" +#' ) #' ) #' #' xgb.importance(model = bst) @@ -58,10 +60,13 @@ #' # binomial classification using "gblinear": #' bst <- xgb.train( #' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), -#' booster = "gblinear", -#' eta = 0.3, -#' nthread = 1, -#' nrounds = 20,objective = "binary:logistic" +#' nrounds = 20, +#' params = xgb.params( +#' booster = "gblinear", +#' eta = 0.3, +#' nthread = 1, +#' objective = "binary:logistic" +#' ) #' ) #' #' xgb.importance(model = bst) @@ -74,12 +79,14 @@ #' as.matrix(iris[, -5]), #' label = as.numeric(iris$Species) - 1 #' ), -#' max_depth = 3, -#' eta = 0.2, -#' nthread = 2, #' nrounds = nrounds, -#' objective = "multi:softprob", -#' num_class = nclass +#' params = xgb.params( +#' max_depth = 3, +#' eta = 0.2, +#' nthread = 2, +#' objective = "multi:softprob", +#' num_class = nclass +#' ) #' ) #' #' # all classes clumped together: @@ -102,12 +109,14 @@ #' scale(as.matrix(iris[, -5])), #' label = as.numeric(iris$Species) - 1 #' ), -#' booster = "gblinear", -#' eta = 0.2, -#' nthread = 1, #' nrounds = 15, -#' objective = "multi:softprob", -#' num_class = nclass +#' params = xgb.params( +#' booster = "gblinear", +#' eta = 0.2, +#' nthread = 1, +#' objective = "multi:softprob", +#' num_class = nclass +#' ) #' ) #' #' xgb.importance(model = mbst) diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R index 29ab2dadaf72..b08a308a3dd8 100644 --- a/R-package/R/xgb.load.R +++ b/R-package/R/xgb.load.R @@ -32,11 +32,13 @@ #' #' bst <- xgb.train( #' data = xgb.DMatrix(train$data, label = train$label), -#' max_depth = 2, -#' eta = 1, -#' nthread = nthread, #' nrounds = 2, -#' objective = "binary:logistic" +#' params = xgb.params( +#' max_depth = 2, +#' eta = 1, +#' nthread = nthread, +#' objective = "binary:logistic" +#' ) #' ) #' #' fname <- file.path(tempdir(), "xgb.ubj") diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index db2972da7513..12ed705ba8f7 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -13,8 +13,7 @@ #' @param use_int_id A logical flag indicating whether nodes in columns "Yes", "No", and #' "Missing" should be represented as integers (when `TRUE`) or as "Tree-Node" #' character strings (when `FALSE`, default). -#' @param ... Currently not used. -#' +#' @inheritParams xgb.train #' @return #' A `data.table` with detailed information about tree nodes. It has the following columns: #' - `Tree`: integer ID of a tree in a model (zero-based index). @@ -44,11 +43,13 @@ #' #' bst <- xgb.train( #' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), -#' max_depth = 2, -#' eta = 1, -#' nthread = nthread, #' nrounds = 2, -#' objective = "binary:logistic" +#' params = xgb.params( +#' max_depth = 2, +#' eta = 1, +#' nthread = nthread, +#' objective = "binary:logistic" +#' ) #' ) #' #' # This bst model already has feature_names stored with it, so those would be used when @@ -66,7 +67,7 @@ #' @export xgb.model.dt.tree <- function(model = NULL, text = NULL, trees = NULL, use_int_id = FALSE, ...) { - check.deprecation(...) + check.deprecation(deprecated_dttree_params, match.call(), ...) if (!inherits(model, "xgb.Booster") && !is.character(text)) { stop("Either 'model' must be an object of class xgb.Booster\n", diff --git a/R-package/R/xgb.plot.deepness.R b/R-package/R/xgb.plot.deepness.R index c8aa92f22f6b..d6ba9c3d2411 100644 --- a/R-package/R/xgb.plot.deepness.R +++ b/R-package/R/xgb.plot.deepness.R @@ -51,12 +51,14 @@ #' ## Change max_depth to a higher number to get a more significant result #' bst <- xgb.train( #' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), -#' max_depth = 6, -#' nthread = nthread, #' nrounds = 50, -#' objective = "binary:logistic", -#' subsample = 0.5, -#' min_child_weight = 2 +#' params = xgb.params( +#' max_depth = 6, +#' nthread = nthread, +#' objective = "binary:logistic", +#' subsample = 0.5, +#' min_child_weight = 2 +#' ) #' ) #' #' xgb.plot.deepness(bst) diff --git a/R-package/R/xgb.plot.importance.R b/R-package/R/xgb.plot.importance.R index 11be29a7cb68..750f386dd6f2 100644 --- a/R-package/R/xgb.plot.importance.R +++ b/R-package/R/xgb.plot.importance.R @@ -52,11 +52,13 @@ #' #' bst <- xgb.train( #' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), -#' max_depth = 3, -#' eta = 1, -#' nthread = nthread, #' nrounds = 2, -#' objective = "binary:logistic" +#' params = xgb.params( +#' max_depth = 3, +#' eta = 1, +#' nthread = nthread, +#' objective = "binary:logistic" +#' ) #' ) #' #' importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst) @@ -74,7 +76,7 @@ #' @export xgb.plot.importance <- function(importance_matrix = NULL, top_n = NULL, measure = NULL, rel_to_first = FALSE, left_margin = 10, cex = NULL, plot = TRUE, ...) { - check.deprecation(...) + check.deprecation(deprecated_plot_params, match.call(), ..., allow_unrecognized = TRUE) if (!is.data.table(importance_matrix)) { stop("importance_matrix: must be a data.table") } diff --git a/R-package/R/xgb.plot.multi.trees.R b/R-package/R/xgb.plot.multi.trees.R index 8b4f0eeed037..1c57dd84babd 100644 --- a/R-package/R/xgb.plot.multi.trees.R +++ b/R-package/R/xgb.plot.multi.trees.R @@ -36,13 +36,15 @@ #' #' bst <- xgb.train( #' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), -#' max_depth = 15, -#' eta = 1, -#' nthread = nthread, #' nrounds = 30, -#' objective = "binary:logistic", -#' min_child_weight = 50, -#' verbose = 0 +#' verbose = 0, +#' params = xgb.params( +#' max_depth = 15, +#' eta = 1, +#' nthread = nthread, +#' objective = "binary:logistic", +#' min_child_weight = 50 +#' ) #' ) #' #' p <- xgb.plot.multi.trees(model = bst, features_keep = 3) @@ -65,7 +67,7 @@ xgb.plot.multi.trees <- function(model, features_keep = 5, plot_width = NULL, pl if (!requireNamespace("DiagrammeR", quietly = TRUE)) { stop("DiagrammeR is required for xgb.plot.multi.trees") } - check.deprecation(...) + check.deprecation(deprecated_multitrees_params, match.call(), ...) tree.matrix <- xgb.model.dt.tree(model = model) # first number of the path represents the tree, then the following numbers are related to the path to follow diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R index 4184c6f5ea6a..bb678968db88 100644 --- a/R-package/R/xgb.plot.shap.R +++ b/R-package/R/xgb.plot.shap.R @@ -84,12 +84,14 @@ #' bst <- xgb.train( #' data = xgb.DMatrix(agaricus.train$data, agaricus.train$label), #' nrounds = nrounds, -#' eta = 0.1, -#' max_depth = 3, -#' subsample = 0.5, -#' objective = "binary:logistic", -#' nthread = nthread, -#' verbose = 0 +#' verbose = 0, +#' params = xgb.params( +#' eta = 0.1, +#' max_depth = 3, +#' subsample = 0.5, +#' objective = "binary:logistic", +#' nthread = nthread +#' ) #' ) #' #' xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none") @@ -109,13 +111,15 @@ #' mbst <- xgb.train( #' data = xgb.DMatrix(x, label = as.numeric(iris$Species) - 1), #' nrounds = nrounds, -#' max_depth = 2, -#' eta = 0.3, -#' subsample = 0.5, -#' nthread = nthread, -#' objective = "multi:softprob", -#' num_class = nclass, -#' verbose = 0 +#' verbose = 0, +#' params = xgb.params( +#' max_depth = 2, +#' eta = 0.3, +#' subsample = 0.5, +#' nthread = nthread, +#' objective = "multi:softprob", +#' num_class = nclass +#' ) #' ) #' trees0 <- seq(from = 0, by = nclass, length.out = nrounds) #' col <- rgb(0, 0, 1, 0.5) diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index f5d53bb3432e..f18e17a9e1fe 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -29,7 +29,7 @@ #' splits. When this option is on, the model dump contains two additional #' values: gain is the approximate loss function gain we get in each split; #' cover is the sum of second order gradient in each node. -#' @param ... Currently not used. +#' @inheritParams xgb.train #' @return #' #' Rendered graph object which is an htmlwidget of ' class `grViz`. Similar to @@ -41,11 +41,13 @@ #' #' bst <- xgb.train( #' data = xgb.DMatrix(agaricus.train$data, agaricus.train$label), -#' max_depth = 3, -#' eta = 1, -#' nthread = 2, #' nrounds = 2, -#' objective = "binary:logistic" +#' params = xgb.params( +#' max_depth = 3, +#' eta = 1, +#' nthread = 2, +#' objective = "binary:logistic" +#' ) #' ) #' #' # plot the first tree @@ -67,7 +69,7 @@ xgb.plot.tree <- function(model, plot_width = NULL, plot_height = NULL, with_stats = FALSE, ...) { - check.deprecation(...) + check.deprecation(deprecated_plottree_params, match.call(), ...) if (!inherits(model, "xgb.Booster")) { stop("model has to be an object of the class xgb.Booster") } diff --git a/R-package/R/xgb.save.R b/R-package/R/xgb.save.R index 195a58e4881c..855cf964b37b 100644 --- a/R-package/R/xgb.save.R +++ b/R-package/R/xgb.save.R @@ -44,11 +44,13 @@ #' #' bst <- xgb.train( #' data = xgb.DMatrix(train$data, label = train$label), -#' max_depth = 2, -#' eta = 1, -#' nthread = nthread, #' nrounds = 2, -#' objective = "binary:logistic" +#' params = xgb.params( +#' max_depth = 2, +#' eta = 1, +#' nthread = nthread, +#' objective = "binary:logistic" +#' ) #' ) #' #' fname <- file.path(tempdir(), "xgb.ubj") diff --git a/R-package/R/xgb.save.raw.R b/R-package/R/xgb.save.raw.R index 197c0980d9ff..3c10cd9f2a5b 100644 --- a/R-package/R/xgb.save.raw.R +++ b/R-package/R/xgb.save.raw.R @@ -23,11 +23,13 @@ #' #' bst <- xgb.train( #' data = xgb.DMatrix(train$data, label = train$label), -#' max_depth = 2, -#' eta = 1, -#' nthread = nthread, #' nrounds = 2, -#' objective = "binary:logistic" +#' params = xgb.params( +#' max_depth = 2, +#' eta = 1, +#' nthread = nthread, +#' objective = "binary:logistic" +#' ) #' ) #' #' raw <- xgb.save.raw(bst) diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index cafdde2da856..9fe8fb3c8f29 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -3,153 +3,66 @@ #' `xgb.train()` is an advanced interface for training an xgboost model. #' The [xgboost()] function is a simpler wrapper for `xgb.train()`. #' -#' @param params the list of parameters. The complete list of parameters is -#' available in the [online documentation](http://xgboost.readthedocs.io/en/latest/parameter.html). -#' Below is a shorter summary: -#' -#' **1. General Parameters** -#' -#' - `booster`: Which booster to use, can be `gbtree` or `gblinear`. Default: `gbtree`. -#' -#' **2. Booster Parameters** -#' -#' **2.1. Parameters for Tree Booster** -#' - `eta`: The learning rate: scale the contribution of each tree by a factor of `0 < eta < 1` -#' when it is added to the current approximation. -#' Used to prevent overfitting by making the boosting process more conservative. -#' Lower value for `eta` implies larger value for `nrounds`: low `eta` value means model -#' more robust to overfitting but slower to compute. Default: 0.3. -#' - `gamma`: Minimum loss reduction required to make a further partition on a leaf node of the tree. -#' the larger, the more conservative the algorithm will be. -#' - `max_depth`: Maximum depth of a tree. Default: 6. -#' - `min_child_weight`: Minimum sum of instance weight (hessian) needed in a child. -#' If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, -#' then the building process will give up further partitioning. -#' In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. -#' The larger, the more conservative the algorithm will be. Default: 1. -#' - `subsample`: Subsample ratio of the training instance. -#' Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees -#' and this will prevent overfitting. It makes computation shorter (because less data to analyse). -#' It is advised to use this parameter with `eta` and increase `nrounds`. Default: 1. -#' - `colsample_bytree`: Subsample ratio of columns when constructing each tree. Default: 1. -#' - `lambda`: L2 regularization term on weights. Default: 1. -#' - `alpha`: L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0. -#' - `num_parallel_tree`: Experimental parameter. number of trees to grow per round. -#' Useful to test Random Forest through XGBoost. -#' (set `colsample_bytree < 1`, `subsample < 1` and `round = 1`) accordingly. -#' Default: 1. -#' - `monotone_constraints`: A numerical vector consists of `1`, `0` and `-1` with its length -#' equals to the number of features in the training data. -#' `1` is increasing, `-1` is decreasing and `0` is no constraint. -#' - `interaction_constraints`: A list of vectors specifying feature indices of permitted interactions. -#' Each item of the list represents one permitted interaction where specified features are allowed to interact with each other. -#' Feature index values should start from `0` (`0` references the first column). -#' Leave argument unspecified for no interaction constraints. -#' -#' **2.2. Parameters for Linear Booster** -#' -#' - `lambda`: L2 regularization term on weights. Default: 0. -#' - `lambda_bias`: L2 regularization term on bias. Default: 0. -#' - `alpha`: L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0. -#' -#' **3. Task Parameters** -#' -#' - `objective`: Specifies the learning task and the corresponding learning objective. -#' users can pass a self-defined function to it. The default objective options are below: -#' - `reg:squarederror`: Regression with squared loss (default). -#' - `reg:squaredlogerror`: Regression with squared log loss \eqn{1/2 \cdot (\log(pred + 1) - \log(label + 1))^2}. -#' All inputs are required to be greater than -1. -#' Also, see metric rmsle for possible issue with this objective. -#' - `reg:logistic`: Logistic regression. -#' - `reg:pseudohubererror`: Regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss. -#' - `binary:logistic`: Logistic regression for binary classification. Output probability. -#' - `binary:logitraw`: Logistic regression for binary classification, output score before logistic transformation. -#' - `binary:hinge`: Hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities. -#' - `count:poisson`: Poisson regression for count data, output mean of Poisson distribution. -#' The parameter `max_delta_step` is set to 0.7 by default in poisson regression -#' (used to safeguard optimization). -#' - `survival:cox`: Cox regression for right censored survival time data (negative values are considered right censored). -#' Note that predictions are returned on the hazard ratio scale (i.e., as HR = exp(marginal_prediction) in the proportional -#' hazard function \eqn{h(t) = h_0(t) \cdot HR}. -#' - `survival:aft`: Accelerated failure time model for censored survival time data. See -#' [Survival Analysis with Accelerated Failure Time](https://xgboost.readthedocs.io/en/latest/tutorials/aft_survival_analysis.html) -#' for details. -#' The parameter `aft_loss_distribution` specifies the Probability Density Function -#' used by `survival:aft` and the `aft-nloglik` metric. -#' - `multi:softmax`: Set xgboost to do multiclass classification using the softmax objective. -#' Class is represented by a number and should be from 0 to `num_class - 1`. -#' - `multi:softprob`: Same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be -#' further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging -#' to each class. -#' - `rank:pairwise`: Set XGBoost to do ranking task by minimizing the pairwise loss. -#' - `rank:ndcg`: Use LambdaMART to perform list-wise ranking where -#' [Normalized Discounted Cumulative Gain (NDCG)](https://en.wikipedia.org/wiki/Discounted_cumulative_gain) is maximized. -#' - `rank:map`: Use LambdaMART to perform list-wise ranking where -#' [Mean Average Precision (MAP)](https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision) -#' is maximized. -#' - `reg:gamma`: Gamma regression with log-link. Output is a mean of gamma distribution. -#' It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be -#' [gamma-distributed](https://en.wikipedia.org/wiki/Gamma_distribution#Applications). -#' - `reg:tweedie`: Tweedie regression with log-link. -#' It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be -#' [Tweedie-distributed](https://en.wikipedia.org/wiki/Tweedie_distribution#Applications). -#' -#' For custom objectives, one should pass a function taking as input the current predictions (as a numeric -#' vector or matrix) and the training data (as an `xgb.DMatrix` object) that will return a list with elements -#' `grad` and `hess`, which should be numeric vectors or matrices with number of rows matching to the numbers -#' of rows in the training data (same shape as the predictions that are passed as input to the function). -#' For multi-valued custom objectives, should have shape `[nrows, ntargets]`. Note that negative values of -#' the Hessian will be clipped, so one might consider using the expected Hessian (Fisher information) if the -#' objective is non-convex. -#' -#' See the tutorials [Custom Objective and Evaluation Metric](https://xgboost.readthedocs.io/en/stable/tutorials/custom_metric_obj.html) -#' and [Advanced Usage of Custom Objectives](https://xgboost.readthedocs.io/en/latest/tutorials/advanced_custom_obj.html) -#' for more information about custom objectives. -#' -#' - `base_score`: The initial prediction score of all instances, global bias. Default: 0.5. -#' - `eval_metric`: Evaluation metrics for validation data. -#' Users can pass a self-defined function to it. -#' Default: metric will be assigned according to objective -#' (rmse for regression, and error for classification, mean average precision for ranking). -#' List is provided in detail section. +#' @param params List of XGBoost parameters which control the model building process. +#' See the [online documentation](http://xgboost.readthedocs.io/en/latest/parameter.html) +#' and the documentation for [xgb.params()] for details. +#' +#' Should be passed as list with named entries. Parameters that are not specified in this +#' list will use their default values. +#' +#' A list of named parameters can be created through the function [xgb.params()], which +#' accepts all valid parameters as function arguments. #' @param data Training dataset. `xgb.train()` accepts only an `xgb.DMatrix` as the input. -#' [xgboost()], in addition, also accepts `matrix`, `dgCMatrix`, or name of a local data file. +#' +#' Note that there is a function [xgboost()] which is meant to accept R data objects +#' as inputs, such as data frames and matrices. #' @param nrounds Max number of boosting iterations. #' @param evals Named list of `xgb.DMatrix` datasets to use for evaluating model performance. -#' Metrics specified in either `eval_metric` or `feval` will be computed for each -#' of these datasets during each boosting iteration, and stored in the end as a field named -#' `evaluation_log` in the resulting object. When either `verbose>=1` or -#' [xgb.cb.print.evaluation()] callback is engaged, the performance results are continuously -#' printed out during the training. +#' Metrics specified in either `eval_metric` (under params) or `custom_metric` (function +#' argument here) will be computed for each of these datasets during each boosting iteration, +#' and stored in the end as a field named `evaluation_log` in the resulting object. +#' +#' When either `verbose>=1` or [xgb.cb.print.evaluation()] callback is engaged, the performance +#' results are continuously printed out during the training. +#' #' E.g., specifying `evals=list(validation1=mat1, validation2=mat2)` allows to track -#' the performance of each round's model on mat1 and mat2. -#' @param obj Customized objective function. Should take two arguments: the first one will be the +#' the performance of each round's model on `mat1` and `mat2`. +#' @param objective Customized objective function. Should take two arguments: the first one will be the #' current predictions (either a numeric vector or matrix depending on the number of targets / classes), #' and the second one will be the `data` DMatrix object that is used for training. #' #' It should return a list with two elements `grad` and `hess` (in that order), as either #' numeric vectors or numeric matrices depending on the number of targets / classes (same #' dimension as the predictions that are passed as first argument). -#' @param feval Customized evaluation function. Just like `obj`, should take two arguments, with -#' the first one being the predictions and the second one the `data` DMatrix. +#' @param custom_metric Customized evaluation function. Just like `objective`, should take two arguments, +#' with the first one being the predictions and the second one the `data` DMatrix. #' #' Should return a list with two elements `metric` (name that will be displayed for this metric, #' should be a string / character), and `value` (the number that the function calculates, should #' be a numeric scalar). #' -#' Note that even if passing `feval`, objectives also have an associated default metric that +#' Note that even if passing `custom_metric`, objectives also have an associated default metric that #' will be evaluated in addition to it. In order to disable the built-in metric, one can pass #' parameter `disable_default_eval_metric = TRUE`. #' @param verbose If 0, xgboost will stay silent. If 1, it will print information about performance. #' If 2, some additional information will be printed out. #' Note that setting `verbose > 0` automatically engages the #' `xgb.cb.print.evaluation(period=1)` callback function. -#' @param print_every_n Print each nth iteration evaluation messages when `verbose>0`. -#' Default is 1 which means all messages are printed. This parameter is passed to the -#' [xgb.cb.print.evaluation()] callback. -#' @param early_stopping_rounds If `NULL`, the early stopping function is not triggered. -#' If set to an integer `k`, training with a validation set will stop if the performance -#' doesn't improve for `k` rounds. Setting this parameter engages the [xgb.cb.early.stop()] callback. +#' @param print_every_n When passing `verbose>0`, evaluation logs (metrics calculated on the +#' data passed under `evals`) will be printed every nth iteration according to the value passed +#' here. The first and last iteration are always included regardless of this 'n'. +#' +#' Only has an effect when passing data under `evals` and when passing `verbose>0`. The parameter +#' is passed to the [xgb.cb.print.evaluation()] callback. +#' @param early_stopping_rounds Number of boosting rounds after which training will be stopped +#' if there is no improvement in performance (as measured by the evaluatiation metric that is +#' supplied or selected by default for the objective) on the evaluation data passed under +#' `evals`. +#' +#' Must pass `evals` in order to use this functionality. Setting this parameter adds the +#' [xgb.cb.early.stop()] callback. +#' +#' If `NULL`, early stopping will not be used. #' @param maximize If `feval` and `early_stopping_rounds` are set, then this parameter must be set as well. #' When it is `TRUE`, it means the larger the evaluation score the better. #' This parameter is passed to the [xgb.cb.early.stop()] callback. @@ -168,44 +81,27 @@ #' such as an evaluation log (a `data.table` object) - be aware that these objects are kept #' as R attributes, and thus do not get saved when using XGBoost's own serializaters like #' [xgb.save()] (but are kept when using R serializers like [saveRDS()]). -#' @param ... other parameters to pass to `params`. +#' @param ... Not used. #' -#' @return An object of class `xgb.Booster`. +#' Some arguments are currently deprecated or have been renamed. If a deprecated argument +#' is passed, will throw a warning and use its current equivalent. #' +#' If some additional argument is passed that is neither a current function argument nor +#' a deprecated argument, an error will be thrown. +#' @return An object of class `xgb.Booster`. #' @details -#' These are the training functions for [xgboost()]. -#' -#' The `xgb.train()` interface supports advanced features such as `evals`, -#' customized objective and evaluation metric functions, therefore it is more flexible -#' than the [xgboost()] interface. +#' Compared to [xgboost()], the `xgb.train()` interface supports advanced features such as +#' `evals`, customized objective and evaluation metric functions, among others, with the +#' difference these work `xgb.DMatrix` objects and do not follow typical R idioms. #' #' Parallelization is automatically enabled if OpenMP is present. #' Number of threads can also be manually specified via the `nthread` parameter. #' -#' While in other interfaces, the default random seed defaults to zero, in R, if a parameter `seed` +#' While in XGBoost language bindings, the default random seed defaults to zero, in R, if a parameter `seed` #' is not manually supplied, it will generate a random seed through R's own random number generator, #' whose seed in turn is controllable through `set.seed`. If `seed` is passed, it will override the #' RNG from R. #' -#' The evaluation metric is chosen automatically by XGBoost (according to the objective) -#' when the `eval_metric` parameter is not provided. -#' User may set one or several `eval_metric` parameters. -#' Note that when using a customized metric, only this single metric can be used. -#' The following is the list of built-in metrics for which XGBoost provides optimized implementation: -#' - `rmse`: Root mean square error. \url{https://en.wikipedia.org/wiki/Root_mean_square_error} -#' - `logloss`: Negative log-likelihood. \url{https://en.wikipedia.org/wiki/Log-likelihood} -#' - `mlogloss`: Multiclass logloss. \url{https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html} -#' - `error`: Binary classification error rate. It is calculated as `(# wrong cases) / (# all cases)`. -#' By default, it uses the 0.5 threshold for predicted values to define negative and positive instances. -#' Different threshold (e.g., 0.) could be specified as `error@0`. -#' - `merror`: Multiclass classification error rate. It is calculated as `(# wrong cases) / (# all cases)`. -#' - `mae`: Mean absolute error. -#' - `mape`: Mean absolute percentage error. -#' - `auc`: Area under the curve. -#' \url{https://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation. -#' - `aucpr`: Area under the PR curve. \url{https://en.wikipedia.org/wiki/Precision_and_recall} for ranking evaluation. -#' - `ndcg`: Normalized Discounted Cumulative Gain (for ranking task). \url{https://en.wikipedia.org/wiki/NDCG} -#' #' The following callbacks are automatically created when certain parameters are set: #' - [xgb.cb.print.evaluation()] is turned on when `verbose > 0` and the `print_every_n` #' parameter is passed to it. @@ -226,7 +122,7 @@ #' is assigned from the `params` argument to this function, and is only meant to serve as a #' reference for what went into the booster, but is not used in other methods that take a booster #' object - so for example, changing the booster's configuration requires calling `xgb.config<-` -#' or `xgb.parameters<-`, while simply modifying `attributes(model)$params$<...>` will have no +#' or `xgb.model.parameters<-`, while simply modifying `attributes(model)$params$<...>` will have no #' effect elsewhere. #' #' @seealso [xgb.Callback()], [predict.xgb.Booster()], [xgb.cv()] @@ -252,7 +148,7 @@ #' evals <- list(train = dtrain, eval = dtest) #' #' ## A simple xgb.train example: -#' param <- list( +#' param <- xgb.params( #' max_depth = 2, #' eta = 1, #' nthread = nthread, @@ -276,9 +172,9 @@ #' return(list(metric = "error", value = err)) #' } #' -#' # These functions could be used by passing them either: -#' # as 'objective' and 'eval_metric' parameters in the params list: -#' param <- list( +#' # These functions could be used by passing them as 'objective' and +#' # 'eval_metric' parameters in the params list: +#' param <- xgb.params( #' max_depth = 2, #' eta = 1, #' nthread = nthread, @@ -287,26 +183,16 @@ #' ) #' bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0) #' -#' # or through the ... arguments: -#' param <- list(max_depth = 2, eta = 1, nthread = nthread) -#' bst <- xgb.train( -#' param, -#' dtrain, -#' nrounds = 2, -#' evals = evals, -#' verbose = 0, -#' objective = logregobj, -#' eval_metric = evalerror -#' ) -#' -#' # or as dedicated 'obj' and 'feval' parameters of xgb.train: +#' # ... or as dedicated 'objective' and 'custom_metric' parameters of xgb.train: #' bst <- xgb.train( -#' param, dtrain, nrounds = 2, evals = evals, obj = logregobj, feval = evalerror +#' within(param, rm("objective", "eval_metric")), +#' dtrain, nrounds = 2, evals = evals, +#' objective = logregobj, custom_metric = evalerror #' ) #' #' #' ## An xgb.train example of using variable learning rates at each iteration: -#' param <- list( +#' param <- xgb.params( #' max_depth = 2, #' eta = 1, #' nthread = nthread, @@ -328,30 +214,21 @@ #' bst <- xgb.train( #' param, dtrain, nrounds = 25, evals = evals, early_stopping_rounds = 3 #' ) -#' -#' ## An 'xgboost' interface example: -#' bst <- xgboost( -#' x = agaricus.train$data, -#' y = factor(agaricus.train$label), -#' params = list(max_depth = 2, eta = 1), -#' nthread = nthread, -#' nrounds = 2 -#' ) -#' pred <- predict(bst, agaricus.test$data) -#' #' @export -xgb.train <- function(params = list(), data, nrounds, evals = list(), - obj = NULL, feval = NULL, verbose = 1, print_every_n = 1L, +xgb.train <- function(params = xgb.params(), data, nrounds, evals = list(), + objective = NULL, custom_metric = NULL, verbose = 1, print_every_n = 1L, early_stopping_rounds = NULL, maximize = NULL, save_period = NULL, save_name = "xgboost.model", xgb_model = NULL, callbacks = list(), ...) { + check.deprecation(deprecated_train_params, match.call(), ...) - check.deprecation(...) - - params <- check.booster.params(params, ...) - - check.custom.obj() - check.custom.eval() + params <- check.booster.params(params) + tmp <- check.custom.obj(params, objective) + params <- tmp$params + objective <- tmp$objective + tmp <- check.custom.eval(params, custom_metric, maximize, early_stopping_rounds, callbacks) + params <- tmp$params + custom_metric <- tmp$custom_metric # data & evals checks dtrain <- data @@ -456,7 +333,7 @@ xgb.train <- function(params = list(), data, nrounds, evals = list(), bst = bst, dtrain = dtrain, iter = iteration - 1, - obj = obj + objective = objective ) bst_evaluation <- NULL @@ -465,7 +342,7 @@ xgb.train <- function(params = list(), data, nrounds, evals = list(), bst = bst, evals = evals, iter = iteration - 1, - feval = feval + custom_metric = custom_metric ) } @@ -512,3 +389,435 @@ xgb.train <- function(params = list(), data, nrounds, evals = list(), return(bst) } + +# nolint start: line_length_linter. +#' @title XGBoost Parameters +#' @description Convenience function to generate a list of named XGBoost parameters, which +#' can be passed as argument `params` to [xgb.train()]. See the [online documentation]( +#' https://xgboost.readthedocs.io/en/stable/parameter.html) for more details. +#' +#' The purpose of this function is to enable IDE autocompletions and to provide in-package +#' documentation for all the possible parameters that XGBoost accepts. The output from this +#' function is just a regular R list containing the parameters that were set to non-default +#' values. Note that this function will not perform any validation on the supplied arguments. +#' +#' If passing `NULL` for a given parameter (the default for all of them), then the default +#' value for that parameter will be used. Default values are automatically determined by the +#' XGBoost core library upon calls to [xgb.train()] or [xgb.cv()], and are subject to change +#' over XGBoost library versions. +#' @return A list with the entries that were passed non-NULL values. It is intended to +#' be passed as argument `params` to [xgb.train()] or [xgb.cv()]. +#' @export +#' @param objective (default=`"reg:squarederror"`) +#' Specify the learning task and the corresponding learning objective or a custom objective function to be used. +#' +#' For custom objective, see [Custom Objective and Evaluation Metric](https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html) +#' and [Custom objective and metric](https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html#custom-obj-metric) for more information, +#' along with the end note for function signatures. +#' +#' Supported values are: +#' - `"reg:squarederror"`: regression with squared loss. +#' - `"reg:squaredlogerror"`: regression with squared log loss \eqn{\frac{1}{2}[log(pred + 1) - log(label + 1)]^2}. All input labels are required to be greater than -1. Also, see metric `rmsle` for possible issue with this objective. +#' - `"reg:logistic"`: logistic regression, output probability +#' - `"reg:pseudohubererror"`: regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss. +#' - `"reg:absoluteerror"`: Regression with L1 error. When tree model is used, leaf value is refreshed after tree construction. If used in distributed training, the leaf value is calculated as the mean value from all workers, which is not guaranteed to be optimal. +#' +#' Version added: 1.7.0 +#' - `"reg:quantileerror"`: Quantile loss, also known as "pinball loss". See later sections for its parameter and [Quantile Regression](https://xgboost.readthedocs.io/en/latest/python/examples/quantile_regression.html#sphx-glr-python-examples-quantile-regression-py) for a worked example. +#' +#' Version added: 2.0.0 +#' - `"binary:logistic"`: logistic regression for binary classification, output probability +#' - `"binary:logitraw"`: logistic regression for binary classification, output score before logistic transformation +#' - `"binary:hinge"`: hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities. +#' - `"count:poisson"`: Poisson regression for count data, output mean of Poisson distribution. #' `"max_delta_step"` is set to 0.7 by default in Poisson regression (used to safeguard optimization) +#' - `"survival:cox"`: Cox regression for right censored survival time data (negative values are considered right censored). +#' +#' Note that predictions are returned on the hazard ratio scale (i.e., as HR = exp(marginal_prediction) in the proportional hazard function `h(t) = h0(t) * HR`). +#' - `"survival:aft"`: Accelerated failure time model for censored survival time data. +#' See [Survival Analysis with Accelerated Failure Time](https://xgboost.readthedocs.io/en/latest/tutorials/aft_survival_analysis.html) for details. +#' - `"multi:softmax"`: set XGBoost to do multiclass classification using the softmax objective, you also need to set num_class(number of classes) +#' - `"multi:softprob"`: same as softmax, but output a vector of `ndata * nclass`, which can be further reshaped to `ndata * nclass` matrix. The result contains predicted probability of each data point belonging to each class. +#' - `"rank:ndcg"`: Use LambdaMART to perform pair-wise ranking where [Normalized Discounted Cumulative Gain (NDCG)](http://en.wikipedia.org/wiki/NDCG) is maximized. This objective supports position debiasing for click data. +#' - `"rank:map"`: Use LambdaMART to perform pair-wise ranking where [Mean Average Precision (MAP)](http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision) is maximized +#' - `"rank:pairwise"`: Use LambdaRank to perform pair-wise ranking using the `ranknet` objective. +#' - `"reg:gamma"`: gamma regression with log-link. Output is a mean of gamma distribution. It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be [gamma-distributed](https://en.wikipedia.org/wiki/Gamma_distribution#Occurrence_and_applications). +#' - `"reg:tweedie"`: Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be [Tweedie-distributed](https://en.wikipedia.org/wiki/Tweedie_distribution#Occurrence_and_applications). +#' @param verbosity (default=1) +#' Verbosity of printing messages. Valid values are 0 (silent), 1 (warning), 2 (info), 3 +#' (debug). Sometimes XGBoost tries to change configurations based on heuristics, which +#' is displayed as warning message. If there's unexpected behaviour, please try to +#' increase value of verbosity. +#' @param nthread (default to maximum number of threads available if not set) +#' Number of parallel threads used to run XGBoost. When choosing it, please keep thread +#' contention and hyperthreading in mind. +#' @param seed Random number seed. If not specified, will take a random seed through R's own RNG engine. +#' @param booster (default= `"gbtree"`) +#' Which booster to use. Can be `"gbtree"`, `"gblinear"` or `"dart"`; `"gbtree"` and `"dart"` use tree based models while `"gblinear"` uses linear functions. +#' @param eta,learning_rate (two aliases for the same parameter) (for Tree Booster) (default=0.3) +#' Step size shrinkage used in update to prevent overfitting. After each boosting step, we can directly get the weights of new features, and `eta` shrinks the feature weights to make the boosting process more conservative. +#' +#' range: \eqn{[0,1]} +#' +#' Note: should only pass one of `eta` or `learning_rate`. Both refer to the same parameter and there's thus no difference between one or the other. +#' @param gamma,min_split_loss (two aliases for the same parameter) (for Tree Booster) (default=0, alias: `gamma`) +#' Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger `min_split_loss` is, the more conservative the algorithm will be. Note that a tree where no splits were made might still contain a single terminal node with a non-zero score. +#' +#' range: \eqn{[0, \infty)} +#' +#' Note: should only pass one of `gamma` or `min_split_loss`. Both refer to the same parameter and there's thus no difference between one or the other. +#' @param max_depth (for Tree Booster) (default=6) +#' Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit. 0 indicates no limit on depth. Beware that XGBoost aggressively consumes memory when training a deep tree. `"exact"` tree method requires non-zero value. +#' +#' range: \eqn{[0, \infty)} +#' @param min_child_weight (for Tree Booster) (default=1) +#' Minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than `min_child_weight`, then the building process will give up further partitioning. In linear regression task, this simply corresponds to minimum number of instances needed to be in each node. The larger `min_child_weight` is, the more conservative the algorithm will be. +#' +#' range: \eqn{[0, \infty)} +#' @param max_delta_step (for Tree Booster) (default=0) +#' Maximum delta step we allow each leaf output to be. If the value is set to 0, it means there is no constraint. If it is set to a positive value, it can help making the update step more conservative. Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced. Set it to value of 1-10 might help control the update. +#' +#' range: \eqn{[0, \infty)} +#' @param subsample (for Tree Booster) (default=1) +#' Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. and this will prevent overfitting. Subsampling will occur once in every boosting iteration. +#' +#' range: \eqn{(0,1]} +#' @param sampling_method (for Tree Booster) (default= `"uniform"`) +#' The method to use to sample the training instances. +#' - `"uniform"`: each training instance has an equal probability of being selected. Typically set +#' `"subsample"` >= 0.5 for good results. +#' - `"gradient_based"`: the selection probability for each training instance is proportional to the +#' \bold{regularized absolute value} of gradients (more specifically, \eqn{\sqrt{g^2+\lambda h^2}}). +#' `"subsample"` may be set to as low as 0.1 without loss of model accuracy. Note that this +#' sampling method is only supported when `"tree_method"` is set to `"hist"` and the device is `"cuda"`; other tree +#' methods only support `"uniform"` sampling. +#' @param colsample_bytree,colsample_bylevel,colsample_bynode (for Tree Booster) (default=1) +#' This is a family of parameters for subsampling of columns. +#' - All `"colsample_by*"` parameters have a range of \eqn{(0, 1]}, the default value of 1, and specify the fraction of columns to be subsampled. +#' - `"colsample_bytree"` is the subsample ratio of columns when constructing each tree. Subsampling occurs once for every tree constructed. +#' - `"colsample_bylevel"` is the subsample ratio of columns for each level. Subsampling occurs once for every new depth level reached in a tree. Columns are subsampled from the set of columns chosen for the current tree. +#' - `"colsample_bynode"` is the subsample ratio of columns for each node (split). Subsampling occurs once every time a new split is evaluated. Columns are subsampled from the set of columns chosen for the current level. This is not supported by the exact tree method. +#' - `"colsample_by*"` parameters work cumulatively. For instance, +#' the combination `{'colsample_bytree'=0.5, 'colsample_bylevel'=0.5, 'colsample_bynode'=0.5}` with 64 features will leave 8 features to choose from at +#' each split. +#' +#' One can set the `"feature_weights"` for DMatrix to +#' define the probability of each feature being selected when using column sampling. +#' @param lambda,reg_lambda (two aliases for the same parameter) +#' +#' - For tree-based boosters: +#' - L2 regularization term on weights. Increasing this value will make model more conservative. +#' - default: 1 +#' - range: \eqn{[0, \infty]} +#' - For linear booster: +#' - L2 regularization term on weights. Increasing this value will make model more conservative. Normalised to number of training examples. +#' - default: 0 +#' - range: \eqn{[0, \infty)} +#' +#' Note: should only pass one of `lambda` or `reg_lambda`. Both refer to the same parameter and there's thus no difference between one or the other. +#' @param alpha,reg_alpha (two aliases for the same parameter) +#' - L1 regularization term on weights. Increasing this value will make model more conservative. +#' - For the linear booster, it's normalised to number of training examples. +#' - default: 0 +#' - range: \eqn{[0, \infty)} +#' +#' Note: should only pass one of `alpha` or `reg_alpha`. Both refer to the same parameter and there's thus no difference between one or the other. +#' @param tree_method (for Tree Booster) (default= `"auto"`) +#' The tree construction algorithm used in XGBoost. See description in the [reference paper](http://arxiv.org/abs/1603.02754) and [Tree Methods](https://xgboost.readthedocs.io/en/latest/treemethod.html). +#' +#' Choices: `"auto"`, `"exact"`, `"approx"`, `"hist"`, this is a combination of commonly +#' used updaters. For other updaters like `"refresh"`, set the parameter `updater` +#' directly. +#' - `"auto"`: Same as the `"hist"` tree method. +#' - `"exact"`: Exact greedy algorithm. Enumerates all split candidates. +#' - `"approx"`: Approximate greedy algorithm using quantile sketch and gradient histogram. +#' - `"hist"`: Faster histogram optimized approximate greedy algorithm. +#' @param scale_pos_weight (for Tree Booster) (default=1) +#' Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: `sum(negative instances) / sum(positive instances)`. See [Parameters Tuning](https://xgboost.readthedocs.io/en/latest/tutorials/param_tuning.html) for more discussion. Also, see Higgs Kaggle competition demo for examples: [R](https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-train.R), [py1](https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-numpy.py), [py2](https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-cv.py), [py3](https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cross_validation.py). +#' @param updater Has different meanings depending on the type of booster. +#' +#' - For tree-based boosters: +#' A comma separated string defining the sequence of tree updaters to run, providing a modular way to construct and to modify the trees. This is an advanced parameter that is usually set automatically, depending on some other parameters. However, it could be also set explicitly by a user. The following updaters exist: +#' - `"grow_colmaker"`: non-distributed column-based construction of trees. +#' - `"grow_histmaker"`: distributed tree construction with row-based data splitting based on global proposal of histogram counting. +#' - `"grow_quantile_histmaker"`: Grow tree using quantized histogram. +#' - `"grow_gpu_hist"`: Enabled when `tree_method` is set to `"hist"` along with `device="cuda"`. +#' - `"grow_gpu_approx"`: Enabled when `tree_method` is set to `"approx"` along with `device="cuda"`. +#' - `"sync"`: synchronizes trees in all distributed nodes. +#' - `"refresh"`: refreshes tree's statistics and/or leaf values based on the current data. Note that no random subsampling of data rows is performed. +#' - `"prune"`: prunes the splits where loss < `min_split_loss` (or `gamma`) and nodes that have depth greater than `max_depth`. +#' +#' - For `booster="gblinear"`: +#' (default= `"shotgun"`) Choice of algorithm to fit linear model +#' - `"shotgun"`: Parallel coordinate descent algorithm based on shotgun algorithm. Uses 'hogwild' parallelism and therefore produces a nondeterministic solution on each run. +#' - `"coord_descent"`: Ordinary coordinate descent algorithm. Also multithreaded but still produces a deterministic solution. When the `device` parameter is set to `"cuda"` or `"gpu"`, a GPU variant would be used. +#' @param refresh_leaf (for Tree Booster) (default=1) +#' This is a parameter of the `"refresh"` updater. When this flag is 1, tree leafs as well as tree nodes' stats are updated. When it is 0, only node stats are updated. +#' @param grow_policy (for Tree Booster) (default= `"depthwise"`) +#' - Controls a way new nodes are added to the tree. +#' - Currently supported only if `tree_method` is set to `"hist"` or `"approx"`. +#' - Choices: `"depthwise"`, `"lossguide"` +#' - `"depthwise"`: split at nodes closest to the root. +#' - `"lossguide"`: split at nodes with highest loss change. +#' @param max_leaves (for Tree Booster) (default=0) +#' Maximum number of nodes to be added. Not used by `"exact"` tree method. +#' @param max_bin (for Tree Booster) (default=256) +#' - Only used if `tree_method` is set to `"hist"` or `"approx"`. +#' - Maximum number of discrete bins to bucket continuous features. +#' - Increasing this number improves the optimality of splits at the cost of higher computation time. +#' @param num_parallel_tree (for Tree Booster) (default=1) +#' Number of parallel trees constructed during each iteration. This option is used to support boosted random forest. +#' @param monotone_constraints (for Tree Booster) +#' Constraint of variable monotonicity. See [Monotonic Constraints](https://xgboost.readthedocs.io/en/latest/tutorials/monotonic.html) for more information. +#' @param interaction_constraints (for Tree Booster) +#' Constraints for interaction representing permitted interactions. The constraints must +#' be specified in the form of a nest list, e.g. `list(c(0, 1), c(2, 3, 4))`, where each inner +#' list is a group of indices of features (base-0 numeration) that are allowed to interact with each other. +#' See [Feature Interaction Constraints](https://xgboost.readthedocs.io/en/latest/tutorials/feature_interaction_constraint.html) for more information. +#' @param multi_strategy (for Tree Booster) (default = `"one_output_per_tree"`) +#' The strategy used for training multi-target models, including multi-target regression +#' and multi-class classification. See [Multiple Outputs](https://xgboost.readthedocs.io/en/latest/tutorials/multioutput.html) for more information. +#' - `"one_output_per_tree"`: One model for each target. +#' - `"multi_output_tree"`: Use multi-target trees. +#' +#' Version added: 2.0.0 +#' +#' Note: This parameter is working-in-progress. +#' @param base_score +#' - The initial prediction score of all instances, global bias +#' - The parameter is automatically estimated for selected objectives before training. To +#' disable the estimation, specify a real number argument. +#' - If `base_margin` is supplied, `base_score` will not be added. +#' - For sufficient number of iterations, changing this value will not have too much effect. +#' @param eval_metric (default according to objective) +#' - Evaluation metrics for validation data, a default metric will be assigned according to objective (rmse for regression, and logloss for classification, `mean average precision` for ``rank:map``, etc.) +#' - User can add multiple evaluation metrics. +#' - The choices are listed below: +#' - `"rmse"`: [root mean square error](http://en.wikipedia.org/wiki/Root_mean_square_error) +#' - `"rmsle"`: root mean square log error: \eqn{\sqrt{\frac{1}{N}[log(pred + 1) - log(label + 1)]^2}}. Default metric of `"reg:squaredlogerror"` objective. This metric reduces errors generated by outliers in dataset. But because `log` function is employed, `"rmsle"` might output `nan` when prediction value is less than -1. See `"reg:squaredlogerror"` for other requirements. +#' - `"mae"`: [mean absolute error](https://en.wikipedia.org/wiki/Mean_absolute_error) +#' - `"mape"`: [mean absolute percentage error](https://en.wikipedia.org/wiki/Mean_absolute_percentage_error) +#' - `"mphe"`: [mean Pseudo Huber error](https://en.wikipedia.org/wiki/Huber_loss). Default metric of `"reg:pseudohubererror"` objective. +#' - `"logloss"`: [negative log-likelihood](http://en.wikipedia.org/wiki/Log-likelihood) +#' - `"error"`: Binary classification error rate. It is calculated as `#(wrong cases)/#(all cases)`. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances. +#' - `"error@t"`: a different than 0.5 binary classification threshold value could be specified by providing a numerical value through 't'. +#' - `"merror"`: Multiclass classification error rate. It is calculated as `#(wrong cases)/#(all cases)`. +#' - `"mlogloss"`: [Multiclass logloss](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html). +#' - `"auc"`: [Receiver Operating Characteristic Area under the Curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve). +#' Available for classification and learning-to-rank tasks. +#' - When used with binary classification, the objective should be `"binary:logistic"` or similar functions that work on probability. +#' - When used with multi-class classification, objective should be `"multi:softprob"` instead of `"multi:softmax"`, as the latter doesn't output probability. Also the AUC is calculated by 1-vs-rest with reference class weighted by class prevalence. +#' - When used with LTR task, the AUC is computed by comparing pairs of documents to count correctly sorted pairs. This corresponds to pairwise learning to rank. The implementation has some issues with average AUC around groups and distributed workers not being well-defined. +#' - On a single machine the AUC calculation is exact. In a distributed environment the AUC is a weighted average over the AUC of training rows on each node - therefore, distributed AUC is an approximation sensitive to the distribution of data across workers. Use another metric in distributed environments if precision and reproducibility are important. +#' - When input dataset contains only negative or positive samples, the output is `NaN`. The behavior is implementation defined, for instance, `scikit-learn` returns \eqn{0.5} instead. +#' - `"aucpr"`: [Area under the PR curve](https://en.wikipedia.org/wiki/Precision_and_recall). +#' Available for classification and learning-to-rank tasks. +#' +#' After XGBoost 1.6, both of the requirements and restrictions for using `"aucpr"` in classification problem are similar to `"auc"`. For ranking task, only binary relevance label \eqn{y \in [0, 1]} is supported. Different from `"map"` (mean average precision), `"aucpr"` calculates the *interpolated* area under precision recall curve using continuous interpolation. +#' +#' - `"pre"`: Precision at \eqn{k}. Supports only learning to rank task. +#' - `"ndcg"`: [Normalized Discounted Cumulative Gain](http://en.wikipedia.org/wiki/NDCG) +#' - `"map"`: [Mean Average Precision](http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision) +#' +#' The `average precision` is defined as: +#' +#' \eqn{AP@l = \frac{1}{min{(l, N)}}\sum^l_{k=1}P@k \cdot I_{(k)}} +#' +#' where \eqn{I_{(k)}} is an indicator function that equals to \eqn{1} when the document at \eqn{k} is relevant and \eqn{0} otherwise. The \eqn{P@k} is the precision at \eqn{k}, and \eqn{N} is the total number of relevant documents. Lastly, the `mean average precision` is defined as the weighted average across all queries. +#' +#' - `"ndcg@n"`, `"map@n"`, `"pre@n"`: \eqn{n} can be assigned as an integer to cut off the top positions in the lists for evaluation. +#' - `"ndcg-"`, `"map-"`, `"ndcg@n-"`, `"map@n-"`: In XGBoost, the NDCG and MAP evaluate the score of a list without any positive samples as \eqn{1}. By appending "-" to the evaluation metric name, we can ask XGBoost to evaluate these scores as \eqn{0} to be consistent under some conditions. +#' - `"poisson-nloglik"`: negative log-likelihood for Poisson regression +#' - `"gamma-nloglik"`: negative log-likelihood for gamma regression +#' - `"cox-nloglik"`: negative partial log-likelihood for Cox proportional hazards regression +#' - `"gamma-deviance"`: residual deviance for gamma regression +#' - `"tweedie-nloglik"`: negative log-likelihood for Tweedie regression (at a specified value of the `tweedie_variance_power` parameter) +#' - `"aft-nloglik"`: Negative log likelihood of Accelerated Failure Time model. +#' See [Survival Analysis with Accelerated Failure Time](https://xgboost.readthedocs.io/en/latest/tutorials/aft_survival_analysis.html) for details. +#' - `"interval-regression-accuracy"`: Fraction of data points whose predicted labels fall in the interval-censored labels. +#' Only applicable for interval-censored data. See [Survival Analysis with Accelerated Failure Time](https://xgboost.readthedocs.io/en/latest/tutorials/aft_survival_analysis.html) for details. +#' @param seed_per_iteration (default= `FALSE`) +#' Seed PRNG determnisticly via iterator number. +#' @param device (default= `"cpu"`) +#' Device for XGBoost to run. User can set it to one of the following values: +#' - `"cpu"`: Use CPU. +#' - `"cuda"`: Use a GPU (CUDA device). +#' - `"cuda:"`: `` is an integer that specifies the ordinal of the GPU (which GPU do you want to use if you have more than one devices). +#' - `"gpu"`: Default GPU device selection from the list of available and supported devices. Only `"cuda"` devices are supported currently. +#' - `"gpu:"`: Default GPU device selection from the list of available and supported devices. Only `"cuda"` devices are supported currently. +#' +#' For more information about GPU acceleration, see [XGBoost GPU Support](https://xgboost.readthedocs.io/en/latest/gpu/index.html). In distributed environments, ordinal selection is handled by distributed frameworks instead of XGBoost. As a result, using `"cuda:"` will result in an error. Use `"cuda"` instead. +#' +#' Version added: 2.0.0 +#' +#' Note: if XGBoost was installed from CRAN, it won't have GPU support enabled, thus only `"cpu"` will be available. +#' To get GPU support, the R package for XGBoost must be installed from source or from the GitHub releases - see +#' [instructions](https://xgboost.readthedocs.io/en/latest/install.html#r). +#' @param disable_default_eval_metric (default= `FALSE`) +#' Flag to disable default metric. Set to 1 or `TRUE` to disable. +#' @param use_rmm Whether to use RAPIDS Memory Manager (RMM) to allocate cache GPU +#' memory. The primary memory is always allocated on the RMM pool when XGBoost is built +#' (compiled) with the RMM plugin enabled. Valid values are `TRUE` and `FALSE`. See +#' [Using XGBoost with RAPIDS Memory Manager (RMM) plugin](https://xgboost.readthedocs.io/en/latest/python/rmm-examples/index.html) for details. +#' @param max_cached_hist_node (for Non-Exact Tree Methods) (default = 65536) +#' Maximum number of cached nodes for histogram. This can be used with the `"hist"` and the +#' `"approx"` tree methods. +#' +#' Version added: 2.0.0 +#' +#' - For most of the cases this parameter should not be set except for growing deep +#' trees. After 3.0, this parameter affects GPU algorithms as well. +#' @param extmem_single_page (for Non-Exact Tree Methods) (default = `FALSE`) +#' This parameter is only used for the `"hist"` tree method with `device="cuda"` and +#' `subsample != 1.0`. Before 3.0, pages were always concatenated. +#' +#' Version added: 3.0.0 +#' +#' Whether the GPU-based `"hist"` tree method should concatenate the training data into a +#' single batch instead of fetching data on-demand when external memory is used. For GPU +#' devices that don't support address translation services, external memory training is +#' expensive. This parameter can be used in combination with subsampling to reduce overall +#' memory usage without significant overhead. See [Using XGBoost External Memory Version](https://xgboost.readthedocs.io/en/latest/tutorials/external_memory.html) for +#' more information. +#' @param max_cat_to_onehot (for Non-Exact Tree Methods) +#' A threshold for deciding whether XGBoost should use one-hot encoding based split for +#' categorical data. When number of categories is lesser than the threshold then one-hot +#' encoding is chosen, otherwise the categories will be partitioned into children nodes. +#' +#' Version added: 1.6.0 +#' @param max_cat_threshold (for Non-Exact Tree Methods) +#' Maximum number of categories considered for each split. Used only by partition-based +#' splits for preventing over-fitting. +#' +#' Version added: 1.7.0 +#' @param sample_type (for Dart Booster) (default= `"uniform"`) +#' Type of sampling algorithm. +#' - `"uniform"`: dropped trees are selected uniformly. +#' - `"weighted"`: dropped trees are selected in proportion to weight. +#' @param normalize_type (for Dart Booster) (default= `"tree"`) +#' Type of normalization algorithm. +#' - `"tree"`: new trees have the same weight of each of dropped trees. +#' - Weight of new trees are `1 / (k + learning_rate)`. +#' - Dropped trees are scaled by a factor of `k / (k + learning_rate)`. +#' - `"forest"`: new trees have the same weight of sum of dropped trees (forest). +#' - Weight of new trees are `1 / (1 + learning_rate)`. +#' - Dropped trees are scaled by a factor of `1 / (1 + learning_rate)`. +#' @param rate_drop (for Dart Booster) (default=0.0) +#' Dropout rate (a fraction of previous trees to drop during the dropout). +#' +#' range: \eqn{[0.0, 1.0]} +#' @param one_drop (for Dart Booster) (default=0) +#' When this flag is enabled, at least one tree is always dropped during the dropout (allows Binomial-plus-one or epsilon-dropout from the original DART paper). +#' @param skip_drop (for Dart Booster) (default=0.0) +#' Probability of skipping the dropout procedure during a boosting iteration. +#' - If a dropout is skipped, new trees are added in the same manner as `"gbtree"`. +#' - Note that non-zero `skip_drop` has higher priority than `rate_drop` or `one_drop`. +#' +#' range: \eqn{[0.0, 1.0]} +#' @param feature_selector (for Linear Booster) (default= `"cyclic"`) +#' Feature selection and ordering method +#' - `"cyclic"`: Deterministic selection by cycling through features one at a time. +#' - `"shuffle"`: Similar to `"cyclic"` but with random feature shuffling prior to each update. +#' - `"random"`: A random (with replacement) coordinate selector. +#' - `"greedy"`: Select coordinate with the greatest gradient magnitude. It has `O(num_feature^2)` complexity. It is fully deterministic. It allows restricting the selection to `top_k` features per group with the largest magnitude of univariate weight change, by setting the `top_k` parameter. Doing so would reduce the complexity to `O(num_feature*top_k)`. +#' - `"thrifty"`: Thrifty, approximately-greedy feature selector. Prior to cyclic updates, reorders features in descending magnitude of their univariate weight changes. This operation is multithreaded and is a linear complexity approximation of the quadratic greedy selection. It allows restricting the selection to `top_k` features per group with the largest magnitude of univariate weight change, by setting the `top_k` parameter. +#' @param top_k (for Linear Booster) (default=0) +#' The number of top features to select in `greedy` and `thrifty` feature selector. The value of 0 means using all the features. +#' @param num_class Number of classes when using multi-class classification objectives (e.g. `objective="multi:softprob"`) +#' @param tweedie_variance_power (for Tweedie Regression (`"objective=reg:tweedie"`)) (default=1.5) +#' - Parameter that controls the variance of the Tweedie distribution `var(y) ~ E(y)^tweedie_variance_power` +#' - range: \eqn{(1,2)} +#' - Set closer to 2 to shift towards a gamma distribution +#' - Set closer to 1 to shift towards a Poisson distribution. +#' @param huber_slope (for using Pseudo-Huber (`"reg:pseudohubererror`")) (default = 1.0) +#' A parameter used for Pseudo-Huber loss to define the \eqn{\delta} term. +#' @param quantile_alpha (for using Quantile Loss (`"reg:quantileerror"`)) +#' A scalar or a list of targeted quantiles (passed as a numeric vector). +#' +#' Version added: 2.0.0 +#' @param aft_loss_distribution (for using AFT Survival Loss (`"survival:aft"`) and Negative Log Likelihood of AFT metric (`"aft-nloglik"`)) +#' Probability Density Function, `"normal"`, `"logistic"`, or `"extreme"`. +#' @param lambdarank_pair_method (for learning to rank (`"rank:ndcg"`, `"rank:map"`, `"rank:pairwise"`)) (default = `"topk"`) +#' How to construct pairs for pair-wise learning. +#' - `"mean"`: Sample `lambdarank_num_pair_per_sample` pairs for each document in the query list. +#' - `"topk"`: Focus on top-`lambdarank_num_pair_per_sample` documents. Construct \eqn{|query|} pairs for each document at the top-`lambdarank_num_pair_per_sample` ranked by the model. +#' @param lambdarank_num_pair_per_sample (for learning to rank (`"rank:ndcg"`, `"rank:map"`, `"rank:pairwise"`)) +#' It specifies the number of pairs sampled for each document when pair method is `"mean"`, or the truncation level for queries when the pair method is `"topk"`. For example, to train with `ndcg@6`, set `"lambdarank_num_pair_per_sample"` to \eqn{6} and `lambdarank_pair_method` to `"topk"`. +#' +#' range = \eqn{[1, \infty)} +#' @param lambdarank_normalization (for learning to rank (`"rank:ndcg"`, `"rank:map"`, `"rank:pairwise"`)) (default = `TRUE`) +#' Whether to normalize the leaf value by lambda gradient. This can sometimes stagnate the training progress. +#' +#' Version added: 2.1.0 +#' @param lambdarank_unbiased (for learning to rank (`"rank:ndcg"`, `"rank:map"`, `"rank:pairwise"`)) (default = `FALSE`) +#' Specify whether do we need to debias input click data. +#' @param lambdarank_bias_norm (for learning to rank (`"rank:ndcg"`, `"rank:map"`, `"rank:pairwise"`)) (default = 2.0) +#' \eqn{L_p} normalization for position debiasing, default is \eqn{L_2}. Only relevant when `lambdarank_unbiased` is set to `TRUE`. +#' @param ndcg_exp_gain (for learning to rank (`"rank:ndcg"`, `"rank:map"`, `"rank:pairwise"`)) (default = `TRUE`) +#' Whether we should use exponential gain function for `NDCG`. There are two forms of gain function for `NDCG`, one is using relevance value directly while the other is using\eqn{2^{rel} - 1} to emphasize on retrieving relevant documents. When `ndcg_exp_gain` is `TRUE` (the default), relevance degree cannot be greater than 31. +xgb.params <- function( + objective = NULL, + verbosity = NULL, + nthread = NULL, + seed = NULL, + booster = NULL, + eta = NULL, + learning_rate = NULL, + gamma = NULL, + min_split_loss = NULL, + max_depth = NULL, + min_child_weight = NULL, + max_delta_step = NULL, + subsample = NULL, + sampling_method = NULL, + colsample_bytree = NULL, + colsample_bylevel = NULL, + colsample_bynode = NULL, + lambda = NULL, + reg_lambda = NULL, + alpha = NULL, + reg_alpha = NULL, + tree_method = NULL, + scale_pos_weight = NULL, + updater = NULL, + refresh_leaf = NULL, + grow_policy = NULL, + max_leaves = NULL, + max_bin = NULL, + num_parallel_tree = NULL, + monotone_constraints = NULL, + interaction_constraints = NULL, + multi_strategy = NULL, + base_score = NULL, + eval_metric = NULL, + seed_per_iteration = NULL, + device = NULL, + disable_default_eval_metric = NULL, + use_rmm = NULL, + max_cached_hist_node = NULL, + extmem_single_page = NULL, + max_cat_to_onehot = NULL, + max_cat_threshold = NULL, + sample_type = NULL, + normalize_type = NULL, + rate_drop = NULL, + one_drop = NULL, + skip_drop = NULL, + feature_selector = NULL, + top_k = NULL, + num_class = NULL, + tweedie_variance_power = NULL, + huber_slope = NULL, + quantile_alpha = NULL, + aft_loss_distribution = NULL, + lambdarank_pair_method = NULL, + lambdarank_num_pair_per_sample = NULL, + lambdarank_normalization = NULL, + lambdarank_unbiased = NULL, + lambdarank_bias_norm = NULL, + ndcg_exp_gain = NULL +) { +# nolint end + out <- as.list(environment()) + out <- out[!sapply(out, is.null)] + return(out) +} diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index c22752a3f506..b62c25266269 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -22,6 +22,10 @@ prescreen.parameters <- function(params) { prescreen.objective <- function(objective) { if (!is.null(objective)) { + if (!is.character(objective) || length(objective) != 1L || is.na(objective)) { + stop("'objective' must be a single character/string variable.") + } + if (objective %in% .OBJECTIVES_NON_DEFAULT_MODE()) { stop( "Objectives with non-default prediction mode (", @@ -30,8 +34,8 @@ prescreen.objective <- function(objective) { ) } - if (!is.character(objective) || length(objective) != 1L || is.na(objective)) { - stop("'objective' must be a single character/string variable.") + if (objective %in% .RANKING_OBJECTIVES()) { + stop("Ranking objectives are not supported in 'xgboost()'. Try 'xgb.train()'.") } } } @@ -501,7 +505,7 @@ check.nthreads <- function(nthreads) { return(as.integer(nthreads)) } -check.can.use.qdm <- function(x, params) { +check.can.use.qdm <- function(x, params, eval_set) { if ("booster" %in% names(params)) { if (params$booster == "gblinear") { return(FALSE) @@ -512,6 +516,9 @@ check.can.use.qdm <- function(x, params) { return(FALSE) } } + if (NROW(eval_set)) { + return(FALSE) + } return(TRUE) } @@ -717,6 +724,129 @@ process.x.and.col.args <- function( return(lst_args) } +process.eval.set <- function(eval_set, lst_args) { + if (!NROW(eval_set)) { + return(NULL) + } + nrows <- nrow(lst_args$dmatrix_args$data) + is_classif <- hasName(lst_args$metadata, "y_levels") + processed_y <- lst_args$dmatrix_args$label + eval_set <- as.vector(eval_set) + if (length(eval_set) == 1L) { + + eval_set <- as.numeric(eval_set) + if (is.na(eval_set) || eval_set < 0 || eval_set >= 1) { + stop("'eval_set' as a fraction must be a number between zero and one (non-inclusive).") + } + if (eval_set == 0) { + return(NULL) + } + nrow_eval <- as.integer(round(nrows * eval_set, 0)) + if (nrow_eval < 1) { + warning( + "Desired 'eval_set' fraction amounts to zero observations.", + " Will not create evaluation set." + ) + return(NULL) + } + nrow_train <- nrows - nrow_eval + if (nrow_train < 2L) { + stop("Desired 'eval_set' fraction would leave less than 2 observations for training data.") + } + if (is_classif && nrow_train < length(lst_args$metadata$y_levels)) { + stop("Desired 'eval_set' fraction would not leave enough samples for each class of 'y'.") + } + + seed <- lst_args$params$seed + if (!is.null(seed)) { + set.seed(seed) + } + + idx_shuffled <- sample(nrows, nrows, replace = FALSE) + idx_eval <- idx_shuffled[seq(1L, nrow_eval)] + idx_train <- idx_shuffled[seq(nrow_eval + 1L, nrows)] + # Here we want the training set to include all of the classes of 'y' for classification + # objectives. If that condition doesn't hold with the random sample, then it forcibly + # makes a new random selection in such a way that the condition would always hold, by + # first sampling one random example of 'y' for training and then choosing the evaluation + # set from the remaining rows. The procedure here is quite inefficient, but there aren't + # enough random-related functions in base R to be able to construct an efficient version. + if (is_classif && length(unique(processed_y[idx_train])) < length(lst_args$metadata$y_levels)) { + # These are defined in order to avoid NOTEs from CRAN checks + # when using non-standard data.table evaluation with column names. + idx <- NULL + y <- NULL + ranked_idx <- NULL + chosen <- NULL + + dt <- data.table::data.table(y = processed_y, idx = seq(1L, nrows))[ + , .( + ranked_idx = seq(1L, .N), + chosen = rep(sample(.N, 1L), .N), + idx + ) + , by = y + ] + min_idx_train <- dt[ranked_idx == chosen, idx] + rem_idx <- dt[ranked_idx != chosen, idx] + if (length(rem_idx) == nrow_eval) { + idx_train <- min_idx_train + idx_eval <- rem_idx + } else { + rem_idx <- rem_idx[sample(length(rem_idx), length(rem_idx), replace = FALSE)] + idx_eval <- rem_idx[seq(1L, nrow_eval)] + idx_train <- c(min_idx_train, rem_idx[seq(nrow_eval + 1L, length(rem_idx))]) + } + } + + } else { + + if (any(eval_set != floor(eval_set))) { + stop("'eval_set' as indices must contain only integers.") + } + eval_set <- as.integer(eval_set) + idx_min <- min(eval_set) + if (is.na(idx_min) || idx_min < 1L) { + stop("'eval_set' contains invalid indices.") + } + idx_max <- max(eval_set) + if (is.na(idx_max) || idx_max > nrows) { + stop("'eval_set' contains row indices beyond the size of the input data.") + } + idx_train <- seq(1L, nrows)[-eval_set] + if (is_classif && length(unique(processed_y[idx_train])) < length(lst_args$metadata$y_levels)) { + warning("'eval_set' indices will leave some classes of 'y' outside of the training data.") + } + idx_eval <- eval_set + + } + + # Note: slicing is done in the constructed DMatrix object instead of in the + # original input, because objects from 'Matrix' might change class after + # being sliced (e.g. 'dgRMatrix' turns into 'dgCMatrix'). + return(list(idx_train = idx_train, idx_eval = idx_eval)) +} + +check.early.stopping.rounds <- function(early_stopping_rounds, eval_set) { + if (is.null(early_stopping_rounds)) { + return(NULL) + } + if (is.null(eval_set)) { + stop("'early_stopping_rounds' requires passing 'eval_set'.") + } + if (NROW(early_stopping_rounds) != 1L) { + stop("'early_stopping_rounds' must be NULL or an integer greater than zero.") + } + early_stopping_rounds <- as.integer(early_stopping_rounds) + if (is.na(early_stopping_rounds) || early_stopping_rounds <= 0L) { + stop( + "'early_stopping_rounds' must be NULL or an integer greater than zero. Got: ", + early_stopping_rounds + ) + } + return(early_stopping_rounds) +} + #' Fit XGBoost Model #' #' @export @@ -778,7 +908,7 @@ process.x.and.col.args <- function( #' @param objective Optimization objective to minimize based on the supplied data, to be passed #' by name as a string / character (e.g. `reg:absoluteerror`). See the #' [Learning Task Parameters](https://xgboost.readthedocs.io/en/stable/parameter.html#learning-task-parameters) -#' page for more detailed information on allowed values. +#' page and the [xgb.params()] documentation for more detailed information on allowed values. #' #' If `NULL` (the default), will be automatically determined from `y` according to the following #' logic: @@ -808,6 +938,35 @@ process.x.and.col.args <- function( #' 2 (info), and 3 (debug). #' @param monitor_training Whether to monitor objective optimization progress on the input data. #' Note that same 'x' and 'y' data are used for both model fitting and evaluation. +#' @param eval_set Subset of the data to use as evaluation set. Can be passed as: +#' - A vector of row indices (base-1 numeration) indicating the observations that are to be designed +#' as evaluation data. +#' - A number between zero and one indicating a random fraction of the input data to use as +#' evaluation data. Note that the selection will be done uniformly at random, regardless of +#' argument `weights`. +#' +#' If passed, this subset of the data will be excluded from the training procedure, and the +#' evaluation metric(s) supplied under `eval_metric` will be calculated on this dataset after each +#' boosting iteration (pass `verbosity>0` to have these metrics printed during training). If +#' `eval_metric` is not passed, a default metric will be selected according to `objective`. +#' +#' If passing a fraction, in classification problems, the evaluation set will be chosen in such a +#' way that at least one observation of each class will be kept in the training data. +#' +#' For more elaborate evaluation variants (e.g. custom metrics, multiple evaluation sets, etc.), +#' one might want to use [xgb.train()] instead. +#' @param early_stopping_rounds Number of boosting rounds after which training will be stopped +#' if there is no improvement in performance (as measured by the last metric passed under +#' `eval_metric`, or by the default metric for the objective if `eval_metric` is not passed) on the +#' evaluation data from `eval_set`. Must pass `eval_set` in order to use this functionality. +#' +#' If `NULL`, early stopping will not be used. +#' @param print_every_n When passing `verbosity>0` and either `monitor_training=TRUE` or `eval_set`, +#' evaluation logs (metrics calculated on the training and/or evaluation data) will be printed every +#' nth iteration according to the value passed here. The first and last iteration are always +#' included regardless of this 'n'. +#' +#' Only has an effect when passing `verbosity>0`. #' @param nthreads Number of parallel threads to use. If passing zero, will use all CPU threads. #' @param seed Seed to use for random number generation. If passing `NULL`, will draw a random #' number using R's PRNG system to use as seed. @@ -893,8 +1052,11 @@ xgboost <- function( objective = NULL, nrounds = 100L, weights = NULL, - verbosity = 0L, + verbosity = if (is.null(eval_set)) 0L else 1L, monitor_training = verbosity > 0, + eval_set = NULL, + early_stopping_rounds = NULL, + print_every_n = 1L, nthreads = parallel::detectCores(), seed = 0L, monotone_constraints = NULL, @@ -907,7 +1069,7 @@ xgboost <- function( params <- list(...) params <- prescreen.parameters(params) prescreen.objective(objective) - use_qdm <- check.can.use.qdm(x, params) + use_qdm <- check.can.use.qdm(x, params, eval_set) lst_args <- process.y.margin.and.objective(y, base_margin, objective, params) lst_args <- process.row.weights(weights, lst_args) lst_args <- process.x.and.col.args( @@ -918,8 +1080,9 @@ xgboost <- function( lst_args, use_qdm ) + eval_set <- process.eval.set(eval_set, lst_args) - if (use_qdm && "max_bin" %in% names(params)) { + if (use_qdm && hasName(params, "max_bin")) { lst_args$dmatrix_args$max_bin <- params$max_bin } @@ -929,18 +1092,27 @@ xgboost <- function( lst_args$params$seed <- seed params <- c(lst_args$params, params) + params$verbosity <- verbosity fn_dm <- if (use_qdm) xgb.QuantileDMatrix else xgb.DMatrix dm <- do.call(fn_dm, lst_args$dmatrix_args) + if (!is.null(eval_set)) { + dm_eval <- xgb.slice.DMatrix(dm, eval_set$idx_eval) + dm <- xgb.slice.DMatrix(dm, eval_set$idx_train) + } evals <- list() if (monitor_training) { evals <- list(train = dm) } + if (!is.null(eval_set)) { + evals <- c(evals, list(eval = dm_eval)) + } model <- xgb.train( params = params, data = dm, nrounds = nrounds, verbose = verbosity, + print_every_n = print_every_n, evals = evals ) attributes(model)$metadata <- lst_args$metadata diff --git a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd index 4ce043799436..c4e9026d77f3 100644 --- a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd +++ b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd @@ -82,11 +82,13 @@ data(agaricus.train, package = "xgboost") bst <- xgb.train( data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), - max_depth = 2, - eta = 1, - nthread = 2, nrounds = 2, - objective = "binary:logistic" + params = xgb.params( + max_depth = 2, + eta = 1, + nthread = 2, + objective = "binary:logistic" + ) ) # Save as a stand-alone file; load it with xgb.load() diff --git a/R-package/man/coef.xgb.Booster.Rd b/R-package/man/coef.xgb.Booster.Rd index 295c766e6413..011139804479 100644 --- a/R-package/man/coef.xgb.Booster.Rd +++ b/R-package/man/coef.xgb.Booster.Rd @@ -48,7 +48,7 @@ y <- mtcars[, 1] x <- as.matrix(mtcars[, -1]) dm <- xgb.DMatrix(data = x, label = y, nthread = 1) -params <- list(booster = "gblinear", nthread = 1) +params <- xgb.params(booster = "gblinear", nthread = 1) model <- xgb.train(data = dm, params = params, nrounds = 2) coef(model) } diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd index 5cdfed97f504..2ef3f1b4bf61 100644 --- a/R-package/man/predict.xgb.Booster.Rd +++ b/R-package/man/predict.xgb.Booster.Rd @@ -205,7 +205,7 @@ Since it quadratically depends on the number of features, it is recommended to p of the most important features first. See below about the format of the returned results. The \code{predict()} method uses as many threads as defined in \code{xgb.Booster} object (all by default). -If you want to change their number, assign a new number to \code{nthread} using \code{\link[=xgb.parameters<-]{xgb.parameters<-()}}. +If you want to change their number, assign a new number to \code{nthread} using \code{\link[=xgb.model.parameters<-]{xgb.model.parameters<-()}}. Note that converting a matrix to \code{\link[=xgb.DMatrix]{xgb.DMatrix()}} uses multiple threads too. } \examples{ @@ -223,11 +223,13 @@ test <- agaricus.test bst <- xgb.train( data = xgb.DMatrix(train$data, label = train$label), - max_depth = 2, - eta = 0.5, - nthread = nthread, nrounds = 5, - objective = "binary:logistic" + params = xgb.params( + max_depth = 2, + eta = 0.5, + nthread = nthread, + objective = "binary:logistic" + ) ) # use all trees by default @@ -266,13 +268,15 @@ set.seed(11) bst <- xgb.train( data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb), - max_depth = 4, - eta = 0.5, - nthread = 2, nrounds = 10, - subsample = 0.5, - objective = "multi:softprob", - num_class = num_class + params = xgb.params( + max_depth = 4, + eta = 0.5, + nthread = 2, + subsample = 0.5, + objective = "multi:softprob", + num_class = num_class + ) ) # predict for softmax returns num_class probability numbers per case: @@ -288,13 +292,15 @@ set.seed(11) bst <- xgb.train( data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb), - max_depth = 4, - eta = 0.5, - nthread = 2, nrounds = 10, - subsample = 0.5, - objective = "multi:softmax", - num_class = num_class + params = xgb.params( + max_depth = 4, + eta = 0.5, + nthread = 2, + subsample = 0.5, + objective = "multi:softmax", + num_class = num_class + ) ) pred <- predict(bst, as.matrix(iris[, -5])) diff --git a/R-package/man/print.xgb.Booster.Rd b/R-package/man/print.xgb.Booster.Rd index a1e1e7f7226b..e797bd60f6d4 100644 --- a/R-package/man/print.xgb.Booster.Rd +++ b/R-package/man/print.xgb.Booster.Rd @@ -23,11 +23,13 @@ train <- agaricus.train bst <- xgb.train( data = xgb.DMatrix(train$data, label = train$label), - max_depth = 2, - eta = 1, - nthread = 2, nrounds = 2, - objective = "binary:logistic" + params = xgb.params( + max_depth = 2, + eta = 1, + nthread = 2, + objective = "binary:logistic" + ) ) attr(bst, "myattr") <- "memo" diff --git a/R-package/man/print.xgb.cv.Rd b/R-package/man/print.xgb.cv.Rd index fbc4b9e32151..a8c1d70ef4a7 100644 --- a/R-package/man/print.xgb.cv.Rd +++ b/R-package/man/print.xgb.cv.Rd @@ -27,11 +27,13 @@ train <- agaricus.train cv <- xgb.cv( data = xgb.DMatrix(train$data, label = train$label), nfold = 5, - max_depth = 2, - eta = 1, - nthread = 2, nrounds = 2, - objective = "binary:logistic" + params = xgb.params( + max_depth = 2, + eta = 1, + nthread = 2, + objective = "binary:logistic" + ) ) print(cv) print(cv, verbose = TRUE) diff --git a/R-package/man/xgb.Callback.Rd b/R-package/man/xgb.Callback.Rd index 8cee8c729698..1d108010798b 100644 --- a/R-package/man/xgb.Callback.Rd +++ b/R-package/man/xgb.Callback.Rd @@ -120,7 +120,7 @@ example by using the early stopping callback \code{\link[=xgb.cb.early.stop]{xgb \item iteration Index of the iteration number that is being executed (first iteration will be the same as parameter \code{begin_iteration}, then next one will add +1, and so on). \item iter_feval Evaluation metrics for \code{evals} that were supplied, either -determined by the objective, or by parameter \code{feval}. +determined by the objective, or by parameter \code{custom_metric}. For \code{\link[=xgb.train]{xgb.train()}}, this will be a named vector with one entry per element in \code{evals}, where the names are determined as 'evals name' + '-' + 'metric name' - for @@ -220,10 +220,9 @@ x <- as.matrix(mtcars[, -1]) dm <- xgb.DMatrix(x, label = y, nthread = 1) model <- xgb.train( data = dm, - params = list(objective = "reg:squarederror", nthread = 1), + params = xgb.params(objective = "reg:squarederror", nthread = 1), nrounds = 5, - callbacks = list(ssq_callback), - keep_extra_attributes = TRUE + callbacks = list(ssq_callback) ) # Result from 'f_after_iter' will be available as an attribute diff --git a/R-package/man/xgb.ExtMemDMatrix.Rd b/R-package/man/xgb.ExtMemDMatrix.Rd index a4555f571a76..d5d71ef3c0c2 100644 --- a/R-package/man/xgb.ExtMemDMatrix.Rd +++ b/R-package/man/xgb.ExtMemDMatrix.Rd @@ -108,7 +108,7 @@ cache_prefix <- tempdir() dm <- xgb.ExtMemDMatrix(data_iterator, cache_prefix, nthread = 1) # After construction, can be used as a regular DMatrix -params <- list(nthread = 1, objective = "reg:squarederror") +params <- xgb.params(nthread = 1, objective = "reg:squarederror") model <- xgb.train(data = dm, nrounds = 2, params = params) # Predictions can also be called on it, and should be the same diff --git a/R-package/man/xgb.attr.Rd b/R-package/man/xgb.attr.Rd index 40dedeea94e2..015da9458b9b 100644 --- a/R-package/man/xgb.attr.Rd +++ b/R-package/man/xgb.attr.Rd @@ -49,7 +49,7 @@ would not be saved by \code{\link[=xgb.save]{xgb.save()}} because an XGBoost mod and its serialization is handled externally. Also, setting an attribute that has the same name as one of XGBoost's parameters wouldn't change the value of that parameter for a model. -Use \code{\link[=xgb.parameters<-]{xgb.parameters<-()}} to set or change model parameters. +Use \code{\link[=xgb.model.parameters<-]{xgb.model.parameters<-()}} to set or change model parameters. The \verb{xgb.attributes<-} setter either updates the existing or adds one or several attributes, but it doesn't delete the other existing attributes. @@ -66,11 +66,13 @@ train <- agaricus.train bst <- xgb.train( data = xgb.DMatrix(train$data, label = train$label), - max_depth = 2, - eta = 1, - nthread = 2, nrounds = 2, - objective = "binary:logistic" + params = xgb.params( + max_depth = 2, + eta = 1, + nthread = 2, + objective = "binary:logistic" + ) ) xgb.attr(bst, "my_attribute") <- "my attribute value" diff --git a/R-package/man/xgb.cb.evaluation.log.Rd b/R-package/man/xgb.cb.evaluation.log.Rd index 037dc7cbc2f4..2fe6289ac29b 100644 --- a/R-package/man/xgb.cb.evaluation.log.Rd +++ b/R-package/man/xgb.cb.evaluation.log.Rd @@ -14,7 +14,7 @@ Callback for logging the evaluation history } \details{ This callback creates a table with per-iteration evaluation metrics (see parameters -\code{evals} and \code{feval} in \code{\link[=xgb.train]{xgb.train()}}). +\code{evals} and \code{custom_metric} in \code{\link[=xgb.train]{xgb.train()}}). Note: in the column names of the final data.table, the dash '-' character is replaced with the underscore '_' in order to make the column names more like regular R identifiers. diff --git a/R-package/man/xgb.cb.gblinear.history.Rd b/R-package/man/xgb.cb.gblinear.history.Rd index c2b7709aac62..a5c6cd17a011 100644 --- a/R-package/man/xgb.cb.gblinear.history.Rd +++ b/R-package/man/xgb.cb.gblinear.history.Rd @@ -59,7 +59,7 @@ dtrain <- xgb.DMatrix( label = 1 * (iris$Species == "versicolor"), nthread = nthread ) -param <- list( +param <- xgb.params( booster = "gblinear", objective = "reg:logistic", eval_metric = "auc", @@ -73,11 +73,10 @@ param <- list( # rate does not break the convergence, but allows us to illustrate the typical pattern of # "stochastic explosion" behaviour of this lock-free algorithm at early boosting iterations. bst <- xgb.train( - param, + c(param, list(eta = 1.)), dtrain, - list(tr = dtrain), + evals = list(tr = dtrain), nrounds = 200, - eta = 1., callbacks = list(xgb.cb.gblinear.history()) ) @@ -88,14 +87,18 @@ matplot(coef_path, type = "l") # With the deterministic coordinate descent updater, it is safer to use higher learning rates. # Will try the classical componentwise boosting which selects a single best feature per round: bst <- xgb.train( - param, + c( + param, + xgb.params( + eta = 0.8, + updater = "coord_descent", + feature_selector = "thrifty", + top_k = 1 + ) + ), dtrain, - list(tr = dtrain), + evals = list(tr = dtrain), nrounds = 200, - eta = 0.8, - updater = "coord_descent", - feature_selector = "thrifty", - top_k = 1, callbacks = list(xgb.cb.gblinear.history()) ) matplot(xgb.gblinear.history(bst), type = "l") @@ -105,11 +108,10 @@ matplot(xgb.gblinear.history(bst), type = "l") # For xgb.cv: bst <- xgb.cv( - param, + c(param, list(eta = 0.8)), dtrain, nfold = 5, nrounds = 100, - eta = 0.8, callbacks = list(xgb.cb.gblinear.history()) ) # coefficients in the CV fold #3 @@ -119,7 +121,7 @@ matplot(xgb.gblinear.history(bst)[[3]], type = "l") #### Multiclass classification: dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = nthread) -param <- list( +param <- xgb.params( booster = "gblinear", objective = "multi:softprob", num_class = 3, @@ -131,11 +133,10 @@ param <- list( # For the default linear updater 'shotgun' it sometimes is helpful # to use smaller eta to reduce instability bst <- xgb.train( - param, + c(param, list(eta = 0.5)), dtrain, - list(tr = dtrain), + evals = list(tr = dtrain), nrounds = 50, - eta = 0.5, callbacks = list(xgb.cb.gblinear.history()) ) @@ -146,11 +147,10 @@ matplot(xgb.gblinear.history(bst, class_index = 2), type = "l") # CV: bst <- xgb.cv( - param, + c(param, list(eta = 0.5)), dtrain, nfold = 5, nrounds = 70, - eta = 0.5, callbacks = list(xgb.cb.gblinear.history(FALSE)) ) # 1st fold of 1st class diff --git a/R-package/man/xgb.config.Rd b/R-package/man/xgb.config.Rd index 5ac223b4d8a8..12c23d2b29dd 100644 --- a/R-package/man/xgb.config.Rd +++ b/R-package/man/xgb.config.Rd @@ -37,11 +37,13 @@ train <- agaricus.train bst <- xgb.train( data = xgb.DMatrix(train$data, label = train$label), - max_depth = 2, - eta = 1, - nthread = nthread, nrounds = 2, - objective = "binary:logistic" + params = xgb.params( + max_depth = 2, + eta = 1, + nthread = nthread, + objective = "binary:logistic" + ) ) config <- xgb.config(bst) diff --git a/R-package/man/xgb.copy.Booster.Rd b/R-package/man/xgb.copy.Booster.Rd index 2bab71cd2a52..20e6d0633c11 100644 --- a/R-package/man/xgb.copy.Booster.Rd +++ b/R-package/man/xgb.copy.Booster.Rd @@ -30,8 +30,8 @@ dm <- xgb.DMatrix(x, label = y, nthread = 1) model <- xgb.train( data = dm, - params = list(nthread = 1), - nround = 3 + params = xgb.params(nthread = 1), + nrounds = 3 ) # Set an arbitrary attribute kept at the C level diff --git a/R-package/man/xgb.create.features.Rd b/R-package/man/xgb.create.features.Rd index 282593ebd000..ebb210435b43 100644 --- a/R-package/man/xgb.create.features.Rd +++ b/R-package/man/xgb.create.features.Rd @@ -4,14 +4,12 @@ \alias{xgb.create.features} \title{Create new features from a previously learned model} \usage{ -xgb.create.features(model, data, ...) +xgb.create.features(model, data) } \arguments{ \item{model}{Decision tree boosting model learned on the original data.} \item{data}{Original data (usually provided as a \code{dgCMatrix} matrix).} - -\item{...}{Currently not used.} } \value{ A \code{dgCMatrix} matrix including both the original data and the new features. @@ -64,10 +62,10 @@ data(agaricus.test, package = "xgboost") dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2)) dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2)) -param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic') +param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic', nthread = 1) nrounds = 4 -bst <- xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2) +bst <- xgb.train(params = param, data = dtrain, nrounds = nrounds) # Model accuracy without new features accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / @@ -79,12 +77,12 @@ new.features.test <- xgb.create.features(model = bst, agaricus.test$data) # learning with new features new.dtrain <- xgb.DMatrix( - data = new.features.train, label = agaricus.train$label, nthread = 2 + data = new.features.train, label = agaricus.train$label ) new.dtest <- xgb.DMatrix( - data = new.features.test, label = agaricus.test$label, nthread = 2 + data = new.features.test, label = agaricus.test$label ) -bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2) +bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds) # Model accuracy with new features accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index c5686e201ec7..02a1a876b6a9 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -5,15 +5,15 @@ \title{Cross Validation} \usage{ xgb.cv( - params = list(), + params = xgb.params(), data, nrounds, nfold, prediction = FALSE, showsd = TRUE, metrics = list(), - obj = NULL, - feval = NULL, + objective = NULL, + custom_metric = NULL, stratified = "auto", folds = NULL, train_folds = NULL, @@ -26,28 +26,15 @@ xgb.cv( ) } \arguments{ -\item{params}{The list of parameters. The complete list of parameters is available in the -\href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. -Below is a shorter summary: -\itemize{ -\item \code{objective}: Objective function, common ones are -\itemize{ -\item \code{reg:squarederror}: Regression with squared loss. -\item \code{binary:logistic}: Logistic regression for classification. -} +\item{params}{List of XGBoost parameters which control the model building process. +See the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation} +and the documentation for \code{\link[=xgb.params]{xgb.params()}} for details. -See \code{\link[=xgb.train]{xgb.train()}} for complete list of objectives. -\item \code{eta}: Step size of each boosting step -\item \code{max_depth}: Maximum depth of the tree -\item \code{nthread}: Number of threads used in training. If not set, all threads are used -} +Should be passed as list with named entries. Parameters that are not specified in this +list will use their default values. -See \code{\link[=xgb.train]{xgb.train()}} for further details. -See also demo for walkthrough example in R. - -Note that, while \code{params} accepts a \code{seed} entry and will use such parameter for model training if -supplied, this seed is not used for creation of train-test splits, which instead rely on R's own RNG -system - thus, for reproducible results, one needs to call the \code{\link[=set.seed]{set.seed()}} function beforehand.} +A list of named parameters can be created through the function \code{\link[=xgb.params]{xgb.params()}}, which +accepts all valid parameters as function arguments.} \item{data}{An \code{xgb.DMatrix} object, with corresponding fields like \code{label} or bounds as required for model training by the objective. @@ -55,7 +42,7 @@ for model training by the objective. Note that only the basic \code{xgb.DMatrix} class is supported - variants such as \code{xgb.QuantileDMatrix} or \code{xgb.ExtMemDMatrix} are not supported here.} -\item{nrounds}{The max number of iterations.} +\item{nrounds}{Max number of boosting iterations.} \item{nfold}{The original dataset is randomly partitioned into \code{nfold} equal size subsamples.} @@ -78,11 +65,24 @@ Possible options are: \item \code{merror}: Exact matching error used to evaluate multi-class classification }} -\item{obj}{Customized objective function. Returns gradient and second order -gradient with given prediction and dtrain.} +\item{objective}{Customized objective function. Should take two arguments: the first one will be the +current predictions (either a numeric vector or matrix depending on the number of targets / classes), +and the second one will be the \code{data} DMatrix object that is used for training. + +It should return a list with two elements \code{grad} and \code{hess} (in that order), as either +numeric vectors or numeric matrices depending on the number of targets / classes (same +dimension as the predictions that are passed as first argument).} + +\item{custom_metric}{Customized evaluation function. Just like \code{objective}, should take two arguments, +with the first one being the predictions and the second one the \code{data} DMatrix. -\item{feval}{Customized evaluation function. Returns -\code{list(metric='metric-name', value='metric-value')} with given prediction and dtrain.} +Should return a list with two elements \code{metric} (name that will be displayed for this metric, +should be a string / character), and \code{value} (the number that the function calculates, should +be a numeric scalar). + +Note that even if passing \code{custom_metric}, objectives also have an associated default metric that +will be evaluated in addition to it. In order to disable the built-in metric, one can pass +parameter \code{disable_default_eval_metric = TRUE}.} \item{stratified}{Logical flag indicating whether sampling of folds should be stratified by the values of outcome labels. For real-valued labels in regression objectives, @@ -110,19 +110,29 @@ the resulting DMatrices.} This is not supported when \code{data} has \code{group} field.} -\item{verbose}{Logical flag. Should statistics be printed during the process?} +\item{verbose}{If 0, xgboost will stay silent. If 1, it will print information about performance. +If 2, some additional information will be printed out. +Note that setting \code{verbose > 0} automatically engages the +\code{xgb.cb.print.evaluation(period=1)} callback function.} + +\item{print_every_n}{When passing \code{verbose>0}, evaluation logs (metrics calculated on the +data passed under \code{evals}) will be printed every nth iteration according to the value passed +here. The first and last iteration are always included regardless of this 'n'. + +Only has an effect when passing data under \code{evals} and when passing \code{verbose>0}. The parameter +is passed to the \code{\link[=xgb.cb.print.evaluation]{xgb.cb.print.evaluation()}} callback.} -\item{print_every_n}{Print each nth iteration evaluation messages when \code{verbose > 0}. -Default is 1 which means all messages are printed. This parameter is passed to the -\code{\link[=xgb.cb.print.evaluation]{xgb.cb.print.evaluation()}} callback.} +\item{early_stopping_rounds}{Number of boosting rounds after which training will be stopped +if there is no improvement in performance (as measured by the evaluatiation metric that is +supplied or selected by default for the objective) on the evaluation data passed under +\code{evals}. -\item{early_stopping_rounds}{If \code{NULL}, the early stopping function is not triggered. -If set to an integer \code{k}, training with a validation set will stop if the performance -doesn't improve for \code{k} rounds. -Setting this parameter engages the \code{\link[=xgb.cb.early.stop]{xgb.cb.early.stop()}} callback.} +Must pass \code{evals} in order to use this functionality. Setting this parameter adds the +\code{\link[=xgb.cb.early.stop]{xgb.cb.early.stop()}} callback. -\item{maximize}{If \code{feval} and \code{early_stopping_rounds} are set, -then this parameter must be set as well. +If \code{NULL}, early stopping will not be used.} + +\item{maximize}{If \code{feval} and \code{early_stopping_rounds} are set, then this parameter must be set as well. When it is \code{TRUE}, it means the larger the evaluation score the better. This parameter is passed to the \code{\link[=xgb.cb.early.stop]{xgb.cb.early.stop()}} callback.} @@ -131,7 +141,13 @@ See \code{\link[=xgb.Callback]{xgb.Callback()}}. Some of the callbacks are autom parameters' values. User can provide either existing or their own callback methods in order to customize the training process.} -\item{...}{Other parameters to pass to \code{params}.} +\item{...}{Not used. + +Some arguments are currently deprecated or have been renamed. If a deprecated argument +is passed, will throw a warning and use its current equivalent. + +If some additional argument is passed that is neither a current function argument nor +a deprecated argument, an error will be thrown.} } \value{ An object of class 'xgb.cv.synchronous' with the following elements: @@ -181,11 +197,14 @@ dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2)) cv <- xgb.cv( data = dtrain, nrounds = 3, - nthread = 2, + params = xgb.params( + nthread = 2, + max_depth = 3, + eta = 1, + objective = "binary:logistic" + ), nfold = 5, - metrics = list("rmse","auc"), - max_depth = 3, - eta = 1,objective = "binary:logistic" + metrics = list("rmse","auc") ) print(cv) print(cv, verbose = TRUE) diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd index 8bd8e5d6c0d6..a3b622947eac 100644 --- a/R-package/man/xgb.dump.Rd +++ b/R-package/man/xgb.dump.Rd @@ -33,7 +33,13 @@ cover is the sum of second order gradient in each node.} Format 'dot' for a single tree can be passed directly to packages that consume this format for graph visualization, such as function \code{DiagrammeR::grViz()}} -\item{...}{Currently not used} +\item{...}{Not used. + +Some arguments are currently deprecated or have been renamed. If a deprecated argument +is passed, will throw a warning and use its current equivalent. + +If some additional argument is passed that is neither a current function argument nor +a deprecated argument, an error will be thrown.} } \value{ If fname is not provided or set to \code{NULL} the function will return the model @@ -52,11 +58,13 @@ test <- agaricus.test bst <- xgb.train( data = xgb.DMatrix(train$data, label = train$label), - max_depth = 2, - eta = 1, - nthread = 2, nrounds = 2, - objective = "binary:logistic" + params = xgb.params( + max_depth = 2, + eta = 1, + nthread = 2, + objective = "binary:logistic" + ) ) # save the model in file 'xgb.model.dump' diff --git a/R-package/man/xgb.get.DMatrix.qcut.Rd b/R-package/man/xgb.get.DMatrix.qcut.Rd index daa8edf71093..d9d21b1912a8 100644 --- a/R-package/man/xgb.get.DMatrix.qcut.Rd +++ b/R-package/man/xgb.get.DMatrix.qcut.Rd @@ -44,7 +44,7 @@ dm <- xgb.DMatrix(x, label = y, nthread = 1) # DMatrix is not quantized right away, but will be once a hist model is generated model <- xgb.train( data = dm, - params = list(tree_method = "hist", max_bin = 8, nthread = 1), + params = xgb.params(tree_method = "hist", max_bin = 8, nthread = 1), nrounds = 3 ) diff --git a/R-package/man/xgb.get.num.boosted.rounds.Rd b/R-package/man/xgb.get.num.boosted.rounds.Rd index ba1c5e11a96b..039c0fe5c1c7 100644 --- a/R-package/man/xgb.get.num.boosted.rounds.Rd +++ b/R-package/man/xgb.get.num.boosted.rounds.Rd @@ -20,6 +20,6 @@ Get number of boosting in a fitted booster } \details{ Note that setting booster parameters related to training -continuation / updates through \code{\link[=xgb.parameters<-]{xgb.parameters<-()}} will reset the +continuation / updates through \code{\link[=xgb.model.parameters<-]{xgb.model.parameters<-()}} will reset the number of rounds to zero. } diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index 76574b9cbf06..f26067d7fef9 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -72,11 +72,13 @@ data(agaricus.train, package = "xgboost") bst <- xgb.train( data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), - max_depth = 2, - eta = 1, - nthread = 2, nrounds = 2, - objective = "binary:logistic" + params = xgb.params( + max_depth = 2, + eta = 1, + nthread = 2, + objective = "binary:logistic" + ) ) xgb.importance(model = bst) @@ -84,10 +86,13 @@ xgb.importance(model = bst) # binomial classification using "gblinear": bst <- xgb.train( data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), - booster = "gblinear", - eta = 0.3, - nthread = 1, - nrounds = 20,objective = "binary:logistic" + nrounds = 20, + params = xgb.params( + booster = "gblinear", + eta = 0.3, + nthread = 1, + objective = "binary:logistic" + ) ) xgb.importance(model = bst) @@ -100,12 +105,14 @@ mbst <- xgb.train( as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1 ), - max_depth = 3, - eta = 0.2, - nthread = 2, nrounds = nrounds, - objective = "multi:softprob", - num_class = nclass + params = xgb.params( + max_depth = 3, + eta = 0.2, + nthread = 2, + objective = "multi:softprob", + num_class = nclass + ) ) # all classes clumped together: @@ -128,12 +135,14 @@ mbst <- xgb.train( scale(as.matrix(iris[, -5])), label = as.numeric(iris$Species) - 1 ), - booster = "gblinear", - eta = 0.2, - nthread = 1, nrounds = 15, - objective = "multi:softprob", - num_class = nclass + params = xgb.params( + booster = "gblinear", + eta = 0.2, + nthread = 1, + objective = "multi:softprob", + num_class = nclass + ) ) xgb.importance(model = mbst) diff --git a/R-package/man/xgb.is.same.Booster.Rd b/R-package/man/xgb.is.same.Booster.Rd index 4ef0182077ca..9b7e47491c36 100644 --- a/R-package/man/xgb.is.same.Booster.Rd +++ b/R-package/man/xgb.is.same.Booster.Rd @@ -40,9 +40,9 @@ y <- mtcars$mpg x <- as.matrix(mtcars[, -1]) model <- xgb.train( - params = list(nthread = 1), + params = xgb.params(nthread = 1), data = xgb.DMatrix(x, label = y, nthread = 1), - nround = 3 + nrounds = 3 ) model_shallow_copy <- model diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd index bb898d6f5bfb..3059d530b017 100644 --- a/R-package/man/xgb.load.Rd +++ b/R-package/man/xgb.load.Rd @@ -38,11 +38,13 @@ test <- agaricus.test bst <- xgb.train( data = xgb.DMatrix(train$data, label = train$label), - max_depth = 2, - eta = 1, - nthread = nthread, nrounds = 2, - objective = "binary:logistic" + params = xgb.params( + max_depth = 2, + eta = 1, + nthread = nthread, + objective = "binary:logistic" + ) ) fname <- file.path(tempdir(), "xgb.ubj") diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd index 97533c883874..f55fc17a4e7b 100644 --- a/R-package/man/xgb.model.dt.tree.Rd +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -28,7 +28,13 @@ is zero-based (e.g., use \code{trees = 0:4} for the first five trees).} "Missing" should be represented as integers (when \code{TRUE}) or as "Tree-Node" character strings (when \code{FALSE}, default).} -\item{...}{Currently not used.} +\item{...}{Not used. + +Some arguments are currently deprecated or have been renamed. If a deprecated argument +is passed, will throw a warning and use its current equivalent. + +If some additional argument is passed that is neither a current function argument nor +a deprecated argument, an error will be thrown.} } \value{ A \code{data.table} with detailed information about tree nodes. It has the following columns: @@ -64,11 +70,13 @@ data.table::setDTthreads(nthread) bst <- xgb.train( data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), - max_depth = 2, - eta = 1, - nthread = nthread, nrounds = 2, - objective = "binary:logistic" + params = xgb.params( + max_depth = 2, + eta = 1, + nthread = nthread, + objective = "binary:logistic" + ) ) # This bst model already has feature_names stored with it, so those would be used when diff --git a/R-package/man/xgb.parameters.Rd b/R-package/man/xgb.model.parameters.Rd similarity index 83% rename from R-package/man/xgb.parameters.Rd rename to R-package/man/xgb.model.parameters.Rd index 65426792e0dd..5f7c11a1eb95 100644 --- a/R-package/man/xgb.parameters.Rd +++ b/R-package/man/xgb.model.parameters.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/xgb.Booster.R -\name{xgb.parameters<-} -\alias{xgb.parameters<-} +\name{xgb.model.parameters<-} +\alias{xgb.model.parameters<-} \title{Accessors for model parameters} \usage{ -xgb.parameters(object) <- value +xgb.model.parameters(object) <- value } \arguments{ \item{object}{Object of class \code{xgb.Booster}. \strong{Will be modified in-place}.} @@ -36,13 +36,15 @@ train <- agaricus.train bst <- xgb.train( data = xgb.DMatrix(train$data, label = train$label), - max_depth = 2, - eta = 1, - nthread = 2, nrounds = 2, - objective = "binary:logistic" + params = xgb.params( + max_depth = 2, + eta = 1, + nthread = 2, + objective = "binary:logistic" + ) ) -xgb.parameters(bst) <- list(eta = 0.1) +xgb.model.parameters(bst) <- list(eta = 0.1) } diff --git a/R-package/man/xgb.params.Rd b/R-package/man/xgb.params.Rd new file mode 100644 index 000000000000..051fba6c8bd0 --- /dev/null +++ b/R-package/man/xgb.params.Rd @@ -0,0 +1,539 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.train.R +\name{xgb.params} +\alias{xgb.params} +\title{XGBoost Parameters} +\usage{ +xgb.params( + objective = NULL, + verbosity = NULL, + nthread = NULL, + seed = NULL, + booster = NULL, + eta = NULL, + learning_rate = NULL, + gamma = NULL, + min_split_loss = NULL, + max_depth = NULL, + min_child_weight = NULL, + max_delta_step = NULL, + subsample = NULL, + sampling_method = NULL, + colsample_bytree = NULL, + colsample_bylevel = NULL, + colsample_bynode = NULL, + lambda = NULL, + reg_lambda = NULL, + alpha = NULL, + reg_alpha = NULL, + tree_method = NULL, + scale_pos_weight = NULL, + updater = NULL, + refresh_leaf = NULL, + grow_policy = NULL, + max_leaves = NULL, + max_bin = NULL, + num_parallel_tree = NULL, + monotone_constraints = NULL, + interaction_constraints = NULL, + multi_strategy = NULL, + base_score = NULL, + eval_metric = NULL, + seed_per_iteration = NULL, + device = NULL, + disable_default_eval_metric = NULL, + use_rmm = NULL, + max_cached_hist_node = NULL, + extmem_single_page = NULL, + max_cat_to_onehot = NULL, + max_cat_threshold = NULL, + sample_type = NULL, + normalize_type = NULL, + rate_drop = NULL, + one_drop = NULL, + skip_drop = NULL, + feature_selector = NULL, + top_k = NULL, + num_class = NULL, + tweedie_variance_power = NULL, + huber_slope = NULL, + quantile_alpha = NULL, + aft_loss_distribution = NULL, + lambdarank_pair_method = NULL, + lambdarank_num_pair_per_sample = NULL, + lambdarank_normalization = NULL, + lambdarank_unbiased = NULL, + lambdarank_bias_norm = NULL, + ndcg_exp_gain = NULL +) +} +\arguments{ +\item{objective}{(default=\code{"reg:squarederror"}) +Specify the learning task and the corresponding learning objective or a custom objective function to be used. + +For custom objective, see \href{https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html}{Custom Objective and Evaluation Metric} +and \href{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html#custom-obj-metric}{Custom objective and metric} for more information, +along with the end note for function signatures. + +Supported values are: +\itemize{ +\item \code{"reg:squarederror"}: regression with squared loss. +\item \code{"reg:squaredlogerror"}: regression with squared log loss \eqn{\frac{1}{2}[log(pred + 1) - log(label + 1)]^2}. All input labels are required to be greater than -1. Also, see metric \code{rmsle} for possible issue with this objective. +\item \code{"reg:logistic"}: logistic regression, output probability +\item \code{"reg:pseudohubererror"}: regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss. +\item \code{"reg:absoluteerror"}: Regression with L1 error. When tree model is used, leaf value is refreshed after tree construction. If used in distributed training, the leaf value is calculated as the mean value from all workers, which is not guaranteed to be optimal. + +Version added: 1.7.0 +\item \code{"reg:quantileerror"}: Quantile loss, also known as "pinball loss". See later sections for its parameter and \href{https://xgboost.readthedocs.io/en/latest/python/examples/quantile_regression.html#sphx-glr-python-examples-quantile-regression-py}{Quantile Regression} for a worked example. + +Version added: 2.0.0 +\item \code{"binary:logistic"}: logistic regression for binary classification, output probability +\item \code{"binary:logitraw"}: logistic regression for binary classification, output score before logistic transformation +\item \code{"binary:hinge"}: hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities. +\item \code{"count:poisson"}: Poisson regression for count data, output mean of Poisson distribution. #' \code{"max_delta_step"} is set to 0.7 by default in Poisson regression (used to safeguard optimization) +\item \code{"survival:cox"}: Cox regression for right censored survival time data (negative values are considered right censored). + +Note that predictions are returned on the hazard ratio scale (i.e., as HR = exp(marginal_prediction) in the proportional hazard function \code{h(t) = h0(t) * HR}). +\item \code{"survival:aft"}: Accelerated failure time model for censored survival time data. +See \href{https://xgboost.readthedocs.io/en/latest/tutorials/aft_survival_analysis.html}{Survival Analysis with Accelerated Failure Time} for details. +\item \code{"multi:softmax"}: set XGBoost to do multiclass classification using the softmax objective, you also need to set num_class(number of classes) +\item \code{"multi:softprob"}: same as softmax, but output a vector of \code{ndata * nclass}, which can be further reshaped to \code{ndata * nclass} matrix. The result contains predicted probability of each data point belonging to each class. +\item \code{"rank:ndcg"}: Use LambdaMART to perform pair-wise ranking where \href{http://en.wikipedia.org/wiki/NDCG}{Normalized Discounted Cumulative Gain (NDCG)} is maximized. This objective supports position debiasing for click data. +\item \code{"rank:map"}: Use LambdaMART to perform pair-wise ranking where \href{http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision}{Mean Average Precision (MAP)} is maximized +\item \code{"rank:pairwise"}: Use LambdaRank to perform pair-wise ranking using the \code{ranknet} objective. +\item \code{"reg:gamma"}: gamma regression with log-link. Output is a mean of gamma distribution. It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be \href{https://en.wikipedia.org/wiki/Gamma_distribution#Occurrence_and_applications}{gamma-distributed}. +\item \code{"reg:tweedie"}: Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be \href{https://en.wikipedia.org/wiki/Tweedie_distribution#Occurrence_and_applications}{Tweedie-distributed}. +}} + +\item{verbosity}{(default=1) +Verbosity of printing messages. Valid values are 0 (silent), 1 (warning), 2 (info), 3 +(debug). Sometimes XGBoost tries to change configurations based on heuristics, which +is displayed as warning message. If there's unexpected behaviour, please try to +increase value of verbosity.} + +\item{nthread}{(default to maximum number of threads available if not set) +Number of parallel threads used to run XGBoost. When choosing it, please keep thread +contention and hyperthreading in mind.} + +\item{seed}{Random number seed. If not specified, will take a random seed through R's own RNG engine.} + +\item{booster}{(default= \code{"gbtree"}) +Which booster to use. Can be \code{"gbtree"}, \code{"gblinear"} or \code{"dart"}; \code{"gbtree"} and \code{"dart"} use tree based models while \code{"gblinear"} uses linear functions.} + +\item{eta, learning_rate}{(two aliases for the same parameter) (for Tree Booster) (default=0.3) +Step size shrinkage used in update to prevent overfitting. After each boosting step, we can directly get the weights of new features, and \code{eta} shrinks the feature weights to make the boosting process more conservative. + +range: \eqn{[0,1]} + +Note: should only pass one of \code{eta} or \code{learning_rate}. Both refer to the same parameter and there's thus no difference between one or the other.} + +\item{gamma, min_split_loss}{(two aliases for the same parameter) (for Tree Booster) (default=0, alias: \code{gamma}) +Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger \code{min_split_loss} is, the more conservative the algorithm will be. Note that a tree where no splits were made might still contain a single terminal node with a non-zero score. + +range: \eqn{[0, \infty)} + +Note: should only pass one of \code{gamma} or \code{min_split_loss}. Both refer to the same parameter and there's thus no difference between one or the other.} + +\item{max_depth}{(for Tree Booster) (default=6) +Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit. 0 indicates no limit on depth. Beware that XGBoost aggressively consumes memory when training a deep tree. \code{"exact"} tree method requires non-zero value. + +range: \eqn{[0, \infty)}} + +\item{min_child_weight}{(for Tree Booster) (default=1) +Minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than \code{min_child_weight}, then the building process will give up further partitioning. In linear regression task, this simply corresponds to minimum number of instances needed to be in each node. The larger \code{min_child_weight} is, the more conservative the algorithm will be. + +range: \eqn{[0, \infty)}} + +\item{max_delta_step}{(for Tree Booster) (default=0) +Maximum delta step we allow each leaf output to be. If the value is set to 0, it means there is no constraint. If it is set to a positive value, it can help making the update step more conservative. Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced. Set it to value of 1-10 might help control the update. + +range: \eqn{[0, \infty)}} + +\item{subsample}{(for Tree Booster) (default=1) +Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. and this will prevent overfitting. Subsampling will occur once in every boosting iteration. + +range: \eqn{(0,1]}} + +\item{sampling_method}{(for Tree Booster) (default= \code{"uniform"}) +The method to use to sample the training instances. +\itemize{ +\item \code{"uniform"}: each training instance has an equal probability of being selected. Typically set +\code{"subsample"} >= 0.5 for good results. +\item \code{"gradient_based"}: the selection probability for each training instance is proportional to the +\bold{regularized absolute value} of gradients (more specifically, \eqn{\sqrt{g^2+\lambda h^2}}). +\code{"subsample"} may be set to as low as 0.1 without loss of model accuracy. Note that this +sampling method is only supported when \code{"tree_method"} is set to \code{"hist"} and the device is \code{"cuda"}; other tree +methods only support \code{"uniform"} sampling. +}} + +\item{colsample_bytree, colsample_bylevel, colsample_bynode}{(for Tree Booster) (default=1) +This is a family of parameters for subsampling of columns. +\itemize{ +\item All \code{"colsample_by*"} parameters have a range of \eqn{(0, 1]}, the default value of 1, and specify the fraction of columns to be subsampled. +\item \code{"colsample_bytree"} is the subsample ratio of columns when constructing each tree. Subsampling occurs once for every tree constructed. +\item \code{"colsample_bylevel"} is the subsample ratio of columns for each level. Subsampling occurs once for every new depth level reached in a tree. Columns are subsampled from the set of columns chosen for the current tree. +\item \code{"colsample_bynode"} is the subsample ratio of columns for each node (split). Subsampling occurs once every time a new split is evaluated. Columns are subsampled from the set of columns chosen for the current level. This is not supported by the exact tree method. +\item \code{"colsample_by*"} parameters work cumulatively. For instance, +the combination \verb{\{'colsample_bytree'=0.5, 'colsample_bylevel'=0.5, 'colsample_bynode'=0.5\}} with 64 features will leave 8 features to choose from at +each split. +} + +One can set the \code{"feature_weights"} for DMatrix to +define the probability of each feature being selected when using column sampling.} + +\item{lambda, reg_lambda}{(two aliases for the same parameter) +\itemize{ +\item For tree-based boosters: +\itemize{ +\item L2 regularization term on weights. Increasing this value will make model more conservative. +\item default: 1 +\item range: \eqn{[0, \infty]} +} +\item For linear booster: +\itemize{ +\item L2 regularization term on weights. Increasing this value will make model more conservative. Normalised to number of training examples. +\item default: 0 +\item range: \eqn{[0, \infty)} +} +} + +Note: should only pass one of \code{lambda} or \code{reg_lambda}. Both refer to the same parameter and there's thus no difference between one or the other.} + +\item{alpha, reg_alpha}{(two aliases for the same parameter) +\itemize{ +\item L1 regularization term on weights. Increasing this value will make model more conservative. +\item For the linear booster, it's normalised to number of training examples. +\item default: 0 +\item range: \eqn{[0, \infty)} +} + +Note: should only pass one of \code{alpha} or \code{reg_alpha}. Both refer to the same parameter and there's thus no difference between one or the other.} + +\item{tree_method}{(for Tree Booster) (default= \code{"auto"}) +The tree construction algorithm used in XGBoost. See description in the \href{http://arxiv.org/abs/1603.02754}{reference paper} and \href{https://xgboost.readthedocs.io/en/latest/treemethod.html}{Tree Methods}. + +Choices: \code{"auto"}, \code{"exact"}, \code{"approx"}, \code{"hist"}, this is a combination of commonly +used updaters. For other updaters like \code{"refresh"}, set the parameter \code{updater} +directly. +\itemize{ +\item \code{"auto"}: Same as the \code{"hist"} tree method. +\item \code{"exact"}: Exact greedy algorithm. Enumerates all split candidates. +\item \code{"approx"}: Approximate greedy algorithm using quantile sketch and gradient histogram. +\item \code{"hist"}: Faster histogram optimized approximate greedy algorithm. +}} + +\item{scale_pos_weight}{(for Tree Booster) (default=1) +Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: \verb{sum(negative instances) / sum(positive instances)}. See \href{https://xgboost.readthedocs.io/en/latest/tutorials/param_tuning.html}{Parameters Tuning} for more discussion. Also, see Higgs Kaggle competition demo for examples: \href{https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-train.R}{R}, \href{https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-numpy.py}{py1}, \href{https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-cv.py}{py2}, \href{https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cross_validation.py}{py3}.} + +\item{updater}{Has different meanings depending on the type of booster. +\itemize{ +\item For tree-based boosters: +A comma separated string defining the sequence of tree updaters to run, providing a modular way to construct and to modify the trees. This is an advanced parameter that is usually set automatically, depending on some other parameters. However, it could be also set explicitly by a user. The following updaters exist: +\itemize{ +\item \code{"grow_colmaker"}: non-distributed column-based construction of trees. +\item \code{"grow_histmaker"}: distributed tree construction with row-based data splitting based on global proposal of histogram counting. +\item \code{"grow_quantile_histmaker"}: Grow tree using quantized histogram. +\item \code{"grow_gpu_hist"}: Enabled when \code{tree_method} is set to \code{"hist"} along with \code{device="cuda"}. +\item \code{"grow_gpu_approx"}: Enabled when \code{tree_method} is set to \code{"approx"} along with \code{device="cuda"}. +\item \code{"sync"}: synchronizes trees in all distributed nodes. +\item \code{"refresh"}: refreshes tree's statistics and/or leaf values based on the current data. Note that no random subsampling of data rows is performed. +\item \code{"prune"}: prunes the splits where loss < \code{min_split_loss} (or \code{gamma}) and nodes that have depth greater than \code{max_depth}. +} +\item For \code{booster="gblinear"}: +(default= \code{"shotgun"}) Choice of algorithm to fit linear model +\itemize{ +\item \code{"shotgun"}: Parallel coordinate descent algorithm based on shotgun algorithm. Uses 'hogwild' parallelism and therefore produces a nondeterministic solution on each run. +\item \code{"coord_descent"}: Ordinary coordinate descent algorithm. Also multithreaded but still produces a deterministic solution. When the \code{device} parameter is set to \code{"cuda"} or \code{"gpu"}, a GPU variant would be used. +} +}} + +\item{refresh_leaf}{(for Tree Booster) (default=1) +This is a parameter of the \code{"refresh"} updater. When this flag is 1, tree leafs as well as tree nodes' stats are updated. When it is 0, only node stats are updated.} + +\item{grow_policy}{(for Tree Booster) (default= \code{"depthwise"}) +\itemize{ +\item Controls a way new nodes are added to the tree. +\item Currently supported only if \code{tree_method} is set to \code{"hist"} or \code{"approx"}. +\item Choices: \code{"depthwise"}, \code{"lossguide"} +\itemize{ +\item \code{"depthwise"}: split at nodes closest to the root. +\item \code{"lossguide"}: split at nodes with highest loss change. +} +}} + +\item{max_leaves}{(for Tree Booster) (default=0) +Maximum number of nodes to be added. Not used by \code{"exact"} tree method.} + +\item{max_bin}{(for Tree Booster) (default=256) +\itemize{ +\item Only used if \code{tree_method} is set to \code{"hist"} or \code{"approx"}. +\item Maximum number of discrete bins to bucket continuous features. +\item Increasing this number improves the optimality of splits at the cost of higher computation time. +}} + +\item{num_parallel_tree}{(for Tree Booster) (default=1) +Number of parallel trees constructed during each iteration. This option is used to support boosted random forest.} + +\item{monotone_constraints}{(for Tree Booster) +Constraint of variable monotonicity. See \href{https://xgboost.readthedocs.io/en/latest/tutorials/monotonic.html}{Monotonic Constraints} for more information.} + +\item{interaction_constraints}{(for Tree Booster) +Constraints for interaction representing permitted interactions. The constraints must +be specified in the form of a nest list, e.g. \code{list(c(0, 1), c(2, 3, 4))}, where each inner +list is a group of indices of features (base-0 numeration) that are allowed to interact with each other. +See \href{https://xgboost.readthedocs.io/en/latest/tutorials/feature_interaction_constraint.html}{Feature Interaction Constraints} for more information.} + +\item{multi_strategy}{(for Tree Booster) (default = \code{"one_output_per_tree"}) +The strategy used for training multi-target models, including multi-target regression +and multi-class classification. See \href{https://xgboost.readthedocs.io/en/latest/tutorials/multioutput.html}{Multiple Outputs} for more information. +\itemize{ +\item \code{"one_output_per_tree"}: One model for each target. +\item \code{"multi_output_tree"}: Use multi-target trees. +} + +Version added: 2.0.0 + +Note: This parameter is working-in-progress.} + +\item{base_score}{\itemize{ +\item The initial prediction score of all instances, global bias +\item The parameter is automatically estimated for selected objectives before training. To +disable the estimation, specify a real number argument. +\item If \code{base_margin} is supplied, \code{base_score} will not be added. +\item For sufficient number of iterations, changing this value will not have too much effect. +}} + +\item{eval_metric}{(default according to objective) +\itemize{ +\item Evaluation metrics for validation data, a default metric will be assigned according to objective (rmse for regression, and logloss for classification, \verb{mean average precision} for \code{rank:map}, etc.) +\item User can add multiple evaluation metrics. +\item The choices are listed below: +\itemize{ +\item \code{"rmse"}: \href{http://en.wikipedia.org/wiki/Root_mean_square_error}{root mean square error} +\item \code{"rmsle"}: root mean square log error: \eqn{\sqrt{\frac{1}{N}[log(pred + 1) - log(label + 1)]^2}}. Default metric of \code{"reg:squaredlogerror"} objective. This metric reduces errors generated by outliers in dataset. But because \code{log} function is employed, \code{"rmsle"} might output \code{nan} when prediction value is less than -1. See \code{"reg:squaredlogerror"} for other requirements. +\item \code{"mae"}: \href{https://en.wikipedia.org/wiki/Mean_absolute_error}{mean absolute error} +\item \code{"mape"}: \href{https://en.wikipedia.org/wiki/Mean_absolute_percentage_error}{mean absolute percentage error} +\item \code{"mphe"}: \href{https://en.wikipedia.org/wiki/Huber_loss}{mean Pseudo Huber error}. Default metric of \code{"reg:pseudohubererror"} objective. +\item \code{"logloss"}: \href{http://en.wikipedia.org/wiki/Log-likelihood}{negative log-likelihood} +\item \code{"error"}: Binary classification error rate. It is calculated as \verb{#(wrong cases)/#(all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances. +\item \code{"error@t"}: a different than 0.5 binary classification threshold value could be specified by providing a numerical value through 't'. +\item \code{"merror"}: Multiclass classification error rate. It is calculated as \verb{#(wrong cases)/#(all cases)}. +\item \code{"mlogloss"}: \href{http://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html}{Multiclass logloss}. +\item \code{"auc"}: \href{https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve}{Receiver Operating Characteristic Area under the Curve}. +Available for classification and learning-to-rank tasks. +\itemize{ +\item When used with binary classification, the objective should be \code{"binary:logistic"} or similar functions that work on probability. +\item When used with multi-class classification, objective should be \code{"multi:softprob"} instead of \code{"multi:softmax"}, as the latter doesn't output probability. Also the AUC is calculated by 1-vs-rest with reference class weighted by class prevalence. +\item When used with LTR task, the AUC is computed by comparing pairs of documents to count correctly sorted pairs. This corresponds to pairwise learning to rank. The implementation has some issues with average AUC around groups and distributed workers not being well-defined. +\item On a single machine the AUC calculation is exact. In a distributed environment the AUC is a weighted average over the AUC of training rows on each node - therefore, distributed AUC is an approximation sensitive to the distribution of data across workers. Use another metric in distributed environments if precision and reproducibility are important. +\item When input dataset contains only negative or positive samples, the output is \code{NaN}. The behavior is implementation defined, for instance, \code{scikit-learn} returns \eqn{0.5} instead. +} +\item \code{"aucpr"}: \href{https://en.wikipedia.org/wiki/Precision_and_recall}{Area under the PR curve}. +Available for classification and learning-to-rank tasks. + +After XGBoost 1.6, both of the requirements and restrictions for using \code{"aucpr"} in classification problem are similar to \code{"auc"}. For ranking task, only binary relevance label \eqn{y \in [0, 1]} is supported. Different from \code{"map"} (mean average precision), \code{"aucpr"} calculates the \emph{interpolated} area under precision recall curve using continuous interpolation. +\item \code{"pre"}: Precision at \eqn{k}. Supports only learning to rank task. +\item \code{"ndcg"}: \href{http://en.wikipedia.org/wiki/NDCG}{Normalized Discounted Cumulative Gain} +\item \code{"map"}: \href{http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision}{Mean Average Precision} + +The \verb{average precision} is defined as: + +\eqn{AP@l = \frac{1}{min{(l, N)}}\sum^l_{k=1}P@k \cdot I_{(k)}} + +where \eqn{I_{(k)}} is an indicator function that equals to \eqn{1} when the document at \eqn{k} is relevant and \eqn{0} otherwise. The \eqn{P@k} is the precision at \eqn{k}, and \eqn{N} is the total number of relevant documents. Lastly, the \verb{mean average precision} is defined as the weighted average across all queries. +\item \code{"ndcg@n"}, \code{"map@n"}, \code{"pre@n"}: \eqn{n} can be assigned as an integer to cut off the top positions in the lists for evaluation. +\item \code{"ndcg-"}, \code{"map-"}, \code{"ndcg@n-"}, \code{"map@n-"}: In XGBoost, the NDCG and MAP evaluate the score of a list without any positive samples as \eqn{1}. By appending "-" to the evaluation metric name, we can ask XGBoost to evaluate these scores as \eqn{0} to be consistent under some conditions. +\item \code{"poisson-nloglik"}: negative log-likelihood for Poisson regression +\item \code{"gamma-nloglik"}: negative log-likelihood for gamma regression +\item \code{"cox-nloglik"}: negative partial log-likelihood for Cox proportional hazards regression +\item \code{"gamma-deviance"}: residual deviance for gamma regression +\item \code{"tweedie-nloglik"}: negative log-likelihood for Tweedie regression (at a specified value of the \code{tweedie_variance_power} parameter) +\item \code{"aft-nloglik"}: Negative log likelihood of Accelerated Failure Time model. +See \href{https://xgboost.readthedocs.io/en/latest/tutorials/aft_survival_analysis.html}{Survival Analysis with Accelerated Failure Time} for details. +\item \code{"interval-regression-accuracy"}: Fraction of data points whose predicted labels fall in the interval-censored labels. +Only applicable for interval-censored data. See \href{https://xgboost.readthedocs.io/en/latest/tutorials/aft_survival_analysis.html}{Survival Analysis with Accelerated Failure Time} for details. +} +}} + +\item{seed_per_iteration}{(default= \code{FALSE}) +Seed PRNG determnisticly via iterator number.} + +\item{device}{(default= \code{"cpu"}) +Device for XGBoost to run. User can set it to one of the following values: +\itemize{ +\item \code{"cpu"}: Use CPU. +\item \code{"cuda"}: Use a GPU (CUDA device). +\item \code{"cuda:"}: \verb{} is an integer that specifies the ordinal of the GPU (which GPU do you want to use if you have more than one devices). +\item \code{"gpu"}: Default GPU device selection from the list of available and supported devices. Only \code{"cuda"} devices are supported currently. +\item \code{"gpu:"}: Default GPU device selection from the list of available and supported devices. Only \code{"cuda"} devices are supported currently. +} + +For more information about GPU acceleration, see \href{https://xgboost.readthedocs.io/en/latest/gpu/index.html}{XGBoost GPU Support}. In distributed environments, ordinal selection is handled by distributed frameworks instead of XGBoost. As a result, using \code{"cuda:"} will result in an error. Use \code{"cuda"} instead. + +Version added: 2.0.0 + +Note: if XGBoost was installed from CRAN, it won't have GPU support enabled, thus only \code{"cpu"} will be available. +To get GPU support, the R package for XGBoost must be installed from source or from the GitHub releases - see +\href{https://xgboost.readthedocs.io/en/latest/install.html#r}{instructions}.} + +\item{disable_default_eval_metric}{(default= \code{FALSE}) +Flag to disable default metric. Set to 1 or \code{TRUE} to disable.} + +\item{use_rmm}{Whether to use RAPIDS Memory Manager (RMM) to allocate cache GPU +memory. The primary memory is always allocated on the RMM pool when XGBoost is built +(compiled) with the RMM plugin enabled. Valid values are \code{TRUE} and \code{FALSE}. See +\href{https://xgboost.readthedocs.io/en/latest/python/rmm-examples/index.html}{Using XGBoost with RAPIDS Memory Manager (RMM) plugin} for details.} + +\item{max_cached_hist_node}{(for Non-Exact Tree Methods) (default = 65536) +Maximum number of cached nodes for histogram. This can be used with the \code{"hist"} and the +\code{"approx"} tree methods. + +Version added: 2.0.0 +\itemize{ +\item For most of the cases this parameter should not be set except for growing deep +trees. After 3.0, this parameter affects GPU algorithms as well. +}} + +\item{extmem_single_page}{(for Non-Exact Tree Methods) (default = \code{FALSE}) +This parameter is only used for the \code{"hist"} tree method with \code{device="cuda"} and +\code{subsample != 1.0}. Before 3.0, pages were always concatenated. + +Version added: 3.0.0 + +Whether the GPU-based \code{"hist"} tree method should concatenate the training data into a +single batch instead of fetching data on-demand when external memory is used. For GPU +devices that don't support address translation services, external memory training is +expensive. This parameter can be used in combination with subsampling to reduce overall +memory usage without significant overhead. See \href{https://xgboost.readthedocs.io/en/latest/tutorials/external_memory.html}{Using XGBoost External Memory Version} for +more information.} + +\item{max_cat_to_onehot}{(for Non-Exact Tree Methods) +A threshold for deciding whether XGBoost should use one-hot encoding based split for +categorical data. When number of categories is lesser than the threshold then one-hot +encoding is chosen, otherwise the categories will be partitioned into children nodes. + +Version added: 1.6.0} + +\item{max_cat_threshold}{(for Non-Exact Tree Methods) +Maximum number of categories considered for each split. Used only by partition-based +splits for preventing over-fitting. + +Version added: 1.7.0} + +\item{sample_type}{(for Dart Booster) (default= \code{"uniform"}) +Type of sampling algorithm. +\itemize{ +\item \code{"uniform"}: dropped trees are selected uniformly. +\item \code{"weighted"}: dropped trees are selected in proportion to weight. +}} + +\item{normalize_type}{(for Dart Booster) (default= \code{"tree"}) +Type of normalization algorithm. +\itemize{ +\item \code{"tree"}: new trees have the same weight of each of dropped trees. +\itemize{ +\item Weight of new trees are \code{1 / (k + learning_rate)}. +\item Dropped trees are scaled by a factor of \code{k / (k + learning_rate)}. +} +\item \code{"forest"}: new trees have the same weight of sum of dropped trees (forest). +\itemize{ +\item Weight of new trees are \code{1 / (1 + learning_rate)}. +\item Dropped trees are scaled by a factor of \code{1 / (1 + learning_rate)}. +} +}} + +\item{rate_drop}{(for Dart Booster) (default=0.0) +Dropout rate (a fraction of previous trees to drop during the dropout). + +range: \eqn{[0.0, 1.0]}} + +\item{one_drop}{(for Dart Booster) (default=0) +When this flag is enabled, at least one tree is always dropped during the dropout (allows Binomial-plus-one or epsilon-dropout from the original DART paper).} + +\item{skip_drop}{(for Dart Booster) (default=0.0) +Probability of skipping the dropout procedure during a boosting iteration. +\itemize{ +\item If a dropout is skipped, new trees are added in the same manner as \code{"gbtree"}. +\item Note that non-zero \code{skip_drop} has higher priority than \code{rate_drop} or \code{one_drop}. +} + +range: \eqn{[0.0, 1.0]}} + +\item{feature_selector}{(for Linear Booster) (default= \code{"cyclic"}) +Feature selection and ordering method +\itemize{ +\item \code{"cyclic"}: Deterministic selection by cycling through features one at a time. +\item \code{"shuffle"}: Similar to \code{"cyclic"} but with random feature shuffling prior to each update. +\item \code{"random"}: A random (with replacement) coordinate selector. +\item \code{"greedy"}: Select coordinate with the greatest gradient magnitude. It has \code{O(num_feature^2)} complexity. It is fully deterministic. It allows restricting the selection to \code{top_k} features per group with the largest magnitude of univariate weight change, by setting the \code{top_k} parameter. Doing so would reduce the complexity to \code{O(num_feature*top_k)}. +\item \code{"thrifty"}: Thrifty, approximately-greedy feature selector. Prior to cyclic updates, reorders features in descending magnitude of their univariate weight changes. This operation is multithreaded and is a linear complexity approximation of the quadratic greedy selection. It allows restricting the selection to \code{top_k} features per group with the largest magnitude of univariate weight change, by setting the \code{top_k} parameter. +}} + +\item{top_k}{(for Linear Booster) (default=0) +The number of top features to select in \code{greedy} and \code{thrifty} feature selector. The value of 0 means using all the features.} + +\item{num_class}{Number of classes when using multi-class classification objectives (e.g. \code{objective="multi:softprob"})} + +\item{tweedie_variance_power}{(for Tweedie Regression (\code{"objective=reg:tweedie"})) (default=1.5) +\itemize{ +\item Parameter that controls the variance of the Tweedie distribution \code{var(y) ~ E(y)^tweedie_variance_power} +\item range: \eqn{(1,2)} +\item Set closer to 2 to shift towards a gamma distribution +\item Set closer to 1 to shift towards a Poisson distribution. +}} + +\item{huber_slope}{(for using Pseudo-Huber (\verb{"reg:pseudohubererror}")) (default = 1.0) +A parameter used for Pseudo-Huber loss to define the \eqn{\delta} term.} + +\item{quantile_alpha}{(for using Quantile Loss (\code{"reg:quantileerror"})) +A scalar or a list of targeted quantiles (passed as a numeric vector). + +Version added: 2.0.0} + +\item{aft_loss_distribution}{(for using AFT Survival Loss (\code{"survival:aft"}) and Negative Log Likelihood of AFT metric (\code{"aft-nloglik"})) +Probability Density Function, \code{"normal"}, \code{"logistic"}, or \code{"extreme"}.} + +\item{lambdarank_pair_method}{(for learning to rank (\code{"rank:ndcg"}, \code{"rank:map"}, \code{"rank:pairwise"})) (default = \code{"topk"}) +How to construct pairs for pair-wise learning. +\itemize{ +\item \code{"mean"}: Sample \code{lambdarank_num_pair_per_sample} pairs for each document in the query list. +\item \code{"topk"}: Focus on top-\code{lambdarank_num_pair_per_sample} documents. Construct \eqn{|query|} pairs for each document at the top-\code{lambdarank_num_pair_per_sample} ranked by the model. +}} + +\item{lambdarank_num_pair_per_sample}{(for learning to rank (\code{"rank:ndcg"}, \code{"rank:map"}, \code{"rank:pairwise"})) +It specifies the number of pairs sampled for each document when pair method is \code{"mean"}, or the truncation level for queries when the pair method is \code{"topk"}. For example, to train with \verb{ndcg@6}, set \code{"lambdarank_num_pair_per_sample"} to \eqn{6} and \code{lambdarank_pair_method} to \code{"topk"}. + +range = \eqn{[1, \infty)}} + +\item{lambdarank_normalization}{(for learning to rank (\code{"rank:ndcg"}, \code{"rank:map"}, \code{"rank:pairwise"})) (default = \code{TRUE}) +Whether to normalize the leaf value by lambda gradient. This can sometimes stagnate the training progress. + +Version added: 2.1.0} + +\item{lambdarank_unbiased}{(for learning to rank (\code{"rank:ndcg"}, \code{"rank:map"}, \code{"rank:pairwise"})) (default = \code{FALSE}) +Specify whether do we need to debias input click data.} + +\item{lambdarank_bias_norm}{(for learning to rank (\code{"rank:ndcg"}, \code{"rank:map"}, \code{"rank:pairwise"})) (default = 2.0) +\eqn{L_p} normalization for position debiasing, default is \eqn{L_2}. Only relevant when \code{lambdarank_unbiased} is set to \code{TRUE}.} + +\item{ndcg_exp_gain}{(for learning to rank (\code{"rank:ndcg"}, \code{"rank:map"}, \code{"rank:pairwise"})) (default = \code{TRUE}) +Whether we should use exponential gain function for \code{NDCG}. There are two forms of gain function for \code{NDCG}, one is using relevance value directly while the other is using\eqn{2^{rel} - 1} to emphasize on retrieving relevant documents. When \code{ndcg_exp_gain} is \code{TRUE} (the default), relevance degree cannot be greater than 31.} +} +\value{ +A list with the entries that were passed non-NULL values. It is intended to +be passed as argument \code{params} to \code{\link[=xgb.train]{xgb.train()}} or \code{\link[=xgb.cv]{xgb.cv()}}. +} +\description{ +Convenience function to generate a list of named XGBoost parameters, which +can be passed as argument \code{params} to \code{\link[=xgb.train]{xgb.train()}}. See the \href{https://xgboost.readthedocs.io/en/stable/parameter.html}{online documentation} for more details. + +The purpose of this function is to enable IDE autocompletions and to provide in-package +documentation for all the possible parameters that XGBoost accepts. The output from this +function is just a regular R list containing the parameters that were set to non-default +values. Note that this function will not perform any validation on the supplied arguments. + +If passing \code{NULL} for a given parameter (the default for all of them), then the default +value for that parameter will be used. Default values are automatically determined by the +XGBoost core library upon calls to \code{\link[=xgb.train]{xgb.train()}} or \code{\link[=xgb.cv]{xgb.cv()}}, and are subject to change +over XGBoost library versions. +} diff --git a/R-package/man/xgb.plot.deepness.Rd b/R-package/man/xgb.plot.deepness.Rd index 536bb98c8436..e8729b7ca9be 100644 --- a/R-package/man/xgb.plot.deepness.Rd +++ b/R-package/man/xgb.plot.deepness.Rd @@ -76,12 +76,14 @@ data.table::setDTthreads(nthread) ## Change max_depth to a higher number to get a more significant result bst <- xgb.train( data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), - max_depth = 6, - nthread = nthread, nrounds = 50, - objective = "binary:logistic", - subsample = 0.5, - min_child_weight = 2 + params = xgb.params( + max_depth = 6, + nthread = nthread, + objective = "binary:logistic", + subsample = 0.5, + min_child_weight = 2 + ) ) xgb.plot.deepness(bst) diff --git a/R-package/man/xgb.plot.importance.Rd b/R-package/man/xgb.plot.importance.Rd index 6b26bec2a86d..7b9dc40d2450 100644 --- a/R-package/man/xgb.plot.importance.Rd +++ b/R-package/man/xgb.plot.importance.Rd @@ -90,11 +90,13 @@ data.table::setDTthreads(nthread) bst <- xgb.train( data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), - max_depth = 3, - eta = 1, - nthread = nthread, nrounds = 2, - objective = "binary:logistic" + params = xgb.params( + max_depth = 3, + eta = 1, + nthread = nthread, + objective = "binary:logistic" + ) ) importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst) diff --git a/R-package/man/xgb.plot.multi.trees.Rd b/R-package/man/xgb.plot.multi.trees.Rd index df72ee452ee6..989096e60e42 100644 --- a/R-package/man/xgb.plot.multi.trees.Rd +++ b/R-package/man/xgb.plot.multi.trees.Rd @@ -26,7 +26,13 @@ The values are passed to \code{DiagrammeR::render_graph()}.} \item{render}{Should the graph be rendered or not? The default is \code{TRUE}.} -\item{...}{Currently not used.} +\item{...}{Not used. + +Some arguments are currently deprecated or have been renamed. If a deprecated argument +is passed, will throw a warning and use its current equivalent. + +If some additional argument is passed that is neither a current function argument nor +a deprecated argument, an error will be thrown.} } \value{ Rendered graph object which is an htmlwidget of ' class \code{grViz}. Similar to @@ -64,13 +70,15 @@ data.table::setDTthreads(nthread) bst <- xgb.train( data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), - max_depth = 15, - eta = 1, - nthread = nthread, nrounds = 30, - objective = "binary:logistic", - min_child_weight = 50, - verbose = 0 + verbose = 0, + params = xgb.params( + max_depth = 15, + eta = 1, + nthread = nthread, + objective = "binary:logistic", + min_child_weight = 50 + ) ) p <- xgb.plot.multi.trees(model = bst, features_keep = 3) diff --git a/R-package/man/xgb.plot.shap.Rd b/R-package/man/xgb.plot.shap.Rd index 969a7d103c62..7bdd5ad2bfac 100644 --- a/R-package/man/xgb.plot.shap.Rd +++ b/R-package/man/xgb.plot.shap.Rd @@ -137,12 +137,14 @@ nrounds <- 20 bst <- xgb.train( data = xgb.DMatrix(agaricus.train$data, agaricus.train$label), nrounds = nrounds, - eta = 0.1, - max_depth = 3, - subsample = 0.5, - objective = "binary:logistic", - nthread = nthread, - verbose = 0 + verbose = 0, + params = xgb.params( + eta = 0.1, + max_depth = 3, + subsample = 0.5, + objective = "binary:logistic", + nthread = nthread + ) ) xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none") @@ -162,13 +164,15 @@ is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values mbst <- xgb.train( data = xgb.DMatrix(x, label = as.numeric(iris$Species) - 1), nrounds = nrounds, - max_depth = 2, - eta = 0.3, - subsample = 0.5, - nthread = nthread, - objective = "multi:softprob", - num_class = nclass, - verbose = 0 + verbose = 0, + params = xgb.params( + max_depth = 2, + eta = 0.3, + subsample = 0.5, + nthread = nthread, + objective = "multi:softprob", + num_class = nclass + ) ) trees0 <- seq(from = 0, by = nclass, length.out = nrounds) col <- rgb(0, 0, 1, 0.5) diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd index c58187d0f520..ebcc8603fe98 100644 --- a/R-package/man/xgb.plot.tree.Rd +++ b/R-package/man/xgb.plot.tree.Rd @@ -29,7 +29,13 @@ splits. When this option is on, the model dump contains two additional values: gain is the approximate loss function gain we get in each split; cover is the sum of second order gradient in each node.} -\item{...}{Currently not used.} +\item{...}{Not used. + +Some arguments are currently deprecated or have been renamed. If a deprecated argument +is passed, will throw a warning and use its current equivalent. + +If some additional argument is passed that is neither a current function argument nor +a deprecated argument, an error will be thrown.} } \value{ Rendered graph object which is an htmlwidget of ' class \code{grViz}. Similar to @@ -62,11 +68,13 @@ data(agaricus.train, package = "xgboost") bst <- xgb.train( data = xgb.DMatrix(agaricus.train$data, agaricus.train$label), - max_depth = 3, - eta = 1, - nthread = 2, nrounds = 2, - objective = "binary:logistic" + params = xgb.params( + max_depth = 3, + eta = 1, + nthread = 2, + objective = "binary:logistic" + ) ) # plot the first tree diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd index ec9ab63f717c..738f947405c7 100644 --- a/R-package/man/xgb.save.Rd +++ b/R-package/man/xgb.save.Rd @@ -52,11 +52,13 @@ test <- agaricus.test bst <- xgb.train( data = xgb.DMatrix(train$data, label = train$label), - max_depth = 2, - eta = 1, - nthread = nthread, nrounds = 2, - objective = "binary:logistic" + params = xgb.params( + max_depth = 2, + eta = 1, + nthread = nthread, + objective = "binary:logistic" + ) ) fname <- file.path(tempdir(), "xgb.ubj") diff --git a/R-package/man/xgb.save.raw.Rd b/R-package/man/xgb.save.raw.Rd index d5b0d7cc9d6c..24f190a88c30 100644 --- a/R-package/man/xgb.save.raw.Rd +++ b/R-package/man/xgb.save.raw.Rd @@ -34,11 +34,13 @@ test <- agaricus.test bst <- xgb.train( data = xgb.DMatrix(train$data, label = train$label), - max_depth = 2, - eta = 1, - nthread = nthread, nrounds = 2, - objective = "binary:logistic" + params = xgb.params( + max_depth = 2, + eta = 1, + nthread = nthread, + objective = "binary:logistic" + ) ) raw <- xgb.save.raw(bst) diff --git a/R-package/man/xgb.slice.Booster.Rd b/R-package/man/xgb.slice.Booster.Rd index d245ced1bccf..294a51b5aa32 100644 --- a/R-package/man/xgb.slice.Booster.Rd +++ b/R-package/man/xgb.slice.Booster.Rd @@ -47,7 +47,7 @@ y <- mtcars$mpg x <- as.matrix(mtcars[, -1]) dm <- xgb.DMatrix(x, label = y, nthread = 1) -model <- xgb.train(data = dm, params = list(nthread = 1), nrounds = 5) +model <- xgb.train(data = dm, params = xgb.params(nthread = 1), nrounds = 5) model_slice <- xgb.slice.Booster(model, 1, 3) # Prediction for first three rounds predict(model, x, predleaf = TRUE)[, 1:3] diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index be4290d9806d..2a2d45c78924 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -5,12 +5,12 @@ \title{eXtreme Gradient Boosting Training} \usage{ xgb.train( - params = list(), + params = xgb.params(), data, nrounds, evals = list(), - obj = NULL, - feval = NULL, + objective = NULL, + custom_metric = NULL, verbose = 1, print_every_n = 1L, early_stopping_rounds = NULL, @@ -23,137 +23,35 @@ xgb.train( ) } \arguments{ -\item{params}{the list of parameters. The complete list of parameters is -available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. -Below is a shorter summary: +\item{params}{List of XGBoost parameters which control the model building process. +See the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation} +and the documentation for \code{\link[=xgb.params]{xgb.params()}} for details. -\strong{1. General Parameters} -\itemize{ -\item \code{booster}: Which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}. -} - -\strong{2. Booster Parameters} +Should be passed as list with named entries. Parameters that are not specified in this +list will use their default values. -\strong{2.1. Parameters for Tree Booster} -\itemize{ -\item \code{eta}: The learning rate: scale the contribution of each tree by a factor of \verb{0 < eta < 1} -when it is added to the current approximation. -Used to prevent overfitting by making the boosting process more conservative. -Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model -more robust to overfitting but slower to compute. Default: 0.3. -\item \code{gamma}: Minimum loss reduction required to make a further partition on a leaf node of the tree. -the larger, the more conservative the algorithm will be. -\item \code{max_depth}: Maximum depth of a tree. Default: 6. -\item \code{min_child_weight}: Minimum sum of instance weight (hessian) needed in a child. -If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, -then the building process will give up further partitioning. -In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. -The larger, the more conservative the algorithm will be. Default: 1. -\item \code{subsample}: Subsample ratio of the training instance. -Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees -and this will prevent overfitting. It makes computation shorter (because less data to analyse). -It is advised to use this parameter with \code{eta} and increase \code{nrounds}. Default: 1. -\item \code{colsample_bytree}: Subsample ratio of columns when constructing each tree. Default: 1. -\item \code{lambda}: L2 regularization term on weights. Default: 1. -\item \code{alpha}: L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0. -\item \code{num_parallel_tree}: Experimental parameter. number of trees to grow per round. -Useful to test Random Forest through XGBoost. -(set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. -Default: 1. -\item \code{monotone_constraints}: A numerical vector consists of \code{1}, \code{0} and \code{-1} with its length -equals to the number of features in the training data. -\code{1} is increasing, \code{-1} is decreasing and \code{0} is no constraint. -\item \code{interaction_constraints}: A list of vectors specifying feature indices of permitted interactions. -Each item of the list represents one permitted interaction where specified features are allowed to interact with each other. -Feature index values should start from \code{0} (\code{0} references the first column). -Leave argument unspecified for no interaction constraints. -} - -\strong{2.2. Parameters for Linear Booster} -\itemize{ -\item \code{lambda}: L2 regularization term on weights. Default: 0. -\item \code{lambda_bias}: L2 regularization term on bias. Default: 0. -\item \code{alpha}: L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0. -} - -\strong{3. Task Parameters} -\itemize{ -\item \code{objective}: Specifies the learning task and the corresponding learning objective. -users can pass a self-defined function to it. The default objective options are below: -\itemize{ -\item \code{reg:squarederror}: Regression with squared loss (default). -\item \code{reg:squaredlogerror}: Regression with squared log loss \eqn{1/2 \cdot (\log(pred + 1) - \log(label + 1))^2}. -All inputs are required to be greater than -1. -Also, see metric rmsle for possible issue with this objective. -\item \code{reg:logistic}: Logistic regression. -\item \code{reg:pseudohubererror}: Regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss. -\item \code{binary:logistic}: Logistic regression for binary classification. Output probability. -\item \code{binary:logitraw}: Logistic regression for binary classification, output score before logistic transformation. -\item \code{binary:hinge}: Hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities. -\item \code{count:poisson}: Poisson regression for count data, output mean of Poisson distribution. -The parameter \code{max_delta_step} is set to 0.7 by default in poisson regression -(used to safeguard optimization). -\item \code{survival:cox}: Cox regression for right censored survival time data (negative values are considered right censored). -Note that predictions are returned on the hazard ratio scale (i.e., as HR = exp(marginal_prediction) in the proportional -hazard function \eqn{h(t) = h_0(t) \cdot HR}. -\item \code{survival:aft}: Accelerated failure time model for censored survival time data. See -\href{https://xgboost.readthedocs.io/en/latest/tutorials/aft_survival_analysis.html}{Survival Analysis with Accelerated Failure Time} -for details. -The parameter \code{aft_loss_distribution} specifies the Probability Density Function -used by \code{survival:aft} and the \code{aft-nloglik} metric. -\item \code{multi:softmax}: Set xgboost to do multiclass classification using the softmax objective. -Class is represented by a number and should be from 0 to \code{num_class - 1}. -\item \code{multi:softprob}: Same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be -further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging -to each class. -\item \code{rank:pairwise}: Set XGBoost to do ranking task by minimizing the pairwise loss. -\item \code{rank:ndcg}: Use LambdaMART to perform list-wise ranking where -\href{https://en.wikipedia.org/wiki/Discounted_cumulative_gain}{Normalized Discounted Cumulative Gain (NDCG)} is maximized. -\item \code{rank:map}: Use LambdaMART to perform list-wise ranking where -\href{https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision}{Mean Average Precision (MAP)} -is maximized. -\item \code{reg:gamma}: Gamma regression with log-link. Output is a mean of gamma distribution. -It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be -\href{https://en.wikipedia.org/wiki/Gamma_distribution#Applications}{gamma-distributed}. -\item \code{reg:tweedie}: Tweedie regression with log-link. -It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be -\href{https://en.wikipedia.org/wiki/Tweedie_distribution#Applications}{Tweedie-distributed}. -} - -For custom objectives, one should pass a function taking as input the current predictions (as a numeric -vector or matrix) and the training data (as an \code{xgb.DMatrix} object) that will return a list with elements -\code{grad} and \code{hess}, which should be numeric vectors or matrices with number of rows matching to the numbers -of rows in the training data (same shape as the predictions that are passed as input to the function). -For multi-valued custom objectives, should have shape \verb{[nrows, ntargets]}. Note that negative values of -the Hessian will be clipped, so one might consider using the expected Hessian (Fisher information) if the -objective is non-convex. - -See the tutorials \href{https://xgboost.readthedocs.io/en/stable/tutorials/custom_metric_obj.html}{Custom Objective and Evaluation Metric} -and \href{https://xgboost.readthedocs.io/en/latest/tutorials/advanced_custom_obj.html}{Advanced Usage of Custom Objectives} -for more information about custom objectives. -\item \code{base_score}: The initial prediction score of all instances, global bias. Default: 0.5. -\item \code{eval_metric}: Evaluation metrics for validation data. -Users can pass a self-defined function to it. -Default: metric will be assigned according to objective -(rmse for regression, and error for classification, mean average precision for ranking). -List is provided in detail section. -}} +A list of named parameters can be created through the function \code{\link[=xgb.params]{xgb.params()}}, which +accepts all valid parameters as function arguments.} \item{data}{Training dataset. \code{xgb.train()} accepts only an \code{xgb.DMatrix} as the input. -\code{\link[=xgboost]{xgboost()}}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.} + +Note that there is a function \code{\link[=xgboost]{xgboost()}} which is meant to accept R data objects +as inputs, such as data frames and matrices.} \item{nrounds}{Max number of boosting iterations.} \item{evals}{Named list of \code{xgb.DMatrix} datasets to use for evaluating model performance. -Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each -of these datasets during each boosting iteration, and stored in the end as a field named -\code{evaluation_log} in the resulting object. When either \code{verbose>=1} or -\code{\link[=xgb.cb.print.evaluation]{xgb.cb.print.evaluation()}} callback is engaged, the performance results are continuously -printed out during the training. +Metrics specified in either \code{eval_metric} (under params) or \code{custom_metric} (function +argument here) will be computed for each of these datasets during each boosting iteration, +and stored in the end as a field named \code{evaluation_log} in the resulting object. + +When either \code{verbose>=1} or \code{\link[=xgb.cb.print.evaluation]{xgb.cb.print.evaluation()}} callback is engaged, the performance +results are continuously printed out during the training. + E.g., specifying \code{evals=list(validation1=mat1, validation2=mat2)} allows to track -the performance of each round's model on mat1 and mat2.} +the performance of each round's model on \code{mat1} and \code{mat2}.} -\item{obj}{Customized objective function. Should take two arguments: the first one will be the +\item{objective}{Customized objective function. Should take two arguments: the first one will be the current predictions (either a numeric vector or matrix depending on the number of targets / classes), and the second one will be the \code{data} DMatrix object that is used for training. @@ -161,14 +59,14 @@ It should return a list with two elements \code{grad} and \code{hess} (in that o numeric vectors or numeric matrices depending on the number of targets / classes (same dimension as the predictions that are passed as first argument).} -\item{feval}{Customized evaluation function. Just like \code{obj}, should take two arguments, with -the first one being the predictions and the second one the \code{data} DMatrix. +\item{custom_metric}{Customized evaluation function. Just like \code{objective}, should take two arguments, +with the first one being the predictions and the second one the \code{data} DMatrix. Should return a list with two elements \code{metric} (name that will be displayed for this metric, should be a string / character), and \code{value} (the number that the function calculates, should be a numeric scalar). -Note that even if passing \code{feval}, objectives also have an associated default metric that +Note that even if passing \code{custom_metric}, objectives also have an associated default metric that will be evaluated in addition to it. In order to disable the built-in metric, one can pass parameter \code{disable_default_eval_metric = TRUE}.} @@ -177,13 +75,22 @@ If 2, some additional information will be printed out. Note that setting \code{verbose > 0} automatically engages the \code{xgb.cb.print.evaluation(period=1)} callback function.} -\item{print_every_n}{Print each nth iteration evaluation messages when \code{verbose>0}. -Default is 1 which means all messages are printed. This parameter is passed to the -\code{\link[=xgb.cb.print.evaluation]{xgb.cb.print.evaluation()}} callback.} +\item{print_every_n}{When passing \code{verbose>0}, evaluation logs (metrics calculated on the +data passed under \code{evals}) will be printed every nth iteration according to the value passed +here. The first and last iteration are always included regardless of this 'n'. + +Only has an effect when passing data under \code{evals} and when passing \code{verbose>0}. The parameter +is passed to the \code{\link[=xgb.cb.print.evaluation]{xgb.cb.print.evaluation()}} callback.} -\item{early_stopping_rounds}{If \code{NULL}, the early stopping function is not triggered. -If set to an integer \code{k}, training with a validation set will stop if the performance -doesn't improve for \code{k} rounds. Setting this parameter engages the \code{\link[=xgb.cb.early.stop]{xgb.cb.early.stop()}} callback.} +\item{early_stopping_rounds}{Number of boosting rounds after which training will be stopped +if there is no improvement in performance (as measured by the evaluatiation metric that is +supplied or selected by default for the objective) on the evaluation data passed under +\code{evals}. + +Must pass \code{evals} in order to use this functionality. Setting this parameter adds the +\code{\link[=xgb.cb.early.stop]{xgb.cb.early.stop()}} callback. + +If \code{NULL}, early stopping will not be used.} \item{maximize}{If \code{feval} and \code{early_stopping_rounds} are set, then this parameter must be set as well. When it is \code{TRUE}, it means the larger the evaluation score the better. @@ -208,7 +115,13 @@ such as an evaluation log (a \code{data.table} object) - be aware that these obj as R attributes, and thus do not get saved when using XGBoost's own serializaters like \code{\link[=xgb.save]{xgb.save()}} (but are kept when using R serializers like \code{\link[=saveRDS]{saveRDS()}}).} -\item{...}{other parameters to pass to \code{params}.} +\item{...}{Not used. + +Some arguments are currently deprecated or have been renamed. If a deprecated argument +is passed, will throw a warning and use its current equivalent. + +If some additional argument is passed that is neither a current function argument nor +a deprecated argument, an error will be thrown.} } \value{ An object of class \code{xgb.Booster}. @@ -218,41 +131,18 @@ An object of class \code{xgb.Booster}. The \code{\link[=xgboost]{xgboost()}} function is a simpler wrapper for \code{xgb.train()}. } \details{ -These are the training functions for \code{\link[=xgboost]{xgboost()}}. - -The \code{xgb.train()} interface supports advanced features such as \code{evals}, -customized objective and evaluation metric functions, therefore it is more flexible -than the \code{\link[=xgboost]{xgboost()}} interface. +Compared to \code{\link[=xgboost]{xgboost()}}, the \code{xgb.train()} interface supports advanced features such as +\code{evals}, customized objective and evaluation metric functions, among others, with the +difference these work \code{xgb.DMatrix} objects and do not follow typical R idioms. Parallelization is automatically enabled if OpenMP is present. Number of threads can also be manually specified via the \code{nthread} parameter. -While in other interfaces, the default random seed defaults to zero, in R, if a parameter \code{seed} +While in XGBoost language bindings, the default random seed defaults to zero, in R, if a parameter \code{seed} is not manually supplied, it will generate a random seed through R's own random number generator, whose seed in turn is controllable through \code{set.seed}. If \code{seed} is passed, it will override the RNG from R. -The evaluation metric is chosen automatically by XGBoost (according to the objective) -when the \code{eval_metric} parameter is not provided. -User may set one or several \code{eval_metric} parameters. -Note that when using a customized metric, only this single metric can be used. -The following is the list of built-in metrics for which XGBoost provides optimized implementation: -\itemize{ -\item \code{rmse}: Root mean square error. \url{https://en.wikipedia.org/wiki/Root_mean_square_error} -\item \code{logloss}: Negative log-likelihood. \url{https://en.wikipedia.org/wiki/Log-likelihood} -\item \code{mlogloss}: Multiclass logloss. \url{https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html} -\item \code{error}: Binary classification error rate. It is calculated as \verb{(# wrong cases) / (# all cases)}. -By default, it uses the 0.5 threshold for predicted values to define negative and positive instances. -Different threshold (e.g., 0.) could be specified as \verb{error@0}. -\item \code{merror}: Multiclass classification error rate. It is calculated as \verb{(# wrong cases) / (# all cases)}. -\item \code{mae}: Mean absolute error. -\item \code{mape}: Mean absolute percentage error. -\item \code{auc}: Area under the curve. -\url{https://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation. -\item \code{aucpr}: Area under the PR curve. \url{https://en.wikipedia.org/wiki/Precision_and_recall} for ranking evaluation. -\item \code{ndcg}: Normalized Discounted Cumulative Gain (for ranking task). \url{https://en.wikipedia.org/wiki/NDCG} -} - The following callbacks are automatically created when certain parameters are set: \itemize{ \item \code{\link[=xgb.cb.print.evaluation]{xgb.cb.print.evaluation()}} is turned on when \code{verbose > 0} and the \code{print_every_n} @@ -275,7 +165,7 @@ Be aware that one such R attribute that is automatically added is \code{params} is assigned from the \code{params} argument to this function, and is only meant to serve as a reference for what went into the booster, but is not used in other methods that take a booster object - so for example, changing the booster's configuration requires calling \verb{xgb.config<-} -or \verb{xgb.parameters<-}, while simply modifying \verb{attributes(model)$params$<...>} will have no +or \verb{xgb.model.parameters<-}, while simply modifying \verb{attributes(model)$params$<...>} will have no effect elsewhere. } \examples{ @@ -295,7 +185,7 @@ dtest <- with( evals <- list(train = dtrain, eval = dtest) ## A simple xgb.train example: -param <- list( +param <- xgb.params( max_depth = 2, eta = 1, nthread = nthread, @@ -319,9 +209,9 @@ evalerror <- function(preds, dtrain) { return(list(metric = "error", value = err)) } -# These functions could be used by passing them either: -# as 'objective' and 'eval_metric' parameters in the params list: -param <- list( +# These functions could be used by passing them as 'objective' and +# 'eval_metric' parameters in the params list: +param <- xgb.params( max_depth = 2, eta = 1, nthread = nthread, @@ -330,26 +220,16 @@ param <- list( ) bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0) -# or through the ... arguments: -param <- list(max_depth = 2, eta = 1, nthread = nthread) +# ... or as dedicated 'objective' and 'custom_metric' parameters of xgb.train: bst <- xgb.train( - param, - dtrain, - nrounds = 2, - evals = evals, - verbose = 0, - objective = logregobj, - eval_metric = evalerror -) - -# or as dedicated 'obj' and 'feval' parameters of xgb.train: -bst <- xgb.train( - param, dtrain, nrounds = 2, evals = evals, obj = logregobj, feval = evalerror + within(param, rm("objective", "eval_metric")), + dtrain, nrounds = 2, evals = evals, + objective = logregobj, custom_metric = evalerror ) ## An xgb.train example of using variable learning rates at each iteration: -param <- list( +param <- xgb.params( max_depth = 2, eta = 1, nthread = nthread, @@ -371,17 +251,6 @@ bst <- xgb.train( bst <- xgb.train( param, dtrain, nrounds = 25, evals = evals, early_stopping_rounds = 3 ) - -## An 'xgboost' interface example: -bst <- xgboost( - x = agaricus.train$data, - y = factor(agaricus.train$label), - params = list(max_depth = 2, eta = 1), - nthread = nthread, - nrounds = 2 -) -pred <- predict(bst, agaricus.test$data) - } \references{ Tianqi Chen and Carlos Guestrin, "XGBoost: A Scalable Tree Boosting System", diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index ab6c9ac1a8ef..058090e1ad1f 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -10,8 +10,11 @@ xgboost( objective = NULL, nrounds = 100L, weights = NULL, - verbosity = 0L, + verbosity = if (is.null(eval_set)) 0L else 1L, monitor_training = verbosity > 0, + eval_set = NULL, + early_stopping_rounds = NULL, + print_every_n = 1L, nthreads = parallel::detectCores(), seed = 0L, monotone_constraints = NULL, @@ -66,7 +69,7 @@ set as the last level.} \item{objective}{Optimization objective to minimize based on the supplied data, to be passed by name as a string / character (e.g. \code{reg:absoluteerror}). See the \href{https://xgboost.readthedocs.io/en/stable/parameter.html#learning-task-parameters}{Learning Task Parameters} -page for more detailed information on allowed values. +page and the \code{\link[=xgb.params]{xgb.params()}} documentation for more detailed information on allowed values. If \code{NULL} (the default), will be automatically determined from \code{y} according to the following logic: @@ -103,6 +106,40 @@ If not \code{NULL}, should be passed as a numeric vector with length matching to \item{monitor_training}{Whether to monitor objective optimization progress on the input data. Note that same 'x' and 'y' data are used for both model fitting and evaluation.} +\item{eval_set}{Subset of the data to use as evaluation set. Can be passed as: +\itemize{ +\item A vector of row indices (base-1 numeration) indicating the observations that are to be designed +as evaluation data. +\item A number between zero and one indicating a random fraction of the input data to use as +evaluation data. Note that the selection will be done uniformly at random, regardless of +argument \code{weights}. +} + +If passed, this subset of the data will be excluded from the training procedure, and the +evaluation metric(s) supplied under \code{eval_metric} will be calculated on this dataset after each +boosting iteration (pass \code{verbosity>0} to have these metrics printed during training). If +\code{eval_metric} is not passed, a default metric will be selected according to \code{objective}. + +If passing a fraction, in classification problems, the evaluation set will be chosen in such a +way that at least one observation of each class will be kept in the training data. + +For more elaborate evaluation variants (e.g. custom metrics, multiple evaluation sets, etc.), +one might want to use \code{\link[=xgb.train]{xgb.train()}} instead.} + +\item{early_stopping_rounds}{Number of boosting rounds after which training will be stopped +if there is no improvement in performance (as measured by the last metric passed under +\code{eval_metric}, or by the default metric for the objective if \code{eval_metric} is not passed) on the +evaluation data from \code{eval_set}. Must pass \code{eval_set} in order to use this functionality. + +If \code{NULL}, early stopping will not be used.} + +\item{print_every_n}{When passing \code{verbosity>0} and either \code{monitor_training=TRUE} or \code{eval_set}, +evaluation logs (metrics calculated on the training and/or evaluation data) will be printed every +nth iteration according to the value passed here. The first and last iteration are always +included regardless of this 'n'. + +Only has an effect when passing \code{verbosity>0}.} + \item{nthreads}{Number of parallel threads to use. If passing zero, will use all CPU threads.} \item{seed}{Seed to use for random number generation. If passing \code{NULL}, will draw a random diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index 0e7234a18708..adb9649bf33d 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -687,6 +687,7 @@ XGB_DLL SEXP XGProxyDMatrixSetDataDense_R(SEXP handle, SEXP R_mat) { { std::string array_str = MakeArrayInterfaceFromRMat(R_mat); res_code = XGProxyDMatrixSetDataDense(proxy_dmat, array_str.c_str()); + R_SetExternalPtrProtected(handle, R_mat); } CHECK_CALL(res_code); R_API_END(); @@ -707,6 +708,7 @@ XGB_DLL SEXP XGProxyDMatrixSetDataCSR_R(SEXP handle, SEXP lst) { array_str_indices.c_str(), array_str_data.c_str(), ncol); + R_SetExternalPtrProtected(handle, lst); } CHECK_CALL(res_code); R_API_END(); @@ -720,6 +722,7 @@ XGB_DLL SEXP XGProxyDMatrixSetDataColumnar_R(SEXP handle, SEXP lst) { { std::string sinterface = MakeArrayInterfaceFromRDataFrame(lst); res_code = XGProxyDMatrixSetDataColumnar(proxy_dmat, sinterface.c_str()); + R_SetExternalPtrProtected(handle, lst); } CHECK_CALL(res_code); R_API_END(); @@ -733,17 +736,20 @@ struct _RDataIterator { SEXP f_reset; SEXP calling_env; SEXP continuation_token; + SEXP proxy_dmat; _RDataIterator( - SEXP f_next, SEXP f_reset, SEXP calling_env, SEXP continuation_token) : + SEXP f_next, SEXP f_reset, SEXP calling_env, SEXP continuation_token, SEXP proxy_dmat) : f_next(f_next), f_reset(f_reset), calling_env(calling_env), - continuation_token(continuation_token) {} + continuation_token(continuation_token), proxy_dmat(proxy_dmat) {} void reset() { + R_SetExternalPtrProtected(this->proxy_dmat, R_NilValue); SafeExecFun(this->f_reset, this->calling_env, this->continuation_token); } int next() { + R_SetExternalPtrProtected(this->proxy_dmat, R_NilValue); SEXP R_res = Rf_protect( SafeExecFun(this->f_next, this->calling_env, this->continuation_token)); int res = Rf_asInteger(R_res); @@ -771,7 +777,7 @@ SEXP XGDMatrixCreateFromCallbackGeneric_R( int res_code; try { - _RDataIterator data_iterator(f_next, f_reset, calling_env, continuation_token); + _RDataIterator data_iterator(f_next, f_reset, calling_env, continuation_token, proxy_dmat); std::string str_cache_prefix; xgboost::Json jconfig{xgboost::Object{}}; diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index 03a346d02076..fdec78e88083 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -17,9 +17,15 @@ test_that("train and predict binary classification", { nrounds <- 2 expect_output( bst <- xgb.train( - data = xgb.DMatrix(train$data, label = train$label), max_depth = 2, - eta = 1, nthread = n_threads, nrounds = nrounds, - objective = "binary:logistic", eval_metric = "error", + data = xgb.DMatrix(train$data, label = train$label), + nrounds = nrounds, + params = xgb.params( + max_depth = 2, + eta = 1, + nthread = n_threads, + objective = "binary:logistic", + eval_metric = "error" + ), evals = list(train = xgb.DMatrix(train$data, label = train$label)) ), "train-error" @@ -104,14 +110,16 @@ test_that("dart prediction works", { set.seed(1994) booster_by_xgboost <- xgb.train( data = xgb.DMatrix(d, label = y), - max_depth = 2, - booster = "dart", - rate_drop = 0.5, - one_drop = TRUE, - eta = 1, - nthread = n_threads, nrounds = nrounds, - objective = "reg:squarederror" + params = xgb.params( + max_depth = 2, + booster = "dart", + rate_drop = 0.5, + one_drop = TRUE, + eta = 1, + nthread = n_threads, + objective = "reg:squarederror" + ) ) pred_by_xgboost_0 <- predict(booster_by_xgboost, newdata = d, iterationrange = NULL) pred_by_xgboost_1 <- predict(booster_by_xgboost, newdata = d, iterationrange = c(1, nrounds)) @@ -123,7 +131,7 @@ test_that("dart prediction works", { set.seed(1994) dtrain <- xgb.DMatrix(data = d, label = y, nthread = n_threads) booster_by_train <- xgb.train( - params = list( + params = xgb.params( booster = "dart", max_depth = 2, eta = 1, @@ -150,8 +158,11 @@ test_that("train and predict softprob", { expect_output( bst <- xgb.train( data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb), - max_depth = 3, eta = 0.5, nthread = n_threads, nrounds = 5, - objective = "multi:softprob", num_class = 3, eval_metric = "merror", + nrounds = 5, + params = xgb.params( + max_depth = 3, eta = 0.5, nthread = n_threads, + objective = "multi:softprob", num_class = 3, eval_metric = "merror" + ), evals = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb)) ), "train-merror" @@ -186,9 +197,14 @@ test_that("train and predict softprob", { y <- sample.int(10, 100, replace = TRUE) - 1 dtrain <- xgb.DMatrix(data = d, label = y, nthread = n_threads) booster <- xgb.train( - params = list(tree_method = "hist", nthread = n_threads), - data = dtrain, nrounds = 4, num_class = 10, - objective = "multi:softprob" + params = xgb.params( + objective = "multi:softprob", + num_class = 10, + tree_method = "hist", + nthread = n_threads + ), + data = dtrain, + nrounds = 4 ) predt <- predict(booster, as.matrix(d), strict_shape = FALSE) expect_equal(ncol(predt), 10) @@ -201,8 +217,11 @@ test_that("train and predict softmax", { expect_output( bst <- xgb.train( data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb), - max_depth = 3, eta = 0.5, nthread = n_threads, nrounds = 5, - objective = "multi:softmax", num_class = 3, eval_metric = "merror", + nrounds = 5, + params = xgb.params( + max_depth = 3, eta = 0.5, nthread = n_threads, + objective = "multi:softmax", num_class = 3, eval_metric = "merror" + ), evals = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb)) ), "train-merror" @@ -222,11 +241,16 @@ test_that("train and predict RF", { lb <- train$label # single iteration bst <- xgb.train( - data = xgb.DMatrix(train$data, label = lb), max_depth = 5, - nthread = n_threads, - nrounds = 1, objective = "binary:logistic", eval_metric = "error", - num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1, - evals = list(train = xgb.DMatrix(train$data, label = lb)) + data = xgb.DMatrix(train$data, label = lb), + nrounds = 1, + params = xgb.params( + max_depth = 5, + nthread = n_threads, + objective = "binary:logistic", eval_metric = "error", + num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1 + ), + evals = list(train = xgb.DMatrix(train$data, label = lb)), + verbose = 0 ) expect_equal(xgb.get.num.boosted.rounds(bst), 1) @@ -246,10 +270,14 @@ test_that("train and predict RF with softprob", { set.seed(11) bst <- xgb.train( data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb), - max_depth = 3, eta = 0.9, nthread = n_threads, nrounds = nrounds, - objective = "multi:softprob", eval_metric = "merror", - num_class = 3, verbose = 0, - num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5, + nrounds = nrounds, + verbose = 0, + params = xgb.params( + max_depth = 3, eta = 0.9, nthread = n_threads, + objective = "multi:softprob", eval_metric = "merror", + num_class = 3, + num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5 + ), evals = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb)) ) expect_equal(xgb.get.num.boosted.rounds(bst), 15) @@ -268,9 +296,13 @@ test_that("train and predict RF with softprob", { test_that("use of multiple eval metrics works", { expect_output( bst <- xgb.train( - data = xgb.DMatrix(train$data, label = train$label), max_depth = 2, - eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic", - eval_metric = "error", eval_metric = "auc", eval_metric = "logloss", + data = xgb.DMatrix(train$data, label = train$label), + nrounds = 2, + params = list( + max_depth = 2, + eta = 1, nthread = n_threads, objective = "binary:logistic", + eval_metric = "error", eval_metric = "auc", eval_metric = "logloss" + ), evals = list(train = xgb.DMatrix(train$data, label = train$label)) ), "train-error.*train-auc.*train-logloss" @@ -280,9 +312,13 @@ test_that("use of multiple eval metrics works", { expect_equal(colnames(attributes(bst)$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss")) expect_output( bst2 <- xgb.train( - data = xgb.DMatrix(train$data, label = train$label), max_depth = 2, - eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic", - eval_metric = list("error", "auc", "logloss"), + data = xgb.DMatrix(train$data, label = train$label), + nrounds = 2, + params = xgb.params( + max_depth = 2, + eta = 1, nthread = n_threads, objective = "binary:logistic", + eval_metric = list("error", "auc", "logloss") + ), evals = list(train = xgb.DMatrix(train$data, label = train$label)) ), "train-error.*train-auc.*train-logloss" @@ -296,18 +332,18 @@ test_that("use of multiple eval metrics works", { test_that("training continuation works", { dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = n_threads) evals <- list(train = dtrain) - param <- list( + params <- xgb.params( objective = "binary:logistic", max_depth = 2, eta = 1, nthread = n_threads ) # for the reference, use 4 iterations at once: set.seed(11) - bst <- xgb.train(param, dtrain, nrounds = 4, evals = evals, verbose = 0) + bst <- xgb.train(params, dtrain, nrounds = 4, evals = evals, verbose = 0) # first two iterations: set.seed(11) - bst1 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0) + bst1 <- xgb.train(params, dtrain, nrounds = 2, evals = evals, verbose = 0) # continue for two more: - bst2 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0, xgb_model = bst1) + bst2 <- xgb.train(params, dtrain, nrounds = 2, evals = evals, verbose = 0, xgb_model = bst1) if (!windows_flag && !solaris_flag) { expect_equal(xgb.save.raw(bst), xgb.save.raw(bst2)) } @@ -315,7 +351,7 @@ test_that("training continuation works", { expect_equal(dim(attributes(bst2)$evaluation_log), c(4, 2)) expect_equal(attributes(bst2)$evaluation_log, attributes(bst)$evaluation_log) # test continuing from raw model data - bst2 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0, xgb_model = xgb.save.raw(bst1)) + bst2 <- xgb.train(params, dtrain, nrounds = 2, evals = evals, verbose = 0, xgb_model = xgb.save.raw(bst1)) if (!windows_flag && !solaris_flag) { expect_equal(xgb.save.raw(bst), xgb.save.raw(bst2)) } @@ -323,7 +359,7 @@ test_that("training continuation works", { # test continuing from a model in file fname <- file.path(tempdir(), "xgboost.json") xgb.save(bst1, fname) - bst2 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0, xgb_model = fname) + bst2 <- xgb.train(params, dtrain, nrounds = 2, evals = evals, verbose = 0, xgb_model = fname) if (!windows_flag && !solaris_flag) { expect_equal(xgb.save.raw(bst), xgb.save.raw(bst2)) } @@ -334,9 +370,15 @@ test_that("xgb.cv works", { set.seed(11) expect_output( cv <- xgb.cv( - data = xgb.DMatrix(train$data, label = train$label), max_depth = 2, nfold = 5, - eta = 1., nthread = n_threads, nrounds = 2, objective = "binary:logistic", - eval_metric = "error", verbose = TRUE + data = xgb.DMatrix(train$data, label = train$label), + nfold = 5, + nrounds = 2, + params = xgb.params( + max_depth = 2, + eta = 1., nthread = n_threads, objective = "binary:logistic", + eval_metric = "error" + ), + verbose = TRUE ), "train-error:" ) @@ -355,14 +397,24 @@ test_that("xgb.cv works with stratified folds", { dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = n_threads) set.seed(314159) cv <- xgb.cv( - data = dtrain, max_depth = 2, nfold = 5, - eta = 1., nthread = n_threads, nrounds = 2, objective = "binary:logistic", + data = dtrain, + nrounds = 2, + nfold = 5, + params = xgb.params( + max_depth = 2, + eta = 1., nthread = n_threads, objective = "binary:logistic" + ), verbose = FALSE, stratified = FALSE ) set.seed(314159) cv2 <- xgb.cv( - data = dtrain, max_depth = 2, nfold = 5, - eta = 1., nthread = n_threads, nrounds = 2, objective = "binary:logistic", + data = dtrain, + nfold = 5, + nrounds = 2, + params = xgb.params( + max_depth = 2, + eta = 1., nthread = n_threads, objective = "binary:logistic" + ), verbose = FALSE, stratified = TRUE ) # Stratified folds should result in a different evaluation logs @@ -373,8 +425,12 @@ test_that("train and predict with non-strict classes", { # standard dense matrix input train_dense <- as.matrix(train$data) bst <- xgb.train( - data = xgb.DMatrix(train_dense, label = train$label), max_depth = 2, - eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic", + data = xgb.DMatrix(train_dense, label = train$label), + nrounds = 2, + params = xgb.params( + max_depth = 2, + eta = 1, nthread = n_threads, objective = "binary:logistic" + ), verbose = 0 ) pr0 <- predict(bst, train_dense) @@ -384,8 +440,12 @@ test_that("train and predict with non-strict classes", { expect_true(is.matrix(train_dense)) expect_error( bst <- xgb.train( - data = xgb.DMatrix(train_dense, label = train$label), max_depth = 2, - eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic", + data = xgb.DMatrix(train_dense, label = train$label), + nrounds = 2, + params = xgb.params( + max_depth = 2, + eta = 1, nthread = n_threads, objective = "binary:logistic" + ), verbose = 0 ), regexp = NA @@ -398,8 +458,12 @@ test_that("train and predict with non-strict classes", { expect_true(is.matrix(train_dense)) expect_error( bst <- xgb.train( - data = xgb.DMatrix(train_dense, label = train$label), max_depth = 2, - eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic", + data = xgb.DMatrix(train_dense, label = train$label), + nrounds = 2, + params = xgb.params( + max_depth = 2, + eta = 1, nthread = n_threads, objective = "binary:logistic" + ), verbose = 0 ), regexp = NA @@ -418,16 +482,16 @@ test_that("max_delta_step works", { agaricus.train$data, label = agaricus.train$label, nthread = n_threads ) evals <- list(train = dtrain) - param <- list( + params <- xgb.params( objective = "binary:logistic", eval_metric = "logloss", max_depth = 2, nthread = n_threads, eta = 0.5 ) nrounds <- 5 # model with no restriction on max_delta_step - bst1 <- xgb.train(param, dtrain, nrounds, evals = evals, verbose = 1) + bst1 <- xgb.train(params, dtrain, nrounds, evals = evals, verbose = 0) # model with restricted max_delta_step - bst2 <- xgb.train(param, dtrain, nrounds, evals = evals, verbose = 1, max_delta_step = 1) + bst2 <- xgb.train(c(params, list(max_delta_step = 1)), dtrain, nrounds, evals = evals, verbose = 0) # the no-restriction model is expected to have consistently lower loss during the initial iterations expect_true(all(attributes(bst1)$evaluation_log$train_logloss < attributes(bst2)$evaluation_log$train_logloss)) expect_lt(mean(attributes(bst1)$evaluation_log$train_logloss) / mean(attributes(bst2)$evaluation_log$train_logloss), 0.8) @@ -447,13 +511,13 @@ test_that("colsample_bytree works", { evals <- list(train = dtrain, eval = dtest) ## Use colsample_bytree = 0.01, so that roughly one out of 100 features is chosen for ## each tree - param <- list( + params <- xgb.params( max_depth = 2, eta = 0, nthread = n_threads, colsample_bytree = 0.01, objective = "binary:logistic", eval_metric = "auc" ) set.seed(2) - bst <- xgb.train(param, dtrain, nrounds = 100, evals = evals, verbose = 0) + bst <- xgb.train(params, dtrain, nrounds = 100, evals = evals, verbose = 0) xgb.importance(model = bst) # If colsample_bytree works properly, a variety of features should be used # in the 100 trees @@ -462,9 +526,12 @@ test_that("colsample_bytree works", { test_that("Configuration works", { bst <- xgb.train( - data = xgb.DMatrix(train$data, label = train$label), max_depth = 2, - eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic", - eval_metric = "error", eval_metric = "auc", eval_metric = "logloss" + data = xgb.DMatrix(train$data, label = train$label), + nrounds = 2, + params = xgb.params( + max_depth = 2, + eta = 1, nthread = n_threads, objective = "binary:logistic" + ) ) config <- xgb.config(bst) xgb.config(bst) <- config @@ -514,8 +581,11 @@ test_that("strict_shape works", { bst <- xgb.train( data = xgb.DMatrix(X, label = y), - max_depth = 2, nrounds = n_rounds, nthread = n_threads, - objective = "multi:softprob", num_class = 3, eval_metric = "merror" + nrounds = n_rounds, + params = xgb.params( + max_depth = 2, nthread = n_threads, + objective = "multi:softprob", num_class = 3 + ) ) test_strict_shape(bst, X, 3) @@ -528,9 +598,12 @@ test_that("strict_shape works", { y <- agaricus.train$label bst <- xgb.train( - data = xgb.DMatrix(X, label = y), max_depth = 2, nthread = n_threads, - nrounds = n_rounds, objective = "binary:logistic", - eval_metric = "error", eval_metric = "auc", eval_metric = "logloss" + data = xgb.DMatrix(X, label = y), + nrounds = n_rounds, + params = xgb.params( + max_depth = 2, nthread = n_threads, + objective = "binary:logistic" + ) ) test_strict_shape(bst, X, 1) @@ -547,8 +620,12 @@ test_that("'predict' accepts CSR data", { x_csr <- as(x_csc, "RsparseMatrix") x_spv <- as(x_csc, "sparseVector") bst <- xgb.train( - data = xgb.DMatrix(X, label = y), objective = "binary:logistic", - nrounds = 5L, verbose = FALSE, nthread = n_threads, + data = xgb.DMatrix(X, label = y), + nrounds = 5L, verbose = FALSE, + params = xgb.params( + objective = "binary:logistic", + nthread = n_threads + ) ) p_csc <- predict(bst, x_csc) p_csr <- predict(bst, x_csr) @@ -564,7 +641,7 @@ test_that("Quantile regression accepts multiple quantiles", { dm <- xgb.DMatrix(data = x, label = y) model <- xgb.train( data = dm, - params = list( + params = xgb.params( objective = "reg:quantileerror", tree_method = "exact", quantile_alpha = c(0.05, 0.5, 0.95), @@ -591,7 +668,7 @@ test_that("Can use multi-output labels with built-in objectives", { y_mirrored <- cbind(y, -y) dm <- xgb.DMatrix(x, label = y_mirrored, nthread = n_threads) model <- xgb.train( - params = list( + params = xgb.params( tree_method = "hist", multi_strategy = "multi_output_tree", objective = "reg:squarederror", @@ -613,7 +690,7 @@ test_that("Can use multi-output labels with custom objectives", { y_mirrored <- cbind(y, -y) dm <- xgb.DMatrix(x, label = y_mirrored, nthread = n_threads) model <- xgb.train( - params = list( + params = xgb.params( tree_method = "hist", multi_strategy = "multi_output_tree", base_score = 0, @@ -645,11 +722,13 @@ test_that("Can use ranking objectives with either 'qid' or 'group'", { dmat_qid <- xgb.DMatrix(x, label = y, qid = qid) dmat_gr <- xgb.DMatrix(x, label = y, group = gr) - params <- list(tree_method = "hist", - lambdarank_num_pair_per_sample = 8, - objective = "rank:ndcg", - lambdarank_pair_method = "topk", - nthread = n_threads) + params <- xgb.params( + tree_method = "hist", + lambdarank_num_pair_per_sample = 8, + objective = "rank:ndcg", + lambdarank_pair_method = "topk", + nthread = n_threads + ) set.seed(123) model_qid <- xgb.train(params, dmat_qid, nrounds = 5) set.seed(123) @@ -667,7 +746,7 @@ test_that("Can predict on data.frame objects", { x_mat <- as.matrix(x_df) dm <- xgb.DMatrix(x_mat, label = y, nthread = n_threads) model <- xgb.train( - params = list( + params = xgb.params( tree_method = "hist", objective = "reg:squarederror", nthread = n_threads @@ -687,7 +766,7 @@ test_that("'base_margin' gives the same result in DMatrix as in inplace_predict" x <- as.matrix(mtcars[, -1]) dm <- xgb.DMatrix(x, label = y, nthread = n_threads) model <- xgb.train( - params = list( + params = xgb.params( tree_method = "hist", objective = "reg:squarederror", nthread = n_threads @@ -714,7 +793,7 @@ test_that("Coefficients from gblinear have the expected shape and names", { dm <- xgb.DMatrix(x, label = y, nthread = 1) model <- xgb.train( data = dm, - params = list( + params = xgb.params( booster = "gblinear", nthread = 1 ), @@ -735,7 +814,7 @@ test_that("Coefficients from gblinear have the expected shape and names", { mm <- model.matrix(~., data = iris[, -5]) model <- xgb.train( data = dm, - params = list( + params = xgb.params( booster = "gblinear", objective = "multi:softprob", num_class = 3, @@ -772,7 +851,7 @@ test_that("Deep copies work as expected", { dm <- xgb.DMatrix(x, label = y, nthread = 1) model <- xgb.train( data = dm, - params = list(nthread = 1), + params = xgb.params(nthread = 1), nrounds = 3 ) @@ -795,7 +874,7 @@ test_that("Pointer comparison works as expected", { y <- mtcars$mpg x <- as.matrix(mtcars[, -1]) model <- xgb.train( - params = list(nthread = 1), + params = xgb.params(nthread = 1), data = xgb.DMatrix(x, label = y, nthread = 1), nrounds = 3 ) @@ -824,7 +903,7 @@ test_that("DMatrix field are set to booster when training", { dm_both <- xgb.DMatrix(x, label = y, feature_names = c("a", "b", "c"), nthread = 1) setinfo(dm_both, "feature_type", c("q", "c", "q")) - params <- list(nthread = 1) + params <- xgb.params(nthread = 1) model_unnamed <- xgb.train(data = dm_unnamed, params = params, nrounds = 3) model_feature_names <- xgb.train(data = dm_feature_names, params = params, nrounds = 3) model_feature_types <- xgb.train(data = dm_feature_types, params = params, nrounds = 3) @@ -853,7 +932,7 @@ test_that("Seed in params override PRNG from R", { agaricus.train$data, label = agaricus.train$label, nthread = 1L ), - params = list( + params = xgb.params( objective = "binary:logistic", max_depth = 3L, subsample = 0.1, @@ -869,7 +948,7 @@ test_that("Seed in params override PRNG from R", { agaricus.train$data, label = agaricus.train$label, nthread = 1L ), - params = list( + params = xgb.params( objective = "binary:logistic", max_depth = 3L, subsample = 0.1, @@ -890,7 +969,7 @@ test_that("Seed in params override PRNG from R", { agaricus.train$data, label = agaricus.train$label, nthread = 1L ), - params = list( + params = xgb.params( objective = "binary:logistic", max_depth = 3L, subsample = 0.1, @@ -913,7 +992,7 @@ test_that("xgb.cv works for AFT", { X <- matrix(c(1, -1, -1, 1, 0, 1, 1, 0), nrow = 4, byrow = TRUE) # 4x2 matrix dtrain <- xgb.DMatrix(X, nthread = n_threads) - params <- list(objective = 'survival:aft', learning_rate = 0.2, max_depth = 2L) + params <- xgb.params(objective = 'survival:aft', learning_rate = 0.2, max_depth = 2L, nthread = n_threads) # data must have bounds expect_error( @@ -921,8 +1000,7 @@ test_that("xgb.cv works for AFT", { params = params, data = dtrain, nround = 5L, - nfold = 4L, - nthread = n_threads + nfold = 4L ) ) @@ -933,7 +1011,7 @@ test_that("xgb.cv works for AFT", { expect_warning( xgb.cv( params = params, data = dtrain, nround = 5L, nfold = 4L, - nthread = n_threads, stratified = TRUE, verbose = FALSE + stratified = TRUE, verbose = FALSE ) ) @@ -951,9 +1029,10 @@ test_that("xgb.cv works for ranking", { dm <- xgb.DMatrix(x, label = y, group = group) res <- xgb.cv( data = dm, - params = list( + params = xgb.params( objective = "rank:pairwise", - max_depth = 3 + max_depth = 3, + nthread = 1L ), nrounds = 3, nfold = 2, @@ -970,7 +1049,7 @@ test_that("Row names are preserved in outputs", { dm <- xgb.DMatrix(x, label = y, nthread = 1) model <- xgb.train( data = dm, - params = list( + params = xgb.params( objective = "multi:softprob", num_class = 3, max_depth = 2, @@ -990,7 +1069,7 @@ test_that("Row names are preserved in outputs", { dm <- xgb.DMatrix(data = x, label = y) model <- xgb.train( data = dm, - params = list( + params = xgb.params( max_depth = 2, nthread = 1 ), diff --git a/R-package/tests/testthat/test_callbacks.R b/R-package/tests/testthat/test_callbacks.R index bf95a170dcfc..7eef3cb46aa9 100644 --- a/R-package/tests/testthat/test_callbacks.R +++ b/R-package/tests/testthat/test_callbacks.R @@ -24,15 +24,17 @@ evals <- list(train = dtrain, test = dtest) err <- function(label, pr) sum((pr > 0.5) != label) / length(label) -param <- list(objective = "binary:logistic", eval_metric = "error", - max_depth = 2, nthread = n_threads) +params <- xgb.params( + objective = "binary:logistic", eval_metric = "error", + max_depth = 2, nthread = n_threads +) test_that("xgb.cb.print.evaluation works as expected for xgb.train", { logs1 <- capture.output({ model <- xgb.train( data = dtrain, - params = list( + params = xgb.params( objective = "binary:logistic", eval_metric = "auc", max_depth = 2, @@ -50,7 +52,7 @@ test_that("xgb.cb.print.evaluation works as expected for xgb.train", { logs2 <- capture.output({ model <- xgb.train( data = dtrain, - params = list( + params = xgb.params( objective = "binary:logistic", eval_metric = "auc", max_depth = 2, @@ -71,7 +73,7 @@ test_that("xgb.cb.print.evaluation works as expected for xgb.cv", { logs1 <- capture.output({ model <- xgb.cv( data = dtrain, - params = list( + params = xgb.params( objective = "binary:logistic", eval_metric = "auc", max_depth = 2, @@ -89,7 +91,7 @@ test_that("xgb.cb.print.evaluation works as expected for xgb.cv", { logs2 <- capture.output({ model <- xgb.cv( data = dtrain, - params = list( + params = xgb.params( objective = "binary:logistic", eval_metric = "auc", max_depth = 2, @@ -109,7 +111,7 @@ test_that("xgb.cb.print.evaluation works as expected for xgb.cv", { test_that("xgb.cb.evaluation.log works as expected for xgb.train", { model <- xgb.train( data = dtrain, - params = list( + params = xgb.params( objective = "binary:logistic", eval_metric = "auc", max_depth = 2, @@ -129,7 +131,7 @@ test_that("xgb.cb.evaluation.log works as expected for xgb.train", { test_that("xgb.cb.evaluation.log works as expected for xgb.cv", { model <- xgb.cv( data = dtrain, - params = list( + params = xgb.params( objective = "binary:logistic", eval_metric = "auc", max_depth = 2, @@ -150,12 +152,14 @@ test_that("xgb.cb.evaluation.log works as expected for xgb.cv", { }) -param <- list(objective = "binary:logistic", eval_metric = "error", - max_depth = 4, nthread = n_threads) +params <- xgb.params( + objective = "binary:logistic", eval_metric = "error", + max_depth = 4, nthread = n_threads +) test_that("can store evaluation_log without printing", { expect_silent( - bst <- xgb.train(param, dtrain, nrounds = 10, evals = evals, eta = 1, verbose = 0) + bst <- xgb.train(params, dtrain, nrounds = 10, evals = evals, verbose = 0) ) expect_false(is.null(attributes(bst)$evaluation_log)) expect_false(is.null(attributes(bst)$evaluation_log$train_error)) @@ -165,15 +169,16 @@ test_that("can store evaluation_log without printing", { test_that("xgb.cb.reset.parameters works as expected", { # fixed eta + params <- c(params, list(eta = 0.9)) set.seed(111) - bst0 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, eta = 0.9, verbose = 0) + bst0 <- xgb.train(params, dtrain, nrounds = 2, evals = evals, verbose = 0) expect_false(is.null(attributes(bst0)$evaluation_log)) expect_false(is.null(attributes(bst0)$evaluation_log$train_error)) # same eta but re-set as a vector parameter in the callback set.seed(111) my_par <- list(eta = c(0.9, 0.9)) - bst1 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0, + bst1 <- xgb.train(params, dtrain, nrounds = 2, evals = evals, verbose = 0, callbacks = list(xgb.cb.reset.parameters(my_par))) expect_false(is.null(attributes(bst1)$evaluation_log$train_error)) expect_equal(attributes(bst0)$evaluation_log$train_error, @@ -182,7 +187,7 @@ test_that("xgb.cb.reset.parameters works as expected", { # same eta but re-set via a function in the callback set.seed(111) my_par <- list(eta = function(itr, itr_end) 0.9) - bst2 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0, + bst2 <- xgb.train(params, dtrain, nrounds = 2, evals = evals, verbose = 0, callbacks = list(xgb.cb.reset.parameters(my_par))) expect_false(is.null(attributes(bst2)$evaluation_log$train_error)) expect_equal(attributes(bst0)$evaluation_log$train_error, @@ -191,7 +196,7 @@ test_that("xgb.cb.reset.parameters works as expected", { # different eta re-set as a vector parameter in the callback set.seed(111) my_par <- list(eta = c(0.6, 0.5)) - bst3 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0, + bst3 <- xgb.train(params, dtrain, nrounds = 2, evals = evals, verbose = 0, callbacks = list(xgb.cb.reset.parameters(my_par))) expect_false(is.null(attributes(bst3)$evaluation_log$train_error)) expect_false(all(attributes(bst0)$evaluation_log$train_error == attributes(bst3)$evaluation_log$train_error)) @@ -199,18 +204,18 @@ test_that("xgb.cb.reset.parameters works as expected", { # resetting multiple parameters at the same time runs with no error my_par <- list(eta = c(1., 0.5), gamma = c(1, 2), max_depth = c(4, 8)) expect_error( - bst4 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0, + bst4 <- xgb.train(params, dtrain, nrounds = 2, evals = evals, verbose = 0, callbacks = list(xgb.cb.reset.parameters(my_par))) , NA) # NA = no error # CV works as well expect_error( - bst4 <- xgb.cv(param, dtrain, nfold = 2, nrounds = 2, verbose = 0, + bst4 <- xgb.cv(params, dtrain, nfold = 2, nrounds = 2, verbose = 0, callbacks = list(xgb.cb.reset.parameters(my_par))) , NA) # NA = no error # expect no learning with 0 learning rate my_par <- list(eta = c(0., 0.)) - bstX <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0, + bstX <- xgb.train(params, dtrain, nrounds = 2, evals = evals, verbose = 0, callbacks = list(xgb.cb.reset.parameters(my_par))) expect_false(is.null(attributes(bstX)$evaluation_log$train_error)) er <- unique(attributes(bstX)$evaluation_log$train_error) @@ -223,15 +228,15 @@ test_that("xgb.cb.save.model works as expected", { files <- unname(sapply(files, function(f) file.path(tempdir(), f))) for (f in files) if (file.exists(f)) file.remove(f) - bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, eta = 1, verbose = 0, + bst <- xgb.train(params, dtrain, nrounds = 2, evals = evals, verbose = 0, save_period = 1, save_name = file.path(tempdir(), "xgboost_%02d.json")) expect_true(file.exists(files[1])) expect_true(file.exists(files[2])) b1 <- xgb.load(files[1]) - xgb.parameters(b1) <- list(nthread = 2) + xgb.model.parameters(b1) <- list(nthread = 2) expect_equal(xgb.get.num.boosted.rounds(b1), 1) b2 <- xgb.load(files[2]) - xgb.parameters(b2) <- list(nthread = 2) + xgb.model.parameters(b2) <- list(nthread = 2) expect_equal(xgb.get.num.boosted.rounds(b2), 2) xgb.config(b2) <- xgb.config(bst) @@ -239,7 +244,7 @@ test_that("xgb.cb.save.model works as expected", { expect_equal(xgb.save.raw(bst), xgb.save.raw(b2)) # save_period = 0 saves the last iteration's model - bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, eta = 1, verbose = 0, + bst <- xgb.train(params, dtrain, nrounds = 2, evals = evals, verbose = 0, save_period = 0, save_name = file.path(tempdir(), 'xgboost.json')) expect_true(file.exists(files[3])) b2 <- xgb.load(files[3]) @@ -250,9 +255,10 @@ test_that("xgb.cb.save.model works as expected", { }) test_that("early stopping xgb.train works", { + params <- c(params, list(eta = 0.3)) set.seed(11) expect_output( - bst <- xgb.train(param, dtrain, nrounds = 20, evals = evals, eta = 0.3, + bst <- xgb.train(params, dtrain, nrounds = 20, evals = evals, early_stopping_rounds = 3, maximize = FALSE) , "Stopping. Best iteration") expect_false(is.null(xgb.attr(bst, "best_iteration"))) @@ -266,12 +272,12 @@ test_that("early stopping xgb.train works", { set.seed(11) expect_silent( - bst0 <- xgb.train(param, dtrain, nrounds = 20, evals = evals, eta = 0.3, + bst0 <- xgb.train(params, dtrain, nrounds = 20, evals = evals, early_stopping_rounds = 3, maximize = FALSE, verbose = 0) ) expect_equal(attributes(bst)$evaluation_log, attributes(bst0)$evaluation_log) - fname <- file.path(tempdir(), "model.bin") + fname <- file.path(tempdir(), "model.ubj") xgb.save(bst, fname) loaded <- xgb.load(fname) @@ -282,10 +288,22 @@ test_that("early stopping xgb.train works", { test_that("early stopping using a specific metric works", { set.seed(11) expect_output( - bst <- xgb.train(param[-2], dtrain, nrounds = 20, evals = evals, eta = 0.6, - eval_metric = "logloss", eval_metric = "auc", - callbacks = list(xgb.cb.early.stop(stopping_rounds = 3, maximize = FALSE, - metric_name = 'test_logloss'))) + bst <- xgb.train( + c( + within(params, rm("eval_metric")), + list( + eta = 0.6, + eval_metric = "logloss", + eval_metric = "auc" + ) + ), + dtrain, + nrounds = 20, + evals = evals, + callbacks = list( + xgb.cb.early.stop(stopping_rounds = 3, maximize = FALSE, metric_name = 'test_logloss') + ) + ) , "Stopping. Best iteration") expect_false(is.null(xgb.attr(bst, "best_iteration"))) expect_lt(xgb.attr(bst, "best_iteration"), 19) @@ -308,13 +326,16 @@ test_that("early stopping works with titanic", { dtx <- model.matrix(~ 0 + ., data = titanic[, c("Pclass", "Sex")]) dty <- titanic$Survived - xgboost::xgb.train( + xgb.train( data = xgb.DMatrix(dtx, label = dty), - objective = "binary:logistic", - eval_metric = "auc", + params = xgb.params( + objective = "binary:logistic", + eval_metric = "auc", + nthread = n_threads + ), nrounds = 100, early_stopping_rounds = 3, - nthread = n_threads, + verbose = 0, evals = list(train = xgb.DMatrix(dtx, label = dty)) ) @@ -324,9 +345,18 @@ test_that("early stopping works with titanic", { test_that("early stopping xgb.cv works", { set.seed(11) expect_output( - cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.3, nrounds = 20, - early_stopping_rounds = 3, maximize = FALSE) - , "Stopping. Best iteration") + { + cv <- xgb.cv( + c(params, list(eta = 0.3)), + dtrain, + nfold = 5, + nrounds = 20, + early_stopping_rounds = 3, + maximize = FALSE + ) + }, + "Stopping. Best iteration" + ) expect_false(is.null(cv$early_stop$best_iteration)) expect_lt(cv$early_stop$best_iteration, 19) # the best error is min error: @@ -334,9 +364,10 @@ test_that("early stopping xgb.cv works", { }) test_that("prediction in xgb.cv works", { + params <- c(params, list(eta = 0.5)) set.seed(11) nrounds <- 4 - cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE, verbose = 0) + cv <- xgb.cv(params, dtrain, nfold = 5, nrounds = nrounds, prediction = TRUE, verbose = 0) expect_false(is.null(cv$evaluation_log)) expect_false(is.null(cv$cv_predict$pred)) expect_length(cv$cv_predict$pred, nrow(train$data)) @@ -346,7 +377,7 @@ test_that("prediction in xgb.cv works", { # save CV models set.seed(11) - cvx <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE, verbose = 0, + cvx <- xgb.cv(params, dtrain, nfold = 5, nrounds = nrounds, prediction = TRUE, verbose = 0, callbacks = list(xgb.cb.cv.predict(save_models = TRUE))) expect_equal(cv$evaluation_log, cvx$evaluation_log) expect_length(cvx$cv_predict$models, 5) @@ -355,19 +386,20 @@ test_that("prediction in xgb.cv works", { test_that("prediction in xgb.cv works for gblinear too", { set.seed(11) - p <- list(booster = 'gblinear', objective = "reg:logistic", nthread = n_threads) - cv <- xgb.cv(p, dtrain, nfold = 5, eta = 0.5, nrounds = 2, prediction = TRUE, verbose = 0) + p <- xgb.params(booster = 'gblinear', objective = "reg:logistic", eta = 0.5, nthread = n_threads) + cv <- xgb.cv(p, dtrain, nfold = 5, nrounds = 2, prediction = TRUE, verbose = 0) expect_false(is.null(cv$evaluation_log)) expect_false(is.null(cv$cv_predict$pred)) expect_length(cv$cv_predict$pred, nrow(train$data)) }) test_that("prediction in early-stopping xgb.cv works", { + params <- c(params, list(eta = 0.1, base_score = 0.5)) set.seed(11) expect_output( - cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.1, nrounds = 20, + cv <- xgb.cv(params, dtrain, nfold = 5, nrounds = 20, early_stopping_rounds = 5, maximize = FALSE, stratified = FALSE, - prediction = TRUE, base_score = 0.5, verbose = TRUE) + prediction = TRUE, verbose = TRUE) , "Stopping. Best iteration") expect_false(is.null(cv$early_stop$best_iteration)) @@ -387,11 +419,22 @@ test_that("prediction in xgb.cv for softprob works", { lb <- as.numeric(iris$Species) - 1 set.seed(11) expect_warning( - cv <- xgb.cv(data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb), nfold = 4, - eta = 0.5, nrounds = 5, max_depth = 3, nthread = n_threads, - subsample = 0.8, gamma = 2, verbose = 0, - prediction = TRUE, objective = "multi:softprob", num_class = 3) - , NA) + { + cv <- xgb.cv( + data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb), + nfold = 4, + nrounds = 5, + params = xgb.params( + objective = "multi:softprob", num_class = 3, + eta = 0.5, max_depth = 3, nthread = n_threads, + subsample = 0.8, gamma = 2 + ), + verbose = 0, + prediction = TRUE + ) + }, + NA + ) expect_false(is.null(cv$cv_predict$pred)) expect_equal(dim(cv$cv_predict$pred), c(nrow(iris), 3)) expect_lt(diff(range(rowSums(cv$cv_predict$pred))), 1e-6) @@ -404,7 +447,7 @@ test_that("prediction in xgb.cv works for multi-quantile", { dm <- xgb.DMatrix(x, label = y, nthread = 1) cv <- xgb.cv( data = dm, - params = list( + params = xgb.params( objective = "reg:quantileerror", quantile_alpha = c(0.1, 0.2, 0.5, 0.8, 0.9), nthread = 1 @@ -424,7 +467,7 @@ test_that("prediction in xgb.cv works for multi-output", { dm <- xgb.DMatrix(x, label = cbind(y, -y), nthread = 1) cv <- xgb.cv( data = dm, - params = list( + params = xgb.params( tree_method = "hist", multi_strategy = "multi_output_tree", objective = "reg:squarederror", @@ -445,7 +488,7 @@ test_that("prediction in xgb.cv works for multi-quantile", { dm <- xgb.DMatrix(x, label = y, nthread = 1) cv <- xgb.cv( data = dm, - params = list( + params = xgb.params( objective = "reg:quantileerror", quantile_alpha = c(0.1, 0.2, 0.5, 0.8, 0.9), nthread = 1 @@ -465,7 +508,7 @@ test_that("prediction in xgb.cv works for multi-output", { dm <- xgb.DMatrix(x, label = cbind(y, -y), nthread = 1) cv <- xgb.cv( data = dm, - params = list( + params = xgb.params( tree_method = "hist", multi_strategy = "multi_output_tree", objective = "reg:squarederror", diff --git a/R-package/tests/testthat/test_custom_objective.R b/R-package/tests/testthat/test_custom_objective.R index cf3a347d4d9d..1d08b8ebf280 100644 --- a/R-package/tests/testthat/test_custom_objective.R +++ b/R-package/tests/testthat/test_custom_objective.R @@ -33,7 +33,7 @@ param <- list(max_depth = 2, eta = 1, nthread = n_threads, num_round <- 2 test_that("custom objective works", { - bst <- xgb.train(param, dtrain, num_round, evals) + bst <- xgb.train(param, dtrain, num_round, evals, verbose = 0) expect_equal(class(bst), "xgb.Booster") expect_false(is.null(attributes(bst)$evaluation_log)) expect_false(is.null(attributes(bst)$evaluation_log$eval_error)) @@ -41,14 +41,14 @@ test_that("custom objective works", { }) test_that("custom objective in CV works", { - cv <- xgb.cv(param, dtrain, num_round, nfold = 10, verbose = FALSE) + cv <- xgb.cv(param, dtrain, num_round, nfold = 10, verbose = FALSE, stratified = FALSE) expect_false(is.null(cv$evaluation_log)) expect_equal(dim(cv$evaluation_log), c(2, 5)) expect_lt(cv$evaluation_log[num_round, test_error_mean], 0.03) }) test_that("custom objective with early stop works", { - bst <- xgb.train(param, dtrain, 10, evals) + bst <- xgb.train(param, dtrain, 10, evals, verbose = 0) expect_equal(class(bst), "xgb.Booster") train_log <- attributes(bst)$evaluation_log$train_error expect_true(all(diff(train_log) <= 0)) @@ -66,7 +66,7 @@ test_that("custom objective using DMatrix attr works", { return(list(grad = grad, hess = hess)) } param$objective <- logregobjattr - bst <- xgb.train(param, dtrain, num_round, evals) + bst <- xgb.train(param, dtrain, num_round, evals, verbose = 0) expect_equal(class(bst), "xgb.Booster") }) @@ -89,7 +89,9 @@ test_that("custom objective with multi-class shape", { } param$objective <- fake_softprob param$eval_metric <- fake_merror - bst <- xgb.train(param, dtrain, 1, num_class = n_classes) + expect_warning({ + bst <- xgb.train(c(param, list(num_class = n_classes)), dtrain, nrounds = 1) + }) }) softmax <- function(values) { @@ -168,13 +170,29 @@ test_that("custom metric with multi-target passes reshaped data to feval", { num_class = 3L, base_score = 0, disable_default_eval_metric = TRUE, + eval_metric = multinomial.ll, max_depth = 123, seed = 123 ), data = dtrain, nrounds = 2L, evals = list(Train = dtrain), - eval_metric = multinomial.ll, + verbose = 0 + ) + + model <- xgb.train( + params = list( + objective = "multi:softmax", + num_class = 3L, + base_score = 0, + disable_default_eval_metric = TRUE, + max_depth = 123, + seed = 123 + ), + data = dtrain, + nrounds = 2L, + evals = list(Train = dtrain), + custom_metric = multinomial.ll, verbose = 0 ) }) diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R index 6aa8cda4b666..ead67d86a258 100644 --- a/R-package/tests/testthat/test_dmatrix.R +++ b/R-package/tests/testthat/test_dmatrix.R @@ -41,13 +41,13 @@ test_that("xgb.DMatrix: basic construction", { params <- list(tree_method = "hist", nthread = n_threads) bst_fd <- xgb.train( - params, nrounds = 8, fd, evals = list(train = fd) + params, nrounds = 8, fd, evals = list(train = fd), verbose = 0 ) bst_dgr <- xgb.train( - params, nrounds = 8, fdgr, evals = list(train = fdgr) + params, nrounds = 8, fdgr, evals = list(train = fdgr), verbose = 0 ) bst_dgc <- xgb.train( - params, nrounds = 8, fdgc, evals = list(train = fdgc) + params, nrounds = 8, fdgc, evals = list(train = fdgc), verbose = 0 ) raw_fd <- xgb.save.raw(bst_fd, raw_format = "ubj") @@ -103,8 +103,10 @@ test_that("xgb.DMatrix: saving, loading", { on.exit(unlink(tmp_file)) expect_true(xgb.DMatrix.save(dtest1, tmp_file)) # read from a local file + xgb.set.config(verbosity = 2) expect_output(dtest3 <- xgb.DMatrix(tmp_file), "entries loaded from") - expect_output(dtest3 <- xgb.DMatrix(tmp_file, silent = TRUE), NA) + xgb.set.config(verbosity = 1) + expect_output(dtest3 <- xgb.DMatrix(tmp_file), NA) unlink(tmp_file) expect_equal(getinfo(dtest1, 'label'), getinfo(dtest3, 'label')) @@ -128,6 +130,7 @@ test_that("xgb.DMatrix: saving, loading", { expect_equal(length(cnames), 126) tmp_file <- tempfile('xgb.DMatrix_') xgb.DMatrix.save(dtrain, tmp_file) + xgb.set.config(verbosity = 0) dtrain <- xgb.DMatrix(tmp_file) expect_equal(colnames(dtrain), cnames) diff --git a/R-package/tests/testthat/test_glm.R b/R-package/tests/testthat/test_glm.R index b59de8b62f15..226439319dc8 100644 --- a/R-package/tests/testthat/test_glm.R +++ b/R-package/tests/testthat/test_glm.R @@ -21,12 +21,12 @@ test_that("gblinear works", { VERB <- 0 # chatterbox switch param$updater <- 'shotgun' - bst <- xgb.train(param, dtrain, n, evals, verbose = VERB, feature_selector = 'shuffle') + bst <- xgb.train(c(param, list(feature_selector = 'shuffle')), dtrain, n, evals, verbose = VERB) ypred <- predict(bst, dtest) expect_equal(length(getinfo(dtest, 'label')), 1611) expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL) - bst <- xgb.train(param, dtrain, n, evals, verbose = VERB, feature_selector = 'cyclic', + bst <- xgb.train(c(param, list(feature_selector = 'cyclic')), dtrain, n, evals, verbose = VERB, callbacks = list(xgb.cb.gblinear.history())) expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL) h <- xgb.gblinear.history(bst) @@ -34,17 +34,17 @@ test_that("gblinear works", { expect_is(h, "matrix") param$updater <- 'coord_descent' - bst <- xgb.train(param, dtrain, n, evals, verbose = VERB, feature_selector = 'cyclic') + bst <- xgb.train(c(param, list(feature_selector = 'cyclic')), dtrain, n, evals, verbose = VERB) expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL) - bst <- xgb.train(param, dtrain, n, evals, verbose = VERB, feature_selector = 'shuffle') + bst <- xgb.train(c(param, list(feature_selector = 'shuffle')), dtrain, n, evals, verbose = VERB) expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL) - bst <- xgb.train(param, dtrain, 2, evals, verbose = VERB, feature_selector = 'greedy') + bst <- xgb.train(c(param, list(feature_selector = 'greedy')), dtrain, 2, evals, verbose = VERB) expect_lt(attributes(bst)$evaluation_log$eval_error[2], ERR_UL) - bst <- xgb.train(param, dtrain, n, evals, verbose = VERB, feature_selector = 'thrifty', - top_k = 50, callbacks = list(xgb.cb.gblinear.history(sparse = TRUE))) + bst <- xgb.train(c(param, list(feature_selector = 'thrifty', top_k = 50)), dtrain, n, evals, verbose = VERB, + callbacks = list(xgb.cb.gblinear.history(sparse = TRUE))) expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL) h <- xgb.gblinear.history(bst) expect_equal(dim(h), c(n, ncol(dtrain) + 1)) @@ -61,7 +61,7 @@ test_that("gblinear early stopping works", { agaricus.test$data, label = agaricus.test$label, nthread = n_threads ) - param <- list( + param <- xgb.params( objective = "binary:logistic", eval_metric = "error", booster = "gblinear", nthread = n_threads, eta = 0.8, alpha = 0.0001, lambda = 0.0001, updater = "coord_descent" @@ -70,14 +70,16 @@ test_that("gblinear early stopping works", { es_round <- 1 n <- 10 booster <- xgb.train( - param, dtrain, n, list(eval = dtest, train = dtrain), early_stopping_rounds = es_round + param, dtrain, nrounds = n, evals = list(eval = dtest, train = dtrain), + early_stopping_rounds = es_round, verbose = 0 ) expect_equal(xgb.attr(booster, "best_iteration"), 4) predt_es <- predict(booster, dtrain) n <- xgb.attr(booster, "best_iteration") + es_round + 1 booster <- xgb.train( - param, dtrain, n, list(eval = dtest, train = dtrain), early_stopping_rounds = es_round + param, dtrain, nrounds = n, evals = list(eval = dtest, train = dtrain), + early_stopping_rounds = es_round, verbose = 0 ) predt <- predict(booster, dtrain) expect_equal(predt_es, predt) diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index bfffe9e7878c..b03282cfd6d0 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -25,15 +25,26 @@ if (isTRUE(VCD_AVAILABLE)) { label <- df[, ifelse(Improved == "Marked", 1, 0)] # binary - bst.Tree <- xgb.train(data = xgb.DMatrix(sparse_matrix, label = label), max_depth = 9, - eta = 1, nthread = 2, nrounds = nrounds, verbose = 0, - objective = "binary:logistic", booster = "gbtree", - base_score = 0.5) - - bst.GLM <- xgb.train(data = xgb.DMatrix(sparse_matrix, label = label), - eta = 1, nthread = 1, nrounds = nrounds, verbose = 0, - objective = "binary:logistic", booster = "gblinear", - base_score = 0.5) + bst.Tree <- xgb.train( + data = xgb.DMatrix(sparse_matrix, label = label), + nrounds = nrounds, verbose = 0, + params = xgb.params( + max_depth = 9, + eta = 1, nthread = 2, + objective = "binary:logistic", booster = "gbtree", + base_score = 0.5 + ) + ) + + bst.GLM <- xgb.train( + data = xgb.DMatrix(sparse_matrix, label = label), + nrounds = nrounds, verbose = 0, + params = xgb.params( + eta = 1, nthread = 1, + objective = "binary:logistic", booster = "gblinear", + base_score = 0.5 + ) + ) feature.names <- colnames(sparse_matrix) @@ -45,13 +56,25 @@ if (isTRUE(VCD_AVAILABLE)) { # multiclass mlabel <- as.numeric(iris$Species) - 1 nclass <- 3 -mbst.Tree <- xgb.train(data = xgb.DMatrix(as.matrix(iris[, -5]), label = mlabel), verbose = 0, - max_depth = 3, eta = 0.5, nthread = 2, nrounds = nrounds, - objective = "multi:softprob", num_class = nclass, base_score = 0) - -mbst.GLM <- xgb.train(data = xgb.DMatrix(as.matrix(iris[, -5]), label = mlabel), verbose = 0, - booster = "gblinear", eta = 0.1, nthread = 1, nrounds = nrounds, - objective = "multi:softprob", num_class = nclass, base_score = 0) +mbst.Tree <- xgb.train( + data = xgb.DMatrix(as.matrix(iris[, -5]), label = mlabel), + verbose = 0, + nrounds = nrounds, + params = xgb.params( + max_depth = 3, eta = 0.5, nthread = 2, + objective = "multi:softprob", num_class = nclass, base_score = 0 + ) +) + +mbst.GLM <- xgb.train( + data = xgb.DMatrix(as.matrix(iris[, -5]), label = mlabel), + verbose = 0, + nrounds = nrounds, + params = xgb.params( + booster = "gblinear", eta = 0.1, nthread = 1, + objective = "multi:softprob", num_class = nclass, base_score = 0 + ) +) test_that("xgb.dump works", { .skip_if_vcd_not_available() @@ -74,9 +97,15 @@ test_that("xgb.dump works for gblinear", { expect_length(xgb.dump(bst.GLM), 14) # also make sure that it works properly for a sparse model where some coefficients # are 0 from setting large L1 regularization: - bst.GLM.sp <- xgb.train(data = xgb.DMatrix(sparse_matrix, label = label), eta = 1, - nthread = 2, nrounds = 1, - alpha = 2, objective = "binary:logistic", booster = "gblinear") + bst.GLM.sp <- xgb.train( + data = xgb.DMatrix(sparse_matrix, label = label), + nrounds = 1, + params = xgb.params( + eta = 1, + nthread = 2, + alpha = 2, objective = "binary:logistic", booster = "gblinear" + ) + ) d.sp <- xgb.dump(bst.GLM.sp) expect_length(d.sp, 14) expect_gt(sum(d.sp == "0"), 0) @@ -327,7 +356,7 @@ test_that("xgb.importance works with and without feature names", { importance <- xgb.importance(feature_names = feature.names, model = bst.Tree, trees = trees) importance_from_dump <- function() { - model_text_dump <- xgb.dump(model = bst.Tree, with_stats = TRUE, trees = trees) + model_text_dump <- xgb.dump(model = bst.Tree, with_stats = TRUE) imp <- xgb.model.dt.tree( text = model_text_dump, trees = trees @@ -352,11 +381,13 @@ test_that("xgb.importance works with and without feature names", { expect_equal(importance_from_dump(), importance, tolerance = 1e-6) ## decision stump - m <- xgboost::xgb.train( + m <- xgb.train( data = xgb.DMatrix(as.matrix(data.frame(x = c(0, 1))), label = c(1, 2)), nrounds = 1, - base_score = 0.5, - nthread = 2 + params = xgb.params( + base_score = 0.5, + nthread = 2 + ) ) df <- xgb.model.dt.tree(model = m) expect_equal(df$Feature, "Leaf") @@ -384,9 +415,15 @@ test_that("xgb.importance works with GLM model", { test_that("xgb.model.dt.tree and xgb.importance work with a single split model", { .skip_if_vcd_not_available() - bst1 <- xgb.train(data = xgb.DMatrix(sparse_matrix, label = label), max_depth = 1, - eta = 1, nthread = 2, nrounds = 1, verbose = 0, - objective = "binary:logistic") + bst1 <- xgb.train( + data = xgb.DMatrix(sparse_matrix, label = label), + nrounds = 1, verbose = 0, + params = xgb.params( + max_depth = 1, + eta = 1, nthread = 2, + objective = "binary:logistic" + ) + ) expect_error(dt <- xgb.model.dt.tree(model = bst1), regexp = NA) # no error expect_equal(nrow(dt), 3) expect_error(imp <- xgb.importance(model = bst1), regexp = NA) # no error @@ -406,7 +443,7 @@ test_that("xgb.plot.importance de-duplicates features", { test_that("xgb.plot.tree works with and without feature names", { .skip_if_vcd_not_available() - expect_silent(xgb.plot.tree(feature_names = feature.names, model = bst.Tree.unnamed)) + expect_silent(xgb.plot.tree(model = bst.Tree.unnamed)) expect_silent(xgb.plot.tree(model = bst.Tree)) ## Categorical @@ -425,8 +462,9 @@ test_that("xgb.plot.tree works with and without feature names", { test_that("xgb.plot.multi.trees works with and without feature names", { .skip_if_vcd_not_available() - xgb.plot.multi.trees(model = bst.Tree.unnamed, feature_names = feature.names, features_keep = 3) + xgb.plot.multi.trees(model = bst.Tree.unnamed, features_keep = 3) xgb.plot.multi.trees(model = bst.Tree, features_keep = 3) + expect_true(TRUE) }) test_that("xgb.plot.deepness works", { @@ -544,18 +582,76 @@ test_that("xgb.plot.shap.summary ignores categorical features", { }) test_that("check.deprecation works", { - ttt <- function(a = NNULL, DUMMY = NULL, ...) { - check.deprecation(...) - as.list((environment())) - } - res <- ttt(a = 1, DUMMY = 2, z = 3) - expect_equal(res, list(a = 1, DUMMY = 2)) - expect_error( - res <- ttt(a = 1, dummy = 22, z = 3), - ) - expect_error( - res <- ttt(a = 1, dumm = 22, z = 3), + data(mtcars) + dm <- xgb.DMatrix(mtcars[, -1L], label = mtcars$mpg) + params <- xgb.params(nthread = 1, max_depth = 2, eval_metric = "rmse") + args_train <- list( + data = dm, + params = params, + nrounds = 10, + verbose = 0 ) + + # with exact name + expect_warning({ + model <- xgb.train( + data = dm, + params = params, + nrounds = 10, + watchlist = list(tr = dm), + verbose = 0 + ) + }, regexp = "watchlist") + expect_true(hasName(attributes(model), "evaluation_log")) + expect_equal(names(attributes(model)$evaluation_log), c("iter", "tr_rmse")) + + # with partial name match + expect_warning({ + model <- xgb.train( + data = dm, + params = params, + nrounds = 10, + watchlis = list(train = dm), + verbose = 0 + ) + }, regexp = "watchlist") + expect_true(hasName(attributes(model), "evaluation_log")) + expect_equal(names(attributes(model)$evaluation_log), c("iter", "train_rmse")) + + # error is thrown if argument cannot be matched + expect_error({ + model <- xgb.train( + data = dm, + params = params, + nrounds = 10, + watchlistt = list(train = dm), + verbose = 0 + ) + }, regexp = "unrecognized") + + # error should suggest to put under 'params' if it goes there + expect_error({ + model <- xgb.train( + data = dm, + nthread = 1, max_depth = 2, eval_metric = "rmse", + nrounds = 10, + watchlistt = list(train = dm), + verbose = 0 + ) + }, regexp = "should be passed as a list to argument 'params'") + + # can take more than one deprecated parameter + expect_warning({ + model <- xgb.train( + training.data = dm, + params = params, + nrounds = 10, + watchlis = list(tr = dm), + verbose = 0 + ) + }, regexp = "training.data") + expect_true(hasName(attributes(model), "evaluation_log")) + expect_equal(names(attributes(model)$evaluation_log), c("iter", "tr_rmse")) }) test_that('convert.labels works', { @@ -659,3 +755,35 @@ test_that("validate.features works as expected", { validate.features(model, tmp) }, "Feature types") }) + +test_that("Parameters constructor works as expected", { + empty_list <- list() + names(empty_list) <- character() + + params <- xgb.params() + expect_equal(params, empty_list) + + params <- xgb.params(max_depth = 2) + expect_equal(params, list(max_depth = 2)) + + params <- xgb.params(max_depth = NULL) + expect_equal(params, empty_list) + + max_depth <- 3 + params <- xgb.params(max_depth = max_depth) + expect_equal(params, list(max_depth = 3)) + + four <- 4L + params <- xgb.params(max_depth = four) + expect_equal(params, list(max_depth = 4L)) + + params <- xgb.params(objective = "binary:logistic", nthread = 10) + expect_equal(params, list(objective = "binary:logistic", nthread = 10)) + + expect_error({ + xgb.params(max_xgboost = 10) + }) + expect_error({ + xgb.params(max_depth = 2, max_depth = 3) + }) +}) diff --git a/R-package/tests/testthat/test_interaction_constraints.R b/R-package/tests/testthat/test_interaction_constraints.R index cfffb029ce84..d28f1e618d49 100644 --- a/R-package/tests/testthat/test_interaction_constraints.R +++ b/R-package/tests/testthat/test_interaction_constraints.R @@ -13,9 +13,15 @@ train <- matrix(c(x1, x2, x3), ncol = 3) test_that("interaction constraints for regression", { # Fit a model that only allows interaction between x1 and x2 - bst <- xgb.train(data = xgb.DMatrix(train, label = y), max_depth = 3, - eta = 0.1, nthread = 2, nrounds = 100, verbose = 0, - interaction_constraints = list(c(0, 1))) + bst <- xgb.train( + data = xgb.DMatrix(train, label = y), + nrounds = 100, verbose = 0, + params = xgb.params( + max_depth = 3, + eta = 0.1, nthread = 2, + interaction_constraints = list(c(0, 1)) + ) + ) # Set all observations to have the same x3 values then increment # by the same amount @@ -52,13 +58,20 @@ test_that("interaction constraints scientific representation", { with_inc <- xgb.train( data = dtrain, - tree_method = 'hist', - interaction_constraints = inc, nrounds = 10, - nthread = n_threads + params = xgb.params( + tree_method = 'hist', + interaction_constraints = inc, + nthread = n_threads + ) ) without_inc <- xgb.train( - data = dtrain, tree_method = 'hist', nrounds = 10, nthread = n_threads + data = dtrain, + nrounds = 10, + params = xgb.params( + tree_method = 'hist', + nthread = n_threads + ) ) expect_equal(xgb.save.raw(with_inc), xgb.save.raw(without_inc)) }) diff --git a/R-package/tests/testthat/test_interactions.R b/R-package/tests/testthat/test_interactions.R index 1380225c79f7..a01adcc532d4 100644 --- a/R-package/tests/testthat/test_interactions.R +++ b/R-package/tests/testthat/test_interactions.R @@ -123,7 +123,7 @@ test_that("multiclass feature interactions work", { dm <- xgb.DMatrix( as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1, nthread = n_threads ) - param <- list( + param <- xgb.params( eta = 0.1, max_depth = 4, objective = 'multi:softprob', num_class = 3, nthread = n_threads ) b <- xgb.train(param, dm, 40) @@ -152,10 +152,12 @@ test_that("SHAP single sample works", { test <- agaricus.test booster <- xgb.train( data = xgb.DMatrix(train$data, label = train$label), - max_depth = 2, nrounds = 4, - objective = "binary:logistic", - nthread = n_threads + params = xgb.params( + max_depth = 2, + objective = "binary:logistic", + nthread = n_threads + ) ) predt <- predict( diff --git a/R-package/tests/testthat/test_io.R b/R-package/tests/testthat/test_io.R index 36a6d7572f2f..3265ca0197af 100644 --- a/R-package/tests/testthat/test_io.R +++ b/R-package/tests/testthat/test_io.R @@ -9,8 +9,11 @@ test_that("load/save raw works", { nrounds <- 8 booster <- xgb.train( data = xgb.DMatrix(train$data, label = train$label), - nrounds = nrounds, objective = "binary:logistic", - nthread = 2 + nrounds = nrounds, + params = xgb.params( + objective = "binary:logistic", + nthread = 2 + ) ) json_bytes <- xgb.save.raw(booster, raw_format = "json") @@ -34,7 +37,7 @@ test_that("saveRDS preserves C and R attributes", { dm <- xgb.DMatrix(x, label = y, nthread = 1) model <- xgb.train( data = dm, - params = list(nthread = 1, max_depth = 2), + params = xgb.params(nthread = 1, max_depth = 2), nrounds = 5 ) attributes(model)$my_attr <- "qwerty" diff --git a/R-package/tests/testthat/test_model_compatibility.R b/R-package/tests/testthat/test_model_compatibility.R index 613ba066f459..9bab6e0c91a7 100644 --- a/R-package/tests/testthat/test_model_compatibility.R +++ b/R-package/tests/testthat/test_model_compatibility.R @@ -87,7 +87,7 @@ test_that("Models from previous versions of XGBoost can be loaded", { booster <- readRDS(model_file) } else { booster <- xgb.load(model_file) - xgb.parameters(booster) <- list(nthread = 2) + xgb.model.parameters(booster) <- list(nthread = 2) } predict(booster, newdata = pred_data) run_booster_check(booster, name) diff --git a/R-package/tests/testthat/test_monotone.R b/R-package/tests/testthat/test_monotone.R index 671c02bd0658..70d67ceb7e22 100644 --- a/R-package/tests/testthat/test_monotone.R +++ b/R-package/tests/testthat/test_monotone.R @@ -7,9 +7,15 @@ train <- matrix(x, ncol = 1) test_that("monotone constraints for regression", { - bst <- xgb.train(data = xgb.DMatrix(train, label = y), max_depth = 2, - eta = 0.1, nthread = 2, nrounds = 100, verbose = 0, - monotone_constraints = -1) + bst <- xgb.train( + data = xgb.DMatrix(train, label = y), + nrounds = 100, verbose = 0, + params = xgb.params( + max_depth = 2, + eta = 0.1, nthread = 2, + monotone_constraints = -1 + ) + ) pred <- predict(bst, train) diff --git a/R-package/tests/testthat/test_parameter_exposure.R b/R-package/tests/testthat/test_parameter_exposure.R index ed5c28ca5aaa..aacefe3a83ce 100644 --- a/R-package/tests/testthat/test_parameter_exposure.R +++ b/R-package/tests/testthat/test_parameter_exposure.R @@ -10,13 +10,17 @@ dtest <- xgb.DMatrix( agaricus.test$data, label = agaricus.test$label, nthread = 2 ) -bst <- xgb.train(data = dtrain, - max_depth = 2, - eta = 1, - nrounds = 10, - nthread = 1, - verbose = 0, - objective = "binary:logistic") +bst <- xgb.train( + data = dtrain, + verbose = 0, + nrounds = 10, + params = xgb.params( + max_depth = 2, + eta = 1, + nthread = 1, + objective = "binary:logistic" + ) +) test_that("call is exposed to R", { expect_false(is.null(attributes(bst)$call)) diff --git a/R-package/tests/testthat/test_poisson_regression.R b/R-package/tests/testthat/test_poisson_regression.R index adf199c052fb..c045091d044b 100644 --- a/R-package/tests/testthat/test_poisson_regression.R +++ b/R-package/tests/testthat/test_poisson_regression.R @@ -6,7 +6,8 @@ test_that("Poisson regression works", { data(mtcars) bst <- xgb.train( data = xgb.DMatrix(as.matrix(mtcars[, -11]), label = mtcars[, 11]), - objective = 'count:poisson', nrounds = 10, verbose = 0, nthread = 2 + nrounds = 10, verbose = 0, + params = xgb.params(objective = 'count:poisson', nthread = 2) ) expect_equal(class(bst), "xgb.Booster") pred <- predict(bst, as.matrix(mtcars[, -11])) @@ -21,7 +22,7 @@ test_that("Poisson regression is centered around mean", { x <- matrix(rnorm(m * n), nrow = m) model <- xgb.train( data = xgb.DMatrix(x, label = y), - params = list(objective = "count:poisson", gamma = 1e4), + params = xgb.params(objective = "count:poisson", gamma = 1e4), nrounds = 1 ) model_json <- xgb.save.raw(model, "json") |> rawToChar() |> jsonlite::fromJSON() @@ -41,7 +42,7 @@ test_that("Poisson regression is centered around mean", { w <- y + 1 model_weighted <- xgb.train( data = xgb.DMatrix(x, label = y, weight = w), - params = list(objective = "count:poisson", gamma = 1e4), + params = xgb.params(objective = "count:poisson", gamma = 1e4), nrounds = 1 ) model_json <- xgb.save.raw(model_weighted, "json") |> rawToChar() |> jsonlite::fromJSON() diff --git a/R-package/tests/testthat/test_ranking.R b/R-package/tests/testthat/test_ranking.R index 0e7db42da0b2..3a87bc60944b 100644 --- a/R-package/tests/testthat/test_ranking.R +++ b/R-package/tests/testthat/test_ranking.R @@ -15,7 +15,7 @@ test_that('Test ranking with unweighted data', { params <- list(eta = 1, tree_method = 'exact', objective = 'rank:pairwise', max_depth = 1, eval_metric = 'auc', eval_metric = 'aucpr', nthread = n_threads) - bst <- xgb.train(params, dtrain, nrounds = 10, evals = list(train = dtrain)) + bst <- xgb.train(params, dtrain, nrounds = 10, evals = list(train = dtrain), verbose = 0) # Check if the metric is monotone increasing expect_true(all(diff(attributes(bst)$evaluation_log$train_auc) >= 0)) expect_true(all(diff(attributes(bst)$evaluation_log$train_aucpr) >= 0)) @@ -39,7 +39,7 @@ test_that('Test ranking with weighted data', { eta = 1, tree_method = "exact", objective = "rank:pairwise", max_depth = 1, eval_metric = "auc", eval_metric = "aucpr", nthread = n_threads ) - bst <- xgb.train(params, dtrain, nrounds = 10, evals = list(train = dtrain)) + bst <- xgb.train(params, dtrain, nrounds = 10, evals = list(train = dtrain), verbose = 0) # Check if the metric is monotone increasing expect_true(all(diff(attributes(bst)$evaluation_log$train_auc) >= 0)) expect_true(all(diff(attributes(bst)$evaluation_log$train_aucpr) >= 0)) diff --git a/R-package/tests/testthat/test_unicode.R b/R-package/tests/testthat/test_unicode.R index 718d58109163..efdb32ac31f3 100644 --- a/R-package/tests/testthat/test_unicode.R +++ b/R-package/tests/testthat/test_unicode.R @@ -8,15 +8,21 @@ set.seed(1994) test_that("Can save and load models with Unicode paths", { nrounds <- 2 - bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2, - eta = 1, nthread = 2, nrounds = nrounds, objective = "binary:logistic", - eval_metric = "error") + bst <- xgb.train( + data = xgb.DMatrix(train$data, label = train$label), + nrounds = nrounds, + params = xgb.params( + max_depth = 2, + eta = 1, nthread = 2, + objective = "binary:logistic" + ) + ) tmpdir <- tempdir() lapply(c("모델.json", "がうる・ぐら.json", "类继承.ubj"), function(x) { path <- file.path(tmpdir, x) xgb.save(bst, path) bst2 <- xgb.load(path) - xgb.parameters(bst2) <- list(nthread = 2) + xgb.model.parameters(bst2) <- list(nthread = 2) expect_equal(predict(bst, test$data), predict(bst2, test$data)) }) }) diff --git a/R-package/tests/testthat/test_xgboost.R b/R-package/tests/testthat/test_xgboost.R index 8f0c1e7ba9a7..b4e79decbb43 100644 --- a/R-package/tests/testthat/test_xgboost.R +++ b/R-package/tests/testthat/test_xgboost.R @@ -945,3 +945,70 @@ test_that("Column names from multiquantile are added to leaf predictions", { expect_equal(dim(pred), c(nrow(x), 1L, 3L)) expect_equal(dimnames(pred)[[3L]], c("q0.25", "q0.5", "q0.75")) }) + +test_that("Evaluation fraction leaves examples of all classes for training", { + # With minimal sample leave no remainder + lst_args <- list( + dmatrix_args = list( + data = matrix(seq(1, 4), ncol = 1L), + label = c(0, 0, 1, 1) + ), + metadata = list( + y_levels = c("a", "b") + ), + params = list( + seed = 123 + ) + ) + for (retry in seq_len(10)) { + lst_args$params$seed <- retry + res <- process.eval.set(0.5, lst_args) + expect_equal(length(intersect(res$idx_train, res$idx_eval)), 0) + expect_equal(length(res$idx_train), 2L) + expect_equal(length(res$idx_eval), 2L) + expect_true(length(intersect(c(1L, 2L), res$idx_train)) >= 1L) + expect_true(length(intersect(c(3L, 4L), res$idx_train)) >= 1L) + } + + # With minimal sample leaving some remainder + lst_args <- list( + dmatrix_args = list( + data = matrix(seq(1, 5), ncol = 1L), + label = c(0, 0, 1, 1, 1) + ), + metadata = list( + y_levels = c("a", "b") + ), + params = list( + seed = 123 + ) + ) + for (retry in seq_len(20)) { + lst_args$params$seed <- retry + res <- process.eval.set(0.4, lst_args) + expect_equal(length(intersect(res$idx_train, res$idx_eval)), 0) + expect_equal(length(res$idx_train), 3L) + expect_equal(length(res$idx_eval), 2L) + expect_true(length(intersect(c(1L, 2L), res$idx_train)) >= 1L) + expect_true(length(intersect(c(3L, 4L, 5L), res$idx_train)) >= 1L) + } +}) + +test_that("'eval_set' as fraction works", { + y <- iris$Species + x <- iris[, -5L] + model <- xgboost( + x, + y, + base_margin = matrix(0.1, nrow = nrow(x), ncol = 3L), + eval_set = 0.2, + nthreads = 1L, + nrounds = 4L, + max_depth = 2L + ) + expect_true(hasName(attributes(model), "evaluation_log")) + evaluation_log <- attributes(model)$evaluation_log + expect_equal(nrow(evaluation_log), 4L) + expect_true(hasName(evaluation_log, "eval_mlogloss")) + expect_equal(length(attributes(model)$metadata$y_levels), 3L) +}) diff --git a/R-package/vignettes/discoverYourData.Rmd b/R-package/vignettes/discoverYourData.Rmd index 8347d0ee0a84..ed82e3b8ce49 100644 --- a/R-package/vignettes/discoverYourData.Rmd +++ b/R-package/vignettes/discoverYourData.Rmd @@ -174,7 +174,7 @@ The code below is very usual. For more information, you can look at the document ```{r} bst <- xgboost(x = sparse_matrix, y = output_vector, - params = list(max_depth = 4, eta = 1), + max_depth = 4, eta = 1, nthread = 2, nrounds = 10) ``` @@ -302,12 +302,10 @@ test <- agaricus.test bst <- xgboost( x = train$data, y = factor(train$label, levels = c(0, 1)), - params = list( - max_depth = 4, - num_parallel_tree = 1000, - subsample = 0.5, - colsample_bytree = 0.5 - ), + max_depth = 4, + num_parallel_tree = 1000, + subsample = 0.5, + colsample_bytree = 0.5, nrounds = 1, nthread = 2 ) @@ -316,7 +314,7 @@ bst <- xgboost( bst <- xgboost( x = train$data, y = factor(train$label, levels = c(0, 1)), - params = list(max_depth = 4), + max_depth = 4, nrounds = 3, nthread = 2 ) diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd index 911234b3da70..444f7ba96fa8 100644 --- a/R-package/vignettes/xgboostPresentation.Rmd +++ b/R-package/vignettes/xgboostPresentation.Rmd @@ -157,12 +157,12 @@ as a named list: ```{r} bstTrInterface <- xgb.train( data = xgb.DMatrix(train$data, label = train$label, nthread = 1) - , params = list( + , params = xgb.params( objective = "binary:logistic" , max_depth = 2 , eta = 1 + , nthread = 2 ) - , nthread = 2 , nrounds = 2 ) ``` @@ -343,9 +343,9 @@ For a better understanding of the learning progression, you may want to have som ```{r evals2, message=F, warning=F} bst <- xgb.train( data = dtrain - , max_depth = 2 , params = list( eta = 1 + , max_depth = 2 , nthread = 2 , objective = "binary:logistic" , eval_metric = "error" @@ -475,7 +475,7 @@ An interesting test to see how identical our saved model is to the original one # can be modified like this: RhpcBLASctl::omp_set_num_threads(1) bst2 <- xgb.load(fname) -xgb.parameters(bst2) <- list(nthread = 2) +xgb.model.parameters(bst2) <- list(nthread = 2) pred2 <- predict(bst2, test$data) # And now the test @@ -500,7 +500,7 @@ print(class(rawVec)) # load binary model to R bst3 <- xgb.load.raw(rawVec) -xgb.parameters(bst3) <- list(nthread = 2) +xgb.model.parameters(bst3) <- list(nthread = 2) pred3 <- predict(bst3, test$data) # pred2 should be identical to pred diff --git a/R-package/vignettes/xgboostfromJSON.Rmd b/R-package/vignettes/xgboostfromJSON.Rmd index e5331b0ff38c..2fbffb3b46b9 100644 --- a/R-package/vignettes/xgboostfromJSON.Rmd +++ b/R-package/vignettes/xgboostfromJSON.Rmd @@ -54,10 +54,12 @@ data <- data.frame(dates = dates, labels = labels) bst <- xgb.train( data = xgb.DMatrix(as.matrix(data$dates), label = labels, missing = NA), - nthread = 2, nrounds = 1, - objective = "binary:logistic", - max_depth = 1 + params = xgb.params( + objective = "binary:logistic", + nthread = 2, + max_depth = 1 + ) ) ``` diff --git a/demo/README.md b/demo/README.md index 2be1141dded0..b0c644b0c802 100644 --- a/demo/README.md +++ b/demo/README.md @@ -24,36 +24,31 @@ Code Examples ------------- ### Features Walkthrough +_Note: for the R package, see the in-package examples and vignettes instead_ + This is a list of short codes introducing different functionalities of xgboost packages. * Basic walkthrough of packages [python](guide-python/basic_walkthrough.py) - [R](../R-package/demo/basic_walkthrough.R) [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/basic_walkthrough.jl) [PHP](https://github.com/bpachev/xgboost-php/blob/master/demo/titanic_demo.php) * Customize loss function, and evaluation metric [python](guide-python/custom_objective.py) - [R](../R-package/demo/custom_objective.R) [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/custom_objective.jl) * Boosting from existing prediction [python](guide-python/boost_from_prediction.py) - [R](../R-package/demo/boost_from_prediction.R) [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl) * Predicting using first n trees [python](guide-python/predict_first_ntree.py) - [R](../R-package/demo/predict_first_ntree.R) [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/predict_first_ntree.jl) * Generalized Linear Model [python](guide-python/generalized_linear_model.py) - [R](../R-package/demo/generalized_linear_model.R) [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/generalized_linear_model.jl) * Cross validation [python](guide-python/cross_validation.py) - [R](../R-package/demo/cross_validation.R) [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/cross_validation.jl) * Predicting leaf indices [python](guide-python/predict_leaf_indices.py) - [R](../R-package/demo/predict_leaf_indices.R) ### Basic Examples by Tasks diff --git a/doc/R-package/index.rst b/doc/R-package/index.rst index bf9c1f8d9007..18de8d1c0902 100644 --- a/doc/R-package/index.rst +++ b/doc/R-package/index.rst @@ -14,7 +14,6 @@ Get Started *********** * Checkout the :doc:`Installation Guide ` contains instructions to install xgboost, and :doc:`Tutorials ` for examples on how to use XGBoost for various tasks. * Read the `API documentation `_. -* Please visit `Walk-through Examples `_. ********* Tutorials diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst index d6effa0b09d4..c9c79231a2ec 100644 --- a/doc/contrib/ci.rst +++ b/doc/contrib/ci.rst @@ -11,78 +11,83 @@ project. :backlinks: none :local: -************** -GitHub Actions -************** -We make the extensive use of `GitHub Actions `_ to host our -CI pipelines. Most of the tests listed in the configuration files run automatically for every -incoming pull requests and every update to branches. A few tests however require manual activation: - -* R tests with ``noLD`` option: Run R tests using a custom-built R with compilation flag - ``--disable-long-double``. See `this page `_ for more - details about noLD. This is a requirement for keeping XGBoost on CRAN (the R package index). - To invoke this test suite for a particular pull request, simply add a review comment - ``/gha run r-nold-test``. (Ordinary comment won't work. It needs to be a review comment.) - -******************************* -Self-Hosted Runners with RunsOn -******************************* - -`RunsOn `_ is a SaaS (Software as a Service) app that lets us to easily create -self-hosted runners to use with GitHub Actions pipelines. RunsOn uses -`Amazon Web Services (AWS) `_ under the hood to provision runners with -access to various amount of CPUs, memory, and NVIDIA GPUs. Thanks to this app, we are able to test -GPU-accelerated and distributed algorithms of XGBoost while using the familar interface of -GitHub Actions. - -In GitHub Actions, jobs run on Microsoft-hosted runners by default. -To opt into self-hosted runners (enabled by RunsOn), we use the following special syntax: +**************** +Tips for testing +**************** + +==================================== +Running R tests with ``noLD`` option +==================================== +You can run R tests using a custom-built R with compilation flag +``--disable-long-double``. See `this page `_ for more +details about noLD. This is a requirement for keeping XGBoost on CRAN (the R package index). +Unlike other tests, this test must be invoked manually. Simply add a review comment +``/gha run r-nold-test`` to a pull request to kick off the test. +(Ordinary comment won't work. It needs to be a review comment.) + +=============================== +Making changes to CI containers +=============================== +Many of the CI pipelines use Docker containers to ensure consistent testing environment +with a variety of software packages. We have a separate repo, +`dmlc/xgboost-devops `_, to host the logic for +building and publishing CI containers. + +To make changes to the CI container, carry out the following steps: + +1. Identify which container needs updating. Example: + ``492475357299.dkr.ecr.us-west-2.amazonaws.com/xgb-ci.gpu:main`` +2. Clone `dmlc/xgboost-devops `_ and make changes to the + corresponding Dockerfile. Example: ``containers/dockerfile/Dockerfile.gpu``. +3. Locally build the container, to ensure that the container successfully builds. + Consult :ref:`build_run_docker_locally` for this step. +4. Submit a pull request to `dmlc/xgboost-devops `_ with + the proposed changes to the Dockerfile. Make note of the pull request number. Example: ``#204`` +5. Clone `dmlc/xgboost `_ and update all references to the + old container to point to the new container. More specifically, all Docker tags of format + ``492475357299.dkr.ecr.us-west-2.amazonaws.com/[container_id]:main`` should have the last + component replaced with ``PR-#``, where ``#`` is the pull request number. For the example above, + we'd replace ``492475357299.dkr.ecr.us-west-2.amazonaws.com/xgb-ci.gpu:main`` with + ``492475357299.dkr.ecr.us-west-2.amazonaws.com/xgb-ci.gpu:PR-204``. +6. Now submit a pull request to `dmlc/xgboost `_. The CI will + run tests using the new container. Verify that all tests pass. +7. Merge the pull request in ``dmlc/xgboost-devops``. Wait until the CI completes on the ``main`` branch. +8. Go back to the the pull request for ``dmlc/xgboost`` and change the container references back + to ``:main``. +9. Merge the pull request in ``dmlc/xgboost``. -.. code-block:: yaml - - runs-on: - - runs-on - - runner=runner-name - - run-id=${{ github.run_id }} - - tag=[unique tag that uniquely identifies the job in the GH Action workflow] +.. _build_run_docker_locally: -where the runner is defined in ``.github/runs-on.yml``. +=========================================== +Reproducing CI testing environments locally +=========================================== +You can reproduce the same testing environment as the CI pipelines by building and running Docker +containers locally. -********************************************************* -Reproduce CI testing environments using Docker containers -********************************************************* -In our CI pipelines, we use Docker containers extensively to package many software packages together. -You can reproduce the same testing environment as the CI pipelines by running Docker locally. +**Prerequisites** -============= -Prerequisites -============= 1. Install Docker: https://docs.docker.com/engine/install/ubuntu/ 2. Install NVIDIA Docker runtime: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html. The runtime lets you access NVIDIA GPUs inside a Docker container. -.. _build_run_docker_locally: - -============================================== -Building and Running Docker containers locally -============================================== -For your convenience, we provide three wrapper scripts: - -* ``ops/docker_build.py``: Build a Docker container -* ``ops/docker_build.sh``: Wrapper for ``ops/docker_build.py`` with a more concise interface -* ``ops/docker_run.py``: Run a command inside a Docker container - -**To build a Docker container**, invoke ``docker_build.sh`` as follows: +--------------------------- +To build a Docker container +--------------------------- +Clone the repository `dmlc/xgboost-devops `_ +and invoke ``containers/docker_build.sh`` as follows: .. code-block:: bash - export BRANCH_NAME="master" # Relevant for CI, for local testing, use "master" - bash ops/docker_build.sh CONTAINER_ID + # The following env vars are only relevant for CI + # For local testing, set them to "main" + export GITHUB_SHA="main" + export BRANCH_NAME="main" + bash containers/docker_build.sh CONTAINER_ID where ``CONTAINER_ID`` identifies for the container. The wrapper script will look up the YAML file -``ops/docker/ci_container.yml``. For example, when ``CONTAINER_ID`` is set to ``xgb-ci.gpu``, -the script will use the corresponding entry from ``ci_container.yml``: +``containers/ci_container.yml``. For example, when ``CONTAINER_ID`` is set to ``xgb-ci.gpu``, +the script will use the corresponding entry from ``containers/ci_container.yml``: .. code-block:: yaml @@ -94,9 +99,9 @@ the script will use the corresponding entry from ``ci_container.yml``: RAPIDS_VERSION_ARG: "24.10" The ``container_def`` entry indicates where the Dockerfile is located. The container -definition will be fetched from ``ops/docker/dockerfile/Dockerfile.CONTAINER_DEF`` where +definition will be fetched from ``containers/dockerfile/Dockerfile.CONTAINER_DEF`` where ``CONTAINER_DEF`` is the value of ``container_def`` entry. In this example, the Dockerfile -is ``ops/docker/dockerfile/Dockerfile.gpu``. +is ``containers/dockerfile/Dockerfile.gpu``. The ``build_args`` entry lists all the build arguments for the Docker build. In this example, the build arguments are: @@ -108,38 +113,21 @@ the build arguments are: The build arguments provide inputs to the ``ARG`` instructions in the Dockerfile. -.. note:: Inspect the logs from the CI pipeline to find what's going on under the hood - - When invoked, ``ops/docker_build.sh`` logs the precise commands that it runs under the hood. - Using the example above: - - .. code-block:: bash - - # docker_build.sh calls docker_build.py... - python3 ops/docker_build.py --container-def gpu --container-id xgb-ci.gpu \ - --build-arg CUDA_VERSION_ARG=12.4.1 --build-arg NCCL_VERSION_ARG=2.23.4-1 \ - --build-arg RAPIDS_VERSION_ARG=24.10 +When ``containers/docker_build.sh`` completes, you will have access to the container with tag +``492475357299.dkr.ecr.us-west-2.amazonaws.com/[container_id]:main``. The prefix +``492475357299.dkr.ecr.us-west-2.amazonaws.com/`` was added so that the container could +later be uploaded to AWS Elastic Container Registry (ECR), a private Docker registry. - ... - - # .. and docker_build.py in turn calls "docker build"... - docker build --build-arg CUDA_VERSION_ARG=12.4.1 \ - --build-arg NCCL_VERSION_ARG=2.23.4-1 \ - --build-arg RAPIDS_VERSION_ARG=24.10 \ - --load --progress=plain \ - --ulimit nofile=1024000:1024000 \ - -t xgb-ci.gpu \ - -f ops/docker/dockerfile/Dockerfile.gpu \ - ops/ - - The logs come in handy when debugging the container builds. In addition, you can change - the build arguments to make changes to the container. - -**To run commands within a Docker container**, invoke ``docker_run.py`` as follows: +----------------------------------------- +To run commands within a Docker container +----------------------------------------- +Invoke ``ops/docker_run.py`` from the main ``dmlc/xgboost`` repo as follows: .. code-block:: bash - python3 ops/docker_run.py --container-id "ID of the container" [--use-gpus] \ + python3 ops/docker_run.py \ + --container-tag 492475357299.dkr.ecr.us-west-2.amazonaws.com/[container_id]:main \ + [--use-gpus] \ -- "command to run inside the container" where ``--use-gpus`` should be specified to expose NVIDIA GPUs to the Docker container. @@ -149,83 +137,151 @@ For example: .. code-block:: bash # Run without GPU - python3 ops/docker_run.py --container-id xgb-ci.cpu \ - -- bash ops/script/build_via_cmake.sh + python3 ops/docker_run.py \ + --container-tag 492475357299.dkr.ecr.us-west-2.amazonaws.com/xgb-ci.cpu:main \ + -- bash ops/pipeline/build-cpu-impl.sh cpu # Run with NVIDIA GPU - python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + python3 ops/docker_run.py \ + --container-tag 492475357299.dkr.ecr.us-west-2.amazonaws.com/xgb-ci.gpu:main \ + --use-gpus \ -- bash ops/pipeline/test-python-wheel-impl.sh gpu -The ``docker_run.py`` script will convert these commands to the following invocations -of ``docker run``: - -.. code-block:: bash - - docker run --rm --pid=host \ - -w /workspace -v /path/to/xgboost:/workspace \ - -e CI_BUILD_UID= -e CI_BUILD_USER= \ - -e CI_BUILD_GID= -e CI_BUILD_GROUP= \ - xgb-ci.cpu \ - bash ops/script/build_via_cmake.sh - - docker run --rm --pid=host --gpus all \ - -w /workspace -v /path/to/xgboost:/workspace \ - -e CI_BUILD_UID= -e CI_BUILD_USER= \ - -e CI_BUILD_GID= -e CI_BUILD_GROUP= \ - xgb-ci.gpu \ - bash ops/pipeline/test-python-wheel-impl.sh gpu - Optionally, you can specify ``--run-args`` to pass extra arguments to ``docker run``: .. code-block:: bash # Allocate extra space in /dev/shm to enable NCCL # Also run the container with elevated privileges - python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + python3 ops/docker_run.py \ + --container-tag 492475357299.dkr.ecr.us-west-2.amazonaws.com/xgb-ci.gpu:main \ + --use-gpus \ --run-args='--shm-size=4g --privileged' \ -- bash ops/pipeline/test-python-wheel-impl.sh gpu -which translates to +See :ref:`ci_container_infra` to read about how containers are built and managed in the CI pipelines. -.. code-block:: bash +-------------------------------------------- +Examples: useful tasks for local development +-------------------------------------------- - docker run --rm --pid=host --gpus all \ - -w /workspace -v /path/to/xgboost:/workspace \ - -e CI_BUILD_UID= -e CI_BUILD_USER= \ - -e CI_BUILD_GID= -e CI_BUILD_GROUP= \ - --shm-size=4g --privileged \ - xgb-ci.gpu \ - bash ops/pipeline/test-python-wheel-impl.sh gpu +* Build XGBoost with GPU support + package it as a Python wheel + + .. code-block:: bash + + export DOCKER_REGISTRY=492475357299.dkr.ecr.us-west-2.amazonaws.com + python3 ops/docker_run.py \ + --container-tag ${DOCKER_REGISTRY}/xgb-ci.gpu_build_rockylinux8:main \ + -- ops/pipeline/build-cuda-impl.sh + +* Run Python tests + + .. code-block:: bash + + export DOCKER_REGISTRY=492475357299.dkr.ecr.us-west-2.amazonaws.com + python3 ops/docker_run.py \ + --container-tag ${DOCKER_REGISTRY}/xgb-ci.cpu:main \ + -- ops/pipeline/test-python-wheel-impl.sh cpu + +* Run Python tests with GPU algorithm + + .. code-block:: bash + + export DOCKER_REGISTRY=492475357299.dkr.ecr.us-west-2.amazonaws.com + python3 ops/docker_run.py \ + --container-tag ${DOCKER_REGISTRY}/xgb-ci.gpu:main \ + --use-gpus \ + -- ops/pipeline/test-python-wheel-impl.sh gpu + +* Run Python tests with GPU algorithm, with multiple GPUs + + .. code-block:: bash + + export DOCKER_REGISTRY=492475357299.dkr.ecr.us-west-2.amazonaws.com + python3 ops/docker_run.py \ + --container-tag ${DOCKER_REGISTRY}/xgb-ci.gpu:main \ + --use-gpus \ + --run-args='--shm-size=4g' \ + -- ops/pipeline/test-python-wheel-impl.sh mgpu + # --shm-size=4g is needed for multi-GPU algorithms to function + +* Build and test JVM packages + + .. code-block:: bash + + export DOCKER_REGISTRY=492475357299.dkr.ecr.us-west-2.amazonaws.com + export SCALA_VERSION=2.12 # Specify Scala version (2.12 or 2.13) + python3 ops/docker_run.py \ + --container-tag ${DOCKER_REGISTRY}/xgb-ci.jvm:main \ + --run-args "-e SCALA_VERSION" \ + -- ops/pipeline/build-test-jvm-packages-impl.sh + +* Build and test JVM packages, with GPU support + + .. code-block:: bash + + export DOCKER_REGISTRY=492475357299.dkr.ecr.us-west-2.amazonaws.com + export SCALA_VERSION=2.12 # Specify Scala version (2.12 or 2.13) + export USE_CUDA=1 + python3 ops/docker_run.py \ + --container-tag ${DOCKER_REGISTRY}/xgb-ci.jvm_gpu_build:main \ + --use-gpus \ + --run-args "-e SCALA_VERSION -e USE_CUDA --shm-size=4g" \ + -- ops/pipeline/build-test-jvm-packages-impl.sh + # --shm-size=4g is needed for multi-GPU algorithms to function + +***************************** +Tour of the CI infrastructure +***************************** + +============== +GitHub Actions +============== +We make the extensive use of `GitHub Actions `_ to host our +CI pipelines. Most of the tests listed in the configuration files run automatically for every +incoming pull requests and every update to branches. + +=============================== +Self-Hosted Runners with RunsOn +=============================== +`RunsOn `_ is a SaaS (Software as a Service) app that lets us to easily create +self-hosted runners to use with GitHub Actions pipelines. RunsOn uses +`Amazon Web Services (AWS) `_ under the hood to provision runners with +access to various amount of CPUs, memory, and NVIDIA GPUs. Thanks to this app, we are able to test +GPU-accelerated and distributed algorithms of XGBoost while using the familar interface of +GitHub Actions. + +In GitHub Actions, jobs run on Microsoft-hosted runners by default. +To opt into self-hosted runners (enabled by RunsOn), we use the following special syntax: + +.. code-block:: yaml -******************************************************************* + runs-on: + - runs-on + - runner=runner-name + - run-id=${{ github.run_id }} + - tag=[unique tag that uniquely identifies the job in the GH Action workflow] + +where the runner is defined in ``.github/runs-on.yml``. + +=================================================================== The Lay of the Land: how CI pipelines are organized in the codebase -******************************************************************* +=================================================================== The XGBoost project stores the configuration for its CI pipelines as part of the codebase. The git repository therefore stores not only the change history for its source code but also the change history for the CI pipelines. -================= -File Organization -================= - The CI pipelines are organized into the following directories and files: * ``.github/workflows/``: Definition of CI pipelines, using the GitHub Actions syntax * ``.github/runs-on.yml``: Configuration for the RunsOn service. Specifies the spec for the self-hosted CI runners. * ``ops/conda_env/``: Definitions for Conda environments -* ``ops/packer/``: Packer scripts to build VM images for Amazon EC2 * ``ops/patch/``: Patch files * ``ops/pipeline/``: Shell scripts defining CI/CD pipelines. Most of these scripts can be run locally (to assist with development and debugging); a few must run in the CI. * ``ops/script/``: Various utility scripts useful for testing -* ``ops/docker/dockerfile/``: Dockerfiles to define containers -* ``ops/docker/ci_container.yml``: Defines the mapping between Dockerfiles and containers. - Also specifies the build arguments to be used with each container. See - :ref:`build_run_docker_locally` to learn how this YAML file is used in the context of - a container build. -* ``ops/docker_build.*``: Wrapper scripts to build and test CI containers. See - :ref:`build_run_docker_locally` for the detailed description. +* ``ops/docker_run.py``: Wrapper script to run commands inside a container To inspect a given CI pipeline, inspect files in the following order: @@ -255,78 +311,93 @@ To inspect a given CI pipeline, inspect files in the following order: :align: center :figwidth: 80 % -=================================== -Primitives used in the CI pipelines -=================================== +Many of the CI pipelines use Docker containers to ensure consistent testing environment +with a variety of software packages. We have a separate repo, +`dmlc/xgboost-devops `_, that +hosts the code for building the CI containers. The repository is organized as follows: ------------------------- -Build and run containers ------------------------- +* ``actions/``: Custom actions to be used with GitHub Actions. See :ref:`custom_actions` + for more details. +* ``containers/dockerfile/``: Dockerfiles to define containers +* ``containers/ci_container.yml``: Defines the mapping between Dockerfiles and containers. + Also specifies the build arguments to be used with each container. +* ``containers/docker_build.{py,sh}``: Wrapper scripts to build and test CI containers. +* ``vm_images/``: Defines bootstrap scripts to build VM images for Amazon EC2. See + :ref:`vm_images` to learn about how VM images relate to container images. See :ref:`build_run_docker_locally` to learn about the utility scripts for building and using containers. -**What's the relationship between the VM image (for Amazon EC2) and the container image?** -In ``ops/packer/`` directory, we define Packer scripts to build VM images for Amazon EC2. -The VM image contains the minimal set of drivers and system software that are needed to -run the containers. +=========================================== +Artifact sharing between jobs via Amazon S3 +=========================================== -We update container images much more often than VM images. Whereas VM images are -updated sparingly (once in a few months), container images are updated each time a branch -or a pull request is updated. This way, developers can make changes to containers and -see the results of the changes immediately in the CI run. +We make artifacts from one workflow job available to another job, by uploading the +artifacts to `Amazon S3 `_. In the CI, we utilize the +script ``ops/pipeline/manage-artifacts.py`` to coordinate artifact sharing. ------------------------------------------- -Stash artifacts, to move them between jobs ------------------------------------------- +**To upload files to S3**: In the workflow YAML, add the following lines: -This primitive is useful when one pipeline job needs to consume the output -from another job. -We use `Amazon S3 `_ to store the stashed files. - -**To stash a file**: - -.. code-block:: bash +.. code-block:: yaml - REMOTE_PREFIX="remote directory to place the artifact(s)" - bash ops/pipeline/stash-artifacts.sh stash "${REMOTE_PREFIX}" path/to/file + - name: Upload files to S3 + run: | + REMOTE_PREFIX="remote directory to place the artifact(s)" + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix cache/${{ github.run_id }}/${REMOTE_PREFIX} \ + path/to/file -The ``REMOTE_PREFIX`` argument, which is the second command-line argument -for ``stash-artifacts.sh``, specifies the remote directory in which the artifact(s) -should be placed. More precisely, the artifact(s) will be placed in -``s3://{RUNS_ON_S3_BUCKET_CACHE}/cache/{GITHUB_REPOSITORY}/stash/{GITHUB_RUN_ID}/{REMOTE_PREFIX}/`` -where ``RUNS_ON_S3_BUCKET_CACHE``, ``GITHUB_REPOSITORY``, and ``GITHUB_RUN_ID`` are set by -the CI. (RunsOn provisions an S3 bucket to stage cache, and its name is stored in the environment -variable ``RUNS_ON_S3_BUCKET_CACHE``.) +The ``--prefix`` argument specifies the remote directory in which the artifact(s) +should be placed. The artifact(s) will be placed in +``s3://{RUNS_ON_S3_BUCKET_CACHE}/cache/{GITHUB_RUN_ID}/{REMOTE_PREFIX}/`` +where ``RUNS_ON_S3_BUCKET_CACHE`` and ``GITHUB_RUN_ID`` are set by the CI. You can upload multiple files, possibly with wildcard globbing: -.. code-block:: bash +.. code-block:: yaml - REMOTE_PREFIX="build-cuda" - bash ops/pipeline/stash-artifacts.sh stash "${REMOTE_PREFIX}" \ - build/testxgboost python-package/dist/*.whl + - name: Upload files to S3 + run: | + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix cache/${{ github.run_id }}/build-cuda \ + build/testxgboost python-package/dist/*.whl -**To unstash a file**: +**To download files from S3**: In the workflow YAML, add the following lines: -.. code-block:: bash +.. code-block:: yaml - REMOTE_PREFIX="remote directory to place the artifact(s)" - bash ops/pipeline/stash-artifacts.sh unstash "${REMOTE_PREFIX}" path/to/file + - name: Download files from S3 + run: | + REMOTE_PREFIX="remote directory where the artifact(s) were placed" + python3 ops/pipeline/manage-artifacts.py download \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix cache/${{ github.run_id }}/${REMOTE_PREFIX} \ + --dest-dir path/to/destination_directory \ + artifacts -You can also use the wildcard globbing. The script will download the matching artifacts -from the remote directory. +You can also use the wildcard globbing. The script will locate all artifacts +under the given prefix that matches the wildcard pattern. -.. code-block:: bash +.. code-block:: yaml - REMOTE_PREFIX="build-cuda" - # Download all files whose path matches the wildcard pattern python-package/dist/*.whl - bash ops/pipeline/stash-artifacts.sh unstash "${REMOTE_PREFIX}" \ - python-package/dist/*.whl + - name: Download files from S3 + run: | + # Locate all artifacts with name *.whl under prefix + # cache/${GITHUB_RUN_ID}/${REMOTE_PREFIX} and + # download them to wheelhouse/. + python3 ops/pipeline/manage-artifacts.py download \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix cache/${{ github.run_id }}/${REMOTE_PREFIX} \ + --dest-dir wheelhouse/ \ + *.whl ------------------------------------------ -Custom actions in ``dmlc/xgboost-devops`` ------------------------------------------ +.. _custom_actions: + +================================= +Custom actions for GitHub Actions +================================= XGBoost implements a few custom `composite actions `_ @@ -334,22 +405,137 @@ to reduce duplicated code within workflow YAML files. The custom actions are hos `dmlc/xgboost-devops `_, to make it easy to test changes to the custom actions in a pull request or a fork. -In a workflow file, we'd refer to ``dmlc/xgboost-devops/{custom-action}@main``. For example: +In a workflow file, we'd refer to ``dmlc/xgboost-devops/actions/{custom-action}@main``. For example: .. code-block:: yaml - - uses: dmlc/xgboost-devops/miniforge-setup@main + - uses: dmlc/xgboost-devops/actions/miniforge-setup@main with: environment-name: cpp_test environment-file: ops/conda_env/cpp_test.yml Each custom action consists of two components: -* Main script (``dmlc/xgboost-devops/{custom-action}/action.yml``): dispatches to a specific version +* Main script (``dmlc/xgboost-devops/actions/{custom-action}/action.yml``): dispatches to a specific version of the implementation script (see the next item). The main script clones ``xgboost-devops`` from a specified fork at a particular ref, allowing us to easily test changes to the custom action. -* Implementation script (``dmlc/xgboost-devops/impls/{custom-action}/action.yml``): Implements the +* Implementation script (``dmlc/xgboost-devops/actions/impls/{custom-action}/action.yml``): Implements the custom script. This design was inspired by Mike Sarahan's work in `rapidsai/shared-actions `_. + + +.. _ci_container_infra: + +============================================================= +Infra for building and publishing CI containers and VM images +============================================================= + +-------------------------- +Notes on Docker containers +-------------------------- +**CI pipeline for containers** + +The `dmlc/xgboost-devops `_ repo hosts a CI pipeline to build new +Docker containers at a regular schedule. New containers are built in the following occasions: + +* New commits are added to the ``main`` branch of ``dmlc/xgboost-devops``. +* New pull requests are submitted to ``dmlc/xgboost-devops``. +* Every week, at a set day and hour. + +This setup ensures that the CI containers remain up-to-date. + +**How wrapper scripts work** + +The wrapper scripts ``docker_build.sh``, ``docker_build.py`` (in ``dmlc/xgboost-devops``) and ``docker_run.py`` +(in ``dmlc/xgboost``) are designed to transparently log what commands are being carried out under the hood. +For example, when you run ``bash containers/docker_build.sh xgb-ci.gpu``, the logs will show the following: + +.. code-block:: bash + + # docker_build.sh calls docker_build.py... + python3 containers/docker_build.py --container-def gpu \ + --container-tag 492475357299.dkr.ecr.us-west-2.amazonaws.com/xgb-ci.gpu:main \ + --build-arg CUDA_VERSION_ARG=12.4.1 --build-arg NCCL_VERSION_ARG=2.23.4-1 \ + --build-arg RAPIDS_VERSION_ARG=24.10 + + ... + + # .. and docker_build.py in turn calls "docker build"... + docker build --build-arg CUDA_VERSION_ARG=12.4.1 \ + --build-arg NCCL_VERSION_ARG=2.23.4-1 \ + --build-arg RAPIDS_VERSION_ARG=24.10 \ + --load --progress=plain \ + --ulimit nofile=1024000:1024000 \ + -t 492475357299.dkr.ecr.us-west-2.amazonaws.com/xgb-ci.gpu:main \ + -f containers/dockerfile/Dockerfile.gpu \ + containers/ + +The logs come in handy when debugging the container builds. + +Here is an example with ``docker_run.py``: + +.. code-block:: bash + + # Run without GPU + python3 ops/docker_run.py \ + --container-tag 492475357299.dkr.ecr.us-west-2.amazonaws.com/xgb-ci.cpu:main \ + -- bash ops/pipeline/build-cpu-impl.sh cpu + + # Run with NVIDIA GPU + # Allocate extra space in /dev/shm to enable NCCL + # Also run the container with elevated privileges + python3 ops/docker_run.py \ + --container-tag 492475357299.dkr.ecr.us-west-2.amazonaws.com/xgb-ci.gpu:main \ + --use-gpus \ + --run-args='--shm-size=4g --privileged' \ + -- bash ops/pipeline/test-python-wheel-impl.sh gpu + +which are translated to the following ``docker run`` invocations: + +.. code-block:: bash + + docker run --rm --pid=host \ + -w /workspace -v /path/to/xgboost:/workspace \ + -e CI_BUILD_UID= -e CI_BUILD_USER= \ + -e CI_BUILD_GID= -e CI_BUILD_GROUP= \ + 492475357299.dkr.ecr.us-west-2.amazonaws.com/xgb-ci.cpu:main \ + bash ops/pipeline/build-cpu-impl.sh cpu + + docker run --rm --pid=host --gpus all \ + -w /workspace -v /path/to/xgboost:/workspace \ + -e CI_BUILD_UID= -e CI_BUILD_USER= \ + -e CI_BUILD_GID= -e CI_BUILD_GROUP= \ + --shm-size=4g --privileged \ + 492475357299.dkr.ecr.us-west-2.amazonaws.com/xgb-ci.gpu:main \ + bash ops/pipeline/test-python-wheel-impl.sh gpu + + +.. _vm_images: +------------------ +Notes on VM images +------------------ +In the ``vm_images/`` directory of `dmlc/xgboost-devops `_, +we define Packer scripts to build images for Virtual Machines (VM) on +`Amazon EC2 `_. +The VM image contains the minimal set of drivers and system software that are needed to +run the containers. + +We update container images much more often than VM images. Whereas it takes only 10 minutes to +build a new container image, it takes 1-2 hours to build a new VM image. + +To enable quick development iteration cycle, we place the most of +the development environment in containers and keep VM images small. +Packages need for testing should be baked into containers, not VM images. +Developers can make changes to containers and see the results of the changes quickly. + +.. note:: Special note for the Windows platform + + We do not use containers when testing XGBoost on Windows. All software must be baked into + the VM image. Containers are not used because + `NVIDIA Container Toolkit `_ + does not yet support Windows natively. + +The `dmlc/xgboost-devops `_ repo hosts a CI pipeline to build new +VM images at a regular schedule (currently monthly). diff --git a/doc/get_started.rst b/doc/get_started.rst index 69254777ddc5..5c717adafe05 100644 --- a/doc/get_started.rst +++ b/doc/get_started.rst @@ -44,7 +44,8 @@ R train <- agaricus.train test <- agaricus.test # fit model - bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nrounds = 2, + bst <- xgboost(x = train$data, y = factor(train$label), + max.depth = 2, eta = 1, nrounds = 2, nthread = 2, objective = "binary:logistic") # predict pred <- predict(bst, test$data) diff --git a/doc/python/python_intro.rst b/doc/python/python_intro.rst index cfdd20da074d..0e2793cfae9f 100644 --- a/doc/python/python_intro.rst +++ b/doc/python/python_intro.rst @@ -166,10 +166,6 @@ Support Matrix +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ | dlpack | CPA | CPA | | CPA | FF | FF | +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ -| datatable.Frame | T | FF | | NPA | FF | | -+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ -| datatable.Table | T | FF | | NPA | FF | | -+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ | modin.DataFrame | NPA | FF | NPA | NPA | FF | | +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ | modin.Series | NPA | FF | NPA | NPA | FF | | diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index 6ae1dea8d3ce..5adc689554d2 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -254,22 +254,6 @@ XGB_DLL int XGDMatrixCreateFromMat_omp(const float *data, // NOLINT bst_ulong nrow, bst_ulong ncol, float missing, DMatrixHandle *out, int nthread); -/*! - * \brief create matrix content from python data table - * \param data pointer to pointer to column data - * \param feature_stypes pointer to strings - * \param nrow number of rows - * \param ncol number columns - * \param out created dmatrix - * \param nthread number of threads (up to maximum cores available, if <=0 use all cores) - * \return 0 when success, -1 when failure happens - */ -XGB_DLL int XGDMatrixCreateFromDT(void** data, - const char ** feature_stypes, - bst_ulong nrow, - bst_ulong ncol, - DMatrixHandle* out, - int nthread); /*! * \brief Create DMatrix from CUDA columnar format. (cuDF) diff --git a/ops/conda_env/linux_cpu_test.yml b/ops/conda_env/linux_cpu_test.yml index 1ec2a5447604..8621cc808693 100644 --- a/ops/conda_env/linux_cpu_test.yml +++ b/ops/conda_env/linux_cpu_test.yml @@ -39,5 +39,3 @@ dependencies: - cloudpickle - modin - pyspark>=3.4.0 -- pip: - - datatable diff --git a/ops/docker/ci_container.yml b/ops/docker/ci_container.yml deleted file mode 100644 index 348bf90f8a1f..000000000000 --- a/ops/docker/ci_container.yml +++ /dev/null @@ -1,72 +0,0 @@ -## List of CI containers with definitions and build arguments - -# Each container will be built using the definition from -# ops/docker/dockerfile/Dockerfile.CONTAINER_DEF - -rapids_versions: - stable: &rapids_version "24.10" - dev: &dev_rapids_version "24.12" - -xgb-ci.gpu_build_rockylinux8: - container_def: gpu_build_rockylinux8 - build_args: - CUDA_VERSION_ARG: "12.4.1" - NCCL_VERSION_ARG: "2.23.4-1" - RAPIDS_VERSION_ARG: *rapids_version - -xgb-ci.gpu_build_rockylinux8_dev_ver: - container_def: gpu_build_rockylinux8 - build_args: - CUDA_VERSION_ARG: "12.4.1" - NCCL_VERSION_ARG: "2.23.4-1" - RAPIDS_VERSION_ARG: *dev_rapids_version - -xgb-ci.gpu_build_r_rockylinux8: - container_def: gpu_build_r_rockylinux8 - build_args: - CUDA_VERSION_ARG: "12.4.1" - R_VERSION_ARG: "4.3.2" - -xgb-ci.gpu: - container_def: gpu - build_args: - CUDA_VERSION_ARG: "12.4.1" - NCCL_VERSION_ARG: "2.23.4-1" - RAPIDS_VERSION_ARG: *rapids_version - -xgb-ci.gpu_dev_ver: - container_def: gpu - build_args: - CUDA_VERSION_ARG: "12.4.1" - NCCL_VERSION_ARG: "2.23.4-1" - RAPIDS_VERSION_ARG: *dev_rapids_version - RAPIDSAI_CONDA_CHANNEL_ARG: "rapidsai-nightly" - -xgb-ci.clang_tidy: - container_def: clang_tidy - build_args: - CUDA_VERSION_ARG: "12.4.1" - -xgb-ci.cpu: - container_def: cpu - -xgb-ci.aarch64: - container_def: aarch64 - -xgb-ci.manylinux_2_28_x86_64: - container_def: manylinux_2_28_x86_64 - -xgb-ci.manylinux2014_x86_64: - container_def: manylinux2014_x86_64 - -xgb-ci.manylinux2014_aarch64: - container_def: manylinux2014_aarch64 - -xgb-ci.jvm: - container_def: jvm - -xgb-ci.jvm_gpu_build: - container_def: jvm_gpu_build - build_args: - CUDA_VERSION_ARG: "12.4.1" - NCCL_VERSION_ARG: "2.23.4-1" diff --git a/ops/docker/docker_cache_ecr.yml b/ops/docker/docker_cache_ecr.yml deleted file mode 100644 index e20f35fc8020..000000000000 --- a/ops/docker/docker_cache_ecr.yml +++ /dev/null @@ -1,4 +0,0 @@ -## Constants for AWS ECR (Elastic Container Registry), used for the Docker cache - -DOCKER_CACHE_ECR_ID: "492475357299" -DOCKER_CACHE_ECR_REGION: "us-west-2" diff --git a/ops/docker/dockerfile/Dockerfile.aarch64 b/ops/docker/dockerfile/Dockerfile.aarch64 deleted file mode 100644 index 9dff2a05230b..000000000000 --- a/ops/docker/dockerfile/Dockerfile.aarch64 +++ /dev/null @@ -1,38 +0,0 @@ -FROM quay.io/pypa/manylinux_2_28_aarch64 - -SHELL ["/bin/bash", "-c"] # Use Bash as shell - -# Install all basic requirements -RUN \ - dnf -y update && \ - dnf -y install dnf-plugins-core && \ - dnf config-manager --set-enabled powertools && \ - dnf install -y tar unzip wget xz git which ninja-build gcc-toolset-10-gcc gcc-toolset-10-binutils gcc-toolset-10-gcc-c++ && \ - # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-aarch64.sh && \ - bash conda.sh -b -p /opt/miniforge - -ENV PATH=/opt/miniforge/bin:$PATH -ENV CC=/opt/rh/gcc-toolset-10/root/usr/bin/gcc -ENV CXX=/opt/rh/gcc-toolset-10/root/usr/bin/c++ -ENV CPP=/opt/rh/gcc-toolset-10/root/usr/bin/cpp -ENV GOSU_VERSION=1.10 - -# Create new Conda environment -COPY conda_env/aarch64_test.yml /scripts/ -RUN mamba create -n aarch64_test && \ - mamba env update -n aarch64_test --file=/scripts/aarch64_test.yml && \ - mamba clean --all --yes - -# Install lightweight sudo (not bound to TTY) -RUN set -ex; \ - wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-arm64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY docker/entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.clang_tidy b/ops/docker/dockerfile/Dockerfile.clang_tidy deleted file mode 100644 index de7d9bd3f254..000000000000 --- a/ops/docker/dockerfile/Dockerfile.clang_tidy +++ /dev/null @@ -1,50 +0,0 @@ -ARG CUDA_VERSION_ARG=notset -FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu22.04 -ARG CUDA_VERSION_ARG - -# Environment -ENV DEBIAN_FRONTEND=noninteractive - -# Install all basic requirements -RUN \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \ - apt-get update && \ - apt-get install -y wget git python3 python3-pip software-properties-common \ - apt-transport-https ca-certificates gnupg-agent && \ - apt-get install -y ninja-build - -# Install clang-tidy: https://apt.llvm.org/ -RUN \ - apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-19 main" && \ - wget -O llvm-snapshot.gpg.key https://apt.llvm.org/llvm-snapshot.gpg.key && \ - apt-key add ./llvm-snapshot.gpg.key && \ - rm llvm-snapshot.gpg.key && \ - apt-get update && \ - apt-get install -y clang-tidy-19 clang-19 libomp-19-dev - -# Set default clang-tidy version -RUN \ - update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-19 100 && \ - update-alternatives --install /usr/bin/clang clang /usr/bin/clang-19 100 - -RUN \ - apt-get install libgtest-dev libgmock-dev -y - -# Install Python packages -RUN \ - pip3 install cmake - -ENV GOSU_VERSION=1.10 - -# Install lightweight sudo (not bound to TTY) -RUN set -ex; \ - wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY docker/entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.cpu b/ops/docker/dockerfile/Dockerfile.cpu deleted file mode 100644 index a426ce5da30c..000000000000 --- a/ops/docker/dockerfile/Dockerfile.cpu +++ /dev/null @@ -1,57 +0,0 @@ -FROM ubuntu:22.04 - -# Environment -ENV DEBIAN_FRONTEND=noninteractive -SHELL ["/bin/bash", "-c"] - -# Install all basic requirements -RUN \ - apt-get update && \ - apt-get install -y software-properties-common && \ - add-apt-repository ppa:ubuntu-toolchain-r/test && \ - apt-get update && \ - apt-get install -y tar unzip wget git build-essential doxygen graphviz llvm libidn12 cmake ninja-build gcc-10 g++-10 openjdk-8-jdk-headless && \ - # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ - bash conda.sh -b -p /opt/miniforge - -ENV PATH=/opt/miniforge/bin:$PATH -ENV CC=gcc-10 -ENV CXX=g++-10 -ENV CPP=cpp-10 - -ENV GOSU_VERSION=1.10 -ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ - -# Install gRPC -# Patch Abseil to apply https://github.com/abseil/abseil-cpp/issues/1629 -RUN git clone -b v1.65.4 https://github.com/grpc/grpc.git \ - --recurse-submodules --depth 1 && \ - pushd grpc && \ - pushd third_party/abseil-cpp && \ - git fetch origin master && \ - git cherry-pick -n cfde5f74e276049727f9556f13473a59fe77d9eb && \ - popd && \ - cmake -S . -B build -GNinja -DCMAKE_INSTALL_PREFIX=/opt/grpc -DCMAKE_CXX_VISIBILITY_PRESET=hidden && \ - cmake --build build --target install && \ - popd && \ - rm -rf grpc - -# Create new Conda environment -COPY conda_env/linux_cpu_test.yml /scripts/ -RUN mamba create -n linux_cpu_test && \ - mamba env update -n linux_cpu_test --file=/scripts/linux_cpu_test.yml && \ - mamba clean --all --yes - -# Install lightweight sudo (not bound to TTY) -RUN set -ex; \ - wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY docker/entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.gpu b/ops/docker/dockerfile/Dockerfile.gpu deleted file mode 100644 index 96a532fc2ff1..000000000000 --- a/ops/docker/dockerfile/Dockerfile.gpu +++ /dev/null @@ -1,54 +0,0 @@ -ARG CUDA_VERSION_ARG=notset -FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04 -ARG CUDA_VERSION_ARG -ARG RAPIDS_VERSION_ARG - # Should be first 4 digits (e.g. 24.06) -ARG NCCL_VERSION_ARG -ARG RAPIDSAI_CONDA_CHANNEL_ARG="rapidsai" - -# Environment -ENV DEBIAN_FRONTEND=noninteractive -SHELL ["/bin/bash", "-c"] - -# Install all basic requirements -RUN \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \ - apt-get update && \ - apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \ - apt-get install libnccl2 libnccl-dev -y --allow-change-held-packages && \ - # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ - bash conda.sh -b -p /opt/miniforge - -ENV PATH=/opt/miniforge/bin:$PATH - -# Create new Conda environment with cuDF, Dask, and cuPy -RUN \ - export NCCL_SHORT_VER=$(echo "$NCCL_VERSION_ARG" | cut -d "-" -f 1) && \ - export CUDA_SHORT_VER=$(echo "$CUDA_VERSION_ARG" | grep -o -E '[0-9]+\.[0-9]') && \ - mamba create -y -n gpu_test -c ${RAPIDSAI_CONDA_CHANNEL_ARG} -c conda-forge -c nvidia \ - python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cuda-version=$CUDA_SHORT_VER \ - "nccl>=${NCCL_SHORT_VER}" \ - "dask<=2024.10.0" \ - "distributed<=2024.10.0" \ - "dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \ - numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel \ - python-kubernetes urllib3 graphviz hypothesis loky \ - "pyspark>=3.4.0" cloudpickle cuda-python && \ - mamba clean --all --yes - -ENV GOSU_VERSION=1.10 -ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ - -# Install lightweight sudo (not bound to TTY) -RUN set -ex; \ - wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY docker/entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.gpu_build_r_rockylinux8 b/ops/docker/dockerfile/Dockerfile.gpu_build_r_rockylinux8 deleted file mode 100644 index 2d18b1eeb315..000000000000 --- a/ops/docker/dockerfile/Dockerfile.gpu_build_r_rockylinux8 +++ /dev/null @@ -1,58 +0,0 @@ -ARG CUDA_VERSION_ARG=notset -FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-rockylinux8 -ARG CUDA_VERSION_ARG -ARG R_VERSION_ARG - -# Install all basic requirements -RUN \ - curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \ - > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ - dnf -y update && \ - dnf -y install dnf-plugins-core && \ - dnf config-manager --set-enabled powertools && \ - dnf install -y tar unzip wget xz git which ninja-build readline-devel libX11-devel libXt-devel \ - xorg-x11-server-devel openssl-devel zlib-devel bzip2-devel xz-devel \ - pcre2-devel libcurl-devel texlive-* \ - gcc-toolset-10-gcc gcc-toolset-10-binutils gcc-toolset-10-gcc-c++ \ - gcc-toolset-10-gcc-gfortran gcc-toolset-10-libquadmath-devel \ - gcc-toolset-10-runtime gcc-toolset-10-libstdc++-devel - -ENV PATH=/opt/miniforge/bin:/usr/local/ninja:/opt/software/packages/bin:/opt/R/$R_VERSION_ARG/bin:$PATH -ENV LD_LIBRARY_PATH=/opt/software/packages/lib:/opt/R/$R_VERSION_ARG/lib64:$LD_LIBRARY_PATH -ENV CC=/opt/rh/gcc-toolset-10/root/usr/bin/gcc -ENV CXX=/opt/rh/gcc-toolset-10/root/usr/bin/c++ -ENV CPP=/opt/rh/gcc-toolset-10/root/usr/bin/cpp -ENV F77=/opt/rh/gcc-toolset-10/root/usr/bin/gfortran -ENV FC=/opt/rh/gcc-toolset-10/root/usr/bin/gfortran - -RUN \ - wget -nv -nc https://cran.r-project.org/src/base/R-4/R-$R_VERSION_ARG.tar.gz && \ - tar xf R-$R_VERSION_ARG.tar.gz && \ - cd R-$R_VERSION_ARG && \ - ./configure --prefix=/opt/R/$R_VERSION_ARG --enable-R-shlib --with-pcrel && \ - make -j$(nproc) && \ - make install - -run \ - # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ - bash conda.sh -b -p /opt/miniforge && \ - /opt/miniforge/bin/python -m pip install auditwheel awscli && \ - # CMake - wget -nv -nc https://cmake.org/files/v3.29/cmake-3.29.5-linux-x86_64.sh --no-check-certificate && \ - bash cmake-3.29.5-linux-x86_64.sh --skip-license --prefix=/usr - -ENV GOSU_VERSION=1.10 - -# Install lightweight sudo (not bound to TTY) -RUN set -ex; \ - wget -nv -nc -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY docker/entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 b/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 deleted file mode 100644 index b686bfbb2b0d..000000000000 --- a/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 +++ /dev/null @@ -1,82 +0,0 @@ -ARG CUDA_VERSION_ARG=notset -FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-rockylinux8 -ARG CUDA_VERSION_ARG -ARG NCCL_VERSION_ARG -ARG RAPIDS_VERSION_ARG - -# Install all basic requirements -RUN \ - curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \ - > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ - dnf -y update && \ - dnf -y install dnf-plugins-core && \ - dnf config-manager --set-enabled powertools && \ - dnf install -y tar unzip wget xz git which ninja-build gcc-toolset-10-gcc gcc-toolset-10-binutils gcc-toolset-10-gcc-c++ && \ - # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ - bash conda.sh -b -p /opt/miniforge && \ - /opt/miniforge/bin/python -m pip install awscli && \ - # CMake - wget -nv -nc https://cmake.org/files/v3.29/cmake-3.29.5-linux-x86_64.sh --no-check-certificate && \ - bash cmake-3.29.5-linux-x86_64.sh --skip-license --prefix=/usr - -# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) -RUN \ - export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \ - export NCCL_VERSION=$NCCL_VERSION_ARG && \ - dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \ - dnf -y update && \ - dnf install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} - -ENV PATH=/opt/miniforge/bin:/usr/local/ninja:$PATH -ENV CC=/opt/rh/gcc-toolset-10/root/usr/bin/gcc -ENV CXX=/opt/rh/gcc-toolset-10/root/usr/bin/c++ -ENV CPP=/opt/rh/gcc-toolset-10/root/usr/bin/cpp -ENV CUDAHOSTCXX=/opt/rh/gcc-toolset-10/root/usr/bin/c++ - -ENV GOSU_VERSION=1.10 - -# Install gRPC -# Patch Abseil to apply https://github.com/abseil/abseil-cpp/issues/1629 -RUN git clone -b v1.65.4 https://github.com/grpc/grpc.git \ - --recurse-submodules --depth 1 && \ - pushd grpc && \ - pushd third_party/abseil-cpp && \ - git fetch origin master && \ - git cherry-pick -n cfde5f74e276049727f9556f13473a59fe77d9eb && \ - popd && \ - cmake -S . -B build -GNinja -DCMAKE_INSTALL_PREFIX=/opt/grpc -DCMAKE_CXX_VISIBILITY_PRESET=hidden && \ - cmake --build build --target install && \ - popd && \ - rm -rf grpc - -# Install RMM -# Patch out -Werror -# Patch CCCL 2.5.0 to apply https://github.com/NVIDIA/cccl/pull/1957 -RUN git clone -b branch-${RAPIDS_VERSION_ARG} https://github.com/rapidsai/rmm.git --recurse-submodules --depth 1 && \ - pushd rmm && \ - find . -name CMakeLists.txt -print0 | xargs -0 sed -i 's/-Werror//g' && \ - mkdir build && \ - pushd build && \ - cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=/opt/rmm -DCUDA_STATIC_RUNTIME=ON && \ - pushd _deps/cccl-src/ && \ - git fetch origin main && \ - git cherry-pick -n 9fcb32c228865f21f2b002b29d38a06b4c6fbd73 && \ - popd && \ - cmake --build . --target install && \ - popd && \ - popd && \ - rm -rf rmm - -# Install lightweight sudo (not bound to TTY) -RUN set -ex; \ - wget -nv -nc -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY docker/entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.i386 b/ops/docker/dockerfile/Dockerfile.i386 deleted file mode 100644 index f128a008fa6c..000000000000 --- a/ops/docker/dockerfile/Dockerfile.i386 +++ /dev/null @@ -1,8 +0,0 @@ -FROM i386/debian:sid - -ENV DEBIAN_FRONTEND=noninteractive -SHELL ["/bin/bash", "-c"] - -RUN \ - apt-get update && \ - apt-get install -y tar unzip wget git build-essential ninja-build cmake diff --git a/ops/docker/dockerfile/Dockerfile.jvm b/ops/docker/dockerfile/Dockerfile.jvm deleted file mode 100644 index 9fd62e52de93..000000000000 --- a/ops/docker/dockerfile/Dockerfile.jvm +++ /dev/null @@ -1,43 +0,0 @@ -FROM rockylinux:8 - -# Install all basic requirements -RUN \ - dnf -y update && \ - dnf -y install dnf-plugins-core && \ - dnf config-manager --set-enabled powertools && \ - dnf install -y tar unzip make bzip2 wget xz git which ninja-build java-1.8.0-openjdk-devel \ - gcc-toolset-10-gcc gcc-toolset-10-binutils gcc-toolset-10-gcc-c++ \ - gcc-toolset-10-runtime gcc-toolset-10-libstdc++-devel && \ - # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ - bash conda.sh -b -p /opt/miniforge && \ - # CMake - wget -nv -nc https://cmake.org/files/v3.29/cmake-3.29.5-linux-x86_64.sh --no-check-certificate && \ - bash cmake-3.29.5-linux-x86_64.sh --skip-license --prefix=/usr && \ - # Maven - wget -nv -nc https://archive.apache.org/dist/maven/maven-3/3.9.7/binaries/apache-maven-3.9.7-bin.tar.gz && \ - tar xvf apache-maven-3.9.7-bin.tar.gz -C /opt && \ - ln -s /opt/apache-maven-3.9.7/ /opt/maven - -ENV PATH=/opt/miniforge/bin:/opt/maven/bin:$PATH -ENV CC=/opt/rh/gcc-toolset-10/root/usr/bin/gcc -ENV CXX=/opt/rh/gcc-toolset-10/root/usr/bin/c++ -ENV CPP=/opt/rh/gcc-toolset-10/root/usr/bin/cpp - -# Install Python packages -RUN pip install numpy pytest scipy scikit-learn wheel kubernetes awscli - -ENV GOSU_VERSION=1.10 - -# Install lightweight sudo (not bound to TTY) -RUN set -ex; \ - wget -nv -nc -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY docker/entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.jvm_gpu_build b/ops/docker/dockerfile/Dockerfile.jvm_gpu_build deleted file mode 100644 index 4983493a6878..000000000000 --- a/ops/docker/dockerfile/Dockerfile.jvm_gpu_build +++ /dev/null @@ -1,54 +0,0 @@ -ARG CUDA_VERSION_ARG=notset -FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-rockylinux8 -ARG CUDA_VERSION_ARG -ARG NCCL_VERSION_ARG - -# Install all basic requirements -RUN \ - curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \ - > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ - dnf -y update && \ - dnf -y install dnf-plugins-core && \ - dnf config-manager --set-enabled powertools && \ - dnf install -y tar unzip wget xz git which ninja-build java-1.8.0-openjdk-devel gcc-toolset-10-gcc gcc-toolset-10-binutils gcc-toolset-10-gcc-c++ && \ - # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ - bash conda.sh -b -p /opt/miniforge && \ - # CMake - wget -nv -nc https://cmake.org/files/v3.29/cmake-3.29.5-linux-x86_64.sh --no-check-certificate && \ - bash cmake-3.29.5-linux-x86_64.sh --skip-license --prefix=/usr && \ - # Maven - wget -nv -nc https://archive.apache.org/dist/maven/maven-3/3.9.7/binaries/apache-maven-3.9.7-bin.tar.gz && \ - tar xvf apache-maven-3.9.7-bin.tar.gz -C /opt && \ - ln -s /opt/apache-maven-3.9.7/ /opt/maven - -# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) -RUN \ - export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \ - export NCCL_VERSION=$NCCL_VERSION_ARG && \ - dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \ - dnf -y update && \ - dnf install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-static-${NCCL_VERSION}+cuda${CUDA_SHORT} - -ENV PATH=/opt/miniforge/bin:/opt/maven/bin:$PATH -ENV CC=/opt/rh/gcc-toolset-10/root/usr/bin/gcc -ENV CXX=/opt/rh/gcc-toolset-10/root/usr/bin/c++ -ENV CPP=/opt/rh/gcc-toolset-10/root/usr/bin/cpp - -# Install Python packages -RUN pip install numpy pytest scipy scikit-learn wheel kubernetes awscli - -ENV GOSU_VERSION=1.10 - -# Install lightweight sudo (not bound to TTY) -RUN set -ex; \ - wget -nv -nc -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY docker/entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.manylinux2014_aarch64 b/ops/docker/dockerfile/Dockerfile.manylinux2014_aarch64 deleted file mode 100644 index 7800033f552d..000000000000 --- a/ops/docker/dockerfile/Dockerfile.manylinux2014_aarch64 +++ /dev/null @@ -1,17 +0,0 @@ -FROM quay.io/pypa/manylinux2014_aarch64 - -RUN yum update -y && yum install -y java-1.8.0-openjdk-devel - -# Install lightweight sudo (not bound to TTY) -ENV GOSU_VERSION=1.10 -RUN set -ex; \ - curl -o /usr/local/bin/gosu -L "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-arm64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY docker/entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.manylinux2014_x86_64 b/ops/docker/dockerfile/Dockerfile.manylinux2014_x86_64 deleted file mode 100644 index 8214b598d8d4..000000000000 --- a/ops/docker/dockerfile/Dockerfile.manylinux2014_x86_64 +++ /dev/null @@ -1,17 +0,0 @@ -FROM quay.io/pypa/manylinux2014_x86_64 - -RUN yum update -y && yum install -y java-1.8.0-openjdk-devel - -# Install lightweight sudo (not bound to TTY) -ENV GOSU_VERSION=1.10 -RUN set -ex; \ - curl -o /usr/local/bin/gosu -L "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY docker/entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 b/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 deleted file mode 100644 index f5dac54b9b8f..000000000000 --- a/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 +++ /dev/null @@ -1,15 +0,0 @@ -FROM quay.io/pypa/manylinux_2_28_x86_64 - -# Install lightweight sudo (not bound to TTY) -ENV GOSU_VERSION=1.10 -RUN set -ex; \ - curl -o /usr/local/bin/gosu -L "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY docker/entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/entrypoint.sh b/ops/docker/entrypoint.sh deleted file mode 100755 index 40135c197c73..000000000000 --- a/ops/docker/entrypoint.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env bash - -# This wrapper script propagates the user information from the host -# to the container. This way, any files generated by processes running -# in the container will be accessible in the host. - -set -euo pipefail - -COMMAND=("$@") - -if ! touch /this_is_writable_file_system; then - echo "You can't write to your filesystem!" - echo "If you are in Docker you should check you do not have too many images" \ - "with too many files in them. Docker has some issue with it." - exit 1 -else - rm /this_is_writable_file_system -fi - -## Assumption: the host passes correct user information via environment variables -## CI_BUILD_UID, CI_BUILD_GID, CI_BUILD_USER, CI_BUILD_GROUP - -if [[ -n ${CI_BUILD_UID:-} ]] && [[ -n ${CI_BUILD_GID:-} ]] -then - groupadd -o -g "${CI_BUILD_GID}" "${CI_BUILD_GROUP}" || true - useradd -o -m -g "${CI_BUILD_GID}" -u "${CI_BUILD_UID}" \ - "${CI_BUILD_USER}" || true - export HOME="/home/${CI_BUILD_USER}" - shopt -s dotglob - cp -r /root/* "$HOME/" - chown -R "${CI_BUILD_UID}:${CI_BUILD_GID}" "$HOME" - - # Allows project-specific customization - if [[ -e "/workspace/.pre_entry.sh" ]]; then - gosu "${CI_BUILD_UID}:${CI_BUILD_GID}" /workspace/.pre_entry.sh - fi - - # Enable passwordless sudo capabilities for the user - chown root:"${CI_BUILD_GID}" "$(which gosu)" - chmod +s "$(which gosu)"; sync - - exec gosu "${CI_BUILD_UID}:${CI_BUILD_GID}" "${COMMAND[@]}" -else - exec "${COMMAND[@]}" -fi diff --git a/ops/docker/extract_build_args.jq b/ops/docker/extract_build_args.jq deleted file mode 100644 index b35240edb626..000000000000 --- a/ops/docker/extract_build_args.jq +++ /dev/null @@ -1,12 +0,0 @@ -## Example input: -## xgb-ci.gpu_build_r_rockylinux8 -## Example output: -## --build-arg CUDA_VERSION_ARG=12.4.1 --build-arg R_VERSION_ARG=4.3.2 -def compute_build_args($input; $container_id): - $input | - .[$container_id] | - select(.build_args != null) | - .build_args | - to_entries | - map("--build-arg " + .key + "=" + .value) | - join(" "); diff --git a/ops/docker/extract_build_args.sh b/ops/docker/extract_build_args.sh deleted file mode 100755 index 42a83047742c..000000000000 --- a/ops/docker/extract_build_args.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -## Extract container definition and build args from ops/docker/ci_container.yml, -## given the container ID. -## -## Example input: -## xgb-ci.clang_tidy -## Example output: -## CONTAINER_DEF='clang_tidy' BUILD_ARGS='--build-arg CUDA_VERSION_ARG=12.4.1' - -if [ "$#" -ne 1 ]; then - echo "Usage: $0 [container_id]" - exit 1 -fi - -CONTAINER_ID="$1" -CONTAINER_DEF=$( - yq -o json ops/docker/ci_container.yml | - jq -r --arg container_id "${CONTAINER_ID}" '.[$container_id].container_def' -) -BUILD_ARGS=$( - yq -o json ops/docker/ci_container.yml | - jq -r --arg container_id "${CONTAINER_ID}" \ - 'include "ops/docker/extract_build_args"; - compute_build_args(.; $container_id)' -) -echo "CONTAINER_DEF='${CONTAINER_DEF}' BUILD_ARGS='${BUILD_ARGS}'" diff --git a/ops/docker_build.py b/ops/docker_build.py deleted file mode 100644 index 1fed975ce223..000000000000 --- a/ops/docker_build.py +++ /dev/null @@ -1,137 +0,0 @@ -""" -Wrapper script to build a Docker container with layer caching -""" - -import argparse -import itertools -import pathlib -import subprocess -import sys -from typing import Optional - -from docker_run import OPS_DIR, fancy_print_cli_args - - -def parse_build_args(raw_build_args: list[str]) -> dict[str, str]: - parsed_build_args = dict() - for arg in raw_build_args: - try: - key, value = arg.split("=", maxsplit=1) - except ValueError as e: - raise ValueError( - f"Build argument must be of form KEY=VALUE. Got: {arg}" - ) from e - parsed_build_args[key] = value - return parsed_build_args - - -def docker_build( - container_id: str, - *, - build_args: dict[str, str], - dockerfile_path: pathlib.Path, - docker_context_path: pathlib.Path, - cache_from: Optional[str], - cache_to: Optional[str], -) -> None: - ## Set up command-line arguments to be passed to `docker build` - # Build args - docker_build_cli_args = list( - itertools.chain.from_iterable( - [["--build-arg", f"{k}={v}"] for k, v in build_args.items()] - ) - ) - # When building an image using a non-default driver, we need to specify - # `--load` to load it to the image store. - # See https://docs.docker.com/build/builders/drivers/ - docker_build_cli_args.append("--load") - # Layer caching - if cache_from: - docker_build_cli_args.extend(["--cache-from", cache_from]) - if cache_to: - docker_build_cli_args.extend(["--cache-to", cache_to]) - # Remaining CLI args - docker_build_cli_args.extend( - [ - "--progress=plain", - "--ulimit", - "nofile=1024000:1024000", - "-t", - container_id, - "-f", - str(dockerfile_path), - str(docker_context_path), - ] - ) - cli_args = ["docker", "build"] + docker_build_cli_args - fancy_print_cli_args(cli_args) - subprocess.run(cli_args, check=True, encoding="utf-8") - - -def main(args: argparse.Namespace) -> None: - # Dockerfile to be used in docker build - dockerfile_path = ( - OPS_DIR / "docker" / "dockerfile" / f"Dockerfile.{args.container_def}" - ) - docker_context_path = OPS_DIR - - build_args = parse_build_args(args.build_arg) - - docker_build( - args.container_id, - build_args=build_args, - dockerfile_path=dockerfile_path, - docker_context_path=docker_context_path, - cache_from=args.cache_from, - cache_to=args.cache_to, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Build a Docker container") - parser.add_argument( - "--container-def", - type=str, - required=True, - help=( - "String uniquely identifying the container definition. The container " - "definition will be fetched from " - "docker/dockerfile/Dockerfile.CONTAINER_DEF." - ), - ) - parser.add_argument( - "--container-id", - type=str, - required=True, - help="String ID to assign to the newly built container", - ) - parser.add_argument( - "--build-arg", - type=str, - default=[], - action="append", - help=( - "Build-time variable(s) to be passed to `docker build`. Each variable " - "should be specified as a key-value pair in the form KEY=VALUE. " - "The variables should match the ARG instructions in the Dockerfile. " - "When passing multiple variables, specify --build-arg multiple times. " - "Example: --build-arg CUDA_VERSION_ARG=12.5 --build-arg RAPIDS_VERSION_ARG=24.10'" - ), - ) - parser.add_argument( - "--cache-from", - type=str, - help="Use an external cache source for the Docker build", - ) - parser.add_argument( - "--cache-to", - type=str, - help="Export layers from the container to an external cache destination", - ) - - if len(sys.argv) == 1: - parser.print_help() - sys.exit(1) - - parsed_args = parser.parse_args() - main(parsed_args) diff --git a/ops/docker_build.sh b/ops/docker_build.sh deleted file mode 100755 index 7d83daec9574..000000000000 --- a/ops/docker_build.sh +++ /dev/null @@ -1,149 +0,0 @@ -#!/bin/bash -## Build a CI container and cache the layers in AWS ECR (Elastic Container Registry). -## This script provides a convenient wrapper for ops/docker_build.py. -## Build-time variables (--build-arg) and container defintion are fetched from -## ops/docker/ci_container.yml. -## -## Note. This script takes in some inputs via environment variables. - -USAGE_DOC=$( -cat <<-EOF -Usage: ops/docker_build.sh [container_id] - -In addition, the following environment variables should be set. - - BRANCH_NAME: Name of the current git branch or pull request (Required) - - USE_DOCKER_CACHE: If set to 1, enable caching -EOF -) - -ECR_LIFECYCLE_RULE=$( -cat <<-EOF -{ - "rules": [ - { - "rulePriority": 1, - "selection": { - "tagStatus": "any", - "countType": "sinceImagePushed", - "countUnit": "days", - "countNumber": 30 - }, - "action": { - "type": "expire" - } - } - ] -} -EOF -) - -set -euo pipefail - -for arg in "BRANCH_NAME" -do - if [[ -z "${!arg:-}" ]] - then - echo -e "Error: $arg must be set.\n\n${USAGE_DOC}" - exit 1 - fi -done - -if [[ "$#" -lt 1 ]] -then - echo "${USAGE_DOC}" - exit 2 -fi -CONTAINER_ID="$1" - -# Fetch CONTAINER_DEF and BUILD_ARGS -source <(ops/docker/extract_build_args.sh ${CONTAINER_ID} | tee /dev/stderr) 2>&1 - -if [[ "${USE_DOCKER_CACHE:-}" != "1" ]] # Any value other than 1 is considered false -then - USE_DOCKER_CACHE=0 -fi - -if [[ ${USE_DOCKER_CACHE} -eq 0 ]] -then - echo "USE_DOCKER_CACHE not set; caching disabled" -else - DOCKER_CACHE_ECR_ID=$(yq ".DOCKER_CACHE_ECR_ID" ops/docker/docker_cache_ecr.yml) - DOCKER_CACHE_ECR_REGION=$(yq ".DOCKER_CACHE_ECR_REGION" ops/docker/docker_cache_ecr.yml) - DOCKER_CACHE_REPO="${DOCKER_CACHE_ECR_ID}.dkr.ecr.${DOCKER_CACHE_ECR_REGION}.amazonaws.com" - echo "Using AWS ECR; repo URL = ${DOCKER_CACHE_REPO}" - # Login for Docker registry - echo "aws ecr get-login-password --region ${DOCKER_CACHE_ECR_REGION} |" \ - "docker login --username AWS --password-stdin ${DOCKER_CACHE_REPO}" - aws ecr get-login-password --region ${DOCKER_CACHE_ECR_REGION} \ - | docker login --username AWS --password-stdin ${DOCKER_CACHE_REPO} -fi - -# Pull pre-built container from the cache -# First try locating one for the particular branch or pull request -CACHE_FROM_CMD="" -IS_CACHED=0 -if [[ ${USE_DOCKER_CACHE} -eq 1 ]] -then - DOCKER_TAG="${BRANCH_NAME//\//-}" # Slashes are not allowed in Docker tag - DOCKER_URL="${DOCKER_CACHE_REPO}/${CONTAINER_ID}:${DOCKER_TAG}" - echo "docker pull --quiet ${DOCKER_URL}" - if time docker pull --quiet "${DOCKER_URL}" - then - echo "Found a cached container for the branch ${BRANCH_NAME}: ${DOCKER_URL}" - IS_CACHED=1 - else - # If there's no pre-built container from the cache, - # use the pre-built container from the master branch. - DOCKER_URL="${DOCKER_CACHE_REPO}/${CONTAINER_ID}:master" - echo "Could not find a cached container for the branch ${BRANCH_NAME}." \ - "Using a cached container from the master branch: ${DOCKER_URL}" - echo "docker pull --quiet ${DOCKER_URL}" - if time docker pull --quiet "${DOCKER_URL}" - then - IS_CACHED=1 - else - echo "Could not find a cached container for the master branch either." - IS_CACHED=0 - fi - fi - if [[ $IS_CACHED -eq 1 ]] - then - CACHE_FROM_CMD="--cache-from type=registry,ref=${DOCKER_URL}" - fi -fi - -# Run Docker build -set -x -python3 ops/docker_build.py \ - --container-def ${CONTAINER_DEF} \ - --container-id ${CONTAINER_ID} \ - ${BUILD_ARGS} \ - --cache-to type=inline \ - ${CACHE_FROM_CMD} -set +x - -# Now cache the new container -if [[ ${USE_DOCKER_CACHE} -eq 1 ]] -then - DOCKER_URL="${DOCKER_CACHE_REPO}/${CONTAINER_ID}:${DOCKER_TAG}" - echo "docker tag ${CONTAINER_ID} ${DOCKER_URL}" - docker tag "${CONTAINER_ID}" "${DOCKER_URL}" - - # Attempt to create Docker repository; it will fail if the repository already exists - echo "aws ecr create-repository --repository-name ${CONTAINER_ID} --region ${DOCKER_CACHE_ECR_REGION}" - if aws ecr create-repository --repository-name ${CONTAINER_ID} --region ${DOCKER_CACHE_ECR_REGION} - then - # Repository was created. Now set expiration policy - echo "aws ecr put-lifecycle-policy --repository-name ${CONTAINER_ID}" \ - "--region ${DOCKER_CACHE_ECR_REGION} --lifecycle-policy-text file:///dev/stdin" - echo "${ECR_LIFECYCLE_RULE}" | aws ecr put-lifecycle-policy --repository-name ${CONTAINER_ID} \ - --region ${DOCKER_CACHE_ECR_REGION} --lifecycle-policy-text file:///dev/stdin - fi - - echo "docker push --quiet ${DOCKER_URL}" - if ! time docker push --quiet "${DOCKER_URL}" - then - echo "ERROR: could not update Docker cache ${DOCKER_URL}" - exit 1 - fi -fi diff --git a/ops/docker_run.py b/ops/docker_run.py index 7e61c5a14f39..06f9d6cc8dc8 100644 --- a/ops/docker_run.py +++ b/ops/docker_run.py @@ -24,7 +24,7 @@ ) -def parse_run_args(raw_run_args: str) -> list[str]: +def parse_run_args(*, raw_run_args: str) -> list[str]: return [x for x in raw_run_args.split() if x] @@ -39,7 +39,7 @@ def get_user_ids() -> dict[str, str]: } -def fancy_print_cli_args(cli_args: list[str]) -> None: +def fancy_print_cli_args(*, cli_args: list[str]) -> None: print( "=" * LINEWIDTH + "\n" @@ -52,9 +52,9 @@ def fancy_print_cli_args(cli_args: list[str]) -> None: def docker_run( - container_id: str, - command_args: list[str], *, + container_tag: str, + command_args: list[str], use_gpus: bool, workdir: pathlib.Path, user_ids: dict[str, str], @@ -71,16 +71,16 @@ def docker_run( itertools.chain.from_iterable([["-e", f"{k}={v}"] for k, v in user_ids.items()]) ) docker_run_cli_args.extend(extra_args) - docker_run_cli_args.append(container_id) + docker_run_cli_args.append(container_tag) docker_run_cli_args.extend(command_args) cli_args = ["docker", "run"] + docker_run_cli_args - fancy_print_cli_args(cli_args) + fancy_print_cli_args(cli_args=cli_args) subprocess.run(cli_args, check=True, encoding="utf-8") -def main(args: argparse.Namespace) -> None: - run_args = parse_run_args(args.run_args) +def main(*, args: argparse.Namespace) -> None: + run_args = parse_run_args(raw_run_args=args.run_args) user_ids = get_user_ids() if args.use_gpus: @@ -90,8 +90,8 @@ def main(args: argparse.Namespace) -> None: run_args.append("-it") docker_run( - args.container_id, - args.command_args, + container_tag=args.container_tag, + command_args=args.command_args, use_gpus=args.use_gpus, workdir=args.workdir, user_ids=user_ids, @@ -102,17 +102,20 @@ def main(args: argparse.Namespace) -> None: if __name__ == "__main__": parser = argparse.ArgumentParser( usage=( - f"{sys.argv[0]} --container-id CONTAINER_ID [--use-gpus] [--interactive] " + f"{sys.argv[0]} --container-tag CONTAINER_TAG [--use-gpus] [--interactive] " "[--workdir WORKDIR] [--run-args RUN_ARGS] -- COMMAND_ARG " "[COMMAND_ARG ...]" ), description="Run tasks inside a Docker container", ) parser.add_argument( - "--container-id", + "--container-tag", type=str, required=True, - help="String ID of the container to run.", + help=( + "Container tag to identify the container, e.g. " + "492475357299.dkr.ecr.us-west-2.amazonaws.com/xgb-ci.gpu:main" + ), ) parser.add_argument( "--use-gpus", @@ -165,4 +168,4 @@ def main(args: argparse.Namespace) -> None: sys.exit(1) parsed_args = parser.parse_args() - main(parsed_args) + main(args=parsed_args) diff --git a/ops/patch/cpu_only_pypkg.patch b/ops/patch/cpu_only_pypkg.patch index 765ac5c098d0..66d669d161f8 100644 --- a/ops/patch/cpu_only_pypkg.patch +++ b/ops/patch/cpu_only_pypkg.patch @@ -34,10 +34,10 @@ index 1fc0bb5a0..f1c68470b 100644 +Note. ``xgboost-cpu`` does not provide an sdist (source distribution). You may install sdist +from https://pypi.org/project/xgboost/. diff --git python-package/pyproject.toml python-package/pyproject.toml -index 46c1451c2..c5dc908d9 100644 +index 32abff1c6..5206f2e31 100644 --- python-package/pyproject.toml +++ python-package/pyproject.toml -@@ -6,7 +6,7 @@ backend-path = ["."] +@@ -7,7 +7,7 @@ backend-path = ["."] build-backend = "packager.pep517" [project] @@ -46,10 +46,13 @@ index 46c1451c2..c5dc908d9 100644 description = "XGBoost Python Package" readme = { file = "README.rst", content-type = "text/x-rst" } authors = [ -@@ -82,3 +82,6 @@ class-attribute-naming-style = "snake_case" +@@ -71,6 +71,9 @@ disable = [ + dummy-variables-rgx = "(unused|)_.*" + reports = false - # Allow single-letter variables - variable-rgx = "[a-zA-Z_][a-z0-9_]{0,30}$" -+ +[tool.hatch.build.targets.wheel] +packages = ["xgboost/"] ++ + [tool.pylint.basic] + # Enforce naming convention + const-naming-style = "UPPER_CASE" diff --git a/ops/patch/manylinux2014_warning.patch b/ops/patch/manylinux2014_warning.patch index 679205988b7a..0302b5e10d6c 100644 --- a/ops/patch/manylinux2014_warning.patch +++ b/ops/patch/manylinux2014_warning.patch @@ -1,8 +1,8 @@ diff --git python-package/xgboost/core.py python-package/xgboost/core.py -index e8bc735e6..030972ef2 100644 +index 079246239..2f1764812 100644 --- python-package/xgboost/core.py +++ python-package/xgboost/core.py -@@ -262,6 +262,18 @@ Likely cause: +@@ -281,6 +281,18 @@ Likely cause: ) raise ValueError(msg) @@ -15,7 +15,7 @@ index e8bc735e6..030972ef2 100644 + "features such as GPU algorithms or federated learning are not available. " + "To use these features, please upgrade to a recent Linux distro with glibc " + "2.28+, and install the 'manylinux_2_28' variant.", -+ FutureWarning ++ FutureWarning, + ) + return lib diff --git a/ops/patch/remove_nccl_dep.patch b/ops/patch/remove_nccl_dep.patch index c5a8fe3acee1..80fd48cc1faf 100644 --- a/ops/patch/remove_nccl_dep.patch +++ b/ops/patch/remove_nccl_dep.patch @@ -1,8 +1,8 @@ diff --git python-package/pyproject.toml python-package/pyproject.toml -index 20d3f9974..953087ff4 100644 +index b9f08dda6..32abff1c6 100644 --- python-package/pyproject.toml +++ python-package/pyproject.toml -@@ -30,7 +30,6 @@ classifiers = [ +@@ -32,7 +32,6 @@ classifiers = [ dependencies = [ "numpy", "scipy", diff --git a/ops/pipeline/build-cpu-arm64-impl.sh b/ops/pipeline/build-cpu-arm64-impl.sh new file mode 100755 index 000000000000..ae0aa7d5b4ce --- /dev/null +++ b/ops/pipeline/build-cpu-arm64-impl.sh @@ -0,0 +1,32 @@ +#!/bin/bash +## Build and test XGBoost with ARM64 CPU +## Companion script for ops/pipeline/build-cpu-arm64.sh + +set -euox pipefail + +source activate aarch64_test + +echo "--- Build libxgboost from the source" +mkdir -p build +pushd build +cmake .. \ + -GNinja \ + -DCMAKE_PREFIX_PATH="${CONDA_PREFIX}" \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOLS=ON \ + -DGOOGLE_TEST=ON \ + -DUSE_DMLC_GTEST=ON \ + -DENABLE_ALL_WARNINGS=ON \ + -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \ + -DBUILD_DEPRECATED_CLI=ON +time ninja -v + +echo "--- Run Google Test" +ctest --extra-verbose +popd + +echo "--- Build binary wheel" +pushd python-package +rm -rfv dist/* +pip wheel --no-deps -v . --wheel-dir dist/ +popd diff --git a/ops/pipeline/build-cpu-arm64.sh b/ops/pipeline/build-cpu-arm64.sh index ff948ca0c77a..9801790baaaa 100755 --- a/ops/pipeline/build-cpu-arm64.sh +++ b/ops/pipeline/build-cpu-arm64.sh @@ -1,6 +1,7 @@ #!/bin/bash +## Build and test XGBoost with ARM64 CPU -set -euox pipefail +set -euo pipefail if [[ -z "${GITHUB_SHA:-}" ]] then @@ -8,45 +9,40 @@ then exit 1 fi +source ops/pipeline/classify-git-branch.sh +source ops/pipeline/get-docker-registry-details.sh + WHEEL_TAG=manylinux_2_28_aarch64 +CONTAINER_TAG=${DOCKER_REGISTRY_URL}/xgb-ci.aarch64:main echo "--- Build CPU code targeting ARM64" - -echo "--- Build libxgboost from the source" -python3 ops/docker_run.py \ - --container-id xgb-ci.aarch64 \ - -- ops/script/build_via_cmake.sh \ - --conda-env=aarch64_test \ - -DUSE_OPENMP=ON \ - -DHIDE_CXX_SYMBOL=ON - -echo "--- Run Google Test" -python3 ops/docker_run.py \ - --container-id xgb-ci.aarch64 \ - -- bash -c "cd build && ctest --extra-verbose" - -echo "--- Build binary wheel" +set -x python3 ops/docker_run.py \ - --container-id xgb-ci.aarch64 \ - -- bash -c \ - "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" -python3 ops/script/rename_whl.py \ - --wheel-path python-package/dist/*.whl \ - --commit-hash ${GITHUB_SHA} \ - --platform-tag ${WHEEL_TAG} + --container-tag ${CONTAINER_TAG} \ + -- ops/pipeline/build-cpu-arm64-impl.sh echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" python3 ops/docker_run.py \ - --container-id xgb-ci.aarch64 \ - -- auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl -python3 ops/script/rename_whl.py \ - --wheel-path wheelhouse/*.whl \ - --commit-hash ${GITHUB_SHA} \ - --platform-tag ${WHEEL_TAG} + --container-tag ${CONTAINER_TAG} \ + -- auditwheel repair --only-plat \ + --plat ${WHEEL_TAG} python-package/dist/*.whl +python3 -m wheel tags --python-tag py3 --abi-tag none --platform ${WHEEL_TAG} --remove \ + wheelhouse/*.whl mv -v wheelhouse/*.whl python-package/dist/ -# Make sure that libgomp.so is vendored in the wheel -python3 ops/docker_run.py \ - --container-id xgb-ci.aarch64 \ - -- bash -c \ - "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" +if ! unzip -l ./python-package/dist/*.whl | grep libgomp > /dev/null; then + echo "error: libgomp.so was not vendored in the wheel" + exit -1 +fi + +# Check size of wheel +pydistcheck --config python-package/pyproject.toml python-package/dist/*.whl + +echo "--- Upload Python wheel" +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket xgboost-nightly-builds \ + --prefix ${BRANCH_NAME}/${GITHUB_SHA} --make-public \ + python-package/dist/*.whl +fi diff --git a/ops/pipeline/build-cpu-impl.sh b/ops/pipeline/build-cpu-impl.sh new file mode 100755 index 000000000000..55e205d3edfa --- /dev/null +++ b/ops/pipeline/build-cpu-impl.sh @@ -0,0 +1,57 @@ +#!/bin/bash +## Build and test XGBoost with AMD64 CPU +## Companion script for ops/pipeline/build-cpu.sh + +set -euox pipefail + +if [[ "$#" -lt 1 ]] +then + echo "Usage: $0 {cpu,cpu-sanitizer}" + exit 1 +fi +suite="$1" + +mkdir -p build +pushd build + +case "${suite}" in + cpu) + echo "--- Build libxgboost from the source" + cmake .. \ + -GNinja \ + -DHIDE_CXX_SYMBOLS=ON \ + -DGOOGLE_TEST=ON \ + -DUSE_DMLC_GTEST=ON \ + -DENABLE_ALL_WARNINGS=ON \ + -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \ + -DBUILD_DEPRECATED_CLI=ON \ + -DCMAKE_PREFIX_PATH='/opt/grpc' \ + -DPLUGIN_FEDERATED=ON + time ninja -v + echo "--- Run Google Test" + ctest --extra-verbose + ;; + cpu-sanitizer) + echo "--- Run Google Test with sanitizer" + cmake .. \ + -GNinja \ + -DHIDE_CXX_SYMBOLS=ON \ + -DGOOGLE_TEST=ON \ + -DUSE_DMLC_GTEST=ON \ + -DENABLE_ALL_WARNINGS=ON \ + -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \ + -DBUILD_DEPRECATED_CLI=ON \ + -DUSE_SANITIZER=ON \ + -DENABLED_SANITIZERS="address;leak;undefined" \ + -DCMAKE_BUILD_TYPE=Debug \ + -DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/ + time ninja -v + ./testxgboost --gtest_filter=-*DeathTest* + ;; + *) + echo "Unrecognized argument: $suite" + exit 1 + ;; +esac + +popd diff --git a/ops/pipeline/build-cpu.sh b/ops/pipeline/build-cpu.sh index dc0572f0ca4d..04fd4944eae7 100755 --- a/ops/pipeline/build-cpu.sh +++ b/ops/pipeline/build-cpu.sh @@ -1,8 +1,15 @@ #!/bin/bash +## Build and test XGBoost with AMD64 CPU -set -euox pipefail +set -euo pipefail + +source ops/pipeline/classify-git-branch.sh +source ops/pipeline/get-docker-registry-details.sh + +CONTAINER_TAG=${DOCKER_REGISTRY_URL}/xgb-ci.cpu:main echo "--- Build CPU code" +set -x # This step is not necessary, but here we include it, to ensure that # DMLC_CORE_USE_CMAKE flag is correctly propagated. We want to make sure that we use @@ -10,32 +17,20 @@ echo "--- Build CPU code" # include/dmlc/build_config_default.h. rm -fv dmlc-core/include/dmlc/build_config_default.h -# Sanitizer tests -echo "--- Run Google Test with sanitizer enabled" +# Test with sanitizer +export ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer +export ASAN_OPTIONS='symbolize=1' +export UBSAN_OPTIONS='print_stacktrace=1:log_path=ubsan_error.log' # Work around https://github.com/google/sanitizers/issues/1614 sudo sysctl vm.mmap_rnd_bits=28 python3 ops/docker_run.py \ - --container-id xgb-ci.cpu \ - -- ops/script/build_via_cmake.sh \ - -DUSE_SANITIZER=ON \ - -DENABLED_SANITIZERS="address;leak;undefined" \ - -DCMAKE_BUILD_TYPE=Debug \ - -DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/ -python3 ops/docker_run.py \ - --container-id xgb-ci.cpu \ - --run-args '-e ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer - -e ASAN_OPTIONS=symbolize=1 - -e UBSAN_OPTIONS=print_stacktrace=1:log_path=ubsan_error.log - --cap-add SYS_PTRACE' \ - -- bash -c \ - "cd build && ./testxgboost --gtest_filter=-*DeathTest*" + --container-tag ${CONTAINER_TAG} \ + --run-args '-e ASAN_SYMBOLIZER_PATH -e ASAN_OPTIONS -e UBSAN_OPTIONS + --cap-add SYS_PTRACE' \ + -- bash ops/pipeline/build-cpu-impl.sh cpu-sanitizer -echo "--- Run Google Test" -python3 ops/docker_run.py \ - --container-id xgb-ci.cpu \ - -- ops/script/build_via_cmake.sh \ - -DCMAKE_PREFIX_PATH=/opt/grpc \ - -DPLUGIN_FEDERATED=ON +# Test without sanitizer +rm -rf build/ python3 ops/docker_run.py \ - --container-id xgb-ci.cpu \ - -- bash -c "cd build && ctest --extra-verbose" + --container-tag ${CONTAINER_TAG} \ + -- bash ops/pipeline/build-cpu-impl.sh cpu diff --git a/ops/pipeline/build-cuda-impl.sh b/ops/pipeline/build-cuda-impl.sh new file mode 100755 index 000000000000..198936852948 --- /dev/null +++ b/ops/pipeline/build-cuda-impl.sh @@ -0,0 +1,51 @@ +#!/bin/bash +## Build XGBoost with CUDA +## Companion script for ops/pipeline/build-cuda.sh + +set -euox pipefail + +if [[ "${BUILD_ONLY_SM75:-}" == 1 ]] +then + cmake_args='-DGPU_COMPUTE_VER=75' +else + cmake_args='' +fi + +if [[ "${USE_RMM:-}" == 1 ]] +then + cmake_prefix_path='/opt/grpc;/opt/rmm;/opt/rmm/lib64/rapids/cmake' + cmake_args="${cmake_args} -DPLUGIN_RMM=ON" +else + cmake_prefix_path='/opt/grpc;/workspace/cccl' +fi + +# Disable CMAKE_COMPILE_WARNING_AS_ERROR option temporarily until +# https://github.com/dmlc/xgboost/issues/10400 is fixed +echo "--- Build libxgboost from the source" +mkdir -p build +pushd build +cmake .. \ + -GNinja \ + -DCMAKE_PREFIX_PATH="${cmake_prefix_path}" \ + -DUSE_CUDA=ON \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOLS=ON \ + -DPLUGIN_FEDERATED=ON \ + -DUSE_NCCL=ON \ + -DUSE_NCCL_LIB_PATH=ON \ + -DNCCL_INCLUDE_DIR=/usr/include \ + -DUSE_DLOPEN_NCCL=ON \ + -DGOOGLE_TEST=ON \ + -DUSE_DMLC_GTEST=ON \ + -DENABLE_ALL_WARNINGS=ON \ + -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \ + -DBUILD_DEPRECATED_CLI=ON \ + ${cmake_args} +time ninja -v +popd + +echo "--- Build binary wheel" +pushd python-package +rm -rfv dist/* +pip wheel --no-deps -v . --wheel-dir dist/ +popd diff --git a/ops/pipeline/build-cuda-with-rmm.sh b/ops/pipeline/build-cuda-with-rmm.sh deleted file mode 100755 index 479c9a1b1a28..000000000000 --- a/ops/pipeline/build-cuda-with-rmm.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash -## Build XGBoost with CUDA + RMM support - -set -euo pipefail - -if [[ -z "${GITHUB_SHA:-}" ]] -then - echo "Make sure to set environment variable GITHUB_SHA" - exit 1 -fi - -if [[ "$#" -lt 1 ]] -then - echo "Usage: $0 [container_id]" - exit 1 -fi -container_id="$1" - -source ops/pipeline/classify-git-branch.sh - -set -x - -WHEEL_TAG=manylinux_2_28_x86_64 - -echo "--- Build with CUDA with RMM" - -if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -then - arch_flag="-DGPU_COMPUTE_VER=75" -else - arch_flag="" -fi - -echo "--- Build libxgboost from the source" -python3 ops/docker_run.py \ - --container-id "${container_id}" \ - -- ops/script/build_via_cmake.sh \ - -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm;/opt/rmm/lib64/rapids/cmake" \ - -DUSE_CUDA=ON \ - -DUSE_OPENMP=ON \ - -DHIDE_CXX_SYMBOLS=ON \ - -DPLUGIN_FEDERATED=ON \ - -DPLUGIN_RMM=ON \ - -DUSE_NCCL=ON \ - -DUSE_NCCL_LIB_PATH=ON \ - -DNCCL_INCLUDE_DIR=/usr/include \ - -DUSE_DLOPEN_NCCL=ON \ - ${arch_flag} - -echo "--- Build binary wheel" -python3 ops/docker_run.py \ - --container-id "${container_id}" \ - -- bash -c \ - "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" -python3 ops/script/rename_whl.py \ - --wheel-path python-package/dist/*.whl \ - --commit-hash ${GITHUB_SHA} \ - --platform-tag ${WHEEL_TAG} - -echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" -python3 ops/docker_run.py \ - --container-id xgb-ci.${WHEEL_TAG} \ - -- auditwheel repair \ - --plat ${WHEEL_TAG} python-package/dist/*.whl -python3 ops/script/rename_whl.py \ - --wheel-path wheelhouse/*.whl \ - --commit-hash ${GITHUB_SHA} \ - --platform-tag ${WHEEL_TAG} -mv -v wheelhouse/*.whl python-package/dist/ -# Make sure that libgomp.so is vendored in the wheel -python3 ops/docker_run.py \ - --container-id xgb-ci.${WHEEL_TAG} \ - -- bash -c \ - "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh index 49475c01c69e..5e2f2401f1eb 100755 --- a/ops/pipeline/build-cuda.sh +++ b/ops/pipeline/build-cuda.sh @@ -1,7 +1,7 @@ #!/bin/bash ## Build XGBoost with CUDA -set -euox pipefail +set -euo pipefail if [[ -z "${GITHUB_SHA:-}" ]] then @@ -9,77 +9,90 @@ then exit 1 fi -WHEEL_TAG=manylinux_2_28_x86_64 +if [[ "$#" -lt 2 ]] +then + echo "Usage: $0 [container_id] {enable-rmm,disable-rmm}" + exit 2 +fi +container_id="$1" +rmm_flag="$2" + +# Validate RMM flag +case "${rmm_flag}" in + enable-rmm) + export USE_RMM=1 + ;; + disable-rmm) + export USE_RMM=0 + ;; + *) + echo "Unrecognized argument: $rmm_flag" + exit 3 + ;; +esac source ops/pipeline/classify-git-branch.sh +source ops/pipeline/get-docker-registry-details.sh + +WHEEL_TAG=manylinux_2_28_x86_64 +BUILD_CONTAINER_TAG="${DOCKER_REGISTRY_URL}/${container_id}:main" +MANYLINUX_CONTAINER_TAG="${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:main" echo "--- Build with CUDA" if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] then - arch_flag="-DGPU_COMPUTE_VER=75" + export BUILD_ONLY_SM75=1 else - arch_flag="" + export BUILD_ONLY_SM75=0 +fi + +if [[ ${USE_RMM} == 0 ]] +then + # Work around https://github.com/NVIDIA/cccl/issues/1956 + # TODO(hcho3): Remove this once new CUDA version ships with CCCL 2.6.0+ + git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet fi -echo "--- Build libxgboost from the source" set -x -# Work around https://github.com/NVIDIA/cccl/issues/1956 -# TODO(hcho3): Remove this once new CUDA version ships with CCCL 2.6.0+ -git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet -python3 ops/docker_run.py \ - --container-id xgb-ci.gpu_build_rockylinux8 \ - -- ops/script/build_via_cmake.sh \ - -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ - -DUSE_CUDA=ON \ - -DUSE_OPENMP=ON \ - -DHIDE_CXX_SYMBOLS=ON \ - -DPLUGIN_FEDERATED=ON \ - -DUSE_NCCL=ON \ - -DUSE_NCCL_LIB_PATH=ON \ - -DNCCL_INCLUDE_DIR=/usr/include \ - -DUSE_DLOPEN_NCCL=ON \ - ${arch_flag} -echo "--- Build binary wheel" python3 ops/docker_run.py \ - --container-id xgb-ci.gpu_build_rockylinux8 \ - -- bash -c \ - "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" -python3 ops/script/rename_whl.py \ - --wheel-path python-package/dist/*.whl \ - --commit-hash ${GITHUB_SHA} \ - --platform-tag ${WHEEL_TAG} + --container-tag ${BUILD_CONTAINER_TAG} \ + --run-args='-e BUILD_ONLY_SM75 -e USE_RMM' \ + -- ops/pipeline/build-cuda-impl.sh echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" python3 ops/docker_run.py \ - --container-id xgb-ci.manylinux_2_28_x86_64 \ - -- auditwheel repair \ + --container-tag ${MANYLINUX_CONTAINER_TAG} \ + -- auditwheel repair --only-plat \ --plat ${WHEEL_TAG} python-package/dist/*.whl -python3 ops/script/rename_whl.py \ - --wheel-path wheelhouse/*.whl \ - --commit-hash ${GITHUB_SHA} \ - --platform-tag ${WHEEL_TAG} +python3 -m wheel tags --python-tag py3 --abi-tag none --platform ${WHEEL_TAG} --remove \ + wheelhouse/*.whl mv -v wheelhouse/*.whl python-package/dist/ -# Make sure that libgomp.so is vendored in the wheel -python3 ops/docker_run.py \ - --container-id xgb-ci.manylinux_2_28_x86_64 \ - -- bash -c "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" +if ! unzip -l ./python-package/dist/*.whl | grep libgomp > /dev/null; then + echo "error: libgomp.so was not vendored in the wheel" + exit -1 +fi -# Generate the meta info which includes xgboost version and the commit info -python3 ops/docker_run.py \ ---container-id xgb-ci.gpu_build_rockylinux8 \ --- python ops/script/format_wheel_meta.py \ - --wheel-path python-package/dist/*.whl \ - --commit-hash ${GITHUB_SHA} \ - --platform-tag ${WHEEL_TAG} \ - --meta-path python-package/dist/ +# Check size of wheel +pydistcheck --config python-package/pyproject.toml python-package/dist/*.whl -echo "--- Upload Python wheel" -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +if [[ $USE_RMM == 0 ]] then - aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ - --acl public-read --no-progress - aws s3 cp python-package/dist/meta.json s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ - --acl public-read --no-progress + # Generate the meta info which includes xgboost version and the commit info + echo "--- Generate meta info" + python3 ops/script/format_wheel_meta.py \ + --wheel-path python-package/dist/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} \ + --meta-path python-package/dist/ + + echo "--- Upload Python wheel" + if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] + then + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket xgboost-nightly-builds \ + --prefix ${BRANCH_NAME}/${GITHUB_SHA} --make-public \ + python-package/dist/*.whl python-package/dist/meta.json + fi fi diff --git a/ops/pipeline/build-gpu-rpkg-impl.sh b/ops/pipeline/build-gpu-rpkg-impl.sh index 2815b8f448f1..2b803b926271 100755 --- a/ops/pipeline/build-gpu-rpkg-impl.sh +++ b/ops/pipeline/build-gpu-rpkg-impl.sh @@ -33,4 +33,4 @@ cp -v lib/xgboost.so xgboost_rpack/src/ echo 'all:' > xgboost_rpack/src/Makefile echo 'all:' > xgboost_rpack/src/Makefile.win mv xgboost_rpack/ xgboost/ -tar cvzf xgboost_r_gpu_linux_${commit_hash}.tar.gz xgboost/ +tar cvzf xgboost_r_gpu_linux.tar.gz xgboost/ diff --git a/ops/pipeline/build-gpu-rpkg.sh b/ops/pipeline/build-gpu-rpkg.sh index d1384ef766a6..07a08ff15385 100755 --- a/ops/pipeline/build-gpu-rpkg.sh +++ b/ops/pipeline/build-gpu-rpkg.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -euox pipefail +set -euo pipefail if [[ -z "${GITHUB_SHA:-}" ]] then @@ -8,8 +8,22 @@ then exit 1 fi +source ops/pipeline/classify-git-branch.sh +source ops/pipeline/get-docker-registry-details.sh + +CONTAINER_TAG=${DOCKER_REGISTRY_URL}/xgb-ci.gpu_build_r_rockylinux8:main + echo "--- Build XGBoost R package with CUDA" +set -x python3 ops/docker_run.py \ - --container-id xgb-ci.gpu_build_r_rockylinux8 \ + --container-tag ${CONTAINER_TAG} \ -- ops/pipeline/build-gpu-rpkg-impl.sh \ ${GITHUB_SHA} + +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket xgboost-nightly-builds \ + --prefix ${BRANCH_NAME}/${GITHUB_SHA} --make-public \ + xgboost_r_gpu_linux.tar.gz +fi diff --git a/ops/pipeline/build-jvm-doc.sh b/ops/pipeline/build-jvm-doc.sh index 00fdac7a1353..a61f903cb5b9 100755 --- a/ops/pipeline/build-jvm-doc.sh +++ b/ops/pipeline/build-jvm-doc.sh @@ -3,9 +3,7 @@ ## Note: this script assumes that the user has already built libxgboost4j.so ## and place it in the lib/ directory. -set -euox pipefail - -echo "--- Build JVM packages doc" +set -euo pipefail if [[ -z ${BRANCH_NAME:-} ]] then @@ -19,6 +17,12 @@ then exit 2 fi +source ops/pipeline/get-docker-registry-details.sh + +CONTAINER_TAG=${DOCKER_REGISTRY_URL}/xgb-ci.jvm_gpu_build:main + +echo "--- Build JVM packages doc" +set -x python3 ops/docker_run.py \ - --container-id xgb-ci.jvm_gpu_build \ + --container-tag ${CONTAINER_TAG} \ -- ops/pipeline/build-jvm-doc-impl.sh ${BRANCH_NAME} diff --git a/ops/pipeline/build-jvm-gpu.sh b/ops/pipeline/build-jvm-gpu.sh index 7656a3d2f188..3d6f446eb462 100755 --- a/ops/pipeline/build-jvm-gpu.sh +++ b/ops/pipeline/build-jvm-gpu.sh @@ -4,6 +4,9 @@ set -euo pipefail source ops/pipeline/classify-git-branch.sh +source ops/pipeline/get-docker-registry-details.sh + +CONTAINER_TAG=${DOCKER_REGISTRY_URL}/xgb-ci.jvm_gpu_build:main echo "--- Build libxgboost4j.so with CUDA" @@ -29,5 +32,5 @@ mkdir -p build-gpu/ # TODO(hcho3): Remove this once new CUDA version ships with CCCL 2.6.0+ git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet --depth 1 python3 ops/docker_run.py \ - --container-id xgb-ci.jvm_gpu_build \ + --container-tag ${CONTAINER_TAG} \ -- bash -c "${COMMAND}" diff --git a/ops/pipeline/build-jvm-manylinux2014.sh b/ops/pipeline/build-jvm-manylinux2014.sh index e69dd3682b90..068fb5fb0c44 100755 --- a/ops/pipeline/build-jvm-manylinux2014.sh +++ b/ops/pipeline/build-jvm-manylinux2014.sh @@ -1,7 +1,7 @@ #!/bin/bash ## Build libxgboost4j.so targeting glibc 2.17 systems -set -euox pipefail +set -euo pipefail if [[ $# -ne 1 ]] then @@ -10,16 +10,30 @@ then fi arch=$1 +container_id="xgb-ci.manylinux2014_${arch}" -image="xgb-ci.manylinux2014_${arch}" +source ops/pipeline/classify-git-branch.sh +source ops/pipeline/get-docker-registry-details.sh + +CONTAINER_TAG="${DOCKER_REGISTRY_URL}/${container_id}:main" # Build XGBoost4J binary echo "--- Build libxgboost4j.so (targeting glibc 2.17)" set -x mkdir build python3 ops/docker_run.py \ - --container-id ${image} \ + --container-tag "${CONTAINER_TAG}" \ -- bash -c \ "cd build && cmake .. -DJVM_BINDINGS=ON -DUSE_OPENMP=ON && make -j$(nproc)" ldd lib/libxgboost4j.so objdump -T lib/libxgboost4j.so | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/\1/g' | sort -Vu + +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then + libname=lib/libxgboost4j_linux_${arch}.so + mv -v lib/libxgboost4j.so ${libname} + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket xgboost-nightly-builds \ + --prefix ${BRANCH_NAME}/${GITHUB_SHA} --make-public \ + ${libname} +fi diff --git a/ops/pipeline/build-manylinux2014.sh b/ops/pipeline/build-manylinux2014.sh index a8f5af8bc3cd..fbc349568e72 100755 --- a/ops/pipeline/build-manylinux2014.sh +++ b/ops/pipeline/build-manylinux2014.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -euox pipefail +set -euo pipefail if [[ -z "${GITHUB_SHA:-}" ]] then @@ -16,29 +16,32 @@ fi arch="$1" -WHEEL_TAG="manylinux2014_${arch}" -image="xgb-ci.${WHEEL_TAG}" +source ops/pipeline/classify-git-branch.sh +source ops/pipeline/get-docker-registry-details.sh +WHEEL_TAG="manylinux2014_${arch}" +container_id="xgb-ci.${WHEEL_TAG}" python_bin="/opt/python/cp310-cp310/bin/python" +CONTAINER_TAG="${DOCKER_REGISTRY_URL}/${container_id}:main" echo "--- Build binary wheel for ${WHEEL_TAG}" +set -x # Patch to add warning about manylinux2014 variant patch -p0 < ops/patch/remove_nccl_dep.patch patch -p0 < ops/patch/manylinux2014_warning.patch python3 ops/docker_run.py \ - --container-id ${image} \ + --container-tag "${CONTAINER_TAG}" \ -- bash -c \ "cd python-package && ${python_bin} -m pip wheel --no-deps -v . --wheel-dir dist/" git checkout python-package/pyproject.toml python-package/xgboost/core.py # discard the patch python3 ops/docker_run.py \ - --container-id ${image} \ - -- auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl -python3 ops/script/rename_whl.py \ - --wheel-path wheelhouse/*.whl \ - --commit-hash ${GITHUB_SHA} \ - --platform-tag ${WHEEL_TAG} + --container-tag "${CONTAINER_TAG}" \ + -- auditwheel repair --only-plat \ + --plat ${WHEEL_TAG} python-package/dist/*.whl +python3 -m wheel tags --python-tag py3 --abi-tag none --platform ${WHEEL_TAG} --remove \ + wheelhouse/*.whl rm -rf python-package/dist/ mkdir python-package/dist/ mv -v wheelhouse/*.whl python-package/dist/ @@ -48,17 +51,24 @@ echo "--- Build binary wheel for ${WHEEL_TAG} (CPU only)" patch -p0 < ops/patch/remove_nccl_dep.patch patch -p0 < ops/patch/cpu_only_pypkg.patch python3 ops/docker_run.py \ - --container-id ${image} \ + --container-tag "${CONTAINER_TAG}" \ -- bash -c \ "cd python-package && ${python_bin} -m pip wheel --no-deps -v . --wheel-dir dist/" git checkout python-package/pyproject.toml # discard the patch python3 ops/docker_run.py \ - --container-id ${image} \ - -- auditwheel repair --plat ${WHEEL_TAG} python-package/dist/xgboost_cpu-*.whl -python3 ops/script/rename_whl.py \ - --wheel-path wheelhouse/xgboost_cpu-*.whl \ - --commit-hash ${GITHUB_SHA} \ - --platform-tag ${WHEEL_TAG} + --container-tag "${CONTAINER_TAG}" \ + -- auditwheel repair --only-plat \ + --plat ${WHEEL_TAG} python-package/dist/xgboost_cpu-*.whl +python3 -m wheel tags --python-tag py3 --abi-tag none --platform ${WHEEL_TAG} --remove \ + wheelhouse/xgboost_cpu-*.whl rm -v python-package/dist/xgboost_cpu-*.whl mv -v wheelhouse/xgboost_cpu-*.whl python-package/dist/ + +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket xgboost-nightly-builds \ + --prefix ${BRANCH_NAME}/${GITHUB_SHA} --make-public \ + python-package/dist/*.whl +fi diff --git a/ops/pipeline/build-python-wheels-macos.sh b/ops/pipeline/build-python-wheels-macos.sh index 697514c0c3ad..ef1cdabaad56 100755 --- a/ops/pipeline/build-python-wheels-macos.sh +++ b/ops/pipeline/build-python-wheels-macos.sh @@ -13,13 +13,13 @@ commit_id=$2 if [[ "$platform_id" == macosx_* ]]; then if [[ "$platform_id" == macosx_arm64 ]]; then # MacOS, Apple Silicon - wheel_tag=macosx_12_0_arm64 + WHEEL_TAG=macosx_12_0_arm64 cpython_ver=310 cibw_archs=arm64 export MACOSX_DEPLOYMENT_TARGET=12.0 elif [[ "$platform_id" == macosx_x86_64 ]]; then # MacOS, Intel - wheel_tag=macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64 + WHEEL_TAG=macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64 cpython_ver=310 cibw_archs=x86_64 export MACOSX_DEPLOYMENT_TARGET=10.15 @@ -42,10 +42,6 @@ export CIBW_REPAIR_WHEEL_COMMAND_MACOS="delocate-wheel --require-archs {delocate python -m pip install cibuildwheel python -m cibuildwheel python-package --output-dir wheelhouse -python ops/script/rename_whl.py \ - --wheel-path wheelhouse/*.whl \ - --commit-hash ${commit_id} \ - --platform-tag ${wheel_tag} # List dependencies of libxgboost.dylib mkdir tmp diff --git a/ops/pipeline/build-test-cpu-nonomp.sh b/ops/pipeline/build-test-cpu-nonomp.sh new file mode 100755 index 000000000000..5bd6fa7f9d32 --- /dev/null +++ b/ops/pipeline/build-test-cpu-nonomp.sh @@ -0,0 +1,19 @@ +#!/bin/bash +## Ensure that XGBoost can function with OpenMP disabled + +set -euox pipefail + +mkdir -p build +pushd build +cmake .. \ + -GNinja \ + -DUSE_OPENMP=OFF \ + -DHIDE_CXX_SYMBOLS=ON \ + -DGOOGLE_TEST=ON \ + -DUSE_DMLC_GTEST=ON \ + -DENABLE_ALL_WARNINGS=ON \ + -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \ + -DBUILD_DEPRECATED_CLI=ON +time ninja -v +ctest --extra-verbose +popd diff --git a/ops/pipeline/build-test-jvm-packages-impl.sh b/ops/pipeline/build-test-jvm-packages-impl.sh index ed95ba3368ab..61550d61bbae 100755 --- a/ops/pipeline/build-test-jvm-packages-impl.sh +++ b/ops/pipeline/build-test-jvm-packages-impl.sh @@ -1,6 +1,6 @@ #!/bin/bash ## Build and test JVM packages. -## Companion script for build-test-jvm-packages.sh. +## Companion script for ops/pipeline/build-test-jvm-packages.sh. ## ## Note. This script takes in all inputs via environment variables. diff --git a/ops/pipeline/build-test-jvm-packages.sh b/ops/pipeline/build-test-jvm-packages.sh index d04cc3510de5..aea905e00294 100755 --- a/ops/pipeline/build-test-jvm-packages.sh +++ b/ops/pipeline/build-test-jvm-packages.sh @@ -12,6 +12,8 @@ EOF set -euo pipefail +source ops/pipeline/get-docker-registry-details.sh + for arg in "SCALA_VERSION" do if [[ -z "${!arg:-}" ]] @@ -21,8 +23,10 @@ do fi done +CONTAINER_TAG=${DOCKER_REGISTRY_URL}/xgb-ci.jvm:main + set -x -python3 ops/docker_run.py --container-id xgb-ci.jvm \ +python3 ops/docker_run.py --container-tag ${CONTAINER_TAG} \ --run-args "-e SCALA_VERSION=${SCALA_VERSION}" \ -- ops/pipeline/build-test-jvm-packages-impl.sh diff --git a/ops/pipeline/build-win64-gpu.ps1 b/ops/pipeline/build-win64-gpu.ps1 index 76cc955059b8..894304f2c4f9 100644 --- a/ops/pipeline/build-win64-gpu.ps1 +++ b/ops/pipeline/build-win64-gpu.ps1 @@ -27,20 +27,19 @@ if ($LASTEXITCODE -ne 0) { throw "Last command failed" } Write-Host "--- Build binary wheel" cd ../python-package conda activate -pip install --user -v "pip>=23" -pip --version pip wheel --no-deps -v . --wheel-dir dist/ if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -python ../ops/script/rename_whl.py ` - --wheel-path (Get-ChildItem dist/*.whl | Select-Object -Expand FullName) ` - --commit-hash $Env:GITHUB_SHA ` - --platform-tag win_amd64 +python -m wheel tags --python-tag py3 --abi-tag none ` + --platform win_amd64 --remove ` + (Get-ChildItem dist/*.whl | Select-Object -Expand FullName) if ($LASTEXITCODE -ne 0) { throw "Last command failed" } Write-Host "--- Upload Python wheel" cd .. if ( $is_release_branch -eq 1 ) { - aws s3 cp (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName) ` - s3://xgboost-nightly-builds/$Env:BRANCH_NAME/ --acl public-read --no-progress + python ops/pipeline/manage-artifacts.py upload ` + --s3-bucket 'xgboost-nightly-builds' ` + --prefix "$Env:BRANCH_NAME/$Env:GITHUB_SHA" --make-public ` + (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName) if ($LASTEXITCODE -ne 0) { throw "Last command failed" } } diff --git a/ops/pipeline/deploy-jvm-packages.sh b/ops/pipeline/deploy-jvm-packages.sh index e821f334b9d2..f76724a702cb 100755 --- a/ops/pipeline/deploy-jvm-packages.sh +++ b/ops/pipeline/deploy-jvm-packages.sh @@ -1,9 +1,10 @@ #!/bin/bash ## Deploy JVM packages to S3 bucket -set -euox pipefail +set -euo pipefail source ops/pipeline/enforce-ci.sh +source ops/pipeline/get-docker-registry-details.sh if [[ "$#" -lt 3 ]] then @@ -15,9 +16,13 @@ variant="$1" container_id="$2" scala_version="$3" +CONTAINER_TAG="${DOCKER_REGISTRY_URL}/${container_id}:main" + +set -x + if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then echo "--- Deploy JVM packages to xgboost-maven-repo S3 repo" - python3 ops/docker_run.py --container-id "${container_id}" \ + python3 ops/docker_run.py --container-tag "${CONTAINER_TAG}" \ -- ops/pipeline/deploy-jvm-packages-impl.sh "${variant}" "${scala_version}" fi diff --git a/ops/pipeline/enforce-ci.ps1 b/ops/pipeline/enforce-ci.ps1 index 0528472be6cb..e2eb734d1229 100644 --- a/ops/pipeline/enforce-ci.ps1 +++ b/ops/pipeline/enforce-ci.ps1 @@ -1,7 +1,7 @@ ## Ensure that a script is running inside the CI. ## Usage: . ops/pipeline/enforce-ci.ps1 -if ( -Not $Env:GITHUB_ACTION ) { +if ( -Not $Env:GITHUB_ACTIONS ) { $script_name = (Split-Path -Path $PSCommandPath -Leaf) Write-Host "$script_name is not meant to run locally; it should run inside GitHub Actions." Write-Host "Please inspect the content of $script_name and locate the desired command manually." diff --git a/ops/pipeline/enforce-ci.sh b/ops/pipeline/enforce-ci.sh index 1e853a5ea266..292d6baec079 100755 --- a/ops/pipeline/enforce-ci.sh +++ b/ops/pipeline/enforce-ci.sh @@ -5,7 +5,7 @@ set -euo pipefail -if [[ -z ${GITHUB_ACTION:-} ]] +if [[ -z ${GITHUB_ACTIONS:-} ]] then echo "$0 is not meant to run locally; it should run inside GitHub Actions." echo "Please inspect the content of $0 and locate the desired command manually." diff --git a/ops/pipeline/get-docker-registry-details.sh b/ops/pipeline/get-docker-registry-details.sh new file mode 100755 index 000000000000..000db9a2655a --- /dev/null +++ b/ops/pipeline/get-docker-registry-details.sh @@ -0,0 +1,5 @@ +## Get details for AWS ECR (Elastic Container Registry) in environment variables + +ECR_AWS_ACCOUNT_ID="492475357299" +ECR_AWS_REGION="us-west-2" +DOCKER_REGISTRY_URL="${ECR_AWS_ACCOUNT_ID}.dkr.ecr.${ECR_AWS_REGION}.amazonaws.com" diff --git a/ops/pipeline/login-docker-registry.sh b/ops/pipeline/login-docker-registry.sh new file mode 100755 index 000000000000..a03987f484b8 --- /dev/null +++ b/ops/pipeline/login-docker-registry.sh @@ -0,0 +1,11 @@ +## Log into AWS ECR (Elastic Container Registry) to be able to pull containers from it +## Note. Requires valid AWS credentials + +set -euo pipefail + +source ops/pipeline/get-docker-registry-details.sh + +echo "aws ecr get-login-password --region ${ECR_AWS_REGION} |" \ + "docker login --username AWS --password-stdin ${DOCKER_REGISTRY_URL}" +aws ecr get-login-password --region ${ECR_AWS_REGION} \ + | docker login --username AWS --password-stdin ${DOCKER_REGISTRY_URL} diff --git a/ops/pipeline/manage-artifacts.py b/ops/pipeline/manage-artifacts.py new file mode 100644 index 000000000000..3f94678421b8 --- /dev/null +++ b/ops/pipeline/manage-artifacts.py @@ -0,0 +1,163 @@ +""" +Upload an artifact to an S3 bucket for later use +Note. This script takes in all inputs via environment variables + except the path to the artifact(s). +""" + +import argparse +import os +import subprocess +import sys +from pathlib import Path +from urllib.parse import SplitResult, urlsplit, urlunsplit + + +def resolve(x: Path) -> Path: + return x.expanduser().resolve() + + +def path_equals(a: Path, b: Path) -> bool: + return resolve(a) == resolve(b) + + +def compute_s3_url(*, s3_bucket: str, prefix: str, artifact: str) -> str: + if prefix == "": + return f"s3://{s3_bucket}/{artifact}" + return f"s3://{s3_bucket}/{prefix}/{artifact}" + + +def aws_s3_upload(*, src: Path, dest: str, make_public: bool) -> None: + cli_args = ["aws", "s3", "cp", "--no-progress", str(src), dest] + if make_public: + cli_args.extend(["--acl", "public-read"]) + print(" ".join(cli_args)) + subprocess.run( + cli_args, + check=True, + encoding="utf-8", + ) + + +def aws_s3_download(*, src: str, dest_dir: Path) -> None: + cli_args = ["aws", "s3", "cp", "--no-progress", src, str(dest_dir)] + print(" ".join(cli_args)) + subprocess.run( + cli_args, + check=True, + encoding="utf-8", + ) + + +def aws_s3_download_with_wildcard(*, src: str, dest_dir: Path) -> None: + parsed_src = urlsplit(src) + src_dir = urlunsplit( + SplitResult( + scheme="s3", + netloc=parsed_src.netloc, + path=os.path.dirname(parsed_src.path), + query="", + fragment="", + ) + ) + src_glob = os.path.basename(parsed_src.path) + cli_args = [ + "aws", + "s3", + "cp", + "--recursive", + "--no-progress", + "--exclude", + "'*'", + "--include", + src_glob, + src_dir, + str(dest_dir), + ] + print(" ".join(cli_args)) + subprocess.run( + cli_args, + check=True, + encoding="utf-8", + ) + + +def upload(*, args: argparse.Namespace) -> None: + print(f"Uploading artifacts to prefix {args.prefix}...") + for artifact in args.artifacts: + artifact_path = Path(artifact) + s3_url = compute_s3_url( + s3_bucket=args.s3_bucket, prefix=args.prefix, artifact=artifact_path.name + ) + aws_s3_upload(src=artifact_path, dest=s3_url, make_public=args.make_public) + + +def download(*, args: argparse.Namespace) -> None: + print(f"Downloading artifacts from prefix {args.prefix}...") + dest_dir = Path(args.dest_dir) + print(f"mkdir -p {str(dest_dir)}") + dest_dir.mkdir(parents=True, exist_ok=True) + for artifact in args.artifacts: + s3_url = compute_s3_url( + s3_bucket=args.s3_bucket, prefix=args.prefix, artifact=artifact + ) + if "*" in artifact: + aws_s3_download_with_wildcard(src=s3_url, dest_dir=dest_dir) + else: + aws_s3_download(src=s3_url, dest_dir=dest_dir) + + +if __name__ == "__main__": + # Ensure that the current working directory is the project root + if not (Path.cwd() / "ops").is_dir() or not path_equals( + Path(__file__).parent.parent, Path.cwd() / "ops" + ): + x = Path(__file__).name + raise RuntimeError(f"Script {x} must be run at the project's root directory") + + root_parser = argparse.ArgumentParser() + subparser_factory = root_parser.add_subparsers(required=True, dest="command") + parsers = {} + for command in ["upload", "download"]: + parsers[command] = subparser_factory.add_parser(command) + parsers[command].add_argument( + "--s3-bucket", + type=str, + required=True, + help="Name of the S3 bucket to store the artifact", + ) + parsers[command].add_argument( + "--prefix", + type=str, + required=True, + help=( + "Where the artifact(s) would be stored. The artifact(s) will be stored at " + "s3://[s3-bucket]/[prefix]/[filename]." + ), + ) + parsers[command].add_argument( + "artifacts", + type=str, + nargs="+", + metavar="artifact", + help=f"Artifact(s) to {command}", + ) + + parsers["upload"].add_argument( + "--make-public", action="store_true", help="Make artifact publicly accessible" + ) + parsers["download"].add_argument( + "--dest-dir", type=str, required=True, help="Where to download artifact(s)" + ) + + if len(sys.argv) == 1: + print("1. Upload artifact(s)") + parsers["upload"].print_help() + print("\n2. Download artifact(s)") + parsers["download"].print_help() + sys.exit(1) + + parsed_args = root_parser.parse_args() + if parsed_args.command == "upload": + upload(args=parsed_args) + elif parsed_args.command == "download": + download(args=parsed_args) diff --git a/ops/pipeline/publish-artifact.sh b/ops/pipeline/publish-artifact.sh deleted file mode 100755 index adcb3c521d2a..000000000000 --- a/ops/pipeline/publish-artifact.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -## Publish artifacts in an S3 bucket -## Meant to be used inside GitHub Actions - -set -euo pipefail - -source ops/pipeline/enforce-ci.sh - -if [[ $# -ne 2 ]] -then - echo "Usage: $0 [artifact] [s3_url]" - exit 1 -fi - -artifact="$1" -s3_url="$2" - -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - echo "aws s3 cp ${artifact} ${s3_url} --acl public-read --no-progress" - aws s3 cp "${artifact}" "${s3_url}" --acl public-read --no-progress -fi diff --git a/ops/pipeline/run-clang-tidy.sh b/ops/pipeline/run-clang-tidy.sh index 676f302009ce..3f2019f3a330 100755 --- a/ops/pipeline/run-clang-tidy.sh +++ b/ops/pipeline/run-clang-tidy.sh @@ -1,9 +1,13 @@ #!/bin/bash -set -euox pipefail +set -euo pipefail -echo "--- Run clang-tidy" +source ops/pipeline/get-docker-registry-details.sh + +CONTAINER_TAG=${DOCKER_REGISTRY_URL}/xgb-ci.clang_tidy:main +echo "--- Run clang-tidy" +set -x python3 ops/docker_run.py \ - --container-id xgb-ci.clang_tidy \ + --container-tag ${CONTAINER_TAG} \ -- python3 ops/script/run_clang_tidy.py --cuda-archs 75 diff --git a/ops/pipeline/stash-artifacts.ps1 b/ops/pipeline/stash-artifacts.ps1 deleted file mode 100644 index 9b9989bf376d..000000000000 --- a/ops/pipeline/stash-artifacts.ps1 +++ /dev/null @@ -1,49 +0,0 @@ -[CmdletBinding()] -Param( - [Parameter( - Mandatory=$true, - Position=0 - )][string]$command, - [Parameter( - Mandatory=$true, - Position=1 - )][string]$remote_prefix, - [Parameter( - Mandatory=$true, - Position=2, - ValueFromRemainingArguments=$true - )][string[]]$artifacts -) - -## Convenience wrapper for ops/pipeline/stash-artifacts.py -## Meant to be used inside GitHub Actions - -$ErrorActionPreference = "Stop" - -. ops/pipeline/enforce-ci.ps1 - -foreach ($env in "GITHUB_REPOSITORY", "GITHUB_RUN_ID", "RUNS_ON_S3_BUCKET_CACHE") { - $val = [Environment]::GetEnvironmentVariable($env) - if ($val -eq $null) { - Write-Host "Error: $env must be set." - exit 1 - } -} - -$artifact_stash_prefix = "cache/${Env:GITHUB_REPOSITORY}/stash/${Env:GITHUB_RUN_ID}" - -conda activate - -Write-Host @" -python ops/pipeline/stash-artifacts.py ` - --command "${command}" ` - --s3-bucket "${Env:RUNS_ON_S3_BUCKET_CACHE}" ` - --prefix "${artifact_stash_prefix}/${remote_prefix}" ` - -- $artifacts -"@ -python ops/pipeline/stash-artifacts.py ` - --command "${command}" ` - --s3-bucket "${Env:RUNS_ON_S3_BUCKET_CACHE}" ` - --prefix "${artifact_stash_prefix}/${remote_prefix}" ` - -- $artifacts -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/ops/pipeline/stash-artifacts.py b/ops/pipeline/stash-artifacts.py deleted file mode 100644 index 151e187513da..000000000000 --- a/ops/pipeline/stash-artifacts.py +++ /dev/null @@ -1,144 +0,0 @@ -""" -Stash an artifact in an S3 bucket for later use - -Note. This script takes in all inputs via environment variables - except the path to the artifact(s). -""" - -import argparse -import os -import subprocess -from pathlib import Path -from urllib.parse import SplitResult, urlsplit, urlunsplit - - -def resolve(x: Path) -> Path: - return x.expanduser().resolve() - - -def path_equals(a: Path, b: Path) -> bool: - return resolve(a) == resolve(b) - - -def compute_s3_url(s3_bucket: str, prefix: str, artifact: Path) -> str: - filename = artifact.name - relative_path = resolve(artifact).relative_to(Path.cwd()) - if resolve(artifact.parent) == resolve(Path.cwd()): - full_prefix = prefix - else: - full_prefix = f"{prefix}/{str(relative_path.parent)}" - return f"s3://{s3_bucket}/{full_prefix}/{filename}" - - -def aws_s3_upload(src: Path, dest: str) -> None: - cli_args = ["aws", "s3", "cp", "--no-progress", str(src), dest] - print(" ".join(cli_args)) - subprocess.run( - cli_args, - check=True, - encoding="utf-8", - ) - - -def aws_s3_download(src: str, dest: Path) -> None: - cli_args = ["aws", "s3", "cp", "--no-progress", src, str(dest)] - print(" ".join(cli_args)) - subprocess.run( - cli_args, - check=True, - encoding="utf-8", - ) - - -def aws_s3_download_with_wildcard(src: str, dest: Path) -> None: - parsed_src = urlsplit(src) - src_dir = urlunsplit( - SplitResult( - scheme="s3", - netloc=parsed_src.netloc, - path=os.path.dirname(parsed_src.path), - query="", - fragment="", - ) - ) - dest_dir = dest.parent - src_glob = os.path.basename(parsed_src.path) - cli_args = [ - "aws", - "s3", - "cp", - "--recursive", - "--no-progress", - "--exclude", - "'*'", - "--include", - src_glob, - src_dir, - str(dest_dir), - ] - print(" ".join(cli_args)) - subprocess.run( - cli_args, - check=True, - encoding="utf-8", - ) - - -def upload(args: argparse.Namespace) -> None: - print(f"Stashing artifacts to prefix {args.prefix}...") - for artifact in args.artifacts: - artifact_path = Path(artifact) - s3_url = compute_s3_url(args.s3_bucket, args.prefix, artifact_path) - aws_s3_upload(artifact_path, s3_url) - - -def download(args: argparse.Namespace) -> None: - print(f"Unstashing artifacts from prefix {args.prefix}...") - for artifact in args.artifacts: - artifact_path = Path(artifact) - print(f"mkdir -p {str(artifact_path.parent)}") - artifact_path.parent.mkdir(parents=True, exist_ok=True) - s3_url = compute_s3_url(args.s3_bucket, args.prefix, artifact_path) - if "*" in artifact: - aws_s3_download_with_wildcard(s3_url, artifact_path) - else: - aws_s3_download(s3_url, artifact_path) - - -if __name__ == "__main__": - # Ensure that the current working directory is the project root - if not (Path.cwd() / "ops").is_dir() or not path_equals( - Path(__file__).parent.parent, Path.cwd() / "ops" - ): - x = Path(__file__).name - raise RuntimeError(f"Script {x} must be run at the project's root directory") - - parser = argparse.ArgumentParser() - parser.add_argument( - "--command", - type=str, - choices=["stash", "unstash"], - required=True, - help="Whether to stash or unstash the artifact", - ) - parser.add_argument( - "--s3-bucket", - type=str, - required=True, - help="Name of the S3 bucket to store the artifact", - ) - parser.add_argument( - "--prefix", - type=str, - required=True, - help=( - "Where the artifact would be stored. The artifact will be stored in " - "s3://[s3-bucket]/[prefix]." - ), - ) - parser.add_argument("artifacts", type=str, nargs="+", metavar="artifact") - parsed_args = parser.parse_args() - if parsed_args.command == "stash": - upload(parsed_args) - elif parsed_args.command == "unstash": - download(parsed_args) diff --git a/ops/pipeline/stash-artifacts.sh b/ops/pipeline/stash-artifacts.sh deleted file mode 100755 index 98c9695c4227..000000000000 --- a/ops/pipeline/stash-artifacts.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -## Convenience wrapper for ops/pipeline/stash-artifacts.py -## Meant to be used inside GitHub Actions - -set -euo pipefail - -source ops/pipeline/enforce-ci.sh - -if [[ "$#" -lt 3 ]] -then - echo "Usage: $0 {stash,unstash} [remote_prefix] [artifact] [artifact ...]" - exit 1 -fi - -command="$1" -remote_prefix="$2" -shift 2 - -for arg in "GITHUB_REPOSITORY" "GITHUB_RUN_ID" "RUNS_ON_S3_BUCKET_CACHE" -do - if [[ -z "${!arg:-}" ]] - then - echo "Error: $arg must be set." - exit 2 - fi -done - -artifact_stash_prefix="cache/${GITHUB_REPOSITORY}/stash/${GITHUB_RUN_ID}" - -set -x -python3 ops/pipeline/stash-artifacts.py \ - --command "${command}" \ - --s3-bucket "${RUNS_ON_S3_BUCKET_CACHE}" \ - --prefix "${artifact_stash_prefix}/${remote_prefix}" \ - -- "$@" diff --git a/ops/pipeline/test-cpp-gpu.sh b/ops/pipeline/test-cpp-gpu.sh index 9a0cd4743c18..9fdcd314264d 100755 --- a/ops/pipeline/test-cpp-gpu.sh +++ b/ops/pipeline/test-cpp-gpu.sh @@ -7,36 +7,34 @@ then echo "Usage: $0 {gpu,gpu-rmm,mgpu}" exit 1 fi -arg=$1 +suite=$1 -case "${arg}" in +source ops/pipeline/get-docker-registry-details.sh + +CONTAINER_TAG=${DOCKER_REGISTRY_URL}/xgb-ci.gpu:main + +case "${suite}" in gpu) echo "--- Run Google Tests, using a single GPU" - python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ - -- nvidia-smi - python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + python3 ops/docker_run.py --container-tag ${CONTAINER_TAG} --use-gpus \ -- build/testxgboost ;; gpu-rmm) echo "--- Run Google Tests, using a single GPU, RMM enabled" - python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ - -- nvidia-smi - python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + python3 ops/docker_run.py --container-tag ${CONTAINER_TAG} --use-gpus \ -- build/testxgboost --use-rmm-pool ;; mgpu) echo "--- Run Google Tests, using multiple GPUs" - python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ - -- nvidia-smi - python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + python3 ops/docker_run.py --container-tag ${CONTAINER_TAG} --use-gpus \ --run-args='--shm-size=4g' \ -- build/testxgboost --gtest_filter=*MGPU* ;; *) - echo "Unrecognized arg: ${arg}" + echo "Unrecognized suite: ${suite}" exit 2 ;; esac diff --git a/ops/pipeline/test-cpp-i386-impl.sh b/ops/pipeline/test-cpp-i386-impl.sh new file mode 100755 index 000000000000..1f7653fd5e1e --- /dev/null +++ b/ops/pipeline/test-cpp-i386-impl.sh @@ -0,0 +1,22 @@ +#!/bin/bash +## Run C++ tests for i386 +## Companion script for ops/pipeline/test-cpp-i386.sh + +set -euox pipefail + +export CXXFLAGS='-Wno-error=overloaded-virtual -Wno-error=maybe-uninitialized -Wno-error=redundant-move -Wno-narrowing' + +mkdir -p build +pushd build + +cmake .. \ + -GNinja \ + -DGOOGLE_TEST=ON \ + -DUSE_DMLC_GTEST=ON \ + -DENABLE_ALL_WARNINGS=ON \ + -DCMAKE_COMPILE_WARNING_AS_ERROR=ON +time ninja -v +# TODO(hcho3): Run gtest for i386 +# ./testxgboost + +popd diff --git a/ops/pipeline/test-cpp-i386.sh b/ops/pipeline/test-cpp-i386.sh new file mode 100755 index 000000000000..19223041c3fb --- /dev/null +++ b/ops/pipeline/test-cpp-i386.sh @@ -0,0 +1,13 @@ +#!/bin/bash +## Run C++ tests for i386 + +set -euo pipefail + +source ops/pipeline/get-docker-registry-details.sh + +CONTAINER_TAG="${DOCKER_REGISTRY_URL}/xgb-ci.i386:main" + +set -x +python3 ops/docker_run.py \ + --container-tag ${CONTAINER_TAG} \ + -- bash ops/pipeline/test-cpp-i386-impl.sh diff --git a/ops/pipeline/test-jvm-gpu.sh b/ops/pipeline/test-jvm-gpu.sh index 380db97c787c..0f517832113f 100755 --- a/ops/pipeline/test-jvm-gpu.sh +++ b/ops/pipeline/test-jvm-gpu.sh @@ -23,10 +23,12 @@ do fi done +source ops/pipeline/get-docker-registry-details.sh + +CONTAINER_TAG=${DOCKER_REGISTRY_URL}/xgb-ci.jvm_gpu_build:main + set -x -python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ - -- nvidia-smi -python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ +python3 ops/docker_run.py --container-tag ${CONTAINER_TAG} --use-gpus \ --run-args "-e SCALA_VERSION=${SCALA_VERSION} -e USE_CUDA=1 -e SKIP_NATIVE_BUILD=1 --shm-size=4g --privileged" \ -- ops/pipeline/build-test-jvm-packages-impl.sh diff --git a/ops/pipeline/test-python-wheel-impl.sh b/ops/pipeline/test-python-wheel-impl.sh index 75bfa5fbaffb..837ff03b24d7 100755 --- a/ops/pipeline/test-python-wheel-impl.sh +++ b/ops/pipeline/test-python-wheel-impl.sh @@ -34,7 +34,7 @@ export PYSPARK_DRIVER_PYTHON=$(which python) export PYSPARK_PYTHON=$(which python) export SPARK_TESTING=1 -pip install -v ./python-package/dist/*.whl +pip install -v ./wheelhouse/*.whl case "$suite" in gpu) diff --git a/ops/pipeline/test-python-wheel.sh b/ops/pipeline/test-python-wheel.sh index b4dd59b7cb0e..56d54fd65d02 100755 --- a/ops/pipeline/test-python-wheel.sh +++ b/ops/pipeline/test-python-wheel.sh @@ -19,7 +19,10 @@ else gpu_option="" fi +source ops/pipeline/get-docker-registry-details.sh +CONTAINER_TAG="${DOCKER_REGISTRY_URL}/${container_id}:main" + set -x -python3 ops/docker_run.py --container-id "${container_id}" ${gpu_option} \ +python3 ops/docker_run.py --container-tag "${CONTAINER_TAG}" ${gpu_option} \ --run-args='--shm-size=4g --privileged' \ -- bash ops/pipeline/test-python-wheel-impl.sh "${suite}" diff --git a/ops/script/build_via_cmake.sh b/ops/script/build_via_cmake.sh deleted file mode 100755 index 00a571584ea4..000000000000 --- a/ops/script/build_via_cmake.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -if [[ "$#" -lt 1 ]] -then - conda_env="" -else - conda_env="$1" -fi - -if [[ "${conda_env}" == --conda-env=* ]] -then - conda_env=$(echo "${conda_env}" | sed 's/^--conda-env=//g' -) - echo "Activating Conda environment ${conda_env}" - shift 1 - cmake_args="$@" - - # Workaround for file permission error - if [[ -n ${CI_BUILD_UID:-} ]] - then - gosu root chown -R "${CI_BUILD_UID}:${CI_BUILD_GID}" /opt/miniforge/envs - fi - - # Don't activate Conda env if it's already activated - if [[ -z ${CONDA_PREFIX:-} ]] - then - source activate ${conda_env} - fi - cmake_prefix_flag="-DCMAKE_PREFIX_PATH=$CONDA_PREFIX" -else - cmake_args="$@" - cmake_prefix_flag='' -fi - -rm -rf build -mkdir build -cd build -# Disable CMAKE_COMPILE_WARNING_AS_ERROR option temporarily until -# https://github.com/dmlc/xgboost/issues/10400 is fixed -set -x -cmake .. ${cmake_args} \ - -DGOOGLE_TEST=ON \ - -DUSE_DMLC_GTEST=ON \ - -DENABLE_ALL_WARNINGS=ON \ - -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \ - -GNinja \ - ${cmake_prefix_flag} \ - -DHIDE_CXX_SYMBOLS=ON \ - -DBUILD_DEPRECATED_CLI=ON -ninja clean -time ninja -v -cd .. -set +x diff --git a/ops/script/format_wheel_meta.py b/ops/script/format_wheel_meta.py index a7def879905e..8b37e81bc9a7 100644 --- a/ops/script/format_wheel_meta.py +++ b/ops/script/format_wheel_meta.py @@ -27,6 +27,7 @@ def main(args: argparse.Namespace) -> None: version = tokens[1].split("+")[0] meta_info = { + "wheel_path": f"{args.commit_hash}/{wheel_name}", "wheel_name": wheel_name, "platform_tag": args.platform_tag, "version": version, diff --git a/ops/script/lint_python.py b/ops/script/lint_python.py index f418fbf1075f..8c30d261b520 100644 --- a/ops/script/lint_python.py +++ b/ops/script/lint_python.py @@ -22,7 +22,6 @@ class LintersPaths: "tests/python/test_collective.py", "tests/python/test_data_iterator.py", "tests/python/test_dmatrix.py", - "tests/python/test_dt.py", "tests/python/test_demos.py", "tests/python/test_eval_metrics.py", "tests/python/test_early_stopping.py", @@ -94,7 +93,6 @@ class LintersPaths: "python-package/", # tests "tests/python/test_collective.py", - "tests/python/test_dt.py", "tests/python/test_demos.py", "tests/python/test_data_iterator.py", "tests/python/test_multi_target.py", diff --git a/ops/script/rename_whl.py b/ops/script/rename_whl.py deleted file mode 100644 index d4467720c738..000000000000 --- a/ops/script/rename_whl.py +++ /dev/null @@ -1,62 +0,0 @@ -import argparse -import pathlib - - -def main(args: argparse.Namespace) -> None: - wheel_path = pathlib.Path(args.wheel_path).expanduser().resolve() - if not wheel_path.exists(): - raise ValueError(f"Wheel cannot be found at path {wheel_path}") - if not wheel_path.is_file(): - raise ValueError(f"Path {wheel_path} is not a valid file") - wheel_dir, wheel_name = wheel_path.parent, wheel_path.name - - tokens = wheel_name.split("-") - assert len(tokens) == 5 - version = tokens[1].split("+")[0] - keywords = { - "pkg_name": tokens[0], - "version": version, - "commit_id": args.commit_hash, - "platform_tag": args.platform_tag, - } - new_wheel_name = ( - "{pkg_name}-{version}+{commit_id}-py3-none-{platform_tag}.whl".format( - **keywords - ) - ) - new_wheel_path = wheel_dir / new_wheel_name - print(f"Renaming {wheel_name} to {new_wheel_name}...") - if new_wheel_name == wheel_name: - print("Skipping, as the old name is identical to the new name.") - else: - if new_wheel_path.is_file(): - new_wheel_path.unlink() - wheel_path.rename(new_wheel_path) - - filesize = new_wheel_path.stat().st_size / 1024 / 1024 # MiB - print(f"Wheel size: {filesize:.2f} MiB") - - if filesize > 300: - raise RuntimeError( - f"Limit of wheel size set by PyPI is exceeded. {new_wheel_name}: {filesize:.2f} MiB" - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Format a Python wheel's name using the git commit hash and platform tag" - ) - parser.add_argument( - "--wheel-path", type=str, required=True, help="Path to the wheel" - ) - parser.add_argument( - "--commit-hash", type=str, required=True, help="Git commit hash" - ) - parser.add_argument( - "--platform-tag", - type=str, - required=True, - help="Platform tag (e.g. manylinux_2_28_x86_64)", - ) - parsed_args = parser.parse_args() - main(parsed_args) diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index 565b61eb0669..b9f08dda6a83 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -43,7 +43,6 @@ repository = "https://github.com/dmlc/xgboost" pandas = ["pandas>=1.2"] scikit-learn = ["scikit-learn"] dask = ["dask", "pandas", "distributed"] -datatable = ["datatable"] plotting = ["graphviz", "matplotlib"] pyspark = ["pyspark", "scikit-learn", "cloudpickle"] @@ -86,3 +85,9 @@ class-attribute-naming-style = "snake_case" # Allow single-letter variables variable-rgx = "[a-zA-Z_][a-z0-9_]{0,30}$" + +[tool.pydistcheck] +inspect = true +ignore = ["compiled-objects-have-debug-symbols"] +max_allowed_size_compressed = '300M' +max_allowed_size_uncompressed = '500M' diff --git a/python-package/xgboost/_data_utils.py b/python-package/xgboost/_data_utils.py new file mode 100644 index 000000000000..23483af0751e --- /dev/null +++ b/python-package/xgboost/_data_utils.py @@ -0,0 +1,136 @@ +"""Helpers for interfacing array like objects.""" + +import copy +import ctypes +import json +from typing import Literal, Optional, Protocol, Tuple, Type, TypedDict, Union, cast + +import numpy as np + +from ._typing import CNumericPtr, DataType, NumpyOrCupy +from .compat import import_cupy + + +class _ArrayLikeArg(Protocol): + @property + def __array_interface__(self) -> "ArrayInf": ... + + +ArrayInf = TypedDict( + "ArrayInf", + { + "data": Tuple[int, bool], + "typestr": str, + "version": Literal[3], + "strides": Optional[Tuple[int, ...]], + "shape": Tuple[int, ...], + "mask": Union["ArrayInf", None, _ArrayLikeArg], + }, +) + + +def array_hasobject(data: DataType) -> bool: + """Whether the numpy array has object dtype.""" + return hasattr(data.dtype, "hasobject") and data.dtype.hasobject + + +def cuda_array_interface(data: DataType) -> bytes: + """Make cuda array interface str.""" + if array_hasobject(data): + raise ValueError("Input data contains `object` dtype. Expecting numeric data.") + interface = data.__cuda_array_interface__ + if "mask" in interface: + interface["mask"] = interface["mask"].__cuda_array_interface__ + interface_str = bytes(json.dumps(interface), "utf-8") + return interface_str + + +def from_array_interface(interface: ArrayInf, zero_copy: bool = False) -> NumpyOrCupy: + """Convert array interface to numpy or cupy array""" + + class Array: + """Wrapper type for communicating with numpy and cupy.""" + + _interface: Optional[ArrayInf] = None + + @property + def __array_interface__(self) -> Optional[ArrayInf]: + return self._interface + + @__array_interface__.setter + def __array_interface__(self, interface: ArrayInf) -> None: + self._interface = copy.copy(interface) + # Convert some fields to tuple as required by numpy + self._interface["shape"] = tuple(self._interface["shape"]) + self._interface["data"] = ( + self._interface["data"][0], + self._interface["data"][1], + ) + strides = self._interface.get("strides", None) + if strides is not None: + self._interface["strides"] = tuple(strides) + + @property + def __cuda_array_interface__(self) -> Optional[ArrayInf]: + return self.__array_interface__ + + @__cuda_array_interface__.setter + def __cuda_array_interface__(self, interface: ArrayInf) -> None: + self.__array_interface__ = interface + + arr = Array() + + if "stream" in interface: + # CUDA stream is presented, this is a __cuda_array_interface__. + arr.__cuda_array_interface__ = interface + out = import_cupy().array(arr, copy=not zero_copy) + else: + arr.__array_interface__ = interface + out = np.array(arr, copy=not zero_copy) + + return out + + +def make_array_interface( + ptr: CNumericPtr, shape: Tuple[int, ...], dtype: Type[np.number], is_cuda: bool +) -> ArrayInf: + """Make an __(cuda)_array_interface__ from a pointer.""" + # Use an empty array to handle typestr and descr + if is_cuda: + empty = import_cupy().empty(shape=(0,), dtype=dtype) + array = empty.__cuda_array_interface__ # pylint: disable=no-member + else: + empty = np.empty(shape=(0,), dtype=dtype) + array = empty.__array_interface__ # pylint: disable=no-member + + addr = ctypes.cast(ptr, ctypes.c_void_p).value + length = int(np.prod(shape)) + # Handle empty dataset. + assert addr is not None or length == 0 + + if addr is None: + return array + + array["data"] = (addr, True) + if is_cuda: + array["stream"] = 2 + array["shape"] = shape + array["strides"] = None + return array + + +def array_interface_dict(data: np.ndarray) -> ArrayInf: + """Convert array interface into a Python dictionary.""" + if array_hasobject(data): + raise ValueError("Input data contains `object` dtype. Expecting numeric data.") + arrinf = data.__array_interface__ + if "mask" in arrinf: + arrinf["mask"] = arrinf["mask"].__array_interface__ + return cast(ArrayInf, arrinf) + + +def array_interface(data: np.ndarray) -> bytes: + """Make array interface str.""" + interface = array_interface_dict(data) + interface_str = bytes(json.dumps(interface), "utf-8") + return interface_str diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 07924623955d..ad0c77edeae6 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -36,6 +36,12 @@ import numpy as np import scipy.sparse +from ._data_utils import ( + array_interface, + cuda_array_interface, + from_array_interface, + make_array_interface, +) from ._typing import ( _T, ArrayLike, @@ -58,7 +64,7 @@ TransformedData, c_bst_ulong, ) -from .compat import PANDAS_INSTALLED, DataFrame, import_cupy, py_str +from .compat import PANDAS_INSTALLED, DataFrame, py_str from .libpath import find_lib_path @@ -377,20 +383,6 @@ def _numpy2ctypes_type(dtype: Type[np.number]) -> Type[CNumeric]: return _NUMPY_TO_CTYPES_MAPPING[dtype] -def _array_hasobject(data: DataType) -> bool: - return hasattr(data.dtype, "hasobject") and data.dtype.hasobject - - -def _cuda_array_interface(data: DataType) -> bytes: - if _array_hasobject(data): - raise ValueError("Input data contains `object` dtype. Expecting numeric data.") - interface = data.__cuda_array_interface__ - if "mask" in interface: - interface["mask"] = interface["mask"].__cuda_array_interface__ - interface_str = bytes(json.dumps(interface), "utf-8") - return interface_str - - def ctypes2numpy(cptr: CNumericPtr, length: int, dtype: Type[np.number]) -> np.ndarray: """Convert a ctypes pointer array to a numpy array.""" ctype: Type[CNumeric] = _numpy2ctypes_type(dtype) @@ -427,76 +419,6 @@ def c_array( return (ctype * len(values))(*values) -def from_array_interface(interface: dict) -> NumpyOrCupy: - """Convert array interface to numpy or cupy array""" - - class Array: - """Wrapper type for communicating with numpy and cupy.""" - - _interface: Optional[dict] = None - - @property - def __array_interface__(self) -> Optional[dict]: - return self._interface - - @__array_interface__.setter - def __array_interface__(self, interface: dict) -> None: - self._interface = copy.copy(interface) - # converts some fields to tuple as required by numpy - self._interface["shape"] = tuple(self._interface["shape"]) - self._interface["data"] = tuple(self._interface["data"]) - if self._interface.get("strides", None) is not None: - self._interface["strides"] = tuple(self._interface["strides"]) - - @property - def __cuda_array_interface__(self) -> Optional[dict]: - return self.__array_interface__ - - @__cuda_array_interface__.setter - def __cuda_array_interface__(self, interface: dict) -> None: - self.__array_interface__ = interface - - arr = Array() - - if "stream" in interface: - # CUDA stream is presented, this is a __cuda_array_interface__. - arr.__cuda_array_interface__ = interface - out = import_cupy().array(arr, copy=True) - else: - arr.__array_interface__ = interface - out = np.array(arr, copy=True) - - return out - - -def make_array_interface( - ptr: CNumericPtr, shape: Tuple[int, ...], dtype: Type[np.number], is_cuda: bool -) -> Dict[str, Union[int, tuple, None]]: - """Make an __(cuda)_array_interface__ from a pointer.""" - # Use an empty array to handle typestr and descr - if is_cuda: - empty = import_cupy().empty(shape=(0,), dtype=dtype) - array = empty.__cuda_array_interface__ # pylint: disable=no-member - else: - empty = np.empty(shape=(0,), dtype=dtype) - array = empty.__array_interface__ # pylint: disable=no-member - - addr = ctypes.cast(ptr, ctypes.c_void_p).value - length = int(np.prod(shape)) - # Handle empty dataset. - assert addr is not None or length == 0 - - if addr is None: - return array - - array["data"] = (addr, True) - if is_cuda: - array["stream"] = 2 - array["shape"] = shape - array["strides"] = None - return array - - def _prediction_output( shape: CNumericPtr, dims: c_bst_ulong, predts: CFloatPtr, is_cuda: bool ) -> NumpyOrCupy: @@ -1495,11 +1417,8 @@ def __init__(self) -> None: # pylint: disable=super-init-not-called def _ref_data_from_cuda_interface(self, data: DataType) -> None: """Reference data from CUDA array interface.""" - interface = data.__cuda_array_interface__ - interface_str = bytes(json.dumps(interface), "utf-8") - _check_call( - _LIB.XGProxyDMatrixSetDataCudaArrayInterface(self.handle, interface_str) - ) + arrinf = cuda_array_interface(data) + _check_call(_LIB.XGProxyDMatrixSetDataCudaArrayInterface(self.handle, arrinf)) def _ref_data_from_cuda_columnar(self, data: DataType, cat_codes: list) -> None: """Reference data from CUDA columnar format.""" @@ -1510,11 +1429,7 @@ def _ref_data_from_cuda_columnar(self, data: DataType, cat_codes: list) -> None: def _ref_data_from_array(self, data: np.ndarray) -> None: """Reference data from numpy array.""" - from .data import _array_interface - - _check_call( - _LIB.XGProxyDMatrixSetDataDense(self.handle, _array_interface(data)) - ) + _check_call(_LIB.XGProxyDMatrixSetDataDense(self.handle, array_interface(data))) def _ref_data_from_pandas(self, data: DataType) -> None: """Reference data from a pandas DataFrame. The input is a PandasTransformed @@ -1527,13 +1442,11 @@ def _ref_data_from_pandas(self, data: DataType) -> None: def _ref_data_from_csr(self, csr: scipy.sparse.csr_matrix) -> None: """Reference data from scipy csr.""" - from .data import _array_interface - _LIB.XGProxyDMatrixSetDataCSR( self.handle, - _array_interface(csr.indptr), - _array_interface(csr.indices), - _array_interface(csr.data), + array_interface(csr.indptr), + array_interface(csr.indices), + array_interface(csr.data), ctypes.c_size_t(csr.shape[1]), ) @@ -2311,19 +2224,14 @@ def boost( The second order of gradient. """ - from .data import ( - _array_interface, - _cuda_array_interface, - _ensure_np_dtype, - _is_cupy_alike, - ) + from .data import _ensure_np_dtype, _is_cupy_alike self._assign_dmatrix_features(dtrain) def is_flatten(array: NumpyOrCupy) -> bool: return len(array.shape) == 1 or array.shape[1] == 1 - def array_interface(array: NumpyOrCupy) -> bytes: + def grad_arrinf(array: NumpyOrCupy) -> bytes: # Can we check for __array_interface__ instead of a specific type instead? msg = ( "Expecting `np.ndarray` or `cupy.ndarray` for gradient and hessian." @@ -2343,9 +2251,9 @@ def array_interface(array: NumpyOrCupy) -> bytes: if isinstance(array, np.ndarray): array, _ = _ensure_np_dtype(array, array.dtype) - interface = _array_interface(array) + interface = array_interface(array) elif _is_cupy_alike(array): - interface = _cuda_array_interface(array) + interface = cuda_array_interface(array) else: raise TypeError(msg) @@ -2356,8 +2264,8 @@ def array_interface(array: NumpyOrCupy) -> bytes: self.handle, dtrain.handle, iteration, - array_interface(grad), - array_interface(hess), + grad_arrinf(grad), + grad_arrinf(hess), ) ) @@ -2675,7 +2583,6 @@ def inplace_predict( from .data import ( PandasTransformed, - _array_interface, _arrow_transform, _is_arrow, _is_cudf_df, @@ -2724,7 +2631,7 @@ def inplace_predict( _check_call( _LIB.XGBoosterPredictFromDense( self.handle, - _array_interface(data), + array_interface(data), args, p_handle, ctypes.byref(shape), @@ -2753,9 +2660,9 @@ def inplace_predict( _check_call( _LIB.XGBoosterPredictFromCSR( self.handle, - _array_interface(data.indptr), - _array_interface(data.indices), - _array_interface(data.data), + array_interface(data.indptr), + array_interface(data.indices), + array_interface(data.data), c_bst_ulong(data.shape[1]), args, p_handle, @@ -2769,7 +2676,7 @@ def inplace_predict( from .data import _transform_cupy_array data = _transform_cupy_array(data) - interface_str = _cuda_array_interface(data) + interface_str = cuda_array_interface(data) _check_call( _LIB.XGBoosterPredictFromCudaArray( self.handle, @@ -3294,6 +3201,8 @@ def get_split_value_histogram( a histogram of used splitting values for the specified feature either as numpy array or pandas DataFrame. """ + from .data import CAT_T + xgdump = self.get_dump(fmap=fmap) values = [] # pylint: disable=consider-using-f-string @@ -3321,7 +3230,7 @@ def get_split_value_histogram( except (ValueError, AttributeError, TypeError): # None.index: attr err, None[0]: type err, fn.index(-1): value err feature_t = None - if feature_t == "c": # categorical + if feature_t == CAT_T: # categorical raise ValueError( "Split value historgam doesn't support categorical split." ) diff --git a/python-package/xgboost/dask/__init__.py b/python-package/xgboost/dask/__init__.py index b2fc191f1c02..345c7ccc990d 100644 --- a/python-package/xgboost/dask/__init__.py +++ b/python-package/xgboost/dask/__init__.py @@ -1616,7 +1616,7 @@ def _client_sync(self, func: Callable, **kwargs: Any) -> Any: @xgboost_model_doc( """Implementation of the Scikit-Learn API for XGBoost.""", ["estimators", "model"] ) -class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase): +class DaskXGBRegressor(XGBRegressorBase, DaskScikitLearnBase): """dummy doc string to workaround pylint, replaced by the decorator.""" async def _fit_async( @@ -1707,7 +1707,7 @@ def fit( "Implementation of the scikit-learn API for XGBoost classification.", ["estimators", "model"], ) -class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase): +class DaskXGBClassifier(XGBClassifierBase, DaskScikitLearnBase): # pylint: disable=missing-class-docstring async def _fit_async( self, @@ -1911,7 +1911,7 @@ def _argmax(x: Any) -> Any: For the dask implementation, group is not supported, use qid instead. """, ) -class DaskXGBRanker(DaskScikitLearnBase, XGBRankerMixIn): +class DaskXGBRanker(XGBRankerMixIn, DaskScikitLearnBase): @_deprecate_positional_args def __init__( self, diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 1085f28f8ff5..d49ff5e43899 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -10,6 +10,12 @@ import numpy as np +from ._data_utils import ( + array_hasobject, + array_interface, + array_interface_dict, + cuda_array_interface, +) from ._typing import ( CupyT, DataType, @@ -30,9 +36,7 @@ DataIter, DataSplitMode, DMatrix, - _array_hasobject, _check_call, - _cuda_array_interface, _ProxyDMatrix, c_str, from_pystr_to_cstr, @@ -81,21 +85,6 @@ def is_scipy_csr(data: DataType) -> bool: return is_array or is_matrix -def _array_interface_dict(data: np.ndarray) -> dict: - if _array_hasobject(data): - raise ValueError("Input data contains `object` dtype. Expecting numeric data.") - interface = data.__array_interface__ - if "mask" in interface: - interface["mask"] = interface["mask"].__array_interface__ - return interface - - -def _array_interface(data: np.ndarray) -> bytes: - interface = _array_interface_dict(data) - interface_str = bytes(json.dumps(interface), "utf-8") - return interface_str - - def transform_scipy_sparse(data: DataType, is_csr: bool) -> DataType: """Ensure correct data alignment and data type for scipy sparse inputs. Input should be either csr or csc matrix. @@ -136,9 +125,9 @@ def _from_scipy_csr( data = transform_scipy_sparse(data, True) _check_call( _LIB.XGDMatrixCreateFromCSR( - _array_interface(data.indptr), - _array_interface(data.indices), - _array_interface(data.data), + array_interface(data.indptr), + array_interface(data.indices), + array_interface(data.data), c_bst_ulong(data.shape[1]), make_jcargs( missing=float(missing), @@ -184,9 +173,9 @@ def _from_scipy_csc( transform_scipy_sparse(data, False) _check_call( _LIB.XGDMatrixCreateFromCSC( - _array_interface(data.indptr), - _array_interface(data.indices), - _array_interface(data.data), + array_interface(data.indptr), + array_interface(data.indices), + array_interface(data.data), c_bst_ulong(data.shape[0]), make_jcargs( missing=float(missing), @@ -225,7 +214,7 @@ def _is_np_array_like(data: DataType) -> TypeGuard[np.ndarray]: def _ensure_np_dtype( data: DataType, dtype: Optional[NumpyDType] ) -> Tuple[np.ndarray, Optional[NumpyDType]]: - if _array_hasobject(data) or data.dtype in [np.float16, np.bool_]: + if array_hasobject(data) or data.dtype in [np.float16, np.bool_]: dtype = np.float32 data = data.astype(dtype, copy=False) if not data.flags.aligned: @@ -261,7 +250,7 @@ def _from_numpy_array( handle = ctypes.c_void_p() _check_call( _LIB.XGDMatrixCreateFromDense( - _array_interface(data), + array_interface(data), make_jcargs( missing=float(missing), nthread=int(nthread), @@ -604,7 +593,7 @@ def __init__(self, columns: List[np.ndarray]) -> None: def array_interface(self) -> bytes: """Return a byte string for JSON encoded array interface.""" - aitfs = list(map(_array_interface_dict, self.columns)) + aitfs = list(map(array_interface_dict, self.columns)) sarrays = bytes(json.dumps(aitfs), "utf-8") return sarrays @@ -732,110 +721,6 @@ def _from_pandas_series( ) -def _is_dt_df(data: DataType) -> bool: - return lazy_isinstance(data, "datatable", "Frame") or lazy_isinstance( - data, "datatable", "DataTable" - ) - - -def _transform_dt_df( - data: DataType, - feature_names: Optional[FeatureNames], - feature_types: Optional[FeatureTypes], - meta: Optional[str] = None, - meta_type: Optional[NumpyDType] = None, -) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]: - """Validate feature names and types if data table""" - _dt_type_mapper = {"bool": "bool", "int": "int", "real": "float"} - _dt_type_mapper2 = {"bool": "i", "int": "int", "real": "float"} - if meta and data.shape[1] > 1: - raise ValueError("DataTable for meta info cannot have multiple columns") - if meta: - meta_type = "float" if meta_type is None else meta_type - # below requires new dt version - # extract first column - data = data.to_numpy()[:, 0].astype(meta_type) - return data, None, None - - data_types_names = tuple(lt.name for lt in data.ltypes) - bad_fields = [ - data.names[i] - for i, type_name in enumerate(data_types_names) - if type_name not in _dt_type_mapper - ] - if bad_fields: - msg = """DataFrame.types for data must be int, float or bool. - Did not expect the data types in fields """ - raise ValueError(msg + ", ".join(bad_fields)) - - if feature_names is None and meta is None: - feature_names = data.names - - # always return stypes for dt ingestion - if feature_types is not None: - raise ValueError("DataTable has own feature types, cannot pass them in.") - feature_types = np.vectorize(_dt_type_mapper2.get)(data_types_names).tolist() - - return data, feature_names, feature_types - - -def _from_dt_df( - *, - data: DataType, - missing: Optional[FloatCompatible], - nthread: int, - feature_names: Optional[FeatureNames], - feature_types: Optional[FeatureTypes], - enable_categorical: bool, -) -> DispatchedDataBackendReturnType: - if enable_categorical: - raise ValueError("categorical data in datatable is not supported yet.") - data, feature_names, feature_types = _transform_dt_df( - data=data, - feature_names=feature_names, - feature_types=feature_types, - meta=None, - meta_type=None, - ) - - ptrs = (ctypes.c_void_p * data.ncols)() - if hasattr(data, "internal") and hasattr(data.internal, "column"): - # datatable>0.8.0 - for icol in range(data.ncols): - col = data.internal.column(icol) - ptr = col.data_pointer - ptrs[icol] = ctypes.c_void_p(ptr) - else: - # datatable<=0.8.0 - from datatable.internal import ( - frame_column_data_r, # pylint: disable=no-name-in-module - ) - - for icol in range(data.ncols): - ptrs[icol] = frame_column_data_r(data, icol) - - # always return stypes for dt ingestion - feature_type_strings = (ctypes.c_char_p * data.ncols)() - for icol in range(data.ncols): - feature_type_strings[icol] = ctypes.c_char_p( - data.stypes[icol].name.encode("utf-8") - ) - - _warn_unused_missing(data, missing) - handle = ctypes.c_void_p() - _check_call( - _LIB.XGDMatrixCreateFromDT( - ptrs, - feature_type_strings, - c_bst_ulong(data.shape[0]), - c_bst_ulong(data.shape[1]), - ctypes.byref(handle), - ctypes.c_int(nthread), - ) - ) - return handle, feature_names, feature_types - - def _is_arrow(data: DataType) -> bool: return lazy_isinstance(data, "pyarrow.lib", "Table") or lazy_isinstance( data, "pyarrow._dataset", "Dataset" @@ -1032,7 +917,7 @@ def _transform_cupy_array(data: DataType) -> CupyT: if not hasattr(data, "__cuda_array_interface__") and hasattr(data, "__array__"): data = cupy.array(data, copy=False) - if _array_hasobject(data) or data.dtype in [cupy.bool_]: + if array_hasobject(data) or data.dtype in [cupy.bool_]: data = data.astype(cupy.float32, copy=False) return data @@ -1046,7 +931,7 @@ def _from_cupy_array( ) -> DispatchedDataBackendReturnType: """Initialize DMatrix from cupy ndarray.""" data = _transform_cupy_array(data) - interface_str = _cuda_array_interface(data) + interface_str = cuda_array_interface(data) handle = ctypes.c_void_p() config = bytes(json.dumps({"missing": missing, "nthread": nthread}), "utf-8") _check_call( @@ -1297,16 +1182,6 @@ def dispatch_data_backend( raise TypeError("cupyx CSC is not supported yet.") if _is_dlpack(data): return _from_dlpack(data, missing, threads, feature_names, feature_types) - if _is_dt_df(data): - _warn_unused_missing(data, missing) - return _from_dt_df( - data=data, - missing=missing, - nthread=threads, - feature_names=feature_names, - feature_types=feature_types, - enable_categorical=enable_categorical, - ) if _is_modin_df(data): return _from_pandas_df( data=data, @@ -1372,7 +1247,7 @@ def _meta_from_numpy( interface = data.__array_interface__ if interface.get("mask", None) is not None: raise ValueError("Masked array is not supported.") - interface_str = _array_interface(data) + interface_str = array_interface(data) _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface_str)) @@ -1394,7 +1269,7 @@ def _meta_from_cudf_df(data: DataType, field: str, handle: ctypes.c_void_p) -> N _meta_from_cudf_series(data.iloc[:, 0], field, handle) else: data = data.values - interface = _cuda_array_interface(data) + interface = cuda_array_interface(data) _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface)) @@ -1409,15 +1284,6 @@ def _meta_from_cupy_array(data: DataType, field: str, handle: ctypes.c_void_p) - _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface)) -def _meta_from_dt( - data: DataType, field: str, dtype: Optional[NumpyDType], handle: ctypes.c_void_p -) -> None: - data, _, _ = _transform_dt_df( - data=data, feature_names=None, feature_types=None, meta=field, meta_type=dtype - ) - _meta_from_numpy(data, field, dtype, handle) - - def dispatch_meta_backend( matrix: DMatrix, data: DataType, name: str, dtype: Optional[NumpyDType] = None ) -> None: @@ -1459,9 +1325,6 @@ def dispatch_meta_backend( if _is_cudf_df(data): _meta_from_cudf_df(data, name, handle) return - if _is_dt_df(data): - _meta_from_dt(data, name, dtype, handle) - return if _is_modin_df(data): _meta_from_pandas_df(data, name, dtype=dtype, handle=handle) return diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index b197539bfc1f..62d91be29bac 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -536,13 +536,15 @@ def task(i: int) -> float: information) instead. """ +TDoc = TypeVar("TDoc", bound=Type) + def xgboost_model_doc( header: str, items: List[str], extra_parameters: Optional[str] = None, end_note: Optional[str] = None, -) -> Callable[[Type], Type]: +) -> Callable[[TDoc], TDoc]: """Obtain documentation for Scikit-Learn wrappers Parameters @@ -568,7 +570,7 @@ def get_doc(item: str) -> str: } return __doc[item] - def adddoc(cls: Type) -> Type: + def adddoc(cls: TDoc) -> TDoc: doc = [ """ Parameters diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py index 0821aee913c3..53779403917b 100644 --- a/python-package/xgboost/testing/__init__.py +++ b/python-package/xgboost/testing/__init__.py @@ -152,10 +152,6 @@ def no_modin() -> PytestSkip: return {"reason": "Failed import modin.", "condition": True} -def no_dt() -> PytestSkip: - return no_mod("datatable") - - def no_matplotlib() -> PytestSkip: reason = "Matplotlib is not installed." try: diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index d3e11d2f894c..14154066908e 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -583,17 +583,6 @@ XGB_DLL int XGDMatrixCreateFromMat_omp(const bst_float* data, // NOLINT API_END(); } -XGB_DLL int XGDMatrixCreateFromDT(void** data, const char** feature_stypes, - xgboost::bst_ulong nrow, - xgboost::bst_ulong ncol, DMatrixHandle* out, - int nthread) { - API_BEGIN(); - data::DataTableAdapter adapter(data, feature_stypes, nrow, ncol); - xgboost_CHECK_C_ARG_PTR(out); - *out = new std::shared_ptr(DMatrix::Create(&adapter, std::nan(""), nthread)); - API_END(); -} - XGB_DLL int XGDMatrixSliceDMatrix(DMatrixHandle handle, const int *idxset, xgboost::bst_ulong len, DMatrixHandle *out) { xgboost_CHECK_C_ARG_PTR(out); diff --git a/src/common/quantile.cc b/src/common/quantile.cc index eb02924aaf8a..61e12100b17b 100644 --- a/src/common/quantile.cc +++ b/src/common/quantile.cc @@ -112,7 +112,6 @@ void HostSketchContainer::PushAdapterBatch(Batch const &batch, size_t base_rowid INSTANTIATE(ArrayAdapterBatch) INSTANTIATE(CSRArrayAdapterBatch) INSTANTIATE(CSCAdapterBatch) -INSTANTIATE(DataTableAdapterBatch) INSTANTIATE(SparsePageAdapterBatch) INSTANTIATE(ColumnarAdapterBatch) diff --git a/src/data/adapter.h b/src/data/adapter.h index 1467d3376886..741fd69cebb2 100644 --- a/src/data/adapter.h +++ b/src/data/adapter.h @@ -536,131 +536,6 @@ class CSCArrayAdapter : public detail::SingleBatchDataIter [[nodiscard]] const CSCArrayAdapterBatch& Value() const override { return batch_; } }; -class DataTableAdapterBatch : public detail::NoMetaInfo { - enum class DTType : std::uint8_t { - kFloat32 = 0, - kFloat64 = 1, - kBool8 = 2, - kInt32 = 3, - kInt8 = 4, - kInt16 = 5, - kInt64 = 6, - kUnknown = 7 - }; - - static DTType DTGetType(std::string type_string) { - if (type_string == "float32") { - return DTType::kFloat32; - } else if (type_string == "float64") { - return DTType::kFloat64; - } else if (type_string == "bool8") { - return DTType::kBool8; - } else if (type_string == "int32") { - return DTType::kInt32; - } else if (type_string == "int8") { - return DTType::kInt8; - } else if (type_string == "int16") { - return DTType::kInt16; - } else if (type_string == "int64") { - return DTType::kInt64; - } else { - LOG(FATAL) << "Unknown data table type."; - return DTType::kUnknown; - } - } - - public: - DataTableAdapterBatch(void const* const* const data, char const* const* feature_stypes, - std::size_t num_rows, std::size_t num_features) - : data_(data), num_rows_(num_rows) { - CHECK(feature_types_.empty()); - std::transform(feature_stypes, feature_stypes + num_features, - std::back_inserter(feature_types_), - [](char const* stype) { return DTGetType(stype); }); - } - - private: - class Line { - std::size_t row_idx_; - void const* const* const data_; - std::vector const& feature_types_; - - float DTGetValue(void const* column, DTType dt_type, std::size_t ridx) const { - float missing = std::numeric_limits::quiet_NaN(); - switch (dt_type) { - case DTType::kFloat32: { - float val = reinterpret_cast(column)[ridx]; - return std::isfinite(val) ? val : missing; - } - case DTType::kFloat64: { - double val = reinterpret_cast(column)[ridx]; - return std::isfinite(val) ? static_cast(val) : missing; - } - case DTType::kBool8: { - bool val = reinterpret_cast(column)[ridx]; - return static_cast(val); - } - case DTType::kInt32: { - int32_t val = reinterpret_cast(column)[ridx]; - return val != (-2147483647 - 1) ? static_cast(val) : missing; - } - case DTType::kInt8: { - int8_t val = reinterpret_cast(column)[ridx]; - return val != -128 ? static_cast(val) : missing; - } - case DTType::kInt16: { - int16_t val = reinterpret_cast(column)[ridx]; - return val != -32768 ? static_cast(val) : missing; - } - case DTType::kInt64: { - int64_t val = reinterpret_cast(column)[ridx]; - return val != -9223372036854775807 - 1 ? static_cast(val) : missing; - } - default: { - LOG(FATAL) << "Unknown data table type."; - return 0.0f; - } - } - } - - public: - Line(std::size_t ridx, void const* const* const data, std::vector const& ft) - : row_idx_{ridx}, data_{data}, feature_types_{ft} {} - [[nodiscard]] std::size_t Size() const { return feature_types_.size(); } - [[nodiscard]] COOTuple GetElement(std::size_t idx) const { - return COOTuple{row_idx_, idx, DTGetValue(data_[idx], feature_types_[idx], row_idx_)}; - } - }; - - public: - [[nodiscard]] size_t Size() const { return num_rows_; } - [[nodiscard]] const Line GetLine(std::size_t ridx) const { return {ridx, data_, feature_types_}; } - static constexpr bool kIsRowMajor = true; - - private: - void const* const* const data_; - - std::vector feature_types_; - std::size_t num_rows_; -}; - -class DataTableAdapter : public detail::SingleBatchDataIter { - public: - DataTableAdapter(void** data, const char** feature_stypes, std::size_t num_rows, - std::size_t num_features) - : batch_(data, feature_stypes, num_rows, num_features), - num_rows_(num_rows), - num_columns_(num_features) {} - [[nodiscard]] const DataTableAdapterBatch& Value() const override { return batch_; } - [[nodiscard]] std::size_t NumRows() const { return num_rows_; } - [[nodiscard]] std::size_t NumColumns() const { return num_columns_; } - - private: - DataTableAdapterBatch batch_; - std::size_t num_rows_; - std::size_t num_columns_; -}; - class ColumnarAdapterBatch : public detail::NoMetaInfo { common::Span> columns_; diff --git a/src/data/data.cc b/src/data/data.cc index 713ad4a1a514..ed3a17eaf841 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -878,8 +878,8 @@ DMatrix* TryLoadBinary(std::string fname, bool silent) { if (magic == data::SimpleDMatrix::kMagic) { DMatrix* dmat = new data::SimpleDMatrix(&is); if (!silent) { - LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with " - << dmat->Info().num_nonzero_ << " entries loaded from " << fname; + LOG(INFO) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with " + << dmat->Info().num_nonzero_ << " entries loaded from " << fname; } return dmat; } @@ -993,7 +993,6 @@ INSTANTIATION_CREATE(DenseAdapter) INSTANTIATION_CREATE(ArrayAdapter) INSTANTIATION_CREATE(CSRAdapter) INSTANTIATION_CREATE(CSCAdapter) -INSTANTIATION_CREATE(DataTableAdapter) INSTANTIATION_CREATE(FileAdapter) INSTANTIATION_CREATE(CSRArrayAdapter) INSTANTIATION_CREATE(CSCArrayAdapter) @@ -1271,8 +1270,6 @@ template uint64_t SparsePage::Push(const data::CSRArrayAdapterBatch& batch, floa template uint64_t SparsePage::Push(const data::CSCArrayAdapterBatch& batch, float missing, int nthread); template uint64_t SparsePage::Push(const data::CSCAdapterBatch& batch, float missing, int nthread); -template uint64_t SparsePage::Push(const data::DataTableAdapterBatch& batch, float missing, - int nthread); template uint64_t SparsePage::Push(const data::FileAdapterBatch& batch, float missing, int nthread); template uint64_t SparsePage::Push(const data::ColumnarAdapterBatch& batch, float missing, std::int32_t nthread); diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc index 0edc0bbc79de..241dea89b13c 100644 --- a/src/data/simple_dmatrix.cc +++ b/src/data/simple_dmatrix.cc @@ -357,8 +357,6 @@ template SimpleDMatrix::SimpleDMatrix(CSCArrayAdapter* adapter, float missing, i DataSplitMode data_split_mode); template SimpleDMatrix::SimpleDMatrix(CSCAdapter* adapter, float missing, int nthread, DataSplitMode data_split_mode); -template SimpleDMatrix::SimpleDMatrix(DataTableAdapter* adapter, float missing, int nthread, - DataSplitMode data_split_mode); template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, int nthread, DataSplitMode data_split_mode); template SimpleDMatrix::SimpleDMatrix(ColumnarAdapter* adapter, float missing, int nthread, diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc index 57565a85e214..b022349ae1c6 100644 --- a/tests/cpp/c_api/test_c_api.cc +++ b/tests/cpp/c_api/test_c_api.cc @@ -25,34 +25,6 @@ #include "../../../src/data/sparse_page_dmatrix.h" // for SparsePageDMatrix #include "../helpers.h" -TEST(CAPI, XGDMatrixCreateFromMatDT) { - std::vector col0 = {0, -1, 3}; - std::vector col1 = {-4.0f, 2.0f, 0.0f}; - const char *col0_type = "int32"; - const char *col1_type = "float32"; - std::vector data = {col0.data(), col1.data()}; - std::vector types = {col0_type, col1_type}; - DMatrixHandle handle; - XGDMatrixCreateFromDT(data.data(), types.data(), 3, 2, &handle, - 0); - std::shared_ptr *dmat = - static_cast *>(handle); - xgboost::MetaInfo &info = (*dmat)->Info(); - ASSERT_EQ(info.num_col_, 2ul); - ASSERT_EQ(info.num_row_, 3ul); - ASSERT_EQ(info.num_nonzero_, 6ul); - - for (const auto &batch : (*dmat)->GetBatches()) { - auto page = batch.GetView(); - ASSERT_EQ(page[0][0].fvalue, 0.0f); - ASSERT_EQ(page[0][1].fvalue, -4.0f); - ASSERT_EQ(page[2][0].fvalue, 3.0f); - ASSERT_EQ(page[2][1].fvalue, 0.0f); - } - - delete dmat; -} - TEST(CAPI, XGDMatrixCreateFromMatOmp) { std::vector num_rows = {100, 11374, 15000}; for (auto row : num_rows) { diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py index ea9dade9673e..d1005b2014db 100644 --- a/tests/python-gpu/test_gpu_prediction.py +++ b/tests/python-gpu/test_gpu_prediction.py @@ -477,7 +477,7 @@ def test_predict_leaf_dart(self, param: dict, dataset: tm.TestDataset) -> None: ) @settings(deadline=None, max_examples=20, print_blob=True) def test_predict_categorical_split(self, df): - from sklearn.metrics import mean_squared_error + from sklearn.metrics import root_mean_squared_error df = df.astype("category") x0, x1 = df["x0"].to_numpy(), df["x1"].to_numpy() @@ -504,7 +504,7 @@ def test_predict_categorical_split(self, df): ) bst.set_param({"device": "cuda:0"}) pred = bst.predict(dtrain) - rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False) + rmse = root_mean_squared_error(y_true=y, y_pred=pred) np.testing.assert_almost_equal( rmse, eval_history["train"]["rmse"][-1], decimal=5 ) diff --git a/tests/python/test_dt.py b/tests/python/test_dt.py deleted file mode 100644 index 1e583b0579d1..000000000000 --- a/tests/python/test_dt.py +++ /dev/null @@ -1,41 +0,0 @@ -import numpy as np -import pytest - -import xgboost as xgb - -dt = pytest.importorskip("datatable") -pd = pytest.importorskip("pandas") - - -class TestDataTable: - def test_dt(self) -> None: - df = pd.DataFrame([[1, 2.0, True], [2, 3.0, False]], columns=["a", "b", "c"]) - dtable = dt.Frame(df) - labels = dt.Frame([1, 2]) - dm = xgb.DMatrix(dtable, label=labels) - assert dm.feature_names == ["a", "b", "c"] - assert dm.feature_types == ["int", "float", "i"] - assert dm.num_row() == 2 - assert dm.num_col() == 3 - - np.testing.assert_array_equal(np.array([1, 2]), dm.get_label()) - - # overwrite feature_names - dm = xgb.DMatrix(dtable, label=pd.Series([1, 2]), feature_names=["x", "y", "z"]) - assert dm.feature_names == ["x", "y", "z"] - assert dm.num_row() == 2 - assert dm.num_col() == 3 - - # incorrect dtypes - df = pd.DataFrame([[1, 2.0, "x"], [2, 3.0, "y"]], columns=["a", "b", "c"]) - dtable = dt.Frame(df) - with pytest.raises(ValueError): - xgb.DMatrix(dtable) - - df = pd.DataFrame({"A=1": [1, 2, 3], "A=2": [4, 5, 6]}) - dtable = dt.Frame(df) - dm = xgb.DMatrix(dtable) - assert dm.feature_names == ["A=1", "A=2"] - assert dm.feature_types == ["int", "int"] - assert dm.num_row() == 3 - assert dm.num_col() == 2