From 74e4c8328450fe8618cdd312b33dd5ce7805073c Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 6 Jun 2024 11:20:16 -0700 Subject: [PATCH 1/4] remove deprecated code and avoid default dask tokenization --- python-package/xgboost/core.py | 8 ++++++++ .../test_gpu_with_dask/test_gpu_with_dask.py | 10 +++++----- .../test_distributed/test_with_dask/test_with_dask.py | 1 + 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 76251d65c522..8c3fcb82a23d 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -7,6 +7,7 @@ import os import re import sys +import uuid import warnings import weakref from abc import ABC, abstractmethod @@ -3143,3 +3144,10 @@ def get_split_value_histogram( UserWarning, ) return nph_stacked + + def __dask_tokenize__(self): + # TODO: Implement proper tokenization to avoid + # unnecessary re-computation in Dask. However, + # default tokenzation causes problems after + # https://github.com/dask/dask/pull/10883 + return uuid.uuid4() diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py index 905947d874ee..c872ec9d7b1d 100644 --- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py +++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py @@ -248,10 +248,10 @@ def test_categorical(self, local_cuda_client: Client) -> None: import dask_cudf X, y = make_categorical(local_cuda_client, 10000, 30, 13) - X = dask_cudf.from_dask_dataframe(X) + X = X.to_backend("cudf") X_onehot, _ = make_categorical(local_cuda_client, 10000, 30, 13, True) - X_onehot = dask_cudf.from_dask_dataframe(X_onehot) + X_onehot = X_onehot.to_backend("cudf") run_categorical(local_cuda_client, "hist", "cuda", X, X_onehot, y) @given( @@ -383,9 +383,9 @@ def test_dask_classifier(self, model: str, local_cuda_client: Client) -> None: X_, y_, w_ = generate_array(with_weights=True) y_ = (y_ * 10).astype(np.int32) - X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X_)) - y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y_)) - w = dask_cudf.from_dask_dataframe(dd.from_dask_array(w_)) + X = dd.from_dask_array(X_).to_backend("cudf") + y = dd.from_dask_array(y_).to_backend("cudf") + w = dd.from_dask_array(w_).to_backend("cudf") run_dask_classifier(X, y, w, model, "hist", "cuda", local_cuda_client, 10) def test_empty_dmatrix(self, local_cuda_client: Client) -> None: diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py index 56abccb95ef5..2948ef657781 100644 --- a/tests/test_distributed/test_with_dask/test_with_dask.py +++ b/tests/test_distributed/test_with_dask/test_with_dask.py @@ -395,6 +395,7 @@ def check_model_output(model: xgb.dask.Booster) -> None: assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"]) booster = reg.get_booster() + #import pdb; pdb.set_trace() predt = xgb.dask.predict(client, booster, X).compute().values inpredt = xgb.dask.inplace_predict(client, booster, X).compute().values From 6bd0645c6f88c5fb47355997bdb8838d4454bb08 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 6 Jun 2024 11:23:45 -0700 Subject: [PATCH 2/4] remove comment --- tests/test_distributed/test_with_dask/test_with_dask.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py index 2948ef657781..56abccb95ef5 100644 --- a/tests/test_distributed/test_with_dask/test_with_dask.py +++ b/tests/test_distributed/test_with_dask/test_with_dask.py @@ -395,7 +395,6 @@ def check_model_output(model: xgb.dask.Booster) -> None: assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"]) booster = reg.get_booster() - #import pdb; pdb.set_trace() predt = xgb.dask.predict(client, booster, X).compute().values inpredt = xgb.dask.inplace_predict(client, booster, X).compute().values From 92a440bad18be27af6491605f0a40653f468f0ed Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 7 Jun 2024 14:59:40 +0800 Subject: [PATCH 3/4] Remove dask pin. --- tests/ci_build/Dockerfile.gpu | 2 +- tests/ci_build/Dockerfile.gpu_dev_ver | 2 +- tests/ci_build/conda_env/linux_cpu_test.yml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu index f68ba9d6b14b..6a37d21ffe71 100644 --- a/tests/ci_build/Dockerfile.gpu +++ b/tests/ci_build/Dockerfile.gpu @@ -25,7 +25,7 @@ RUN \ mamba create -y -n gpu_test -c rapidsai -c conda-forge -c nvidia \ python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \ "nccl>=${NCCL_SHORT_VER}" \ - dask=2024.1.1 \ + dask \ dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \ "pyspark>=3.4.0" cloudpickle cuda-python && \ diff --git a/tests/ci_build/Dockerfile.gpu_dev_ver b/tests/ci_build/Dockerfile.gpu_dev_ver index a592d4891093..dfcb379642f2 100644 --- a/tests/ci_build/Dockerfile.gpu_dev_ver +++ b/tests/ci_build/Dockerfile.gpu_dev_ver @@ -28,7 +28,7 @@ RUN \ mamba create -y -n gpu_test -c rapidsai-nightly -c conda-forge -c nvidia \ python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cudatoolkit=$CUDA_VERSION_ARG \ "nccl>=${NCCL_SHORT_VER}" \ - dask=2024.1.1 \ + dask \ "dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \ "pyspark>=3.4.0" cloudpickle cuda-python && \ diff --git a/tests/ci_build/conda_env/linux_cpu_test.yml b/tests/ci_build/conda_env/linux_cpu_test.yml index d87d8fdef6b4..fd630c85a07f 100644 --- a/tests/ci_build/conda_env/linux_cpu_test.yml +++ b/tests/ci_build/conda_env/linux_cpu_test.yml @@ -17,8 +17,8 @@ dependencies: - scikit-learn - pandas - matplotlib -- dask>=2022.6 -- distributed>=2022.6 +- dask +- distributed - python-graphviz - hypothesis>=6.46 - astroid From 1d8aa74588dad090ffe881edf4115ff2c2d6a68f Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 14 Jun 2024 17:04:37 +0800 Subject: [PATCH 4/4] Typing. --- python-package/xgboost/core.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 2f11b8d18e82..1722670fdf5c 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -3145,9 +3145,8 @@ def get_split_value_histogram( ) return nph_stacked - def __dask_tokenize__(self): - # TODO: Implement proper tokenization to avoid - # unnecessary re-computation in Dask. However, - # default tokenzation causes problems after + def __dask_tokenize__(self) -> uuid.UUID: + # TODO: Implement proper tokenization to avoid unnecessary re-computation in + # Dask. However, default tokenzation causes problems after # https://github.com/dask/dask/pull/10883 return uuid.uuid4()