Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[jvm-packages] [pyspark] Make QDM optional based on cuDF check #8471

Merged
merged 6 commits into from
Nov 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions python-package/xgboost/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool:
pandas_concat = None
PANDAS_INSTALLED = False


# sklearn
try:
from sklearn.base import BaseEstimator as XGBModelBase
Expand Down Expand Up @@ -72,6 +73,22 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool:
XGBStratifiedKFold = None


_logger = logging.getLogger(__name__)


def is_cudf_available() -> bool:
"""Check cuDF package available or not"""
if importlib.util.find_spec("cudf") is None:
return False
Comment on lines +81 to +82
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is for avoiding printing importing error if cuDF not installed. (specifically for databricks runtime)

try:
import cudf

return True
except ImportError:
_logger.exception("Importing cuDF failed, use DMatrix instead of QDM")
WeichenXu123 marked this conversation as resolved.
Show resolved Hide resolved
return False


class XGBoostLabelEncoder(LabelEncoder):
"""Label encoder with JSON serialization methods."""

Expand Down
13 changes: 12 additions & 1 deletion python-package/xgboost/spark/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
ShortType,
)
from scipy.special import expit, softmax # pylint: disable=no-name-in-module
from xgboost.compat import is_cudf_available
from xgboost.core import Booster
from xgboost.training import train as worker_train

Expand Down Expand Up @@ -755,7 +756,8 @@ def _fit(self, dataset):
k: v for k, v in train_call_kwargs_params.items() if v is not None
}
dmatrix_kwargs = {k: v for k, v in dmatrix_kwargs.items() if v is not None}
use_qdm = booster_params.get("tree_method", None) in ("hist", "gpu_hist")

use_hist = booster_params.get("tree_method", None) in ("hist", "gpu_hist")

def _train_booster(pandas_df_iter):
"""Takes in an RDD partition and outputs a booster for that partition after
Expand All @@ -769,6 +771,15 @@ def _train_booster(pandas_df_iter):

gpu_id = None

# If cuDF is not installed, then using DMatrix instead of QDM,
# because without cuDF, DMatrix performs better than QDM.
# Note: Checking `is_cudf_available` in spark worker side because
# spark worker might has different python environment with driver side.
if use_gpu:
use_qdm = use_hist and is_cudf_available()
else:
use_qdm = use_hist

if use_qdm and (booster_params.get("max_bin", None) is not None):
dmatrix_kwargs["max_bin"] = booster_params["max_bin"]

Expand Down