From bce3da7a60a28dc82b63adaa49c12713294e9310 Mon Sep 17 00:00:00 2001 From: Weichen Xu Date: Wed, 16 Nov 2022 22:51:30 +0800 Subject: [PATCH 1/6] init Signed-off-by: Weichen Xu --- python-package/xgboost/compat.py | 5 +++++ python-package/xgboost/spark/core.py | 10 +++++++++- python-package/xgboost/spark/data.py | 4 ++-- python-package/xgboost/spark/params.py | 16 ++++++++++++++++ 4 files changed, 32 insertions(+), 3 deletions(-) diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py index 275b6621064d..5393014f835f 100644 --- a/python-package/xgboost/compat.py +++ b/python-package/xgboost/compat.py @@ -43,6 +43,11 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool: pandas_concat = None PANDAS_INSTALLED = False + +# cuDF +CUDF_INSTALLED = importlib.util.find_spec("cudf") is not None + + # sklearn try: from sklearn.base import BaseEstimator as XGBModelBase diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py index 1e544c34f228..11ad72b6457b 100644 --- a/python-package/xgboost/spark/core.py +++ b/python-package/xgboost/spark/core.py @@ -37,6 +37,7 @@ import xgboost from xgboost import XGBClassifier, XGBRanker, XGBRegressor +from xgboost.compat import CUDF_INSTALLED from .data import ( _read_csr_matrix_from_unwrapped_spark_vec, @@ -56,6 +57,7 @@ HasEnableSparseDataOptim, HasFeaturesCols, HasQueryIdCol, + UseQuantileDMatrix, ) from .utils import ( CommunicatorContext, @@ -150,6 +152,7 @@ class _SparkXGBParams( HasFeaturesCols, HasEnableSparseDataOptim, HasQueryIdCol, + UseQuantileDMatrix, ): num_workers = Param( Params._dummy(), @@ -755,7 +758,12 @@ def _fit(self, dataset): k: v for k, v in train_call_kwargs_params.items() if v is not None } dmatrix_kwargs = {k: v for k, v in dmatrix_kwargs.items() if v is not None} - use_qdm = booster_params.get("tree_method", None) in ("hist", "gpu_hist") + + if self.isDefined(self.use_quantile_dmatrix): + use_qdm = self.getOrDefault(self.use_quantile_dmatrix) + else: + use_qdm = CUDF_INSTALLED and \ + booster_params.get("tree_method", None) in ("hist", "gpu_hist") def _train_booster(pandas_df_iter): """Takes in an RDD partition and outputs a booster for that partition after diff --git a/python-package/xgboost/spark/data.py b/python-package/xgboost/spark/data.py index b2cf3e654e37..87ae145d196c 100644 --- a/python-package/xgboost/spark/data.py +++ b/python-package/xgboost/spark/data.py @@ -5,7 +5,7 @@ import numpy as np import pandas as pd from scipy.sparse import csr_matrix -from xgboost.compat import concat +from xgboost.compat import CUDF_INSTALLED, concat from xgboost import DataIter, DMatrix, QuantileDMatrix @@ -81,7 +81,7 @@ def _fetch(self, data: Optional[Sequence[pd.DataFrame]]) -> Optional[pd.DataFram if not data: return None - if self._device_id is not None: + if self._device_id is not None and CUDF_INSTALLED: import cudf # pylint: disable=import-error import cupy as cp # pylint: disable=import-error diff --git a/python-package/xgboost/spark/params.py b/python-package/xgboost/spark/params.py index 77cfcd137d99..04a7ca9d73ef 100644 --- a/python-package/xgboost/spark/params.py +++ b/python-package/xgboost/spark/params.py @@ -85,3 +85,19 @@ class HasQueryIdCol(Params): "query id column name", typeConverter=TypeConverters.toString, ) + + +class UseQuantileDMatrix(Params): + """ + Mixin for param use_quantile_dmatrix: + """ + use_quantile_dmatrix = Param( + Params._dummy(), + "use_quantile_dmatrix", + "This stores the boolean config of constructing quantile DMatrix instead of plain " + "DMatrix as xgboost training input. By default, if 'tree_method' param is 'hist' or " + "'gpu_hist' and 'cuDF' package is installed, the config is ON, otherwise the config " + "is OFF. Note that if you do not install 'cuDF' package, turning on this config " + "might result in performance degradation.", + typeConverter=TypeConverters.toBoolean, + ) From ae7a4078f64e9fb7b9e56a243e1ddcec8167680f Mon Sep 17 00:00:00 2001 From: Weichen Xu Date: Wed, 23 Nov 2022 10:36:30 +0800 Subject: [PATCH 2/6] update Signed-off-by: Weichen Xu --- python-package/xgboost/spark/core.py | 10 ++++------ python-package/xgboost/spark/params.py | 16 ---------------- 2 files changed, 4 insertions(+), 22 deletions(-) diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py index 11ad72b6457b..b0da765bbd3c 100644 --- a/python-package/xgboost/spark/core.py +++ b/python-package/xgboost/spark/core.py @@ -152,7 +152,6 @@ class _SparkXGBParams( HasFeaturesCols, HasEnableSparseDataOptim, HasQueryIdCol, - UseQuantileDMatrix, ): num_workers = Param( Params._dummy(), @@ -759,11 +758,10 @@ def _fit(self, dataset): } dmatrix_kwargs = {k: v for k, v in dmatrix_kwargs.items() if v is not None} - if self.isDefined(self.use_quantile_dmatrix): - use_qdm = self.getOrDefault(self.use_quantile_dmatrix) - else: - use_qdm = CUDF_INSTALLED and \ - booster_params.get("tree_method", None) in ("hist", "gpu_hist") + # If cuDF is not installed, then using DMatrix instead of QDM, + # because without cuDF, DMatrix performs better than QDM. + use_qdm = CUDF_INSTALLED and \ + booster_params.get("tree_method", None) in ("hist", "gpu_hist") def _train_booster(pandas_df_iter): """Takes in an RDD partition and outputs a booster for that partition after diff --git a/python-package/xgboost/spark/params.py b/python-package/xgboost/spark/params.py index 04a7ca9d73ef..77cfcd137d99 100644 --- a/python-package/xgboost/spark/params.py +++ b/python-package/xgboost/spark/params.py @@ -85,19 +85,3 @@ class HasQueryIdCol(Params): "query id column name", typeConverter=TypeConverters.toString, ) - - -class UseQuantileDMatrix(Params): - """ - Mixin for param use_quantile_dmatrix: - """ - use_quantile_dmatrix = Param( - Params._dummy(), - "use_quantile_dmatrix", - "This stores the boolean config of constructing quantile DMatrix instead of plain " - "DMatrix as xgboost training input. By default, if 'tree_method' param is 'hist' or " - "'gpu_hist' and 'cuDF' package is installed, the config is ON, otherwise the config " - "is OFF. Note that if you do not install 'cuDF' package, turning on this config " - "might result in performance degradation.", - typeConverter=TypeConverters.toBoolean, - ) From 1b7e2bc2639d88586ca6f6751afa9127faecb8dd Mon Sep 17 00:00:00 2001 From: Weichen Xu Date: Thu, 24 Nov 2022 11:34:11 +0800 Subject: [PATCH 3/6] update Signed-off-by: Weichen Xu --- python-package/xgboost/compat.py | 16 ++++++++++++---- python-package/xgboost/spark/core.py | 14 ++++++++------ 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py index 5393014f835f..3902b85287da 100644 --- a/python-package/xgboost/compat.py +++ b/python-package/xgboost/compat.py @@ -44,10 +44,6 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool: PANDAS_INSTALLED = False -# cuDF -CUDF_INSTALLED = importlib.util.find_spec("cudf") is not None - - # sklearn try: from sklearn.base import BaseEstimator as XGBModelBase @@ -77,6 +73,18 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool: XGBStratifiedKFold = None +def is_cudf_installed(): + """Check cuDF installed or not""" + # Checking by `importing` instead of check `importlib.util.find_spec("cudf") is not None` + # because user might install cudf successfully but importing cudf raises issues (e.g. saying + # running on mismatched cuda version) + try: + import cudf + return True + except ImportError: + return False + + class XGBoostLabelEncoder(LabelEncoder): """Label encoder with JSON serialization methods.""" diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py index b0da765bbd3c..2dae2eac2688 100644 --- a/python-package/xgboost/spark/core.py +++ b/python-package/xgboost/spark/core.py @@ -37,7 +37,7 @@ import xgboost from xgboost import XGBClassifier, XGBRanker, XGBRegressor -from xgboost.compat import CUDF_INSTALLED +from xgboost.compat import is_cudf_installed from .data import ( _read_csr_matrix_from_unwrapped_spark_vec, @@ -57,7 +57,6 @@ HasEnableSparseDataOptim, HasFeaturesCols, HasQueryIdCol, - UseQuantileDMatrix, ) from .utils import ( CommunicatorContext, @@ -758,10 +757,7 @@ def _fit(self, dataset): } dmatrix_kwargs = {k: v for k, v in dmatrix_kwargs.items() if v is not None} - # If cuDF is not installed, then using DMatrix instead of QDM, - # because without cuDF, DMatrix performs better than QDM. - use_qdm = CUDF_INSTALLED and \ - booster_params.get("tree_method", None) in ("hist", "gpu_hist") + use_hist = booster_params.get("tree_method", None) in ("hist", "gpu_hist") def _train_booster(pandas_df_iter): """Takes in an RDD partition and outputs a booster for that partition after @@ -775,6 +771,12 @@ def _train_booster(pandas_df_iter): gpu_id = None + # If cuDF is not installed, then using DMatrix instead of QDM, + # because without cuDF, DMatrix performs better than QDM. + # Note: Checking `is_cudf_installed` in spark worker side because + # spark worker might has different python environment with driver side. + use_qdm = use_hist and is_cudf_installed() + if use_qdm and (booster_params.get("max_bin", None) is not None): dmatrix_kwargs["max_bin"] = booster_params["max_bin"] From 061747784ceeb04e6695513b58f62338993c7968 Mon Sep 17 00:00:00 2001 From: Weichen Xu Date: Thu, 24 Nov 2022 18:55:12 +0800 Subject: [PATCH 4/6] update Signed-off-by: Weichen Xu --- python-package/xgboost/compat.py | 6 +++++- python-package/xgboost/spark/data.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py index 3902b85287da..3177fae9f9e0 100644 --- a/python-package/xgboost/compat.py +++ b/python-package/xgboost/compat.py @@ -73,7 +73,10 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool: XGBStratifiedKFold = None -def is_cudf_installed(): +logger = logging.getLogger(__name__) + + +def is_cudf_installed() -> bool: """Check cuDF installed or not""" # Checking by `importing` instead of check `importlib.util.find_spec("cudf") is not None` # because user might install cudf successfully but importing cudf raises issues (e.g. saying @@ -82,6 +85,7 @@ def is_cudf_installed(): import cudf return True except ImportError: + _logger.exception("Importing cuDF failed, use DMatrix instead of QDM") return False diff --git a/python-package/xgboost/spark/data.py b/python-package/xgboost/spark/data.py index 87ae145d196c..b2cf3e654e37 100644 --- a/python-package/xgboost/spark/data.py +++ b/python-package/xgboost/spark/data.py @@ -5,7 +5,7 @@ import numpy as np import pandas as pd from scipy.sparse import csr_matrix -from xgboost.compat import CUDF_INSTALLED, concat +from xgboost.compat import concat from xgboost import DataIter, DMatrix, QuantileDMatrix @@ -81,7 +81,7 @@ def _fetch(self, data: Optional[Sequence[pd.DataFrame]]) -> Optional[pd.DataFram if not data: return None - if self._device_id is not None and CUDF_INSTALLED: + if self._device_id is not None: import cudf # pylint: disable=import-error import cupy as cp # pylint: disable=import-error From 62d184811c83fca9f6f78cf65f098423486a35ed Mon Sep 17 00:00:00 2001 From: Weichen Xu Date: Thu, 24 Nov 2022 20:55:01 +0800 Subject: [PATCH 5/6] update Signed-off-by: Weichen Xu --- python-package/xgboost/compat.py | 2 +- python-package/xgboost/spark/core.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py index 3177fae9f9e0..096ab067fbdd 100644 --- a/python-package/xgboost/compat.py +++ b/python-package/xgboost/compat.py @@ -73,7 +73,7 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool: XGBStratifiedKFold = None -logger = logging.getLogger(__name__) +_logger = logging.getLogger(__name__) def is_cudf_installed() -> bool: diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py index 2dae2eac2688..751a08082c0a 100644 --- a/python-package/xgboost/spark/core.py +++ b/python-package/xgboost/spark/core.py @@ -775,7 +775,10 @@ def _train_booster(pandas_df_iter): # because without cuDF, DMatrix performs better than QDM. # Note: Checking `is_cudf_installed` in spark worker side because # spark worker might has different python environment with driver side. - use_qdm = use_hist and is_cudf_installed() + if use_gpu: + use_qdm = use_hist and is_cudf_installed() + else: + use_qdm = use_hist if use_qdm and (booster_params.get("max_bin", None) is not None): dmatrix_kwargs["max_bin"] = booster_params["max_bin"] From 943921c3c06bbc6275084d335aa9a8947b56bcbc Mon Sep 17 00:00:00 2001 From: Weichen Xu Date: Fri, 25 Nov 2022 00:30:53 +0800 Subject: [PATCH 6/6] update Signed-off-by: Weichen Xu --- python-package/xgboost/compat.py | 10 +++++----- python-package/xgboost/spark/core.py | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py index 096ab067fbdd..fab734a01361 100644 --- a/python-package/xgboost/compat.py +++ b/python-package/xgboost/compat.py @@ -76,13 +76,13 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool: _logger = logging.getLogger(__name__) -def is_cudf_installed() -> bool: - """Check cuDF installed or not""" - # Checking by `importing` instead of check `importlib.util.find_spec("cudf") is not None` - # because user might install cudf successfully but importing cudf raises issues (e.g. saying - # running on mismatched cuda version) +def is_cudf_available() -> bool: + """Check cuDF package available or not""" + if importlib.util.find_spec("cudf") is None: + return False try: import cudf + return True except ImportError: _logger.exception("Importing cuDF failed, use DMatrix instead of QDM") diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py index 751a08082c0a..56f63ac0f277 100644 --- a/python-package/xgboost/spark/core.py +++ b/python-package/xgboost/spark/core.py @@ -32,12 +32,12 @@ ShortType, ) from scipy.special import expit, softmax # pylint: disable=no-name-in-module +from xgboost.compat import is_cudf_available from xgboost.core import Booster from xgboost.training import train as worker_train import xgboost from xgboost import XGBClassifier, XGBRanker, XGBRegressor -from xgboost.compat import is_cudf_installed from .data import ( _read_csr_matrix_from_unwrapped_spark_vec, @@ -773,10 +773,10 @@ def _train_booster(pandas_df_iter): # If cuDF is not installed, then using DMatrix instead of QDM, # because without cuDF, DMatrix performs better than QDM. - # Note: Checking `is_cudf_installed` in spark worker side because + # Note: Checking `is_cudf_available` in spark worker side because # spark worker might has different python environment with driver side. if use_gpu: - use_qdm = use_hist and is_cudf_installed() + use_qdm = use_hist and is_cudf_available() else: use_qdm = use_hist