From bce3da7a60a28dc82b63adaa49c12713294e9310 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 16 Nov 2022 22:51:30 +0800
Subject: [PATCH 1/6] init

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 python-package/xgboost/compat.py       |  5 +++++
 python-package/xgboost/spark/core.py   | 10 +++++++++-
 python-package/xgboost/spark/data.py   |  4 ++--
 python-package/xgboost/spark/params.py | 16 ++++++++++++++++
 4 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py
index 275b6621064d..5393014f835f 100644
--- a/python-package/xgboost/compat.py
+++ b/python-package/xgboost/compat.py
@@ -43,6 +43,11 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool:
     pandas_concat = None
     PANDAS_INSTALLED = False
 
+
+# cuDF
+CUDF_INSTALLED = importlib.util.find_spec("cudf") is not None
+
+
 # sklearn
 try:
     from sklearn.base import BaseEstimator as XGBModelBase
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 1e544c34f228..11ad72b6457b 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -37,6 +37,7 @@
 
 import xgboost
 from xgboost import XGBClassifier, XGBRanker, XGBRegressor
+from xgboost.compat import CUDF_INSTALLED
 
 from .data import (
     _read_csr_matrix_from_unwrapped_spark_vec,
@@ -56,6 +57,7 @@
     HasEnableSparseDataOptim,
     HasFeaturesCols,
     HasQueryIdCol,
+    UseQuantileDMatrix,
 )
 from .utils import (
     CommunicatorContext,
@@ -150,6 +152,7 @@ class _SparkXGBParams(
     HasFeaturesCols,
     HasEnableSparseDataOptim,
     HasQueryIdCol,
+    UseQuantileDMatrix,
 ):
     num_workers = Param(
         Params._dummy(),
@@ -755,7 +758,12 @@ def _fit(self, dataset):
             k: v for k, v in train_call_kwargs_params.items() if v is not None
         }
         dmatrix_kwargs = {k: v for k, v in dmatrix_kwargs.items() if v is not None}
-        use_qdm = booster_params.get("tree_method", None) in ("hist", "gpu_hist")
+
+        if self.isDefined(self.use_quantile_dmatrix):
+            use_qdm = self.getOrDefault(self.use_quantile_dmatrix)
+        else:
+            use_qdm = CUDF_INSTALLED and \
+                      booster_params.get("tree_method", None) in ("hist", "gpu_hist")
 
         def _train_booster(pandas_df_iter):
             """Takes in an RDD partition and outputs a booster for that partition after
diff --git a/python-package/xgboost/spark/data.py b/python-package/xgboost/spark/data.py
index b2cf3e654e37..87ae145d196c 100644
--- a/python-package/xgboost/spark/data.py
+++ b/python-package/xgboost/spark/data.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pandas as pd
 from scipy.sparse import csr_matrix
-from xgboost.compat import concat
+from xgboost.compat import CUDF_INSTALLED, concat
 
 from xgboost import DataIter, DMatrix, QuantileDMatrix
 
@@ -81,7 +81,7 @@ def _fetch(self, data: Optional[Sequence[pd.DataFrame]]) -> Optional[pd.DataFram
         if not data:
             return None
 
-        if self._device_id is not None:
+        if self._device_id is not None and CUDF_INSTALLED:
             import cudf  # pylint: disable=import-error
             import cupy as cp  # pylint: disable=import-error
 
diff --git a/python-package/xgboost/spark/params.py b/python-package/xgboost/spark/params.py
index 77cfcd137d99..04a7ca9d73ef 100644
--- a/python-package/xgboost/spark/params.py
+++ b/python-package/xgboost/spark/params.py
@@ -85,3 +85,19 @@ class HasQueryIdCol(Params):
         "query id column name",
         typeConverter=TypeConverters.toString,
     )
+
+
+class UseQuantileDMatrix(Params):
+    """
+    Mixin for param use_quantile_dmatrix:
+    """
+    use_quantile_dmatrix = Param(
+        Params._dummy(),
+        "use_quantile_dmatrix",
+        "This stores the boolean config of constructing quantile DMatrix instead of plain "
+        "DMatrix as xgboost training input. By default, if 'tree_method' param is 'hist' or "
+        "'gpu_hist' and 'cuDF' package is installed, the config is ON, otherwise the config "
+        "is OFF. Note that if you do not install 'cuDF' package, turning on this config "
+        "might result in performance degradation.",
+        typeConverter=TypeConverters.toBoolean,
+    )

From ae7a4078f64e9fb7b9e56a243e1ddcec8167680f Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 23 Nov 2022 10:36:30 +0800
Subject: [PATCH 2/6] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 python-package/xgboost/spark/core.py   | 10 ++++------
 python-package/xgboost/spark/params.py | 16 ----------------
 2 files changed, 4 insertions(+), 22 deletions(-)

diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 11ad72b6457b..b0da765bbd3c 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -152,7 +152,6 @@ class _SparkXGBParams(
     HasFeaturesCols,
     HasEnableSparseDataOptim,
     HasQueryIdCol,
-    UseQuantileDMatrix,
 ):
     num_workers = Param(
         Params._dummy(),
@@ -759,11 +758,10 @@ def _fit(self, dataset):
         }
         dmatrix_kwargs = {k: v for k, v in dmatrix_kwargs.items() if v is not None}
 
-        if self.isDefined(self.use_quantile_dmatrix):
-            use_qdm = self.getOrDefault(self.use_quantile_dmatrix)
-        else:
-            use_qdm = CUDF_INSTALLED and \
-                      booster_params.get("tree_method", None) in ("hist", "gpu_hist")
+        # If cuDF is not installed, then using DMatrix instead of QDM,
+        # because without cuDF, DMatrix performs better than QDM.
+        use_qdm = CUDF_INSTALLED and \
+                  booster_params.get("tree_method", None) in ("hist", "gpu_hist")
 
         def _train_booster(pandas_df_iter):
             """Takes in an RDD partition and outputs a booster for that partition after
diff --git a/python-package/xgboost/spark/params.py b/python-package/xgboost/spark/params.py
index 04a7ca9d73ef..77cfcd137d99 100644
--- a/python-package/xgboost/spark/params.py
+++ b/python-package/xgboost/spark/params.py
@@ -85,19 +85,3 @@ class HasQueryIdCol(Params):
         "query id column name",
         typeConverter=TypeConverters.toString,
     )
-
-
-class UseQuantileDMatrix(Params):
-    """
-    Mixin for param use_quantile_dmatrix:
-    """
-    use_quantile_dmatrix = Param(
-        Params._dummy(),
-        "use_quantile_dmatrix",
-        "This stores the boolean config of constructing quantile DMatrix instead of plain "
-        "DMatrix as xgboost training input. By default, if 'tree_method' param is 'hist' or "
-        "'gpu_hist' and 'cuDF' package is installed, the config is ON, otherwise the config "
-        "is OFF. Note that if you do not install 'cuDF' package, turning on this config "
-        "might result in performance degradation.",
-        typeConverter=TypeConverters.toBoolean,
-    )

From 1b7e2bc2639d88586ca6f6751afa9127faecb8dd Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 24 Nov 2022 11:34:11 +0800
Subject: [PATCH 3/6] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 python-package/xgboost/compat.py     | 16 ++++++++++++----
 python-package/xgboost/spark/core.py | 14 ++++++++------
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py
index 5393014f835f..3902b85287da 100644
--- a/python-package/xgboost/compat.py
+++ b/python-package/xgboost/compat.py
@@ -44,10 +44,6 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool:
     PANDAS_INSTALLED = False
 
 
-# cuDF
-CUDF_INSTALLED = importlib.util.find_spec("cudf") is not None
-
-
 # sklearn
 try:
     from sklearn.base import BaseEstimator as XGBModelBase
@@ -77,6 +73,18 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool:
     XGBStratifiedKFold = None
 
 
+def is_cudf_installed():
+    """Check cuDF installed or not"""
+    # Checking by `importing` instead of check `importlib.util.find_spec("cudf") is not None`
+    # because user might install cudf successfully but importing cudf raises issues (e.g. saying
+    # running on mismatched cuda version)
+    try:
+        import cudf
+        return True
+    except ImportError:
+        return False
+
+
 class XGBoostLabelEncoder(LabelEncoder):
     """Label encoder with JSON serialization methods."""
 
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index b0da765bbd3c..2dae2eac2688 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -37,7 +37,7 @@
 
 import xgboost
 from xgboost import XGBClassifier, XGBRanker, XGBRegressor
-from xgboost.compat import CUDF_INSTALLED
+from xgboost.compat import is_cudf_installed
 
 from .data import (
     _read_csr_matrix_from_unwrapped_spark_vec,
@@ -57,7 +57,6 @@
     HasEnableSparseDataOptim,
     HasFeaturesCols,
     HasQueryIdCol,
-    UseQuantileDMatrix,
 )
 from .utils import (
     CommunicatorContext,
@@ -758,10 +757,7 @@ def _fit(self, dataset):
         }
         dmatrix_kwargs = {k: v for k, v in dmatrix_kwargs.items() if v is not None}
 
-        # If cuDF is not installed, then using DMatrix instead of QDM,
-        # because without cuDF, DMatrix performs better than QDM.
-        use_qdm = CUDF_INSTALLED and \
-                  booster_params.get("tree_method", None) in ("hist", "gpu_hist")
+        use_hist = booster_params.get("tree_method", None) in ("hist", "gpu_hist")
 
         def _train_booster(pandas_df_iter):
             """Takes in an RDD partition and outputs a booster for that partition after
@@ -775,6 +771,12 @@ def _train_booster(pandas_df_iter):
 
             gpu_id = None
 
+            # If cuDF is not installed, then using DMatrix instead of QDM,
+            # because without cuDF, DMatrix performs better than QDM.
+            # Note: Checking `is_cudf_installed` in spark worker side because
+            # spark worker might has different python environment with driver side.
+            use_qdm = use_hist and is_cudf_installed()
+
             if use_qdm and (booster_params.get("max_bin", None) is not None):
                 dmatrix_kwargs["max_bin"] = booster_params["max_bin"]
 

From 061747784ceeb04e6695513b58f62338993c7968 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 24 Nov 2022 18:55:12 +0800
Subject: [PATCH 4/6] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 python-package/xgboost/compat.py     | 6 +++++-
 python-package/xgboost/spark/data.py | 4 ++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py
index 3902b85287da..3177fae9f9e0 100644
--- a/python-package/xgboost/compat.py
+++ b/python-package/xgboost/compat.py
@@ -73,7 +73,10 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool:
     XGBStratifiedKFold = None
 
 
-def is_cudf_installed():
+logger = logging.getLogger(__name__)
+
+
+def is_cudf_installed() -> bool:
     """Check cuDF installed or not"""
     # Checking by `importing` instead of check `importlib.util.find_spec("cudf") is not None`
     # because user might install cudf successfully but importing cudf raises issues (e.g. saying
@@ -82,6 +85,7 @@ def is_cudf_installed():
         import cudf
         return True
     except ImportError:
+        _logger.exception("Importing cuDF failed, use DMatrix instead of QDM")
         return False
 
 
diff --git a/python-package/xgboost/spark/data.py b/python-package/xgboost/spark/data.py
index 87ae145d196c..b2cf3e654e37 100644
--- a/python-package/xgboost/spark/data.py
+++ b/python-package/xgboost/spark/data.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pandas as pd
 from scipy.sparse import csr_matrix
-from xgboost.compat import CUDF_INSTALLED, concat
+from xgboost.compat import concat
 
 from xgboost import DataIter, DMatrix, QuantileDMatrix
 
@@ -81,7 +81,7 @@ def _fetch(self, data: Optional[Sequence[pd.DataFrame]]) -> Optional[pd.DataFram
         if not data:
             return None
 
-        if self._device_id is not None and CUDF_INSTALLED:
+        if self._device_id is not None:
             import cudf  # pylint: disable=import-error
             import cupy as cp  # pylint: disable=import-error
 

From 62d184811c83fca9f6f78cf65f098423486a35ed Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 24 Nov 2022 20:55:01 +0800
Subject: [PATCH 5/6] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 python-package/xgboost/compat.py     | 2 +-
 python-package/xgboost/spark/core.py | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py
index 3177fae9f9e0..096ab067fbdd 100644
--- a/python-package/xgboost/compat.py
+++ b/python-package/xgboost/compat.py
@@ -73,7 +73,7 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool:
     XGBStratifiedKFold = None
 
 
-logger = logging.getLogger(__name__)
+_logger = logging.getLogger(__name__)
 
 
 def is_cudf_installed() -> bool:
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 2dae2eac2688..751a08082c0a 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -775,7 +775,10 @@ def _train_booster(pandas_df_iter):
             # because without cuDF, DMatrix performs better than QDM.
             # Note: Checking `is_cudf_installed` in spark worker side because
             # spark worker might has different python environment with driver side.
-            use_qdm = use_hist and is_cudf_installed()
+            if use_gpu:
+                use_qdm = use_hist and is_cudf_installed()
+            else:
+                use_qdm = use_hist
 
             if use_qdm and (booster_params.get("max_bin", None) is not None):
                 dmatrix_kwargs["max_bin"] = booster_params["max_bin"]

From 943921c3c06bbc6275084d335aa9a8947b56bcbc Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Fri, 25 Nov 2022 00:30:53 +0800
Subject: [PATCH 6/6] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 python-package/xgboost/compat.py     | 10 +++++-----
 python-package/xgboost/spark/core.py |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py
index 096ab067fbdd..fab734a01361 100644
--- a/python-package/xgboost/compat.py
+++ b/python-package/xgboost/compat.py
@@ -76,13 +76,13 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool:
 _logger = logging.getLogger(__name__)
 
 
-def is_cudf_installed() -> bool:
-    """Check cuDF installed or not"""
-    # Checking by `importing` instead of check `importlib.util.find_spec("cudf") is not None`
-    # because user might install cudf successfully but importing cudf raises issues (e.g. saying
-    # running on mismatched cuda version)
+def is_cudf_available() -> bool:
+    """Check cuDF package available or not"""
+    if importlib.util.find_spec("cudf") is None:
+        return False
     try:
         import cudf
+
         return True
     except ImportError:
         _logger.exception("Importing cuDF failed, use DMatrix instead of QDM")
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 751a08082c0a..56f63ac0f277 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -32,12 +32,12 @@
     ShortType,
 )
 from scipy.special import expit, softmax  # pylint: disable=no-name-in-module
+from xgboost.compat import is_cudf_available
 from xgboost.core import Booster
 from xgboost.training import train as worker_train
 
 import xgboost
 from xgboost import XGBClassifier, XGBRanker, XGBRegressor
-from xgboost.compat import is_cudf_installed
 
 from .data import (
     _read_csr_matrix_from_unwrapped_spark_vec,
@@ -773,10 +773,10 @@ def _train_booster(pandas_df_iter):
 
             # If cuDF is not installed, then using DMatrix instead of QDM,
             # because without cuDF, DMatrix performs better than QDM.
-            # Note: Checking `is_cudf_installed` in spark worker side because
+            # Note: Checking `is_cudf_available` in spark worker side because
             # spark worker might has different python environment with driver side.
             if use_gpu:
-                use_qdm = use_hist and is_cudf_installed()
+                use_qdm = use_hist and is_cudf_available()
             else:
                 use_qdm = use_hist