From cec1b57919cf3ca4b42c4e86529c992ccf175edf Mon Sep 17 00:00:00 2001 From: viclafargue Date: Mon, 16 May 2022 18:16:37 +0200 Subject: [PATCH 1/2] Fix KBinsDiscretizer bin_edges_ --- .../sklearn/preprocessing/_discretization.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py b/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py index 50ba5f1109..ee22a9e3c3 100644 --- a/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py +++ b/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py @@ -135,7 +135,7 @@ class KBinsDiscretizer(TransformerMixin, """ - bin_edges_ = CumlArrayDescriptor() + bin_edges_internal_ = CumlArrayDescriptor() n_bins_ = CumlArrayDescriptor() @_deprecate_pos_args(version="21.06") @@ -234,7 +234,7 @@ def fit(self, X, y=None) -> "KBinsDiscretizer": 'decreasing the number of bins.' % jj) n_bins[jj] = len(bin_edges[jj]) - 1 - self.bin_edges_ = bin_edges + self.bin_edges_internal_ = bin_edges self.n_bins_ = n_bins if 'onehot' in self.encode: @@ -303,7 +303,7 @@ def transform(self, X) -> SparseCumlArray: raise ValueError("Incorrect number of features. Expecting {}, " "received {}.".format(n_features, Xt.shape[1])) - bin_edges = self.bin_edges_ + bin_edges = self.bin_edges_internal_ for jj in range(Xt.shape[1]): # Values which are close to a bin edge are susceptible to numeric # instability. Add eps to X so these values are binned correctly @@ -353,9 +353,13 @@ def inverse_transform(self, Xt) -> SparseCumlArray: "received {}.".format(n_features, Xinv.shape[1])) for jj in range(n_features): - bin_edges = self.bin_edges_[jj] + bin_edges = self.bin_edges_internal_[jj] bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5 idxs = np.asnumpy(Xinv[:, jj]) Xinv[:, jj] = bin_centers[idxs.astype(np.int32)] return Xinv + + @property + def bin_edges_(self): + return self.bin_edges_internal_ From eb1d6e0d35f05780f1321cc7dbd1880fc8839abb Mon Sep 17 00:00:00 2001 From: viclafargue Date: Tue, 17 May 2022 10:43:58 +0200 Subject: [PATCH 2/2] Update docstring --- .../_thirdparty/sklearn/preprocessing/_discretization.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py b/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py index ee22a9e3c3..c2ef7ab17a 100644 --- a/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py +++ b/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py @@ -93,7 +93,7 @@ class KBinsDiscretizer(TransformerMixin, np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf]) You can combine ``KBinsDiscretizer`` with - :class:`sklearn.compose.ColumnTransformer` if you only want to preprocess + :class:`cuml.compose.ColumnTransformer` if you only want to preprocess part of the features. ``KBinsDiscretizer`` might produce constant features (e.g., when @@ -104,12 +104,12 @@ class KBinsDiscretizer(TransformerMixin, Examples -------- >>> from cuml.preprocessing import KBinsDiscretizer - >>> import numpy as np + >>> import cupy as cp >>> X = [[-2, 1, -4, -1], ... [-1, 2, -3, -0.5], ... [ 0, 3, -2, 0.5], ... [ 1, 4, -1, 2]] - >>> X = np.array(X) + >>> X = cp.array(X) >>> est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform') >>> est.fit(X) KBinsDiscretizer(...)