From 32fe2691f7eea7d2d2ed3bf3460965450f2ba256 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 23 Oct 2024 13:02:21 +0200 Subject: [PATCH 001/131] add finiteness_checker pybind11 bindings --- onedal/dal.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/onedal/dal.cpp b/onedal/dal.cpp index 814b22aa8b..14e0aed35d 100644 --- a/onedal/dal.cpp +++ b/onedal/dal.cpp @@ -75,6 +75,9 @@ namespace oneapi::dal::python { #if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240001 ONEDAL_PY_INIT_MODULE(logistic_regression); #endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240001 + #if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240700 + ONEDAL_PY_INIT_MODULE(finiteness_checker); + #endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240700 #endif // ONEDAL_DATA_PARALLEL_SPMD #ifdef ONEDAL_DATA_PARALLEL_SPMD @@ -133,6 +136,9 @@ namespace oneapi::dal::python { #if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240001 init_logistic_regression(m); #endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240001 + #if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240700 + init_finiteness_checker(m); + #endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240700 } #endif // ONEDAL_DATA_PARALLEL_SPMD From cdbf1b5e5bfdc8036beee80545ea11e553ceac99 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 23 Oct 2024 13:04:00 +0200 Subject: [PATCH 002/131] added finiteness checker --- onedal/primitives/finiteness_checker.cpp | 96 ++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 onedal/primitives/finiteness_checker.cpp diff --git a/onedal/primitives/finiteness_checker.cpp b/onedal/primitives/finiteness_checker.cpp new file mode 100644 index 0000000000..6aaf7c52d6 --- /dev/null +++ b/onedal/primitives/finiteness_checker.cpp @@ -0,0 +1,96 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "oneapi/dal/algo/finiteness_checker.hpp" + +#include "onedal/common.hpp" +#include "onedal/version.hpp" + +namespace py = pybind11; + +namespace oneapi::dal::python { + +template +struct method2t { + method2t(const Task& task, const Ops& ops) : ops(ops) {} + + template + auto operator()(const py::dict& params) { + using namespace finiteness_checker; + + const auto method = params["method"].cast(); + + ONEDAL_PARAM_DISPATCH_VALUE(method, "dense", ops, Float, method::dense); + ONEDAL_PARAM_DISPATCH_VALUE(method, "by_default", ops, Float, method::by_default); + ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(method); + } + + Ops ops; +}; + +struct params2desc { + template + auto operator()(const pybind11::dict& params) { + using namespace dal::finiteness_checker; + + auto desc = descriptor(); + desc.set_allow_NaN(params["allow_nan"].cast()); + return desc; + } +}; + +template +void init_compute_ops(py::module_& m) { + m.def("compute", + [](const Policy& policy, + const py::dict& params, + const table& data) { + using namespace finiteness_checker; + using input_t = compute_input; + + compute_ops ops(policy, input_t{ data}, params2desc{}); + return fptype2t{ method2t{ Task{}, ops } }(params); + }); +} + +template +void init_compute_result(py::module_& m) { + using namespace finiteness_checker; + using result_t = compute_result; + + py::class_(m, "compute_result") + .def(py::init()) + .DEF_ONEDAL_PY_PROPERTY(finite, result_t) +} + +ONEDAL_PY_TYPE2STR(finiteness_checker::task::compute, "compute"); + +ONEDAL_PY_DECLARE_INSTANTIATOR(init_compute_ops); +ONEDAL_PY_DECLARE_INSTANTIATOR(init_compute_result); + +ONEDAL_PY_INIT_MODULE(finiteness_checker) { + using namespace dal::detail; + using namespace finiteness_checker; + using namespace dal::finiteness; + + using task_list = types; + auto sub = m.def_submodule("finiteness_checker"); + + ONEDAL_PY_INSTANTIATE(init_compute_ops, sub, policy_list, task_list); + ONEDAL_PY_INSTANTIATE(init_compute_result, sub, task_list); +} + +} // namespace oneapi::dal::python From 62674a24547cf4f7771efbd48657666ed41a97fe Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 23 Oct 2024 13:37:53 +0200 Subject: [PATCH 003/131] Update finiteness_checker.cpp --- onedal/primitives/finiteness_checker.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/primitives/finiteness_checker.cpp b/onedal/primitives/finiteness_checker.cpp index 6aaf7c52d6..51a3ef161a 100644 --- a/onedal/primitives/finiteness_checker.cpp +++ b/onedal/primitives/finiteness_checker.cpp @@ -14,7 +14,7 @@ * limitations under the License. *******************************************************************************/ -#include "oneapi/dal/algo/finiteness_checker.hpp" +#include "oneapi/dal/algo/finiteness_checker/compute.hpp" #include "onedal/common.hpp" #include "onedal/version.hpp" From c75c23b34e714ac22eace32d4a44ae5699286262 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 23 Oct 2024 13:46:49 +0200 Subject: [PATCH 004/131] Update finiteness_checker.cpp --- onedal/primitives/finiteness_checker.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/onedal/primitives/finiteness_checker.cpp b/onedal/primitives/finiteness_checker.cpp index 51a3ef161a..761ee28de9 100644 --- a/onedal/primitives/finiteness_checker.cpp +++ b/onedal/primitives/finiteness_checker.cpp @@ -14,7 +14,12 @@ * limitations under the License. *******************************************************************************/ +// fix error with missing headers +#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20250200 +#include "oneapi/dal/algo/finiteness_checker.hpp +#else #include "oneapi/dal/algo/finiteness_checker/compute.hpp" +#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20250200 #include "onedal/common.hpp" #include "onedal/version.hpp" From 6a20938aba804e69b09bf5d15c12f3128982df7d Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 23 Oct 2024 13:47:36 +0200 Subject: [PATCH 005/131] Update finiteness_checker.cpp --- onedal/primitives/finiteness_checker.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onedal/primitives/finiteness_checker.cpp b/onedal/primitives/finiteness_checker.cpp index 761ee28de9..531554f857 100644 --- a/onedal/primitives/finiteness_checker.cpp +++ b/onedal/primitives/finiteness_checker.cpp @@ -16,9 +16,9 @@ // fix error with missing headers #if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20250200 -#include "oneapi/dal/algo/finiteness_checker.hpp + #include "oneapi/dal/algo/finiteness_checker.hpp #else -#include "oneapi/dal/algo/finiteness_checker/compute.hpp" + #include "oneapi/dal/algo/finiteness_checker/compute.hpp" #endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20250200 #include "onedal/common.hpp" From 382d7a1268a4612f6eec162a30c02b18bcc0e041 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 23 Oct 2024 13:47:47 +0200 Subject: [PATCH 006/131] Update finiteness_checker.cpp --- onedal/primitives/finiteness_checker.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/primitives/finiteness_checker.cpp b/onedal/primitives/finiteness_checker.cpp index 531554f857..ebc7bfd798 100644 --- a/onedal/primitives/finiteness_checker.cpp +++ b/onedal/primitives/finiteness_checker.cpp @@ -16,7 +16,7 @@ // fix error with missing headers #if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20250200 - #include "oneapi/dal/algo/finiteness_checker.hpp + #include "oneapi/dal/algo/finiteness_checker.hpp" #else #include "oneapi/dal/algo/finiteness_checker/compute.hpp" #endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20250200 From c8ffd9c0c2c9a132449020fa2ffc492b7c9bd1fb Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 23 Oct 2024 13:54:20 +0200 Subject: [PATCH 007/131] Update finiteness_checker.cpp --- onedal/primitives/finiteness_checker.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/primitives/finiteness_checker.cpp b/onedal/primitives/finiteness_checker.cpp index ebc7bfd798..92a17a875d 100644 --- a/onedal/primitives/finiteness_checker.cpp +++ b/onedal/primitives/finiteness_checker.cpp @@ -52,7 +52,7 @@ struct params2desc { using namespace dal::finiteness_checker; auto desc = descriptor(); - desc.set_allow_NaN(params["allow_nan"].cast()); + desc.set_allow_NaN(params["allow_nan"].cast()); return desc; } }; From 9aa13d5e72340509c33986befce7ff5f3169a325 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 23 Oct 2024 13:58:13 +0200 Subject: [PATCH 008/131] Update finiteness_checker.cpp --- onedal/primitives/finiteness_checker.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onedal/primitives/finiteness_checker.cpp b/onedal/primitives/finiteness_checker.cpp index 92a17a875d..7189aec5d9 100644 --- a/onedal/primitives/finiteness_checker.cpp +++ b/onedal/primitives/finiteness_checker.cpp @@ -78,7 +78,7 @@ void init_compute_result(py::module_& m) { py::class_(m, "compute_result") .def(py::init()) - .DEF_ONEDAL_PY_PROPERTY(finite, result_t) + .DEF_ONEDAL_PY_PROPERTY(finite, result_t); } ONEDAL_PY_TYPE2STR(finiteness_checker::task::compute, "compute"); @@ -89,7 +89,7 @@ ONEDAL_PY_DECLARE_INSTANTIATOR(init_compute_result); ONEDAL_PY_INIT_MODULE(finiteness_checker) { using namespace dal::detail; using namespace finiteness_checker; - using namespace dal::finiteness; + using namespace dal::finiteness_checker; using task_list = types; auto sub = m.def_submodule("finiteness_checker"); From 84e15d598392ebf5da945468cd1cf110a25d3764 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 23 Oct 2024 14:21:02 +0200 Subject: [PATCH 009/131] Rename finiteness_checker.cpp to finiteness_checker.cpp --- onedal/{primitives => utils}/finiteness_checker.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename onedal/{primitives => utils}/finiteness_checker.cpp (100%) diff --git a/onedal/primitives/finiteness_checker.cpp b/onedal/utils/finiteness_checker.cpp similarity index 100% rename from onedal/primitives/finiteness_checker.cpp rename to onedal/utils/finiteness_checker.cpp From 63073c60d17c192781e30db5425eeee4832761d9 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Thu, 24 Oct 2024 10:58:08 +0200 Subject: [PATCH 010/131] Update finiteness_checker.cpp --- onedal/utils/finiteness_checker.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/onedal/utils/finiteness_checker.cpp b/onedal/utils/finiteness_checker.cpp index 7189aec5d9..6bc6a2e66b 100644 --- a/onedal/utils/finiteness_checker.cpp +++ b/onedal/utils/finiteness_checker.cpp @@ -94,8 +94,10 @@ ONEDAL_PY_INIT_MODULE(finiteness_checker) { using task_list = types; auto sub = m.def_submodule("finiteness_checker"); - ONEDAL_PY_INSTANTIATE(init_compute_ops, sub, policy_list, task_list); - ONEDAL_PY_INSTANTIATE(init_compute_result, sub, task_list); + #ifndef ONEDAL_DATA_PARALLEL_SPMD + ONEDAL_PY_INSTANTIATE(init_compute_ops, sub, policy_list, task_list); + ONEDAL_PY_INSTANTIATE(init_compute_result, sub, task_list); + #endif } } // namespace oneapi::dal::python From 3dddf2dc3469f197c7e539c73f407670173c9864 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 1 Nov 2024 00:30:15 +0100 Subject: [PATCH 011/131] add next step --- onedal/utils/validation.py | 41 +++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index bde2390e80..eb313cd980 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -20,6 +20,10 @@ import numpy as np from scipy import sparse as sp +from onedal import _backend +from ..common._policy import _get_policy +from ..datatypes import _convert_to_supported, to_table + if np.lib.NumpyVersion(np.__version__) >= np.lib.NumpyVersion("2.0.0a0"): # numpy_version >= 2.0 @@ -31,7 +35,9 @@ from sklearn.preprocessing import LabelEncoder from sklearn.utils.validation import check_array -from daal4py.sklearn.utils.validation import _assert_all_finite +from daal4py.sklearn.utils.validation import ( + _assert_all_finite as _daal4py_assert_all_finite, +) class DataConversionWarning(UserWarning): @@ -135,10 +141,10 @@ def _check_array( if force_all_finite: if sp.issparse(array): if hasattr(array, "data"): - _assert_all_finite(array.data) + _daal4py_assert_all_finite(array.data) force_all_finite = False else: - _assert_all_finite(array) + _daal4py_assert_all_finite(array) force_all_finite = False array = check_array( array=array, @@ -200,7 +206,7 @@ def _check_X_y( if y_numeric and y.dtype.kind == "O": y = y.astype(np.float64) if force_all_finite: - _assert_all_finite(y) + _daal4py_assert_all_finite(y) lengths = [X.shape[0], y.shape[0]] uniques = np.unique(lengths) @@ -285,7 +291,7 @@ def _type_of_target(y): # check float and contains non-integer float values if y.dtype.kind == "f" and np.any(y != y.astype(int)): # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] - _assert_all_finite(y) + _daal4py_assert_all_finite(y) return "continuous" + suffix if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1): @@ -430,3 +436,28 @@ def _is_csr(x): return isinstance(x, sp.csr_matrix) or ( hasattr(sp, "csr_array") and isinstance(x, sp.csr_array) ) + + +def _assert_all_finite(X, allow_nan=False, input_name=""): + # NOTE: This function does not respond to target_offload, as the memory movement + # is likely to cause a significant reduction in performance + # requires extracting the queue to generate a policy for converting the data to fp32 + X = to_table(_convert_to_supported(_get_policy(None, X), X)) + if not _backend.finiteness_checker(allow_nan=allow_nan).compute(X).finite: + type_err = "infinity" if allow_nan else "NaN, infinity" + padded_input_name = input_name + " " if input_name else "" + msg_err = f"Input {padded_input_name}contains {type_err}." + raise ValueError(msg_err) + + +def assert_all_finite( + X, + *, + allow_nan=False, + input_name="", +): + _assert_all_finite( + X.data if sp.issparse(X) else X, + allow_nan=allow_nan, + input_name=input_name, + ) From 1e1213e60e2d52310b26625a1c749379affcd007 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 1 Nov 2024 00:37:07 +0100 Subject: [PATCH 012/131] follow conventions --- onedal/utils/validation.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index eb313cd980..3a9d849486 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -442,8 +442,11 @@ def _assert_all_finite(X, allow_nan=False, input_name=""): # NOTE: This function does not respond to target_offload, as the memory movement # is likely to cause a significant reduction in performance # requires extracting the queue to generate a policy for converting the data to fp32 - X = to_table(_convert_to_supported(_get_policy(None, X), X)) - if not _backend.finiteness_checker(allow_nan=allow_nan).compute(X).finite: + policy = _get_policy(None, X) + X = to_table(_convert_to_supported(policy, X)) + if not _backend.finiteness_checker.compute( + policy, {"allow_nan": allow_nan}, X + ).finite: type_err = "infinity" if allow_nan else "NaN, infinity" padded_input_name = input_name + " " if input_name else "" msg_err = f"Input {padded_input_name}contains {type_err}." From 053171340099a68ced8fec11f79371f6bac253ef Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 1 Nov 2024 00:38:57 +0100 Subject: [PATCH 013/131] make xtable explicit --- onedal/utils/validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index 3a9d849486..67c7a2dee0 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -443,9 +443,9 @@ def _assert_all_finite(X, allow_nan=False, input_name=""): # is likely to cause a significant reduction in performance # requires extracting the queue to generate a policy for converting the data to fp32 policy = _get_policy(None, X) - X = to_table(_convert_to_supported(policy, X)) + X_table = to_table(_convert_to_supported(policy, X)) if not _backend.finiteness_checker.compute( - policy, {"allow_nan": allow_nan}, X + policy, {"allow_nan": allow_nan}, X_table ).finite: type_err = "infinity" if allow_nan else "NaN, infinity" padded_input_name = input_name + " " if input_name else "" From e831167b32b85135b9e685c7dd83227db89603e2 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 1 Nov 2024 00:42:29 +0100 Subject: [PATCH 014/131] remove comment --- onedal/utils/validation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index 67c7a2dee0..10bb920291 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -441,7 +441,6 @@ def _is_csr(x): def _assert_all_finite(X, allow_nan=False, input_name=""): # NOTE: This function does not respond to target_offload, as the memory movement # is likely to cause a significant reduction in performance - # requires extracting the queue to generate a policy for converting the data to fp32 policy = _get_policy(None, X) X_table = to_table(_convert_to_supported(policy, X)) if not _backend.finiteness_checker.compute( From d6eb1d05e9de1c6bc0a1f9683659ddef4540480d Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Fri, 1 Nov 2024 00:57:56 +0100 Subject: [PATCH 015/131] Update validation.py --- onedal/utils/validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index 10bb920291..f4597cd01c 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -21,8 +21,8 @@ import numpy as np from scipy import sparse as sp from onedal import _backend -from ..common._policy import _get_policy -from ..datatypes import _convert_to_supported, to_table +from onedal.common._policy import _get_policy +from onedal.datatypes import _convert_to_supported, to_table if np.lib.NumpyVersion(np.__version__) >= np.lib.NumpyVersion("2.0.0a0"): From fb30d6e69a2c6244112079a9c6a0dd75cd9a3a85 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Fri, 1 Nov 2024 22:34:52 +0100 Subject: [PATCH 016/131] Update __init__.py --- onedal/utils/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/onedal/utils/__init__.py b/onedal/utils/__init__.py index 0a1b05fbc2..0bc9ed35a3 100644 --- a/onedal/utils/__init__.py +++ b/onedal/utils/__init__.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +import scipy.sparse as sp from .validation import ( _check_array, @@ -22,7 +23,6 @@ _column_or_1d, _is_arraylike, _is_arraylike_not_scalar, - _is_csr, _is_integral_float, _is_multilabel, _num_features, @@ -31,6 +31,12 @@ _validate_targets, ) +def _is_csr(x): + """Return True if x is scipy.sparse.csr_matrix or scipy.sparse.csr_array""" + return isinstance(x, sp.csr_matrix) or ( + hasattr(sp, "csr_array") and isinstance(x, sp.csr_array) + ) + __all__ = [ "_column_or_1d", "_validate_targets", From 63a18c2f66ad93720408c33aa3a3b05f74d58f48 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Fri, 1 Nov 2024 22:35:12 +0100 Subject: [PATCH 017/131] Update validation.py --- onedal/utils/validation.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index f4597cd01c..1421bfaefc 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -431,13 +431,6 @@ def _num_samples(x): raise TypeError(message) from type_error -def _is_csr(x): - """Return True if x is scipy.sparse.csr_matrix or scipy.sparse.csr_array""" - return isinstance(x, sp.csr_matrix) or ( - hasattr(sp, "csr_array") and isinstance(x, sp.csr_array) - ) - - def _assert_all_finite(X, allow_nan=False, input_name=""): # NOTE: This function does not respond to target_offload, as the memory movement # is likely to cause a significant reduction in performance From 76c0856a12c04d4d3eb13d3c21382b1b84a23dc7 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Fri, 1 Nov 2024 22:40:03 +0100 Subject: [PATCH 018/131] Update __init__.py --- onedal/utils/__init__.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/onedal/utils/__init__.py b/onedal/utils/__init__.py index 0bc9ed35a3..a7e1495cf9 100644 --- a/onedal/utils/__init__.py +++ b/onedal/utils/__init__.py @@ -13,8 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -import scipy.sparse as sp +def _is_csr(x): + """Return True if x is scipy.sparse.csr_matrix or scipy.sparse.csr_array""" + return isinstance(x, sp.csr_matrix) or ( + hasattr(sp, "csr_array") and isinstance(x, sp.csr_array) + ) from .validation import ( _check_array, _check_classification_targets, @@ -23,6 +27,7 @@ _column_or_1d, _is_arraylike, _is_arraylike_not_scalar, + _is_csr, _is_integral_float, _is_multilabel, _num_features, @@ -31,12 +36,6 @@ _validate_targets, ) -def _is_csr(x): - """Return True if x is scipy.sparse.csr_matrix or scipy.sparse.csr_array""" - return isinstance(x, sp.csr_matrix) or ( - hasattr(sp, "csr_array") and isinstance(x, sp.csr_array) - ) - __all__ = [ "_column_or_1d", "_validate_targets", From 7deb2bbce9c0435b2484ae0fcfc754f5521bb01d Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Fri, 1 Nov 2024 22:40:24 +0100 Subject: [PATCH 019/131] Update __init__.py --- onedal/utils/__init__.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/onedal/utils/__init__.py b/onedal/utils/__init__.py index a7e1495cf9..0a1b05fbc2 100644 --- a/onedal/utils/__init__.py +++ b/onedal/utils/__init__.py @@ -14,11 +14,6 @@ # limitations under the License. # ============================================================================== -def _is_csr(x): - """Return True if x is scipy.sparse.csr_matrix or scipy.sparse.csr_array""" - return isinstance(x, sp.csr_matrix) or ( - hasattr(sp, "csr_array") and isinstance(x, sp.csr_array) - ) from .validation import ( _check_array, _check_classification_targets, From ed46b2907bb0a00678dab9c2516543941471b64a Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Fri, 1 Nov 2024 22:41:17 +0100 Subject: [PATCH 020/131] Update validation.py --- onedal/utils/validation.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index 1421bfaefc..f4597cd01c 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -431,6 +431,13 @@ def _num_samples(x): raise TypeError(message) from type_error +def _is_csr(x): + """Return True if x is scipy.sparse.csr_matrix or scipy.sparse.csr_array""" + return isinstance(x, sp.csr_matrix) or ( + hasattr(sp, "csr_array") and isinstance(x, sp.csr_array) + ) + + def _assert_all_finite(X, allow_nan=False, input_name=""): # NOTE: This function does not respond to target_offload, as the memory movement # is likely to cause a significant reduction in performance From 67d6273f3520232daad4f7f16b49291240600e16 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Fri, 1 Nov 2024 22:42:45 +0100 Subject: [PATCH 021/131] Update _data_conversion.py --- onedal/datatypes/_data_conversion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onedal/datatypes/_data_conversion.py b/onedal/datatypes/_data_conversion.py index 0caac10884..011a2eb89d 100644 --- a/onedal/datatypes/_data_conversion.py +++ b/onedal/datatypes/_data_conversion.py @@ -17,11 +17,11 @@ import warnings import numpy as np +import scipy.sparse as sp from daal4py.sklearn._utils import make2d from onedal import _backend, _is_dpc_backend -from ..utils import _is_csr from ..utils._dpep_helpers import is_dpctl_available dpctl_available = is_dpctl_available("0.14") @@ -46,7 +46,7 @@ def convert_one_to_table(arg): if isinstance(arg, dpt.usm_ndarray): return _backend.dpctl_to_table(arg) - if not _is_csr(arg): + if not sp.issparse(arg): arg = make2d(arg) return _backend.to_table(arg) From 8abead922bd8c2fceff7e8e6dffe4b76389fe1d4 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Fri, 1 Nov 2024 22:58:03 +0100 Subject: [PATCH 022/131] Update _data_conversion.py --- onedal/datatypes/_data_conversion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onedal/datatypes/_data_conversion.py b/onedal/datatypes/_data_conversion.py index 386101eb14..12dc24eca3 100644 --- a/onedal/datatypes/_data_conversion.py +++ b/onedal/datatypes/_data_conversion.py @@ -103,7 +103,7 @@ def convert_one_to_table(arg, sua_iface=None): if sua_iface: return _backend.sua_iface_to_table(arg) - if not sp.sparse(arg): + if not sp.issparse(arg): arg = make2d(arg) return _backend.to_table(arg) @@ -130,7 +130,7 @@ def convert_one_to_table(arg, sua_iface=None): "SYCL usm array conversion to table requires the DPC backend" ) - if not sp.sparse(arg): + if not sp.issparse(arg): arg = make2d(arg) return _backend.to_table(arg) From 47d0f8bf7f0544089bcc2626dc06863be663757b Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 00:39:18 +0100 Subject: [PATCH 023/131] Update policy_common.cpp --- onedal/common/policy_common.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/onedal/common/policy_common.cpp b/onedal/common/policy_common.cpp index bfb3c02cbd..3d8443378d 100644 --- a/onedal/common/policy_common.cpp +++ b/onedal/common/policy_common.cpp @@ -31,6 +31,10 @@ constexpr const char py_capsule_name[] = "PyCapsule"; constexpr const char get_capsule_name[] = "_get_capsule"; constexpr const char queue_capsule_name[] = "SyclQueueRef"; constexpr const char context_capsule_name[] = "SyclContextRef"; +constexpr const char device_name[] = "sycl_device"; +constexpr const char filter_name[] = "filter_selector"; + + sycl::queue extract_queue(py::capsule capsule) { constexpr const char* gtr_name = queue_capsule_name; @@ -79,7 +83,12 @@ sycl::queue get_queue_from_python(const py::object& syclobj) { const auto caps = syclobj.cast(); return extract_from_capsule(std::move(caps)); } - else { + else if (py::hasattr(syclobj, device_name) && py::hasattr(syclobj.attr(device_name), filter_name)) { + auto attr = syclobj.attr(device_name).attr(filter_name); + return get_queue_by_filter_string(attr.cast()); + } + else + { throw std::runtime_error("Unable to interpret \"syclobj\""); } } From e48c2bdca15b554e9b325508b8827465ae6d34bf Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 00:45:56 +0100 Subject: [PATCH 024/131] Update policy_common.cpp --- onedal/common/policy_common.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/onedal/common/policy_common.cpp b/onedal/common/policy_common.cpp index 3d8443378d..364f248992 100644 --- a/onedal/common/policy_common.cpp +++ b/onedal/common/policy_common.cpp @@ -32,7 +32,7 @@ constexpr const char get_capsule_name[] = "_get_capsule"; constexpr const char queue_capsule_name[] = "SyclQueueRef"; constexpr const char context_capsule_name[] = "SyclContextRef"; constexpr const char device_name[] = "sycl_device"; -constexpr const char filter_name[] = "filter_selector"; +constexpr const char get_filter_name[] = "get_filter_string"; @@ -83,9 +83,9 @@ sycl::queue get_queue_from_python(const py::object& syclobj) { const auto caps = syclobj.cast(); return extract_from_capsule(std::move(caps)); } - else if (py::hasattr(syclobj, device_name) && py::hasattr(syclobj.attr(device_name), filter_name)) { - auto attr = syclobj.attr(device_name).attr(filter_name); - return get_queue_by_filter_string(attr.cast()); + else if (py::hasattr(syclobj, device_name) && py::hasattr(syclobj.attr(device_name), get_filter_name)) { + auto attr = syclobj.attr(device_name).attr(get_filter_name); + return get_queue_by_filter_string(attr().cast()); } else { From c6751c4bc2dea6fd8e38c470d9f398bb0b8f8161 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 00:47:04 +0100 Subject: [PATCH 025/131] Update _policy.py --- onedal/common/_policy.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/onedal/common/_policy.py b/onedal/common/_policy.py index 90705854f6..abd267f4a6 100644 --- a/onedal/common/_policy.py +++ b/onedal/common/_policy.py @@ -48,12 +48,7 @@ def __init__(self): if _is_dpc_backend: - from onedal._device_offload import DummySyclQueue - class _DataParallelInteropPolicy(_backend.data_parallel_policy): def __init__(self, queue): self._queue = queue - if isinstance(queue, DummySyclQueue): - super().__init__(self._queue.sycl_device.get_filter_string()) - return super().__init__(self._queue) From f3e4a3a678298b7a7b135bae67ef29e293a45ee5 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 01:01:33 +0100 Subject: [PATCH 026/131] Update policy_common.cpp --- onedal/common/policy_common.cpp | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/onedal/common/policy_common.cpp b/onedal/common/policy_common.cpp index 364f248992..3bd18c3689 100644 --- a/onedal/common/policy_common.cpp +++ b/onedal/common/policy_common.cpp @@ -34,8 +34,6 @@ constexpr const char context_capsule_name[] = "SyclContextRef"; constexpr const char device_name[] = "sycl_device"; constexpr const char get_filter_name[] = "get_filter_string"; - - sycl::queue extract_queue(py::capsule capsule) { constexpr const char* gtr_name = queue_capsule_name; constexpr std::size_t gtr_size = sizeof(queue_capsule_name); @@ -74,6 +72,20 @@ sycl::queue get_queue_by_get_capsule(const py::object& syclobj) { return extract_from_capsule(std::move(capsule)); } +sycl::queue get_queue_by_filter_string(const std::string& filter) { + filter_selector_wrapper selector{ filter }; + return sycl::queue{ selector }; +} + +sycl::queue get_queue_by_device_id(std::uint32_t id) { + if (auto device = get_device_by_id(id)) { + return sycl::queue{ device.value() }; + } + else { + throw std::runtime_error(unknown_device); + } +} + sycl::queue get_queue_from_python(const py::object& syclobj) { static auto pycapsule = py::cast(py_capsule_name); if (py::hasattr(syclobj, get_capsule_name)) { @@ -93,20 +105,6 @@ sycl::queue get_queue_from_python(const py::object& syclobj) { } } -sycl::queue get_queue_by_filter_string(const std::string& filter) { - filter_selector_wrapper selector{ filter }; - return sycl::queue{ selector }; -} - -sycl::queue get_queue_by_device_id(std::uint32_t id) { - if (auto device = get_device_by_id(id)) { - return sycl::queue{ device.value() }; - } - else { - throw std::runtime_error(unknown_device); - } -} - std::string get_device_name(const sycl::queue& queue) { const auto& device = queue.get_device(); if (device.is_gpu()) { From 39cdb5f3c48810a178b12608fa18eb2a8edecfd0 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 01:28:12 +0100 Subject: [PATCH 027/131] Rename finiteness_checker.cpp to finiteness_checker.cpp --- onedal/{utils => primitives}/finiteness_checker.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename onedal/{utils => primitives}/finiteness_checker.cpp (100%) diff --git a/onedal/utils/finiteness_checker.cpp b/onedal/primitives/finiteness_checker.cpp similarity index 100% rename from onedal/utils/finiteness_checker.cpp rename to onedal/primitives/finiteness_checker.cpp From 0f39613063f153d054826cbcac9f931232c14177 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 01:33:21 +0100 Subject: [PATCH 028/131] Create finiteness_checker.py --- onedal/primitives/finiteness_checker.py | 48 +++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 onedal/primitives/finiteness_checker.py diff --git a/onedal/primitives/finiteness_checker.py b/onedal/primitives/finiteness_checker.py new file mode 100644 index 0000000000..c1a2b5c364 --- /dev/null +++ b/onedal/primitives/finiteness_checker.py @@ -0,0 +1,48 @@ +# ============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import scipy.sparse as sp + +from onedal import _backend +from onedal.common._policy import _get_policy +from onedal.datatypes import _convert_to_supported, to_table + + +def _assert_all_finite(X, allow_nan=False, input_name=""): + # NOTE: This function does not respond to target_offload, as the memory movement + # is likely to cause a significant reduction in performance + policy = _get_policy(None, X) + X_table = to_table(_convert_to_supported(policy, X)) + if not _backend.finiteness_checker.compute( + policy, {"allow_nan": allow_nan}, X_table + ).finite: + type_err = "infinity" if allow_nan else "NaN, infinity" + padded_input_name = input_name + " " if input_name else "" + msg_err = f"Input {padded_input_name}contains {type_err}." + raise ValueError(msg_err) + + +def assert_all_finite( + X, + *, + allow_nan=False, + input_name="", +): + _assert_all_finite( + X.data if sp.issparse(X) else X, + allow_nan=allow_nan, + input_name=input_name, + ) From b42cfe365d6dba0735dee79e732b6f1bddd9b1dc Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 01:33:45 +0100 Subject: [PATCH 029/131] Update validation.py --- onedal/utils/validation.py | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index f4597cd01c..bb501617fa 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -20,10 +20,6 @@ import numpy as np from scipy import sparse as sp -from onedal import _backend -from onedal.common._policy import _get_policy -from onedal.datatypes import _convert_to_supported, to_table - if np.lib.NumpyVersion(np.__version__) >= np.lib.NumpyVersion("2.0.0a0"): # numpy_version >= 2.0 @@ -436,30 +432,3 @@ def _is_csr(x): return isinstance(x, sp.csr_matrix) or ( hasattr(sp, "csr_array") and isinstance(x, sp.csr_array) ) - - -def _assert_all_finite(X, allow_nan=False, input_name=""): - # NOTE: This function does not respond to target_offload, as the memory movement - # is likely to cause a significant reduction in performance - policy = _get_policy(None, X) - X_table = to_table(_convert_to_supported(policy, X)) - if not _backend.finiteness_checker.compute( - policy, {"allow_nan": allow_nan}, X_table - ).finite: - type_err = "infinity" if allow_nan else "NaN, infinity" - padded_input_name = input_name + " " if input_name else "" - msg_err = f"Input {padded_input_name}contains {type_err}." - raise ValueError(msg_err) - - -def assert_all_finite( - X, - *, - allow_nan=False, - input_name="", -): - _assert_all_finite( - X.data if sp.issparse(X) else X, - allow_nan=allow_nan, - input_name=input_name, - ) From 0ed615e9b44825e483aaad292187296416a08960 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 01:34:51 +0100 Subject: [PATCH 030/131] Update __init__.py --- onedal/primitives/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/onedal/primitives/__init__.py b/onedal/primitives/__init__.py index 39213819b5..c501a78d67 100644 --- a/onedal/primitives/__init__.py +++ b/onedal/primitives/__init__.py @@ -15,13 +15,16 @@ # ============================================================================== from .get_tree import get_tree_state_cls, get_tree_state_reg +from .finiteness_checker import assert_all_finite, _assert_all_finite from .kernel_functions import linear_kernel, poly_kernel, rbf_kernel, sigmoid_kernel __all__ = [ + "assert_all_finite", "get_tree_state_cls", "get_tree_state_reg", "linear_kernel", "rbf_kernel", "poly_kernel", "sigmoid_kernel", + "_assert_all_finite", ] From f101affd5068f017edd6f399666528920a4e309f Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sat, 2 Nov 2024 13:32:43 +0100 Subject: [PATCH 031/131] attempt at fixing circular imports again --- onedal/common/_policy.py | 1 + onedal/datatypes/_data_conversion.py | 31 ++++++++-------- onedal/primitives/finiteness_checker.py | 48 ------------------------- onedal/utils/validation.py | 31 ++++++++++++++++ 4 files changed, 49 insertions(+), 62 deletions(-) delete mode 100644 onedal/primitives/finiteness_checker.py diff --git a/onedal/common/_policy.py b/onedal/common/_policy.py index abd267f4a6..0d7d8ca6a3 100644 --- a/onedal/common/_policy.py +++ b/onedal/common/_policy.py @@ -48,6 +48,7 @@ def __init__(self): if _is_dpc_backend: + class _DataParallelInteropPolicy(_backend.data_parallel_policy): def __init__(self, queue): self._queue = queue diff --git a/onedal/datatypes/_data_conversion.py b/onedal/datatypes/_data_conversion.py index 12dc24eca3..af5b41eb6b 100644 --- a/onedal/datatypes/_data_conversion.py +++ b/onedal/datatypes/_data_conversion.py @@ -31,13 +31,23 @@ def _apply_and_pass(func, *args, **kwargs): if _is_dpc_backend: - from ..utils._dpep_helpers import dpctl_available, dpnp_available + try: + import dpnp - if dpctl_available: - import dpctl.tensor as dpt + def _onedal_gpu_table_to_array(table, xp=None): + # By default DPNP ndarray created with a copy. + # TODO: + # investigate why dpnp.array(table, copy=False) doesn't work. + # Work around with using dpctl.tensor.asarray. + if xp == dpnp: + return dpnp.array(dpnp.dpctl.tensor.asarray(table), copy=False) + else: + return xp.asarray(table) - if dpnp_available: - import dpnp + except ImportError: + + def _onedal_gpu_table_to_array(table, xp=None): + return xp.asarray(table) from ..common._policy import _HostInteropPolicy @@ -86,15 +96,8 @@ def convert_one_from_table(table, sycl_queue=None, sua_iface=None, xp=None): _backend.from_table(table), usm_type="device", sycl_queue=sycl_queue ) else: - xp_name = xp.__name__ - if dpnp_available and xp_name == "dpnp": - # By default DPNP ndarray created with a copy. - # TODO: - # investigate why dpnp.array(table, copy=False) doesn't work. - # Work around with using dpctl.tensor.asarray. - return dpnp.array(dpt.asarray(table), copy=False) - else: - return xp.asarray(table) + return _onedal_gpu_table_to_array(table, xp=xp) + return _backend.from_table(table) def convert_one_to_table(arg, sua_iface=None): diff --git a/onedal/primitives/finiteness_checker.py b/onedal/primitives/finiteness_checker.py deleted file mode 100644 index c1a2b5c364..0000000000 --- a/onedal/primitives/finiteness_checker.py +++ /dev/null @@ -1,48 +0,0 @@ -# ============================================================================== -# Copyright 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import scipy.sparse as sp - -from onedal import _backend -from onedal.common._policy import _get_policy -from onedal.datatypes import _convert_to_supported, to_table - - -def _assert_all_finite(X, allow_nan=False, input_name=""): - # NOTE: This function does not respond to target_offload, as the memory movement - # is likely to cause a significant reduction in performance - policy = _get_policy(None, X) - X_table = to_table(_convert_to_supported(policy, X)) - if not _backend.finiteness_checker.compute( - policy, {"allow_nan": allow_nan}, X_table - ).finite: - type_err = "infinity" if allow_nan else "NaN, infinity" - padded_input_name = input_name + " " if input_name else "" - msg_err = f"Input {padded_input_name}contains {type_err}." - raise ValueError(msg_err) - - -def assert_all_finite( - X, - *, - allow_nan=False, - input_name="", -): - _assert_all_finite( - X.data if sp.issparse(X) else X, - allow_nan=allow_nan, - input_name=input_name, - ) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index bb501617fa..c620b7b2e4 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -35,6 +35,10 @@ _assert_all_finite as _daal4py_assert_all_finite, ) +from onedal import _backend +from onedal.common._policy import _get_policy +from onedal.datatypes import _convert_to_supported, to_table + class DataConversionWarning(UserWarning): """Warning used to notify implicit data conversions happening in the code.""" @@ -432,3 +436,30 @@ def _is_csr(x): return isinstance(x, sp.csr_matrix) or ( hasattr(sp, "csr_array") and isinstance(x, sp.csr_array) ) + + +def _assert_all_finite(X, allow_nan=False, input_name=""): + # NOTE: This function does not respond to target_offload, as the memory movement + # is likely to cause a significant reduction in performance + policy = _get_policy(None, X) + X_table = to_table(_convert_to_supported(policy, X)) + if not _backend.finiteness_checker.compute( + policy, {"allow_nan": allow_nan}, X_table + ).finite: + type_err = "infinity" if allow_nan else "NaN, infinity" + padded_input_name = input_name + " " if input_name else "" + msg_err = f"Input {padded_input_name}contains {type_err}." + raise ValueError(msg_err) + + +def assert_all_finite( + X, + *, + allow_nan=False, + input_name="", +): + _assert_all_finite( + X.data if sp.issparse(X) else X, + allow_nan=allow_nan, + input_name=input_name, + ) From 24c0e9472a85b2023ddb21a27fe6a783adb5cc1c Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sat, 2 Nov 2024 13:33:06 +0100 Subject: [PATCH 032/131] fix isort --- onedal/primitives/__init__.py | 2 +- onedal/utils/validation.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/onedal/primitives/__init__.py b/onedal/primitives/__init__.py index c501a78d67..79d72e2f16 100644 --- a/onedal/primitives/__init__.py +++ b/onedal/primitives/__init__.py @@ -14,8 +14,8 @@ # limitations under the License. # ============================================================================== +from .finiteness_checker import _assert_all_finite, assert_all_finite from .get_tree import get_tree_state_cls, get_tree_state_reg -from .finiteness_checker import assert_all_finite, _assert_all_finite from .kernel_functions import linear_kernel, poly_kernel, rbf_kernel, sigmoid_kernel __all__ = [ diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index c620b7b2e4..4c5cc9746f 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -34,7 +34,6 @@ from daal4py.sklearn.utils.validation import ( _assert_all_finite as _daal4py_assert_all_finite, ) - from onedal import _backend from onedal.common._policy import _get_policy from onedal.datatypes import _convert_to_supported, to_table From 3f96166299d3ac5f07931ba64e5b0e96af345496 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sat, 2 Nov 2024 13:35:06 +0100 Subject: [PATCH 033/131] remove __init__ changes --- onedal/primitives/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/onedal/primitives/__init__.py b/onedal/primitives/__init__.py index 79d72e2f16..39213819b5 100644 --- a/onedal/primitives/__init__.py +++ b/onedal/primitives/__init__.py @@ -14,17 +14,14 @@ # limitations under the License. # ============================================================================== -from .finiteness_checker import _assert_all_finite, assert_all_finite from .get_tree import get_tree_state_cls, get_tree_state_reg from .kernel_functions import linear_kernel, poly_kernel, rbf_kernel, sigmoid_kernel __all__ = [ - "assert_all_finite", "get_tree_state_cls", "get_tree_state_reg", "linear_kernel", "rbf_kernel", "poly_kernel", "sigmoid_kernel", - "_assert_all_finite", ] From d98505388701b670e037148e14490163e5675590 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sat, 2 Nov 2024 13:35:50 +0100 Subject: [PATCH 034/131] last move --- onedal/{primitives => utils}/finiteness_checker.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename onedal/{primitives => utils}/finiteness_checker.cpp (100%) diff --git a/onedal/primitives/finiteness_checker.cpp b/onedal/utils/finiteness_checker.cpp similarity index 100% rename from onedal/primitives/finiteness_checker.cpp rename to onedal/utils/finiteness_checker.cpp From 90ec48b46bc0c06a1da5b07e7b5d93efc12c12b7 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 14:39:03 +0100 Subject: [PATCH 035/131] Update policy_common.cpp --- onedal/common/policy_common.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/onedal/common/policy_common.cpp b/onedal/common/policy_common.cpp index 3bd18c3689..828be51547 100644 --- a/onedal/common/policy_common.cpp +++ b/onedal/common/policy_common.cpp @@ -87,11 +87,10 @@ sycl::queue get_queue_by_device_id(std::uint32_t id) { } sycl::queue get_queue_from_python(const py::object& syclobj) { - static auto pycapsule = py::cast(py_capsule_name); if (py::hasattr(syclobj, get_capsule_name)) { return get_queue_by_get_capsule(syclobj); } - else if (py::isinstance(syclobj, pycapsule)) { + else if (py::isinstance(syclobj, py::capsule)) { const auto caps = syclobj.cast(); return extract_from_capsule(std::move(caps)); } From 8c2c854c06b0e4486aae563418ea047d24f528df Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 14:59:19 +0100 Subject: [PATCH 036/131] Update policy_common.cpp --- onedal/common/policy_common.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/onedal/common/policy_common.cpp b/onedal/common/policy_common.cpp index 828be51547..224e7a04e1 100644 --- a/onedal/common/policy_common.cpp +++ b/onedal/common/policy_common.cpp @@ -19,7 +19,6 @@ #endif // ONEDAL_DATA_PARALLEL #include - #include "onedal/common/policy_common.hpp" namespace oneapi::dal::python { @@ -90,7 +89,7 @@ sycl::queue get_queue_from_python(const py::object& syclobj) { if (py::hasattr(syclobj, get_capsule_name)) { return get_queue_by_get_capsule(syclobj); } - else if (py::isinstance(syclobj, py::capsule)) { + else if (py::isinstance(syclobj) { const auto caps = syclobj.cast(); return extract_from_capsule(std::move(caps)); } From 6fa38d7f49d95a831d663101e076530297980865 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 15:07:44 +0100 Subject: [PATCH 037/131] Update policy_common.cpp --- onedal/common/policy_common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/common/policy_common.cpp b/onedal/common/policy_common.cpp index 224e7a04e1..b10c60880d 100644 --- a/onedal/common/policy_common.cpp +++ b/onedal/common/policy_common.cpp @@ -89,7 +89,7 @@ sycl::queue get_queue_from_python(const py::object& syclobj) { if (py::hasattr(syclobj, get_capsule_name)) { return get_queue_by_get_capsule(syclobj); } - else if (py::isinstance(syclobj) { + else if (py::isinstance(syclobj)) { const auto caps = syclobj.cast(); return extract_from_capsule(std::move(caps)); } From 9c1ca9c3f29d3f00f5b10444e3e78101fb39adc0 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 17:22:59 +0100 Subject: [PATCH 038/131] Update policy_common.cpp --- onedal/common/policy_common.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/onedal/common/policy_common.cpp b/onedal/common/policy_common.cpp index b10c60880d..284762b035 100644 --- a/onedal/common/policy_common.cpp +++ b/onedal/common/policy_common.cpp @@ -19,6 +19,7 @@ #endif // ONEDAL_DATA_PARALLEL #include + #include "onedal/common/policy_common.hpp" namespace oneapi::dal::python { From 4b67dbde880bfa8c3d5373473a589bd2f6577c56 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 19:27:45 +0100 Subject: [PATCH 039/131] Update validation.py --- onedal/utils/validation.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index 4c5cc9746f..2ea8de8f51 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -438,8 +438,6 @@ def _is_csr(x): def _assert_all_finite(X, allow_nan=False, input_name=""): - # NOTE: This function does not respond to target_offload, as the memory movement - # is likely to cause a significant reduction in performance policy = _get_policy(None, X) X_table = to_table(_convert_to_supported(policy, X)) if not _backend.finiteness_checker.compute( From fa59a3c0103e9bd9d31ac1c0bf94cc9d1f86ae26 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sat, 2 Nov 2024 22:23:58 +0100 Subject: [PATCH 040/131] add testing --- onedal/utils/tests/test_validation.py | 115 ++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 onedal/utils/tests/test_validation.py diff --git a/onedal/utils/tests/test_validation.py b/onedal/utils/tests/test_validation.py new file mode 100644 index 0000000000..406a2fd7bc --- /dev/null +++ b/onedal/utils/tests/test_validation.py @@ -0,0 +1,115 @@ +# ============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import time + +import numpy as np +import numpy.random as rand +import pytest +from numpy.testing import assert_raises + +from onedal.tests.utils._dataframes_support import ( + _convert_to_dataframe, + get_dataframes_and_queues, +) +from onedal.utils.validation import assert_all_finite, _assert_all_finite + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize( + "shape", + [ + [16, 2048], + [ + 2**16 + 3, + ], + [1000, 1000], + [ + 3, + ], + ], +) +@pytest.mark.parametrize("allow_nan", [False, True]) +@pytest.mark.parametrize( + "dataframe, queue", get_dataframes_and_queues("numpy,dpnp,dpctl") +) +def test_sum_infinite_actually_finite(dtype, shape, allow_nan, dataframe, queue): + X = np.array(shape, dtype=dtype) + X.fill(np.finfo(dtype).max) + X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + _assert_all_finite(X, allow_nan=allow_nan) + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize( + "shape", + [ + [16, 2048], + [ + 2**16 + 3, + ], + [1000, 1000], + [ + 3, + ], + ], +) +@pytest.mark.parametrize("allow_nan", [False, True]) +@pytest.mark.parametrize("check", ["inf", "NaN", None]) +@pytest.mark.parametrize("seed", [0, int(time.time())]) +@pytest.mark.parametrize( + "dataframe, queue", get_dataframes_and_queues("numpy,dpnp,dpctl") +) +def test_assert_finite_random_location( + dtype, shape, allow_nan, check, seed, dataframe, queue +): + rand.seed(seed) + X = rand.uniform(high=np.finfo(dtype).max, size=shape).astype(dtype) + X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + + if check: + loc = rand.randint(0, X.size - 1) + X.reshape((-1,))[loc] = float(check) + + if check is None or (allow_nan and check == "NaN"): + _assert_all_finite(X, allow_nan=allow_nan) + else: + assert_raises(ValueError, _assert_all_finite, X, allow_nan=allow_nan) + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("allow_nan", [False, True]) +@pytest.mark.parametrize("check", ["inf", "NaN", None]) +@pytest.mark.parametrize("seed", [0, int(time.time())]) +@pytest.mark.parametrize( + "dataframe, queue", get_dataframes_and_queues("numpy,dpnp,dpctl") +) +def test_assert_finite_random_shape_and_location( + dtype, allow_nan, check, seed, dataframe, queue +): + lb, ub = 2, 1048576 # lb is a patching condition, ub 2^20 + rand.seed(seed) + X = rand.uniform(high=np.finfo(dtype).max, size=rand.randint(lb, ub)).astype(dtype) + X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + + if check: + loc = rand.randint(0, X.size - 1) + X[loc] = float(check) + + if check is None or (allow_nan and check == "NaN"): + _assert_all_finite(X, allow_nan=allow_nan) + else: + assert_raises(ValueError, _assert_all_finite, X, allow_nan=allow_nan) From 3330b3312f07a751859d8e9c7639512e5d035ed3 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sat, 2 Nov 2024 22:24:38 +0100 Subject: [PATCH 041/131] isort --- onedal/utils/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/utils/tests/test_validation.py b/onedal/utils/tests/test_validation.py index 406a2fd7bc..5788a9ccc3 100644 --- a/onedal/utils/tests/test_validation.py +++ b/onedal/utils/tests/test_validation.py @@ -25,7 +25,7 @@ _convert_to_dataframe, get_dataframes_and_queues, ) -from onedal.utils.validation import assert_all_finite, _assert_all_finite +from onedal.utils.validation import _assert_all_finite, assert_all_finite @pytest.mark.parametrize("dtype", [np.float32, np.float64]) From 48959403bde34845dd7bcc9bb357cc6e79eb846e Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sat, 2 Nov 2024 22:53:23 +0100 Subject: [PATCH 042/131] attempt to fix module error --- onedal/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index 2ea8de8f51..9b33d49fe0 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -440,7 +440,7 @@ def _is_csr(x): def _assert_all_finite(X, allow_nan=False, input_name=""): policy = _get_policy(None, X) X_table = to_table(_convert_to_supported(policy, X)) - if not _backend.finiteness_checker.compute( + if not _backend.finiteness_checker.compute.compute( policy, {"allow_nan": allow_nan}, X_table ).finite: type_err = "infinity" if allow_nan else "NaN, infinity" From 0c6dd5d284155478773d1d4cf88c4fab3c9b6558 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sat, 2 Nov 2024 23:20:51 +0100 Subject: [PATCH 043/131] add fptype --- onedal/utils/validation.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index 9b33d49fe0..f6e62bef14 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -439,10 +439,12 @@ def _is_csr(x): def _assert_all_finite(X, allow_nan=False, input_name=""): policy = _get_policy(None, X) - X_table = to_table(_convert_to_supported(policy, X)) - if not _backend.finiteness_checker.compute.compute( - policy, {"allow_nan": allow_nan}, X_table - ).finite: + X_t = to_table(_convert_to_supported(policy, X)) + params = { + "fptype": "float" if X_t.dtype.name == "float32" else "double", + "allow_nan": allow_nan, + } + if not _backend.finiteness_checker.compute.compute(policy, params, X_t).finite: type_err = "infinity" if allow_nan else "NaN, infinity" padded_input_name = input_name + " " if input_name else "" msg_err = f"Input {padded_input_name}contains {type_err}." From e2182fa81ffc0b35b485a01f43b1d0dca5bb79e1 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sat, 2 Nov 2024 23:40:24 +0100 Subject: [PATCH 044/131] fix typo --- onedal/utils/validation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index f6e62bef14..1ce7e5378d 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -438,12 +438,12 @@ def _is_csr(x): def _assert_all_finite(X, allow_nan=False, input_name=""): - policy = _get_policy(None, X) - X_t = to_table(_convert_to_supported(policy, X)) params = { - "fptype": "float" if X_t.dtype.name == "float32" else "double", + "fptype": "float" if X.dtype.name == "float32" else "double", "allow_nan": allow_nan, } + policy = _get_policy(None, X) + X_t = to_table(_convert_to_supported(policy, X)) if not _backend.finiteness_checker.compute.compute(policy, params, X_t).finite: type_err = "infinity" if allow_nan else "NaN, infinity" padded_input_name = input_name + " " if input_name else "" From 982ef2c8e57e56d4d018b72fa7cd3e7ba58e0ebb Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sun, 3 Nov 2024 00:02:35 +0100 Subject: [PATCH 045/131] Update validation.py --- onedal/utils/validation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index 1ce7e5378d..6298f3ee5a 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -438,11 +438,12 @@ def _is_csr(x): def _assert_all_finite(X, allow_nan=False, input_name=""): + policy = _get_policy(None, X) params = { "fptype": "float" if X.dtype.name == "float32" else "double", + "method": "dense", "allow_nan": allow_nan, } - policy = _get_policy(None, X) X_t = to_table(_convert_to_supported(policy, X)) if not _backend.finiteness_checker.compute.compute(policy, params, X_t).finite: type_err = "infinity" if allow_nan else "NaN, infinity" From 2fb52a82bc27226d53ddfa27a462840e2011c9cb Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sun, 3 Nov 2024 02:38:41 +0100 Subject: [PATCH 046/131] remove sua_ifcae from to_table --- onedal/datatypes/_data_conversion.py | 39 +++++++++++----------------- onedal/datatypes/table.cpp | 11 ++++---- onedal/datatypes/tests/test_data.py | 12 ++++----- sklearnex/tests/test_memory_usage.py | 6 ++--- 4 files changed, 30 insertions(+), 38 deletions(-) diff --git a/onedal/datatypes/_data_conversion.py b/onedal/datatypes/_data_conversion.py index af5b41eb6b..2ef6903041 100644 --- a/onedal/datatypes/_data_conversion.py +++ b/onedal/datatypes/_data_conversion.py @@ -19,15 +19,29 @@ import numpy as np import scipy.sparse as sp -from daal4py.sklearn._utils import make2d from onedal import _backend, _is_dpc_backend +def make2d(X): + # generalized for array-like inputs + if hasattr(X, "reshape") and hasattr(X, "ndim") and X.ndim == 1: + return X.reshape((-1, 1)) + if np.isscalar(X): + return np.atleast_2d(X) + return X + + def _apply_and_pass(func, *args, **kwargs): if len(args) == 1: return func(args[0], **kwargs) return tuple(map(lambda arg: func(arg, **kwargs), args)) +def convert_one_to_table(arg): + return _backend.to_table(arg if sp.issparse(arg) else make2d(arg)) + +def to_table(*args): + return _apply_and_pass(convert_one_to_table, *args) + if _is_dpc_backend: @@ -100,16 +114,6 @@ def convert_one_from_table(table, sycl_queue=None, sua_iface=None, xp=None): return _backend.from_table(table) - def convert_one_to_table(arg, sua_iface=None): - # Note: currently only oneDAL homogen tables are supported and the - # contiuginity of the input array should be checked in advance. - if sua_iface: - return _backend.sua_iface_to_table(arg) - - if not sp.issparse(arg): - arg = make2d(arg) - return _backend.to_table(arg) - else: def _convert_to_supported(policy, *data): @@ -127,22 +131,9 @@ def convert_one_from_table(table, sycl_queue=None, sua_iface=None, xp=None): ) return _backend.from_table(table) - def convert_one_to_table(arg, sua_iface=None): - if sua_iface: - raise RuntimeError( - "SYCL usm array conversion to table requires the DPC backend" - ) - - if not sp.issparse(arg): - arg = make2d(arg) - return _backend.to_table(arg) - def from_table(*args, sycl_queue=None, sua_iface=None, xp=None): return _apply_and_pass( convert_one_from_table, *args, sycl_queue=sycl_queue, sua_iface=sua_iface, xp=xp ) - -def to_table(*args, sua_iface=None): - return _apply_and_pass(convert_one_to_table, *args, sua_iface=sua_iface) diff --git a/onedal/datatypes/table.cpp b/onedal/datatypes/table.cpp index 9771306118..ce0f15936b 100644 --- a/onedal/datatypes/table.cpp +++ b/onedal/datatypes/table.cpp @@ -78,6 +78,12 @@ ONEDAL_PY_INIT_MODULE(table) { #endif // ONEDAL_DATA_PARALLEL m.def("to_table", [](py::object obj) { + #ifdef ONEDAL_DATA_PARALLEL + if (py::hasattr(obj, "__sycl_usm_array_interface__")) { + return convert_from_sua_iface(obj); + } + #endif // ONEDAL_DATA_PARALLEL + auto* obj_ptr = obj.ptr(); return convert_to_table(obj_ptr); }); @@ -87,11 +93,6 @@ ONEDAL_PY_INIT_MODULE(table) { return obj_ptr; }); -#ifdef ONEDAL_DATA_PARALLEL - m.def("sua_iface_to_table", [](py::object obj) { - return convert_from_sua_iface(obj); - }); -#endif // ONEDAL_DATA_PARALLEL } } // namespace oneapi::dal::python diff --git a/onedal/datatypes/tests/test_data.py b/onedal/datatypes/tests/test_data.py index 471d6f0a64..de47e18ad4 100644 --- a/onedal/datatypes/tests/test_data.py +++ b/onedal/datatypes/tests/test_data.py @@ -68,7 +68,7 @@ def fit(self, X, y=None): X = xp.astype(X, dtype=xp.float64) dtype = get_dtype(X) params = bs_DBSCAN._get_onedal_params(dtype) - X_table = to_table(X, sua_iface=sua_iface) + X_table = to_table(X) # TODO: # check other candidates for the dummy base oneDAL func. # oneDAL backend func is needed to check result table checks. @@ -251,7 +251,7 @@ def test_input_sua_iface_zero_copy(dataframe, queue, order, dtype): sua_iface, X_dp_namespace, _ = _get_sycl_namespace(X_dp) - X_table = to_table(X_dp, sua_iface=sua_iface) + X_table = to_table(X_dp) _assert_sua_iface_fields(X_dp, X_table) X_dp_from_table = from_table( @@ -339,7 +339,7 @@ def test_sua_iface_interop_invalid_shape(dataframe, queue, data_shape): "Unable to convert from SUA interface: only 1D & 2D tensors are allowed" ) with pytest.raises(ValueError, match=expected_err_msg): - to_table(X, sua_iface=sua_iface) + to_table(X) @pytest.mark.skipif( @@ -368,7 +368,7 @@ def test_sua_iface_interop_unsupported_dtypes(dataframe, queue, dtype): expected_err_msg = "Unable to convert from SUA interface: unknown data type" with pytest.raises(ValueError, match=expected_err_msg): - to_table(X, sua_iface=sua_iface) + to_table(X) @pytest.mark.parametrize( @@ -393,7 +393,7 @@ def test_to_table_non_contiguous_input(dataframe, queue): else: expected_err_msg = "Numpy input Could not convert Python object to onedal table." with pytest.raises(ValueError, match=expected_err_msg): - to_table(X, sua_iface=sua_iface) + to_table(X) @pytest.mark.skipif( @@ -411,4 +411,4 @@ def test_sua_iface_interop_if_no_dpc_backend(dataframe, queue, dtype): expected_err_msg = "SYCL usm array conversion to table requires the DPC backend" with pytest.raises(RuntimeError, match=expected_err_msg): - to_table(X, sua_iface=sua_iface) + to_table(X) diff --git a/sklearnex/tests/test_memory_usage.py b/sklearnex/tests/test_memory_usage.py index 4035832d37..6e7fdb72b5 100644 --- a/sklearnex/tests/test_memory_usage.py +++ b/sklearnex/tests/test_memory_usage.py @@ -142,8 +142,8 @@ class DummyEstimatorWithTableConversions(BaseEstimator): def fit(self, X, y=None): sua_iface, xp, _ = _get_sycl_namespace(X) - X_table = to_table(X, sua_iface=sua_iface) - y_table = to_table(y, sua_iface=sua_iface) + X_table = to_table(X) + y_table = to_table(y) # The presence of the fitted attributes (ending with a trailing # underscore) is required for the correct check. The cleanup of # the memory will occur at the estimator instance deletion. @@ -160,7 +160,7 @@ def predict(self, X): # fitted attributes (ending with a trailing underscore). check_is_fitted(self) sua_iface, xp, _ = _get_sycl_namespace(X) - X_table = to_table(X, sua_iface=sua_iface) + X_table = to_table(X) returned_X = from_table( X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp ) From 28dc267ab319edf2cef611340c0ab634eae036c4 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sun, 3 Nov 2024 02:42:29 +0100 Subject: [PATCH 047/131] isort and black --- onedal/datatypes/_data_conversion.py | 3 ++- onedal/datatypes/table.cpp | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/onedal/datatypes/_data_conversion.py b/onedal/datatypes/_data_conversion.py index 2ef6903041..c08196f1d6 100644 --- a/onedal/datatypes/_data_conversion.py +++ b/onedal/datatypes/_data_conversion.py @@ -36,9 +36,11 @@ def _apply_and_pass(func, *args, **kwargs): return func(args[0], **kwargs) return tuple(map(lambda arg: func(arg, **kwargs), args)) + def convert_one_to_table(arg): return _backend.to_table(arg if sp.issparse(arg) else make2d(arg)) + def to_table(*args): return _apply_and_pass(convert_one_to_table, *args) @@ -136,4 +138,3 @@ def from_table(*args, sycl_queue=None, sua_iface=None, xp=None): return _apply_and_pass( convert_one_from_table, *args, sycl_queue=sycl_queue, sua_iface=sua_iface, xp=xp ) - diff --git a/onedal/datatypes/table.cpp b/onedal/datatypes/table.cpp index ce0f15936b..113d881228 100644 --- a/onedal/datatypes/table.cpp +++ b/onedal/datatypes/table.cpp @@ -92,7 +92,6 @@ ONEDAL_PY_INIT_MODULE(table) { auto* obj_ptr = convert_to_pyobject(t); return obj_ptr; }); - } } // namespace oneapi::dal::python From 2f85fd4713535424395acfe5d0f72d1451c27d16 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sun, 3 Nov 2024 08:19:57 +0100 Subject: [PATCH 048/131] Update test_memory_usage.py --- sklearnex/tests/test_memory_usage.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sklearnex/tests/test_memory_usage.py b/sklearnex/tests/test_memory_usage.py index 6e7fdb72b5..6e3ef2b3f7 100644 --- a/sklearnex/tests/test_memory_usage.py +++ b/sklearnex/tests/test_memory_usage.py @@ -142,6 +142,14 @@ class DummyEstimatorWithTableConversions(BaseEstimator): def fit(self, X, y=None): sua_iface, xp, _ = _get_sycl_namespace(X) + assert X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS'] + assert y.flags['C_CONTIGUOUS'] or y.flags['F_CONTIGUOUS'] + if not (X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS']): + X = xp.copy(X) + if not (y.flags['C_CONTIGUOUS'] or y.flags['F_CONTIGUOUS']): + y = xp.copy(y) + assert X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS'] + assert y.flags['C_CONTIGUOUS'] or y.flags['F_CONTIGUOUS'] X_table = to_table(X) y_table = to_table(y) # The presence of the fitted attributes (ending with a trailing @@ -160,6 +168,10 @@ def predict(self, X): # fitted attributes (ending with a trailing underscore). check_is_fitted(self) sua_iface, xp, _ = _get_sycl_namespace(X) + assert X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS'] + if not (X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS']): + X = xp.copy(X) + assert X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS'] X_table = to_table(X) returned_X = from_table( X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp From 8659248f70dc78cc94058690e217fa6383747b9b Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sun, 3 Nov 2024 09:19:39 +0100 Subject: [PATCH 049/131] format --- sklearnex/tests/test_memory_usage.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sklearnex/tests/test_memory_usage.py b/sklearnex/tests/test_memory_usage.py index 6e3ef2b3f7..214c03a6ba 100644 --- a/sklearnex/tests/test_memory_usage.py +++ b/sklearnex/tests/test_memory_usage.py @@ -142,14 +142,14 @@ class DummyEstimatorWithTableConversions(BaseEstimator): def fit(self, X, y=None): sua_iface, xp, _ = _get_sycl_namespace(X) - assert X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS'] - assert y.flags['C_CONTIGUOUS'] or y.flags['F_CONTIGUOUS'] - if not (X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS']): + assert X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"] + assert y.flags["C_CONTIGUOUS"] or y.flags["F_CONTIGUOUS"] + if not (X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]): X = xp.copy(X) - if not (y.flags['C_CONTIGUOUS'] or y.flags['F_CONTIGUOUS']): + if not (y.flags["C_CONTIGUOUS"] or y.flags["F_CONTIGUOUS"]): y = xp.copy(y) - assert X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS'] - assert y.flags['C_CONTIGUOUS'] or y.flags['F_CONTIGUOUS'] + assert X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"] + assert y.flags["C_CONTIGUOUS"] or y.flags["F_CONTIGUOUS"] X_table = to_table(X) y_table = to_table(y) # The presence of the fitted attributes (ending with a trailing @@ -168,10 +168,10 @@ def predict(self, X): # fitted attributes (ending with a trailing underscore). check_is_fitted(self) sua_iface, xp, _ = _get_sycl_namespace(X) - assert X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS'] - if not (X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS']): + assert X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"] + if not (X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]): X = xp.copy(X) - assert X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS'] + assert X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"] X_table = to_table(X) returned_X = from_table( X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp From 3827d6f38cfcd5ef065d8d6a3ea34bc749de436a Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sun, 3 Nov 2024 11:01:26 +0100 Subject: [PATCH 050/131] Update _data_conversion.py --- onedal/datatypes/_data_conversion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/onedal/datatypes/_data_conversion.py b/onedal/datatypes/_data_conversion.py index c08196f1d6..0deacf4c74 100644 --- a/onedal/datatypes/_data_conversion.py +++ b/onedal/datatypes/_data_conversion.py @@ -24,8 +24,9 @@ def make2d(X): # generalized for array-like inputs + # dpnp -1 indexing is broken, use size if hasattr(X, "reshape") and hasattr(X, "ndim") and X.ndim == 1: - return X.reshape((-1, 1)) + return X.reshape((X.size, 1)) if np.isscalar(X): return np.atleast_2d(X) return X From 55fa7d214f7a2f0398f1a83a7961a8491c587269 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sun, 3 Nov 2024 12:28:38 +0100 Subject: [PATCH 051/131] Update _data_conversion.py --- onedal/datatypes/_data_conversion.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/onedal/datatypes/_data_conversion.py b/onedal/datatypes/_data_conversion.py index 0deacf4c74..353fef7e9c 100644 --- a/onedal/datatypes/_data_conversion.py +++ b/onedal/datatypes/_data_conversion.py @@ -22,16 +22,6 @@ from onedal import _backend, _is_dpc_backend -def make2d(X): - # generalized for array-like inputs - # dpnp -1 indexing is broken, use size - if hasattr(X, "reshape") and hasattr(X, "ndim") and X.ndim == 1: - return X.reshape((X.size, 1)) - if np.isscalar(X): - return np.atleast_2d(X) - return X - - def _apply_and_pass(func, *args, **kwargs): if len(args) == 1: return func(args[0], **kwargs) @@ -39,7 +29,7 @@ def _apply_and_pass(func, *args, **kwargs): def convert_one_to_table(arg): - return _backend.to_table(arg if sp.issparse(arg) else make2d(arg)) + return _backend.to_table(np.atleast_2d(arg) if np.isscalar(arg) else arg) def to_table(*args): From 175cd7899f2a3851c60cd1964c7f7fe1f48712f3 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sun, 3 Nov 2024 13:33:34 +0100 Subject: [PATCH 052/131] Update test_validation.py --- onedal/utils/tests/test_validation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/onedal/utils/tests/test_validation.py b/onedal/utils/tests/test_validation.py index 5788a9ccc3..6f9f1c383f 100644 --- a/onedal/utils/tests/test_validation.py +++ b/onedal/utils/tests/test_validation.py @@ -78,12 +78,13 @@ def test_assert_finite_random_location( ): rand.seed(seed) X = rand.uniform(high=np.finfo(dtype).max, size=shape).astype(dtype) - X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) if check: loc = rand.randint(0, X.size - 1) X.reshape((-1,))[loc] = float(check) + X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + if check is None or (allow_nan and check == "NaN"): _assert_all_finite(X, allow_nan=allow_nan) else: @@ -103,12 +104,13 @@ def test_assert_finite_random_shape_and_location( lb, ub = 2, 1048576 # lb is a patching condition, ub 2^20 rand.seed(seed) X = rand.uniform(high=np.finfo(dtype).max, size=rand.randint(lb, ub)).astype(dtype) - X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) if check: loc = rand.randint(0, X.size - 1) X[loc] = float(check) + X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + if check is None or (allow_nan and check == "NaN"): _assert_all_finite(X, allow_nan=allow_nan) else: From 7016ad0871a5f4c5f1d0c53bad5709752a88361c Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sun, 3 Nov 2024 14:33:38 +0100 Subject: [PATCH 053/131] remove unnecessary code --- onedal/datatypes/_data_conversion.py | 1 - sklearnex/tests/test_memory_usage.py | 12 ------------ 2 files changed, 13 deletions(-) diff --git a/onedal/datatypes/_data_conversion.py b/onedal/datatypes/_data_conversion.py index 353fef7e9c..018b79524e 100644 --- a/onedal/datatypes/_data_conversion.py +++ b/onedal/datatypes/_data_conversion.py @@ -17,7 +17,6 @@ import warnings import numpy as np -import scipy.sparse as sp from onedal import _backend, _is_dpc_backend diff --git a/sklearnex/tests/test_memory_usage.py b/sklearnex/tests/test_memory_usage.py index 214c03a6ba..6e7fdb72b5 100644 --- a/sklearnex/tests/test_memory_usage.py +++ b/sklearnex/tests/test_memory_usage.py @@ -142,14 +142,6 @@ class DummyEstimatorWithTableConversions(BaseEstimator): def fit(self, X, y=None): sua_iface, xp, _ = _get_sycl_namespace(X) - assert X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"] - assert y.flags["C_CONTIGUOUS"] or y.flags["F_CONTIGUOUS"] - if not (X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]): - X = xp.copy(X) - if not (y.flags["C_CONTIGUOUS"] or y.flags["F_CONTIGUOUS"]): - y = xp.copy(y) - assert X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"] - assert y.flags["C_CONTIGUOUS"] or y.flags["F_CONTIGUOUS"] X_table = to_table(X) y_table = to_table(y) # The presence of the fitted attributes (ending with a trailing @@ -168,10 +160,6 @@ def predict(self, X): # fitted attributes (ending with a trailing underscore). check_is_fitted(self) sua_iface, xp, _ = _get_sycl_namespace(X) - assert X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"] - if not (X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]): - X = xp.copy(X) - assert X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"] X_table = to_table(X) returned_X = from_table( X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp From fb7375f796834d6dd6a2ed490bdcc38a018f80e3 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Tue, 19 Nov 2024 06:57:01 +0100 Subject: [PATCH 054/131] make reviewer changes --- onedal/utils/finiteness_checker.cpp | 2 +- onedal/utils/tests/test_validation.py | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/onedal/utils/finiteness_checker.cpp b/onedal/utils/finiteness_checker.cpp index 6bc6a2e66b..2b8d84bd6f 100644 --- a/onedal/utils/finiteness_checker.cpp +++ b/onedal/utils/finiteness_checker.cpp @@ -66,7 +66,7 @@ void init_compute_ops(py::module_& m) { using namespace finiteness_checker; using input_t = compute_input; - compute_ops ops(policy, input_t{ data}, params2desc{}); + compute_ops ops(policy, input_t{ data }, params2desc{}); return fptype2t{ method2t{ Task{}, ops } }(params); }); } diff --git a/onedal/utils/tests/test_validation.py b/onedal/utils/tests/test_validation.py index 6f9f1c383f..5f92a64bf7 100644 --- a/onedal/utils/tests/test_validation.py +++ b/onedal/utils/tests/test_validation.py @@ -19,7 +19,6 @@ import numpy as np import numpy.random as rand import pytest -from numpy.testing import assert_raises from onedal.tests.utils._dataframes_support import ( _convert_to_dataframe, @@ -88,7 +87,9 @@ def test_assert_finite_random_location( if check is None or (allow_nan and check == "NaN"): _assert_all_finite(X, allow_nan=allow_nan) else: - assert_raises(ValueError, _assert_all_finite, X, allow_nan=allow_nan) + msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." + with pytest.raises(ValueError, match=msg_err): + _assert_all_finite(X, allow_nan=allow_nan) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @@ -114,4 +115,6 @@ def test_assert_finite_random_shape_and_location( if check is None or (allow_nan and check == "NaN"): _assert_all_finite(X, allow_nan=allow_nan) else: - assert_raises(ValueError, _assert_all_finite, X, allow_nan=allow_nan) + msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." + with pytest.raises(ValueError, match=msg_err): + _assert_all_finite(X, allow_nan=allow_nan) From 30816bf546a8b5aa5470a34ec0b4e6c82577a3c9 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Tue, 19 Nov 2024 15:43:29 +0100 Subject: [PATCH 055/131] make dtype check change --- onedal/datatypes/table.cpp | 4 ++++ onedal/utils/validation.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/onedal/datatypes/table.cpp b/onedal/datatypes/table.cpp index 113d881228..634cc99a1d 100644 --- a/onedal/datatypes/table.cpp +++ b/onedal/datatypes/table.cpp @@ -72,6 +72,10 @@ ONEDAL_PY_INIT_MODULE(table) { const auto column_count = t.get_column_count(); return py::make_tuple(row_count, column_count); }); + table_obj.def_property_readonly("dtype", [](const table& t){ + // returns a numpy dtype, even if source was not from numpy + return convert_dal_to_npy_type(t.get_metadata().get_data_type(0)); + }); #ifdef ONEDAL_DATA_PARALLEL define_sycl_usm_array_property(table_obj); diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index 5294483ac2..836dd84a75 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -447,12 +447,12 @@ def _is_csr(x): def _assert_all_finite(X, allow_nan=False, input_name=""): policy = _get_policy(None, X) + X_t = to_table(_convert_to_supported(policy, X)) params = { - "fptype": "float" if X.dtype.name == "float32" else "double", + "fptype": "float" if X_t.dtype == np.float32 else "double", "method": "dense", "allow_nan": allow_nan, } - X_t = to_table(_convert_to_supported(policy, X)) if not _backend.finiteness_checker.compute.compute(policy, params, X_t).finite: type_err = "infinity" if allow_nan else "NaN, infinity" padded_input_name = input_name + " " if input_name else "" From abb3b1683f71fe758beec194795ab6a8b24545f3 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Tue, 19 Nov 2024 16:06:59 +0100 Subject: [PATCH 056/131] add sparse testing --- onedal/utils/tests/test_validation.py | 29 +++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/onedal/utils/tests/test_validation.py b/onedal/utils/tests/test_validation.py index 5f92a64bf7..aefa1dbb36 100644 --- a/onedal/utils/tests/test_validation.py +++ b/onedal/utils/tests/test_validation.py @@ -19,6 +19,7 @@ import numpy as np import numpy.random as rand import pytest +import scipy.sparse as sp from onedal.tests.utils._dataframes_support import ( _convert_to_dataframe, @@ -118,3 +119,31 @@ def test_assert_finite_random_shape_and_location( msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." with pytest.raises(ValueError, match=msg_err): _assert_all_finite(X, allow_nan=allow_nan) + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("allow_nan", [False, True]) +@pytest.mark.parametrize("check", ["inf", "NaN", None]) +@pytest.mark.parametrize("seed", [0, int(time.time())]) +def test_assert_finite_sparse(dtype, allow_nan, check, seed): + lb, ub = 2, 1048576 # lb is a patching condition, ub 2^20 + rand.seed(seed) + X = sp.random( + rand.randint(lb, ub), + rand.randint(lb, ub), + format="csr", + dtype=dtype, + random_state=rand.default_rng(seed), + ) + + if check: + locx = rand.randint(0, X.shape[0] - 1) + locy = rand.randint(0, X.shape[1] - 1) + X[locx, locy] = float(check) + + if check is None or (allow_nan and check == "NaN"): + assert_all_finite(X, allow_nan=allow_nan) + else: + msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." + with pytest.raises(ValueError, match=msg_err): + assert_all_finite(X, allow_nan=allow_nan) From 97aef73e5866db07206fdf47571f9fb94f93185c Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Tue, 19 Nov 2024 17:06:17 +0100 Subject: [PATCH 057/131] try again --- onedal/datatypes/table.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/datatypes/table.cpp b/onedal/datatypes/table.cpp index 634cc99a1d..a06a08710d 100644 --- a/onedal/datatypes/table.cpp +++ b/onedal/datatypes/table.cpp @@ -74,7 +74,7 @@ ONEDAL_PY_INIT_MODULE(table) { }); table_obj.def_property_readonly("dtype", [](const table& t){ // returns a numpy dtype, even if source was not from numpy - return convert_dal_to_npy_type(t.get_metadata().get_data_type(0)); + return py::dtype(convert_dal_to_npy_type(t.get_metadata().get_data_type(0))); }); #ifdef ONEDAL_DATA_PARALLEL From 6e29651587f42226b06c2d733d386a0bc19e0168 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Tue, 19 Nov 2024 17:29:19 +0100 Subject: [PATCH 058/131] try again --- onedal/utils/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/utils/tests/test_validation.py b/onedal/utils/tests/test_validation.py index aefa1dbb36..d953038f33 100644 --- a/onedal/utils/tests/test_validation.py +++ b/onedal/utils/tests/test_validation.py @@ -126,7 +126,7 @@ def test_assert_finite_random_shape_and_location( @pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize("seed", [0, int(time.time())]) def test_assert_finite_sparse(dtype, allow_nan, check, seed): - lb, ub = 2, 1048576 # lb is a patching condition, ub 2^20 + lb, ub = 2, 256 rand.seed(seed) X = sp.random( rand.randint(lb, ub), From 59363a8126643a1eb5aff981d1d7ce09cdbf711b Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Tue, 19 Nov 2024 17:30:46 +0100 Subject: [PATCH 059/131] try again --- onedal/utils/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/utils/tests/test_validation.py b/onedal/utils/tests/test_validation.py index d953038f33..7662f486f3 100644 --- a/onedal/utils/tests/test_validation.py +++ b/onedal/utils/tests/test_validation.py @@ -126,7 +126,7 @@ def test_assert_finite_random_shape_and_location( @pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize("seed", [0, int(time.time())]) def test_assert_finite_sparse(dtype, allow_nan, check, seed): - lb, ub = 2, 256 + lb, ub = 2, 2056 rand.seed(seed) X = sp.random( rand.randint(lb, ub), From 12de7038d719510df8043ae3dbce216afb39c6b2 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 07:18:21 +0100 Subject: [PATCH 060/131] temporary commit --- sklearnex/utils/validation.py | 40 ++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index b2d1898643..e41dec4a18 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -14,4 +14,42 @@ # limitations under the License. # =============================================================================== -from daal4py.sklearn.utils.validation import _assert_all_finite +import scipy.sparse as sp +from sklearn.utils.validation import _assert_all_finite as _sklearn_assert_all_finite +from onedal.utils.validation import _assert_all_finite as _onedal_assert_all_finite +from daal4py.sklearn._utils import sklearn_check_version + +if sklearn_check_version("1.6"): + from sklearn.utils.validation import validate_data as _sklearn_validate_data + _finite_keyword = "ensure_all_finite" + +else: + from sklearn.base import BaseEstimator + _sklearn_validate_data = BaseEstimator._validate_data + _finite_keyword = "force_all_finite" + + + +def validate_data(*args, **kwargs): + # force finite check to not occur in sklearn, default is True + force_all_finite = _finite_keyword not in kwargs or kwargs[_finite_keyword] + kwargs[_finite_keyword] = False + out = _sklearn_validate_data(*args, **kwargs) + if force_all_finite: + # run local finite check + for arg in out: + assert_all_finite(arg) + return out + + +def assert_all_finite( + X, + *, + allow_nan=False, + input_name="", +): + _assert_all_finite( + X.data if sp.issparse(X) else X, + allow_nan=allow_nan, + input_name=input_name, + ) \ No newline at end of file From 07ec3d88ca0a5754edcf42a060ce03f1ab438dd7 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 10:58:56 +0100 Subject: [PATCH 061/131] first attempt --- sklearnex/utils/validation.py | 137 +++++++++++++++++++++++++++++++--- 1 file changed, 125 insertions(+), 12 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index e41dec4a18..16b398380e 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -16,30 +16,107 @@ import scipy.sparse as sp from sklearn.utils.validation import _assert_all_finite as _sklearn_assert_all_finite -from onedal.utils.validation import _assert_all_finite as _onedal_assert_all_finite + from daal4py.sklearn._utils import sklearn_check_version +from onedal.utils._array_api import _is_numpy_namespace +from onedal.utils.validation import _assert_all_finite as _onedal_assert_all_finite + +from ._array_api import get_namespace if sklearn_check_version("1.6"): from sklearn.utils.validation import validate_data as _sklearn_validate_data + _finite_keyword = "ensure_all_finite" else: from sklearn.base import BaseEstimator + _sklearn_validate_data = BaseEstimator._validate_data _finite_keyword = "force_all_finite" +def _is_contiguous(X): + # array_api does not have a `strides` or `flags` attribute for testing memory + # order. When dlpack support is brought in for oneDAL, the dlpack object can + # then be inspected and this must be updated. _is_contiguous is therefore + # conservative in verifying attributes and does not support array_api. This + # will block onedal_assert_all_finite from being used for array api inputs. + if hasattr(X, "flags") and X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]: + return True + return False -def validate_data(*args, **kwargs): - # force finite check to not occur in sklearn, default is True - force_all_finite = _finite_keyword not in kwargs or kwargs[_finite_keyword] - kwargs[_finite_keyword] = False - out = _sklearn_validate_data(*args, **kwargs) - if force_all_finite: - # run local finite check - for arg in out: - assert_all_finite(arg) - return out + +def _assert_all_finite_core(X, *, xp, allow_nan, input_name=""): + # This is a reproduction of code from sklearn.utils.validation + # necessary for older sklearn versions (<1.2) and for dpnp inputs + # which do not conform to the array_api standard, and cannot be + # checked in sklearn. + first_pass_isfinite = xp.isfinite(xp.sum(X)) + if first_pass_isfinite: + return + + has_inf = xp.any(xp.isinf(X)) + has_nan_error = False if allow_nan else xp.any(xp.isnan(X)) + if has_inf or has_nan_error: + type_err = "infinity" if allow_nan else "NaN, infinity" + padded_input_name = input_name + " " if input_name else "" + msg_err = f"Input {padded_input_name}contains {type_err}." + raise ValueError(msg_err) + + +if sklearn_check_version("1.2"): + + def _array_api_assert_all_finite( + X, *, xp, is_array_api_compliant, allow_nan=False, input_name="" + ): + if _is_numpy_namespace(xp) or is_array_api_compliant: + _sklearn_assert_all_finite(X, allow_nan=allow_nan, input_name=input_name) + elif "float" not in xp.dtype.name or "complex" not in xp.dtype.name: + return + # handle dpnp inputs + _assert_all_finite_core(X, xp, allow_nan, input_name=input_name) + +else: + + def _array_api_assert_all_finite( + X, xp, is_array_api_compliant, *, allow_nan=False, input_name="" + ): + + if _is_numpy_namespace(xp): + _sklearn_assert_all_finite(X, allow_nan, input_name=input_name) + elif is_array_api_compliant and not xp.isdtype( + X, ("real floating", "complex floating") + ): + return + elif "float" not in xp.dtype.name or "complex" not in xp.dtype.name: + return + + # handle array_api and dpnp inputs + _assert_all_finite_core(X, xp, allow_nan, input_name=input_name) + + +def _assert_all_finite( + X, + *, + allow_nan=False, + input_name="", +): + # array_api compliance in sklearn varies betweeen the support sklearn versions + # therefore a separate check matching sklearn's assert_all_finite is necessary + # when the data is not float32 or float64 but of a float type. The onedal + # assert_all_finite is only for float32 and float64 contiguous arrays. + + # initial match to daal4py, can be optimized later + xp, is_array_api_compliant = get_namespace(X) + if X.size < 32768 or X.dtype not in [xp.float32, xp.float64] or not _is_contiguous(X): + + # all non-numpy arrays for sklearn 1.0 and dpnp for sklearn are not handeled properly + # separate function for import-time sklearn version check + _array_api_assert_all_finite( + X, xp, is_array_api_compliant, allow_nan=allow_nan, input_name=input_name + ) + else: + _onedal_assert_all_finite(X, allow_nan=allow_nan, input_name=input_name) def assert_all_finite( @@ -52,4 +129,40 @@ def assert_all_finite( X.data if sp.issparse(X) else X, allow_nan=allow_nan, input_name=input_name, - ) \ No newline at end of file + ) + + +def validate_data( + _estimator, + /, + X="no_validation", + y="no_validation", + reset=True, + validate_separately=False, + skip_check_array=False, + **check_params, +): + # force finite check to not occur in sklearn, default is True + # `ensure_all_finite` is the most up-to-date keyword name in sklearn + # _finite_keyword provides backward compatability for `force_all_finite` + force_all_finite = ( + "ensure_all_finite" not in check_params or check_params["ensure_all_finite"] + ) + check_params[_finite_keyword] = False + out = _sklearn_validate_data( + _estimator, + X=X, + y=y, + reset=reset, + validate_separate=validate_separately, + skip_check_array=skip_check_array, + **check_params, + ) + if force_all_finite: + # run local finite check + arg = iter(out) + if not isinstance(X, str) or X != "no_validation": + assert_all_finite(next(arg), input_name="X") + if y is not None or not isinstance(y, str) or y != "no_validation": + assert_all_finite(next(arg), input_name="y") + return out From 32c565d42ad0d07ed37d5a2ea264c32b25510676 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 11:18:13 +0100 Subject: [PATCH 062/131] missing change? --- sklearnex/utils/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/__init__.py b/sklearnex/utils/__init__.py index 4c3fe21154..686e089adf 100755 --- a/sklearnex/utils/__init__.py +++ b/sklearnex/utils/__init__.py @@ -14,6 +14,6 @@ # limitations under the License. # =============================================================================== -from .validation import _assert_all_finite +from .validation import assert_all_finite -__all__ = ["_assert_all_finite"] +__all__ = ["assert_all_finite"] From 5093ed7d8e35559c7966d3e4fd573cd2a6f19b80 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 12:15:56 +0100 Subject: [PATCH 063/131] modify DummyEstimator for testing --- sklearnex/tests/test_memory_usage.py | 44 +++++----------------------- sklearnex/tests/utils/__init__.py | 2 ++ sklearnex/tests/utils/base.py | 35 ++++++++++++++++++++++ 3 files changed, 44 insertions(+), 37 deletions(-) diff --git a/sklearnex/tests/test_memory_usage.py b/sklearnex/tests/test_memory_usage.py index 6e7fdb72b5..570e061040 100644 --- a/sklearnex/tests/test_memory_usage.py +++ b/sklearnex/tests/test_memory_usage.py @@ -38,7 +38,12 @@ from onedal.utils._array_api import _get_sycl_namespace from onedal.utils._dpep_helpers import dpctl_available, dpnp_available from sklearnex import config_context -from sklearnex.tests.utils import PATCHED_FUNCTIONS, PATCHED_MODELS, SPECIAL_INSTANCES +from sklearnex.tests.utils import ( + PATCHED_FUNCTIONS, + PATCHED_MODELS, + SPECIAL_INSTANCES, + DummyEstimator, +) from sklearnex.utils._array_api import get_namespace if dpctl_available: @@ -132,41 +137,6 @@ def gen_functions(functions): ORDER_DICT = {"F": np.asfortranarray, "C": np.ascontiguousarray} -if _is_dpc_backend: - - from sklearn.utils.validation import check_is_fitted - - from onedal.datatypes import from_table, to_table - - class DummyEstimatorWithTableConversions(BaseEstimator): - - def fit(self, X, y=None): - sua_iface, xp, _ = _get_sycl_namespace(X) - X_table = to_table(X) - y_table = to_table(y) - # The presence of the fitted attributes (ending with a trailing - # underscore) is required for the correct check. The cleanup of - # the memory will occur at the estimator instance deletion. - self.x_attr_ = from_table( - X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp - ) - self.y_attr_ = from_table( - y_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp - ) - return self - - def predict(self, X): - # Checks if the estimator is fitted by verifying the presence of - # fitted attributes (ending with a trailing underscore). - check_is_fitted(self) - sua_iface, xp, _ = _get_sycl_namespace(X) - X_table = to_table(X) - returned_X = from_table( - X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp - ) - return returned_X - - def gen_clsf_data(n_samples, n_features, dtype=None): data, label = make_classification( n_classes=2, n_samples=n_samples, n_features=n_features, random_state=777 @@ -370,7 +340,7 @@ def test_table_conversions_memory_leaks(dataframe, queue, order, data_shape, dty pytest.skip("SYCL device memory leak check requires the level zero sysman") _kfold_function_template( - DummyEstimatorWithTableConversions, + DummyEstimator, dataframe, data_shape, queue, diff --git a/sklearnex/tests/utils/__init__.py b/sklearnex/tests/utils/__init__.py index 60ca67fa37..db728fe913 100644 --- a/sklearnex/tests/utils/__init__.py +++ b/sklearnex/tests/utils/__init__.py @@ -21,6 +21,7 @@ SPECIAL_INSTANCES, UNPATCHED_FUNCTIONS, UNPATCHED_MODELS, + DummyEstimator, _get_processor_info, call_method, gen_dataset, @@ -39,6 +40,7 @@ "gen_models_info", "gen_dataset", "sklearn_clone_dict", + "DummyEstimator", ] _IS_INTEL = "GenuineIntel" in _get_processor_info() diff --git a/sklearnex/tests/utils/base.py b/sklearnex/tests/utils/base.py index 1949519585..248eb85a59 100755 --- a/sklearnex/tests/utils/base.py +++ b/sklearnex/tests/utils/base.py @@ -32,7 +32,9 @@ ) from sklearn.datasets import load_diabetes, load_iris from sklearn.neighbors._base import KNeighborsMixin +from sklearn.utils.validation import check_is_fitted +from onedal.datatypes import from_table, to_table from onedal.tests.utils._dataframes_support import _convert_to_dataframe from sklearnex import get_patch_map, patch_sklearn, sklearn_is_patched, unpatch_sklearn from sklearnex.basic_statistics import BasicStatistics, IncrementalBasicStatistics @@ -44,6 +46,7 @@ NearestNeighbors, ) from sklearnex.svm import SVC, NuSVC +from sklearnex.utils.validation import validate_data def _load_all_models(with_sklearnex=True, estimator=True): @@ -369,3 +372,35 @@ def _get_processor_info(): ) return proc + + +class DummyEstimator(BaseEstimator): + + def fit(self, X, y=None): + X_array, y_array = validate_data(self, X, y) + + sua_iface, xp, _ = _get_sycl_namespace(X_array) + X_table = to_table(X_array) + y_table = to_table(y_array) + # The presence of the fitted attributes (ending with a trailing + # underscore) is required for the correct check. The cleanup of + # the memory will occur at the estimator instance deletion. + self.x_attr_ = from_table( + X_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp + ) + self.y_attr_ = from_table( + y_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp + ) + return self + + def predict(self, X): + # Checks if the estimator is fitted by verifying the presence of + # fitted attributes (ending with a trailing underscore). + check_is_fitted(self) + X_array = validate_data(self, X, reset=False) + sua_iface, xp, _ = _get_sycl_namespace(X_array) + X_table = to_table(X_array) + returned_X = from_table( + X_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp + ) + return returned_X From f04deba338611c4367d3c7ca91f9fcfaf3e1c432 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 12:21:32 +0100 Subject: [PATCH 064/131] generalize DummyEstimator --- sklearnex/tests/utils/base.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/sklearnex/tests/utils/base.py b/sklearnex/tests/utils/base.py index 248eb85a59..1d4eb3d0cf 100755 --- a/sklearnex/tests/utils/base.py +++ b/sklearnex/tests/utils/base.py @@ -385,12 +385,19 @@ def fit(self, X, y=None): # The presence of the fitted attributes (ending with a trailing # underscore) is required for the correct check. The cleanup of # the memory will occur at the estimator instance deletion. - self.x_attr_ = from_table( - X_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp - ) - self.y_attr_ = from_table( - y_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp - ) + if sua_iface: + self.x_attr_ = from_table( + X_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp + ) + self.y_attr_ = from_table( + y_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp + ) + else: + self.x_attr = from_table(X_table) + self.y_attr = from_table(y_table) + + assert type(self.x_attr) == type(X) + return self def predict(self, X): @@ -400,7 +407,13 @@ def predict(self, X): X_array = validate_data(self, X, reset=False) sua_iface, xp, _ = _get_sycl_namespace(X_array) X_table = to_table(X_array) - returned_X = from_table( - X_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp - ) + if sua_iface: + returned_X = from_table( + X_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp + ) + else: + returned_X = from_table(X_table) + + assert type(returned_X) == type(X) + return returned_X From 740a5e762788d989186222b79c9f467d4c0973c4 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 12:42:42 +0100 Subject: [PATCH 065/131] switch test --- sklearnex/utils/tests/test_finite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 2874ec3400..eaa39fe2c0 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -21,7 +21,7 @@ import pytest from numpy.testing import assert_raises -from sklearnex.utils import _assert_all_finite +from sklearnex.utils import assert_all_finite @pytest.mark.parametrize("dtype", [np.float32, np.float64]) From 27050bd5a4329dcc30d8f9ec39efce6212cd8694 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 12:43:23 +0100 Subject: [PATCH 066/131] further testing changes --- sklearnex/utils/tests/test_finite.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index eaa39fe2c0..487bb39369 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -65,7 +65,7 @@ def test_assert_finite_random_location(dtype, shape, allow_nan, check, seed): X.reshape((-1,))[loc] = float(check) if check is None or (allow_nan and check == "NaN"): - _assert_all_finite(X, allow_nan=allow_nan) + assert_all_finite(X, allow_nan=allow_nan) else: assert_raises(ValueError, _assert_all_finite, X, allow_nan=allow_nan) @@ -84,6 +84,6 @@ def test_assert_finite_random_shape_and_location(dtype, allow_nan, check, seed): X[loc] = float(check) if check is None or (allow_nan and check == "NaN"): - _assert_all_finite(X, allow_nan=allow_nan) + assert_all_finite(X, allow_nan=allow_nan) else: - assert_raises(ValueError, _assert_all_finite, X, allow_nan=allow_nan) + assert_raises(ValueError, assert_all_finite, X, allow_nan=allow_nan) From 53c8f7b7152d53019819fe7cbb30b382cf7b4e66 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 13:34:29 +0100 Subject: [PATCH 067/131] add initial validate_data test, will be refactored --- sklearnex/utils/tests/test_finite.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 487bb39369..6468fde2cc 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -21,6 +21,8 @@ import pytest from numpy.testing import assert_raises +from onedal.tests.utils._dataframes_support import get_dataframes_and_queues +from sklearnex.tests.utils import DummyEstimator, gen_dataset from sklearnex.utils import assert_all_finite @@ -39,7 +41,7 @@ def test_sum_infinite_actually_finite(dtype, shape, allow_nan): X = np.array(shape, dtype=dtype) X.fill(np.finfo(dtype).max) - _assert_all_finite(X, allow_nan=allow_nan) + assert_all_finite(X, allow_nan=allow_nan) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @@ -67,7 +69,7 @@ def test_assert_finite_random_location(dtype, shape, allow_nan, check, seed): if check is None or (allow_nan and check == "NaN"): assert_all_finite(X, allow_nan=allow_nan) else: - assert_raises(ValueError, _assert_all_finite, X, allow_nan=allow_nan) + assert_raises(ValueError, assert_all_finite, X, allow_nan=allow_nan) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @@ -87,3 +89,13 @@ def test_assert_finite_random_shape_and_location(dtype, allow_nan, check, seed): assert_all_finite(X, allow_nan=allow_nan) else: assert_raises(ValueError, assert_all_finite, X, allow_nan=allow_nan) + + +@pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_validate_data(dtype, dataframe, queue): + est = DummyEstimator() + X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype)[0] + est.fit(X, y) + output = est.predict(X) + assert type(X) == type(output) From 90f59c442021b4c529e64ef9f4844296f412c014 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 15:10:04 +0100 Subject: [PATCH 068/131] fixes for CI --- sklearnex/utils/validation.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 16b398380e..7bcfc3fdf6 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -67,7 +67,7 @@ def _assert_all_finite_core(X, *, xp, allow_nan, input_name=""): if sklearn_check_version("1.2"): def _array_api_assert_all_finite( - X, *, xp, is_array_api_compliant, allow_nan=False, input_name="" + X, xp, is_array_api_compliant, *, allow_nan=False, input_name="" ): if _is_numpy_namespace(xp) or is_array_api_compliant: _sklearn_assert_all_finite(X, allow_nan=allow_nan, input_name=input_name) @@ -137,26 +137,18 @@ def validate_data( /, X="no_validation", y="no_validation", - reset=True, - validate_separately=False, - skip_check_array=False, - **check_params, + **kwargs, ): # force finite check to not occur in sklearn, default is True # `ensure_all_finite` is the most up-to-date keyword name in sklearn # _finite_keyword provides backward compatability for `force_all_finite` - force_all_finite = ( - "ensure_all_finite" not in check_params or check_params["ensure_all_finite"] - ) - check_params[_finite_keyword] = False + force_all_finite = "ensure_all_finite" not in kwargs or kwargs["ensure_all_finite"] + kwargs[_finite_keyword] = False out = _sklearn_validate_data( _estimator, X=X, y=y, - reset=reset, - validate_separate=validate_separately, - skip_check_array=skip_check_array, - **check_params, + **kwargs, ) if force_all_finite: # run local finite check From 7f170e2efc494d66b1a7b9b1f29c87eb1c3f9edf Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 20 Nov 2024 16:36:38 +0100 Subject: [PATCH 069/131] Update validation.py --- sklearnex/utils/validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 7bcfc3fdf6..0fc31d53c0 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -1,4 +1,4 @@ -# =============================================================================== +the# =============================================================================== # Copyright 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -46,7 +46,7 @@ def _is_contiguous(X): return False -def _assert_all_finite_core(X, *, xp, allow_nan, input_name=""): +def _assert_all_finite_core(X, xp, allow_nan, *, input_name=""): # This is a reproduction of code from sklearn.utils.validation # necessary for older sklearn versions (<1.2) and for dpnp inputs # which do not conform to the array_api standard, and cannot be From 81e2bbc763b21bdd29b40e1a72c1ac41355de569 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 20 Nov 2024 16:54:08 +0100 Subject: [PATCH 070/131] Update validation.py --- sklearnex/utils/validation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 0fc31d53c0..3e65223331 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -1,4 +1,4 @@ -the# =============================================================================== +# =============================================================================== # Copyright 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -46,7 +46,7 @@ def _is_contiguous(X): return False -def _assert_all_finite_core(X, xp, allow_nan, *, input_name=""): +def _assert_all_finite_core(X, xp, *, allow_nan=False, input_name=""): # This is a reproduction of code from sklearn.utils.validation # necessary for older sklearn versions (<1.2) and for dpnp inputs # which do not conform to the array_api standard, and cannot be @@ -74,7 +74,7 @@ def _array_api_assert_all_finite( elif "float" not in xp.dtype.name or "complex" not in xp.dtype.name: return # handle dpnp inputs - _assert_all_finite_core(X, xp, allow_nan, input_name=input_name) + _assert_all_finite_core(X, xp, allow_nan=allow_nan, input_name=input_name) else: From 116bdba61f83fda8d66566cfd6bbeb999ca532df Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 20 Nov 2024 17:18:33 +0100 Subject: [PATCH 071/131] Update test_memory_usage.py --- sklearnex/tests/test_memory_usage.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearnex/tests/test_memory_usage.py b/sklearnex/tests/test_memory_usage.py index 570e061040..be501be218 100644 --- a/sklearnex/tests/test_memory_usage.py +++ b/sklearnex/tests/test_memory_usage.py @@ -35,7 +35,6 @@ get_dataframes_and_queues, ) from onedal.tests.utils._device_selection import get_queues, is_dpctl_device_available -from onedal.utils._array_api import _get_sycl_namespace from onedal.utils._dpep_helpers import dpctl_available, dpnp_available from sklearnex import config_context from sklearnex.tests.utils import ( From 076ebc401b4e9fbd872f9f1bc971bad1eb095f32 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 20 Nov 2024 17:19:15 +0100 Subject: [PATCH 072/131] Update base.py --- sklearnex/tests/utils/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearnex/tests/utils/base.py b/sklearnex/tests/utils/base.py index 1d4eb3d0cf..35ba2811e2 100755 --- a/sklearnex/tests/utils/base.py +++ b/sklearnex/tests/utils/base.py @@ -36,6 +36,7 @@ from onedal.datatypes import from_table, to_table from onedal.tests.utils._dataframes_support import _convert_to_dataframe +from onedal.utils._array_api import _get_sycl_namespace from sklearnex import get_patch_map, patch_sklearn, sklearn_is_patched, unpatch_sklearn from sklearnex.basic_statistics import BasicStatistics, IncrementalBasicStatistics from sklearnex.linear_model import LogisticRegression From e1d074365e51fa77fa75f6457c090346eb6d527a Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 20 Nov 2024 18:03:03 +0100 Subject: [PATCH 073/131] Update base.py --- sklearnex/tests/utils/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/tests/utils/base.py b/sklearnex/tests/utils/base.py index 35ba2811e2..0d58b5189b 100755 --- a/sklearnex/tests/utils/base.py +++ b/sklearnex/tests/utils/base.py @@ -396,7 +396,7 @@ def fit(self, X, y=None): else: self.x_attr = from_table(X_table) self.y_attr = from_table(y_table) - + assert type(X_array) == type(X) assert type(self.x_attr) == type(X) return self @@ -414,7 +414,7 @@ def predict(self, X): ) else: returned_X = from_table(X_table) - + assert type(X_array) == type(X) assert type(returned_X) == type(X) return returned_X From f59cdd33d29321c3989d0b4415b99b5055408f23 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 22:38:30 +0100 Subject: [PATCH 074/131] improve tests --- sklearnex/tests/utils/base.py | 23 +++----- sklearnex/utils/tests/test_finite.py | 83 +++++++++++++++++++++++----- 2 files changed, 77 insertions(+), 29 deletions(-) diff --git a/sklearnex/tests/utils/base.py b/sklearnex/tests/utils/base.py index 0d58b5189b..e484423cfc 100755 --- a/sklearnex/tests/utils/base.py +++ b/sklearnex/tests/utils/base.py @@ -378,26 +378,24 @@ def _get_processor_info(): class DummyEstimator(BaseEstimator): def fit(self, X, y=None): - X_array, y_array = validate_data(self, X, y) + X, y = validate_data(self, X, y) - sua_iface, xp, _ = _get_sycl_namespace(X_array) - X_table = to_table(X_array) - y_table = to_table(y_array) + sua_iface, xp, _ = _get_sycl_namespace(X) + X_table = to_table(X) + y_table = to_table(y) # The presence of the fitted attributes (ending with a trailing # underscore) is required for the correct check. The cleanup of # the memory will occur at the estimator instance deletion. if sua_iface: self.x_attr_ = from_table( - X_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp + X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp ) self.y_attr_ = from_table( - y_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp + y_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp ) else: self.x_attr = from_table(X_table) self.y_attr = from_table(y_table) - assert type(X_array) == type(X) - assert type(self.x_attr) == type(X) return self @@ -405,16 +403,13 @@ def predict(self, X): # Checks if the estimator is fitted by verifying the presence of # fitted attributes (ending with a trailing underscore). check_is_fitted(self) - X_array = validate_data(self, X, reset=False) - sua_iface, xp, _ = _get_sycl_namespace(X_array) - X_table = to_table(X_array) + sua_iface, xp, _ = _get_sycl_namespace(X) + X_table = to_table(X) if sua_iface: returned_X = from_table( - X_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp + X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp ) else: returned_X = from_table(X_table) - assert type(X_array) == type(X) - assert type(returned_X) == type(X) return returned_X diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 6468fde2cc..5c3ee2d50e 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -19,11 +19,15 @@ import numpy as np import numpy.random as rand import pytest -from numpy.testing import assert_raises -from onedal.tests.utils._dataframes_support import get_dataframes_and_queues +from daal4py.sklearn._utils import sklearn_check_version +from onedal.tests.utils._dataframes_support import ( + _convert_to_dataframe, + get_dataframes_and_queues, +) +from sklearnex import config_context from sklearnex.tests.utils import DummyEstimator, gen_dataset -from sklearnex.utils import assert_all_finite +from sklearnex.utils import validate_data @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @@ -39,9 +43,11 @@ ) @pytest.mark.parametrize("allow_nan", [False, True]) def test_sum_infinite_actually_finite(dtype, shape, allow_nan): + est = DummyEstimator() X = np.array(shape, dtype=dtype) X.fill(np.finfo(dtype).max) - assert_all_finite(X, allow_nan=allow_nan) + X_array = validate_data(est, X, allow_nan=allow_nan) + assert type(X_array) == type(X) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @@ -58,7 +64,11 @@ def test_sum_infinite_actually_finite(dtype, shape, allow_nan): @pytest.mark.parametrize("allow_nan", [False, True]) @pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize("seed", [0, int(time.time())]) -def test_assert_finite_random_location(dtype, shape, allow_nan, check, seed): +@pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) +def test_validate_data_random_location( + dataframe, queue, dtype, shape, allow_nan, check, seed +): + est = DummyEstimator() rand.seed(seed) X = rand.uniform(high=np.finfo(dtype).max, size=shape).astype(dtype) @@ -66,17 +76,29 @@ def test_assert_finite_random_location(dtype, shape, allow_nan, check, seed): loc = rand.randint(0, X.size - 1) X.reshape((-1,))[loc] = float(check) + X = _convert_to_dataframe( + X, + target_df=dataframe, + sycl_queue=queue, + ) + if check is None or (allow_nan and check == "NaN"): - assert_all_finite(X, allow_nan=allow_nan) + validate_data(est, X, allow_nan=allow_nan) else: - assert_raises(ValueError, assert_all_finite, X, allow_nan=allow_nan) + msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." + with pytest.raises(ValueError, match=msg_err): + validate_data(est, X, allow_nan=allow_nan) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("allow_nan", [False, True]) @pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize("seed", [0, int(time.time())]) -def test_assert_finite_random_shape_and_location(dtype, allow_nan, check, seed): +@pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) +def test_validate_data_random_shape_and_location( + dataframe, queue, dtype, allow_nan, check, seed +): + est = DummyEstimator() lb, ub = 32768, 1048576 # lb is a patching condition, ub 2^20 rand.seed(seed) X = rand.uniform(high=np.finfo(dtype).max, size=rand.randint(lb, ub)).astype(dtype) @@ -85,17 +107,48 @@ def test_assert_finite_random_shape_and_location(dtype, allow_nan, check, seed): loc = rand.randint(0, X.size - 1) X[loc] = float(check) + X = _convert_to_dataframe( + X, + target_df=dataframe, + sycl_queue=queue, + ) + if check is None or (allow_nan and check == "NaN"): - assert_all_finite(X, allow_nan=allow_nan) + validate_data(est, X) else: - assert_raises(ValueError, assert_all_finite, X, allow_nan=allow_nan) + msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." + with pytest.raises(ValueError, match=msg_err): + validate_data(est, X, allow_nan=allow_nan) -@pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) -def test_validate_data(dtype, dataframe, queue): +@pytest.mark.parametrize("array_api_dispatch", [True, False]) +@pytest.mark.parametrize( + "dataframe, queue", get_dataframes_and_queues("numpy,dpctl,dpnp") +) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): est = DummyEstimator() X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype)[0] - est.fit(X, y) - output = est.predict(X) - assert type(X) == type(output) + + dispatch = {} + if sklearn_check_version("1.2"): + dispatch["array_api_dispatch"] = array_api_dispatch + + with config_context(**dispatch): + validate_data(est, X, y) + est.fit(X, y) + X_array = validate_data(est, X, reset=False) + X_out = est.predict(X) + + if ( + sklearn_check_version("1.2") or dataframe != "array_api" + ) and dataframe != "pandas": + assert type(X) == type( + X_array + ), f"validate_data converted {type(X)} to {type(X_array)}" + assert type(X) == type(X_out), f"from_array converted {type(X)} to {type(X_out)}" + else: + # array_api_strict from sklearn < 1.2 and pandas will convert to numpy arrays + assert isinstance(X_array, np.ndarray) + assert isinstance(X_out, np.ndarray) From 7f9ea25aceaff20983895aab9770311211fb9211 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 22:56:10 +0100 Subject: [PATCH 075/131] fix logic --- sklearnex/utils/tests/test_finite.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 5c3ee2d50e..9ddbed4d67 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -141,9 +141,9 @@ def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): X_array = validate_data(est, X, reset=False) X_out = est.predict(X) - if ( - sklearn_check_version("1.2") or dataframe != "array_api" - ) and dataframe != "pandas": + if dataframe != "pandas" and not ( + dataframe == "array_api" and sklearn_check_version("1.2") and array_api_dispatch + ): assert type(X) == type( X_array ), f"validate_data converted {type(X)} to {type(X_array)}" From 51247c050952481babace230e099f26750806ae5 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 23:00:48 +0100 Subject: [PATCH 076/131] fix logic --- sklearnex/utils/tests/test_finite.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 9ddbed4d67..cd400c855c 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -142,7 +142,9 @@ def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): X_out = est.predict(X) if dataframe != "pandas" and not ( - dataframe == "array_api" and sklearn_check_version("1.2") and array_api_dispatch + dataframe == "array_api" + and not sklearn_check_version("1.2") + and not array_api_dispatch ): assert type(X) == type( X_array From 6e5c0efeae8743c2406cf0e89aca19197cc9654f Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 23:09:24 +0100 Subject: [PATCH 077/131] fix logic again --- sklearnex/utils/tests/test_finite.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index cd400c855c..9a789f274f 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -141,16 +141,15 @@ def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): X_array = validate_data(est, X, reset=False) X_out = est.predict(X) - if dataframe != "pandas" and not ( + if dataframe == "pandas" or ( dataframe == "array_api" - and not sklearn_check_version("1.2") - and not array_api_dispatch + and not (sklearn_check_version("1.2") and array_api_dispatch) ): + # array_api_strict from sklearn < 1.2 and pandas will convert to numpy arrays + assert isinstance(X_array, np.ndarray) + assert isinstance(X_out, np.ndarray) + else: assert type(X) == type( X_array ), f"validate_data converted {type(X)} to {type(X_array)}" assert type(X) == type(X_out), f"from_array converted {type(X)} to {type(X_out)}" - else: - # array_api_strict from sklearn < 1.2 and pandas will convert to numpy arrays - assert isinstance(X_array, np.ndarray) - assert isinstance(X_out, np.ndarray) From 8d47744f25c0b32e9b0ad639e772107710c56e98 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 23:21:14 +0100 Subject: [PATCH 078/131] rename file --- sklearnex/utils/tests/{test_finite.py => test_validation.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sklearnex/utils/tests/{test_finite.py => test_validation.py} (100%) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_validation.py similarity index 100% rename from sklearnex/utils/tests/test_finite.py rename to sklearnex/utils/tests/test_validation.py From 1ae9af5aa01ea34228e52e55f304b9c5e436e3fb Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 23:25:59 +0100 Subject: [PATCH 079/131] Revert "rename file" This reverts commit 8d47744f25c0b32e9b0ad639e772107710c56e98. --- sklearnex/utils/tests/{test_validation.py => test_finite.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sklearnex/utils/tests/{test_validation.py => test_finite.py} (100%) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_finite.py similarity index 100% rename from sklearnex/utils/tests/test_validation.py rename to sklearnex/utils/tests/test_finite.py From bf9b46e84bdc0833463aa99ad7a61090fc7bbd30 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 23:26:43 +0100 Subject: [PATCH 080/131] remove duplication --- sklearnex/utils/tests/test_finite.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 9a789f274f..3fea947cd7 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -123,10 +123,7 @@ def test_validate_data_random_shape_and_location( @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("array_api_dispatch", [True, False]) -@pytest.mark.parametrize( - "dataframe, queue", get_dataframes_and_queues("numpy,dpctl,dpnp") -) -@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): est = DummyEstimator() X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype)[0] From 3101c3fb0b5bbcc4f3a8386de28da538c5ed4467 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 23:41:44 +0100 Subject: [PATCH 081/131] fix imports --- sklearnex/utils/tests/test_finite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 3fea947cd7..d9d8d461fe 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -27,7 +27,7 @@ ) from sklearnex import config_context from sklearnex.tests.utils import DummyEstimator, gen_dataset -from sklearnex.utils import validate_data +from sklearnex.utils.validation import validate_data @pytest.mark.parametrize("dtype", [np.float32, np.float64]) From ee799f60c000651eb828bd7586a91825706b644b Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 20 Nov 2024 23:42:45 +0100 Subject: [PATCH 082/131] Rename test_finite.py to test_validation.py --- sklearnex/utils/tests/{test_finite.py => test_validation.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sklearnex/utils/tests/{test_finite.py => test_validation.py} (100%) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_validation.py similarity index 100% rename from sklearnex/utils/tests/test_finite.py rename to sklearnex/utils/tests/test_validation.py From db4a6c6fe00883b42b8c580b11ecee8b169bc237 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 23:43:15 +0100 Subject: [PATCH 083/131] Revert "Rename test_finite.py to test_validation.py" This reverts commit ee799f60c000651eb828bd7586a91825706b644b. --- sklearnex/utils/tests/{test_validation.py => test_finite.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sklearnex/utils/tests/{test_validation.py => test_finite.py} (100%) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_finite.py similarity index 100% rename from sklearnex/utils/tests/test_validation.py rename to sklearnex/utils/tests/test_finite.py From b5acbac8782f6022eff6ee85425d593ce9826e6e Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 06:07:57 +0100 Subject: [PATCH 084/131] updates --- sklearnex/utils/tests/test_finite.py | 36 +++++++++++++++++----------- sklearnex/utils/validation.py | 11 +++++---- 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index d9d8d461fe..180b256771 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -41,12 +41,12 @@ [1000, 1000], ], ) -@pytest.mark.parametrize("allow_nan", [False, True]) -def test_sum_infinite_actually_finite(dtype, shape, allow_nan): +@pytest.mark.parametrize("ensure_all_finite", [False, True]) +def test_sum_infinite_actually_finite(dtype, shape, ensure_all_finite): est = DummyEstimator() X = np.array(shape, dtype=dtype) X.fill(np.finfo(dtype).max) - X_array = validate_data(est, X, allow_nan=allow_nan) + X_array = validate_data(est, X, ensure_all_finite=ensure_all_finite) assert type(X_array) == type(X) @@ -61,12 +61,12 @@ def test_sum_infinite_actually_finite(dtype, shape, allow_nan): [1000, 1000], ], ) -@pytest.mark.parametrize("allow_nan", [False, True]) +@pytest.mark.parametrize("ensure_all_finite", [False, True]) @pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize("seed", [0, int(time.time())]) @pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) def test_validate_data_random_location( - dataframe, queue, dtype, shape, allow_nan, check, seed + dataframe, queue, dtype, shape, ensure_all_finite, check, seed ): est = DummyEstimator() rand.seed(seed) @@ -82,21 +82,25 @@ def test_validate_data_random_location( sycl_queue=queue, ) - if check is None or (allow_nan and check == "NaN"): - validate_data(est, X, allow_nan=allow_nan) + if check is None or (ensure_all_finite and check == "NaN"): + validate_data(est, X, ensure_all_finite=ensure_all_finite) else: - msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." + msg_err = ( + "Input contains " + + ("infinity" if ensure_all_finite else "NaN, infinity") + + "." + ) with pytest.raises(ValueError, match=msg_err): - validate_data(est, X, allow_nan=allow_nan) + validate_data(est, X, ensure_all_finite=ensure_all_finite) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) -@pytest.mark.parametrize("allow_nan", [False, True]) +@pytest.mark.parametrize("ensure_all_finite", [False, True]) @pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize("seed", [0, int(time.time())]) @pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) def test_validate_data_random_shape_and_location( - dataframe, queue, dtype, allow_nan, check, seed + dataframe, queue, dtype, ensure_all_finite, check, seed ): est = DummyEstimator() lb, ub = 32768, 1048576 # lb is a patching condition, ub 2^20 @@ -113,12 +117,16 @@ def test_validate_data_random_shape_and_location( sycl_queue=queue, ) - if check is None or (allow_nan and check == "NaN"): + if check is None or (ensure_all_finite and check == "NaN"): validate_data(est, X) else: - msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." + msg_err = ( + "Input contains " + + ("infinity" if ensure_all_finite else "NaN, infinity") + + "." + ) with pytest.raises(ValueError, match=msg_err): - validate_data(est, X, allow_nan=allow_nan) + validate_data(est, X, ensure_all_finite=ensure_all_finite) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 3e65223331..34bb988748 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -142,7 +142,9 @@ def validate_data( # force finite check to not occur in sklearn, default is True # `ensure_all_finite` is the most up-to-date keyword name in sklearn # _finite_keyword provides backward compatability for `force_all_finite` - force_all_finite = "ensure_all_finite" not in kwargs or kwargs["ensure_all_finite"] + ensure_all_finite = "ensure_all_finite" not in kwargs or kwargs.pop( + "ensure_all_finite" + ) kwargs[_finite_keyword] = False out = _sklearn_validate_data( _estimator, @@ -150,11 +152,12 @@ def validate_data( y=y, **kwargs, ) - if force_all_finite: + if ensure_all_finite: # run local finite check + allow_nan = ensure_all_finite == "allow-nan" arg = iter(out) if not isinstance(X, str) or X != "no_validation": - assert_all_finite(next(arg), input_name="X") + assert_all_finite(next(arg), allow_nan=allow_nan, input_name="X") if y is not None or not isinstance(y, str) or y != "no_validation": - assert_all_finite(next(arg), input_name="y") + assert_all_finite(next(arg), allow_nan=allow_nan, input_name="y") return out From ed57c15e7e08dee51970b4db316aaea16343d7c0 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Thu, 21 Nov 2024 06:14:53 +0100 Subject: [PATCH 085/131] Update validation.py --- sklearnex/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 34bb988748..e3dd92b7ed 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -142,7 +142,7 @@ def validate_data( # force finite check to not occur in sklearn, default is True # `ensure_all_finite` is the most up-to-date keyword name in sklearn # _finite_keyword provides backward compatability for `force_all_finite` - ensure_all_finite = "ensure_all_finite" not in kwargs or kwargs.pop( + ensure_all_finite = True if "ensure_all_finite" not in kwargs else kwargs.pop( "ensure_all_finite" ) kwargs[_finite_keyword] = False From 414f8979da5d44d8d0d19255d1b5f621733d8065 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 07:07:15 +0100 Subject: [PATCH 086/131] fixes for some test failures --- sklearnex/utils/tests/test_finite.py | 29 ++++++--------- sklearnex/utils/validation.py | 55 +++++++++++++++------------- 2 files changed, 41 insertions(+), 43 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 180b256771..f75ff33301 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -41,11 +41,12 @@ [1000, 1000], ], ) -@pytest.mark.parametrize("ensure_all_finite", [False, True]) +@pytest.mark.parametrize("ensure_all_finite", ["allow-nan", True]) def test_sum_infinite_actually_finite(dtype, shape, ensure_all_finite): est = DummyEstimator() X = np.array(shape, dtype=dtype) X.fill(np.finfo(dtype).max) + X = np.atleast_2d(X) X_array = validate_data(est, X, ensure_all_finite=ensure_all_finite) assert type(X_array) == type(X) @@ -61,7 +62,7 @@ def test_sum_infinite_actually_finite(dtype, shape, ensure_all_finite): [1000, 1000], ], ) -@pytest.mark.parametrize("ensure_all_finite", [False, True]) +@pytest.mark.parametrize("ensure_all_finite", ["allow-nan", True]) @pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize("seed", [0, int(time.time())]) @pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) @@ -77,25 +78,22 @@ def test_validate_data_random_location( X.reshape((-1,))[loc] = float(check) X = _convert_to_dataframe( - X, + np.atleast_2d(X), target_df=dataframe, sycl_queue=queue, ) - if check is None or (ensure_all_finite and check == "NaN"): + allow_nan = ensure_all_finite == "allow_nan" + if check is None or (allow_nan and check == "NaN"): validate_data(est, X, ensure_all_finite=ensure_all_finite) else: - msg_err = ( - "Input contains " - + ("infinity" if ensure_all_finite else "NaN, infinity") - + "." - ) + msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." with pytest.raises(ValueError, match=msg_err): validate_data(est, X, ensure_all_finite=ensure_all_finite) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) -@pytest.mark.parametrize("ensure_all_finite", [False, True]) +@pytest.mark.parametrize("ensure_all_finite", ["allow-nan", True]) @pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize("seed", [0, int(time.time())]) @pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) @@ -112,19 +110,16 @@ def test_validate_data_random_shape_and_location( X[loc] = float(check) X = _convert_to_dataframe( - X, + np.atleast_2d(X), target_df=dataframe, sycl_queue=queue, ) - if check is None or (ensure_all_finite and check == "NaN"): + allow_nan = ensure_all_finite == "allow_nan" + if check is None or (allow_nan and check == "NaN"): validate_data(est, X) else: - msg_err = ( - "Input contains " - + ("infinity" if ensure_all_finite else "NaN, infinity") - + "." - ) + msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." with pytest.raises(ValueError, match=msg_err): validate_data(est, X, ensure_all_finite=ensure_all_finite) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index e3dd92b7ed..804fafdb48 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -37,20 +37,20 @@ def _is_contiguous(X): # array_api does not have a `strides` or `flags` attribute for testing memory - # order. When dlpack support is brought in for oneDAL, the dlpack object can - # then be inspected and this must be updated. _is_contiguous is therefore - # conservative in verifying attributes and does not support array_api. This - # will block onedal_assert_all_finite from being used for array api inputs. + # order. When dlpack support is brought in for oneDAL, the dlpack python capsule + # can then be inspected for strides and this must be updated. _is_contiguous is + # therefore conservative in verifying attributes and does not support array_api. + # This will block onedal_assert_all_finite from being used for array_api inputs. if hasattr(X, "flags") and X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]: return True return False -def _assert_all_finite_core(X, xp, *, allow_nan=False, input_name=""): - # This is a reproduction of code from sklearn.utils.validation - # necessary for older sklearn versions (<1.2) and for dpnp inputs - # which do not conform to the array_api standard, and cannot be - # checked in sklearn. +def _sycl_usm_assert_all_finite(X, xp, *, allow_nan=False, input_name=""): + # This is a reproduction of code from sklearn.utils.validation necessary for + # non-contiguous or non-fp32/fp64 dpctl inputs when sklearn version is <1.2 or + # for non-contiguous or non-fp32/fp64 dpnp inputs, as these cannot be checked + # for finiteness in sklearn nor onedal while preserving their object type. first_pass_isfinite = xp.isfinite(xp.sum(X)) if first_pass_isfinite: return @@ -66,7 +66,7 @@ def _assert_all_finite_core(X, xp, *, allow_nan=False, input_name=""): if sklearn_check_version("1.2"): - def _array_api_assert_all_finite( + def _general_assert_all_finite( X, xp, is_array_api_compliant, *, allow_nan=False, input_name="" ): if _is_numpy_namespace(xp) or is_array_api_compliant: @@ -74,11 +74,11 @@ def _array_api_assert_all_finite( elif "float" not in xp.dtype.name or "complex" not in xp.dtype.name: return # handle dpnp inputs - _assert_all_finite_core(X, xp, allow_nan=allow_nan, input_name=input_name) + _sycl_usm_assert_all_finite(X, xp, allow_nan=allow_nan, input_name=input_name) else: - def _array_api_assert_all_finite( + def _general_assert_all_finite( X, xp, is_array_api_compliant, *, allow_nan=False, input_name="" ): @@ -90,9 +90,8 @@ def _array_api_assert_all_finite( return elif "float" not in xp.dtype.name or "complex" not in xp.dtype.name: return - - # handle array_api and dpnp inputs - _assert_all_finite_core(X, xp, allow_nan, input_name=input_name) + # handle dpctl and dpnp inputs + _sycl_usm_assert_all_finite(X, xp, allow_nan, input_name=input_name) def _assert_all_finite( @@ -101,18 +100,22 @@ def _assert_all_finite( allow_nan=False, input_name="", ): - # array_api compliance in sklearn varies betweeen the support sklearn versions - # therefore a separate check matching sklearn's assert_all_finite is necessary - # when the data is not float32 or float64 but of a float type. The onedal - # assert_all_finite is only for float32 and float64 contiguous arrays. - - # initial match to daal4py, can be optimized later + # unlike sklearnex, sklearn does not support sycl_usm_ndarrays by default + # therefore a separate finite check implementation matching sklearn's + # `_assert_all_finite` is necessary when the data is not float32 or float64 or + # non-contiguous. The onedal assert_all_finite is only for float32 and float64 + # contiguous arrays. + + # size check is an initial match to daal4py for performance reasons, can be + # optimized later xp, is_array_api_compliant = get_namespace(X) if X.size < 32768 or X.dtype not in [xp.float32, xp.float64] or not _is_contiguous(X): - # all non-numpy arrays for sklearn 1.0 and dpnp for sklearn are not handeled properly - # separate function for import-time sklearn version check - _array_api_assert_all_finite( + # all sycl_usm_ndarrays for sklearn < 1.2 and dpnp for sklearn > 1.2 are not + # handled properly, it calls a separate function for an import-time sklearn + # version check before possible hand-off to sklearn's _assert_all_finite or to + # _assert_all_finite_core. + _general_assert_all_finite( X, xp, is_array_api_compliant, allow_nan=allow_nan, input_name=input_name ) else: @@ -142,8 +145,8 @@ def validate_data( # force finite check to not occur in sklearn, default is True # `ensure_all_finite` is the most up-to-date keyword name in sklearn # _finite_keyword provides backward compatability for `force_all_finite` - ensure_all_finite = True if "ensure_all_finite" not in kwargs else kwargs.pop( - "ensure_all_finite" + ensure_all_finite = ( + True if "ensure_all_finite" not in kwargs else kwargs.pop("ensure_all_finite") ) kwargs[_finite_keyword] = False out = _sklearn_validate_data( From 83253b3cba87bbec4e5a16b5a75519013e93a5b2 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 07:13:13 +0100 Subject: [PATCH 087/131] fix text --- sklearnex/utils/validation.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 804fafdb48..5e85bc559d 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -50,7 +50,7 @@ def _sycl_usm_assert_all_finite(X, xp, *, allow_nan=False, input_name=""): # This is a reproduction of code from sklearn.utils.validation necessary for # non-contiguous or non-fp32/fp64 dpctl inputs when sklearn version is <1.2 or # for non-contiguous or non-fp32/fp64 dpnp inputs, as these cannot be checked - # for finiteness in sklearn nor onedal while preserving their object type. + # for finiteness in onedal or by sklearn (while preserving their object type). first_pass_isfinite = xp.isfinite(xp.sum(X)) if first_pass_isfinite: return @@ -100,12 +100,6 @@ def _assert_all_finite( allow_nan=False, input_name="", ): - # unlike sklearnex, sklearn does not support sycl_usm_ndarrays by default - # therefore a separate finite check implementation matching sklearn's - # `_assert_all_finite` is necessary when the data is not float32 or float64 or - # non-contiguous. The onedal assert_all_finite is only for float32 and float64 - # contiguous arrays. - # size check is an initial match to daal4py for performance reasons, can be # optimized later xp, is_array_api_compliant = get_namespace(X) @@ -114,7 +108,7 @@ def _assert_all_finite( # all sycl_usm_ndarrays for sklearn < 1.2 and dpnp for sklearn > 1.2 are not # handled properly, it calls a separate function for an import-time sklearn # version check before possible hand-off to sklearn's _assert_all_finite or to - # _assert_all_finite_core. + # _sycl_usm_assert_all_finite. _general_assert_all_finite( X, xp, is_array_api_compliant, allow_nan=allow_nan, input_name=input_name ) From b22e23a47d1cb88d94d71dc29cf61f2f3f39fcc3 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 08:22:54 +0100 Subject: [PATCH 088/131] fixes for some failures --- sklearnex/utils/tests/test_finite.py | 7 +++++-- sklearnex/utils/validation.py | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index f75ff33301..a790301a27 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -87,7 +87,9 @@ def test_validate_data_random_location( if check is None or (allow_nan and check == "NaN"): validate_data(est, X, ensure_all_finite=ensure_all_finite) else: - msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." + msg_err = ( + "Input X contains " + ("infinity" if allow_nan else "NaN, infinity") + "." + ) with pytest.raises(ValueError, match=msg_err): validate_data(est, X, ensure_all_finite=ensure_all_finite) @@ -119,7 +121,8 @@ def test_validate_data_random_shape_and_location( if check is None or (allow_nan and check == "NaN"): validate_data(est, X) else: - msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." + type_err = "infinity" if allow_nan else "NaN, infinity" + msg_err = f"Input X contains {type_err}." with pytest.raises(ValueError, match=msg_err): validate_data(est, X, ensure_all_finite=ensure_all_finite) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 5e85bc559d..61cb9acba8 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -153,8 +153,8 @@ def validate_data( # run local finite check allow_nan = ensure_all_finite == "allow-nan" arg = iter(out) - if not isinstance(X, str) or X != "no_validation": + if X != "no_validation": assert_all_finite(next(arg), allow_nan=allow_nan, input_name="X") - if y is not None or not isinstance(y, str) or y != "no_validation": + if y is not None and y != "no_validation": assert_all_finite(next(arg), allow_nan=allow_nan, input_name="y") return out From 2f8ec169a563ccc1c0d6fadb9dc27ee68d25fec3 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 08:23:45 +0100 Subject: [PATCH 089/131] make consistent --- sklearnex/utils/tests/test_finite.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index a790301a27..157b79f6c7 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -87,9 +87,8 @@ def test_validate_data_random_location( if check is None or (allow_nan and check == "NaN"): validate_data(est, X, ensure_all_finite=ensure_all_finite) else: - msg_err = ( - "Input X contains " + ("infinity" if allow_nan else "NaN, infinity") + "." - ) + type_err = "infinity" if allow_nan else "NaN, infinity" + msg_err = f"Input X contains {type_err}." with pytest.raises(ValueError, match=msg_err): validate_data(est, X, ensure_all_finite=ensure_all_finite) From 1fd9973d018eb1b059c85c555216ce2e9377daae Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 09:21:14 +0100 Subject: [PATCH 090/131] fix bad logic --- sklearnex/utils/validation.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 61cb9acba8..996299f37b 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -94,7 +94,7 @@ def _general_assert_all_finite( _sycl_usm_assert_all_finite(X, xp, allow_nan, input_name=input_name) -def _assert_all_finite( +def _sklearnex_assert_all_finite( X, *, allow_nan=False, @@ -122,7 +122,7 @@ def assert_all_finite( allow_nan=False, input_name="", ): - _assert_all_finite( + _sklearnex_assert_all_finite( X.data if sp.issparse(X) else X, allow_nan=allow_nan, input_name=input_name, @@ -139,9 +139,7 @@ def validate_data( # force finite check to not occur in sklearn, default is True # `ensure_all_finite` is the most up-to-date keyword name in sklearn # _finite_keyword provides backward compatability for `force_all_finite` - ensure_all_finite = ( - True if "ensure_all_finite" not in kwargs else kwargs.pop("ensure_all_finite") - ) + ensure_all_finite = kwargs.pop("ensure_all_finite", True) kwargs[_finite_keyword] = False out = _sklearn_validate_data( _estimator, @@ -153,8 +151,8 @@ def validate_data( # run local finite check allow_nan = ensure_all_finite == "allow-nan" arg = iter(out) - if X != "no_validation": + if not isinstance(X, str) or X != "no_validation": assert_all_finite(next(arg), allow_nan=allow_nan, input_name="X") - if y is not None and y != "no_validation": + if not (y is None or isinstance(y, str) and y == "no_validation"): assert_all_finite(next(arg), allow_nan=allow_nan, input_name="y") return out From c20c8cc5891d6b41e5ffb36898617c6d310344b2 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 10:03:43 +0100 Subject: [PATCH 091/131] fix in string --- sklearnex/utils/tests/test_finite.py | 4 ++-- sklearnex/utils/validation.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 157b79f6c7..c2dec65e00 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -83,7 +83,7 @@ def test_validate_data_random_location( sycl_queue=queue, ) - allow_nan = ensure_all_finite == "allow_nan" + allow_nan = ensure_all_finite == "allow-nan" if check is None or (allow_nan and check == "NaN"): validate_data(est, X, ensure_all_finite=ensure_all_finite) else: @@ -116,7 +116,7 @@ def test_validate_data_random_shape_and_location( sycl_queue=queue, ) - allow_nan = ensure_all_finite == "allow_nan" + allow_nan = ensure_all_finite == "allow-nan" if check is None or (allow_nan and check == "NaN"): validate_data(est, X) else: diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 996299f37b..10257623a0 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -46,7 +46,7 @@ def _is_contiguous(X): return False -def _sycl_usm_assert_all_finite(X, xp, *, allow_nan=False, input_name=""): +def _assert_all_finite(X, xp, *, allow_nan=False, input_name=""): # This is a reproduction of code from sklearn.utils.validation necessary for # non-contiguous or non-fp32/fp64 dpctl inputs when sklearn version is <1.2 or # for non-contiguous or non-fp32/fp64 dpnp inputs, as these cannot be checked @@ -74,7 +74,7 @@ def _general_assert_all_finite( elif "float" not in xp.dtype.name or "complex" not in xp.dtype.name: return # handle dpnp inputs - _sycl_usm_assert_all_finite(X, xp, allow_nan=allow_nan, input_name=input_name) + _assert_all_finite(X, xp, allow_nan=allow_nan, input_name=input_name) else: @@ -91,7 +91,7 @@ def _general_assert_all_finite( elif "float" not in xp.dtype.name or "complex" not in xp.dtype.name: return # handle dpctl and dpnp inputs - _sycl_usm_assert_all_finite(X, xp, allow_nan, input_name=input_name) + _assert_all_finite(X, xp, allow_nan, input_name=input_name) def _sklearnex_assert_all_finite( From 1ce1b10df9ebd80cf5bf445373ff6e157cf4a207 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 10:57:15 +0100 Subject: [PATCH 092/131] attempt tp see if dataframe conversion is causing the issue --- sklearnex/utils/tests/test_finite.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index c2dec65e00..e8995fe6d0 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -77,11 +77,12 @@ def test_validate_data_random_location( loc = rand.randint(0, X.size - 1) X.reshape((-1,))[loc] = float(check) - X = _convert_to_dataframe( + _ = _convert_to_dataframe( np.atleast_2d(X), target_df=dataframe, sycl_queue=queue, - ) + ) #test to see if convert_to_dataframe is causing problems + X = np.atleast_2d(X) allow_nan = ensure_all_finite == "allow-nan" if check is None or (allow_nan and check == "NaN"): @@ -110,11 +111,12 @@ def test_validate_data_random_shape_and_location( loc = rand.randint(0, X.size - 1) X[loc] = float(check) - X = _convert_to_dataframe( + _ = _convert_to_dataframe( np.atleast_2d(X), target_df=dataframe, sycl_queue=queue, - ) + ) #test to see if convert_to_dataframe is causing problems + X = np.atleast_2d(X) allow_nan = ensure_all_finite == "allow-nan" if check is None or (allow_nan and check == "NaN"): From 5355039022d9f39c447f39c91ed46d65f4555810 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 13:46:18 +0100 Subject: [PATCH 093/131] fix iter problem --- sklearnex/utils/validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 10257623a0..acdd21323c 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -41,7 +41,7 @@ def _is_contiguous(X): # can then be inspected for strides and this must be updated. _is_contiguous is # therefore conservative in verifying attributes and does not support array_api. # This will block onedal_assert_all_finite from being used for array_api inputs. - if hasattr(X, "flags") and X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]: + if hasattr(X, "flags") and (X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]): return True return False @@ -150,7 +150,7 @@ def validate_data( if ensure_all_finite: # run local finite check allow_nan = ensure_all_finite == "allow-nan" - arg = iter(out) + arg = iter(out if isinstance(out, tuple) else (out,)) if not isinstance(X, str) or X != "no_validation": assert_all_finite(next(arg), allow_nan=allow_nan, input_name="X") if not (y is None or isinstance(y, str) and y == "no_validation"): From b5b84427f2b8c5d5ce39f34f75076190a36ffd6f Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 14:26:05 +0100 Subject: [PATCH 094/131] fix testing issues --- sklearnex/utils/tests/test_finite.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index e8995fe6d0..f20d95a05c 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -44,7 +44,7 @@ @pytest.mark.parametrize("ensure_all_finite", ["allow-nan", True]) def test_sum_infinite_actually_finite(dtype, shape, ensure_all_finite): est = DummyEstimator() - X = np.array(shape, dtype=dtype) + X = np.empty(shape, dtype=dtype) X.fill(np.finfo(dtype).max) X = np.atleast_2d(X) X_array = validate_data(est, X, ensure_all_finite=ensure_all_finite) @@ -120,7 +120,7 @@ def test_validate_data_random_shape_and_location( allow_nan = ensure_all_finite == "allow-nan" if check is None or (allow_nan and check == "NaN"): - validate_data(est, X) + validate_data(est, X, ensure_all_finite=ensure_all_finite) else: type_err = "infinity" if allow_nan else "NaN, infinity" msg_err = f"Input X contains {type_err}." @@ -129,26 +129,25 @@ def test_validate_data_random_shape_and_location( @pytest.mark.parametrize("dtype", [np.float32, np.float64]) -@pytest.mark.parametrize("array_api_dispatch", [True, False]) +@pytest.mark.parametrize("array_api_dispatch", [True, False] if sklearn_check_version("1.2") else [False]) @pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): est = DummyEstimator() X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype)[0] dispatch = {} - if sklearn_check_version("1.2"): + if array_api_dispatch: + pytest.skip(dataframe == "pandas", "pandas inputs do not work with sklearn's array_api_dispatch") dispatch["array_api_dispatch"] = array_api_dispatch with config_context(**dispatch): - validate_data(est, X, y) - est.fit(X, y) + X_out, y_out = validate_data(est, X, y) + # check sklearn validate_data operations work underneath X_array = validate_data(est, X, reset=False) - X_out = est.predict(X) if dataframe == "pandas" or ( dataframe == "array_api" - and not (sklearn_check_version("1.2") and array_api_dispatch) - ): + and not array_api_dispatch): # array_api_strict from sklearn < 1.2 and pandas will convert to numpy arrays assert isinstance(X_array, np.ndarray) assert isinstance(X_out, np.ndarray) From d025c89547d7eb5f21deba665dd42ed173925400 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 14:27:38 +0100 Subject: [PATCH 095/131] formatting --- sklearnex/utils/tests/test_finite.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index f20d95a05c..884b3ec6c5 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -81,7 +81,7 @@ def test_validate_data_random_location( np.atleast_2d(X), target_df=dataframe, sycl_queue=queue, - ) #test to see if convert_to_dataframe is causing problems + ) # test to see if convert_to_dataframe is causing problems X = np.atleast_2d(X) allow_nan = ensure_all_finite == "allow-nan" @@ -115,7 +115,7 @@ def test_validate_data_random_shape_and_location( np.atleast_2d(X), target_df=dataframe, sycl_queue=queue, - ) #test to see if convert_to_dataframe is causing problems + ) # test to see if convert_to_dataframe is causing problems X = np.atleast_2d(X) allow_nan = ensure_all_finite == "allow-nan" @@ -129,7 +129,9 @@ def test_validate_data_random_shape_and_location( @pytest.mark.parametrize("dtype", [np.float32, np.float64]) -@pytest.mark.parametrize("array_api_dispatch", [True, False] if sklearn_check_version("1.2") else [False]) +@pytest.mark.parametrize( + "array_api_dispatch", [True, False] if sklearn_check_version("1.2") else [False] +) @pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): est = DummyEstimator() @@ -137,7 +139,10 @@ def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): dispatch = {} if array_api_dispatch: - pytest.skip(dataframe == "pandas", "pandas inputs do not work with sklearn's array_api_dispatch") + pytest.skip( + dataframe == "pandas", + "pandas inputs do not work with sklearn's array_api_dispatch", + ) dispatch["array_api_dispatch"] = array_api_dispatch with config_context(**dispatch): @@ -145,9 +150,7 @@ def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): # check sklearn validate_data operations work underneath X_array = validate_data(est, X, reset=False) - if dataframe == "pandas" or ( - dataframe == "array_api" - and not array_api_dispatch): + if dataframe == "pandas" or (dataframe == "array_api" and not array_api_dispatch): # array_api_strict from sklearn < 1.2 and pandas will convert to numpy arrays assert isinstance(X_array, np.ndarray) assert isinstance(X_out, np.ndarray) From 428bfb6f5a0db7df71a546d80e80221afbf8a32b Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 14:31:43 +0100 Subject: [PATCH 096/131] revert change --- sklearnex/utils/tests/test_finite.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 884b3ec6c5..6be0f50841 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -77,7 +77,7 @@ def test_validate_data_random_location( loc = rand.randint(0, X.size - 1) X.reshape((-1,))[loc] = float(check) - _ = _convert_to_dataframe( + X = _convert_to_dataframe( np.atleast_2d(X), target_df=dataframe, sycl_queue=queue, @@ -111,7 +111,7 @@ def test_validate_data_random_shape_and_location( loc = rand.randint(0, X.size - 1) X[loc] = float(check) - _ = _convert_to_dataframe( + X = _convert_to_dataframe( np.atleast_2d(X), target_df=dataframe, sycl_queue=queue, From da2313873bb0db18bbfbe88a4b0756b735cb5533 Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 21 Nov 2024 05:38:41 -0800 Subject: [PATCH 097/131] fixes for pandas --- sklearnex/utils/tests/test_finite.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 6be0f50841..637d12b631 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -139,10 +139,8 @@ def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): dispatch = {} if array_api_dispatch: - pytest.skip( - dataframe == "pandas", - "pandas inputs do not work with sklearn's array_api_dispatch", - ) + if dataframe == "pandas": + pytest.skip("pandas inputs do not work with sklearn's array_api_dispatch") dispatch["array_api_dispatch"] = array_api_dispatch with config_context(**dispatch): From 1d0c330f513acd12af54ef5ca43286bf941585f9 Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 21 Nov 2024 05:42:04 -0800 Subject: [PATCH 098/131] there is a slowdown with pandas that needs to be solved --- sklearnex/utils/tests/test_finite.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 637d12b631..2ad2341d6f 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -81,8 +81,7 @@ def test_validate_data_random_location( np.atleast_2d(X), target_df=dataframe, sycl_queue=queue, - ) # test to see if convert_to_dataframe is causing problems - X = np.atleast_2d(X) + ) allow_nan = ensure_all_finite == "allow-nan" if check is None or (allow_nan and check == "NaN"): @@ -115,8 +114,7 @@ def test_validate_data_random_shape_and_location( np.atleast_2d(X), target_df=dataframe, sycl_queue=queue, - ) # test to see if convert_to_dataframe is causing problems - X = np.atleast_2d(X) + ) allow_nan = ensure_all_finite == "allow-nan" if check is None or (allow_nan and check == "NaN"): From f3f63a6a11955670c3763c8cfd2932a0d4864aa7 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 14:56:33 +0100 Subject: [PATCH 099/131] swap to transpose for speed --- sklearnex/utils/tests/test_finite.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 2ad2341d6f..2904ff2bf3 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -77,8 +77,10 @@ def test_validate_data_random_location( loc = rand.randint(0, X.size - 1) X.reshape((-1,))[loc] = float(check) + # column heavy pandas inputs are very slow in sklearn's check_array + # transpose inputs to guarantee fast processing in tests X = _convert_to_dataframe( - np.atleast_2d(X), + np.atleast_2d(X).T, target_df=dataframe, sycl_queue=queue, ) @@ -111,7 +113,7 @@ def test_validate_data_random_shape_and_location( X[loc] = float(check) X = _convert_to_dataframe( - np.atleast_2d(X), + np.atleast_2d(X).T, target_df=dataframe, sycl_queue=queue, ) From 56c80545af46e1116f9445300fa4517f14476d32 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 14:58:07 +0100 Subject: [PATCH 100/131] more clarity --- sklearnex/utils/tests/test_finite.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 2904ff2bf3..fdaec2e2e4 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -77,8 +77,8 @@ def test_validate_data_random_location( loc = rand.randint(0, X.size - 1) X.reshape((-1,))[loc] = float(check) - # column heavy pandas inputs are very slow in sklearn's check_array - # transpose inputs to guarantee fast processing in tests + # column heavy pandas inputs are very slow in sklearn's check_array even without + # the finite check, just transpose inputs to guarantee fast processing in tests X = _convert_to_dataframe( np.atleast_2d(X).T, target_df=dataframe, From 1580d770ed403475853cf4909438f61d028b1744 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 22 Nov 2024 14:24:05 +0100 Subject: [PATCH 101/131] add _check_sample_weight --- sklearnex/utils/validation.py | 120 +++++++++++++++++----------------- 1 file changed, 59 insertions(+), 61 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index acdd21323c..72876bcae6 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -14,11 +14,16 @@ # limitations under the License. # =============================================================================== +import numbers +import warnings + +import numpy as np import scipy.sparse as sp from sklearn.utils.validation import _assert_all_finite as _sklearn_assert_all_finite +from sklearn.utils.validation import _num_samples, check_array, check_non_negative from daal4py.sklearn._utils import sklearn_check_version -from onedal.utils._array_api import _is_numpy_namespace +from onedal.utils._array_api import _get_sycl_namespace, _is_numpy_namespace from onedal.utils.validation import _assert_all_finite as _onedal_assert_all_finite from ._array_api import get_namespace @@ -41,57 +46,7 @@ def _is_contiguous(X): # can then be inspected for strides and this must be updated. _is_contiguous is # therefore conservative in verifying attributes and does not support array_api. # This will block onedal_assert_all_finite from being used for array_api inputs. - if hasattr(X, "flags") and (X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]): - return True - return False - - -def _assert_all_finite(X, xp, *, allow_nan=False, input_name=""): - # This is a reproduction of code from sklearn.utils.validation necessary for - # non-contiguous or non-fp32/fp64 dpctl inputs when sklearn version is <1.2 or - # for non-contiguous or non-fp32/fp64 dpnp inputs, as these cannot be checked - # for finiteness in onedal or by sklearn (while preserving their object type). - first_pass_isfinite = xp.isfinite(xp.sum(X)) - if first_pass_isfinite: - return - - has_inf = xp.any(xp.isinf(X)) - has_nan_error = False if allow_nan else xp.any(xp.isnan(X)) - if has_inf or has_nan_error: - type_err = "infinity" if allow_nan else "NaN, infinity" - padded_input_name = input_name + " " if input_name else "" - msg_err = f"Input {padded_input_name}contains {type_err}." - raise ValueError(msg_err) - - -if sklearn_check_version("1.2"): - - def _general_assert_all_finite( - X, xp, is_array_api_compliant, *, allow_nan=False, input_name="" - ): - if _is_numpy_namespace(xp) or is_array_api_compliant: - _sklearn_assert_all_finite(X, allow_nan=allow_nan, input_name=input_name) - elif "float" not in xp.dtype.name or "complex" not in xp.dtype.name: - return - # handle dpnp inputs - _assert_all_finite(X, xp, allow_nan=allow_nan, input_name=input_name) - -else: - - def _general_assert_all_finite( - X, xp, is_array_api_compliant, *, allow_nan=False, input_name="" - ): - - if _is_numpy_namespace(xp): - _sklearn_assert_all_finite(X, allow_nan, input_name=input_name) - elif is_array_api_compliant and not xp.isdtype( - X, ("real floating", "complex floating") - ): - return - elif "float" not in xp.dtype.name or "complex" not in xp.dtype.name: - return - # handle dpctl and dpnp inputs - _assert_all_finite(X, xp, allow_nan, input_name=input_name) + return hasattr(X, "flags") and (X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]) def _sklearnex_assert_all_finite( @@ -102,16 +57,9 @@ def _sklearnex_assert_all_finite( ): # size check is an initial match to daal4py for performance reasons, can be # optimized later - xp, is_array_api_compliant = get_namespace(X) + xp, _ = get_namespace(X) if X.size < 32768 or X.dtype not in [xp.float32, xp.float64] or not _is_contiguous(X): - - # all sycl_usm_ndarrays for sklearn < 1.2 and dpnp for sklearn > 1.2 are not - # handled properly, it calls a separate function for an import-time sklearn - # version check before possible hand-off to sklearn's _assert_all_finite or to - # _sycl_usm_assert_all_finite. - _general_assert_all_finite( - X, xp, is_array_api_compliant, allow_nan=allow_nan, input_name=input_name - ) + _sklearn_assert_all_finite(X, allow_nan=allow_nan, input_name=input_name) else: _onedal_assert_all_finite(X, allow_nan=allow_nan, input_name=input_name) @@ -141,6 +89,7 @@ def validate_data( # _finite_keyword provides backward compatability for `force_all_finite` ensure_all_finite = kwargs.pop("ensure_all_finite", True) kwargs[_finite_keyword] = False + out = _sklearn_validate_data( _estimator, X=X, @@ -156,3 +105,52 @@ def validate_data( if not (y is None or isinstance(y, str) and y == "no_validation"): assert_all_finite(next(arg), allow_nan=allow_nan, input_name="y") return out + + +def _check_sample_weight( + sample_weight, X, dtype=None, copy=False, only_non_negative=False +): + + n_samples = _num_samples(X) + xp, _ = get_namespace(X) + + if dtype is not None and dtype not in [xp.float32, xp.float64]: + dtype = xp.float64 + + if sample_weight is None: + sample_weight = xp.ones(n_samples, dtype=dtype) + elif isinstance(sample_weight, numbers.Number): + sample_weight = xp.full(n_samples, sample_weight, dtype=dtype) + else: + if dtype is None: + dtype = [xp.float64, xp.float32] + + # create param dict such that the variable finite_keyword can + # be added to it without direct sklearn_check_version maintenance + params = { + "accept_sparse": False, + "ensure_2d": False, + "dtype": dtype, + "order": "C", + "copy": copy, + "input_name": "sample_weight", + _finite_keyword: False, + } + + sample_weight = check_array(sample_weight, **params) + assert_all_finite(sample_weight, input_name="sample_weight") + + if sample_weight.ndim != 1: + raise ValueError("Sample weights must be 1D array or scalar") + + if sample_weight.shape != (n_samples,): + raise ValueError( + "sample_weight.shape == {}, expected {}!".format( + sample_weight.shape, (n_samples,) + ) + ) + + if only_non_negative: + check_non_negative(sample_weight, "`sample_weight`") + + return sample_weight From ffc9f1f33c361495177e8277f9d6fdda4bcce449 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 22 Nov 2024 15:07:22 +0100 Subject: [PATCH 102/131] add more testing' --- .../{test_finite.py => test_validation.py} | 76 ++++++++++++++++++- sklearnex/utils/validation.py | 2 +- 2 files changed, 75 insertions(+), 3 deletions(-) rename sklearnex/utils/tests/{test_finite.py => test_validation.py} (66%) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_validation.py similarity index 66% rename from sklearnex/utils/tests/test_finite.py rename to sklearnex/utils/tests/test_validation.py index fdaec2e2e4..31530c4866 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_validation.py @@ -27,7 +27,7 @@ ) from sklearnex import config_context from sklearnex.tests.utils import DummyEstimator, gen_dataset -from sklearnex.utils.validation import validate_data +from sklearnex.utils.validation import _check_sample_weight, validate_data @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @@ -129,11 +129,83 @@ def test_validate_data_random_shape_and_location( @pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize( "array_api_dispatch", [True, False] if sklearn_check_version("1.2") else [False] ) -@pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) +@pytest.mark.parametrize("seed", [0, int(time.time())]) +@pytest.mark.parametrize( + "dataframe, queue", + get_dataframes_and_queues( + "numpy,pandas" + ("dpctl,array_api" if sklearn_check_version("1.2") else "") + ), +) +def test__check_sample_weights_random_shape_and_location( + dataframe, queue, dtype, array_api_dispatch, check, seed +): + # This testing assumes that array api inputs to validate_data will only occur + # with sklearn array_api support which began in sklearn 1.2. This would assume + # that somewhere upstream of the validate_data call, a data conversion of dpnp, + # dpctl, or array_api inputs to numpy inputs would have occurred. + + lb, ub = 32768, 1048576 # lb is a patching condition, ub 2^20 + rand.seed(seed) + shape = (rand.randint(lb, ub), 2) + X = rand.uniform(high=np.finfo(dtype).max, size=shape).astype(dtype) + sample_weight = rand.uniform(high=np.finfo(dtype).max, size=shape[0]).astype(dtype) + + if check: + loc = rand.randint(0, shape[0] - 1) + sample_weight[loc] = float(check) + + X = _convert_to_dataframe( + X, + target_df=dataframe, + sycl_queue=queue, + ) + sample_weight = _convert_to_dataframe( + sample_weight, + target_df=dataframe, + sycl_queue=queue, + ) + + dispatch = {} + if array_api_dispatch: + if dataframe == "pandas": + pytest.skip("pandas inputs do not work with sklearn's array_api_dispatch") + dispatch["array_api_dispatch"] = array_api_dispatch + + with config_context(**dispatch): + + if check is None: + X_out = _check_sample_weight(X, sample_weight) + if dataframe == "pandas" or ( + dataframe == "array_api" and not array_api_dispatch + ): + assert isinstance(X, np.ndarray) + else: + assert type(X_out) == type(X) + else: + msg_err = "Input sample_weight contains NaN, infinity." + with pytest.raises(ValueError, match=msg_err): + X_out = _check_sample_weight(X, sample_weight) + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize( + "array_api_dispatch", [True, False] if sklearn_check_version("1.2") else [False] +) +@pytest.mark.parametrize( + "dataframe, queue", + get_dataframes_and_queues( + "numpy,pandas" + ("dpctl,array_api" if sklearn_check_version("1.2") else "") + ), +) def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): + # This testing assumes that array api inputs to validate_data will only occur + # with sklearn array_api support which began in sklearn 1.2. This would assume + # that somewhere upstream of the validate_data call, a data conversion of dpnp, + # dpctl, or array_api inputs to numpy inputs would have occurred. est = DummyEstimator() X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype)[0] diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 72876bcae6..f0ed55d86a 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -125,7 +125,7 @@ def _check_sample_weight( if dtype is None: dtype = [xp.float64, xp.float32] - # create param dict such that the variable finite_keyword can + # create param dict such that the variable _finite_keyword can # be added to it without direct sklearn_check_version maintenance params = { "accept_sparse": False, From d184ed044c1bd26b6f38362a6d706331e49714db Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 22 Nov 2024 15:09:29 +0100 Subject: [PATCH 103/131] rename --- sklearnex/utils/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_validation.py index 31530c4866..13934acc7c 100644 --- a/sklearnex/utils/tests/test_validation.py +++ b/sklearnex/utils/tests/test_validation.py @@ -140,7 +140,7 @@ def test_validate_data_random_shape_and_location( "numpy,pandas" + ("dpctl,array_api" if sklearn_check_version("1.2") else "") ), ) -def test__check_sample_weights_random_shape_and_location( +def test__check_sample_weight_random_shape_and_location( dataframe, queue, dtype, array_api_dispatch, check, seed ): # This testing assumes that array api inputs to validate_data will only occur From c68616f26b77f39e2dfcc7f502efb5079583070b Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 22 Nov 2024 15:18:58 +0100 Subject: [PATCH 104/131] remove unnecessary imports --- sklearnex/utils/validation.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index f0ed55d86a..17a83ea054 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -15,15 +15,12 @@ # =============================================================================== import numbers -import warnings -import numpy as np import scipy.sparse as sp from sklearn.utils.validation import _assert_all_finite as _sklearn_assert_all_finite from sklearn.utils.validation import _num_samples, check_array, check_non_negative from daal4py.sklearn._utils import sklearn_check_version -from onedal.utils._array_api import _get_sycl_namespace, _is_numpy_namespace from onedal.utils.validation import _assert_all_finite as _onedal_assert_all_finite from ._array_api import get_namespace From e7ea94e3fea7d28e213bd36a8816499742dfc15f Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 22 Nov 2024 15:42:59 +0100 Subject: [PATCH 105/131] fix test slowness --- sklearnex/utils/tests/test_validation.py | 40 +++++++++--------------- 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_validation.py index 13934acc7c..d1976decce 100644 --- a/sklearnex/utils/tests/test_validation.py +++ b/sklearnex/utils/tests/test_validation.py @@ -130,9 +130,6 @@ def test_validate_data_random_shape_and_location( @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("check", ["inf", "NaN", None]) -@pytest.mark.parametrize( - "array_api_dispatch", [True, False] if sklearn_check_version("1.2") else [False] -) @pytest.mark.parametrize("seed", [0, int(time.time())]) @pytest.mark.parametrize( "dataframe, queue", @@ -141,7 +138,7 @@ def test_validate_data_random_shape_and_location( ), ) def test__check_sample_weight_random_shape_and_location( - dataframe, queue, dtype, array_api_dispatch, check, seed + dataframe, queue, dtype, check, seed ): # This testing assumes that array api inputs to validate_data will only occur # with sklearn array_api support which began in sklearn 1.2. This would assume @@ -170,21 +167,17 @@ def test__check_sample_weight_random_shape_and_location( ) dispatch = {} - if array_api_dispatch: - if dataframe == "pandas": - pytest.skip("pandas inputs do not work with sklearn's array_api_dispatch") - dispatch["array_api_dispatch"] = array_api_dispatch + if dataframe in ["array_api", "dpctl"]: + dispatch["array_api_dispatch"] = True with config_context(**dispatch): if check is None: X_out = _check_sample_weight(X, sample_weight) - if dataframe == "pandas" or ( - dataframe == "array_api" and not array_api_dispatch - ): - assert isinstance(X, np.ndarray) - else: + if dispatch: assert type(X_out) == type(X) + else: + assert isinstance(X, np.ndarray) else: msg_err = "Input sample_weight contains NaN, infinity." with pytest.raises(ValueError, match=msg_err): @@ -192,16 +185,13 @@ def test__check_sample_weight_random_shape_and_location( @pytest.mark.parametrize("dtype", [np.float32, np.float64]) -@pytest.mark.parametrize( - "array_api_dispatch", [True, False] if sklearn_check_version("1.2") else [False] -) @pytest.mark.parametrize( "dataframe, queue", get_dataframes_and_queues( "numpy,pandas" + ("dpctl,array_api" if sklearn_check_version("1.2") else "") ), ) -def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): +def test_validate_data_output(dtype, dataframe, queue): # This testing assumes that array api inputs to validate_data will only occur # with sklearn array_api support which began in sklearn 1.2. This would assume # that somewhere upstream of the validate_data call, a data conversion of dpnp, @@ -210,22 +200,20 @@ def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype)[0] dispatch = {} - if array_api_dispatch: - if dataframe == "pandas": - pytest.skip("pandas inputs do not work with sklearn's array_api_dispatch") - dispatch["array_api_dispatch"] = array_api_dispatch + if dataframe in ["array_api", "dpctl"]: + dispatch["array_api_dispatch"] = True with config_context(**dispatch): X_out, y_out = validate_data(est, X, y) # check sklearn validate_data operations work underneath X_array = validate_data(est, X, reset=False) - if dataframe == "pandas" or (dataframe == "array_api" and not array_api_dispatch): - # array_api_strict from sklearn < 1.2 and pandas will convert to numpy arrays - assert isinstance(X_array, np.ndarray) - assert isinstance(X_out, np.ndarray) - else: + if dispatch: assert type(X) == type( X_array ), f"validate_data converted {type(X)} to {type(X_array)}" assert type(X) == type(X_out), f"from_array converted {type(X)} to {type(X_out)}" + else: + # array_api_strict from sklearn < 1.2 and pandas will convert to numpy arrays + assert isinstance(X_array, np.ndarray) + assert isinstance(X_out, np.ndarray) From dbe108dd0d9c09bc4ec9801c9dce9a71739e874b Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 22 Nov 2024 15:45:40 +0100 Subject: [PATCH 106/131] focus get_dataframes_and_queues --- sklearnex/utils/tests/test_validation.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_validation.py index d1976decce..64bc18e280 100644 --- a/sklearnex/utils/tests/test_validation.py +++ b/sklearnex/utils/tests/test_validation.py @@ -65,7 +65,12 @@ def test_sum_infinite_actually_finite(dtype, shape, ensure_all_finite): @pytest.mark.parametrize("ensure_all_finite", ["allow-nan", True]) @pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize("seed", [0, int(time.time())]) -@pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) +@pytest.mark.parametrize( + "dataframe, queue", + get_dataframes_and_queues( + "numpy,pandas" + ("dpctl,array_api" if sklearn_check_version("1.2") else "") + ), +) def test_validate_data_random_location( dataframe, queue, dtype, shape, ensure_all_finite, check, seed ): @@ -99,7 +104,12 @@ def test_validate_data_random_location( @pytest.mark.parametrize("ensure_all_finite", ["allow-nan", True]) @pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize("seed", [0, int(time.time())]) -@pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) +@pytest.mark.parametrize( + "dataframe, queue", + get_dataframes_and_queues( + "numpy,pandas" + ("dpctl,array_api" if sklearn_check_version("1.2") else "") + ), +) def test_validate_data_random_shape_and_location( dataframe, queue, dtype, ensure_all_finite, check, seed ): From 7284b59910cd839d571a6c586a9b302fcb5d5760 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 22 Nov 2024 15:50:22 +0100 Subject: [PATCH 107/131] put config_context around --- sklearnex/utils/tests/test_validation.py | 44 +++++++++++++++--------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_validation.py index 64bc18e280..3c1978e127 100644 --- a/sklearnex/utils/tests/test_validation.py +++ b/sklearnex/utils/tests/test_validation.py @@ -90,14 +90,20 @@ def test_validate_data_random_location( sycl_queue=queue, ) - allow_nan = ensure_all_finite == "allow-nan" - if check is None or (allow_nan and check == "NaN"): - validate_data(est, X, ensure_all_finite=ensure_all_finite) - else: - type_err = "infinity" if allow_nan else "NaN, infinity" - msg_err = f"Input X contains {type_err}." - with pytest.raises(ValueError, match=msg_err): + dispatch = {} + if sklearn_check_version("1.2") and dataframe != "pandas": + dispatch["array_api_dispatch"] = True + + with config_context(**dispatch): + + allow_nan = ensure_all_finite == "allow-nan" + if check is None or (allow_nan and check == "NaN"): validate_data(est, X, ensure_all_finite=ensure_all_finite) + else: + type_err = "infinity" if allow_nan else "NaN, infinity" + msg_err = f"Input X contains {type_err}." + with pytest.raises(ValueError, match=msg_err): + validate_data(est, X, ensure_all_finite=ensure_all_finite) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @@ -128,14 +134,20 @@ def test_validate_data_random_shape_and_location( sycl_queue=queue, ) - allow_nan = ensure_all_finite == "allow-nan" - if check is None or (allow_nan and check == "NaN"): - validate_data(est, X, ensure_all_finite=ensure_all_finite) - else: - type_err = "infinity" if allow_nan else "NaN, infinity" - msg_err = f"Input X contains {type_err}." - with pytest.raises(ValueError, match=msg_err): + dispatch = {} + if sklearn_check_version("1.2") and dataframe != "pandas": + dispatch["array_api_dispatch"] = True + + with config_context(**dispatch): + + allow_nan = ensure_all_finite == "allow-nan" + if check is None or (allow_nan and check == "NaN"): validate_data(est, X, ensure_all_finite=ensure_all_finite) + else: + type_err = "infinity" if allow_nan else "NaN, infinity" + msg_err = f"Input X contains {type_err}." + with pytest.raises(ValueError, match=msg_err): + validate_data(est, X, ensure_all_finite=ensure_all_finite) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @@ -177,7 +189,7 @@ def test__check_sample_weight_random_shape_and_location( ) dispatch = {} - if dataframe in ["array_api", "dpctl"]: + if sklearn_check_version("1.2") and dataframe != "pandas": dispatch["array_api_dispatch"] = True with config_context(**dispatch): @@ -210,7 +222,7 @@ def test_validate_data_output(dtype, dataframe, queue): X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype)[0] dispatch = {} - if dataframe in ["array_api", "dpctl"]: + if sklearn_check_version("1.2") and dataframe != "pandas": dispatch["array_api_dispatch"] = True with config_context(**dispatch): From e1be91d13c5cef9815ecc9d8a7c3ced8e7386efa Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sun, 24 Nov 2024 14:28:54 +0100 Subject: [PATCH 108/131] Update test_validation.py --- sklearnex/utils/tests/test_validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_validation.py index 3c1978e127..dc6117e6d4 100644 --- a/sklearnex/utils/tests/test_validation.py +++ b/sklearnex/utils/tests/test_validation.py @@ -195,7 +195,7 @@ def test__check_sample_weight_random_shape_and_location( with config_context(**dispatch): if check is None: - X_out = _check_sample_weight(X, sample_weight) + X_out = _check_sample_weight(sample_weight, X) if dispatch: assert type(X_out) == type(X) else: @@ -203,7 +203,7 @@ def test__check_sample_weight_random_shape_and_location( else: msg_err = "Input sample_weight contains NaN, infinity." with pytest.raises(ValueError, match=msg_err): - X_out = _check_sample_weight(X, sample_weight) + X_out = _check_sample_weight(sample_weight, X) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) From 8a0f9e9dd1219d2ad1e514c9fecd9055cdfb0d60 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sun, 24 Nov 2024 15:20:57 +0100 Subject: [PATCH 109/131] Update base.py --- sklearnex/tests/utils/base.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearnex/tests/utils/base.py b/sklearnex/tests/utils/base.py index e484423cfc..706de39a91 100755 --- a/sklearnex/tests/utils/base.py +++ b/sklearnex/tests/utils/base.py @@ -47,7 +47,6 @@ NearestNeighbors, ) from sklearnex.svm import SVC, NuSVC -from sklearnex.utils.validation import validate_data def _load_all_models(with_sklearnex=True, estimator=True): @@ -378,8 +377,6 @@ def _get_processor_info(): class DummyEstimator(BaseEstimator): def fit(self, X, y=None): - X, y = validate_data(self, X, y) - sua_iface, xp, _ = _get_sycl_namespace(X) X_table = to_table(X) y_table = to_table(y) From 52722077467d2844dcce2233f24efb7e5dafd7f4 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sun, 24 Nov 2024 21:07:29 +0100 Subject: [PATCH 110/131] Update test_validation.py --- sklearnex/utils/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_validation.py index dc6117e6d4..d366a74560 100644 --- a/sklearnex/utils/tests/test_validation.py +++ b/sklearnex/utils/tests/test_validation.py @@ -199,7 +199,7 @@ def test__check_sample_weight_random_shape_and_location( if dispatch: assert type(X_out) == type(X) else: - assert isinstance(X, np.ndarray) + assert isinstance(X_out, np.ndarray) else: msg_err = "Input sample_weight contains NaN, infinity." with pytest.raises(ValueError, match=msg_err): From 56b5c4c4730de70243cb158e88ddda6ac38bc082 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Mon, 25 Nov 2024 06:46:06 +0100 Subject: [PATCH 111/131] generalize regex --- sklearnex/utils/tests/test_validation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_validation.py index d366a74560..3f7fb0758d 100644 --- a/sklearnex/utils/tests/test_validation.py +++ b/sklearnex/utils/tests/test_validation.py @@ -100,8 +100,8 @@ def test_validate_data_random_location( if check is None or (allow_nan and check == "NaN"): validate_data(est, X, ensure_all_finite=ensure_all_finite) else: - type_err = "infinity" if allow_nan else "NaN, infinity" - msg_err = f"Input X contains {type_err}." + type_err = "infinity" if allow_nan else "[NaN|infinity]" + msg_err = f"Input X contains {type_err}" with pytest.raises(ValueError, match=msg_err): validate_data(est, X, ensure_all_finite=ensure_all_finite) @@ -144,7 +144,7 @@ def test_validate_data_random_shape_and_location( if check is None or (allow_nan and check == "NaN"): validate_data(est, X, ensure_all_finite=ensure_all_finite) else: - type_err = "infinity" if allow_nan else "NaN, infinity" + type_err = "infinity" if allow_nan else "[NaN|infinity]" msg_err = f"Input X contains {type_err}." with pytest.raises(ValueError, match=msg_err): validate_data(est, X, ensure_all_finite=ensure_all_finite) @@ -201,7 +201,7 @@ def test__check_sample_weight_random_shape_and_location( else: assert isinstance(X_out, np.ndarray) else: - msg_err = "Input sample_weight contains NaN, infinity." + msg_err = "Input sample_weight contains [NaN|infinity]" with pytest.raises(ValueError, match=msg_err): X_out = _check_sample_weight(sample_weight, X) From 0d1b30607d0fa12c93eb7eeaf9c1b818cee44467 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Mon, 25 Nov 2024 10:29:42 +0100 Subject: [PATCH 112/131] add fixes for sklearn 1.0 and input_name --- sklearnex/utils/validation.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 17a83ea054..76470091ce 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -56,7 +56,10 @@ def _sklearnex_assert_all_finite( # optimized later xp, _ = get_namespace(X) if X.size < 32768 or X.dtype not in [xp.float32, xp.float64] or not _is_contiguous(X): - _sklearn_assert_all_finite(X, allow_nan=allow_nan, input_name=input_name) + if sklearn_check_version("1.1"): + _sklearn_assert_all_finite(X, allow_nan=allow_nan, input_name=input_name) + else: + _sklearn_assert_all_finite(X, allow_nan=allow_nan) else: _onedal_assert_all_finite(X, allow_nan=allow_nan, input_name=input_name) @@ -122,17 +125,16 @@ def _check_sample_weight( if dtype is None: dtype = [xp.float64, xp.float32] - # create param dict such that the variable _finite_keyword can - # be added to it without direct sklearn_check_version maintenance params = { "accept_sparse": False, "ensure_2d": False, "dtype": dtype, "order": "C", "copy": copy, - "input_name": "sample_weight", _finite_keyword: False, } + if sklearn_check_version("1.1"): + params["input_name"] = "sample_weight" sample_weight = check_array(sample_weight, **params) assert_all_finite(sample_weight, input_name="sample_weight") From 8ff312eecb289815b8e5ff65c558a39dadb1a72d Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Mon, 25 Nov 2024 10:35:24 +0100 Subject: [PATCH 113/131] fixes for test failures --- sklearnex/utils/tests/test_validation.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_validation.py index 3f7fb0758d..92ba0d742a 100644 --- a/sklearnex/utils/tests/test_validation.py +++ b/sklearnex/utils/tests/test_validation.py @@ -29,6 +29,13 @@ from sklearnex.tests.utils import DummyEstimator, gen_dataset from sklearnex.utils.validation import _check_sample_weight, validate_data +# array_api support starts in sklearn 1.2, and array_api_strict conformance starts in sklearn 1.3 +_dataframes_supported = ( + "numpy,pandas" + + (",dpctl" if sklearn_check_version("1.2") else "") + + (",array_api" if sklearn_check_version("1.3") else "") +) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize( @@ -67,9 +74,7 @@ def test_sum_infinite_actually_finite(dtype, shape, ensure_all_finite): @pytest.mark.parametrize("seed", [0, int(time.time())]) @pytest.mark.parametrize( "dataframe, queue", - get_dataframes_and_queues( - "numpy,pandas" + ("dpctl,array_api" if sklearn_check_version("1.2") else "") - ), + get_dataframes_and_queues(_dataframes_supported), ) def test_validate_data_random_location( dataframe, queue, dtype, shape, ensure_all_finite, check, seed @@ -112,9 +117,7 @@ def test_validate_data_random_location( @pytest.mark.parametrize("seed", [0, int(time.time())]) @pytest.mark.parametrize( "dataframe, queue", - get_dataframes_and_queues( - "numpy,pandas" + ("dpctl,array_api" if sklearn_check_version("1.2") else "") - ), + get_dataframes_and_queues(_dataframes_supported), ) def test_validate_data_random_shape_and_location( dataframe, queue, dtype, ensure_all_finite, check, seed @@ -155,9 +158,7 @@ def test_validate_data_random_shape_and_location( @pytest.mark.parametrize("seed", [0, int(time.time())]) @pytest.mark.parametrize( "dataframe, queue", - get_dataframes_and_queues( - "numpy,pandas" + ("dpctl,array_api" if sklearn_check_version("1.2") else "") - ), + get_dataframes_and_queues(_dataframes_supported), ) def test__check_sample_weight_random_shape_and_location( dataframe, queue, dtype, check, seed @@ -209,9 +210,7 @@ def test__check_sample_weight_random_shape_and_location( @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize( "dataframe, queue", - get_dataframes_and_queues( - "numpy,pandas" + ("dpctl,array_api" if sklearn_check_version("1.2") else "") - ), + get_dataframes_and_queues(_dataframes_supported), ) def test_validate_data_output(dtype, dataframe, queue): # This testing assumes that array api inputs to validate_data will only occur From 87b7e3b461c431d07da1114c15ea8e9ca3c9c4b9 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Mon, 25 Nov 2024 21:42:18 +0100 Subject: [PATCH 114/131] Update validation.py --- onedal/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index 836dd84a75..38dcfd3fb3 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -449,7 +449,7 @@ def _assert_all_finite(X, allow_nan=False, input_name=""): policy = _get_policy(None, X) X_t = to_table(_convert_to_supported(policy, X)) params = { - "fptype": "float" if X_t.dtype == np.float32 else "double", + "fptype": X_t.dtype, "method": "dense", "allow_nan": allow_nan, } From 29e8f8c1a34aad809695d86d55bb197bb6e3fae1 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Mon, 25 Nov 2024 21:42:56 +0100 Subject: [PATCH 115/131] Update test_validation.py --- onedal/utils/tests/test_validation.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/onedal/utils/tests/test_validation.py b/onedal/utils/tests/test_validation.py index 7662f486f3..37486f0337 100644 --- a/onedal/utils/tests/test_validation.py +++ b/onedal/utils/tests/test_validation.py @@ -137,9 +137,8 @@ def test_assert_finite_sparse(dtype, allow_nan, check, seed): ) if check: - locx = rand.randint(0, X.shape[0] - 1) - locy = rand.randint(0, X.shape[1] - 1) - X[locx, locy] = float(check) + locx = rand.randint(0, X.data.shape[0] - 1) + X.data[locx] = float(check) if check is None or (allow_nan and check == "NaN"): assert_all_finite(X, allow_nan=allow_nan) From 27ce5fc64fa75c2bcde2eac87f7fec7899cf2416 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 27 Nov 2024 11:49:57 +0100 Subject: [PATCH 116/131] Update validation.py --- sklearnex/utils/validation.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 76470091ce..479c0b300d 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -20,8 +20,7 @@ from sklearn.utils.validation import _assert_all_finite as _sklearn_assert_all_finite from sklearn.utils.validation import _num_samples, check_array, check_non_negative -from daal4py.sklearn._utils import sklearn_check_version -from onedal.utils.validation import _assert_all_finite as _onedal_assert_all_finite +from daal4py.sklearn._utils import daal_check_version, sklearn_check_version from ._array_api import get_namespace @@ -37,13 +36,26 @@ _finite_keyword = "force_all_finite" -def _is_contiguous(X): - # array_api does not have a `strides` or `flags` attribute for testing memory - # order. When dlpack support is brought in for oneDAL, the dlpack python capsule - # can then be inspected for strides and this must be updated. _is_contiguous is - # therefore conservative in verifying attributes and does not support array_api. - # This will block onedal_assert_all_finite from being used for array_api inputs. - return hasattr(X, "flags") and (X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]) +if daal_check_version(2024, "P", 700): + from onedal.utils.validation import _assert_all_finite as _onedal_assert_all_finite + + def _onedal_supported_format(X, xp=None): + # array_api does not have a `strides` or `flags` attribute for testing memory + # order. When dlpack support is brought in for oneDAL, general support for + # array_api can be enabled and the hasattr check can be removed. + # _onedal_supported_format is therefore conservative in verifying attributes and + # does not support array_api. This will block onedal_assert_all_finite from being + # used for array_api inputs but will allow dpnp ndarrays and dpctl tensors. + return X.dtype in [xp.float32, xp.float64] and hasattr(X, "flags") + +else: + from daal4py.utils.validation import _assert_all_finite as _onedal_assert_all_finite + from onedal.utils._array_api import _is_numpy_namespace + + def _onedal_supported_format(X, xp=None): + # daal4py _assert_all_finite only supports numpy namespaces, use internally- + # defined check to validate inputs, otherwise offload to sklearn + return X.dtype in [xp.float32, xp.float64] and _is_numpy_namespace(xp) def _sklearnex_assert_all_finite( @@ -55,7 +67,7 @@ def _sklearnex_assert_all_finite( # size check is an initial match to daal4py for performance reasons, can be # optimized later xp, _ = get_namespace(X) - if X.size < 32768 or X.dtype not in [xp.float32, xp.float64] or not _is_contiguous(X): + if X.size < 32768 or not _onedal_supported_format(X, xp): if sklearn_check_version("1.1"): _sklearn_assert_all_finite(X, allow_nan=allow_nan, input_name=input_name) else: From 5d31988df229e45c116bfbb8a0c21db0ee3bbc32 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 27 Nov 2024 11:50:39 +0100 Subject: [PATCH 117/131] formattintg --- sklearnex/utils/validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 479c0b300d..19f8fed17a 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -38,7 +38,7 @@ if daal_check_version(2024, "P", 700): from onedal.utils.validation import _assert_all_finite as _onedal_assert_all_finite - + def _onedal_supported_format(X, xp=None): # array_api does not have a `strides` or `flags` attribute for testing memory # order. When dlpack support is brought in for oneDAL, general support for @@ -51,7 +51,7 @@ def _onedal_supported_format(X, xp=None): else: from daal4py.utils.validation import _assert_all_finite as _onedal_assert_all_finite from onedal.utils._array_api import _is_numpy_namespace - + def _onedal_supported_format(X, xp=None): # daal4py _assert_all_finite only supports numpy namespaces, use internally- # defined check to validate inputs, otherwise offload to sklearn From c4dccd61076198dd8d225071c7ada649b6223685 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 27 Nov 2024 11:55:47 +0100 Subject: [PATCH 118/131] make suggested changes --- sklearnex/utils/tests/test_validation.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_validation.py index 92ba0d742a..70da28dbce 100644 --- a/sklearnex/utils/tests/test_validation.py +++ b/sklearnex/utils/tests/test_validation.py @@ -42,9 +42,7 @@ "shape", [ [16, 2048], - [ - 2**16 + 3, - ], + [2**16 + 3], [1000, 1000], ], ) @@ -63,9 +61,7 @@ def test_sum_infinite_actually_finite(dtype, shape, ensure_all_finite): "shape", [ [16, 2048], - [ - 2**16 + 3, - ], + [2**16 + 3], [1000, 1000], ], ) From f83f1ef1a3217a1997434c09fbe8efc6777ecdc8 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 27 Nov 2024 11:59:43 +0100 Subject: [PATCH 119/131] follow changes made in #2126 --- onedal/utils/tests/test_validation.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/onedal/utils/tests/test_validation.py b/onedal/utils/tests/test_validation.py index 37486f0337..31eb8da2cc 100644 --- a/onedal/utils/tests/test_validation.py +++ b/onedal/utils/tests/test_validation.py @@ -33,9 +33,7 @@ "shape", [ [16, 2048], - [ - 2**16 + 3, - ], + [65539], # 2**16 + 3, [1000, 1000], [ 3, @@ -58,9 +56,7 @@ def test_sum_infinite_actually_finite(dtype, shape, allow_nan, dataframe, queue) "shape", [ [16, 2048], - [ - 2**16 + 3, - ], + [65539], # 2**16 + 3, [1000, 1000], [ 3, From e43c047cb109bb85eb822dd974f80b452b756230 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 27 Nov 2024 12:09:44 +0100 Subject: [PATCH 120/131] fix future device problem --- sklearnex/utils/validation.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 19f8fed17a..80edd2ec57 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -130,9 +130,17 @@ def _check_sample_weight( dtype = xp.float64 if sample_weight is None: - sample_weight = xp.ones(n_samples, dtype=dtype) + if hasattr(X, "device"): + sample_weight = xp.ones(n_samples, dtype=dtype, device=X.device) + else: + sample_weight = xp.ones(n_samples, dtype=dtype) elif isinstance(sample_weight, numbers.Number): - sample_weight = xp.full(n_samples, sample_weight, dtype=dtype) + if hasattr(X, "device"): + sample_weight = xp.full( + n_samples, sample_weight, dtype=dtype, device=X.device + ) + else: + sample_weight = xp.full(n_samples, sample_weight, dtype=dtype) else: if dtype is None: dtype = [xp.float64, xp.float32] From 5c81f9df84f6ca603f8a16516cb6391ed9be2684 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 27 Nov 2024 17:55:17 +0100 Subject: [PATCH 121/131] Update validation.py --- sklearnex/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 80edd2ec57..c2ba2c1dc5 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -36,7 +36,7 @@ _finite_keyword = "force_all_finite" -if daal_check_version(2024, "P", 700): +if daal_check_version((2024, "P", 700)): from onedal.utils.validation import _assert_all_finite as _onedal_assert_all_finite def _onedal_supported_format(X, xp=None): From 164435de60077f573f19244dcb682b36b4f6b513 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 4 Dec 2024 21:54:14 +0100 Subject: [PATCH 122/131] minor changes based on #2206, suggestions --- sklearnex/utils/tests/test_validation.py | 2 +- sklearnex/utils/validation.py | 35 ++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_validation.py index 70da28dbce..c770abd495 100644 --- a/sklearnex/utils/tests/test_validation.py +++ b/sklearnex/utils/tests/test_validation.py @@ -1,5 +1,5 @@ # ============================================================================== -# Copyright 2024 Intel Corporation +# Copyright 2024 UXL Foundation Contributors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index c2ba2c1dc5..4e908a31ce 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -46,7 +46,13 @@ def _onedal_supported_format(X, xp=None): # _onedal_supported_format is therefore conservative in verifying attributes and # does not support array_api. This will block onedal_assert_all_finite from being # used for array_api inputs but will allow dpnp ndarrays and dpctl tensors. - return X.dtype in [xp.float32, xp.float64] and hasattr(X, "flags") + # only check contiguous arrays to prevent unnecessary copying of data, even if + # non-contiguous arrays can now be converted to oneDAL tables. + return ( + X.dtype in [xp.float32, xp.float64] + and hasattr(X, "flags") + and (X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]) + ) else: from daal4py.utils.validation import _assert_all_finite as _onedal_assert_all_finite @@ -108,14 +114,37 @@ def validate_data( y=y, **kwargs, ) + + check_x = not isinstance(X, str) or X != "no_validation" + check_y = not (y is None or isinstance(y, str) and y == "no_validation") + if ensure_all_finite: # run local finite check allow_nan = ensure_all_finite == "allow-nan" arg = iter(out if isinstance(out, tuple) else (out,)) - if not isinstance(X, str) or X != "no_validation": + if check_x: assert_all_finite(next(arg), allow_nan=allow_nan, input_name="X") - if not (y is None or isinstance(y, str) and y == "no_validation"): + if check_y: assert_all_finite(next(arg), allow_nan=allow_nan, input_name="y") + + if check_y and "dtype" in kwargs: + # validate_data does not do full dtype conversions, as it uses check_X_y + # oneDAL can make tables from [int32, float32, float64], requiring + # a dtype check and conversion. This will query the array_namespace and + # convert y as necessary. This is done after assert_all_finite, because + # int y arrays do not need to finite check, and this will lead to a speedup + # in comparison to sklearn + dtype = kwargs["dtype"] + if not isinstance(dtype, (tuple, list)): + dtype = tuple(dtype) + + outx, outy = out if check_x else (None, out) + if outy.dtype not in dtype: + yp, _ = get_namespace(outy) + # use asarray rather than astype because of numpy support + outy = yp.asarray(outy, dtype=dtype[0]) + out = (outx, outy) if check_x else outy + return out From 4aff9e025f23137e877739df0bf3fc4cba3a65c2 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 5 Dec 2024 10:05:36 +0100 Subject: [PATCH 123/131] remove xp as keyword --- sklearnex/utils/validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 4e908a31ce..cfc5106aab 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -39,7 +39,7 @@ if daal_check_version((2024, "P", 700)): from onedal.utils.validation import _assert_all_finite as _onedal_assert_all_finite - def _onedal_supported_format(X, xp=None): + def _onedal_supported_format(X, xp): # array_api does not have a `strides` or `flags` attribute for testing memory # order. When dlpack support is brought in for oneDAL, general support for # array_api can be enabled and the hasattr check can be removed. @@ -58,7 +58,7 @@ def _onedal_supported_format(X, xp=None): from daal4py.utils.validation import _assert_all_finite as _onedal_assert_all_finite from onedal.utils._array_api import _is_numpy_namespace - def _onedal_supported_format(X, xp=None): + def _onedal_supported_format(X, xp): # daal4py _assert_all_finite only supports numpy namespaces, use internally- # defined check to validate inputs, otherwise offload to sklearn return X.dtype in [xp.float32, xp.float64] and _is_numpy_namespace(xp) From db11608bd83ff0bb657fca137949fe3cc5a2af94 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 5 Dec 2024 10:15:17 +0100 Subject: [PATCH 124/131] only_non_negative -> ensure_non_negative --- sklearnex/utils/validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index cfc5106aab..3361ae0c39 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -149,7 +149,7 @@ def validate_data( def _check_sample_weight( - sample_weight, X, dtype=None, copy=False, only_non_negative=False + sample_weight, X, dtype=None, copy=False, ensure_non_negative=False ): n_samples = _num_samples(X) @@ -198,7 +198,7 @@ def _check_sample_weight( ) ) - if only_non_negative: + if ensure_non_negative: check_non_negative(sample_weight, "`sample_weight`") return sample_weight From bf62e50b0d84db076a408523f1d4f229746544db Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 5 Dec 2024 10:42:58 +0100 Subject: [PATCH 125/131] add commentary --- sklearnex/utils/validation.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 3361ae0c39..298a1c2eda 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -121,6 +121,11 @@ def validate_data( if ensure_all_finite: # run local finite check allow_nan = ensure_all_finite == "allow-nan" + # the return object from validate_data can (annoyingly) be a single + # element (either x or y) or both (as a tuple). An iterator along with + # check_x and check_y can go through the output properly without + # stacking layers of if statements to make sure the proper input_name + # is used arg = iter(out if isinstance(out, tuple) else (out,)) if check_x: assert_all_finite(next(arg), allow_nan=allow_nan, input_name="X") @@ -131,9 +136,7 @@ def validate_data( # validate_data does not do full dtype conversions, as it uses check_X_y # oneDAL can make tables from [int32, float32, float64], requiring # a dtype check and conversion. This will query the array_namespace and - # convert y as necessary. This is done after assert_all_finite, because - # int y arrays do not need to finite check, and this will lead to a speedup - # in comparison to sklearn + # convert y as necessary. This is important especially for regressors. dtype = kwargs["dtype"] if not isinstance(dtype, (tuple, list)): dtype = tuple(dtype) From 993a27207cea1722851406fd214a8f1eb0bcba77 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 5 Dec 2024 10:43:12 +0100 Subject: [PATCH 126/131] formatting --- sklearnex/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 298a1c2eda..b5c8b89251 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -121,7 +121,7 @@ def validate_data( if ensure_all_finite: # run local finite check allow_nan = ensure_all_finite == "allow-nan" - # the return object from validate_data can (annoyingly) be a single + # the return object from validate_data can (annoyingly) be a single # element (either x or y) or both (as a tuple). An iterator along with # check_x and check_y can go through the output properly without # stacking layers of if statements to make sure the proper input_name From c034883d96b1d2bfa369e3f7a1f84d2aca5fb81d Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 5 Dec 2024 11:16:32 +0100 Subject: [PATCH 127/131] address changes --- sklearnex/tests/utils/base.py | 5 ++++- sklearnex/utils/tests/test_validation.py | 22 +++++++++++++--------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/sklearnex/tests/utils/base.py b/sklearnex/tests/utils/base.py index 706de39a91..1fd5b25e92 100755 --- a/sklearnex/tests/utils/base.py +++ b/sklearnex/tests/utils/base.py @@ -388,7 +388,10 @@ def fit(self, X, y=None): X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp ) self.y_attr_ = from_table( - y_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp + y_table, + sua_iface=sua_iface, + sycl_queue=y.sycl_queue if y else X.sycl_queue, + xp=xp, ) else: self.x_attr = from_table(X_table) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_validation.py index c770abd495..8b6391958d 100644 --- a/sklearnex/utils/tests/test_validation.py +++ b/sklearnex/utils/tests/test_validation.py @@ -225,12 +225,16 @@ def test_validate_data_output(dtype, dataframe, queue): # check sklearn validate_data operations work underneath X_array = validate_data(est, X, reset=False) - if dispatch: - assert type(X) == type( - X_array - ), f"validate_data converted {type(X)} to {type(X_array)}" - assert type(X) == type(X_out), f"from_array converted {type(X)} to {type(X_out)}" - else: - # array_api_strict from sklearn < 1.2 and pandas will convert to numpy arrays - assert isinstance(X_array, np.ndarray) - assert isinstance(X_out, np.ndarray) + for orig, first, second in ((X, X_out, X_array), (y, y_out, None)): + if dispatch: + assert type(orig) == type( + first + ), f"validate_data converted {type(orig)} to {type(first)}" + if second: + assert type(orig) == type( + second + ), f"from_array converted {type(orig)} to {type(second)}" + else: + # array_api_strict from sklearn < 1.2 and pandas will convert to numpy arrays + assert isinstance(first, np.ndarray) + assert second is None or isinstance(second, np.ndarray) From e5c9b8bd22f93272c73290d5a852c19279f51fa5 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Thu, 5 Dec 2024 13:59:03 +0100 Subject: [PATCH 128/131] Update test_validation.py --- sklearnex/utils/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_validation.py index 8b6391958d..aa0f9d6894 100644 --- a/sklearnex/utils/tests/test_validation.py +++ b/sklearnex/utils/tests/test_validation.py @@ -230,7 +230,7 @@ def test_validate_data_output(dtype, dataframe, queue): assert type(orig) == type( first ), f"validate_data converted {type(orig)} to {type(first)}" - if second: + if second is not None: assert type(orig) == type( second ), f"from_array converted {type(orig)} to {type(second)}" From c55843bc755e9df994fa7ad57280a5f48e2e146b Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Thu, 5 Dec 2024 16:03:34 +0100 Subject: [PATCH 129/131] Update base.py --- sklearnex/tests/utils/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/tests/utils/base.py b/sklearnex/tests/utils/base.py index 1fd5b25e92..33d3804b8f 100755 --- a/sklearnex/tests/utils/base.py +++ b/sklearnex/tests/utils/base.py @@ -390,7 +390,7 @@ def fit(self, X, y=None): self.y_attr_ = from_table( y_table, sua_iface=sua_iface, - sycl_queue=y.sycl_queue if y else X.sycl_queue, + sycl_queue=X.sycl_queue if y is None else y.sycl_queue, xp=xp, ) else: From ac6d8317f857fb3eaac8dac4ed29d93024cc9b78 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Thu, 5 Dec 2024 19:26:05 +0100 Subject: [PATCH 130/131] Update test_validation.py --- sklearnex/utils/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_validation.py index aa0f9d6894..37d0a6df6e 100644 --- a/sklearnex/utils/tests/test_validation.py +++ b/sklearnex/utils/tests/test_validation.py @@ -1,5 +1,5 @@ # ============================================================================== -# Copyright 2024 UXL Foundation Contributors +# Copyright contributors to the oneDAL project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 1305ca1dac41c02aad3abc933c3f63b4a36fe81a Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Mon, 9 Dec 2024 14:25:28 +0100 Subject: [PATCH 131/131] Update sklearnex/utils/validation.py Co-authored-by: ethanglaser <42726565+ethanglaser@users.noreply.github.com> --- sklearnex/utils/validation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index b5c8b89251..4d12602d74 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -121,7 +121,8 @@ def validate_data( if ensure_all_finite: # run local finite check allow_nan = ensure_all_finite == "allow-nan" - # the return object from validate_data can (annoyingly) be a single + # the return object from validate_data can be a single + # element (either x or y) or both (as a tuple). An iterator along with # check_x and check_y can go through the output properly without # stacking layers of if statements to make sure the proper input_name