diff --git a/CHANGELOG.md b/CHANGELOG.md index a474691f9a..21ec5613c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -56,6 +56,7 @@ - PR #3008: Check number of columns in check_array validator - PR #3012: Increasing learning rate for SGD log loss and invscaling pytests - PR #2950: Fix includes in UMAP +- PR #3194: Fix cuDF to cuPy conversion (missing value) - PR #3021: Fix a hang in cuML RF experimental backend - PR #3039: Update RF and decision tree parameter initializations in benchmark codes - PR #3060: Speed up test suite `test_fil` diff --git a/python/cuml/common/input_utils.py b/python/cuml/common/input_utils.py index adfea879ad..b2549f591f 100644 --- a/python/cuml/common/input_utils.py +++ b/python/cuml/common/input_utils.py @@ -398,11 +398,21 @@ def input_to_cupy_array(X, check_cols=False, check_rows=False, fail_on_order=False, - force_contiguous=True) -> cuml_array: + force_contiguous=True, + fail_on_null=True) -> cuml_array: """ Identical to input_to_cuml_array but it returns a cupy array instead of CumlArray """ + if not fail_on_null: + if isinstance(X, (cudf.DataFrame, cudf.Series)): + try: + X = X.values + except ValueError: + X = X.astype('float64', copy=False) + X.fillna(cp.nan, inplace=True) + X = X.values + out_data = input_to_cuml_array(X, order=order, deepcopy=deepcopy, diff --git a/python/cuml/test/test_input_utils.py b/python/cuml/test/test_input_utils.py index 1a5446e7fb..bb66fa299d 100644 --- a/python/cuml/test/test_input_utils.py +++ b/python/cuml/test/test_input_utils.py @@ -23,6 +23,7 @@ from cuml.common import input_to_cuml_array, CumlArray from cuml.common import input_to_host_array +from cuml.common.input_utils import input_to_cupy_array from cuml.common import has_cupy from cuml.common.input_utils import convert_dtype from cuml.common.memory_utils import _check_array_contiguity @@ -344,3 +345,26 @@ def get_input(type, nrows, ncols, dtype, order='C', out_dtype=False): order=order) else: return result, np.array(cp.asnumpy(rand_mat), order=order) + + +def test_tocupy_missing_values_handling(): + df = cudf.DataFrame(data=[[7, 2, 3], [4, 5, 6], [10, 5, 9]]) + array, n_rows, n_cols, dtype = input_to_cupy_array(df, fail_on_null=False) + assert isinstance(array, cp.ndarray) + assert str(array.dtype) == 'int64' + + df = cudf.DataFrame(data=[[7, 2, 3], [4, None, 6], [10, 5, 9]]) + array, n_rows, n_cols, dtype = input_to_cupy_array(df, fail_on_null=False) + assert isinstance(array, cp.ndarray) + assert str(array.dtype) == 'float64' + assert cp.isnan(array[1, 1]) + + df = cudf.Series(data=[7, None, 3]) + array, n_rows, n_cols, dtype = input_to_cupy_array(df, fail_on_null=False) + assert str(array.dtype) == 'float64' + assert cp.isnan(array[1]) + + with pytest.raises(ValueError): + df = cudf.Series(data=[7, None, 3]) + array, n_rows, n_cols, dtype = input_to_cupy_array(df, + fail_on_null=True) diff --git a/python/cuml/thirdparty_adapters/adapters.py b/python/cuml/thirdparty_adapters/adapters.py index 8b4f2d35a3..fb50d86cdb 100644 --- a/python/cuml/thirdparty_adapters/adapters.py +++ b/python/cuml/thirdparty_adapters/adapters.py @@ -277,7 +277,8 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, else: X, n_rows, n_cols, dtype = input_to_cupy_array(array, order=order, - deepcopy=copy) + deepcopy=copy, + fail_on_null=False) if correct_dtype != dtype: X = X.astype(correct_dtype) check_finite(X, force_all_finite)