Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Fix cuDF to cuPy conversion (missing value) #3194

Merged
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
- PR #3008: Check number of columns in check_array validator
- PR #3012: Increasing learning rate for SGD log loss and invscaling pytests
- PR #2950: Fix includes in UMAP
- PR #3194: Fix cuDF to cuPy conversion (missing value)
- PR #3021: Fix a hang in cuML RF experimental backend
- PR #3039: Update RF and decision tree parameter initializations in benchmark codes
- PR #3060: Speed up test suite `test_fil`
Expand Down
12 changes: 11 additions & 1 deletion python/cuml/common/input_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,11 +398,21 @@ def input_to_cupy_array(X,
check_cols=False,
check_rows=False,
fail_on_order=False,
force_contiguous=True) -> cuml_array:
force_contiguous=True,
fail_on_nan=True) -> cuml_array:
viclafargue marked this conversation as resolved.
Show resolved Hide resolved
"""
Identical to input_to_cuml_array but it returns a cupy array instead of
CumlArray
"""
if not fail_on_nan:
if isinstance(X, (cudf.DataFrame, cudf.Series)):
try:
X = X.values
except ValueError:
X = X.astype('float64', copy=False)
X.fillna(cp.nan, inplace=True)
X = X.values

out_data = input_to_cuml_array(X,
order=order,
deepcopy=deepcopy,
Expand Down
24 changes: 24 additions & 0 deletions python/cuml/test/test_input_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

from cuml.common import input_to_cuml_array, CumlArray
from cuml.common import input_to_host_array
from cuml.common.input_utils import input_to_cupy_array
from cuml.common import has_cupy
from cuml.common.input_utils import convert_dtype
from cuml.common.memory_utils import _check_array_contiguity
Expand Down Expand Up @@ -344,3 +345,26 @@ def get_input(type, nrows, ncols, dtype, order='C', out_dtype=False):
order=order)
else:
return result, np.array(cp.asnumpy(rand_mat), order=order)


def test_tocupy_missing_values_handling():
viclafargue marked this conversation as resolved.
Show resolved Hide resolved
df = cudf.DataFrame(data=[[7, 2, 3], [4, 5, 6], [10, 5, 9]])
array, n_rows, n_cols, dtype = input_to_cupy_array(df, fail_on_nan=False)
assert isinstance(array, cp.ndarray)
assert str(array.dtype) == 'int64'

df = cudf.DataFrame(data=[[7, 2, 3], [4, None, 6], [10, 5, 9]])
array, n_rows, n_cols, dtype = input_to_cupy_array(df, fail_on_nan=False)
assert isinstance(array, cp.ndarray)
assert str(array.dtype) == 'float64'
assert cp.isnan(array[1, 1])

df = cudf.Series(data=[7, None, 3])
array, n_rows, n_cols, dtype = input_to_cupy_array(df, fail_on_nan=False)
assert str(array.dtype) == 'float64'
assert cp.isnan(array[1])

with pytest.raises(ValueError):
df = cudf.Series(data=[7, None, 3])
array, n_rows, n_cols, dtype = input_to_cupy_array(df,
fail_on_nan=True)
3 changes: 2 additions & 1 deletion python/cuml/thirdparty_adapters/adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,8 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True,
else:
X, n_rows, n_cols, dtype = input_to_cupy_array(array,
order=order,
deepcopy=copy)
deepcopy=copy,
fail_on_nan=False)
if correct_dtype != dtype:
X = X.astype(correct_dtype)
check_finite(X, force_all_finite)
Expand Down