Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix convert_dtypes with convert_integer=False/convert_floating=True #15964

Merged
merged 8 commits into from
Jul 15, 2024
34 changes: 19 additions & 15 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6235,13 +6235,13 @@ def rank(

def convert_dtypes(
self,
infer_objects=True,
convert_string=True,
convert_integer=True,
convert_boolean=True,
convert_floating=True,
infer_objects: bool = True,
convert_string: bool = True,
convert_integer: bool = True,
convert_boolean: bool = True,
convert_floating: bool = True,
dtype_backend=None,
):
) -> Self:
"""
Convert columns to the best possible nullable dtypes.

Expand All @@ -6252,17 +6252,21 @@ def convert_dtypes(
All other dtypes are always returned as-is as all dtypes in
cudf are nullable.
"""
result = self.copy()

if convert_floating:
# cast any floating columns to int64 if
# they are all integer data:
for name, col in result._data.items():
if not (convert_floating and convert_integer):
return self.copy()
else:
cols = []
for col in self._columns:
if col.dtype.kind == "f":
col = col.fillna(0)
if cp.allclose(col, col.astype("int64")):
result._data[name] = col.astype("int64")
return result
as_int = col.astype("int64")
if cp.allclose(col, as_int):
cols.append(as_int)
continue
cols.append(col)
return self._from_data_like_self(
self._data._from_columns_like_self(cols, verify=False)
)

@_warn_no_dask_cudf
def __dask_tokenize__(self):
Expand Down
13 changes: 13 additions & 0 deletions python/cudf/cudf/tests/series/test_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,18 @@ def test_convert_dtypes(data, dtype):
assert_eq(expect, got)


def test_convert_integer_false_convert_floating_true():
data = [1.000000000000000000000000001, 1]
expected = pd.Series(data).convert_dtypes(
convert_integer=False, convert_floating=True
)
result = (
cudf.Series(data)
.convert_dtypes(convert_integer=False, convert_floating=True)
.to_pandas(nullable=True)
)
assert_eq(result, expected)


# Now write the same test, but construct a DataFrame
# as input instead of parametrizing:
Loading