From 5efd72f64e3b1e25337c30ba0ab246051d3fe396 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 1 Jul 2024 07:37:12 -1000 Subject: [PATCH] Ensure cudf objects can astype to any type when empty (#16106) pandas allows objects to `astype` to any other type if the object is empty. The PR mirrors that behavior for cudf. This PR also more consistently uses `astype` instead of `as_*_column` and fixes a bug in `IntervalDtype.__eq__` discovered when writing a unit test for this bug. Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16106 --- python/cudf/cudf/core/column/column.py | 9 ++++++ python/cudf/cudf/core/column/datetime.py | 36 +++++++++++---------- python/cudf/cudf/core/column/decimal.py | 2 +- python/cudf/cudf/core/column/interval.py | 26 +++++++-------- python/cudf/cudf/core/column/timedelta.py | 34 +++++++++++--------- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/dtypes.py | 2 +- python/cudf/cudf/core/frame.py | 4 +-- python/cudf/cudf/core/indexing_utils.py | 2 +- python/cudf/cudf/core/series.py | 8 +++-- python/cudf/cudf/core/tools/numeric.py | 14 ++++---- python/cudf/cudf/tests/test_interval.py | 6 ++++ python/cudf/cudf/tests/test_series.py | 39 +++++++++++++++++++++++ 13 files changed, 121 insertions(+), 63 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 5db6fd904a9..e7a2863da8c 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -959,6 +959,15 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: raise NotImplementedError() def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase: + if len(self) == 0: + dtype = cudf.dtype(dtype) + if self.dtype == dtype: + if copy: + return self.copy() + else: + return self + else: + return column_empty(0, dtype=dtype, masked=self.nullable) if copy: col = self.copy() else: diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 121076b69ce..c10aceba9f4 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -280,8 +280,8 @@ def __contains__(self, item: ScalarLike) -> bool: return False elif ts.tzinfo is not None: ts = ts.tz_convert(None) - return ts.to_numpy().astype("int64") in self.as_numerical_column( - "int64" + return ts.to_numpy().astype("int64") in cast( + "cudf.core.column.NumericalColumn", self.astype("int64") ) @functools.cached_property @@ -503,9 +503,9 @@ def mean( self, skipna=None, min_count: int = 0, dtype=np.float64 ) -> ScalarLike: return pd.Timestamp( - self.as_numerical_column("int64").mean( - skipna=skipna, min_count=min_count, dtype=dtype - ), + cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).mean(skipna=skipna, min_count=min_count, dtype=dtype), unit=self.time_unit, ).as_unit(self.time_unit) @@ -517,7 +517,7 @@ def std( ddof: int = 1, ) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical_column("int64").std( + cast("cudf.core.column.NumericalColumn", self.astype("int64")).std( skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof ) * _unit_to_nanoseconds_conversion[self.time_unit], @@ -525,7 +525,9 @@ def std( def median(self, skipna: bool | None = None) -> pd.Timestamp: return pd.Timestamp( - self.as_numerical_column("int64").median(skipna=skipna), + cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).median(skipna=skipna), unit=self.time_unit, ).as_unit(self.time_unit) @@ -534,18 +536,18 @@ def cov(self, other: DatetimeColumn) -> float: raise TypeError( f"cannot perform cov with types {self.dtype}, {other.dtype}" ) - return self.as_numerical_column("int64").cov( - other.as_numerical_column("int64") - ) + return cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).cov(cast("cudf.core.column.NumericalColumn", other.astype("int64"))) def corr(self, other: DatetimeColumn) -> float: if not isinstance(other, DatetimeColumn): raise TypeError( f"cannot perform corr with types {self.dtype}, {other.dtype}" ) - return self.as_numerical_column("int64").corr( - other.as_numerical_column("int64") - ) + return cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).corr(cast("cudf.core.column.NumericalColumn", other.astype("int64"))) def quantile( self, @@ -554,7 +556,7 @@ def quantile( exact: bool, return_scalar: bool, ) -> ColumnBase: - result = self.as_numerical_column("int64").quantile( + result = self.astype("int64").quantile( q=q, interpolation=interpolation, exact=exact, @@ -645,12 +647,12 @@ def indices_of( ) -> cudf.core.column.NumericalColumn: value = column.as_column( pd.to_datetime(value), dtype=self.dtype - ).as_numerical_column("int64") - return self.as_numerical_column("int64").indices_of(value) + ).astype("int64") + return self.astype("int64").indices_of(value) @property def is_unique(self) -> bool: - return self.as_numerical_column("int64").is_unique + return self.astype("int64").is_unique def isin(self, values: Sequence) -> ColumnBase: return cudf.core.tools.datetimes._isin_datetimelike(self, values) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index d66908b5f94..3e238d65cff 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -157,7 +157,7 @@ def normalize_binop_value(self, other): "Decimal columns only support binary operations with " "integer numerical columns." ) - other = other.as_decimal_column( + other = other.astype( self.dtype.__class__(self.dtype.__class__.MAX_PRECISION, 0) ) elif not isinstance(other, DecimalBaseColumn): diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index f24ca3fdad1..d09a1f66539 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -4,7 +4,7 @@ import cudf from cudf.core.column import StructColumn -from cudf.core.dtypes import CategoricalDtype, IntervalDtype +from cudf.core.dtypes import IntervalDtype class IntervalColumn(StructColumn): @@ -87,20 +87,16 @@ def copy(self, deep=True): def as_interval_column(self, dtype): if isinstance(dtype, IntervalDtype): - if isinstance(self.dtype, CategoricalDtype): - new_struct = self._get_decategorized_column() - return IntervalColumn.from_struct_column(new_struct) - else: - return IntervalColumn( - size=self.size, - dtype=dtype, - mask=self.mask, - offset=self.offset, - null_count=self.null_count, - children=tuple( - child.astype(dtype.subtype) for child in self.children - ), - ) + return IntervalColumn( + size=self.size, + dtype=dtype, + mask=self.mask, + offset=self.offset, + null_count=self.null_count, + children=tuple( + child.astype(dtype.subtype) for child in self.children + ), + ) else: raise ValueError("dtype must be IntervalDtype") diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 8f41bcb6422..5a0171bbbdc 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -107,7 +107,9 @@ def __contains__(self, item: DatetimeLikeScalar) -> bool: # np.timedelta64 raises ValueError, hence `item` # cannot exist in `self`. return False - return item.view("int64") in self.as_numerical_column("int64") + return item.view("int64") in cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ) @property def values(self): @@ -132,9 +134,7 @@ def to_arrow(self) -> pa.Array: self.mask_array_view(mode="read").copy_to_host() ) data = pa.py_buffer( - self.as_numerical_column("int64") - .data_array_view(mode="read") - .copy_to_host() + self.astype("int64").data_array_view(mode="read").copy_to_host() ) pa_dtype = np_to_pa_dtype(self.dtype) return pa.Array.from_buffers( @@ -295,13 +295,17 @@ def as_timedelta_column( def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical_column("int64").mean(skipna=skipna, dtype=dtype), + cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).mean(skipna=skipna, dtype=dtype), unit=self.time_unit, ).as_unit(self.time_unit) def median(self, skipna: bool | None = None) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical_column("int64").median(skipna=skipna), + cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).median(skipna=skipna), unit=self.time_unit, ).as_unit(self.time_unit) @@ -315,7 +319,7 @@ def quantile( exact: bool, return_scalar: bool, ) -> ColumnBase: - result = self.as_numerical_column("int64").quantile( + result = self.astype("int64").quantile( q=q, interpolation=interpolation, exact=exact, @@ -337,7 +341,7 @@ def sum( # Since sum isn't overridden in Numerical[Base]Column, mypy only # sees the signature from Reducible (which doesn't have the extra # parameters from ColumnBase._reduce) so we have to ignore this. - self.as_numerical_column("int64").sum( # type: ignore + self.astype("int64").sum( # type: ignore skipna=skipna, min_count=min_count, dtype=dtype ), unit=self.time_unit, @@ -351,7 +355,7 @@ def std( ddof: int = 1, ) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical_column("int64").std( + cast("cudf.core.column.NumericalColumn", self.astype("int64")).std( skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype ), unit=self.time_unit, @@ -362,18 +366,18 @@ def cov(self, other: TimeDeltaColumn) -> float: raise TypeError( f"cannot perform cov with types {self.dtype}, {other.dtype}" ) - return self.as_numerical_column("int64").cov( - other.as_numerical_column("int64") - ) + return cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).cov(cast("cudf.core.column.NumericalColumn", other.astype("int64"))) def corr(self, other: TimeDeltaColumn) -> float: if not isinstance(other, TimeDeltaColumn): raise TypeError( f"cannot perform corr with types {self.dtype}, {other.dtype}" ) - return self.as_numerical_column("int64").corr( - other.as_numerical_column("int64") - ) + return cast( + "cudf.core.column.NumericalColumn", self.astype("int64") + ).corr(cast("cudf.core.column.NumericalColumn", other.astype("int64"))) def components(self) -> dict[str, ColumnBase]: """ diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 4dfeb68b7ba..b249410c2e4 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2404,7 +2404,7 @@ def scatter_by_map( if isinstance(map_index, cudf.core.column.StringColumn): cat_index = cast( cudf.core.column.CategoricalColumn, - map_index.as_categorical_column("category"), + map_index.astype("category"), ) map_index = cat_index.codes warnings.warn( diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 034849d0e71..de715191c08 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -937,7 +937,7 @@ def to_pandas(self) -> pd.IntervalDtype: def __eq__(self, other): if isinstance(other, str): # This means equality isn't transitive but mimics pandas - return other == self.name + return other in (self.name, str(self)) return ( type(self) == type(other) and self.subtype == other.subtype diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 9bac75dc6ac..253d200f7d4 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -927,7 +927,7 @@ def from_arrow(cls, data: pa.Table) -> Self: # of column is 0 (i.e., empty) then we will have an # int8 column in result._data[name] returned by libcudf, # which needs to be type-casted to 'category' dtype. - result[name] = result[name].as_categorical_column("category") + result[name] = result[name].astype("category") elif ( pandas_dtypes.get(name) == "empty" and np_dtypes.get(name) == "object" @@ -936,7 +936,7 @@ def from_arrow(cls, data: pa.Table) -> Self: # is specified as 'empty' and np_dtypes as 'object', # hence handling this special case to type-cast the empty # float column to str column. - result[name] = result[name].as_string_column(cudf.dtype("str")) + result[name] = result[name].astype(cudf.dtype("str")) elif name in data.column_names and isinstance( data[name].type, ( diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index 73a1cd26367..a5fed02cbed 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -229,7 +229,7 @@ def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec: else: key = cudf.core.column.as_column(key) if isinstance(key, cudf.core.column.CategoricalColumn): - key = key.as_numerical_column(key.codes.dtype) + key = key.astype(key.codes.dtype) if is_bool_dtype(key.dtype): return MaskIndexer(BooleanMask(key, n)) elif len(key) == 0: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 97b6bbec2d4..4a60470fafa 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3107,10 +3107,12 @@ def value_counts( # Pandas returns an IntervalIndex as the index of res # this condition makes sure we do too if bins is given if bins is not None and len(res) == len(res.index.categories): - int_index = IntervalColumn.as_interval_column( - res.index._column, res.index.categories.dtype + interval_col = IntervalColumn.from_struct_column( + res.index._column._get_decategorized_column() + ) + res.index = cudf.IntervalIndex._from_data( + {res.index.name: interval_col} ) - res.index = int_index res.name = result_name return res diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 68b23f1e059..ef6b86a04a7 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -115,11 +115,11 @@ def to_numeric(arg, errors="raise", downcast=None): dtype = col.dtype if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype): - col = col.as_numerical_column(cudf.dtype("int64")) + col = col.astype(cudf.dtype("int64")) elif isinstance(dtype, CategoricalDtype): cat_dtype = col.dtype.type if _is_non_decimal_numeric_dtype(cat_dtype): - col = col.as_numerical_column(cat_dtype) + col = col.astype(cat_dtype) else: try: col = _convert_str_col( @@ -146,8 +146,8 @@ def to_numeric(arg, errors="raise", downcast=None): raise ValueError("Unrecognized datatype") # str->float conversion may require lower precision - if col.dtype == cudf.dtype("f"): - col = col.as_numerical_column("d") + if col.dtype == cudf.dtype("float32"): + col = col.astype("float64") if downcast: if downcast == "float": @@ -205,7 +205,7 @@ def _convert_str_col(col, errors, _downcast=None): is_integer = libstrings.is_integer(col) if is_integer.all(): - return col.as_numerical_column(dtype=cudf.dtype("i8")) + return col.astype(dtype=cudf.dtype("i8")) col = _proc_inf_empty_strings(col) @@ -218,9 +218,9 @@ def _convert_str_col(col, errors, _downcast=None): "limited by float32 precision." ) ) - return col.as_numerical_column(dtype=cudf.dtype("f")) + return col.astype(dtype=cudf.dtype("float32")) else: - return col.as_numerical_column(dtype=cudf.dtype("d")) + return col.astype(dtype=cudf.dtype("float64")) else: if errors == "coerce": col = libcudf.string_casting.stod(col) diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py index 1b395c09ba8..5eeea87d8e0 100644 --- a/python/cudf/cudf/tests/test_interval.py +++ b/python/cudf/cudf/tests/test_interval.py @@ -188,3 +188,9 @@ def test_from_pandas_intervaldtype(): result = cudf.from_pandas(dtype) expected = cudf.IntervalDtype("int64", closed="left") assert_eq(result, expected) + + +def test_intervaldtype_eq_string_with_attributes(): + dtype = cudf.IntervalDtype("int64", closed="left") + assert dtype == "interval" + assert dtype == "interval[int64, left]" diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 467d0c46ae7..f2501041f25 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2873,3 +2873,42 @@ def test_nunique_all_null(dropna): result = pd_ser.nunique(dropna=dropna) expected = cudf_ser.nunique(dropna=dropna) assert result == expected + + +@pytest.mark.parametrize( + "type1", + [ + "category", + "interval[int64, right]", + "int64", + "float64", + "str", + "datetime64[ns]", + "timedelta64[ns]", + ], +) +@pytest.mark.parametrize( + "type2", + [ + "category", + "interval[int64, right]", + "int64", + "float64", + "str", + "datetime64[ns]", + "timedelta64[ns]", + ], +) +@pytest.mark.parametrize( + "as_dtype", [lambda x: x, cudf.dtype], ids=["string", "object"] +) +@pytest.mark.parametrize("copy", [True, False]) +def test_empty_astype_always_castable(type1, type2, as_dtype, copy): + ser = cudf.Series([], dtype=as_dtype(type1)) + result = ser.astype(as_dtype(type2), copy=copy) + expected = cudf.Series([], dtype=as_dtype(type2)) + assert_eq(result, expected) + if not copy and cudf.dtype(type1) == cudf.dtype(type2): + assert ser._column is result._column + else: + assert ser._column is not result._column