From 26dfdc8e4b74b8189ea0f14d7929ed74c677dc5f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 28 Feb 2024 12:45:38 -0800 Subject: [PATCH 1/3] Allow to_pandas to return pandas.ArrowDtype --- python/cudf/cudf/core/_base_index.py | 8 +++- python/cudf/cudf/core/column/categorical.py | 8 +++- python/cudf/cudf/core/column/column.py | 15 ++++++-- python/cudf/cudf/core/column/datetime.py | 41 +++++++++++++-------- python/cudf/cudf/core/column/interval.py | 8 +++- python/cudf/cudf/core/column/numerical.py | 11 +++++- python/cudf/cudf/core/column/string.py | 11 +++++- python/cudf/cudf/core/column/struct.py | 15 ++++++-- python/cudf/cudf/core/column/timedelta.py | 24 ++++++++---- python/cudf/cudf/core/dataframe.py | 12 +++++- python/cudf/cudf/core/index.py | 27 ++++++++++---- python/cudf/cudf/core/multiindex.py | 6 ++- python/cudf/cudf/core/series.py | 20 ++++++++-- 13 files changed, 156 insertions(+), 50 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 58e2241e810..68084227d72 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -910,7 +910,7 @@ def notna(self): """ raise NotImplementedError - def to_pandas(self, *, nullable: bool = False): + def to_pandas(self, *, nullable: bool = False, arrow_type: bool = False): """ Convert to a Pandas Index. @@ -924,6 +924,12 @@ def to_pandas(self, *, nullable: bool = False): If ``nullable`` is ``False``, the resulting index will either convert null values to ``np.nan`` or ``None`` depending on the dtype. + arrow_type : bool, Default False + Return the Index with a ``pandas.ArrowDtype`` + + Notes + ----- + nullable and arrow_type cannot both be set to ``True`` Examples -------- diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 9ecd461cf99..4c64e7085c9 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -770,10 +770,16 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]: ) def to_pandas( - self, *, index: Optional[pd.Index] = None, nullable: bool = False + self, + *, + index: Optional[pd.Index] = None, + nullable: bool = False, + arrow_type: bool = False, ) -> pd.Series: if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") + elif arrow_type: + raise NotImplementedError(f"{arrow_type=} is not implemented.") if self.categories.dtype.kind == "f": new_mask = bools_to_mask(self.notnull()) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index cecdaf70750..65e2ecfb6e6 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -199,6 +199,7 @@ def to_pandas( *, index: Optional[pd.Index] = None, nullable: bool = False, + arrow_type: bool = False, ) -> pd.Series: """Convert object to pandas type. @@ -208,11 +209,17 @@ def to_pandas( # way if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") - pd_series = self.to_arrow().to_pandas() + pa_array = self.to_arrow() + if arrow_type: + return pd.Series( + pd.arrays.ArrowExtensionArray(pa_array), index=index + ) + else: + pd_series = pa_array.to_pandas() - if index is not None: - pd_series.index = index - return pd_series + if index is not None: + pd_series.index = index + return pd_series @property def values_host(self) -> "np.ndarray": diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index b03b21a7aba..f76eea68e66 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -318,18 +318,23 @@ def to_pandas( *, index: Optional[pd.Index] = None, nullable: bool = False, + arrow_type: bool = False, ) -> pd.Series: if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") - # `copy=True` workaround until following issue is fixed: - # https://issues.apache.org/jira/browse/ARROW-9772 - - return pd.Series( - self.to_arrow(), - copy=True, - dtype=self.dtype, - index=index, - ) + if arrow_type: + return pd.Series( + pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index + ) + else: + # `copy=True` workaround until following issue is fixed: + # https://issues.apache.org/jira/browse/ARROW-9772 + return pd.Series( + self.to_arrow(), + copy=True, + dtype=self.dtype, + index=index, + ) @property def values(self): @@ -723,15 +728,21 @@ def to_pandas( *, index: Optional[pd.Index] = None, nullable: bool = False, + arrow_type: bool = False, ) -> pd.Series: if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") - series = self._local_time.to_pandas().dt.tz_localize( - self.dtype.tz, ambiguous="NaT", nonexistent="NaT" - ) - if index is not None: - series.index = index - return series + if arrow_type: + return pd.Series( + pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index + ) + else: + series = self._local_time.to_pandas().dt.tz_localize( + self.dtype.tz, ambiguous="NaT", nonexistent="NaT" + ) + if index is not None: + series.index = index + return series def to_arrow(self): return pa.compute.assume_timezone( diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 5d93fa26298..0f6e2474c9b 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -105,7 +105,11 @@ def as_interval_column(self, dtype): raise ValueError("dtype must be IntervalDtype") def to_pandas( - self, *, index: Optional[pd.Index] = None, nullable: bool = False + self, + *, + index: Optional[pd.Index] = None, + nullable: bool = False, + arrow_type: bool = False, ) -> pd.Series: # Note: This does not handle null values in the interval column. # However, this exact sequence (calling __from_arrow__ on the output of @@ -114,6 +118,8 @@ def to_pandas( # directly is problematic), so we're stuck with this for now. if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") + elif arrow_type: + raise NotImplementedError(f"{nullable=} is not implemented.") return pd.Series( self.dtype.to_pandas().__from_arrow__(self.to_arrow()), index=index ) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index b80dd626066..82d82593c77 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -690,8 +690,17 @@ def to_pandas( *, index: Optional[pd.Index] = None, nullable: bool = False, + arrow_type: bool = False, ) -> pd.Series: - if nullable and self.dtype in np_dtypes_to_pandas_dtypes: + if arrow_type and nullable: + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) + if arrow_type: + return pd.Series( + pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index + ) + elif nullable and self.dtype in np_dtypes_to_pandas_dtypes: pandas_nullable_dtype = np_dtypes_to_pandas_dtypes[self.dtype] arrow_array = self.to_arrow() pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 2373f94ee97..dea60f58690 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5791,8 +5791,17 @@ def to_pandas( *, index: Optional[pd.Index] = None, nullable: bool = False, + arrow_type: bool = False, ) -> pd.Series: - if nullable: + if arrow_type and nullable: + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) + if arrow_type: + return pd.Series( + pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index + ) + elif nullable: pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow()) pd_series = pd.Series(pandas_array, copy=False) else: diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 69e9a50956b..288895c8e1a 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -58,14 +58,23 @@ def to_arrow(self): ) def to_pandas( - self, *, index: Optional[pd.Index] = None, nullable: bool = False + self, + *, + index: Optional[pd.Index] = None, + nullable: bool = False, + arrow_type: bool = False, ) -> pd.Series: # We cannot go via Arrow's `to_pandas` because of the following issue: # https://issues.apache.org/jira/browse/ARROW-12680 if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") - - return pd.Series(self.to_arrow().tolist(), dtype="object", index=index) + pa_array = self.to_arrow() + if arrow_type: + return pd.Series( + pd.arrays.ArrowExtensionArray(pa_array), index=index + ) + else: + return pd.Series(pa_array.tolist(), dtype="object", index=index) @cached_property def memory_usage(self): diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index b911c86fa01..b83f201c2fd 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -147,20 +147,28 @@ def to_arrow(self) -> pa.Array: ) def to_pandas( - self, *, index: Optional[pd.Index] = None, nullable: bool = False + self, + *, + index: Optional[pd.Index] = None, + nullable: bool = False, + arrow_type: bool = False, ) -> pd.Series: # `copy=True` workaround until following issue is fixed: # https://issues.apache.org/jira/browse/ARROW-9772 if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") - - return pd.Series( - self.to_arrow(), - copy=True, - dtype=self.dtype, - index=index, - ) + if arrow_type: + return pd.Series( + pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index + ) + else: + return pd.Series( + self.to_arrow(), + copy=True, + dtype=self.dtype, + index=index, + ) def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: reflect, op = self._check_reflected_op(op) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 9b4a79c6841..adc1c2dba3d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5203,7 +5203,9 @@ def describe( return res @_cudf_nvtx_annotate - def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame: + def to_pandas( + self, *, nullable: bool = False, arrow_type: bool = False + ) -> pd.DataFrame: """ Convert to a Pandas DataFrame. @@ -5218,11 +5220,17 @@ def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame: If ``nullable`` is ``False``, the resulting columns will either convert null values to ``np.nan`` or ``None`` depending on the dtype. + arrow_type : bool, Default False + Return the Index with a ``pandas.ArrowDtype`` Returns ------- out : Pandas DataFrame + Notes + ----- + nullable and arrow_type cannot both be set to ``True`` + Examples -------- >>> import cudf @@ -5271,7 +5279,7 @@ def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame: for i, col_key in enumerate(self._data): out_data[i] = self._data[col_key].to_pandas( - index=out_index, nullable=nullable + index=out_index, nullable=nullable, arrow_type=arrow_type ) out_df = pd.DataFrame(out_data, index=out_index) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 1b9893d1256..4fa1b7c5ba9 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -483,9 +483,13 @@ def dtype(self): return _maybe_convert_to_default_type(dtype) @_cudf_nvtx_annotate - def to_pandas(self, *, nullable: bool = False) -> pd.RangeIndex: + def to_pandas( + self, *, nullable: bool = False, arrow_type: bool = False + ) -> pd.RangeIndex: if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") + elif arrow_type: + raise NotImplementedError(f"{arrow_type=} is not implemented.") return pd.RangeIndex( start=self._start, stop=self._stop, @@ -1521,9 +1525,12 @@ def _clean_nulls_from_index(self): def any(self): return self._values.any() - def to_pandas(self, *, nullable: bool = False) -> pd.Index: + def to_pandas( + self, *, nullable: bool = False, arrow_type: bool = False + ) -> pd.Index: return pd.Index( - self._values.to_pandas(nullable=nullable), name=self.name + self._values.to_pandas(nullable=nullable, arrow_type=arrow_type), + name=self.name, ) def append(self, other): @@ -2094,7 +2101,9 @@ def isocalendar(self): return cudf.core.tools.datetimes._to_iso_calendar(self) @_cudf_nvtx_annotate - def to_pandas(self, *, nullable: bool = False) -> pd.DatetimeIndex: + def to_pandas( + self, *, nullable: bool = False, arrow_type: bool = False + ) -> pd.DatetimeIndex: if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") @@ -2104,7 +2113,9 @@ def to_pandas(self, *, nullable: bool = False) -> pd.DatetimeIndex: else None ) return pd.DatetimeIndex( - self._values.to_pandas(), name=self.name, freq=freq + self._values.to_pandas(arrow_type=arrow_type), + name=self.name, + freq=freq, ) @_cudf_nvtx_annotate @@ -2426,11 +2437,13 @@ def __getitem__(self, index): return value @_cudf_nvtx_annotate - def to_pandas(self, *, nullable: bool = False) -> pd.TimedeltaIndex: + def to_pandas( + self, *, nullable: bool = False, arrow_type: bool = False + ) -> pd.TimedeltaIndex: if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") return pd.TimedeltaIndex( - self._values.to_pandas(), + self._values.to_pandas(arrow_type=arrow_type), name=self.name, ) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index df1b1ea10cd..70112044f75 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1574,10 +1574,12 @@ def droplevel(self, level=-1): return mi @_cudf_nvtx_annotate - def to_pandas(self, *, nullable: bool = False) -> pd.MultiIndex: + def to_pandas( + self, *, nullable: bool = False, arrow_type: bool = False + ) -> pd.MultiIndex: result = self.to_frame( index=False, name=list(range(self.nlevels)) - ).to_pandas(nullable=nullable) + ).to_pandas(nullable=nullable, arrow_type=arrow_type) return pd.MultiIndex.from_frame(result, names=self.names) @classmethod diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 3f51ecdf7dc..629ca1ea0d1 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1983,10 +1983,14 @@ def any(self, axis=0, bool_only=None, skipna=True, **kwargs): @_cudf_nvtx_annotate def to_pandas( - self, *, index: bool = True, nullable: bool = False + self, + *, + index: bool = True, + nullable: bool = False, + arrow_type: bool = False, ) -> pd.Series: """ - Convert to a Pandas Series. + Convert to a pandas Series. Parameters ---------- @@ -2003,10 +2007,16 @@ def to_pandas( If ``nullable`` is ``False``, the resulting series will either convert null values to ``np.nan`` or ``None`` depending on the dtype. + arrow_type : bool, Default False + Return the Series with a ``pandas.ArrowDtype`` Returns ------- - out : Pandas Series + out : pandas Series + + Notes + ----- + nullable and arrow_type cannot both be set to ``True`` Examples -------- @@ -2048,7 +2058,9 @@ def to_pandas( index = self.index.to_pandas() else: index = None # type: ignore[assignment] - s = self._column.to_pandas(index=index, nullable=nullable) + s = self._column.to_pandas( + index=index, nullable=nullable, arrow_type=arrow_type + ) s.name = self.name return s From e5379ccf284c58ed322838055a0953b022fb6a33 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 28 Feb 2024 13:47:24 -0800 Subject: [PATCH 2/3] Add testing and fix some impl --- python/cudf/cudf/core/_base_index.py | 2 ++ python/cudf/cudf/core/column/column.py | 6 +++- python/cudf/cudf/core/column/datetime.py | 16 ++++++--- python/cudf/cudf/core/column/interval.py | 4 +++ python/cudf/cudf/core/column/struct.py | 6 +++- python/cudf/cudf/core/column/timedelta.py | 9 +++-- python/cudf/cudf/core/dataframe.py | 10 ++++-- python/cudf/cudf/core/index.py | 42 +++++++++++++--------- python/cudf/cudf/core/series.py | 11 ++++-- python/cudf/cudf/tests/test_dataframe.py | 41 +++++++++++++++++++++ python/cudf/cudf/tests/test_index.py | 38 ++++++++++++++++++++ python/cudf/cudf/tests/test_multiindex.py | 34 ++++++++++++++++++ python/cudf/cudf/tests/test_series.py | 43 ++++++++++++++++++++++- 13 files changed, 232 insertions(+), 30 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 68084227d72..de44f392eef 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -943,6 +943,8 @@ def to_pandas(self, *, nullable: bool = False, arrow_type: bool = False): >>> type(idx) + >>> idx.to_pandas(arrow_type=True) + Index([-3, 10, 15, 20], dtype='int64[pyarrow]') """ raise NotImplementedError diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 65e2ecfb6e6..be196833f32 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -207,7 +207,11 @@ def to_pandas( """ # This default implementation does not handle nulls in any meaningful # way - if nullable: + if arrow_type and nullable: + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) + elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") pa_array = self.to_arrow() if arrow_type: diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index f76eea68e66..85f07064c97 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -320,9 +320,13 @@ def to_pandas( nullable: bool = False, arrow_type: bool = False, ) -> pd.Series: - if nullable: + if arrow_type and nullable: + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) + elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") - if arrow_type: + elif arrow_type: return pd.Series( pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index ) @@ -730,9 +734,13 @@ def to_pandas( nullable: bool = False, arrow_type: bool = False, ) -> pd.Series: - if nullable: + if arrow_type and nullable: + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) + elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") - if arrow_type: + elif arrow_type: return pd.Series( pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index ) diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 0f6e2474c9b..dcec8957bb2 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -116,6 +116,10 @@ def to_pandas( # self.to_arrow) is currently the best known way to convert interval # types into pandas (trying to convert the underlying numerical columns # directly is problematic), so we're stuck with this for now. + if arrow_type and nullable: + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") elif arrow_type: diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 288895c8e1a..1b2ffcc2700 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -66,7 +66,11 @@ def to_pandas( ) -> pd.Series: # We cannot go via Arrow's `to_pandas` because of the following issue: # https://issues.apache.org/jira/browse/ARROW-12680 - if nullable: + if arrow_type and nullable: + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) + elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") pa_array = self.to_arrow() if arrow_type: diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index b83f201c2fd..dab2723795e 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -155,10 +155,13 @@ def to_pandas( ) -> pd.Series: # `copy=True` workaround until following issue is fixed: # https://issues.apache.org/jira/browse/ARROW-9772 - - if nullable: + if arrow_type and nullable: + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) + elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") - if arrow_type: + elif arrow_type: return pd.Series( pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index adc1c2dba3d..9eff2cce6ac 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5244,8 +5244,7 @@ def to_pandas( >>> type(pdf) - ``nullable`` parameter can be used to control - whether dtype can be Pandas Nullable or not: + ``nullable=True`` converts the result to pandas nullable types: >>> df = cudf.DataFrame({'a': [0, None, 2], 'b': [True, False, None]}) >>> df @@ -5273,6 +5272,13 @@ def to_pandas( a float64 b object dtype: object + + ``arrow_type=True`` converts the result to ``pandas.ArrowDtype``: + + >>> df.to_pandas(arrow_type=True).dtypes + a int64[pyarrow] + b bool[pyarrow] + dtype: object """ out_data = {} out_index = self.index.to_pandas() diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 4fa1b7c5ba9..9d481037ec6 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2104,19 +2104,23 @@ def isocalendar(self): def to_pandas( self, *, nullable: bool = False, arrow_type: bool = False ) -> pd.DatetimeIndex: - if nullable: + if arrow_type and nullable: + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) + elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") - freq = ( - self._freq._maybe_as_fast_pandas_offset() - if self._freq is not None - else None - ) - return pd.DatetimeIndex( - self._values.to_pandas(arrow_type=arrow_type), - name=self.name, - freq=freq, - ) + result = self._values.to_pandas(arrow_type=arrow_type) + if arrow_type: + return pd.Index(result, name=self.name) + else: + freq = ( + self._freq._maybe_as_fast_pandas_offset() + if self._freq is not None + else None + ) + return pd.DatetimeIndex(result, name=self.name, freq=freq) @_cudf_nvtx_annotate def _get_dt_field(self, field): @@ -2440,12 +2444,18 @@ def __getitem__(self, index): def to_pandas( self, *, nullable: bool = False, arrow_type: bool = False ) -> pd.TimedeltaIndex: - if nullable: + if arrow_type and nullable: + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) + elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") - return pd.TimedeltaIndex( - self._values.to_pandas(arrow_type=arrow_type), - name=self.name, - ) + + result = self._values.to_pandas(arrow_type=arrow_type) + if arrow_type: + return pd.Index(result, name=self.name) + else: + return pd.TimedeltaIndex(result, name=self.name) @property # type: ignore @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 629ca1ea0d1..b3c11138e4a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2031,8 +2031,7 @@ def to_pandas( >>> type(pds) - ``nullable`` parameter can be used to control - whether dtype can be Pandas Nullable or not: + ``nullable=True`` converts the result to pandas nullable types: >>> ser = cudf.Series([10, 20, None, 30]) >>> ser @@ -2053,6 +2052,14 @@ def to_pandas( 2 NaN 3 30.0 dtype: float64 + + ``arrow_type=True`` converts the result to ``pandas.ArrowDtype``: + >>> ser.to_pandas(arrow_type=True) + 0 10 + 1 20 + 2 + 3 30 + dtype: int64[pyarrow] """ if index is True: index = self.index.to_pandas() diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 2084db89909..47e1ab2834e 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10862,3 +10862,44 @@ def test_dataframe_duplicate_index_reindex(): lfunc_args_and_kwargs=([10, 11, 12, 13], {}), rfunc_args_and_kwargs=([10, 11, 12, 13], {}), ) + + +@pytest.mark.parametrize( + "scalar", + [ + 1, + 1.0, + "a", + datetime.datetime(2020, 1, 1), + datetime.timedelta(1), + {"1": 2}, + [1], + decimal.Decimal("1.0"), + ], +) +def test_dataframe_to_pandas_arrow_type_nullable_raises(scalar): + pa_array = pa.array([scalar, None]) + df = cudf.DataFrame({"a": pa_array}) + with pytest.raises(ValueError): + df.to_pandas(nullable=True, arrow_type=True) + + +@pytest.mark.parametrize( + "scalar", + [ + 1, + 1.0, + "a", + datetime.datetime(2020, 1, 1), + datetime.timedelta(1), + {"1": 2}, + [1], + decimal.Decimal("1.0"), + ], +) +def test_dataframe_to_pandas_arrow_type(scalar): + pa_array = pa.array([scalar, None]) + df = cudf.DataFrame({"a": pa_array}) + result = df.to_pandas(arrow_type=True) + expected = pd.DataFrame({"a": pd.arrays.ArrowExtensionArray(pa_array)}) + pd.testing.assert_frame_equal(result, expected) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index cced05d2217..51e9a3022f4 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -3,6 +3,7 @@ """ Test related to Index """ +import datetime import operator import re @@ -3138,3 +3139,40 @@ def test_from_pandas_rangeindex_return_rangeindex(): def test_index_to_pandas_nullable_notimplemented(idx): with pytest.raises(NotImplementedError): idx.to_pandas(nullable=True) + + +@pytest.mark.parametrize( + "scalar", + [ + 1, + 1.0, + "a", + datetime.datetime(2020, 1, 1), + datetime.timedelta(1), + {"1": 2}, + ], +) +def test_index_to_pandas_arrow_type_nullable_raises(scalar): + pa_array = pa.array([scalar, None]) + idx = cudf.Index(pa_array) + with pytest.raises(ValueError): + idx.to_pandas(nullable=True, arrow_type=True) + + +@pytest.mark.parametrize( + "scalar", + [ + 1, + 1.0, + "a", + datetime.datetime(2020, 1, 1), + datetime.timedelta(1), + {"1": 2}, + ], +) +def test_index_to_pandas_arrow_type(scalar): + pa_array = pa.array([scalar, None]) + idx = cudf.Index(pa_array) + result = idx.to_pandas(arrow_type=True) + expected = pd.Index(pd.arrays.ArrowExtensionArray(pa_array)) + pd.testing.assert_index_equal(result, expected) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index a13fe333107..4926d79e734 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -3,6 +3,7 @@ """ Test related to MultiIndex """ +import datetime import itertools import operator import pickle @@ -13,6 +14,7 @@ import cupy as cp import numpy as np import pandas as pd +import pyarrow as pa import pytest import cudf @@ -2118,3 +2120,35 @@ def test_multiindex_from_arrays(array): def test_multiindex_from_arrays_wrong_arg(arg): with pytest.raises(TypeError): cudf.MultiIndex.from_arrays(arg) + + +@pytest.mark.parametrize( + "scalar", + [ + 1, + 1.0, + "a", + datetime.datetime(2020, 1, 1), + datetime.timedelta(1), + {"1": 2}, + ], +) +def test_index_to_pandas_arrow_type_nullable_raises(scalar): + pa_array = pa.array([scalar, None]) + midx = cudf.MultiIndex(levels=[pa_array], codes=[[0]]) + with pytest.raises(ValueError): + midx.to_pandas(nullable=True, arrow_type=True) + + +@pytest.mark.parametrize( + "scalar", + [1, 1.0, "a", datetime.datetime(2020, 1, 1), datetime.timedelta(1)], +) +def test_index_to_pandas_arrow_type(scalar): + pa_array = pa.array([scalar, None]) + midx = cudf.MultiIndex(levels=[pa_array], codes=[[0]]) + result = midx.to_pandas(arrow_type=True) + expected = pd.MultiIndex( + levels=[pd.arrays.ArrowExtensionArray(pa_array)], codes=[[0]] + ) + pd.testing.assert_index_equal(result, expected) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index caf8947e3b0..6b5c0406deb 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1,5 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - +import datetime import decimal import hashlib import operator @@ -2708,3 +2708,44 @@ def test_series_from_large_string(): expected = pd.Series(pa_large_string_array) assert_eq(expected, got) + + +@pytest.mark.parametrize( + "scalar", + [ + 1, + 1.0, + "a", + datetime.datetime(2020, 1, 1), + datetime.timedelta(1), + {"1": 2}, + [1], + decimal.Decimal("1.0"), + ], +) +def test_series_to_pandas_arrow_type_nullable_raises(scalar): + pa_array = pa.array([scalar, None]) + ser = cudf.Series(pa_array) + with pytest.raises(ValueError): + ser.to_pandas(nullable=True, arrow_type=True) + + +@pytest.mark.parametrize( + "scalar", + [ + 1, + 1.0, + "a", + datetime.datetime(2020, 1, 1), + datetime.timedelta(1), + {"1": 2}, + [1], + decimal.Decimal("1.0"), + ], +) +def test_series_to_pandas_arrow_type(scalar): + pa_array = pa.array([scalar, None]) + ser = cudf.Series(pa_array) + result = ser.to_pandas(arrow_type=True) + expected = pd.Series(pd.arrays.ArrowExtensionArray(pa_array)) + pd.testing.assert_series_equal(result, expected) From b00e9e09ca21bdea8109145dbf184dfca4a539ea Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 1 Mar 2024 08:44:45 -0600 Subject: [PATCH 3/3] Update python/cudf/cudf/core/series.py --- python/cudf/cudf/core/series.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index b3c11138e4a..cb5008af3ad 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2054,6 +2054,7 @@ def to_pandas( dtype: float64 ``arrow_type=True`` converts the result to ``pandas.ArrowDtype``: + >>> ser.to_pandas(arrow_type=True) 0 10 1 20