From e644273bad5c4b59edd16b4cb30f370f71e03b12 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 14 Apr 2023 21:29:35 -0400 Subject: [PATCH 1/3] BUG: pd.NA showing up as NaN in Categorical repr --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/arrays/categorical.py | 30 +++++++++++++++----- pandas/tests/arrays/categorical/test_repr.py | 17 +++++++++++ 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index e860d59f2e5bd..51be96542ee38 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -272,6 +272,7 @@ Bug fixes Categorical ^^^^^^^^^^^ +- Bug in :meth:`Categorical.__repr__` and :meth:`Series.__repr`, where :class:`Categorical`'s having categories backed by a :class:`pandas.api.extensions.ExtensionDtype` had null values show up as "NaN" instead of ``ExtensionDtype.na_value`` (:issue:`52681`) - Bug in :meth:`Series.map` , where the value of the ``na_action`` parameter was not used if the series held a :class:`Categorical` (:issue:`22527`). - diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index adb083c16a838..cbbcb2e43fb53 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1599,7 +1599,14 @@ def _internal_get_values(self): if needs_i8_conversion(self.categories.dtype): return self.categories.take(self._codes, fill_value=NaT) elif is_integer_dtype(self.categories) and -1 in self._codes: - return self.categories.astype("object").take(self._codes, fill_value=np.nan) + fill_value = self.categories.dtype.na_value + if is_extension_array_dtype(self.categories.dtype): + # Nullable integer dtype + # Don't astype to object + return self.categories.take(self._codes, fill_value=fill_value) + return self.categories.astype("object").take( + self._codes, fill_value=fill_value + ) return np.array(self) def check_for_ordered(self, op) -> None: @@ -1911,14 +1918,18 @@ def _formatter(self, boxed: bool = False): # Defer to CategoricalFormatter's formatter. return None - def _tidy_repr(self, max_vals: int = 10, footer: bool = True) -> str: + def _tidy_repr( + self, max_vals: int = 10, footer: bool = True, na_rep: str = "NaN" + ) -> str: """ a short repr displaying only max_vals and an optional (but default footer) """ num = max_vals // 2 - head = self[:num]._get_repr(length=False, footer=False) - tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False) + head = self[:num]._get_repr(length=False, footer=False, na_rep=na_rep) + tail = self[-(max_vals - num) :]._get_repr( + length=False, footer=False, na_rep=na_rep + ) result = f"{head[:-1]}, ..., {tail[1:]}" if footer: @@ -2001,12 +2012,17 @@ def __repr__(self) -> str: String representation. """ _maxlen = 10 + na_repr = "NaN" + if is_extension_array_dtype(self.categories.dtype): + na_repr = repr(self.categories.dtype.na_value) if len(self._codes) > _maxlen: - result = self._tidy_repr(_maxlen) + result = self._tidy_repr(_maxlen, na_rep=na_repr) elif len(self._codes) > 0: - result = self._get_repr(length=len(self) > _maxlen) + result = self._get_repr(length=len(self) > _maxlen, na_rep=na_repr) else: - msg = self._get_repr(length=False, footer=True).replace("\n", ", ") + msg = self._get_repr(length=False, footer=True, na_rep=na_repr).replace( + "\n", ", " + ) result = f"[], {msg}" return result diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index ffc44b30a3870..1ee90d53f9509 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -1,15 +1,19 @@ import numpy as np +import pytest from pandas import ( + NA, Categorical, CategoricalDtype, CategoricalIndex, Series, + array, date_range, option_context, period_range, timedelta_range, ) +import pandas._testing as tm class TestCategoricalReprWithFactor: @@ -253,6 +257,19 @@ def test_categorical_repr_int_with_nan(self): Categories (2, int64): [1, 2]""" assert repr(s) == s_exp + @pytest.mark.parametrize("values_dtype", tm.ALL_INT_EA_DTYPES) + def test_categorical_repr_nullable_int_NA(self, values_dtype): + arr = array([1, 2, np.nan], dtype=values_dtype) + c = Categorical(arr) + c_exp = f"""[1, 2, {NA}]\nCategories (2, {values_dtype}): [1, 2]""" + assert repr(c) == c_exp + + s = Series([1, 2, np.nan], dtype=values_dtype).astype("category") + s_exp = """0 1\n1 2\n2 +dtype: category +Categories (2, Int64): [1, 2]""" + assert repr(s) == s_exp + def test_categorical_repr_period(self): idx = period_range("2011-01-01 09:00", freq="H", periods=5) c = Categorical(idx) From 70d9ddffaff10761cc7db93c4641ddb4247e23db Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sat, 15 Apr 2023 07:05:47 -0400 Subject: [PATCH 2/3] fix tests and address code review --- pandas/core/arrays/categorical.py | 8 +++----- pandas/tests/arrays/categorical/test_repr.py | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index cbbcb2e43fb53..e5628930cd2e5 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1599,14 +1599,12 @@ def _internal_get_values(self): if needs_i8_conversion(self.categories.dtype): return self.categories.take(self._codes, fill_value=NaT) elif is_integer_dtype(self.categories) and -1 in self._codes: - fill_value = self.categories.dtype.na_value - if is_extension_array_dtype(self.categories.dtype): + if isinstance(self.categories.dtype, ExtensionDtype): # Nullable integer dtype # Don't astype to object + fill_value = self.categories.dtype.na_value return self.categories.take(self._codes, fill_value=fill_value) - return self.categories.astype("object").take( - self._codes, fill_value=fill_value - ) + return self.categories.astype("object").take(self._codes, fill_value=np.nan) return np.array(self) def check_for_ordered(self, op) -> None: diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index 1ee90d53f9509..ffb8162b7964b 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -265,9 +265,9 @@ def test_categorical_repr_nullable_int_NA(self, values_dtype): assert repr(c) == c_exp s = Series([1, 2, np.nan], dtype=values_dtype).astype("category") - s_exp = """0 1\n1 2\n2 + s_exp = f"""0 1\n1 2\n2 dtype: category -Categories (2, Int64): [1, 2]""" +Categories (2, {values_dtype}): [1, 2]""" assert repr(s) == s_exp def test_categorical_repr_period(self): From 2d8c80b81b792c1c8cd0fce7e41b5d2b83ccc2cb Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sat, 15 Apr 2023 08:20:06 -0400 Subject: [PATCH 3/3] fix np.nan showing up as nan --- pandas/core/arrays/categorical.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e5628930cd2e5..606b247442a43 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2011,8 +2011,10 @@ def __repr__(self) -> str: """ _maxlen = 10 na_repr = "NaN" - if is_extension_array_dtype(self.categories.dtype): - na_repr = repr(self.categories.dtype.na_value) + if isinstance(self.categories.dtype, ExtensionDtype): + # np.nan should show up as NaN, not as nan + if self.categories.dtype.na_value is not np.nan: + na_repr = repr(self.categories.dtype.na_value) if len(self._codes) > _maxlen: result = self._tidy_repr(_maxlen, na_rep=na_repr) elif len(self._codes) > 0: