From ccc1a25962f2d79d82b375a12c14f5ebd46f304c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 16 Dec 2021 22:27:43 +0100 Subject: [PATCH] Bug in concat casting all na levels to float (#44902) --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/arrays/categorical.py | 9 ++++++++- pandas/tests/arrays/categorical/test_missing.py | 14 ++++++++++++++ pandas/tests/reshape/concat/test_datetimes.py | 13 +++++++++++++ 4 files changed, 36 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 5c4b0e7e8371f..31c1336b02f54 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -809,6 +809,7 @@ Reshaping - Bug in :func:`crosstab` would fail when inputs are lists or tuples (:issue:`44076`) - Bug in :meth:`DataFrame.append` failing to retain ``index.name`` when appending a list of :class:`Series` objects (:issue:`44109`) - Fixed metadata propagation in :meth:`Dataframe.apply` method, consequently fixing the same issue for :meth:`Dataframe.transform`, :meth:`Dataframe.nunique` and :meth:`Dataframe.mode` (:issue:`28283`) +- Bug in :func:`concat` casting levels of :class:`MultiIndex` to float if the only consist of missing values (:issue:`44900`) - Bug in :meth:`DataFrame.stack` with ``ExtensionDtype`` columns incorrectly raising (:issue:`43561`) - Bug in :meth:`Series.unstack` with object doing unwanted type inference on resulting columns (:issue:`44595`) - Bug in :class:`MultiIndex` failing join operations with overlapping ``IntervalIndex`` levels (:issue:`44096`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 02db03aa4e273..f9d066f1e694d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -422,7 +422,14 @@ def __init__( # We remove null values here, then below will re-insert # them, grep "full_codes" arr_list = [values[idx] for idx in np.where(~null_mask)[0]] - arr = sanitize_array(arr_list, None) + + # GH#44900 Do not cast to float if we have only missing values + if arr_list or arr.dtype == "object": + sanitize_dtype = None + else: + sanitize_dtype = arr.dtype + + arr = sanitize_array(arr_list, None, dtype=sanitize_dtype) values = arr if dtype.categories is None: diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index f419aa6f181f2..fb5330a9665ff 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -197,3 +197,17 @@ def test_compare_categorical_with_missing(self, a1, a2, categories): result = Series(a1, dtype=cat_type) == Series(a2, dtype=cat_type) expected = Series(a1) == Series(a2) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "na_value, dtype", + [ + (pd.NaT, "datetime64[ns]"), + (None, "float64"), + (np.nan, "float64"), + (pd.NA, "float64"), + ], + ) + def test_categorical_only_missing_values_no_cast(self, na_value, dtype): + # GH#44900 + result = Categorical([na_value, na_value]) + tm.assert_index_equal(result.categories, Index([], dtype=dtype)) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index c4fe16b43313a..1af54a1d5cf4a 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -528,3 +528,16 @@ def test_concat_timedelta64_block(): result = concat([df, df]) tm.assert_frame_equal(result.iloc[:10], df) tm.assert_frame_equal(result.iloc[10:], df) + + +def test_concat_multiindex_datetime_nat(): + # GH#44900 + left = DataFrame({"a": 1}, index=MultiIndex.from_tuples([(1, pd.NaT)])) + right = DataFrame( + {"b": 2}, index=MultiIndex.from_tuples([(1, pd.NaT), (2, pd.NaT)]) + ) + result = concat([left, right], axis="columns") + expected = DataFrame( + {"a": [1.0, np.nan], "b": 2}, MultiIndex.from_tuples([(1, pd.NaT), (2, pd.NaT)]) + ) + tm.assert_frame_equal(result, expected)