From e9ac68a9ca77e992a823bc137991dd3236a79145 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 27 Feb 2023 01:02:04 +0100 Subject: [PATCH] ENH: Improve replace lazy copy for categoricals --- pandas/core/internals/blocks.py | 3 +-- pandas/tests/copy_view/test_replace.py | 32 ++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 35a7855b8240f..70367fdab0048 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -701,8 +701,7 @@ def replace_list( # TODO: avoid special-casing # GH49404 if using_cow and inplace: - # TODO(CoW): Optimize - blk = self.copy() + blk = self.copy(deep=self.refs.has_reference()) else: blk = self if inplace else self.copy() values = cast(Categorical, blk.values) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 7cd197541ac33..2c44752459c77 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -112,6 +112,38 @@ def test_replace_to_replace_wrong_dtype(using_copy_on_write): assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) +def test_replace_list_categorical(using_copy_on_write): + df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") + arr = get_array(df, "a") + df.replace(["c"], value="a", inplace=True) + assert np.shares_memory(arr.codes, get_array(df, "a").codes) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + + df_orig = df.copy() + df2 = df.replace(["b"], value="a") + assert not np.shares_memory(arr.codes, get_array(df2, "a").codes) + + tm.assert_frame_equal(df, df_orig) + + +def test_replace_list_inplace_refs_categorical(using_copy_on_write): + df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") + view = df[:] + df_orig = df.copy() + df.replace(["c"], value="a", inplace=True) + if using_copy_on_write: + assert not np.shares_memory( + get_array(view, "a").codes, get_array(df, "a").codes + ) + tm.assert_frame_equal(df_orig, view) + else: + # This could be inplace + assert not np.shares_memory( + get_array(view, "a").codes, get_array(df, "a").codes + ) + + def test_replace_inplace(using_copy_on_write): df = DataFrame({"a": [1.5, 2, 3]}) arr_a = get_array(df, "a")