From dfc73b8461d174df6168ccca5e54b4e08ee4319a Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 8 Dec 2020 17:39:00 -0800 Subject: [PATCH 1/2] BUG: item_cache invalidation on DataFrame.insert --- pandas/core/internals/managers.py | 10 +++++++++- pandas/tests/frame/indexing/test_insert.py | 14 ++++++++++++++ pandas/tests/frame/test_block_internals.py | 15 +++++++++------ pandas/tests/io/sas/test_sas7bdat.py | 7 +++++-- 4 files changed, 37 insertions(+), 9 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 0b3f1079cdb16..e939c43015aed 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -18,6 +18,7 @@ from pandas._libs import internals as libinternals, lib from pandas._typing import ArrayLike, DtypeObj, Label, Shape +from pandas.errors import PerformanceWarning from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -1222,7 +1223,14 @@ def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False): self._known_consolidated = False if len(self.blocks) > 100: - self._consolidate_inplace() + warnings.warn( + "DataFrame is highly fragmented. This is usually the result " + "of calling `frame.insert` many times, which has poor performance. " + "Consider using pd.concat instead. To get a de-fragmented frame, " + "use `newframe = frame.copy()`", + PerformanceWarning, + stacklevel=5, + ) def reindex_axis( self, diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py index 622c93d1c2fdc..6e4deb5469777 100644 --- a/pandas/tests/frame/indexing/test_insert.py +++ b/pandas/tests/frame/indexing/test_insert.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas.errors import PerformanceWarning + from pandas import DataFrame, Index import pandas._testing as tm @@ -66,3 +68,15 @@ def test_insert_with_columns_dups(self): [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"] ) tm.assert_frame_equal(df, exp) + + def test_insert_item_cache(self): + df = DataFrame(np.random.randn(4, 3)) + ser = df[0] + + with tm.assert_produces_warning(PerformanceWarning): + for n in range(100): + df[n + 3] = df[1] * n + + ser.values[0] = 99 + + assert df.iloc[0, 0] == df[0][0] diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 5513262af8100..8954d8a0e7598 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.errors import PerformanceWarning + import pandas as pd from pandas import ( Categorical, @@ -329,12 +331,13 @@ def test_strange_column_corruption_issue(self): df[0] = np.nan wasCol = {} - for i, dt in enumerate(df.index): - for col in range(100, 200): - if col not in wasCol: - wasCol[col] = 1 - df[col] = np.nan - df[col][dt] = i + with tm.assert_produces_warning(PerformanceWarning): + for i, dt in enumerate(df.index): + for col in range(100, 200): + if col not in wasCol: + wasCol[col] = 1 + df[col] = np.nan + df[col][dt] = i myid = 100 diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index cca62c5af59a1..1ce1ba9d2caae 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -7,7 +7,7 @@ import numpy as np import pytest -from pandas.errors import EmptyDataError +from pandas.errors import EmptyDataError, PerformanceWarning import pandas.util._test_decorators as td import pandas as pd @@ -194,7 +194,10 @@ def test_compact_numerical_values(datapath): def test_many_columns(datapath): # Test for looking for column information in more places (PR #22628) fname = datapath("io", "sas", "data", "many_columns.sas7bdat") - df = pd.read_sas(fname, encoding="latin-1") + with tm.assert_produces_warning(PerformanceWarning): + # Many DataFrame.insert calls + df = pd.read_sas(fname, encoding="latin-1") + fname = datapath("io", "sas", "data", "many_columns.csv") df0 = pd.read_csv(fname, encoding="latin-1") tm.assert_frame_equal(df, df0) From 0549739ee44863387bf610a917d5ad9862a23aa7 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 10 Dec 2020 09:56:36 -0800 Subject: [PATCH 2/2] Whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 90f611c55e710..d0afc24aaecac 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -127,7 +127,7 @@ Interval Indexing ^^^^^^^^ - +- Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`) - -