Skip to content

Commit

Permalink
BUG: item_cache invalidation on DataFrame.insert (#38380)
Browse files Browse the repository at this point in the history
* BUG: item_cache invalidation on DataFrame.insert

* Whatsnew
  • Loading branch information
jbrockmendel authored Dec 11, 2020
1 parent 5f6bf14 commit d0db009
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 10 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ Interval

Indexing
^^^^^^^^

- Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`)
-
-

Expand Down
10 changes: 9 additions & 1 deletion pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from pandas._libs import internals as libinternals, lib
from pandas._typing import ArrayLike, DtypeObj, Label, Shape
from pandas.errors import PerformanceWarning
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.cast import (
Expand Down Expand Up @@ -1222,7 +1223,14 @@ def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False):
self._known_consolidated = False

if len(self.blocks) > 100:
self._consolidate_inplace()
warnings.warn(
"DataFrame is highly fragmented. This is usually the result "
"of calling `frame.insert` many times, which has poor performance. "
"Consider using pd.concat instead. To get a de-fragmented frame, "
"use `newframe = frame.copy()`",
PerformanceWarning,
stacklevel=5,
)

def reindex_axis(
self,
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/frame/indexing/test_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import numpy as np
import pytest

from pandas.errors import PerformanceWarning

from pandas import DataFrame, Index
import pandas._testing as tm

Expand Down Expand Up @@ -66,3 +68,15 @@ def test_insert_with_columns_dups(self):
[["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"]
)
tm.assert_frame_equal(df, exp)

def test_insert_item_cache(self):
df = DataFrame(np.random.randn(4, 3))
ser = df[0]

with tm.assert_produces_warning(PerformanceWarning):
for n in range(100):
df[n + 3] = df[1] * n

ser.values[0] = 99

assert df.iloc[0, 0] == df[0][0]
15 changes: 9 additions & 6 deletions pandas/tests/frame/test_block_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import numpy as np
import pytest

from pandas.errors import PerformanceWarning

import pandas as pd
from pandas import (
Categorical,
Expand Down Expand Up @@ -329,12 +331,13 @@ def test_strange_column_corruption_issue(self):
df[0] = np.nan
wasCol = {}

for i, dt in enumerate(df.index):
for col in range(100, 200):
if col not in wasCol:
wasCol[col] = 1
df[col] = np.nan
df[col][dt] = i
with tm.assert_produces_warning(PerformanceWarning):
for i, dt in enumerate(df.index):
for col in range(100, 200):
if col not in wasCol:
wasCol[col] = 1
df[col] = np.nan
df[col][dt] = i

myid = 100

Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/io/sas/test_sas7bdat.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import numpy as np
import pytest

from pandas.errors import EmptyDataError
from pandas.errors import EmptyDataError, PerformanceWarning
import pandas.util._test_decorators as td

import pandas as pd
Expand Down Expand Up @@ -194,7 +194,10 @@ def test_compact_numerical_values(datapath):
def test_many_columns(datapath):
# Test for looking for column information in more places (PR #22628)
fname = datapath("io", "sas", "data", "many_columns.sas7bdat")
df = pd.read_sas(fname, encoding="latin-1")
with tm.assert_produces_warning(PerformanceWarning):
# Many DataFrame.insert calls
df = pd.read_sas(fname, encoding="latin-1")

fname = datapath("io", "sas", "data", "many_columns.csv")
df0 = pd.read_csv(fname, encoding="latin-1")
tm.assert_frame_equal(df, df0)
Expand Down

0 comments on commit d0db009

Please sign in to comment.