Skip to content

Commit

Permalink
BUG: fix DataFrame.__getitem__ and .loc with non-list listlikes (pan…
Browse files Browse the repository at this point in the history
…das-dev#21313)

* BUG: fix DataFrame.__getitem__ and .loc with non-list listlikes

close pandas-dev#21294
close pandas-dev#21428
  • Loading branch information
toobaz authored and victor committed Sep 30, 2018
1 parent 0502f16 commit 70d8be2
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 106 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,8 @@ Indexing
- When ``.ix`` is asked for a missing integer label in a :class:`MultiIndex` with a first level of integer type, it now raises a ``KeyError``, consistently with the case of a flat :class:`Int64Index, rather than falling back to positional indexing (:issue:`21593`)
- Bug in :meth:`DatetimeIndex.reindex` when reindexing a tz-naive and tz-aware :class:`DatetimeIndex` (:issue:`8306`)
- Bug in :class:`DataFrame` when setting values with ``.loc`` and a timezone aware :class:`DatetimeIndex` (:issue:`11365`)
- ``DataFrame.__getitem__`` now accepts dictionaries and dictionary keys as list-likes of labels, consistently with ``Series.__getitem__`` (:issue:`21294`)
- Fixed ``DataFrame[np.nan]`` when columns are non-unique (:issue:`21428`)
- Bug when indexing :class:`DatetimeIndex` with nanosecond resolution dates and timezones (:issue:`11679`)

-
Expand Down
108 changes: 60 additions & 48 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2672,68 +2672,80 @@ def _ixs(self, i, axis=0):
def __getitem__(self, key):
key = com._apply_if_callable(key, self)

# shortcut if we are an actual column
is_mi_columns = isinstance(self.columns, MultiIndex)
# shortcut if the key is in columns
try:
if key in self.columns and not is_mi_columns:
return self._getitem_column(key)
except:
if self.columns.is_unique and key in self.columns:
if self.columns.nlevels > 1:
return self._getitem_multilevel(key)
return self._get_item_cache(key)
except (TypeError, ValueError):
# The TypeError correctly catches non hashable "key" (e.g. list)
# The ValueError can be removed once GH #21729 is fixed
pass

# see if we can slice the rows
# Do we have a slicer (on rows)?
indexer = convert_to_index_sliceable(self, key)
if indexer is not None:
return self._getitem_slice(indexer)
return self._slice(indexer, axis=0)

if isinstance(key, (Series, np.ndarray, Index, list)):
# either boolean or fancy integer index
return self._getitem_array(key)
elif isinstance(key, DataFrame):
# Do we have a (boolean) DataFrame?
if isinstance(key, DataFrame):
return self._getitem_frame(key)
elif is_mi_columns:
return self._getitem_multilevel(key)

# Do we have a (boolean) 1d indexer?
if com.is_bool_indexer(key):
return self._getitem_bool_array(key)

# We are left with two options: a single key, and a collection of keys,
# We interpret tuples as collections only for non-MultiIndex
is_single_key = isinstance(key, tuple) or not is_list_like(key)

if is_single_key:
if self.columns.nlevels > 1:
return self._getitem_multilevel(key)
indexer = self.columns.get_loc(key)
if is_integer(indexer):
indexer = [indexer]
else:
return self._getitem_column(key)
if is_iterator(key):
key = list(key)
indexer = self.loc._convert_to_indexer(key, axis=1,
raise_missing=True)

def _getitem_column(self, key):
""" return the actual column """
# take() does not accept boolean indexers
if getattr(indexer, "dtype", None) == bool:
indexer = np.where(indexer)[0]

# get column
if self.columns.is_unique:
return self._get_item_cache(key)
data = self._take(indexer, axis=1)

# duplicate columns & possible reduce dimensionality
result = self._constructor(self._data.get(key))
if result.columns.is_unique:
result = result[key]
if is_single_key:
# What does looking for a single key in a non-unique index return?
# The behavior is inconsistent. It returns a Series, except when
# - the key itself is repeated (test on data.shape, #9519), or
# - we have a MultiIndex on columns (test on self.columns, #21309)
if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex):
data = data[key]

return result

def _getitem_slice(self, key):
return self._slice(key, axis=0)
return data

def _getitem_array(self, key):
def _getitem_bool_array(self, key):
# also raises Exception if object array with NA values
if com.is_bool_indexer(key):
# warning here just in case -- previously __setitem__ was
# reindexing but __getitem__ was not; it seems more reasonable to
# go with the __setitem__ behavior since that is more consistent
# with all other indexing behavior
if isinstance(key, Series) and not key.index.equals(self.index):
warnings.warn("Boolean Series key will be reindexed to match "
"DataFrame index.", UserWarning, stacklevel=3)
elif len(key) != len(self.index):
raise ValueError('Item wrong length %d instead of %d.' %
(len(key), len(self.index)))
# check_bool_indexer will throw exception if Series key cannot
# be reindexed to match DataFrame rows
key = check_bool_indexer(self.index, key)
indexer = key.nonzero()[0]
return self._take(indexer, axis=0)
else:
indexer = self.loc._convert_to_indexer(key, axis=1,
raise_missing=True)
return self._take(indexer, axis=1)
# warning here just in case -- previously __setitem__ was
# reindexing but __getitem__ was not; it seems more reasonable to
# go with the __setitem__ behavior since that is more consistent
# with all other indexing behavior
if isinstance(key, Series) and not key.index.equals(self.index):
warnings.warn("Boolean Series key will be reindexed to match "
"DataFrame index.", UserWarning, stacklevel=3)
elif len(key) != len(self.index):
raise ValueError('Item wrong length %d instead of %d.' %
(len(key), len(self.index)))

# check_bool_indexer will throw exception if Series key cannot
# be reindexed to match DataFrame rows
key = check_bool_indexer(self.index, key)
indexer = key.nonzero()[0]
return self._take(indexer, axis=0)

def _getitem_multilevel(self, key):
loc = self.columns.get_loc(key)
Expand Down
26 changes: 8 additions & 18 deletions pandas/core/sparse/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,10 @@ def _init_dict(self, data, index, columns, dtype=None):
if index is None:
index = extract_index(list(data.values()))

sp_maker = lambda x: SparseArray(x, kind=self._default_kind,
fill_value=self._default_fill_value,
copy=True, dtype=dtype)
def sp_maker(x):
return SparseArray(x, kind=self._default_kind,
fill_value=self._default_fill_value,
copy=True, dtype=dtype)
sdict = {}
for k, v in compat.iteritems(data):
if isinstance(v, Series):
Expand Down Expand Up @@ -397,9 +398,10 @@ def _sanitize_column(self, key, value, **kwargs):
sanitized_column : SparseArray
"""
sp_maker = lambda x, index=None: SparseArray(
x, index=index, fill_value=self._default_fill_value,
kind=self._default_kind)
def sp_maker(x, index=None):
return SparseArray(x, index=index,
fill_value=self._default_fill_value,
kind=self._default_kind)
if isinstance(value, SparseSeries):
clean = value.reindex(self.index).as_sparse_array(
fill_value=self._default_fill_value, kind=self._default_kind)
Expand Down Expand Up @@ -428,18 +430,6 @@ def _sanitize_column(self, key, value, **kwargs):
# always return a SparseArray!
return clean

def __getitem__(self, key):
"""
Retrieve column or slice from DataFrame
"""
if isinstance(key, slice):
date_rng = self.index[key]
return self.reindex(date_rng)
elif isinstance(key, (np.ndarray, list, Series)):
return self._getitem_array(key)
else:
return self._get_item_cache(key)

def get_value(self, index, col, takeable=False):
"""
Quickly retrieve single value at passed column and index
Expand Down
31 changes: 20 additions & 11 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,9 +501,11 @@ def test_constructor_dict_of_tuples(self):
tm.assert_frame_equal(result, expected, check_dtype=False)

def test_constructor_dict_multiindex(self):
check = lambda result, expected: tm.assert_frame_equal(
result, expected, check_dtype=True, check_index_type=True,
check_column_type=True, check_names=True)
def check(result, expected):
return tm.assert_frame_equal(result, expected, check_dtype=True,
check_index_type=True,
check_column_type=True,
check_names=True)
d = {('a', 'a'): {('i', 'i'): 0, ('i', 'j'): 1, ('j', 'i'): 2},
('b', 'a'): {('i', 'i'): 6, ('i', 'j'): 5, ('j', 'i'): 4},
('b', 'c'): {('i', 'i'): 7, ('i', 'j'): 8, ('j', 'i'): 9}}
Expand Down Expand Up @@ -1655,19 +1657,21 @@ def check(df):
for i in range(len(df.columns)):
df.iloc[:, i]

# allow single nans to succeed
indexer = np.arange(len(df.columns))[isna(df.columns)]

if len(indexer) == 1:
tm.assert_series_equal(df.iloc[:, indexer[0]],
df.loc[:, np.nan])

# multiple nans should fail
else:

# No NaN found -> error
if len(indexer) == 0:
def f():
df.loc[:, np.nan]
pytest.raises(TypeError, f)
# single nan should result in Series
elif len(indexer) == 1:
tm.assert_series_equal(df.iloc[:, indexer[0]],
df.loc[:, np.nan])
# multiple nans should result in DataFrame
else:
tm.assert_frame_equal(df.iloc[:, indexer],
df.loc[:, np.nan])

df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[1, np.nan])
check(df)
Expand All @@ -1683,6 +1687,11 @@ def f():
columns=[np.nan, 1.1, 2.2, np.nan])
check(df)

# GH 21428 (non-unique columns)
df = DataFrame([[0.0, 1, 2, 3.0], [4, 5, 6, 7]],
columns=[np.nan, 1, 2, 2])
check(df)

def test_constructor_lists_to_object_dtype(self):
# from #1074
d = DataFrame({'a': [np.nan, False]})
Expand Down
61 changes: 32 additions & 29 deletions pandas/tests/frame/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,45 +92,46 @@ def test_get(self):
result = df.get(None)
assert result is None

def test_getitem_iterator(self):
def test_loc_iterable(self):
idx = iter(['A', 'B', 'C'])
result = self.frame.loc[:, idx]
expected = self.frame.loc[:, ['A', 'B', 'C']]
assert_frame_equal(result, expected)

idx = iter(['A', 'B', 'C'])
result = self.frame.loc[:, idx]
expected = self.frame.loc[:, ['A', 'B', 'C']]
assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"idx_type",
[list, iter, Index, set,
lambda l: dict(zip(l, range(len(l)))),
lambda l: dict(zip(l, range(len(l)))).keys()],
ids=["list", "iter", "Index", "set", "dict", "dict_keys"])
@pytest.mark.parametrize("levels", [1, 2])
def test_getitem_listlike(self, idx_type, levels):
# GH 21294

if levels == 1:
frame, missing = self.frame, 'food'
else:
# MultiIndex columns
frame = DataFrame(randn(8, 3),
columns=Index([('foo', 'bar'), ('baz', 'qux'),
('peek', 'aboo')],
name=('sth', 'sth2')))
missing = ('good', 'food')

def test_getitem_list(self):
self.frame.columns.name = 'foo'
keys = [frame.columns[1], frame.columns[0]]
idx = idx_type(keys)
idx_check = list(idx_type(keys))

result = self.frame[['B', 'A']]
result2 = self.frame[Index(['B', 'A'])]
result = frame[idx]

expected = self.frame.loc[:, ['B', 'A']]
expected.columns.name = 'foo'
expected = frame.loc[:, idx_check]
expected.columns.names = frame.columns.names

assert_frame_equal(result, expected)
assert_frame_equal(result2, expected)

assert result.columns.name == 'foo'

with tm.assert_raises_regex(KeyError, 'not in index'):
self.frame[['B', 'A', 'food']]
idx = idx_type(keys + [missing])
with tm.assert_raises_regex(KeyError, 'not in index'):
self.frame[Index(['B', 'A', 'foo'])]

# tuples
df = DataFrame(randn(8, 3),
columns=Index([('foo', 'bar'), ('baz', 'qux'),
('peek', 'aboo')], name=('sth', 'sth2')))

result = df[[('foo', 'bar'), ('baz', 'qux')]]
expected = df.iloc[:, :2]
assert_frame_equal(result, expected)
assert result.columns.names == ('sth', 'sth2')
frame[idx]

def test_getitem_callable(self):
# GH 12533
Expand Down Expand Up @@ -223,7 +224,8 @@ def test_setitem_callable(self):

def test_setitem_other_callable(self):
# GH 13299
inc = lambda x: x + 1
def inc(x):
return x + 1

df = pd.DataFrame([[-1, 1], [1, -1]])
df[df > 0] = inc
Expand Down Expand Up @@ -2082,7 +2084,8 @@ def test_reindex_level(self):
icol = ['jim', 'joe', 'jolie']

def verify_first_level(df, level, idx, check_index_type=True):
f = lambda val: np.nonzero(df[level] == val)[0]
def f(val):
return np.nonzero(df[level] == val)[0]
i = np.concatenate(list(map(f, idx)))
left = df.set_index(icol).reindex(idx, level=level)
right = df.iloc[i].set_index(icol)
Expand Down

0 comments on commit 70d8be2

Please sign in to comment.