From 32ee9732b823448b87848f6bcaefdc762868999c Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Sat, 7 Jul 2018 16:24:56 +0200 Subject: [PATCH] BUG: fix DataFrame.__getitem__ and .loc with non-list listlikes (#21313) * BUG: fix DataFrame.__getitem__ and .loc with non-list listlikes close #21294 close #21428 --- doc/source/whatsnew/v0.24.0.txt | 2 + pandas/core/frame.py | 108 +++++++++++++----------- pandas/core/sparse/frame.py | 26 ++---- pandas/tests/frame/test_constructors.py | 31 ++++--- pandas/tests/frame/test_indexing.py | 61 ++++++------- 5 files changed, 122 insertions(+), 106 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 79e075d8a89c2..7362e11b22189 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -355,6 +355,8 @@ Indexing - When ``.ix`` is asked for a missing integer label in a :class:`MultiIndex` with a first level of integer type, it now raises a ``KeyError``, consistently with the case of a flat :class:`Int64Index, rather than falling back to positional indexing (:issue:`21593`) - Bug in :meth:`DatetimeIndex.reindex` when reindexing a tz-naive and tz-aware :class:`DatetimeIndex` (:issue:`8306`) - Bug in :class:`DataFrame` when setting values with ``.loc`` and a timezone aware :class:`DatetimeIndex` (:issue:`11365`) +- ``DataFrame.__getitem__`` now accepts dictionaries and dictionary keys as list-likes of labels, consistently with ``Series.__getitem__`` (:issue:`21294`) +- Fixed ``DataFrame[np.nan]`` when columns are non-unique (:issue:`21428`) - Bug when indexing :class:`DatetimeIndex` with nanosecond resolution dates and timezones (:issue:`11679`) - diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c4e0849aef2da..41934f6adbbd3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2672,68 +2672,80 @@ def _ixs(self, i, axis=0): def __getitem__(self, key): key = com._apply_if_callable(key, self) - # shortcut if we are an actual column - is_mi_columns = isinstance(self.columns, MultiIndex) + # shortcut if the key is in columns try: - if key in self.columns and not is_mi_columns: - return self._getitem_column(key) - except: + if self.columns.is_unique and key in self.columns: + if self.columns.nlevels > 1: + return self._getitem_multilevel(key) + return self._get_item_cache(key) + except (TypeError, ValueError): + # The TypeError correctly catches non hashable "key" (e.g. list) + # The ValueError can be removed once GH #21729 is fixed pass - # see if we can slice the rows + # Do we have a slicer (on rows)? indexer = convert_to_index_sliceable(self, key) if indexer is not None: - return self._getitem_slice(indexer) + return self._slice(indexer, axis=0) - if isinstance(key, (Series, np.ndarray, Index, list)): - # either boolean or fancy integer index - return self._getitem_array(key) - elif isinstance(key, DataFrame): + # Do we have a (boolean) DataFrame? + if isinstance(key, DataFrame): return self._getitem_frame(key) - elif is_mi_columns: - return self._getitem_multilevel(key) + + # Do we have a (boolean) 1d indexer? + if com.is_bool_indexer(key): + return self._getitem_bool_array(key) + + # We are left with two options: a single key, and a collection of keys, + # We interpret tuples as collections only for non-MultiIndex + is_single_key = isinstance(key, tuple) or not is_list_like(key) + + if is_single_key: + if self.columns.nlevels > 1: + return self._getitem_multilevel(key) + indexer = self.columns.get_loc(key) + if is_integer(indexer): + indexer = [indexer] else: - return self._getitem_column(key) + if is_iterator(key): + key = list(key) + indexer = self.loc._convert_to_indexer(key, axis=1, + raise_missing=True) - def _getitem_column(self, key): - """ return the actual column """ + # take() does not accept boolean indexers + if getattr(indexer, "dtype", None) == bool: + indexer = np.where(indexer)[0] - # get column - if self.columns.is_unique: - return self._get_item_cache(key) + data = self._take(indexer, axis=1) - # duplicate columns & possible reduce dimensionality - result = self._constructor(self._data.get(key)) - if result.columns.is_unique: - result = result[key] + if is_single_key: + # What does looking for a single key in a non-unique index return? + # The behavior is inconsistent. It returns a Series, except when + # - the key itself is repeated (test on data.shape, #9519), or + # - we have a MultiIndex on columns (test on self.columns, #21309) + if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex): + data = data[key] - return result - - def _getitem_slice(self, key): - return self._slice(key, axis=0) + return data - def _getitem_array(self, key): + def _getitem_bool_array(self, key): # also raises Exception if object array with NA values - if com.is_bool_indexer(key): - # warning here just in case -- previously __setitem__ was - # reindexing but __getitem__ was not; it seems more reasonable to - # go with the __setitem__ behavior since that is more consistent - # with all other indexing behavior - if isinstance(key, Series) and not key.index.equals(self.index): - warnings.warn("Boolean Series key will be reindexed to match " - "DataFrame index.", UserWarning, stacklevel=3) - elif len(key) != len(self.index): - raise ValueError('Item wrong length %d instead of %d.' % - (len(key), len(self.index))) - # check_bool_indexer will throw exception if Series key cannot - # be reindexed to match DataFrame rows - key = check_bool_indexer(self.index, key) - indexer = key.nonzero()[0] - return self._take(indexer, axis=0) - else: - indexer = self.loc._convert_to_indexer(key, axis=1, - raise_missing=True) - return self._take(indexer, axis=1) + # warning here just in case -- previously __setitem__ was + # reindexing but __getitem__ was not; it seems more reasonable to + # go with the __setitem__ behavior since that is more consistent + # with all other indexing behavior + if isinstance(key, Series) and not key.index.equals(self.index): + warnings.warn("Boolean Series key will be reindexed to match " + "DataFrame index.", UserWarning, stacklevel=3) + elif len(key) != len(self.index): + raise ValueError('Item wrong length %d instead of %d.' % + (len(key), len(self.index))) + + # check_bool_indexer will throw exception if Series key cannot + # be reindexed to match DataFrame rows + key = check_bool_indexer(self.index, key) + indexer = key.nonzero()[0] + return self._take(indexer, axis=0) def _getitem_multilevel(self, key): loc = self.columns.get_loc(key) diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 28299fbe61daf..1feddf004058a 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -148,9 +148,10 @@ def _init_dict(self, data, index, columns, dtype=None): if index is None: index = extract_index(list(data.values())) - sp_maker = lambda x: SparseArray(x, kind=self._default_kind, - fill_value=self._default_fill_value, - copy=True, dtype=dtype) + def sp_maker(x): + return SparseArray(x, kind=self._default_kind, + fill_value=self._default_fill_value, + copy=True, dtype=dtype) sdict = {} for k, v in compat.iteritems(data): if isinstance(v, Series): @@ -397,9 +398,10 @@ def _sanitize_column(self, key, value, **kwargs): sanitized_column : SparseArray """ - sp_maker = lambda x, index=None: SparseArray( - x, index=index, fill_value=self._default_fill_value, - kind=self._default_kind) + def sp_maker(x, index=None): + return SparseArray(x, index=index, + fill_value=self._default_fill_value, + kind=self._default_kind) if isinstance(value, SparseSeries): clean = value.reindex(self.index).as_sparse_array( fill_value=self._default_fill_value, kind=self._default_kind) @@ -428,18 +430,6 @@ def _sanitize_column(self, key, value, **kwargs): # always return a SparseArray! return clean - def __getitem__(self, key): - """ - Retrieve column or slice from DataFrame - """ - if isinstance(key, slice): - date_rng = self.index[key] - return self.reindex(date_rng) - elif isinstance(key, (np.ndarray, list, Series)): - return self._getitem_array(key) - else: - return self._get_item_cache(key) - def get_value(self, index, col, takeable=False): """ Quickly retrieve single value at passed column and index diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index e7fb765128738..bef38288ff3a5 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -501,9 +501,11 @@ def test_constructor_dict_of_tuples(self): tm.assert_frame_equal(result, expected, check_dtype=False) def test_constructor_dict_multiindex(self): - check = lambda result, expected: tm.assert_frame_equal( - result, expected, check_dtype=True, check_index_type=True, - check_column_type=True, check_names=True) + def check(result, expected): + return tm.assert_frame_equal(result, expected, check_dtype=True, + check_index_type=True, + check_column_type=True, + check_names=True) d = {('a', 'a'): {('i', 'i'): 0, ('i', 'j'): 1, ('j', 'i'): 2}, ('b', 'a'): {('i', 'i'): 6, ('i', 'j'): 5, ('j', 'i'): 4}, ('b', 'c'): {('i', 'i'): 7, ('i', 'j'): 8, ('j', 'i'): 9}} @@ -1655,19 +1657,21 @@ def check(df): for i in range(len(df.columns)): df.iloc[:, i] - # allow single nans to succeed indexer = np.arange(len(df.columns))[isna(df.columns)] - if len(indexer) == 1: - tm.assert_series_equal(df.iloc[:, indexer[0]], - df.loc[:, np.nan]) - - # multiple nans should fail - else: - + # No NaN found -> error + if len(indexer) == 0: def f(): df.loc[:, np.nan] pytest.raises(TypeError, f) + # single nan should result in Series + elif len(indexer) == 1: + tm.assert_series_equal(df.iloc[:, indexer[0]], + df.loc[:, np.nan]) + # multiple nans should result in DataFrame + else: + tm.assert_frame_equal(df.iloc[:, indexer], + df.loc[:, np.nan]) df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[1, np.nan]) check(df) @@ -1683,6 +1687,11 @@ def f(): columns=[np.nan, 1.1, 2.2, np.nan]) check(df) + # GH 21428 (non-unique columns) + df = DataFrame([[0.0, 1, 2, 3.0], [4, 5, 6, 7]], + columns=[np.nan, 1, 2, 2]) + check(df) + def test_constructor_lists_to_object_dtype(self): # from #1074 d = DataFrame({'a': [np.nan, False]}) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 90f3bede482c6..9ca2b7e3c8a6a 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -92,45 +92,46 @@ def test_get(self): result = df.get(None) assert result is None - def test_getitem_iterator(self): + def test_loc_iterable(self): idx = iter(['A', 'B', 'C']) result = self.frame.loc[:, idx] expected = self.frame.loc[:, ['A', 'B', 'C']] assert_frame_equal(result, expected) - idx = iter(['A', 'B', 'C']) - result = self.frame.loc[:, idx] - expected = self.frame.loc[:, ['A', 'B', 'C']] - assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "idx_type", + [list, iter, Index, set, + lambda l: dict(zip(l, range(len(l)))), + lambda l: dict(zip(l, range(len(l)))).keys()], + ids=["list", "iter", "Index", "set", "dict", "dict_keys"]) + @pytest.mark.parametrize("levels", [1, 2]) + def test_getitem_listlike(self, idx_type, levels): + # GH 21294 + + if levels == 1: + frame, missing = self.frame, 'food' + else: + # MultiIndex columns + frame = DataFrame(randn(8, 3), + columns=Index([('foo', 'bar'), ('baz', 'qux'), + ('peek', 'aboo')], + name=('sth', 'sth2'))) + missing = ('good', 'food') - def test_getitem_list(self): - self.frame.columns.name = 'foo' + keys = [frame.columns[1], frame.columns[0]] + idx = idx_type(keys) + idx_check = list(idx_type(keys)) - result = self.frame[['B', 'A']] - result2 = self.frame[Index(['B', 'A'])] + result = frame[idx] - expected = self.frame.loc[:, ['B', 'A']] - expected.columns.name = 'foo' + expected = frame.loc[:, idx_check] + expected.columns.names = frame.columns.names assert_frame_equal(result, expected) - assert_frame_equal(result2, expected) - assert result.columns.name == 'foo' - - with tm.assert_raises_regex(KeyError, 'not in index'): - self.frame[['B', 'A', 'food']] + idx = idx_type(keys + [missing]) with tm.assert_raises_regex(KeyError, 'not in index'): - self.frame[Index(['B', 'A', 'foo'])] - - # tuples - df = DataFrame(randn(8, 3), - columns=Index([('foo', 'bar'), ('baz', 'qux'), - ('peek', 'aboo')], name=('sth', 'sth2'))) - - result = df[[('foo', 'bar'), ('baz', 'qux')]] - expected = df.iloc[:, :2] - assert_frame_equal(result, expected) - assert result.columns.names == ('sth', 'sth2') + frame[idx] def test_getitem_callable(self): # GH 12533 @@ -223,7 +224,8 @@ def test_setitem_callable(self): def test_setitem_other_callable(self): # GH 13299 - inc = lambda x: x + 1 + def inc(x): + return x + 1 df = pd.DataFrame([[-1, 1], [1, -1]]) df[df > 0] = inc @@ -2082,7 +2084,8 @@ def test_reindex_level(self): icol = ['jim', 'joe', 'jolie'] def verify_first_level(df, level, idx, check_index_type=True): - f = lambda val: np.nonzero(df[level] == val)[0] + def f(val): + return np.nonzero(df[level] == val)[0] i = np.concatenate(list(map(f, idx))) left = df.set_index(icol).reindex(idx, level=level) right = df.iloc[i].set_index(icol)