Skip to content

Commit

Permalink
DEPR: Error with ambiguous groupby strings (#22415)
Browse files Browse the repository at this point in the history
xref gh-14432.
  • Loading branch information
gfyoung authored and jreback committed Aug 22, 2018
1 parent d83f533 commit 25e6a21
Show file tree
Hide file tree
Showing 12 changed files with 51 additions and 270 deletions.
5 changes: 2 additions & 3 deletions doc/source/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,8 @@ consider the following ``DataFrame``:
.. versionadded:: 0.20

A string passed to ``groupby`` may refer to either a column or an index level.
If a string matches both a column name and an index level name then a warning is
issued and the column takes precedence. This will result in an ambiguity error
in a future version.
If a string matches both a column name and an index level name, a
``ValueError`` will be raised.

.. ipython:: python
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,7 @@ Removal of prior version deprecations/changes
- :meth:`Series.repeat` has renamed the ``reps`` argument to ``repeats`` (:issue:`14645`)
- Several private functions were removed from the (non-public) module ``pandas.core.common`` (:issue:`22001`)
- Removal of the previously deprecated module ``pandas.core.datetools`` (:issue:`14105`, :issue:`14094`)
- Strings passed into :meth:`DataFrame.groupby` that refer to both column and index levels will raise a ``ValueError`` (:issue:`14432`)
-

.. _whatsnew_0240.performance:
Expand Down
7 changes: 2 additions & 5 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4393,7 +4393,6 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False,
kind='quicksort', na_position='last'):
inplace = validate_bool_kwarg(inplace, 'inplace')
axis = self._get_axis_number(axis)
stacklevel = 2 # Number of stack levels from df.sort_values

if not isinstance(by, list):
by = [by]
Expand All @@ -4405,8 +4404,7 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False,

keys = []
for x in by:
k = self._get_label_or_level_values(x, axis=axis,
stacklevel=stacklevel)
k = self._get_label_or_level_values(x, axis=axis)
keys.append(k)
indexer = lexsort_indexer(keys, orders=ascending,
na_position=na_position)
Expand All @@ -4415,8 +4413,7 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False,
from pandas.core.sorting import nargsort

by = by[0]
k = self._get_label_or_level_values(by, axis=axis,
stacklevel=stacklevel)
k = self._get_label_or_level_values(by, axis=axis)

if isinstance(ascending, (tuple, list)):
ascending = ascending[0]
Expand Down
37 changes: 9 additions & 28 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1412,33 +1412,23 @@ def _is_label_or_level_reference(self, key, axis=0):
return (self._is_level_reference(key, axis=axis) or
self._is_label_reference(key, axis=axis))

def _check_label_or_level_ambiguity(self, key, axis=0, stacklevel=1):
def _check_label_or_level_ambiguity(self, key, axis=0):
"""
Check whether `key` matches both a level of the input `axis` and a
label of the other axis and raise a ``FutureWarning`` if this is the
case.
Check whether `key` is ambiguous.
Note: This method will be altered to raise an ambiguity exception in
a future version.
By ambiguous, we mean that it matches both a level of the input
`axis` and a label of the other axis.
Parameters
----------
key: str or object
label or level name
axis: int, default 0
Axis that levels are associated with (0 for index, 1 for columns)
stacklevel: int, default 1
Stack level used when a FutureWarning is raised (see below).
Returns
-------
ambiguous: bool
Raises
------
FutureWarning
if `key` is ambiguous. This will become an ambiguity error in a
future version
ValueError: `key` is ambiguous
"""

axis = self._get_axis_number(axis)
Expand All @@ -1464,21 +1454,15 @@ def _check_label_or_level_ambiguity(self, key, axis=0, stacklevel=1):
('an', 'index'))

msg = ("'{key}' is both {level_article} {level_type} level and "
"{label_article} {label_type} label.\n"
"Defaulting to {label_type}, but this will raise an "
"ambiguity error in a future version"
"{label_article} {label_type} label, which is ambiguous."
).format(key=key,
level_article=level_article,
level_type=level_type,
label_article=label_article,
label_type=label_type)
raise ValueError(msg)

warnings.warn(msg, FutureWarning, stacklevel=stacklevel + 1)
return True
else:
return False

def _get_label_or_level_values(self, key, axis=0, stacklevel=1):
def _get_label_or_level_values(self, key, axis=0):
"""
Return a 1-D array of values associated with `key`, a label or level
from the given `axis`.
Expand All @@ -1497,8 +1481,6 @@ def _get_label_or_level_values(self, key, axis=0, stacklevel=1):
Label or level name.
axis: int, default 0
Axis that levels are associated with (0 for index, 1 for columns)
stacklevel: int, default 1
Stack level used when a FutureWarning is raised (see below).
Returns
-------
Expand All @@ -1524,8 +1506,7 @@ def _get_label_or_level_values(self, key, axis=0, stacklevel=1):
.format(type=type(self)))

if self._is_label_reference(key, axis=axis):
self._check_label_or_level_ambiguity(key, axis=axis,
stacklevel=stacklevel + 1)
self._check_label_or_level_ambiguity(key, axis=axis)
values = self.xs(key, axis=other_axes[0])._values
elif self._is_level_reference(key, axis=axis):
values = self.axes[axis].get_level_values(key)._values
Expand Down
4 changes: 1 addition & 3 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,9 +571,7 @@ def is_in_obj(gpr):
elif is_in_axis(gpr): # df.groupby('name')
if gpr in obj:
if validate:
stacklevel = 5 # Number of stack levels from df.groupby
obj._check_label_or_level_ambiguity(
gpr, stacklevel=stacklevel)
obj._check_label_or_level_ambiguity(gpr)
in_axis, name, gpr = True, gpr, obj[gpr]
exclusions.append(name)
elif obj._is_level_reference(gpr):
Expand Down
16 changes: 5 additions & 11 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -811,7 +811,6 @@ def _get_merge_keys(self):
left_drop = []

left, right = self.left, self.right
stacklevel = 5 # Number of stack levels from df.merge

is_lkey = lambda x: is_array_like(x) and len(x) == len(left)
is_rkey = lambda x: is_array_like(x) and len(x) == len(right)
Expand All @@ -837,8 +836,7 @@ def _get_merge_keys(self):
else:
if rk is not None:
right_keys.append(
right._get_label_or_level_values(
rk, stacklevel=stacklevel))
right._get_label_or_level_values(rk))
join_names.append(rk)
else:
# work-around for merge_asof(right_index=True)
Expand All @@ -848,8 +846,7 @@ def _get_merge_keys(self):
if not is_rkey(rk):
if rk is not None:
right_keys.append(
right._get_label_or_level_values(
rk, stacklevel=stacklevel))
right._get_label_or_level_values(rk))
else:
# work-around for merge_asof(right_index=True)
right_keys.append(right.index)
Expand All @@ -862,8 +859,7 @@ def _get_merge_keys(self):
else:
right_keys.append(rk)
if lk is not None:
left_keys.append(left._get_label_or_level_values(
lk, stacklevel=stacklevel))
left_keys.append(left._get_label_or_level_values(lk))
join_names.append(lk)
else:
# work-around for merge_asof(left_index=True)
Expand All @@ -875,8 +871,7 @@ def _get_merge_keys(self):
left_keys.append(k)
join_names.append(None)
else:
left_keys.append(left._get_label_or_level_values(
k, stacklevel=stacklevel))
left_keys.append(left._get_label_or_level_values(k))
join_names.append(k)
if isinstance(self.right.index, MultiIndex):
right_keys = [lev._values.take(lab)
Expand All @@ -890,8 +885,7 @@ def _get_merge_keys(self):
right_keys.append(k)
join_names.append(None)
else:
right_keys.append(right._get_label_or_level_values(
k, stacklevel=stacklevel))
right_keys.append(right._get_label_or_level_values(k))
join_names.append(k)
if isinstance(self.left.index, MultiIndex):
left_keys = [lev._values.take(lab)
Expand Down
33 changes: 1 addition & 32 deletions pandas/tests/frame/test_sort_values_level_as_str.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import pytest

from pandas import DataFrame, Index
from pandas import DataFrame
from pandas.errors import PerformanceWarning
from pandas.util import testing as tm
from pandas.util.testing import assert_frame_equal
Expand Down Expand Up @@ -93,34 +93,3 @@ def test_sort_column_level_and_index_label(
assert_frame_equal(result, expected)
else:
assert_frame_equal(result, expected)


def test_sort_values_column_index_level_precedence():
# GH 14353, when a string passed as the `by` parameter
# matches a column and an index level the column takes
# precedence

# Construct DataFrame with index and column named 'idx'
idx = Index(np.arange(1, 7), name='idx')
df = DataFrame({'A': np.arange(11, 17),
'idx': np.arange(6, 0, -1)},
index=idx)

# Sorting by 'idx' should sort by the idx column and raise a
# FutureWarning
with tm.assert_produces_warning(FutureWarning):
result = df.sort_values(by='idx')

# This should be equivalent to sorting by the 'idx' index level in
# descending order
expected = df.sort_index(level='idx', ascending=False)
assert_frame_equal(result, expected)

# Perform same test with MultiIndex
df_multi = df.set_index('A', append=True)

with tm.assert_produces_warning(FutureWarning):
result = df_multi.sort_values(by='idx')

expected = df_multi.sort_index(level='idx', ascending=False)
assert_frame_equal(result, expected)
75 changes: 28 additions & 47 deletions pandas/tests/generic/test_label_or_level_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,31 +166,24 @@ def test_is_label_or_level_reference_panel_error(panel):
def test_check_label_or_level_ambiguity_df(df_ambig, axis):

# Transpose frame if axis == 1
if axis in {1, 'columns'}:
if axis in {1, "columns"}:
df_ambig = df_ambig.T

# df_ambig has both an on-axis level and off-axis label named L1
# Therefore L1 is ambiguous
with tm.assert_produces_warning(FutureWarning,
clear=True) as w:
if axis in {0, "index"}:
msg = "'L1' is both an index level and a column label"
else:
msg = "'L1' is both a column level and an index label"

assert df_ambig._check_label_or_level_ambiguity('L1', axis=axis)
warning_msg = w[0].message.args[0]
if axis in {0, 'index'}:
assert warning_msg.startswith("'L1' is both an index level "
"and a column label")
else:
assert warning_msg.startswith("'L1' is both a column level "
"and an index label")
# df_ambig has both an on-axis level and off-axis label named L1
# Therefore, L1 is ambiguous.
with tm.assert_raises_regex(ValueError, msg):
df_ambig._check_label_or_level_ambiguity("L1", axis=axis)

# df_ambig has an on-axis level named L2 and it is not ambiguous
# No warning should be raised
with tm.assert_produces_warning(None):
assert not df_ambig._check_label_or_level_ambiguity('L2', axis=axis)
# df_ambig has an on-axis level named L2,, and it is not ambiguous.
df_ambig._check_label_or_level_ambiguity("L2", axis=axis)

# df_ambig has an off-axis label named L3 and it is not ambiguous
with tm.assert_produces_warning(None):
assert not df_ambig._is_level_reference('L3', axis=axis)
# df_ambig has an off-axis label named L3, and it is not ambiguous
assert not df_ambig._check_label_or_level_ambiguity("L3", axis=axis)


# Series
Expand All @@ -200,17 +193,15 @@ def test_check_label_or_level_ambiguity_series(df):
# A series has no columns and therefore references are never ambiguous

# Make series with L1 as index
s = df.set_index('L1').L2
with tm.assert_produces_warning(None):
assert not s._check_label_or_level_ambiguity('L1', axis=0)
assert not s._check_label_or_level_ambiguity('L2', axis=0)
s = df.set_index("L1").L2
s._check_label_or_level_ambiguity("L1", axis=0)
s._check_label_or_level_ambiguity("L2", axis=0)

# Make series with L1 and L2 as index
s = df.set_index(['L1', 'L2']).L3
with tm.assert_produces_warning(None):
assert not s._check_label_or_level_ambiguity('L1', axis=0)
assert not s._check_label_or_level_ambiguity('L2', axis=0)
assert not s._check_label_or_level_ambiguity('L3', axis=0)
s = df.set_index(["L1", "L2"]).L3
s._check_label_or_level_ambiguity("L1", axis=0)
s._check_label_or_level_ambiguity("L2", axis=0)
s._check_label_or_level_ambiguity("L3", axis=0)


def test_check_label_or_level_ambiguity_series_axis1_error(df):
Expand All @@ -229,7 +220,7 @@ def test_check_label_or_level_ambiguity_panel_error(panel):
.format(type=type(panel)))

with tm.assert_raises_regex(NotImplementedError, msg):
panel._check_label_or_level_ambiguity('L1', axis=0)
panel._check_label_or_level_ambiguity("L1", axis=0)


# Test _get_label_or_level_values
Expand All @@ -241,19 +232,16 @@ def assert_label_values(frame, labels, axis):
else:
expected = frame.loc[label]._values

result = frame._get_label_or_level_values(label, axis=axis,
stacklevel=2)
result = frame._get_label_or_level_values(label, axis=axis)
assert array_equivalent(expected, result)


def assert_level_values(frame, levels, axis):
for level in levels:
if axis in {0, 'index'}:
if axis in {0, "index"}:
expected = frame.index.get_level_values(level=level)._values
else:
expected = (frame.columns
.get_level_values(level=level)
._values)
expected = frame.columns.get_level_values(level=level)._values

result = frame._get_label_or_level_values(level, axis=axis)
assert array_equivalent(expected, result)
Expand Down Expand Up @@ -281,18 +269,11 @@ def test_get_label_or_level_values_df_ambig(df_ambig, axis):
if axis in {1, 'columns'}:
df_ambig = df_ambig.T

# df has both an on-axis level and off-axis label named L1
# Therefore L1 is ambiguous but will default to label
with tm.assert_produces_warning(FutureWarning):
assert_label_values(df_ambig, ['L1'], axis=axis)

# df has an on-axis level named L2 and it is not ambiguous
with tm.assert_produces_warning(None):
assert_level_values(df_ambig, ['L2'], axis=axis)
# df has an on-axis level named L2, and it is not ambiguous.
assert_level_values(df_ambig, ['L2'], axis=axis)

# df has an off-axis label named L3 and it is not ambiguous
with tm.assert_produces_warning(None):
assert_label_values(df_ambig, ['L3'], axis=axis)
# df has an off-axis label named L3, and it is not ambiguous.
assert_label_values(df_ambig, ['L3'], axis=axis)


def test_get_label_or_level_values_df_duplabels(df_duplabels, axis):
Expand Down
13 changes: 2 additions & 11 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,18 +568,9 @@ def test_as_index():
'B': [101, 205]},
columns=['cat', 'A', 'B'])

for name in [None, 'X', 'B', 'cat']:
for name in [None, 'X', 'B']:
df.index = Index(list("abc"), name=name)

if name in group_columns and name in df.index.names:
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
result = df.groupby(
group_columns, as_index=False, observed=True).sum()

else:
result = df.groupby(
group_columns, as_index=False, observed=True).sum()
result = df.groupby(group_columns, as_index=False, observed=True).sum()

tm.assert_frame_equal(result, expected)

Expand Down
Loading

0 comments on commit 25e6a21

Please sign in to comment.