DEPR: Error with ambiguous groupby strings (#22415)

xref gh-14432.
pandas-dev · Aug 22, 2018 · 25e6a21 · 25e6a21
1 parent d83f533
commit 25e6a21
Show file tree

Hide file tree

Showing 12 changed files with 51 additions and 270 deletions.
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -106,9 +106,8 @@ consider the following ``DataFrame``:
    .. versionadded:: 0.20
 
    A string passed to ``groupby`` may refer to either a column or an index level.
-   If a string matches both a column name and an index level name then a warning is
-   issued and the column takes precedence. This will result in an ambiguity error
-   in a future version.
+   If a string matches both a column name and an index level name, a
+   ``ValueError`` will be raised.
 
 .. ipython:: python
 

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -523,6 +523,7 @@ Removal of prior version deprecations/changes
 - :meth:`Series.repeat` has renamed the ``reps`` argument to ``repeats`` (:issue:`14645`)
 - Several private functions were removed from the (non-public) module ``pandas.core.common`` (:issue:`22001`)
 - Removal of the previously deprecated module ``pandas.core.datetools`` (:issue:`14105`, :issue:`14094`)
+- Strings passed into :meth:`DataFrame.groupby` that refer to both column and index levels will raise a ``ValueError`` (:issue:`14432`)
 -
 
 .. _whatsnew_0240.performance:

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4393,7 +4393,6 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False,
                     kind='quicksort', na_position='last'):
         inplace = validate_bool_kwarg(inplace, 'inplace')
         axis = self._get_axis_number(axis)
-        stacklevel = 2  # Number of stack levels from df.sort_values
 
         if not isinstance(by, list):
             by = [by]
@@ -4405,8 +4404,7 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False,
 
             keys = []
             for x in by:
-                k = self._get_label_or_level_values(x, axis=axis,
-                                                    stacklevel=stacklevel)
+                k = self._get_label_or_level_values(x, axis=axis)
                 keys.append(k)
             indexer = lexsort_indexer(keys, orders=ascending,
                                       na_position=na_position)
@@ -4415,8 +4413,7 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False,
             from pandas.core.sorting import nargsort
 
             by = by[0]
-            k = self._get_label_or_level_values(by, axis=axis,
-                                                stacklevel=stacklevel)
+            k = self._get_label_or_level_values(by, axis=axis)
 
             if isinstance(ascending, (tuple, list)):
                 ascending = ascending[0]

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1412,33 +1412,23 @@ def _is_label_or_level_reference(self, key, axis=0):
         return (self._is_level_reference(key, axis=axis) or
                 self._is_label_reference(key, axis=axis))
 
-    def _check_label_or_level_ambiguity(self, key, axis=0, stacklevel=1):
+    def _check_label_or_level_ambiguity(self, key, axis=0):
         """
-        Check whether `key` matches both a level of the input `axis` and a
-        label of the other axis and raise a ``FutureWarning`` if this is the
-        case.
+        Check whether `key` is ambiguous.
 
-        Note: This method will be altered to raise an ambiguity exception in
-        a future version.
+        By ambiguous, we mean that it matches both a level of the input
+        `axis` and a label of the other axis.
 
         Parameters
         ----------
         key: str or object
             label or level name
         axis: int, default 0
             Axis that levels are associated with (0 for index, 1 for columns)
-        stacklevel: int, default 1
-            Stack level used when a FutureWarning is raised (see below).
-
-        Returns
-        -------
-        ambiguous: bool
 
         Raises
         ------
-        FutureWarning
-            if `key` is ambiguous. This will become an ambiguity error in a
-            future version
+        ValueError: `key` is ambiguous
         """
 
         axis = self._get_axis_number(axis)
@@ -1464,21 +1454,15 @@ def _check_label_or_level_ambiguity(self, key, axis=0, stacklevel=1):
                                          ('an', 'index'))
 
             msg = ("'{key}' is both {level_article} {level_type} level and "
-                   "{label_article} {label_type} label.\n"
-                   "Defaulting to {label_type}, but this will raise an "
-                   "ambiguity error in a future version"
+                   "{label_article} {label_type} label, which is ambiguous."
                    ).format(key=key,
                             level_article=level_article,
                             level_type=level_type,
                             label_article=label_article,
                             label_type=label_type)
+            raise ValueError(msg)
 
-            warnings.warn(msg, FutureWarning, stacklevel=stacklevel + 1)
-            return True
-        else:
-            return False
-
-    def _get_label_or_level_values(self, key, axis=0, stacklevel=1):
+    def _get_label_or_level_values(self, key, axis=0):
         """
         Return a 1-D array of values associated with `key`, a label or level
         from the given `axis`.
@@ -1497,8 +1481,6 @@ def _get_label_or_level_values(self, key, axis=0, stacklevel=1):
             Label or level name.
         axis: int, default 0
             Axis that levels are associated with (0 for index, 1 for columns)
-        stacklevel: int, default 1
-            Stack level used when a FutureWarning is raised (see below).
 
         Returns
         -------
@@ -1524,8 +1506,7 @@ def _get_label_or_level_values(self, key, axis=0, stacklevel=1):
                 .format(type=type(self)))
 
         if self._is_label_reference(key, axis=axis):
-            self._check_label_or_level_ambiguity(key, axis=axis,
-                                                 stacklevel=stacklevel + 1)
+            self._check_label_or_level_ambiguity(key, axis=axis)
             values = self.xs(key, axis=other_axes[0])._values
         elif self._is_level_reference(key, axis=axis):
             values = self.axes[axis].get_level_values(key)._values

diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -571,9 +571,7 @@ def is_in_obj(gpr):
         elif is_in_axis(gpr):  # df.groupby('name')
             if gpr in obj:
                 if validate:
-                    stacklevel = 5  # Number of stack levels from df.groupby
-                    obj._check_label_or_level_ambiguity(
-                        gpr, stacklevel=stacklevel)
+                    obj._check_label_or_level_ambiguity(gpr)
                 in_axis, name, gpr = True, gpr, obj[gpr]
                 exclusions.append(name)
             elif obj._is_level_reference(gpr):

diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -811,7 +811,6 @@ def _get_merge_keys(self):
         left_drop = []
 
         left, right = self.left, self.right
-        stacklevel = 5  # Number of stack levels from df.merge
 
         is_lkey = lambda x: is_array_like(x) and len(x) == len(left)
         is_rkey = lambda x: is_array_like(x) and len(x) == len(right)
@@ -837,8 +836,7 @@ def _get_merge_keys(self):
                     else:
                         if rk is not None:
                             right_keys.append(
-                                right._get_label_or_level_values(
-                                    rk, stacklevel=stacklevel))
+                                right._get_label_or_level_values(rk))
                             join_names.append(rk)
                         else:
                             # work-around for merge_asof(right_index=True)
@@ -848,8 +846,7 @@ def _get_merge_keys(self):
                     if not is_rkey(rk):
                         if rk is not None:
                             right_keys.append(
-                                right._get_label_or_level_values(
-                                    rk, stacklevel=stacklevel))
+                                right._get_label_or_level_values(rk))
                         else:
                             # work-around for merge_asof(right_index=True)
                             right_keys.append(right.index)
@@ -862,8 +859,7 @@ def _get_merge_keys(self):
                     else:
                         right_keys.append(rk)
                     if lk is not None:
-                        left_keys.append(left._get_label_or_level_values(
-                            lk, stacklevel=stacklevel))
+                        left_keys.append(left._get_label_or_level_values(lk))
                         join_names.append(lk)
                     else:
                         # work-around for merge_asof(left_index=True)
@@ -875,8 +871,7 @@ def _get_merge_keys(self):
                     left_keys.append(k)
                     join_names.append(None)
                 else:
-                    left_keys.append(left._get_label_or_level_values(
-                        k, stacklevel=stacklevel))
+                    left_keys.append(left._get_label_or_level_values(k))
                     join_names.append(k)
             if isinstance(self.right.index, MultiIndex):
                 right_keys = [lev._values.take(lab)
@@ -890,8 +885,7 @@ def _get_merge_keys(self):
                     right_keys.append(k)
                     join_names.append(None)
                 else:
-                    right_keys.append(right._get_label_or_level_values(
-                        k, stacklevel=stacklevel))
+                    right_keys.append(right._get_label_or_level_values(k))
                     join_names.append(k)
             if isinstance(self.left.index, MultiIndex):
                 left_keys = [lev._values.take(lab)

diff --git a/pandas/tests/frame/test_sort_values_level_as_str.py b/pandas/tests/frame/test_sort_values_level_as_str.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pytest
 
-from pandas import DataFrame, Index
+from pandas import DataFrame
 from pandas.errors import PerformanceWarning
 from pandas.util import testing as tm
 from pandas.util.testing import assert_frame_equal
@@ -93,34 +93,3 @@ def test_sort_column_level_and_index_label(
             assert_frame_equal(result, expected)
     else:
         assert_frame_equal(result, expected)
-
-
-def test_sort_values_column_index_level_precedence():
-    # GH 14353, when a string passed as the `by` parameter
-    # matches a column and an index level the column takes
-    # precedence
-
-    # Construct DataFrame with index and column named 'idx'
-    idx = Index(np.arange(1, 7), name='idx')
-    df = DataFrame({'A': np.arange(11, 17),
-                    'idx': np.arange(6, 0, -1)},
-                   index=idx)
-
-    # Sorting by 'idx' should sort by the idx column and raise a
-    # FutureWarning
-    with tm.assert_produces_warning(FutureWarning):
-        result = df.sort_values(by='idx')
-
-    # This should be equivalent to sorting by the 'idx' index level in
-    # descending order
-    expected = df.sort_index(level='idx', ascending=False)
-    assert_frame_equal(result, expected)
-
-    # Perform same test with MultiIndex
-    df_multi = df.set_index('A', append=True)
-
-    with tm.assert_produces_warning(FutureWarning):
-        result = df_multi.sort_values(by='idx')
-
-    expected = df_multi.sort_index(level='idx', ascending=False)
-    assert_frame_equal(result, expected)
diff --git a/pandas/tests/generic/test_label_or_level_utils.py b/pandas/tests/generic/test_label_or_level_utils.py
@@ -166,31 +166,24 @@ def test_is_label_or_level_reference_panel_error(panel):
 def test_check_label_or_level_ambiguity_df(df_ambig, axis):
 
     # Transpose frame if axis == 1
-    if axis in {1, 'columns'}:
+    if axis in {1, "columns"}:
         df_ambig = df_ambig.T
 
-    # df_ambig has both an on-axis level and off-axis label named L1
-    # Therefore L1 is ambiguous
-    with tm.assert_produces_warning(FutureWarning,
-                                    clear=True) as w:
+    if axis in {0, "index"}:
+        msg = "'L1' is both an index level and a column label"
+    else:
+        msg = "'L1' is both a column level and an index label"
 
-        assert df_ambig._check_label_or_level_ambiguity('L1', axis=axis)
-        warning_msg = w[0].message.args[0]
-        if axis in {0, 'index'}:
-            assert warning_msg.startswith("'L1' is both an index level "
-                                          "and a column label")
-        else:
-            assert warning_msg.startswith("'L1' is both a column level "
-                                          "and an index label")
+    # df_ambig has both an on-axis level and off-axis label named L1
+    # Therefore, L1 is ambiguous.
+    with tm.assert_raises_regex(ValueError, msg):
+        df_ambig._check_label_or_level_ambiguity("L1", axis=axis)
 
-    # df_ambig has an on-axis level named L2 and it is not ambiguous
-    # No warning should be raised
-    with tm.assert_produces_warning(None):
-        assert not df_ambig._check_label_or_level_ambiguity('L2', axis=axis)
+    # df_ambig has an on-axis level named L2,, and it is not ambiguous.
+    df_ambig._check_label_or_level_ambiguity("L2", axis=axis)
 
-    # df_ambig has an off-axis label named L3 and it is not ambiguous
-    with tm.assert_produces_warning(None):
-        assert not df_ambig._is_level_reference('L3', axis=axis)
+    # df_ambig has an off-axis label named L3, and it is not ambiguous
+    assert not df_ambig._check_label_or_level_ambiguity("L3", axis=axis)
 
 
 # Series
@@ -200,17 +193,15 @@ def test_check_label_or_level_ambiguity_series(df):
     # A series has no columns and therefore references are never ambiguous
 
     # Make series with L1 as index
-    s = df.set_index('L1').L2
-    with tm.assert_produces_warning(None):
-        assert not s._check_label_or_level_ambiguity('L1', axis=0)
-        assert not s._check_label_or_level_ambiguity('L2', axis=0)
+    s = df.set_index("L1").L2
+    s._check_label_or_level_ambiguity("L1", axis=0)
+    s._check_label_or_level_ambiguity("L2", axis=0)
 
     # Make series with L1 and L2 as index
-    s = df.set_index(['L1', 'L2']).L3
-    with tm.assert_produces_warning(None):
-        assert not s._check_label_or_level_ambiguity('L1', axis=0)
-        assert not s._check_label_or_level_ambiguity('L2', axis=0)
-        assert not s._check_label_or_level_ambiguity('L3', axis=0)
+    s = df.set_index(["L1", "L2"]).L3
+    s._check_label_or_level_ambiguity("L1", axis=0)
+    s._check_label_or_level_ambiguity("L2", axis=0)
+    s._check_label_or_level_ambiguity("L3", axis=0)
 
 
 def test_check_label_or_level_ambiguity_series_axis1_error(df):
@@ -229,7 +220,7 @@ def test_check_label_or_level_ambiguity_panel_error(panel):
            .format(type=type(panel)))
 
     with tm.assert_raises_regex(NotImplementedError, msg):
-        panel._check_label_or_level_ambiguity('L1', axis=0)
+        panel._check_label_or_level_ambiguity("L1", axis=0)
 
 
 # Test _get_label_or_level_values
@@ -241,19 +232,16 @@ def assert_label_values(frame, labels, axis):
         else:
             expected = frame.loc[label]._values
 
-        result = frame._get_label_or_level_values(label, axis=axis,
-                                                  stacklevel=2)
+        result = frame._get_label_or_level_values(label, axis=axis)
         assert array_equivalent(expected, result)
 
 
 def assert_level_values(frame, levels, axis):
     for level in levels:
-        if axis in {0, 'index'}:
+        if axis in {0, "index"}:
             expected = frame.index.get_level_values(level=level)._values
         else:
-            expected = (frame.columns
-                        .get_level_values(level=level)
-                        ._values)
+            expected = frame.columns.get_level_values(level=level)._values
 
         result = frame._get_label_or_level_values(level, axis=axis)
         assert array_equivalent(expected, result)
@@ -281,18 +269,11 @@ def test_get_label_or_level_values_df_ambig(df_ambig, axis):
     if axis in {1, 'columns'}:
         df_ambig = df_ambig.T
 
-    # df has both an on-axis level and off-axis label named L1
-    # Therefore L1 is ambiguous but will default to label
-    with tm.assert_produces_warning(FutureWarning):
-        assert_label_values(df_ambig, ['L1'], axis=axis)
-
-    # df has an on-axis level named L2 and it is not ambiguous
-    with tm.assert_produces_warning(None):
-        assert_level_values(df_ambig, ['L2'], axis=axis)
+    # df has an on-axis level named L2, and it is not ambiguous.
+    assert_level_values(df_ambig, ['L2'], axis=axis)
 
-    # df has an off-axis label named L3 and it is not ambiguous
-    with tm.assert_produces_warning(None):
-        assert_label_values(df_ambig, ['L3'], axis=axis)
+    # df has an off-axis label named L3, and it is not ambiguous.
+    assert_label_values(df_ambig, ['L3'], axis=axis)
 
 
 def test_get_label_or_level_values_df_duplabels(df_duplabels, axis):

diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -568,18 +568,9 @@ def test_as_index():
          'B': [101, 205]},
         columns=['cat', 'A', 'B'])
 
-    for name in [None, 'X', 'B', 'cat']:
+    for name in [None, 'X', 'B']:
         df.index = Index(list("abc"), name=name)
-
-        if name in group_columns and name in df.index.names:
-            with tm.assert_produces_warning(FutureWarning,
-                                            check_stacklevel=False):
-                result = df.groupby(
-                    group_columns, as_index=False, observed=True).sum()
-
-        else:
-            result = df.groupby(
-                group_columns, as_index=False, observed=True).sum()
+        result = df.groupby(group_columns, as_index=False, observed=True).sum()
 
         tm.assert_frame_equal(result, expected)