Fix bug where df.agg(..., axis=1) gives wrong result

pandas-dev · May 27, 2018 · 9c26e30 · 9c26e30
1 parent bc37ea2
commit 9c26e30
Show file tree

Hide file tree

Showing 5 changed files with 174 additions and 7 deletions.
diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt
@@ -94,3 +94,8 @@ Categorical
 ^^^^^^^^^^^
 
 -
+
+Numeric
+^^^^^^^
+
+- :meth:`~DataFrame.agg` now handles built-in methods like ``sum`` in the same manner when axis=1 as when axis=0 (:issue:`21222`)
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -149,3 +149,20 @@ def tz_aware_fixture(request):
     Fixture for trying explicit timezones: {0}
     """
     return request.param
+
+
+@pytest.fixture(
+    # params: Python 3.5 randomizes dict access and xdist doesn't like that
+    # in fixtures. In order to get predetermined values we need to sort
+    # the list deterministically
+    # GH21222
+    params=list(sorted(pd.core.base.SelectionMixin._cython_table.items(),
+                       key=lambda x: x[0].__name__)),
+    ids=lambda x: "({}-{!r})_fixture".format(x[0].__name__, x[1]),
+)
+def cython_table_items(request):
+    """
+    Fixture for returning the items in
+    pandas.core.base.SelectionMixin._cython_table
+    """
+    return request.param
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -5818,17 +5818,20 @@ def _gotitem(self,
     def aggregate(self, func, axis=0, *args, **kwargs):
         axis = self._get_axis_number(axis)
 
-        # TODO: flipped axis
         result = None
-        if axis == 0:
-            try:
-                result, how = self._aggregate(func, axis=0, *args, **kwargs)
-            except TypeError:
-                pass
+        try:
+            result, how = self._aggregate(func, axis=axis, *args, **kwargs)
+        except TypeError:
+            pass
         if result is None:
             return self.apply(func, axis=axis, args=args, **kwargs)
         return result
 
+    @Appender(NDFrame._aggregate.__doc__, indents=2)
+    def _aggregate(self, arg, axis=0, *args, **kwargs):
+        obj = self.T if axis == 1 else self
+        return super(DataFrame, obj)._aggregate(arg, *args, **kwargs)
+
     agg = aggregate
 
     def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None,

diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py
@@ -946,7 +946,7 @@ def test_agg_dict_nested_renaming_depr(self):
         df = pd.DataFrame({'A': range(5), 'B': 5})
 
         # nested renaming
-        with tm.assert_produces_warning(FutureWarning):
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
             df.agg({'A': {'foo': 'min'},
                     'B': {'bar': 'max'}})
 
@@ -1056,3 +1056,73 @@ def test_non_callable_aggregates(self):
         expected = df.size
 
         assert result == expected
+
+    @pytest.mark.parametrize("frame, expected_dict", [
+        [DataFrame(), {
+            'sum': Series(),
+            'max': Series(),
+            'min': Series(),
+            'all': Series(dtype=bool),
+            'any': Series(dtype=bool),
+            'mean': Series(),
+            'prod': Series(),
+            'std': Series(),
+            'var': Series(),
+            'median': Series(),
+            'cumprod': DataFrame(),
+            'cumsum': DataFrame(),
+        }],
+        [DataFrame([[np.nan, 1], [1, 2]]), {
+            'sum': Series([1., 3]),
+            'max': Series([1., 2]),
+            'min': Series([1., 1]),
+            'all': Series([True, True]),
+            'any': Series([True, True]),
+            'mean': Series([1, 1.5]),
+            'prod': Series([1., 2]),
+            'std': Series([np.nan, 0.707107]),
+            'var': Series([np.nan, 0.5]),
+            'median': Series([1, 1.5]),
+            'cumprod': DataFrame([[np.nan, 1], [1., 2.]]),
+            'cumsum': DataFrame([[np.nan, 1], [1., 3.]]),
+        }],
+        [DataFrame([['a', 'b'], ['b', 'a']]), {
+            'sum': Series(['ab', 'ba']),
+            'max': Series(['b', 'b']),
+            'min': Series(['a', 'a']),
+            'all': Series([True, True]),
+            'any': Series([True, True]),
+            'mean': Series([], index=pd.Index([], dtype='int64')),
+            'prod': Series([], index=pd.Index([], dtype='int64')),
+            'std': Series([], index=pd.Index([], dtype='int64')),
+            'var': Series([], index=pd.Index([], dtype='int64')),
+            'median': Series([], index=pd.Index([], dtype='int64')),
+            'cumprod': TypeError,
+            'cumsum': DataFrame([['a', 'b'], ['ab', 'ba']]),
+        }],
+    ])
+    @pytest.mark.parametrize("axis", [0, 1], ids=lambda x: "axis {}".format(x))
+    def test_agg_cython_table(self, cython_table_items,
+                              frame, expected_dict, axis):
+        # GH21222
+        # test if using items in pandas.core.base.SelectionMixin._cython_table
+        # in agg gives correct results
+        np_func, str_func = cython_table_items
+        expected = expected_dict[str_func]
+
+        if isinstance(expected, type) and issubclass(expected, Exception):
+            with pytest.raises(expected):
+                # e.g. DataFrame(['a b'.split()]).cumprod() will raise
+                frame.agg(np_func, axis=axis)
+            with pytest.raises(expected):
+                frame.agg(str_func, axis=axis)
+            return
+
+        result = frame.agg(np_func, axis=axis)
+        result_str_func = frame.agg(str_func, axis=axis)
+        if str_func in ('cumprod', 'cumsum'):
+            tm.assert_frame_equal(result, expected)
+            tm.assert_frame_equal(result_str_func, expected)
+        else:
+            tm.assert_series_equal(result, expected)
+            tm.assert_series_equal(result_str_func, expected)
diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py
@@ -331,6 +331,78 @@ def test_non_callable_aggregates(self):
                                        ('mean', 1.5)]))
         assert_series_equal(result[expected.index], expected)
 
+    @pytest.mark.parametrize("series, expected_dict", [
+        [Series(), {
+            'sum': 0,
+            'max': np.nan,
+            'min': np.nan,
+            'all': True,
+            'any': False,
+            'mean': np.nan,
+            'prod': 1,
+            'std': np.nan,
+            'var': np.nan,
+            'median': np.nan,
+            'cumprod': Series([], Index([])),
+            'cumsum': Series([], Index([])),
+        }],
+        [Series([np.nan, 1, 2, 3]), {
+            'sum': 6,
+            'max': 3,
+            'min': 1,
+            'all': True,
+            'any': True,
+            'mean': 2,
+            'prod': 6,
+            'std': 1,
+            'var': 1,
+            'median': 2,
+            'cumprod': Series([np.nan, 1, 2, 6]),
+            'cumsum': Series([np.nan, 1, 3, 6]),
+        }],
+        [Series('a b c'.split()), {
+            'sum': 'abc',
+            'max': 'c',
+            'min': 'a',
+            'all': 'c',  # see GH12863
+            'any': 'a',
+            'mean': TypeError,  # mean raises TypeError
+            'prod': TypeError,
+            'std': TypeError,
+            'var': TypeError,
+            'median': TypeError,
+            'cumprod': TypeError,
+            'cumsum': Series(['a', 'ab', 'abc']),
+        }],
+    ])
+    def test_agg_cython_table(self, cython_table_items,
+                              series, expected_dict):
+        # GH21222
+        # test if using items in pandas.core.base.SelectionMixin._cython_table
+        # in agg gives correct results
+        np_func, str_func = cython_table_items
+        expected = expected_dict[str_func]
+
+        if isinstance(expected, type) and issubclass(expected, Exception):
+            with pytest.raises(expected):
+                # e.g. Series('a b'.split()).cumprod() will raise
+                series.agg(np_func)
+            with pytest.raises(expected):
+                series.agg(str_func)
+            return
+
+        result = series.agg(np_func)
+        result_str_func = series.agg(str_func)
+        if str_func in ('cumprod', 'cumsum'):
+            tm.assert_series_equal(result, expected)
+            tm.assert_series_equal(result_str_func, expected)
+        elif tm.is_number(expected):
+            assert np.isclose(result, expected, equal_nan=True)
+            assert np.isclose(result_str_func, expected, equal_nan=True)
+        else:
+            assert result == expected
+            assert result_str_func == expected
+
 
 class TestSeriesMap(TestData):
-Original file line number
+Diff line change
@@ Expand Up / @@ -94,3 +94,8 @@ Categorical @@
     ^^^^^^^^^^^
     -
+    Numeric
+    ^^^^^^^
+    - :meth:`~DataFrame.agg` now handles built-in methods like ``sum`` in the same manner when axis=1 as when axis=0 (:issue:`21222`)