Skip to content

Commit

Permalink
Fix bug where df.agg(..., axis=1) gives wrong result
Browse files Browse the repository at this point in the history
  • Loading branch information
tp committed May 27, 2018
1 parent bc37ea2 commit 9c26e30
Show file tree
Hide file tree
Showing 5 changed files with 174 additions and 7 deletions.
5 changes: 5 additions & 0 deletions doc/source/whatsnew/v0.23.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,8 @@ Categorical
^^^^^^^^^^^

-

Numeric
^^^^^^^

- :meth:`~DataFrame.agg` now handles built-in methods like ``sum`` in the same manner when axis=1 as when axis=0 (:issue:`21222`)
17 changes: 17 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,3 +149,20 @@ def tz_aware_fixture(request):
Fixture for trying explicit timezones: {0}
"""
return request.param


@pytest.fixture(
# params: Python 3.5 randomizes dict access and xdist doesn't like that
# in fixtures. In order to get predetermined values we need to sort
# the list deterministically
# GH21222
params=list(sorted(pd.core.base.SelectionMixin._cython_table.items(),
key=lambda x: x[0].__name__)),
ids=lambda x: "({}-{!r})_fixture".format(x[0].__name__, x[1]),
)
def cython_table_items(request):
"""
Fixture for returning the items in
pandas.core.base.SelectionMixin._cython_table
"""
return request.param
15 changes: 9 additions & 6 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5818,17 +5818,20 @@ def _gotitem(self,
def aggregate(self, func, axis=0, *args, **kwargs):
axis = self._get_axis_number(axis)

# TODO: flipped axis
result = None
if axis == 0:
try:
result, how = self._aggregate(func, axis=0, *args, **kwargs)
except TypeError:
pass
try:
result, how = self._aggregate(func, axis=axis, *args, **kwargs)
except TypeError:
pass
if result is None:
return self.apply(func, axis=axis, args=args, **kwargs)
return result

@Appender(NDFrame._aggregate.__doc__, indents=2)
def _aggregate(self, arg, axis=0, *args, **kwargs):
obj = self.T if axis == 1 else self
return super(DataFrame, obj)._aggregate(arg, *args, **kwargs)

agg = aggregate

def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None,
Expand Down
72 changes: 71 additions & 1 deletion pandas/tests/frame/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -946,7 +946,7 @@ def test_agg_dict_nested_renaming_depr(self):
df = pd.DataFrame({'A': range(5), 'B': 5})

# nested renaming
with tm.assert_produces_warning(FutureWarning):
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
df.agg({'A': {'foo': 'min'},
'B': {'bar': 'max'}})

Expand Down Expand Up @@ -1056,3 +1056,73 @@ def test_non_callable_aggregates(self):
expected = df.size

assert result == expected

@pytest.mark.parametrize("frame, expected_dict", [
[DataFrame(), {
'sum': Series(),
'max': Series(),
'min': Series(),
'all': Series(dtype=bool),
'any': Series(dtype=bool),
'mean': Series(),
'prod': Series(),
'std': Series(),
'var': Series(),
'median': Series(),
'cumprod': DataFrame(),
'cumsum': DataFrame(),
}],
[DataFrame([[np.nan, 1], [1, 2]]), {
'sum': Series([1., 3]),
'max': Series([1., 2]),
'min': Series([1., 1]),
'all': Series([True, True]),
'any': Series([True, True]),
'mean': Series([1, 1.5]),
'prod': Series([1., 2]),
'std': Series([np.nan, 0.707107]),
'var': Series([np.nan, 0.5]),
'median': Series([1, 1.5]),
'cumprod': DataFrame([[np.nan, 1], [1., 2.]]),
'cumsum': DataFrame([[np.nan, 1], [1., 3.]]),
}],
[DataFrame([['a', 'b'], ['b', 'a']]), {
'sum': Series(['ab', 'ba']),
'max': Series(['b', 'b']),
'min': Series(['a', 'a']),
'all': Series([True, True]),
'any': Series([True, True]),
'mean': Series([], index=pd.Index([], dtype='int64')),
'prod': Series([], index=pd.Index([], dtype='int64')),
'std': Series([], index=pd.Index([], dtype='int64')),
'var': Series([], index=pd.Index([], dtype='int64')),
'median': Series([], index=pd.Index([], dtype='int64')),
'cumprod': TypeError,
'cumsum': DataFrame([['a', 'b'], ['ab', 'ba']]),
}],
])
@pytest.mark.parametrize("axis", [0, 1], ids=lambda x: "axis {}".format(x))
def test_agg_cython_table(self, cython_table_items,
frame, expected_dict, axis):
# GH21222
# test if using items in pandas.core.base.SelectionMixin._cython_table
# in agg gives correct results
np_func, str_func = cython_table_items
expected = expected_dict[str_func]

if isinstance(expected, type) and issubclass(expected, Exception):
with pytest.raises(expected):
# e.g. DataFrame(['a b'.split()]).cumprod() will raise
frame.agg(np_func, axis=axis)
with pytest.raises(expected):
frame.agg(str_func, axis=axis)
return

result = frame.agg(np_func, axis=axis)
result_str_func = frame.agg(str_func, axis=axis)
if str_func in ('cumprod', 'cumsum'):
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result_str_func, expected)
else:
tm.assert_series_equal(result, expected)
tm.assert_series_equal(result_str_func, expected)
72 changes: 72 additions & 0 deletions pandas/tests/series/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,78 @@ def test_non_callable_aggregates(self):
('mean', 1.5)]))
assert_series_equal(result[expected.index], expected)

@pytest.mark.parametrize("series, expected_dict", [
[Series(), {
'sum': 0,
'max': np.nan,
'min': np.nan,
'all': True,
'any': False,
'mean': np.nan,
'prod': 1,
'std': np.nan,
'var': np.nan,
'median': np.nan,
'cumprod': Series([], Index([])),
'cumsum': Series([], Index([])),
}],
[Series([np.nan, 1, 2, 3]), {
'sum': 6,
'max': 3,
'min': 1,
'all': True,
'any': True,
'mean': 2,
'prod': 6,
'std': 1,
'var': 1,
'median': 2,
'cumprod': Series([np.nan, 1, 2, 6]),
'cumsum': Series([np.nan, 1, 3, 6]),
}],
[Series('a b c'.split()), {
'sum': 'abc',
'max': 'c',
'min': 'a',
'all': 'c', # see GH12863
'any': 'a',
'mean': TypeError, # mean raises TypeError
'prod': TypeError,
'std': TypeError,
'var': TypeError,
'median': TypeError,
'cumprod': TypeError,
'cumsum': Series(['a', 'ab', 'abc']),
}],
])
def test_agg_cython_table(self, cython_table_items,
series, expected_dict):
# GH21222
# test if using items in pandas.core.base.SelectionMixin._cython_table
# in agg gives correct results
np_func, str_func = cython_table_items
expected = expected_dict[str_func]

if isinstance(expected, type) and issubclass(expected, Exception):
with pytest.raises(expected):
# e.g. Series('a b'.split()).cumprod() will raise
series.agg(np_func)
with pytest.raises(expected):
series.agg(str_func)
return

result = series.agg(np_func)
result_str_func = series.agg(str_func)
if str_func in ('cumprod', 'cumsum'):
tm.assert_series_equal(result, expected)
tm.assert_series_equal(result_str_func, expected)
elif tm.is_number(expected):
assert np.isclose(result, expected, equal_nan=True)
assert np.isclose(result_str_func, expected, equal_nan=True)
else:
assert result == expected
assert result_str_func == expected


class TestSeriesMap(TestData):

Expand Down

0 comments on commit 9c26e30

Please sign in to comment.