From aaa04ad35ada10f2469e31a40a15848a82c717a7 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 30 Nov 2017 08:19:38 -0500 Subject: [PATCH] API/BUG: .apply will correctly infer output shape when axis=1 closes #16353 closes #17348 closes #17437 closes #18573 closes #17970 closes #17892 closes #17602 closes #18775 closes #18901 closes #18919 --- doc/source/whatsnew/v0.23.0.txt | 50 +++++++++- pandas/core/apply.py | 83 ++++++++++++----- pandas/tests/frame/test_apply.py | 154 +++++++++++++++++++++++++++++-- 3 files changed, 253 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 6407a33c442d0d..aad8531042233d 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -88,7 +88,7 @@ Previous Behavior: 4 NaN dtype: float64 -Current Behavior +Current Behavior: .. ipython:: python @@ -113,7 +113,7 @@ Previous Behavior: 3 2.5 dtype: float64 -Current Behavior +Current Behavior: .. ipython:: python @@ -167,6 +167,52 @@ If installed, we now require: | openpyxl | 2.4.0 | | +-----------------+-----------------+----------+ +.. _whatsnew_0230.api_breaking.apply: + +Apply Changes +~~~~~~~~~~~~~ + +:func:`DataFrame.apply` was inconsistent when applying an arbitrary user-defined-function that returned a list-like with ``axis=1``. Several bugs and inconsistencies +are resolved. If the applied function returns a Series, then pandas will return a DataFrame; otherwise a Series will be returned, this includes the case +where a list-like (e.g. ``tuple`` or ``list`` is returned), (:issue:`16353`, :issue:`17437`, :issue:`17970`, :issue:`17348`, :issue:`17892`, :issue:`18573`, +:issue:`17602`, :issue:`18775`, :issue:`18901`, :issue:`18919`) + +.. ipython:: python + + df = pd.DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C']) + df + +Previous Behavior. If the returned shape happened to match the index, this would return a list-like. + +.. code-block:: python + + In [3]: df.apply(lambda x: [1, 2, 3], axis=1) + Out[3]: + A B C + 0 1 2 3 + 1 1 2 3 + 2 1 2 3 + 3 1 2 3 + 4 1 2 3 + 5 1 2 3 + + In [4]: df.apply(lambda x: [1, 2], axis=1) + Out[4]: + 0 [1, 2] + 1 [1, 2] + 2 [1, 2] + 3 [1, 2] + 4 [1, 2] + 5 [1, 2] + dtype: object + + +New Behavior. The behavior is consistent. These will *always* return a ``Series``. + +.. ipython:: python + + df.apply(lambda x: [1, 2, 3], axis=1) + df.apply(lambda x: [1, 2], axis=1) Build Changes ^^^^^^^^^^^^^ diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 2f43087f7dff9e..8c65e3f9dc9b82 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -19,18 +19,20 @@ def frame_apply(obj, func, axis=0, broadcast=False, klass = FrameColumnApply return klass(obj, func, broadcast=broadcast, - raw=raw, reduce=reduce, args=args, kwds=kwds) + raw=raw, reduce=reduce, + args=args, kwds=kwds) class FrameApply(object): - def __init__(self, obj, func, broadcast, raw, reduce, args, kwds): + def __init__(self, obj, func, broadcast, raw, reduce, + args, kwds): self.obj = obj self.broadcast = broadcast self.raw = raw self.reduce = reduce - self.args = args + self.args = args self.ignore_failures = kwds.pop('ignore_failures', False) self.kwds = kwds @@ -94,6 +96,13 @@ def get_result(self): return self.apply_standard() def apply_empty_result(self): + """ + we have an empty result; at least 1 axis is 0 + + we will try to apply the function to an empty + series in order to see if this is a reduction function + """ + from pandas import Series reduce = self.reduce @@ -113,6 +122,8 @@ def apply_empty_result(self): return self.obj.copy() def apply_raw(self): + """ apply to the values as a numpy array """ + try: result = lib.reduce(self.values, self.f, axis=self.axis) except Exception: @@ -207,19 +218,57 @@ def wrap_results(self, results, res_index, res_columns): from pandas import Series if len(results) > 0 and is_sequence(results[0]): - if not isinstance(results[0], Series): - index = res_columns + + # map to rows + if self.axis == 0: + result = self.obj._constructor(data=results) + + if not isinstance(results[0], Series): + try: + result.index = res_columns + except ValueError: + pass + + try: + result.columns = res_index + except ValueError: + pass + + # map to columns else: - index = None - result = self.obj._constructor(data=results, index=index) - result.columns = res_index + def infer_to_same_shape(): + result = self.obj._constructor(data=results) + result = result.T + + # try to assign the result indices; + # this may fail, if so we have + # received an invalid return shape + try: + result.index = res_index + except ValueError: + pass + + try: + result.columns = res_columns + except ValueError: + pass + + # infer dtypes + result = result.infer_objects() - if self.axis == 1: - result = result.T - result = result._convert( - datetime=True, timedelta=True, copy=False) + return result + # we have a non-series and don't want inference + if not isinstance(results[0], Series): + result = Series(results) + result.index = res_index + + # we may want to infer results + else: + result = infer_to_same_shape() + + # dict of scalars else: result = Series(results) @@ -270,16 +319,6 @@ def result_columns(self): class FrameColumnApply(FrameApply): axis = 1 - def __init__(self, obj, func, broadcast, raw, reduce, args, kwds): - super(FrameColumnApply, self).__init__(obj, func, broadcast, - raw, reduce, args, kwds) - - # skip if we are mixed datelike and trying reduce across axes - # GH6125 - if self.reduce: - if self.obj._is_mixed_type and self.obj._is_datelike_mixed_type: - self.reduce = False - def apply_broadcast(self): return self._apply_broadcast(self.obj.T).T diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 65dd166e1f6a87..d3e879c612893f 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -350,11 +350,10 @@ def test_apply_attach_name(self): result = self.frame.apply(lambda x: np.repeat(x.name, len(x)), axis=1) - expected = DataFrame(np.tile(self.frame.index, - (len(self.frame.columns), 1)).T, - index=self.frame.index, - columns=self.frame.columns) - assert_frame_equal(result, expected) + expected = Series(np.repeat(t[0], len(self.frame.columns)) + for t in self.frame.itertuples()) + expected.index = self.frame.index + assert_series_equal(result, expected) def test_apply_multi_index(self): s = DataFrame([[1, 2], [3, 4], [5, 6]]) @@ -367,10 +366,10 @@ def test_apply_dict(self): # GH 8735 A = DataFrame([['foo', 'bar'], ['spam', 'eggs']]) - A_dicts = pd.Series([dict([(0, 'foo'), (1, 'spam')]), - dict([(0, 'bar'), (1, 'eggs')])]) + A_dicts = Series([dict([(0, 'foo'), (1, 'spam')]), + dict([(0, 'bar'), (1, 'eggs')])]) B = DataFrame([[0, 1], [2, 3]]) - B_dicts = pd.Series([dict([(0, 0), (1, 2)]), dict([(0, 1), (1, 3)])]) + B_dicts = Series([dict([(0, 0), (1, 2)]), dict([(0, 1), (1, 3)])]) fn = lambda x: x.to_dict() for df, dicts in [(A, A_dicts), (B, B_dicts)]: @@ -472,6 +471,141 @@ def test_apply_non_numpy_dtype(self): assert_frame_equal(result, df) +class TestInferOutputShape(object): + # the user has supplied an opaque UDF where + # they are transforming the input that requires + # us to infer the output + + def test_infer_row_shape(self): + # gh-17437 + # if row shape is changing, infer it + df = pd.DataFrame(np.random.rand(10, 2)) + result = df.apply(np.fft.fft, axis=0) + assert result.shape == (10, 2) + + result = df.apply(np.fft.rfft, axis=0) + assert result.shape == (6, 2) + + def test_with_dictlike_columns(self): + # gh 17602 + + df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + result = df.apply(lambda x: {'s': x['a'] + x['b']}, 1) + expected = Series([{'s': 3} for t in df.itertuples()]) + assert_series_equal(result, expected) + + df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'), + pd.Timestamp('2017-05-02 00:00:00')] + assert_series_equal(result, expected) + + # compose a series + result = (df['a'] + df['b']).apply(lambda x: {'s': x}) + expected = Series([{'s': 3}, {'s': 3}]) + assert_series_equal(result, expected) + + # gh-18775 + df = DataFrame() + df["author"] = ["X", "Y", "Z"] + df["publisher"] = ["BBC", "NBC", "N24"] + df["date"] = pd.to_datetime(['17-10-2010 07:15:30', + '13-05-2011 08:20:35', + '15-01-2013 09:09:09']) + result = df.apply(lambda x: {}, axis=1) + expected = Series([{}, {}, {}]) + assert_series_equal(result, expected) + + def test_with_listlike_columns(self): + # gh-17348 + df = DataFrame({'a': Series(np.random.randn(4)), + 'b': ['a', 'list', 'of', 'words'], + 'ts': date_range('2016-10-01', periods=4, freq='H')}) + + result = df[['a', 'b']].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[['a', 'b']].itertuples()]) + assert_series_equal(result, expected) + + result = df[['a', 'ts']].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[['a', 'ts']].itertuples()]) + assert_series_equal(result, expected) + + # gh-18919 + df = DataFrame({'x': Series([['a', 'b'], ['q']]), + 'y': Series([['z'], ['q', 't']])}) + df.index = MultiIndex.from_tuples([('i0', 'j0'), ('i1', 'j1')]) + + result = df.apply( + lambda row: [el for el in row['x'] if el in row['y']], + axis=1) + expected = Series([[], ['q']], index=df.index) + assert_series_equal(result, expected) + + def test_infer_output_shape_columns(self): + # gh-18573 + + df = DataFrame({'number': [1., 2.], + 'string': ['foo', 'bar'], + 'datetime': [pd.Timestamp('2017-11-29 03:30:00'), + pd.Timestamp('2017-11-29 03:45:00')]}) + result = df.apply(lambda row: (row.number, row.string), axis=1) + expected = Series([t[2:] for t in df.itertuples()]) + assert_series_equal(result, expected) + + def test_infer_output_shape_listlike_columns(self): + # gh-16353 + + df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C']) + + result = df.apply(lambda x: [1, 2, 3], axis=1) + expected = Series([[1, 2, 3] for t in df.itertuples()]) + assert_series_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1) + expected = Series([[1, 2] for t in df.itertuples()]) + assert_series_equal(result, expected) + + # gh-17970 + df = DataFrame({"a": [1, 2, 3]}, index=list('abc')) + + result = df.apply(lambda row: np.ones(1), axis=1) + expected = Series([np.ones(1) for t in df.itertuples()], + index=df.index) + assert_series_equal(result, expected) + + result = df.apply(lambda row: np.ones(2), axis=1) + expected = Series([np.ones(2) for t in df.itertuples()], + index=df.index) + assert_series_equal(result, expected) + + # gh-17892 + df = pd.DataFrame({'a': [pd.Timestamp('2010-02-01'), + pd.Timestamp('2010-02-04'), + pd.Timestamp('2010-02-05'), + pd.Timestamp('2010-02-06')], + 'b': [9, 5, 4, 3], + 'c': [5, 3, 4, 2], + 'd': [1, 2, 3, 4]}) + + def fun(x): + return (1, 2) + + result = df.apply(fun, axis=1) + expected = Series([(1, 2) for t in df.itertuples()]) + assert_series_equal(result, expected) + + def test_consistent_coerce_for_shapes(self): + # we want column names to NOT be propagated + # just because the shape matches the input shape + df = DataFrame(np.random.randn(4, 3), columns=['A', 'B', 'C']) + + result = df.apply(lambda x: [1, 2, 3], axis=1) + expected = Series([[1, 2, 3] for t in df.itertuples()]) + assert_series_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1) + expected = Series([[1, 2] for t in df.itertuples()]) + assert_series_equal(result, expected) + + def zip_frames(*frames): """ take a list of frames, zip the columns together for each @@ -649,13 +783,13 @@ def test_non_callable_aggregates(self): # Function aggregate result = df.agg({'A': 'count'}) - expected = pd.Series({'A': 2}) + expected = Series({'A': 2}) assert_series_equal(result, expected) # Non-function aggregate result = df.agg({'A': 'size'}) - expected = pd.Series({'A': 3}) + expected = Series({'A': 3}) assert_series_equal(result, expected)