From f7f266c32072082fe505a055c32a0e60423b90fe Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 14 Aug 2018 03:49:10 -0700 Subject: [PATCH] dispatch scalar DataFrame ops to Series (#22163) --- doc/source/whatsnew/v0.24.0.txt | 41 +++++- pandas/core/frame.py | 8 ++ pandas/core/ops.py | 11 +- pandas/tests/arithmetic/test_datetime64.py | 47 +++++-- pandas/tests/arithmetic/test_numeric.py | 9 +- pandas/tests/frame/test_arithmetic.py | 68 ++++++++- pandas/tests/frame/test_indexing.py | 28 +++- pandas/tests/frame/test_operators.py | 13 +- .../indexes/timedeltas/test_arithmetic.py | 9 +- pandas/tests/internals/test_internals.py | 25 +++- pandas/tests/test_arithmetic.py | 129 +++--------------- pandas/tests/test_expressions.py | 4 +- 12 files changed, 238 insertions(+), 154 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 9b70bda82e247..3ebdf853a9c64 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -216,7 +216,7 @@ New Behavior: idx = pd.interval_range(0, 4) idx.values -This mirrors ``CateogricalIndex.values``, which returns a ``Categorical``. +This mirrors ``CategoricalIndex.values``, which returns a ``Categorical``. For situations where you need an ``ndarray`` of ``Interval`` objects, use :meth:`numpy.asarray` or ``idx.astype(object)``. @@ -406,6 +406,34 @@ Previous Behavior: In [3]: pi - pi[0] Out[3]: Int64Index([0, 1, 2], dtype='int64') + +.. _whatsnew_0240.api.timedelta64_subtract_nan + +Addition/Subtraction of ``NaN`` from :class:``DataFrame`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Adding or subtracting ``NaN`` from a :class:`DataFrame` column with +`timedelta64[ns]` dtype will now raise a ``TypeError`` instead of returning +all-``NaT``. This is for compatibility with ``TimedeltaIndex`` and +``Series`` behavior (:issue:`22163`) + +.. ipython:: python + + df = pd.DataFrame([pd.Timedelta(days=1)]) + df - np.nan + +Previous Behavior: + +.. code-block:: ipython + + In [4]: df = pd.DataFrame([pd.Timedelta(days=1)]) + + In [5]: df - np.nan + Out[5]: + 0 + 0 NaT + + .. _whatsnew_0240.api.extension: ExtensionType Changes @@ -539,6 +567,16 @@ Datetimelike - Bug in :class:`DatetimeIndex` comparisons where string comparisons incorrectly raises ``TypeError`` (:issue:`22074`) - Bug in :class:`DatetimeIndex` comparisons when comparing against ``timedelta64[ns]`` dtyped arrays; in some cases ``TypeError`` was incorrectly raised, in others it incorrectly failed to raise (:issue:`22074`) - Bug in :class:`DatetimeIndex` comparisons when comparing against object-dtyped arrays (:issue:`22074`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``Timedelta``-like objects (:issue:`22005`,:issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``DateOffset`` objects returning an ``object`` dtype instead of ``datetime64[ns]`` dtype (:issue:`21610`,:issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype comparing against ``NaT`` incorrectly (:issue:`22242`,:issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``Timestamp``-like object incorrectly returned ``datetime64[ns]`` dtype instead of ``timedelta64[ns]`` dtype (:issue:`8554`,:issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``np.datetime64`` object with non-nanosecond unit failing to convert to nanoseconds (:issue:`18874`,:issue:`22163`) +- Bug in :class:`DataFrame` comparisons against ``Timestamp``-like objects failing to raise ``TypeError`` for inequality checks with mismatched types (:issue:`8932`,:issue:`22163`) +- Bug in :class:`DataFrame` with mixed dtypes including ``datetime64[ns]`` incorrectly raising ``TypeError`` on equality comparisons (:issue:`13128`,:issue:`22163`) +- Bug in :meth:`DataFrame.eq` comparison against ``NaT`` incorrectly returning ``True`` or ``NaN`` (:issue:`15697`,:issue:`22163`) +- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype division by ``Timedelta``-like scalar incorrectly returning ``timedelta64[ns]`` dtype instead of ``float64`` dtype (:issue:`20088`,:issue:`22163`) +- Timedelta ^^^^^^^^^ @@ -586,6 +624,7 @@ Numeric when supplied with a list of functions and ``axis=1`` (e.g. ``df.apply(['sum', 'mean'], axis=1)``), a ``TypeError`` was wrongly raised. For all three methods such calculation are now done correctly. (:issue:`16679`). - Bug in :class:`Series` comparison against datetime-like scalars and arrays (:issue:`22074`) +- Bug in :class:`DataFrame` multiplication between boolean dtype and integer returning ``object`` dtype instead of integer dtype (:issue:`22047`,:issue:`22163`) - Strings diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 700562386c838..b35bc8325d560 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4835,6 +4835,14 @@ def _combine_match_columns(self, other, func, level=None, try_cast=True): return self._constructor(new_data) def _combine_const(self, other, func, errors='raise', try_cast=True): + if lib.is_scalar(other) or np.ndim(other) == 0: + new_data = {i: func(self.iloc[:, i], other) + for i, col in enumerate(self.columns)} + + result = self._constructor(new_data, index=self.index, copy=False) + result.columns = self.columns + return result + new_data = self._data.eval(func=func, other=other, errors=errors, try_cast=try_cast) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index dc139a8e14f66..10418ccbb1f64 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -1350,7 +1350,7 @@ def na_op(x, y): with np.errstate(all='ignore'): result = method(y) if result is NotImplemented: - raise TypeError("invalid type comparison") + return invalid_comparison(x, y, op) else: result = op(x, y) @@ -1366,6 +1366,10 @@ def wrapper(self, other, axis=None): res_name = get_op_result_name(self, other) + if isinstance(other, list): + # TODO: same for tuples? + other = np.asarray(other) + if isinstance(other, ABCDataFrame): # pragma: no cover # Defer to DataFrame implementation; fail early return NotImplemented @@ -1459,8 +1463,6 @@ def wrapper(self, other, axis=None): else: values = self.get_values() - if isinstance(other, list): - other = np.asarray(other) with np.errstate(all='ignore'): res = na_op(values, other) @@ -1741,7 +1743,8 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): if fill_value is not None: self = self.fillna(fill_value) - return self._combine_const(other, na_op, try_cast=True) + pass_op = op if lib.is_scalar(other) else na_op + return self._combine_const(other, pass_op, try_cast=True) f.__name__ = op_name diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index a27199b58cf5e..879a4e1b4af1a 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -63,6 +63,15 @@ def test_tz_aware_scalar_comparison(self, timestamps): expected = pd.DataFrame({'test': [False, False]}) tm.assert_frame_equal(df == -1, expected) + def test_dt64_nat_comparison(self): + # GH#22242, GH#22163 DataFrame considered NaT == ts incorrectly + ts = pd.Timestamp.now() + df = pd.DataFrame([ts, pd.NaT]) + expected = pd.DataFrame([True, False]) + + result = df == ts + tm.assert_frame_equal(result, expected) + class TestDatetime64SeriesComparison(object): # TODO: moved from tests.series.test_operators; needs cleanup @@ -640,10 +649,22 @@ def test_dti_cmp_object_dtype(self): # Arithmetic class TestFrameArithmetic(object): + def test_dt64arr_sub_dtscalar(self, box): + # GH#8554, GH#22163 DataFrame op should _not_ return dt64 dtype + idx = pd.date_range('2013-01-01', periods=3) + idx = tm.box_expected(idx, box) + + ts = pd.Timestamp('2013-01-01') + # TODO: parametrize over scalar types + + expected = pd.TimedeltaIndex(['0 Days', '1 Day', '2 Days']) + expected = tm.box_expected(expected, box) + + result = idx - ts + tm.assert_equal(result, expected) - @pytest.mark.xfail(reason='GH#7996 datetime64 units not converted to nano', - strict=True) def test_df_sub_datetime64_not_ns(self): + # GH#7996, GH#22163 ensure non-nano datetime64 is converted to nano df = pd.DataFrame(pd.date_range('20130101', periods=3)) dt64 = np.datetime64('2013-01-01') assert dt64.dtype == 'datetime64[D]' @@ -992,9 +1013,11 @@ def test_dti_add_sub_float(self, op, other): with pytest.raises(TypeError): op(dti, other) - def test_dti_add_timestamp_raises(self): + def test_dti_add_timestamp_raises(self, box): + # GH#22163 ensure DataFrame doesn't cast Timestamp to i8 idx = DatetimeIndex(['2011-01-01', '2011-01-02']) - msg = "cannot add DatetimeIndex and Timestamp" + idx = tm.box_expected(idx, box) + msg = "cannot add" with tm.assert_raises_regex(TypeError, msg): idx + Timestamp('2011-01-01') @@ -1090,13 +1113,17 @@ def test_dti_add_intarray_no_freq(self, box): # ------------------------------------------------------------- # Binary operations DatetimeIndex and timedelta-like - def test_dti_add_timedeltalike(self, tz_naive_fixture, delta): + def test_dti_add_timedeltalike(self, tz_naive_fixture, delta, box): + # GH#22005, GH#22163 check DataFrame doesn't raise TypeError tz = tz_naive_fixture rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) + rng = tm.box_expected(rng, box) + result = rng + delta expected = pd.date_range('2000-01-01 02:00', '2000-02-01 02:00', tz=tz) - tm.assert_index_equal(result, expected) + expected = tm.box_expected(expected, box) + tm.assert_equal(result, expected) def test_dti_iadd_timedeltalike(self, tz_naive_fixture, delta): tz = tz_naive_fixture @@ -1662,14 +1689,8 @@ def test_dti_with_offset_series(self, tz_naive_fixture, names): res3 = dti - other tm.assert_series_equal(res3, expected_sub) - @pytest.mark.parametrize('box', [ - pd.Index, - pd.Series, - pytest.param(pd.DataFrame, - marks=pytest.mark.xfail(reason="Returns object dtype", - strict=True)) - ], ids=lambda x: x.__name__) def test_dti_add_offset_tzaware(self, tz_aware_fixture, box): + # GH#21610, GH#22163 ensure DataFrame doesn't return object-dtype timezone = tz_aware_fixture if timezone == 'US/Pacific': dates = date_range('2012-11-01', periods=3, tz=timezone) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 71742d428ea3e..85a0a8dffc55f 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -58,13 +58,6 @@ def test_ops_series(self): tm.assert_series_equal(expected, td * other) tm.assert_series_equal(expected, other * td) - @pytest.mark.parametrize('box', [ - pd.Index, - Series, - pytest.param(pd.DataFrame, - marks=pytest.mark.xfail(reason="block.eval incorrect", - strict=True)) - ]) @pytest.mark.parametrize('index', [ pd.Int64Index(range(1, 11)), pd.UInt64Index(range(1, 11)), @@ -79,7 +72,7 @@ def test_ops_series(self): def test_numeric_arr_mul_tdscalar(self, scalar_td, index, box): # GH#19333 - if (box is Series and + if (box in [Series, pd.DataFrame] and type(scalar_td) is timedelta and index.dtype == 'f8'): raise pytest.xfail(reason="Cannot multiply timedelta by float") diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 9a17dc580ff6c..f142f770a0c54 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -17,6 +17,53 @@ # Comparisons class TestFrameComparisons(object): + def test_flex_comparison_nat(self): + # GH#15697, GH#22163 df.eq(pd.NaT) should behave like df == pd.NaT, + # and _definitely_ not be NaN + df = pd.DataFrame([pd.NaT]) + + result = df == pd.NaT + # result.iloc[0, 0] is a np.bool_ object + assert result.iloc[0, 0].item() is False + + result = df.eq(pd.NaT) + assert result.iloc[0, 0].item() is False + + result = df != pd.NaT + assert result.iloc[0, 0].item() is True + + result = df.ne(pd.NaT) + assert result.iloc[0, 0].item() is True + + def test_mixed_comparison(self): + # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, + # not raise TypeError + # (this appears to be fixed before #22163, not sure when) + df = pd.DataFrame([['1989-08-01', 1], ['1989-08-01', 2]]) + other = pd.DataFrame([['a', 'b'], ['c', 'd']]) + + result = df == other + assert not result.any().any() + + result = df != other + assert result.all().all() + + def test_df_numeric_cmp_dt64_raises(self): + # GH#8932, GH#22163 + ts = pd.Timestamp.now() + df = pd.DataFrame({'x': range(5)}) + with pytest.raises(TypeError): + df > ts + with pytest.raises(TypeError): + df < ts + with pytest.raises(TypeError): + ts < df + with pytest.raises(TypeError): + ts > df + + assert not (df == ts).any().any() + assert (df != ts).all().all() + def test_df_boolean_comparison_error(self): # GH#4576 # boolean comparisons with a tuple/list give unexpected results @@ -32,8 +79,8 @@ def test_df_float_none_comparison(self): df = pd.DataFrame(np.random.randn(8, 3), index=range(8), columns=['A', 'B', 'C']) - with pytest.raises(TypeError): - df.__eq__(None) + result = df.__eq__(None) + assert not result.any().any() def test_df_string_comparison(self): df = pd.DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}]) @@ -251,3 +298,20 @@ def test_arith_flex_zero_len_raises(self): with tm.assert_raises_regex(NotImplementedError, 'fill_value'): df_len0.sub(df['A'], axis=None, fill_value=3) + + +class TestFrameArithmetic(object): + def test_df_bool_mul_int(self): + # GH#22047, GH#22163 multiplication by 1 should result in int dtype, + # not object dtype + df = pd.DataFrame([[False, True], [False, False]]) + result = df * 1 + + # On appveyor this comes back as np.int32 instead of np.int64, + # so we check dtype.kind instead of just dtype + kinds = result.dtypes.apply(lambda x: x.kind) + assert (kinds == 'i').all() + + result = 1 * df + kinds = result.dtypes.apply(lambda x: x.kind) + assert (kinds == 'i').all() diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index d885df76967b8..6a4cf1ffc6071 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -273,6 +273,8 @@ def test_getitem_boolean(self): # test df[df > 0] for df in [self.tsframe, self.mixed_frame, self.mixed_float, self.mixed_int]: + if compat.PY3 and df is self.mixed_frame: + continue data = df._get_numeric_data() bif = df[df > 0] @@ -2468,8 +2470,11 @@ def test_boolean_indexing_mixed(self): assert_frame_equal(df2, expected) df['foo'] = 'test' - with tm.assert_raises_regex(TypeError, 'boolean setting ' - 'on mixed-type'): + msg = ("boolean setting on mixed-type|" + "not supported between|" + "unorderable types") + with tm.assert_raises_regex(TypeError, msg): + # TODO: This message should be the same in PY2/PY3 df[df > 0.3] = 1 def test_where(self): @@ -2502,6 +2507,10 @@ def _check_get(df, cond, check_dtypes=True): # check getting for df in [default_frame, self.mixed_frame, self.mixed_float, self.mixed_int]: + if compat.PY3 and df is self.mixed_frame: + with pytest.raises(TypeError): + df > 0 + continue cond = df > 0 _check_get(df, cond) @@ -2549,6 +2558,10 @@ def _check_align(df, cond, other, check_dtypes=True): assert (rs.dtypes == df.dtypes).all() for df in [self.mixed_frame, self.mixed_float, self.mixed_int]: + if compat.PY3 and df is self.mixed_frame: + with pytest.raises(TypeError): + df > 0 + continue # other is a frame cond = (df > 0)[1:] @@ -2594,6 +2607,10 @@ def _check_set(df, cond, check_dtypes=True): for df in [default_frame, self.mixed_frame, self.mixed_float, self.mixed_int]: + if compat.PY3 and df is self.mixed_frame: + with pytest.raises(TypeError): + df > 0 + continue cond = df > 0 _check_set(df, cond) @@ -2759,9 +2776,14 @@ def test_where_datetime(self): C=np.random.randn(5))) stamp = datetime(2013, 1, 3) - result = df[df > stamp] + with pytest.raises(TypeError): + df > stamp + + result = df[df.iloc[:, :-1] > stamp] + expected = df.copy() expected.loc[[0, 1], 'A'] = np.nan + expected.loc[:, 'C'] = np.nan assert_frame_equal(result, expected) def test_where_none(self): diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 2fc59c5003a4d..da4424b1ae626 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -150,10 +150,15 @@ def test_timestamp_compare(self): right_f = getattr(operator, right) # no nats - expected = left_f(df, Timestamp('20010109')) - result = right_f(Timestamp('20010109'), df) - assert_frame_equal(result, expected) - + if left in ['eq', 'ne']: + expected = left_f(df, Timestamp('20010109')) + result = right_f(Timestamp('20010109'), df) + assert_frame_equal(result, expected) + else: + with pytest.raises(TypeError): + left_f(df, Timestamp('20010109')) + with pytest.raises(TypeError): + right_f(Timestamp('20010109'), df) # nats expected = left_f(df, Timestamp('nat')) result = right_f(Timestamp('nat'), df) diff --git a/pandas/tests/indexes/timedeltas/test_arithmetic.py b/pandas/tests/indexes/timedeltas/test_arithmetic.py index 21d895fa59021..f3bc523ca525e 100644 --- a/pandas/tests/indexes/timedeltas/test_arithmetic.py +++ b/pandas/tests/indexes/timedeltas/test_arithmetic.py @@ -473,7 +473,6 @@ def test_timedelta_ops_with_missing_values(self): scalar1 = pd.to_timedelta('00:00:01') scalar2 = pd.to_timedelta('00:00:02') timedelta_NaT = pd.to_timedelta('NaT') - NA = np.nan actual = scalar1 + scalar1 assert actual == scalar2 @@ -541,10 +540,10 @@ def test_timedelta_ops_with_missing_values(self): actual = df1 - timedelta_NaT tm.assert_frame_equal(actual, dfn) - actual = df1 + NA - tm.assert_frame_equal(actual, dfn) - actual = df1 - NA - tm.assert_frame_equal(actual, dfn) + with pytest.raises(TypeError): + df1 + np.nan + with pytest.raises(TypeError): + df1 - np.nan actual = df1 + pd.NaT # NaT is datetime, not timedelta tm.assert_frame_equal(actual, dfn) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 0b06775326ab1..34f22513106ba 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1235,16 +1235,31 @@ def test_binop_other(self, op, value, dtype): (operator.truediv, 'bool'), (operator.mod, 'i8'), (operator.mod, 'complex128'), - (operator.mod, '