dispatch scalar DataFrame ops to Series (#22163)

pandas-dev · Aug 14, 2018 · f7f266c · f7f266c
1 parent 1ea9664
commit f7f266c
Show file tree

Hide file tree

Showing 12 changed files with 238 additions and 154 deletions.
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -216,7 +216,7 @@ New Behavior:
    idx = pd.interval_range(0, 4)
    idx.values
 
-This mirrors ``CateogricalIndex.values``, which returns a ``Categorical``.
+This mirrors ``CategoricalIndex.values``, which returns a ``Categorical``.
 
 For situations where you need an ``ndarray`` of ``Interval`` objects, use
 :meth:`numpy.asarray` or ``idx.astype(object)``.
@@ -406,6 +406,34 @@ Previous Behavior:
     In [3]: pi - pi[0]
     Out[3]: Int64Index([0, 1, 2], dtype='int64')
 
+
+.. _whatsnew_0240.api.timedelta64_subtract_nan
+
+Addition/Subtraction of ``NaN`` from :class:``DataFrame``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Adding or subtracting ``NaN`` from a :class:`DataFrame` column with
+`timedelta64[ns]` dtype will now raise a ``TypeError`` instead of returning
+all-``NaT``.  This is for compatibility with ``TimedeltaIndex`` and
+``Series`` behavior (:issue:`22163`)
+
+.. ipython:: python
+
+    df = pd.DataFrame([pd.Timedelta(days=1)])
+    df - np.nan
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+    In [4]: df = pd.DataFrame([pd.Timedelta(days=1)])
+
+    In [5]: df - np.nan
+    Out[5]:
+        0
+    0 NaT
+
+
 .. _whatsnew_0240.api.extension:
 
 ExtensionType Changes
@@ -539,6 +567,16 @@ Datetimelike
 - Bug in :class:`DatetimeIndex` comparisons where string comparisons incorrectly raises ``TypeError`` (:issue:`22074`)
 - Bug in :class:`DatetimeIndex` comparisons when comparing against ``timedelta64[ns]`` dtyped arrays; in some cases ``TypeError`` was incorrectly raised, in others it incorrectly failed to raise (:issue:`22074`)
 - Bug in :class:`DatetimeIndex` comparisons when comparing against object-dtyped arrays (:issue:`22074`)
+- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``Timedelta``-like objects (:issue:`22005`,:issue:`22163`)
+- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``DateOffset`` objects returning an ``object`` dtype instead of ``datetime64[ns]`` dtype (:issue:`21610`,:issue:`22163`)
+- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype comparing against ``NaT`` incorrectly (:issue:`22242`,:issue:`22163`)
+- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``Timestamp``-like object incorrectly returned ``datetime64[ns]`` dtype instead of ``timedelta64[ns]`` dtype (:issue:`8554`,:issue:`22163`)
+- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``np.datetime64`` object with non-nanosecond unit failing to convert to nanoseconds (:issue:`18874`,:issue:`22163`)
+- Bug in :class:`DataFrame` comparisons against ``Timestamp``-like objects failing to raise ``TypeError`` for inequality checks with mismatched types (:issue:`8932`,:issue:`22163`)
+- Bug in :class:`DataFrame` with mixed dtypes including ``datetime64[ns]`` incorrectly raising ``TypeError`` on equality comparisons (:issue:`13128`,:issue:`22163`)
+- Bug in :meth:`DataFrame.eq` comparison against ``NaT`` incorrectly returning ``True`` or ``NaN`` (:issue:`15697`,:issue:`22163`)
+- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype division by ``Timedelta``-like scalar incorrectly returning ``timedelta64[ns]`` dtype instead of ``float64`` dtype (:issue:`20088`,:issue:`22163`)
+-
 
 Timedelta
 ^^^^^^^^^
@@ -586,6 +624,7 @@ Numeric
   when supplied with a list of functions and ``axis=1`` (e.g. ``df.apply(['sum', 'mean'], axis=1)``),
   a ``TypeError`` was wrongly raised. For all three methods such calculation are now done correctly. (:issue:`16679`).
 - Bug in :class:`Series` comparison against datetime-like scalars and arrays (:issue:`22074`)
+- Bug in :class:`DataFrame` multiplication between boolean dtype and integer returning ``object`` dtype instead of integer dtype (:issue:`22047`,:issue:`22163`)
 -
 
 Strings

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4835,6 +4835,14 @@ def _combine_match_columns(self, other, func, level=None, try_cast=True):
         return self._constructor(new_data)
 
     def _combine_const(self, other, func, errors='raise', try_cast=True):
+        if lib.is_scalar(other) or np.ndim(other) == 0:
+            new_data = {i: func(self.iloc[:, i], other)
+                        for i, col in enumerate(self.columns)}
+
+            result = self._constructor(new_data, index=self.index, copy=False)
+            result.columns = self.columns
+            return result
+
         new_data = self._data.eval(func=func, other=other,
                                    errors=errors,
                                    try_cast=try_cast)

diff --git a/pandas/core/ops.py b/pandas/core/ops.py
@@ -1350,7 +1350,7 @@ def na_op(x, y):
                 with np.errstate(all='ignore'):
                     result = method(y)
                 if result is NotImplemented:
-                    raise TypeError("invalid type comparison")
+                    return invalid_comparison(x, y, op)
             else:
                 result = op(x, y)
 
@@ -1366,6 +1366,10 @@ def wrapper(self, other, axis=None):
 
         res_name = get_op_result_name(self, other)
 
+        if isinstance(other, list):
+            # TODO: same for tuples?
+            other = np.asarray(other)
+
         if isinstance(other, ABCDataFrame):  # pragma: no cover
             # Defer to DataFrame implementation; fail early
             return NotImplemented
@@ -1459,8 +1463,6 @@ def wrapper(self, other, axis=None):
 
         else:
             values = self.get_values()
-            if isinstance(other, list):
-                other = np.asarray(other)
 
             with np.errstate(all='ignore'):
                 res = na_op(values, other)
@@ -1741,7 +1743,8 @@ def f(self, other, axis=default_axis, level=None, fill_value=None):
             if fill_value is not None:
                 self = self.fillna(fill_value)
 
-            return self._combine_const(other, na_op, try_cast=True)
+            pass_op = op if lib.is_scalar(other) else na_op
+            return self._combine_const(other, pass_op, try_cast=True)
 
     f.__name__ = op_name
 

diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py
@@ -63,6 +63,15 @@ def test_tz_aware_scalar_comparison(self, timestamps):
         expected = pd.DataFrame({'test': [False, False]})
         tm.assert_frame_equal(df == -1, expected)
 
+    def test_dt64_nat_comparison(self):
+        # GH#22242, GH#22163 DataFrame considered NaT == ts incorrectly
+        ts = pd.Timestamp.now()
+        df = pd.DataFrame([ts, pd.NaT])
+        expected = pd.DataFrame([True, False])
+
+        result = df == ts
+        tm.assert_frame_equal(result, expected)
+
 
 class TestDatetime64SeriesComparison(object):
     # TODO: moved from tests.series.test_operators; needs cleanup
@@ -640,10 +649,22 @@ def test_dti_cmp_object_dtype(self):
 # Arithmetic
 
 class TestFrameArithmetic(object):
+    def test_dt64arr_sub_dtscalar(self, box):
+        # GH#8554, GH#22163 DataFrame op should _not_ return dt64 dtype
+        idx = pd.date_range('2013-01-01', periods=3)
+        idx = tm.box_expected(idx, box)
+
+        ts = pd.Timestamp('2013-01-01')
+        # TODO: parametrize over scalar types
+
+        expected = pd.TimedeltaIndex(['0 Days', '1 Day', '2 Days'])
+        expected = tm.box_expected(expected, box)
+
+        result = idx - ts
+        tm.assert_equal(result, expected)
 
-    @pytest.mark.xfail(reason='GH#7996 datetime64 units not converted to nano',
-                       strict=True)
     def test_df_sub_datetime64_not_ns(self):
+        # GH#7996, GH#22163 ensure non-nano datetime64 is converted to nano
         df = pd.DataFrame(pd.date_range('20130101', periods=3))
         dt64 = np.datetime64('2013-01-01')
         assert dt64.dtype == 'datetime64[D]'
@@ -992,9 +1013,11 @@ def test_dti_add_sub_float(self, op, other):
         with pytest.raises(TypeError):
             op(dti, other)
 
-    def test_dti_add_timestamp_raises(self):
+    def test_dti_add_timestamp_raises(self, box):
+        # GH#22163 ensure DataFrame doesn't cast Timestamp to i8
         idx = DatetimeIndex(['2011-01-01', '2011-01-02'])
-        msg = "cannot add DatetimeIndex and Timestamp"
+        idx = tm.box_expected(idx, box)
+        msg = "cannot add"
         with tm.assert_raises_regex(TypeError, msg):
             idx + Timestamp('2011-01-01')
 
@@ -1090,13 +1113,17 @@ def test_dti_add_intarray_no_freq(self, box):
     # -------------------------------------------------------------
     # Binary operations DatetimeIndex and timedelta-like
 
-    def test_dti_add_timedeltalike(self, tz_naive_fixture, delta):
+    def test_dti_add_timedeltalike(self, tz_naive_fixture, delta, box):
+        # GH#22005, GH#22163 check DataFrame doesn't raise TypeError
         tz = tz_naive_fixture
         rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz)
+        rng = tm.box_expected(rng, box)
+
         result = rng + delta
         expected = pd.date_range('2000-01-01 02:00',
                                  '2000-02-01 02:00', tz=tz)
-        tm.assert_index_equal(result, expected)
+        expected = tm.box_expected(expected, box)
+        tm.assert_equal(result, expected)
 
     def test_dti_iadd_timedeltalike(self, tz_naive_fixture, delta):
         tz = tz_naive_fixture
@@ -1662,14 +1689,8 @@ def test_dti_with_offset_series(self, tz_naive_fixture, names):
             res3 = dti - other
         tm.assert_series_equal(res3, expected_sub)
 
-    @pytest.mark.parametrize('box', [
-        pd.Index,
-        pd.Series,
-        pytest.param(pd.DataFrame,
-                     marks=pytest.mark.xfail(reason="Returns object dtype",
-                                             strict=True))
-    ], ids=lambda x: x.__name__)
     def test_dti_add_offset_tzaware(self, tz_aware_fixture, box):
+        # GH#21610, GH#22163 ensure DataFrame doesn't return object-dtype
         timezone = tz_aware_fixture
         if timezone == 'US/Pacific':
             dates = date_range('2012-11-01', periods=3, tz=timezone)

diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py
@@ -58,13 +58,6 @@ def test_ops_series(self):
         tm.assert_series_equal(expected, td * other)
         tm.assert_series_equal(expected, other * td)
 
-    @pytest.mark.parametrize('box', [
-        pd.Index,
-        Series,
-        pytest.param(pd.DataFrame,
-                     marks=pytest.mark.xfail(reason="block.eval incorrect",
-                                             strict=True))
-    ])
     @pytest.mark.parametrize('index', [
         pd.Int64Index(range(1, 11)),
         pd.UInt64Index(range(1, 11)),
@@ -79,7 +72,7 @@ def test_ops_series(self):
     def test_numeric_arr_mul_tdscalar(self, scalar_td, index, box):
         # GH#19333
 
-        if (box is Series and
+        if (box in [Series, pd.DataFrame] and
                 type(scalar_td) is timedelta and index.dtype == 'f8'):
             raise pytest.xfail(reason="Cannot multiply timedelta by float")
 

diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py
@@ -17,6 +17,53 @@
 # Comparisons
 
 class TestFrameComparisons(object):
+    def test_flex_comparison_nat(self):
+        # GH#15697, GH#22163 df.eq(pd.NaT) should behave like df == pd.NaT,
+        # and _definitely_ not be NaN
+        df = pd.DataFrame([pd.NaT])
+
+        result = df == pd.NaT
+        # result.iloc[0, 0] is a np.bool_ object
+        assert result.iloc[0, 0].item() is False
+
+        result = df.eq(pd.NaT)
+        assert result.iloc[0, 0].item() is False
+
+        result = df != pd.NaT
+        assert result.iloc[0, 0].item() is True
+
+        result = df.ne(pd.NaT)
+        assert result.iloc[0, 0].item() is True
+
+    def test_mixed_comparison(self):
+        # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False,
+        # not raise TypeError
+        # (this appears to be fixed before #22163, not sure when)
+        df = pd.DataFrame([['1989-08-01', 1], ['1989-08-01', 2]])
+        other = pd.DataFrame([['a', 'b'], ['c', 'd']])
+
+        result = df == other
+        assert not result.any().any()
+
+        result = df != other
+        assert result.all().all()
+
+    def test_df_numeric_cmp_dt64_raises(self):
+        # GH#8932, GH#22163
+        ts = pd.Timestamp.now()
+        df = pd.DataFrame({'x': range(5)})
+        with pytest.raises(TypeError):
+            df > ts
+        with pytest.raises(TypeError):
+            df < ts
+        with pytest.raises(TypeError):
+            ts < df
+        with pytest.raises(TypeError):
+            ts > df
+
+        assert not (df == ts).any().any()
+        assert (df != ts).all().all()
+
     def test_df_boolean_comparison_error(self):
         # GH#4576
         # boolean comparisons with a tuple/list give unexpected results
@@ -32,8 +79,8 @@ def test_df_float_none_comparison(self):
         df = pd.DataFrame(np.random.randn(8, 3), index=range(8),
                           columns=['A', 'B', 'C'])
 
-        with pytest.raises(TypeError):
-            df.__eq__(None)
+        result = df.__eq__(None)
+        assert not result.any().any()
 
     def test_df_string_comparison(self):
         df = pd.DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}])
@@ -251,3 +298,20 @@ def test_arith_flex_zero_len_raises(self):
 
         with tm.assert_raises_regex(NotImplementedError, 'fill_value'):
             df_len0.sub(df['A'], axis=None, fill_value=3)
+
+
+class TestFrameArithmetic(object):
+    def test_df_bool_mul_int(self):
+        # GH#22047, GH#22163 multiplication by 1 should result in int dtype,
+        # not object dtype
+        df = pd.DataFrame([[False, True], [False, False]])
+        result = df * 1
+
+        # On appveyor this comes back as np.int32 instead of np.int64,
+        # so we check dtype.kind instead of just dtype
+        kinds = result.dtypes.apply(lambda x: x.kind)
+        assert (kinds == 'i').all()
+
+        result = 1 * df
+        kinds = result.dtypes.apply(lambda x: x.kind)
+        assert (kinds == 'i').all()
diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py
@@ -273,6 +273,8 @@ def test_getitem_boolean(self):
         # test df[df > 0]
         for df in [self.tsframe, self.mixed_frame,
                    self.mixed_float, self.mixed_int]:
+            if compat.PY3 and df is self.mixed_frame:
+                continue
 
             data = df._get_numeric_data()
             bif = df[df > 0]
@@ -2468,8 +2470,11 @@ def test_boolean_indexing_mixed(self):
         assert_frame_equal(df2, expected)
 
         df['foo'] = 'test'
-        with tm.assert_raises_regex(TypeError, 'boolean setting '
-                                    'on mixed-type'):
+        msg = ("boolean setting on mixed-type|"
+               "not supported between|"
+               "unorderable types")
+        with tm.assert_raises_regex(TypeError, msg):
+            # TODO: This message should be the same in PY2/PY3
             df[df > 0.3] = 1
 
     def test_where(self):
@@ -2502,6 +2507,10 @@ def _check_get(df, cond, check_dtypes=True):
         # check getting
         for df in [default_frame, self.mixed_frame,
                    self.mixed_float, self.mixed_int]:
+            if compat.PY3 and df is self.mixed_frame:
+                with pytest.raises(TypeError):
+                    df > 0
+                continue
             cond = df > 0
             _check_get(df, cond)
 
@@ -2549,6 +2558,10 @@ def _check_align(df, cond, other, check_dtypes=True):
                 assert (rs.dtypes == df.dtypes).all()
 
         for df in [self.mixed_frame, self.mixed_float, self.mixed_int]:
+            if compat.PY3 and df is self.mixed_frame:
+                with pytest.raises(TypeError):
+                    df > 0
+                continue
 
             # other is a frame
             cond = (df > 0)[1:]
@@ -2594,6 +2607,10 @@ def _check_set(df, cond, check_dtypes=True):
 
         for df in [default_frame, self.mixed_frame, self.mixed_float,
                    self.mixed_int]:
+            if compat.PY3 and df is self.mixed_frame:
+                with pytest.raises(TypeError):
+                    df > 0
+                continue
 
             cond = df > 0
             _check_set(df, cond)
@@ -2759,9 +2776,14 @@ def test_where_datetime(self):
                             C=np.random.randn(5)))
 
         stamp = datetime(2013, 1, 3)
-        result = df[df > stamp]
+        with pytest.raises(TypeError):
+            df > stamp
+
+        result = df[df.iloc[:, :-1] > stamp]
+
         expected = df.copy()
         expected.loc[[0, 1], 'A'] = np.nan
+        expected.loc[:, 'C'] = np.nan
         assert_frame_equal(result, expected)
 
     def test_where_none(self):