implement+test mean for datetimelike EA/Index/Series (#24757)

pandas-dev · Jun 10, 2019 · efc7f2f · efc7f2f
1 parent 959e799
commit efc7f2f
Show file tree

Hide file tree

Showing 9 changed files with 181 additions and 1 deletion.
diff --git a/doc/source/reference/indexing.rst b/doc/source/reference/indexing.rst
@@ -403,6 +403,13 @@ Conversion
    DatetimeIndex.to_series
    DatetimeIndex.to_frame
 
+Methods
+~~~~~~~
+.. autosummary::
+    :toctree: api/
+
+    DatetimeIndex.mean
+
 TimedeltaIndex
 --------------
 .. autosummary::
@@ -435,6 +442,13 @@ Conversion
    TimedeltaIndex.ceil
    TimedeltaIndex.to_frame
 
+Methods
+~~~~~~~
+.. autosummary::
+    :toctree: api/
+
+    TimedeltaIndex.mean
+
 .. currentmodule:: pandas
 
 PeriodIndex

diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -96,6 +96,7 @@ Other Enhancements
 - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`)
 - :func:`merge_asof` now gives a more clear error message when merge keys are categoricals that are not equal (:issue:`26136`)
 - :meth:`pandas.core.window.Rolling` supports exponential (or Poisson) window type (:issue:`21303`)
+- :class:`DatetimeIndex` and :class:`TimedeltaIndex` now have a `mean` method (:issue:`24757`)
 -
 
 .. _whatsnew_0250.api_breaking:

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -1382,7 +1382,7 @@ def _ensure_localized(self, arg, ambiguous='raise', nonexistent='raise',
     def _reduce(self, name, axis=0, skipna=True, **kwargs):
         op = getattr(self, name, None)
         if op:
-            return op(axis=axis, skipna=skipna, **kwargs)
+            return op(skipna=skipna, **kwargs)
         else:
             return super()._reduce(name, skipna, **kwargs)
 
@@ -1438,6 +1438,54 @@ def max(self, axis=None, skipna=True, *args, **kwargs):
         # Don't have to worry about NA `result`, since no NA went in.
         return self._box_func(result)
 
+    def mean(self, skipna=True):
+        """
+        Return the mean value of the Array.
+
+        .. versionadded:: 0.25.0
+
+        Parameters
+        ----------
+        skipna : bool, default True
+            Whether to ignore any NaT elements
+
+        Returns
+        -------
+        scalar (Timestamp or Timedelta)
+
+        See Also
+        --------
+        numpy.ndarray.mean
+        Series.mean : Return the mean value in a Series.
+
+        Notes
+        -----
+        mean is only defined for Datetime and Timedelta dtypes, not for Period.
+        """
+        if is_period_dtype(self):
+            # See discussion in GH#24757
+            raise TypeError(
+                "mean is not implemented for {cls} since the meaning is "
+                "ambiguous.  An alternative is "
+                "obj.to_timestamp(how='start').mean()"
+                .format(cls=type(self).__name__))
+
+        mask = self.isna()
+        if skipna:
+            values = self[~mask]
+        elif mask.any():
+            return NaT
+        else:
+            values = self
+
+        if not len(values):
+            # short-circut for empty max / min
+            return NaT
+
+        result = nanops.nanmean(values.view('i8'), skipna=skipna)
+        # Don't have to worry about NA `result`, since no NA went in.
+        return self._box_func(result)
+
 
 # -------------------------------------------------------------------
 # Shared Constructor Helpers

diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
@@ -73,6 +73,7 @@ class DatetimeIndexOpsMixin(ExtensionOpsMixin):
     _maybe_mask_results = ea_passthrough(
         DatetimeLikeArrayMixin._maybe_mask_results)
     __iter__ = ea_passthrough(DatetimeLikeArrayMixin.__iter__)
+    mean = ea_passthrough(DatetimeLikeArrayMixin.mean)
 
     @property
     def freq(self):

diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
@@ -203,6 +203,7 @@ class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index, DatetimeDelegateMixin):
     to_frame
     month_name
     day_name
+    mean
 
     See Also
     --------

diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py
@@ -129,6 +129,7 @@ class TimedeltaIndex(DatetimeIndexOpsMixin, dtl.TimelikeOps, Int64Index,
     floor
     ceil
     to_frame
+    mean
 
     See Also
     --------

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -3729,6 +3729,10 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
         elif is_datetime64_dtype(delegate):
             # use DatetimeIndex implementation to handle skipna correctly
             delegate = DatetimeIndex(delegate)
+        elif is_timedelta64_dtype(delegate) and hasattr(TimedeltaIndex, name):
+            # use TimedeltaIndex to handle skipna correctly
+            # TODO: remove hasattr check after TimedeltaIndex has `std` method
+            delegate = TimedeltaIndex(delegate)
 
         # dispatch to numpy arrays
         elif isinstance(delegate, np.ndarray):

diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -1205,6 +1205,47 @@ def test_mean_corner(self, float_frame, float_string_frame):
         means = float_frame.mean(0)
         assert means['bool'] == float_frame['bool'].values.mean()
 
+    def test_mean_datetimelike(self):
+        # GH#24757 check that datetimelike are excluded by default, handled
+        #  correctly with numeric_only=True
+
+        df = pd.DataFrame({
+            'A': np.arange(3),
+            'B': pd.date_range('2016-01-01', periods=3),
+            'C': pd.timedelta_range('1D', periods=3),
+            'D': pd.period_range('2016', periods=3, freq='A')
+        })
+        result = df.mean(numeric_only=True)
+        expected = pd.Series({'A': 1.})
+        tm.assert_series_equal(result, expected)
+
+        result = df.mean()
+        expected = pd.Series({
+            'A': 1.,
+            'C': df.loc[1, 'C']
+        })
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.xfail(reason="casts to object-dtype and then tries to "
+                              "add timestamps",
+                       raises=TypeError, strict=True)
+    def test_mean_datetimelike_numeric_only_false(self):
+        df = pd.DataFrame({
+            'A': np.arange(3),
+            'B': pd.date_range('2016-01-01', periods=3),
+            'C': pd.timedelta_range('1D', periods=3),
+            'D': pd.period_range('2016', periods=3, freq='A')
+        })
+
+        result = df.mean(numeric_only=False)
+        expected = pd.Series({
+            'A': 1,
+            'B': df.loc[1, 'B'],
+            'C': df.loc[1, 'C'],
+            'D': df.loc[1, 'D']
+        })
+        tm.assert_series_equal(result, expected)
+
     def test_stats_mixed_type(self, float_string_frame):
         # don't blow up
         float_string_frame.std(1)

diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py
@@ -10,9 +10,78 @@
 
 import pandas as pd
 from pandas import DataFrame, Series
+from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray
 import pandas.util.testing as tm
 
 
+class TestDatetimeLikeStatReductions:
+
+    @pytest.mark.parametrize('box', [Series, pd.Index, DatetimeArray])
+    def test_dt64_mean(self, tz_naive_fixture, box):
+        tz = tz_naive_fixture
+
+        dti = pd.date_range('2001-01-01', periods=11, tz=tz)
+        # shuffle so that we are not just working with monotone-increasing
+        dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6])
+        dtarr = dti._data
+
+        obj = box(dtarr)
+        assert obj.mean() == pd.Timestamp('2001-01-06', tz=tz)
+        assert obj.mean(skipna=False) == pd.Timestamp('2001-01-06', tz=tz)
+
+        # dtarr[-2] will be the first date 2001-01-1
+        dtarr[-2] = pd.NaT
+
+        obj = box(dtarr)
+        assert obj.mean() == pd.Timestamp('2001-01-06 07:12:00', tz=tz)
+        assert obj.mean(skipna=False) is pd.NaT
+
+    @pytest.mark.parametrize('box', [Series, pd.Index, PeriodArray])
+    def test_period_mean(self, box):
+        # GH#24757
+        dti = pd.date_range('2001-01-01', periods=11)
+        # shuffle so that we are not just working with monotone-increasing
+        dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6])
+
+        # use hourly frequency to avoid rounding errors in expected results
+        #  TODO: flesh this out with different frequencies
+        parr = dti._data.to_period('H')
+        obj = box(parr)
+        with pytest.raises(TypeError, match="ambiguous"):
+            obj.mean()
+        with pytest.raises(TypeError, match="ambiguous"):
+            obj.mean(skipna=True)
+
+        # parr[-2] will be the first date 2001-01-1
+        parr[-2] = pd.NaT
+
+        with pytest.raises(TypeError, match="ambiguous"):
+            obj.mean()
+        with pytest.raises(TypeError, match="ambiguous"):
+            obj.mean(skipna=True)
+
+    @pytest.mark.parametrize('box', [Series, pd.Index, TimedeltaArray])
+    def test_td64_mean(self, box):
+        tdi = pd.TimedeltaIndex([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4],
+                                unit='D')
+
+        tdarr = tdi._data
+        obj = box(tdarr)
+
+        result = obj.mean()
+        expected = np.array(tdarr).mean()
+        assert result == expected
+
+        tdarr[0] = pd.NaT
+        assert obj.mean(skipna=False) is pd.NaT
+
+        result2 = obj.mean(skipna=True)
+        assert result2 == tdi[1:].mean()
+
+        # exact equality fails by 1 nanosecond
+        assert result2.round('us') == (result * 11. / 10).round('us')
+
+
 class TestSeriesStatReductions:
     # Note: the name TestSeriesStatReductions indicates these tests
     #  were moved from a series-specific test file, _not_ that these tests are