diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 649629714c3b1..1f5ba610cdeb7 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -761,6 +761,7 @@ Groupby/Resample/Rolling - Bug in :meth:`Resampler.apply` when passing postiional arguments to applied func (:issue:`14615`). - Bug in :meth:`Series.resample` when passing ``numpy.timedelta64`` to ``loffset`` kwarg (:issue:`7687`). - Bug in :meth:`Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`). +- Bug in :meth:`SeriesGroupBy.mean` when values were integral but could not fit inside of int64, overflowing instead. (:issue:`22487`) Sparse ^^^^^^ diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index b8cbb41501dd1..f6e7e87f1043b 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -90,6 +90,33 @@ def ensure_categorical(arr): return arr +def ensure_int64_or_float64(arr, copy=False): + """ + Ensure that an dtype array of some integer dtype + has an int64 dtype if possible + If it's not possible, potentially because of overflow, + convert the array to float64 instead. + + Parameters + ---------- + arr : array-like + The array whose data type we want to enforce. + copy: boolean + Whether to copy the original array or reuse + it in place, if possible. + + Returns + ------- + out_arr : The input array cast as int64 if + possible without overflow. + Otherwise the input array cast to float64. + """ + try: + return arr.astype('int64', copy=copy, casting='safe') + except TypeError: + return arr.astype('float64', copy=copy) + + def is_object_dtype(arr_or_dtype): """ Check whether an array-like or dtype is of the object dtype. diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index ba04ff3a3d3ee..d9f7b4d9c31c3 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -23,6 +23,7 @@ ensure_float64, ensure_platform_int, ensure_int64, + ensure_int64_or_float64, ensure_object, needs_i8_conversion, is_integer_dtype, @@ -471,7 +472,7 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, if (values == iNaT).any(): values = ensure_float64(values) else: - values = values.astype('int64', copy=False) + values = ensure_int64_or_float64(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index f8a0f1688c64e..775747ce0c6c1 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1125,3 +1125,12 @@ def h(df, arg3): expected = pd.Series([4, 8, 12], index=pd.Int64Index([1, 2, 3])) tm.assert_series_equal(result, expected) + + +def test_groupby_mean_no_overflow(): + # Regression test for (#22487) + df = pd.DataFrame({ + "user": ["A", "A", "A", "A", "A"], + "connections": [4970, 4749, 4719, 4704, 18446744073699999744] + }) + assert df.groupby('user')['connections'].mean()['A'] == 3689348814740003840