From 0c8ab058a3eae667f41ae811b388cea58a6a8eb3 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 27 Sep 2023 18:04:07 -0400 Subject: [PATCH] REF: Add tests.groupby.methods --- pandas/tests/groupby/methods/__init__.py | 0 .../groupby/{ => methods}/test_any_all.py | 0 pandas/tests/groupby/methods/test_corrwith.py | 24 + pandas/tests/groupby/methods/test_cum.py | 291 ++++++ pandas/tests/groupby/methods/test_describe.py | 221 +++++ .../{ => methods}/test_groupby_shift_diff.py | 0 .../groupby/methods/test_idxmin_idxmax.py | 76 ++ .../groupby/methods/test_is_monotonic.py | 78 ++ pandas/tests/groupby/methods/test_mean.py | 28 + pandas/tests/groupby/methods/test_median.py | 38 + .../groupby/{ => methods}/test_min_max.py | 17 + .../methods/test_nlargest_nsmallest.py | 115 +++ .../tests/groupby/{ => methods}/test_nth.py | 0 .../groupby/{ => methods}/test_nunique.py | 0 .../groupby/{ => methods}/test_quantile.py | 0 .../tests/groupby/{ => methods}/test_rank.py | 0 .../groupby/{ => methods}/test_sample.py | 0 .../tests/groupby/{ => methods}/test_size.py | 0 .../tests/groupby/{ => methods}/test_skew.py | 0 pandas/tests/groupby/methods/test_sum.py | 72 ++ .../{ => methods}/test_value_counts.py | 0 pandas/tests/groupby/test_function.py | 883 ------------------ 22 files changed, 960 insertions(+), 883 deletions(-) create mode 100644 pandas/tests/groupby/methods/__init__.py rename pandas/tests/groupby/{ => methods}/test_any_all.py (100%) create mode 100644 pandas/tests/groupby/methods/test_corrwith.py create mode 100644 pandas/tests/groupby/methods/test_cum.py create mode 100644 pandas/tests/groupby/methods/test_describe.py rename pandas/tests/groupby/{ => methods}/test_groupby_shift_diff.py (100%) create mode 100644 pandas/tests/groupby/methods/test_idxmin_idxmax.py create mode 100644 pandas/tests/groupby/methods/test_is_monotonic.py create mode 100644 pandas/tests/groupby/methods/test_mean.py create mode 100644 pandas/tests/groupby/methods/test_median.py rename pandas/tests/groupby/{ => methods}/test_min_max.py (94%) create mode 100644 pandas/tests/groupby/methods/test_nlargest_nsmallest.py rename pandas/tests/groupby/{ => methods}/test_nth.py (100%) rename pandas/tests/groupby/{ => methods}/test_nunique.py (100%) rename pandas/tests/groupby/{ => methods}/test_quantile.py (100%) rename pandas/tests/groupby/{ => methods}/test_rank.py (100%) rename pandas/tests/groupby/{ => methods}/test_sample.py (100%) rename pandas/tests/groupby/{ => methods}/test_size.py (100%) rename pandas/tests/groupby/{ => methods}/test_skew.py (100%) create mode 100644 pandas/tests/groupby/methods/test_sum.py rename pandas/tests/groupby/{ => methods}/test_value_counts.py (100%) diff --git a/pandas/tests/groupby/methods/__init__.py b/pandas/tests/groupby/methods/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/groupby/test_any_all.py b/pandas/tests/groupby/methods/test_any_all.py similarity index 100% rename from pandas/tests/groupby/test_any_all.py rename to pandas/tests/groupby/methods/test_any_all.py diff --git a/pandas/tests/groupby/methods/test_corrwith.py b/pandas/tests/groupby/methods/test_corrwith.py new file mode 100644 index 0000000000000..53e8bdc4534dc --- /dev/null +++ b/pandas/tests/groupby/methods/test_corrwith.py @@ -0,0 +1,24 @@ +import numpy as np + +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm + + +def test_corrwith_with_1_axis(): + # GH 47723 + df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]}) + gb = df.groupby("a") + + msg = "DataFrameGroupBy.corrwith with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.corrwith(df, axis=1) + index = Index( + data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)], + name=("a", None), + ) + expected = Series([np.nan] * 6, index=index) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_cum.py b/pandas/tests/groupby/methods/test_cum.py new file mode 100644 index 0000000000000..eecb82cd5050b --- /dev/null +++ b/pandas/tests/groupby/methods/test_cum.py @@ -0,0 +1,291 @@ +import numpy as np +import pytest + +from pandas.errors import UnsupportedFunctionCall +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + + +@pytest.fixture( + params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"], + ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"], +) +def dtypes_for_minmax(request): + """ + Fixture of dtypes with min and max values used for testing + cummin and cummax + """ + dtype = request.param + + np_type = dtype + if dtype == "Int64": + np_type = np.int64 + elif dtype == "Float64": + np_type = np.float64 + + min_val = ( + np.iinfo(np_type).min + if np.dtype(np_type).kind == "i" + else np.finfo(np_type).min + ) + max_val = ( + np.iinfo(np_type).max + if np.dtype(np_type).kind == "i" + else np.finfo(np_type).max + ) + + return (dtype, min_val, max_val) + + +def test_groupby_cumprod(): + # GH 4095 + df = DataFrame({"key": ["b"] * 10, "value": 2}) + + actual = df.groupby("key")["value"].cumprod() + expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) + expected.name = "value" + tm.assert_series_equal(actual, expected) + + df = DataFrame({"key": ["b"] * 100, "value": 2}) + df["value"] = df["value"].astype(float) + actual = df.groupby("key")["value"].cumprod() + expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) + expected.name = "value" + tm.assert_series_equal(actual, expected) + + +def test_groupby_cumprod_overflow(): + # GH#37493 if we overflow we return garbage consistent with numpy + df = DataFrame({"key": ["b"] * 4, "value": 100_000}) + actual = df.groupby("key")["value"].cumprod() + expected = Series( + [100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920], + name="value", + ) + tm.assert_series_equal(actual, expected) + + numpy_result = df.groupby("key", group_keys=False)["value"].apply( + lambda x: x.cumprod() + ) + numpy_result.name = "value" + tm.assert_series_equal(actual, numpy_result) + + +def test_groupby_cumprod_nan_influences_other_columns(): + # GH#48064 + df = DataFrame( + { + "a": 1, + "b": [1, np.nan, 2], + "c": [1, 2, 3.0], + } + ) + result = df.groupby("a").cumprod(numeric_only=True, skipna=False) + expected = DataFrame({"b": [1, np.nan, np.nan], "c": [1, 2, 6.0]}) + tm.assert_frame_equal(result, expected) + + +def test_cummin(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + min_val = dtypes_for_minmax[1] + + # GH 15048 + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] + + df = base_df.astype(dtype) + + expected = DataFrame({"B": expected_mins}).astype(dtype) + result = df.groupby("A").cummin() + tm.assert_frame_equal(result, expected) + result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test w/ min value for dtype + df.loc[[2, 6], "B"] = min_val + df.loc[[1, 5], "B"] = min_val + 1 + expected.loc[[2, 3, 6, 7], "B"] = min_val + expected.loc[[1, 5], "B"] = min_val + 1 # should not be rounded to min_val + result = df.groupby("A").cummin() + tm.assert_frame_equal(result, expected, check_exact=True) + expected = ( + df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() + ) + tm.assert_frame_equal(result, expected, check_exact=True) + + # Test nan in some values + # Explicit cast to float to avoid implicit cast when setting nan + base_df = base_df.astype({"B": "float"}) + base_df.loc[[0, 2, 4, 6], "B"] = np.nan + expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]}) + result = base_df.groupby("A").cummin() + tm.assert_frame_equal(result, expected) + expected = ( + base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() + ) + tm.assert_frame_equal(result, expected) + + # GH 15561 + df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) + expected = Series(pd.to_datetime("2001"), index=[0], name="b") + + result = df.groupby("a")["b"].cummin() + tm.assert_series_equal(expected, result) + + # GH 15635 + df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]}) + result = df.groupby("a").b.cummin() + expected = Series([1, 2, 1], name="b") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +@pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"]) +def test_cummin_max_all_nan_column(method, dtype): + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8}) + base_df["B"] = base_df["B"].astype(dtype) + grouped = base_df.groupby("A") + + expected = DataFrame({"B": [np.nan] * 8}, dtype=dtype) + result = getattr(grouped, method)() + tm.assert_frame_equal(expected, result) + + result = getattr(grouped["B"], method)().to_frame() + tm.assert_frame_equal(expected, result) + + +def test_cummax(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + max_val = dtypes_for_minmax[2] + + # GH 15048 + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] + + df = base_df.astype(dtype) + + expected = DataFrame({"B": expected_maxs}).astype(dtype) + result = df.groupby("A").cummax() + tm.assert_frame_equal(result, expected) + result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test w/ max value for dtype + df.loc[[2, 6], "B"] = max_val + expected.loc[[2, 3, 6, 7], "B"] = max_val + result = df.groupby("A").cummax() + tm.assert_frame_equal(result, expected) + expected = ( + df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() + ) + tm.assert_frame_equal(result, expected) + + # Test nan in some values + # Explicit cast to float to avoid implicit cast when setting nan + base_df = base_df.astype({"B": "float"}) + base_df.loc[[0, 2, 4, 6], "B"] = np.nan + expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) + result = base_df.groupby("A").cummax() + tm.assert_frame_equal(result, expected) + expected = ( + base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() + ) + tm.assert_frame_equal(result, expected) + + # GH 15561 + df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) + expected = Series(pd.to_datetime("2001"), index=[0], name="b") + + result = df.groupby("a")["b"].cummax() + tm.assert_series_equal(expected, result) + + # GH 15635 + df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]}) + result = df.groupby("a").b.cummax() + expected = Series([2, 1, 2], name="b") + tm.assert_series_equal(result, expected) + + +def test_cummax_i8_at_implementation_bound(): + # the minimum value used to be treated as NPY_NAT+1 instead of NPY_NAT + # for int64 dtype GH#46382 + ser = Series([pd.NaT._value + n for n in range(5)]) + df = DataFrame({"A": 1, "B": ser, "C": ser.view("M8[ns]")}) + gb = df.groupby("A") + + res = gb.cummax() + exp = df[["B", "C"]] + tm.assert_frame_equal(res, exp) + + +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"]) +@pytest.mark.parametrize( + "groups,expected_data", + [ + ([1, 1, 1], [1, None, None]), + ([1, 2, 3], [1, None, 2]), + ([1, 3, 3], [1, None, None]), + ], +) +def test_cummin_max_skipna(method, dtype, groups, expected_data): + # GH-34047 + df = DataFrame({"a": Series([1, None, 2], dtype=dtype)}) + orig = df.copy() + gb = df.groupby(groups)["a"] + + result = getattr(gb, method)(skipna=False) + expected = Series(expected_data, dtype=dtype, name="a") + + # check we didn't accidentally alter df + tm.assert_frame_equal(df, orig) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +def test_cummin_max_skipna_multiple_cols(method): + # Ensure missing value in "a" doesn't cause "b" to be nan-filled + df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]}) + gb = df.groupby([1, 1, 1])[["a", "b"]] + + result = getattr(gb, method)(skipna=False) + expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]}) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("func", ["cumprod", "cumsum"]) +def test_numpy_compat(func): + # see gh-12811 + df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) + g = df.groupby("A") + + msg = "numpy operations are not valid with groupby" + + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(g, func)(1, 2, 3) + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(g, func)(foo=1) + + +@td.skip_if_32bit +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +@pytest.mark.parametrize( + "dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2**53 + 1)] +) +def test_nullable_int_not_cast_as_float(method, dtype, val): + data = [val, pd.NA] + df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype) + grouped = df.groupby("grp") + + result = grouped.transform(method) + expected = DataFrame({"b": data}, dtype=dtype) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_describe.py b/pandas/tests/groupby/methods/test_describe.py new file mode 100644 index 0000000000000..f38de8faddb59 --- /dev/null +++ b/pandas/tests/groupby/methods/test_describe.py @@ -0,0 +1,221 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + Timestamp, +) +import pandas._testing as tm + + +def test_apply_describe_bug(mframe): + grouped = mframe.groupby(level="first") + grouped.describe() # it works! + + +def test_series_describe_multikey(): + ts = tm.makeTimeSeries() + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe() + tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False) + tm.assert_series_equal(result["std"], grouped.std(), check_names=False) + tm.assert_series_equal(result["min"], grouped.min(), check_names=False) + + +def test_series_describe_single(): + ts = tm.makeTimeSeries() + grouped = ts.groupby(lambda x: x.month) + result = grouped.apply(lambda x: x.describe()) + expected = grouped.describe().stack(future_stack=True) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]]) +def test_series_describe_as_index(as_index, keys): + # GH#49256 + df = DataFrame( + { + "key1": ["one", "two", "two", "three", "two"], + "key2": ["one", "two", "two", "three", "two"], + "foo2": [1, 2, 4, 4, 6], + } + ) + gb = df.groupby(keys, as_index=as_index)["foo2"] + result = gb.describe() + expected = DataFrame( + { + "key1": ["one", "three", "two"], + "count": [1.0, 1.0, 3.0], + "mean": [1.0, 4.0, 4.0], + "std": [np.nan, np.nan, 2.0], + "min": [1.0, 4.0, 2.0], + "25%": [1.0, 4.0, 3.0], + "50%": [1.0, 4.0, 4.0], + "75%": [1.0, 4.0, 5.0], + "max": [1.0, 4.0, 6.0], + } + ) + if len(keys) == 2: + expected.insert(1, "key2", expected["key1"]) + if as_index: + expected = expected.set_index(keys) + tm.assert_frame_equal(result, expected) + + +def test_frame_describe_multikey(tsframe): + grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe() + desc_groups = [] + for col in tsframe: + group = grouped[col].describe() + # GH 17464 - Remove duplicate MultiIndex levels + group_col = MultiIndex( + levels=[[col], group.columns], + codes=[[0] * len(group.columns), range(len(group.columns))], + ) + group = DataFrame(group.values, columns=group_col, index=group.index) + desc_groups.append(group) + expected = pd.concat(desc_groups, axis=1) + tm.assert_frame_equal(result, expected) + + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) + result = groupedT.describe() + expected = tsframe.describe().T + # reverting the change from https://github.com/pandas-dev/pandas/pull/35441/ + expected.index = MultiIndex( + levels=[[0, 1], expected.index], + codes=[[0, 0, 1, 1], range(len(expected.index))], + ) + tm.assert_frame_equal(result, expected) + + +def test_frame_describe_tupleindex(): + # GH 14848 - regression from 0.19.0 to 0.19.1 + df1 = DataFrame( + { + "x": [1, 2, 3, 4, 5] * 3, + "y": [10, 20, 30, 40, 50] * 3, + "z": [100, 200, 300, 400, 500] * 3, + } + ) + df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 + df2 = df1.rename(columns={"k": "key"}) + msg = "Names should be list-like for a MultiIndex" + with pytest.raises(ValueError, match=msg): + df1.groupby("k").describe() + with pytest.raises(ValueError, match=msg): + df2.groupby("key").describe() + + +def test_frame_describe_unstacked_format(): + # GH 4792 + prices = { + Timestamp("2011-01-06 10:59:05", tz=None): 24990, + Timestamp("2011-01-06 12:43:33", tz=None): 25499, + Timestamp("2011-01-06 12:54:09", tz=None): 25499, + } + volumes = { + Timestamp("2011-01-06 10:59:05", tz=None): 1500000000, + Timestamp("2011-01-06 12:43:33", tz=None): 5000000000, + Timestamp("2011-01-06 12:54:09", tz=None): 100000000, + } + df = DataFrame({"PRICE": prices, "VOLUME": volumes}) + result = df.groupby("PRICE").VOLUME.describe() + data = [ + df[df.PRICE == 24990].VOLUME.describe().values.tolist(), + df[df.PRICE == 25499].VOLUME.describe().values.tolist(), + ] + expected = DataFrame( + data, + index=Index([24990, 25499], name="PRICE"), + columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.filterwarnings( + "ignore:" + "indexing past lexsort depth may impact performance:" + "pandas.errors.PerformanceWarning" +) +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) +def test_describe_with_duplicate_output_column_names(as_index, keys): + # GH 35314 + df = DataFrame( + { + "a1": [99, 99, 99, 88, 88, 88], + "a2": [99, 99, 99, 88, 88, 88], + "b": [1, 2, 3, 4, 5, 6], + "c": [10, 20, 30, 40, 50, 60], + }, + columns=["a1", "a2", "b", "b"], + copy=False, + ) + if keys == ["a1"]: + df = df.drop(columns="a2") + + expected = ( + DataFrame.from_records( + [ + ("b", "count", 3.0, 3.0), + ("b", "mean", 5.0, 2.0), + ("b", "std", 1.0, 1.0), + ("b", "min", 4.0, 1.0), + ("b", "25%", 4.5, 1.5), + ("b", "50%", 5.0, 2.0), + ("b", "75%", 5.5, 2.5), + ("b", "max", 6.0, 3.0), + ("b", "count", 3.0, 3.0), + ("b", "mean", 5.0, 2.0), + ("b", "std", 1.0, 1.0), + ("b", "min", 4.0, 1.0), + ("b", "25%", 4.5, 1.5), + ("b", "50%", 5.0, 2.0), + ("b", "75%", 5.5, 2.5), + ("b", "max", 6.0, 3.0), + ], + ) + .set_index([0, 1]) + .T + ) + expected.columns.names = [None, None] + if len(keys) == 2: + expected.index = MultiIndex( + levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"] + ) + else: + expected.index = Index([88, 99], name="a1") + + if not as_index: + expected = expected.reset_index() + + result = df.groupby(keys, as_index=as_index).describe() + + tm.assert_frame_equal(result, expected) + + +def test_describe_duplicate_columns(): + # GH#50806 + df = DataFrame([[0, 1, 2, 3]]) + df.columns = [0, 1, 2, 0] + gb = df.groupby(df[1]) + result = gb.describe(percentiles=[]) + + columns = ["count", "mean", "std", "min", "50%", "max"] + frames = [ + DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns) + for val in (0.0, 2.0, 3.0) + ] + expected = pd.concat(frames, axis=1) + expected.columns = MultiIndex( + levels=[[0, 2], columns], + codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))], + ) + expected.index.names = [1] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby_shift_diff.py b/pandas/tests/groupby/methods/test_groupby_shift_diff.py similarity index 100% rename from pandas/tests/groupby/test_groupby_shift_diff.py rename to pandas/tests/groupby/methods/test_groupby_shift_diff.py diff --git a/pandas/tests/groupby/methods/test_idxmin_idxmax.py b/pandas/tests/groupby/methods/test_idxmin_idxmax.py new file mode 100644 index 0000000000000..f9b4d73fd97ab --- /dev/null +++ b/pandas/tests/groupby/methods/test_idxmin_idxmax.py @@ -0,0 +1,76 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, + date_range, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "func, values", + [ + ("idxmin", {"c_int": [0, 2], "c_float": [1, 3], "c_date": [1, 2]}), + ("idxmax", {"c_int": [1, 3], "c_float": [0, 2], "c_date": [0, 3]}), + ], +) +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_idxmin_idxmax_returns_int_types(func, values, numeric_only): + # GH 25444 + df = DataFrame( + { + "name": ["A", "A", "B", "B"], + "c_int": [1, 2, 3, 4], + "c_float": [4.02, 3.03, 2.04, 1.05], + "c_date": ["2019", "2018", "2016", "2017"], + } + ) + df["c_date"] = pd.to_datetime(df["c_date"]) + df["c_date_tz"] = df["c_date"].dt.tz_localize("US/Pacific") + df["c_timedelta"] = df["c_date"] - df["c_date"].iloc[0] + df["c_period"] = df["c_date"].dt.to_period("W") + df["c_Integer"] = df["c_int"].astype("Int64") + df["c_Floating"] = df["c_float"].astype("Float64") + + result = getattr(df.groupby("name"), func)(numeric_only=numeric_only) + + expected = DataFrame(values, index=Index(["A", "B"], name="name")) + if numeric_only: + expected = expected.drop(columns=["c_date"]) + else: + expected["c_date_tz"] = expected["c_date"] + expected["c_timedelta"] = expected["c_date"] + expected["c_period"] = expected["c_date"] + expected["c_Integer"] = expected["c_int"] + expected["c_Floating"] = expected["c_float"] + + tm.assert_frame_equal(result, expected) + + +def test_idxmin_idxmax_axis1(): + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"] + ) + df["A"] = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4] + + gb = df.groupby("A") + + warn_msg = "DataFrameGroupBy.idxmax with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + res = gb.idxmax(axis=1) + + alt = df.iloc[:, 1:].idxmax(axis=1) + indexer = res.index.get_level_values(1) + + tm.assert_series_equal(alt[indexer], res.droplevel("A")) + + df["E"] = date_range("2016-01-01", periods=10) + gb2 = df.groupby("A") + + msg = "'>' not supported between instances of 'Timestamp' and 'float'" + with pytest.raises(TypeError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + gb2.idxmax(axis=1) diff --git a/pandas/tests/groupby/methods/test_is_monotonic.py b/pandas/tests/groupby/methods/test_is_monotonic.py new file mode 100644 index 0000000000000..3428fc90f6e51 --- /dev/null +++ b/pandas/tests/groupby/methods/test_is_monotonic.py @@ -0,0 +1,78 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "in_vals, out_vals", + [ + # Basics: strictly increasing (T), strictly decreasing (F), + # abs val increasing (F), non-strictly increasing (T) + ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]), + # Test with inf vals + ( + [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], + [True, False, True, False], + ), + # Test with nan vals; should always be False + ( + [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False], + ), + ], +) +def test_is_monotonic_increasing(in_vals, out_vals): + # GH 17015 + source_dict = { + "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], + "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], + "C": in_vals, + } + df = DataFrame(source_dict) + result = df.groupby("B").C.is_monotonic_increasing + index = Index(list("abcd"), name="B") + expected = Series(index=index, data=out_vals, name="C") + tm.assert_series_equal(result, expected) + + # Also check result equal to manually taking x.is_monotonic_increasing. + expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "in_vals, out_vals", + [ + # Basics: strictly decreasing (T), strictly increasing (F), + # abs val decreasing (F), non-strictly increasing (T) + ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]), + # Test with inf vals + ( + [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], + [True, True, False, True], + ), + # Test with nan vals; should always be False + ( + [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False], + ), + ], +) +def test_is_monotonic_decreasing(in_vals, out_vals): + # GH 17015 + source_dict = { + "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], + "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], + "C": in_vals, + } + + df = DataFrame(source_dict) + result = df.groupby("B").C.is_monotonic_decreasing + index = Index(list("abcd"), name="B") + expected = Series(index=index, data=out_vals, name="C") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_mean.py b/pandas/tests/groupby/methods/test_mean.py new file mode 100644 index 0000000000000..4123264ddcb27 --- /dev/null +++ b/pandas/tests/groupby/methods/test_mean.py @@ -0,0 +1,28 @@ +import pandas as pd +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm + + +def test_groupby_mean_no_overflow(): + # Regression test for (#22487) + df = DataFrame( + { + "user": ["A", "A", "A", "A", "A"], + "connections": [4970, 4749, 4719, 4704, 18446744073699999744], + } + ) + assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840 + + +def test_mean_on_timedelta(): + # GH 17382 + df = DataFrame({"time": pd.to_timedelta(range(10)), "cat": ["A", "B"] * 5}) + result = df.groupby("cat")["time"].mean() + expected = Series( + pd.to_timedelta([4, 5]), name="time", index=Index(["A", "B"], name="cat") + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_median.py b/pandas/tests/groupby/methods/test_median.py new file mode 100644 index 0000000000000..148be348c4ac0 --- /dev/null +++ b/pandas/tests/groupby/methods/test_median.py @@ -0,0 +1,38 @@ +import numpy as np + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + + +def test_cython_median(): + arr = np.random.default_rng(2).standard_normal(1000) + arr[::2] = np.nan + df = DataFrame(arr) + + labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float) + labels[::17] = np.nan + + result = df.groupby(labels).median() + msg = "using DataFrameGroupBy.median" + with tm.assert_produces_warning(FutureWarning, match=msg): + exp = df.groupby(labels).agg(np.nanmedian) + tm.assert_frame_equal(result, exp) + + df = DataFrame(np.random.default_rng(2).standard_normal((1000, 5))) + msg = "using DataFrameGroupBy.median" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = df.groupby(labels).agg(np.median) + xp = df.groupby(labels).median() + tm.assert_frame_equal(rs, xp) + + +def test_median_empty_bins(observed): + df = DataFrame(np.random.default_rng(2).integers(0, 44, 500)) + + grps = range(0, 55, 5) + bins = pd.cut(df[0], grps) + + result = df.groupby(bins, observed=observed).median() + expected = df.groupby(bins, observed=observed).agg(lambda x: x.median()) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_min_max.py b/pandas/tests/groupby/methods/test_min_max.py similarity index 94% rename from pandas/tests/groupby/test_min_max.py rename to pandas/tests/groupby/methods/test_min_max.py index 37eb52be0b37b..5b6fdec306515 100644 --- a/pandas/tests/groupby/test_min_max.py +++ b/pandas/tests/groupby/methods/test_min_max.py @@ -1,3 +1,5 @@ +from io import StringIO + import numpy as np import pytest @@ -270,3 +272,18 @@ def test_groupby_min_max_categorical(func): index=idx, ) tm.assert_frame_equal(result, expected) + + +def test_max_nan_bug(): + raw = """,Date,app,File +-04-23,2013-04-23 00:00:00,,log080001.log +-05-06,2013-05-06 00:00:00,,log.log +-05-07,2013-05-07 00:00:00,OE,xlsx""" + + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + df = pd.read_csv(StringIO(raw), parse_dates=[0]) + gb = df.groupby("Date") + r = gb[["File"]].max() + e = gb["File"].max().to_frame() + tm.assert_frame_equal(r, e) + assert not r["File"].isna().any() diff --git a/pandas/tests/groupby/methods/test_nlargest_nsmallest.py b/pandas/tests/groupby/methods/test_nlargest_nsmallest.py new file mode 100644 index 0000000000000..061326bd36c53 --- /dev/null +++ b/pandas/tests/groupby/methods/test_nlargest_nsmallest.py @@ -0,0 +1,115 @@ +import numpy as np +import pytest + +from pandas import ( + MultiIndex, + Series, + date_range, +) +import pandas._testing as tm + + +def test_nlargest(): + a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) + b = Series(list("a" * 5 + "b" * 5)) + gb = a.groupby(b) + r = gb.nlargest(3) + e = Series( + [7, 5, 3, 10, 9, 6], + index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]), + ) + tm.assert_series_equal(r, e) + + a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) + gb = a.groupby(b) + e = Series( + [3, 2, 1, 3, 3, 2], + index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]), + ) + tm.assert_series_equal(gb.nlargest(3, keep="last"), e) + + +def test_nlargest_mi_grouper(): + # see gh-21411 + npr = np.random.default_rng(2) + + dts = date_range("20180101", periods=10) + iterables = [dts, ["one", "two"]] + + idx = MultiIndex.from_product(iterables, names=["first", "second"]) + s = Series(npr.standard_normal(20), index=idx) + + result = s.groupby("first").nlargest(1) + + exp_idx = MultiIndex.from_tuples( + [ + (dts[0], dts[0], "one"), + (dts[1], dts[1], "one"), + (dts[2], dts[2], "one"), + (dts[3], dts[3], "two"), + (dts[4], dts[4], "one"), + (dts[5], dts[5], "one"), + (dts[6], dts[6], "one"), + (dts[7], dts[7], "one"), + (dts[8], dts[8], "one"), + (dts[9], dts[9], "one"), + ], + names=["first", "first", "second"], + ) + + exp_values = [ + 0.18905338179353307, + -0.41306354339189344, + 1.799707382720902, + 0.7738065867276614, + 0.28121066979764925, + 0.9775674511260357, + -0.3288239040579627, + 0.45495807124085547, + 0.5452887139646817, + 0.12682784711186987, + ] + + expected = Series(exp_values, index=exp_idx) + tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3) + + +def test_nsmallest(): + a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) + b = Series(list("a" * 5 + "b" * 5)) + gb = a.groupby(b) + r = gb.nsmallest(3) + e = Series( + [1, 2, 3, 0, 4, 6], + index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]), + ) + tm.assert_series_equal(r, e) + + a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) + gb = a.groupby(b) + e = Series( + [0, 1, 1, 0, 1, 2], + index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]), + ) + tm.assert_series_equal(gb.nsmallest(3, keep="last"), e) + + +@pytest.mark.parametrize( + "data, groups", + [([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])], +) +@pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES]) +@pytest.mark.parametrize("method", ["nlargest", "nsmallest"]) +def test_nlargest_and_smallest_noop(data, groups, dtype, method): + # GH 15272, GH 16345, GH 29129 + # Test nlargest/smallest when it results in a noop, + # i.e. input is sorted and group size <= n + if dtype is not None: + data = np.array(data, dtype=dtype) + if method == "nlargest": + data = list(reversed(data)) + ser = Series(data, name="a") + result = getattr(ser.groupby(groups), method)(n=2) + expidx = np.array(groups, dtype=np.int_) if isinstance(groups, list) else groups + expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/methods/test_nth.py similarity index 100% rename from pandas/tests/groupby/test_nth.py rename to pandas/tests/groupby/methods/test_nth.py diff --git a/pandas/tests/groupby/test_nunique.py b/pandas/tests/groupby/methods/test_nunique.py similarity index 100% rename from pandas/tests/groupby/test_nunique.py rename to pandas/tests/groupby/methods/test_nunique.py diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py similarity index 100% rename from pandas/tests/groupby/test_quantile.py rename to pandas/tests/groupby/methods/test_quantile.py diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/methods/test_rank.py similarity index 100% rename from pandas/tests/groupby/test_rank.py rename to pandas/tests/groupby/methods/test_rank.py diff --git a/pandas/tests/groupby/test_sample.py b/pandas/tests/groupby/methods/test_sample.py similarity index 100% rename from pandas/tests/groupby/test_sample.py rename to pandas/tests/groupby/methods/test_sample.py diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/methods/test_size.py similarity index 100% rename from pandas/tests/groupby/test_size.py rename to pandas/tests/groupby/methods/test_size.py diff --git a/pandas/tests/groupby/test_skew.py b/pandas/tests/groupby/methods/test_skew.py similarity index 100% rename from pandas/tests/groupby/test_skew.py rename to pandas/tests/groupby/methods/test_skew.py diff --git a/pandas/tests/groupby/methods/test_sum.py b/pandas/tests/groupby/methods/test_sum.py new file mode 100644 index 0000000000000..112075add67d1 --- /dev/null +++ b/pandas/tests/groupby/methods/test_sum.py @@ -0,0 +1,72 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("min_count", [0, 10]) +def test_groupby_sum_mincount_boolean(min_count): + b = True + a = False + na = np.nan + dfg = pd.array([b, b, na, na, a, a, b], dtype="boolean") + + df = DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": dfg}) + result = df.groupby("A").sum(min_count=min_count) + if min_count == 0: + expected = DataFrame( + {"B": pd.array([3, 0, 0], dtype="Int64")}, + index=Index([1, 2, 3], name="A"), + ) + tm.assert_frame_equal(result, expected) + else: + expected = DataFrame( + {"B": pd.array([pd.NA] * 3, dtype="Int64")}, + index=Index([1, 2, 3], name="A"), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_sum_below_mincount_nullable_integer(): + # https://github.com/pandas-dev/pandas/issues/32861 + df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64") + grouped = df.groupby("a") + idx = Index([0, 1, 2], name="a", dtype="Int64") + + result = grouped["b"].sum(min_count=2) + expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b") + tm.assert_series_equal(result, expected) + + result = grouped.sum(min_count=2) + expected = DataFrame({"b": [pd.NA] * 3, "c": [pd.NA] * 3}, dtype="Int64", index=idx) + tm.assert_frame_equal(result, expected) + + +def test_groupby_sum_timedelta_with_nat(): + # GH#42659 + df = DataFrame( + { + "a": [1, 1, 2, 2], + "b": [pd.Timedelta("1d"), pd.Timedelta("2d"), pd.Timedelta("3d"), pd.NaT], + } + ) + td3 = pd.Timedelta(days=3) + + gb = df.groupby("a") + + res = gb.sum() + expected = DataFrame({"b": [td3, td3]}, index=Index([1, 2], name="a")) + tm.assert_frame_equal(res, expected) + + res = gb["b"].sum() + tm.assert_series_equal(res, expected["b"]) + + res = gb["b"].sum(min_count=2) + expected = Series([td3, pd.NaT], dtype="m8[ns]", name="b", index=expected.index) + tm.assert_series_equal(res, expected) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py similarity index 100% rename from pandas/tests/groupby/test_value_counts.py rename to pandas/tests/groupby/methods/test_value_counts.py diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 41bbfcf6840a9..4876267c72f12 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1,12 +1,10 @@ import builtins -from io import StringIO import re import numpy as np import pytest from pandas._libs import lib -from pandas.errors import UnsupportedFunctionCall import pandas as pd from pandas import ( @@ -22,37 +20,6 @@ from pandas.util import _test_decorators as td -@pytest.fixture( - params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"], - ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"], -) -def dtypes_for_minmax(request): - """ - Fixture of dtypes with min and max values used for testing - cummin and cummax - """ - dtype = request.param - - np_type = dtype - if dtype == "Int64": - np_type = np.int64 - elif dtype == "Float64": - np_type = np.float64 - - min_val = ( - np.iinfo(np_type).min - if np.dtype(np_type).kind == "i" - else np.finfo(np_type).min - ) - max_val = ( - np.iinfo(np_type).max - if np.dtype(np_type).kind == "i" - else np.finfo(np_type).max - ) - - return (dtype, min_val, max_val) - - def test_intercept_builtin_sum(): s = Series([1.0, 2.0, np.nan, 3.0]) grouped = s.groupby([0, 1, 2, 2]) @@ -372,39 +339,6 @@ def test_cython_api2(): tm.assert_frame_equal(result, expected) -def test_cython_median(): - arr = np.random.default_rng(2).standard_normal(1000) - arr[::2] = np.nan - df = DataFrame(arr) - - labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float) - labels[::17] = np.nan - - result = df.groupby(labels).median() - msg = "using DataFrameGroupBy.median" - with tm.assert_produces_warning(FutureWarning, match=msg): - exp = df.groupby(labels).agg(np.nanmedian) - tm.assert_frame_equal(result, exp) - - df = DataFrame(np.random.default_rng(2).standard_normal((1000, 5))) - msg = "using DataFrameGroupBy.median" - with tm.assert_produces_warning(FutureWarning, match=msg): - rs = df.groupby(labels).agg(np.median) - xp = df.groupby(labels).median() - tm.assert_frame_equal(rs, xp) - - -def test_median_empty_bins(observed): - df = DataFrame(np.random.default_rng(2).integers(0, 44, 500)) - - grps = range(0, 55, 5) - bins = pd.cut(df[0], grps) - - result = df.groupby(bins, observed=observed).median() - expected = df.groupby(bins, observed=observed).agg(lambda x: x.median()) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( "dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"] ) @@ -478,72 +412,6 @@ def test_groupby_non_arithmetic_agg_int_like_precision(i): assert res.iloc[0].b == data["expected"] -@pytest.mark.parametrize( - "func, values", - [ - ("idxmin", {"c_int": [0, 2], "c_float": [1, 3], "c_date": [1, 2]}), - ("idxmax", {"c_int": [1, 3], "c_float": [0, 2], "c_date": [0, 3]}), - ], -) -@pytest.mark.parametrize("numeric_only", [True, False]) -def test_idxmin_idxmax_returns_int_types(func, values, numeric_only): - # GH 25444 - df = DataFrame( - { - "name": ["A", "A", "B", "B"], - "c_int": [1, 2, 3, 4], - "c_float": [4.02, 3.03, 2.04, 1.05], - "c_date": ["2019", "2018", "2016", "2017"], - } - ) - df["c_date"] = pd.to_datetime(df["c_date"]) - df["c_date_tz"] = df["c_date"].dt.tz_localize("US/Pacific") - df["c_timedelta"] = df["c_date"] - df["c_date"].iloc[0] - df["c_period"] = df["c_date"].dt.to_period("W") - df["c_Integer"] = df["c_int"].astype("Int64") - df["c_Floating"] = df["c_float"].astype("Float64") - - result = getattr(df.groupby("name"), func)(numeric_only=numeric_only) - - expected = DataFrame(values, index=Index(["A", "B"], name="name")) - if numeric_only: - expected = expected.drop(columns=["c_date"]) - else: - expected["c_date_tz"] = expected["c_date"] - expected["c_timedelta"] = expected["c_date"] - expected["c_period"] = expected["c_date"] - expected["c_Integer"] = expected["c_int"] - expected["c_Floating"] = expected["c_float"] - - tm.assert_frame_equal(result, expected) - - -def test_idxmin_idxmax_axis1(): - df = DataFrame( - np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"] - ) - df["A"] = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4] - - gb = df.groupby("A") - - warn_msg = "DataFrameGroupBy.idxmax with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - res = gb.idxmax(axis=1) - - alt = df.iloc[:, 1:].idxmax(axis=1) - indexer = res.index.get_level_values(1) - - tm.assert_series_equal(alt[indexer], res.droplevel("A")) - - df["E"] = date_range("2016-01-01", periods=10) - gb2 = df.groupby("A") - - msg = "'>' not supported between instances of 'Timestamp' and 'float'" - with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - gb2.idxmax(axis=1) - - @pytest.mark.parametrize("numeric_only", [True, False, None]) def test_axis1_numeric_only(request, groupby_func, numeric_only): if groupby_func in ("idxmax", "idxmin"): @@ -625,54 +493,6 @@ def test_axis1_numeric_only(request, groupby_func, numeric_only): tm.assert_equal(result, expected) -def test_groupby_cumprod(): - # GH 4095 - df = DataFrame({"key": ["b"] * 10, "value": 2}) - - actual = df.groupby("key")["value"].cumprod() - expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) - expected.name = "value" - tm.assert_series_equal(actual, expected) - - df = DataFrame({"key": ["b"] * 100, "value": 2}) - df["value"] = df["value"].astype(float) - actual = df.groupby("key")["value"].cumprod() - expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) - expected.name = "value" - tm.assert_series_equal(actual, expected) - - -def test_groupby_cumprod_overflow(): - # GH#37493 if we overflow we return garbage consistent with numpy - df = DataFrame({"key": ["b"] * 4, "value": 100_000}) - actual = df.groupby("key")["value"].cumprod() - expected = Series( - [100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920], - name="value", - ) - tm.assert_series_equal(actual, expected) - - numpy_result = df.groupby("key", group_keys=False)["value"].apply( - lambda x: x.cumprod() - ) - numpy_result.name = "value" - tm.assert_series_equal(actual, numpy_result) - - -def test_groupby_cumprod_nan_influences_other_columns(): - # GH#48064 - df = DataFrame( - { - "a": 1, - "b": [1, np.nan, 2], - "c": [1, 2, 3.0], - } - ) - result = df.groupby("a").cumprod(numeric_only=True, skipna=False) - expected = DataFrame({"b": [1, np.nan, np.nan], "c": [1, 2, 6.0]}) - tm.assert_frame_equal(result, expected) - - def scipy_sem(*args, **kwargs): from scipy.stats import sem @@ -708,627 +528,12 @@ def test_ops_general(op, targop): tm.assert_frame_equal(result, expected) -def test_max_nan_bug(): - raw = """,Date,app,File --04-23,2013-04-23 00:00:00,,log080001.log --05-06,2013-05-06 00:00:00,,log.log --05-07,2013-05-07 00:00:00,OE,xlsx""" - - with tm.assert_produces_warning(UserWarning, match="Could not infer format"): - df = pd.read_csv(StringIO(raw), parse_dates=[0]) - gb = df.groupby("Date") - r = gb[["File"]].max() - e = gb["File"].max().to_frame() - tm.assert_frame_equal(r, e) - assert not r["File"].isna().any() - - -def test_nlargest(): - a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) - b = Series(list("a" * 5 + "b" * 5)) - gb = a.groupby(b) - r = gb.nlargest(3) - e = Series( - [7, 5, 3, 10, 9, 6], - index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]), - ) - tm.assert_series_equal(r, e) - - a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) - gb = a.groupby(b) - e = Series( - [3, 2, 1, 3, 3, 2], - index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]), - ) - tm.assert_series_equal(gb.nlargest(3, keep="last"), e) - - -def test_nlargest_mi_grouper(): - # see gh-21411 - npr = np.random.default_rng(2) - - dts = date_range("20180101", periods=10) - iterables = [dts, ["one", "two"]] - - idx = MultiIndex.from_product(iterables, names=["first", "second"]) - s = Series(npr.standard_normal(20), index=idx) - - result = s.groupby("first").nlargest(1) - - exp_idx = MultiIndex.from_tuples( - [ - (dts[0], dts[0], "one"), - (dts[1], dts[1], "one"), - (dts[2], dts[2], "one"), - (dts[3], dts[3], "two"), - (dts[4], dts[4], "one"), - (dts[5], dts[5], "one"), - (dts[6], dts[6], "one"), - (dts[7], dts[7], "one"), - (dts[8], dts[8], "one"), - (dts[9], dts[9], "one"), - ], - names=["first", "first", "second"], - ) - - exp_values = [ - 0.18905338179353307, - -0.41306354339189344, - 1.799707382720902, - 0.7738065867276614, - 0.28121066979764925, - 0.9775674511260357, - -0.3288239040579627, - 0.45495807124085547, - 0.5452887139646817, - 0.12682784711186987, - ] - - expected = Series(exp_values, index=exp_idx) - tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3) - - -def test_nsmallest(): - a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) - b = Series(list("a" * 5 + "b" * 5)) - gb = a.groupby(b) - r = gb.nsmallest(3) - e = Series( - [1, 2, 3, 0, 4, 6], - index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]), - ) - tm.assert_series_equal(r, e) - - a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) - gb = a.groupby(b) - e = Series( - [0, 1, 1, 0, 1, 2], - index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]), - ) - tm.assert_series_equal(gb.nsmallest(3, keep="last"), e) - - -@pytest.mark.parametrize( - "data, groups", - [([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])], -) -@pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES]) -@pytest.mark.parametrize("method", ["nlargest", "nsmallest"]) -def test_nlargest_and_smallest_noop(data, groups, dtype, method): - # GH 15272, GH 16345, GH 29129 - # Test nlargest/smallest when it results in a noop, - # i.e. input is sorted and group size <= n - if dtype is not None: - data = np.array(data, dtype=dtype) - if method == "nlargest": - data = list(reversed(data)) - ser = Series(data, name="a") - result = getattr(ser.groupby(groups), method)(n=2) - expidx = np.array(groups, dtype=np.int_) if isinstance(groups, list) else groups - expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a") - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("func", ["cumprod", "cumsum"]) -def test_numpy_compat(func): - # see gh-12811 - df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) - g = df.groupby("A") - - msg = "numpy operations are not valid with groupby" - - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(g, func)(1, 2, 3) - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(g, func)(foo=1) - - -def test_cummin(dtypes_for_minmax): - dtype = dtypes_for_minmax[0] - min_val = dtypes_for_minmax[1] - - # GH 15048 - base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) - expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] - - df = base_df.astype(dtype) - - expected = DataFrame({"B": expected_mins}).astype(dtype) - result = df.groupby("A").cummin() - tm.assert_frame_equal(result, expected) - result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() - tm.assert_frame_equal(result, expected) - - # Test w/ min value for dtype - df.loc[[2, 6], "B"] = min_val - df.loc[[1, 5], "B"] = min_val + 1 - expected.loc[[2, 3, 6, 7], "B"] = min_val - expected.loc[[1, 5], "B"] = min_val + 1 # should not be rounded to min_val - result = df.groupby("A").cummin() - tm.assert_frame_equal(result, expected, check_exact=True) - expected = ( - df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() - ) - tm.assert_frame_equal(result, expected, check_exact=True) - - # Test nan in some values - # Explicit cast to float to avoid implicit cast when setting nan - base_df = base_df.astype({"B": "float"}) - base_df.loc[[0, 2, 4, 6], "B"] = np.nan - expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]}) - result = base_df.groupby("A").cummin() - tm.assert_frame_equal(result, expected) - expected = ( - base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() - ) - tm.assert_frame_equal(result, expected) - - # GH 15561 - df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) - expected = Series(pd.to_datetime("2001"), index=[0], name="b") - - result = df.groupby("a")["b"].cummin() - tm.assert_series_equal(expected, result) - - # GH 15635 - df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]}) - result = df.groupby("a").b.cummin() - expected = Series([1, 2, 1], name="b") - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("method", ["cummin", "cummax"]) -@pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"]) -def test_cummin_max_all_nan_column(method, dtype): - base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8}) - base_df["B"] = base_df["B"].astype(dtype) - grouped = base_df.groupby("A") - - expected = DataFrame({"B": [np.nan] * 8}, dtype=dtype) - result = getattr(grouped, method)() - tm.assert_frame_equal(expected, result) - - result = getattr(grouped["B"], method)().to_frame() - tm.assert_frame_equal(expected, result) - - -def test_cummax(dtypes_for_minmax): - dtype = dtypes_for_minmax[0] - max_val = dtypes_for_minmax[2] - - # GH 15048 - base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) - expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] - - df = base_df.astype(dtype) - - expected = DataFrame({"B": expected_maxs}).astype(dtype) - result = df.groupby("A").cummax() - tm.assert_frame_equal(result, expected) - result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() - tm.assert_frame_equal(result, expected) - - # Test w/ max value for dtype - df.loc[[2, 6], "B"] = max_val - expected.loc[[2, 3, 6, 7], "B"] = max_val - result = df.groupby("A").cummax() - tm.assert_frame_equal(result, expected) - expected = ( - df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() - ) - tm.assert_frame_equal(result, expected) - - # Test nan in some values - # Explicit cast to float to avoid implicit cast when setting nan - base_df = base_df.astype({"B": "float"}) - base_df.loc[[0, 2, 4, 6], "B"] = np.nan - expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) - result = base_df.groupby("A").cummax() - tm.assert_frame_equal(result, expected) - expected = ( - base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() - ) - tm.assert_frame_equal(result, expected) - - # GH 15561 - df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) - expected = Series(pd.to_datetime("2001"), index=[0], name="b") - - result = df.groupby("a")["b"].cummax() - tm.assert_series_equal(expected, result) - - # GH 15635 - df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]}) - result = df.groupby("a").b.cummax() - expected = Series([2, 1, 2], name="b") - tm.assert_series_equal(result, expected) - - -def test_cummax_i8_at_implementation_bound(): - # the minimum value used to be treated as NPY_NAT+1 instead of NPY_NAT - # for int64 dtype GH#46382 - ser = Series([pd.NaT._value + n for n in range(5)]) - df = DataFrame({"A": 1, "B": ser, "C": ser.view("M8[ns]")}) - gb = df.groupby("A") - - res = gb.cummax() - exp = df[["B", "C"]] - tm.assert_frame_equal(res, exp) - - -@pytest.mark.parametrize("method", ["cummin", "cummax"]) -@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"]) -@pytest.mark.parametrize( - "groups,expected_data", - [ - ([1, 1, 1], [1, None, None]), - ([1, 2, 3], [1, None, 2]), - ([1, 3, 3], [1, None, None]), - ], -) -def test_cummin_max_skipna(method, dtype, groups, expected_data): - # GH-34047 - df = DataFrame({"a": Series([1, None, 2], dtype=dtype)}) - orig = df.copy() - gb = df.groupby(groups)["a"] - - result = getattr(gb, method)(skipna=False) - expected = Series(expected_data, dtype=dtype, name="a") - - # check we didn't accidentally alter df - tm.assert_frame_equal(df, orig) - - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("method", ["cummin", "cummax"]) -def test_cummin_max_skipna_multiple_cols(method): - # Ensure missing value in "a" doesn't cause "b" to be nan-filled - df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]}) - gb = df.groupby([1, 1, 1])[["a", "b"]] - - result = getattr(gb, method)(skipna=False) - expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]}) - - tm.assert_frame_equal(result, expected) - - -@td.skip_if_32bit -@pytest.mark.parametrize("method", ["cummin", "cummax"]) -@pytest.mark.parametrize( - "dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2**53 + 1)] -) -def test_nullable_int_not_cast_as_float(method, dtype, val): - data = [val, pd.NA] - df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype) - grouped = df.groupby("grp") - - result = grouped.transform(method) - expected = DataFrame({"b": data}, dtype=dtype) - - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "in_vals, out_vals", - [ - # Basics: strictly increasing (T), strictly decreasing (F), - # abs val increasing (F), non-strictly increasing (T) - ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]), - # Test with inf vals - ( - [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], - [True, False, True, False], - ), - # Test with nan vals; should always be False - ( - [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], - [False, False, False, False], - ), - ], -) -def test_is_monotonic_increasing(in_vals, out_vals): - # GH 17015 - source_dict = { - "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], - "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], - "C": in_vals, - } - df = DataFrame(source_dict) - result = df.groupby("B").C.is_monotonic_increasing - index = Index(list("abcd"), name="B") - expected = Series(index=index, data=out_vals, name="C") - tm.assert_series_equal(result, expected) - - # Also check result equal to manually taking x.is_monotonic_increasing. - expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "in_vals, out_vals", - [ - # Basics: strictly decreasing (T), strictly increasing (F), - # abs val decreasing (F), non-strictly increasing (T) - ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]), - # Test with inf vals - ( - [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], - [True, True, False, True], - ), - # Test with nan vals; should always be False - ( - [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], - [False, False, False, False], - ), - ], -) -def test_is_monotonic_decreasing(in_vals, out_vals): - # GH 17015 - source_dict = { - "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], - "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], - "C": in_vals, - } - - df = DataFrame(source_dict) - result = df.groupby("B").C.is_monotonic_decreasing - index = Index(list("abcd"), name="B") - expected = Series(index=index, data=out_vals, name="C") - tm.assert_series_equal(result, expected) - - -# describe -# -------------------------------- - - -def test_apply_describe_bug(mframe): - grouped = mframe.groupby(level="first") - grouped.describe() # it works! - - -def test_series_describe_multikey(): - ts = tm.makeTimeSeries() - grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.describe() - tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False) - tm.assert_series_equal(result["std"], grouped.std(), check_names=False) - tm.assert_series_equal(result["min"], grouped.min(), check_names=False) - - -def test_series_describe_single(): - ts = tm.makeTimeSeries() - grouped = ts.groupby(lambda x: x.month) - result = grouped.apply(lambda x: x.describe()) - expected = grouped.describe().stack(future_stack=True) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]]) -def test_series_describe_as_index(as_index, keys): - # GH#49256 - df = DataFrame( - { - "key1": ["one", "two", "two", "three", "two"], - "key2": ["one", "two", "two", "three", "two"], - "foo2": [1, 2, 4, 4, 6], - } - ) - gb = df.groupby(keys, as_index=as_index)["foo2"] - result = gb.describe() - expected = DataFrame( - { - "key1": ["one", "three", "two"], - "count": [1.0, 1.0, 3.0], - "mean": [1.0, 4.0, 4.0], - "std": [np.nan, np.nan, 2.0], - "min": [1.0, 4.0, 2.0], - "25%": [1.0, 4.0, 3.0], - "50%": [1.0, 4.0, 4.0], - "75%": [1.0, 4.0, 5.0], - "max": [1.0, 4.0, 6.0], - } - ) - if len(keys) == 2: - expected.insert(1, "key2", expected["key1"]) - if as_index: - expected = expected.set_index(keys) - tm.assert_frame_equal(result, expected) - - def test_series_index_name(df): grouped = df.loc[:, ["C"]].groupby(df["A"]) result = grouped.agg(lambda x: x.mean()) assert result.index.name == "A" -def test_frame_describe_multikey(tsframe): - grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.describe() - desc_groups = [] - for col in tsframe: - group = grouped[col].describe() - # GH 17464 - Remove duplicate MultiIndex levels - group_col = MultiIndex( - levels=[[col], group.columns], - codes=[[0] * len(group.columns), range(len(group.columns))], - ) - group = DataFrame(group.values, columns=group_col, index=group.index) - desc_groups.append(group) - expected = pd.concat(desc_groups, axis=1) - tm.assert_frame_equal(result, expected) - - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) - result = groupedT.describe() - expected = tsframe.describe().T - # reverting the change from https://github.com/pandas-dev/pandas/pull/35441/ - expected.index = MultiIndex( - levels=[[0, 1], expected.index], - codes=[[0, 0, 1, 1], range(len(expected.index))], - ) - tm.assert_frame_equal(result, expected) - - -def test_frame_describe_tupleindex(): - # GH 14848 - regression from 0.19.0 to 0.19.1 - df1 = DataFrame( - { - "x": [1, 2, 3, 4, 5] * 3, - "y": [10, 20, 30, 40, 50] * 3, - "z": [100, 200, 300, 400, 500] * 3, - } - ) - df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 - df2 = df1.rename(columns={"k": "key"}) - msg = "Names should be list-like for a MultiIndex" - with pytest.raises(ValueError, match=msg): - df1.groupby("k").describe() - with pytest.raises(ValueError, match=msg): - df2.groupby("key").describe() - - -def test_frame_describe_unstacked_format(): - # GH 4792 - prices = { - Timestamp("2011-01-06 10:59:05", tz=None): 24990, - Timestamp("2011-01-06 12:43:33", tz=None): 25499, - Timestamp("2011-01-06 12:54:09", tz=None): 25499, - } - volumes = { - Timestamp("2011-01-06 10:59:05", tz=None): 1500000000, - Timestamp("2011-01-06 12:43:33", tz=None): 5000000000, - Timestamp("2011-01-06 12:54:09", tz=None): 100000000, - } - df = DataFrame({"PRICE": prices, "VOLUME": volumes}) - result = df.groupby("PRICE").VOLUME.describe() - data = [ - df[df.PRICE == 24990].VOLUME.describe().values.tolist(), - df[df.PRICE == 25499].VOLUME.describe().values.tolist(), - ] - expected = DataFrame( - data, - index=Index([24990, 25499], name="PRICE"), - columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.filterwarnings( - "ignore:" - "indexing past lexsort depth may impact performance:" - "pandas.errors.PerformanceWarning" -) -@pytest.mark.parametrize("as_index", [True, False]) -@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) -def test_describe_with_duplicate_output_column_names(as_index, keys): - # GH 35314 - df = DataFrame( - { - "a1": [99, 99, 99, 88, 88, 88], - "a2": [99, 99, 99, 88, 88, 88], - "b": [1, 2, 3, 4, 5, 6], - "c": [10, 20, 30, 40, 50, 60], - }, - columns=["a1", "a2", "b", "b"], - copy=False, - ) - if keys == ["a1"]: - df = df.drop(columns="a2") - - expected = ( - DataFrame.from_records( - [ - ("b", "count", 3.0, 3.0), - ("b", "mean", 5.0, 2.0), - ("b", "std", 1.0, 1.0), - ("b", "min", 4.0, 1.0), - ("b", "25%", 4.5, 1.5), - ("b", "50%", 5.0, 2.0), - ("b", "75%", 5.5, 2.5), - ("b", "max", 6.0, 3.0), - ("b", "count", 3.0, 3.0), - ("b", "mean", 5.0, 2.0), - ("b", "std", 1.0, 1.0), - ("b", "min", 4.0, 1.0), - ("b", "25%", 4.5, 1.5), - ("b", "50%", 5.0, 2.0), - ("b", "75%", 5.5, 2.5), - ("b", "max", 6.0, 3.0), - ], - ) - .set_index([0, 1]) - .T - ) - expected.columns.names = [None, None] - if len(keys) == 2: - expected.index = MultiIndex( - levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"] - ) - else: - expected.index = Index([88, 99], name="a1") - - if not as_index: - expected = expected.reset_index() - - result = df.groupby(keys, as_index=as_index).describe() - - tm.assert_frame_equal(result, expected) - - -def test_describe_duplicate_columns(): - # GH#50806 - df = DataFrame([[0, 1, 2, 3]]) - df.columns = [0, 1, 2, 0] - gb = df.groupby(df[1]) - result = gb.describe(percentiles=[]) - - columns = ["count", "mean", "std", "min", "50%", "max"] - frames = [ - DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns) - for val in (0.0, 2.0, 3.0) - ] - expected = pd.concat(frames, axis=1) - expected.columns = MultiIndex( - levels=[[0, 2], columns], - codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))], - ) - expected.index.names = [1] - tm.assert_frame_equal(result, expected) - - -def test_groupby_mean_no_overflow(): - # Regression test for (#22487) - df = DataFrame( - { - "user": ["A", "A", "A", "A", "A"], - "connections": [4970, 4749, 4719, 4704, 18446744073699999744], - } - ) - assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840 - - @pytest.mark.parametrize( "values", [ @@ -1360,78 +565,6 @@ def test_apply_to_nullable_integer_returns_float(values, function): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("min_count", [0, 10]) -def test_groupby_sum_mincount_boolean(min_count): - b = True - a = False - na = np.nan - dfg = pd.array([b, b, na, na, a, a, b], dtype="boolean") - - df = DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": dfg}) - result = df.groupby("A").sum(min_count=min_count) - if min_count == 0: - expected = DataFrame( - {"B": pd.array([3, 0, 0], dtype="Int64")}, - index=Index([1, 2, 3], name="A"), - ) - tm.assert_frame_equal(result, expected) - else: - expected = DataFrame( - {"B": pd.array([pd.NA] * 3, dtype="Int64")}, - index=Index([1, 2, 3], name="A"), - ) - tm.assert_frame_equal(result, expected) - - -def test_groupby_sum_below_mincount_nullable_integer(): - # https://github.com/pandas-dev/pandas/issues/32861 - df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64") - grouped = df.groupby("a") - idx = Index([0, 1, 2], name="a", dtype="Int64") - - result = grouped["b"].sum(min_count=2) - expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b") - tm.assert_series_equal(result, expected) - - result = grouped.sum(min_count=2) - expected = DataFrame({"b": [pd.NA] * 3, "c": [pd.NA] * 3}, dtype="Int64", index=idx) - tm.assert_frame_equal(result, expected) - - -def test_mean_on_timedelta(): - # GH 17382 - df = DataFrame({"time": pd.to_timedelta(range(10)), "cat": ["A", "B"] * 5}) - result = df.groupby("cat")["time"].mean() - expected = Series( - pd.to_timedelta([4, 5]), name="time", index=Index(["A", "B"], name="cat") - ) - tm.assert_series_equal(result, expected) - - -def test_groupby_sum_timedelta_with_nat(): - # GH#42659 - df = DataFrame( - { - "a": [1, 1, 2, 2], - "b": [pd.Timedelta("1d"), pd.Timedelta("2d"), pd.Timedelta("3d"), pd.NaT], - } - ) - td3 = pd.Timedelta(days=3) - - gb = df.groupby("a") - - res = gb.sum() - expected = DataFrame({"b": [td3, td3]}, index=Index([1, 2], name="a")) - tm.assert_frame_equal(res, expected) - - res = gb["b"].sum() - tm.assert_series_equal(res, expected["b"]) - - res = gb["b"].sum(min_count=2) - expected = Series([td3, pd.NaT], dtype="m8[ns]", name="b", index=expected.index) - tm.assert_series_equal(res, expected) - - @pytest.mark.parametrize( "kernel, has_arg", [ @@ -1673,22 +806,6 @@ def test_groupby_empty_dataset(dtype, kwargs): tm.assert_frame_equal(result, expected) -def test_corrwith_with_1_axis(): - # GH 47723 - df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]}) - gb = df.groupby("a") - - msg = "DataFrameGroupBy.corrwith with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = gb.corrwith(df, axis=1) - index = Index( - data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)], - name=("a", None), - ) - expected = Series([np.nan] * 6, index=index) - tm.assert_series_equal(result, expected) - - def test_multiindex_group_all_columns_when_empty(groupby_func): # GH 32464 df = DataFrame({"a": [], "b": [], "c": []}).set_index(["a", "b", "c"])