Closes Bears-R-Us#3302 GroupBy.head (Bears-R-Us#3324)

Co-authored-by: Amanda Potts <ajpotts@users.noreply.github.com>
ajpotts · Jun 27, 2024 · 2bb0045 · 2bb0045
1 parent e7a863e
commit 2bb0045
Show file tree

Hide file tree

Showing 5 changed files with 621 additions and 53 deletions.
diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py
@@ -1158,7 +1158,7 @@ def test_dropna(self):
                     if df.to_pandas(retain_index=True).dropna(axis=axis, thresh=thresh).empty:
                         assert (
                             df.dropna(axis=axis, thresh=thresh).to_pandas(retain_index=True).empty
-                            == True
+                            is True
                         )
 
                     else:
@@ -1310,6 +1310,77 @@ def test_sample_flags(self):
                             print(f"Failure with seed:\n{seed}")
                         assert res
 
+    @pytest.mark.parametrize("size", pytest.prob_size)
+    def test_head_tail(self, size):
+
+        bool_col = ak.full(size, False, dtype=ak.bool)
+        bool_col[::2] = True
+
+        df = ak.DataFrame(
+            {
+                "a": ak.arange(size) % 3,
+                "b": ak.arange(size, dtype="int64"),
+                "c": ak.arange(size, dtype="float64"),
+                "d": ak.random_strings_uniform(size=size, minlen=1, maxlen=2, seed=18),
+                "e": bool_col,
+            }
+        )
+
+        size_range = ak.arange(size)
+        zeros_idx = size_range[df["a"] == 0][0:2]
+        ones_idx = size_range[df["a"] == 1][0:2]
+        twos_idx = size_range[df["a"] == 2][0:2]
+        head_expected_idx = ak.concatenate([zeros_idx, ones_idx, twos_idx])
+
+        def get_head_values(col):
+            zeros_values = df[col][zeros_idx]
+            ones_values = df[col][ones_idx]
+            twos_values = df[col][twos_idx]
+            expected_values = ak.concatenate([zeros_values, ones_values, twos_values])
+            return expected_values
+
+        head_df = df.groupby("a").head(n=2, sort_index=False)
+        assert ak.all(head_df.index == head_expected_idx)
+        for col in df.columns:
+            assert ak.all(head_df[col] == get_head_values(col))
+
+        head_df_sorted = df.groupby("a").head(n=2, sort_index=True)
+        from pandas.testing import assert_frame_equal
+
+        assert_frame_equal(
+            head_df_sorted.to_pandas(retain_index=True),
+            df.to_pandas(retain_index=True).groupby("a").head(n=2),
+        )
+
+        #   Now test tail
+        tail_zeros_idx = size_range[df["a"] == 0][-2:]
+        tail_ones_idx = size_range[df["a"] == 1][-2:]
+        tail_twos_idx = size_range[df["a"] == 2][-2:]
+        tail_expected_idx = ak.concatenate([tail_zeros_idx, tail_ones_idx, tail_twos_idx])
+
+        def get_tail_values(col):
+            tail_zeros_values = df[col][tail_zeros_idx]
+            tail_ones_values = df[col][tail_ones_idx]
+            tail_twos_values = df[col][tail_twos_idx]
+            tail_expected_values = ak.concatenate(
+                [tail_zeros_values, tail_ones_values, tail_twos_values]
+            )
+            return tail_expected_values
+
+        tail_df = df.groupby("a").tail(n=2, sort_index=False)
+        assert ak.all(tail_df.index == tail_expected_idx)
+
+        for col in df.columns:
+            assert ak.all(tail_df[col] == get_tail_values(col))
+
+        tail_df_sorted = df.groupby("a").tail(n=2, sort_index=True)
+        from pandas.testing import assert_frame_equal
+
+        assert_frame_equal(
+            tail_df_sorted.to_pandas(retain_index=True),
+            df.to_pandas(retain_index=True).groupby("a").tail(n=2),
+        )
+
 
 def pda_to_str_helper(pda):
     return ak.array([f"str {i}" for i in pda.to_list()])
diff --git a/PROTO_tests/tests/groupby_test.py b/PROTO_tests/tests/groupby_test.py
@@ -3,6 +3,9 @@
 import pytest
 
 import arkouda as ak
+from arkouda import GroupBy, concatenate
+from arkouda import sort as aksort
+from arkouda import sum as aksum
 from arkouda.groupbyclass import GroupByReductionType
 from arkouda.scipy import chisquare as akchisquare
 
@@ -625,6 +628,106 @@ def test_zero_length_groupby(self):
         g = ak.GroupBy(ak.zeros(0, dtype=ak.int64))
         str(g.segments)  # passing condition, if this was deleted it will cause the test to fail
 
+    @pytest.mark.parametrize("dtype", ["bool", "str_", "int64", "float64"])
+    @pytest.mark.parametrize("size", pytest.prob_size)
+    def test_head_aggregation(self, size, dtype):
+
+        if np.issubdtype(dtype, np.number):
+            a = ak.arange(size, dtype=dtype) % 3
+        else:
+            a = ak.arange(size, dtype=ak.int64) % 3
+
+        if dtype is ak.str_:
+            v = ak.random_strings_uniform(size=size, minlen=1, maxlen=2)
+        elif dtype is ak.bool:
+            v = ak.full(size, False, dtype=ak.bool)
+            v[::2] = True
+        else:
+            v = ak.arange(size, dtype=dtype)
+
+        rng = ak.random.default_rng(17)
+        i = ak.arange(size)
+        rng.shuffle(i)
+        a = a[i]
+
+        rng.shuffle(i)
+        v = v[i]
+
+        g = GroupBy(a)
+
+        size_range = ak.arange(size)
+        zeros_idx = size_range[a == 0][0:2]
+        ones_idx = size_range[a == 1][0:2]
+        twos_idx = size_range[a == 2][0:2]
+        expected_idx = concatenate([zeros_idx, ones_idx, twos_idx])
+
+        unique_keys, idx = g.head(v, 2, return_indices=True)
+        assert ak.all(unique_keys == ak.array([0, 1, 2]))
+        assert ak.all(aksort(idx) == aksort(expected_idx))
+
+        zeros_values = v[a == 0][0:2]
+        ones_values = v[a == 1][0:2]
+        twos_values = v[a == 2][0:2]
+        expected_values = concatenate([zeros_values, ones_values, twos_values])
+
+        unique_keys, values = g.head(v, 2, return_indices=False)
+        assert len(values) == len(expected_values)
+        assert ak.all(unique_keys == ak.array([0, 1, 2]))
+        if dtype == ak.bool:
+            assert aksum(values) == aksum(expected_values)
+        else:
+            assert set(values.to_list()) == set(expected_values.to_list())
+
+    @pytest.mark.parametrize("dtype", ["bool", "str_", "int64", "float64"])
+    @pytest.mark.parametrize("size", pytest.prob_size)
+    def test_tail_aggregation(self, size, dtype):
+
+        if np.issubdtype(dtype, np.number):
+            a = ak.arange(size, dtype=dtype) % 3
+        else:
+            a = ak.arange(size, dtype=ak.int64) % 3
+
+        if dtype is ak.str_:
+            v = ak.random_strings_uniform(size=size, minlen=1, maxlen=2)
+        elif dtype is ak.bool:
+            v = ak.full(size, False, dtype=ak.bool)
+            v[::2] = True
+        else:
+            v = ak.arange(size, dtype=dtype)
+
+        rng = ak.random.default_rng(17)
+        i = ak.arange(size)
+        rng.shuffle(i)
+        a = a[i]
+
+        rng.shuffle(i)
+        v = v[i]
+
+        g = GroupBy(a)
+
+        size_range = ak.arange(size)
+        zeros_idx = size_range[a == 0][-2:]
+        ones_idx = size_range[a == 1][-2:]
+        twos_idx = size_range[a == 2][-2:]
+        expected_idx = concatenate([zeros_idx, ones_idx, twos_idx])
+
+        unique_keys, idx = g.tail(v, 2, return_indices=True)
+        assert ak.all(unique_keys == ak.array([0, 1, 2]))
+        assert ak.all(aksort(idx) == aksort(expected_idx))
+
+        zeros_values = v[a == 0][-2:]
+        ones_values = v[a == 1][-2:]
+        twos_values = v[a == 2][-2:]
+        expected_values = concatenate([zeros_values, ones_values, twos_values])
+
+        unique_keys, values = g.tail(v, 2, return_indices=False)
+        assert len(values) == len(expected_values)
+        assert ak.all(unique_keys == ak.array([0, 1, 2]))
+        if dtype == ak.bool:
+            assert aksum(values) == aksum(expected_values)
+        else:
+            assert set(values.to_list()) == set(expected_values.to_list())
+
     def test_first_aggregation(self):
         keys = ak.array([0, 1, 0, 1, 0, 1])
         vals = ak.array([9, 8, 7, 6, 5, 4])

diff --git a/arkouda/dataframe.py b/arkouda/dataframe.py
@@ -13,6 +13,7 @@
 from numpy._typing import _8Bit, _16Bit, _32Bit, _64Bit
 from typeguard import typechecked
 
+from arkouda import sort as aksort
 from arkouda.categorical import Categorical
 from arkouda.client import generic_msg, maxTransferBytes
 from arkouda.client_dtypes import BitVector, Fields, IPv4
@@ -206,6 +207,157 @@ def size(self, as_series=None, sort_index=True):
         else:
             return self._return_agg_dataframe(self.gb.size(), "size", sort_index=sort_index)
 
+    def head(
+        self,
+        n: int = 5,
+        sort_index: bool = True,
+    ) -> DataFrame:
+        """
+        Return the first n rows from each group.
+
+        Parameters
+        ----------
+        n: int, optional, default = 5
+            Maximum number of rows to return for each group.
+            If the number of rows in a group is less than n,
+            all the values from that group will be returned.
+        sort_index: bool, default = True
+            If true, return the DataFrame with indices sorted.
+
+        Returns
+        -------
+        arkouda.dataframe.DataFrame
+
+
+        Examples
+        --------
+        >>> import arkouda as ak
+        >>> from arkouda import *
+        >>> df = ak.DataFrame({"a":ak.arange(10) %3 , "b":ak.arange(10)})
+
+        +----+-----+-----+
+        |    |   a |   b |
+        +====+=====+=====+
+        |  0 |   0 |   0 |
+        +----+-----+-----+
+        |  1 |   1 |   1 |
+        +----+-----+-----+
+        |  2 |   2 |   2 |
+        +----+-----+-----+
+        |  3 |   0 |   3 |
+        +----+-----+-----+
+        |  4 |   1 |   4 |
+        +----+-----+-----+
+        |  5 |   2 |   5 |
+        +----+-----+-----+
+        |  6 |   0 |   6 |
+        +----+-----+-----+
+        |  7 |   1 |   7 |
+        +----+-----+-----+
+        |  8 |   2 |   8 |
+        +----+-----+-----+
+        |  9 |   0 |   9 |
+        +----+-----+-----+
+
+        >>> df.groupby("a").head(2)
+
+        +----+-----+-----+
+        |    |   a |   b |
+        +====+=====+=====+
+        |  0 |   0 |   0 |
+        +----+-----+-----+
+        |  1 |   0 |   3 |
+        +----+-----+-----+
+        |  2 |   1 |   1 |
+        +----+-----+-----+
+        |  3 |   1 |   4 |
+        +----+-----+-----+
+        |  4 |   2 |   2 |
+        +----+-----+-----+
+        |  5 |   2 |   5 |
+        +----+-----+-----+
+
+        """
+        _, indx = self.gb.head(self.df.index.values, n=n, return_indices=True)
+        if sort_index:
+            indx = aksort(indx)
+        return self.df[indx]
+
+    def tail(
+        self,
+        n: int = 5,
+        sort_index: bool = True,
+    ) -> DataFrame:
+        """
+        Return the last n rows from each group.
+
+        Parameters
+        ----------
+        n: int, optional, default = 5
+            Maximum number of rows to return for each group.
+            If the number of rows in a group is less than n,
+            all the rows from that group will be returned.
+        sort_index: bool, default = True
+            If true, return the DataFrame with indices sorted.
+
+        Returns
+        -------
+        arkouda.dataframe.DataFrame
+
+        Examples
+        --------
+        >>> import arkouda as ak
+        >>> from arkouda import *
+        >>> df = ak.DataFrame({"a":ak.arange(10) %3 , "b":ak.arange(10)})
+
+        +----+-----+-----+
+        |    |   a |   b |
+        +====+=====+=====+
+        |  0 |   0 |   0 |
+        +----+-----+-----+
+        |  1 |   1 |   1 |
+        +----+-----+-----+
+        |  2 |   2 |   2 |
+        +----+-----+-----+
+        |  3 |   0 |   3 |
+        +----+-----+-----+
+        |  4 |   1 |   4 |
+        +----+-----+-----+
+        |  5 |   2 |   5 |
+        +----+-----+-----+
+        |  6 |   0 |   6 |
+        +----+-----+-----+
+        |  7 |   1 |   7 |
+        +----+-----+-----+
+        |  8 |   2 |   8 |
+        +----+-----+-----+
+        |  9 |   0 |   9 |
+        +----+-----+-----+
+
+        >>> df.groupby("a").tail(2)
+
+        +----+-----+-----+
+        |    |   a |   b |
+        +====+=====+=====+
+        |  0 |   0 |   6 |
+        +----+-----+-----+
+        |  1 |   0 |   9 |
+        +----+-----+-----+
+        |  2 |   1 |   4 |
+        +----+-----+-----+
+        |  3 |   1 |   7 |
+        +----+-----+-----+
+        |  4 |   2 |   5 |
+        +----+-----+-----+
+        |  5 |   2 |   8 |
+        +----+-----+-----+
+
+        """
+        _, indx = self.gb.tail(self.df.index.values, n=n, return_indices=True)
+        if sort_index:
+            indx = aksort(indx)
+        return self.df[indx]
+
     def sample(self, n=None, frac=None, replace=False, weights=None, random_state=None):
         """
         Return a random sample from each group. You can either specify the number of elements