Skip to content

Commit

Permalink
Closes Bears-R-Us#3302 GroupBy.head (Bears-R-Us#3324)
Browse files Browse the repository at this point in the history
Co-authored-by: Amanda Potts <ajpotts@users.noreply.github.com>
  • Loading branch information
ajpotts and ajpotts authored Jun 27, 2024
1 parent e7a863e commit 2bb0045
Show file tree
Hide file tree
Showing 5 changed files with 621 additions and 53 deletions.
73 changes: 72 additions & 1 deletion PROTO_tests/tests/dataframe_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1158,7 +1158,7 @@ def test_dropna(self):
if df.to_pandas(retain_index=True).dropna(axis=axis, thresh=thresh).empty:
assert (
df.dropna(axis=axis, thresh=thresh).to_pandas(retain_index=True).empty
== True
is True
)

else:
Expand Down Expand Up @@ -1310,6 +1310,77 @@ def test_sample_flags(self):
print(f"Failure with seed:\n{seed}")
assert res

@pytest.mark.parametrize("size", pytest.prob_size)
def test_head_tail(self, size):

bool_col = ak.full(size, False, dtype=ak.bool)
bool_col[::2] = True

df = ak.DataFrame(
{
"a": ak.arange(size) % 3,
"b": ak.arange(size, dtype="int64"),
"c": ak.arange(size, dtype="float64"),
"d": ak.random_strings_uniform(size=size, minlen=1, maxlen=2, seed=18),
"e": bool_col,
}
)

size_range = ak.arange(size)
zeros_idx = size_range[df["a"] == 0][0:2]
ones_idx = size_range[df["a"] == 1][0:2]
twos_idx = size_range[df["a"] == 2][0:2]
head_expected_idx = ak.concatenate([zeros_idx, ones_idx, twos_idx])

def get_head_values(col):
zeros_values = df[col][zeros_idx]
ones_values = df[col][ones_idx]
twos_values = df[col][twos_idx]
expected_values = ak.concatenate([zeros_values, ones_values, twos_values])
return expected_values

head_df = df.groupby("a").head(n=2, sort_index=False)
assert ak.all(head_df.index == head_expected_idx)
for col in df.columns:
assert ak.all(head_df[col] == get_head_values(col))

head_df_sorted = df.groupby("a").head(n=2, sort_index=True)
from pandas.testing import assert_frame_equal

assert_frame_equal(
head_df_sorted.to_pandas(retain_index=True),
df.to_pandas(retain_index=True).groupby("a").head(n=2),
)

# Now test tail
tail_zeros_idx = size_range[df["a"] == 0][-2:]
tail_ones_idx = size_range[df["a"] == 1][-2:]
tail_twos_idx = size_range[df["a"] == 2][-2:]
tail_expected_idx = ak.concatenate([tail_zeros_idx, tail_ones_idx, tail_twos_idx])

def get_tail_values(col):
tail_zeros_values = df[col][tail_zeros_idx]
tail_ones_values = df[col][tail_ones_idx]
tail_twos_values = df[col][tail_twos_idx]
tail_expected_values = ak.concatenate(
[tail_zeros_values, tail_ones_values, tail_twos_values]
)
return tail_expected_values

tail_df = df.groupby("a").tail(n=2, sort_index=False)
assert ak.all(tail_df.index == tail_expected_idx)

for col in df.columns:
assert ak.all(tail_df[col] == get_tail_values(col))

tail_df_sorted = df.groupby("a").tail(n=2, sort_index=True)
from pandas.testing import assert_frame_equal

assert_frame_equal(
tail_df_sorted.to_pandas(retain_index=True),
df.to_pandas(retain_index=True).groupby("a").tail(n=2),
)


def pda_to_str_helper(pda):
return ak.array([f"str {i}" for i in pda.to_list()])
103 changes: 103 additions & 0 deletions PROTO_tests/tests/groupby_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import pytest

import arkouda as ak
from arkouda import GroupBy, concatenate
from arkouda import sort as aksort
from arkouda import sum as aksum
from arkouda.groupbyclass import GroupByReductionType
from arkouda.scipy import chisquare as akchisquare

Expand Down Expand Up @@ -625,6 +628,106 @@ def test_zero_length_groupby(self):
g = ak.GroupBy(ak.zeros(0, dtype=ak.int64))
str(g.segments) # passing condition, if this was deleted it will cause the test to fail

@pytest.mark.parametrize("dtype", ["bool", "str_", "int64", "float64"])
@pytest.mark.parametrize("size", pytest.prob_size)
def test_head_aggregation(self, size, dtype):

if np.issubdtype(dtype, np.number):
a = ak.arange(size, dtype=dtype) % 3
else:
a = ak.arange(size, dtype=ak.int64) % 3

if dtype is ak.str_:
v = ak.random_strings_uniform(size=size, minlen=1, maxlen=2)
elif dtype is ak.bool:
v = ak.full(size, False, dtype=ak.bool)
v[::2] = True
else:
v = ak.arange(size, dtype=dtype)

rng = ak.random.default_rng(17)
i = ak.arange(size)
rng.shuffle(i)
a = a[i]

rng.shuffle(i)
v = v[i]

g = GroupBy(a)

size_range = ak.arange(size)
zeros_idx = size_range[a == 0][0:2]
ones_idx = size_range[a == 1][0:2]
twos_idx = size_range[a == 2][0:2]
expected_idx = concatenate([zeros_idx, ones_idx, twos_idx])

unique_keys, idx = g.head(v, 2, return_indices=True)
assert ak.all(unique_keys == ak.array([0, 1, 2]))
assert ak.all(aksort(idx) == aksort(expected_idx))

zeros_values = v[a == 0][0:2]
ones_values = v[a == 1][0:2]
twos_values = v[a == 2][0:2]
expected_values = concatenate([zeros_values, ones_values, twos_values])

unique_keys, values = g.head(v, 2, return_indices=False)
assert len(values) == len(expected_values)
assert ak.all(unique_keys == ak.array([0, 1, 2]))
if dtype == ak.bool:
assert aksum(values) == aksum(expected_values)
else:
assert set(values.to_list()) == set(expected_values.to_list())

@pytest.mark.parametrize("dtype", ["bool", "str_", "int64", "float64"])
@pytest.mark.parametrize("size", pytest.prob_size)
def test_tail_aggregation(self, size, dtype):

if np.issubdtype(dtype, np.number):
a = ak.arange(size, dtype=dtype) % 3
else:
a = ak.arange(size, dtype=ak.int64) % 3

if dtype is ak.str_:
v = ak.random_strings_uniform(size=size, minlen=1, maxlen=2)
elif dtype is ak.bool:
v = ak.full(size, False, dtype=ak.bool)
v[::2] = True
else:
v = ak.arange(size, dtype=dtype)

rng = ak.random.default_rng(17)
i = ak.arange(size)
rng.shuffle(i)
a = a[i]

rng.shuffle(i)
v = v[i]

g = GroupBy(a)

size_range = ak.arange(size)
zeros_idx = size_range[a == 0][-2:]
ones_idx = size_range[a == 1][-2:]
twos_idx = size_range[a == 2][-2:]
expected_idx = concatenate([zeros_idx, ones_idx, twos_idx])

unique_keys, idx = g.tail(v, 2, return_indices=True)
assert ak.all(unique_keys == ak.array([0, 1, 2]))
assert ak.all(aksort(idx) == aksort(expected_idx))

zeros_values = v[a == 0][-2:]
ones_values = v[a == 1][-2:]
twos_values = v[a == 2][-2:]
expected_values = concatenate([zeros_values, ones_values, twos_values])

unique_keys, values = g.tail(v, 2, return_indices=False)
assert len(values) == len(expected_values)
assert ak.all(unique_keys == ak.array([0, 1, 2]))
if dtype == ak.bool:
assert aksum(values) == aksum(expected_values)
else:
assert set(values.to_list()) == set(expected_values.to_list())

def test_first_aggregation(self):
keys = ak.array([0, 1, 0, 1, 0, 1])
vals = ak.array([9, 8, 7, 6, 5, 4])
Expand Down
152 changes: 152 additions & 0 deletions arkouda/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from numpy._typing import _8Bit, _16Bit, _32Bit, _64Bit
from typeguard import typechecked

from arkouda import sort as aksort
from arkouda.categorical import Categorical
from arkouda.client import generic_msg, maxTransferBytes
from arkouda.client_dtypes import BitVector, Fields, IPv4
Expand Down Expand Up @@ -206,6 +207,157 @@ def size(self, as_series=None, sort_index=True):
else:
return self._return_agg_dataframe(self.gb.size(), "size", sort_index=sort_index)

def head(
self,
n: int = 5,
sort_index: bool = True,
) -> DataFrame:
"""
Return the first n rows from each group.
Parameters
----------
n: int, optional, default = 5
Maximum number of rows to return for each group.
If the number of rows in a group is less than n,
all the values from that group will be returned.
sort_index: bool, default = True
If true, return the DataFrame with indices sorted.
Returns
-------
arkouda.dataframe.DataFrame
Examples
--------
>>> import arkouda as ak
>>> from arkouda import *
>>> df = ak.DataFrame({"a":ak.arange(10) %3 , "b":ak.arange(10)})
+----+-----+-----+
| | a | b |
+====+=====+=====+
| 0 | 0 | 0 |
+----+-----+-----+
| 1 | 1 | 1 |
+----+-----+-----+
| 2 | 2 | 2 |
+----+-----+-----+
| 3 | 0 | 3 |
+----+-----+-----+
| 4 | 1 | 4 |
+----+-----+-----+
| 5 | 2 | 5 |
+----+-----+-----+
| 6 | 0 | 6 |
+----+-----+-----+
| 7 | 1 | 7 |
+----+-----+-----+
| 8 | 2 | 8 |
+----+-----+-----+
| 9 | 0 | 9 |
+----+-----+-----+
>>> df.groupby("a").head(2)
+----+-----+-----+
| | a | b |
+====+=====+=====+
| 0 | 0 | 0 |
+----+-----+-----+
| 1 | 0 | 3 |
+----+-----+-----+
| 2 | 1 | 1 |
+----+-----+-----+
| 3 | 1 | 4 |
+----+-----+-----+
| 4 | 2 | 2 |
+----+-----+-----+
| 5 | 2 | 5 |
+----+-----+-----+
"""
_, indx = self.gb.head(self.df.index.values, n=n, return_indices=True)
if sort_index:
indx = aksort(indx)
return self.df[indx]

def tail(
self,
n: int = 5,
sort_index: bool = True,
) -> DataFrame:
"""
Return the last n rows from each group.
Parameters
----------
n: int, optional, default = 5
Maximum number of rows to return for each group.
If the number of rows in a group is less than n,
all the rows from that group will be returned.
sort_index: bool, default = True
If true, return the DataFrame with indices sorted.
Returns
-------
arkouda.dataframe.DataFrame
Examples
--------
>>> import arkouda as ak
>>> from arkouda import *
>>> df = ak.DataFrame({"a":ak.arange(10) %3 , "b":ak.arange(10)})
+----+-----+-----+
| | a | b |
+====+=====+=====+
| 0 | 0 | 0 |
+----+-----+-----+
| 1 | 1 | 1 |
+----+-----+-----+
| 2 | 2 | 2 |
+----+-----+-----+
| 3 | 0 | 3 |
+----+-----+-----+
| 4 | 1 | 4 |
+----+-----+-----+
| 5 | 2 | 5 |
+----+-----+-----+
| 6 | 0 | 6 |
+----+-----+-----+
| 7 | 1 | 7 |
+----+-----+-----+
| 8 | 2 | 8 |
+----+-----+-----+
| 9 | 0 | 9 |
+----+-----+-----+
>>> df.groupby("a").tail(2)
+----+-----+-----+
| | a | b |
+====+=====+=====+
| 0 | 0 | 6 |
+----+-----+-----+
| 1 | 0 | 9 |
+----+-----+-----+
| 2 | 1 | 4 |
+----+-----+-----+
| 3 | 1 | 7 |
+----+-----+-----+
| 4 | 2 | 5 |
+----+-----+-----+
| 5 | 2 | 8 |
+----+-----+-----+
"""
_, indx = self.gb.tail(self.df.index.values, n=n, return_indices=True)
if sort_index:
indx = aksort(indx)
return self.df[indx]

def sample(self, n=None, frac=None, replace=False, weights=None, random_state=None):
"""
Return a random sample from each group. You can either specify the number of elements
Expand Down
Loading

0 comments on commit 2bb0045

Please sign in to comment.