diff --git a/PROTO_tests/tests/groupby_test.py b/PROTO_tests/tests/groupby_test.py index 46705d24b6..6b2f4fefdd 100644 --- a/PROTO_tests/tests/groupby_test.py +++ b/PROTO_tests/tests/groupby_test.py @@ -8,6 +8,39 @@ from arkouda import sum as aksum from arkouda.groupbyclass import GroupByReductionType from arkouda.scipy import chisquare as akchisquare +from arkouda.dtypes import npstr + +# block of variables and functions used in test_unique + +UNIQUE_TYPES = [ak.categorical, ak.int64, ak.float64, npstr] +VOWELS_AND_SUCH = ["a", "e", "i", "o", "u", "AB", 47, 2, 3.14159] +PICKS = np.array([f"base {i}" for i in range(10)]) + +isSorted = lambda x: np.all(x[:-1] <= x[1:]) # short for is x[i] <= x[i+1] for all i + +# This function (almost) guarantees both a sorted and unsorted version of +# a 1d array. The only exception is an array of all identical values. +# The first "if" block skips the whole function in that case. Otherwise, +# if the sample is already sorted, a non-sorted permutation is generated, +# and the two are returned. If it isn't, a sorted version is created, +# and those two are returned. + + +def make_sorted_and_unsorted_data(sample): + if np.all(sample == sample[0]): + return sample, sample + if isSorted(sample): + s_a = sample[:] + us_a = np.random.permutation(sample) + while isSorted(us_a): + us_a = np.random.permutation(us_a) + else: + s_a = np.sort(sample) + us_a = sample[:] + return s_a, us_a + + +# end of block def to_tuple_dict(labels, values): @@ -757,6 +790,73 @@ def test_large_mean_aggregation(self): for m in means.to_list(): assert np.isclose(float(a[0]), m) + # ak.unique takes 1 pda argument and 3 booleans + # However, not all 8 combinations of the booleans are needed to + # cover the test space. + # Combinations TTF and TFF are supersets of all other possible + # combinations, so only those are tested below. + + @pytest.mark.parametrize("data_type", UNIQUE_TYPES) + @pytest.mark.parametrize("prob_size", pytest.prob_size) + def test_unique(self, data_type, prob_size): + Jenny = pytest.seed if pytest.seed is not None else 8675309 + T = True + F = False + np.random.seed(Jenny) + arrays = { + npstr: np.random.choice(VOWELS_AND_SUCH, prob_size), + ak.int64: np.random.randint(0, prob_size // 3, prob_size), + ak.float64: np.random.uniform(0, prob_size // 3, prob_size), + ak.categorical: np.random.choice(PICKS, prob_size), + } + nda = arrays[data_type] + np_unique = np.unique(nda) # get unique keys from np for comparison + s_nda, us_nda = make_sorted_and_unsorted_data(nda) + s_pda = ak.array(s_nda) + us_pda = ak.array(us_nda) + + # Categorical requires another step to make the pdarrays categorical + + if data_type == "categorical": + s_pda = ak.Categorical(s_pda) + us_pda = ak.Categorical(us_pda) + + # Call ak.unique with TTF and TFF + + ak_TTF = ak.unique(s_pda, T, T, F) + ak_TFF = ak.unique(us_pda, T, F, F) + + # Check for correct unique keys. + + assert np.all(np_unique == np.sort(ak_TFF[0].to_ndarray())) + assert np.all(np_unique == np.sort(ak_TTF[0].to_ndarray())) + + # Check groups and indices. If data was sorted, the group ndarray + # should just be list(range(len(nda))). + # For unsorted data, a reordered copy of the pdarray is created + # based on the returned permutation. + # In both cases, broadcasting the unique values using the returned + # indices should create the sorted/reordered array. + + # keys should always be returned sorted if data is int64 + + # sorted + + if data_type == ak.int64 : assert isSorted(ak_TFF[0].to_ndarray()) + srange = np.arange(len(nda)) + assert np.all(srange == ak_TTF[1].to_ndarray()) + indices = ak_TTF[2] + assert ak.all(s_pda == ak.broadcast(indices, ak_TTF[0], len(s_nda))) + + # unsorted + + aku = ak.unique(us_pda).to_ndarray() + if data_type == ak.int64 : assert isSorted(aku) + reordering = ak_TFF[1] + reordered = us_pda[reordering] + indices = ak_TFF[2] + assert ak.all(reordered == ak.broadcast(indices, ak_TFF[0], len(us_nda))) + def test_unique_aggregation(self): keys = ak.array([0, 1, 0, 1, 0, 1, 0, 1]) vals = ak.array([4, 3, 5, 3, 5, 2, 6, 2]) diff --git a/arkouda/groupbyclass.py b/arkouda/groupbyclass.py index e9b43fd22d..ef01d431ea 100644 --- a/arkouda/groupbyclass.py +++ b/arkouda/groupbyclass.py @@ -81,11 +81,11 @@ def unique( Input array. return_groups : bool, optional If True, also return grouping information for the array. + assume_sorted : bool, optional + If True, assume pda is sorted and skip sorting step return_indices: bool, optional Only applicable if return_groups is True. If True, return unique key indices along with other groups - assume_sorted : bool, optional - If True, assume pda is sorted and skip sorting step Returns -------