3231 unique unit tests (#3258)

* Adds PROTOs unit test for ak.unique * Adds PROTOs unit test for ak.unique * 3231 addresses feedback, splits 1 assert into 2 * addresses feedback re asserts * 3231 incorporates all feedback. * 3231 updates comments. * adds test that keys are always sorted for int64 --------- Co-authored-by: drculhane <drculhane@users.noreply.github.com> Co-authored-by: ajpotts <amanda.j.potts@gmail.com>
Bears-R-Us · Jun 27, 2024 · 714d587 · 714d587
1 parent dd93f86
commit 714d587
Show file tree

Hide file tree

Showing 2 changed files with 102 additions and 2 deletions.
diff --git a/PROTO_tests/tests/groupby_test.py b/PROTO_tests/tests/groupby_test.py
@@ -8,6 +8,39 @@
 from arkouda import sum as aksum
 from arkouda.groupbyclass import GroupByReductionType
 from arkouda.scipy import chisquare as akchisquare
+from arkouda.dtypes import npstr
+
+#  block of variables and functions used in test_unique
+
+UNIQUE_TYPES = [ak.categorical, ak.int64, ak.float64, npstr]
+VOWELS_AND_SUCH = ["a", "e", "i", "o", "u", "AB", 47, 2, 3.14159]
+PICKS = np.array([f"base {i}" for i in range(10)])
+
+isSorted = lambda x: np.all(x[:-1] <= x[1:])  # short for is x[i] <= x[i+1] for all i
+
+#  This function (almost) guarantees both a sorted and unsorted version of
+#  a 1d array.  The only exception is an array of all identical values.
+#  The first "if" block skips the whole function in that case.  Otherwise,
+#  if the sample is already sorted, a non-sorted permutation is generated,
+#  and the two are returned.  If it isn't, a sorted version is created,
+#  and those two are returned.
+
+
+def make_sorted_and_unsorted_data(sample):
+    if np.all(sample == sample[0]):
+        return sample, sample
+    if isSorted(sample):
+        s_a = sample[:]
+        us_a = np.random.permutation(sample)
+        while isSorted(us_a):
+            us_a = np.random.permutation(us_a)
+    else:
+        s_a = np.sort(sample)
+        us_a = sample[:]
+    return s_a, us_a
+
+
+#  end of block
 
 
 def to_tuple_dict(labels, values):
@@ -757,6 +790,73 @@ def test_large_mean_aggregation(self):
         for m in means.to_list():
             assert np.isclose(float(a[0]), m)
 
+    #   ak.unique takes 1 pda argument and 3 booleans
+    #      However, not all 8 combinations of the booleans are needed to
+    #      cover the test space.
+    #      Combinations TTF and TFF are supersets of all other possible
+    #      combinations, so only those are tested below.
+
+    @pytest.mark.parametrize("data_type", UNIQUE_TYPES)
+    @pytest.mark.parametrize("prob_size", pytest.prob_size)
+    def test_unique(self, data_type, prob_size):
+        Jenny = pytest.seed if pytest.seed is not None else 8675309
+        T = True
+        F = False
+        np.random.seed(Jenny)
+        arrays = {
+            npstr: np.random.choice(VOWELS_AND_SUCH, prob_size),
+            ak.int64: np.random.randint(0, prob_size // 3, prob_size),
+            ak.float64: np.random.uniform(0, prob_size // 3, prob_size),
+            ak.categorical: np.random.choice(PICKS, prob_size),
+        }
+        nda = arrays[data_type]
+        np_unique = np.unique(nda)  # get unique keys from np for comparison
+        s_nda, us_nda = make_sorted_and_unsorted_data(nda)
+        s_pda = ak.array(s_nda)
+        us_pda = ak.array(us_nda)
+
+        # Categorical requires another step to make the pdarrays categorical
+
+        if data_type == "categorical":
+            s_pda = ak.Categorical(s_pda)
+            us_pda = ak.Categorical(us_pda)
+
+        # Call ak.unique with TTF and TFF
+
+        ak_TTF = ak.unique(s_pda, T, T, F)
+        ak_TFF = ak.unique(us_pda, T, F, F)
+
+        # Check for correct unique keys.
+
+        assert np.all(np_unique == np.sort(ak_TFF[0].to_ndarray()))
+        assert np.all(np_unique == np.sort(ak_TTF[0].to_ndarray()))
+
+        # Check groups and indices.  If data was sorted, the group ndarray
+        # should just be list(range(len(nda))).  
+        # For unsorted data, a reordered copy of the pdarray is created
+        # based on the returned permutation.
+        # In both cases, broadcasting the unique values using the returned
+        # indices should create the sorted/reordered array.
+
+        # keys should always be returned sorted if data is int64
+
+        # sorted
+
+        if data_type == ak.int64 : assert isSorted(ak_TFF[0].to_ndarray())
+        srange = np.arange(len(nda))
+        assert np.all(srange == ak_TTF[1].to_ndarray())
+        indices = ak_TTF[2]
+        assert ak.all(s_pda == ak.broadcast(indices, ak_TTF[0], len(s_nda)))
+
+        # unsorted
+
+        aku = ak.unique(us_pda).to_ndarray()
+        if data_type == ak.int64 : assert isSorted(aku)
+        reordering = ak_TFF[1]
+        reordered = us_pda[reordering]
+        indices = ak_TFF[2]
+        assert ak.all(reordered == ak.broadcast(indices, ak_TFF[0], len(us_nda)))
+
     def test_unique_aggregation(self):
         keys = ak.array([0, 1, 0, 1, 0, 1, 0, 1])
         vals = ak.array([4, 3, 5, 3, 5, 2, 6, 2])

diff --git a/arkouda/groupbyclass.py b/arkouda/groupbyclass.py
@@ -81,11 +81,11 @@ def unique(
         Input array.
     return_groups : bool, optional
         If True, also return grouping information for the array.
+    assume_sorted : bool, optional
+        If True, assume pda is sorted and skip sorting step
     return_indices: bool, optional
         Only applicable if return_groups is True.
         If True, return unique key indices along with other groups
-    assume_sorted : bool, optional
-        If True, assume pda is sorted and skip sorting step
 
     Returns
     -------