JuliaData · bkamins · Jun 19, 2023 · Apr 7, 2023 · Apr 8, 2023 · Apr 8, 2023
diff --git a/src/abstractdataframe/sort.jl b/src/abstractdataframe/sort.jl
@@ -341,10 +341,15 @@ If `rev` is `true`, reverse sorting is performed. To enable reverse sorting only
 for some columns, pass `order(c, rev=true)` in `cols`, with `c` the
 corresponding column index (see example below).
 
+Since having repeated elements makes multiple sorting orders valid, if `checkunique`
+is `true` some basic uniqueness checks are made. If duplicate elements are 
+found, an `ArgumentError` will be thrown.
+
 The `by` keyword allows providing a function that will be applied to each
 cell before comparison; the `lt` keyword allows providing a custom "less
 than" function. If both `by` and `lt` are specified, the `lt` function is
-applied to the result of the `by` function.
+applied to the result of the `by` function. Neither `by` nor `lt` can be used
+if `checkunique` is `true`, including inside `order(...)` clauses.
 
 All the keyword arguments can be either a single value, which is applied to
 all columns, or a vector of length equal to the number of columns that the
@@ -407,17 +412,7 @@ function Base.issorted(df::AbstractDataFrame, cols=All();
     if cols isa MultiColumnIndex && !(cols isa AbstractVector)
         cols = index(df)[cols]
     end
-    if checkunique
-        newcols = Int[]
-
-        for col in cols
-            push!(newcols, index(df)[(_getcol(col))])
-        end
-        if !allunique(df, newcols)
-            throw(ArgumentError("Non-unique elements found. Multiple orders " *
-                                "are valid"))
-        end
-    end
+    checkunique && _perform_uniqueness_checks(df, cols, lt, by)
     if cols isa ColumnIndex
         return issorted(df[!, cols], lt=to_scalar(lt), by=to_scalar(by),
                         rev=to_scalar(rev), order=to_scalar(order))
@@ -603,17 +598,7 @@ function Base.sortperm(df::AbstractDataFrame, cols=All();
     end
     ord = ordering(df, cols, lt, by, rev, order)
     _alg = Sort.defalg(df, ord; alg=alg, cols=cols)
-    if checkunique
-        newcols = Int[]
-
-        for col in cols
-            push!(newcols, index(df)[(_getcol(col))])
-        end
-        if !allunique(df, newcols)
-            throw(ArgumentError("Non-unique elements found. Multiple orders " *
-                                "are valid"))
-        end
-    end
+    checkunique && _perform_uniqueness_checks(df, cols, lt, by)
     return _sortperm(df, _alg, ord)
 end
 
@@ -720,17 +705,7 @@ function Base.sort!(df::AbstractDataFrame, cols=All();
     end
     ord = ordering(df, cols, lt, by, rev, order)
     _alg = Sort.defalg(df, ord; alg=alg, cols=cols)
-    if checkunique
-        newcols = Int[]
-
-        for col in cols
-            push!(newcols, index(df)[(_getcol(col))])
-        end
-        if !allunique(df, newcols)
-            throw(ArgumentError("Non-unique elements found. Multiple orders " *
-                                "are valid"))
-        end
-    end
+    checkunique && _perform_uniqueness_checks(df, cols, lt, by)
     return sort!(df, _alg, ord)
 end
 
@@ -744,3 +719,41 @@ function Base.sort!(df::AbstractDataFrame, a::Base.Sort.Algorithm,
     end
     permute!(df, _sortperm(df, a, o))
 end
+
+# Internal function that aids in uniqueness checks
+function _perform_uniqueness_checks(df, cols, lt, by)
+    if !(lt == isless && by == identity)
+        throw(ArgumentError("Passing either lt or by along with checkunique=" *
+                            "true is not supported."))
+    end
+    # Easiest case, cols contains numeric indexes already
+    if cols isa AbstractVector{<:ColumnIndex}
+        by_or_lt_set = false
+        col_idxs = cols
+    # Second easiest, multicol index (no vector with orders clauses mixed in)
+    elseif cols isa MultiColumnIndex && !(cols isa AbstractVector) || cols isa ColumnIndex
+        by_or_lt_set = false
+        col_idxs = index(df)[cols]
+    elseif cols isa UserColOrdering
+        by_or_lt_set = any(haskey(cols.kwargs, key) for key in [:by, :lt])
+        col_idxs = index(df)[(_getcol(cols))]
+    # Multicol indexes mixed in
+    elseif cols isa AbstractVector
+        newcols = Int[]
+        by_or_lt_set = false
+        for col in cols
+            if col isa UserColOrdering
+                by_or_lt_set = any(haskey(col.kwargs, key) for key in [:by, :lt])
+            end
+
+            push!(newcols, index(df)[(_getcol(col))])
+        end
+        col_idxs = newcols
+    end
+    if by_or_lt_set
+        throw(ArgumentError("Order clauses with either by or lt set in combination " *
+                            "with checkunique=true are not supported"))
+    end
+    !allunique(df, col_idxs) && throw(ArgumentError("Non-unique elements found. " *
+                                                       "Multiple orders are valid"))
+end
diff --git a/test/sort.jl b/test/sort.jl
@@ -6,18 +6,17 @@ using DataFrames, Random, Test, CategoricalArrays
     dv1 = [9, 1, 8, missing, 3, 3, 7, missing]
     dv2 = [9, 1, 8, missing, 3, 3, 7, missing]
     dv3 = Vector{Union{Int, Missing}}(1:8)
+    dv4 = 8:-1:1
     cv1 = CategoricalArray(dv1, ordered=true)
 
-    d = DataFrame(dv1=dv1, dv2=dv2, dv3=dv3, cv1=cv1)
+    d = DataFrame(dv1=dv1, dv2=dv2, dv3=dv3, dv4=dv4, cv1=cv1)
 
     @test sort(DataFrame()) == DataFrame()
     @test sort!(DataFrame()) == DataFrame()
     @test isempty(sortperm(DataFrame()))
     @test issorted(DataFrame())
     @test sortperm(d) == sortperm(dv1)
     @test sortperm(d[:, [:dv3, :dv1]]) == sortperm(dv3)
-    @test_throws ArgumentError sortperm(d, :cv1, checkunique=true)
-    @test_throws ArgumentError sortperm(d, [:cv1, :dv1], checkunique=true)
     @test sort(d, :dv1)[!, :dv3] == sort(d, "dv1")[!, "dv3"] == sortperm(dv1)
     @test sort(d, :dv2)[!, :dv3] == sortperm(dv1)
     @test sort(d, :cv1)[!, :dv3] == sortperm(dv1)
@@ -32,7 +31,6 @@ using DataFrames, Random, Test, CategoricalArrays
     @test issorted(sort(df, rev=true), rev=true)
     @test issorted(sort(df, [:chrom, :pos])[:, [:chrom, :pos]])
     @test issorted(sort(df, ["chrom", "pos"])[:, ["chrom", "pos"]])
-    @test_throws ArgumentError issorted(sort(df), :rank, checkunique=true)
 
     ds = sort(df, [order(:rank, rev=true), :chrom, :pos])
     @test issorted(ds, [order(:rank, rev=true), :chrom, :pos])
@@ -145,6 +143,64 @@ using DataFrames, Random, Test, CategoricalArrays
     end
 end
 
+@testset "correctness of checkunique keyword" begin
+    dv1 = [9, 1, 8, missing, 3, 3, 7, missing]
+    dv2 = [9, 1, 8, missing, 3, 3, 7, missing]
+    dv3 = Vector{Union{Int, Missing}}(1:8)
+    dv4 = 8:-1:1
+    cv1 = CategoricalArray(dv1, ordered=true)
+
+    d = DataFrame(dv1=dv1, dv2=dv2, dv3=dv3, dv4=dv4, cv1=cv1)
+
+    ## logic:
+    ### Test each every selector in the following order:
+    ### Symbol, String, Vect{ColumnIndex}, Order, Vect{ColIndex, Order}
+
+    # issorted
+    @test_throws ArgumentError issorted(d, :dv1, checkunique=true)
+    @test issorted(d, :dv3, checkunique=true)
+    @test issorted(d, "dv3", checkunique=true)
+    @test issorted(d, ["dv3", "dv4"], checkunique=true)
+    @test issorted(d, :dv4, rev = true, checkunique=true)
+    @test issorted(d, order(:dv4, rev=true), checkunique=true)
+    @test_throws ArgumentError issorted(d, order(:dv4, by=x -> -x), checkunique=true)
+    @test_throws ArgumentError issorted(d, order(:dv4, lt= >), checkunique=true)
+    @test issorted(d, [:dv3, order(:dv4, rev=true)], checkunique=true)
+    @test issorted(d, [:dv3, :dv4], rev = [false, true], checkunique=true)
+    @test_throws ArgumentError issorted(d, :dv3, by=x-> -x, checkunique=true)
+    @test_throws ArgumentError issorted(d, :dv3, lt = >, checkunique=true)
+    @test issorted(d, [order(:dv3, rev=false), order(:dv4, rev=true)], checkunique=true)
+    @test issorted(d, [order(:dv3, by=identity), order(:dv4, rev=true)], checkunique=true)
+
+    # sort
+    @test_throws ArgumentError sort(d, :dv1, checkunique=true)
+    @test_throws ArgumentError sort(d, "dv1", checkunique=true)
+    @test_throws ArgumentError sort(d, 1, checkunique=true)
+    @test_throws ArgumentError sort(d, [:dv1, :dv2], checkunique=true)
+    @test_throws ArgumentError sort(d, ["dv1", "dv2"], checkunique=true)
+    @test_throws ArgumentError sort(d, order(:dv1, rev=true), checkunique=true)
+    @test_throws ArgumentError sort(d, order(:dv1, by=x -> -x), checkunique=true)
+    @test_throws ArgumentError sort(d, order(:dv1, lt= >), checkunique=true)
+    @test_throws ArgumentError sort(d, [:dv2, order(:dv1, rev=true)], checkunique=true)
+    @test_throws ArgumentError sort(d, [:dv1, :dv2], rev = [false, false], checkunique=true)
+    @test_throws ArgumentError sort(d, :dv1, by = x -> -x, checkunique=true)
+    @test_throws ArgumentError sort(d, :dv1, lt = >, checkunique=true)
+
+    # sortperm
+    @test_throws ArgumentError sortperm(d, :dv1, checkunique=true)
+    @test_throws ArgumentError sortperm(d, "dv1", checkunique=true)
+    @test_throws ArgumentError sortperm(d, 1, checkunique=true)
+    @test_throws ArgumentError sortperm(d, [:dv1, :dv2], checkunique=true)
+    @test_throws ArgumentError sortperm(d, ["dv1", "dv2"], checkunique=true)
+    @test_throws ArgumentError sortperm(d, order(:dv1, rev=true), checkunique=true)
+    @test_throws ArgumentError sortperm(d, order(:dv1, by=x -> -x), checkunique=true)
+    @test_throws ArgumentError sortperm(d, order(:dv1, lt= >), checkunique=true)
+    @test_throws ArgumentError sortperm(d, [:dv2, order(:dv1, rev=true)], checkunique=true)
+    @test_throws ArgumentError sortperm(d, [:dv1, :dv2], rev = [false, false], checkunique=true)
+    @test_throws ArgumentError sortperm(d, :dv1, by = x -> -x, checkunique=true)
+    @test_throws ArgumentError sortperm(d, :dv1, lt = >, checkunique=true)
+end
+
 @testset "non standard selectors" begin
     Random.seed!(1234)
     df = DataFrame(rand(1:2, 1000, 4), :auto)