diff --git a/NEWS.md b/NEWS.md index ddaab8785..0c6c5bbfb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -13,6 +13,10 @@ treated as if they were wrapped in `Cols` and does not throw an error when a vector of duplicate indices is passed when doing column selection ([#3302](https://github.com/JuliaData/DataFrames.jl/pull/3302)) +* Added the kwarg `checkunique` to sorting related functions (`issorted`, + `sort`, `sort!` and `sortperm`) that throws an error when duplicate elements + make multiple sort orders valid + ([#3312](https://github.com/JuliaData/DataFrames.jl/pull/3312)) * `reduce` performing `vcat` on a collection of data frames now accepts `init` keyword argument ([#3310](https://github.com/JuliaData/DataFrames.jl/pull/3310)) @@ -28,6 +32,7 @@ from a data frame is its column or might alias with some of its columns ([#3304](https://github.com/JuliaData/DataFrames.jl/pull/3304)) + # DataFrames.jl v1.5 Release Notes ## New functionalities diff --git a/src/abstractdataframe/sort.jl b/src/abstractdataframe/sort.jl index fb9931386..5a6c676cc 100755 --- a/src/abstractdataframe/sort.jl +++ b/src/abstractdataframe/sort.jl @@ -14,6 +14,9 @@ # which allows a user to specify column specific orderings # with "order(column, rev=true, ...)" +import SortingAlgorithms.DataStructures.FasterForward, + SortingAlgorithms.DataStructures.FasterReverse + struct UserColOrdering{T<:ColumnIndex} col::T kwargs @@ -341,15 +344,24 @@ If `rev` is `true`, reverse sorting is performed. To enable reverse sorting only for some columns, pass `order(c, rev=true)` in `cols`, with `c` the corresponding column index (see example below). +Since having repeated elements makes multiple sorting orders valid, the +`checkunique` keyword allows for the situation to be caught. If `checkunique` is +`true` and duplicate elements are found an error will be thrown. The use of the +`checkunique` keyword is only supported when neither the `by` nor the `lt` +keywords are being used. Similarly, the use of `order(...)` clauses that specify +either `by` or `lt` are not supported, but specifying `rev` by itself is +allowed. + The `by` keyword allows providing a function that will be applied to each cell before comparison; the `lt` keyword allows providing a custom "less than" function. If both `by` and `lt` are specified, the `lt` function is applied to the result of the `by` function. -All the keyword arguments can be either a single value, which is applied to -all columns, or a vector of length equal to the number of columns that the -operation is performed on. In such a case each entry is used for the -column in the corresponding position in `cols`. +Keyword arguments specifying sorting order (`rev`, `lt` or `by`) can either be +a single value, or a vector of length equal to the number of columns the +operation is performed on. When a single value is passed, it applies to all +columns. When a vector is passed, each entry applies to the column in the +corresponding position in `cols`. """ """ @@ -357,7 +369,8 @@ column in the corresponding position in `cols`. lt::Union{Function, AbstractVector{<:Function}}=isless, by::Union{Function, AbstractVector{<:Function}}=identity, rev::Union{Bool, AbstractVector{Bool}}=false, - order::Union{Ordering, AbstractVector{<:Ordering}}=Forward) + order::Union{Ordering, AbstractVector{<:Ordering}}=Forward, + checkunique::Bool=false) Test whether data frame `df` sorted by column(s) `cols`. Checking against multiple columns is done lexicographically. @@ -397,7 +410,8 @@ function Base.issorted(df::AbstractDataFrame, cols=All(); lt::Union{Function, AbstractVector{<:Function}}=isless, by::Union{Function, AbstractVector{<:Function}}=identity, rev::Union{Bool, AbstractVector{Bool}}=false, - order::Union{Ordering, AbstractVector{<:Ordering}}=Forward) + order::Union{Ordering, AbstractVector{<:Ordering}}=Forward, + checkunique::Bool=false) to_scalar(x::AbstractVector) = only(x) to_scalar(x::Any) = x @@ -405,6 +419,7 @@ function Base.issorted(df::AbstractDataFrame, cols=All(); if cols isa MultiColumnIndex && !(cols isa AbstractVector) cols = index(df)[cols] end + checkunique && _perform_uniqueness_checks(df, cols, lt, by, order) if cols isa ColumnIndex return issorted(df[!, cols], lt=to_scalar(lt), by=to_scalar(by), rev=to_scalar(rev), order=to_scalar(order)) @@ -427,7 +442,8 @@ Base.issorted(::AbstractDataFrame, ::Base.Order.Ordering) = by::Union{Function, AbstractVector{<:Function}}=identity, rev::Union{Bool, AbstractVector{Bool}}=false, order::Union{Ordering, AbstractVector{<:Ordering}}=Forward, - view::Bool=false) + view::Bool=false, + checkunique::Bool=false) Return a data frame containing the rows in `df` sorted by column(s) `cols`. Sorting on multiple columns is done lexicographically. @@ -506,8 +522,10 @@ julia> sort(df, [:x, order(:y, rev=true)]) by::Union{Function, AbstractVector{<:Function}}=identity, rev::Union{Bool, AbstractVector{Bool}}=false, order::Union{Ordering, AbstractVector{<:Ordering}}=Forward, - view::Bool=false) - rowidxs = sortperm(df, cols, alg=alg, lt=lt, by=by, rev=rev, order=order) + view::Bool=false, + checkunique::Bool=false) + rowidxs = sortperm(df, cols, alg=alg, lt=lt, by=by, rev=rev, order=order, + checkunique=checkunique) return view ? Base.view(df, rowidxs, :) : df[rowidxs, :] end @@ -517,7 +535,8 @@ end lt::Union{Function, AbstractVector{<:Function}}=isless, by::Union{Function, AbstractVector{<:Function}}=identity, rev::Union{Bool, AbstractVector{Bool}}=false, - order::Union{Ordering, AbstractVector{<:Ordering}}=Forward) + order::Union{Ordering, AbstractVector{<:Ordering}}=Forward, + checkunique::Bool=false) Return a permutation vector of row indices of data frame `df` that puts them in sorted order according to column(s) `cols`. @@ -579,13 +598,15 @@ function Base.sortperm(df::AbstractDataFrame, cols=All(); lt::Union{Function, AbstractVector{<:Function}}=isless, by::Union{Function, AbstractVector{<:Function}}=identity, rev::Union{Bool, AbstractVector{Bool}}=false, - order::Union{Ordering, AbstractVector{<:Ordering}}=Forward) + order::Union{Ordering, AbstractVector{<:Ordering}}=Forward, + checkunique::Bool=false) # exclude AbstractVector as in that case cols can contain order(...) clauses if cols isa MultiColumnIndex && !(cols isa AbstractVector) cols = index(df)[cols] end ord = ordering(df, cols, lt, by, rev, order) _alg = Sort.defalg(df, ord; alg=alg, cols=cols) + checkunique && _perform_uniqueness_checks(df, cols, lt, by, order) return _sortperm(df, _alg, ord) end @@ -601,7 +622,8 @@ _sortperm(df::AbstractDataFrame, a::Algorithm, o::Ordering) = lt::Union{Function, AbstractVector{<:Function}}=isless, by::Union{Function, AbstractVector{<:Function}}=identity, rev::Union{Bool, AbstractVector{Bool}}=false, - order::Union{Ordering, AbstractVector{<:Ordering}}=Forward) + order::Union{Ordering, AbstractVector{<:Ordering}}=Forward, + checkunique::Bool=false) Sort data frame `df` by column(s) `cols` by permuting its rows in-place. Sorting on multiple columns is done lexicographicallly. @@ -682,7 +704,8 @@ function Base.sort!(df::AbstractDataFrame, cols=All(); lt::Union{Function, AbstractVector{<:Function}}=isless, by::Union{Function, AbstractVector{<:Function}}=identity, rev::Union{Bool, AbstractVector{Bool}}=false, - order::Union{Ordering, AbstractVector{<:Ordering}}=Forward) + order::Union{Ordering, AbstractVector{<:Ordering}}=Forward, + checkunique::Bool=false) # exclude AbstractVector as in that case cols can contain order(...) clauses if cols isa MultiColumnIndex && !(cols isa AbstractVector) @@ -690,8 +713,112 @@ function Base.sort!(df::AbstractDataFrame, cols=All(); end ord = ordering(df, cols, lt, by, rev, order) _alg = Sort.defalg(df, ord; alg=alg, cols=cols) + checkunique && _perform_uniqueness_checks(df, cols, lt, by, order) return sort!(df, _alg, ord) end -Base.sort!(df::AbstractDataFrame, a::Base.Sort.Algorithm, o::Base.Sort.Ordering) = +# this sort! method does not support uniqueness checks since they can't be carried out +# without knowledge of which columns are to be sorted. +function Base.sort!(df::AbstractDataFrame, a::Base.Sort.Algorithm, + o::Base.Sort.Ordering) permute!(df, _sortperm(df, a, o)) +end + +# Functions to verify whether an Ordering has user-defined `by` or `lt` functions +# The complexity checks exploit `Base.Order`'s way of constructing `Order` objects. +# `DirectOrdering` is by definition an order that uses `isless` and `identity` as its +# `lt` and `by` functions, so they are not complex. The `By` and `Lt` types have +# attributes storing the defined `by` and `lt` functions respectively, simple identity +# checks are enough. `ReverseOrdering`s wrap another type of ordering, so we +# perform the check on the wrapped type. +is_complex(o::Union{DirectOrdering, FasterForward, FasterReverse}) = false +is_complex(o::By) = o.by !== identity +is_complex(o::Lt) = o.lt !== isless +is_complex(o::ReverseOrdering) = is_complex(o.fwd) +is_complex(o::Perm) = is_complex(o.order) + +function is_complex(o::DFPerm) + if o.ord isa Ordering + return is_complex(o.ord) + elseif o.ord isa Tuple + return any(is_complex(ordering) for ordering in o.ord) + end + throw(ArgumentError("unsupported ord type")) +end + +function is_complex(o::Ordering) + throw(ArgumentError("The use of the keyword `checkunique` is currently " * + "not supported with Ordering type $(typeof(o))")) +end + +function is_complex(o::UserColOrdering) + has_lt = haskey(o.kwargs, :lt) + has_by = haskey(o.kwargs, :by) + if !has_lt && !has_by + return false + elseif has_lt && !has_by + return o.kwargs[:lt] !== isless + elseif has_by && !has_lt + return o.kwargs[:by] !== identity + else + @assert has_lt && has_by + return o.kwargs[:by] !== identity || o.kwargs[:lt] !== isless + end +end + +# Internal function that aids in uniqueness checks +# Converts column selectors to indices and checks necessary conditions for uniqueness +function _perform_uniqueness_checks(df::AbstractDataFrame, cols, + lt::Union{Function, AbstractVector{<:Function}}, + by::Union{Function, AbstractVector{<:Function}}, + order::Union{Ordering, AbstractVector{<:Ordering}}) + + if !(lt === isless && by === identity) + throw(ArgumentError("Passing either lt or by along with checkunique=" * + "true is not supported.")) + end + + # Validating the order argument + if order isa Ordering + order = [order] + end + for o in order + if is_complex(o) + throw(ArgumentError("Using either lt or by functions through the " * + "order keyword argument simultaneously with " * + "checkunique=true is not supported.")) + end + end + + # Easiest case, cols contains column indexes already + if cols isa AbstractVector{<:ColumnIndex} + by_or_lt_set = false + col_idxs = cols + # Second easiest, multicol index (no vector with orders clauses mixed in) + elseif (cols isa MultiColumnIndex && !(cols isa AbstractVector)) || cols isa ColumnIndex + by_or_lt_set = false + col_idxs = index(df)[cols] + elseif cols isa UserColOrdering + by_or_lt_set = is_complex(cols) + col_idxs = [index(df)[_getcol(cols)]] + # Mix of ColOrdering and other ColumnSelectors + else + @assert cols isa AbstractVector + newcols = Int[] + by_or_lt_set = false + for col in cols + if col isa UserColOrdering + by_or_lt_set = is_complex(col) || by_or_lt_set + end + + push!(newcols, index(df)[_getcol(col)]) + end + col_idxs = newcols + end + if by_or_lt_set + throw(ArgumentError("Order clauses with either by or lt set in combination " * + "with checkunique=true are not supported")) + end + allunique(df, col_idxs) || + throw(ArgumentError("Non-unique elements found. Multiple orders are valid.")) +end \ No newline at end of file diff --git a/test/sort.jl b/test/sort.jl index 36ff18170..b81e262e1 100644 --- a/test/sort.jl +++ b/test/sort.jl @@ -1,14 +1,15 @@ module TestSort -using DataFrames, Random, Test, CategoricalArrays, InlineStrings +using DataFrames, Random, Test, CategoricalArrays, SortingAlgorithms, InlineStrings @testset "standard tests" begin dv1 = [9, 1, 8, missing, 3, 3, 7, missing] dv2 = [9, 1, 8, missing, 3, 3, 7, missing] dv3 = Vector{Union{Int, Missing}}(1:8) + dv4 = 8:-1:1 cv1 = CategoricalArray(dv1, ordered=true) - d = DataFrame(dv1=dv1, dv2=dv2, dv3=dv3, cv1=cv1) + d = DataFrame(dv1=dv1, dv2=dv2, dv3=dv3, dv4=dv4, cv1=cv1) @test sort(DataFrame()) == DataFrame() @test sort!(DataFrame()) == DataFrame() @@ -142,6 +143,133 @@ using DataFrames, Random, Test, CategoricalArrays, InlineStrings end end +@testset "correctness of checkunique keyword" begin + dv1 = [9, 1, 8, missing, 3, 3, 7, missing] + dv2 = [9, 1, 8, missing, 3, 3, 7, missing] + dv3 = Vector{Union{Int, Missing}}(1:8) + dv4 = 8:-1:1 + cv1 = CategoricalArray(dv1, ordered=true) + + d = DataFrame(dv1=dv1, dv2=dv2, dv3=dv3, dv4=dv4, cv1=cv1) + + # Complex Orderings: ord(lt, by, rev, order) + orderings = Dict( + :cmplex => Base.Order.ord(isless, x -> -x, false), # Complex ∴ disallowed + :simple => Base.Order.ord(isless, identity, true), # Simple ∴ allowed + # Creating Order objects with their constructors directly + :by_simple => Base.Order.By(identity), # Simple ∴ allowed + :by_cmplex => Base.Order.By(x -> x), # Disallowed since x -> x !== identity + :lt_simple => Base.Order.Lt(isless), # Simple ∴ allowed + :lt_cmplex => Base.Order.Lt(<), # Disallowed since isless !== < + # A test for robustness wrt complex orderings + :sneaky_lt => Base.Order.ord(<, identity, true), # Disallowed + # Tests with other subtypes of Ordering + :perm_simple => Base.Order.Perm(Base.Order.ForwardOrdering(), dv3), + :perm_cmplex => Base.Order.Perm( + Base.Order.ord(isless, x -> -x, false), dv3 + ), + :fforward => SortingAlgorithms.DataStructures.FasterForward(), # Simple ∴ allowed + :freverse => SortingAlgorithms.DataStructures.FasterReverse() # Simple ∴ allowed + ) + + ords = [ + order(:dv4, lt = isless, by = identity, rev=true), + order(:dv4, lt = >), + order(:dv4, by = x -> -x), + order(:dv4, lt = isless, by = x -> -x), + ] + + # To test uncovered subtypes of Ordering throw error + struct ArbitraryOrderSubtype <: Base.Sort.Ordering end + + ## logic: + ### Test each every selector in the following order: + ### Symbol, String, Vect{ColumnIndex}, Order, Vect{ColIndex, Order} + + # issorted + # Possible selectors as cols + @test_throws ArgumentError issorted(d, :dv1, checkunique=true) + @test issorted(d, :dv3, checkunique=true) + @test issorted(d, "dv3", checkunique=true) + @test issorted(d, ["dv3", "dv4"], checkunique=true) + @test issorted(d, :dv4, rev = true, checkunique=true) + @test issorted(d, order(:dv4, rev=true), checkunique=true) + # Non-standard lt or by functions + @test_throws ArgumentError issorted(d, order(:dv4, by=x -> -x), checkunique=true) + @test_throws ArgumentError issorted(d, order(:dv4, lt= >), checkunique=true) + @test_throws ArgumentError issorted(d, :dv3, by=x-> -x, checkunique=true) + @test_throws ArgumentError issorted(d, :dv3, lt = >, checkunique=true) + # Mixed-in order clauses + @test issorted(d, [:dv3, order(:dv4, rev=true)], checkunique=true) + @test issorted(d, [:dv3, :dv4], rev = [false, true], checkunique=true) + @test issorted(d, [order(:dv3, rev=false), order(:dv4, rev=true)], checkunique=true) + @test issorted(d, [order(:dv3, by=identity), order(:dv4, rev=true)], checkunique=true) + @test_throws ArgumentError issorted(d, [order(:dv1, by = round), order(:dv2, rev=true)], checkunique=true) + # Orderings defined via Base.ord & Base.Order subtypes + @test issorted(d, :dv4, order=Base.Reverse, checkunique=true) + @test issorted(d, :dv4, order=orderings[:simple], checkunique=true) + @test_throws ArgumentError issorted(d, :dv3, order=orderings[:cmplex], checkunique=true) + @test issorted(d, :dv3, order=orderings[:by_simple], checkunique=true) + @test_throws ArgumentError issorted(d, :dv3, order=orderings[:by_cmplex], checkunique=true) + @test issorted(d, :dv3, order=orderings[:lt_simple], checkunique=true) + @test_throws ArgumentError issorted(d, :dv3, order=orderings[:lt_cmplex], checkunique=true) + @test_throws ArgumentError issorted(d, :dv3, order=orderings[:sneaky_lt], checkunique=true) + @test_throws ArgumentError issorted(d, :dv3, order=orderings[:sneaky_lt], checkunique=true) + @test issorted(d, :dv3, order=orderings[:fforward], checkunique=true) + @test issorted(d, :dv4, order=orderings[:freverse], checkunique=true) + @test issorted(d, :dv3, order=orderings[:perm_simple], checkunique=true) + @test_throws ArgumentError issorted(d, :dv3, order=orderings[:perm_cmplex], checkunique=true) + # UserColOrderings + @test issorted(d, ords[1], checkunique=true) + @test_throws ArgumentError issorted(d, ords[2], checkunique=true) + + # Checking correct complex order detection + @test DataFrames.is_complex.(ords) == [false, true, true, true] + @test DataFrames.is_complex(DataFrames.DFPerm([orderings[:cmplex], orderings[:by_cmplex]], d[!, [:dv3, :dv4]])) == true + @test DataFrames.is_complex(DataFrames.DFPerm(orderings[:cmplex], d[!, [:dv3]])) == true + # Error on unsupported Ordering subtypes + @test_throws ArgumentError DataFrames.is_complex(ArbitraryOrderSubtype()) + + # sort + @test_throws ArgumentError sort(d, :dv1, checkunique=true) + @test_throws ArgumentError sort(d, "dv1", checkunique=true) + @test_throws ArgumentError sort(d, 1, checkunique=true) + @test_throws ArgumentError sort(d, [:dv1, :dv2], checkunique=true) + @test_throws ArgumentError sort(d, ["dv1", "dv2"], checkunique=true) + @test_throws ArgumentError sort(d, order(:dv1, rev=true), checkunique=true) + @test_throws ArgumentError sort(d, order(:dv1, by=x -> -x), checkunique=true) + @test_throws ArgumentError sort(d, order(:dv1, lt= >), checkunique=true) + @test_throws ArgumentError sort(d, [:dv2, order(:dv1, rev=true)], checkunique=true) + @test_throws ArgumentError sort(d, [:dv1, :dv2], rev = [false, false], checkunique=true) + @test_throws ArgumentError sort(d, :dv1, by = x -> -x, checkunique=true) + @test_throws ArgumentError sort(d, :dv1, lt = >, checkunique=true) + + # sort! + @test_throws ArgumentError sort!(d, :dv1, checkunique=true) + @test_throws ArgumentError sort!(d[!, [:dv1, :dv2]], checkunique = true) + # Orderings defined via Base.ord + @test_throws ArgumentError sort!(d, :dv3, order=orderings[:cmplex], checkunique=true) + @test_throws ArgumentError sort!(d, :dv3, order=orderings[:by_cmplex], checkunique=true) + @test_throws ArgumentError sort!(d, :dv3, order=orderings[:lt_cmplex], checkunique=true) + @test_throws ArgumentError sort!(d, :dv3, order=orderings[:sneaky_lt], checkunique=true) + # UserColOrderings + @test_throws ArgumentError sort!(d, ords[2], checkunique=true) + + # sortperm + @test_throws ArgumentError sortperm(d, :dv1, checkunique=true) + @test_throws ArgumentError sortperm(d, "dv1", checkunique=true) + @test_throws ArgumentError sortperm(d, 1, checkunique=true) + @test_throws ArgumentError sortperm(d, [:dv1, :dv2], checkunique=true) + @test_throws ArgumentError sortperm(d, ["dv1", "dv2"], checkunique=true) + @test_throws ArgumentError sortperm(d, order(:dv1, rev=true), checkunique=true) + @test_throws ArgumentError sortperm(d, order(:dv1, by=x -> -x), checkunique=true) + @test_throws ArgumentError sortperm(d, order(:dv1, lt= >), checkunique=true) + @test_throws ArgumentError sortperm(d, [:dv2, order(:dv1, rev=true)], checkunique=true) + @test_throws ArgumentError sortperm(d, [:dv1, :dv2], rev = [false, false], checkunique=true) + @test_throws ArgumentError sortperm(d, :dv1, by = x -> -x, checkunique=true) + @test_throws ArgumentError sortperm(d, :dv1, lt = >, checkunique=true) +end + @testset "non standard selectors" begin Random.seed!(1234) df = DataFrame(rand(1:2, 1000, 4), :auto)