JuliaData · bkamins · Dec 27, 2022 · Dec 19, 2022 · Dec 19, 2022 · Dec 19, 2022
diff --git a/NEWS.md b/NEWS.md
@@ -10,6 +10,8 @@
 * Add support for `operator` keyword argument in `Cols`
   to take a set operation to apply to passed selectors (`union` by default)
   ([3224](https://github.com/JuliaData/DataFrames.jl/pull/3224))
+* Improve support for setting group order in `groupby`
+  ([3253](https://github.com/JuliaData/DataFrames.jl/pull/3253))
 * Joining functions now support `order` keyword argument allowing the user
   to specify the order of the rows in the produced table
   ([#3233](https://github.com/JuliaData/DataFrames.jl/pull/3233))

diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md
@@ -1276,3 +1276,71 @@ two aspects:
   are exceptions to the standard operation specification syntax rules. They
   were added for user convenience.
 
+## Specifying group order in `groupby`
+
+By default order of groups produced by `groupby` is undefined.
+If you want the order of groups to follow the order of first appereance in
+the source data frame of a grouping key then pass the `sort=false` keyword argument
+to `groupby`:
+
+```jldoctest sac
+julia> push!(df, ["a", 100, 100]) # push row with large integer values to disable default sorting
+7×3 DataFrame
+ Row │ customer_id  transaction_id  volume 
+     │ String       Int64           Int64  
+─────┼─────────────────────────────────────
+   1 │ a                        12       2
+   2 │ b                        15       3
+   3 │ b                        19       1
+   4 │ b                        17       4
+   5 │ c                        13       5
+   6 │ c                        11       9
+   7 │ a                       100     100
+
+julia> keys(groupby(df, :volume))
+7-element DataFrames.GroupKeys{GroupedDataFrame{DataFrame}}:
+ GroupKey: (volume = 2,)
+ GroupKey: (volume = 3,)
+ GroupKey: (volume = 1,)
+ GroupKey: (volume = 4,)
+ GroupKey: (volume = 5,)
+ GroupKey: (volume = 9,)
+ GroupKey: (volume = 100,)
+```
+
+If you want to have them sorted in ascending order pass `sort=true`:
+
+```
+julia> keys(groupby(df, :volume, sort=true))
+7-element DataFrames.GroupKeys{GroupedDataFrame{DataFrame}}:
+ GroupKey: (volume = 1,)
+ GroupKey: (volume = 2,)
+ GroupKey: (volume = 3,)
+ GroupKey: (volume = 4,)
+ GroupKey: (volume = 5,)
+ GroupKey: (volume = 9,)
+ GroupKey: (volume = 100,)
+```
+
+You can also use the [`order`](@ref) wrapper when passing a column name to group
+by or pass a named tuple as `sort` keyword argument containing one or more of
+`alg`, `lt`, `by`, `rev`, and `order` fields that will be treated just like in
+[`sortperm`](@ref):
+
+```
+julia> keys(groupby(df, [:customer_id, order(:volume, rev=true)]))
+6-element DataFrames.GroupKeys{GroupedDataFrame{DataFrame}}:
+ GroupKey: (customer_id = "a", volume = 2)
+ GroupKey: (customer_id = "b", volume = 4)
+ GroupKey: (customer_id = "b", volume = 3)
+ GroupKey: (customer_id = "b", volume = 1)
+ GroupKey: (customer_id = "c", volume = 9)
+ GroupKey: (customer_id = "c", volume = 5)
+
+julia> keys(groupby(df, :customer_id, sort=(rev=true,)))
+3-element DataFrames.GroupKeys{GroupedDataFrame{DataFrame}}:
+ GroupKey: (customer_id = "c",)
+ GroupKey: (customer_id = "b",)
+ GroupKey: (customer_id = "a",)
+```
+
diff --git a/src/DataFrames.jl b/src/DataFrames.jl
@@ -139,6 +139,8 @@ include("subdataframe/subdataframe.jl")
 include("dataframerow/dataframerow.jl")
 include("dataframe/insertion.jl")
 
+include("abstractdataframe/sort.jl")
+
 include("groupeddataframe/groupeddataframe.jl")
 include("groupeddataframe/utils.jl")
 
@@ -165,8 +167,6 @@ include("groupeddataframe/show.jl")
 include("dataframerow/show.jl")
 include("abstractdataframe/io.jl")
 
-include("abstractdataframe/sort.jl")
-
 include("other/tables.jl")
 include("other/names.jl")
 include("other/metadata.jl")

diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl
@@ -49,7 +49,7 @@ end
 
 """
     groupby(d::AbstractDataFrame, cols;
-            sort::Union{Bool, Nothing}=nothing,
+            sort::Union{Bool, Nothing, NamedTuple}=nothing,
             skipmissing::Bool=false)
 
 Return a `GroupedDataFrame` representing a view of an `AbstractDataFrame` split
@@ -58,25 +58,31 @@ into row groups.
 # Arguments
 - `df` : an `AbstractDataFrame` to split
 - `cols` : data frame columns to group by. Can be any column selector
-  ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
-- `sort` : if `sort=true` sort groups according to the values of the grouping columns
-  `cols`; if `sort=false` groups are created in their order of appearance in `df`
-  if `sort=nothing` (the default) then the fastest available grouping algorithm
-  is picked and in consequence the order of groups in the result is undefined
-  and may change in future releases; below a description of the current
-  implementation is provided.
+  ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). In particular if the selector
+  picks no columns then a single-group `GroupedDataFrame` is created. As a
+  special case, if `cols` is a single column or a vector of columns then
+  it can contain columns wrapped in [`order`](@ref) that will be used to
+  determine the order of groups if `sort` is `true` or a `NamedTuple` (if `sort`
+  is `false`, then passing `order` is an error; if `sort` is `nothing`
+  then it is set to `true` when `order` is passed).
+- `sort` : if `sort=true` sort groups according to the values of the grouping
+  columns `cols`; if `sort=false` groups are created in their order of
+  appearance in `df`; if `sort=nothing` (the default) then the fastest available
+  grouping algorithm is picked and in consequence the order of groups in the
+  result is undefined and may change in future releases; below a description of
+  the current implementation is provided. Additionally `sort` can be a
+  `NamedTuple` having some or all of `alg`, `lt`, `by`, `rev`, and `order`
+  fields. In this case the groups are sorted and their order follows the
+  [`sortperm`](@ref) order.
 - `skipmissing` : whether to skip groups with `missing` values in one of the
   grouping columns `cols`
 
 # Details
+
 An iterator over a `GroupedDataFrame` returns a `SubDataFrame` view
 for each grouping into `df`.
 Within each group, the order of rows in `df` is preserved.
 
-`cols` can be any valid data frame indexing expression.
-In particular if it is an empty vector then a single-group `GroupedDataFrame`
-is created.
-
 A `GroupedDataFrame` also supports indexing by groups, `select`, `transform`,
 and `combine` (which applies a function to each group and combines the result
 into a data frame).
@@ -104,7 +110,8 @@ and none of them is equal to `-0.0`.
 
 # See also
 
-[`combine`](@ref), [`select`](@ref), [`select!`](@ref), [`transform`](@ref), [`transform!`](@ref)
+[`combine`](@ref), [`select`](@ref), [`select!`](@ref), [`transform`](@ref),
+[`transform!`](@ref)
 
 # Examples
 ```jldoctest
@@ -209,9 +216,29 @@ julia> for g in gd
 ```
 """
 function groupby(df::AbstractDataFrame, cols;
-                 sort::Union{Bool,Nothing}=nothing, skipmissing::Bool=false)
+                 sort::Union{Bool, Nothing, NamedTuple}=nothing,
+                 skipmissing::Bool=false)
     _check_consistency(df)
-    idxcols = index(df)[cols]
+    if cols isa UserColOrdering ||
+       (cols isa AbstractVector && any(x -> x isa UserColOrdering, cols))
+        if isnothing(sort) || sort === true
+            # if sort === true replace it with NamedTuple to avoid sorting
+            # in row_group_slots as we will perform sorting later
+            sort = NamedTuple()
+        elseif sort === false
+            throw(ArgumentError("passing `order` is only allowed if `sort` " *
+                                "is `true`, `nothing`, or a `NamedTuple`"))
+        end
+        gcols = if cols isa UserColOrdering
+                    cols.col
+                else
+                    Any[x isa UserColOrdering ? x.col : x for x in cols]
+                end
+    else
+        gcols = cols
+    end
+
+    idxcols = index(df)[gcols]
     if isempty(idxcols)
         return GroupedDataFrame(df, Symbol[], ones(Int, nrow(df)),
                                 nothing, nothing, nothing, nrow(df) == 0 ? 0 : 1,
@@ -222,17 +249,19 @@ function groupby(df::AbstractDataFrame, cols;
     groups = Vector{Int}(undef, nrow(df))
     ngroups, rhashes, gslots, sorted =
         row_group_slots(ntuple(i -> sdf[!, i], ncol(sdf)), Val(false),
-                        groups, skipmissing, sort)
+                        groups, skipmissing, sort isa NamedTuple ? nothing : sort)
 
-    gd = GroupedDataFrame(df, copy(_names(sdf)), groups, nothing, nothing, nothing, ngroups, nothing,
-                          Threads.ReentrantLock())
+    gd = GroupedDataFrame(df, copy(_names(sdf)), groups, nothing, nothing, nothing,
+                          ngroups, nothing, Threads.ReentrantLock())
 
     # sort groups if row_group_slots hasn't already done that
-    if sort === true && !sorted
+    if (sort === true && !sorted) || (sort isa NamedTuple)
         # Find index of representative row for each group
         idx = Vector{Int}(undef, length(gd))
         fillfirst!(nothing, idx, 1:nrow(parent(gd)), gd)
-        group_invperm = invperm(sortperm(view(parent(gd)[!, gd.cols], idx, :)))
+        sort_kwargs = sort isa NamedTuple ? sort : NamedTuple()
+        group_invperm = invperm(sortperm(view(parent(gd), idx, :),
+                                         cols; sort_kwargs...))
         groups = gd.groups
         @inbounds for i in eachindex(groups)
             gix = groups[i]

diff --git a/test/grouping.jl b/test/grouping.jl
@@ -4378,4 +4378,132 @@ end
     @test_throws ArgumentError combine(gdf, :a => (x -> [Dict('x' => 1)]) => AsTable)
 end
 
+@testset "sorting API" begin
+    # simple tests
+    df = DataFrame(x=["b", "c", "b", "a", "c"])
+    @test getindex.(keys(groupby(df, :x)), 1) == ["b", "c", "a"]
+    @test getindex.(keys(groupby(df, :x, sort=true)), 1) == ["a", "b", "c"]
+    @test getindex.(keys(groupby(df, :x, sort=NamedTuple())), 1) == ["a", "b", "c"]
+    @test getindex.(keys(groupby(df, :x, sort=false)), 1) == ["b", "c", "a"]
+    @test getindex.(keys(groupby(df, order(:x))), 1) == ["a", "b", "c"]
+    @test getindex.(keys(groupby(df, order(:x), sort=true)), 1) == ["a", "b", "c"]
+    @test_throws ArgumentError groupby(df, order(:x), sort=false)
+    @test getindex.(keys(groupby(df, order(:x), sort=NamedTuple())), 1) == ["a", "b", "c"]
+    @test getindex.(keys(groupby(df, [order(:x)])), 1) == ["a", "b", "c"]
+    @test getindex.(keys(groupby(df, [order(:x)], sort=true)), 1) == ["a", "b", "c"]
+    @test_throws ArgumentError groupby(df, [order(:x)], sort=false)
+    @test getindex.(keys(groupby(df, [order(:x)], sort=NamedTuple())), 1) == ["a", "b", "c"]
+    @test getindex.(keys(groupby(df, order(:x, rev=true))), 1) == ["c", "b", "a"]
+    @test getindex.(keys(groupby(df, order(:x, rev=true), sort=true)), 1) == ["c", "b", "a"]
+    @test getindex.(keys(groupby(df, order(:x, rev=true), sort=NamedTuple())), 1) == ["c", "b", "a"]
+    @test getindex.(keys(groupby(df, [order(:x, rev=true)])), 1) == ["c", "b", "a"]
+    @test getindex.(keys(groupby(df, [order(:x, rev=true)], sort=true)), 1) == ["c", "b", "a"]
+    @test getindex.(keys(groupby(df, [order(:x, rev=true)], sort=NamedTuple())), 1) == ["c", "b", "a"]
+    @test getindex.(keys(groupby(df, :x, sort=(;rev=true))), 1) == ["c", "b", "a"]
+    @test getindex.(keys(groupby(df, [:x], sort=(;rev=true))), 1) == ["c", "b", "a"]
+
+    # by default sorting is not applied as range of values is wide
+    df = DataFrame(x=[2, 100, 2, 1, 100])
+    @test getindex.(keys(groupby(df, :x)), 1) == [2, 100, 1]
+    @test getindex.(keys(groupby(df, :x, sort=true)), 1) == [1, 2, 100]
+    @test getindex.(keys(groupby(df, :x, sort=NamedTuple())), 1) == [1, 2, 100]
+    @test getindex.(keys(groupby(df, :x, sort=false)), 1) == [2, 100, 1]
+    @test getindex.(keys(groupby(df, order(:x))), 1) == [1, 2, 100]
+    @test getindex.(keys(groupby(df, [order(:x)])), 1) == [1, 2, 100]
+    @test getindex.(keys(groupby(df, order(:x, rev=true))), 1) == [100, 2, 1]
+    @test getindex.(keys(groupby(df, [order(:x, rev=true)])), 1) == [100, 2, 1]
+    @test getindex.(keys(groupby(df, :x, sort=(;rev=true))), 1) == [100, 2, 1]
+    @test getindex.(keys(groupby(df, [:x], sort=(;rev=true))), 1) == [100, 2, 1]
+
+    # by default sorting is applied as range of values is narrow
+    df = DataFrame(x=[2, 3, 2, 1, 3])
+    @test getindex.(keys(groupby(df, :x)), 1) == [1, 2, 3]
+    @test getindex.(keys(groupby(df, :x, sort=true)), 1) == [1, 2, 3]
+    @test getindex.(keys(groupby(df, :x, sort=NamedTuple())), 1) == [1, 2, 3]
+    @test getindex.(keys(groupby(df, :x, sort=false)), 1) == [2, 3, 1]
+    @test getindex.(keys(groupby(df, order(:x))), 1) == [1, 2, 3]
+    @test getindex.(keys(groupby(df, [order(:x)])), 1) == [1, 2, 3]
+    @test getindex.(keys(groupby(df, order(:x, rev=true))), 1) == [3, 2, 1]
+    @test getindex.(keys(groupby(df, [order(:x, rev=true)])), 1) == [3, 2, 1]
+    @test getindex.(keys(groupby(df, :x, sort=(;rev=true))), 1) == [3, 2, 1]
+    @test getindex.(keys(groupby(df, [:x], sort=(;rev=true))), 1) == [3, 2, 1]
+
+    # randomized tests
+    Random.seed!(1234)
+    df1 = DataFrame(a=rand(-10:10, 100), b=rand(-10:10, 100), c=1:100)
+    df2 = string.(df1, pad=3)
+
+    for df in (df1, df2)
+        for col in (:a, "a", 1, :b, "b", 2, :c, "c", 3) 
+            gdf = groupby(df, order(col))
+            @test issorted(DataFrame(gdf)[:, col])
+            @test all(x -> issorted(x.c), gdf)
+            gdf = groupby(df, col, sort=true)
+            @test issorted(DataFrame(gdf)[:, col])
+            @test all(x -> issorted(x.c), gdf)
+            gdf = groupby(df, order(col), sort=true)
+            @test issorted(DataFrame(gdf)[:, col])
+            @test all(x -> issorted(x.c), gdf)
+            gdf = groupby(df, col, sort=NamedTuple())
+            @test issorted(DataFrame(gdf)[:, col])
+            @test all(x -> issorted(x.c), gdf)
+            gdf = groupby(df, order(col), sort=NamedTuple())
+            @test issorted(DataFrame(gdf)[:, col])
+            @test all(x -> issorted(x.c), gdf)
+            gdf = groupby(df, col, sort=(rev=true,))
+            @test issorted(DataFrame(gdf)[:, col], rev=true)
+            @test all(x -> issorted(x.c), gdf)
+            if eltype(df[!, col]) === Int
+                gdf = groupby(df, order(col, by=abs), sort=(rev=true,))
+                @test issorted(DataFrame(gdf)[:, col], rev=true, by=abs)
+            else
+                gdf = groupby(df, order(col, by=abs∘(x -> parse(Int, x))), sort=(rev=true,))
+                @test issorted(DataFrame(gdf)[:, col], rev=true, by=abs∘(x -> parse(Int, x)))
+            end
+            @test all(x -> issorted(x.c), gdf)
+            gdf = groupby(df, col, sort=false)
+            @test getindex.(keys(gdf), 1) == unique(df[!, col])
+            @test all(x -> issorted(x.c), gdf)
+        end
+
+        gdf = groupby(df, [:a, :b], sort=true)
+        @test issorted(DataFrame(gdf), [:a, :b])
+        @test all(x -> issorted(x.c), gdf)
+        gdf = groupby(df, [:a, order(:b)])
+        @test issorted(DataFrame(gdf), [:a, :b])
+        @test all(x -> issorted(x.c), gdf)
+        gdf = groupby(df, [:a, order(:b)], sort=true)
+        @test issorted(DataFrame(gdf), [:a, :b])
+        @test all(x -> issorted(x.c), gdf)
+        gdf = groupby(df, [:a, :b], sort=NamedTuple())
+        @test issorted(DataFrame(gdf), [:a, :b])
+        @test all(x -> issorted(x.c), gdf)
+        gdf = groupby(df, [:a, order(:b)], sort=NamedTuple())
+        @test issorted(DataFrame(gdf), [:a, :b])
+        @test all(x -> issorted(x.c), gdf)
+        gdf = groupby(df, [:a, :b], sort=(rev=true,))
+        @test issorted(DataFrame(gdf), [:a, :b], rev=true)
+        @test all(x -> issorted(x.c), gdf)
+        if eltype(df[!, :a]) === Int
+            gdf = groupby(df, [order(:a, by=abs), :b], sort=(rev=true,))
+            @test issorted(DataFrame(gdf), [order(:a, by=abs), :b], rev=true)
+            @test all(x -> issorted(x.c), gdf)
+        else
+            gdf = groupby(df, [order(:a, by=abs∘(x -> parse(Int, x))), :b], sort=(rev=true,))
+            @test issorted(DataFrame(gdf), [order(:a, by=abs∘(x -> parse(Int, x))), :b], rev=true)
+            @test all(x -> issorted(x.c), gdf)
+        end
+        gdf = groupby(df, [:a, order(:b, rev=false)], sort=(rev=true,))
+        @test issorted(DataFrame(gdf), [:a, order(:b, rev=false)], rev=true)
+        @test all(x -> issorted(x.c), gdf)
+        gdf = groupby(df, [:a, :b], sort=false)
+        @test Tuple.(keys(gdf)) == unique(Tuple.(eachrow(df[!, [:a, :b]])))
+        @test all(x -> issorted(x.c), gdf)
+
+        @test_throws ArgumentError groupby(df, order(:a), sort=false)
+        @test_throws ArgumentError groupby(df, [:b, order(:a)], sort=false)
+        @test_throws MethodError groupby(df, :a, sort=(x=1,))
+    end
+end
+
 end # module