JuliaData · bkamins · Dec 27, 2022 · Dec 19, 2022 · Dec 19, 2022 · Dec 19, 2022
diff --git a/NEWS.md b/NEWS.md
@@ -10,6 +10,8 @@
 * Add support for `operator` keyword argument in `Cols`
   to take a set operation to apply to passed selectors (`union` by default)
   ([3224](https://github.com/JuliaData/DataFrames.jl/pull/3224))
+* Improve support for setting group order in `groupby`
+  ([3253](https://github.com/JuliaData/DataFrames.jl/pull/3253))
 
 # DataFrames.jl v1.4.4 Patch Release Notes
 

diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md
@@ -1276,3 +1276,70 @@ two aspects:
   are exceptions to the standard operation specification syntax rules. They
   were added for user convenience.
 
+## Specifying group order in `groupby`
+
+By default order of groups produced by `groupby` is undefined.
+If you want the order of groups to follow the order of first appereance in
+the source data frame of a grouping key then pass `sort=false` keyword argument
+to `groupby`:
+
+```jldoctest sac
+julia> push!(df, ["a", 100, 100]) # push row with large integer values to disable default sorting
+7×3 DataFrame
+ Row │ customer_id  transaction_id  volume 
+     │ String       Int64           Int64  
+─────┼─────────────────────────────────────
+   1 │ a                        12       2
+   2 │ b                        15       3
+   3 │ b                        19       1
+   4 │ b                        17       4
+   5 │ c                        13       5
+   6 │ c                        11       9
+   7 │ a                       100     100
+
+julia> keys(groupby(df, :volume))
+7-element DataFrames.GroupKeys{GroupedDataFrame{DataFrame}}:
+ GroupKey: (volume = 2,)
+ GroupKey: (volume = 3,)
+ GroupKey: (volume = 1,)
+ GroupKey: (volume = 4,)
+ GroupKey: (volume = 5,)
+ GroupKey: (volume = 9,)
+ GroupKey: (volume = 100,)
+```
+
+If you want to have them sorted in ascending order pass `sort=true`:
+
+```
+julia> keys(groupby(df, :volume, sort=true))
+7-element DataFrames.GroupKeys{GroupedDataFrame{DataFrame}}:
+ GroupKey: (volume = 1,)
+ GroupKey: (volume = 2,)
+ GroupKey: (volume = 3,)
+ GroupKey: (volume = 4,)
+ GroupKey: (volume = 5,)
+ GroupKey: (volume = 9,)
+ GroupKey: (volume = 100,)
+```
+
+You can also use [`order`](@ref) wrapper when passing a column name to group by
+or pass a named tuple containing one or more of `alg`, `lt`, `by`, `rev`, and
+`order` fields that will be treated just like in [`sortperm`](@ref):
+
+```
+julia> keys(groupby(df, :customer_id, sort=(rev=true,)))
+3-element DataFrames.GroupKeys{GroupedDataFrame{DataFrame}}:
+ GroupKey: (customer_id = "c",)
+ GroupKey: (customer_id = "b",)
+ GroupKey: (customer_id = "a",)
+
+julia> keys(groupby(df, [:customer_id, order(:volume, rev=true)], sort=true))
+6-element DataFrames.GroupKeys{GroupedDataFrame{DataFrame}}:
+ GroupKey: (customer_id = "a", volume = 2)
+ GroupKey: (customer_id = "b", volume = 4)
+ GroupKey: (customer_id = "b", volume = 3)
+ GroupKey: (customer_id = "b", volume = 1)
+ GroupKey: (customer_id = "c", volume = 9)
+ GroupKey: (customer_id = "c", volume = 5)
+
+```
diff --git a/src/DataFrames.jl b/src/DataFrames.jl
@@ -139,6 +139,8 @@ include("subdataframe/subdataframe.jl")
 include("dataframerow/dataframerow.jl")
 include("dataframe/insertion.jl")
 
+include("abstractdataframe/sort.jl")
+
 include("groupeddataframe/groupeddataframe.jl")
 include("groupeddataframe/utils.jl")
 
@@ -165,8 +167,6 @@ include("groupeddataframe/show.jl")
 include("dataframerow/show.jl")
 include("abstractdataframe/io.jl")
 
-include("abstractdataframe/sort.jl")
-
 include("other/tables.jl")
 include("other/names.jl")
 include("other/metadata.jl")

diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl
@@ -49,7 +49,7 @@ end
 
 """
     groupby(d::AbstractDataFrame, cols;
-            sort::Union{Bool, Nothing}=nothing,
+            sort::Union{Bool, Nothing, NamedTuple}=nothing,
             skipmissing::Bool=false)
 
 Return a `GroupedDataFrame` representing a view of an `AbstractDataFrame` split
@@ -64,18 +64,24 @@ into row groups.
   if `sort=nothing` (the default) then the fastest available grouping algorithm
   is picked and in consequence the order of groups in the result is undefined
   and may change in future releases; below a description of the current
-  implementation is provided.
+  implementation is provided. Additionally `sort` can be a `NamedTuple` having
+  some or all of `alg`, `lt`, `by`, `rev`, and `order` fields. In this case
+  the groups are sorted and their order follows the [`sortperm`](@ref) order.
 - `skipmissing` : whether to skip groups with `missing` values in one of the
   grouping columns `cols`
 
 # Details
+
 An iterator over a `GroupedDataFrame` returns a `SubDataFrame` view
 for each grouping into `df`.
 Within each group, the order of rows in `df` is preserved.
 
 `cols` can be any valid data frame indexing expression.
 In particular if it is an empty vector then a single-group `GroupedDataFrame`
-is created.
+is created. As a special case, if a list of columns to group by is passed
+as a vector it can contain columns wrapped in [`order`](@ref) that will be
+used to determine order of groups if `sort` is `true` or a `NamedTuple` (if
+`sort` is `nothing` or `false`, then passing `order` is an error).
 
 A `GroupedDataFrame` also supports indexing by groups, `select`, `transform`,
 and `combine` (which applies a function to each group and combines the result
@@ -209,9 +215,9 @@ julia> for g in gd
 ```
 """
 function groupby(df::AbstractDataFrame, cols;
-                 sort::Union{Bool,Nothing}=nothing, skipmissing::Bool=false)
+                 sort::Union{Bool,Nothing,NamedTuple}=nothing, skipmissing::Bool=false)
     _check_consistency(df)
-    idxcols = index(df)[cols]
+    idxcols = index(df)[normalize_grouping_cols(cols, sort === true || sort isa NamedTuple)]
     if isempty(idxcols)
         return GroupedDataFrame(df, Symbol[], ones(Int, nrow(df)),
                                 nothing, nothing, nothing, nrow(df) == 0 ? 0 : 1,
@@ -222,17 +228,19 @@ function groupby(df::AbstractDataFrame, cols;
     groups = Vector{Int}(undef, nrow(df))
     ngroups, rhashes, gslots, sorted =
         row_group_slots(ntuple(i -> sdf[!, i], ncol(sdf)), Val(false),
-                        groups, skipmissing, sort)
+                        groups, skipmissing, sort isa NamedTuple ? nothing : sort)
 
     gd = GroupedDataFrame(df, copy(_names(sdf)), groups, nothing, nothing, nothing, ngroups, nothing,
                           Threads.ReentrantLock())
 
     # sort groups if row_group_slots hasn't already done that
-    if sort === true && !sorted
+    if (sort === true && !sorted) || (sort isa NamedTuple)
         # Find index of representative row for each group
         idx = Vector{Int}(undef, length(gd))
         fillfirst!(nothing, idx, 1:nrow(parent(gd)), gd)
-        group_invperm = invperm(sortperm(view(parent(gd)[!, gd.cols], idx, :)))
+        sort_kwargs = sort isa NamedTuple ? sort : NamedTuple()
+        group_invperm = invperm(sortperm(view(parent(gd), idx, :),
+                                         cols; sort_kwargs...))
         groups = gd.groups
         @inbounds for i in eachindex(groups)
             gix = groups[i]
@@ -243,6 +251,25 @@ function groupby(df::AbstractDataFrame, cols;
     return gd
 end
 
+normalize_grouping_cols(cols, sort::Bool) = cols
+
+function normalize_grouping_cols(cols::UserColOrdering, sort::Bool)
+    sort || throw(ArgumentError("passing `order` is only allowed if `sort` " *
+                                "is `true` or a `NamedTuple`"))
+    return cols.col
+end
+
+function normalize_grouping_cols(cols::AbstractVector, sort::Bool)
+    has_order = any(x -> x isa UserColOrdering, cols)
+    if has_order
+        sort || throw(ArgumentError("passing `order` is only allowed if `sort` " *
+                                    "is `true` or a `NamedTuple`"))
+        return Any[x isa UserColOrdering ? x.col : x for x in cols]
+    else
+        return cols
+    end
+end
+
 function genkeymap(gd, cols)
     # currently we use Dict{Any, Int} because then field :keymap in GroupedDataFrame
     # has a concrete type which makes the access to it faster as we do not have a dynamic

diff --git a/test/grouping.jl b/test/grouping.jl
@@ -4378,4 +4378,45 @@ end
     @test_throws ArgumentError combine(gdf, :a => (x -> [Dict('x' => 1)]) => AsTable)
 end
 
+@testset "sorting API" begin
+    Random.seed!(1234)
+    df = DataFrame(a=rand(-10:10, 100), b=rand(-10:10, 100), c=1:100)
+    for col in (:a, "a", 1, :b, "b", 2, :c, "c", 3) 
+        gdf = groupby(df, col, sort=true)
+        @test issorted(DataFrame(gdf)[:, col])
+        @test all(x -> issorted(x.c), gdf)
+        gdf = groupby(df, col, sort=NamedTuple())
+        @test issorted(DataFrame(gdf)[:, col])
+        @test all(x -> issorted(x.c), gdf)
+        gdf = groupby(df, col, sort=(rev=true,))
+        @test issorted(DataFrame(gdf)[:, col], rev=true)
+        @test all(x -> issorted(x.c), gdf)
+        gdf = groupby(df, order(col, by=abs), sort=(rev=true,))
+        @test issorted(DataFrame(gdf)[:, col], rev=true, by=abs)
+        @test all(x -> issorted(x.c), gdf)
+    end
+
+    gdf = groupby(df, [:a, :b], sort=true)
+    @test issorted(DataFrame(gdf), [:a, :b])
+    @test all(x -> issorted(x.c), gdf)
+    gdf = groupby(df, [:a, :b], sort=NamedTuple())
+    @test issorted(DataFrame(gdf), [:a, :b])
+    @test all(x -> issorted(x.c), gdf)
+    gdf = groupby(df, [:a, :b], sort=(rev=true,))
+    @test issorted(DataFrame(gdf), [:a, :b], rev=true)
+    @test all(x -> issorted(x.c), gdf)
+    gdf = groupby(df, [order(:a, by=abs), :b], sort=(rev=true,))
+    @test issorted(DataFrame(gdf), [order(:a, by=abs), :b], rev=true)
+    @test all(x -> issorted(x.c), gdf)
+    gdf = groupby(df, [:a, order(:b, rev=false)], sort=(rev=true,))
+    @test issorted(DataFrame(gdf), [:a, order(:b, rev=false)], rev=true)
+    @test all(x -> issorted(x.c), gdf)
+
+    @test_throws ArgumentError groupby(df, order(:a))
+    @test_throws ArgumentError groupby(df, order(:a), sort=false)
+    @test_throws ArgumentError groupby(df, [:b, order(:a)])
+    @test_throws ArgumentError groupby(df, [:b, order(:a)], sort=false)
+    @test_throws MethodError groupby(df, :a, sort=(x=1,))
+end
+
 end # module