From a2ea650743eeafe50e41b378477c9a40db45fcdc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 7 Nov 2020 12:44:50 +0100
Subject: [PATCH 1/4] avoid CategoricalArrays dependency in aggregates

---
 src/groupeddataframe/fastaggregates.jl | 21 +++++++++--------
 test/grouping.jl                       | 32 +++++++++++++++++++++++++-
 2 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/src/groupeddataframe/fastaggregates.jl b/src/groupeddataframe/fastaggregates.jl
index 9d0d6e8cd4..351664546c 100644
--- a/src/groupeddataframe/fastaggregates.jl
+++ b/src/groupeddataframe/fastaggregates.jl
@@ -122,10 +122,10 @@ for (op, initf) in ((:max, :typemin), (:min, :typemax))
             # !ismissing check is purely an optimization to avoid a copy later
             outcol = similar(incol, condf === !ismissing ? S : T, length(gd))
             # Comparison is possible only between CatValues from the same pool
-            if incol isa CategoricalVector
-                U = Union{CategoricalArrays.leveltype(outcol),
-                          eltype(outcol) >: Missing ? Missing : Union{}}
-                outcol = CategoricalArray{U, 1}(outcol.refs, incol.pool)
+            incolT = typeof(incol).name
+            if incolT.name === :CategoricalArray &&
+                nameof(incolT.module) === :CategoricalArrays
+                outcol = Vector{eltype(incol)}(undef, length(gd))
             end
             # It is safe to use a non-missing init value
             # since missing will poison the result if present
@@ -198,11 +198,14 @@ function groupreduce!(res::AbstractVector, f, op, condf, adjust, checkempty::Boo
     if checkempty && any(iszero, counts)
         throw(ArgumentError("some groups contain only missing values"))
     end
-    # Undo pool sharing done by groupreduce_init
-    if res isa CategoricalVector && res.pool === incol.pool
-        V = Union{CategoricalArrays.leveltype(res),
-                  eltype(res) >: Missing ? Missing : Union{}}
-        res = CategoricalArray{V, 1}(res.refs, copy(res.pool))
+    # Reallocate Vector created in groupreduce_init with min or max
+    # for CategoricalVector
+    incolT = typeof(incol).name
+    if incolT.name === :CategoricalArray &&
+        nameof(incolT.module) === :CategoricalArrays && res isa Vector
+        @assert op === min || op === max
+        # use the fact that broadcasted identity will create CategoricalVector here
+        res = identity.(res)
     end
     if isconcretetype(eltype(res))
         return res
diff --git a/test/grouping.jl b/test/grouping.jl
index cafb5d7e08..26df83b15a 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -1,6 +1,6 @@
 module TestGrouping
 
-using Test, DataFrames, Random, Statistics, PooledArrays, CategoricalArrays
+using Test, DataFrames, Random, Statistics, PooledArrays, CategoricalArrays, DataAPI
 const ≅ = isequal
 
 """Check if passed data frames are `isequal` and have the same element types of columns"""
@@ -3173,4 +3173,34 @@ end
                     :min => min.(df.y, df.z), :max => max.(df.y, df.z), :y => df.y) |> sort
 end
 
+@testset "extra CategoricalArray aggregation tests" begin
+    for ord in (true, false)
+        df = DataFrame(id = [1, 1, 1, 2, 2, 2], x = categorical(1:6, ordered=ord))
+        gdf = groupby_checked(df, :id)
+        res = combine(gdf, :x .=> [minimum, maximum, first, last, length])
+        @test res == DataFrame(id=[1,2], x_minimum=[1,4], x_maximum=[3,6],
+                               x_first=[1,4], x_last=[3,6], x_length=[3,3])
+        @test res.x_minimum isa CategoricalVector
+        @test res.x_maximum isa CategoricalVector
+        @test res.x_first isa CategoricalVector
+        @test res.x_last isa CategoricalVector
+        @test isordered(res.x_minimum) == ord
+        @test isordered(res.x_maximum) == ord
+        @test isordered(res.x_first) == ord
+        @test isordered(res.x_last) == ord
+        @test DataAPI.refpool(res.x_minimum) == DataAPI.refpool(df.x)
+        @test DataAPI.refpool(res.x_maximum) == DataAPI.refpool(df.x)
+        @test DataAPI.refpool(res.x_first) == DataAPI.refpool(df.x)
+        @test DataAPI.refpool(res.x_last) == DataAPI.refpool(df.x)
+        @test DataAPI.refpool(res.x_minimum) !== DataAPI.refpool(df.x)
+        @test DataAPI.refpool(res.x_maximum) !== DataAPI.refpool(df.x)
+        @test DataAPI.refpool(res.x_first) !== DataAPI.refpool(df.x)
+        @test DataAPI.refpool(res.x_last) !== DataAPI.refpool(df.x)
+        @test res.x_minimum.pool != df.x.pool
+        @test res.x_maximum.pool != df.x.pool
+        @test res.x_first.pool != df.x.pool
+        @test res.x_last.pool != df.x.pool
+    end
+end
+
 end # module

From 5437a13d901ed3aa5763a258bce097d00cae0438 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 7 Nov 2020 13:00:49 +0100
Subject: [PATCH 2/4] improve performance

---
 src/groupeddataframe/fastaggregates.jl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/groupeddataframe/fastaggregates.jl b/src/groupeddataframe/fastaggregates.jl
index 351664546c..c9c9591edd 100644
--- a/src/groupeddataframe/fastaggregates.jl
+++ b/src/groupeddataframe/fastaggregates.jl
@@ -125,7 +125,8 @@ for (op, initf) in ((:max, :typemin), (:min, :typemax))
             incolT = typeof(incol).name
             if incolT.name === :CategoricalArray &&
                 nameof(incolT.module) === :CategoricalArrays
-                outcol = Vector{eltype(incol)}(undef, length(gd))
+                # we know that CategoricalArray has `pool` field
+                outcol.pool = incol.pool
             end
             # It is safe to use a non-missing init value
             # since missing will poison the result if present
@@ -204,8 +205,9 @@ function groupreduce!(res::AbstractVector, f, op, condf, adjust, checkempty::Boo
     if incolT.name === :CategoricalArray &&
         nameof(incolT.module) === :CategoricalArrays && res isa Vector
         @assert op === min || op === max
-        # use the fact that broadcasted identity will create CategoricalVector here
-        res = identity.(res)
+        # we know that CategoricalArray has `pool` field
+        @assert res.pool === incol.pool
+        res.pool = copy(incol.pool)
     end
     if isconcretetype(eltype(res))
         return res

From afb090de3ba89f8135351cbf1c06ffe4cbbd3311 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 7 Nov 2020 16:03:28 +0100
Subject: [PATCH 3/4] Apply suggestions from code review

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/groupeddataframe/fastaggregates.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/groupeddataframe/fastaggregates.jl b/src/groupeddataframe/fastaggregates.jl
index c9c9591edd..4bce0e6c5f 100644
--- a/src/groupeddataframe/fastaggregates.jl
+++ b/src/groupeddataframe/fastaggregates.jl
@@ -122,9 +122,9 @@ for (op, initf) in ((:max, :typemin), (:min, :typemax))
             # !ismissing check is purely an optimization to avoid a copy later
             outcol = similar(incol, condf === !ismissing ? S : T, length(gd))
             # Comparison is possible only between CatValues from the same pool
-            incolT = typeof(incol).name
-            if incolT.name === :CategoricalArray &&
-                nameof(incolT.module) === :CategoricalArrays
+            outcolT = typeof(outcol).name
+            if outcolT.name === :CategoricalArray &&
+                nameof(outcolT.module) === :CategoricalArrays
                 # we know that CategoricalArray has `pool` field
                 outcol.pool = incol.pool
             end
@@ -201,9 +201,9 @@ function groupreduce!(res::AbstractVector, f, op, condf, adjust, checkempty::Boo
     end
     # Reallocate Vector created in groupreduce_init with min or max
     # for CategoricalVector
-    incolT = typeof(incol).name
-    if incolT.name === :CategoricalArray &&
-        nameof(incolT.module) === :CategoricalArrays && res isa Vector
+    resT = typeof(res).name
+    if resT.name === :CategoricalArray &&
+        nameof(resT.module) === :CategoricalArrays
         @assert op === min || op === max
         # we know that CategoricalArray has `pool` field
         @assert res.pool === incol.pool

From 1ee4e4283e01f402cf72b3525478ce36ef4df3a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 7 Nov 2020 16:04:03 +0100
Subject: [PATCH 4/4] Apply suggestions from code review

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 test/grouping.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/grouping.jl b/test/grouping.jl
index 26df83b15a..6b2adedd21 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -3196,10 +3196,10 @@ end
         @test DataAPI.refpool(res.x_maximum) !== DataAPI.refpool(df.x)
         @test DataAPI.refpool(res.x_first) !== DataAPI.refpool(df.x)
         @test DataAPI.refpool(res.x_last) !== DataAPI.refpool(df.x)
-        @test res.x_minimum.pool != df.x.pool
-        @test res.x_maximum.pool != df.x.pool
-        @test res.x_first.pool != df.x.pool
-        @test res.x_last.pool != df.x.pool
+        @test res.x_minimum.pool !== df.x.pool
+        @test res.x_maximum.pool !== df.x.pool
+        @test res.x_first.pool !== df.x.pool
+        @test res.x_last.pool !== df.x.pool
     end
 end