From a2ea650743eeafe50e41b378477c9a40db45fcdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 7 Nov 2020 12:44:50 +0100 Subject: [PATCH 1/4] avoid CategoricalArrays dependency in aggregates --- src/groupeddataframe/fastaggregates.jl | 21 +++++++++-------- test/grouping.jl | 32 +++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/src/groupeddataframe/fastaggregates.jl b/src/groupeddataframe/fastaggregates.jl index 9d0d6e8cd4..351664546c 100644 --- a/src/groupeddataframe/fastaggregates.jl +++ b/src/groupeddataframe/fastaggregates.jl @@ -122,10 +122,10 @@ for (op, initf) in ((:max, :typemin), (:min, :typemax)) # !ismissing check is purely an optimization to avoid a copy later outcol = similar(incol, condf === !ismissing ? S : T, length(gd)) # Comparison is possible only between CatValues from the same pool - if incol isa CategoricalVector - U = Union{CategoricalArrays.leveltype(outcol), - eltype(outcol) >: Missing ? Missing : Union{}} - outcol = CategoricalArray{U, 1}(outcol.refs, incol.pool) + incolT = typeof(incol).name + if incolT.name === :CategoricalArray && + nameof(incolT.module) === :CategoricalArrays + outcol = Vector{eltype(incol)}(undef, length(gd)) end # It is safe to use a non-missing init value # since missing will poison the result if present @@ -198,11 +198,14 @@ function groupreduce!(res::AbstractVector, f, op, condf, adjust, checkempty::Boo if checkempty && any(iszero, counts) throw(ArgumentError("some groups contain only missing values")) end - # Undo pool sharing done by groupreduce_init - if res isa CategoricalVector && res.pool === incol.pool - V = Union{CategoricalArrays.leveltype(res), - eltype(res) >: Missing ? Missing : Union{}} - res = CategoricalArray{V, 1}(res.refs, copy(res.pool)) + # Reallocate Vector created in groupreduce_init with min or max + # for CategoricalVector + incolT = typeof(incol).name + if incolT.name === :CategoricalArray && + nameof(incolT.module) === :CategoricalArrays && res isa Vector + @assert op === min || op === max + # use the fact that broadcasted identity will create CategoricalVector here + res = identity.(res) end if isconcretetype(eltype(res)) return res diff --git a/test/grouping.jl b/test/grouping.jl index cafb5d7e08..26df83b15a 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -1,6 +1,6 @@ module TestGrouping -using Test, DataFrames, Random, Statistics, PooledArrays, CategoricalArrays +using Test, DataFrames, Random, Statistics, PooledArrays, CategoricalArrays, DataAPI const ≅ = isequal """Check if passed data frames are `isequal` and have the same element types of columns""" @@ -3173,4 +3173,34 @@ end :min => min.(df.y, df.z), :max => max.(df.y, df.z), :y => df.y) |> sort end +@testset "extra CategoricalArray aggregation tests" begin + for ord in (true, false) + df = DataFrame(id = [1, 1, 1, 2, 2, 2], x = categorical(1:6, ordered=ord)) + gdf = groupby_checked(df, :id) + res = combine(gdf, :x .=> [minimum, maximum, first, last, length]) + @test res == DataFrame(id=[1,2], x_minimum=[1,4], x_maximum=[3,6], + x_first=[1,4], x_last=[3,6], x_length=[3,3]) + @test res.x_minimum isa CategoricalVector + @test res.x_maximum isa CategoricalVector + @test res.x_first isa CategoricalVector + @test res.x_last isa CategoricalVector + @test isordered(res.x_minimum) == ord + @test isordered(res.x_maximum) == ord + @test isordered(res.x_first) == ord + @test isordered(res.x_last) == ord + @test DataAPI.refpool(res.x_minimum) == DataAPI.refpool(df.x) + @test DataAPI.refpool(res.x_maximum) == DataAPI.refpool(df.x) + @test DataAPI.refpool(res.x_first) == DataAPI.refpool(df.x) + @test DataAPI.refpool(res.x_last) == DataAPI.refpool(df.x) + @test DataAPI.refpool(res.x_minimum) !== DataAPI.refpool(df.x) + @test DataAPI.refpool(res.x_maximum) !== DataAPI.refpool(df.x) + @test DataAPI.refpool(res.x_first) !== DataAPI.refpool(df.x) + @test DataAPI.refpool(res.x_last) !== DataAPI.refpool(df.x) + @test res.x_minimum.pool != df.x.pool + @test res.x_maximum.pool != df.x.pool + @test res.x_first.pool != df.x.pool + @test res.x_last.pool != df.x.pool + end +end + end # module From 5437a13d901ed3aa5763a258bce097d00cae0438 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 7 Nov 2020 13:00:49 +0100 Subject: [PATCH 2/4] improve performance --- src/groupeddataframe/fastaggregates.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/groupeddataframe/fastaggregates.jl b/src/groupeddataframe/fastaggregates.jl index 351664546c..c9c9591edd 100644 --- a/src/groupeddataframe/fastaggregates.jl +++ b/src/groupeddataframe/fastaggregates.jl @@ -125,7 +125,8 @@ for (op, initf) in ((:max, :typemin), (:min, :typemax)) incolT = typeof(incol).name if incolT.name === :CategoricalArray && nameof(incolT.module) === :CategoricalArrays - outcol = Vector{eltype(incol)}(undef, length(gd)) + # we know that CategoricalArray has `pool` field + outcol.pool = incol.pool end # It is safe to use a non-missing init value # since missing will poison the result if present @@ -204,8 +205,9 @@ function groupreduce!(res::AbstractVector, f, op, condf, adjust, checkempty::Boo if incolT.name === :CategoricalArray && nameof(incolT.module) === :CategoricalArrays && res isa Vector @assert op === min || op === max - # use the fact that broadcasted identity will create CategoricalVector here - res = identity.(res) + # we know that CategoricalArray has `pool` field + @assert res.pool === incol.pool + res.pool = copy(incol.pool) end if isconcretetype(eltype(res)) return res From afb090de3ba89f8135351cbf1c06ffe4cbbd3311 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 7 Nov 2020 16:03:28 +0100 Subject: [PATCH 3/4] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/groupeddataframe/fastaggregates.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/groupeddataframe/fastaggregates.jl b/src/groupeddataframe/fastaggregates.jl index c9c9591edd..4bce0e6c5f 100644 --- a/src/groupeddataframe/fastaggregates.jl +++ b/src/groupeddataframe/fastaggregates.jl @@ -122,9 +122,9 @@ for (op, initf) in ((:max, :typemin), (:min, :typemax)) # !ismissing check is purely an optimization to avoid a copy later outcol = similar(incol, condf === !ismissing ? S : T, length(gd)) # Comparison is possible only between CatValues from the same pool - incolT = typeof(incol).name - if incolT.name === :CategoricalArray && - nameof(incolT.module) === :CategoricalArrays + outcolT = typeof(outcol).name + if outcolT.name === :CategoricalArray && + nameof(outcolT.module) === :CategoricalArrays # we know that CategoricalArray has `pool` field outcol.pool = incol.pool end @@ -201,9 +201,9 @@ function groupreduce!(res::AbstractVector, f, op, condf, adjust, checkempty::Boo end # Reallocate Vector created in groupreduce_init with min or max # for CategoricalVector - incolT = typeof(incol).name - if incolT.name === :CategoricalArray && - nameof(incolT.module) === :CategoricalArrays && res isa Vector + resT = typeof(res).name + if resT.name === :CategoricalArray && + nameof(resT.module) === :CategoricalArrays @assert op === min || op === max # we know that CategoricalArray has `pool` field @assert res.pool === incol.pool From 1ee4e4283e01f402cf72b3525478ce36ef4df3a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 7 Nov 2020 16:04:03 +0100 Subject: [PATCH 4/4] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- test/grouping.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/grouping.jl b/test/grouping.jl index 26df83b15a..6b2adedd21 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -3196,10 +3196,10 @@ end @test DataAPI.refpool(res.x_maximum) !== DataAPI.refpool(df.x) @test DataAPI.refpool(res.x_first) !== DataAPI.refpool(df.x) @test DataAPI.refpool(res.x_last) !== DataAPI.refpool(df.x) - @test res.x_minimum.pool != df.x.pool - @test res.x_maximum.pool != df.x.pool - @test res.x_first.pool != df.x.pool - @test res.x_last.pool != df.x.pool + @test res.x_minimum.pool !== df.x.pool + @test res.x_maximum.pool !== df.x.pool + @test res.x_first.pool !== df.x.pool + @test res.x_last.pool !== df.x.pool end end