Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

avoid CategoricalArrays dependency in aggregates #2519

Merged
merged 5 commits into from
Nov 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 14 additions & 9 deletions src/groupeddataframe/fastaggregates.jl
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,11 @@ for (op, initf) in ((:max, :typemin), (:min, :typemax))
# !ismissing check is purely an optimization to avoid a copy later
outcol = similar(incol, condf === !ismissing ? S : T, length(gd))
# Comparison is possible only between CatValues from the same pool
if incol isa CategoricalVector
U = Union{CategoricalArrays.leveltype(outcol),
eltype(outcol) >: Missing ? Missing : Union{}}
outcol = CategoricalArray{U, 1}(outcol.refs, incol.pool)
outcolT = typeof(outcol).name
if outcolT.name === :CategoricalArray &&
nameof(outcolT.module) === :CategoricalArrays
# we know that CategoricalArray has `pool` field
outcol.pool = incol.pool
end
# It is safe to use a non-missing init value
# since missing will poison the result if present
Expand Down Expand Up @@ -198,11 +199,15 @@ function groupreduce!(res::AbstractVector, f, op, condf, adjust, checkempty::Boo
if checkempty && any(iszero, counts)
throw(ArgumentError("some groups contain only missing values"))
end
# Undo pool sharing done by groupreduce_init
bkamins marked this conversation as resolved.
Show resolved Hide resolved
if res isa CategoricalVector && res.pool === incol.pool
V = Union{CategoricalArrays.leveltype(res),
eltype(res) >: Missing ? Missing : Union{}}
res = CategoricalArray{V, 1}(res.refs, copy(res.pool))
# Reallocate Vector created in groupreduce_init with min or max
# for CategoricalVector
resT = typeof(res).name
if resT.name === :CategoricalArray &&
nameof(resT.module) === :CategoricalArrays
@assert op === min || op === max
# we know that CategoricalArray has `pool` field
@assert res.pool === incol.pool
res.pool = copy(incol.pool)
end
if isconcretetype(eltype(res))
return res
Expand Down
31 changes: 30 additions & 1 deletion test/grouping.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module TestGrouping

using Test, DataFrames, Random, Statistics, PooledArrays, CategoricalArrays
using Test, DataFrames, Random, Statistics, PooledArrays, CategoricalArrays, DataAPI
const ≅ = isequal

"""Check if passed data frames are `isequal` and have the same element types of columns"""
Expand Down Expand Up @@ -3178,6 +3178,35 @@ end
:min => min.(df.y, df.z), :max => max.(df.y, df.z), :y => df.y) |> sort
end

@testset "extra CategoricalArray aggregation tests" begin
for ord in (true, false)
df = DataFrame(id = [1, 1, 1, 2, 2, 2], x = categorical(1:6, ordered=ord))
gdf = groupby_checked(df, :id)
res = combine(gdf, :x .=> [minimum, maximum, first, last, length])
@test res == DataFrame(id=[1,2], x_minimum=[1,4], x_maximum=[3,6],
x_first=[1,4], x_last=[3,6], x_length=[3,3])
@test res.x_minimum isa CategoricalVector
@test res.x_maximum isa CategoricalVector
@test res.x_first isa CategoricalVector
@test res.x_last isa CategoricalVector
@test isordered(res.x_minimum) == ord
@test isordered(res.x_maximum) == ord
@test isordered(res.x_first) == ord
@test isordered(res.x_last) == ord
@test DataAPI.refpool(res.x_minimum) == DataAPI.refpool(df.x)
@test DataAPI.refpool(res.x_maximum) == DataAPI.refpool(df.x)
@test DataAPI.refpool(res.x_first) == DataAPI.refpool(df.x)
@test DataAPI.refpool(res.x_last) == DataAPI.refpool(df.x)
@test DataAPI.refpool(res.x_minimum) !== DataAPI.refpool(df.x)
@test DataAPI.refpool(res.x_maximum) !== DataAPI.refpool(df.x)
@test DataAPI.refpool(res.x_first) !== DataAPI.refpool(df.x)
@test DataAPI.refpool(res.x_last) !== DataAPI.refpool(df.x)
@test res.x_minimum.pool !== df.x.pool
@test res.x_maximum.pool !== df.x.pool
@test res.x_first.pool !== df.x.pool
@test res.x_last.pool !== df.x.pool
end

@testset "hashing of pooled vectors" begin
# test both hashrow calculation paths - the of pool length threshold is 50%
for x in ([1:9; fill(1, 101)], [1:100;],
Expand Down