Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stop using makeunique=true for grouping keys in combine #1938

Merged
merged 3 commits into from
Sep 3, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 27 additions & 3 deletions src/groupeddataframe/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,10 @@ If the first argument is a vector, tuple or named tuple of such pairs, each pair
handled as described above. If a named tuple, field names are used to name
each generated column.

If the first argument is a callable, it is passed a `SubDataFrame` view for each group,
If the first argument is a callable `f`, it is passed a [`SubDataFrame`](@ref) view for each group,
and the returned `DataFrame` then consists of the returned rows plus the grouping columns.
If the returned data frame contains columns with the same names as the grouping columns,
they are required to be equal.
Note that this second form is much slower than the first one due to type instability.

`f` can return a single value, a row or multiple rows. The type of the returned value
Expand Down Expand Up @@ -297,7 +299,16 @@ See [`by`](@ref) for more examples.
function Base.map(f::Any, gd::GroupedDataFrame)
if length(gd) > 0
idx, valscat = _combine(f, gd)
parent = hcat!(gd.parent[idx, gd.cols], valscat, makeunique=true)
keys = _names(gd.parent)[gd.cols]
for key in keys
if hasproperty(valscat, key) &&
!isequal(valscat[!, key], view(gd.parent[!, key], idx))
throw(ArgumentError("column :$key in returned data frame " *
"is not equal to grouping key :$key"))
end
end
parent = hcat!(gd.parent[idx, gd.cols],
without(valscat, intersect(keys, _names(valscat))))
if length(idx) == 0
return GroupedDataFrame(parent, collect(1:length(gd.cols)), idx,
Int[], Int[], Int[])
Expand Down Expand Up @@ -343,6 +354,8 @@ views into these columns.

If the last argument is a callable `f`, it is passed a [`SubDataFrame`](@ref) view for each group,
and the returned `DataFrame` then consists of the returned rows plus the grouping columns.
If the returned data frame contains columns with the same names as the grouping columns,
they are required to be equal.
Note that this second form is much slower than the first one due to type instability.
A method is defined with `f` as the first argument, so do-block
notation can be used.
Expand Down Expand Up @@ -435,7 +448,16 @@ of `combine(map(f, groupby(df, cols)))`.
function combine(f::Any, gd::GroupedDataFrame)
if length(gd) > 0
idx, valscat = _combine(f, gd)
return hcat!(gd.parent[idx, gd.cols], valscat, makeunique=true)
keys = _names(gd.parent)[gd.cols]
for key in keys
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
if hasproperty(valscat, key) &&
!isequal(valscat[!, key], view(gd.parent[!, key], idx))
throw(ArgumentError("column :$key in returned data frame " *
"is not equal to grouping key :$key"))
end
end
return hcat!(gd.parent[idx, gd.cols],
without(valscat, intersect(keys, _names(valscat))))
else
return gd.parent[1:0, gd.cols]
end
Expand Down Expand Up @@ -948,6 +970,8 @@ views into these columns.

If the last argument is a callable `f`, it is passed a [`SubDataFrame`](@ref) view for each group,
and the returned `DataFrame` then consists of the returned rows plus the grouping columns.
If the returned data frame contains columns with the same names as the grouping columns,
they are required to be equal.
Note that this second form is much slower than the first one due to type instability.
A method is defined with `f` as the first argument, so do-block
notation can be used.
Expand Down
29 changes: 24 additions & 5 deletions test/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,8 @@ end
for cols in ([:a, :b], [:b, :a], [:a, :c], [:c, :a],
[1, 2], [2, 1], [1, 3], [3, 1],
[true, true, false, false], [true, false, true, false])
colssym = names(df[:, cols])
hcatdf = hcat(df[:, cols], df, makeunique=true)
colssym = names(df[!, cols])
hcatdf = hcat(df[!, cols], df[!, Not(cols)])
nms = names(hcatdf)
res = unique(df[:, cols])
res.xmax = [maximum(df[(df[!, colssym[1]] .== a) .& (df[!, colssym[2]] .== b), :x])
Expand Down Expand Up @@ -163,7 +163,7 @@ end
df_comb = combine(identity, gd)
@test sort(df_comb, colssym) == shcatdf
df_ref = DataFrame(gd)
@test sort(hcat(df_ref[:, cols], df_ref, makeunique=true), colssym) == shcatdf
@test sort(hcat(df_ref[!, cols], df_ref[!, Not(cols)]), colssym) == shcatdf
@test df_ref.x == df_comb.x
@test combine(f1, gd) == res
@test combine(f2, gd) == res
Expand All @@ -183,7 +183,7 @@ end
end
@test combine(identity, gd) == shcatdf
df_ref = DataFrame(gd)
@test hcat(df_ref[:, cols], df_ref, makeunique=true) == shcatdf
@test hcat(df_ref[!, cols], df_ref[!, Not(cols)]) == shcatdf
@test combine(f1, gd) == sres
@test combine(f2, gd) == sres
@test rename(combine(f3, gd), :x1 => :xmax) == sres
Expand Down Expand Up @@ -342,7 +342,7 @@ end

# Test function returning DataFrameRow
res = by(d -> DataFrameRow(d, 1, :), df, :x)
@test res == DataFrame(x=df.x, x_1=df.x, y=df.y)
@test res == DataFrame(x=df.x, y=df.y)

# Test function returning Tuple
res = by(d -> (sum(d.y),), df, :x)
Expand Down Expand Up @@ -893,6 +893,25 @@ Base.isless(::TestType, ::TestType) = false
end
end

@testset "combine and map with columns named like grouping keys" begin
df = DataFrame(x=["a", "a", "b", missing], y=1:4)
gd = groupby(df, :x)
@test combine(identity, gd) ≅ df
@test combine(d -> d[:, [2, 1]], gd) ≅ df
@test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd)
@test map(identity, gd) ≅ gd
@test map(d -> d[:, [2, 1]], gd) ≅ gd
@test_throws ArgumentError map(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd)

gd = groupby(df, :x, skipmissing=true)
@test combine(identity, gd) == df[1:3, :]
@test combine(d -> d[:, [2, 1]], gd) == df[1:3, :]
@test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd)
@test map(identity, gd) == gd
@test map(d -> d[:, [2, 1]], gd) == gd
@test_throws ArgumentError map(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd)
end

@testset "iteration protocol" begin
gd = groupby_checked(DataFrame(A = [:A, :A, :B, :B], B = 1:4), :A)
for v in gd
Expand Down