From 0bbd96ff247463e1da9e98f9327d85e1db9cd68d Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Tue, 3 Sep 2019 10:25:26 +0200 Subject: [PATCH 1/3] Stop using makeunique=true for grouping keys in combine Instead, check whether columns with the same names as grouping keys have been returned, and throw an error if they are not equal. This is breaking since columns named key_1, etc. won't be created, but in practice it should not be frequently used. Introducing a deprecation would be more annoying than helpful since it would require adding a keyword argument to disable adding grouping keys and force users to set it to silence the warning, which happens in most common cases. --- src/groupeddataframe/grouping.jl | 11 ++++++++++- test/grouping.jl | 21 ++++++++++++++++----- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl index c825aa5d0b..76a84e4aab 100644 --- a/src/groupeddataframe/grouping.jl +++ b/src/groupeddataframe/grouping.jl @@ -435,7 +435,16 @@ of `combine(map(f, groupby(df, cols)))`. function combine(f::Any, gd::GroupedDataFrame) if length(gd) > 0 idx, valscat = _combine(f, gd) - return hcat!(gd.parent[idx, gd.cols], valscat, makeunique=true) + keys = _names(gd.parent)[gd.cols] + for key in keys + if hasproperty(valscat, key) && + !isequal(valscat[!, key], view(gd.parent[!, key], idx)) + throw(ArgumentError("column :$key in returned data frame " * + "is not equal to grouping key :$key")) + end + end + return hcat!(gd.parent[idx, gd.cols], + without(valscat, intersect(keys, _names(valscat)))) else return gd.parent[1:0, gd.cols] end diff --git a/test/grouping.jl b/test/grouping.jl index 31e44184f9..4ae2981c9c 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -104,8 +104,8 @@ end for cols in ([:a, :b], [:b, :a], [:a, :c], [:c, :a], [1, 2], [2, 1], [1, 3], [3, 1], [true, true, false, false], [true, false, true, false]) - colssym = names(df[:, cols]) - hcatdf = hcat(df[:, cols], df, makeunique=true) + colssym = names(df[!, cols]) + hcatdf = hcat(df[!, cols], df[!, Not(cols)]) nms = names(hcatdf) res = unique(df[:, cols]) res.xmax = [maximum(df[(df[!, colssym[1]] .== a) .& (df[!, colssym[2]] .== b), :x]) @@ -163,7 +163,7 @@ end df_comb = combine(identity, gd) @test sort(df_comb, colssym) == shcatdf df_ref = DataFrame(gd) - @test sort(hcat(df_ref[:, cols], df_ref, makeunique=true), colssym) == shcatdf + @test sort(hcat(df_ref[!, cols], df_ref[!, Not(cols)]), colssym) == shcatdf @test df_ref.x == df_comb.x @test combine(f1, gd) == res @test combine(f2, gd) == res @@ -183,7 +183,7 @@ end end @test combine(identity, gd) == shcatdf df_ref = DataFrame(gd) - @test hcat(df_ref[:, cols], df_ref, makeunique=true) == shcatdf + @test hcat(df_ref[!, cols], df_ref[!, Not(cols)]) == shcatdf @test combine(f1, gd) == sres @test combine(f2, gd) == sres @test rename(combine(f3, gd), :x1 => :xmax) == sres @@ -342,7 +342,7 @@ end # Test function returning DataFrameRow res = by(d -> DataFrameRow(d, 1, :), df, :x) - @test res == DataFrame(x=df.x, x_1=df.x, y=df.y) + @test res == DataFrame(x=df.x, y=df.y) # Test function returning Tuple res = by(d -> (sum(d.y),), df, :x) @@ -893,6 +893,17 @@ Base.isless(::TestType, ::TestType) = false end end +@testset "combine with columns named like grouping keys" begin + df = DataFrame(x=["a", "a", "b", missing], y=1:4) + gd = groupby(df, :x) + @test combine(identity, gd) ≅ df + @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd) + + gd = groupby(df, :x, skipmissing=true) + @test combine(identity, gd) == df[1:3, :] + @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd) +end + @testset "iteration protocol" begin gd = groupby_checked(DataFrame(A = [:A, :A, :B, :B], B = 1:4), :A) for v in gd From 6ebf3d2e86c1db920ed9cd5c70499b3377c5af5b Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Tue, 3 Sep 2019 11:40:02 +0200 Subject: [PATCH 2/3] Add more stringent test To check that names are used, and not only column indices. --- test/grouping.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/grouping.jl b/test/grouping.jl index 4ae2981c9c..f9445f9607 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -897,10 +897,12 @@ end df = DataFrame(x=["a", "a", "b", missing], y=1:4) gd = groupby(df, :x) @test combine(identity, gd) ≅ df + @test combine(d -> d[:, [2, 1]], gd) ≅ df @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd) gd = groupby(df, :x, skipmissing=true) @test combine(identity, gd) == df[1:3, :] + @test combine(d -> d[:, [2, 1]], gd) == df[1:3, :] @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd) end From 9248c88dc642bd1940b66c79998b67fe6bf69c56 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Tue, 3 Sep 2019 21:08:47 +0200 Subject: [PATCH 3/3] Also fix map, add mention in docstring --- src/groupeddataframe/grouping.jl | 19 +++++++++++++++++-- test/grouping.jl | 8 +++++++- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl index 76a84e4aab..2bf5454084 100644 --- a/src/groupeddataframe/grouping.jl +++ b/src/groupeddataframe/grouping.jl @@ -212,8 +212,10 @@ If the first argument is a vector, tuple or named tuple of such pairs, each pair handled as described above. If a named tuple, field names are used to name each generated column. -If the first argument is a callable, it is passed a `SubDataFrame` view for each group, +If the first argument is a callable `f`, it is passed a [`SubDataFrame`](@ref) view for each group, and the returned `DataFrame` then consists of the returned rows plus the grouping columns. +If the returned data frame contains columns with the same names as the grouping columns, +they are required to be equal. Note that this second form is much slower than the first one due to type instability. `f` can return a single value, a row or multiple rows. The type of the returned value @@ -297,7 +299,16 @@ See [`by`](@ref) for more examples. function Base.map(f::Any, gd::GroupedDataFrame) if length(gd) > 0 idx, valscat = _combine(f, gd) - parent = hcat!(gd.parent[idx, gd.cols], valscat, makeunique=true) + keys = _names(gd.parent)[gd.cols] + for key in keys + if hasproperty(valscat, key) && + !isequal(valscat[!, key], view(gd.parent[!, key], idx)) + throw(ArgumentError("column :$key in returned data frame " * + "is not equal to grouping key :$key")) + end + end + parent = hcat!(gd.parent[idx, gd.cols], + without(valscat, intersect(keys, _names(valscat)))) if length(idx) == 0 return GroupedDataFrame(parent, collect(1:length(gd.cols)), idx, Int[], Int[], Int[]) @@ -343,6 +354,8 @@ views into these columns. If the last argument is a callable `f`, it is passed a [`SubDataFrame`](@ref) view for each group, and the returned `DataFrame` then consists of the returned rows plus the grouping columns. +If the returned data frame contains columns with the same names as the grouping columns, +they are required to be equal. Note that this second form is much slower than the first one due to type instability. A method is defined with `f` as the first argument, so do-block notation can be used. @@ -957,6 +970,8 @@ views into these columns. If the last argument is a callable `f`, it is passed a [`SubDataFrame`](@ref) view for each group, and the returned `DataFrame` then consists of the returned rows plus the grouping columns. +If the returned data frame contains columns with the same names as the grouping columns, +they are required to be equal. Note that this second form is much slower than the first one due to type instability. A method is defined with `f` as the first argument, so do-block notation can be used. diff --git a/test/grouping.jl b/test/grouping.jl index f9445f9607..72c4b8e494 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -893,17 +893,23 @@ Base.isless(::TestType, ::TestType) = false end end -@testset "combine with columns named like grouping keys" begin +@testset "combine and map with columns named like grouping keys" begin df = DataFrame(x=["a", "a", "b", missing], y=1:4) gd = groupby(df, :x) @test combine(identity, gd) ≅ df @test combine(d -> d[:, [2, 1]], gd) ≅ df @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd) + @test map(identity, gd) ≅ gd + @test map(d -> d[:, [2, 1]], gd) ≅ gd + @test_throws ArgumentError map(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd) gd = groupby(df, :x, skipmissing=true) @test combine(identity, gd) == df[1:3, :] @test combine(d -> d[:, [2, 1]], gd) == df[1:3, :] @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd) + @test map(identity, gd) == gd + @test map(d -> d[:, [2, 1]], gd) == gd + @test_throws ArgumentError map(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd) end @testset "iteration protocol" begin