diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl index c825aa5d0b..2bf5454084 100644 --- a/src/groupeddataframe/grouping.jl +++ b/src/groupeddataframe/grouping.jl @@ -212,8 +212,10 @@ If the first argument is a vector, tuple or named tuple of such pairs, each pair handled as described above. If a named tuple, field names are used to name each generated column. -If the first argument is a callable, it is passed a `SubDataFrame` view for each group, +If the first argument is a callable `f`, it is passed a [`SubDataFrame`](@ref) view for each group, and the returned `DataFrame` then consists of the returned rows plus the grouping columns. +If the returned data frame contains columns with the same names as the grouping columns, +they are required to be equal. Note that this second form is much slower than the first one due to type instability. `f` can return a single value, a row or multiple rows. The type of the returned value @@ -297,7 +299,16 @@ See [`by`](@ref) for more examples. function Base.map(f::Any, gd::GroupedDataFrame) if length(gd) > 0 idx, valscat = _combine(f, gd) - parent = hcat!(gd.parent[idx, gd.cols], valscat, makeunique=true) + keys = _names(gd.parent)[gd.cols] + for key in keys + if hasproperty(valscat, key) && + !isequal(valscat[!, key], view(gd.parent[!, key], idx)) + throw(ArgumentError("column :$key in returned data frame " * + "is not equal to grouping key :$key")) + end + end + parent = hcat!(gd.parent[idx, gd.cols], + without(valscat, intersect(keys, _names(valscat)))) if length(idx) == 0 return GroupedDataFrame(parent, collect(1:length(gd.cols)), idx, Int[], Int[], Int[]) @@ -343,6 +354,8 @@ views into these columns. If the last argument is a callable `f`, it is passed a [`SubDataFrame`](@ref) view for each group, and the returned `DataFrame` then consists of the returned rows plus the grouping columns. +If the returned data frame contains columns with the same names as the grouping columns, +they are required to be equal. Note that this second form is much slower than the first one due to type instability. A method is defined with `f` as the first argument, so do-block notation can be used. @@ -435,7 +448,16 @@ of `combine(map(f, groupby(df, cols)))`. function combine(f::Any, gd::GroupedDataFrame) if length(gd) > 0 idx, valscat = _combine(f, gd) - return hcat!(gd.parent[idx, gd.cols], valscat, makeunique=true) + keys = _names(gd.parent)[gd.cols] + for key in keys + if hasproperty(valscat, key) && + !isequal(valscat[!, key], view(gd.parent[!, key], idx)) + throw(ArgumentError("column :$key in returned data frame " * + "is not equal to grouping key :$key")) + end + end + return hcat!(gd.parent[idx, gd.cols], + without(valscat, intersect(keys, _names(valscat)))) else return gd.parent[1:0, gd.cols] end @@ -948,6 +970,8 @@ views into these columns. If the last argument is a callable `f`, it is passed a [`SubDataFrame`](@ref) view for each group, and the returned `DataFrame` then consists of the returned rows plus the grouping columns. +If the returned data frame contains columns with the same names as the grouping columns, +they are required to be equal. Note that this second form is much slower than the first one due to type instability. A method is defined with `f` as the first argument, so do-block notation can be used. diff --git a/test/grouping.jl b/test/grouping.jl index 31e44184f9..72c4b8e494 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -104,8 +104,8 @@ end for cols in ([:a, :b], [:b, :a], [:a, :c], [:c, :a], [1, 2], [2, 1], [1, 3], [3, 1], [true, true, false, false], [true, false, true, false]) - colssym = names(df[:, cols]) - hcatdf = hcat(df[:, cols], df, makeunique=true) + colssym = names(df[!, cols]) + hcatdf = hcat(df[!, cols], df[!, Not(cols)]) nms = names(hcatdf) res = unique(df[:, cols]) res.xmax = [maximum(df[(df[!, colssym[1]] .== a) .& (df[!, colssym[2]] .== b), :x]) @@ -163,7 +163,7 @@ end df_comb = combine(identity, gd) @test sort(df_comb, colssym) == shcatdf df_ref = DataFrame(gd) - @test sort(hcat(df_ref[:, cols], df_ref, makeunique=true), colssym) == shcatdf + @test sort(hcat(df_ref[!, cols], df_ref[!, Not(cols)]), colssym) == shcatdf @test df_ref.x == df_comb.x @test combine(f1, gd) == res @test combine(f2, gd) == res @@ -183,7 +183,7 @@ end end @test combine(identity, gd) == shcatdf df_ref = DataFrame(gd) - @test hcat(df_ref[:, cols], df_ref, makeunique=true) == shcatdf + @test hcat(df_ref[!, cols], df_ref[!, Not(cols)]) == shcatdf @test combine(f1, gd) == sres @test combine(f2, gd) == sres @test rename(combine(f3, gd), :x1 => :xmax) == sres @@ -342,7 +342,7 @@ end # Test function returning DataFrameRow res = by(d -> DataFrameRow(d, 1, :), df, :x) - @test res == DataFrame(x=df.x, x_1=df.x, y=df.y) + @test res == DataFrame(x=df.x, y=df.y) # Test function returning Tuple res = by(d -> (sum(d.y),), df, :x) @@ -893,6 +893,25 @@ Base.isless(::TestType, ::TestType) = false end end +@testset "combine and map with columns named like grouping keys" begin + df = DataFrame(x=["a", "a", "b", missing], y=1:4) + gd = groupby(df, :x) + @test combine(identity, gd) ≅ df + @test combine(d -> d[:, [2, 1]], gd) ≅ df + @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd) + @test map(identity, gd) ≅ gd + @test map(d -> d[:, [2, 1]], gd) ≅ gd + @test_throws ArgumentError map(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd) + + gd = groupby(df, :x, skipmissing=true) + @test combine(identity, gd) == df[1:3, :] + @test combine(d -> d[:, [2, 1]], gd) == df[1:3, :] + @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd) + @test map(identity, gd) == gd + @test map(d -> d[:, [2, 1]], gd) == gd + @test_throws ArgumentError map(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd) +end + @testset "iteration protocol" begin gd = groupby_checked(DataFrame(A = [:A, :A, :B, :B], B = 1:4), :A) for v in gd