From 0bbd96ff247463e1da9e98f9327d85e1db9cd68d Mon Sep 17 00:00:00 2001
From: Milan Bouchet-Valat <nalimilan@club.fr>
Date: Tue, 3 Sep 2019 10:25:26 +0200
Subject: [PATCH 1/3] Stop using makeunique=true for grouping keys in combine

Instead, check whether columns with the same names as grouping keys
have been returned, and throw an error if they are not equal.
This is breaking since columns named key_1, etc. won't be created,
but in practice it should not be frequently used. Introducing a deprecation
would be more annoying than helpful since it would require adding a keyword
argument to disable adding grouping keys and force users to set it to silence
the warning, which happens in most common cases.
---
 src/groupeddataframe/grouping.jl | 11 ++++++++++-
 test/grouping.jl                 | 21 ++++++++++++++++-----
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl
index c825aa5d0b..76a84e4aab 100644
--- a/src/groupeddataframe/grouping.jl
+++ b/src/groupeddataframe/grouping.jl
@@ -435,7 +435,16 @@ of `combine(map(f, groupby(df, cols)))`.
 function combine(f::Any, gd::GroupedDataFrame)
     if length(gd) > 0
         idx, valscat = _combine(f, gd)
-        return hcat!(gd.parent[idx, gd.cols], valscat, makeunique=true)
+        keys = _names(gd.parent)[gd.cols]
+        for key in keys
+            if hasproperty(valscat, key) &&
+               !isequal(valscat[!, key], view(gd.parent[!, key], idx))
+               throw(ArgumentError("column :$key in returned data frame " *
+                                   "is not equal to grouping key :$key"))
+            end
+        end
+        return hcat!(gd.parent[idx, gd.cols],
+                     without(valscat, intersect(keys, _names(valscat))))
     else
         return gd.parent[1:0, gd.cols]
     end
diff --git a/test/grouping.jl b/test/grouping.jl
index 31e44184f9..4ae2981c9c 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -104,8 +104,8 @@ end
     for cols in ([:a, :b], [:b, :a], [:a, :c], [:c, :a],
                  [1, 2], [2, 1], [1, 3], [3, 1],
                  [true, true, false, false], [true, false, true, false])
-        colssym = names(df[:, cols])
-        hcatdf = hcat(df[:, cols], df, makeunique=true)
+        colssym = names(df[!, cols])
+        hcatdf = hcat(df[!, cols], df[!, Not(cols)])
         nms = names(hcatdf)
         res = unique(df[:, cols])
         res.xmax = [maximum(df[(df[!, colssym[1]] .== a) .& (df[!, colssym[2]] .== b), :x])
@@ -163,7 +163,7 @@ end
         df_comb = combine(identity, gd)
         @test sort(df_comb, colssym) == shcatdf
         df_ref = DataFrame(gd)
-        @test sort(hcat(df_ref[:, cols], df_ref, makeunique=true), colssym) == shcatdf
+        @test sort(hcat(df_ref[!, cols], df_ref[!, Not(cols)]), colssym) == shcatdf
         @test df_ref.x == df_comb.x
         @test combine(f1, gd) == res
         @test combine(f2, gd) == res
@@ -183,7 +183,7 @@ end
         end
         @test combine(identity, gd) == shcatdf
         df_ref = DataFrame(gd)
-        @test hcat(df_ref[:, cols], df_ref, makeunique=true) == shcatdf
+        @test hcat(df_ref[!, cols], df_ref[!, Not(cols)]) == shcatdf
         @test combine(f1, gd) == sres
         @test combine(f2, gd) == sres
         @test rename(combine(f3, gd), :x1 => :xmax) == sres
@@ -342,7 +342,7 @@ end
 
     # Test function returning DataFrameRow
     res = by(d -> DataFrameRow(d, 1, :), df, :x)
-    @test res == DataFrame(x=df.x, x_1=df.x, y=df.y)
+    @test res == DataFrame(x=df.x, y=df.y)
 
     # Test function returning Tuple
     res = by(d -> (sum(d.y),), df, :x)
@@ -893,6 +893,17 @@ Base.isless(::TestType, ::TestType) = false
     end
 end
 
+@testset "combine with columns named like grouping keys" begin
+    df = DataFrame(x=["a", "a", "b", missing], y=1:4)
+    gd = groupby(df, :x)
+    @test combine(identity, gd) ≅ df
+    @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd)
+
+    gd = groupby(df, :x, skipmissing=true)
+    @test combine(identity, gd) == df[1:3, :]
+    @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd)
+end
+
 @testset "iteration protocol" begin
     gd = groupby_checked(DataFrame(A = [:A, :A, :B, :B], B = 1:4), :A)
     for v in gd

From 6ebf3d2e86c1db920ed9cd5c70499b3377c5af5b Mon Sep 17 00:00:00 2001
From: Milan Bouchet-Valat <nalimilan@club.fr>
Date: Tue, 3 Sep 2019 11:40:02 +0200
Subject: [PATCH 2/3] Add more stringent test

To check that names are used, and not only column indices.
---
 test/grouping.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/grouping.jl b/test/grouping.jl
index 4ae2981c9c..f9445f9607 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -897,10 +897,12 @@ end
     df = DataFrame(x=["a", "a", "b", missing], y=1:4)
     gd = groupby(df, :x)
     @test combine(identity, gd) ≅ df
+    @test combine(d -> d[:, [2, 1]], gd) ≅ df
     @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd)
 
     gd = groupby(df, :x, skipmissing=true)
     @test combine(identity, gd) == df[1:3, :]
+    @test combine(d -> d[:, [2, 1]], gd) == df[1:3, :]
     @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd)
 end
 

From 9248c88dc642bd1940b66c79998b67fe6bf69c56 Mon Sep 17 00:00:00 2001
From: Milan Bouchet-Valat <nalimilan@club.fr>
Date: Tue, 3 Sep 2019 21:08:47 +0200
Subject: [PATCH 3/3] Also fix map, add mention in docstring

---
 src/groupeddataframe/grouping.jl | 19 +++++++++++++++++--
 test/grouping.jl                 |  8 +++++++-
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl
index 76a84e4aab..2bf5454084 100644
--- a/src/groupeddataframe/grouping.jl
+++ b/src/groupeddataframe/grouping.jl
@@ -212,8 +212,10 @@ If the first argument is a vector, tuple or named tuple of such pairs, each pair
 handled as described above. If a named tuple, field names are used to name
 each generated column.
 
-If the first argument is a callable, it is passed a `SubDataFrame` view for each group,
+If the first argument is a callable `f`, it is passed a [`SubDataFrame`](@ref) view for each group,
 and the returned `DataFrame` then consists of the returned rows plus the grouping columns.
+If the returned data frame contains columns with the same names as the grouping columns,
+they are required to be equal.
 Note that this second form is much slower than the first one due to type instability.
 
 `f` can return a single value, a row or multiple rows. The type of the returned value
@@ -297,7 +299,16 @@ See [`by`](@ref) for more examples.
 function Base.map(f::Any, gd::GroupedDataFrame)
     if length(gd) > 0
         idx, valscat = _combine(f, gd)
-        parent = hcat!(gd.parent[idx, gd.cols], valscat, makeunique=true)
+        keys = _names(gd.parent)[gd.cols]
+        for key in keys
+            if hasproperty(valscat, key) &&
+               !isequal(valscat[!, key], view(gd.parent[!, key], idx))
+               throw(ArgumentError("column :$key in returned data frame " *
+                                   "is not equal to grouping key :$key"))
+            end
+        end
+        parent = hcat!(gd.parent[idx, gd.cols],
+                       without(valscat, intersect(keys, _names(valscat))))
         if length(idx) == 0
             return GroupedDataFrame(parent, collect(1:length(gd.cols)), idx,
                                     Int[], Int[], Int[])
@@ -343,6 +354,8 @@ views into these columns.
 
 If the last argument is a callable `f`, it is passed a [`SubDataFrame`](@ref) view for each group,
 and the returned `DataFrame` then consists of the returned rows plus the grouping columns.
+If the returned data frame contains columns with the same names as the grouping columns,
+they are required to be equal.
 Note that this second form is much slower than the first one due to type instability.
 A method is defined with `f` as the first argument, so do-block
 notation can be used.
@@ -957,6 +970,8 @@ views into these columns.
 
 If the last argument is a callable `f`, it is passed a [`SubDataFrame`](@ref) view for each group,
 and the returned `DataFrame` then consists of the returned rows plus the grouping columns.
+If the returned data frame contains columns with the same names as the grouping columns,
+they are required to be equal.
 Note that this second form is much slower than the first one due to type instability.
 A method is defined with `f` as the first argument, so do-block
 notation can be used.
diff --git a/test/grouping.jl b/test/grouping.jl
index f9445f9607..72c4b8e494 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -893,17 +893,23 @@ Base.isless(::TestType, ::TestType) = false
     end
 end
 
-@testset "combine with columns named like grouping keys" begin
+@testset "combine and map with columns named like grouping keys" begin
     df = DataFrame(x=["a", "a", "b", missing], y=1:4)
     gd = groupby(df, :x)
     @test combine(identity, gd) ≅ df
     @test combine(d -> d[:, [2, 1]], gd) ≅ df
     @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd)
+    @test map(identity, gd) ≅ gd
+    @test map(d -> d[:, [2, 1]], gd) ≅ gd
+    @test_throws ArgumentError map(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd)
 
     gd = groupby(df, :x, skipmissing=true)
     @test combine(identity, gd) == df[1:3, :]
     @test combine(d -> d[:, [2, 1]], gd) == df[1:3, :]
     @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd)
+    @test map(identity, gd) == gd
+    @test map(d -> d[:, [2, 1]], gd) == gd
+    @test_throws ArgumentError map(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd)
 end
 
 @testset "iteration protocol" begin