JuliaData · bkamins · Mar 25, 2021 · Mar 22, 2021 · Mar 23, 2021 · Mar 23, 2021
diff --git a/NEWS.md b/NEWS.md
@@ -26,6 +26,9 @@
   additional column to be added in the last position in the resulting data frame
   that will identify the source data frame.
   ([#2649](https://github.com/JuliaData/DataFrames.jl/pull/2649))
+* `GroupKey` and `DataFrameRow` are consistently behaving like `NamedTuple`
+  in comparisons and they now implement: `hash`, `==`, `isequal`, `<`, `isless`
+  ([#2669](https://github.com/JuliaData/DataFrames.jl/pull/2669)])
 
 ## Deprecated
 

diff --git a/src/dataframerow/dataframerow.jl b/src/dataframerow/dataframerow.jl
@@ -455,55 +455,57 @@ Base.merge(a::DataFrameRow, b::DataFrameRow) = merge(NamedTuple(a), NamedTuple(b
 Base.merge(a::DataFrameRow, b::Base.Iterators.Pairs) = merge(NamedTuple(a), b)
 Base.merge(a::DataFrameRow, itr) = merge(NamedTuple(a), itr)
 
-# hash of DataFrame rows based on its values
-# so that duplicate rows would have the same hash
-# table columns are passed as a tuple of vectors to ensure type specialization
-rowhash(cols::Tuple{AbstractVector}, r::Int, h::UInt = zero(UInt))::UInt =
-    hash(cols[1][r], h)
-function rowhash(cols::Tuple{Vararg{AbstractVector}}, r::Int, h::UInt = zero(UInt))::UInt
-    h = hash(cols[1][r], h)
-    rowhash(Base.tail(cols), r, h)
-end
-
-Base.hash(r::DataFrameRow, h::UInt = zero(UInt)) =
-    rowhash(ntuple(col -> parent(r)[!, parentcols(index(r), col)], length(r)), row(r), h)
-
-function Base.:(==)(r1::DataFrameRow, r2::DataFrameRow)
-    if parent(r1) === parent(r2)
-        parentcols(index(r1)) == parentcols(index(r2)) || return false
-        row(r1) == row(r2) && return true
-    else
-        _names(r1) == _names(r2) || return false
+Base.hash(r::DataFrameRow, h::UInt) = _nt_like_hash(r, h)
+
+_getnames(x::DataFrameRow) = _names(x)
+_getnames(x::NamedTuple) = propertynames(x)
+
+# this is required as == does not allow for comparison between tuples and vectors
+function _equal_names(r1, r2)
+    n1 = _getnames(r1)
+    n2 = _getnames(r2)
+    length(n1) == length(n2) || return false
+    for (a, b) in zip(n1, n2)
+        a == b || return false
     end
-    all(((a, b),) -> a == b, zip(r1, r2))
+    return true
 end
 
-function Base.isequal(r1::DataFrameRow, r2::DataFrameRow)
-    if parent(r1) === parent(r2)
-        parentcols(index(r1)) == parentcols(index(r2)) || return false
-        row(r1) == row(r2) && return true
-    else
-        _names(r1) == _names(r2) || return false
+for eqfun in (:isequal, :(==)),
+    (leftarg, rightarg) in ((:DataFrameRow, :DataFrameRow),
+                            (:DataFrameRow, :NamedTuple),
+                            (:NamedTuple, :DataFrameRow))
+    @eval function Base.$eqfun(r1::$leftarg, r2::$rightarg)
+        _equal_names(r1, r2) || return false
+        return all(((a, b),) -> $eqfun(a, b), zip(r1, r2))
     end
-    all(((a, b),) -> isequal(a, b), zip(r1, r2))
 end
 
-# lexicographic ordering on DataFrame rows, missing > !missing
-function Base.isless(r1::DataFrameRow, r2::DataFrameRow)
-    length(r1) == length(r2) ||
-        throw(ArgumentError("compared DataFrameRows must have the same number " *
-                            "of columns (got $(length(r1)) and $(length(r2)))"))
-    if _names(r1) != _names(r2)
-        mismatch = findfirst(i -> _names(r1)[i] != _names(r2)[i], 1:length(r1))
-        throw(ArgumentError("compared DataFrameRows must have the same colum " *
-                            "names but they differ in column number $mismatch " *
-                            "where the names are :$(names(r1)[mismatch]) and " *
-                            ":$(_names(r2)[mismatch]) respectively"))
-    end
-    for (a, b) in zip(r1, r2)
-        isequal(a, b) || return isless(a, b)
+for (eqfun, cmpfun) in ((:isequal, :isless), (:(==), :(<))),
+    (leftarg, rightarg) in ((:DataFrameRow, :DataFrameRow),
+                            (:DataFrameRow, :NamedTuple),
+                            (:NamedTuple, :DataFrameRow))
+    @eval function Base.$cmpfun(r1::$leftarg, r2::$rightarg)
+        if !_equal_names(r1, r2)
+            length(r1) == length(r2) ||
+                throw(ArgumentError("compared objects must have the same number " *
+                                    "of columns (got $(length(r1)) and $(length(r2)))"))
+            mismatch = findfirst(i -> _getnames(r1)[i] != _getnames(r2)[i], 1:length(r1))
+            throw(ArgumentError("compared objects must have the same property " *
+                                "names but they differ in column number $mismatch " *
+                                "where the names are :$(_getnames(r1)[mismatch]) and " *
+                                ":$(_getnames(r2)[mismatch]) respectively"))
+        end
+        for (a, b) in zip(r1, r2)
+            eq = $eqfun(a, b)
+            if ismissing(eq)
+                return missing
+            elseif !eq
+                return $cmpfun(a, b)
+            end
+        end
+        return false # here we know that r1 and r2 have equal lengths and all values were equal
     end
-    return false
 end
 
 function DataFrame(dfr::DataFrameRow)

diff --git a/src/dataframerow/utils.jl b/src/dataframerow/utils.jl
@@ -440,57 +440,3 @@ function compute_indices(groups::AbstractVector{<:Integer}, ngroups::Integer)
 
     return rperm, starts, stops
 end
-
-# Build RowGroupDict for a given DataFrame, using all of its columns as grouping keys
-function group_rows(df::AbstractDataFrame)
-    groups = Vector{Int}(undef, nrow(df))
-    ngroups, rhashes, gslots, sorted =
-        row_group_slots(ntuple(i -> df[!, i], ncol(df)), Val(true), groups, false, false)
-    rperm, starts, stops = compute_indices(groups, ngroups)
-    return RowGroupDict(df, rhashes, gslots, groups, rperm, starts, stops)
-end
-
-# Find index of a row in gd that matches given row by content, 0 if not found
-function findrow(gd::RowGroupDict,
-                 df::AbstractDataFrame,
-                 gd_cols::Tuple{Vararg{AbstractVector}},
-                 df_cols::Tuple{Vararg{AbstractVector}},
-                 row::Int)
-    (gd.df === df) && return row # same table, return itself
-    # different tables, content matching required
-    rhash = rowhash(df_cols, row)
-    szm1 = length(gd.gslots)-1
-    slotix = ini_slotix = rhash & szm1 + 1
-    while true
-        g_row = gd.gslots[slotix]
-        if g_row == 0 || # not found
-            (rhash == gd.rhashes[g_row] &&
-            isequal_row(gd_cols, g_row, df_cols, row)) # found
-            return g_row
-        end
-        slotix = (slotix & szm1) + 1 # miss, try the next slot
-        (slotix == ini_slotix) && break
-    end
-    return 0 # not found
-end
-
-# Find indices of rows in 'gd' that match given row by content.
-# return empty set if no row matches
-function findrows(gd::RowGroupDict,
-                  df::AbstractDataFrame,
-                  gd_cols::Tuple{Vararg{AbstractVector}},
-                  df_cols::Tuple{Vararg{AbstractVector}},
-                  row::Int)
-    g_row = findrow(gd, df, gd_cols, df_cols, row)
-    (g_row == 0) && return view(gd.rperm, 0:-1)
-    gix = gd.groups[g_row]
-    return view(gd.rperm, gd.starts[gix]:gd.stops[gix])
-end
-
-function Base.getindex(gd::RowGroupDict, dfr::DataFrameRow)
-    g_row = findrow(gd, parent(dfr), ntuple(i -> gd.df[!, i], ncol(gd.df)),
-                    ntuple(i -> parent(dfr)[!, i], ncol(parent(dfr))), row(dfr))
-    (g_row == 0) && throw(KeyError(dfr))
-    gix = gd.groups[g_row]
-    return view(gd.rperm, gd.starts[gix]:gd.stops[gix])
-end
diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl
@@ -521,6 +521,51 @@ end
 
 Base.getproperty(key::GroupKey, p::AbstractString) = getproperty(key, Symbol(p))
 
+Base.hash(key::GroupKey, h::UInt) = _nt_like_hash(key, h)
+
+_getnames(x::GroupKey) = parent(x).cols
+
+for eqfun in (:isequal, :(==)),
+    (leftarg, rightarg) in ((:GroupKey, :GroupKey),
+                            (:DataFrameRow, :GroupKey),
+                            (:GroupKey, :DataFrameRow),
+                            (:NamedTuple, :GroupKey),
+                            (:GroupKey, :NamedTuple))
+    @eval function Base.$eqfun(k1::$leftarg, k2::$rightarg)
+        _equal_names(k1, k2) || return false
+        return all(((a, b),) -> $eqfun(a, b), zip(k1, k2))
+    end
+end
+
+for (eqfun, cmpfun) in ((:isequal, :isless), (:(==), :(<))),
+    (leftarg, rightarg) in ((:GroupKey, :GroupKey),
+                            (:DataFrameRow, :GroupKey),
+                            (:GroupKey, :DataFrameRow),
+                            (:NamedTuple, :GroupKey),
+                            (:GroupKey, :NamedTuple))
+    @eval function Base.$cmpfun(k1::$leftarg, k2::$rightarg)
+        if !_equal_names(k1, k2)
+            length(k1) == length(k2) ||
+                throw(ArgumentError("compared objects must have the same number " *
+                                    "of columns (got $(length(k1)) and $(length(k2)))"))
+            mismatch = findfirst(i -> _getnames(k1)[i] != _getnames(k2)[i], 1:length(k1))
+            throw(ArgumentError("compared objects must have the same column " *
+                                "names but they differ in column number $mismatch " *
+                                "where the names are :$(_getnames(k1)[mismatch]) and " *
+                                ":$(_getnames(k2)[mismatch]) respectively"))
+        end
+        for (a, b) in zip(k1, k2)
+            eq = $eqfun(a, b)
+            if ismissing(eq)
+                return missing
+            elseif !eq
+                return $cmpfun(a, b)
+            end
+        end
+        return false # here we know that r1 and r2 have equal lengths and all values were equal
+    end
+end
+
 function Base.NamedTuple(key::GroupKey)
     N = NamedTuple{Tuple(parent(key).cols)}
     N(_groupvalues(parent(key), getfield(key, :idx)))

diff --git a/src/other/precompile.jl b/src/other/precompile.jl
@@ -1608,7 +1608,6 @@ function precompile(all=false)
         Base.precompile(Tuple{typeof(DataFrames._unstack),DataFrame,Array{Int,1},Int,GroupedDataFrame{DataFrame},Array{Any,1},GroupedDataFrame{DataFrame},Function,Bool,Bool})
         Base.precompile(Tuple{typeof(DataFrames._combine_multicol),String,Function,GroupedDataFrame{DataFrame},Nothing})
         Base.precompile(Tuple{DataFrames.Reduce{typeof(min),Nothing,Nothing},Array{Union{Missing, BigFloat},1},GroupedDataFrame{DataFrame}})
-        Base.precompile(Tuple{typeof(DataFrames.rowhash),Tuple{Array{Symbol,1}},Int,UInt})
         Base.precompile(Tuple{typeof(combine),GroupedDataFrame{DataFrame},Pair{InvertedIndex{Symbol},ByRow{typeof(/)}}})
         Base.precompile(Tuple{typeof(transform),DataFrame,Any,Any})
         Base.precompile(Tuple{typeof(DataFrames._combine_process_pair_symbol),Bool,GroupedDataFrame{DataFrame},Dict{Symbol,Tuple{Bool,Int}},Array{DataFrames.TransformationResult,1},Nothing,Symbol,Bool,Base.RefValue{SubArray{Int,1,Array{Int,1},Tuple{Array{Int,1}},false}},Union{Function, Type},Tuple{Array{Int,1}}})
@@ -2634,7 +2633,6 @@ function precompile(all=false)
         Base.precompile(Tuple{Core.kwftype(typeof(DataFrames.Type)),NamedTuple{(:a,),Tuple{Int}},Type{DataFrame}})
         Base.precompile(Tuple{Core.kwftype(typeof(DataFrames.Type)),NamedTuple{(:a, :b, :v1),Tuple{Array{Union{Missing, Symbol},1},Array{Union{Missing, Symbol},1},UnitRange{Int}}},Type{DataFrame}})
         Base.precompile(Tuple{typeof(getindex),DataFrame,Int,All{Tuple{}}})
-        Base.precompile(Tuple{typeof(DataFrames.group_rows),DataFrame})
         Base.precompile(Tuple{typeof(DataFrames._combine_process_pair_symbol),Bool,GroupedDataFrame{DataFrame},Dict{Symbol,Tuple{Bool,Int}},Array{DataFrames.TransformationResult,1},Nothing,Symbol,Bool,Complex{Float64},Union{Function, Type},Tuple{Array{Complex{Float64},1}}})
         Base.precompile(Tuple{typeof(push!),DataFrame,Tuple{Int,Char}})
         Base.precompile(Tuple{Core.kwftype(typeof(DataFrames.leftjoin)),NamedTuple{(:on,),Tuple{Array{Pair{Symbol,String},1}}},typeof(leftjoin),DataFrame,DataFrame})

diff --git a/src/other/utils.jl b/src/other/utils.jl
@@ -128,3 +128,14 @@ function tforeach(f, x::AbstractArray; basesize::Integer)
     end
     return
 end
+
+function _nt_like_hash(v, h::UInt)
+    length(v) == 0 && return hash(NamedTuple(), h)
+
+    h = hash((), h)
+    for i in length(v):-1:1
+        h = hash(v[i], h)
+    end
+
+    return xor(objectid(Tuple(propertynames(v))), h)
+end
diff --git a/test/dataframerow.jl b/test/dataframerow.jl
@@ -171,7 +171,7 @@ end
     @test_throws ArgumentError df[1, 1:2] < df[1, 2:3]
 end
 
-@testset "hashing" begin
+@testset "hashing of DataFrameRow and GroupKey" begin
     df = deepcopy(ref_df)
 
     @test hash(DataFrameRow(df, 1, :)) != hash(DataFrameRow(df, 2, :))
@@ -180,33 +180,17 @@ end
     @test hash(DataFrameRow(df, 2, :)) == hash(DataFrameRow(df, 5, :))
     @test hash(DataFrameRow(df, 2, :)) != hash(DataFrameRow(df, 6, :))
 
-    # check that hashrows() function generates the same hashes as DataFrameRow
-    df_rowhashes, _ = DataFrames.hashrows(Tuple(eachcol(df)), false)
-    @test df_rowhashes == [hash(dr) for dr in eachrow(df)]
-end
-
-@testset "grouping" begin
-    # test RowGroupDict
-    Random.seed!(1234)
-    df1 = DataFrame(d1=rand(1:2, 1000))
-    df2 = DataFrame(d1=[2, 3])
-
-    # test_group("group_rows")
-    gd = DataFrames.group_rows(df1)
-    @test length(unique(gd.groups)) == 2
-
-    # getting groups for the rows of the other frames
-    @test length(gd[DataFrameRow(df2, 1, :)]) > 0
-    @test_throws KeyError gd[DataFrameRow(df2, 2, :)]
-    @test isempty(DataFrames.findrows(gd, df2, (gd.df[!, 1],), (df2[!, 1],), 2))
+    df = DataFrame(reshape(1:24, 6, 4), :auto)
+    df.x2 = string.(df.x2)
+    df.x3 = categorical(df.x3)
+    df.x4 = Float64.(df.x4)
+    gks = keys(groupby(df, :))
 
-    # grouping empty frame
-    gd = DataFrames.group_rows(DataFrame(x=Int[]))
-    @test length(unique(gd.groups)) == 0
-
-    # grouping single row
-    gd = DataFrames.group_rows(df1[1:1, :])
-    @test length(unique(gd.groups)) == 1
+    for i in axes(df, 1), h in UInt(0):UInt(10)
+        @test hash(DataFrameRow(df, i, :), h) ==
+              hash(gks[i], h) ==
+              hash(NamedTuple(DataFrameRow(df, i, :)), h)
+    end
 end
 
 @testset "getproperty, setproperty! and propertynames" begin
@@ -587,4 +571,83 @@ end
     @test_throws ArgumentError r .+ 1
 end
 
+@testset "comparison tests: DataFrameRow, NamedTuple and GroupKey" begin
+    df = DataFrame(a=[1, 2], b=[missing, 3])
+    dfr = [df[1, :], df[2, :], df[1, 1:1], df[2, 1:1]]
+    nt = NamedTuple.(dfr)
+    gk = [keys(groupby(df, [:a, :b])); keys(groupby(df, :a))]
+
+    for l in (dfr[1], nt[1], gk[1]), r in (dfr[1], nt[1], gk[1])
+        @test ismissing(l == r)
+        @test ismissing(l == (a=1, b=2))
+        @test l ≅ r
+        @test l ≇ (a=1, b=2)
+        # work around https://github.com/JuliaLang/julia/pull/40147
+        if !(l isa NamedTuple && r isa NamedTuple)
+            @test ismissing(l < r)
+        end
+        @test !isless(l, r)
+    end
+
+    for i in 2:4, l in (dfr, nt, gk), r in (dfr, nt, gk)
+        @test l[i] == r[i]
+        @test l[1] != l[i]
+        @test l[1] != r[i]
+        @test l[i] ≅ r[i]
+        @test l[1] ≇ l[i]
+        @test l[1] ≇ r[i]
+
+        @test !(l[i] < r[i])
+        @test !isless(l[i], r[i])
+
+        if i > 2
+            if l[1] isa NamedTuple && r[i] isa NamedTuple
+                @test_throws MethodError l[1] < r[i]
+                @test_throws MethodError isless(l[1], r[i])
+            else
+                @test_throws ArgumentError l[1] < r[i]
+                @test_throws ArgumentError isless(l[1], r[i])
+            end
+        end
+    end
+
+    for l in (dfr, nt, gk), r in (dfr, nt, gk)
+        @test l[1] < r[2]
+        @test isless(l[1], r[2])
+        @test !(l[2] < r[1])
+        @test !isless(l[2], r[1])
+
+        @test l[3] < r[4]
+        @test isless(l[3], r[4])
+        @test !(l[4] < r[3])
+        @test !isless(l[4], r[3])
+    end
+
+    @test !(dfr[1] == (x=1, b=missing))
+    @test !(gk[1] == (x=1, b=missing))
+    @test !(dfr[1] ≅ (x=1, b=missing))
+    @test !(gk[1] ≅ (x=1, b=missing))
+
+    @test_throws ArgumentError dfr[1] < (x=1, b=missing)
+    @test_throws ArgumentError gk[1] < (x=1, b=missing)
+    @test_throws ArgumentError isless(dfr[1], (x=1, b=missing))
+    @test_throws ArgumentError isless(gk[1], (x=1, b=missing))
+
+    df2 = DataFrame(a=1, b=missing)
+    df3 = DataFrame(a=2, b=1)
+    dfr2 = df2[1, :]
+    dfr3 = df3[1, :]
+    nt2 = NamedTuple(dfr2)
+    nt3 = NamedTuple(dfr3)
+    gk2 = keys(groupby(df2, [:a, :b]))[1]
+    gk3 = keys(groupby(df3, [:a, :b]))[1]
+
+    for a in (dfr2, nt2, gk2), b in (dfr3, nt3, gk3)
+        @test !(a == b)
+        @test !(a ≅ b)
+        @test a < b
+        @test isless(a, b)
+    end
+end
+
 end # module