Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add ==, isequal <, and isless for DataFrameRow and GroupKey #2669

Merged
merged 16 commits into from
Mar 25, 2021
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@
additional column to be added in the last position in the resulting data frame
that will identify the source data frame.
([#2649](https://github.com/JuliaData/DataFrames.jl/pull/2649))
* `GroupKey` and `DataFrameRow` are consistently behaving like `NamedTuple`
in comparisons and they now implement: `hash`, `==`, `isequal`, `<`, `isless`
([#2669](https://github.com/JuliaData/DataFrames.jl/pull/2669)])

## Deprecated

Expand Down
86 changes: 44 additions & 42 deletions src/dataframerow/dataframerow.jl
Original file line number Diff line number Diff line change
Expand Up @@ -455,55 +455,57 @@ Base.merge(a::DataFrameRow, b::DataFrameRow) = merge(NamedTuple(a), NamedTuple(b
Base.merge(a::DataFrameRow, b::Base.Iterators.Pairs) = merge(NamedTuple(a), b)
Base.merge(a::DataFrameRow, itr) = merge(NamedTuple(a), itr)

# hash of DataFrame rows based on its values
# so that duplicate rows would have the same hash
# table columns are passed as a tuple of vectors to ensure type specialization
rowhash(cols::Tuple{AbstractVector}, r::Int, h::UInt = zero(UInt))::UInt =
hash(cols[1][r], h)
function rowhash(cols::Tuple{Vararg{AbstractVector}}, r::Int, h::UInt = zero(UInt))::UInt
h = hash(cols[1][r], h)
rowhash(Base.tail(cols), r, h)
end

Base.hash(r::DataFrameRow, h::UInt = zero(UInt)) =
rowhash(ntuple(col -> parent(r)[!, parentcols(index(r), col)], length(r)), row(r), h)

function Base.:(==)(r1::DataFrameRow, r2::DataFrameRow)
if parent(r1) === parent(r2)
parentcols(index(r1)) == parentcols(index(r2)) || return false
row(r1) == row(r2) && return true
else
_names(r1) == _names(r2) || return false
Base.hash(r::DataFrameRow, h::UInt) = _nt_like_hash(r, h)

_getnames(x::DataFrameRow) = _names(x)
_getnames(x::NamedTuple) = propertynames(x)

# this is required as == does not allow for comparison between tuples and vectors
function _equal_names(r1, r2)
n1 = _getnames(r1)
n2 = _getnames(r2)
length(n1) == length(n2) || return false
for (a, b) in zip(n1, n2)
a == b || return false
end
all(((a, b),) -> a == b, zip(r1, r2))
return true
end

function Base.isequal(r1::DataFrameRow, r2::DataFrameRow)
if parent(r1) === parent(r2)
parentcols(index(r1)) == parentcols(index(r2)) || return false
row(r1) == row(r2) && return true
else
_names(r1) == _names(r2) || return false
for eqfun in (:isequal, :(==)),
(leftarg, rightarg) in ((:DataFrameRow, :DataFrameRow),
(:DataFrameRow, :NamedTuple),
(:NamedTuple, :DataFrameRow))
@eval function Base.$eqfun(r1::$leftarg, r2::$rightarg)
_equal_names(r1, r2) || return false
return all(((a, b),) -> $eqfun(a, b), zip(r1, r2))
end
all(((a, b),) -> isequal(a, b), zip(r1, r2))
end

# lexicographic ordering on DataFrame rows, missing > !missing
function Base.isless(r1::DataFrameRow, r2::DataFrameRow)
length(r1) == length(r2) ||
throw(ArgumentError("compared DataFrameRows must have the same number " *
"of columns (got $(length(r1)) and $(length(r2)))"))
if _names(r1) != _names(r2)
mismatch = findfirst(i -> _names(r1)[i] != _names(r2)[i], 1:length(r1))
throw(ArgumentError("compared DataFrameRows must have the same colum " *
"names but they differ in column number $mismatch " *
"where the names are :$(names(r1)[mismatch]) and " *
":$(_names(r2)[mismatch]) respectively"))
end
for (a, b) in zip(r1, r2)
isequal(a, b) || return isless(a, b)
for (eqfun, cmpfun) in ((:isequal, :isless), (:(==), :(<))),
(leftarg, rightarg) in ((:DataFrameRow, :DataFrameRow),
(:DataFrameRow, :NamedTuple),
(:NamedTuple, :DataFrameRow))
@eval function Base.$cmpfun(r1::$leftarg, r2::$rightarg)
if !_equal_names(r1, r2)
length(r1) == length(r2) ||
throw(ArgumentError("compared objects must have the same number " *
"of columns (got $(length(r1)) and $(length(r2)))"))
mismatch = findfirst(i -> _getnames(r1)[i] != _getnames(r2)[i], 1:length(r1))
throw(ArgumentError("compared objects must have the same property " *
"names but they differ in column number $mismatch " *
"where the names are :$(_getnames(r1)[mismatch]) and " *
":$(_getnames(r2)[mismatch]) respectively"))
end
for (a, b) in zip(r1, r2)
eq = $eqfun(a, b)
if ismissing(eq)
return missing
elseif !eq
return $cmpfun(a, b)
end
end
return false # here we know that r1 and r2 have equal lengths and all values were equal
end
return false
end

function DataFrame(dfr::DataFrameRow)
Expand Down
54 changes: 0 additions & 54 deletions src/dataframerow/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -440,57 +440,3 @@ function compute_indices(groups::AbstractVector{<:Integer}, ngroups::Integer)

return rperm, starts, stops
end

# Build RowGroupDict for a given DataFrame, using all of its columns as grouping keys
function group_rows(df::AbstractDataFrame)
groups = Vector{Int}(undef, nrow(df))
ngroups, rhashes, gslots, sorted =
row_group_slots(ntuple(i -> df[!, i], ncol(df)), Val(true), groups, false, false)
rperm, starts, stops = compute_indices(groups, ngroups)
return RowGroupDict(df, rhashes, gslots, groups, rperm, starts, stops)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think RowGroupDict can also be removed.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed - I am running tests to double check. It is astonishing how much we have reworked internally in this release.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK - all seems clean. I will merge the PR after CI passes.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that this file contains only grouping code, we will be able to move it to the corresponding folder and rename it without touching anything else. :-)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK will move it in this PR as otherwise we will forget. Also nonunique uses it, but I think it is not a problem.

end

# Find index of a row in gd that matches given row by content, 0 if not found
function findrow(gd::RowGroupDict,
df::AbstractDataFrame,
gd_cols::Tuple{Vararg{AbstractVector}},
df_cols::Tuple{Vararg{AbstractVector}},
row::Int)
(gd.df === df) && return row # same table, return itself
# different tables, content matching required
rhash = rowhash(df_cols, row)
szm1 = length(gd.gslots)-1
slotix = ini_slotix = rhash & szm1 + 1
while true
g_row = gd.gslots[slotix]
if g_row == 0 || # not found
(rhash == gd.rhashes[g_row] &&
isequal_row(gd_cols, g_row, df_cols, row)) # found
return g_row
end
slotix = (slotix & szm1) + 1 # miss, try the next slot
(slotix == ini_slotix) && break
end
return 0 # not found
end

# Find indices of rows in 'gd' that match given row by content.
# return empty set if no row matches
function findrows(gd::RowGroupDict,
df::AbstractDataFrame,
gd_cols::Tuple{Vararg{AbstractVector}},
df_cols::Tuple{Vararg{AbstractVector}},
row::Int)
g_row = findrow(gd, df, gd_cols, df_cols, row)
(g_row == 0) && return view(gd.rperm, 0:-1)
gix = gd.groups[g_row]
return view(gd.rperm, gd.starts[gix]:gd.stops[gix])
end

function Base.getindex(gd::RowGroupDict, dfr::DataFrameRow)
g_row = findrow(gd, parent(dfr), ntuple(i -> gd.df[!, i], ncol(gd.df)),
ntuple(i -> parent(dfr)[!, i], ncol(parent(dfr))), row(dfr))
(g_row == 0) && throw(KeyError(dfr))
gix = gd.groups[g_row]
return view(gd.rperm, gd.starts[gix]:gd.stops[gix])
end
45 changes: 45 additions & 0 deletions src/groupeddataframe/groupeddataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,51 @@ end

Base.getproperty(key::GroupKey, p::AbstractString) = getproperty(key, Symbol(p))

Base.hash(key::GroupKey, h::UInt) = _nt_like_hash(key, h)

_getnames(x::GroupKey) = parent(x).cols

for eqfun in (:isequal, :(==)),
(leftarg, rightarg) in ((:GroupKey, :GroupKey),
(:DataFrameRow, :GroupKey),
(:GroupKey, :DataFrameRow),
(:NamedTuple, :GroupKey),
(:GroupKey, :NamedTuple))
@eval function Base.$eqfun(k1::$leftarg, k2::$rightarg)
_equal_names(k1, k2) || return false
return all(((a, b),) -> $eqfun(a, b), zip(k1, k2))
end
end

for (eqfun, cmpfun) in ((:isequal, :isless), (:(==), :(<))),
(leftarg, rightarg) in ((:GroupKey, :GroupKey),
(:DataFrameRow, :GroupKey),
(:GroupKey, :DataFrameRow),
(:NamedTuple, :GroupKey),
(:GroupKey, :NamedTuple))
@eval function Base.$cmpfun(k1::$leftarg, k2::$rightarg)
if !_equal_names(k1, k2)
length(k1) == length(k2) ||
throw(ArgumentError("compared objects must have the same number " *
"of columns (got $(length(k1)) and $(length(k2)))"))
mismatch = findfirst(i -> _getnames(k1)[i] != _getnames(k2)[i], 1:length(k1))
throw(ArgumentError("compared objects must have the same column " *
"names but they differ in column number $mismatch " *
"where the names are :$(_getnames(k1)[mismatch]) and " *
":$(_getnames(k2)[mismatch]) respectively"))
end
for (a, b) in zip(k1, k2)
eq = $eqfun(a, b)
if ismissing(eq)
return missing
elseif !eq
return $cmpfun(a, b)
end
end
return false # here we know that r1 and r2 have equal lengths and all values were equal
end
end

function Base.NamedTuple(key::GroupKey)
N = NamedTuple{Tuple(parent(key).cols)}
N(_groupvalues(parent(key), getfield(key, :idx)))
Expand Down
2 changes: 0 additions & 2 deletions src/other/precompile.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1608,7 +1608,6 @@ function precompile(all=false)
Base.precompile(Tuple{typeof(DataFrames._unstack),DataFrame,Array{Int,1},Int,GroupedDataFrame{DataFrame},Array{Any,1},GroupedDataFrame{DataFrame},Function,Bool,Bool})
Base.precompile(Tuple{typeof(DataFrames._combine_multicol),String,Function,GroupedDataFrame{DataFrame},Nothing})
Base.precompile(Tuple{DataFrames.Reduce{typeof(min),Nothing,Nothing},Array{Union{Missing, BigFloat},1},GroupedDataFrame{DataFrame}})
Base.precompile(Tuple{typeof(DataFrames.rowhash),Tuple{Array{Symbol,1}},Int,UInt})
Base.precompile(Tuple{typeof(combine),GroupedDataFrame{DataFrame},Pair{InvertedIndex{Symbol},ByRow{typeof(/)}}})
Base.precompile(Tuple{typeof(transform),DataFrame,Any,Any})
Base.precompile(Tuple{typeof(DataFrames._combine_process_pair_symbol),Bool,GroupedDataFrame{DataFrame},Dict{Symbol,Tuple{Bool,Int}},Array{DataFrames.TransformationResult,1},Nothing,Symbol,Bool,Base.RefValue{SubArray{Int,1,Array{Int,1},Tuple{Array{Int,1}},false}},Union{Function, Type},Tuple{Array{Int,1}}})
Expand Down Expand Up @@ -2634,7 +2633,6 @@ function precompile(all=false)
Base.precompile(Tuple{Core.kwftype(typeof(DataFrames.Type)),NamedTuple{(:a,),Tuple{Int}},Type{DataFrame}})
Base.precompile(Tuple{Core.kwftype(typeof(DataFrames.Type)),NamedTuple{(:a, :b, :v1),Tuple{Array{Union{Missing, Symbol},1},Array{Union{Missing, Symbol},1},UnitRange{Int}}},Type{DataFrame}})
Base.precompile(Tuple{typeof(getindex),DataFrame,Int,All{Tuple{}}})
Base.precompile(Tuple{typeof(DataFrames.group_rows),DataFrame})
Base.precompile(Tuple{typeof(DataFrames._combine_process_pair_symbol),Bool,GroupedDataFrame{DataFrame},Dict{Symbol,Tuple{Bool,Int}},Array{DataFrames.TransformationResult,1},Nothing,Symbol,Bool,Complex{Float64},Union{Function, Type},Tuple{Array{Complex{Float64},1}}})
Base.precompile(Tuple{typeof(push!),DataFrame,Tuple{Int,Char}})
Base.precompile(Tuple{Core.kwftype(typeof(DataFrames.leftjoin)),NamedTuple{(:on,),Tuple{Array{Pair{Symbol,String},1}}},typeof(leftjoin),DataFrame,DataFrame})
Expand Down
11 changes: 11 additions & 0 deletions src/other/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,14 @@ function tforeach(f, x::AbstractArray; basesize::Integer)
end
return
end

function _nt_like_hash(v, h::UInt)
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
length(v) == 0 && return hash(NamedTuple(), h)

h = hash((), h)
for i in length(v):-1:1
h = hash(v[i], h)
end

return xor(objectid(Tuple(propertynames(v))), h)
end
117 changes: 90 additions & 27 deletions test/dataframerow.jl
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ end
@test_throws ArgumentError df[1, 1:2] < df[1, 2:3]
end

@testset "hashing" begin
@testset "hashing of DataFrameRow and GroupKey" begin
df = deepcopy(ref_df)

@test hash(DataFrameRow(df, 1, :)) != hash(DataFrameRow(df, 2, :))
Expand All @@ -180,33 +180,17 @@ end
@test hash(DataFrameRow(df, 2, :)) == hash(DataFrameRow(df, 5, :))
@test hash(DataFrameRow(df, 2, :)) != hash(DataFrameRow(df, 6, :))

# check that hashrows() function generates the same hashes as DataFrameRow
df_rowhashes, _ = DataFrames.hashrows(Tuple(eachcol(df)), false)
@test df_rowhashes == [hash(dr) for dr in eachrow(df)]
end

@testset "grouping" begin
# test RowGroupDict
Random.seed!(1234)
df1 = DataFrame(d1=rand(1:2, 1000))
df2 = DataFrame(d1=[2, 3])

# test_group("group_rows")
gd = DataFrames.group_rows(df1)
@test length(unique(gd.groups)) == 2

# getting groups for the rows of the other frames
@test length(gd[DataFrameRow(df2, 1, :)]) > 0
@test_throws KeyError gd[DataFrameRow(df2, 2, :)]
@test isempty(DataFrames.findrows(gd, df2, (gd.df[!, 1],), (df2[!, 1],), 2))
df = DataFrame(reshape(1:24, 6, 4), :auto)
df.x2 = string.(df.x2)
df.x3 = categorical(df.x3)
df.x4 = Float64.(df.x4)
gks = keys(groupby(df, :))

# grouping empty frame
gd = DataFrames.group_rows(DataFrame(x=Int[]))
@test length(unique(gd.groups)) == 0

# grouping single row
gd = DataFrames.group_rows(df1[1:1, :])
@test length(unique(gd.groups)) == 1
for i in axes(df, 1), h in UInt(0):UInt(10)
@test hash(DataFrameRow(df, i, :), h) ==
hash(gks[i], h) ==
hash(NamedTuple(DataFrameRow(df, i, :)), h)
end
end

@testset "getproperty, setproperty! and propertynames" begin
Expand Down Expand Up @@ -587,4 +571,83 @@ end
@test_throws ArgumentError r .+ 1
end

@testset "comparison tests: DataFrameRow, NamedTuple and GroupKey" begin
df = DataFrame(a=[1, 2], b=[missing, 3])
dfr = [df[1, :], df[2, :], df[1, 1:1], df[2, 1:1]]
nt = NamedTuple.(dfr)
gk = [keys(groupby(df, [:a, :b])); keys(groupby(df, :a))]

for l in (dfr[1], nt[1], gk[1]), r in (dfr[1], nt[1], gk[1])
@test ismissing(l == r)
@test ismissing(l == (a=1, b=2))
@test l ≅ r
@test l ≇ (a=1, b=2)
# work around https://github.com/JuliaLang/julia/pull/40147
if !(l isa NamedTuple && r isa NamedTuple)
@test ismissing(l < r)
bkamins marked this conversation as resolved.
Show resolved Hide resolved
end
@test !isless(l, r)
end

for i in 2:4, l in (dfr, nt, gk), r in (dfr, nt, gk)
@test l[i] == r[i]
@test l[1] != l[i]
@test l[1] != r[i]
@test l[i] ≅ r[i]
@test l[1] ≇ l[i]
@test l[1] ≇ r[i]

@test !(l[i] < r[i])
@test !isless(l[i], r[i])

if i > 2
if l[1] isa NamedTuple && r[i] isa NamedTuple
@test_throws MethodError l[1] < r[i]
@test_throws MethodError isless(l[1], r[i])
else
@test_throws ArgumentError l[1] < r[i]
@test_throws ArgumentError isless(l[1], r[i])
end
end
end

for l in (dfr, nt, gk), r in (dfr, nt, gk)
@test l[1] < r[2]
@test isless(l[1], r[2])
@test !(l[2] < r[1])
@test !isless(l[2], r[1])

@test l[3] < r[4]
@test isless(l[3], r[4])
@test !(l[4] < r[3])
@test !isless(l[4], r[3])
end

@test !(dfr[1] == (x=1, b=missing))
@test !(gk[1] == (x=1, b=missing))
@test !(dfr[1] ≅ (x=1, b=missing))
@test !(gk[1] ≅ (x=1, b=missing))

@test_throws ArgumentError dfr[1] < (x=1, b=missing)
@test_throws ArgumentError gk[1] < (x=1, b=missing)
@test_throws ArgumentError isless(dfr[1], (x=1, b=missing))
@test_throws ArgumentError isless(gk[1], (x=1, b=missing))

df2 = DataFrame(a=1, b=missing)
df3 = DataFrame(a=2, b=1)
dfr2 = df2[1, :]
dfr3 = df3[1, :]
nt2 = NamedTuple(dfr2)
nt3 = NamedTuple(dfr3)
gk2 = keys(groupby(df2, [:a, :b]))[1]
gk3 = keys(groupby(df3, [:a, :b]))[1]

for a in (dfr2, nt2, gk2), b in (dfr3, nt3, gk3)
@test !(a == b)
@test !(a ≅ b)
@test a < b
@test isless(a, b)
end
end

end # module
Loading