From 3d3aa361f49329df514f29a72a2d4c79f1deec1e Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Tue, 7 Mar 2017 16:25:33 +0100 Subject: [PATCH] Make hash(::CategoricalValue) faster by pre-computing hashes This optimization only works for the one-argument hash() method: when a hash code is passed, the hash needs to be recomputed anyway. This can still be useful for hashing DataTables columns. --- benchmark/benchmarks.jl | 35 +++++++++++++++++++++++++++++++++++ src/buildfields.jl | 5 +++-- src/pool.jl | 2 ++ src/typedefs.jl | 10 ++++++---- src/value.jl | 2 ++ 5 files changed, 48 insertions(+), 6 deletions(-) diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index d5103d6d..1f4b6f3c 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -30,3 +30,38 @@ end @bench "CategoricalArray" sumequals(ca, ca[1]) @bench "NullableCategoricalArray" sumequals(nca, nca[1]) end + +@benchgroup "hash" begin + function f(X) + h = zero(UInt) + index = X.pool.index + @inbounds for i in X.refs + h += hash(index[i]) + end + h + end + + function g(X) + h = zero(UInt) + @inbounds for x in X + h += hash(x) + end + h + end + + function h(X) + h = zero(UInt) + pool = X.pool + @inbounds for i in X.refs + h += CategoricalArrays.hash_level(pool, i) + end + h + end + + X = CategoricalArray(repeat(["ABCDEF", "GHIJKL", "MNOPQR", "STUVWX"], inner=100, outer=100)) + + using BenchmarkTools + @bench "hashing strings" f(X) + @bench "hashing CategoricalValues" g(X) + @bench "using precomputed hashes" h(X) +end diff --git a/src/buildfields.jl b/src/buildfields.jl index 187f4a42..cf6c8634 100644 --- a/src/buildfields.jl +++ b/src/buildfields.jl @@ -18,13 +18,14 @@ function buildinvindex{T, R}(index::Vector{T}, ::Type{R}=DefaultRefType) return invindex end -function buildvalues!{T, R, V}(pool::CategoricalPool{T, R, V}) +function buildcaches!{T, R, V}(pool::CategoricalPool{T, R, V}) n = length(levels(pool)) resize!(pool.valindex, n) + resize!(pool.hashindex, n) for i in 1:n pool.valindex[i] = V(i, pool) + pool.hashindex[i] = hash(pool.index[i]) end - return pool.valindex end function buildorder!{S, R <: Integer}(order::Array{R}, diff --git a/src/pool.jl b/src/pool.jl index d9d75811..5dba295b 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -135,6 +135,8 @@ function Base.delete!{S, R, V}(pool::CategoricalPool{S, R, V}, levels...) return pool end +hash_level(pool::CategoricalPool, i::Integer) = pool.hashindex[i] + function levels!{S, R, V}(pool::CategoricalPool{S, R, V}, newlevels::Vector) if !allunique(newlevels) throw(ArgumentError(string("duplicated levels found in newlevels: ", diff --git a/src/typedefs.jl b/src/typedefs.jl index bbdd368a..85cd52f8 100644 --- a/src/typedefs.jl +++ b/src/typedefs.jl @@ -13,14 +13,15 @@ if VERSION >= v"0.6.0-dev.2643" order::Vector{R} levels::Vector{T} valindex::Vector{V} + hashindex::Vector{UInt} ordered::Bool function CategoricalPool{T, R, V}(index::Vector{T}, invindex::Dict{T, R}, order::Vector{R}, ordered::Bool) where {T, R, V} - pool = new(index, invindex, order, index[order], V[], ordered) - buildvalues!(pool) + pool = new(index, invindex, order, index[order], V[], UInt[], ordered) + buildcaches!(pool) return pool end end @@ -33,14 +34,15 @@ else order::Vector{R} levels::Vector{T} valindex::Vector{V} + hashindex::Vector{UInt} ordered::Bool function CategoricalPool{T, R}(index::Vector{T}, invindex::Dict{T, R}, order::Vector{R}, ordered::Bool) - pool = new(index, invindex, order, index[order], V[], ordered) - buildvalues!(pool) + pool = new(index, invindex, order, index[order], V[], UInt[], ordered) + buildcaches!(pool) return pool end end diff --git a/src/value.jl b/src/value.jl index 1981f300..2159f49e 100644 --- a/src/value.jl +++ b/src/value.jl @@ -63,6 +63,8 @@ Base.isequal(x::CategoricalValue, y::Any) = isequal(index(x.pool)[x.level], y) Base.isequal(x::Any, y::CategoricalValue) = isequal(y, x) Base.hash(x::CategoricalValue, h::UInt) = hash(index(x.pool)[x.level], h) +# When h=zero(UInt) we do not need to recompute the hash +Base.hash(x::CategoricalValue) = x.pool.hashindex[x.level] function Base.isless{S, T}(x::CategoricalValue{S}, y::CategoricalValue{T}) throw(ArgumentError("CategoricalValue objects with different pools cannot be tested for order"))