Skip to content

Commit

Permalink
Simpler array hashing (#26022)
Browse files Browse the repository at this point in the history
    Goal: Hash approximately log(N) entries with a higher density of hashed elements
    weighted towards the end and special consideration for repeated values. Colliding
    hashes will often subsequently be compared by equality -- and equality between arrays
    works elementwise forwards and is short-circuiting. This means that a collision
    between arrays that differ by elements at the beginning is cheaper than one where the
    difference is towards the end. Furthermore, blindly choosing log(N) entries from a
    sparse array will likely only choose the same element repeatedly (zero in this case).

    To achieve this, we work backwards, starting by hashing the last element of the
    array. After hashing each element, we skip the next `fibskip` elements, where
    `fibskip` is pulled from the Fibonacci sequence -- Fibonacci was chosen as a simple
    ~O(log(N)) algorithm that ensures we don't hit a common divisor of a dimension and
    only end up hashing one slice of the array (as might happen with powers of two).
    Finally, we find the next distinct value from the one we just hashed.

Fixes #27865 and fixes #26011.

Fixes #26034
  • Loading branch information
mbauman authored and JeffBezanson committed Aug 2, 2018
1 parent 463513f commit b0bf91e
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 161 deletions.
174 changes: 59 additions & 115 deletions base/abstractarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2071,124 +2071,68 @@ push!(A, a, b, c...) = push!(push!(A, a, b), c...)
pushfirst!(A, a, b) = pushfirst!(pushfirst!(A, b), a)
pushfirst!(A, a, b, c...) = pushfirst!(pushfirst!(A, c...), a, b)

## hashing collections ##

const hashaa_seed = UInt === UInt64 ? 0x7f53e68ceb575e76 : 0xeb575e76
const hashrle_seed = UInt === UInt64 ? 0x2aab8909bfea414c : 0xbfea414c
const hashr_seed = UInt === UInt64 ? 0x80707b6821b70087 : 0x21b70087

# Efficient O(1) method equivalent to the O(N) AbstractArray fallback,
# which works only for ranges with regular step (RangeStepRegular)
function hash_range(r::AbstractRange, h::UInt)
h += hashaa_seed
h += hash(size(r))

length(r) == 0 && return h
h = hash(first(r), h)
length(r) == 1 && return h
length(r) == 2 && return hash(last(r), h)

h += hashr_seed
h = hash(length(r), h)
h = hash(last(r), h)
end

function hash(a::AbstractArray{T}, h::UInt) where T
# O(1) hashing for types with regular step
if isa(a, AbstractRange) && isa(RangeStepStyle(a), RangeStepRegular)
return hash_range(a, h)
end

h += hashaa_seed
h += hash(size(a))

y1 = iterate(a)
y1 === nothing && return h
y2 = iterate(a, y1[2])
y2 === nothing && return hash(y1[1], h)
y = iterate(a, y2[2])
y === nothing && return hash(y2[1], hash(y1[1], h))
x1, x2 = y1[1], y2[1]

# For the rest of the function, we keep three elements worth of state,
# x1, x2, y[1], with `y` potentially being `nothing` if there's only
# two elements remaining

# Check whether the array is equal to a range, and hash the elements
# at the beginning of the array as such as long as they match this assumption
# This needs to be done even for non-RangeStepRegular types since they may still be equal
# to RangeStepRegular values (e.g. 1.0:3.0 == 1:3)
if isa(a, AbstractVector) && applicable(-, x2, x1)
n = 1
local step, laststep, laststate

h = hash(x1, h)
h += hashr_seed

while true
# If overflow happens with entries of the same type, a cannot be equal
# to a range with more than two elements because more extreme values
# cannot be represented. We must still hash the two first values as a
# range since they can always be considered as such (in a wider type)
if isconcretetype(T)
try
step = x2 - x1
catch err
isa(err, OverflowError) || rethrow(err)
break
end
# If true, wraparound overflow happened
sign(step) == cmp(x2, x1) || break
else
applicable(-, x2, x1) || break
# widen() is here to ensure no overflow can happen
step = widen(x2) - widen(x1)
end
n > 1 && !isequal(step, laststep) && break
n += 1
laststep = step
if y === nothing
# The array matches a range exactly
return hash(x2, hash(n, h))
end
x1, x2 = x2, y[1]
y = iterate(a, y[2])
## hashing AbstractArray ##

function hash(A::AbstractArray, h::UInt)
h = hash(AbstractArray, h)
# Axes are themselves AbstractArrays, so hashing them directly would stack overflow
# Instead hash the tuple of firsts and lasts along each dimension
h = hash(map(first, axes(A)), h)
h = hash(map(last, axes(A)), h)
isempty(A) && return h

# Goal: Hash approximately log(N) entries with a higher density of hashed elements
# weighted towards the end and special consideration for repeated values. Colliding
# hashes will often subsequently be compared by equality -- and equality between arrays
# works elementwise forwards and is short-circuiting. This means that a collision
# between arrays that differ by elements at the beginning is cheaper than one where the
# difference is towards the end. Furthermore, blindly choosing log(N) entries from a
# sparse array will likely only choose the same element repeatedly (zero in this case).

# To achieve this, we work backwards, starting by hashing the last element of the
# array. After hashing each element, we skip `fibskip` elements, where `fibskip`
# is pulled from the Fibonacci sequence -- Fibonacci was chosen as a simple
# ~O(log(N)) algorithm that ensures we don't hit a common divisor of a dimension
# and only end up hashing one slice of the array (as might happen with powers of
# two). Finally, we find the next distinct value from the one we just hashed.

# This is a little tricky since skipping an integer number of values inherently works
# with linear indices, but `findprev` uses `keys`. Hoist out the conversion "maps":
ks = keys(A)
key_to_linear = LinearIndices(ks) # Index into this map to compute the linear index
linear_to_key = vec(ks) # And vice-versa

# Start at the last index
keyidx = last(ks)
linidx = key_to_linear[keyidx]
fibskip = prevfibskip = oneunit(linidx)
n = 0
while true
n += 1
# Hash the current key-index and its element
elt = A[keyidx]
h = hash(keyidx=>elt, h)

# Skip backwards a Fibonacci number of indices -- this is a linear index operation
linidx = key_to_linear[keyidx]
linidx <= fibskip && break
linidx -= fibskip
keyidx = linear_to_key[linidx]

# Only increase the Fibonacci skip once every N iterations. This was chosen
# to be big enough that all elements of small arrays get hashed while
# obscenely large arrays are still tractable. With a choice of N=4096, an
# entirely-distinct 8000-element array will have ~75% of its elements hashed,
# with every other element hashed in the first half of the array. At the same
# time, hashing a `typemax(Int64)`-length Float64 range takes about a second.
if rem(n, 4096) == 0
fibskip, prevfibskip = fibskip + prevfibskip, fibskip
end

# Always hash at least the two first elements as a range (even in case of overflow)
if n < 2
h = hash(2, h)
h = hash(y2[1], h)
@assert y !== nothing
x1, x2 = x2, y[1]
y = iterate(a, y[2])
else
h = hash(n, h)
h = hash(x1, h)
end
# Find a key index with a value distinct from `elt` -- might be `keyidx` itself
keyidx = findprev(!isequal(elt), A, keyidx)
keyidx === nothing && break
end

# Hash elements which do not correspond to a range
while true
if isequal(x2, x1)
# For repeated elements, use run length encoding
# This allows efficient hashing of sparse arrays
runlength = 2
while y !== nothing
# No need to update x1 (it's isequal x2)
x2 = y[1]
y = iterate(a, y[2])
isequal(x1, x2) || break
runlength += 1
end
h += hashrle_seed
h = hash(runlength, h)
end
h = hash(x1, h)
y === nothing && break
x1, x2 = x2, y[1]
y = iterate(a, y[2])
end
!isequal(x2, x1) && (h = hash(x2, h))
return h
end
4 changes: 4 additions & 0 deletions stdlib/Dates/test/ranges.jl
Original file line number Diff line number Diff line change
Expand Up @@ -578,4 +578,8 @@ a = Dates.Time(23, 1, 1)
@test !in Date(2017, 01, 01):Dates.Day(1):Date(2017, 01, 05))
@test !("a" in Date(2017, 01, 01):Dates.Day(1):Date(2017, 01, 05))

@test hash(Any[Date("2018-1-03"), Date("2018-1-04"), Date("2018-1-05")]) ==
hash([Date("2018-1-03"), Date("2018-1-04"), Date("2018-1-05")]) ==
hash(Date("2018-1-03"):Day(1):Date("2018-1-05"))

end
45 changes: 0 additions & 45 deletions stdlib/SparseArrays/src/sparsematrix.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3477,51 +3477,6 @@ function rotl90(A::SparseMatrixCSC)
return sparse(J, I, V, n, m)
end

## hashing

# End the run and return the current hash
@inline function hashrun(val, runlength::Int, h::UInt)
if runlength == 0
return h
elseif runlength > 1
h += Base.hashrle_seed
h = hash(runlength, h)
end
hash(val, h)
end

function hash(A::SparseMatrixCSC{T}, h::UInt) where T
h += Base.hashaa_seed
sz = size(A)
h += hash(sz)

colptr = A.colptr
rowval = A.rowval
nzval = A.nzval
lastidx = 0
runlength = 0
lastnz = zero(T)
@inbounds for col = 1:size(A, 2)
for j = colptr[col]:colptr[col+1]-1
nz = nzval[j]
isequal(nz, zero(T)) && continue
idx = Base._sub2ind(sz, rowval[j], col)
if idx != lastidx+1 || !isequal(nz, lastnz) # Run is over
h = hashrun(lastnz, runlength, h) # Hash previous run
h = hashrun(0, idx-lastidx-1, h) # Hash intervening zeros

runlength = 1
lastnz = nz
else
runlength += 1
end
lastidx = idx
end
end
h = hashrun(lastnz, runlength, h) # Hash previous run
hashrun(0, length(A)-lastidx, h) # Hash zeros at end
end

## Uniform matrix arithmetic

(+)(A::SparseMatrixCSC, J::UniformScaling) = A + sparse(J, size(A)...)
Expand Down
29 changes: 28 additions & 1 deletion test/arrayops.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2442,7 +2442,34 @@ end
end

@testset "inference hash array 22740" begin
@inferred hash([1,2,3])
@test @inferred(hash([1,2,3])) == @inferred(hash(1:3))
end

@testset "hashing arrays of arrays" begin
# issues #27865 and #26011
@test hash([["asd"], ["asd"], ["asad"]]) == hash(Any[["asd"], ["asd"], ["asad"]])
@test hash([["asd"], ["asd"], ["asad"]]) != hash([["asd"], ["asd"], ["asadq"]])
@test hash([1,2,[3]]) == hash([1,2,Any[3]]) == hash([1,2,Int8[3]]) == hash([1,2,BigInt[3]]) == hash([1,2,[3.0]])
@test hash([1,2,[3]]) != hash([1,2,[3,4]])
end

# Ensure we can hash strange custom structs — and they hash the same in arrays
struct totally_not_five26034 end
Base.isequal(::totally_not_five26034, x)=isequal(5,x);
Base.isequal(x, ::totally_not_five26034)=isequal(5,x);
Base.isequal(::totally_not_five26034, ::totally_not_five26034)=true;
Base.hash(::totally_not_five26034, h::UInt)=hash(5, h);
import Base.==
==(::totally_not_five26034, x)= (5==x);
==(x,::totally_not_five26034)= (5==x);
==(::totally_not_five26034,::totally_not_five26034)=true;
@testset "issue #26034" begin
n5 = totally_not_five26034()
@test hash(n5) == hash(5)
@test isequal([4,n5,6], [4,5,6])
@test isequal(hash([4,n5,6]), hash([4,5,6]))
@test isequal(hash(Any[4,n5,6]), hash(Union{Int, totally_not_five26034}[4,5,6]))
@test isequal(hash([n5,4,n5,6]), hash([n5,4,5,6]))
end

function f27079()
Expand Down

0 comments on commit b0bf91e

Please sign in to comment.