diff --git a/base/abstractarray.jl b/base/abstractarray.jl index 43f2617180923..86c1ed4ace8f7 100644 --- a/base/abstractarray.jl +++ b/base/abstractarray.jl @@ -2057,124 +2057,68 @@ push!(A, a, b, c...) = push!(push!(A, a, b), c...) pushfirst!(A, a, b) = pushfirst!(pushfirst!(A, b), a) pushfirst!(A, a, b, c...) = pushfirst!(pushfirst!(A, c...), a, b) -## hashing collections ## - -const hashaa_seed = UInt === UInt64 ? 0x7f53e68ceb575e76 : 0xeb575e76 -const hashrle_seed = UInt === UInt64 ? 0x2aab8909bfea414c : 0xbfea414c -const hashr_seed = UInt === UInt64 ? 0x80707b6821b70087 : 0x21b70087 - -# Efficient O(1) method equivalent to the O(N) AbstractArray fallback, -# which works only for ranges with regular step (RangeStepRegular) -function hash_range(r::AbstractRange, h::UInt) - h += hashaa_seed - h += hash(size(r)) - - length(r) == 0 && return h - h = hash(first(r), h) - length(r) == 1 && return h - length(r) == 2 && return hash(last(r), h) - - h += hashr_seed - h = hash(length(r), h) - h = hash(last(r), h) -end - -function hash(a::AbstractArray{T}, h::UInt) where T - # O(1) hashing for types with regular step - if isa(a, AbstractRange) && isa(RangeStepStyle(a), RangeStepRegular) - return hash_range(a, h) - end - - h += hashaa_seed - h += hash(size(a)) - - y1 = iterate(a) - y1 === nothing && return h - y2 = iterate(a, y1[2]) - y2 === nothing && return hash(y1[1], h) - y = iterate(a, y2[2]) - y === nothing && return hash(y2[1], hash(y1[1], h)) - x1, x2 = y1[1], y2[1] - - # For the rest of the function, we keep three elements worth of state, - # x1, x2, y[1], with `y` potentially being `nothing` if there's only - # two elements remaining - - # Check whether the array is equal to a range, and hash the elements - # at the beginning of the array as such as long as they match this assumption - # This needs to be done even for non-RangeStepRegular types since they may still be equal - # to RangeStepRegular values (e.g. 1.0:3.0 == 1:3) - if isa(a, AbstractVector) && applicable(-, x2, x1) - n = 1 - local step, laststep, laststate - - h = hash(x1, h) - h += hashr_seed - - while true - # If overflow happens with entries of the same type, a cannot be equal - # to a range with more than two elements because more extreme values - # cannot be represented. We must still hash the two first values as a - # range since they can always be considered as such (in a wider type) - if isconcretetype(T) - try - step = x2 - x1 - catch err - isa(err, OverflowError) || rethrow(err) - break - end - # If true, wraparound overflow happened - sign(step) == cmp(x2, x1) || break - else - applicable(-, x2, x1) || break - # widen() is here to ensure no overflow can happen - step = widen(x2) - widen(x1) - end - n > 1 && !isequal(step, laststep) && break - n += 1 - laststep = step - if y === nothing - # The array matches a range exactly - return hash(x2, hash(n, h)) - end - x1, x2 = x2, y[1] - y = iterate(a, y[2]) +## hashing AbstractArray ## + +function hash(A::AbstractArray, h::UInt) + h = hash(AbstractArray, h) + # Axes are themselves AbstractArrays, so hashing them directly would stack overflow + # Instead hash the tuple of firsts and lasts along each dimension + h = hash(map(first, axes(A)), h) + h = hash(map(last, axes(A)), h) + isempty(A) && return h + + # Goal: Hash approximately log(N) entries with a higher density of hashed elements + # weighted towards the end and special consideration for repeated values. Colliding + # hashes will often subsequently be compared by equality -- and equality between arrays + # works elementwise forwards and is short-circuiting. This means that a collision + # between arrays that differ by elements at the beginning is cheaper than one where the + # difference is towards the end. Furthermore, blindly choosing log(N) entries from a + # sparse array will likely only choose the same element repeatedly (zero in this case). + + # To achieve this, we work backwards, starting by hashing the last element of the + # array. After hashing each element, we skip `fibskip` elements, where `fibskip` + # is pulled from the Fibonacci sequence -- Fibonacci was chosen as a simple + # ~O(log(N)) algorithm that ensures we don't hit a common divisor of a dimension + # and only end up hashing one slice of the array (as might happen with powers of + # two). Finally, we find the next distinct value from the one we just hashed. + + # This is a little tricky since skipping an integer number of values inherently works + # with linear indices, but `findprev` uses `keys`. Hoist out the conversion "maps": + ks = keys(A) + key_to_linear = LinearIndices(ks) # Index into this map to compute the linear index + linear_to_key = vec(ks) # And vice-versa + + # Start at the last index + keyidx = last(ks) + linidx = key_to_linear[keyidx] + fibskip = prevfibskip = oneunit(linidx) + n = 0 + while true + n += 1 + # Hash the current key-index and its element + elt = A[keyidx] + h = hash(keyidx=>elt, h) + + # Skip backwards a Fibonacci number of indices -- this is a linear index operation + linidx = key_to_linear[keyidx] + linidx <= fibskip && break + linidx -= fibskip + keyidx = linear_to_key[linidx] + + # Only increase the Fibonacci skip once every N iterations. This was chosen + # to be big enough that all elements of small arrays get hashed while + # obscenely large arrays are still tractable. With a choice of N=4096, an + # entirely-distinct 8000-element array will have ~75% of its elements hashed, + # with every other element hashed in the first half of the array. At the same + # time, hashing a `typemax(Int64)`-length Float64 range takes about a second. + if rem(n, 4096) == 0 + fibskip, prevfibskip = fibskip + prevfibskip, fibskip end - # Always hash at least the two first elements as a range (even in case of overflow) - if n < 2 - h = hash(2, h) - h = hash(y2[1], h) - @assert y !== nothing - x1, x2 = x2, y[1] - y = iterate(a, y[2]) - else - h = hash(n, h) - h = hash(x1, h) - end + # Find a key index with a value distinct from `elt` -- might be `keyidx` itself + keyidx = findprev(!isequal(elt), A, keyidx) + keyidx === nothing && break end - # Hash elements which do not correspond to a range - while true - if isequal(x2, x1) - # For repeated elements, use run length encoding - # This allows efficient hashing of sparse arrays - runlength = 2 - while y !== nothing - # No need to update x1 (it's isequal x2) - x2 = y[1] - y = iterate(a, y[2]) - isequal(x1, x2) || break - runlength += 1 - end - h += hashrle_seed - h = hash(runlength, h) - end - h = hash(x1, h) - y === nothing && break - x1, x2 = x2, y[1] - y = iterate(a, y[2]) - end - !isequal(x2, x1) && (h = hash(x2, h)) return h end diff --git a/stdlib/Dates/test/ranges.jl b/stdlib/Dates/test/ranges.jl index 3d02def61b671..929e78e692260 100644 --- a/stdlib/Dates/test/ranges.jl +++ b/stdlib/Dates/test/ranges.jl @@ -578,4 +578,8 @@ a = Dates.Time(23, 1, 1) @test !(π in Date(2017, 01, 01):Dates.Day(1):Date(2017, 01, 05)) @test !("a" in Date(2017, 01, 01):Dates.Day(1):Date(2017, 01, 05)) +@test hash(Any[Date("2018-1-03"), Date("2018-1-04"), Date("2018-1-05")]) == + hash([Date("2018-1-03"), Date("2018-1-04"), Date("2018-1-05")]) == + hash(Date("2018-1-03"):Day(1):Date("2018-1-05")) + end diff --git a/stdlib/SparseArrays/src/sparsematrix.jl b/stdlib/SparseArrays/src/sparsematrix.jl index 6e4da6fa85cf4..f7293a3bc1b83 100644 --- a/stdlib/SparseArrays/src/sparsematrix.jl +++ b/stdlib/SparseArrays/src/sparsematrix.jl @@ -3444,51 +3444,6 @@ function rotl90(A::SparseMatrixCSC) return sparse(J, I, V, n, m) end -## hashing - -# End the run and return the current hash -@inline function hashrun(val, runlength::Int, h::UInt) - if runlength == 0 - return h - elseif runlength > 1 - h += Base.hashrle_seed - h = hash(runlength, h) - end - hash(val, h) -end - -function hash(A::SparseMatrixCSC{T}, h::UInt) where T - h += Base.hashaa_seed - sz = size(A) - h += hash(sz) - - colptr = A.colptr - rowval = A.rowval - nzval = A.nzval - lastidx = 0 - runlength = 0 - lastnz = zero(T) - @inbounds for col = 1:size(A, 2) - for j = colptr[col]:colptr[col+1]-1 - nz = nzval[j] - isequal(nz, zero(T)) && continue - idx = Base._sub2ind(sz, rowval[j], col) - if idx != lastidx+1 || !isequal(nz, lastnz) # Run is over - h = hashrun(lastnz, runlength, h) # Hash previous run - h = hashrun(0, idx-lastidx-1, h) # Hash intervening zeros - - runlength = 1 - lastnz = nz - else - runlength += 1 - end - lastidx = idx - end - end - h = hashrun(lastnz, runlength, h) # Hash previous run - hashrun(0, length(A)-lastidx, h) # Hash zeros at end -end - ## Uniform matrix arithmetic (+)(A::SparseMatrixCSC, J::UniformScaling) = A + sparse(J, size(A)...) diff --git a/test/arrayops.jl b/test/arrayops.jl index 0b4479181cf68..c759689ca05a7 100644 --- a/test/arrayops.jl +++ b/test/arrayops.jl @@ -2424,7 +2424,34 @@ end end @testset "inference hash array 22740" begin - @inferred hash([1,2,3]) + @test @inferred(hash([1,2,3])) == @inferred(hash(1:3)) +end + +@testset "hashing arrays of arrays" begin + # issues #27865 and #26011 + @test hash([["asd"], ["asd"], ["asad"]]) == hash(Any[["asd"], ["asd"], ["asad"]]) + @test hash([["asd"], ["asd"], ["asad"]]) != hash([["asd"], ["asd"], ["asadq"]]) + @test hash([1,2,[3]]) == hash([1,2,Any[3]]) == hash([1,2,Int8[3]]) == hash([1,2,BigInt[3]]) == hash([1,2,[3.0]]) + @test hash([1,2,[3]]) != hash([1,2,[3,4]]) +end + +# Ensure we can hash strange custom structs — and they hash the same in arrays +struct totally_not_five26034 end +Base.isequal(::totally_not_five26034, x)=isequal(5,x); +Base.isequal(x, ::totally_not_five26034)=isequal(5,x); +Base.isequal(::totally_not_five26034, ::totally_not_five26034)=true; +Base.hash(::totally_not_five26034, h::UInt)=hash(5, h); +import Base.== +==(::totally_not_five26034, x)= (5==x); +==(x,::totally_not_five26034)= (5==x); +==(::totally_not_five26034,::totally_not_five26034)=true; +@testset "issue #26034" begin + n5 = totally_not_five26034() + @test hash(n5) == hash(5) + @test isequal([4,n5,6], [4,5,6]) + @test isequal(hash([4,n5,6]), hash([4,5,6])) + @test isequal(hash(Any[4,n5,6]), hash(Union{Int, totally_not_five26034}[4,5,6])) + @test isequal(hash([n5,4,n5,6]), hash([n5,4,5,6])) end function f27079()