Skip to content

Commit

Permalink
Add optimized mapreduce implementation for SkipMissing (#27743)
Browse files Browse the repository at this point in the history
Based on the AbstractArray methods. Allows the compiler to emit SIMD instructions,
for SkipMissing{Vector{Int}}, but also improves performance for large SkipMissing{Vector{Float64}}.
  • Loading branch information
nalimilan authored Jul 6, 2018
1 parent ae8e95f commit a60119b
Show file tree
Hide file tree
Showing 2 changed files with 145 additions and 1 deletion.
95 changes: 94 additions & 1 deletion base/missing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ IteratorSize(::Type{<:SkipMissing}) = SizeUnknown()
IteratorEltype(::Type{SkipMissing{T}}) where {T} = IteratorEltype(T)
eltype(::Type{SkipMissing{T}}) where {T} = nonmissingtype(eltype(T))

function Base.iterate(itr::SkipMissing, state...)
function iterate(itr::SkipMissing, state...)
y = iterate(itr.x, state...)
y === nothing && return nothing
item, state = y
Expand All @@ -189,6 +189,99 @@ function Base.iterate(itr::SkipMissing, state...)
item, state
end

# Optimized mapreduce implementation
# The generic method is faster when !(eltype(A) >: Missing) since it does not need
# additional loops to identify the two first non-missing values of each block
mapreduce(f, op, itr::SkipMissing{<:AbstractArray}) =
_mapreduce(f, op, IndexStyle(itr.x), eltype(itr.x) >: Missing ? itr : itr.x)

function _mapreduce(f, op, ::IndexLinear, itr::SkipMissing{<:AbstractArray})
A = itr.x
local ai
inds = LinearIndices(A)
i = first(inds)
ilast = last(inds)
while i <= ilast
@inbounds ai = A[i]
ai === missing || break
i += 1
end
i > ilast && return mapreduce_empty(f, op, eltype(itr))
a1 = ai
i += 1
while i <= ilast
@inbounds ai = A[i]
ai === missing || break
i += 1
end
i > ilast && return mapreduce_first(f, op, a1)
# We know A contains at least two non-missing entries: the result cannot be nothing
something(mapreduce_impl(f, op, itr, first(inds), last(inds)))
end

_mapreduce(f, op, ::IndexCartesian, itr::SkipMissing) = mapfoldl(f, op, itr)

mapreduce_impl(f, op, A::SkipMissing, ifirst::Integer, ilast::Integer) =
mapreduce_impl(f, op, A, ifirst, ilast, pairwise_blocksize(f, op))

# Returns nothing when the input contains only missing values, and Some(x) otherwise
@noinline function mapreduce_impl(f, op, itr::SkipMissing{<:AbstractArray},
ifirst::Integer, ilast::Integer, blksize::Int)
A = itr.x
if ifirst == ilast
@inbounds a1 = A[ifirst]
if a1 === missing
return nothing
else
return Some(mapreduce_first(f, op, a1))
end
elseif ifirst + blksize > ilast
# sequential portion
local ai
i = ifirst
while i <= ilast
@inbounds ai = A[i]
ai === missing || break
i += 1
end
i > ilast && return nothing
a1 = ai::eltype(itr)
i += 1
while i <= ilast
@inbounds ai = A[i]
ai === missing || break
i += 1
end
i > ilast && return Some(mapreduce_first(f, op, a1))
a2 = ai::eltype(itr)
# Unexpectedly, the following assertion allows SIMD instructions to be emitted
A[i]::eltype(itr)
i += 1
v = op(f(a1), f(a2))
@simd for i = i:ilast
@inbounds ai = A[i]
if ai !== missing
v = op(v, f(ai))
end
end
return Some(v)
else
# pairwise portion
imid = (ifirst + ilast) >> 1
v1 = mapreduce_impl(f, op, itr, ifirst, imid, blksize)
v2 = mapreduce_impl(f, op, itr, imid+1, ilast, blksize)
if v1 === nothing && v2 === nothing
return nothing
elseif v1 === nothing
return v2
elseif v2 === nothing
return v1
else
return Some(op(something(v1), something(v2)))
end
end
end

"""
coalesce(x, y...)
Expand Down
51 changes: 51 additions & 0 deletions test/missing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,57 @@ end
@test eltype(x) === Any
@test collect(x) == [1, 2, 4]
@test collect(x) isa Vector{Int}

@testset "mapreduce" begin
# Vary size to test splitting blocks with several configurations of missing values
for T in (Int, Float64),
A in (rand(T, 10), rand(T, 1000), rand(T, 10000))
if T === Int
@test sum(A) === sum(skipmissing(A)) ===
reduce(+, skipmissing(A)) === mapreduce(identity, +, skipmissing(A))
else
@test sum(A) sum(skipmissing(A)) ===
reduce(+, skipmissing(A)) === mapreduce(identity, +, skipmissing(A))
end
@test mapreduce(cos, *, A) mapreduce(cos, *, skipmissing(A))

B = Vector{Union{T,Missing}}(A)
replace!(x -> rand(Bool) ? x : missing, B)
if T === Int
@test sum(collect(skipmissing(B))) === sum(skipmissing(B)) ===
reduce(+, skipmissing(B)) === mapreduce(identity, +, skipmissing(B))
else
@test sum(collect(skipmissing(B))) sum(skipmissing(B)) ===
reduce(+, skipmissing(B)) === mapreduce(identity, +, skipmissing(B))
end
@test mapreduce(cos, *, collect(skipmissing(A))) mapreduce(cos, *, skipmissing(A))

# Test block full of missing values
B[1:length(B)÷2] .= missing
if T === Int
@test sum(collect(skipmissing(B))) == sum(skipmissing(B)) ==
reduce(+, skipmissing(B)) == mapreduce(identity, +, skipmissing(B))
else
@test sum(collect(skipmissing(B))) sum(skipmissing(B)) ==
reduce(+, skipmissing(B)) == mapreduce(identity, +, skipmissing(B))
end

@test mapreduce(cos, *, collect(skipmissing(A))) mapreduce(cos, *, skipmissing(A))
end

# Patterns that exercize code paths for inputs with 1 or 2 non-missing values
@test sum(skipmissing([1, missing, missing, missing])) === 1
@test sum(skipmissing([missing, missing, missing, 1])) === 1
@test sum(skipmissing([1, missing, missing, missing, 2])) === 3
@test sum(skipmissing([missing, missing, missing, 1, 2])) === 3

for n in 0:3
itr = skipmissing(Vector{Union{Int,Missing}}(fill(missing, n)))
@test sum(itr) == reduce(+, itr) == mapreduce(identity, +, itr) === 0
@test_throws ArgumentError reduce(x -> x/2, itr)
@test_throws ArgumentError mapreduce(x -> x/2, +, itr)
end
end
end

@testset "coalesce" begin
Expand Down

0 comments on commit a60119b

Please sign in to comment.