Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add optimized mapreduce implementation for SkipMissing #27743

Merged
merged 3 commits into from
Jul 6, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 94 additions & 1 deletion base/missing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ IteratorSize(::Type{<:SkipMissing}) = SizeUnknown()
IteratorEltype(::Type{SkipMissing{T}}) where {T} = IteratorEltype(T)
eltype(::Type{SkipMissing{T}}) where {T} = nonmissingtype(eltype(T))

function Base.iterate(itr::SkipMissing, state...)
function iterate(itr::SkipMissing, state...)
y = iterate(itr.x, state...)
y === nothing && return nothing
item, state = y
Expand All @@ -189,6 +189,99 @@ function Base.iterate(itr::SkipMissing, state...)
item, state
end

# Optimized mapreduce implementation
# The generic method is faster when !(eltype(A) >: Missing) since it does not need
# additional loops to identify the two first non-missing values of each block
mapreduce(f, op, itr::SkipMissing{<:AbstractArray}) =
_mapreduce(f, op, IndexStyle(itr.x), eltype(itr.x) >: Missing ? itr : itr.x)

function _mapreduce(f, op, ::IndexLinear, itr::SkipMissing{<:AbstractArray})
A = itr.x
local ai
inds = LinearIndices(A)
i = first(inds)
ilast = last(inds)
while i <= ilast
@inbounds ai = A[i]
ai === missing || break
i += 1
end
i > ilast && return mapreduce_empty(f, op, eltype(itr))
a1 = ai
i += 1
while i <= ilast
@inbounds ai = A[i]
ai === missing || break
i += 1
end
i > ilast && return mapreduce_first(f, op, a1)
# We know A contains at least two non-missing entries: the result cannot be nothing
something(mapreduce_impl(f, op, itr, first(inds), last(inds)))
end

_mapreduce(f, op, ::IndexCartesian, itr::SkipMissing) = mapfoldl(f, op, itr)

mapreduce_impl(f, op, A::SkipMissing, ifirst::Integer, ilast::Integer) =
mapreduce_impl(f, op, A, ifirst, ilast, pairwise_blocksize(f, op))

# Returns nothing when the input contains only missing values, and Some(x) otherwise
@noinline function mapreduce_impl(f, op, itr::SkipMissing{<:AbstractArray},
ifirst::Integer, ilast::Integer, blksize::Int)
A = itr.x
if ifirst == ilast
@inbounds a1 = A[ifirst]
if a1 === missing
return nothing
else
return Some(mapreduce_first(f, op, a1))
end
elseif ifirst + blksize > ilast
# sequential portion
local ai
i = ifirst
while i <= ilast
@inbounds ai = A[i]
ai === missing || break
i += 1
end
i > ilast && return nothing
a1 = ai::eltype(itr)
i += 1
while i <= ilast
@inbounds ai = A[i]
ai === missing || break
i += 1
end
i > ilast && return Some(mapreduce_first(f, op, a1))
a2 = ai::eltype(itr)
# Unexpectedly, the following assertion allows SIMD instructions to be emitted
A[i]::eltype(itr)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed by d803472 (missing align attribute on dereferenceable Arguments)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's check this: #28035.

i += 1
v = op(f(a1), f(a2))
@simd for i = i:ilast
@inbounds ai = A[i]
if ai !== missing
v = op(v, f(ai))
end
end
return Some(v)
else
# pairwise portion
imid = (ifirst + ilast) >> 1
v1 = mapreduce_impl(f, op, itr, ifirst, imid, blksize)
v2 = mapreduce_impl(f, op, itr, imid+1, ilast, blksize)
if v1 === nothing && v2 === nothing
return nothing
elseif v1 === nothing
return v2
elseif v2 === nothing
return v1
else
return Some(op(something(v1), something(v2)))
end
end
end

"""
coalesce(x, y...)

Expand Down
51 changes: 51 additions & 0 deletions test/missing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,57 @@ end
@test eltype(x) === Any
@test collect(x) == [1, 2, 4]
@test collect(x) isa Vector{Int}

@testset "mapreduce" begin
# Vary size to test splitting blocks with several configurations of missing values
for T in (Int, Float64),
A in (rand(T, 10), rand(T, 1000), rand(T, 10000))
if T === Int
@test sum(A) === sum(skipmissing(A)) ===
reduce(+, skipmissing(A)) === mapreduce(identity, +, skipmissing(A))
else
@test sum(A) ≈ sum(skipmissing(A)) ===
reduce(+, skipmissing(A)) === mapreduce(identity, +, skipmissing(A))
end
@test mapreduce(cos, *, A) ≈ mapreduce(cos, *, skipmissing(A))

B = Vector{Union{T,Missing}}(A)
replace!(x -> rand(Bool) ? x : missing, B)
if T === Int
@test sum(collect(skipmissing(B))) === sum(skipmissing(B)) ===
reduce(+, skipmissing(B)) === mapreduce(identity, +, skipmissing(B))
else
@test sum(collect(skipmissing(B))) ≈ sum(skipmissing(B)) ===
reduce(+, skipmissing(B)) === mapreduce(identity, +, skipmissing(B))
end
@test mapreduce(cos, *, collect(skipmissing(A))) ≈ mapreduce(cos, *, skipmissing(A))

# Test block full of missing values
B[1:length(B)÷2] .= missing
if T === Int
@test sum(collect(skipmissing(B))) == sum(skipmissing(B)) ==
reduce(+, skipmissing(B)) == mapreduce(identity, +, skipmissing(B))
else
@test sum(collect(skipmissing(B))) ≈ sum(skipmissing(B)) ==
reduce(+, skipmissing(B)) == mapreduce(identity, +, skipmissing(B))
end

@test mapreduce(cos, *, collect(skipmissing(A))) ≈ mapreduce(cos, *, skipmissing(A))
end

# Patterns that exercize code paths for inputs with 1 or 2 non-missing values
@test sum(skipmissing([1, missing, missing, missing])) === 1
@test sum(skipmissing([missing, missing, missing, 1])) === 1
@test sum(skipmissing([1, missing, missing, missing, 2])) === 3
@test sum(skipmissing([missing, missing, missing, 1, 2])) === 3

for n in 0:3
itr = skipmissing(Vector{Union{Int,Missing}}(fill(missing, n)))
@test sum(itr) == reduce(+, itr) == mapreduce(identity, +, itr) === 0
@test_throws ArgumentError reduce(x -> x/2, itr)
@test_throws ArgumentError mapreduce(x -> x/2, +, itr)
end
end
end

@testset "coalesce" begin
Expand Down