Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add skipmissings #111

Merged
merged 8 commits into from
Apr 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ This package provides additional functionality for working with `missing` values
- `allowmissing` and `disallowmissing` to convert between `Vector{T}` and `Vector{Union{T, Missing}}`
- `passmissing` to wrap a function so that it returns `missing` if any of its positional arguments is `missing`
- `levels` to get the unique values in a vector excluding `missing` and in their preferred order
- `skipmissings` to loop through a collection of iterators excluding indi ces where any iterators are `missing`

## Contributing and Questions

Expand Down
260 changes: 259 additions & 1 deletion src/Missings.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
module Missings

export allowmissing, disallowmissing, ismissing, missing, missings,
Missing, MissingException, levels, coalesce, passmissing, nonmissingtype
Missing, MissingException, levels, coalesce, passmissing, nonmissingtype,
skipmissings

using Base: ismissing, missing, Missing, MissingException
using Base: @deprecate
Expand Down Expand Up @@ -207,4 +208,261 @@ missing
"""
passmissing(f) = PassMissing{Core.Typeof(f)}(f)

"""
skipmissings(args...)

Return a tuple of iterators wrapping each of the iterators in `args`, but
skipping elements at positions where at least one of the iterators returns `missing`
(listwise deletion of missing values).

# Examples
```
julia> x = [1, 2, missing, 4]; y = [1, 2, 3, missing];

julia> tx, ty = skipmissings(x, y)
(Missings.SkipMissings{Array{Union{Missing, Int64},1},Tuple{Array{Union{Missing, Int64},1}}}
(Union{Missing, Int64}[1, 2, missing, 4], (Union{Missing, Int64}[1, 2, 3, missing],)), Missi
ngs.SkipMissings{Array{Union{Missing, Int64},1},Tuple{Array{Union{Missing, Int64},1}}}(Union
{Missing, Int64}[1, 2, 3, missing], (Union{Missing, Int64}[1, 2, missing, 4],)))

julia> collect(tx)
2-element Array{Int64,1}:
1
2

```
"""
function skipmissings(args...)
pdeffebach marked this conversation as resolved.
Show resolved Hide resolved
if isempty(args)
throw(ArgumentError("Must input one or more arguments"))
end

if args isa Tuple{Vararg{AbstractArray}}
if !all(x -> length(x) == length(args[1]), args)
throw(ArgumentError("All arguments must have the same length"))
end

if !all(x -> eachindex(x) == eachindex(args[1]), args)
throw(ArgumentError("All arguments must have the same indices"))
end
end

ntuple(length(args)) do i
s = setdiff(1:length(args), i)
SkipMissings(args[i], args[s])
end
end

struct SkipMissings{V, T}
x::V
others::T
end

Base.@propagate_inbounds function _anymissingindex(others::Tuple{Vararg{AbstractArray}}, i)
for oth in others
pdeffebach marked this conversation as resolved.
Show resolved Hide resolved
oth[i] === missing && return true
end

return false
end

@inline function _anymissingiterate(others::Tuple, state)
for oth in others
y = iterate(oth, state)
y !== nothing && first(y) === missing && return true
end

return false
end

const SkipMissingsofArrays = SkipMissings{V, T} where
{V <: AbstractArray, T <: Tuple{Vararg{AbstractArray}}}

function Base.show(io::IO, mime::MIME"text/plain", itr::SkipMissings{V}) where V
print(io, SkipMissings, '{', V, '}', '(', itr.x, ')', " comprised of " *
"$(length(itr.others) + 1) iterators")
end

Base.IteratorSize(::Type{<:SkipMissings}) = Base.SizeUnknown()
Base.IteratorEltype(::Type{<:SkipMissings{V}}) where {V} = Base.IteratorEltype(V)
Base.eltype(::Type{<:SkipMissings{V}}) where {V} = nonmissingtype(eltype(V))
Base.IndexStyle(itr::SkipMissings) = Base.IndexStyle(itr.x)

function Base.iterate(itr::SkipMissings, state=1)
x_itr = iterate(itr.x, state)
x_itr === nothing && return nothing
x_item, x_state = x_itr
while true
x_item === missing || _anymissingiterate(itr.others, state) || break
x_itr = iterate(itr.x, x_state)
x_itr === nothing && return nothing
state = x_state
x_item, x_state = x_itr
end
return x_item, x_state
end

function Base.iterate(itr::SkipMissingsofArrays, state=0)
eix = eachindex(itr.x)
ind_itr = iterate(eix, state)
ind_itr === nothing && return nothing
ind_item, ind_state = ind_itr
@inbounds x_item = itr.x[ind_item]
@inbounds while true
x_item === missing || _anymissingindex(itr.others, ind_item) || break
ind_itr = iterate(eix, ind_state)
ind_itr === nothing && return nothing
ind_item, ind_state = ind_itr
x_item = itr.x[ind_item]
end
return x_item, ind_state
end

Base.IndexStyle(::Type{<:SkipMissings{V}}) where {V} = Base.IndexStyle(V)

function Base.eachindex(itr::SkipMissingsofArrays)
@inbounds Iterators.filter(eachindex(itr.x)) do i
itr.x[i] !== missing && !_anymissingindex(itr.others, i)
end
end

function Base.keys(itr::SkipMissingsofArrays)
@inbounds Iterators.filter(keys(itr.x)) do i
itr.x[i] !== missing && !_anymissingindex(itr.others, i)
end
end

@inline function Base.getindex(itr::SkipMissingsofArrays, i)
@boundscheck checkbounds(itr.x, i)
@inbounds xi = itr.x[i]
if xi === missing || @inbounds _anymissingindex(itr.others, i)
throw(MissingException("the value at index $i is missing for some element"))
end
return xi
end

Base.mapreduce(f, op, itr::SkipMissingsofArrays) =
Base._mapreduce(f, op, Base.IndexStyle(itr), itr)

function Base._mapreduce(f, op, ::IndexLinear, itr::SkipMissingsofArrays)
A = itr.x
local ai
inds = LinearIndices(A)
i = first(inds)
ilast = last(inds)
@inbounds while i <= ilast
ai = A[i]
ai === missing || _anymissingindex(itr.others, i) || break
i += 1
end
i > ilast && return Base.mapreduce_empty(f, op, Base.eltype(itr))
a1 = ai
i += 1
@inbounds while i <= ilast
ai = A[i]
ai === missing || _anymissingindex(itr.others, i) || break
i += 1
end
i > ilast && return Base.mapreduce_first(f, op, a1)
# We know A contains at least two non-missing entries: the result cannot be nothing
something(Base.mapreduce_impl(f, op, itr, first(inds), last(inds)))
end

Base._mapreduce(f, op, ::IndexCartesian, itr::SkipMissingsofArrays) = mapfoldl(f, op, itr)


Base.mapreduce_impl(f, op, A::SkipMissingsofArrays, ifirst::Integer, ilast::Integer) =
Base.mapreduce_impl(f, op, A, ifirst, ilast, Base.pairwise_blocksize(f, op))

# Returns nothing when the input contains only missing values, and Some(x) otherwise
@noinline function Base.mapreduce_impl(f, op, itr::SkipMissingsofArrays,
ifirst::Integer, ilast::Integer, blksize::Int)
A = itr.x
if ifirst == ilast
@inbounds a1 = A[ifirst]
if a1 === missing
return nothing
elseif _anymissingindex(itr.others, ifirst)
return nothing
else
return Some(Base.mapreduce_first(f, op, a1))
end
elseif ifirst + blksize > ilast
# sequential portion
local ai
i = ifirst
@inbounds while i <= ilast
ai = A[i]
ai === missing || _anymissingindex(itr.others, i) || break
i += 1
end
i > ilast && return nothing
a1 = ai::eltype(itr)
i += 1
@inbounds while i <= ilast
ai = A[i]
ai === missing || _anymissingindex(itr.others, i) || break
i += 1
end
i > ilast && return Some(Base.mapreduce_first(f, op, a1))
a2 = ai::eltype(itr)
i += 1
v = op(f(a1), f(a2))
@simd for i = i:ilast
@inbounds ai = A[i]
ai === missing || @inbounds _anymissingindex(itr.others, i) || (v = op(v, f(ai)))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this was incorrect before and tests didn't spot it? Probably there was no test with an input large enough to use this branch (IIRC 16 elements is the threshold). Can you add one?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added a test for this.

end
return Some(v)
else
# pairwise portion
imid = (ifirst + ilast) >> 1
v1 = Base.mapreduce_impl(f, op, itr, ifirst, imid, blksize)
v2 = Base.mapreduce_impl(f, op, itr, imid+1, ilast, blksize)
if v1 === nothing && v2 === nothing
return nothing
elseif v1 === nothing
return v2
elseif v2 === nothing
return v1
else
return Some(op(something(v1), something(v2)))
end
end
end

"""
filter(f, itr::SkipMissings)

Return a vector similar to the array wrapped by the given `SkipMissings` iterator
but skipping all elements with a `missing` value in one of the iterators passed
to `skipmissing` and elements for which `f` returns `false`. This method
only applies when all iterators passed to `skipmissings` are arrays.

# Examples
```
julia> x = [missing; 2:9]; y = [1:9; missing];

julia> mx, my = skipmissings(x, y);

julia> filter(isodd, mx)
4-element Array{Int64,1}:
3
5
7
9

```
"""
function filter(f, itr::SkipMissingsofArrays)
x = itr.x
y = similar(x, eltype(itr), 0)
for i in eachindex(x)
@inbounds xi = x[i]
if xi !== missing && @inbounds !_anymissingindex(itr.others, i) && f(xi)
push!(y, xi)
end
end
y
end

end # module
52 changes: 52 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,58 @@ struct CubeRooter end
@test collect(x) == [1, 2, 4]
@test collect(x) isa Vector{Int}

x = [1, 2, missing, 4]
y = ["a", "b", "c", missing]
z = [missing, missing, 3.1, 4.5]
l = [1, 2, 3, 4, 5]
@test_throws ArgumentError skipmissings(x, l)
mx, my = skipmissings(x, y)
iobuf = IOBuffer()
show(iobuf, MIME("text/plain"), mx)
s = String(take!(iobuf))
@test s == "Missings.SkipMissings{Array{Union{Missing, $Int},1}}(" *
"Union{Missing, $Int}[1, 2, missing, 4]) comprised of 2 iterators"
@test collect(mx) == [1, 2]
@test collect(mx) isa Vector{Int}
@test reduce(+, mx) === reduce(+, collect(mx)) === sum(mx) ===
mapreduce(identity, +, mx) === 3
@test mapreduce(x -> x^2, +, mx) === mapreduce(x -> x^2, +, collect(mx)) === 5
mx, my, mz = skipmissings(x, y, z)
@test eltype(mx) == Int
@test eltype(my) == String
@test eltype(mz) == Float64
@test isempty(collect(mx))
@test sum(mx) === 0
x = [missing 4; 2 5; 3 6]
y = [1 4; missing 5; 3 6]
mx, my = skipmissings(x, y)
@test collect(mx) == [3, 4, 5, 6]
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
@test mx[3] == 3
@test_throws MissingException mx[1]
@test reduce(+, mx) === 18
@test isapprox(mapreduce(cos, *, collect(mx)), mapreduce(cos, *, mx))
pdeffebach marked this conversation as resolved.
Show resolved Hide resolved
if VERSION >= v"1.4.0-DEV"
t = quote
@inferred Union{Float64, Missing} mapreduce(cos, *, $mx)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, thinking about it again, shouldn't this be just Float64? The result cannot be missing by definition, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the skipmissing in Base we have

julia> @code_warntype sum(sx)
Variables
  #self#::Core.Compiler.Const(sum, false)
  a::Base.SkipMissing{Array{Union{Missing, Int64},1}}

Body::Union{Missing, Int64}
1 ─ %1 = Base.sum(Base.identity, a)::Union{Missing, Int64}
└──      return %1

I don't know why the output works like that given eltype(sx) == Int but it's a yet-to-be improved aspect of mapreduce with skipmissing. So it's fair that skipmissings would work the same. You have a comment on some issue highlighting that it's not a high priority performance issue but I can't find it.

Thankfully small unions are fast.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, I need to investigate this. It's somewhat ironic that the function designed to skip missings isn't inferred as such. :-D

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually I've realized @static if works here instead of @eval. I've committed that.

end
eval(t)
end

x = [missing missing missing]
y = [1, 2, 3]
mx, my = skipmissings(x, y)
@test_throws ArgumentError reduce(x -> x/2, mx)
@test_throws ArgumentError mapreduce(x -> x/2, +, mx)
@test_throws MethodError length(mx)
@test IndexStyle(typeof(mx)) == IndexStyle(typeof(x))
x = [isodd(i) ? missing : i for i in 1:64]
y = [isodd(i) ? missing : i for i in 65:128]
mx, my = skipmissings(x, y)
@test sum(mx) === 1056
@static if VERSION >= v"1.4.0-DEV"
@inferred Union{Missing, Int} sum(mx)
end

pdeffebach marked this conversation as resolved.
Show resolved Hide resolved
@test levels(1:1) == levels([1]) == levels([1, missing]) == levels([missing, 1]) == [1]
@test levels(2:-1:1) == levels([2, 1]) == levels([2, missing, 1]) == [1, 2]
@test levels([missing, "a", "c", missing, "b"]) == ["a", "b", "c"]
Expand Down