diff --git a/README.md b/README.md index b9e0cdc..82c2aa3 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ This package provides additional functionality for working with `missing` values - `allowmissing` and `disallowmissing` to convert between `Vector{T}` and `Vector{Union{T, Missing}}` - `passmissing` to wrap a function so that it returns `missing` if any of its positional arguments is `missing` - `levels` to get the unique values in a vector excluding `missing` and in their preferred order +- `skipmissings` to loop through a collection of iterators excluding indi ces where any iterators are `missing` ## Contributing and Questions diff --git a/src/Missings.jl b/src/Missings.jl index b7c4c50..d903890 100644 --- a/src/Missings.jl +++ b/src/Missings.jl @@ -1,7 +1,8 @@ module Missings export allowmissing, disallowmissing, ismissing, missing, missings, - Missing, MissingException, levels, coalesce, passmissing, nonmissingtype + Missing, MissingException, levels, coalesce, passmissing, nonmissingtype, + skipmissings using Base: ismissing, missing, Missing, MissingException using Base: @deprecate @@ -207,4 +208,261 @@ missing """ passmissing(f) = PassMissing{Core.Typeof(f)}(f) +""" + skipmissings(args...) + +Return a tuple of iterators wrapping each of the iterators in `args`, but +skipping elements at positions where at least one of the iterators returns `missing` +(listwise deletion of missing values). + +# Examples +``` +julia> x = [1, 2, missing, 4]; y = [1, 2, 3, missing]; + +julia> tx, ty = skipmissings(x, y) +(Missings.SkipMissings{Array{Union{Missing, Int64},1},Tuple{Array{Union{Missing, Int64},1}}} +(Union{Missing, Int64}[1, 2, missing, 4], (Union{Missing, Int64}[1, 2, 3, missing],)), Missi +ngs.SkipMissings{Array{Union{Missing, Int64},1},Tuple{Array{Union{Missing, Int64},1}}}(Union +{Missing, Int64}[1, 2, 3, missing], (Union{Missing, Int64}[1, 2, missing, 4],))) + +julia> collect(tx) +2-element Array{Int64,1}: + 1 + 2 + +``` +""" +function skipmissings(args...) + if isempty(args) + throw(ArgumentError("Must input one or more arguments")) + end + + if args isa Tuple{Vararg{AbstractArray}} + if !all(x -> length(x) == length(args[1]), args) + throw(ArgumentError("All arguments must have the same length")) + end + + if !all(x -> eachindex(x) == eachindex(args[1]), args) + throw(ArgumentError("All arguments must have the same indices")) + end + end + + ntuple(length(args)) do i + s = setdiff(1:length(args), i) + SkipMissings(args[i], args[s]) + end +end + +struct SkipMissings{V, T} + x::V + others::T +end + +Base.@propagate_inbounds function _anymissingindex(others::Tuple{Vararg{AbstractArray}}, i) + for oth in others + oth[i] === missing && return true + end + + return false +end + +@inline function _anymissingiterate(others::Tuple, state) + for oth in others + y = iterate(oth, state) + y !== nothing && first(y) === missing && return true + end + + return false +end + +const SkipMissingsofArrays = SkipMissings{V, T} where + {V <: AbstractArray, T <: Tuple{Vararg{AbstractArray}}} + +function Base.show(io::IO, mime::MIME"text/plain", itr::SkipMissings{V}) where V + print(io, SkipMissings, '{', V, '}', '(', itr.x, ')', " comprised of " * + "$(length(itr.others) + 1) iterators") +end + +Base.IteratorSize(::Type{<:SkipMissings}) = Base.SizeUnknown() +Base.IteratorEltype(::Type{<:SkipMissings{V}}) where {V} = Base.IteratorEltype(V) +Base.eltype(::Type{<:SkipMissings{V}}) where {V} = nonmissingtype(eltype(V)) +Base.IndexStyle(itr::SkipMissings) = Base.IndexStyle(itr.x) + +function Base.iterate(itr::SkipMissings, state=1) + x_itr = iterate(itr.x, state) + x_itr === nothing && return nothing + x_item, x_state = x_itr + while true + x_item === missing || _anymissingiterate(itr.others, state) || break + x_itr = iterate(itr.x, x_state) + x_itr === nothing && return nothing + state = x_state + x_item, x_state = x_itr + end + return x_item, x_state +end + +function Base.iterate(itr::SkipMissingsofArrays, state=0) + eix = eachindex(itr.x) + ind_itr = iterate(eix, state) + ind_itr === nothing && return nothing + ind_item, ind_state = ind_itr + @inbounds x_item = itr.x[ind_item] + @inbounds while true + x_item === missing || _anymissingindex(itr.others, ind_item) || break + ind_itr = iterate(eix, ind_state) + ind_itr === nothing && return nothing + ind_item, ind_state = ind_itr + x_item = itr.x[ind_item] + end + return x_item, ind_state +end + +Base.IndexStyle(::Type{<:SkipMissings{V}}) where {V} = Base.IndexStyle(V) + +function Base.eachindex(itr::SkipMissingsofArrays) + @inbounds Iterators.filter(eachindex(itr.x)) do i + itr.x[i] !== missing && !_anymissingindex(itr.others, i) + end +end + +function Base.keys(itr::SkipMissingsofArrays) + @inbounds Iterators.filter(keys(itr.x)) do i + itr.x[i] !== missing && !_anymissingindex(itr.others, i) + end +end + +@inline function Base.getindex(itr::SkipMissingsofArrays, i) + @boundscheck checkbounds(itr.x, i) + @inbounds xi = itr.x[i] + if xi === missing || @inbounds _anymissingindex(itr.others, i) + throw(MissingException("the value at index $i is missing for some element")) + end + return xi +end + +Base.mapreduce(f, op, itr::SkipMissingsofArrays) = + Base._mapreduce(f, op, Base.IndexStyle(itr), itr) + +function Base._mapreduce(f, op, ::IndexLinear, itr::SkipMissingsofArrays) + A = itr.x + local ai + inds = LinearIndices(A) + i = first(inds) + ilast = last(inds) + @inbounds while i <= ilast + ai = A[i] + ai === missing || _anymissingindex(itr.others, i) || break + i += 1 + end + i > ilast && return Base.mapreduce_empty(f, op, Base.eltype(itr)) + a1 = ai + i += 1 + @inbounds while i <= ilast + ai = A[i] + ai === missing || _anymissingindex(itr.others, i) || break + i += 1 + end + i > ilast && return Base.mapreduce_first(f, op, a1) + # We know A contains at least two non-missing entries: the result cannot be nothing + something(Base.mapreduce_impl(f, op, itr, first(inds), last(inds))) +end + +Base._mapreduce(f, op, ::IndexCartesian, itr::SkipMissingsofArrays) = mapfoldl(f, op, itr) + + +Base.mapreduce_impl(f, op, A::SkipMissingsofArrays, ifirst::Integer, ilast::Integer) = + Base.mapreduce_impl(f, op, A, ifirst, ilast, Base.pairwise_blocksize(f, op)) + +# Returns nothing when the input contains only missing values, and Some(x) otherwise +@noinline function Base.mapreduce_impl(f, op, itr::SkipMissingsofArrays, + ifirst::Integer, ilast::Integer, blksize::Int) + A = itr.x + if ifirst == ilast + @inbounds a1 = A[ifirst] + if a1 === missing + return nothing + elseif _anymissingindex(itr.others, ifirst) + return nothing + else + return Some(Base.mapreduce_first(f, op, a1)) + end + elseif ifirst + blksize > ilast + # sequential portion + local ai + i = ifirst + @inbounds while i <= ilast + ai = A[i] + ai === missing || _anymissingindex(itr.others, i) || break + i += 1 + end + i > ilast && return nothing + a1 = ai::eltype(itr) + i += 1 + @inbounds while i <= ilast + ai = A[i] + ai === missing || _anymissingindex(itr.others, i) || break + i += 1 + end + i > ilast && return Some(Base.mapreduce_first(f, op, a1)) + a2 = ai::eltype(itr) + i += 1 + v = op(f(a1), f(a2)) + @simd for i = i:ilast + @inbounds ai = A[i] + ai === missing || @inbounds _anymissingindex(itr.others, i) || (v = op(v, f(ai))) + end + return Some(v) + else + # pairwise portion + imid = (ifirst + ilast) >> 1 + v1 = Base.mapreduce_impl(f, op, itr, ifirst, imid, blksize) + v2 = Base.mapreduce_impl(f, op, itr, imid+1, ilast, blksize) + if v1 === nothing && v2 === nothing + return nothing + elseif v1 === nothing + return v2 + elseif v2 === nothing + return v1 + else + return Some(op(something(v1), something(v2))) + end + end +end + +""" + filter(f, itr::SkipMissings) + +Return a vector similar to the array wrapped by the given `SkipMissings` iterator +but skipping all elements with a `missing` value in one of the iterators passed +to `skipmissing` and elements for which `f` returns `false`. This method +only applies when all iterators passed to `skipmissings` are arrays. + +# Examples +``` +julia> x = [missing; 2:9]; y = [1:9; missing]; + +julia> mx, my = skipmissings(x, y); + +julia> filter(isodd, mx) +4-element Array{Int64,1}: + 3 + 5 + 7 + 9 + +``` +""" +function filter(f, itr::SkipMissingsofArrays) + x = itr.x + y = similar(x, eltype(itr), 0) + for i in eachindex(x) + @inbounds xi = x[i] + if xi !== missing && @inbounds !_anymissingindex(itr.others, i) && f(xi) + push!(y, xi) + end + end + y +end + end # module diff --git a/test/runtests.jl b/test/runtests.jl index 1aedba7..79b259a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -75,6 +75,58 @@ struct CubeRooter end @test collect(x) == [1, 2, 4] @test collect(x) isa Vector{Int} + x = [1, 2, missing, 4] + y = ["a", "b", "c", missing] + z = [missing, missing, 3.1, 4.5] + l = [1, 2, 3, 4, 5] + @test_throws ArgumentError skipmissings(x, l) + mx, my = skipmissings(x, y) + iobuf = IOBuffer() + show(iobuf, MIME("text/plain"), mx) + s = String(take!(iobuf)) + @test s == "Missings.SkipMissings{Array{Union{Missing, $Int},1}}(" * + "Union{Missing, $Int}[1, 2, missing, 4]) comprised of 2 iterators" + @test collect(mx) == [1, 2] + @test collect(mx) isa Vector{Int} + @test reduce(+, mx) === reduce(+, collect(mx)) === sum(mx) === + mapreduce(identity, +, mx) === 3 + @test mapreduce(x -> x^2, +, mx) === mapreduce(x -> x^2, +, collect(mx)) === 5 + mx, my, mz = skipmissings(x, y, z) + @test eltype(mx) == Int + @test eltype(my) == String + @test eltype(mz) == Float64 + @test isempty(collect(mx)) + @test sum(mx) === 0 + x = [missing 4; 2 5; 3 6] + y = [1 4; missing 5; 3 6] + mx, my = skipmissings(x, y) + @test collect(mx) == [3, 4, 5, 6] + @test mx[3] == 3 + @test_throws MissingException mx[1] + @test reduce(+, mx) === 18 + @test isapprox(mapreduce(cos, *, collect(mx)), mapreduce(cos, *, mx)) + if VERSION >= v"1.4.0-DEV" + t = quote + @inferred Union{Float64, Missing} mapreduce(cos, *, $mx) + end + eval(t) + end + + x = [missing missing missing] + y = [1, 2, 3] + mx, my = skipmissings(x, y) + @test_throws ArgumentError reduce(x -> x/2, mx) + @test_throws ArgumentError mapreduce(x -> x/2, +, mx) + @test_throws MethodError length(mx) + @test IndexStyle(typeof(mx)) == IndexStyle(typeof(x)) + x = [isodd(i) ? missing : i for i in 1:64] + y = [isodd(i) ? missing : i for i in 65:128] + mx, my = skipmissings(x, y) + @test sum(mx) === 1056 + @static if VERSION >= v"1.4.0-DEV" + @inferred Union{Missing, Int} sum(mx) + end + @test levels(1:1) == levels([1]) == levels([1, missing]) == levels([missing, 1]) == [1] @test levels(2:-1:1) == levels([2, 1]) == levels([2, missing, 1]) == [1, 2] @test levels([missing, "a", "c", missing, "b"]) == ["a", "b", "c"]