From 858efc0c016afc9472a3da4fbfbb6b670ecdecc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 30 Apr 2021 09:23:10 +0200 Subject: [PATCH 1/3] Always keep missing in pool This is WIP to show the changes so that pools can be shared between `PooledArray`s allowing and disallowing missing values. It seems we can have them without much complications. --- src/PooledArrays.jl | 59 +++++++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 16 deletions(-) diff --git a/src/PooledArrays.jl b/src/PooledArrays.jl index 9ba0f67..6e405bc 100644 --- a/src/PooledArrays.jl +++ b/src/PooledArrays.jl @@ -30,13 +30,13 @@ end mutable struct PooledArray{T, R<:Integer, N, RA} <: AbstractArray{T, N} refs::RA - pool::Vector{T} - invpool::Dict{T,R} + pool::Vector{Union{Missing,T}} + invpool::Dict{Union{Missing,T},R} # refcount[] is 1 if only one PooledArray holds a reference to pool and invpool refcount::Threads.Atomic{Int} - function PooledArray{T,R,N,RA}(rs::RefArray{RA}, invpool::Dict{T, R}, - pool::Vector{T}=_invert(invpool), + function PooledArray{T,R,N,RA}(rs::RefArray{RA}, invpool::Dict{Union{Missing,T}, R}, + pool::Vector{Union{Missing,T}}=_invert(invpool), refcount::Threads.Atomic{Int}=Threads.Atomic{Int}(1)) where {T,R,N,RA<:AbstractArray{R, N}} # this is a quick but incomplete consistency check if length(pool) != length(invpool) @@ -45,11 +45,22 @@ mutable struct PooledArray{T, R<:Integer, N, RA} <: AbstractArray{T, N} if length(rs.a) > 0 # 0 indicates #undef # refs mustn't overflow pool - minref, maxref = extrema(rs.a) - if (minref < 0 || maxref > length(invpool)) - throw(ArgumentError("Reference array points beyond the end of the pool")) + + ipl = length(invpool) + for v in rs.a + if (v < 0 || v > ipl) + throw(ArgumentError("Reference array points beyond the end of the pool")) + end + if !(T <: Missing) + if v == 1 + throw(ArgumentError("Missing value in a pool that does not allow it")) + end + end end end + if pool[1] !== missing + throw(ArgumentError("First entry in pool must be a missing value")) + end pa = new{T,R,N,RA}(rs.a, pool, invpool, refcount) finalizer(x -> Threads.atomic_sub!(x.refcount, 1), pa) return pa @@ -76,7 +87,8 @@ const PooledArrOrSub = Union{SubArray{T, N, <:PooledArray{T, R}}, ############################################################################## # Echo inner constructor as an outer constructor -PooledArray(refs::RefArray{RA}, invpool::Dict{T,R}, pool::Vector{T}=_invert(invpool), +PooledArray{T}(refs::RefArray{RA}, invpool::Dict{Union{Missing,T},R}, + pool::Vector{Union{Missing,T}}=_invert(invpool), refcount::Threads.Atomic{Int}=Threads.Atomic{Int}(1)) where {T,R,RA<:AbstractArray{R}} = PooledArray{T,R,ndims(RA),RA}(refs, invpool, pool, refcount) @@ -100,8 +112,8 @@ function _label(xs::AbstractArray, ::Type{I}=DEFAULT_POOLED_REF_TYPE, start = 1, labels = Array{I}(undef, size(xs)), - invpool::Dict{T,I} = Dict{T, I}(), - pool::Vector{T} = T[], + invpool::Dict{Union{Missing,T},I} = Dict{Union{Missing,T}, I}(), + pool::Vector{Union{Missing,T}} = Union{Missing,T}[], nlabels = 0, ) where {T, I<:Integer} @@ -213,7 +225,7 @@ Base.copy(pa::PooledArrOrSub) = PooledArray(pa) # here we do not allow dest to be SubArray as copy! is intended to replace whole arrays # slow path will be used for SubArray -function copy!(dest::PooledArray{T, R, N}, +function copy!(dest::Union{PooledArray{T, R, N}, PooledArray{Union{Missing,T}, R, N}}, src::PooledArrOrSub{T, N, R}) where {T, N, R} copy!(dest.refs, DataAPI.refarray(src)) src_refcount = refcount(src) @@ -232,10 +244,12 @@ function copy!(dest::PooledArray{T, R, N}, end # this is needed as Julia Base uses a special path for this case we want to avoid -Base.copyto!(dest::PooledArrOrSub{T, N, R}, src::PooledArrOrSub{T, N, R}) where {T, N, R} = +Base.copyto!(dest::Union{PooledArrOrSub{T, N, R}, PooledArrOrSub{Union{Missing,T}, N, R}}, + src::PooledArrOrSub{T, N, R}) where {T, N, R} = copyto!(dest, 1, src, 1, length(src)) -function Base.copyto!(dest::PooledArrOrSub{T, N, R}, doffs::Union{Signed, Unsigned}, +function Base.copyto!(dest::Union{PooledArrOrSub{T, N, R}, PooledArrOrSub{Union{Missing,T}, N, R}}, + doffs::Union{Signed, Unsigned}, src::PooledArrOrSub{T, N, R}, soffs::Union{Signed, Unsigned}, n::Union{Signed, Unsigned}) where {T, N, R} n == 0 && return dest @@ -293,7 +307,7 @@ function Base.invpermute!!(x::PooledArray, p::AbstractVector{T}) where T<:Intege end Base.similar(pa::PooledArray{T,R}, S::Type, dims::Dims) where {T,R} = - PooledArray(RefArray(zeros(R, dims)), Dict{S,R}()) + PooledArray{S}(RefArray(zeros(R, dims)), Dict{Union{Missing,S},R}()) Base.findall(pdv::PooledVector{Bool}) = findall(convert(Vector{Bool}, pdv)) @@ -304,6 +318,10 @@ Base.findall(pdv::PooledVector{Bool}) = findall(convert(Vector{Bool}, pdv)) ## ############################################################################## +# TODO ensure proper translation: +# 1. if missing is mapped to something else - re-introduce the missing +# 2. make sure missing ends up as the first entry in the result +# 3. correctly calculate the eltype of the result (so that Missing does not affect it if it was not allowed originally) function Base.map(f, x::PooledArray{T,R}) where {T,R<:Integer} ks = collect(keys(x.invpool)) vs = collect(values(x.invpool)) @@ -392,6 +410,8 @@ Base.sort(pa::PooledArray; kw...) = pa[sortperm(pa; kw...)] ## ############################################################################## +# TODO: correctly handle types in conversions + function Base.convert(::Type{PooledArray{S,R1,N}}, pa::PooledArray{T,R2,N}) where {S,T,R1<:Integer,R2<:Integer,N} invpool_conv = convert(Dict{S,R1}, pa.invpool) @assert invpool_conv !== pa.invpool @@ -448,7 +468,8 @@ Base.convert(::Type{Array}, pa::PooledArray{T, R, N}) where {T, R, N} = convert( Base.@propagate_inbounds function Base.getindex(A::PooledArray, I::Int) idx = DataAPI.refarray(A)[I] - iszero(idx) && throw(UndefRefError()) + # if eltype(A) does not allow Missing then also 1 is an error + idx <= !(eltype(A) <: Missing) && throw(UndefRefError()) return @inbounds DataAPI.refpool(A)[idx] end @@ -466,7 +487,8 @@ Base.@propagate_inbounds function Base.getindex(A::PooledArrOrSub, end Base.@propagate_inbounds function Base.isassigned(pa::PooledArrOrSub, I::Int...) - !iszero(DataAPI.refarray(pa)[I...]) + # if eltype(A) does not allow Missing then also 1 is an error + !(DataAPI.refarray(pa)[I...] <= !(eltype(pa) <: Missing)) end ############################################################################## @@ -511,6 +533,9 @@ end Base.IndexStyle(::Type{<:PooledArray}) = IndexLinear() Base.@propagate_inbounds function Base.setindex!(x::PooledArray, val, ind::Int) + if val === missing && !(eltype(x) <: Missing) + throw(ArgumentError("PooledArray element type does not allow storing missing values")) + end x.refs[ind] = getpoolidx(x, val) return x end @@ -548,6 +573,8 @@ Base.empty!(pv::PooledVector) = (empty!(pv.refs); pv) Base.deleteat!(pv::PooledVector, inds) = (deleteat!(pv.refs, inds); pv) +# TODO: review vcat for possible Missing related issues + function _vcat!(c, a, b) copyto!(c, 1, a, 1, length(a)) return copyto!(c, length(a)+1, b, 1, length(b)) From 5ecc0bd9a9618924c1aaec257f3b4f943e7f5305 Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Sat, 22 May 2021 01:43:21 -0600 Subject: [PATCH 2/3] A few fixes --- src/PooledArrays.jl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/PooledArrays.jl b/src/PooledArrays.jl index 6e405bc..31b95d3 100644 --- a/src/PooledArrays.jl +++ b/src/PooledArrays.jl @@ -51,7 +51,7 @@ mutable struct PooledArray{T, R<:Integer, N, RA} <: AbstractArray{T, N} if (v < 0 || v > ipl) throw(ArgumentError("Reference array points beyond the end of the pool")) end - if !(T <: Missing) + if !(T >: Missing) if v == 1 throw(ArgumentError("Missing value in a pool that does not allow it")) end @@ -101,9 +101,9 @@ function _our_copy(x::SubArray{<:Any, 0}) return y end -function PooledArray(d::PooledArrOrSub) +function PooledArray(d::PooledArrOrSub{T}) where {T} Threads.atomic_add!(refcount(d), 1) - return PooledArray(RefArray(_our_copy(DataAPI.refarray(d))), + return PooledArray{T}(RefArray(_our_copy(DataAPI.refarray(d))), DataAPI.invrefpool(d), DataAPI.refpool(d), refcount(d)) end @@ -469,26 +469,26 @@ Base.convert(::Type{Array}, pa::PooledArray{T, R, N}) where {T, R, N} = convert( Base.@propagate_inbounds function Base.getindex(A::PooledArray, I::Int) idx = DataAPI.refarray(A)[I] # if eltype(A) does not allow Missing then also 1 is an error - idx <= !(eltype(A) <: Missing) && throw(UndefRefError()) + idx <= !(eltype(A) >: Missing) && throw(UndefRefError()) return @inbounds DataAPI.refpool(A)[idx] end # we handle fast only the case when the first index is an abstract vector # this is to make sure other indexing synraxes use standard dispatch from Base # the reason is that creation of DataAPI.refarray(A) is unfortunately slow -Base.@propagate_inbounds function Base.getindex(A::PooledArrOrSub, +Base.@propagate_inbounds function Base.getindex(A::PooledArrOrSub{T}, I1::AbstractVector, - I2::Union{Real, AbstractVector}...) + I2::Union{Real, AbstractVector}...) where {T} # make sure we do not increase A.refcount in case creation of newrefs fails newrefs = DataAPI.refarray(A)[I1, I2...] @assert newrefs isa AbstractArray Threads.atomic_add!(refcount(A), 1) - return PooledArray(RefArray(newrefs), DataAPI.invrefpool(A), DataAPI.refpool(A), refcount(A)) + return PooledArray{T}(RefArray(newrefs), DataAPI.invrefpool(A), DataAPI.refpool(A), refcount(A)) end Base.@propagate_inbounds function Base.isassigned(pa::PooledArrOrSub, I::Int...) # if eltype(A) does not allow Missing then also 1 is an error - !(DataAPI.refarray(pa)[I...] <= !(eltype(pa) <: Missing)) + !(DataAPI.refarray(pa)[I...] <= !(eltype(pa) >: Missing)) end ############################################################################## @@ -533,7 +533,7 @@ end Base.IndexStyle(::Type{<:PooledArray}) = IndexLinear() Base.@propagate_inbounds function Base.setindex!(x::PooledArray, val, ind::Int) - if val === missing && !(eltype(x) <: Missing) + if val === missing && !(eltype(x) >: Missing) throw(ArgumentError("PooledArray element type does not allow storing missing values")) end x.refs[ind] = getpoolidx(x, val) From e979ea6511f88e5cc8611275daecf7d3642e1136 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 23 May 2021 00:12:31 +0200 Subject: [PATCH 3/3] partial fix of the PR --- src/PooledArrays.jl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/PooledArrays.jl b/src/PooledArrays.jl index 31b95d3..4892dfb 100644 --- a/src/PooledArrays.jl +++ b/src/PooledArrays.jl @@ -45,7 +45,7 @@ mutable struct PooledArray{T, R<:Integer, N, RA} <: AbstractArray{T, N} if length(rs.a) > 0 # 0 indicates #undef # refs mustn't overflow pool - + ipl = length(invpool) for v in rs.a if (v < 0 || v > ipl) @@ -112,9 +112,9 @@ function _label(xs::AbstractArray, ::Type{I}=DEFAULT_POOLED_REF_TYPE, start = 1, labels = Array{I}(undef, size(xs)), - invpool::Dict{Union{Missing,T},I} = Dict{Union{Missing,T}, I}(), - pool::Vector{Union{Missing,T}} = Union{Missing,T}[], - nlabels = 0, + invpool::Dict{Union{Missing,T},I} = Dict{Union{Missing,T}, I}(missing => one(I)), + pool::Vector{Union{Missing,T}} = Union{Missing,T}[missing], + nlabels = 1, ) where {T, I<:Integer} @inbounds for i in start:length(xs) @@ -188,7 +188,7 @@ end function PooledArray{T}(d::AbstractArray; signed::Bool=false, compress::Bool=false) where {T} R = signed ? (compress ? Int8 : DEFAULT_SIGNED_REF_TYPE) : (compress ? UInt8 : DEFAULT_POOLED_REF_TYPE) refs, invpool, pool = _label(d, T, R) - return PooledArray(RefArray(refs), invpool, pool) + return PooledArray{T}(RefArray(refs), invpool, pool) end PooledArray(d::AbstractArray{T}, r::Type) where {T} = PooledArray{T}(d, r) @@ -291,9 +291,9 @@ function Base.resize!(pa::PooledArray{T,R,1}, n::Integer) where {T,R} return pa end -function Base.reverse(x::PooledArray) +function Base.reverse(x::PooledArray{T}) where {T} Threads.atomic_add!(x.refcount, 1) - PooledArray(RefArray(reverse(x.refs)), x.invpool, x.pool, x.refcount) + PooledArray{T}(RefArray(reverse(x.refs)), x.invpool, x.pool, x.refcount) end function Base.permute!!(x::PooledArray, p::AbstractVector{T}) where T<:Integer @@ -307,7 +307,7 @@ function Base.invpermute!!(x::PooledArray, p::AbstractVector{T}) where T<:Intege end Base.similar(pa::PooledArray{T,R}, S::Type, dims::Dims) where {T,R} = - PooledArray{S}(RefArray(zeros(R, dims)), Dict{Union{Missing,S},R}()) + PooledArray{S}(RefArray(zeros(R, dims)), Dict{Union{Missing,S},R}(missing=>one(R))) Base.findall(pdv::PooledVector{Bool}) = findall(convert(Vector{Bool}, pdv)) @@ -466,7 +466,7 @@ Base.convert(::Type{Array}, pa::PooledArray{T, R, N}) where {T, R, N} = convert( # We need separate functions due to dispatch ambiguities -Base.@propagate_inbounds function Base.getindex(A::PooledArray, I::Int) +Base.@propagate_inbounds function Base.getindex(A::PooledArray{T}, I::Int)::T where T idx = DataAPI.refarray(A)[I] # if eltype(A) does not allow Missing then also 1 is an error idx <= !(eltype(A) >: Missing) && throw(UndefRefError())