-
Notifications
You must be signed in to change notification settings - Fork 13
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Always keep missing in pool #65
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,13 +30,13 @@ end | |
|
||
mutable struct PooledArray{T, R<:Integer, N, RA} <: AbstractArray{T, N} | ||
refs::RA | ||
pool::Vector{T} | ||
invpool::Dict{T,R} | ||
pool::Vector{Union{Missing,T}} | ||
invpool::Dict{Union{Missing,T},R} | ||
# refcount[] is 1 if only one PooledArray holds a reference to pool and invpool | ||
refcount::Threads.Atomic{Int} | ||
|
||
function PooledArray{T,R,N,RA}(rs::RefArray{RA}, invpool::Dict{T, R}, | ||
pool::Vector{T}=_invert(invpool), | ||
function PooledArray{T,R,N,RA}(rs::RefArray{RA}, invpool::Dict{Union{Missing,T}, R}, | ||
pool::Vector{Union{Missing,T}}=_invert(invpool), | ||
refcount::Threads.Atomic{Int}=Threads.Atomic{Int}(1)) where {T,R,N,RA<:AbstractArray{R, N}} | ||
# this is a quick but incomplete consistency check | ||
if length(pool) != length(invpool) | ||
|
@@ -45,11 +45,22 @@ mutable struct PooledArray{T, R<:Integer, N, RA} <: AbstractArray{T, N} | |
if length(rs.a) > 0 | ||
# 0 indicates #undef | ||
# refs mustn't overflow pool | ||
minref, maxref = extrema(rs.a) | ||
if (minref < 0 || maxref > length(invpool)) | ||
throw(ArgumentError("Reference array points beyond the end of the pool")) | ||
|
||
ipl = length(invpool) | ||
for v in rs.a | ||
if (v < 0 || v > ipl) | ||
throw(ArgumentError("Reference array points beyond the end of the pool")) | ||
end | ||
if !(T <: Missing) | ||
if v == 1 | ||
throw(ArgumentError("Missing value in a pool that does not allow it")) | ||
end | ||
end | ||
end | ||
end | ||
if pool[1] !== missing | ||
throw(ArgumentError("First entry in pool must be a missing value")) | ||
end | ||
pa = new{T,R,N,RA}(rs.a, pool, invpool, refcount) | ||
finalizer(x -> Threads.atomic_sub!(x.refcount, 1), pa) | ||
return pa | ||
|
@@ -76,7 +87,8 @@ const PooledArrOrSub = Union{SubArray{T, N, <:PooledArray{T, R}}, | |
############################################################################## | ||
|
||
# Echo inner constructor as an outer constructor | ||
PooledArray(refs::RefArray{RA}, invpool::Dict{T,R}, pool::Vector{T}=_invert(invpool), | ||
PooledArray{T}(refs::RefArray{RA}, invpool::Dict{Union{Missing,T},R}, | ||
pool::Vector{Union{Missing,T}}=_invert(invpool), | ||
refcount::Threads.Atomic{Int}=Threads.Atomic{Int}(1)) where {T,R,RA<:AbstractArray{R}} = | ||
PooledArray{T,R,ndims(RA),RA}(refs, invpool, pool, refcount) | ||
|
||
|
@@ -100,8 +112,8 @@ function _label(xs::AbstractArray, | |
::Type{I}=DEFAULT_POOLED_REF_TYPE, | ||
start = 1, | ||
labels = Array{I}(undef, size(xs)), | ||
invpool::Dict{T,I} = Dict{T, I}(), | ||
pool::Vector{T} = T[], | ||
invpool::Dict{Union{Missing,T},I} = Dict{Union{Missing,T}, I}(), | ||
pool::Vector{Union{Missing,T}} = Union{Missing,T}[], | ||
nlabels = 0, | ||
) where {T, I<:Integer} | ||
|
||
|
@@ -213,7 +225,7 @@ Base.copy(pa::PooledArrOrSub) = PooledArray(pa) | |
|
||
# here we do not allow dest to be SubArray as copy! is intended to replace whole arrays | ||
# slow path will be used for SubArray | ||
function copy!(dest::PooledArray{T, R, N}, | ||
function copy!(dest::Union{PooledArray{T, R, N}, PooledArray{Union{Missing,T}, R, N}}, | ||
src::PooledArrOrSub{T, N, R}) where {T, N, R} | ||
copy!(dest.refs, DataAPI.refarray(src)) | ||
src_refcount = refcount(src) | ||
|
@@ -232,10 +244,12 @@ function copy!(dest::PooledArray{T, R, N}, | |
end | ||
|
||
# this is needed as Julia Base uses a special path for this case we want to avoid | ||
Base.copyto!(dest::PooledArrOrSub{T, N, R}, src::PooledArrOrSub{T, N, R}) where {T, N, R} = | ||
Base.copyto!(dest::Union{PooledArrOrSub{T, N, R}, PooledArrOrSub{Union{Missing,T}, N, R}}, | ||
src::PooledArrOrSub{T, N, R}) where {T, N, R} = | ||
copyto!(dest, 1, src, 1, length(src)) | ||
|
||
function Base.copyto!(dest::PooledArrOrSub{T, N, R}, doffs::Union{Signed, Unsigned}, | ||
function Base.copyto!(dest::Union{PooledArrOrSub{T, N, R}, PooledArrOrSub{Union{Missing,T}, N, R}}, | ||
doffs::Union{Signed, Unsigned}, | ||
src::PooledArrOrSub{T, N, R}, soffs::Union{Signed, Unsigned}, | ||
n::Union{Signed, Unsigned}) where {T, N, R} | ||
n == 0 && return dest | ||
|
@@ -293,7 +307,7 @@ function Base.invpermute!!(x::PooledArray, p::AbstractVector{T}) where T<:Intege | |
end | ||
|
||
Base.similar(pa::PooledArray{T,R}, S::Type, dims::Dims) where {T,R} = | ||
PooledArray(RefArray(zeros(R, dims)), Dict{S,R}()) | ||
PooledArray{S}(RefArray(zeros(R, dims)), Dict{Union{Missing,S},R}()) | ||
|
||
Base.findall(pdv::PooledVector{Bool}) = findall(convert(Vector{Bool}, pdv)) | ||
|
||
|
@@ -304,6 +318,10 @@ Base.findall(pdv::PooledVector{Bool}) = findall(convert(Vector{Bool}, pdv)) | |
## | ||
############################################################################## | ||
|
||
# TODO ensure proper translation: | ||
# 1. if missing is mapped to something else - re-introduce the missing | ||
# 2. make sure missing ends up as the first entry in the result | ||
# 3. correctly calculate the eltype of the result (so that Missing does not affect it if it was not allowed originally) | ||
function Base.map(f, x::PooledArray{T,R}) where {T,R<:Integer} | ||
ks = collect(keys(x.invpool)) | ||
vs = collect(values(x.invpool)) | ||
|
@@ -392,6 +410,8 @@ Base.sort(pa::PooledArray; kw...) = pa[sortperm(pa; kw...)] | |
## | ||
############################################################################## | ||
|
||
# TODO: correctly handle types in conversions | ||
|
||
function Base.convert(::Type{PooledArray{S,R1,N}}, pa::PooledArray{T,R2,N}) where {S,T,R1<:Integer,R2<:Integer,N} | ||
invpool_conv = convert(Dict{S,R1}, pa.invpool) | ||
@assert invpool_conv !== pa.invpool | ||
|
@@ -448,7 +468,8 @@ Base.convert(::Type{Array}, pa::PooledArray{T, R, N}) where {T, R, N} = convert( | |
|
||
Base.@propagate_inbounds function Base.getindex(A::PooledArray, I::Int) | ||
idx = DataAPI.refarray(A)[I] | ||
iszero(idx) && throw(UndefRefError()) | ||
# if eltype(A) does not allow Missing then also 1 is an error | ||
idx <= !(eltype(A) <: Missing) && throw(UndefRefError()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmmm, I think something like the following should be faster (as it avoids the core subtyping algorithm): @assert Base.nonmissingtype(T) === T || !isone(idx) I think I like doing an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could do this also. I wanted to squeeze it into one check (to reduce the cost of However, first we should decide if we go for the design I proposed here or the design similar to what CategoricalArrays.jl does (as the other approach does not have this issue). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The other approach is having There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right. The issue is the following:
so this would be breaking (i.e. it is impossible to have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and then you have:
which means one needs a custom type to handle non 1-based indexing. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmmmm, I don't think that's awful behavior, but indeed a bit inconsistent. I mean, not letting users get There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel like we'd have a similar problem with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The option is to have a custom |
||
return @inbounds DataAPI.refpool(A)[idx] | ||
end | ||
|
||
|
@@ -466,7 +487,8 @@ Base.@propagate_inbounds function Base.getindex(A::PooledArrOrSub, | |
end | ||
|
||
Base.@propagate_inbounds function Base.isassigned(pa::PooledArrOrSub, I::Int...) | ||
!iszero(DataAPI.refarray(pa)[I...]) | ||
# if eltype(A) does not allow Missing then also 1 is an error | ||
!(DataAPI.refarray(pa)[I...] <= !(eltype(pa) <: Missing)) | ||
end | ||
|
||
############################################################################## | ||
|
@@ -511,6 +533,9 @@ end | |
Base.IndexStyle(::Type{<:PooledArray}) = IndexLinear() | ||
|
||
Base.@propagate_inbounds function Base.setindex!(x::PooledArray, val, ind::Int) | ||
if val === missing && !(eltype(x) <: Missing) | ||
throw(ArgumentError("PooledArray element type does not allow storing missing values")) | ||
end | ||
x.refs[ind] = getpoolidx(x, val) | ||
return x | ||
end | ||
|
@@ -548,6 +573,8 @@ Base.empty!(pv::PooledVector) = (empty!(pv.refs); pv) | |
|
||
Base.deleteat!(pv::PooledVector, inds) = (deleteat!(pv.refs, inds); pv) | ||
|
||
# TODO: review vcat for possible Missing related issues | ||
|
||
function _vcat!(c, a, b) | ||
copyto!(c, 1, a, 1, length(a)) | ||
return copyto!(c, length(a)+1, b, 1, length(b)) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I kind of think we should just keep this
T
; and letUnion{Missing, T}
be provided as theT
. That way if you don't havemissing
values, you're not penalized. It seems like to sosimillar_missing
, we could provide a function that just re-wraps therefs
,pool
,invpool
, etc in a newPooledArray
struct w/Union{T, Missing}
, right?But maybe you're right that it would be too expensive to have to change
pool::Vector{T}
andinvpool::Dict{T, R}
to supportmissing
as well in thesimilar_missing
operation.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am trying to avoid having to pay the following cost:
(it would also reduce the memory burden of making this operation)
With this PR it would be avoided.
My original preference was different, i.e. to never allow
Missing
in therefpool
andinvrefpool
but have0
to be a ref that is interpreted asmissing
without making a lookup, but @nalimilan thought it would be overly complex. That is why I have made a draft here to discuss the design.This would have a consequence similar to what we have in CategoricalArrays.jl:
(so magically
#undef
becomesmissing
which is probably undesirable)There is no rush to make a decision here. Let us think and decide what is best.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah neither solution is simple. Maybe it would be possible to make
convert
faster for dicts when the target eltype is a supertype of the source eltype?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AFAICT this has been an open issue in Julia Base for a long time. Who do you think is best to ask?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No idea. I guess we (you :-p) would have to come up with at least a draft PR to get people's attention. But this may be quite easy to do.
convert
calls this constructor when the origin type is the same as the destination:https://github.com/JuliaLang/julia/blob/be6887fa9ffa3d08315c93f1551c41b44dd3d720/base/dict.jl#L92-L95
I wonder whether what's needed is just to call
convert
on the keys and value when the types don't match. What do you think?Assuming that works, do you think it would be an acceptable replacement for this PR or would it still not be sufficient?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But then we loose COW benefits. However, since we are not fully convinced what is best I would park this PR for now, so that @quinnj can finish CSV.jl update without this functionality (which is more pressing). We can go back to this PR and decide what to do later (especially that if we change the internals probably 2.0 release would be needed).
And maybe in the mean time what @nalimilan proposes to do in Julia Base would be implemented (I cannot do it as I do not know how to do it unfortunately as I know too little about internal design related to JuliaLang/julia#26681)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm afraid the best person to implement no-copy
convert
forArray{Union{T, Missing}}
is also @quinnj! :-DThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@quinnj - if you do not have time for this (which I assume is the case) - can you please point me to the part of the sources that would have to be changed? (I assume it is somewhere in the C codes)
Thank you!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All the relevant code is in https://github.com/JuliaLang/julia/blob/master/src/array.c. In the jl_array_typetagdata function, it returns the "selector bytes" for an isbits-union array. This is where we lengthen the requested array size to account for selectory bytes. We also have to ensure the selector bytes are always initialized to zero, since otherwise you could get selector bytes pointing to something not just garbage, but totally wrong.
So the operation we'd need is
convert(::Type{Vector{Union{T, S}}}, x::Vector{T}) where {T, S}
, right? And we're ok if the two arrays potentially share data? So it seems we would add a check on the julia-side for ifT
isisbitstype
andS
isisbitstype
, and if so, call a new array.c routine.This is where it gets a little hairy though, because we have
array_resize_buffer
, but that takes an existing array to resize it. We'd need to use some of the same logic in that routine and make a newjl_convert_to_isbitsunion
method that:isbitstype
Union
eltype is alsoisbitstype
a->flags.isshared
(means we can't resize it; like an mmap buffer)jl_ptr_to_array_1d
with our realloc'd pointer? that will take care of making the new array and using the realloc'd data pointerThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah - I was afraid it would not be that simple. What we need is only
convert(::Type{Vector{Union{T, Missing}}}, x::Vector{T}) where {T}
butT
may be any type (also non-bits, e.g.String
). So I guess it would require separate paths forT
being bits type and being non-bits type.However, this in general shows the following problem. Assume that we have
x
andy
sharing the same data. Where eltype ofx
isT
and ofy
isUnion{Missing, T}
. What will happen tox
if we doy[1] = missing
? My intuition that this operation is super problematic - right? (i.e. would lead to an undefined state ofx[1]
)So the conclusion is that these arrays may not share data. Therefore we have an easier task of adding an internal constructor in PooledArrays.jl for
Dict
type that instead of:would copy an internal structure of source
Dict
and only do promotion of eltype ofkeys
field in the source dict.So we should have a function like:
and the performance would be:
the benefit would be ~3x speedup.