diff --git a/base/reducedim.jl b/base/reducedim.jl index 44a72a9a25b9d..251c5a83074b7 100644 --- a/base/reducedim.jl +++ b/base/reducedim.jl @@ -353,9 +353,19 @@ reduce(op, A::AbstractArray; kw...) = mapreduce(identity, op, A; kw...) ##### Specific reduction functions ##### """ - sum(A::AbstractArray; dims) + sum(A::AbstractArray; [dims], [weights::AbstractArray]) -Sum elements of an array over the given dimensions. +Compute the sum of array `A`. +If `dims` is provided, return an array of sums over these dimensions. +If `weights` is provided, return the weighted sum(s). `weights` must be +either an array of the same size as `A` if `dims` is omitted, +or a vector with the same length as `size(A, dims)` if `dims` is provided. + +!!! compat "Julia 1.1" + `mean` for empty arrays requires at least Julia 1.1. + +!!! compat "Julia 1.3" + The `weights` keyword argument requires at least Julia 1.3. # Examples ```jldoctest @@ -372,14 +382,26 @@ julia> sum(A, dims=2) 2×1 Array{Int64,2}: 3 7 + +julia> sum(A, weights=[2 1; 2 1]) +14 + +julia> sum(A, weights=[2, 1], dims=1) +1×2 Array{Int64,2}: + 5 8 ``` """ sum(A::AbstractArray; dims) """ - sum!(r, A) + sum!(r, A; [weights::AbstractVector]) Sum elements of `A` over the singleton dimensions of `r`, and write results to `r`. +If `r` has only one singleton dimension `i`, `weights` can be a vector of length +`size(v, i)` to compute the weighted mean. + +!!! compat "Julia 1.3" + The `weights` keyword argument requires at least Julia 1.3. # Examples ```jldoctest @@ -645,7 +667,7 @@ julia> any!([1 1], A) """ any!(r, A) -for (fname, _fname, op) in [(:sum, :_sum, :add_sum), (:prod, :_prod, :mul_prod), +for (fname, _fname, op) in [(:prod, :_prod, :mul_prod), (:maximum, :_maximum, :max), (:minimum, :_minimum, :min)] @eval begin # User-facing methods with keyword arguments @@ -658,6 +680,14 @@ for (fname, _fname, op) in [(:sum, :_sum, :add_sum), (:prod, :_prod, end end +# Sum is the only reduction which supports weights +sum(a::AbstractArray; dims=:, weights::Union{AbstractArray,Nothing}=nothing) = + _sum(a, dims, weights) +sum(f, a::AbstractArray; dims=:, weights::Union{AbstractArray,Nothing}=nothing) = + _sum(f, a, dims, weights) +sum(a, ::Colon, weights) = _sum(identity, a, :, weights) +sum(f, a, ::Colon, ::Nothing) = mapreduce(f, add_sum, a) + any(a::AbstractArray; dims=:) = _any(a, dims) any(f::Function, a::AbstractArray; dims=:) = _any(f, a, dims) _any(a, ::Colon) = _any(identity, a, :) @@ -665,21 +695,173 @@ all(a::AbstractArray; dims=:) = _all(a, dims) all(f::Function, a::AbstractArray; dims=:) = _all(f, a, dims) _all(a, ::Colon) = _all(identity, a, :) -for (fname, op) in [(:sum, :add_sum), (:prod, :mul_prod), +for (fname, op) in [(:prod, :mul_prod), (:maximum, :max), (:minimum, :min), (:all, :&), (:any, :|)] fname! = Symbol(fname, '!') + _fname! = Symbol('_', fname, '!') _fname = Symbol('_', fname) @eval begin + $(fname!)(r::AbstractArray, A::AbstractArray; init::Bool=true) = + $(fname!)(identity, r, A; init=init) $(fname!)(f::Function, r::AbstractArray, A::AbstractArray; init::Bool=true) = - mapreducedim!(f, $(op), initarray!(r, $(op), init, A), A) - $(fname!)(r::AbstractArray, A::AbstractArray; init::Bool=true) = $(fname!)(identity, r, A; init=init) + $(_fname!)(f, r, A; init=init) - $(_fname)(A, dims) = $(_fname)(identity, A, dims) + # Underlying implementations using dispatch + $(_fname!)(f, r::AbstractArray, A::AbstractArray; init::Bool=true) = + mapreducedim!(f, $(op), initarray!(r, $(op), init, A), A) + $(_fname)(A, dims) = $(_fname)(identity, A, dims) $(_fname)(f, A, dims) = mapreduce(f, $(op), A, dims=dims) end end +# Sum is the only reduction which supports weights +sum!(r::AbstractArray, A::AbstractArray; + init::Bool=true, weights::Union{AbstractArray,Nothing}=nothing) = + sum!(identity, r, A; init=init, weights=weights) +sum!(f::Function, r::AbstractArray, A::AbstractArray; + init::Bool=true, weights::Union{AbstractArray,Nothing}=nothing) = + _sum!(f, r, A, weights; init=init) +_sum!(f, r::AbstractArray, A::AbstractArray, ::Nothing; init::Bool=true) = + mapreducedim!(f, add_sum, initarray!(r, add_sum, init, A), A) +_sum(A::AbstractArray, dims, weights) = _sum(identity, A, dims, weights) +_sum(f, A::AbstractArray, dims, ::Nothing) = mapreduce(f, add_sum, A, dims=dims) +_sum(::typeof(identity), A::AbstractArray, dims, w::AbstractArray) = + _sum!(identity, reducedim_init(t -> t*zero(eltype(w)), add_sum, A, dims), A, w) +_sum(f, A::AbstractArray, dims, w::AbstractArray) = + throw(ArgumentError("Passing a function is not supported with `weights`")) + + +# Weighted sum +function _sum(A::AbstractArray, dims::Colon, w::AbstractArray{<:Real}) + sw = size(w) + sA = size(A) + if sw != sA + throw(DimensionMismatch("weights must have the same dimension as data (got $sw and $sA).")) + end + s0 = zero(eltype(A)) * zero(eltype(w)) + s = add_sum(s0, s0) + @inbounds @simd for i in eachindex(A, w) + s += A[i] * w[i] + end + s +end + +# Weighted sum over dimensions +# +# Brief explanation of the algorithm: +# ------------------------------------ +# +# 1. _wsum! provides the core implementation, which assumes that +# the dimensions of all input arguments are consistent, and no +# dimension checking is performed therein. +# +# wsum and wsum! perform argument checking and call _wsum! +# internally. +# +# 2. _wsum! adopt a Cartesian based implementation for general +# sub types of AbstractArray. Particularly, a faster routine +# that keeps a local accumulator will be used when dim = 1. +# +# The internal function that implements this is _wsum_general! +# +# 3. _wsum! is specialized for following cases: +# (a) A is a vector: we invoke the vector version wsum above. +# The internal function that implements this is _wsum1! +# +# (b) A is a dense matrix with eltype <: BlasReal: we call gemv! +# The internal function that implements this is _wsum2_blas! +# (in LinearAlgebra/src/wsum.jl) +# +# (c) A is a contiguous array with eltype <: BlasReal: +# dim == 1: treat A like a matrix of size (d1, d2 x ... x dN) +# dim == N: treat A like a matrix of size (d1 x ... x d(N-1), dN) +# otherwise: decompose A into multiple pages, and apply _wsum2_blas! +# for each +# The internal function that implements this is _wsumN! +# (in LinearAlgebra/src/wsum.jl) +# +# (d) A is a general dense array with eltype <: BlasReal: +# dim <= 2: delegate to (a) and (b) +# otherwise, decompose A into multiple pages +# The internal function that implements this is _wsumN! +# (in LinearAlgebra/src/wsum.jl) + +function _wsum1!(R::AbstractArray, A::AbstractVector, w::AbstractVector, init::Bool) + r = _sum(A, :, w) + if init + R[1] = r + else + R[1] += r + end + return R +end + +function _wsum_general!(R::AbstractArray{S}, A::AbstractArray, w::AbstractVector, + dim::Int, init::Bool) where {S} + # following the implementation of _mapreducedim! + lsiz = check_reducedims(R,A) + !isempty(R) && init && fill!(R, zero(S)) + isempty(A) && return R + + indsAt, indsRt = safe_tail(axes(A)), safe_tail(axes(R)) # handle d=1 manually + keep, Idefault = Broadcast.shapeindexer(indsRt) + if reducedim1(R, A) + i1 = first(axes1(R)) + for IA in CartesianIndices(indsAt) + IR = Broadcast.newindex(IA, keep, Idefault) + r = R[i1,IR] + @inbounds @simd for i in axes(A, 1) + r += A[i,IA] * w[dim > 1 ? IA[dim-1] : i] + end + R[i1,IR] = r + end + else + for IA in CartesianIndices(indsAt) + IR = Broadcast.newindex(IA, keep, Idefault) + @inbounds @simd for i in axes(A, 1) + R[i,IR] += A[i,IA] * w[dim > 1 ? IA[dim-1] : i] + end + end + end + return R +end + +_wsum!(R::AbstractArray, A::AbstractVector, w::AbstractVector, + dim::Int, init::Bool) = + _wsum1!(R, A, w, init) + +_wsum!(R::AbstractArray, A::AbstractArray, w::AbstractVector, + dim::Int, init::Bool) = + _wsum_general!(R, A, w, dim, init) + +function _sum!(f, R::AbstractArray, A::AbstractArray{T,N}, w::AbstractArray; + init::Bool=true) where {T,N} + f === identity || throw(ArgumentError("Passing a function is not supported with `weights`")) + w isa AbstractVector || throw(ArgumentError("Only vector `weights` are supported")) + + check_reducedims(R,A) + reddims = size(R) .!= size(A) + dim = something(findfirst(reddims), ndims(R)+1) + if dim > N + dim1 = findfirst(==(1), size(A)) + if dim1 !== nothing + dim = dim1 + end + end + if findnext(reddims, dim+1) !== nothing + throw(ArgumentError("reducing over more than one dimension is not supported with weights")) + end + lw = length(w) + ldim = size(A, dim) + if lw != ldim + throw(DimensionMismatch("weights must have the same length as the dimension " * + "over which reduction is performed (got $lw and $ldim).")) + end + _wsum!(R, A, w, dim, init) +end + + ##### findmin & findmax ##### # The initial values of Rval are not used if the corresponding indices in Rind are 0. # diff --git a/stdlib/LinearAlgebra/src/LinearAlgebra.jl b/stdlib/LinearAlgebra/src/LinearAlgebra.jl index dc11793bb085a..bfec521601471 100644 --- a/stdlib/LinearAlgebra/src/LinearAlgebra.jl +++ b/stdlib/LinearAlgebra/src/LinearAlgebra.jl @@ -14,7 +14,7 @@ import Base: USE_BLAS64, abs, acos, acosh, acot, acoth, acsc, acsch, adjoint, as getproperty, imag, inv, isapprox, isone, iszero, IndexStyle, kron, length, log, map, ndims, oneunit, parent, power_by_squaring, print_matrix, promote_rule, real, round, sec, sech, setindex!, show, similar, sin, sincos, sinh, size, sqrt, - strides, stride, tan, tanh, transpose, trunc, typed_hcat, vec + strides, stride, tan, tanh, transpose, trunc, typed_hcat, vec, _wsum! using Base: hvcat_fill, IndexLinear, promote_op, promote_typeof, @propagate_inbounds, @pure, reduce, typed_vcat, require_one_based_indexing using Base.Broadcast: Broadcasted @@ -373,6 +373,7 @@ include("bitarray.jl") include("ldlt.jl") include("schur.jl") include("structuredbroadcast.jl") +include("wsum.jl") include("deprecated.jl") const ⋅ = dot diff --git a/stdlib/LinearAlgebra/src/wsum.jl b/stdlib/LinearAlgebra/src/wsum.jl new file mode 100644 index 0000000000000..8b0ecc1478cf7 --- /dev/null +++ b/stdlib/LinearAlgebra/src/wsum.jl @@ -0,0 +1,94 @@ +# Optimized method for weighted sum with BlasReal +# dot cannot be used for other types as it uses + rather than add_sum for accumulation, +# and therefore does not return the correct type +Base._sum(A::AbstractArray{T}, dims::Colon, w::AbstractArray{T}) where {T<:BlasReal} = + dot(vec(A), vec(w)) + +# Optimized methods for weighted sum over dimensions with BlasReal +# (generic method is defined in base/reducedim.jl) +# +# _wsum! is specialized for following cases: +# (a) A is a dense matrix with eltype <: BlasReal: we call gemv! +# The internal function that implements this is _wsum2_blas! +# +# (b) A is a contiguous array with eltype <: BlasReal: +# dim == 1: treat A like a matrix of size (d1, d2 x ... x dN) +# dim == N: treat A like a matrix of size (d1 x ... x d(N-1), dN) +# otherwise: decompose A into multiple pages, and apply _wsum2_blas! +# for each +# The internal function that implements this is _wsumN! +# +# (c) A is a general dense array with eltype <: BlasReal: +# dim <= 2: delegate to (a) and (b) +# otherwise, decompose A into multiple pages +# The internal function that implements this is _wsumN! + +function _wsum2_blas!(R::StridedVector{T}, A::StridedMatrix{T}, w::StridedVector{T}, + dim::Int, init::Bool) where T<:BlasReal + beta = ifelse(init, zero(T), one(T)) + trans = dim == 1 ? 'T' : 'N' + BLAS.gemv!(trans, one(T), A, w, beta, R) + return R +end + +function _wsumN!(R::StridedArray{T}, A::StridedArray{T,N}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:BlasReal,N} + if dim == 1 + m = size(A, 1) + n = div(length(A), m) + _wsum2_blas!(view(R,:), reshape(A, (m, n)), w, 1, init) + elseif dim == N + n = size(A, N) + m = div(length(A), n) + _wsum2_blas!(view(R,:), reshape(A, (m, n)), w, 2, init) + else # 1 < dim < N + m = 1 + for i = 1:dim-1 + m *= size(A, i) + end + n = size(A, dim) + k = 1 + for i = dim+1:N + k *= size(A, i) + end + Av = reshape(A, (m, n, k)) + Rv = reshape(R, (m, k)) + for i = 1:k + _wsum2_blas!(view(Rv,:,i), view(Av,:,:,i), w, 2, init) + end + end + return R +end + +function _wsumN!(R::StridedArray{T}, A::DenseArray{T,N}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:BlasReal,N} + @assert N >= 3 + if dim <= 2 + m = size(A, 1) + n = size(A, 2) + npages = 1 + for i = 3:N + npages *= size(A, i) + end + rlen = ifelse(dim == 1, n, m) + Rv = reshape(R, (rlen, npages)) + for i = 1:npages + _wsum2_blas!(view(Rv,:,i), view(A,:,:,i), w, dim, init) + end + else + Base._wsum_general!(R, A, w, dim, init) + end + return R +end + +Base._wsum!(R::StridedArray{T}, A::DenseMatrix{T}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:BlasReal} = + _wsum2_blas!(view(R,:), A, w, dim, init) + +Base._wsum!(R::StridedArray{T}, A::DenseArray{T}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:BlasReal} = + _wsumN!(R, A, w, dim, init) + +Base._wsum!(R::StridedVector{T}, A::DenseArray{T}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:BlasReal} = + Base._wsum1!(R, A, w, init) \ No newline at end of file diff --git a/stdlib/Statistics/docs/src/index.md b/stdlib/Statistics/docs/src/index.md index 5a684541f015f..661fcf4053c3d 100644 --- a/stdlib/Statistics/docs/src/index.md +++ b/stdlib/Statistics/docs/src/index.md @@ -4,22 +4,81 @@ DocTestSetup = :(using Statistics) ``` -The Statistics module contains basic statistics functionality. +The Statistics module contains basic statistics functionality: mean, median, quantiles, +standard deviation, variance, skewness, kurtosis, correlation and covariance. +Statistics can be weighted, and several weights types are distinguished to apply appropriate +corrections where necessary. + +## Mean, median and quantiles + +```@docs +Statistics.mean +Statistics.mean! +Statistics.median +Statistics.median! +Statistics.middle +Statistics.quantile +Statistics.quantile! +``` + +## Moments ```@docs Statistics.std Statistics.stdm Statistics.var Statistics.varm +Statistics.skewness +Statistics.kurtosis +``` + +## Correlation and covariance + +```@docs Statistics.cor Statistics.cov -Statistics.mean! -Statistics.mean -Statistics.median! -Statistics.median -Statistics.middle -Statistics.quantile! -Statistics.quantile +``` + +## Weights types + +Four statistical weights types are provided which inherit from the `AbstractWeights` type: + +- `Weights` is a generic type for arbitary weights. Using this type will trigger an error + with functions which rely on assumptions about a particular definition of weights. +- `AnalyticWeights` describe the relative importance for each observation. + These weights may also be referred to as reliability weights, precision weights + or inverse variance weights. These are typically used when the observations + are aggregate values (e.g. averages) with differing variances. +- `FrequencyWeights` describe the number of times (or frequency) each observation + was observed. These weights may also be referred to as case weights or repeat weights. +- `ProbabilityWeights` represent the inverse of the sampling probability + for each observation, providing a correction mechanism for under- or over-sampling + certain population groups. These weights may also be referred to as sampling weights. + +The choice of weights impacts how bias is corrected in several methods. +See the [`var`](@ref), [`std`](@ref), [`cov`](@ref) and [`quantile`](@ref) +docstrings for more details. + +Short-hand constructors `weights`, `aweights`, `fweights` and `pweights` +are provided for convenience. + +!!! note + - The weight vector is a light-weight wrapper of the input vector. + The input vector is NOT copied during construction. + - The weight vector maintains the sum of weights, which is computed upon construction. + If the value of the sum is pre-computed, one can supply it as the second argument + to the constructor and save the time of computing the sum again. + +```@docs +Statistics.AbstractWeights +Statistics.Weights +Statistics.AnalyticWeights +Statistics.FrequencyWeights +Statistics.ProbabilityWeights +Statistics.weights +Statistics.aweights +Statistics.fweights +Statistics.pweights ``` ```@meta diff --git a/stdlib/Statistics/src/Statistics.jl b/stdlib/Statistics/src/Statistics.jl index 19e73e6ab13bf..5c4863a258e89 100644 --- a/stdlib/Statistics/src/Statistics.jl +++ b/stdlib/Statistics/src/Statistics.jl @@ -12,7 +12,13 @@ using LinearAlgebra, SparseArrays using Base: has_offset_axes, require_one_based_indexing export cor, cov, std, stdm, var, varm, mean!, mean, - median!, median, middle, quantile!, quantile + median!, median, middle, quantile!, quantile, + skewness, kurtosis, + AbstractWeights, Weights, AnalyticWeights, FrequencyWeights, ProbabilityWeights, + weights, aweights, fweights, pweights + +include("weights.jl") +include("moments.jl") ##### mean ##### @@ -101,9 +107,14 @@ _mean(f, A::AbstractArray, ::Colon) = sum(f, A) / length(A) _mean(f, A::AbstractArray, dims) = sum(f, A, dims=dims) / mapreduce(i -> size(A, i), *, unique(dims); init=1) """ - mean!(r, v) + mean!(r, v; [weights::AbstractVector]) Compute the mean of `v` over the singleton dimensions of `r`, and write results to `r`. +If `r` has only one singleton dimension `i`, `weights` can be a vector of length +`size(v, i)` to compute the weighted mean. + +!!! compat "Julia 1.3" + The `weights` argument requires at least Julia 1.3. # Examples ```jldoctest @@ -122,21 +133,35 @@ julia> mean!([1. 1.], v) 2.0 3.0 ``` """ -function mean!(R::AbstractArray, A::AbstractArray) +mean!(R::AbstractArray, A::AbstractArray; + weights::Union{AbstractArray,Nothing}=nothing) = + _mean!(R, A, weights) + +function _mean!(R::AbstractArray, A::AbstractArray, weights::Nothing) sum!(R, A; init=true) x = max(1, length(R)) // length(A) R .= R .* x return R end +_mean!(R::AbstractArray, A::AbstractArray, w::AbstractArray) = + rmul!(sum!(R, A, weights=w), inv(sum(w))) + """ - mean(A::AbstractArray; dims) + mean(A::AbstractArray; [dims], [weights::AbstractArray]) -Compute the mean of an array over the given dimensions. +Compute the mean of array `A`. +If `dims` is provided, return an array of means over these dimensions. +If `weights` is provided, return the weighted mean(s). `weights` must be +either an array of the same size as `A` if `dims` is omitted, +or a vector with the same length as `size(A, dims)` if `dims` is provided. !!! compat "Julia 1.1" `mean` for empty arrays requires at least Julia 1.1. +!!! compat "Julia 1.3" + The `weights` keyword argument requires at least Julia 1.3. + # Examples ```jldoctest julia> A = [1 2; 3 4] @@ -152,19 +177,32 @@ julia> mean(A, dims=2) 2×1 Array{Float64,2}: 1.5 3.5 + +julia> mean(A, weights=[2 1; 2 1]) +2.3333333333333335 + +julia> mean(A, weights=[2, 1], dims=1) +1×2 Array{Float64,2}: + 1.66667 2.66667 ``` """ -mean(A::AbstractArray; dims=:) = _mean(A, dims) - -_mean(A::AbstractArray{T}, region) where {T} = mean!(Base.reducedim_init(t -> t/2, +, A, region), A) -_mean(A::AbstractArray, ::Colon) = sum(A) / length(A) +mean(A::AbstractArray; dims=:, weights::Union{AbstractArray, Nothing}=nothing) = + _mean(A, dims, weights) -function mean(r::AbstractRange{<:Real}) +function _mean(r::AbstractRange{<:Real}, dims::Colon, weights::Nothing) isempty(r) && return oftype((first(r) + last(r)) / 2, NaN) (first(r) + last(r)) / 2 end -median(r::AbstractRange{<:Real}) = mean(r) +_mean(A::AbstractArray, dims, weights::Nothing) = + _mean!(Base.reducedim_init(t -> t/2, Base.add_sum, A, dims), A, nothing) +_mean(A::AbstractArray, dims::Colon, weights::Nothing) = sum(A) / length(A) + +_mean(A::AbstractArray, dims::Colon, w::AbstractArray) = + sum(A, weights=w) / sum(w) + +_mean(A::AbstractArray, dims, w::AbstractArray) = + _mean!(Base.reducedim_init(t -> (t*zero(eltype(w)))/2, Base.add_sum, A, dims), A, w) ##### variances ##### @@ -216,19 +254,19 @@ function _var(iterable, corrected::Bool, mean) end end -centralizedabs2fun(m) = x -> abs2.(x - m) centralize_sumabs2(A::AbstractArray, m) = - mapreduce(centralizedabs2fun(m), +, A) + mapreduce(x -> abs2.(x - m), +, A) centralize_sumabs2(A::AbstractArray, m, ifirst::Int, ilast::Int) = - Base.mapreduce_impl(centralizedabs2fun(m), +, A, ifirst, ilast) + Base.mapreduce_impl(x -> abs2.(x - m), +, A, ifirst, ilast) -function centralize_sumabs2!(R::AbstractArray{S}, A::AbstractArray, means::AbstractArray) where S +function centralize_sumabs2!(R::AbstractArray{S}, A::AbstractArray, means::AbstractArray, + w::Union{AbstractArray, Nothing}=nothing) where S # following the implementation of _mapreducedim! at base/reducedim.jl lsiz = Base.check_reducedims(R,A) isempty(R) || fill!(R, zero(S)) isempty(A) && return R - if Base.has_fast_linear_indexing(A) && lsiz > 16 && !has_offset_axes(R, means) + if w === nothing && Base.has_fast_linear_indexing(A) && lsiz > 16 && !has_offset_axes(R, means) nslices = div(length(A), lsiz) ibase = first(LinearIndices(A))-1 for i = 1:nslices @@ -246,22 +284,34 @@ function centralize_sumabs2!(R::AbstractArray{S}, A::AbstractArray, means::Abstr r = R[i1,IR] m = means[i1,IR] @simd for i in axes(A, 1) - r += abs2(A[i,IA] - m) + if w === nothing + r += abs2(A[i,IA] - m) + else + r += abs2(A[i,IA] - m) * w[i] + end end R[i1,IR] = r end else @inbounds for IA in CartesianIndices(indsAt) IR = Broadcast.newindex(IA, keep, Idefault) + if w !== nothing + wi = w[IA] + end @simd for i in axes(A, 1) - R[i,IR] += abs2(A[i,IA] - means[i,IR]) + if w === nothing + R[i,IR] += abs2(A[i,IA] - means[i,IR]) + else + R[i,IR] += abs2(A[i,IA] - means[i,IR]) * wi + end end end end return R end -function varm!(R::AbstractArray{S}, A::AbstractArray, m::AbstractArray; corrected::Bool=true) where S +function varm!(R::AbstractArray{S}, A::AbstractArray, m::AbstractArray, w::Nothing; + corrected::Bool=true) where S if isempty(A) fill!(R, convert(S, NaN)) else @@ -272,6 +322,12 @@ function varm!(R::AbstractArray{S}, A::AbstractArray, m::AbstractArray; correcte return R end +function varm!(R::AbstractArray, A::AbstractArray, m::AbstractArray, w::AbstractArray; + corrected::Bool=true) + rmul!(centralize_sumabs2!(R, A, m, values(w)), + varcorrection(w, corrected)) +end + """ varm(itr, m; dims, corrected::Bool=true) @@ -294,22 +350,111 @@ over dimensions, and `m` may contain means for each dimension of `itr`. Use the [`skipmissing`](@ref) function to omit `missing` entries and compute the variance of non-missing values. """ -varm(A::AbstractArray, m::AbstractArray; corrected::Bool=true, dims=:) = _varm(A, m, corrected, dims) +varm(A::AbstractArray, m; corrected::Bool=true, dims=:, + weights::Union{AbstractWeights, Nothing}=nothing) = + _varm(A, m, corrected, dims, weights) + +varm(iterable, m; corrected::Bool=true) = + _var(iterable, corrected, m) -_varm(A::AbstractArray{T}, m, corrected::Bool, region) where {T} = - varm!(Base.reducedim_init(t -> abs2(t)/2, +, A, region), A, m; corrected=corrected) +_varm(A::AbstractArray, m, corrected::Bool, dims, w::Nothing) = + varm!(Base.reducedim_init(t -> abs2(t)/2, +, A, dims), A, m, w, corrected=corrected) -varm(A::AbstractArray, m; corrected::Bool=true) = _varm(A, m, corrected, :) +_varm(A::AbstractArray, m, corrected::Bool, dims, w::AbstractWeights{T}) where {T<:Real} = + varm!(Base.reducedim_init(t -> (abs2(t)*zero(T))/2, +, A, dims), A, m, w, + corrected=corrected) -function _varm(A::AbstractArray{T}, m, corrected::Bool, ::Colon) where T +function _varm(A::AbstractArray{T}, m, corrected::Bool, dims::Colon, w::Nothing) where T n = length(A) n == 0 && return oftype((abs2(zero(T)) + abs2(zero(T)))/2, NaN) return centralize_sumabs2(A, m) / (n - Int(corrected)) end +function _varm(A::AbstractArray{T}, m, corrected::Bool, dims::Colon, + w::AbstractWeights) where T + s = (zero(T) - zero(m))^2 * zero(eltype(w)) + @inbounds @simd for i in eachindex(A, w) + z = A[i] - m + s += (z * z) * w[i] + end + + varcorrection(w, corrected) * s +end + +""" + varcorrection(n::Integer, corrected=false) + +Compute a bias correction factor for calculating `var`, `std` and `cov` with +`n` observations. Returns ``\\frac{1}{n - 1}`` when `corrected=true` +(i.e. [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)), +otherwise returns ``\\frac{1}{n}`` (i.e. no correction). +""" +@inline varcorrection(n::Integer, corrected::Bool=false) = 1 / (n - Int(corrected)) + +""" + varcorrection(w::Weights, corrected=false) + +Returns ``\\frac{1}{\\sum w}`` when `corrected=false` and throws an `ArgumentError` +if `corrected=true`. +""" +@inline function varcorrection(w::Weights, corrected::Bool=false) + corrected && throw(ArgumentError("Weights type does not support bias correction: " * + "use FrequencyWeights, AnalyticWeights or ProbabilityWeights if applicable.")) + 1 / w.sum +end + +""" + varcorrection(w::AnalyticWeights, corrected=false) + +* `corrected=true`: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` +* `corrected=false`: ``\\frac{1}{\\sum w}`` +""" +@inline function varcorrection(w::AnalyticWeights, corrected::Bool=false) + s = w.sum + + if corrected + sum_sn = sum(x -> (x / s) ^ 2, w) + 1 / (s * (1 - sum_sn)) + else + 1 / s + end +end + +""" + varcorrection(w::FrequencyWeights, corrected=false) + +* `corrected=true`: ``\\frac{1}{\\sum{w} - 1}`` +* `corrected=false`: ``\\frac{1}{\\sum w}`` +""" +@inline function varcorrection(w::FrequencyWeights, corrected::Bool=false) + s = w.sum + + if corrected + 1 / (s - 1) + else + 1 / s + end +end + +""" + varcorrection(w::ProbabilityWeights, corrected=false) + +* `corrected=true`: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` +* `corrected=false`: ``\\frac{1}{\\sum w}`` +""" +@inline function varcorrection(w::ProbabilityWeights, corrected::Bool=false) + s = w.sum + + if corrected + n = count(!iszero, w) + n / (s * (n - 1)) + else + 1 / s + end +end """ - var(itr; dims, corrected::Bool=true, mean=nothing) + var(itr; corrected::Bool=true, [weights::AbstractWeights], [mean], [dims]) Compute the sample variance of collection `itr`. @@ -326,21 +471,42 @@ A pre-computed `mean` may be provided. If `itr` is an `AbstractArray`, `dims` can be provided to compute the variance over dimensions, and `mean` may contain means for each dimension of `itr`. +If `itr` is an `AbstractArray`, `weights` can be provided to compute the weighted +variance. `weights` must be either an array of the same size +as `A` if `dims` is omitted, or a vector with the same length as `size(A, dims)` +if `dims` is provided. +The weighted uncorrected (when `corrected=false`) sample variance +is defined as: +```math +\\frac{1}{\\sum{w}} \\sum_{i=1}^n {w_i\\left({x_i - μ}\\right)^2 } +``` +where ``n`` is the length of the input and ``μ`` is the mean. +The unbiased estimate (when `corrected=true`) of the population variance is +computed by replacing ``\\frac{1}{\\sum{w}}`` with a factor dependent on the type of +weights used: +* [`AnalyticWeights`](@ref): ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` +* [`FrequencyWeights`](@ref): ``\\frac{1}{\\sum{w} - 1}`` +* [`ProbabilityWeights`](@ref): ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` + equals `count(!iszero, w)` +* [`Weights`](@ref): `ArgumentError` (bias correction not supported) + !!! note If array contains `NaN` or [`missing`](@ref) values, the result is also `NaN` or `missing` (`missing` takes precedence if array contains both). Use the [`skipmissing`](@ref) function to omit `missing` entries and compute the variance of non-missing values. """ -var(A::AbstractArray; corrected::Bool=true, mean=nothing, dims=:) = _var(A, corrected, mean, dims) +var(A::AbstractArray; + corrected::Bool=true, mean=nothing, dims=:, + weights::Union{AbstractWeights, Nothing}=nothing) = + _var(A, corrected, mean, dims, weights) -_var(A::AbstractArray, corrected::Bool, mean, dims) = - varm(A, something(mean, Statistics.mean(A, dims=dims)); corrected=corrected, dims=dims) +_var(A::AbstractArray, corrected::Bool, m, dims, w::Union{AbstractWeights, Nothing}) = + varm(A, m === nothing ? mean(A, dims=dims, weights=w) : m, + corrected=corrected, dims=dims, weights=w) -_var(A::AbstractArray, corrected::Bool, mean, ::Colon) = - real(varm(A, something(mean, Statistics.mean(A)); corrected=corrected)) - -varm(iterable, m; corrected::Bool=true) = _var(iterable, corrected, m) +_var(A::AbstractArray, corrected::Bool, m, ::Colon, w::Union{AbstractWeights, Nothing}) = + real(varm(A, m === nothing ? mean(A, weights=w) : m, corrected=corrected, weights=w)) ## variances over ranges @@ -382,9 +548,7 @@ stdm(A::AbstractArray, m; corrected::Bool=true) = sqrt.(varm(A, m; corrected=corrected)) """ - std(itr; corrected::Bool=true, mean=nothing[, dims]) - -Compute the sample standard deviation of collection `itr`. + std(itr; corrected::Bool=true, mean=nothing, [weights::AbstractWeights], [dims]) The algorithm returns an estimator of the generative distribution's standard deviation under the assumption that each entry of `itr` is an IID drawn from that generative @@ -397,27 +561,55 @@ whereas the sum is scaled with `n` if `corrected` is A pre-computed `mean` may be provided. If `itr` is an `AbstractArray`, `dims` can be provided to compute the standard deviation -over dimensions, and `means` may contain means for each dimension of `itr`. +over dimensions, and `mean` may contain means for each dimension of `itr`. + +If `itr` is an `AbstractArray`, `weights` can be provided to compute the weighted +standard deviation. `weights` must be either an array of the same size +as `A` if `dims` is omitted, or a vector with the same length as `size(A, dims)` +if `dims` is provided. +The weighted uncorrected (when `corrected=false`) sample standard deviation +is defined as: +```math +\\sqrt{\\frac{1}{\\sum{w}} \\sum_{i=1}^n {w_i\\left({x_i - μ}\\right)^2 }} +``` +where ``n`` is the length of the input and ``μ`` is the mean. +The unbiased estimate (when `corrected=true`) of the population standard deviation is +computed by replacing ``\\frac{1}{\\sum{w}}`` with a factor dependent on the type of +weights used: +* [`AnalyticWeights`](@ref): ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` +* [`FrequencyWeights`](@ref): ``\\frac{1}{\\sum{w} - 1}`` +* [`ProbabilityWeights`](@ref): ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` + equals `count(!iszero, w)` +* [`Weights`](@ref): `ArgumentError` (bias correction not supported) !!! note If array contains `NaN` or [`missing`](@ref) values, the result is also `NaN` or `missing` (`missing` takes precedence if array contains both). Use the [`skipmissing`](@ref) function to omit `missing` entries and compute the standard deviation of non-missing values. + +!!! compat "Julia 1.3" + The `weights` keyword argument requires at least Julia 1.3. """ -std(A::AbstractArray; corrected::Bool=true, mean=nothing, dims=:) = _std(A, corrected, mean, dims) +std(A::AbstractArray; + corrected::Bool=true, mean=nothing, dims=:, + weights::Union{AbstractWeights, Nothing}=nothing) = + _std(A, corrected, mean, dims, weights) -_std(A::AbstractArray, corrected::Bool, mean, dims) = - sqrt.(var(A; corrected=corrected, mean=mean, dims=dims)) +_std(A::AbstractArray, corrected::Bool, mean, dims, + weights::Union{AbstractWeights, Nothing}) = + sqrt.(var(A; corrected=corrected, mean=mean, dims=dims, weights=weights)) -_std(A::AbstractArray, corrected::Bool, mean, ::Colon) = - sqrt.(var(A; corrected=corrected, mean=mean)) +_std(A::AbstractArray, corrected::Bool, mean, ::Colon, w::Union{AbstractWeights, Nothing}) = + sqrt.(var(A; corrected=corrected, mean=mean, weights=w)) -_std(A::AbstractArray{<:AbstractFloat}, corrected::Bool, mean, dims) = - sqrt!(var(A; corrected=corrected, mean=mean, dims=dims)) +_std(A::AbstractArray{<:AbstractFloat}, corrected::Bool, mean, dims, + w::Union{AbstractWeights, Nothing}) = + sqrt!(var(A; corrected=corrected, mean=mean, dims=dims, weights=w)) -_std(A::AbstractArray{<:AbstractFloat}, corrected::Bool, mean, ::Colon) = - sqrt.(var(A; corrected=corrected, mean=mean)) +_std(A::AbstractArray{<:AbstractFloat}, corrected::Bool, mean, ::Colon, + w::Union{AbstractWeights, Nothing}) = + sqrt.(var(A; corrected=corrected, mean=mean, weights=w)) std(iterable; corrected::Bool=true, mean=nothing) = sqrt(var(iterable, corrected=corrected, mean=mean)) @@ -693,7 +885,7 @@ Compute the Pearson correlation between the vectors or matrices `X` and `Y` alon cor(x::AbstractVecOrMat, y::AbstractVecOrMat; dims::Int=1) = corm(x, _vmean(x, dims), y, _vmean(y, dims), dims) -##### median & quantiles ##### +##### middle, median & quantiles ##### """ middle(x) @@ -798,25 +990,44 @@ julia> median(skipmissing([1, 2, missing, 4])) median(itr) = median!(collect(itr)) """ - median(A::AbstractArray; dims) + median(A::AbstractArray; [dims], [weights::AbstractArray]) + +Compute the median of array `A`. +If `dims` is provided, return an array of median over these dimensions. +If `weights` is provided, return the weighted median(s). `weights` must be +either an array of the same size as `A`. `dims` and `weights` cannot be specified +at the same time. -Compute the median of an array along the given dimensions. +See the documentation for [`quantile`](@ref) for more details. + +!!! compat "Julia 1.3" + The `weights` keyword argument requires at least Julia 1.3c. # Examples ```jldoctest julia> median([1 2; 3 4], dims=1) 1×2 Array{Float64,2}: 2.0 3.0 + +julia> median([1 2; 3 4], weights=fweights([1 1; 2 1])) +3.0 ``` """ -median(v::AbstractArray; dims=:) = _median(v, dims) +median(A::AbstractArray; dims=:, weights::Union{AbstractArray, Nothing}=nothing) = + _median(A, dims, weights) + +_median(r::AbstractRange{<:Real}, dims::Colon, w::Nothing) = mean(r) + +_median(A::AbstractArray, dims, w::Nothing) = mapslices(median!, A, dims = dims) + +_median(A::AbstractArray{T}, dims::Colon, w::Nothing) where {T} = + median!(copyto!(Array{T,1}(undef, length(A)), A)) -_median(v::AbstractArray, dims) = mapslices(median!, v, dims = dims) +_median(v::AbstractArray, dims::Colon, w::AbstractArray) = _quantile(v, 0.5, false, w) -_median(v::AbstractArray{T}, ::Colon) where {T} = median!(copyto!(Array{T,1}(undef, length(v)), v)) +_median(A::AbstractArray, dims, w::AbstractArray) = + throw(ArgumentError("weights and dims cannot be specified at the same time")) -# for now, use the R/S definition of quantile; may want variants later -# see ?quantile in R -- this is type 7 """ quantile!([q::AbstractArray, ] v::AbstractVector, p; sorted=false) @@ -930,7 +1141,7 @@ end """ - quantile(itr, p; sorted=false) + quantile(itr, p; sorted=false, [weights::AbstractWeights]) Compute the quantile(s) of a collection `itr` at a specified probability or vector or tuple of probabilities `p` on the interval [0,1]. The keyword argument `sorted` indicates whether @@ -940,6 +1151,18 @@ Quantiles are computed via linear interpolation between the points `((k-1)/(n-1) for `k = 1:n` where `n = length(itr)`. This corresponds to Definition 7 of Hyndman and Fan (1996), and is the same as the R default. +If `itr` is an `AbstractArray`, `weights` can be specified to compute weighted quantiles. +Weights must not be negative and must have the same length as the data. +With [`FrequencyWeights`](@ref), the function returns the same result as +`quantile` for a vector with repeated values. Weights must be integers. +With non `FrequencyWeights`, denote ``N`` the length of the vector, ``w`` the vector of weights, +``h = p (\\sum_{i<= N} w_i - w_1) + w_1`` the cumulative weight corresponding to the +probability ``p`` and ``S_k = \\sum_{i<=k} w_i`` the cumulative weight for each +observation, define ``v_{k+1}`` the smallest element of `v` such that ``S_{k+1}`` +is strictly superior to ``h``. The weighted ``p`` quantile is given by ``v_k + \\gamma (v_{k+1} - v_k)`` +with ``\\gamma = (h - S_k)/(S_{k+1} - S_k)``. In particular, when all weights are equal, +the function returns the same result as the unweighted `quantile`. + !!! note An `ArgumentError` is thrown if `itr` contains `NaN` or [`missing`](@ref) values. Use the [`skipmissing`](@ref) function to omit `missing` entries and compute the @@ -963,11 +1186,87 @@ julia> quantile(skipmissing([1, 10, missing]), 0.5) 5.5 ``` """ -quantile(itr, p; sorted::Bool=false) = quantile!(collect(itr), p, sorted=sorted) +quantile(itr, p; sorted::Bool=false, weights::Union{AbstractArray,Nothing}=nothing) = + _quantile(itr, p, sorted, weights) + +_quantile(itr, p, sorted::Bool, weights::Nothing) = + quantile!(collect(itr), p, sorted=sorted) + +_quantile(itr::AbstractArray, p, sorted::Bool, weights::Nothing) = + quantile!(sorted ? itr : Base.copymutable(itr), p; sorted=sorted) + +function _quantile(v::AbstractArray{V}, p, sorted::Bool, w::AbstractArray{W}) where {V,W} + # checks + isempty(v) && throw(ArgumentError("quantile of an empty array is undefined")) + isempty(p) && throw(ArgumentError("empty quantile array")) + all(x -> 0 <= x <= 1, p) || throw(ArgumentError("input probability out of [0,1] range")) + + wsum = sum(w) + wsum == 0 && throw(ArgumentError("weight vector cannot sum to zero")) + length(v) == length(w) || throw(ArgumentError("data and weight vectors must be the same size," * + "got $(length(v)) and $(length(w))")) + for x in w + isnan(x) && throw(ArgumentError("weight vector cannot contain NaN entries")) + x < 0 && throw(ArgumentError("weight vector cannot contain negative entries")) + end + + isa(w, FrequencyWeights) && !(eltype(w) <: Integer) && any(!isinteger, w) && + throw(ArgumentError("The values of the vector of `FrequencyWeights` must be numerically" * + "equal to integers. Use `ProbabilityWeights` or `AnalyticWeights` instead.")) + + # remove zeros weights and sort + nz = .!iszero.(w) + vw = sort!(collect(zip(view(v, nz), view(w, nz)))) + N = length(vw) + + # prepare percentiles + ppermute = sortperm(p) + p = p[ppermute] + + # prepare out vector + out = Vector{typeof(zero(V)/1)}(undef, length(p)) + fill!(out, vw[end][1]) + + @inbounds for x in v + isnan(x) && return fill!(out, x) + end + + # loop on quantiles + Sk, Skold = zero(W), zero(W) + vk, vkold = zero(V), zero(V) + k = 0 + + w1 = vw[1][2] + for i in 1:length(p) + if isa(w, FrequencyWeights) + h = p[i] * (wsum - 1) + 1 + else + h = p[i] * (wsum - w1) + w1 + end + while Sk <= h + k += 1 + if k > N + # out was initialized with maximum v + return out + end + Skold, vkold = Sk, vk + vk, wk = vw[k] + Sk += wk + end + if isa(w, FrequencyWeights) + out[ppermute[i]] = vkold + min(h - Skold, 1) * (vk - vkold) + else + out[ppermute[i]] = vkold + (h - Skold) / (Sk - Skold) * (vk - vkold) + end + end + return out +end -quantile(v::AbstractVector, p; sorted::Bool=false) = - quantile!(sorted ? v : Base.copymutable(v), p; sorted=sorted) +_quantile(v::AbstractArray, p::Real, sorted::Bool, w::AbstractArray) = + _quantile(v, [p], sorted, w)[1] +_quantile(itr, p, sorted::Bool, weights) = + throw(ArgumentError("weights are only supported with AbstractArrays inputs")) ##### SparseArrays optimizations ##### diff --git a/stdlib/Statistics/src/moments.jl b/stdlib/Statistics/src/moments.jl new file mode 100644 index 0000000000000..a1fd0a8511b24 --- /dev/null +++ b/stdlib/Statistics/src/moments.jl @@ -0,0 +1,151 @@ +##### Skewness and Kurtosis + +# Skewness +# This is Type 1 definition according to Joanes and Gill (1998) +""" + skewness(x; [weights::AbstractArray], [mean::Real]) + +Compute the standardized skewness of collection `x`, optionally +specifying a pre-computed `mean`. +If `x` is an `AbstractArray`, a `weights` array of the same length as `x` +can be specified to compute the weighted skewness. + +!!! compat "Julia 1.3" + This function requires at least Julia 1.3. +""" +skewness(A; mean::Union{Real, Nothing}=nothing) = _skewness(A, nothing, mean) + +skewness(A::AbstractArray; + weights::Union{AbstractArray,Nothing}=nothing, + mean::Union{Real, Nothing}=nothing) = + _skewness(A, weights, mean) + +function _skewness(x, w::Nothing, m::Real) + y = iterate(x) + if y === nothing + T = eltype(x) + # Return the NaN of the type that we would get, had this collection + # contained any elements (this is consistent with var) + z0 = zero(T) - zero(m) + return (z0^3 + z0^3)/sqrt((z0^2+z0^2)^3) + end + + v, s = y + z = v - m + cm2 = z * z # empirical 2nd centered moment (variance) + cm3 = cm2 * z # empirical 3rd centered moment + n = 1 + y = iterate(x, s) + while y !== nothing + v, s = y + n += 1 + + z = v - m + z2 = z * z + cm2 += z2 + cm3 += z2 * z + y = iterate(x, s) + end + cm3 /= n + cm2 /= n + return cm3 / sqrt(cm2^3) +end + +function _skewness(x::AbstractArray{T}, w::AbstractArray{W}, m::Real) where {T, W} + length(x) == length(w) || + throw(ArgumentError("data and weight vectors must be the same size," * + "got $(length(v)) and $(length(w))")) + z0 = zero(T) - zero(m) + cm2 = z0 * zero(W) + z0 * zero(W) # empirical 2nd centered moment (variance) + cm3 = cm2 # empirical 3rd centered moment + + @inbounds @simd for i in eachindex(x, w) + z = x[i] - m + z2w = z * z * w[i] + cm2 += z2w + cm3 += z2w * z + end + sw = sum(w) + cm3 /= sw + cm2 /= sw + return cm3 / sqrt(cm2^3) +end + +_skewness(A::AbstractArray, w::Union{AbstractArray, Nothing}, m::Nothing) = + _skewness(A, w, mean(A, weights=w)) + +# (excessive) Kurtosis +# This is Type 1 definition according to Joanes and Gill (1998) +""" + kurtosis(x; [weights::AbstractArray], [mean::Real]) + +Compute the excess kurtosis of collection `x`, optionally +specifying a pre-computed `mean`. +If `x` is an `AbstractArray`, a `weights` array of the same length as `x` +can be specified to compute the weighted kurtosis. + +!!! compat "Julia 1.3" + This function requires at least Julia 1.3. +""" +kurtosis(A; mean::Union{Real, Nothing}=nothing) = _kurtosis(A, nothing, mean) + +kurtosis(A::AbstractArray; + weights::Union{AbstractArray,Nothing}=nothing, + mean::Union{Real, Nothing}=nothing) = + _kurtosis(A, weights, mean) + +function _kurtosis(x, w::Nothing, m::Real) + y = iterate(x) + if y === nothing + T = eltype(x) + # Return the NaN of the type that we would get, had this collection + # contained any elements (this is consistent with var) + z0 = zero(T) - zero(m) + return (z0^3 + z0^3)/sqrt((z0^2+z0^2)^3) + end + + v, s = y + z = v - m + cm2 = z * z # empirical 2nd centered moment (variance) + cm4 = cm2 * cm2 # empirical 4th centered moment + + n = 1 + y = iterate(x, s) + while y !== nothing + v, s = y + n += 1 + + z = v - m + z2 = z * z + cm2 += z2 + cm4 += z2 * z2 + y = iterate(x, s) + end + cm4 /= n + cm2 /= n + return (cm4 / (cm2 * cm2)) - 3 +end + +function _kurtosis(x::AbstractArray{T}, w::AbstractWeights{W}, m::Real) where {T, W} + length(x) == length(w) || + throw(ArgumentError("data and weight vectors must be the same size," * + "got $(length(v)) and $(length(w))")) + z0 = zero(T) - zero(m) + cm2 = z0 * zero(W) + z0 * zero(W) # empirical 2nd centered moment (variance) + cm4 = cm2 # empirical 4rd centered moment + + @inbounds @simd for i in eachindex(x, w) + z = x[i] - m + z2 = z * z + z2w = z2 * w[i] + cm2 += z2w + cm4 += z2w * z2 + end + sw = sum(w) + cm4 /= sw + cm2 /= sw + return (cm4 / (cm2 * cm2)) - 3 +end + +_kurtosis(A::AbstractArray, w::Union{AbstractWeights, Nothing}, m::Nothing) = + _kurtosis(A, w, mean(A, weights=w)) diff --git a/stdlib/Statistics/src/weights.jl b/stdlib/Statistics/src/weights.jl new file mode 100644 index 0000000000000..7497a410d3206 --- /dev/null +++ b/stdlib/Statistics/src/weights.jl @@ -0,0 +1,170 @@ +###### Weights array ##### + +""" + AbstractWeights <: AbstractVector + +The abstract supertype of all vectors of statistical weights. + +Object of this type behave like other `AbstractVector`s, but +they store the sum of their values internally for efficiency. +Concrete `AbstractWeights` type indicates what correction +has to be applied when computing statistics which depend on the +meaning of weights. + +!!! compat "Julia 1.3" + This type requires at least Julia 1.3. +""" +abstract type AbstractWeights{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractVector{T} end + +""" + @weights name + +Generate a new generic weight type with specified `name`, which subtypes `AbstractWeights` +and stores the `values` (`V<:AbstractVector{<:Real}`) and `sum` (`S<:Real`). +""" +macro weights(name) + return quote + mutable struct $name{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractWeights{S, T, V} + values::V + sum::S + end + $(esc(name))(vs) = $(esc(name))(vs, sum(vs)) + end +end + +Base.eltype(wv::AbstractWeights) = eltype(wv.values) +Base.length(wv::AbstractWeights) = length(wv.values) +Base.values(wv::AbstractWeights) = wv.values +Base.sum(wv::AbstractWeights) = wv.sum +Base.isempty(wv::AbstractWeights) = isempty(wv.values) + +Base.getindex(wv::AbstractWeights, i::Int) = getindex(wv.values, i) +Base.size(wv::AbstractWeights) = size(wv.values) + +Base.@propagate_inbounds function Base.setindex!(wv::AbstractWeights, v::Real, i::Int) + s = v - wv[i] + wv.values[i] = v + wv.sum += s + v +end + +@weights Weights + +@doc """ + Weights(vs, wsum=sum(vs)) + +Construct a `Weights` vector with weight values `vs`. +A precomputed sum may be provided as `wsum`. + +The `Weights` type describes a generic weights vector which does not support +all operations possible for [`FrequencyWeights`](@ref), [`AnalyticWeights`](@ref) +and [`ProbabilityWeights`](@ref). + +!!! compat "Julia 1.3" + This type requires at least Julia 1.3. +""" Weights + +""" + weights(vs) + +Construct a `Weights` vector from array `vs`. +See the documentation for [`Weights`](@ref) for more details. +""" +weights(vs::AbstractVector{<:Real}) = Weights(vs) +weights(vs::AbstractArray{<:Real}) = Weights(vec(vs)) + +@weights AnalyticWeights + +@doc """ + AnalyticWeights(vs, wsum=sum(vs)) + +Construct an `AnalyticWeights` vector with weight values `vs`. +A precomputed sum may be provided as `wsum`. + +Analytic weights describe a non-random relative importance (usually between 0 and 1) +for each observation. These weights may also be referred to as reliability weights, +precision weights or inverse variance weights. These are typically used when the observations +being weighted are aggregate values (e.g., averages) with differing variances. + +!!! compat "Julia 1.3" + This type requires at least Julia 1.3. +""" AnalyticWeights + +""" + aweights(vs) + +Construct an `AnalyticWeights` vector from array `vs`. +See the documentation for [`AnalyticWeights`](@ref) for more details. + +!!! compat "Julia 1.3" + This function requires at least Julia 1.3. +""" +aweights(vs::AbstractVector{<:Real}) = AnalyticWeights(vs) +aweights(vs::AbstractArray{<:Real}) = AnalyticWeights(vec(vs)) + +@weights FrequencyWeights + +@doc """ + FrequencyWeights(vs, wsum=sum(vs)) + +Construct a `FrequencyWeights` vector with weight values `vs`. +A precomputed sum may be provided as `wsum`. + +Frequency weights describe the number of times (or frequency) each observation +was observed. These weights may also be referred to as case weights or repeat weights. + +!!! compat "Julia 1.3" + This type requires at least Julia 1.3. +""" FrequencyWeights + +""" + fweights(vs) + +Construct a `FrequencyWeights` vector from a given array. +See the documentation for [`FrequencyWeights`](@ref) for more details. + +!!! compat "Julia 1.3" + This function requires at least Julia 1.3. +""" +fweights(vs::AbstractVector{<:Real}) = FrequencyWeights(vs) +fweights(vs::AbstractArray{<:Real}) = FrequencyWeights(vec(vs)) + +@weights ProbabilityWeights + +@doc """ + ProbabilityWeights(vs, wsum=sum(vs)) + +Construct a `ProbabilityWeights` vector with weight values `vs`. +A precomputed sum may be provided as `wsum`. + +Probability weights represent the inverse of the sampling probability for each observation, +providing a correction mechanism for under- or over-sampling certain population groups. +These weights may also be referred to as sampling weights. + +!!! compat "Julia 1.3" + This type requires at least Julia 1.3. +""" ProbabilityWeights + +""" + pweights(vs) + +Construct a `ProbabilityWeights` vector from a given array. +See the documentation for [`ProbabilityWeights`](@ref) for more details. + +!!! compat "Julia 1.3" + This function requires at least Julia 1.3. +""" +pweights(vs::AbstractVector{<:Real}) = ProbabilityWeights(vs) +pweights(vs::AbstractArray{<:Real}) = ProbabilityWeights(vec(vs)) + +##### Equality tests ##### + +for w in (AnalyticWeights, FrequencyWeights, ProbabilityWeights, Weights) + @eval begin + Base.isequal(x::$w, y::$w) = isequal(x.sum, y.sum) && isequal(x.values, y.values) + Base.:(==)(x::$w, y::$w) = (x.sum == y.sum) && (x.values == y.values) + end +end + +Base.isequal(x::AbstractWeights, y::AbstractWeights) = false +Base.:(==)(x::AbstractWeights, y::AbstractWeights) = false \ No newline at end of file diff --git a/stdlib/Statistics/test/moments.jl b/stdlib/Statistics/test/moments.jl new file mode 100644 index 0000000000000..b1dbbbe86dd03 --- /dev/null +++ b/stdlib/Statistics/test/moments.jl @@ -0,0 +1,141 @@ +using Statistics +using Test + +@testset "Moments" begin +weight_funcs = (weights, aweights, fweights, pweights) + +##### weighted var & std + +x = [0.57, 0.10, 0.91, 0.72, 0.46, 0.0] +w = [3.84, 2.70, 8.29, 8.91, 9.71, 0.0] + +@testset "Uncorrected with $f" for f in weight_funcs + wv = f(w) + m = mean(x, weights=wv) + + # expected uncorrected output + expected_var = sum(abs2.(x .- m) .* wv) / sum(wv) + expected_std = sqrt.(expected_var) + + @testset "Variance" begin + @test var(x, weights=wv, corrected=false) ≈ expected_var + @test var(x, weights=wv, mean=m, corrected=false) ≈ expected_var + end + + @testset "Standard Deviation" begin + @test std(x, weights=wv, corrected=false) ≈ expected_std + @test std(x, weights=wv, mean=m, corrected=false) ≈ expected_std + end +end + +# expected corrected output for (weights, aweights, fweights, pweights) +expected_var = [NaN, 0.0694434191182236, 0.05466601256158146, 0.06628969012045285] +expected_std = sqrt.(expected_var) + +@testset "Corrected with $(weight_funcs[i])" for i in eachindex(weight_funcs) + wv = weight_funcs[i](w) + m = mean(x, weights=wv) + + @testset "Variance" begin + if isa(wv, Weights) + @test_throws ArgumentError var(x, weights=wv, corrected=true) + else + @test var(x, weights=wv, corrected=true) ≈ expected_var[i] + @test var(x, weights=wv, mean=m, corrected=true) ≈ expected_var[i] + end + end + + @testset "Standard Deviation" begin + if isa(wv, Weights) + @test_throws ArgumentError std(x, weights=wv, corrected=true) + else + @test std(x, weights=wv, corrected=true) ≈ expected_std[i] + @test std(x, weights=wv, mean=m, corrected=true) ≈ expected_std[i] + end + end +end + +x = rand(5, 6) +w1 = rand(5) +w2 = rand(6) + +@testset "Uncorrected with $f" for f in weight_funcs + wv1 = f(w1) + wv2 = f(w2) + m1 = mean(x, weights=wv1, dims=1) + m2 = mean(x, weights=wv2, dims=2) + + expected_var1 = sum(abs2.(x .- m1) .* w1, dims = 1) ./ sum(wv1) + expected_var2 = sum(abs2.(x .- m2) .* w2', dims = 2) ./ sum(wv2) + expected_std1 = sqrt.(expected_var1) + expected_std2 = sqrt.(expected_var2) + + @testset "Variance" begin + @test var(x, weights=wv1, dims=1, corrected=false) ≈ expected_var1 + @test var(x, weights=wv2, dims=2, corrected=false) ≈ expected_var2 + @test var(x, weights=wv1, dims=1, mean=m1, corrected=false) ≈ expected_var1 + @test var(x, weights=wv2, dims=2, mean=m2, corrected=false) ≈ expected_var2 + end + + @testset "Standard Deviation" begin + @test std(x, weights=wv1, dims=1, corrected=false) ≈ expected_std1 + @test std(x, weights=wv2, dims=2, corrected=false) ≈ expected_std2 + @test std(x, weights=wv1, dims=1, mean=m1, corrected=false) ≈ expected_std1 + @test std(x, weights=wv2, dims=2, mean=m2, corrected=false) ≈ expected_std2 + end +end + +@testset "Corrected with $f" for f in weight_funcs + wv1 = f(w1) + wv2 = f(w2) + m1 = mean(x, weights=wv1, dims=1) + m2 = mean(x, weights=wv2, dims=2) + + if !isa(wv1, Weights) + expected_var1 = sum(abs2.(x .- m1) .* w1, dims = 1) .* Statistics.varcorrection(wv1, true) + expected_var2 = sum(abs2.(x .- m2) .* w2', dims = 2) .* Statistics.varcorrection(wv2, true) + expected_std1 = sqrt.(expected_var1) + expected_std2 = sqrt.(expected_var2) + end + + @testset "Variance" begin + if isa(wv1, Weights) + @test_throws ArgumentError var(x, weights=wv1, dims=1, corrected=true) + else + @test var(x, weights=wv1, dims=1, corrected=true) ≈ expected_var1 + @test var(x, weights=wv2, dims=2, corrected=true) ≈ expected_var2 + @test var(x, weights=wv1, dims=1, mean=m1, corrected=true) ≈ expected_var1 + @test var(x, weights=wv2, dims=2, mean=m2, corrected=true) ≈ expected_var2 + end + end + + @testset "Standard Deviation" begin + if isa(wv1, Weights) + @test_throws ArgumentError std(x, weights=wv1, dims=1, corrected=true) + else + @test std(x, weights=wv1, dims=1, corrected=true) ≈ expected_std1 + @test std(x, weights=wv2, dims=2, corrected=true) ≈ expected_std2 + @test std(x, weights=wv1, dims=1, mean=m1, corrected=true) ≈ expected_std1 + @test std(x, weights=wv2, dims=2, mean=m2, corrected=true) ≈ expected_std2 + end + end +end + +@testset "Skewness and Kurtosis with $f" for f in weight_funcs + wv = f(ones(5) * 2.0) + + @test skewness(1:5) ≈ 0.0 + @test skewness([1, 2, 3, 4, 5]) ≈ 0.0 + @test skewness([1, 2, 2, 2, 5]) ≈ 1.1731251294063556 + @test skewness([1, 4, 4, 4, 5]) ≈ -1.1731251294063556 + + @test skewness([1, 2, 2, 2, 5], weights=wv) ≈ 1.1731251294063556 + + @test kurtosis(1:5) ≈ -1.3 + @test kurtosis([1, 2, 3, 4, 5]) ≈ -1.3 + @test kurtosis([1, 2, 3, 3, 2]) ≈ -1.1530612244897953 + + @test kurtosis([1, 2, 3, 4, 5], weights=wv) ≈ -1.3 +end + +end diff --git a/stdlib/Statistics/test/runtests.jl b/stdlib/Statistics/test/runtests.jl index e4849e04d4263..68f0a0aed14dd 100644 --- a/stdlib/Statistics/test/runtests.jl +++ b/stdlib/Statistics/test/runtests.jl @@ -720,3 +720,6 @@ end @test isfinite.(cov_sparse) == isfinite.(cov_dense) end end + +include("weights.jl") +include("moments.jl") \ No newline at end of file diff --git a/stdlib/Statistics/test/weights.jl b/stdlib/Statistics/test/weights.jl new file mode 100644 index 0000000000000..5b7cc110570cd --- /dev/null +++ b/stdlib/Statistics/test/weights.jl @@ -0,0 +1,329 @@ +using Statistics +using LinearAlgebra, Random, SparseArrays, Test + +@testset "Weights" begin +weight_funcs = (weights, aweights, fweights, pweights) + +# Construction +@testset "$f" for f in weight_funcs + @test isa(f([1, 2, 3]), AbstractWeights{Int}) + @test isa(f([1., 2., 3.]), AbstractWeights{Float64}) + @test isa(f([1 2 3; 4 5 6]), AbstractWeights{Int}) + + @test isempty(f(Float64[])) + @test size(f([1, 2, 3])) == (3,) + + w = [1., 2., 3.] + wv = f(w) + @test eltype(wv) === Float64 + @test length(wv) === 3 + @test values(wv) === w + @test sum(wv) === 6.0 + @test !isempty(wv) + + b = trues(3) + bv = f(b) + @test eltype(bv) === Bool + @test length(bv) === 3 + @test values(bv) === b + @test sum(bv) === 3 + @test !isempty(bv) +end + +@testset "$f, setindex!" for f in weight_funcs + w = [1., 2., 3.] + wv = f(w) + + # Check getindex & sum + @test wv[1] === 1. + @test sum(wv) === 6. + @test values(wv) == w + + # Test setindex! success + @test (wv[1] = 4) === 4 # setindex! returns original val + @test wv[1] === 4. # value correctly converted and set + @test sum(wv) === 9. # sum updated + @test values(wv) == [4., 2., 3.] # Test state of all values + + # Test mulivalue setindex! + wv[1:2] = [3., 5.] + @test wv[1] === 3. + @test wv[2] === 5. + @test sum(wv) === 11. + @test values(wv) == [3., 5., 3.] # Test state of all values + + # Test failed setindex! due to conversion error + w = [1, 2, 3] + wv = f(w) + + @test_throws InexactError wv[1] = 1.5 # Returns original value + @test wv[1] === 1 # value not updated + @test sum(wv) === 6 # sum not corrupted + @test values(wv) == [1, 2, 3] # Test state of all values +end + +@testset "$f, isequal and ==" for f in weight_funcs + x = f([1, 2, 3]) + + y = f([1, 2, 3]) # same values, type and parameters + @test isequal(x, y) + @test x == y + + y = f([1.0, 2.0, 3.0]) # same values and type, different parameters + @test isequal(x, y) + @test x == y + + if f != fweights # same values and parameters, different types + y = fweights([1, 2, 3]) + @test !isequal(x, y) + @test x != y + end + + x = f([1, 2, NaN]) # isequal and == treat NaN differently + y = f([1, 2, NaN]) + @test isequal(x, y) + @test x != y + + x = f([1.0, 2.0, 0.0]) # isequal and == treat ±0.0 differently + y = f([1.0, 2.0, -0.0]) + @test !isequal(x, y) + @test x == y +end + +@testset "Mean $f" for f in weight_funcs + @test mean([1:3;], weights=f([1.0, 1.0, 0.5])) ≈ 1.8 + @test mean(1:3, weights=f([1.0, 1.0, 0.5])) ≈ 1.8 + + a = reshape(1.0:27.0, 3, 3, 3) + for wt in ([1.0, 1.0, 1.0], [1.0, 0.2, 0.0], [0.2, 0.0, 1.0]) + @test mean(a, weights=f(wt), dims=1) ≈ + sum(a.*reshape(wt, :, 1, 1), dims=1)/sum(wt) + @test mean(a, weights=f(wt), dims=2) ≈ + sum(a.*reshape(wt, 1, :, 1), dims=2)/sum(wt) + @test mean(a, weights=f(wt), dims=3) ≈ + sum(a.*reshape(wt, 1, 1, :), dims=3)/sum(wt) + @test_throws DimensionMismatch mean(a, weights=f(wt), dims=4) + end +end + + +# Quantile fweights +@testset "Quantile fweights" begin + data = ( + [7, 1, 2, 4, 10], + [7, 1, 2, 4, 10], + [7, 1, 2, 4, 10, 15], + [1, 2, 4, 7, 10, 15], + [0, 10, 20, 30], + [1, 2, 3, 4, 5], + [1, 2, 3, 4, 5], + [30, 40, 50, 60, 35], + [2, 0.6, 1.3, 0.3, 0.3, 1.7, 0.7, 1.7], + [1, 2, 2], + [3.7, 3.3, 3.5, 2.8], + [100, 125, 123, 60, 45, 56, 66], + [2, 2, 2, 2, 2, 2], + [2.3], + [-2, -3, 1, 2, -10], + [1, 2, 3, 4, 5], + [5, 4, 3, 2, 1], + [-2, 2, -1, 3, 6], + [-10, 1, 1, -10, -10], + ) + wt = ( + [3, 1, 1, 1, 3], + [1, 1, 1, 1, 1], + [3, 1, 1, 1, 3, 3], + [1, 1, 1, 3, 3, 3], + [30, 191, 9, 0], + [10, 1, 1, 1, 9], + [10, 1, 1, 1, 900], + [1, 3, 5, 4, 2], + [2, 2, 5, 0, 2, 2, 1, 6], + [1, 1, 8], + [5, 5, 4, 1], + [30, 56, 144, 24, 55, 43, 67], + [1, 2, 3, 4, 5, 6], + [12], + [7, 1, 1, 1, 6], + [1, 0, 0, 0, 2], + [1, 2, 3, 4, 5], + [1, 2, 3, 2, 1], + [0, 1, 1, 1, 1], + ) + p = [0.0, 0.25, 0.5, 0.75, 1.0] + function _rep(x::AbstractVector, lengths::AbstractVector{Int}) + res = similar(x, sum(lengths)) + i = 1 + for idx in 1:length(x) + tmp = x[idx] + for kdx in 1:lengths[idx] + res[i] = tmp + i += 1 + end + end + return res + end + # quantile with fweights is the same as repeated vectors + for i = 1:length(data) + @test quantile(data[i], p, weights=fweights(wt[i])) ≈ + quantile(_rep(data[i], wt[i]), p) + end + # quantile with fweights = 1 is the same as quantile + for i = 1:length(data) + @test quantile(data[i], p, weights=fweights(fill!(similar(wt[i]), 1))) ≈ quantile(data[i], p) + end + + # Issue JuliaStats/StatsBase#313 + @test quantile([1, 2, 3, 4, 5], p, weights=fweights([0,1,2,1,0])) ≈ + quantile([2, 3, 3, 4], p) + @test quantile([1, 2], 0.25, weights=fweights([1, 1])) ≈ 1.25 + @test quantile([1, 2], 0.25, weights=fweights([2, 2])) ≈ 1.0 + + # test non integer frequency weights + quantile([1, 2], 0.25, weights=fweights([1.0, 2.0])) == + quantile([1, 2], 0.25, weights=fweights([1, 2])) + @test_throws ArgumentError quantile([1, 2], 0.25, weights=fweights([1.5, 2.0])) + + @test_throws ArgumentError quantile([1, 2], nextfloat(1.0), weights=fweights([1, 2])) + @test_throws ArgumentError quantile([1, 2], prevfloat(0.0), weights=fweights([1, 2])) +end + +@testset "Quantile aweights, pweights and weights" for f in (aweights, pweights, weights) + data = ( + [7, 1, 2, 4, 10], + [7, 1, 2, 4, 10], + [7, 1, 2, 4, 10, 15], + [1, 2, 4, 7, 10, 15], + [0, 10, 20, 30], + [1, 2, 3, 4, 5], + [1, 2, 3, 4, 5], + [30, 40, 50, 60, 35], + [2, 0.6, 1.3, 0.3, 0.3, 1.7, 0.7, 1.7], + [1, 2, 2], + [3.7, 3.3, 3.5, 2.8], + [100, 125, 123, 60, 45, 56, 66], + [2, 2, 2, 2, 2, 2], + [2.3], + [-2, -3, 1, 2, -10], + [1, 2, 3, 4, 5], + [5, 4, 3, 2, 1], + [-2, 2, -1, 3, 6], + [-10, 1, 1, -10, -10], + ) + wt = ( + [1, 1/3, 1/3, 1/3, 1], + [1, 1, 1, 1, 1], + [1, 1/3, 1/3, 1/3, 1, 1], + [1/3, 1/3, 1/3, 1, 1, 1], + [30, 191, 9, 0], + [10, 1, 1, 1, 9], + [10, 1, 1, 1, 900], + [1, 3, 5, 4, 2], + [2, 2, 5, 1, 2, 2, 1, 6], + [0.1, 0.1, 0.8], + [5, 5, 4, 1], + [30, 56, 144, 24, 55, 43, 67], + [0.1, 0.2, 0.3, 0.4, 0.5, 0.6], + [12], + [7, 1, 1, 1, 6], + [1, 0, 0, 0, 2], + [1, 2, 3, 4, 5], + [0.1, 0.2, 0.3, 0.2, 0.1], + [1, 1, 1, 1, 1], + ) + quantile_answers = ( + [1.0, 4.0, 6.0, 8.0, 10.0], + [1.0, 2.0, 4.0, 7.0, 10.0], + [1.0, 4.75, 7.5, 10.4166667, 15.0], + [1.0, 4.75, 7.5, 10.4166667, 15.0], + [0.0, 2.6178010, 5.2356021, 7.8534031, 20.0], + [1.0, 4.0, 4.3333333, 4.6666667, 5.0], + [1.0, 4.2475, 4.4983333, 4.7491667, 5.0], + [30.0, 37.5, 44.0, 51.25, 60.0], + [0.3, 0.7, 1.3, 1.7, 2.0], + [1.0, 2.0, 2.0, 2.0, 2.0], + [2.8, 3.15, 3.4, 3.56, 3.7], + [45.0, 62.149253, 102.875, 117.4097222, 125.0], + [2.0, 2.0, 2.0, 2.0, 2.0], + [2.3, 2.3, 2.3, 2.3, 2.3], + [-10.0, -2.7857143, -2.4285714, -2.0714286, 2.0], + [1.0, 2.0, 3.0, 4.0, 5.0], + [1.0, 1.625, 2.3333333, 3.25, 5.0], + [-2.0, -1.3333333, 0.5, 2.5, 6.0], + [-10.0, -10.0, -10.0, 1.0, 1.0] + ) + p = [0.0, 0.25, 0.5, 0.75, 1.0] + + Random.seed!(10) + for i = 1:length(data) + @test quantile(data[i], p, weights=f(wt[i])) ≈ quantile_answers[i] atol = 1e-5 + for j = 1:10 + # order of p does not matter + reorder = sortperm(rand(length(p))) + @test quantile(data[i], p[reorder], weights=f(wt[i])) ≈ + quantile_answers[i][reorder] atol = 1e-5 + end + for j = 1:10 + # order of w does not matter + reorder = sortperm(rand(length(data[i]))) + @test quantile(data[i][reorder], p, weights=f(wt[i][reorder])) ≈ + quantile_answers[i] atol = 1e-5 + end + end + # All equal weights corresponds to base quantile + for v in (1, 2, 345) + for i = 1:length(data) + w = f(fill(v, length(data[i]))) + @test quantile(data[i], p, weights=w) ≈ quantile(data[i], p) atol = 1e-5 + for j = 1:10 + prandom = rand(4) + @test quantile(data[i], prandom, weights=w) ≈ + quantile(data[i], prandom) atol = 1e-5 + end + end + end + # test zeros are removed + for i = 1:length(data) + @test quantile(vcat(1.0, data[i]), p, weights=f(vcat(0.0, wt[i]))) ≈ + quantile_answers[i] atol = 1e-5 + end + # Syntax + v = [7, 1, 2, 4, 10] + w = [1, 1/3, 1/3, 1/3, 1] + answer = 6.0 + @test quantile(data[1], 0.5, weights=f(w)) ≈ answer atol = 1e-5 +end + + +@testset "Median $f" for f in weight_funcs + data = [4, 3, 2, 1] + wt = [0, 0, 0, 0] + @test_throws ArgumentError median(data, weights=f(wt)) + @test_throws ArgumentError median(Float64[], weights=f(Float64[])) + wt = [1, 2, 3, 4, 5] + @test_throws ArgumentError median(data, weights=f(wt)) + # FIXME: This works now, needs to be covered + # if VERSION >= v"1.0" + # @test_throws MethodError median([4 3 2 1 0], weights=f(wt)) + # @test_throws MethodError median([[1 2] ; [4 5] ; [7 8] ; [10 11] ; [13 14]], + # weights=f(wt)) + # end + data = [1, 3, 2, NaN, 2] + @test isnan(median(data, weights=f(wt))) + wt = [1, 2, NaN, 4, 5] + @test_throws ArgumentError median(data, weights=f(wt)) + data = [1, 3, 2, 1, 2] + @test_throws ArgumentError median(data, weights=f(wt)) + wt = [-1, -1, -1, -1, -1] + @test_throws ArgumentError median(data, weights=f(wt)) + wt = [-1, -1, -1, 0, 0] + @test_throws ArgumentError median(data, weights=f(wt)) + + data = [4, 3, 2, 1] + wt = [1, 2, 3, 4] + @test median(data, weights=f(wt)) ≈ + quantile(data, 0.5, weights=f(wt)) atol = 1e-5 +end + +end # @testset Weights \ No newline at end of file diff --git a/test/reduce.jl b/test/reduce.jl index eb585e8a630f1..9b89e8f6741b6 100644 --- a/test/reduce.jl +++ b/test/reduce.jl @@ -533,3 +533,38 @@ x = [j^2 for j in i] i = Base.Slice(0:0) x = [j+7 for j in i] @test sum(x) == 7 + +@testset "weighted sum" begin + wts = ([1.4, 2.5, 10.1], [1.4f0, 2.5f0, 10.1f0], [0.0, 2.3, 5.6], + [NaN, 2.3, 5.6], [Inf, 2.3, 5.6], + [2, 1, 3], Int8[1, 2, 3], [1, 1, 1]) + for a in (rand(3), rand(Int, 3), rand(Int8, 3)) + for w in wts + res = @inferred sum(a, weights=w) + expected = sum(a.*w) + if isfinite(res) + @test res ≈ expected + else + @test isequal(res, expected) + end + @test typeof(res) == typeof(expected) + end + end + for a in (rand(3, 5), rand(Float32, 3, 5), rand(Int, 3, 5), rand(Int8, 3, 5)) + for w in wts + wr = repeat(w, outer=(1, 5)) + res = @inferred sum(a, weights=wr) + expected = sum(a.*wr) + if isfinite(res) + @test res ≈ expected + else + @test isequal(res, expected) + end + @test typeof(res) == typeof(expected) + end + end + + @test_throws ArgumentError sum(exp, [1], weights=[1]) + @test_throws ArgumentError sum!(exp, [0 0], [1 2], weights=[1, 10]) + @test_throws ArgumentError sum!([0 0], [1 2], weights=[1 10]) +end \ No newline at end of file diff --git a/test/reducedim.jl b/test/reducedim.jl index 2a39b706d2a73..c8c030d3704ce 100644 --- a/test/reducedim.jl +++ b/test/reducedim.jl @@ -398,3 +398,93 @@ end @test_throws DimensionMismatch maximum!(fill(0, 1, 1, 2, 1), B) @test_throws DimensionMismatch minimum!(fill(0, 1, 1, 2, 1), B) end + +@testset "weighted sum over dimensions" begin + wts = ([1.4, 2.5, 10.1], [1.4f0, 2.5f0, 10.1f0], [0.0, 2.3, 5.6], + [NaN, 2.3, 5.6], [Inf, 2.3, 5.6], + [2, 1, 3], Int8[1, 2, 3], [1, 1, 1]) + + ainf = rand(3) + ainf[1] = Inf + anan = rand(3) + anan[1] = NaN + for a in (rand(3), rand(Float32, 3), ainf, anan, + rand(Int, 3), rand(Int8, 3), + view(rand(5), 2:4)) + for w in wts + if all(isfinite, a) && all(isfinite, w) + expected = sum(a.*w, dims=1) + res = @inferred sum(a, weights=w, dims=1) + @test res ≈ expected + @test typeof(res) == typeof(expected) + x = rand!(similar(expected)) + y = copy(x) + @inferred sum!(y, a, weights=w) + @test y ≈ expected + y = copy(x) + @inferred sum!(y, a, weights=w, init=false) + @test y ≈ x + expected + else + expected = sum(a.*w, dims=1) + res = @inferred sum(a, weights=w, dims=1) + @test isfinite.(res) == isfinite.(expected) + @test typeof(res) == typeof(expected) + x = rand!(similar(expected)) + y = copy(x) + @inferred sum!(y, a, weights=w) + @test isfinite.(y) == isfinite.(expected) + y = copy(x) + @inferred sum!(y, a, weights=w, init=false) + @test isfinite.(y) == isfinite.(expected) + end + end + end + + ainf = rand(3, 3, 3) + ainf[1] = Inf + anan = rand(3, 3, 3) + anan[1] = NaN + for a in (rand(3, 3, 3), rand(Float32, 3, 3, 3), ainf, anan, + rand(Int, 3, 3, 3), rand(Int8, 3, 3, 3), + view(rand(3, 3, 5), :, :, 2:4)) + for w in wts + for (d, rw) in ((1, reshape(w, :, 1, 1)), + (2, reshape(w, 1, :, 1)), + (3, reshape(w, 1, 1, :))) + if all(isfinite, a) && all(isfinite, w) + expected = sum(a.*rw, dims=d) + res = @inferred sum(a, weights=w, dims=d) + @test res ≈ expected + @test typeof(res) == typeof(expected) + x = rand!(similar(expected)) + y = copy(x) + @inferred sum!(y, a, weights=w) + @test y ≈ expected + y = copy(x) + @inferred sum!(y, a, weights=w, init=false) + @test y ≈ x + expected + else + expected = sum(a.*rw, dims=d) + res = @inferred sum(a, weights=w, dims=d) + @test isfinite.(res) == isfinite.(expected) + @test typeof(res) == typeof(expected) + x = rand!(similar(expected)) + y = copy(x) + @inferred sum!(y, a, weights=w) + @test isfinite.(y) == isfinite.(expected) + y = copy(x) + @inferred sum!(y, a, weights=w, init=false) + @test isfinite.(y) == isfinite.(expected) + end + end + + @test_throws DimensionMismatch sum(a, weights=w, dims=4) + end + end + + # Corner case with a single row + @test sum([1 2], weights=[2], dims=1) == [2 4] + + @test_throws ArgumentError sum(exp, [1 2], weights=[1, 10], dims=1) + @test_throws ArgumentError sum([1 2], weights=[1 10], dims=1) +end \ No newline at end of file