From 6ca9eb3a8d1342ec41b6057ef7270178422acc40 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 12 Sep 2018 10:34:20 +0200 Subject: [PATCH 01/12] Import weighted stats and moments from StatsBase to Statistics This includes methods for mean, quantile, median, var, std, cov and cor, plus new functions skewness and kurtosis, and weight types. Code is copied from StatsBase with some cleanup where needed, in particular for dispatch, to move from `@nloops`/`@nrefs` to cartesian indexing and to be closer to the mapreducedim code. Weights are now passed via a keyword argument rather than by dispatching on AbstractWeights, so as to support any array where all weights types give the same result. --- stdlib/Statistics/docs/src/index.md | 76 ++++- stdlib/Statistics/src/Statistics.jl | 324 +++++++++++++++++---- stdlib/Statistics/src/moments.jl | 297 +++++++++++++++++++ stdlib/Statistics/src/weights.jl | 434 ++++++++++++++++++++++++++++ stdlib/Statistics/test/moments.jl | 281 ++++++++++++++++++ stdlib/Statistics/test/runtests.jl | 2 + stdlib/Statistics/test/weights.jl | 336 +++++++++++++++++++++ 7 files changed, 1684 insertions(+), 66 deletions(-) create mode 100644 stdlib/Statistics/src/moments.jl create mode 100644 stdlib/Statistics/src/weights.jl create mode 100644 stdlib/Statistics/test/moments.jl create mode 100644 stdlib/Statistics/test/weights.jl diff --git a/stdlib/Statistics/docs/src/index.md b/stdlib/Statistics/docs/src/index.md index 5a684541f015f..ad18491a3e39a 100644 --- a/stdlib/Statistics/docs/src/index.md +++ b/stdlib/Statistics/docs/src/index.md @@ -4,22 +4,82 @@ DocTestSetup = :(using Statistics) ``` -The Statistics module contains basic statistics functionality. +The Statistics module contains basic statistics functionality: mean, median, quantiles, +standard deviation, variance, skewness, kurtosis, correlation and covariance. +Statistics can be weighted, and several weights types are distinguished to apply appropriate +corrections where necessary. + +## Mean, median and quantiles + +```@docs +Statistics.mean +Statistics.mean! +Statistics.median +Statistics.median! +Statistics.middle +Statistics.quantile +Statistics.quantile! +``` + +## Moments ```@docs Statistics.std Statistics.stdm Statistics.var Statistics.varm +Statistics.skewness +Statistics.kurtosis +Statistics.moment +``` + +## Correlation and covariance + +```@docs Statistics.cor Statistics.cov -Statistics.mean! -Statistics.mean -Statistics.median! -Statistics.median -Statistics.middle -Statistics.quantile! -Statistics.quantile +``` + +## Weights types + +Four statistical weights types are provided which inherit from the `AbstractWeights` type: + +- `Weights` is a generic type for arbitary weights. Using this type will trigger an error + with functions which rely on assumptions about a particular definition of weights. +- `AnalyticWeights` describe the relative importance for each observation. + These weights may also be referred to as reliability weights, precision weights + or inverse variance weights. These are typically used when the observations + are aggregate values (e.g. averages) with differing variances. +- `FrequencyWeights` describe the number of times (or frequency) each observation + was observed. These weights may also be referred to as case weights or repeat weights. +- `ProbabilityWeights` represent the inverse of the sampling probability + for each observation, providing a correction mechanism for under- or over-sampling + certain population groups. These weights may also be referred to as sampling weights. + +The choice of weights impacts how bias is corrected in several methods. +See the [`var`](@ref), [`std`](@ref), [`cov`](@ref) and [`quantile`](@ref) +docstrings for more details. + +Short-hand constructors `weights`, `aweights`, `fweights` and `pweights` +are provided for convenience. + +!!! note + - The weight vector is a light-weight wrapper of the input vector. + The input vector is NOT copied during construction. + - The weight vector maintains the sum of weights, which is computed upon construction. + If the value of the sum is pre-computed, one can supply it as the second argument + to the constructor and save the time of computing the sum again. + +```@docs +Statistics.AbstractWeights +Statistics.Weights +Statistics.AnalyticWeights +Statistics.FrequencyWeights +Statistics.ProbabilityWeights +Statistics.weights +Statistics.aweights +Statistics.fweights +Statistics.pweights ``` ```@meta diff --git a/stdlib/Statistics/src/Statistics.jl b/stdlib/Statistics/src/Statistics.jl index 19e73e6ab13bf..f2c521b23a5fb 100644 --- a/stdlib/Statistics/src/Statistics.jl +++ b/stdlib/Statistics/src/Statistics.jl @@ -11,8 +11,14 @@ using LinearAlgebra, SparseArrays using Base: has_offset_axes, require_one_based_indexing -export cor, cov, std, stdm, var, varm, mean!, mean, - median!, median, middle, quantile!, quantile +export cor, cov, std, stdm, var, varm, mean!, mean, mean_and_var, mean_and_std, + median!, median, middle, quantile!, quantile, + AbstractWeights, Weights, AnalyticWeights, FrequencyWeights, ProbabilityWeights, + weights, aweights, fweights, pweights, + moment, skewness, kurtosis + +include("weights.jl") +include("moments.jl") ##### mean ##### @@ -101,9 +107,10 @@ _mean(f, A::AbstractArray, ::Colon) = sum(f, A) / length(A) _mean(f, A::AbstractArray, dims) = sum(f, A, dims=dims) / mapreduce(i -> size(A, i), *, unique(dims); init=1) """ - mean!(r, v) + mean!(r, v; weights::AbstractArray) Compute the mean of `v` over the singleton dimensions of `r`, and write results to `r`. +Weights can be provided via an array of the same size as `A`. # Examples ```jldoctest @@ -122,21 +129,35 @@ julia> mean!([1. 1.], v) 2.0 3.0 ``` """ -function mean!(R::AbstractArray, A::AbstractArray) +mean!(R::AbstractArray, A::AbstractArray; + weights::Union{AbstractArray,Nothing}=nothing) = + _mean!(R, A, weights) + +function _mean!(R::AbstractArray, A::AbstractArray, weights::Nothing) sum!(R, A; init=true) x = max(1, length(R)) // length(A) R .= R .* x return R end +_mean!(R::AbstractArray, A::AbstractArray, w::AbstractArray) = + rmul!(wsum!(R, A, w), inv(sum(w))) + """ - mean(A::AbstractArray; dims) + mean(A::AbstractArray; [dims], [weights::AbstractArray]) -Compute the mean of an array over the given dimensions. +Compute the mean of array `A`. +If `dims` is provided, return an array of means over these dimensions. +If `weights` is provided, return the weighted mean(s). `weights` must be +either an array of the same size as `A` if `dims` is omitted, +or a vector with the same length as `size(A, dims)` if `dims` is provided. !!! compat "Julia 1.1" `mean` for empty arrays requires at least Julia 1.1. +!!! compat "Julia 1.2" + The `weights` keyword argument requires at least Julia 1.2. + # Examples ```jldoctest julia> A = [1 2; 3 4] @@ -152,19 +173,32 @@ julia> mean(A, dims=2) 2×1 Array{Float64,2}: 1.5 3.5 + +julia> mean(A, weights=[2 1; 2 1]) +2.3333333333333335 + +julia> mean(A, weights=[2, 1], dims=1) +1×2 Array{Float64,2}: + 1.66667 2.66667 ``` """ -mean(A::AbstractArray; dims=:) = _mean(A, dims) - -_mean(A::AbstractArray{T}, region) where {T} = mean!(Base.reducedim_init(t -> t/2, +, A, region), A) -_mean(A::AbstractArray, ::Colon) = sum(A) / length(A) +mean(A::AbstractArray; dims=:, weights::Union{AbstractArray, Nothing}=nothing) = + _mean(A, dims, weights) -function mean(r::AbstractRange{<:Real}) +function _mean(r::AbstractRange{<:Real}, dims::Colon, weights::Nothing) isempty(r) && return oftype((first(r) + last(r)) / 2, NaN) (first(r) + last(r)) / 2 end -median(r::AbstractRange{<:Real}) = mean(r) +_mean(A::AbstractArray, dims, weights::Nothing) = + _mean!(Base.reducedim_init(t -> t/2, +, A, dims), A, nothing) +_mean(A::AbstractArray, ::Colon, weights::Nothing) = sum(A) / length(A) + +_mean(A::AbstractArray, dims::Colon, w::AbstractArray) = + wsum(A, w) / sum(w) + +_mean(A::AbstractArray, dims, w::AbstractArray) = + _mean!(Base.reducedim_init(t -> (t*zero(eltype(w)))/2, +, A, dims), A, w) ##### variances ##### @@ -216,19 +250,19 @@ function _var(iterable, corrected::Bool, mean) end end -centralizedabs2fun(m) = x -> abs2.(x - m) centralize_sumabs2(A::AbstractArray, m) = - mapreduce(centralizedabs2fun(m), +, A) + mapreduce(x -> abs2.(x - m), +, A) centralize_sumabs2(A::AbstractArray, m, ifirst::Int, ilast::Int) = - Base.mapreduce_impl(centralizedabs2fun(m), +, A, ifirst, ilast) + Base.mapreduce_impl(x -> abs2.(x - m), +, A, ifirst, ilast) -function centralize_sumabs2!(R::AbstractArray{S}, A::AbstractArray, means::AbstractArray) where S +function centralize_sumabs2!(R::AbstractArray{S}, A::AbstractArray, means::AbstractArray, + w::Union{AbstractArray, Nothing}=nothing) where S # following the implementation of _mapreducedim! at base/reducedim.jl lsiz = Base.check_reducedims(R,A) isempty(R) || fill!(R, zero(S)) isempty(A) && return R - if Base.has_fast_linear_indexing(A) && lsiz > 16 && !has_offset_axes(R, means) + if w === nothing && Base.has_fast_linear_indexing(A) && lsiz > 16 && !has_offset_axes(R, means) nslices = div(length(A), lsiz) ibase = first(LinearIndices(A))-1 for i = 1:nslices @@ -246,22 +280,34 @@ function centralize_sumabs2!(R::AbstractArray{S}, A::AbstractArray, means::Abstr r = R[i1,IR] m = means[i1,IR] @simd for i in axes(A, 1) - r += abs2(A[i,IA] - m) + if w === nothing + r += abs2(A[i,IA] - m) + else + r += abs2(A[i,IA] - m) * w[i] + end end R[i1,IR] = r end else @inbounds for IA in CartesianIndices(indsAt) IR = Broadcast.newindex(IA, keep, Idefault) + if w !== nothing + wi = w[IA] + end @simd for i in axes(A, 1) - R[i,IR] += abs2(A[i,IA] - means[i,IR]) + if w === nothing + R[i,IR] += abs2(A[i,IA] - means[i,IR]) + else + R[i,IR] += abs2(A[i,IA] - means[i,IR]) * wi + end end end end return R end -function varm!(R::AbstractArray{S}, A::AbstractArray, m::AbstractArray; corrected::Bool=true) where S +function varm!(R::AbstractArray{S}, A::AbstractArray, m::AbstractArray, w::Nothing; + corrected::Bool=true) where S if isempty(A) fill!(R, convert(S, NaN)) else @@ -294,22 +340,30 @@ over dimensions, and `m` may contain means for each dimension of `itr`. Use the [`skipmissing`](@ref) function to omit `missing` entries and compute the variance of non-missing values. """ -varm(A::AbstractArray, m::AbstractArray; corrected::Bool=true, dims=:) = _varm(A, m, corrected, dims) +varm(A::AbstractArray, m; corrected::Bool=true, dims=:, + weights::Union{AbstractWeights, Nothing}=nothing) = + _varm(A, m, corrected, dims, weights) -_varm(A::AbstractArray{T}, m, corrected::Bool, region) where {T} = - varm!(Base.reducedim_init(t -> abs2(t)/2, +, A, region), A, m; corrected=corrected) +varm(iterable, m; corrected::Bool=true) = + _var(iterable, corrected, m) -varm(A::AbstractArray, m; corrected::Bool=true) = _varm(A, m, corrected, :) +# TODO: choose element type based on weights type too? +_varm(A::AbstractArray, m, corrected::Bool, dims, w::Union{AbstractWeights, Nothing}) = + varm!(Base.reducedim_init(t -> abs2(t)/2, +, A, dims), A, m, w; corrected=corrected) -function _varm(A::AbstractArray{T}, m, corrected::Bool, ::Colon) where T +function _varm(A::AbstractArray{T}, m, corrected::Bool, dims::Colon, w::Nothing) where T n = length(A) n == 0 && return oftype((abs2(zero(T)) + abs2(zero(T)))/2, NaN) return centralize_sumabs2(A, m) / (n - Int(corrected)) end +function _varm(A::AbstractArray{T}, m::Real, corrected::Bool, dims::Colon, + w::AbstractWeights) where T + return _moment2(A, w, m; corrected=corrected) +end """ - var(itr; dims, corrected::Bool=true, mean=nothing) + var(itr; corrected::Bool=true, [weights::AbstractWeights], [dims]) Compute the sample variance of collection `itr`. @@ -326,21 +380,42 @@ A pre-computed `mean` may be provided. If `itr` is an `AbstractArray`, `dims` can be provided to compute the variance over dimensions, and `mean` may contain means for each dimension of `itr`. +If `itr` is an `AbstractArray`, `weights` can be provided to compute the weighted +variance. `weights` must be either an array of the same size +as `A` if `dims` is omitted, or a vector with the same length as `size(A, dims)` +if `dims` is provided. +The weighted uncorrected (when `corrected=false`) sample variance +is defined as: +```math +\\frac{1}{\\sum{w}} \\sum_{i=1}^n {w_i\\left({x_i - μ}\\right)^2 } +``` +where ``n`` is the length of the input and ``μ`` is the mean. +The unbiased estimate (when `corrected=true`) of the population variance is +computed by replacing ``\\frac{1}{\\sum{w}}`` with a factor dependent on the type of +weights used: +* [`AnalyticWeights`](@ref): ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` +* [`FrequencyWeights`](@ref): ``\\frac{1}{\\sum{w} - 1}`` +* [`ProbabilityWeights`](@ref): ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` + equals `count(!iszero, w)` +* [`Weights`](@ref): `ArgumentError` (bias correction not supported) + !!! note If array contains `NaN` or [`missing`](@ref) values, the result is also `NaN` or `missing` (`missing` takes precedence if array contains both). Use the [`skipmissing`](@ref) function to omit `missing` entries and compute the variance of non-missing values. """ -var(A::AbstractArray; corrected::Bool=true, mean=nothing, dims=:) = _var(A, corrected, mean, dims) - -_var(A::AbstractArray, corrected::Bool, mean, dims) = - varm(A, something(mean, Statistics.mean(A, dims=dims)); corrected=corrected, dims=dims) +var(A::AbstractArray; + corrected::Bool=true, mean=nothing, dims=:, + weights::Union{AbstractWeights, Nothing}=nothing) = + _var(A, corrected, mean, dims, weights) -_var(A::AbstractArray, corrected::Bool, mean, ::Colon) = - real(varm(A, something(mean, Statistics.mean(A)); corrected=corrected)) +_var(A::AbstractArray, corrected::Bool, m, dims, w::Union{AbstractWeights, Nothing}) = + varm(A, m === nothing ? mean(A, dims=dims, weights=w) : m, + corrected=corrected, dims=dims, weights=w) -varm(iterable, m; corrected::Bool=true) = _var(iterable, corrected, m) +_var(A::AbstractArray, corrected::Bool, m, ::Colon, w::Union{AbstractWeights, Nothing}) = + real(varm(A, m === nothing ? mean(A, weights=w) : m, corrected=corrected, weights=w)) ## variances over ranges @@ -382,9 +457,7 @@ stdm(A::AbstractArray, m; corrected::Bool=true) = sqrt.(varm(A, m; corrected=corrected)) """ - std(itr; corrected::Bool=true, mean=nothing[, dims]) - -Compute the sample standard deviation of collection `itr`. + std(itr; corrected::Bool=true, mean=nothing, [weights::AbstractWeights], [dims]) The algorithm returns an estimator of the generative distribution's standard deviation under the assumption that each entry of `itr` is an IID drawn from that generative @@ -397,27 +470,55 @@ whereas the sum is scaled with `n` if `corrected` is A pre-computed `mean` may be provided. If `itr` is an `AbstractArray`, `dims` can be provided to compute the standard deviation -over dimensions, and `means` may contain means for each dimension of `itr`. +over dimensions, and `mean` may contain means for each dimension of `itr`. + +If `itr` is an `AbstractArray`, `weights` can be provided to compute the weighted +standard deviation. `weights` must be either an array of the same size +as `A` if `dims` is omitted, or a vector with the same length as `size(A, dims)` +if `dims` is provided. +The weighted uncorrected (when `corrected=false`) sample standard deviation +is defined as: +```math +\\sqrt{\\frac{1}{\\sum{w}} \\sum_{i=1}^n {w_i\\left({x_i - μ}\\right)^2 }} +``` +where ``n`` is the length of the input and ``μ`` is the mean. +The unbiased estimate (when `corrected=true`) of the population standard deviation is +computed by replacing ``\\frac{1}{\\sum{w}}`` with a factor dependent on the type of +weights used: +* [`AnalyticWeights`](@ref): ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` +* [`FrequencyWeights`](@ref): ``\\frac{1}{\\sum{w} - 1}`` +* [`ProbabilityWeights`](@ref): ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` + equals `count(!iszero, w)` +* [`Weights`](@ref): `ArgumentError` (bias correction not supported) !!! note If array contains `NaN` or [`missing`](@ref) values, the result is also `NaN` or `missing` (`missing` takes precedence if array contains both). Use the [`skipmissing`](@ref) function to omit `missing` entries and compute the standard deviation of non-missing values. + +!!! compat "Julia 1.2" + The `weights` keyword argument requires at least Julia 1.2. """ -std(A::AbstractArray; corrected::Bool=true, mean=nothing, dims=:) = _std(A, corrected, mean, dims) +std(A::AbstractArray; + corrected::Bool=true, mean=nothing, dims=:, + weights::Union{AbstractWeights, Nothing}=nothing) = + _std(A, corrected, mean, dims, weights) -_std(A::AbstractArray, corrected::Bool, mean, dims) = - sqrt.(var(A; corrected=corrected, mean=mean, dims=dims)) +_std(A::AbstractArray, corrected::Bool, mean, dims, + weights::Union{AbstractWeights, Nothing}) = + sqrt.(var(A; corrected=corrected, mean=mean, dims=dims, weights=weights)) -_std(A::AbstractArray, corrected::Bool, mean, ::Colon) = - sqrt.(var(A; corrected=corrected, mean=mean)) +_std(A::AbstractArray, corrected::Bool, mean, ::Colon, w::Union{AbstractWeights, Nothing}) = + sqrt.(var(A; corrected=corrected, mean=mean, weights=w)) -_std(A::AbstractArray{<:AbstractFloat}, corrected::Bool, mean, dims) = - sqrt!(var(A; corrected=corrected, mean=mean, dims=dims)) +_std(A::AbstractArray{<:AbstractFloat}, corrected::Bool, mean, dims, + w::Union{AbstractWeights, Nothing}) = + sqrt!(var(A; corrected=corrected, mean=mean, dims=dims, weights=w)) -_std(A::AbstractArray{<:AbstractFloat}, corrected::Bool, mean, ::Colon) = - sqrt.(var(A; corrected=corrected, mean=mean)) +_std(A::AbstractArray{<:AbstractFloat}, corrected::Bool, mean, ::Colon, + w::Union{AbstractWeights, Nothing}) = + sqrt.(var(A; corrected=corrected, mean=mean, weights=w)) std(iterable; corrected::Bool=true, mean=nothing) = sqrt(var(iterable, corrected=corrected, mean=mean)) @@ -693,7 +794,7 @@ Compute the Pearson correlation between the vectors or matrices `X` and `Y` alon cor(x::AbstractVecOrMat, y::AbstractVecOrMat; dims::Int=1) = corm(x, _vmean(x, dims), y, _vmean(y, dims), dims) -##### median & quantiles ##### +##### middle, median & quantiles ##### """ middle(x) @@ -798,25 +899,44 @@ julia> median(skipmissing([1, 2, missing, 4])) median(itr) = median!(collect(itr)) """ - median(A::AbstractArray; dims) + median(A::AbstractArray; [dims], [weights::AbstractArray]) -Compute the median of an array along the given dimensions. +Compute the median of array `A`. +If `dims` is provided, return an array of median over these dimensions. +If `weights` is provided, return the weighted median(s). `weights` must be +either an array of the same size as `A`. `dims` and `weights` cannot be specified +at the same time. + +See the documentation for [`quantile`](@ref) for more details. + +!!! compat "Julia 1.2" + The `weights` keyword argument requires at least Julia 1.2. # Examples ```jldoctest julia> median([1 2; 3 4], dims=1) 1×2 Array{Float64,2}: 2.0 3.0 + +julia> median([1 2; 3 4], weights=fweights([1 1; 2 1])) +3.0 ``` """ -median(v::AbstractArray; dims=:) = _median(v, dims) +median(A::AbstractArray; dims=:, weights::Union{AbstractArray, Nothing}=nothing) = + _median(A, dims, weights) + +_median(r::AbstractRange{<:Real}, dims::Colon, w::Nothing) = mean(r) + +_median(A::AbstractArray, dims, w::Nothing) = mapslices(median!, A, dims = dims) + +_median(A::AbstractArray{T}, dims::Colon, w::Nothing) where {T} = + median!(copyto!(Array{T,1}(undef, length(A)), A)) -_median(v::AbstractArray, dims) = mapslices(median!, v, dims = dims) +_median(v::AbstractArray, dims::Colon, w::AbstractArray) = _quantile(v, 0.5, false, w) -_median(v::AbstractArray{T}, ::Colon) where {T} = median!(copyto!(Array{T,1}(undef, length(v)), v)) +_median(A::AbstractArray, dims, w::AbstractArray) = + throw(ArgumentError("weights and dims cannot be specified at the same time")) -# for now, use the R/S definition of quantile; may want variants later -# see ?quantile in R -- this is type 7 """ quantile!([q::AbstractArray, ] v::AbstractVector, p; sorted=false) @@ -930,7 +1050,7 @@ end """ - quantile(itr, p; sorted=false) + quantile(itr, p; sorted=false, [weights::AbstractWeights]) Compute the quantile(s) of a collection `itr` at a specified probability or vector or tuple of probabilities `p` on the interval [0,1]. The keyword argument `sorted` indicates whether @@ -940,6 +1060,18 @@ Quantiles are computed via linear interpolation between the points `((k-1)/(n-1) for `k = 1:n` where `n = length(itr)`. This corresponds to Definition 7 of Hyndman and Fan (1996), and is the same as the R default. +If `itr` is an `AbstractArray`, compute the weighted quantiles using weights vector `w`. +Weights must not be negative. The weights and data must have the same length. +With [`FrequencyWeights`](@ref), the function returns the same result as +`quantile` for a vector with repeated values. Weights must be integers. +With non `FrequencyWeights`, denote ``N`` the length of the vector, ``w`` the vector of weights, +``h = p (\\sum_{i<= N} w_i - w_1) + w_1`` the cumulative weight corresponding to the +probability ``p`` and ``S_k = \\sum_{i<=k} w_i`` the cumulative weight for each +observation, define ``v_{k+1}`` the smallest element of `v` such that ``S_{k+1}`` +is strictly superior to ``h``. The weighted ``p`` quantile is given by ``v_k + \\gamma (v_{k+1} - v_k)`` +with ``\\gamma = (h - S_k)/(S_{k+1} - S_k)``. In particular, when all weights are equal, +the function returns the same result as the unweighted `quantile`. + !!! note An `ArgumentError` is thrown if `itr` contains `NaN` or [`missing`](@ref) values. Use the [`skipmissing`](@ref) function to omit `missing` entries and compute the @@ -963,11 +1095,87 @@ julia> quantile(skipmissing([1, 10, missing]), 0.5) 5.5 ``` """ -quantile(itr, p; sorted::Bool=false) = quantile!(collect(itr), p, sorted=sorted) +quantile(itr, p; sorted::Bool=false, weights::Union{AbstractArray,Nothing}=nothing) = + _quantile(itr, p, sorted, weights) + +_quantile(itr, p, sorted::Bool, weights::Nothing) = + quantile!(collect(itr), p, sorted=sorted) + +_quantile(itr::AbstractArray, p, sorted::Bool, weights::Nothing) = + quantile!(sorted ? itr : Base.copymutable(itr), p; sorted=sorted) + +function _quantile(v::AbstractArray{V}, p, sorted::Bool, w::AbstractArray{W}) where {V,W} + # checks + isempty(v) && throw(ArgumentError("quantile of an empty array is undefined")) + isempty(p) && throw(ArgumentError("empty quantile array")) + all(x -> 0 <= x <= 1, p) || throw(ArgumentError("input probability out of [0,1] range")) + + wsum = sum(w) + wsum == 0 && throw(ArgumentError("weight vector cannot sum to zero")) + length(v) == length(w) || throw(ArgumentError("data and weight vectors must be the same size," * + "got $(length(v)) and $(length(w))")) + for x in w + isnan(x) && throw(ArgumentError("weight vector cannot contain NaN entries")) + x < 0 && throw(ArgumentError("weight vector cannot contain negative entries")) + end + + isa(w, FrequencyWeights) && !(eltype(w) <: Integer) && any(!isinteger, w) && + throw(ArgumentError("The values of the vector of `FrequencyWeights` must be numerically" * + "equal to integers. Use `ProbabilityWeights` or `AnalyticWeights` instead.")) + + # remove zeros weights and sort + nz = .!iszero.(w) + vw = sort!(collect(zip(view(v, nz), view(w, nz)))) + N = length(vw) + + # prepare percentiles + ppermute = sortperm(p) + p = p[ppermute] + + # prepare out vector + out = Vector{typeof(zero(V)/1)}(undef, length(p)) + fill!(out, vw[end][1]) + + @inbounds for x in v + isnan(x) && return fill!(out, x) + end + + # loop on quantiles + Sk, Skold = zero(W), zero(W) + vk, vkold = zero(V), zero(V) + k = 0 + + w1 = vw[1][2] + for i in 1:length(p) + if isa(w, FrequencyWeights) + h = p[i] * (wsum - 1) + 1 + else + h = p[i] * (wsum - w1) + w1 + end + while Sk <= h + k += 1 + if k > N + # out was initialized with maximum v + return out + end + Skold, vkold = Sk, vk + vk, wk = vw[k] + Sk += wk + end + if isa(w, FrequencyWeights) + out[ppermute[i]] = vkold + min(h - Skold, 1) * (vk - vkold) + else + out[ppermute[i]] = vkold + (h - Skold) / (Sk - Skold) * (vk - vkold) + end + end + return out +end -quantile(v::AbstractVector, p; sorted::Bool=false) = - quantile!(sorted ? v : Base.copymutable(v), p; sorted=sorted) +_quantile(v::AbstractArray, p::Real, sorted::Bool, w::AbstractArray) = + _quantile(v, [p], sorted, w)[1] +_quantile(itr, p, sorted::Bool, weights) = + throw(ArgumentError("weights are only supported with AbstractArrays inputs")) ##### SparseArrays optimizations ##### diff --git a/stdlib/Statistics/src/moments.jl b/stdlib/Statistics/src/moments.jl new file mode 100644 index 0000000000000..922f35148106d --- /dev/null +++ b/stdlib/Statistics/src/moments.jl @@ -0,0 +1,297 @@ +##### Weighted var & std + +## var + +## var along dim + +function varm!(R::AbstractArray, A::AbstractArray, M::AbstractArray, w::AbstractWeights; + corrected::Bool=true) + rmul!(centralize_sumabs2!(R, A, M, values(w)), + varcorrection(w, corrected)) +end + +function var!(R::AbstractArray, A::AbstractArray, w::AbstractWeights, dims::Int; + mean=nothing, corrected::Bool=true) + if mean == 0 + varm!(R, A, w, Base.reducedim_initarray(A, dims, 0, eltype(R)), dims; + corrected=corrected) + elseif mean == nothing + varm!(R, A, w, Statistics.mean(A, w, dims=dims), dims; corrected=corrected) + else + # check size of mean + for i = 1:ndims(A) + dA = size(A,i) + dM = size(mean,i) + if i == dims + dM == 1 || throw(DimensionMismatch("Incorrect size of mean.")) + else + dM == dA || throw(DimensionMismatch("Incorrect size of mean.")) + end + end + varm!(R, A, w, mean, dims; corrected=corrected) + end +end + +function var(A::AbstractArray, w::AbstractWeights, dim::Int; mean=nothing, + corrected::Bool=true) + corrected = depcheck(:var, corrected) + var!(similar(A, Float64, Base.reduced_indices(axes(A), dim)), A, w, dim; + mean=mean, corrected=corrected) +end + +##### Fused statistics +""" + mean_and_var(x; [weights::AbstractWeights,] [dims,] corrected=false) -> (mean, var) + +Return the mean and variance of a real-valued array `x`, optionally over dimensions +`dims`, as a tuple. Observations in `x` can be weighted using `weights`. +Finally, bias correction is be applied to the variance calculation if `corrected=true`. +See [`var`](@ref) documentation for more details. +""" +function mean_and_var(A::AbstractArray; + weights::Union{AbstractWeights, Nothing}=nothing, + dims=:, + corrected::Bool=true) + m = mean(A, weights=weights, dims=dims) + v = var(A, mean=m, weights=weights, dims=dims, corrected=corrected) + m, v +end + +""" + mean_and_std(x; [weights::AbstractWeights], [dims,] corrected=true) -> (mean, std) + +Return the mean and standard deviation of a real-valued array `x`, +optionally over dimensions `dims`, as a tuple. Observations in `x` +can be weighted using `weights`. Finally, bias correction is be applied to the +standard deviation calculation if `corrected=true`. +See [`std`](@ref) documentation for more details. +""" +function mean_and_std(A::AbstractArray; + weights::Union{AbstractWeights, Nothing}=nothing, + dims=:, + corrected::Bool=true) + m = mean(A, weights=weights, dims=dims) + s = std(A, mean=m, weights=weights, dims=dims, corrected=corrected) + m, s +end + + + +##### General central moment +function _moment2(A::AbstractArray, m::Real; corrected=false) + s = 0.0 + for i in eachindex(A) + @inbounds z = A[i] - m + s += z * z + end + varcorrection(length(A), corrected) * s +end + +function _moment2(A::AbstractArray, w::AbstractArray, m::Real; corrected=false) + s = 0.0 + for i in eachindex(A, w) + @inbounds z = A[i] - m + @inbounds s += (z * z) * w[i] + end + + varcorrection(w, corrected) * s +end + +function _moment3(A::AbstractArray, m::Real) + s = 0.0 + for i in eachindex(A) + @inbounds z = A[i] - m + s += z * z * z + end + s / length(A) +end + +function _moment3(A::AbstractArray, w::AbstractArray, m::Real) + s = 0.0 + for i in eachindex(A, w) + @inbounds z = A[i] - m + @inbounds s += (z * z * z) * w[i] + end + s / sum(w) +end + +function _moment4(A::AbstractArray, m::Real) + s = 0.0 + for i in eachindex(A) + @inbounds z = A[i] - m + s += abs2(z * z) + end + s / length(A) +end + +function _moment4(A::AbstractArray, w::AbstractArray, m::Real) + s = 0.0 + for i in eachindex(A, w) + @inbounds z = A[i] - m + @inbounds s += abs2(z * z) * w[i] + end + s / sum(w) +end + +function _momentk(A::AbstractArray, k::Int, m::Real) + s = 0.0 + for i = eachindex(A) + @inbounds z = A[i] - m + s += (z ^ k) + end + s / length(A) +end + +function _momentk(A::AbstractArray, k::Int, w::AbstractArray, m::Real) + s = 0.0 + for i in eachindex(A, w) + @inbounds z = A[i] - m + @inbounds s += (z ^ k) * w[i] + end + s / sum(w) +end + + +""" + moment(A, k; [weights::AbstractArray], [mean::Real]) + +Return the `k`th order central moment of a real-valued array `A`, optionally +specifying `weights` and `mean`. + +!!! compat "Julia 1.2" + This function requires at least Julia 1.2. +""" +moment(A::AbstractArray, k::Int; + weights::Union{AbstractWeights,Nothing}=nothing, + mean::Union{Real, Nothing}=nothing) = + _moment(A, k, weights, mean) + +function _moment(A::AbstractArray, k::Int, w::Nothing, m::Real) + k == 2 ? _moment2(A, m) : + k == 3 ? _moment3(A, m) : + k == 4 ? _moment4(A, m) : + _momentk(A, k, m) +end +_moment(A::AbstractArray, k::Int, w::Nothing, m::Nothing) = + _moment(A, k, nothing, mean(A)) + +function _moment(A::AbstractArray, k::Int, w::AbstractArray, m::Real) + k == 2 ? _moment2(A, w, m) : + k == 3 ? _moment3(A, w, m) : + k == 4 ? _moment4(A, w, m) : + _momentk(A, k, w, m) +end +_moment(A::AbstractArray, k::Int, w::AbstractArray, m::Nothing) = + _moment(A, k, w, mean(A, weights=w)) + + +##### Skewness and Kurtosis + +# Skewness +# This is Type 1 definition according to Joanes and Gill (1998) +""" + skewness(A; [weights::AbstractArray], [mean::Real]) + +Compute the standardized skewness of a real-valued array `A`, optionally +specifying `weights` and `mean`. + +!!! compat "Julia 1.2" + This function requires at least Julia 1.2. +""" +skewness(A::AbstractArray; + weights::Union{AbstractWeights,Nothing}=nothing, + mean::Union{Real, Nothing}=nothing) = + _skewness(A, weights, mean) + +function _skewness(A::AbstractArray, w::Nothing, m::Real) + n = length(A) + cm2 = 0.0 # empirical 2nd centered moment (variance) + cm3 = 0.0 # empirical 3rd centered moment + for i in eachindex(A) + @inbounds z = A[i] - m + z2 = z * z + + cm2 += z2 + cm3 += z2 * z + end + cm3 /= n + cm2 /= n + return cm3 / sqrt(cm2 * cm2 * cm2) # this is much faster than cm2^1.5 +end + +function _skewness(A::AbstractArray, w::AbstractArray, m::Real) + n = length(A) + length(w) == n || throw(DimensionMismatch("Inconsistent array lengths.")) + cm2 = 0.0 # empirical 2nd centered moment (variance) + cm3 = 0.0 # empirical 3rd centered moment + + @inbounds for i in eachindex(A, w) + x_i = A[i] + w_i = w[i] + z = x_i - m + z2w = z * z * w_i + cm2 += z2w + cm3 += z2w * z + end + sw = sum(w) + cm3 /= sw + cm2 /= sw + return cm3 / sqrt(cm2 * cm2 * cm2) # this is much faster than cm2^1.5 +end + +_skewness(A::AbstractArray, w::Union{AbstractArray, Nothing}, m::Nothing) = + _skewness(A, w, mean(A, weights=w)) + +# (excessive) Kurtosis +# This is Type 1 definition according to Joanes and Gill (1998) +""" + kurtosis(A; [weights::AbstractArray], [mean::Real]) + +Compute the excess kurtosis of a real-valued array `A`, optionally +specifying `weights` and `mean`. + +!!! compat "Julia 1.2" + This function requires at least Julia 1.2. +""" +kurtosis(A::AbstractArray; + weights::Union{AbstractWeights,Nothing}=nothing, + mean::Union{Real, Nothing}=nothing) = + _kurtosis(A, weights, mean) + +function _kurtosis(A::AbstractArray, w::Nothing, m::Real) + n = length(A) + cm2 = 0.0 # empirical 2nd centered moment (variance) + cm4 = 0.0 # empirical 4th centered moment + for i in eachindex(A) + @inbounds z = A[i] - m + z2 = z * z + cm2 += z2 + cm4 += z2 * z2 + end + cm4 /= n + cm2 /= n + return (cm4 / (cm2 * cm2)) - 3.0 +end + +function _kurtosis(A::AbstractArray, w::AbstractWeights, m::Real) + length(w) == length(A) || throw(DimensionMismatch("Inconsistent array lengths.")) + cm2 = 0.0 # empirical 2nd centered moment (variance) + cm4 = 0.0 # empirical 4th centered moment + + @inbounds for i in eachindex(A, w) + x_i = A[i] + w_i = w[i] + z = x_i - m + z2 = z * z + z2w = z2 * w_i + cm2 += z2w + cm4 += z2w * z2 + end + sw = sum(w) + cm4 /= sw + cm2 /= sw + return (cm4 / (cm2 * cm2)) - 3.0 +end + +_kurtosis(A::AbstractArray, w::Union{AbstractWeights, Nothing}, m::Nothing) = + _kurtosis(A, w, mean(A, weights=w)) diff --git a/stdlib/Statistics/src/weights.jl b/stdlib/Statistics/src/weights.jl new file mode 100644 index 0000000000000..dc26fff56d923 --- /dev/null +++ b/stdlib/Statistics/src/weights.jl @@ -0,0 +1,434 @@ +using LinearAlgebra: BlasReal + +###### Weights array ##### + +""" + AbstractWeights <: AbstractVector + +The abstract supertype of all vectors of statistical weights. + +Object of this type behave like other `AbstractVector`s, but +they store the sum of their values internally for efficiency. +Concrete `AbstractWeights` type indicates what correction +has to be applied when computing statistics which depend on the +meaning of weights. + +!!! compat "Julia 1.2" + This type requires at least Julia 1.2. +""" +abstract type AbstractWeights{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractVector{T} end + +""" + @weights name + +Generate a new generic weight type with specified `name`, which subtypes `AbstractWeights` +and stores the `values` (`V<:AbstractVector{<:Real}`) and `sum` (`S<:Real`). +""" +macro weights(name) + return quote + mutable struct $name{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractWeights{S, T, V} + values::V + sum::S + end + $(esc(name))(vs) = $(esc(name))(vs, sum(vs)) + end +end + +Base.eltype(wv::AbstractWeights) = eltype(wv.values) +Base.length(wv::AbstractWeights) = length(wv.values) +Base.values(wv::AbstractWeights) = wv.values +Base.sum(wv::AbstractWeights) = wv.sum +Base.isempty(wv::AbstractWeights) = isempty(wv.values) + +Base.getindex(wv::AbstractWeights, i::Int) = getindex(wv.values, i) +Base.size(wv::AbstractWeights) = size(wv.values) + +Base.@propagate_inbounds function Base.setindex!(wv::AbstractWeights, v::Real, i::Int) + s = v - wv[i] + wv.values[i] = v + wv.sum += s + v +end + +""" + varcorrection(n::Integer, corrected=false) + +Compute a bias correction factor for calculating `var`, `std` and `cov` with +`n` observations. Returns ``\\frac{1}{n - 1}`` when `corrected=true` +(i.e. [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)), +otherwise returns ``\\frac{1}{n}`` (i.e. no correction). +""" +@inline varcorrection(n::Integer, corrected::Bool=false) = 1 / (n - Int(corrected)) + +@weights Weights + +@doc """ + Weights(vs, wsum=sum(vs)) + +Construct a `Weights` vector with weight values `vs`. +A precomputed sum may be provided as `wsum`. + +The `Weights` type describes a generic weights vector which does not support +all operations possible for [`FrequencyWeights`](@ref), [`AnalyticWeights`](@ref) +and [`ProbabilityWeights`](@ref). + +!!! compat "Julia 1.2" + This type requires at least Julia 1.2. +""" Weights + +""" + weights(vs) + +Construct a `Weights` vector from array `vs`. +See the documentation for [`Weights`](@ref) for more details. +""" +weights(vs::AbstractVector{<:Real}) = Weights(vs) +weights(vs::AbstractArray{<:Real}) = Weights(vec(vs)) + +""" + varcorrection(w::Weights, corrected=false) + +Returns ``\\frac{1}{\\sum w}`` when `corrected=false` and throws an `ArgumentError` +if `corrected=true`. +""" +@inline function varcorrection(w::Weights, corrected::Bool=false) + corrected && throw(ArgumentError("Weights type does not support bias correction: " * + "use FrequencyWeights, AnalyticWeights or ProbabilityWeights if applicable.")) + 1 / w.sum +end + +@weights AnalyticWeights + +@doc """ + AnalyticWeights(vs, wsum=sum(vs)) + +Construct an `AnalyticWeights` vector with weight values `vs`. +A precomputed sum may be provided as `wsum`. + +Analytic weights describe a non-random relative importance (usually between 0 and 1) +for each observation. These weights may also be referred to as reliability weights, +precision weights or inverse variance weights. These are typically used when the observations +being weighted are aggregate values (e.g., averages) with differing variances. + +!!! compat "Julia 1.2" + This type requires at least Julia 1.2. +""" AnalyticWeights + +""" + aweights(vs) + +Construct an `AnalyticWeights` vector from array `vs`. +See the documentation for [`AnalyticWeights`](@ref) for more details. + +!!! compat "Julia 1.2" + This function requires at least Julia 1.2. +""" +aweights(vs::AbstractVector{<:Real}) = AnalyticWeights(vs) +aweights(vs::AbstractArray{<:Real}) = AnalyticWeights(vec(vs)) + +""" + varcorrection(w::AnalyticWeights, corrected=false) + +* `corrected=true`: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` +* `corrected=false`: ``\\frac{1}{\\sum w}`` +""" +@inline function varcorrection(w::AnalyticWeights, corrected::Bool=false) + s = w.sum + + if corrected + sum_sn = sum(x -> (x / s) ^ 2, w) + 1 / (s * (1 - sum_sn)) + else + 1 / s + end +end + +@weights FrequencyWeights + +@doc """ + FrequencyWeights(vs, wsum=sum(vs)) + +Construct a `FrequencyWeights` vector with weight values `vs`. +A precomputed sum may be provided as `wsum`. + +Frequency weights describe the number of times (or frequency) each observation +was observed. These weights may also be referred to as case weights or repeat weights. + +!!! compat "Julia 1.2" + This type requires at least Julia 1.2. +""" FrequencyWeights + +""" + fweights(vs) + +Construct a `FrequencyWeights` vector from a given array. +See the documentation for [`FrequencyWeights`](@ref) for more details. + +!!! compat "Julia 1.2" + This function requires at least Julia 1.2. +""" +fweights(vs::AbstractVector{<:Real}) = FrequencyWeights(vs) +fweights(vs::AbstractArray{<:Real}) = FrequencyWeights(vec(vs)) + +""" + varcorrection(w::FrequencyWeights, corrected=false) + +* `corrected=true`: ``\\frac{1}{\\sum{w} - 1}`` +* `corrected=false`: ``\\frac{1}{\\sum w}`` +""" +@inline function varcorrection(w::FrequencyWeights, corrected::Bool=false) + s = w.sum + + if corrected + 1 / (s - 1) + else + 1 / s + end +end + +@weights ProbabilityWeights + +@doc """ + ProbabilityWeights(vs, wsum=sum(vs)) + +Construct a `ProbabilityWeights` vector with weight values `vs`. +A precomputed sum may be provided as `wsum`. + +Probability weights represent the inverse of the sampling probability for each observation, +providing a correction mechanism for under- or over-sampling certain population groups. +These weights may also be referred to as sampling weights. + +!!! compat "Julia 1.2" + This type requires at least Julia 1.2. +""" ProbabilityWeights + +""" + pweights(vs) + +Construct a `ProbabilityWeights` vector from a given array. +See the documentation for [`ProbabilityWeights`](@ref) for more details. + +!!! compat "Julia 1.2" + This function requires at least Julia 1.2. +""" +pweights(vs::AbstractVector{<:Real}) = ProbabilityWeights(vs) +pweights(vs::AbstractArray{<:Real}) = ProbabilityWeights(vec(vs)) + +""" + varcorrection(w::ProbabilityWeights, corrected=false) + +* `corrected=true`: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` +* `corrected=false`: ``\\frac{1}{\\sum w}`` +""" +@inline function varcorrection(w::ProbabilityWeights, corrected::Bool=false) + s = w.sum + + if corrected + n = count(!iszero, w) + n / (s * (n - 1)) + else + 1 / s + end +end + +##### Equality tests ##### + +for w in (AnalyticWeights, FrequencyWeights, ProbabilityWeights, Weights) + @eval begin + Base.isequal(x::$w, y::$w) = isequal(x.sum, y.sum) && isequal(x.values, y.values) + Base.:(==)(x::$w, y::$w) = (x.sum == y.sum) && (x.values == y.values) + end +end + +Base.isequal(x::AbstractWeights, y::AbstractWeights) = false +Base.:(==)(x::AbstractWeights, y::AbstractWeights) = false + +##### Weighted sum ##### + +## weighted sum over vectors + +""" + wsum(v::AbstractArray, w::AbstractArray, [dim]) + +Compute the weighted sum of an array `v` with weights `w`, optionally over the dimension `dim`. +""" +function wsum(A::AbstractArray, w::AbstractArray) + sw = size(w) + sA = size(A) + if sw != sA + throw(DimensionMismatch("weights must have the same dimension as data (got $sw and $sA).")) + end + _wsum(A, w) +end + +_wsum(A::AbstractVector, w::AbstractVector) = dot(A, w) +_wsum(A::AbstractArray, w::AbstractArray) = dot(vec(A), vec(w)) + +## wsum along dimension +# +# Brief explanation of the algorithm: +# ------------------------------------ +# +# 1. _wsum! provides the core implementation, which assumes that +# the dimensions of all input arguments are consistent, and no +# dimension checking is performed therein. +# +# wsum and wsum! perform argument checking and call _wsum! +# internally. +# +# 2. _wsum! adopt a Cartesian based implementation for general +# sub types of AbstractArray. Particularly, a faster routine +# that keeps a local accumulator will be used when dim = 1. +# +# The internal function that implements this is _wsum_general! +# +# 3. _wsum! is specialized for following cases: +# (a) A is a vector: we invoke the vector version wsum above. +# The internal function that implements this is _wsum1! +# +# (b) A is a dense matrix with eltype <: BlasReal: we call gemv! +# The internal function that implements this is _wsum2_blas! +# +# (c) A is a contiguous array with eltype <: BlasReal: +# dim == 1: treat A like a matrix of size (d1, d2 x ... x dN) +# dim == N: treat A like a matrix of size (d1 x ... x d(N-1), dN) +# otherwise: decompose A into multiple pages, and apply _wsum2! +# for each +# +# (d) A is a general dense array with eltype <: BlasReal: +# dim <= 2: delegate to (a) and (b) +# otherwise, decompose A into multiple pages +# + +function _wsum1!(R::AbstractArray, A::AbstractVector, w::AbstractVector, init::Bool) + r = wsum(A, w) + if init + R[1] = r + else + R[1] += r + end + return R +end + +function _wsum2_blas!(R::StridedVector{T}, A::StridedMatrix{T}, w::StridedVector{T}, + dim::Int, init::Bool) where T<:BlasReal + beta = ifelse(init, zero(T), one(T)) + trans = dim == 1 ? 'T' : 'N' + BLAS.gemv!(trans, one(T), A, w, beta, R) + return R +end + +function _wsumN!(R::StridedArray{T}, A::StridedArray{T,N}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:BlasReal,N} + if dim == 1 + m = size(A, 1) + n = div(length(A), m) + _wsum2_blas!(view(R,:), reshape(A, (m, n)), w, 1, init) + elseif dim == N + n = size(A, N) + m = div(length(A), n) + _wsum2_blas!(view(R,:), reshape(A, (m, n)), w, 2, init) + else # 1 < dim < N + m = 1 + for i = 1:dim-1; m *= size(A, i); end + n = size(A, dim) + k = 1 + for i = dim+1:N; k *= size(A, i); end + Av = reshape(A, (m, n, k)) + Rv = reshape(R, (m, k)) + for i = 1:k + _wsum2_blas!(view(Rv,:,i), view(Av,:,:,i), w, 2, init) + end + end + return R +end + +function _wsumN!(R::StridedArray{T}, A::DenseArray{T,N}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:BlasReal,N} + @assert N >= 3 + if dim <= 2 + m = size(A, 1) + n = size(A, 2) + npages = 1 + for i = 3:N + npages *= size(A, i) + end + rlen = ifelse(dim == 1, n, m) + Rv = reshape(R, (rlen, npages)) + for i = 1:npages + _wsum2_blas!(view(Rv,:,i), view(A,:,:,i), w, dim, init) + end + else + _wsum_general!(R, identity, A, w, dim, init) + end + return R +end + +# General weighted sum across dimensions +function _wsum_general!(R::AbstractArray{S}, A::AbstractArray{T, N}, w::AbstractVector{W}, + dim::Int, init::Bool) where {S, T, N, W} + # following the implementation of _mapreducedim! at base/reducedim.jl + lsiz = Base.check_reducedims(R,A) + isempty(R) || fill!(R, zero(S)) + isempty(A) && return R + + indsAt, indsRt = Base.safe_tail(axes(A)), Base.safe_tail(axes(R)) # handle d=1 manually + keep, Idefault = Broadcast.shapeindexer(indsRt) + if Base.reducedim1(R, A) + i1 = first(Base.axes1(R)) + for IA in CartesianIndices(indsAt) + IR = Broadcast.newindex(IA, keep, Idefault) + r = R[i1,IR] + @simd for i in axes(A, 1) + r += A[i,IA] * w[dim > 1 ? IA[dim-1] : i] + end + R[i1,IR] = r + end + else + for IA in CartesianIndices(indsAt) + IR = Broadcast.newindex(IA, keep, Idefault) + @simd for i in axes(A, 1) + R[i,IR] += A[i,IA] * w[dim > 1 ? IA[dim-1] : i] + end + end + end + return R +end + +# N = 1 +_wsum!(R::StridedArray{T}, A::DenseArray{T,1}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:BlasReal} = + _wsum1!(R, A, w, init) + +# N = 2 +_wsum!(R::StridedArray{T}, A::DenseArray{T,2}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:BlasReal} = + _wsum2_blas!(view(R,:), A, w, init) + +# N >= 3 +_wsum!(R::StridedArray{T}, A::DenseArray{T}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:BlasReal} = + _wsumN!(R, A, w, init) + +_wsum!(R::AbstractArray, A::AbstractArray, w::AbstractVector, + dim::Int, init::Bool) = + _wsum_general!(R, A, w, dim, init) + +## wsum! and wsum + +function wsum!(R::AbstractArray, A::AbstractArray{T,N}, w::AbstractVector; + init::Bool=true) where {T,N} + Base.check_reducedims(R,A) + reddims = size(R) .!= size(A) + dim = something(findfirst(reddims), ndims(R)+1) + if findnext(reddims, dim+1) !== nothing + throw(ArgumentError("reducing over more than one dimension is not supported with weights")) + end + lw = length(w) + ldim = size(A, dim) + if lw != ldim + throw(DimensionMismatch("weights must have the same length as the dimension " * + "over which reduction is performed (got $lw and $ldim).")) + end + _wsum!(R, A, w, dim, init) +end \ No newline at end of file diff --git a/stdlib/Statistics/test/moments.jl b/stdlib/Statistics/test/moments.jl new file mode 100644 index 0000000000000..b760d818b64dc --- /dev/null +++ b/stdlib/Statistics/test/moments.jl @@ -0,0 +1,281 @@ +using Statistics +using Test + +@testset "Moments" begin +weight_funcs = (weights, aweights, fweights, pweights) + +##### weighted var & std + +x = [0.57, 0.10, 0.91, 0.72, 0.46, 0.0] +w = [3.84, 2.70, 8.29, 8.91, 9.71, 0.0] + +@testset "Uncorrected with $f" for f in weight_funcs + wv = f(w) + m = mean(x, weights=wv) + + # expected uncorrected output + expected_var = sum(abs2.(x .- m) .* wv) / sum(wv) + expected_std = sqrt.(expected_var) + + @testset "Variance" begin + @test var(x, weights=wv, corrected=false) ≈ expected_var + @test var(x, weights=wv, mean=m, corrected=false) ≈ expected_var + end + + @testset "Standard Deviation" begin + @test std(x, weights=wv, corrected=false) ≈ expected_std + @test std(x, weights=wv, mean=m, corrected=false) ≈ expected_std + end + + @testset "Mean and Variance" begin + (m, v) = mean_and_var(x, corrected=false) + @test m == mean(x) + @test v == var(x, corrected=corrected=false) + + (m, v) = mean_and_var(x, weights=wv, corrected=false) + @test m == mean(x, weights=wv) + @test v == var(x, weights=wv, corrected=false) + end + + @testset "Mean and Standard Deviation" begin + (m, s) = mean_and_std(x; corrected=false) + @test m == mean(x) + @test s == std(x; corrected=false) + + (m, s) = mean_and_std(x, weights=wv, corrected=false) + @test m == mean(x, weights=wv) + @test s == std(x, weights=wv, corrected=false) + end +end + +# expected corrected output for (weights, aweights, fweights, pweights) +expected_var = [NaN, 0.0694434191182236, 0.05466601256158146, 0.06628969012045285] +expected_std = sqrt.(expected_var) + +@testset "Corrected with $(weight_funcs[i])" for i in eachindex(weight_funcs) + wv = weight_funcs[i](w) + m = mean(x, weights=wv) + + @testset "Variance" begin + if isa(wv, Weights) + @test_throws ArgumentError var(x, weights=wv, corrected=true) + else + @test var(x, weights=wv, corrected=true) ≈ expected_var[i] + @test var(x, weights=wv, mean=m, corrected=true) ≈ expected_var[i] + end + end + + @testset "Standard Deviation" begin + if isa(wv, Weights) + @test_throws ArgumentError std(x, weights=wv, corrected=true) + else + @test std(x, weights=wv, corrected=true) ≈ expected_std[i] + @test std(x, weights=wv, mean=m, corrected=true) ≈ expected_std[i] + end + end + + @testset "Mean and Variance" begin + (m, v) = mean_and_var(x; corrected=true) + @test m == mean(x) + @test v == var(x, corrected=true) + + if isa(wv, Weights) + @test_throws ArgumentError mean_and_var(x, weights=wv, corrected=true) + else + (m, v) = mean_and_var(x, weights=wv, corrected=true) + @test m == mean(x, weights=wv) + @test v == var(x, weights=wv, corrected=true) + end + end + + @testset "Mean and Standard Deviation" begin + (m, s) = mean_and_std(x, corrected=true) + @test m == mean(x) + @test s == std(x, corrected=true) + + if isa(wv, Weights) + @test_throws ArgumentError mean_and_std(x, weights=wv, corrected=true) + else + (m, s) = mean_and_std(x, weights=wv, corrected=true) + @test m == mean(x, weights=wv) + @test s == std(x, weights=wv, corrected=true) + end + end +end + +x = rand(5, 6) +w1 = rand(5) +w2 = rand(6) + +@testset "Uncorrected with $f" for f in weight_funcs + wv1 = f(w1) + wv2 = f(w2) + m1 = mean(x, weights=wv1, dims=1) + m2 = mean(x, weights=wv2, dims=2) + + expected_var1 = sum(abs2.(x .- m1) .* w1, dims = 1) ./ sum(wv1) + expected_var2 = sum(abs2.(x .- m2) .* w2', dims = 2) ./ sum(wv2) + expected_std1 = sqrt.(expected_var1) + expected_std2 = sqrt.(expected_var2) + + @testset "Variance" begin + @test var(x, weights=wv1, dims=1, corrected=false) ≈ expected_var1 + @test var(x, weights=wv2, dims=2, corrected=false) ≈ expected_var2 + @test var(x, weights=wv1, dims=1, mean=m1, corrected=false) ≈ expected_var1 + @test var(x, weights=wv2, dims=2, mean=m2, corrected=false) ≈ expected_var2 + end + + @testset "Standard Deviation" begin + @test std(x, weights=wv1, dims=1, corrected=false) ≈ expected_std1 + @test std(x, weights=wv2, dims=2, corrected=false) ≈ expected_std2 + @test std(x, weights=wv1, dims=1, mean=m1, corrected=false) ≈ expected_std1 + @test std(x, weights=wv2, dims=2, mean=m2, corrected=false) ≈ expected_std2 + end + + @testset "Mean and Variance" begin + for d in 1:2 + (m, v) = mean_and_var(x, dims=d, corrected=false) + @test m == mean(x, dims=d) + @test v == var(x, dims=d, corrected=false) + end + + (m, v) = mean_and_var(x, weights=wv1, dims=1, corrected=false) + @test m == mean(x, weights=wv1, dims=1) + @test v == var(x, weights=wv1, dims=1, corrected=false) + + (m, v) = mean_and_var(x, weights=wv2, dims=2, corrected=false) + @test m == mean(x, weights=wv2, dims=2) + @test v == var(x, weights=wv2, dims=2, corrected=false) + end + + @testset "Mean and Standard Deviation" begin + for d in 1:2 + (m, s) = mean_and_std(x, dims=d, corrected=false) + @test m == mean(x, dims=d) + @test s == std(x, dims=d, corrected=false) + end + + (m, s) = mean_and_std(x, weights=wv1, dims=1, corrected=false) + @test m == mean(x, weights=wv1, dims=1) + @test s == std(x, weights=wv1, dims=1, corrected=false) + + (m, s) = mean_and_std(x, weights=wv2, dims=2, corrected=false) + @test m == mean(x, weights=wv2, dims=2) + @test s == std(x, weights=wv2, dims=2, corrected=false) + end +end + +@testset "Corrected with $f" for f in weight_funcs + wv1 = f(w1) + wv2 = f(w2) + m1 = mean(x, weights=wv1, dims=1) + m2 = mean(x, weights=wv2, dims=2) + + if !isa(wv1, Weights) + expected_var1 = sum(abs2.(x .- m1) .* w1, dims = 1) .* Statistics.varcorrection(wv1, true) + expected_var2 = sum(abs2.(x .- m2) .* w2', dims = 2) .* Statistics.varcorrection(wv2, true) + expected_std1 = sqrt.(expected_var1) + expected_std2 = sqrt.(expected_var2) + end + + @testset "Variance" begin + if isa(wv1, Weights) + @test_throws ArgumentError var(x, weights=wv1, dims=1, corrected=true) + else + @test var(x, weights=wv1, dims=1, corrected=true) ≈ expected_var1 + @test var(x, weights=wv2, dims=2, corrected=true) ≈ expected_var2 + @test var(x, weights=wv1, dims=1, mean=m1, corrected=true) ≈ expected_var1 + @test var(x, weights=wv2, dims=2, mean=m2, corrected=true) ≈ expected_var2 + end + end + + @testset "Standard Deviation" begin + if isa(wv1, Weights) + @test_throws ArgumentError std(x, weights=wv1, dims=1, corrected=true) + else + @test std(x, weights=wv1, dims=1, corrected=true) ≈ expected_std1 + @test std(x, weights=wv2, dims=2, corrected=true) ≈ expected_std2 + @test std(x, weights=wv1, dims=1, mean=m1, corrected=true) ≈ expected_std1 + @test std(x, weights=wv2, dims=2, mean=m2, corrected=true) ≈ expected_std2 + end + end + + @testset "Mean and Variance" begin + for d in 1:2 + (m, v) = mean_and_var(x, dims=d, corrected=true) + @test m == mean(x, dims=d) + @test v == var(x, dims=d, corrected=true) + end + + if isa(wv1, Weights) + @test_throws ArgumentError mean_and_var(x, weights=wv1, dims=1, corrected=true) + else + (m, v) = mean_and_var(x, weights=wv1, dims=1, corrected=true) + @test m == mean(x, weights=wv1, dims=1) + @test v == var(x, weights=wv1, dims=1, corrected=true) + + (m, v) = mean_and_var(x, weights=wv2, dims=2; corrected=true) + @test m == mean(x, weights=wv2, dims=2) + @test v == var(x, weights=wv2, dims=2; corrected=true) + end + end + + @testset "Mean and Standard Deviation" begin + for d in 1:2 + (m, s) = mean_and_std(x, dims=d, corrected=true) + @test m == mean(x, dims=d) + @test s == std(x, dims=d, corrected=true) + end + + if isa(wv1, Weights) + @test_throws ArgumentError mean_and_std(x, weights=wv1, dims=1, corrected=true) + else + (m, s) = mean_and_std(x, weights=wv1, dims=1, corrected=true) + @test m == mean(x, weights=wv1, dims=1) + @test s == std(x, weights=wv1, dims=1, corrected=true) + + (m, s) = mean_and_std(x, weights=wv2, dims=2, corrected=true) + @test m == mean(x, weights=wv2, dims=2) + @test s == std(x, weights=wv2, dims=2, corrected=true) + end + end +end + +@testset "Skewness and Kurtosis with $f" for f in weight_funcs + wv = f(ones(5) * 2.0) + + @test skewness(1:5) ≈ 0.0 + @test skewness([1, 2, 3, 4, 5]) ≈ 0.0 + @test skewness([1, 2, 2, 2, 5]) ≈ 1.1731251294063556 + @test skewness([1, 4, 4, 4, 5]) ≈ -1.1731251294063556 + + @test skewness([1, 2, 2, 2, 5], weights=wv) ≈ 1.1731251294063556 + + @test kurtosis(1:5) ≈ -1.3 + @test kurtosis([1, 2, 3, 4, 5]) ≈ -1.3 + @test kurtosis([1, 2, 3, 3, 2]) ≈ -1.1530612244897953 + + @test kurtosis([1, 2, 3, 4, 5], weights=wv) ≈ -1.3 +end + +@testset "General Moments with $f" for f in weight_funcs + x = collect(2.0:8.0) + @test moment(x, 2) ≈ sum((x .- 5).^2) / length(x) + @test moment(x, 3) ≈ sum((x .- 5).^3) / length(x) + @test moment(x, 4) ≈ sum((x .- 5).^4) / length(x) + @test moment(x, 5) ≈ sum((x .- 5).^5) / length(x) + + @test moment(x, 2, mean=4.0) ≈ sum((x .- 4).^2) / length(x) + @test moment(x, 3, mean=4.0) ≈ sum((x .- 4).^3) / length(x) + @test moment(x, 4, mean=4.0) ≈ sum((x .- 4).^4) / length(x) + @test moment(x, 5, mean=4.0) ≈ sum((x .- 4).^5) / length(x) + + w = f([1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]) + x2 = collect(2.0:6.0) + @test moment(x, 2, weights=w) ≈ sum((x2 .- 4).^2) / 5 + @test moment(x, 3, weights=w) ≈ sum((x2 .- 4).^3) / 5 + @test moment(x, 4, weights=w) ≈ sum((x2 .- 4).^4) / 5 + @test moment(x, 5, weights=w) ≈ sum((x2 .- 4).^5) / 5 +end + +end diff --git a/stdlib/Statistics/test/runtests.jl b/stdlib/Statistics/test/runtests.jl index e4849e04d4263..5cb5308a73550 100644 --- a/stdlib/Statistics/test/runtests.jl +++ b/stdlib/Statistics/test/runtests.jl @@ -720,3 +720,5 @@ end @test isfinite.(cov_sparse) == isfinite.(cov_dense) end end + +include("weights.jl") \ No newline at end of file diff --git a/stdlib/Statistics/test/weights.jl b/stdlib/Statistics/test/weights.jl new file mode 100644 index 0000000000000..617ae6d8e4251 --- /dev/null +++ b/stdlib/Statistics/test/weights.jl @@ -0,0 +1,336 @@ +using Statistics +using LinearAlgebra, Random, SparseArrays, Test + +@testset "Weights" begin +weight_funcs = (weights, aweights, fweights, pweights) + +# Construction +@testset "$f" for f in weight_funcs + @test isa(f([1, 2, 3]), AbstractWeights{Int}) + @test isa(f([1., 2., 3.]), AbstractWeights{Float64}) + @test isa(f([1 2 3; 4 5 6]), AbstractWeights{Int}) + + @test isempty(f(Float64[])) + @test size(f([1, 2, 3])) == (3,) + + w = [1., 2., 3.] + wv = f(w) + @test eltype(wv) === Float64 + @test length(wv) === 3 + @test values(wv) === w + @test sum(wv) === 6.0 + @test !isempty(wv) + + b = trues(3) + bv = f(b) + @test eltype(bv) === Bool + @test length(bv) === 3 + @test values(bv) === b + @test sum(bv) === 3 + @test !isempty(bv) + + ba = BitArray([true, false, true]) + sa = sparsevec([1., 0., 2.]) + + # TODO: keep? + #@test sum(ba, wv) === 4.0 + #@test sum(sa, wv) === 7.0 +end + +@testset "$f, setindex!" for f in weight_funcs + w = [1., 2., 3.] + wv = f(w) + + # Check getindex & sum + @test wv[1] === 1. + @test sum(wv) === 6. + @test values(wv) == w + + # Test setindex! success + @test (wv[1] = 4) === 4 # setindex! returns original val + @test wv[1] === 4. # value correctly converted and set + @test sum(wv) === 9. # sum updated + @test values(wv) == [4., 2., 3.] # Test state of all values + + # Test mulivalue setindex! + wv[1:2] = [3., 5.] + @test wv[1] === 3. + @test wv[2] === 5. + @test sum(wv) === 11. + @test values(wv) == [3., 5., 3.] # Test state of all values + + # Test failed setindex! due to conversion error + w = [1, 2, 3] + wv = f(w) + + @test_throws InexactError wv[1] = 1.5 # Returns original value + @test wv[1] === 1 # value not updated + @test sum(wv) === 6 # sum not corrupted + @test values(wv) == [1, 2, 3] # Test state of all values +end + +@testset "$f, isequal and ==" for f in weight_funcs + x = f([1, 2, 3]) + + y = f([1, 2, 3]) # same values, type and parameters + @test isequal(x, y) + @test x == y + + y = f([1.0, 2.0, 3.0]) # same values and type, different parameters + @test isequal(x, y) + @test x == y + + if f != fweights # same values and parameters, different types + y = fweights([1, 2, 3]) + @test !isequal(x, y) + @test x != y + end + + x = f([1, 2, NaN]) # isequal and == treat NaN differently + y = f([1, 2, NaN]) + @test isequal(x, y) + @test x != y + + x = f([1.0, 2.0, 0.0]) # isequal and == treat ±0.0 differently + y = f([1.0, 2.0, -0.0]) + @test !isequal(x, y) + @test x == y +end + +@testset "Mean $f" for f in weight_funcs + @test mean([1:3;], weights=f([1.0, 1.0, 0.5])) ≈ 1.8 + @test mean(1:3, weights=f([1.0, 1.0, 0.5])) ≈ 1.8 + + a = reshape(1.0:27.0, 3, 3, 3) + for wt in ([1.0, 1.0, 1.0], [1.0, 0.2, 0.0], [0.2, 0.0, 1.0]) + @test mean(a, weights=f(wt), dims=1) ≈ + sum(a.*reshape(wt, :, 1, 1), dims=1)/sum(wt) + @test mean(a, weights=f(wt), dims=2) ≈ + sum(a.*reshape(wt, 1, :, 1), dims=2)/sum(wt) + @test mean(a, weights=f(wt), dims=3) ≈ + sum(a.*reshape(wt, 1, 1, :), dims=3)/sum(wt) + @test_throws DimensionMismatch mean(a, weights=f(wt), dims=4) + end +end + + +# Quantile fweights +@testset "Quantile fweights" begin + data = ( + [7, 1, 2, 4, 10], + [7, 1, 2, 4, 10], + [7, 1, 2, 4, 10, 15], + [1, 2, 4, 7, 10, 15], + [0, 10, 20, 30], + [1, 2, 3, 4, 5], + [1, 2, 3, 4, 5], + [30, 40, 50, 60, 35], + [2, 0.6, 1.3, 0.3, 0.3, 1.7, 0.7, 1.7], + [1, 2, 2], + [3.7, 3.3, 3.5, 2.8], + [100, 125, 123, 60, 45, 56, 66], + [2, 2, 2, 2, 2, 2], + [2.3], + [-2, -3, 1, 2, -10], + [1, 2, 3, 4, 5], + [5, 4, 3, 2, 1], + [-2, 2, -1, 3, 6], + [-10, 1, 1, -10, -10], + ) + wt = ( + [3, 1, 1, 1, 3], + [1, 1, 1, 1, 1], + [3, 1, 1, 1, 3, 3], + [1, 1, 1, 3, 3, 3], + [30, 191, 9, 0], + [10, 1, 1, 1, 9], + [10, 1, 1, 1, 900], + [1, 3, 5, 4, 2], + [2, 2, 5, 0, 2, 2, 1, 6], + [1, 1, 8], + [5, 5, 4, 1], + [30, 56, 144, 24, 55, 43, 67], + [1, 2, 3, 4, 5, 6], + [12], + [7, 1, 1, 1, 6], + [1, 0, 0, 0, 2], + [1, 2, 3, 4, 5], + [1, 2, 3, 2, 1], + [0, 1, 1, 1, 1], + ) + p = [0.0, 0.25, 0.5, 0.75, 1.0] + function _rep(x::AbstractVector, lengths::AbstractVector{Int}) + res = similar(x, sum(lengths)) + i = 1 + for idx in 1:length(x) + tmp = x[idx] + for kdx in 1:lengths[idx] + res[i] = tmp + i += 1 + end + end + return res + end + # quantile with fweights is the same as repeated vectors + for i = 1:length(data) + @test quantile(data[i], p, weights=fweights(wt[i])) ≈ + quantile(_rep(data[i], wt[i]), p) + end + # quantile with fweights = 1 is the same as quantile + for i = 1:length(data) + @test quantile(data[i], p, weights=fweights(fill!(similar(wt[i]), 1))) ≈ quantile(data[i], p) + end + + # Issue JuliaStats/StatsBase#313 + @test quantile([1, 2, 3, 4, 5], p, weights=fweights([0,1,2,1,0])) ≈ + quantile([2, 3, 3, 4], p) + @test quantile([1, 2], 0.25, weights=fweights([1, 1])) ≈ 1.25 + @test quantile([1, 2], 0.25, weights=fweights([2, 2])) ≈ 1.0 + + # test non integer frequency weights + quantile([1, 2], 0.25, weights=fweights([1.0, 2.0])) == + quantile([1, 2], 0.25, weights=fweights([1, 2])) + @test_throws ArgumentError quantile([1, 2], 0.25, weights=fweights([1.5, 2.0])) + + @test_throws ArgumentError quantile([1, 2], nextfloat(1.0), weights=fweights([1, 2])) + @test_throws ArgumentError quantile([1, 2], prevfloat(0.0), weights=fweights([1, 2])) +end + +@testset "Quantile aweights, pweights and weights" for f in (aweights, pweights, weights) + data = ( + [7, 1, 2, 4, 10], + [7, 1, 2, 4, 10], + [7, 1, 2, 4, 10, 15], + [1, 2, 4, 7, 10, 15], + [0, 10, 20, 30], + [1, 2, 3, 4, 5], + [1, 2, 3, 4, 5], + [30, 40, 50, 60, 35], + [2, 0.6, 1.3, 0.3, 0.3, 1.7, 0.7, 1.7], + [1, 2, 2], + [3.7, 3.3, 3.5, 2.8], + [100, 125, 123, 60, 45, 56, 66], + [2, 2, 2, 2, 2, 2], + [2.3], + [-2, -3, 1, 2, -10], + [1, 2, 3, 4, 5], + [5, 4, 3, 2, 1], + [-2, 2, -1, 3, 6], + [-10, 1, 1, -10, -10], + ) + wt = ( + [1, 1/3, 1/3, 1/3, 1], + [1, 1, 1, 1, 1], + [1, 1/3, 1/3, 1/3, 1, 1], + [1/3, 1/3, 1/3, 1, 1, 1], + [30, 191, 9, 0], + [10, 1, 1, 1, 9], + [10, 1, 1, 1, 900], + [1, 3, 5, 4, 2], + [2, 2, 5, 1, 2, 2, 1, 6], + [0.1, 0.1, 0.8], + [5, 5, 4, 1], + [30, 56, 144, 24, 55, 43, 67], + [0.1, 0.2, 0.3, 0.4, 0.5, 0.6], + [12], + [7, 1, 1, 1, 6], + [1, 0, 0, 0, 2], + [1, 2, 3, 4, 5], + [0.1, 0.2, 0.3, 0.2, 0.1], + [1, 1, 1, 1, 1], + ) + quantile_answers = ( + [1.0, 4.0, 6.0, 8.0, 10.0], + [1.0, 2.0, 4.0, 7.0, 10.0], + [1.0, 4.75, 7.5, 10.4166667, 15.0], + [1.0, 4.75, 7.5, 10.4166667, 15.0], + [0.0, 2.6178010, 5.2356021, 7.8534031, 20.0], + [1.0, 4.0, 4.3333333, 4.6666667, 5.0], + [1.0, 4.2475, 4.4983333, 4.7491667, 5.0], + [30.0, 37.5, 44.0, 51.25, 60.0], + [0.3, 0.7, 1.3, 1.7, 2.0], + [1.0, 2.0, 2.0, 2.0, 2.0], + [2.8, 3.15, 3.4, 3.56, 3.7], + [45.0, 62.149253, 102.875, 117.4097222, 125.0], + [2.0, 2.0, 2.0, 2.0, 2.0], + [2.3, 2.3, 2.3, 2.3, 2.3], + [-10.0, -2.7857143, -2.4285714, -2.0714286, 2.0], + [1.0, 2.0, 3.0, 4.0, 5.0], + [1.0, 1.625, 2.3333333, 3.25, 5.0], + [-2.0, -1.3333333, 0.5, 2.5, 6.0], + [-10.0, -10.0, -10.0, 1.0, 1.0] + ) + p = [0.0, 0.25, 0.5, 0.75, 1.0] + + Random.seed!(10) + for i = 1:length(data) + @test quantile(data[i], p, weights=f(wt[i])) ≈ quantile_answers[i] atol = 1e-5 + for j = 1:10 + # order of p does not matter + reorder = sortperm(rand(length(p))) + @test quantile(data[i], p[reorder], weights=f(wt[i])) ≈ + quantile_answers[i][reorder] atol = 1e-5 + end + for j = 1:10 + # order of w does not matter + reorder = sortperm(rand(length(data[i]))) + @test quantile(data[i][reorder], p, weights=f(wt[i][reorder])) ≈ + quantile_answers[i] atol = 1e-5 + end + end + # All equal weights corresponds to base quantile + for v in (1, 2, 345) + for i = 1:length(data) + w = f(fill(v, length(data[i]))) + @test quantile(data[i], p, weights=w) ≈ quantile(data[i], p) atol = 1e-5 + for j = 1:10 + prandom = rand(4) + @test quantile(data[i], prandom, weights=w) ≈ + quantile(data[i], prandom) atol = 1e-5 + end + end + end + # test zeros are removed + for i = 1:length(data) + @test quantile(vcat(1.0, data[i]), p, weights=f(vcat(0.0, wt[i]))) ≈ + quantile_answers[i] atol = 1e-5 + end + # Syntax + v = [7, 1, 2, 4, 10] + w = [1, 1/3, 1/3, 1/3, 1] + answer = 6.0 + @test quantile(data[1], 0.5, weights=f(w)) ≈ answer atol = 1e-5 +end + + +@testset "Median $f" for f in weight_funcs + data = [4, 3, 2, 1] + wt = [0, 0, 0, 0] + @test_throws ArgumentError median(data, weights=f(wt)) + @test_throws ArgumentError median(Float64[], weights=f(Float64[])) + wt = [1, 2, 3, 4, 5] + @test_throws ArgumentError median(data, weights=f(wt)) + # FIXME: This works now, needs to be covered + # if VERSION >= v"1.0" + # @test_throws MethodError median([4 3 2 1 0], weights=f(wt)) + # @test_throws MethodError median([[1 2] ; [4 5] ; [7 8] ; [10 11] ; [13 14]], + # weights=f(wt)) + # end + data = [1, 3, 2, NaN, 2] + @test isnan(median(data, weights=f(wt))) + wt = [1, 2, NaN, 4, 5] + @test_throws ArgumentError median(data, weights=f(wt)) + data = [1, 3, 2, 1, 2] + @test_throws ArgumentError median(data, weights=f(wt)) + wt = [-1, -1, -1, -1, -1] + @test_throws ArgumentError median(data, weights=f(wt)) + wt = [-1, -1, -1, 0, 0] + @test_throws ArgumentError median(data, weights=f(wt)) + + data = [4, 3, 2, 1] + wt = [1, 2, 3, 4] + @test median(data, weights=f(wt)) ≈ + quantile(data, 0.5, weights=f(wt)) atol = 1e-5 +end + +end # @testset Weights \ No newline at end of file From d5e33de93b0f94cd143335fdf829f40ecdf73b7d Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 24 Mar 2019 17:07:48 +0100 Subject: [PATCH 02/12] Remove moment and combined stats, make other functions accept any iterable --- stdlib/Statistics/docs/src/index.md | 1 - stdlib/Statistics/src/Statistics.jl | 22 ++- stdlib/Statistics/src/moments.jl | 283 +++++++++------------------- stdlib/Statistics/test/moments.jl | 140 -------------- 4 files changed, 103 insertions(+), 343 deletions(-) diff --git a/stdlib/Statistics/docs/src/index.md b/stdlib/Statistics/docs/src/index.md index ad18491a3e39a..661fcf4053c3d 100644 --- a/stdlib/Statistics/docs/src/index.md +++ b/stdlib/Statistics/docs/src/index.md @@ -30,7 +30,6 @@ Statistics.var Statistics.varm Statistics.skewness Statistics.kurtosis -Statistics.moment ``` ## Correlation and covariance diff --git a/stdlib/Statistics/src/Statistics.jl b/stdlib/Statistics/src/Statistics.jl index f2c521b23a5fb..7ae3ee6986397 100644 --- a/stdlib/Statistics/src/Statistics.jl +++ b/stdlib/Statistics/src/Statistics.jl @@ -11,11 +11,11 @@ using LinearAlgebra, SparseArrays using Base: has_offset_axes, require_one_based_indexing -export cor, cov, std, stdm, var, varm, mean!, mean, mean_and_var, mean_and_std, +export cor, cov, std, stdm, var, varm, mean!, mean, median!, median, middle, quantile!, quantile, + skewness, kurtosis, AbstractWeights, Weights, AnalyticWeights, FrequencyWeights, ProbabilityWeights, - weights, aweights, fweights, pweights, - moment, skewness, kurtosis + weights, aweights, fweights, pweights include("weights.jl") include("moments.jl") @@ -357,13 +357,19 @@ function _varm(A::AbstractArray{T}, m, corrected::Bool, dims::Colon, w::Nothing) return centralize_sumabs2(A, m) / (n - Int(corrected)) end -function _varm(A::AbstractArray{T}, m::Real, corrected::Bool, dims::Colon, +function _varm(A::AbstractArray{T}, m, corrected::Bool, dims::Colon, w::AbstractWeights) where T - return _moment2(A, w, m; corrected=corrected) + s = (zero(T) - zero(m))^2 * zero(eltype(w)) + @inbounds @simd for i in eachindex(A, w) + z = A[i] - m + s += (z * z) * w[i] + end + + varcorrection(w, corrected) * s end """ - var(itr; corrected::Bool=true, [weights::AbstractWeights], [dims]) + var(itr; corrected::Bool=true, [weights::AbstractWeights], [mean], [dims]) Compute the sample variance of collection `itr`. @@ -1060,8 +1066,8 @@ Quantiles are computed via linear interpolation between the points `((k-1)/(n-1) for `k = 1:n` where `n = length(itr)`. This corresponds to Definition 7 of Hyndman and Fan (1996), and is the same as the R default. -If `itr` is an `AbstractArray`, compute the weighted quantiles using weights vector `w`. -Weights must not be negative. The weights and data must have the same length. +If `itr` is an `AbstractArray`, `weights` can be specified to compute weighted quantiles. +Weights must not be negative and must have the same length as the data. With [`FrequencyWeights`](@ref), the function returns the same result as `quantile` for a vector with repeated values. Weights must be integers. With non `FrequencyWeights`, denote ``N`` the length of the vector, ``w`` the vector of weights, diff --git a/stdlib/Statistics/src/moments.jl b/stdlib/Statistics/src/moments.jl index 922f35148106d..a0f99ea388dd2 100644 --- a/stdlib/Statistics/src/moments.jl +++ b/stdlib/Statistics/src/moments.jl @@ -4,13 +4,13 @@ ## var along dim -function varm!(R::AbstractArray, A::AbstractArray, M::AbstractArray, w::AbstractWeights; +function varm!(R::AbstractArray, A::AbstractArray, M::AbstractArray, w::AbstractArray; corrected::Bool=true) rmul!(centralize_sumabs2!(R, A, M, values(w)), varcorrection(w, corrected)) end -function var!(R::AbstractArray, A::AbstractArray, w::AbstractWeights, dims::Int; +function var!(R::AbstractArray, A::AbstractArray, w::AbstractArray, dims::Int; mean=nothing, corrected::Bool=true) if mean == 0 varm!(R, A, w, Base.reducedim_initarray(A, dims, 0, eltype(R)), dims; @@ -32,211 +32,84 @@ function var!(R::AbstractArray, A::AbstractArray, w::AbstractWeights, dims::Int; end end -function var(A::AbstractArray, w::AbstractWeights, dim::Int; mean=nothing, - corrected::Bool=true) +function var(A::AbstractArray, w::AbstractArray, dim::Int; + mean=nothing, corrected::Bool=true) corrected = depcheck(:var, corrected) var!(similar(A, Float64, Base.reduced_indices(axes(A), dim)), A, w, dim; mean=mean, corrected=corrected) end -##### Fused statistics -""" - mean_and_var(x; [weights::AbstractWeights,] [dims,] corrected=false) -> (mean, var) - -Return the mean and variance of a real-valued array `x`, optionally over dimensions -`dims`, as a tuple. Observations in `x` can be weighted using `weights`. -Finally, bias correction is be applied to the variance calculation if `corrected=true`. -See [`var`](@ref) documentation for more details. -""" -function mean_and_var(A::AbstractArray; - weights::Union{AbstractWeights, Nothing}=nothing, - dims=:, - corrected::Bool=true) - m = mean(A, weights=weights, dims=dims) - v = var(A, mean=m, weights=weights, dims=dims, corrected=corrected) - m, v -end - -""" - mean_and_std(x; [weights::AbstractWeights], [dims,] corrected=true) -> (mean, std) - -Return the mean and standard deviation of a real-valued array `x`, -optionally over dimensions `dims`, as a tuple. Observations in `x` -can be weighted using `weights`. Finally, bias correction is be applied to the -standard deviation calculation if `corrected=true`. -See [`std`](@ref) documentation for more details. -""" -function mean_and_std(A::AbstractArray; - weights::Union{AbstractWeights, Nothing}=nothing, - dims=:, - corrected::Bool=true) - m = mean(A, weights=weights, dims=dims) - s = std(A, mean=m, weights=weights, dims=dims, corrected=corrected) - m, s -end - - - -##### General central moment -function _moment2(A::AbstractArray, m::Real; corrected=false) - s = 0.0 - for i in eachindex(A) - @inbounds z = A[i] - m - s += z * z - end - varcorrection(length(A), corrected) * s -end - -function _moment2(A::AbstractArray, w::AbstractArray, m::Real; corrected=false) - s = 0.0 - for i in eachindex(A, w) - @inbounds z = A[i] - m - @inbounds s += (z * z) * w[i] - end - - varcorrection(w, corrected) * s -end - -function _moment3(A::AbstractArray, m::Real) - s = 0.0 - for i in eachindex(A) - @inbounds z = A[i] - m - s += z * z * z - end - s / length(A) -end - -function _moment3(A::AbstractArray, w::AbstractArray, m::Real) - s = 0.0 - for i in eachindex(A, w) - @inbounds z = A[i] - m - @inbounds s += (z * z * z) * w[i] - end - s / sum(w) -end - -function _moment4(A::AbstractArray, m::Real) - s = 0.0 - for i in eachindex(A) - @inbounds z = A[i] - m - s += abs2(z * z) - end - s / length(A) -end - -function _moment4(A::AbstractArray, w::AbstractArray, m::Real) - s = 0.0 - for i in eachindex(A, w) - @inbounds z = A[i] - m - @inbounds s += abs2(z * z) * w[i] - end - s / sum(w) -end - -function _momentk(A::AbstractArray, k::Int, m::Real) - s = 0.0 - for i = eachindex(A) - @inbounds z = A[i] - m - s += (z ^ k) - end - s / length(A) -end - -function _momentk(A::AbstractArray, k::Int, w::AbstractArray, m::Real) - s = 0.0 - for i in eachindex(A, w) - @inbounds z = A[i] - m - @inbounds s += (z ^ k) * w[i] - end - s / sum(w) -end - - -""" - moment(A, k; [weights::AbstractArray], [mean::Real]) - -Return the `k`th order central moment of a real-valued array `A`, optionally -specifying `weights` and `mean`. - -!!! compat "Julia 1.2" - This function requires at least Julia 1.2. -""" -moment(A::AbstractArray, k::Int; - weights::Union{AbstractWeights,Nothing}=nothing, - mean::Union{Real, Nothing}=nothing) = - _moment(A, k, weights, mean) - -function _moment(A::AbstractArray, k::Int, w::Nothing, m::Real) - k == 2 ? _moment2(A, m) : - k == 3 ? _moment3(A, m) : - k == 4 ? _moment4(A, m) : - _momentk(A, k, m) -end -_moment(A::AbstractArray, k::Int, w::Nothing, m::Nothing) = - _moment(A, k, nothing, mean(A)) - -function _moment(A::AbstractArray, k::Int, w::AbstractArray, m::Real) - k == 2 ? _moment2(A, w, m) : - k == 3 ? _moment3(A, w, m) : - k == 4 ? _moment4(A, w, m) : - _momentk(A, k, w, m) -end -_moment(A::AbstractArray, k::Int, w::AbstractArray, m::Nothing) = - _moment(A, k, w, mean(A, weights=w)) - - ##### Skewness and Kurtosis # Skewness # This is Type 1 definition according to Joanes and Gill (1998) """ - skewness(A; [weights::AbstractArray], [mean::Real]) + skewness(x; [weights::AbstractArray], [mean::Real]) -Compute the standardized skewness of a real-valued array `A`, optionally -specifying `weights` and `mean`. +Compute the standardized skewness of collection `x`, optionally +specifying a pre-computed `mean`. +If `x` is an `AbstractArray`, a `weights` array of the same length as `x` +can be specified to compute the weighted skewness. !!! compat "Julia 1.2" This function requires at least Julia 1.2. """ +skewness(A; mean::Union{Real, Nothing}=nothing) = _skewness(A, nothing, mean) + skewness(A::AbstractArray; - weights::Union{AbstractWeights,Nothing}=nothing, + weights::Union{AbstractArray,Nothing}=nothing, mean::Union{Real, Nothing}=nothing) = _skewness(A, weights, mean) -function _skewness(A::AbstractArray, w::Nothing, m::Real) - n = length(A) - cm2 = 0.0 # empirical 2nd centered moment (variance) - cm3 = 0.0 # empirical 3rd centered moment - for i in eachindex(A) - @inbounds z = A[i] - m - z2 = z * z +function _skewness(x, w::Nothing, m::Real) + y = iterate(x) + if y === nothing + T = eltype(x) + # Return the NaN of the type that we would get, had this collection + # contained any elements (this is consistent with var) + z0 = zero(T) - zero(m) + return (z0^3 + z0^3)/sqrt((z0^2+z0^2)^3) + end + v, s = y + z = v - m + cm2 = z * z # empirical 2nd centered moment (variance) + cm3 = cm2 * z # empirical 3rd centered moment + n = 1 + y = iterate(x, s) + while y !== nothing + v, s = y + n += 1 + + z = v - m + z2 = z * z cm2 += z2 cm3 += z2 * z + y = iterate(x, s) end cm3 /= n cm2 /= n - return cm3 / sqrt(cm2 * cm2 * cm2) # this is much faster than cm2^1.5 + return cm3 / sqrt(cm2^3) end -function _skewness(A::AbstractArray, w::AbstractArray, m::Real) - n = length(A) - length(w) == n || throw(DimensionMismatch("Inconsistent array lengths.")) - cm2 = 0.0 # empirical 2nd centered moment (variance) - cm3 = 0.0 # empirical 3rd centered moment +function _skewness(x::AbstractArray{T}, w::AbstractArray{W}, m::Real) where {T, W} + length(x) == length(w) || + throw(ArgumentError("data and weight vectors must be the same size," * + "got $(length(v)) and $(length(w))")) + z0 = zero(T) - zero(m) + cm2 = z0 * zero(W) + z0 * zero(W) # empirical 2nd centered moment (variance) + cm3 = cm2 # empirical 3rd centered moment - @inbounds for i in eachindex(A, w) - x_i = A[i] - w_i = w[i] - z = x_i - m - z2w = z * z * w_i + @inbounds @simd for i in eachindex(x, w) + z = x[i] - m + z2w = z * z * w[i] cm2 += z2w cm3 += z2w * z end sw = sum(w) cm3 /= sw cm2 /= sw - return cm3 / sqrt(cm2 * cm2 * cm2) # this is much faster than cm2^1.5 + return cm3 / sqrt(cm2^3) end _skewness(A::AbstractArray, w::Union{AbstractArray, Nothing}, m::Nothing) = @@ -245,52 +118,74 @@ _skewness(A::AbstractArray, w::Union{AbstractArray, Nothing}, m::Nothing) = # (excessive) Kurtosis # This is Type 1 definition according to Joanes and Gill (1998) """ - kurtosis(A; [weights::AbstractArray], [mean::Real]) + kurtosis(x; [weights::AbstractArray], [mean::Real]) -Compute the excess kurtosis of a real-valued array `A`, optionally -specifying `weights` and `mean`. +Compute the excess kurtosis of collection `x`, optionally +specifying a pre-computed `mean`. +If `x` is an `AbstractArray`, a `weights` array of the same length as `x` +can be specified to compute the weighted kurtosis. !!! compat "Julia 1.2" This function requires at least Julia 1.2. """ +kurtosis(A; mean::Union{Real, Nothing}=nothing) = _kurtosis(A, nothing, mean) + kurtosis(A::AbstractArray; - weights::Union{AbstractWeights,Nothing}=nothing, + weights::Union{AbstractArray,Nothing}=nothing, mean::Union{Real, Nothing}=nothing) = _kurtosis(A, weights, mean) -function _kurtosis(A::AbstractArray, w::Nothing, m::Real) - n = length(A) - cm2 = 0.0 # empirical 2nd centered moment (variance) - cm4 = 0.0 # empirical 4th centered moment - for i in eachindex(A) - @inbounds z = A[i] - m +function _kurtosis(x, w::Nothing, m::Real) + y = iterate(x) + if y === nothing + T = eltype(x) + # Return the NaN of the type that we would get, had this collection + # contained any elements (this is consistent with var) + z0 = zero(T) - zero(m) + return (z0^3 + z0^3)/sqrt((z0^2+z0^2)^3) + end + + v, s = y + z = v - m + cm2 = z * z # empirical 2nd centered moment (variance) + cm4 = cm2 * cm2 # empirical 4th centered moment + + n = 1 + y = iterate(x, s) + while y !== nothing + v, s = y + n += 1 + + z = v - m z2 = z * z cm2 += z2 cm4 += z2 * z2 + y = iterate(x, s) end cm4 /= n cm2 /= n - return (cm4 / (cm2 * cm2)) - 3.0 + return (cm4 / (cm2 * cm2)) - 3 end -function _kurtosis(A::AbstractArray, w::AbstractWeights, m::Real) - length(w) == length(A) || throw(DimensionMismatch("Inconsistent array lengths.")) - cm2 = 0.0 # empirical 2nd centered moment (variance) - cm4 = 0.0 # empirical 4th centered moment +function _kurtosis(x::AbstractArray{T}, w::AbstractWeights{W}, m::Real) where {T, W} + length(x) == length(w) || + throw(ArgumentError("data and weight vectors must be the same size," * + "got $(length(v)) and $(length(w))")) + z0 = zero(T) - zero(m) + cm2 = z0 * zero(W) + z0 * zero(W) # empirical 2nd centered moment (variance) + cm4 = cm2 # empirical 4rd centered moment - @inbounds for i in eachindex(A, w) - x_i = A[i] - w_i = w[i] - z = x_i - m + @inbounds @simd for i in eachindex(x, w) + z = x[i] - m z2 = z * z - z2w = z2 * w_i + z2w = z2 * w[i] cm2 += z2w cm4 += z2w * z2 end sw = sum(w) cm4 /= sw cm2 /= sw - return (cm4 / (cm2 * cm2)) - 3.0 + return (cm4 / (cm2 * cm2)) - 3 end _kurtosis(A::AbstractArray, w::Union{AbstractWeights, Nothing}, m::Nothing) = diff --git a/stdlib/Statistics/test/moments.jl b/stdlib/Statistics/test/moments.jl index b760d818b64dc..b1dbbbe86dd03 100644 --- a/stdlib/Statistics/test/moments.jl +++ b/stdlib/Statistics/test/moments.jl @@ -26,26 +26,6 @@ w = [3.84, 2.70, 8.29, 8.91, 9.71, 0.0] @test std(x, weights=wv, corrected=false) ≈ expected_std @test std(x, weights=wv, mean=m, corrected=false) ≈ expected_std end - - @testset "Mean and Variance" begin - (m, v) = mean_and_var(x, corrected=false) - @test m == mean(x) - @test v == var(x, corrected=corrected=false) - - (m, v) = mean_and_var(x, weights=wv, corrected=false) - @test m == mean(x, weights=wv) - @test v == var(x, weights=wv, corrected=false) - end - - @testset "Mean and Standard Deviation" begin - (m, s) = mean_and_std(x; corrected=false) - @test m == mean(x) - @test s == std(x; corrected=false) - - (m, s) = mean_and_std(x, weights=wv, corrected=false) - @test m == mean(x, weights=wv) - @test s == std(x, weights=wv, corrected=false) - end end # expected corrected output for (weights, aweights, fweights, pweights) @@ -73,34 +53,6 @@ expected_std = sqrt.(expected_var) @test std(x, weights=wv, mean=m, corrected=true) ≈ expected_std[i] end end - - @testset "Mean and Variance" begin - (m, v) = mean_and_var(x; corrected=true) - @test m == mean(x) - @test v == var(x, corrected=true) - - if isa(wv, Weights) - @test_throws ArgumentError mean_and_var(x, weights=wv, corrected=true) - else - (m, v) = mean_and_var(x, weights=wv, corrected=true) - @test m == mean(x, weights=wv) - @test v == var(x, weights=wv, corrected=true) - end - end - - @testset "Mean and Standard Deviation" begin - (m, s) = mean_and_std(x, corrected=true) - @test m == mean(x) - @test s == std(x, corrected=true) - - if isa(wv, Weights) - @test_throws ArgumentError mean_and_std(x, weights=wv, corrected=true) - else - (m, s) = mean_and_std(x, weights=wv, corrected=true) - @test m == mean(x, weights=wv) - @test s == std(x, weights=wv, corrected=true) - end - end end x = rand(5, 6) @@ -131,38 +83,6 @@ w2 = rand(6) @test std(x, weights=wv1, dims=1, mean=m1, corrected=false) ≈ expected_std1 @test std(x, weights=wv2, dims=2, mean=m2, corrected=false) ≈ expected_std2 end - - @testset "Mean and Variance" begin - for d in 1:2 - (m, v) = mean_and_var(x, dims=d, corrected=false) - @test m == mean(x, dims=d) - @test v == var(x, dims=d, corrected=false) - end - - (m, v) = mean_and_var(x, weights=wv1, dims=1, corrected=false) - @test m == mean(x, weights=wv1, dims=1) - @test v == var(x, weights=wv1, dims=1, corrected=false) - - (m, v) = mean_and_var(x, weights=wv2, dims=2, corrected=false) - @test m == mean(x, weights=wv2, dims=2) - @test v == var(x, weights=wv2, dims=2, corrected=false) - end - - @testset "Mean and Standard Deviation" begin - for d in 1:2 - (m, s) = mean_and_std(x, dims=d, corrected=false) - @test m == mean(x, dims=d) - @test s == std(x, dims=d, corrected=false) - end - - (m, s) = mean_and_std(x, weights=wv1, dims=1, corrected=false) - @test m == mean(x, weights=wv1, dims=1) - @test s == std(x, weights=wv1, dims=1, corrected=false) - - (m, s) = mean_and_std(x, weights=wv2, dims=2, corrected=false) - @test m == mean(x, weights=wv2, dims=2) - @test s == std(x, weights=wv2, dims=2, corrected=false) - end end @testset "Corrected with $f" for f in weight_funcs @@ -199,46 +119,6 @@ end @test std(x, weights=wv2, dims=2, mean=m2, corrected=true) ≈ expected_std2 end end - - @testset "Mean and Variance" begin - for d in 1:2 - (m, v) = mean_and_var(x, dims=d, corrected=true) - @test m == mean(x, dims=d) - @test v == var(x, dims=d, corrected=true) - end - - if isa(wv1, Weights) - @test_throws ArgumentError mean_and_var(x, weights=wv1, dims=1, corrected=true) - else - (m, v) = mean_and_var(x, weights=wv1, dims=1, corrected=true) - @test m == mean(x, weights=wv1, dims=1) - @test v == var(x, weights=wv1, dims=1, corrected=true) - - (m, v) = mean_and_var(x, weights=wv2, dims=2; corrected=true) - @test m == mean(x, weights=wv2, dims=2) - @test v == var(x, weights=wv2, dims=2; corrected=true) - end - end - - @testset "Mean and Standard Deviation" begin - for d in 1:2 - (m, s) = mean_and_std(x, dims=d, corrected=true) - @test m == mean(x, dims=d) - @test s == std(x, dims=d, corrected=true) - end - - if isa(wv1, Weights) - @test_throws ArgumentError mean_and_std(x, weights=wv1, dims=1, corrected=true) - else - (m, s) = mean_and_std(x, weights=wv1, dims=1, corrected=true) - @test m == mean(x, weights=wv1, dims=1) - @test s == std(x, weights=wv1, dims=1, corrected=true) - - (m, s) = mean_and_std(x, weights=wv2, dims=2, corrected=true) - @test m == mean(x, weights=wv2, dims=2) - @test s == std(x, weights=wv2, dims=2, corrected=true) - end - end end @testset "Skewness and Kurtosis with $f" for f in weight_funcs @@ -258,24 +138,4 @@ end @test kurtosis([1, 2, 3, 4, 5], weights=wv) ≈ -1.3 end -@testset "General Moments with $f" for f in weight_funcs - x = collect(2.0:8.0) - @test moment(x, 2) ≈ sum((x .- 5).^2) / length(x) - @test moment(x, 3) ≈ sum((x .- 5).^3) / length(x) - @test moment(x, 4) ≈ sum((x .- 5).^4) / length(x) - @test moment(x, 5) ≈ sum((x .- 5).^5) / length(x) - - @test moment(x, 2, mean=4.0) ≈ sum((x .- 4).^2) / length(x) - @test moment(x, 3, mean=4.0) ≈ sum((x .- 4).^3) / length(x) - @test moment(x, 4, mean=4.0) ≈ sum((x .- 4).^4) / length(x) - @test moment(x, 5, mean=4.0) ≈ sum((x .- 4).^5) / length(x) - - w = f([1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]) - x2 = collect(2.0:6.0) - @test moment(x, 2, weights=w) ≈ sum((x2 .- 4).^2) / 5 - @test moment(x, 3, weights=w) ≈ sum((x2 .- 4).^3) / 5 - @test moment(x, 4, weights=w) ≈ sum((x2 .- 4).^4) / 5 - @test moment(x, 5, weights=w) ≈ sum((x2 .- 4).^5) / 5 -end - end From 9a410482f2a1c6da832fb5a92412f765854c748c Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 7 Apr 2019 19:13:27 +0200 Subject: [PATCH 03/12] Implement weighted sum --- base/reducedim.jl | 214 ++++++++++++++++++++++++++-- stdlib/Statistics/src/Statistics.jl | 6 +- stdlib/Statistics/src/weights.jl | 192 +------------------------ 3 files changed, 209 insertions(+), 203 deletions(-) diff --git a/base/reducedim.jl b/base/reducedim.jl index 44a72a9a25b9d..42a1d616d2b8b 100644 --- a/base/reducedim.jl +++ b/base/reducedim.jl @@ -649,12 +649,16 @@ for (fname, _fname, op) in [(:sum, :_sum, :add_sum), (:prod, :_prod, (:maximum, :_maximum, :max), (:minimum, :_minimum, :min)] @eval begin # User-facing methods with keyword arguments - @inline ($fname)(a::AbstractArray; dims=:) = ($_fname)(a, dims) - @inline ($fname)(f, a::AbstractArray; dims=:) = ($_fname)(f, a, dims) + @inline ($fname)(a::AbstractArray; + dims=:, weights::Union{AbstractArray,Nothing}=nothing) = + ($_fname)(a, dims, weights) + @inline ($fname)(f, a::AbstractArray; + dims=:, weights::Union{AbstractArray,Nothing}=nothing) = + ($_fname)(f, a, dims, weights) # Underlying implementations using dispatch - ($_fname)(a, ::Colon) = ($_fname)(identity, a, :) - ($_fname)(f, a, ::Colon) = mapreduce(f, $op, a) + ($_fname)(a, ::Colon, weights) = ($_fname)(identity, a, :, weights) + ($_fname)(f, a, ::Colon, ::Nothing) = mapreduce(f, $op, a) end end @@ -669,17 +673,209 @@ for (fname, op) in [(:sum, :add_sum), (:prod, :mul_prod), (:maximum, :max), (:minimum, :min), (:all, :&), (:any, :|)] fname! = Symbol(fname, '!') + _fname! = Symbol('_', fname, '!') _fname = Symbol('_', fname) @eval begin - $(fname!)(f::Function, r::AbstractArray, A::AbstractArray; init::Bool=true) = + $(fname!)(r::AbstractArray, A::AbstractArray; + init::Bool=true, weights::Union{AbstractArray,Nothing}=nothing) = + $(fname!)(identity, r, A; init=init, weights=weights) + $(fname!)(f::Function, r::AbstractArray, A::AbstractArray; + init::Bool=true, weights::Union{AbstractArray,Nothing}=nothing) = + $(_fname!)(f, r, A, weights; init=init) + + # Underlying implementations using dispatch + $(_fname!)(f, r::AbstractArray, A::AbstractArray, ::Nothing; init::Bool=true) = mapreducedim!(f, $(op), initarray!(r, $(op), init, A), A) - $(fname!)(r::AbstractArray, A::AbstractArray; init::Bool=true) = $(fname!)(identity, r, A; init=init) + $(_fname)(A, dims, weights) = $(_fname)(identity, A, dims, weights) + $(_fname)(f, A, dims, ::Nothing) = mapreduce(f, $(op), A, dims=dims) + end +end - $(_fname)(A, dims) = $(_fname)(identity, A, dims) - $(_fname)(f, A, dims) = mapreduce(f, $(op), A, dims=dims) + +# Weighted sum + +function _sum(A::AbstractArray, dims::Colon, w::AbstractArray{<:Real}) + sw = size(w) + sA = size(A) + if sw != sA + throw(DimensionMismatch("weights must have the same dimension as data (got $sw and $sA).")) end + s0 = zero(eltype(A)) * zero(eltype(w)) + s = s0 + s0 + @inbounds @simd for i in eachindex(A, w) + s += A[i] * w[i] + end + s +end + +# Weighted sum over dimensions +# +# Brief explanation of the algorithm: +# ------------------------------------ +# +# 1. _wsum! provides the core implementation, which assumes that +# the dimensions of all input arguments are consistent, and no +# dimension checking is performed therein. +# +# wsum and wsum! perform argument checking and call _wsum! +# internally. +# +# 2. _wsum! adopt a Cartesian based implementation for general +# sub types of AbstractArray. Particularly, a faster routine +# that keeps a local accumulator will be used when dim = 1. +# +# The internal function that implements this is _wsum_general! +# +# 3. _wsum! is specialized for following cases: +# (a) A is a vector: we invoke the vector version wsum above. +# The internal function that implements this is _wsum1! +# +# (b) A is a dense matrix with eltype <: Union{Float64,Float32}: we call gemv! +# The internal function that implements this is _wsum2_blas! +# +# (c) A is a contiguous array with eltype <: Union{Float64,Float32}: +# dim == 1: treat A like a matrix of size (d1, d2 x ... x dN) +# dim == N: treat A like a matrix of size (d1 x ... x d(N-1), dN) +# otherwise: decompose A into multiple pages, and apply _wsum2! +# for each +# +# (d) A is a general dense array with eltype <: Union{Float64,Float32}: +# dim <= 2: delegate to (a) and (b) +# otherwise, decompose A into multiple pages +# + +function _wsum1!(R::AbstractArray, A::AbstractVector, w::AbstractVector, init::Bool) + r = wsum(A, w) + if init + R[1] = r + else + R[1] += r + end + return R +end + +function _wsum2_blas!(R::StridedVector{T}, A::StridedMatrix{T}, w::StridedVector{T}, + dim::Int, init::Bool) where T<:Union{Float64,Float32} + beta = ifelse(init, zero(T), one(T)) + trans = dim == 1 ? 'T' : 'N' + BLAS.gemv!(trans, one(T), A, w, beta, R) + return R +end + +function _wsumN!(R::StridedArray{T}, A::StridedArray{T,N}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:Union{Float64,Float32},N} + if dim == 1 + m = size(A, 1) + n = div(length(A), m) + _wsum2_blas!(view(R,:), reshape(A, (m, n)), w, 1, init) + elseif dim == N + n = size(A, N) + m = div(length(A), n) + _wsum2_blas!(view(R,:), reshape(A, (m, n)), w, 2, init) + else # 1 < dim < N + m = 1 + for i = 1:dim-1; m *= size(A, i); end + n = size(A, dim) + k = 1 + for i = dim+1:N; k *= size(A, i); end + Av = reshape(A, (m, n, k)) + Rv = reshape(R, (m, k)) + for i = 1:k + _wsum2_blas!(view(Rv,:,i), view(Av,:,:,i), w, 2, init) + end + end + return R end +function _wsumN!(R::StridedArray{T}, A::DenseArray{T,N}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:Union{Float64,Float32},N} + @assert N >= 3 + if dim <= 2 + m = size(A, 1) + n = size(A, 2) + npages = 1 + for i = 3:N + npages *= size(A, i) + end + rlen = ifelse(dim == 1, n, m) + Rv = reshape(R, (rlen, npages)) + for i = 1:npages + _wsum2_blas!(view(Rv,:,i), view(A,:,:,i), w, dim, init) + end + else + _wsum_general!(R, identity, A, w, dim, init) + end + return R +end + +# General weighted sum over dimensions +function _wsum_general!(R::AbstractArray{S}, A::AbstractArray{T, N}, w::AbstractVector{W}, + dim::Int, init::Bool) where {S, T, N, W} + # following the implementation of _mapreducedim! + lsiz = Base.check_reducedims(R,A) + isempty(R) || fill!(R, zero(S)) + isempty(A) && return R + + indsAt, indsRt = Base.safe_tail(axes(A)), Base.safe_tail(axes(R)) # handle d=1 manually + keep, Idefault = Broadcast.shapeindexer(indsRt) + if Base.reducedim1(R, A) + i1 = first(Base.axes1(R)) + for IA in CartesianIndices(indsAt) + IR = Broadcast.newindex(IA, keep, Idefault) + r = R[i1,IR] + @simd for i in axes(A, 1) + r += A[i,IA] * w[dim > 1 ? IA[dim-1] : i] + end + R[i1,IR] = r + end + else + for IA in CartesianIndices(indsAt) + IR = Broadcast.newindex(IA, keep, Idefault) + @simd for i in axes(A, 1) + R[i,IR] += A[i,IA] * w[dim > 1 ? IA[dim-1] : i] + end + end + end + return R +end + +_wsum!(R::StridedArray{T}, A::DenseArray{T,1}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:Union{Float64,Float32}} = + _wsum1!(R, A, w, init) + +_wsum!(R::StridedArray{T}, A::DenseArray{T,2}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:Union{Float64,Float32}} = + _wsum2_blas!(view(R,:), A, w, init) + +_wsum!(R::StridedArray{T}, A::DenseArray{T}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:Union{Float64,Float32}} = + _wsumN!(R, A, w, init) + +_wsum!(R::AbstractArray, A::AbstractArray, w::AbstractVector, + dim::Int, init::Bool) = + _wsum_general!(R, A, w, dim, init) + +function _sum!(::typeof(identity), R::AbstractArray, A::AbstractArray{T,N}, + w::AbstractVector; + init::Bool=true) where {T,N} + Base.check_reducedims(R,A) + reddims = size(R) .!= size(A) + dim = something(findfirst(reddims), ndims(R)+1) + if findnext(reddims, dim+1) !== nothing + throw(ArgumentError("reducing over more than one dimension is not supported with weights")) + end + lw = length(w) + ldim = size(A, dim) + if lw != ldim + throw(DimensionMismatch("weights must have the same length as the dimension " * + "over which reduction is performed (got $lw and $ldim).")) + end + _wsum!(R, A, w, dim, init) +end + +_sum(A::AbstractArray, dims, w::AbstractArray) = + wsum!(Base.reducedim_init(t -> (t*zero(eltype(w)))/2, +, A, dims), A, w) + ##### findmin & findmax ##### # The initial values of Rval are not used if the corresponding indices in Rind are 0. # @@ -879,4 +1075,4 @@ julia> argmax(A, dims=2) CartesianIndex(2, 2) ``` """ -argmax(A::AbstractArray; dims=:) = findmax(A; dims=dims)[2] +argmax(A::AbstractArray; dims=:) = findmax(A; dims=dims)[2] \ No newline at end of file diff --git a/stdlib/Statistics/src/Statistics.jl b/stdlib/Statistics/src/Statistics.jl index 7ae3ee6986397..a61228052adf1 100644 --- a/stdlib/Statistics/src/Statistics.jl +++ b/stdlib/Statistics/src/Statistics.jl @@ -141,7 +141,7 @@ function _mean!(R::AbstractArray, A::AbstractArray, weights::Nothing) end _mean!(R::AbstractArray, A::AbstractArray, w::AbstractArray) = - rmul!(wsum!(R, A, w), inv(sum(w))) + rmul!(sum!(R, A, weights=w), inv(sum(w))) """ mean(A::AbstractArray; [dims], [weights::AbstractArray]) @@ -192,10 +192,10 @@ end _mean(A::AbstractArray, dims, weights::Nothing) = _mean!(Base.reducedim_init(t -> t/2, +, A, dims), A, nothing) -_mean(A::AbstractArray, ::Colon, weights::Nothing) = sum(A) / length(A) +_mean(A::AbstractArray, dims::Colon, weights::Nothing) = sum(A) / length(A) _mean(A::AbstractArray, dims::Colon, w::AbstractArray) = - wsum(A, w) / sum(w) + sum(A, weights=w) / sum(w) _mean(A::AbstractArray, dims, w::AbstractArray) = _mean!(Base.reducedim_init(t -> (t*zero(eltype(w)))/2, +, A, dims), A, w) diff --git a/stdlib/Statistics/src/weights.jl b/stdlib/Statistics/src/weights.jl index dc26fff56d923..ac8b01c4a89e1 100644 --- a/stdlib/Statistics/src/weights.jl +++ b/stdlib/Statistics/src/weights.jl @@ -241,194 +241,4 @@ for w in (AnalyticWeights, FrequencyWeights, ProbabilityWeights, Weights) end Base.isequal(x::AbstractWeights, y::AbstractWeights) = false -Base.:(==)(x::AbstractWeights, y::AbstractWeights) = false - -##### Weighted sum ##### - -## weighted sum over vectors - -""" - wsum(v::AbstractArray, w::AbstractArray, [dim]) - -Compute the weighted sum of an array `v` with weights `w`, optionally over the dimension `dim`. -""" -function wsum(A::AbstractArray, w::AbstractArray) - sw = size(w) - sA = size(A) - if sw != sA - throw(DimensionMismatch("weights must have the same dimension as data (got $sw and $sA).")) - end - _wsum(A, w) -end - -_wsum(A::AbstractVector, w::AbstractVector) = dot(A, w) -_wsum(A::AbstractArray, w::AbstractArray) = dot(vec(A), vec(w)) - -## wsum along dimension -# -# Brief explanation of the algorithm: -# ------------------------------------ -# -# 1. _wsum! provides the core implementation, which assumes that -# the dimensions of all input arguments are consistent, and no -# dimension checking is performed therein. -# -# wsum and wsum! perform argument checking and call _wsum! -# internally. -# -# 2. _wsum! adopt a Cartesian based implementation for general -# sub types of AbstractArray. Particularly, a faster routine -# that keeps a local accumulator will be used when dim = 1. -# -# The internal function that implements this is _wsum_general! -# -# 3. _wsum! is specialized for following cases: -# (a) A is a vector: we invoke the vector version wsum above. -# The internal function that implements this is _wsum1! -# -# (b) A is a dense matrix with eltype <: BlasReal: we call gemv! -# The internal function that implements this is _wsum2_blas! -# -# (c) A is a contiguous array with eltype <: BlasReal: -# dim == 1: treat A like a matrix of size (d1, d2 x ... x dN) -# dim == N: treat A like a matrix of size (d1 x ... x d(N-1), dN) -# otherwise: decompose A into multiple pages, and apply _wsum2! -# for each -# -# (d) A is a general dense array with eltype <: BlasReal: -# dim <= 2: delegate to (a) and (b) -# otherwise, decompose A into multiple pages -# - -function _wsum1!(R::AbstractArray, A::AbstractVector, w::AbstractVector, init::Bool) - r = wsum(A, w) - if init - R[1] = r - else - R[1] += r - end - return R -end - -function _wsum2_blas!(R::StridedVector{T}, A::StridedMatrix{T}, w::StridedVector{T}, - dim::Int, init::Bool) where T<:BlasReal - beta = ifelse(init, zero(T), one(T)) - trans = dim == 1 ? 'T' : 'N' - BLAS.gemv!(trans, one(T), A, w, beta, R) - return R -end - -function _wsumN!(R::StridedArray{T}, A::StridedArray{T,N}, w::StridedVector{T}, - dim::Int, init::Bool) where {T<:BlasReal,N} - if dim == 1 - m = size(A, 1) - n = div(length(A), m) - _wsum2_blas!(view(R,:), reshape(A, (m, n)), w, 1, init) - elseif dim == N - n = size(A, N) - m = div(length(A), n) - _wsum2_blas!(view(R,:), reshape(A, (m, n)), w, 2, init) - else # 1 < dim < N - m = 1 - for i = 1:dim-1; m *= size(A, i); end - n = size(A, dim) - k = 1 - for i = dim+1:N; k *= size(A, i); end - Av = reshape(A, (m, n, k)) - Rv = reshape(R, (m, k)) - for i = 1:k - _wsum2_blas!(view(Rv,:,i), view(Av,:,:,i), w, 2, init) - end - end - return R -end - -function _wsumN!(R::StridedArray{T}, A::DenseArray{T,N}, w::StridedVector{T}, - dim::Int, init::Bool) where {T<:BlasReal,N} - @assert N >= 3 - if dim <= 2 - m = size(A, 1) - n = size(A, 2) - npages = 1 - for i = 3:N - npages *= size(A, i) - end - rlen = ifelse(dim == 1, n, m) - Rv = reshape(R, (rlen, npages)) - for i = 1:npages - _wsum2_blas!(view(Rv,:,i), view(A,:,:,i), w, dim, init) - end - else - _wsum_general!(R, identity, A, w, dim, init) - end - return R -end - -# General weighted sum across dimensions -function _wsum_general!(R::AbstractArray{S}, A::AbstractArray{T, N}, w::AbstractVector{W}, - dim::Int, init::Bool) where {S, T, N, W} - # following the implementation of _mapreducedim! at base/reducedim.jl - lsiz = Base.check_reducedims(R,A) - isempty(R) || fill!(R, zero(S)) - isempty(A) && return R - - indsAt, indsRt = Base.safe_tail(axes(A)), Base.safe_tail(axes(R)) # handle d=1 manually - keep, Idefault = Broadcast.shapeindexer(indsRt) - if Base.reducedim1(R, A) - i1 = first(Base.axes1(R)) - for IA in CartesianIndices(indsAt) - IR = Broadcast.newindex(IA, keep, Idefault) - r = R[i1,IR] - @simd for i in axes(A, 1) - r += A[i,IA] * w[dim > 1 ? IA[dim-1] : i] - end - R[i1,IR] = r - end - else - for IA in CartesianIndices(indsAt) - IR = Broadcast.newindex(IA, keep, Idefault) - @simd for i in axes(A, 1) - R[i,IR] += A[i,IA] * w[dim > 1 ? IA[dim-1] : i] - end - end - end - return R -end - -# N = 1 -_wsum!(R::StridedArray{T}, A::DenseArray{T,1}, w::StridedVector{T}, - dim::Int, init::Bool) where {T<:BlasReal} = - _wsum1!(R, A, w, init) - -# N = 2 -_wsum!(R::StridedArray{T}, A::DenseArray{T,2}, w::StridedVector{T}, - dim::Int, init::Bool) where {T<:BlasReal} = - _wsum2_blas!(view(R,:), A, w, init) - -# N >= 3 -_wsum!(R::StridedArray{T}, A::DenseArray{T}, w::StridedVector{T}, - dim::Int, init::Bool) where {T<:BlasReal} = - _wsumN!(R, A, w, init) - -_wsum!(R::AbstractArray, A::AbstractArray, w::AbstractVector, - dim::Int, init::Bool) = - _wsum_general!(R, A, w, dim, init) - -## wsum! and wsum - -function wsum!(R::AbstractArray, A::AbstractArray{T,N}, w::AbstractVector; - init::Bool=true) where {T,N} - Base.check_reducedims(R,A) - reddims = size(R) .!= size(A) - dim = something(findfirst(reddims), ndims(R)+1) - if findnext(reddims, dim+1) !== nothing - throw(ArgumentError("reducing over more than one dimension is not supported with weights")) - end - lw = length(w) - ldim = size(A, dim) - if lw != ldim - throw(DimensionMismatch("weights must have the same length as the dimension " * - "over which reduction is performed (got $lw and $ldim).")) - end - _wsum!(R, A, w, dim, init) -end \ No newline at end of file +Base.:(==)(x::AbstractWeights, y::AbstractWeights) = false \ No newline at end of file From 022d0295415a28a874834632941434591fefbaf2 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sat, 4 May 2019 14:45:16 +0200 Subject: [PATCH 04/12] Move optimized weighted sum methods to LinearAlgebra --- base/reducedim.jl | 157 ++++++++-------------- stdlib/LinearAlgebra/src/LinearAlgebra.jl | 3 +- stdlib/LinearAlgebra/src/wsum.jl | 84 ++++++++++++ test/reduce.jl | 20 +++ test/reducedim.jl | 24 ++++ 5 files changed, 185 insertions(+), 103 deletions(-) create mode 100644 stdlib/LinearAlgebra/src/wsum.jl diff --git a/base/reducedim.jl b/base/reducedim.jl index 42a1d616d2b8b..985f2d0737522 100644 --- a/base/reducedim.jl +++ b/base/reducedim.jl @@ -645,23 +645,27 @@ julia> any!([1 1], A) """ any!(r, A) -for (fname, _fname, op) in [(:sum, :_sum, :add_sum), (:prod, :_prod, :mul_prod), +for (fname, _fname, op) in [(:prod, :_prod, :mul_prod), (:maximum, :_maximum, :max), (:minimum, :_minimum, :min)] @eval begin # User-facing methods with keyword arguments - @inline ($fname)(a::AbstractArray; - dims=:, weights::Union{AbstractArray,Nothing}=nothing) = - ($_fname)(a, dims, weights) - @inline ($fname)(f, a::AbstractArray; - dims=:, weights::Union{AbstractArray,Nothing}=nothing) = - ($_fname)(f, a, dims, weights) + @inline ($fname)(a::AbstractArray; dims=:) = ($_fname)(a, dims) + @inline ($fname)(f, a::AbstractArray; dims=:) = ($_fname)(f, a, dims) # Underlying implementations using dispatch - ($_fname)(a, ::Colon, weights) = ($_fname)(identity, a, :, weights) - ($_fname)(f, a, ::Colon, ::Nothing) = mapreduce(f, $op, a) + ($_fname)(a, ::Colon) = ($_fname)(identity, a, :) + ($_fname)(f, a, ::Colon) = mapreduce(f, $op, a) end end +# Sum is the only reduction which supports weights +sum(a::AbstractArray; dims=:, weights::Union{AbstractArray,Nothing}=nothing) = + _sum(a, dims, weights) +sum(f, a::AbstractArray; dims=:, weights::Union{AbstractArray,Nothing}=nothing) = + _sum(f, a, dims, weights) +sum(a, ::Colon, weights) = _sum(identity, a, :, weights) +sum(f, a, ::Colon, ::Nothing) = mapreduce(f, add_sum, a) + any(a::AbstractArray; dims=:) = _any(a, dims) any(f::Function, a::AbstractArray; dims=:) = _any(f, a, dims) _any(a, ::Colon) = _any(identity, a, :) @@ -669,31 +673,40 @@ all(a::AbstractArray; dims=:) = _all(a, dims) all(f::Function, a::AbstractArray; dims=:) = _all(f, a, dims) _all(a, ::Colon) = _all(identity, a, :) -for (fname, op) in [(:sum, :add_sum), (:prod, :mul_prod), +for (fname, op) in [(:prod, :mul_prod), (:maximum, :max), (:minimum, :min), (:all, :&), (:any, :|)] fname! = Symbol(fname, '!') _fname! = Symbol('_', fname, '!') _fname = Symbol('_', fname) @eval begin - $(fname!)(r::AbstractArray, A::AbstractArray; - init::Bool=true, weights::Union{AbstractArray,Nothing}=nothing) = - $(fname!)(identity, r, A; init=init, weights=weights) - $(fname!)(f::Function, r::AbstractArray, A::AbstractArray; - init::Bool=true, weights::Union{AbstractArray,Nothing}=nothing) = - $(_fname!)(f, r, A, weights; init=init) + $(fname!)(r::AbstractArray, A::AbstractArray; init::Bool=true) = + $(fname!)(identity, r, A; init=init) + $(fname!)(f::Function, r::AbstractArray, A::AbstractArray; init::Bool=true) = + $(_fname!)(f, r, A; init=init) # Underlying implementations using dispatch - $(_fname!)(f, r::AbstractArray, A::AbstractArray, ::Nothing; init::Bool=true) = + $(_fname!)(f, r::AbstractArray, A::AbstractArray; init::Bool=true) = mapreducedim!(f, $(op), initarray!(r, $(op), init, A), A) - $(_fname)(A, dims, weights) = $(_fname)(identity, A, dims, weights) - $(_fname)(f, A, dims, ::Nothing) = mapreduce(f, $(op), A, dims=dims) + $(_fname)(A, dims) = $(_fname)(identity, A, dims) + $(_fname)(f, A, dims) = mapreduce(f, $(op), A, dims=dims) end end +# Sum is the only reduction which supports weights +sum!(r::AbstractArray, A::AbstractArray; + init::Bool=true, weights::Union{AbstractArray,Nothing}=nothing) = + sum!(identity, r, A; init=init, weights=weights) +sum!(f::Function, r::AbstractArray, A::AbstractArray; + init::Bool=true, weights::Union{AbstractArray,Nothing}=nothing) = + _sum!(f, r, A, weights; init=init) +_sum!(f, r::AbstractArray, A::AbstractArray, ::Nothing; init::Bool=true) = + mapreducedim!(f, add_sum, initarray!(r, add_sum, init, A), A) +_sum(A, dims, weights) = _sum(identity, A, dims, weights) +_sum(f, A, dims, ::Nothing) = mapreduce(f, add_sum, A, dims=dims) -# Weighted sum +# Weighted sum function _sum(A::AbstractArray, dims::Colon, w::AbstractArray{<:Real}) sw = size(w) sA = size(A) @@ -701,7 +714,7 @@ function _sum(A::AbstractArray, dims::Colon, w::AbstractArray{<:Real}) throw(DimensionMismatch("weights must have the same dimension as data (got $sw and $sA).")) end s0 = zero(eltype(A)) * zero(eltype(w)) - s = s0 + s0 + s = add_sum(s0, s0) @inbounds @simd for i in eachindex(A, w) s += A[i] * w[i] end @@ -730,19 +743,23 @@ end # (a) A is a vector: we invoke the vector version wsum above. # The internal function that implements this is _wsum1! # -# (b) A is a dense matrix with eltype <: Union{Float64,Float32}: we call gemv! +# (b) A is a dense matrix with eltype <: BlasReal: we call gemv! # The internal function that implements this is _wsum2_blas! +# (in LinearAlgebra/src/wsum.jl) # -# (c) A is a contiguous array with eltype <: Union{Float64,Float32}: +# (c) A is a contiguous array with eltype <: BlasReal: # dim == 1: treat A like a matrix of size (d1, d2 x ... x dN) # dim == N: treat A like a matrix of size (d1 x ... x d(N-1), dN) -# otherwise: decompose A into multiple pages, and apply _wsum2! +# otherwise: decompose A into multiple pages, and apply _wsum2_blas! # for each +# The internal function that implements this is _wsumN! +# (in LinearAlgebra/src/wsum.jl) # -# (d) A is a general dense array with eltype <: Union{Float64,Float32}: +# (d) A is a general dense array with eltype <: BlasReal: # dim <= 2: delegate to (a) and (b) # otherwise, decompose A into multiple pages -# +# The internal function that implements this is _wsumN! +# (in LinearAlgebra/src/wsum.jl) function _wsum1!(R::AbstractArray, A::AbstractVector, w::AbstractVector, init::Bool) r = wsum(A, w) @@ -754,72 +771,17 @@ function _wsum1!(R::AbstractArray, A::AbstractVector, w::AbstractVector, init::B return R end -function _wsum2_blas!(R::StridedVector{T}, A::StridedMatrix{T}, w::StridedVector{T}, - dim::Int, init::Bool) where T<:Union{Float64,Float32} - beta = ifelse(init, zero(T), one(T)) - trans = dim == 1 ? 'T' : 'N' - BLAS.gemv!(trans, one(T), A, w, beta, R) - return R -end - -function _wsumN!(R::StridedArray{T}, A::StridedArray{T,N}, w::StridedVector{T}, - dim::Int, init::Bool) where {T<:Union{Float64,Float32},N} - if dim == 1 - m = size(A, 1) - n = div(length(A), m) - _wsum2_blas!(view(R,:), reshape(A, (m, n)), w, 1, init) - elseif dim == N - n = size(A, N) - m = div(length(A), n) - _wsum2_blas!(view(R,:), reshape(A, (m, n)), w, 2, init) - else # 1 < dim < N - m = 1 - for i = 1:dim-1; m *= size(A, i); end - n = size(A, dim) - k = 1 - for i = dim+1:N; k *= size(A, i); end - Av = reshape(A, (m, n, k)) - Rv = reshape(R, (m, k)) - for i = 1:k - _wsum2_blas!(view(Rv,:,i), view(Av,:,:,i), w, 2, init) - end - end - return R -end - -function _wsumN!(R::StridedArray{T}, A::DenseArray{T,N}, w::StridedVector{T}, - dim::Int, init::Bool) where {T<:Union{Float64,Float32},N} - @assert N >= 3 - if dim <= 2 - m = size(A, 1) - n = size(A, 2) - npages = 1 - for i = 3:N - npages *= size(A, i) - end - rlen = ifelse(dim == 1, n, m) - Rv = reshape(R, (rlen, npages)) - for i = 1:npages - _wsum2_blas!(view(Rv,:,i), view(A,:,:,i), w, dim, init) - end - else - _wsum_general!(R, identity, A, w, dim, init) - end - return R -end - -# General weighted sum over dimensions -function _wsum_general!(R::AbstractArray{S}, A::AbstractArray{T, N}, w::AbstractVector{W}, - dim::Int, init::Bool) where {S, T, N, W} +function _wsum_general!(R::AbstractArray{S}, A::AbstractArray, w::AbstractVector, + dim::Int, init::Bool) where {S} # following the implementation of _mapreducedim! - lsiz = Base.check_reducedims(R,A) - isempty(R) || fill!(R, zero(S)) + lsiz = check_reducedims(R,A) + !isempty(R) && init && fill!(R, zero(S)) isempty(A) && return R - indsAt, indsRt = Base.safe_tail(axes(A)), Base.safe_tail(axes(R)) # handle d=1 manually + indsAt, indsRt = safe_tail(axes(A)), safe_tail(axes(R)) # handle d=1 manually keep, Idefault = Broadcast.shapeindexer(indsRt) - if Base.reducedim1(R, A) - i1 = first(Base.axes1(R)) + if reducedim1(R, A) + i1 = first(axes1(R)) for IA in CartesianIndices(indsAt) IR = Broadcast.newindex(IA, keep, Idefault) r = R[i1,IR] @@ -839,26 +801,17 @@ function _wsum_general!(R::AbstractArray{S}, A::AbstractArray{T, N}, w::Abstract return R end -_wsum!(R::StridedArray{T}, A::DenseArray{T,1}, w::StridedVector{T}, - dim::Int, init::Bool) where {T<:Union{Float64,Float32}} = +_wsum!(R::AbstractArray, A::AbstractArray, w::AbstractVector, + dim::Int, init::Bool) = _wsum1!(R, A, w, init) -_wsum!(R::StridedArray{T}, A::DenseArray{T,2}, w::StridedVector{T}, - dim::Int, init::Bool) where {T<:Union{Float64,Float32}} = - _wsum2_blas!(view(R,:), A, w, init) - -_wsum!(R::StridedArray{T}, A::DenseArray{T}, w::StridedVector{T}, - dim::Int, init::Bool) where {T<:Union{Float64,Float32}} = - _wsumN!(R, A, w, init) - _wsum!(R::AbstractArray, A::AbstractArray, w::AbstractVector, dim::Int, init::Bool) = _wsum_general!(R, A, w, dim, init) -function _sum!(::typeof(identity), R::AbstractArray, A::AbstractArray{T,N}, - w::AbstractVector; +function _sum!(::typeof(identity), R::AbstractArray, A::AbstractArray{T,N}, w::AbstractVector; init::Bool=true) where {T,N} - Base.check_reducedims(R,A) + check_reducedims(R,A) reddims = size(R) .!= size(A) dim = something(findfirst(reddims), ndims(R)+1) if findnext(reddims, dim+1) !== nothing @@ -874,7 +827,7 @@ function _sum!(::typeof(identity), R::AbstractArray, A::AbstractArray{T,N}, end _sum(A::AbstractArray, dims, w::AbstractArray) = - wsum!(Base.reducedim_init(t -> (t*zero(eltype(w)))/2, +, A, dims), A, w) + _sum!(identity, reducedim_init(t -> t*zero(eltype(w)), add_sum, A, dims), A, w) ##### findmin & findmax ##### # The initial values of Rval are not used if the corresponding indices in Rind are 0. diff --git a/stdlib/LinearAlgebra/src/LinearAlgebra.jl b/stdlib/LinearAlgebra/src/LinearAlgebra.jl index dc11793bb085a..bfec521601471 100644 --- a/stdlib/LinearAlgebra/src/LinearAlgebra.jl +++ b/stdlib/LinearAlgebra/src/LinearAlgebra.jl @@ -14,7 +14,7 @@ import Base: USE_BLAS64, abs, acos, acosh, acot, acoth, acsc, acsch, adjoint, as getproperty, imag, inv, isapprox, isone, iszero, IndexStyle, kron, length, log, map, ndims, oneunit, parent, power_by_squaring, print_matrix, promote_rule, real, round, sec, sech, setindex!, show, similar, sin, sincos, sinh, size, sqrt, - strides, stride, tan, tanh, transpose, trunc, typed_hcat, vec + strides, stride, tan, tanh, transpose, trunc, typed_hcat, vec, _wsum! using Base: hvcat_fill, IndexLinear, promote_op, promote_typeof, @propagate_inbounds, @pure, reduce, typed_vcat, require_one_based_indexing using Base.Broadcast: Broadcasted @@ -373,6 +373,7 @@ include("bitarray.jl") include("ldlt.jl") include("schur.jl") include("structuredbroadcast.jl") +include("wsum.jl") include("deprecated.jl") const ⋅ = dot diff --git a/stdlib/LinearAlgebra/src/wsum.jl b/stdlib/LinearAlgebra/src/wsum.jl new file mode 100644 index 0000000000000..62ac390a85e47 --- /dev/null +++ b/stdlib/LinearAlgebra/src/wsum.jl @@ -0,0 +1,84 @@ +# Optimized methods for weighted sum over dimensions +# (generic method is defined in base/reducedim.jl) +# +# _wsum! is specialized for following cases: +# (a) A is a dense matrix with eltype <: BlasReal: we call gemv! +# The internal function that implements this is _wsum2_blas! +# +# (b) A is a contiguous array with eltype <: BlasReal: +# dim == 1: treat A like a matrix of size (d1, d2 x ... x dN) +# dim == N: treat A like a matrix of size (d1 x ... x d(N-1), dN) +# otherwise: decompose A into multiple pages, and apply _wsum2_blas! +# for each +# The internal function that implements this is _wsumN! +# +# (c) A is a general dense array with eltype <: BlasReal: +# dim <= 2: delegate to (a) and (b) +# otherwise, decompose A into multiple pages +# The internal function that implements this is _wsumN! + +function _wsum2_blas!(R::StridedVector{T}, A::StridedMatrix{T}, w::StridedVector{T}, + dim::Int, init::Bool) where T<:BlasReal + beta = ifelse(init, zero(T), one(T)) + trans = dim == 1 ? 'T' : 'N' + BLAS.gemv!(trans, one(T), A, w, beta, R) + return R +end + +function _wsumN!(R::StridedArray{T}, A::StridedArray{T,N}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:BlasReal,N} + if dim == 1 + m = size(A, 1) + n = div(length(A), m) + _wsum2_blas!(view(R,:), reshape(A, (m, n)), w, 1, init) + elseif dim == N + n = size(A, N) + m = div(length(A), n) + _wsum2_blas!(view(R,:), reshape(A, (m, n)), w, 2, init) + else # 1 < dim < N + m = 1 + for i = 1:dim-1 + m *= size(A, i) + end + n = size(A, dim) + k = 1 + for i = dim+1:N + k *= size(A, i) + end + Av = reshape(A, (m, n, k)) + Rv = reshape(R, (m, k)) + for i = 1:k + _wsum2_blas!(view(Rv,:,i), view(Av,:,:,i), w, 2, init) + end + end + return R +end + +function _wsumN!(R::StridedArray{T}, A::DenseArray{T,N}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:BlasReal,N} + @assert N >= 3 + if dim <= 2 + m = size(A, 1) + n = size(A, 2) + npages = 1 + for i = 3:N + npages *= size(A, i) + end + rlen = ifelse(dim == 1, n, m) + Rv = reshape(R, (rlen, npages)) + for i = 1:npages + _wsum2_blas!(view(Rv,:,i), view(A,:,:,i), w, dim, init) + end + else + Base._wsum_general!(R, A, w, dim, init) + end + return R +end + +_wsum!(R::StridedArray{T}, A::DenseArray{T,2}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:BlasReal} = + _wsum2_blas!(view(R,:), A, w, dim, init) + +_wsum!(R::StridedArray{T}, A::DenseArray{T}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:BlasReal} = + _wsumN!(R, A, w, dim, init) \ No newline at end of file diff --git a/test/reduce.jl b/test/reduce.jl index eb585e8a630f1..8dfb21a920ce4 100644 --- a/test/reduce.jl +++ b/test/reduce.jl @@ -533,3 +533,23 @@ x = [j^2 for j in i] i = Base.Slice(0:0) x = [j+7 for j in i] @test sum(x) == 7 + +@testset "weighted sum" begin + for a in (rand(3), rand(Int, 3), rand(Int8, 3)) + for wt in ([1.0, 1.0, 1.0], [2, 1, 3], Int8[1, 2, 3], [1.1, 0.2, 0.0]) + res = @inferred sum(a, weights=wt) + expected = sum(a.*wt) + @test res ≈ expected + @test typeof(res) == typeof(expected) + end + end + for a in (rand(3, 5), rand(Int, 3, 5), rand(Int8, 3, 5)) + for wt in ([1.0, 1.0, 1.0], [2, 1, 3], Int8[1, 2, 3], [1.1, 0.2, 0.0]) + wtr = repeat(wt, outer=(1, 5)) + res = @inferred sum(a, weights=wtr) + expected = sum(a.*wtr) + @test res ≈ expected + @test typeof(res) == typeof(expected) + end + end +end \ No newline at end of file diff --git a/test/reducedim.jl b/test/reducedim.jl index 2a39b706d2a73..a381e7942f716 100644 --- a/test/reducedim.jl +++ b/test/reducedim.jl @@ -398,3 +398,27 @@ end @test_throws DimensionMismatch maximum!(fill(0, 1, 1, 2, 1), B) @test_throws DimensionMismatch minimum!(fill(0, 1, 1, 2, 1), B) end + +@testset "weighted sum over region" begin + for a in (rand(3, 3, 3), rand(Int, 3, 3, 3), rand(Int8, 3, 3, 3)) + for wt in ([1.0, 1.0, 1.0], [2, 1, 3], Int8[1, 2, 3], [1.1, 0.2, 0.0]) + for (d, rw) in ((1, reshape(wt, :, 1, 1)), + (2, reshape(wt, 1, :, 1)), + (3, reshape(wt, 1, 1, :))) + expected = sum(a.*rw, dims=d) + res = @inferred sum(a, weights=wt, dims=d) + @test res ≈ expected + @test typeof(res) == typeof(expected) + x = rand!(similar(expected)) + y = copy(x) + @inferred sum!(y, a, weights=wt) + @test y ≈ expected + y = copy(x) + @inferred sum!(y, a, weights=wt, init=false) + @test y ≈ x + expected + end + + @test_throws DimensionMismatch sum(a, weights=wt, dims=4) + end + end +end \ No newline at end of file From 24f530ac88267768f217dd44bce293610a561065 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 8 May 2019 13:38:06 +0200 Subject: [PATCH 05/12] More tests --- base/reducedim.jl | 4 +- stdlib/LinearAlgebra/src/wsum.jl | 22 ++++++--- test/reduce.jl | 31 ++++++++---- test/reducedim.jl | 82 +++++++++++++++++++++++++++----- 4 files changed, 110 insertions(+), 29 deletions(-) diff --git a/base/reducedim.jl b/base/reducedim.jl index 985f2d0737522..28a0a170eab11 100644 --- a/base/reducedim.jl +++ b/base/reducedim.jl @@ -762,7 +762,7 @@ end # (in LinearAlgebra/src/wsum.jl) function _wsum1!(R::AbstractArray, A::AbstractVector, w::AbstractVector, init::Bool) - r = wsum(A, w) + r = _sum(A, :, w) if init R[1] = r else @@ -801,7 +801,7 @@ function _wsum_general!(R::AbstractArray{S}, A::AbstractArray, w::AbstractVector return R end -_wsum!(R::AbstractArray, A::AbstractArray, w::AbstractVector, +_wsum!(R::AbstractArray, A::AbstractVector, w::AbstractVector, dim::Int, init::Bool) = _wsum1!(R, A, w, init) diff --git a/stdlib/LinearAlgebra/src/wsum.jl b/stdlib/LinearAlgebra/src/wsum.jl index 62ac390a85e47..8b0ecc1478cf7 100644 --- a/stdlib/LinearAlgebra/src/wsum.jl +++ b/stdlib/LinearAlgebra/src/wsum.jl @@ -1,4 +1,10 @@ -# Optimized methods for weighted sum over dimensions +# Optimized method for weighted sum with BlasReal +# dot cannot be used for other types as it uses + rather than add_sum for accumulation, +# and therefore does not return the correct type +Base._sum(A::AbstractArray{T}, dims::Colon, w::AbstractArray{T}) where {T<:BlasReal} = + dot(vec(A), vec(w)) + +# Optimized methods for weighted sum over dimensions with BlasReal # (generic method is defined in base/reducedim.jl) # # _wsum! is specialized for following cases: @@ -75,10 +81,14 @@ function _wsumN!(R::StridedArray{T}, A::DenseArray{T,N}, w::StridedVector{T}, return R end -_wsum!(R::StridedArray{T}, A::DenseArray{T,2}, w::StridedVector{T}, - dim::Int, init::Bool) where {T<:BlasReal} = +Base._wsum!(R::StridedArray{T}, A::DenseMatrix{T}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:BlasReal} = _wsum2_blas!(view(R,:), A, w, dim, init) -_wsum!(R::StridedArray{T}, A::DenseArray{T}, w::StridedVector{T}, - dim::Int, init::Bool) where {T<:BlasReal} = - _wsumN!(R, A, w, dim, init) \ No newline at end of file +Base._wsum!(R::StridedArray{T}, A::DenseArray{T}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:BlasReal} = + _wsumN!(R, A, w, dim, init) + +Base._wsum!(R::StridedVector{T}, A::DenseArray{T}, w::StridedVector{T}, + dim::Int, init::Bool) where {T<:BlasReal} = + Base._wsum1!(R, A, w, init) \ No newline at end of file diff --git a/test/reduce.jl b/test/reduce.jl index 8dfb21a920ce4..d3ba20c4091ef 100644 --- a/test/reduce.jl +++ b/test/reduce.jl @@ -535,20 +535,31 @@ x = [j+7 for j in i] @test sum(x) == 7 @testset "weighted sum" begin + wts = ([1.4, 2.5, 10.1], [1.4f0, 2.5f0, 10.1f0], [0.0, 2.3, 5.6], + [NaN, 2.3, 5.6], [Inf, 2.3, 5.6], + [2, 1, 3], Int8[1, 2, 3], [1, 1, 1]) for a in (rand(3), rand(Int, 3), rand(Int8, 3)) - for wt in ([1.0, 1.0, 1.0], [2, 1, 3], Int8[1, 2, 3], [1.1, 0.2, 0.0]) - res = @inferred sum(a, weights=wt) - expected = sum(a.*wt) - @test res ≈ expected + for w in wts + res = @inferred sum(a, weights=w) + expected = sum(a.*w) + if isfinite(res) + @test res ≈ expected + else + @test isequal(res, expected) + end @test typeof(res) == typeof(expected) end end - for a in (rand(3, 5), rand(Int, 3, 5), rand(Int8, 3, 5)) - for wt in ([1.0, 1.0, 1.0], [2, 1, 3], Int8[1, 2, 3], [1.1, 0.2, 0.0]) - wtr = repeat(wt, outer=(1, 5)) - res = @inferred sum(a, weights=wtr) - expected = sum(a.*wtr) - @test res ≈ expected + for a in (rand(3, 5), rand(Float32, 3, 5), rand(Int, 3, 5), rand(Int8, 3, 5)) + for w in wts + wr = repeat(w, outer=(1, 5)) + res = @inferred sum(a, weights=wr) + expected = sum(a.*wr) + if isfinite(res) + @test res ≈ expected + else + @test isequal(res, expected) + end @test typeof(res) == typeof(expected) end end diff --git a/test/reducedim.jl b/test/reducedim.jl index a381e7942f716..38d8289051963 100644 --- a/test/reducedim.jl +++ b/test/reducedim.jl @@ -399,26 +399,86 @@ end @test_throws DimensionMismatch minimum!(fill(0, 1, 1, 2, 1), B) end -@testset "weighted sum over region" begin - for a in (rand(3, 3, 3), rand(Int, 3, 3, 3), rand(Int8, 3, 3, 3)) - for wt in ([1.0, 1.0, 1.0], [2, 1, 3], Int8[1, 2, 3], [1.1, 0.2, 0.0]) - for (d, rw) in ((1, reshape(wt, :, 1, 1)), - (2, reshape(wt, 1, :, 1)), - (3, reshape(wt, 1, 1, :))) - expected = sum(a.*rw, dims=d) - res = @inferred sum(a, weights=wt, dims=d) +@testset "weighted sum over dimensions" begin + wts = ([1.4, 2.5, 10.1], [1.4f0, 2.5f0, 10.1f0], [0.0, 2.3, 5.6], + [NaN, 2.3, 5.6], [Inf, 2.3, 5.6], + [2, 1, 3], Int8[1, 2, 3], [1, 1, 1]) + + ainf = rand(3) + ainf[1] = Inf + anan = rand(3) + anan[1] = NaN + for a in (rand(3), rand(Float32, 3), ainf, anan, + rand(Int, 3), rand(Int8, 3), + view(rand(5), 2:4)) + for w in wts + if all(isfinite, a) && all(isfinite, w) + expected = sum(a.*w, dims=1) + res = @inferred sum(a, weights=w, dims=1) @test res ≈ expected @test typeof(res) == typeof(expected) x = rand!(similar(expected)) y = copy(x) - @inferred sum!(y, a, weights=wt) + @inferred sum!(y, a, weights=w) @test y ≈ expected y = copy(x) - @inferred sum!(y, a, weights=wt, init=false) + @inferred sum!(y, a, weights=w, init=false) @test y ≈ x + expected + else + expected = sum(a.*w, dims=1) + res = @inferred sum(a, weights=w, dims=1) + @test isfinite.(res) == isfinite.(expected) + @test typeof(res) == typeof(expected) + x = rand!(similar(expected)) + y = copy(x) + @inferred sum!(y, a, weights=w) + @test isfinite.(y) == isfinite.(expected) + y = copy(x) + @inferred sum!(y, a, weights=w, init=false) + @test isfinite.(y) == isfinite.(expected) + end + end + end + + ainf = rand(3, 3, 3) + ainf[1] = Inf + anan = rand(3, 3, 3) + anan[1] = NaN + for a in (rand(3, 3, 3), rand(Float32, 3, 3, 3), ainf, anan, + rand(Int, 3, 3, 3), rand(Int8, 3, 3, 3), + view(rand(3, 3, 5), :, :, 2:4)) + for w in wts + for (d, rw) in ((1, reshape(w, :, 1, 1)), + (2, reshape(w, 1, :, 1)), + (3, reshape(w, 1, 1, :))) + if all(isfinite, a) && all(isfinite, w) + expected = sum(a.*rw, dims=d) + res = @inferred sum(a, weights=w, dims=d) + @test res ≈ expected + @test typeof(res) == typeof(expected) + x = rand!(similar(expected)) + y = copy(x) + @inferred sum!(y, a, weights=w) + @test y ≈ expected + y = copy(x) + @inferred sum!(y, a, weights=w, init=false) + @test y ≈ x + expected + else + expected = sum(a.*rw, dims=d) + res = @inferred sum(a, weights=w, dims=d) + @test isfinite.(res) == isfinite.(expected) + @test typeof(res) == typeof(expected) + x = rand!(similar(expected)) + y = copy(x) + @inferred sum!(y, a, weights=w) + @test isfinite.(y) == isfinite.(expected) + y = copy(x) + @inferred sum!(y, a, weights=w, init=false) + @test isfinite.(y) == isfinite.(expected) + end end - @test_throws DimensionMismatch sum(a, weights=wt, dims=4) + @test_throws DimensionMismatch sum(a, weights=w, dims=4) end end end \ No newline at end of file From d7ae38bd959db9f9e3e0e58d1af1ae4c906793de Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 8 May 2019 13:43:10 +0200 Subject: [PATCH 06/12] Move varcorrection to Statistics.jl --- stdlib/Statistics/src/Statistics.jl | 72 +++++++++++++++++++++++++++++ stdlib/Statistics/src/weights.jl | 72 ----------------------------- 2 files changed, 72 insertions(+), 72 deletions(-) diff --git a/stdlib/Statistics/src/Statistics.jl b/stdlib/Statistics/src/Statistics.jl index a61228052adf1..677498bcfd136 100644 --- a/stdlib/Statistics/src/Statistics.jl +++ b/stdlib/Statistics/src/Statistics.jl @@ -368,6 +368,78 @@ function _varm(A::AbstractArray{T}, m, corrected::Bool, dims::Colon, varcorrection(w, corrected) * s end +""" + varcorrection(n::Integer, corrected=false) + +Compute a bias correction factor for calculating `var`, `std` and `cov` with +`n` observations. Returns ``\\frac{1}{n - 1}`` when `corrected=true` +(i.e. [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)), +otherwise returns ``\\frac{1}{n}`` (i.e. no correction). +""" +@inline varcorrection(n::Integer, corrected::Bool=false) = 1 / (n - Int(corrected)) + +""" + varcorrection(w::Weights, corrected=false) + +Returns ``\\frac{1}{\\sum w}`` when `corrected=false` and throws an `ArgumentError` +if `corrected=true`. +""" +@inline function varcorrection(w::Weights, corrected::Bool=false) + corrected && throw(ArgumentError("Weights type does not support bias correction: " * + "use FrequencyWeights, AnalyticWeights or ProbabilityWeights if applicable.")) + 1 / w.sum +end + +""" + varcorrection(w::AnalyticWeights, corrected=false) + +* `corrected=true`: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` +* `corrected=false`: ``\\frac{1}{\\sum w}`` +""" +@inline function varcorrection(w::AnalyticWeights, corrected::Bool=false) + s = w.sum + + if corrected + sum_sn = sum(x -> (x / s) ^ 2, w) + 1 / (s * (1 - sum_sn)) + else + 1 / s + end +end + +""" + varcorrection(w::FrequencyWeights, corrected=false) + +* `corrected=true`: ``\\frac{1}{\\sum{w} - 1}`` +* `corrected=false`: ``\\frac{1}{\\sum w}`` +""" +@inline function varcorrection(w::FrequencyWeights, corrected::Bool=false) + s = w.sum + + if corrected + 1 / (s - 1) + else + 1 / s + end +end + +""" + varcorrection(w::ProbabilityWeights, corrected=false) + +* `corrected=true`: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` +* `corrected=false`: ``\\frac{1}{\\sum w}`` +""" +@inline function varcorrection(w::ProbabilityWeights, corrected::Bool=false) + s = w.sum + + if corrected + n = count(!iszero, w) + n / (s * (n - 1)) + else + 1 / s + end +end + """ var(itr; corrected::Bool=true, [weights::AbstractWeights], [mean], [dims]) diff --git a/stdlib/Statistics/src/weights.jl b/stdlib/Statistics/src/weights.jl index ac8b01c4a89e1..7ed9d49ea94d7 100644 --- a/stdlib/Statistics/src/weights.jl +++ b/stdlib/Statistics/src/weights.jl @@ -50,16 +50,6 @@ Base.@propagate_inbounds function Base.setindex!(wv::AbstractWeights, v::Real, i v end -""" - varcorrection(n::Integer, corrected=false) - -Compute a bias correction factor for calculating `var`, `std` and `cov` with -`n` observations. Returns ``\\frac{1}{n - 1}`` when `corrected=true` -(i.e. [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)), -otherwise returns ``\\frac{1}{n}`` (i.e. no correction). -""" -@inline varcorrection(n::Integer, corrected::Bool=false) = 1 / (n - Int(corrected)) - @weights Weights @doc """ @@ -85,18 +75,6 @@ See the documentation for [`Weights`](@ref) for more details. weights(vs::AbstractVector{<:Real}) = Weights(vs) weights(vs::AbstractArray{<:Real}) = Weights(vec(vs)) -""" - varcorrection(w::Weights, corrected=false) - -Returns ``\\frac{1}{\\sum w}`` when `corrected=false` and throws an `ArgumentError` -if `corrected=true`. -""" -@inline function varcorrection(w::Weights, corrected::Bool=false) - corrected && throw(ArgumentError("Weights type does not support bias correction: " * - "use FrequencyWeights, AnalyticWeights or ProbabilityWeights if applicable.")) - 1 / w.sum -end - @weights AnalyticWeights @doc """ @@ -126,23 +104,6 @@ See the documentation for [`AnalyticWeights`](@ref) for more details. aweights(vs::AbstractVector{<:Real}) = AnalyticWeights(vs) aweights(vs::AbstractArray{<:Real}) = AnalyticWeights(vec(vs)) -""" - varcorrection(w::AnalyticWeights, corrected=false) - -* `corrected=true`: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` -* `corrected=false`: ``\\frac{1}{\\sum w}`` -""" -@inline function varcorrection(w::AnalyticWeights, corrected::Bool=false) - s = w.sum - - if corrected - sum_sn = sum(x -> (x / s) ^ 2, w) - 1 / (s * (1 - sum_sn)) - else - 1 / s - end -end - @weights FrequencyWeights @doc """ @@ -170,22 +131,6 @@ See the documentation for [`FrequencyWeights`](@ref) for more details. fweights(vs::AbstractVector{<:Real}) = FrequencyWeights(vs) fweights(vs::AbstractArray{<:Real}) = FrequencyWeights(vec(vs)) -""" - varcorrection(w::FrequencyWeights, corrected=false) - -* `corrected=true`: ``\\frac{1}{\\sum{w} - 1}`` -* `corrected=false`: ``\\frac{1}{\\sum w}`` -""" -@inline function varcorrection(w::FrequencyWeights, corrected::Bool=false) - s = w.sum - - if corrected - 1 / (s - 1) - else - 1 / s - end -end - @weights ProbabilityWeights @doc """ @@ -214,23 +159,6 @@ See the documentation for [`ProbabilityWeights`](@ref) for more details. pweights(vs::AbstractVector{<:Real}) = ProbabilityWeights(vs) pweights(vs::AbstractArray{<:Real}) = ProbabilityWeights(vec(vs)) -""" - varcorrection(w::ProbabilityWeights, corrected=false) - -* `corrected=true`: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` -* `corrected=false`: ``\\frac{1}{\\sum w}`` -""" -@inline function varcorrection(w::ProbabilityWeights, corrected::Bool=false) - s = w.sum - - if corrected - n = count(!iszero, w) - n / (s * (n - 1)) - else - 1 / s - end -end - ##### Equality tests ##### for w in (AnalyticWeights, FrequencyWeights, ProbabilityWeights, Weights) From 46187235ea7ba8836039009936ba99b2b192b9f0 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 8 May 2019 13:54:15 +0200 Subject: [PATCH 07/12] Docs --- base/reducedim.jl | 28 ++++++++++++++++++++++--- stdlib/Statistics/src/Statistics.jl | 20 ++++++++++-------- stdlib/Statistics/src/moments.jl | 8 ++++---- stdlib/Statistics/src/weights.jl | 32 ++++++++++++++--------------- 4 files changed, 57 insertions(+), 31 deletions(-) diff --git a/base/reducedim.jl b/base/reducedim.jl index 28a0a170eab11..065884e959061 100644 --- a/base/reducedim.jl +++ b/base/reducedim.jl @@ -353,9 +353,19 @@ reduce(op, A::AbstractArray; kw...) = mapreduce(identity, op, A; kw...) ##### Specific reduction functions ##### """ - sum(A::AbstractArray; dims) + sum(A::AbstractArray; [dims], [weights::AbstractArray]) -Sum elements of an array over the given dimensions. +Compute the sum of array `A`. +If `dims` is provided, return an array of sums over these dimensions. +If `weights` is provided, return the weighted sum(s). `weights` must be +either an array of the same size as `A` if `dims` is omitted, +or a vector with the same length as `size(A, dims)` if `dims` is provided. + +!!! compat "Julia 1.1" + `mean` for empty arrays requires at least Julia 1.1. + +!!! compat "Julia 1.3" + The `weights` keyword argument requires at least Julia 1.3. # Examples ```jldoctest @@ -372,14 +382,26 @@ julia> sum(A, dims=2) 2×1 Array{Int64,2}: 3 7 + +julia> sum(A, weights=[2 1; 2 1]) +14 + +julia> sum(A, weights=[2, 1], dims=1) +1×2 Array{Int64,2}: + 5 8 ``` """ sum(A::AbstractArray; dims) """ - sum!(r, A) + sum!(r, A; [weights::AbstractVector]) Sum elements of `A` over the singleton dimensions of `r`, and write results to `r`. +If `r` has only one singleton dimension `i`, `weights` can be a vector of length +`size(v, i)` to compute the weighted mean. + +!!! compat "Julia 1.3" + The `weights` keyword argument requires at least Julia 1.3. # Examples ```jldoctest diff --git a/stdlib/Statistics/src/Statistics.jl b/stdlib/Statistics/src/Statistics.jl index 677498bcfd136..044f5aba67dc2 100644 --- a/stdlib/Statistics/src/Statistics.jl +++ b/stdlib/Statistics/src/Statistics.jl @@ -107,10 +107,14 @@ _mean(f, A::AbstractArray, ::Colon) = sum(f, A) / length(A) _mean(f, A::AbstractArray, dims) = sum(f, A, dims=dims) / mapreduce(i -> size(A, i), *, unique(dims); init=1) """ - mean!(r, v; weights::AbstractArray) + mean!(r, v; [weights::AbstractVector]) Compute the mean of `v` over the singleton dimensions of `r`, and write results to `r`. -Weights can be provided via an array of the same size as `A`. +If `r` has only one singleton dimension `i`, `weights` can be a vector of length +`size(v, i)` to compute the weighted mean. + +!!! compat "Julia 1.3" + The `weights` argument requires at least Julia 1.3. # Examples ```jldoctest @@ -155,8 +159,8 @@ or a vector with the same length as `size(A, dims)` if `dims` is provided. !!! compat "Julia 1.1" `mean` for empty arrays requires at least Julia 1.1. -!!! compat "Julia 1.2" - The `weights` keyword argument requires at least Julia 1.2. +!!! compat "Julia 1.3" + The `weights` keyword argument requires at least Julia 1.3. # Examples ```jldoctest @@ -575,8 +579,8 @@ weights used: Use the [`skipmissing`](@ref) function to omit `missing` entries and compute the standard deviation of non-missing values. -!!! compat "Julia 1.2" - The `weights` keyword argument requires at least Julia 1.2. +!!! compat "Julia 1.3" + The `weights` keyword argument requires at least Julia 1.3. """ std(A::AbstractArray; corrected::Bool=true, mean=nothing, dims=:, @@ -987,8 +991,8 @@ at the same time. See the documentation for [`quantile`](@ref) for more details. -!!! compat "Julia 1.2" - The `weights` keyword argument requires at least Julia 1.2. +!!! compat "Julia 1.3" + The `weights` keyword argument requires at least Julia 1.3c. # Examples ```jldoctest diff --git a/stdlib/Statistics/src/moments.jl b/stdlib/Statistics/src/moments.jl index a0f99ea388dd2..cc5a3a43cb1fa 100644 --- a/stdlib/Statistics/src/moments.jl +++ b/stdlib/Statistics/src/moments.jl @@ -51,8 +51,8 @@ specifying a pre-computed `mean`. If `x` is an `AbstractArray`, a `weights` array of the same length as `x` can be specified to compute the weighted skewness. -!!! compat "Julia 1.2" - This function requires at least Julia 1.2. +!!! compat "Julia 1.3" + This function requires at least Julia 1.3. """ skewness(A; mean::Union{Real, Nothing}=nothing) = _skewness(A, nothing, mean) @@ -125,8 +125,8 @@ specifying a pre-computed `mean`. If `x` is an `AbstractArray`, a `weights` array of the same length as `x` can be specified to compute the weighted kurtosis. -!!! compat "Julia 1.2" - This function requires at least Julia 1.2. +!!! compat "Julia 1.3" + This function requires at least Julia 1.3. """ kurtosis(A; mean::Union{Real, Nothing}=nothing) = _kurtosis(A, nothing, mean) diff --git a/stdlib/Statistics/src/weights.jl b/stdlib/Statistics/src/weights.jl index 7ed9d49ea94d7..8f5f2f0f3e457 100644 --- a/stdlib/Statistics/src/weights.jl +++ b/stdlib/Statistics/src/weights.jl @@ -13,8 +13,8 @@ Concrete `AbstractWeights` type indicates what correction has to be applied when computing statistics which depend on the meaning of weights. -!!! compat "Julia 1.2" - This type requires at least Julia 1.2. +!!! compat "Julia 1.3" + This type requires at least Julia 1.3. """ abstract type AbstractWeights{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractVector{T} end @@ -62,8 +62,8 @@ The `Weights` type describes a generic weights vector which does not support all operations possible for [`FrequencyWeights`](@ref), [`AnalyticWeights`](@ref) and [`ProbabilityWeights`](@ref). -!!! compat "Julia 1.2" - This type requires at least Julia 1.2. +!!! compat "Julia 1.3" + This type requires at least Julia 1.3. """ Weights """ @@ -88,8 +88,8 @@ for each observation. These weights may also be referred to as reliability weigh precision weights or inverse variance weights. These are typically used when the observations being weighted are aggregate values (e.g., averages) with differing variances. -!!! compat "Julia 1.2" - This type requires at least Julia 1.2. +!!! compat "Julia 1.3" + This type requires at least Julia 1.3. """ AnalyticWeights """ @@ -98,8 +98,8 @@ being weighted are aggregate values (e.g., averages) with differing variances. Construct an `AnalyticWeights` vector from array `vs`. See the documentation for [`AnalyticWeights`](@ref) for more details. -!!! compat "Julia 1.2" - This function requires at least Julia 1.2. +!!! compat "Julia 1.3" + This function requires at least Julia 1.3. """ aweights(vs::AbstractVector{<:Real}) = AnalyticWeights(vs) aweights(vs::AbstractArray{<:Real}) = AnalyticWeights(vec(vs)) @@ -115,8 +115,8 @@ A precomputed sum may be provided as `wsum`. Frequency weights describe the number of times (or frequency) each observation was observed. These weights may also be referred to as case weights or repeat weights. -!!! compat "Julia 1.2" - This type requires at least Julia 1.2. +!!! compat "Julia 1.3" + This type requires at least Julia 1.3. """ FrequencyWeights """ @@ -125,8 +125,8 @@ was observed. These weights may also be referred to as case weights or repeat we Construct a `FrequencyWeights` vector from a given array. See the documentation for [`FrequencyWeights`](@ref) for more details. -!!! compat "Julia 1.2" - This function requires at least Julia 1.2. +!!! compat "Julia 1.3" + This function requires at least Julia 1.3. """ fweights(vs::AbstractVector{<:Real}) = FrequencyWeights(vs) fweights(vs::AbstractArray{<:Real}) = FrequencyWeights(vec(vs)) @@ -143,8 +143,8 @@ Probability weights represent the inverse of the sampling probability for each o providing a correction mechanism for under- or over-sampling certain population groups. These weights may also be referred to as sampling weights. -!!! compat "Julia 1.2" - This type requires at least Julia 1.2. +!!! compat "Julia 1.3" + This type requires at least Julia 1.3. """ ProbabilityWeights """ @@ -153,8 +153,8 @@ These weights may also be referred to as sampling weights. Construct a `ProbabilityWeights` vector from a given array. See the documentation for [`ProbabilityWeights`](@ref) for more details. -!!! compat "Julia 1.2" - This function requires at least Julia 1.2. +!!! compat "Julia 1.3" + This function requires at least Julia 1.3. """ pweights(vs::AbstractVector{<:Real}) = ProbabilityWeights(vs) pweights(vs::AbstractArray{<:Real}) = ProbabilityWeights(vec(vs)) From d62739337f7d39adad34e6a096d25d196aa737c8 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 8 May 2019 16:04:14 +0200 Subject: [PATCH 08/12] Performance fix --- base/reducedim.jl | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/base/reducedim.jl b/base/reducedim.jl index 065884e959061..97b70b6d6fcd4 100644 --- a/base/reducedim.jl +++ b/base/reducedim.jl @@ -724,8 +724,9 @@ sum!(f::Function, r::AbstractArray, A::AbstractArray; _sum!(f, r, A, weights; init=init) _sum!(f, r::AbstractArray, A::AbstractArray, ::Nothing; init::Bool=true) = mapreducedim!(f, add_sum, initarray!(r, add_sum, init, A), A) -_sum(A, dims, weights) = _sum(identity, A, dims, weights) _sum(f, A, dims, ::Nothing) = mapreduce(f, add_sum, A, dims=dims) +_sum(A::AbstractArray, dims, w::AbstractArray) = + _sum!(identity, reducedim_init(t -> t*zero(eltype(w)), add_sum, A, dims), A, w) # Weighted sum @@ -807,7 +808,7 @@ function _wsum_general!(R::AbstractArray{S}, A::AbstractArray, w::AbstractVector for IA in CartesianIndices(indsAt) IR = Broadcast.newindex(IA, keep, Idefault) r = R[i1,IR] - @simd for i in axes(A, 1) + @inbounds @simd for i in axes(A, 1) r += A[i,IA] * w[dim > 1 ? IA[dim-1] : i] end R[i1,IR] = r @@ -815,7 +816,7 @@ function _wsum_general!(R::AbstractArray{S}, A::AbstractArray, w::AbstractVector else for IA in CartesianIndices(indsAt) IR = Broadcast.newindex(IA, keep, Idefault) - @simd for i in axes(A, 1) + @inbounds @simd for i in axes(A, 1) R[i,IR] += A[i,IA] * w[dim > 1 ? IA[dim-1] : i] end end @@ -848,9 +849,6 @@ function _sum!(::typeof(identity), R::AbstractArray, A::AbstractArray{T,N}, w::A _wsum!(R, A, w, dim, init) end -_sum(A::AbstractArray, dims, w::AbstractArray) = - _sum!(identity, reducedim_init(t -> t*zero(eltype(w)), add_sum, A, dims), A, w) - ##### findmin & findmax ##### # The initial values of Rval are not used if the corresponding indices in Rind are 0. # From 7f364f4dcc329e34314199372cbfb9b5d55a0fcf Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 8 May 2019 16:44:09 +0200 Subject: [PATCH 09/12] Cleanup --- base/reducedim.jl | 13 ++++++++++--- stdlib/Statistics/src/Statistics.jl | 4 ++-- test/reduce.jl | 4 ++++ test/reducedim.jl | 3 +++ 4 files changed, 19 insertions(+), 5 deletions(-) diff --git a/base/reducedim.jl b/base/reducedim.jl index 97b70b6d6fcd4..27c5e05e40c46 100644 --- a/base/reducedim.jl +++ b/base/reducedim.jl @@ -724,9 +724,12 @@ sum!(f::Function, r::AbstractArray, A::AbstractArray; _sum!(f, r, A, weights; init=init) _sum!(f, r::AbstractArray, A::AbstractArray, ::Nothing; init::Bool=true) = mapreducedim!(f, add_sum, initarray!(r, add_sum, init, A), A) -_sum(f, A, dims, ::Nothing) = mapreduce(f, add_sum, A, dims=dims) -_sum(A::AbstractArray, dims, w::AbstractArray) = +_sum(A::AbstractArray, dims, weights) = _sum(identity, A, dims, weights) +_sum(f, A::AbstractArray, dims, ::Nothing) = mapreduce(f, add_sum, A, dims=dims) +_sum(::typeof(identity), A::AbstractArray, dims, w::AbstractArray) = _sum!(identity, reducedim_init(t -> t*zero(eltype(w)), add_sum, A, dims), A, w) +_sum(f, A::AbstractArray, dims, w::AbstractArray) = + throw(ArgumentError("Passing a function is not supported with `weights`")) # Weighted sum @@ -832,8 +835,11 @@ _wsum!(R::AbstractArray, A::AbstractArray, w::AbstractVector, dim::Int, init::Bool) = _wsum_general!(R, A, w, dim, init) -function _sum!(::typeof(identity), R::AbstractArray, A::AbstractArray{T,N}, w::AbstractVector; +function _sum!(f, R::AbstractArray, A::AbstractArray{T,N}, w::AbstractArray; init::Bool=true) where {T,N} + f === identity || throw(ArgumentError("Passing a function is not supported with `weights`")) + w isa AbstractVector || throw(ArgumentError("Only vector `weights` are supported")) + check_reducedims(R,A) reddims = size(R) .!= size(A) dim = something(findfirst(reddims), ndims(R)+1) @@ -849,6 +855,7 @@ function _sum!(::typeof(identity), R::AbstractArray, A::AbstractArray{T,N}, w::A _wsum!(R, A, w, dim, init) end + ##### findmin & findmax ##### # The initial values of Rval are not used if the corresponding indices in Rind are 0. # diff --git a/stdlib/Statistics/src/Statistics.jl b/stdlib/Statistics/src/Statistics.jl index 044f5aba67dc2..f3756e1b0a01d 100644 --- a/stdlib/Statistics/src/Statistics.jl +++ b/stdlib/Statistics/src/Statistics.jl @@ -195,14 +195,14 @@ function _mean(r::AbstractRange{<:Real}, dims::Colon, weights::Nothing) end _mean(A::AbstractArray, dims, weights::Nothing) = - _mean!(Base.reducedim_init(t -> t/2, +, A, dims), A, nothing) + _mean!(Base.reducedim_init(t -> t/2, Base.add_sum, A, dims), A, nothing) _mean(A::AbstractArray, dims::Colon, weights::Nothing) = sum(A) / length(A) _mean(A::AbstractArray, dims::Colon, w::AbstractArray) = sum(A, weights=w) / sum(w) _mean(A::AbstractArray, dims, w::AbstractArray) = - _mean!(Base.reducedim_init(t -> (t*zero(eltype(w)))/2, +, A, dims), A, w) + _mean!(Base.reducedim_init(t -> (t*zero(eltype(w)))/2, Base.add_sum, A, dims), A, w) ##### variances ##### diff --git a/test/reduce.jl b/test/reduce.jl index d3ba20c4091ef..9b89e8f6741b6 100644 --- a/test/reduce.jl +++ b/test/reduce.jl @@ -563,4 +563,8 @@ x = [j+7 for j in i] @test typeof(res) == typeof(expected) end end + + @test_throws ArgumentError sum(exp, [1], weights=[1]) + @test_throws ArgumentError sum!(exp, [0 0], [1 2], weights=[1, 10]) + @test_throws ArgumentError sum!([0 0], [1 2], weights=[1 10]) end \ No newline at end of file diff --git a/test/reducedim.jl b/test/reducedim.jl index 38d8289051963..a90c2c1f96afb 100644 --- a/test/reducedim.jl +++ b/test/reducedim.jl @@ -481,4 +481,7 @@ end @test_throws DimensionMismatch sum(a, weights=w, dims=4) end end + + @test_throws ArgumentError sum(exp, [1 2], weights=[1, 10], dims=1) + @test_throws ArgumentError sum([1 2], weights=[1 10], dims=1) end \ No newline at end of file From f51d8db2157773bebc3caff52a5b9478de61eec4 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 8 May 2019 16:56:25 +0200 Subject: [PATCH 10/12] Fix bug --- base/reducedim.jl | 6 ++++++ test/reducedim.jl | 3 +++ 2 files changed, 9 insertions(+) diff --git a/base/reducedim.jl b/base/reducedim.jl index 27c5e05e40c46..27c1162e7fedb 100644 --- a/base/reducedim.jl +++ b/base/reducedim.jl @@ -843,6 +843,12 @@ function _sum!(f, R::AbstractArray, A::AbstractArray{T,N}, w::AbstractArray; check_reducedims(R,A) reddims = size(R) .!= size(A) dim = something(findfirst(reddims), ndims(R)+1) + if dim > N + dim1 = findfirst(==(1), size(A)) + if dim1 !== nothing + dim = dim1 + end + end if findnext(reddims, dim+1) !== nothing throw(ArgumentError("reducing over more than one dimension is not supported with weights")) end diff --git a/test/reducedim.jl b/test/reducedim.jl index a90c2c1f96afb..c8c030d3704ce 100644 --- a/test/reducedim.jl +++ b/test/reducedim.jl @@ -482,6 +482,9 @@ end end end + # Corner case with a single row + @test sum([1 2], weights=[2], dims=1) == [2 4] + @test_throws ArgumentError sum(exp, [1 2], weights=[1, 10], dims=1) @test_throws ArgumentError sum([1 2], weights=[1 10], dims=1) end \ No newline at end of file From 1f7b3d9ee87c2ef9ceee09c2c0284021cb9ceb2b Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 8 May 2019 17:13:42 +0200 Subject: [PATCH 11/12] Fix TODO --- stdlib/Statistics/src/Statistics.jl | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/stdlib/Statistics/src/Statistics.jl b/stdlib/Statistics/src/Statistics.jl index f3756e1b0a01d..2ceb105a44d28 100644 --- a/stdlib/Statistics/src/Statistics.jl +++ b/stdlib/Statistics/src/Statistics.jl @@ -351,9 +351,12 @@ varm(A::AbstractArray, m; corrected::Bool=true, dims=:, varm(iterable, m; corrected::Bool=true) = _var(iterable, corrected, m) -# TODO: choose element type based on weights type too? -_varm(A::AbstractArray, m, corrected::Bool, dims, w::Union{AbstractWeights, Nothing}) = - varm!(Base.reducedim_init(t -> abs2(t)/2, +, A, dims), A, m, w; corrected=corrected) +_varm(A::AbstractArray, m, corrected::Bool, dims, w::Nothing) = + varm!(Base.reducedim_init(t -> abs2(t)/2, +, A, dims), A, m, w, corrected=corrected) + +_varm(A::AbstractArray, m, corrected::Bool, dims, w::AbstractWeights{T}) where {T<:Real} = + varm!(Base.reducedim_init(t -> (abs2(t)*zero(T))/2, +, A, dims), A, m, w, + corrected=corrected) function _varm(A::AbstractArray{T}, m, corrected::Bool, dims::Colon, w::Nothing) where T n = length(A) From d5e0135bc95263e1bd00a336a93954039f0e4cc3 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 8 May 2019 17:22:00 +0200 Subject: [PATCH 12/12] Another cleanup, enable more tests --- base/reducedim.jl | 2 +- stdlib/Statistics/src/Statistics.jl | 6 +++++ stdlib/Statistics/src/moments.jl | 41 ----------------------------- stdlib/Statistics/src/weights.jl | 2 -- stdlib/Statistics/test/runtests.jl | 3 ++- stdlib/Statistics/test/weights.jl | 7 ----- 6 files changed, 9 insertions(+), 52 deletions(-) diff --git a/base/reducedim.jl b/base/reducedim.jl index 27c1162e7fedb..251c5a83074b7 100644 --- a/base/reducedim.jl +++ b/base/reducedim.jl @@ -1061,4 +1061,4 @@ julia> argmax(A, dims=2) CartesianIndex(2, 2) ``` """ -argmax(A::AbstractArray; dims=:) = findmax(A; dims=dims)[2] \ No newline at end of file +argmax(A::AbstractArray; dims=:) = findmax(A; dims=dims)[2] diff --git a/stdlib/Statistics/src/Statistics.jl b/stdlib/Statistics/src/Statistics.jl index 2ceb105a44d28..5c4863a258e89 100644 --- a/stdlib/Statistics/src/Statistics.jl +++ b/stdlib/Statistics/src/Statistics.jl @@ -322,6 +322,12 @@ function varm!(R::AbstractArray{S}, A::AbstractArray, m::AbstractArray, w::Nothi return R end +function varm!(R::AbstractArray, A::AbstractArray, m::AbstractArray, w::AbstractArray; + corrected::Bool=true) + rmul!(centralize_sumabs2!(R, A, m, values(w)), + varcorrection(w, corrected)) +end + """ varm(itr, m; dims, corrected::Bool=true) diff --git a/stdlib/Statistics/src/moments.jl b/stdlib/Statistics/src/moments.jl index cc5a3a43cb1fa..a1fd0a8511b24 100644 --- a/stdlib/Statistics/src/moments.jl +++ b/stdlib/Statistics/src/moments.jl @@ -1,44 +1,3 @@ -##### Weighted var & std - -## var - -## var along dim - -function varm!(R::AbstractArray, A::AbstractArray, M::AbstractArray, w::AbstractArray; - corrected::Bool=true) - rmul!(centralize_sumabs2!(R, A, M, values(w)), - varcorrection(w, corrected)) -end - -function var!(R::AbstractArray, A::AbstractArray, w::AbstractArray, dims::Int; - mean=nothing, corrected::Bool=true) - if mean == 0 - varm!(R, A, w, Base.reducedim_initarray(A, dims, 0, eltype(R)), dims; - corrected=corrected) - elseif mean == nothing - varm!(R, A, w, Statistics.mean(A, w, dims=dims), dims; corrected=corrected) - else - # check size of mean - for i = 1:ndims(A) - dA = size(A,i) - dM = size(mean,i) - if i == dims - dM == 1 || throw(DimensionMismatch("Incorrect size of mean.")) - else - dM == dA || throw(DimensionMismatch("Incorrect size of mean.")) - end - end - varm!(R, A, w, mean, dims; corrected=corrected) - end -end - -function var(A::AbstractArray, w::AbstractArray, dim::Int; - mean=nothing, corrected::Bool=true) - corrected = depcheck(:var, corrected) - var!(similar(A, Float64, Base.reduced_indices(axes(A), dim)), A, w, dim; - mean=mean, corrected=corrected) -end - ##### Skewness and Kurtosis # Skewness diff --git a/stdlib/Statistics/src/weights.jl b/stdlib/Statistics/src/weights.jl index 8f5f2f0f3e457..7497a410d3206 100644 --- a/stdlib/Statistics/src/weights.jl +++ b/stdlib/Statistics/src/weights.jl @@ -1,5 +1,3 @@ -using LinearAlgebra: BlasReal - ###### Weights array ##### """ diff --git a/stdlib/Statistics/test/runtests.jl b/stdlib/Statistics/test/runtests.jl index 5cb5308a73550..68f0a0aed14dd 100644 --- a/stdlib/Statistics/test/runtests.jl +++ b/stdlib/Statistics/test/runtests.jl @@ -721,4 +721,5 @@ end end end -include("weights.jl") \ No newline at end of file +include("weights.jl") +include("moments.jl") \ No newline at end of file diff --git a/stdlib/Statistics/test/weights.jl b/stdlib/Statistics/test/weights.jl index 617ae6d8e4251..5b7cc110570cd 100644 --- a/stdlib/Statistics/test/weights.jl +++ b/stdlib/Statistics/test/weights.jl @@ -28,13 +28,6 @@ weight_funcs = (weights, aweights, fweights, pweights) @test values(bv) === b @test sum(bv) === 3 @test !isempty(bv) - - ba = BitArray([true, false, true]) - sa = sparsevec([1., 0., 2.]) - - # TODO: keep? - #@test sum(ba, wv) === 4.0 - #@test sum(sa, wv) === 7.0 end @testset "$f, setindex!" for f in weight_funcs