From 4af59129cfb00ec3a0619974d038960f2e42cfb0 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Mon, 25 Nov 2019 15:34:28 +0100 Subject: [PATCH] Move Statistics stdlib module to external repository (#33399) --- .../md5 | 1 + .../sha512 | 1 + stdlib/Makefile | 6 +- stdlib/Statistics.version | 2 + stdlib/Statistics/Project.toml | 13 - stdlib/Statistics/docs/src/index.md | 22 - stdlib/Statistics/src/Statistics.jl | 1057 ----------------- stdlib/Statistics/test/runtests.jl | 738 ------------ 8 files changed, 8 insertions(+), 1832 deletions(-) create mode 100644 deps/checksums/Statistics-a2203d3b67f7413701be5de251622cb85c9cc69d.tar.gz/md5 create mode 100644 deps/checksums/Statistics-a2203d3b67f7413701be5de251622cb85c9cc69d.tar.gz/sha512 create mode 100644 stdlib/Statistics.version delete mode 100644 stdlib/Statistics/Project.toml delete mode 100644 stdlib/Statistics/docs/src/index.md delete mode 100644 stdlib/Statistics/src/Statistics.jl delete mode 100644 stdlib/Statistics/test/runtests.jl diff --git a/deps/checksums/Statistics-a2203d3b67f7413701be5de251622cb85c9cc69d.tar.gz/md5 b/deps/checksums/Statistics-a2203d3b67f7413701be5de251622cb85c9cc69d.tar.gz/md5 new file mode 100644 index 00000000000000..be4e3dff40ea9c --- /dev/null +++ b/deps/checksums/Statistics-a2203d3b67f7413701be5de251622cb85c9cc69d.tar.gz/md5 @@ -0,0 +1 @@ +dcb3084255e7072157f8efb820988bb9 diff --git a/deps/checksums/Statistics-a2203d3b67f7413701be5de251622cb85c9cc69d.tar.gz/sha512 b/deps/checksums/Statistics-a2203d3b67f7413701be5de251622cb85c9cc69d.tar.gz/sha512 new file mode 100644 index 00000000000000..d629a20e5e56ef --- /dev/null +++ b/deps/checksums/Statistics-a2203d3b67f7413701be5de251622cb85c9cc69d.tar.gz/sha512 @@ -0,0 +1 @@ +5f4d0a25915f066bed0c78270af61d63ec60514d5a3b9b08a2250fbbe2aea7046c730263e28dd257b9686d930d6dee814c0aa141b3d532a25a78fd7c58c0587f diff --git a/stdlib/Makefile b/stdlib/Makefile index a08a672b22422d..8699c2c1139002 100644 --- a/stdlib/Makefile +++ b/stdlib/Makefile @@ -17,10 +17,12 @@ $(build_datarootdir)/julia/stdlib/$(VERSDIR): STDLIBS = Base64 CRC32c Dates DelimitedFiles Distributed FileWatching \ Future InteractiveUtils Libdl LibGit2 LinearAlgebra Logging \ Markdown Mmap Printf Profile Random REPL Serialization SHA \ - SharedArrays Sockets SparseArrays Statistics SuiteSparse Test Unicode UUIDs -STDLIBS_EXT = Pkg + SharedArrays Sockets SparseArrays SuiteSparse Test Unicode UUIDs +STDLIBS_EXT = Pkg Statistics PKG_GIT_URL := git://github.com/JuliaLang/Pkg.jl.git PKG_TAR_URL = https://api.github.com/repos/JuliaLang/Pkg.jl/tarball/$1 +STATISTICS_GIT_URL := git://github.com/JuliaLang/Statistics.jl.git +STATISTICS_TAR_URL = https://api.github.com/repos/JuliaLang/Statistics.jl/tarball/$1 $(foreach module, $(STDLIBS_EXT), $(eval $(call stdlib-external,$(module),$(shell echo $(module) | tr a-z A-Z)))) diff --git a/stdlib/Statistics.version b/stdlib/Statistics.version new file mode 100644 index 00000000000000..4387cc84784700 --- /dev/null +++ b/stdlib/Statistics.version @@ -0,0 +1,2 @@ +STATISTICS_BRANCH = master +STATISTICS_SHA1 = a2203d3b67f7413701be5de251622cb85c9cc69d diff --git a/stdlib/Statistics/Project.toml b/stdlib/Statistics/Project.toml deleted file mode 100644 index 12c967736bbfbb..00000000000000 --- a/stdlib/Statistics/Project.toml +++ /dev/null @@ -1,13 +0,0 @@ -name = "Statistics" -uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" - -[deps] -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" - -[extras] -Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[targets] -test = ["Random", "Test"] diff --git a/stdlib/Statistics/docs/src/index.md b/stdlib/Statistics/docs/src/index.md deleted file mode 100644 index 25cdf29216a888..00000000000000 --- a/stdlib/Statistics/docs/src/index.md +++ /dev/null @@ -1,22 +0,0 @@ -# Statistics - -The Statistics module contains basic statistics functionality. - -!!! note - To use any of the examples described below, run `using Statistics` and then the code from the example. - -```@docs -Statistics.std -Statistics.stdm -Statistics.var -Statistics.varm -Statistics.cor -Statistics.cov -Statistics.mean! -Statistics.mean -Statistics.median! -Statistics.median -Statistics.middle -Statistics.quantile! -Statistics.quantile -``` diff --git a/stdlib/Statistics/src/Statistics.jl b/stdlib/Statistics/src/Statistics.jl deleted file mode 100644 index 07151e4784561d..00000000000000 --- a/stdlib/Statistics/src/Statistics.jl +++ /dev/null @@ -1,1057 +0,0 @@ -# This file is a part of Julia. License is MIT: https://julialang.org/license - -""" - Statistics - -Standard library module for basic statistics functionality. -""" -module Statistics - -using LinearAlgebra, SparseArrays - -using Base: has_offset_axes, require_one_based_indexing - -export cor, cov, std, stdm, var, varm, mean!, mean, - median!, median, middle, quantile!, quantile - -##### mean ##### - -""" - mean(itr) - -Compute the mean of all elements in a collection. - -!!! note - If `itr` contains `NaN` or [`missing`](@ref) values, the result is also - `NaN` or `missing` (`missing` takes precedence if array contains both). - Use the [`skipmissing`](@ref) function to omit `missing` entries and compute the - mean of non-missing values. - -# Examples -```jldoctest -julia> mean(1:20) -10.5 - -julia> mean([1, missing, 3]) -missing - -julia> mean(skipmissing([1, missing, 3])) -2.0 -``` -""" -mean(itr) = mean(identity, itr) - -""" - mean(f::Function, itr) - -Apply the function `f` to each element of collection `itr` and take the mean. - -```jldoctest -julia> mean(√, [1, 2, 3]) -1.3820881233139908 - -julia> mean([√1, √2, √3]) -1.3820881233139908 -``` -""" -function mean(f, itr) - y = iterate(itr) - if y === nothing - return Base.mapreduce_empty_iter(f, Base.add_sum, itr, - Base.IteratorEltype(itr)) / 0 - end - count = 1 - value, state = y - f_value = f(value) - total = Base.reduce_first(Base.add_sum, f_value) - y = iterate(itr, state) - while y !== nothing - value, state = y - total += f(value) - count += 1 - y = iterate(itr, state) - end - return total/count -end - -""" - mean(f::Function, A::AbstractArray; dims) - -Apply the function `f` to each element of array `A` and take the mean over dimensions `dims`. - -!!! compat "Julia 1.3" - This method requires at least Julia 1.3. - -```jldoctest -julia> mean(√, [1, 2, 3]) -1.3820881233139908 - -julia> mean([√1, √2, √3]) -1.3820881233139908 - -julia> mean(√, [1 2 3; 4 5 6], dims=2) -2×1 Array{Float64,2}: - 1.3820881233139908 - 2.2285192400943226 -``` -""" -mean(f, A::AbstractArray; dims=:) = _mean(f, A, dims) - -_mean(f, A::AbstractArray, ::Colon) = sum(f, A) / length(A) -_mean(f, A::AbstractArray, dims) = sum(f, A, dims=dims) / mapreduce(i -> size(A, i), *, unique(dims); init=1) - -""" - mean!(r, v) - -Compute the mean of `v` over the singleton dimensions of `r`, and write results to `r`. - -# Examples -```jldoctest -julia> v = [1 2; 3 4] -2×2 Array{Int64,2}: - 1 2 - 3 4 - -julia> mean!([1., 1.], v) -2-element Array{Float64,1}: - 1.5 - 3.5 - -julia> mean!([1. 1.], v) -1×2 Array{Float64,2}: - 2.0 3.0 -``` -""" -function mean!(R::AbstractArray, A::AbstractArray) - sum!(R, A; init=true) - x = max(1, length(R)) // length(A) - R .= R .* x - return R -end - -""" - mean(A::AbstractArray; dims) - -Compute the mean of an array over the given dimensions. - -!!! compat "Julia 1.1" - `mean` for empty arrays requires at least Julia 1.1. - -# Examples -```jldoctest -julia> A = [1 2; 3 4] -2×2 Array{Int64,2}: - 1 2 - 3 4 - -julia> mean(A, dims=1) -1×2 Array{Float64,2}: - 2.0 3.0 - -julia> mean(A, dims=2) -2×1 Array{Float64,2}: - 1.5 - 3.5 -``` -""" -mean(A::AbstractArray; dims=:) = _mean(A, dims) - -_mean(A::AbstractArray{T}, region) where {T} = mean!(Base.reducedim_init(t -> t/2, +, A, region), A) -_mean(A::AbstractArray, ::Colon) = sum(A) / length(A) - -function mean(r::AbstractRange{<:Real}) - isempty(r) && return oftype((first(r) + last(r)) / 2, NaN) - (first(r) + last(r)) / 2 -end - -median(r::AbstractRange{<:Real}) = mean(r) - -##### variances ##### - -# faster computation of real(conj(x)*y) -realXcY(x::Real, y::Real) = x*y -realXcY(x::Complex, y::Complex) = real(x)*real(y) + imag(x)*imag(y) - -var(iterable; corrected::Bool=true, mean=nothing) = _var(iterable, corrected, mean) - -function _var(iterable, corrected::Bool, mean) - y = iterate(iterable) - if y === nothing - T = eltype(iterable) - return oftype((abs2(zero(T)) + abs2(zero(T)))/2, NaN) - end - count = 1 - value, state = y - y = iterate(iterable, state) - if mean === nothing - # Use Welford algorithm as seen in (among other places) - # Knuth's TAOCP, Vol 2, page 232, 3rd edition. - M = value / 1 - S = real(zero(M)) - while y !== nothing - value, state = y - y = iterate(iterable, state) - count += 1 - new_M = M + (value - M) / count - S = S + realXcY(value - M, value - new_M) - M = new_M - end - return S / (count - Int(corrected)) - elseif isa(mean, Number) # mean provided - # Cannot use a compensated version, e.g. the one from - # "Updating Formulae and a Pairwise Algorithm for Computing Sample Variances." - # by Chan, Golub, and LeVeque, Technical Report STAN-CS-79-773, - # Department of Computer Science, Stanford University, - # because user can provide mean value that is different to mean(iterable) - sum2 = abs2(value - mean::Number) - while y !== nothing - value, state = y - y = iterate(iterable, state) - count += 1 - sum2 += abs2(value - mean) - end - return sum2 / (count - Int(corrected)) - else - throw(ArgumentError("invalid value of mean, $(mean)::$(typeof(mean))")) - end -end - -centralizedabs2fun(m) = x -> abs2.(x - m) -centralize_sumabs2(A::AbstractArray, m) = - mapreduce(centralizedabs2fun(m), +, A) -centralize_sumabs2(A::AbstractArray, m, ifirst::Int, ilast::Int) = - Base.mapreduce_impl(centralizedabs2fun(m), +, A, ifirst, ilast) - -function centralize_sumabs2!(R::AbstractArray{S}, A::AbstractArray, means::AbstractArray) where S - # following the implementation of _mapreducedim! at base/reducedim.jl - lsiz = Base.check_reducedims(R,A) - isempty(R) || fill!(R, zero(S)) - isempty(A) && return R - - if Base.has_fast_linear_indexing(A) && lsiz > 16 && !has_offset_axes(R, means) - nslices = div(length(A), lsiz) - ibase = first(LinearIndices(A))-1 - for i = 1:nslices - @inbounds R[i] = centralize_sumabs2(A, means[i], ibase+1, ibase+lsiz) - ibase += lsiz - end - return R - end - indsAt, indsRt = Base.safe_tail(axes(A)), Base.safe_tail(axes(R)) # handle d=1 manually - keep, Idefault = Broadcast.shapeindexer(indsRt) - if Base.reducedim1(R, A) - i1 = first(Base.axes1(R)) - @inbounds for IA in CartesianIndices(indsAt) - IR = Broadcast.newindex(IA, keep, Idefault) - r = R[i1,IR] - m = means[i1,IR] - @simd for i in axes(A, 1) - r += abs2(A[i,IA] - m) - end - R[i1,IR] = r - end - else - @inbounds for IA in CartesianIndices(indsAt) - IR = Broadcast.newindex(IA, keep, Idefault) - @simd for i in axes(A, 1) - R[i,IR] += abs2(A[i,IA] - means[i,IR]) - end - end - end - return R -end - -function varm!(R::AbstractArray{S}, A::AbstractArray, m::AbstractArray; corrected::Bool=true) where S - if isempty(A) - fill!(R, convert(S, NaN)) - else - rn = div(length(A), length(R)) - Int(corrected) - centralize_sumabs2!(R, A, m) - R .= R .* (1 // rn) - end - return R -end - -""" - varm(itr, m; dims, corrected::Bool=true) - -Compute the sample variance of collection `itr`, with known mean(s) `m`. - -The algorithm returns an estimator of the generative distribution's variance -under the assumption that each entry of `itr` is an IID drawn from that generative -distribution. For arrays, this computation is equivalent to calculating -`sum((itr .- mean(itr)).^2) / (length(itr) - 1)`. -If `corrected` is `true`, then the sum is scaled with `n-1`, -whereas the sum is scaled with `n` if `corrected` is -`false` with `n` the number of elements in `itr`. - -If `itr` is an `AbstractArray`, `dims` can be provided to compute the variance -over dimensions, and `m` may contain means for each dimension of `itr`. - -!!! note - If array contains `NaN` or [`missing`](@ref) values, the result is also - `NaN` or `missing` (`missing` takes precedence if array contains both). - Use the [`skipmissing`](@ref) function to omit `missing` entries and compute the - variance of non-missing values. -""" -varm(A::AbstractArray, m::AbstractArray; corrected::Bool=true, dims=:) = _varm(A, m, corrected, dims) - -_varm(A::AbstractArray{T}, m, corrected::Bool, region) where {T} = - varm!(Base.reducedim_init(t -> abs2(t)/2, +, A, region), A, m; corrected=corrected) - -varm(A::AbstractArray, m; corrected::Bool=true) = _varm(A, m, corrected, :) - -function _varm(A::AbstractArray{T}, m, corrected::Bool, ::Colon) where T - n = length(A) - n == 0 && return oftype((abs2(zero(T)) + abs2(zero(T)))/2, NaN) - return centralize_sumabs2(A, m) / (n - Int(corrected)) -end - - -""" - var(itr; dims, corrected::Bool=true, mean=nothing) - -Compute the sample variance of collection `itr`. - -The algorithm returns an estimator of the generative distribution's variance -under the assumption that each entry of `itr` is an IID drawn from that generative -distribution. For arrays, this computation is equivalent to calculating -`sum((itr .- mean(itr)).^2) / (length(itr) - 1)). -If `corrected` is `true`, then the sum is scaled with `n-1`, -whereas the sum is scaled with `n` if `corrected` is -`false` with `n` the number of elements in `itr`. - -A pre-computed `mean` may be provided. - -If `itr` is an `AbstractArray`, `dims` can be provided to compute the variance -over dimensions, and `mean` may contain means for each dimension of `itr`. - -!!! note - If array contains `NaN` or [`missing`](@ref) values, the result is also - `NaN` or `missing` (`missing` takes precedence if array contains both). - Use the [`skipmissing`](@ref) function to omit `missing` entries and compute the - variance of non-missing values. -""" -var(A::AbstractArray; corrected::Bool=true, mean=nothing, dims=:) = _var(A, corrected, mean, dims) - -_var(A::AbstractArray, corrected::Bool, mean, dims) = - varm(A, something(mean, Statistics.mean(A, dims=dims)); corrected=corrected, dims=dims) - -_var(A::AbstractArray, corrected::Bool, mean, ::Colon) = - real(varm(A, something(mean, Statistics.mean(A)); corrected=corrected)) - -varm(iterable, m; corrected::Bool=true) = _var(iterable, corrected, m) - -## variances over ranges - -varm(v::AbstractRange, m::AbstractArray) = range_varm(v, m) -varm(v::AbstractRange, m) = range_varm(v, m) - -function range_varm(v::AbstractRange, m) - f = first(v) - m - s = step(v) - l = length(v) - vv = f^2 * l / (l - 1) + f * s * l + s^2 * l * (2 * l - 1) / 6 - if l == 0 || l == 1 - return typeof(vv)(NaN) - end - return vv -end - -function var(v::AbstractRange) - s = step(v) - l = length(v) - vv = abs2(s) * (l + 1) * l / 12 - if l == 0 || l == 1 - return typeof(vv)(NaN) - end - return vv -end - - -##### standard deviation ##### - -function sqrt!(A::AbstractArray) - for i in eachindex(A) - @inbounds A[i] = sqrt(A[i]) - end - A -end - -stdm(A::AbstractArray, m; corrected::Bool=true) = - sqrt.(varm(A, m; corrected=corrected)) - -""" - std(itr; corrected::Bool=true, mean=nothing[, dims]) - -Compute the sample standard deviation of collection `itr`. - -The algorithm returns an estimator of the generative distribution's standard -deviation under the assumption that each entry of `itr` is an IID drawn from that generative -distribution. For arrays, this computation is equivalent to calculating -`sqrt(sum((itr .- mean(itr)).^2) / (length(itr) - 1))`. -If `corrected` is `true`, then the sum is scaled with `n-1`, -whereas the sum is scaled with `n` if `corrected` is -`false` with `n` the number of elements in `itr`. - -A pre-computed `mean` may be provided. - -If `itr` is an `AbstractArray`, `dims` can be provided to compute the standard deviation -over dimensions, and `means` may contain means for each dimension of `itr`. - -!!! note - If array contains `NaN` or [`missing`](@ref) values, the result is also - `NaN` or `missing` (`missing` takes precedence if array contains both). - Use the [`skipmissing`](@ref) function to omit `missing` entries and compute the - standard deviation of non-missing values. -""" -std(A::AbstractArray; corrected::Bool=true, mean=nothing, dims=:) = _std(A, corrected, mean, dims) - -_std(A::AbstractArray, corrected::Bool, mean, dims) = - sqrt.(var(A; corrected=corrected, mean=mean, dims=dims)) - -_std(A::AbstractArray, corrected::Bool, mean, ::Colon) = - sqrt.(var(A; corrected=corrected, mean=mean)) - -_std(A::AbstractArray{<:AbstractFloat}, corrected::Bool, mean, dims) = - sqrt!(var(A; corrected=corrected, mean=mean, dims=dims)) - -_std(A::AbstractArray{<:AbstractFloat}, corrected::Bool, mean, ::Colon) = - sqrt.(var(A; corrected=corrected, mean=mean)) - -std(iterable; corrected::Bool=true, mean=nothing) = - sqrt(var(iterable, corrected=corrected, mean=mean)) - -""" - stdm(itr, m; corrected::Bool=true) - -Compute the sample standard deviation of collection `itr`, with known mean(s) `m`. - -The algorithm returns an estimator of the generative distribution's standard -deviation under the assumption that each entry of `itr` is an IID drawn from that generative -distribution. For arrays, this computation is equivalent to calculating -`sqrt(sum((itr .- mean(itr)).^2) / (length(itr) - 1))`. -If `corrected` is `true`, then the sum is scaled with `n-1`, -whereas the sum is scaled with `n` if `corrected` is -`false` with `n` the number of elements in `itr`. - -A pre-computed `mean` may be provided. - -If `itr` is an `AbstractArray`, `dims` can be provided to compute the standard deviation -over dimensions, and `m` may contain means for each dimension of `itr`. - -!!! note - If array contains `NaN` or [`missing`](@ref) values, the result is also - `NaN` or `missing` (`missing` takes precedence if array contains both). - Use the [`skipmissing`](@ref) function to omit `missing` entries and compute the - standard deviation of non-missing values. -""" -stdm(iterable, m; corrected::Bool=true) = - std(iterable, corrected=corrected, mean=m) - - -###### covariance ###### - -# auxiliary functions - -_conj(x::AbstractArray{<:Real}) = x -_conj(x::AbstractArray) = conj(x) - -_getnobs(x::AbstractVector, vardim::Int) = length(x) -_getnobs(x::AbstractMatrix, vardim::Int) = size(x, vardim) - -function _getnobs(x::AbstractVecOrMat, y::AbstractVecOrMat, vardim::Int) - n = _getnobs(x, vardim) - _getnobs(y, vardim) == n || throw(DimensionMismatch("dimensions of x and y mismatch")) - return n -end - -_vmean(x::AbstractVector, vardim::Int) = mean(x) -_vmean(x::AbstractMatrix, vardim::Int) = mean(x, dims=vardim) - -# core functions - -unscaled_covzm(x::AbstractVector{<:Number}) = sum(abs2, x) -unscaled_covzm(x::AbstractVector) = sum(t -> t*t', x) -unscaled_covzm(x::AbstractMatrix, vardim::Int) = (vardim == 1 ? _conj(x'x) : x * x') - -unscaled_covzm(x::AbstractVector, y::AbstractVector) = sum(conj(y[i])*x[i] for i in eachindex(y, x)) -unscaled_covzm(x::AbstractVector, y::AbstractMatrix, vardim::Int) = - (vardim == 1 ? *(transpose(x), _conj(y)) : *(transpose(x), transpose(_conj(y)))) -unscaled_covzm(x::AbstractMatrix, y::AbstractVector, vardim::Int) = - (c = vardim == 1 ? *(transpose(x), _conj(y)) : x * _conj(y); reshape(c, length(c), 1)) -unscaled_covzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int) = - (vardim == 1 ? *(transpose(x), _conj(y)) : *(x, adjoint(y))) - -# covzm (with centered data) - -covzm(x::AbstractVector; corrected::Bool=true) = unscaled_covzm(x) / (length(x) - Int(corrected)) -function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true) - C = unscaled_covzm(x, vardim) - T = promote_type(typeof(first(C) / 1), eltype(C)) - A = convert(AbstractMatrix{T}, C) - b = 1//(size(x, vardim) - corrected) - A .= A .* b - return A -end -covzm(x::AbstractVector, y::AbstractVector; corrected::Bool=true) = - unscaled_covzm(x, y) / (length(x) - Int(corrected)) -function covzm(x::AbstractVecOrMat, y::AbstractVecOrMat, vardim::Int=1; corrected::Bool=true) - C = unscaled_covzm(x, y, vardim) - T = promote_type(typeof(first(C) / 1), eltype(C)) - A = convert(AbstractArray{T}, C) - b = 1//(_getnobs(x, y, vardim) - corrected) - A .= A .* b - return A -end - -# covm (with provided mean) -## Use map(t -> t - xmean, x) instead of x .- xmean to allow for Vector{Vector} -## which can't be handled by broadcast -covm(x::AbstractVector, xmean; corrected::Bool=true) = - covzm(map(t -> t - xmean, x); corrected=corrected) -covm(x::AbstractMatrix, xmean, vardim::Int=1; corrected::Bool=true) = - covzm(x .- xmean, vardim; corrected=corrected) -covm(x::AbstractVector, xmean, y::AbstractVector, ymean; corrected::Bool=true) = - covzm(map(t -> t - xmean, x), map(t -> t - ymean, y); corrected=corrected) -covm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1; corrected::Bool=true) = - covzm(x .- xmean, y .- ymean, vardim; corrected=corrected) - -# cov (API) -""" - cov(x::AbstractVector; corrected::Bool=true) - -Compute the variance of the vector `x`. If `corrected` is `true` (the default) then the sum -is scaled with `n-1`, whereas the sum is scaled with `n` if `corrected` is `false` where `n = length(x)`. -""" -cov(x::AbstractVector; corrected::Bool=true) = covm(x, mean(x); corrected=corrected) - -""" - cov(X::AbstractMatrix; dims::Int=1, corrected::Bool=true) - -Compute the covariance matrix of the matrix `X` along the dimension `dims`. If `corrected` -is `true` (the default) then the sum is scaled with `n-1`, whereas the sum is scaled with `n` -if `corrected` is `false` where `n = size(X, dims)`. -""" -cov(X::AbstractMatrix; dims::Int=1, corrected::Bool=true) = - covm(X, _vmean(X, dims), dims; corrected=corrected) - -""" - cov(x::AbstractVector, y::AbstractVector; corrected::Bool=true) - -Compute the covariance between the vectors `x` and `y`. If `corrected` is `true` (the -default), computes ``\\frac{1}{n-1}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*`` where -``*`` denotes the complex conjugate and `n = length(x) = length(y)`. If `corrected` is -`false`, computes ``\\frac{1}{n}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*``. -""" -cov(x::AbstractVector, y::AbstractVector; corrected::Bool=true) = - covm(x, mean(x), y, mean(y); corrected=corrected) - -""" - cov(X::AbstractVecOrMat, Y::AbstractVecOrMat; dims::Int=1, corrected::Bool=true) - -Compute the covariance between the vectors or matrices `X` and `Y` along the dimension -`dims`. If `corrected` is `true` (the default) then the sum is scaled with `n-1`, whereas -the sum is scaled with `n` if `corrected` is `false` where `n = size(X, dims) = size(Y, dims)`. -""" -cov(X::AbstractVecOrMat, Y::AbstractVecOrMat; dims::Int=1, corrected::Bool=true) = - covm(X, _vmean(X, dims), Y, _vmean(Y, dims), dims; corrected=corrected) - -##### correlation ##### - -""" - clampcor(x) - -Clamp a real correlation to between -1 and 1, leaving complex correlations unchanged -""" -clampcor(x::Real) = clamp(x, -1, 1) -clampcor(x) = x - -# cov2cor! - -function cov2cor!(C::AbstractMatrix{T}, xsd::AbstractArray) where T - require_one_based_indexing(C, xsd) - nx = length(xsd) - size(C) == (nx, nx) || throw(DimensionMismatch("inconsistent dimensions")) - for j = 1:nx - for i = 1:j-1 - C[i,j] = adjoint(C[j,i]) - end - C[j,j] = oneunit(T) - for i = j+1:nx - C[i,j] = clampcor(C[i,j] / (xsd[i] * xsd[j])) - end - end - return C -end -function cov2cor!(C::AbstractMatrix, xsd, ysd::AbstractArray) - require_one_based_indexing(C, ysd) - nx, ny = size(C) - length(ysd) == ny || throw(DimensionMismatch("inconsistent dimensions")) - for (j, y) in enumerate(ysd) # fixme (iter): here and in all `cov2cor!` we assume that `C` is efficiently indexed by integers - for i in 1:nx - C[i,j] = clampcor(C[i, j] / (xsd * y)) - end - end - return C -end -function cov2cor!(C::AbstractMatrix, xsd::AbstractArray, ysd) - require_one_based_indexing(C, xsd) - nx, ny = size(C) - length(xsd) == nx || throw(DimensionMismatch("inconsistent dimensions")) - for j in 1:ny - for (i, x) in enumerate(xsd) - C[i,j] = clampcor(C[i,j] / (x * ysd)) - end - end - return C -end -function cov2cor!(C::AbstractMatrix, xsd::AbstractArray, ysd::AbstractArray) - require_one_based_indexing(C, xsd, ysd) - nx, ny = size(C) - (length(xsd) == nx && length(ysd) == ny) || - throw(DimensionMismatch("inconsistent dimensions")) - for (i, x) in enumerate(xsd) - for (j, y) in enumerate(ysd) - C[i,j] = clampcor(C[i,j] / (x * y)) - end - end - return C -end - -# corzm (non-exported, with centered data) - -corzm(x::AbstractVector{T}) where {T} = one(real(T)) -function corzm(x::AbstractMatrix, vardim::Int=1) - c = unscaled_covzm(x, vardim) - return cov2cor!(c, collect(sqrt(c[i,i]) for i in 1:min(size(c)...))) -end -corzm(x::AbstractVector, y::AbstractMatrix, vardim::Int=1) = - cov2cor!(unscaled_covzm(x, y, vardim), sqrt(sum(abs2, x)), sqrt!(sum(abs2, y, dims=vardim))) -corzm(x::AbstractMatrix, y::AbstractVector, vardim::Int=1) = - cov2cor!(unscaled_covzm(x, y, vardim), sqrt!(sum(abs2, x, dims=vardim)), sqrt(sum(abs2, y))) -corzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int=1) = - cov2cor!(unscaled_covzm(x, y, vardim), sqrt!(sum(abs2, x, dims=vardim)), sqrt!(sum(abs2, y, dims=vardim))) - -# corm - -corm(x::AbstractVector{T}, xmean) where {T} = one(real(T)) -corm(x::AbstractMatrix, xmean, vardim::Int=1) = corzm(x .- xmean, vardim) -function corm(x::AbstractVector, mx, y::AbstractVector, my) - require_one_based_indexing(x, y) - n = length(x) - length(y) == n || throw(DimensionMismatch("inconsistent lengths")) - n > 0 || throw(ArgumentError("correlation only defined for non-empty vectors")) - - @inbounds begin - # Initialize the accumulators - xx = zero(sqrt(abs2(one(x[1])))) - yy = zero(sqrt(abs2(one(y[1])))) - xy = zero(x[1] * y[1]') - - @simd for i in eachindex(x, y) - xi = x[i] - mx - yi = y[i] - my - xx += abs2(xi) - yy += abs2(yi) - xy += xi * yi' - end - end - return clampcor(xy / max(xx, yy) / sqrt(min(xx, yy) / max(xx, yy))) -end - -corm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1) = - corzm(x .- xmean, y .- ymean, vardim) - -# cor -""" - cor(x::AbstractVector) - -Return the number one. -""" -cor(x::AbstractVector) = one(real(eltype(x))) - -""" - cor(X::AbstractMatrix; dims::Int=1) - -Compute the Pearson correlation matrix of the matrix `X` along the dimension `dims`. -""" -cor(X::AbstractMatrix; dims::Int=1) = corm(X, _vmean(X, dims), dims) - -""" - cor(x::AbstractVector, y::AbstractVector) - -Compute the Pearson correlation between the vectors `x` and `y`. -""" -cor(x::AbstractVector, y::AbstractVector) = corm(x, mean(x), y, mean(y)) - -""" - cor(X::AbstractVecOrMat, Y::AbstractVecOrMat; dims=1) - -Compute the Pearson correlation between the vectors or matrices `X` and `Y` along the dimension `dims`. -""" -cor(x::AbstractVecOrMat, y::AbstractVecOrMat; dims::Int=1) = - corm(x, _vmean(x, dims), y, _vmean(y, dims), dims) - -##### median & quantiles ##### - -""" - middle(x) - -Compute the middle of a scalar value, which is equivalent to `x` itself, but of the type of `middle(x, x)` for consistency. -""" -middle(x::Union{Bool,Int8,Int16,Int32,Int64,Int128,UInt8,UInt16,UInt32,UInt64,UInt128}) = Float64(x) -# Specialized functions for real types allow for improved performance -middle(x::AbstractFloat) = x -middle(x::Real) = (x + zero(x)) / 1 - -""" - middle(x, y) - -Compute the middle of two reals `x` and `y`, which is -equivalent in both value and type to computing their mean (`(x + y) / 2`). -""" -middle(x::Real, y::Real) = x/2 + y/2 - -""" - middle(range) - -Compute the middle of a range, which consists of computing the mean of its extrema. -Since a range is sorted, the mean is performed with the first and last element. - -```jldoctest -julia> middle(1:10) -5.5 -``` -""" -middle(a::AbstractRange) = middle(a[1], a[end]) - -""" - middle(a) - -Compute the middle of an array `a`, which consists of finding its -extrema and then computing their mean. - -```jldoctest -julia> a = [1,2,3.6,10.9] -4-element Array{Float64,1}: - 1.0 - 2.0 - 3.6 - 10.9 - -julia> middle(a) -5.95 -``` -""" -middle(a::AbstractArray) = ((v1, v2) = extrema(a); middle(v1, v2)) - -""" - median!(v) - -Like [`median`](@ref), but may overwrite the input vector. -""" -function median!(v::AbstractVector) - isempty(v) && throw(ArgumentError("median of an empty array is undefined, $(repr(v))")) - eltype(v)>:Missing && any(ismissing, v) && return missing - (eltype(v)<:AbstractFloat || eltype(v)>:AbstractFloat) && any(isnan, v) && return convert(eltype(v), NaN) - inds = axes(v, 1) - n = length(inds) - mid = div(first(inds)+last(inds),2) - if isodd(n) - return middle(partialsort!(v,mid)) - else - m = partialsort!(v, mid:mid+1) - return middle(m[1], m[2]) - end -end -median!(v::AbstractArray) = median!(vec(v)) - -""" - median(itr) - -Compute the median of all elements in a collection. -For an even number of elements no exact median element exists, so the result is -equivalent to calculating mean of two median elements. - -!!! note - If `itr` contains `NaN` or [`missing`](@ref) values, the result is also - `NaN` or `missing` (`missing` takes precedence if `itr` contains both). - Use the [`skipmissing`](@ref) function to omit `missing` entries and compute the - median of non-missing values. - -# Examples -```jldoctest -julia> median([1, 2, 3]) -2.0 - -julia> median([1, 2, 3, 4]) -2.5 - -julia> median([1, 2, missing, 4]) -missing - -julia> median(skipmissing([1, 2, missing, 4])) -2.0 -``` -""" -median(itr) = median!(collect(itr)) - -""" - median(A::AbstractArray; dims) - -Compute the median of an array along the given dimensions. - -# Examples -```jldoctest -julia> median([1 2; 3 4], dims=1) -1×2 Array{Float64,2}: - 2.0 3.0 -``` -""" -median(v::AbstractArray; dims=:) = _median(v, dims) - -_median(v::AbstractArray, dims) = mapslices(median!, v, dims = dims) - -_median(v::AbstractArray{T}, ::Colon) where {T} = median!(copyto!(Array{T,1}(undef, length(v)), v)) - -# for now, use the R/S definition of quantile; may want variants later -# see ?quantile in R -- this is type 7 -""" - quantile!([q::AbstractArray, ] v::AbstractVector, p; sorted=false) - -Compute the quantile(s) of a vector `v` at a specified probability or vector or tuple of -probabilities `p` on the interval [0,1]. If `p` is a vector, an optional -output array `q` may also be specified. (If not provided, a new output array is created.) -The keyword argument `sorted` indicates whether `v` can be assumed to be sorted; if -`false` (the default), then the elements of `v` will be partially sorted in-place. - -Quantiles are computed via linear interpolation between the points `((k-1)/(n-1), v[k])`, -for `k = 1:n` where `n = length(v)`. This corresponds to Definition 7 of Hyndman and Fan -(1996), and is the same as the R default. - -!!! note - An `ArgumentError` is thrown if `v` contains `NaN` or [`missing`](@ref) values. - -* Hyndman, R.J and Fan, Y. (1996) "Sample Quantiles in Statistical Packages", - *The American Statistician*, Vol. 50, No. 4, pp. 361-365 - -# Examples -```jldoctest -julia> x = [3, 2, 1]; - -julia> quantile!(x, 0.5) -2.0 - -julia> x -3-element Array{Int64,1}: - 1 - 2 - 3 - -julia> y = zeros(3); - -julia> quantile!(y, x, [0.1, 0.5, 0.9]) === y -true - -julia> y -3-element Array{Float64,1}: - 1.2 - 2.0 - 2.8 -``` -""" -function quantile!(q::AbstractArray, v::AbstractVector, p::AbstractArray; - sorted::Bool=false) - require_one_based_indexing(q, v, p) - if size(p) != size(q) - throw(DimensionMismatch("size of p, $(size(p)), must equal size of q, $(size(q))")) - end - isempty(q) && return q - - minp, maxp = extrema(p) - _quantilesort!(v, sorted, minp, maxp) - - for (i, j) in zip(eachindex(p), eachindex(q)) - @inbounds q[j] = _quantile(v,p[i]) - end - return q -end - -function quantile!(v::AbstractVector, p::Union{AbstractArray, Tuple{Vararg{Real}}}; - sorted::Bool=false) - if !isempty(p) - minp, maxp = extrema(p) - _quantilesort!(v, sorted, minp, maxp) - end - return map(x->_quantile(v, x), p) -end - -quantile!(v::AbstractVector, p::Real; sorted::Bool=false) = - _quantile(_quantilesort!(v, sorted, p, p), p) - -# Function to perform partial sort of v for quantiles in given range -function _quantilesort!(v::AbstractArray, sorted::Bool, minp::Real, maxp::Real) - isempty(v) && throw(ArgumentError("empty data vector")) - require_one_based_indexing(v) - - if !sorted - lv = length(v) - lo = floor(Int,1+minp*(lv-1)) - hi = ceil(Int,1+maxp*(lv-1)) - - # only need to perform partial sort - sort!(v, 1, lv, Base.Sort.PartialQuickSort(lo:hi), Base.Sort.Forward) - end - ismissing(v[end]) && throw(ArgumentError("quantiles are undefined in presence of missing values")) - isnan(v[end]) && throw(ArgumentError("quantiles are undefined in presence of NaNs")) - return v -end - -# Core quantile lookup function: assumes `v` sorted -@inline function _quantile(v::AbstractVector, p::Real) - 0 <= p <= 1 || throw(ArgumentError("input probability out of [0,1] range")) - require_one_based_indexing(v) - - lv = length(v) - f0 = (lv - 1)*p # 0-based interpolated index - t0 = trunc(f0) - h = f0 - t0 - i = trunc(Int,t0) + 1 - - a = v[i] - b = v[i + (h > 0)] - if isfinite(a) && isfinite(b) - return a + h*(b-a) - else - return (1-h)*a + h*b - end -end - - -""" - quantile(itr, p; sorted=false) - -Compute the quantile(s) of a collection `itr` at a specified probability or vector or tuple of -probabilities `p` on the interval [0,1]. The keyword argument `sorted` indicates whether -`itr` can be assumed to be sorted. - -Quantiles are computed via linear interpolation between the points `((k-1)/(n-1), v[k])`, -for `k = 1:n` where `n = length(itr)`. This corresponds to Definition 7 of Hyndman and Fan -(1996), and is the same as the R default. - -!!! note - An `ArgumentError` is thrown if `itr` contains `NaN` or [`missing`](@ref) values. - Use the [`skipmissing`](@ref) function to omit `missing` entries and compute the - quantiles of non-missing values. - -- Hyndman, R.J and Fan, Y. (1996) "Sample Quantiles in Statistical Packages", - *The American Statistician*, Vol. 50, No. 4, pp. 361-365 - -# Examples -```jldoctest -julia> quantile(0:20, 0.5) -10.0 - -julia> quantile(0:20, [0.1, 0.5, 0.9]) -3-element Array{Float64,1}: - 2.0 - 10.0 - 18.0 - -julia> quantile(skipmissing([1, 10, missing]), 0.5) -5.5 -``` -""" -quantile(itr, p; sorted::Bool=false) = quantile!(collect(itr), p, sorted=sorted) - -quantile(v::AbstractVector, p; sorted::Bool=false) = - quantile!(sorted ? v : Base.copymutable(v), p; sorted=sorted) - - -##### SparseArrays optimizations ##### - -function cov(X::SparseMatrixCSC; dims::Int=1, corrected::Bool=true) - vardim = dims - a, b = size(X) - n, p = vardim == 1 ? (a, b) : (b, a) - - # The covariance can be decomposed into two terms - # 1/(n - 1) ∑ (x_i - x̄)*(x_i - x̄)' = 1/(n - 1) (∑ x_i*x_i' - n*x̄*x̄') - # which can be evaluated via a sparse matrix-matrix product - - # Compute ∑ x_i*x_i' = X'X using sparse matrix-matrix product - out = Matrix(unscaled_covzm(X, vardim)) - - # Compute x̄ - x̄ᵀ = mean(X, dims=vardim) - - # Subtract n*x̄*x̄' from X'X - @inbounds for j in 1:p, i in 1:p - out[i,j] -= x̄ᵀ[i] * x̄ᵀ[j]' * n - end - - # scale with the sample size n or the corrected sample size n - 1 - return rmul!(out, inv(n - corrected)) -end - -# This is the function that does the reduction underlying var/std -function centralize_sumabs2!(R::AbstractArray{S}, A::SparseMatrixCSC{Tv,Ti}, means::AbstractArray) where {S,Tv,Ti} - require_one_based_indexing(R, A, means) - lsiz = Base.check_reducedims(R,A) - size(means) == size(R) || error("size of means must match size of R") - isempty(R) || fill!(R, zero(S)) - isempty(A) && return R - - rowval = rowvals(A) - nzval = nonzeros(A) - m = size(A, 1) - n = size(A, 2) - - if size(R, 1) == size(R, 2) == 1 - # Reduction along both columns and rows - R[1, 1] = centralize_sumabs2(A, means[1]) - elseif size(R, 1) == 1 - # Reduction along rows - @inbounds for col = 1:n - mu = means[col] - r = convert(S, (m - length(nzrange(A, col)))*abs2(mu)) - @simd for j = nzrange(A, col) - r += abs2(nzval[j] - mu) - end - R[1, col] = r - end - elseif size(R, 2) == 1 - # Reduction along columns - rownz = fill(convert(Ti, n), m) - @inbounds for col = 1:n - @simd for j = nzrange(A, col) - row = rowval[j] - R[row, 1] += abs2(nzval[j] - means[row]) - rownz[row] -= 1 - end - end - for i = 1:m - R[i, 1] += rownz[i]*abs2(means[i]) - end - else - # Reduction along a dimension > 2 - @inbounds for col = 1:n - lastrow = 0 - @simd for j = nzrange(A, col) - row = rowval[j] - for i = lastrow+1:row-1 - R[i, col] = abs2(means[i, col]) - end - R[row, col] = abs2(nzval[j] - means[row, col]) - lastrow = row - end - for i = lastrow+1:m - R[i, col] = abs2(means[i, col]) - end - end - end - return R -end - -end # module diff --git a/stdlib/Statistics/test/runtests.jl b/stdlib/Statistics/test/runtests.jl deleted file mode 100644 index 73ff01fa376425..00000000000000 --- a/stdlib/Statistics/test/runtests.jl +++ /dev/null @@ -1,738 +0,0 @@ -# This file is a part of Julia. License is MIT: https://julialang.org/license - -using Statistics, Test, Random, LinearAlgebra, SparseArrays -using Test: guardseed - -Random.seed!(123) - -@testset "middle" begin - @test middle(3) === 3.0 - @test middle(2, 3) === 2.5 - let x = ((floatmax(1.0)/4)*3) - @test middle(x, x) === x - end - @test middle(1:8) === 4.5 - @test middle([1:8;]) === 4.5 - - # ensure type-correctness - for T in [Bool,Int8,Int16,Int32,Int64,Int128,UInt8,UInt16,UInt32,UInt64,UInt128,Float16,Float32,Float64] - @test middle(one(T)) === middle(one(T), one(T)) - end -end - -@testset "median" begin - @test median([1.]) === 1. - @test median([1.,3]) === 2. - @test median([1.,3,2]) === 2. - - @test median([1,3,2]) === 2.0 - @test median([1,3,2,4]) === 2.5 - - @test median([0.0,Inf]) == Inf - @test median([0.0,-Inf]) == -Inf - @test median([0.,Inf,-Inf]) == 0.0 - @test median([1.,-1.,Inf,-Inf]) == 0.0 - @test isnan(median([-Inf,Inf])) - - X = [2 3 1 -1; 7 4 5 -4] - @test all(median(X, dims=2) .== [1.5, 4.5]) - @test all(median(X, dims=1) .== [4.5 3.5 3.0 -2.5]) - @test X == [2 3 1 -1; 7 4 5 -4] # issue #17153 - - @test_throws ArgumentError median([]) - @test isnan(median([NaN])) - @test isnan(median([0.0,NaN])) - @test isnan(median([NaN,0.0])) - @test isnan(median([NaN,0.0,1.0])) - @test isnan(median(Any[NaN,0.0,1.0])) - @test isequal(median([NaN 0.0; 1.2 4.5], dims=2), reshape([NaN; 2.85], 2, 1)) - - @test ismissing(median([1, missing])) - @test ismissing(median([1, 2, missing])) - @test ismissing(median([NaN, 2.0, missing])) - @test ismissing(median([NaN, missing])) - @test ismissing(median([missing, NaN])) - @test ismissing(median(Any[missing, 2.0, 3.0, 4.0, NaN])) - @test median(skipmissing([1, missing, 2])) === 1.5 - - @test median!([1 2 3 4]) == 2.5 - @test median!([1 2; 3 4]) == 2.5 - - @test invoke(median, Tuple{AbstractVector}, 1:10) == median(1:10) == 5.5 - - @test @inferred(median(Float16[1, 2, NaN])) === Float16(NaN) - @test @inferred(median(Float16[1, 2, 3])) === Float16(2) - @test @inferred(median(Float32[1, 2, NaN])) === NaN32 - @test @inferred(median(Float32[1, 2, 3])) === 2.0f0 -end - -@testset "mean" begin - @test_throws MethodError mean(()) - @test mean((1,2,3)) === 2. - @test mean([0]) === 0. - @test mean([1.]) === 1. - @test mean([1.,3]) == 2. - @test mean([1,2,3]) == 2. - @test mean([0 1 2; 4 5 6], dims=1) == [2. 3. 4.] - @test mean([1 2 3; 4 5 6], dims=1) == [2.5 3.5 4.5] - @test mean(-, [1 2 3 ; 4 5 6], dims=1) == [-2.5 -3.5 -4.5] - @test mean(-, [1 2 3 ; 4 5 6], dims=2) == transpose([-2.0 -5.0]) - @test mean(-, [1 2 3 ; 4 5 6], dims=(1, 2)) == -3.5 .* ones(1, 1) - @test mean(-, [1 2 3 ; 4 5 6], dims=(1, 1)) == [-2.5 -3.5 -4.5] - @test mean(-, [1 2 3 ; 4 5 6], dims=()) == Float64[-1 -2 -3 ; -4 -5 -6] - @test mean(i->i+1, 0:2) === 2. - @test mean(isodd, [3]) === 1. - @test mean(x->3x, (1,1)) === 3. - - # mean of iterables: - n = 10; a = randn(n); b = randn(n) - @test mean(Tuple(a)) ≈ mean(a) - @test mean(Tuple(a + b*im)) ≈ mean(a + b*im) - @test mean(cos, Tuple(a)) ≈ mean(cos, a) - @test mean(x->x/2, a + b*im) ≈ mean(a + b*im) / 2. - @test ismissing(mean(Tuple((1, 2, missing, 4, 5)))) - - @test isnan(mean([NaN])) - @test isnan(mean([0.0,NaN])) - @test isnan(mean([NaN,0.0])) - - @test isnan(mean([0.,Inf,-Inf])) - @test isnan(mean([1.,-1.,Inf,-Inf])) - @test isnan(mean([-Inf,Inf])) - @test isequal(mean([NaN 0.0; 1.2 4.5], dims=2), reshape([NaN; 2.85], 2, 1)) - - @test ismissing(mean([1, missing])) - @test ismissing(mean([NaN, missing])) - @test ismissing(mean([missing, NaN])) - @test isequal(mean([missing 1.0; 2.0 3.0], dims=1), [missing 2.0]) - @test mean(skipmissing([1, missing, 2])) === 1.5 - @test isequal(mean(Complex{Float64}[]), NaN+NaN*im) - @test mean(Complex{Float64}[]) isa Complex{Float64} - @test isequal(mean(skipmissing(Complex{Float64}[])), NaN+NaN*im) - @test mean(skipmissing(Complex{Float64}[])) isa Complex{Float64} - @test isequal(mean(abs, Complex{Float64}[]), NaN) - @test mean(abs, Complex{Float64}[]) isa Float64 - @test isequal(mean(abs, skipmissing(Complex{Float64}[])), NaN) - @test mean(abs, skipmissing(Complex{Float64}[])) isa Float64 - @test isequal(mean(Int[]), NaN) - @test mean(Int[]) isa Float64 - @test isequal(mean(skipmissing(Int[])), NaN) - @test mean(skipmissing(Int[])) isa Float64 - @test_throws MethodError mean([]) - @test_throws MethodError mean(skipmissing([])) - @test_throws ArgumentError mean((1 for i in 2:1)) - - # Check that small types are accumulated using wider type - for T in (Int8, UInt8) - x = [typemax(T) typemax(T)] - g = (v for v in x) - @test mean(x) == mean(g) == typemax(T) - @test mean(identity, x) == mean(identity, g) == typemax(T) - @test mean(x, dims=2) == [typemax(T)]' - end -end - -@testset "mean/median for ranges" begin - for f in (mean, median) - for n = 2:5 - @test f(2:n) == f([2:n;]) - @test f(2:0.1:n) ≈ f([2:0.1:n;]) - end - end - @test mean(2:1) === NaN - @test mean(big(2):1) isa BigFloat -end - -@testset "var & std" begin - # edge case: empty vector - # iterable; this has to throw for type stability - @test_throws MethodError var(()) - @test_throws MethodError var((); corrected=false) - @test_throws MethodError var((); mean=2) - @test_throws MethodError var((); mean=2, corrected=false) - # reduction - @test isnan(var(Int[])) - @test isnan(var(Int[]; corrected=false)) - @test isnan(var(Int[]; mean=2)) - @test isnan(var(Int[]; mean=2, corrected=false)) - # reduction across dimensions - @test isequal(var(Int[], dims=1), [NaN]) - @test isequal(var(Int[], dims=1; corrected=false), [NaN]) - @test isequal(var(Int[], dims=1; mean=[2]), [NaN]) - @test isequal(var(Int[], dims=1; mean=[2], corrected=false), [NaN]) - - # edge case: one-element vector - # iterable - @test isnan(@inferred(var((1,)))) - @test var((1,); corrected=false) === 0.0 - @test var((1,); mean=2) === Inf - @test var((1,); mean=2, corrected=false) === 1.0 - # reduction - @test isnan(@inferred(var([1]))) - @test var([1]; corrected=false) === 0.0 - @test var([1]; mean=2) === Inf - @test var([1]; mean=2, corrected=false) === 1.0 - # reduction across dimensions - @test isequal(@inferred(var([1], dims=1)), [NaN]) - @test var([1], dims=1; corrected=false) ≈ [0.0] - @test var([1], dims=1; mean=[2]) ≈ [Inf] - @test var([1], dims=1; mean=[2], corrected=false) ≈ [1.0] - - @test var(1:8) == 6. - @test varm(1:8,1) == varm(Vector(1:8),1) - @test isnan(varm(1:1,1)) - @test isnan(var(1:1)) - @test isnan(var(1:-1)) - - @test @inferred(var(1.0:8.0)) == 6. - @test varm(1.0:8.0,1.0) == varm(Vector(1.0:8.0),1) - @test isnan(varm(1.0:1.0,1.0)) - @test isnan(var(1.0:1.0)) - @test isnan(var(1.0:-1.0)) - - @test @inferred(var(1.0f0:8.0f0)) === 6.f0 - @test varm(1.0f0:8.0f0,1.0f0) == varm(Vector(1.0f0:8.0f0),1) - @test isnan(varm(1.0f0:1.0f0,1.0f0)) - @test isnan(var(1.0f0:1.0f0)) - @test isnan(var(1.0f0:-1.0f0)) - - @test varm([1,2,3], 2) ≈ 1. - @test var([1,2,3]) ≈ 1. - @test var([1,2,3]; corrected=false) ≈ 2.0/3 - @test var([1,2,3]; mean=0) ≈ 7. - @test var([1,2,3]; mean=0, corrected=false) ≈ 14.0/3 - - @test varm((1,2,3), 2) ≈ 1. - @test var((1,2,3)) ≈ 1. - @test var((1,2,3); corrected=false) ≈ 2.0/3 - @test var((1,2,3); mean=0) ≈ 7. - @test var((1,2,3); mean=0, corrected=false) ≈ 14.0/3 - @test_throws ArgumentError var((1,2,3); mean=()) - - @test var([1 2 3 4 5; 6 7 8 9 10], dims=2) ≈ [2.5 2.5]' - @test var([1 2 3 4 5; 6 7 8 9 10], dims=2; corrected=false) ≈ [2.0 2.0]' - - @test var(collect(1:99), dims=1) ≈ [825] - @test var(Matrix(transpose(collect(1:99))), dims=2) ≈ [825] - - @test stdm([1,2,3], 2) ≈ 1. - @test std([1,2,3]) ≈ 1. - @test std([1,2,3]; corrected=false) ≈ sqrt(2.0/3) - @test std([1,2,3]; mean=0) ≈ sqrt(7.0) - @test std([1,2,3]; mean=0, corrected=false) ≈ sqrt(14.0/3) - - @test stdm([1.0,2,3], 2) ≈ 1. - @test std([1.0,2,3]) ≈ 1. - @test std([1.0,2,3]; corrected=false) ≈ sqrt(2.0/3) - @test std([1.0,2,3]; mean=0) ≈ sqrt(7.0) - @test std([1.0,2,3]; mean=0, corrected=false) ≈ sqrt(14.0/3) - - @test std([1.0,2,3]; dims=1)[] ≈ 1. - @test std([1.0,2,3]; dims=1, corrected=false)[] ≈ sqrt(2.0/3) - @test std([1.0,2,3]; dims=1, mean=[0])[] ≈ sqrt(7.0) - @test std([1.0,2,3]; dims=1, mean=[0], corrected=false)[] ≈ sqrt(14.0/3) - - @test stdm((1,2,3), 2) ≈ 1. - @test std((1,2,3)) ≈ 1. - @test std((1,2,3); corrected=false) ≈ sqrt(2.0/3) - @test std((1,2,3); mean=0) ≈ sqrt(7.0) - @test std((1,2,3); mean=0, corrected=false) ≈ sqrt(14.0/3) - - @test std([1 2 3 4 5; 6 7 8 9 10], dims=2) ≈ sqrt.([2.5 2.5]') - @test std([1 2 3 4 5; 6 7 8 9 10], dims=2; corrected=false) ≈ sqrt.([2.0 2.0]') - - let A = ComplexF64[exp(i*im) for i in 1:10^4] - @test varm(A, 0.) ≈ sum(map(abs2, A)) / (length(A) - 1) - @test varm(A, mean(A)) ≈ var(A) - end - - @test var([1//1, 2//1]) isa Rational{Int} - @test var([1//1, 2//1], dims=1) isa Vector{Rational{Int}} - - @test std([1//1, 2//1]) isa Float64 - @test std([1//1, 2//1], dims=1) isa Vector{Float64} - - @testset "var: empty cases" begin - A = Matrix{Int}(undef, 0,1) - @test var(A) === NaN - - @test isequal(var(A, dims=1), fill(NaN, 1, 1)) - @test isequal(var(A, dims=2), fill(NaN, 0, 1)) - @test isequal(var(A, dims=(1, 2)), fill(NaN, 1, 1)) - @test isequal(var(A, dims=3), fill(NaN, 0, 1)) - end - - # issue #6672 - @test std(AbstractFloat[1,2,3], dims=1) == [1.0] - - for f in (var, std) - @test ismissing(f([1, missing])) - @test ismissing(f([NaN, missing])) - @test ismissing(f([missing, NaN])) - @test isequal(f([missing 1.0; 2.0 3.0], dims=1), [missing f([1.0, 3.0])]) - @test f(skipmissing([1, missing, 2])) === f([1, 2]) - end - for f in (varm, stdm) - @test ismissing(f([1, missing], 0)) - @test ismissing(f([1, 2], missing)) - @test ismissing(f([1, NaN], missing)) - @test ismissing(f([NaN, missing], 0)) - @test ismissing(f([missing, NaN], 0)) - @test ismissing(f([NaN, missing], missing)) - @test ismissing(f([missing, NaN], missing)) - @test f(skipmissing([1, missing, 2]), 0) === f([1, 2], 0) - end - - @test isequal(var(Complex{Float64}[]), NaN) - @test var(Complex{Float64}[]) isa Float64 - @test isequal(var(skipmissing(Complex{Float64}[])), NaN) - @test var(skipmissing(Complex{Float64}[])) isa Float64 - @test_throws MethodError var([]) - @test_throws MethodError var(skipmissing([])) - @test_throws MethodError var((1 for i in 2:1)) - @test isequal(var(Int[]), NaN) - @test var(Int[]) isa Float64 - @test isequal(var(skipmissing(Int[])), NaN) - @test var(skipmissing(Int[])) isa Float64 -end - -function safe_cov(x, y, zm::Bool, cr::Bool) - n = length(x) - if !zm - x = x .- mean(x) - y = y .- mean(y) - end - dot(vec(x), vec(y)) / (n - Int(cr)) -end -X = [1.0 5.0; - 2.0 4.0; - 3.0 6.0; - 4.0 2.0; - 5.0 1.0] -Y = [6.0 2.0; - 1.0 7.0; - 5.0 8.0; - 3.0 4.0; - 2.0 3.0] - -@testset "covariance" begin - for vd in [1, 2], zm in [true, false], cr in [true, false] - # println("vd = $vd: zm = $zm, cr = $cr") - if vd == 1 - k = size(X, 2) - Cxx = zeros(k, k) - Cxy = zeros(k, k) - for i = 1:k, j = 1:k - Cxx[i,j] = safe_cov(X[:,i], X[:,j], zm, cr) - Cxy[i,j] = safe_cov(X[:,i], Y[:,j], zm, cr) - end - x1 = vec(X[:,1]) - y1 = vec(Y[:,1]) - else - k = size(X, 1) - Cxx = zeros(k, k) - Cxy = zeros(k, k) - for i = 1:k, j = 1:k - Cxx[i,j] = safe_cov(X[i,:], X[j,:], zm, cr) - Cxy[i,j] = safe_cov(X[i,:], Y[j,:], zm, cr) - end - x1 = vec(X[1,:]) - y1 = vec(Y[1,:]) - end - - c = zm ? Statistics.covm(x1, 0, corrected=cr) : - cov(x1, corrected=cr) - @test isa(c, Float64) - @test c ≈ Cxx[1,1] - @inferred cov(x1, corrected=cr) - - @test cov(X) == Statistics.covm(X, mean(X, dims=1)) - C = zm ? Statistics.covm(X, 0, vd, corrected=cr) : - cov(X, dims=vd, corrected=cr) - @test size(C) == (k, k) - @test C ≈ Cxx - @inferred cov(X, dims=vd, corrected=cr) - - @test cov(x1, y1) == Statistics.covm(x1, mean(x1), y1, mean(y1)) - c = zm ? Statistics.covm(x1, 0, y1, 0, corrected=cr) : - cov(x1, y1, corrected=cr) - @test isa(c, Float64) - @test c ≈ Cxy[1,1] - @inferred cov(x1, y1, corrected=cr) - - if vd == 1 - @test cov(x1, Y) == Statistics.covm(x1, mean(x1), Y, mean(Y, dims=1)) - end - C = zm ? Statistics.covm(x1, 0, Y, 0, vd, corrected=cr) : - cov(x1, Y, dims=vd, corrected=cr) - @test size(C) == (1, k) - @test vec(C) ≈ Cxy[1,:] - @inferred cov(x1, Y, dims=vd, corrected=cr) - - if vd == 1 - @test cov(X, y1) == Statistics.covm(X, mean(X, dims=1), y1, mean(y1)) - end - C = zm ? Statistics.covm(X, 0, y1, 0, vd, corrected=cr) : - cov(X, y1, dims=vd, corrected=cr) - @test size(C) == (k, 1) - @test vec(C) ≈ Cxy[:,1] - @inferred cov(X, y1, dims=vd, corrected=cr) - - @test cov(X, Y) == Statistics.covm(X, mean(X, dims=1), Y, mean(Y, dims=1)) - C = zm ? Statistics.covm(X, 0, Y, 0, vd, corrected=cr) : - cov(X, Y, dims=vd, corrected=cr) - @test size(C) == (k, k) - @test C ≈ Cxy - @inferred cov(X, Y, dims=vd, corrected=cr) - end - - @testset "floating point accuracy for `cov` of large numbers" begin - A = [4.0, 7.0, 13.0, 16.0] - C = A .+ 1.0e10 - @test cov(A, A) ≈ cov(C, C) - end -end - -function safe_cor(x, y, zm::Bool) - if !zm - x = x .- mean(x) - y = y .- mean(y) - end - x = vec(x) - y = vec(y) - dot(x, y) / (sqrt(dot(x, x)) * sqrt(dot(y, y))) -end -@testset "correlation" begin - for vd in [1, 2], zm in [true, false] - # println("vd = $vd: zm = $zm") - if vd == 1 - k = size(X, 2) - Cxx = zeros(k, k) - Cxy = zeros(k, k) - for i = 1:k, j = 1:k - Cxx[i,j] = safe_cor(X[:,i], X[:,j], zm) - Cxy[i,j] = safe_cor(X[:,i], Y[:,j], zm) - end - x1 = vec(X[:,1]) - y1 = vec(Y[:,1]) - else - k = size(X, 1) - Cxx = zeros(k, k) - Cxy = zeros(k, k) - for i = 1:k, j = 1:k - Cxx[i,j] = safe_cor(X[i,:], X[j,:], zm) - Cxy[i,j] = safe_cor(X[i,:], Y[j,:], zm) - end - x1 = vec(X[1,:]) - y1 = vec(Y[1,:]) - end - - c = zm ? Statistics.corm(x1, 0) : cor(x1) - @test isa(c, Float64) - @test c ≈ Cxx[1,1] - @inferred cor(x1) - - @test cor(X) == Statistics.corm(X, mean(X, dims=1)) - C = zm ? Statistics.corm(X, 0, vd) : cor(X, dims=vd) - @test size(C) == (k, k) - @test C ≈ Cxx - @inferred cor(X, dims=vd) - - @test cor(x1, y1) == Statistics.corm(x1, mean(x1), y1, mean(y1)) - c = zm ? Statistics.corm(x1, 0, y1, 0) : cor(x1, y1) - @test isa(c, Float64) - @test c ≈ Cxy[1,1] - @inferred cor(x1, y1) - - if vd == 1 - @test cor(x1, Y) == Statistics.corm(x1, mean(x1), Y, mean(Y, dims=1)) - end - C = zm ? Statistics.corm(x1, 0, Y, 0, vd) : cor(x1, Y, dims=vd) - @test size(C) == (1, k) - @test vec(C) ≈ Cxy[1,:] - @inferred cor(x1, Y, dims=vd) - - if vd == 1 - @test cor(X, y1) == Statistics.corm(X, mean(X, dims=1), y1, mean(y1)) - end - C = zm ? Statistics.corm(X, 0, y1, 0, vd) : cor(X, y1, dims=vd) - @test size(C) == (k, 1) - @test vec(C) ≈ Cxy[:,1] - @inferred cor(X, y1, dims=vd) - - @test cor(X, Y) == Statistics.corm(X, mean(X, dims=1), Y, mean(Y, dims=1)) - C = zm ? Statistics.corm(X, 0, Y, 0, vd) : cor(X, Y, dims=vd) - @test size(C) == (k, k) - @test C ≈ Cxy - @inferred cor(X, Y, dims=vd) - end - - @test cor(repeat(1:17, 1, 17))[2] <= 1.0 - @test cor(1:17, 1:17) <= 1.0 - @test cor(1:17, 18:34) <= 1.0 - @test cor(Any[1, 2], Any[1, 2]) == 1.0 - @test isnan(cor([0], Int8[81])) - let tmp = range(1, stop=85, length=100) - tmp2 = Vector(tmp) - @test cor(tmp, tmp) <= 1.0 - @test cor(tmp, tmp2) <= 1.0 - end -end - -@testset "quantile" begin - @test quantile([1,2,3,4],0.5) == 2.5 - @test quantile([1,2,3,4],[0.5]) == [2.5] - @test quantile([1., 3],[.25,.5,.75])[2] == median([1., 3]) - @test quantile(100.0:-1.0:0.0, 0.0:0.1:1.0) == 0.0:10.0:100.0 - @test quantile(0.0:100.0, 0.0:0.1:1.0, sorted=true) == 0.0:10.0:100.0 - @test quantile(100f0:-1f0:0.0, 0.0:0.1:1.0) == 0f0:10f0:100f0 - @test quantile([Inf,Inf],0.5) == Inf - @test quantile([-Inf,1],0.5) == -Inf - @test quantile([0,1],1e-18) == 1e-18 - @test quantile([1, 2, 3, 4],[]) == [] - @test quantile([1, 2, 3, 4], (0.5,)) == (2.5,) - @test quantile([4, 9, 1, 5, 7, 8, 2, 3, 5, 17, 11], - (0.1, 0.2, 0.4, 0.9)) == (2.0, 3.0, 5.0, 11.0) - @test quantile(Union{Int, Missing}[4, 9, 1, 5, 7, 8, 2, 3, 5, 17, 11], - [0.1, 0.2, 0.4, 0.9]) == [2.0, 3.0, 5.0, 11.0] - @test quantile(Any[4, 9, 1, 5, 7, 8, 2, 3, 5, 17, 11], - [0.1, 0.2, 0.4, 0.9]) == [2.0, 3.0, 5.0, 11.0] - @test quantile([4, 9, 1, 5, 7, 8, 2, 3, 5, 17, 11], - Any[0.1, 0.2, 0.4, 0.9]) == [2.0, 3.0, 5.0, 11.0] - @test quantile([4, 9, 1, 5, 7, 8, 2, 3, 5, 17, 11], - Any[0.1, 0.2, 0.4, 0.9]) isa Vector{Float64} - @test quantile(Any[4, 9, 1, 5, 7, 8, 2, 3, 5, 17, 11], - Any[0.1, 0.2, 0.4, 0.9]) == [2, 3, 5, 11] - @test quantile(Any[4, 9, 1, 5, 7, 8, 2, 3, 5, 17, 11], - Any[0.1, 0.2, 0.4, 0.9]) isa Vector{Float64} - @test quantile([1, 2, 3, 4], ()) == () - @test isempty(quantile([1, 2, 3, 4], Float64[])) - @test quantile([1, 2, 3, 4], Float64[]) isa Vector{Float64} - @test quantile([1, 2, 3, 4], []) isa Vector{Any} - @test quantile([1, 2, 3, 4], [0, 1]) isa Vector{Int} - - @test quantile(Any[1, 2, 3], 0.5) isa Float64 - @test quantile(Any[1, big(2), 3], 0.5) isa BigFloat - @test quantile(Any[1, 2, 3], Float16(0.5)) isa Float16 - @test quantile(Any[1, Float16(2), 3], Float16(0.5)) isa Float16 - @test quantile(Any[1, big(2), 3], Float16(0.5)) isa BigFloat - - @test_throws ArgumentError quantile([1, missing], 0.5) - @test_throws ArgumentError quantile([1, NaN], 0.5) - @test quantile(skipmissing([1, missing, 2]), 0.5) === 1.5 - - # make sure that type inference works correctly in normal cases - for T in [Int, BigInt, Float64, Float16, BigFloat, Rational{Int}, Rational{BigInt}] - for S in [Float64, Float16, BigFloat, Rational{Int}, Rational{BigInt}] - @inferred quantile(T[1, 2, 3], S(0.5)) - @inferred quantile(T[1, 2, 3], S(0.6)) - @inferred quantile(T[1, 2, 3], S[0.5, 0.6]) - @inferred quantile(T[1, 2, 3], (S(0.5), S(0.6))) - end - end - x = [3; 2; 1] - y = zeros(3) - @test quantile!(y, x, [0.1, 0.5, 0.9]) === y - @test y == [1.2, 2.0, 2.8] -end - -# StatsBase issue 164 -let y = [0.40003674665581906, 0.4085630862624367, 0.41662034698690303, 0.41662034698690303, 0.42189053966652057, 0.42189053966652057, 0.42553514344518345, 0.43985732442991354] - @test issorted(quantile(y, range(0.01, stop=0.99, length=17))) -end - -@testset "variance of complex arrays (#13309)" begin - z = rand(ComplexF64, 10) - @test var(z) ≈ invoke(var, Tuple{Any}, z) ≈ cov(z) ≈ var(z,dims=1)[1] ≈ sum(abs2, z .- mean(z))/9 - @test isa(var(z), Float64) - @test isa(invoke(var, Tuple{Any}, z), Float64) - @test isa(cov(z), Float64) - @test isa(var(z,dims=1), Vector{Float64}) - @test varm(z, 0.0) ≈ invoke(varm, Tuple{Any,Float64}, z, 0.0) ≈ sum(abs2, z)/9 - @test isa(varm(z, 0.0), Float64) - @test isa(invoke(varm, Tuple{Any,Float64}, z, 0.0), Float64) - @test cor(z) === 1.0 - v = varm([1.0+2.0im], 0; corrected = false) - @test v ≈ 5 - @test isa(v, Float64) -end - -@testset "cov and cor of complex arrays (issue #21093)" begin - x = [2.7 - 3.3im, 0.9 + 5.4im, 0.1 + 0.2im, -1.7 - 5.8im, 1.1 + 1.9im] - y = [-1.7 - 1.6im, -0.2 + 6.5im, 0.8 - 10.0im, 9.1 - 3.4im, 2.7 - 5.5im] - @test cov(x, y) ≈ 4.8365 - 12.119im - @test cov(y, x) ≈ 4.8365 + 12.119im - @test cov(x, reshape(y, :, 1)) ≈ reshape([4.8365 - 12.119im], 1, 1) - @test cov(reshape(x, :, 1), y) ≈ reshape([4.8365 - 12.119im], 1, 1) - @test cov(reshape(x, :, 1), reshape(y, :, 1)) ≈ reshape([4.8365 - 12.119im], 1, 1) - @test cov([x y]) ≈ [21.779 4.8365-12.119im; - 4.8365+12.119im 54.548] - @test cor(x, y) ≈ 0.14032104449218274 - 0.35160772008699703im - @test cor(y, x) ≈ 0.14032104449218274 + 0.35160772008699703im - @test cor(x, reshape(y, :, 1)) ≈ reshape([0.14032104449218274 - 0.35160772008699703im], 1, 1) - @test cor(reshape(x, :, 1), y) ≈ reshape([0.14032104449218274 - 0.35160772008699703im], 1, 1) - @test cor(reshape(x, :, 1), reshape(y, :, 1)) ≈ reshape([0.14032104449218274 - 0.35160772008699703im], 1, 1) - @test cor([x y]) ≈ [1.0 0.14032104449218274-0.35160772008699703im - 0.14032104449218274+0.35160772008699703im 1.0] -end - -@testset "Issue #17153 and PR #17154" begin - a = rand(10,10) - b = copy(a) - x = median(a, dims=1) - @test b == a - x = median(a, dims=2) - @test b == a - x = mean(a, dims=1) - @test b == a - x = mean(a, dims=2) - @test b == a - x = var(a, dims=1) - @test b == a - x = var(a, dims=2) - @test b == a - x = std(a, dims=1) - @test b == a - x = std(a, dims=2) - @test b == a -end - -# dimensional correctness -const BASE_TEST_PATH = joinpath(Sys.BINDIR, "..", "share", "julia", "test") -isdefined(Main, :Furlongs) || @eval Main include(joinpath($(BASE_TEST_PATH), "testhelpers", "Furlongs.jl")) -using .Main.Furlongs - -Statistics.middle(x::Furlong{p}) where {p} = Furlong{p}(middle(x.val)) -Statistics.middle(x::Furlong{p}, y::Furlong{p}) where {p} = Furlong{p}(middle(x.val, y.val)) - -@testset "Unitful elements" begin - r = Furlong(1):Furlong(1):Furlong(2) - a = Vector(r) - @test sum(r) == sum(a) == Furlong(3) - @test cumsum(r) == Furlong.([1,3]) - @test mean(r) == mean(a) == median(a) == median(r) == Furlong(1.5) - @test var(r) == var(a) == Furlong{2}(0.5) - @test std(r) == std(a) == Furlong{1}(sqrt(0.5)) - - # Issue #21786 - A = [Furlong{1}(rand(-5:5)) for i in 1:2, j in 1:2] - @test mean(mean(A, dims=1), dims=2)[1] === mean(A) - @test var(A, dims=1)[1] === var(A[:, 1]) - @test std(A, dims=1)[1] === std(A[:, 1]) -end - -# Issue #22901 -@testset "var and quantile of Any arrays" begin - x = Any[1, 2, 4, 10] - y = Any[1, 2, 4, 10//1] - @test var(x) === 16.25 - @test var(y) === 65//4 - @test std(x) === sqrt(16.25) - @test quantile(x, 0.5) === 3.0 - @test quantile(x, 1//2) === 3//1 -end - -@testset "Promotion in covzm. Issue #8080" begin - A = [1 -1 -1; -1 1 1; -1 1 -1; 1 -1 -1; 1 -1 1] - @test Statistics.covzm(A) - mean(A, dims=1)'*mean(A, dims=1)*size(A, 1)/(size(A, 1) - 1) ≈ cov(A) - A = [1//1 -1 -1; -1 1 1; -1 1 -1; 1 -1 -1; 1 -1 1] - @test (A'A - size(A, 1)*mean(A, dims=1)'*mean(A, dims=1))/4 == cov(A) -end - -@testset "Mean along dimension of empty array" begin - a0 = zeros(0) - a00 = zeros(0, 0) - a01 = zeros(0, 1) - a10 = zeros(1, 0) - @test isequal(mean(a0, dims=1) , fill(NaN, 1)) - @test isequal(mean(a00, dims=(1, 2)), fill(NaN, 1, 1)) - @test isequal(mean(a01, dims=1) , fill(NaN, 1, 1)) - @test isequal(mean(a10, dims=2) , fill(NaN, 1, 1)) -end - -@testset "cov/var/std of Vector{Vector}" begin - x = [[2,4,6],[4,6,8]] - @test var(x) ≈ vec(var([x[1] x[2]], dims=2)) - @test std(x) ≈ vec(std([x[1] x[2]], dims=2)) - @test cov(x) ≈ cov([x[1] x[2]], dims=2) -end - -@testset "var of sparse array" begin - se33 = SparseMatrixCSC{Float64}(I, 3, 3) - sA = sprandn(3, 7, 0.5) - pA = sparse(rand(3, 7)) - - for arr in (se33, sA, pA) - farr = Array(arr) - @test var(arr) ≈ var(farr) - @test var(arr, dims=1) ≈ var(farr, dims=1) - @test var(arr, dims=2) ≈ var(farr, dims=2) - @test var(arr, dims=(1, 2)) ≈ [var(farr)] - @test isequal(var(arr, dims=3), var(farr, dims=3)) - end - - @testset "empty cases" begin - @test var(sparse(Int[])) === NaN - @test isequal(var(spzeros(0, 1), dims=1), var(Matrix{Int}(I, 0, 1), dims=1)) - @test isequal(var(spzeros(0, 1), dims=2), var(Matrix{Int}(I, 0, 1), dims=2)) - @test isequal(var(spzeros(0, 1), dims=(1, 2)), var(Matrix{Int}(I, 0, 1), dims=(1, 2))) - @test isequal(var(spzeros(0, 1), dims=3), var(Matrix{Int}(I, 0, 1), dims=3)) - end -end - -# Faster covariance function for sparse matrices -# Prevents densifying the input matrix when subtracting the mean -# Test against dense implementation -# PR https://github.com/JuliaLang/julia/pull/22735 -# Part of this test needed to be hacked due to the treatment -# of Inf in sparse matrix algebra -# https://github.com/JuliaLang/julia/issues/22921 -# The issue will be resolved in -# https://github.com/JuliaLang/julia/issues/22733 -@testset "optimizing sparse $elty covariance" for elty in (Float64, Complex{Float64}) - n = 10 - p = 5 - np2 = div(n*p, 2) - nzvals, x_sparse = guardseed(1) do - if elty <: Real - nzvals = randn(np2) - else - nzvals = complex.(randn(np2), randn(np2)) - end - nzvals, sparse(rand(1:n, np2), rand(1:p, np2), nzvals, n, p) - end - x_dense = convert(Matrix{elty}, x_sparse) - @testset "Test with no Infs and NaNs, vardim=$vardim, corrected=$corrected" for vardim in (1, 2), - corrected in (true, false) - @test cov(x_sparse, dims=vardim, corrected=corrected) ≈ - cov(x_dense , dims=vardim, corrected=corrected) - end - - @testset "Test with $x11, vardim=$vardim, corrected=$corrected" for x11 in (NaN, Inf), - vardim in (1, 2), - corrected in (true, false) - x_sparse[1,1] = x11 - x_dense[1 ,1] = x11 - - cov_sparse = cov(x_sparse, dims=vardim, corrected=corrected) - cov_dense = cov(x_dense , dims=vardim, corrected=corrected) - @test cov_sparse[2:end, 2:end] ≈ cov_dense[2:end, 2:end] - @test isfinite.(cov_sparse) == isfinite.(cov_dense) - @test isfinite.(cov_sparse) == isfinite.(cov_dense) - end - - @testset "Test with NaN and Inf, vardim=$vardim, corrected=$corrected" for vardim in (1, 2), - corrected in (true, false) - x_sparse[1,1] = Inf - x_dense[1 ,1] = Inf - x_sparse[2,1] = NaN - x_dense[2 ,1] = NaN - - cov_sparse = cov(x_sparse, dims=vardim, corrected=corrected) - cov_dense = cov(x_dense , dims=vardim, corrected=corrected) - @test cov_sparse[(1 + vardim):end, (1 + vardim):end] ≈ - cov_dense[ (1 + vardim):end, (1 + vardim):end] - @test isfinite.(cov_sparse) == isfinite.(cov_dense) - @test isfinite.(cov_sparse) == isfinite.(cov_dense) - end -end