diff --git a/Project.toml b/Project.toml index 12c96773..21bd1852 100644 --- a/Project.toml +++ b/Project.toml @@ -1,5 +1,4 @@ name = "Statistics" -uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [deps] LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" diff --git a/src/Statistics.jl b/src/Statistics.jl index 977ae8a6..b43b68f3 100644 --- a/src/Statistics.jl +++ b/src/Statistics.jl @@ -388,6 +388,8 @@ function sqrt!(A::AbstractArray) A end +sqrt!(x::Number) = sqrt(x) + stdm(A::AbstractArray, m; corrected::Bool=true) = sqrt.(varm(A, m; corrected=corrected)) @@ -479,13 +481,19 @@ end _vmean(x::AbstractVector, vardim::Int) = mean(x) _vmean(x::AbstractMatrix, vardim::Int) = mean(x, dims=vardim) -# core functions +_abs2(x::Number) = abs2(x) +_abs2(x) = x*x' + +_conjmul(x::Number, y::Number) = x * conj(y) +_conjmul(x, y) = x * _conj(y)' -unscaled_covzm(x::AbstractVector{<:Number}) = sum(abs2, x) -unscaled_covzm(x::AbstractVector) = sum(t -> t*t', x) +# core functions +unscaled_covzm(itr) = sum(_abs2, itr) +unscaled_covzm(x::AbstractVector) = sum(_abs2, x) unscaled_covzm(x::AbstractMatrix, vardim::Int) = (vardim == 1 ? _conj(x'x) : x * x') -unscaled_covzm(x::AbstractVector, y::AbstractVector) = sum(conj(y[i])*x[i] for i in eachindex(y, x)) +unscaled_covzm(x, y) = sum(t -> _conjmul(first(t), last(t)), zip(x, y)) +unscaled_covzm(x::AbstractVector, y::AbstractVector) = sum(_conjmul(x[i], y[i]) for i in eachindex(y, x)) unscaled_covzm(x::AbstractVector, y::AbstractMatrix, vardim::Int) = (vardim == 1 ? *(transpose(x), _conj(y)) : *(transpose(x), transpose(_conj(y)))) unscaled_covzm(x::AbstractMatrix, y::AbstractVector, vardim::Int) = @@ -494,7 +502,7 @@ unscaled_covzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int) = (vardim == 1 ? *(transpose(x), _conj(y)) : *(x, adjoint(y))) # covzm (with centered data) - +covzm(itr::Any; corrected::Bool=true) = covzm(collect(itr); corrected = corrected) covzm(x::AbstractVector; corrected::Bool=true) = unscaled_covzm(x) / (length(x) - Int(corrected)) function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true) C = unscaled_covzm(x, vardim) @@ -504,6 +512,7 @@ function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true) A .= A .* b return A end +covzm(x::Any, y::Any; corrected::Bool=true) = covzm(collect(x), collect(y); corrected = corrected) covzm(x::AbstractVector, y::AbstractVector; corrected::Bool=true) = unscaled_covzm(x, y) / (length(x) - Int(corrected)) function covzm(x::AbstractVecOrMat, y::AbstractVecOrMat, vardim::Int=1; corrected::Bool=true) @@ -518,20 +527,37 @@ end # covm (with provided mean) ## Use map(t -> t - xmean, x) instead of x .- xmean to allow for Vector{Vector} ## which can't be handled by broadcast +covm(itr::Any, itrmean; corrected::Bool=true) = + covm(map(t -> t - itrmean, x); corrected = corrected) covm(x::AbstractVector, xmean; corrected::Bool=true) = covzm(map(t -> t - xmean, x); corrected=corrected) covm(x::AbstractMatrix, xmean, vardim::Int=1; corrected::Bool=true) = covzm(x .- xmean, vardim; corrected=corrected) +covm(x::Any, xmean, y::Any, ymean; corrected::Bool=true) = + covzm(map(t -> t - xmean, x), map(t -> t - ymean, y); corrected=corrected) covm(x::AbstractVector, xmean, y::AbstractVector, ymean; corrected::Bool=true) = covzm(map(t -> t - xmean, x), map(t -> t - ymean, y); corrected=corrected) covm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1; corrected::Bool=true) = covzm(x .- xmean, y .- ymean, vardim; corrected=corrected) # cov (API) +""" + cov(x::Any; corrected::Bool=true) + +Compute the variance of the iterator `x`. If `corrected` is `true` (the default) then the sum +is scaled with `n-1`, whereas the sum is scaled with `n` if `corrected` is `false` where `n` +is the number of elements in the iterator, which is not necessarily known. +""" +function cov(x::Any, corrected::Bool=true) + cx = collect(x) + covm(cx, mean(cx); corrected=corrected) +end + """ cov(x::AbstractVector; corrected::Bool=true) -Compute the variance of the vector `x`. If `corrected` is `true` (the default) then the sum +Compute the variance of the vector `x`. If `x` is a vector of vectors, returns the estimated +variance-covariance matrix of elements in `x`. If `corrected` is `true` (the default) then the sum is scaled with `n-1`, whereas the sum is scaled with `n` if `corrected` is `false` where `n = length(x)`. """ cov(x::AbstractVector; corrected::Bool=true) = covm(x, mean(x); corrected=corrected) @@ -546,6 +572,22 @@ if `corrected` is `false` where `n = size(X, dims)`. cov(X::AbstractMatrix; dims::Int=1, corrected::Bool=true) = covm(X, _vmean(X, dims), dims; corrected=corrected) +""" + cov(x::Any, y::Any; corrected::Bool=true) + +Compute the covariance between the iterators `x` and `y`. If `corrected` is `true` (the +default), computes ``\\frac{1}{n-1}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*`` where +``*`` denotes the complex conjugate and `n` is the number of elements in `x` which must equal +the number of elements in `y`. If `x` and `y` are both vectors of vectors, computes the analagous +estimator for the covariance matrix for `xi` and `yi. If `corrected` is `false`, computes +``\\frac{1}{n}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*``. +""" +function cov(x::Any, y::Any; corrected::Bool=true) + cx = collect(x) + cy = collect(y) + covm(cx, mean(cx), cy, mean(cy); corrected=corrected) +end + """ cov(x::AbstractVector, y::AbstractVector; corrected::Bool=true) @@ -628,10 +670,19 @@ function cov2cor!(C::AbstractMatrix, xsd::AbstractArray, ysd::AbstractArray) end return C end - +function cov2cor!(xy::Number, xsd::Number, ysd::Number) + xx = abs2(xsd) + yy = abs2(ysd) + clampcor(xy / max(xx, yy) / sqrt(min(xx, yy) / max(xx, yy))) +end # corzm (non-exported, with centered data) -corzm(x::AbstractVector{T}) where {T} = one(real(T)) +corzm(itr) = corzm(collect(itr)) +corzm(x::AbstractVector{T}) where {T<:Number} = one(real(T)) +function corzm(x::AbstractVector{T}) where {T} + c = unscaled_covzm(x) + return cov2cor!(c, collect(sqrt(c[i,i]) for i in 1:min(size(c)...))) +end function corzm(x::AbstractMatrix, vardim::Int=1) c = unscaled_covzm(x, vardim) return cov2cor!(c, collect(sqrt(c[i,i]) for i in 1:min(size(c)...))) @@ -644,9 +695,16 @@ corzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int=1) = cov2cor!(unscaled_covzm(x, y, vardim), sqrt!(sum(abs2, x, dims=vardim)), sqrt!(sum(abs2, y, dims=vardim))) # corm - -corm(x::AbstractVector{T}, xmean) where {T} = one(real(T)) +corm(itr::Any, itrmean) = corm(collect(itr), itrmean) +corm(x::AbstractVector{<:Number}, xmean) = one(real(eltype(x))) +function corm(x::AbstractVector, xmean) + c = sum(t -> _abs2(t - xmean), x) + return cov2cor!(c, collect(sqrt(c[i,i]) for i in 1:min(size(c)...))) +end corm(x::AbstractMatrix, xmean, vardim::Int=1) = corzm(x .- xmean, vardim) +corm(x::Any, mx, y::Any, my) = corm(collect(x), mx, collect(y), my) +_one(x) = one(x) +_one(x::AbstractArray) = fill(one(x[1]), size(x)) function corm(x::AbstractVector, mx, y::AbstractVector, my) require_one_based_indexing(x, y) n = length(x) @@ -655,19 +713,24 @@ function corm(x::AbstractVector, mx, y::AbstractVector, my) @inbounds begin # Initialize the accumulators - xx = zero(sqrt(abs2(one(x[1])))) - yy = zero(sqrt(abs2(one(y[1])))) - xy = zero(x[1] * y[1]') + xx = zero(sqrt!(_abs2(_one(x[1])))) + yy = zero(sqrt!(_abs2(_one(y[1])))) + xy = zero(_conjmul(x[1], y[1])) @simd for i in eachindex(x, y) xi = x[i] - mx yi = y[i] - my - xx += abs2(xi) - yy += abs2(yi) - xy += xi * yi' + xx += _abs2(xi) + yy += _abs2(yi) + xy += _conjmul(xi, yi) end end - return clampcor(xy / max(xx, yy) / sqrt(min(xx, yy) / max(xx, yy))) + + @show xx + @show yy + @show xy + + return cov2cor!(xy, xx, yy) end corm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1) = @@ -675,11 +738,26 @@ corm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1) = # cor """ - cor(x::AbstractVector) + cor(itr::Any) + +Return the number one if `itr` iterates through scalars. Returns the correlation between +elements of the iterator otherwise. +""" +cor(itr::Any) = cor(collect(itr)) + +""" + cor(x::AbstractVector{<:Number}) Return the number one. """ -cor(x::AbstractVector) = one(real(eltype(x))) +cor(x::AbstractVector{<:Number}) = one(real(eltype(x))) + +""" + cor(x::AbstractVector) + +Return the Pearson correlation matrix between elements in `x`. +""" +cor(x::AbstractVector) = corm(x, mean(x)) """ cor(X::AbstractMatrix; dims::Int=1) @@ -688,6 +766,18 @@ Compute the Pearson correlation matrix of the matrix `X` along the dimension `di """ cor(X::AbstractMatrix; dims::Int=1) = corm(X, _vmean(X, dims), dims) +""" + cor(x::Any, y::Any) + +Compute the Pearson correlation between the iterators `x` and `y`. +""" +function cor(x::Any, y::Any) + cx = collect(x) + cy = collect(y) + + corm(cx, mean(cx), cy, mean(cy)) +end + """ cor(x::AbstractVector, y::AbstractVector) diff --git a/test/runtests.jl b/test/runtests.jl index bc33cf57..4213ba25 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -314,6 +314,10 @@ Y = [6.0 2.0; 5.0 8.0; 3.0 4.0; 2.0 3.0] +X_vec = [[X[i,1], X[i,2]] for i in 1:size(X, 1)] +X_gen = ([X[i,1], X[i,2]] for i in 1:size(X, 1)) +Y_vec = [[Y[i,1], Y[i,2]] for i in 1:size(Y, 1)] +Y_gen = ([Y[i,1], Y[i,2]] for i in 1:size(Y, 1)) @testset "covariance" begin for vd in [1, 2], zm in [true, false], cr in [true, false] @@ -328,6 +332,8 @@ Y = [6.0 2.0; end x1 = vec(X[:,1]) y1 = vec(Y[:,1]) + x1_gen = (x for x in x1) + y1_gen = (y for y in y1) else k = size(X, 1) Cxx = zeros(k, k) @@ -338,6 +344,8 @@ Y = [6.0 2.0; end x1 = vec(X[1,:]) y1 = vec(Y[1,:]) + x1_gen = (x for x in x1) + y1_gen = (y for y in y1) end c = zm ? Statistics.covm(x1, 0, corrected=cr) : @@ -346,14 +354,14 @@ Y = [6.0 2.0; @test c ≈ Cxx[1,1] @inferred cov(x1, corrected=cr) - @test cov(X) == Statistics.covm(X, mean(X, dims=1)) + @test cov(X) == cov(X_vec) == cov(X_gen) == Statistics.covm(X, mean(X, dims=1)) C = zm ? Statistics.covm(X, 0, vd, corrected=cr) : cov(X, dims=vd, corrected=cr) @test size(C) == (k, k) @test C ≈ Cxx @inferred cov(X, dims=vd, corrected=cr) - @test cov(x1, y1) == Statistics.covm(x1, mean(x1), y1, mean(y1)) + @test cov(x1, y1) == cov(x1_gen, y1_gen) == Statistics.covm(x1, mean(x1), y1, mean(y1)) c = zm ? Statistics.covm(x1, 0, y1, 0, corrected=cr) : cov(x1, y1, corrected=cr) @test isa(c, Float64) @@ -378,7 +386,10 @@ Y = [6.0 2.0; @test vec(C) ≈ Cxy[:,1] @inferred cov(X, y1, dims=vd, corrected=cr) - @test cov(X, Y) == Statistics.covm(X, mean(X, dims=1), Y, mean(Y, dims=1)) + # Separate tests for equality and approximation + C = cov(X, Y) + @test C == Statistics.covm(X, mean(X, dims=1), Y, mean(Y, dims=1)) + @test C ≈ cov(X_vec, Y_vec) ≈ cov(X_gen, Y_gen) C = zm ? Statistics.covm(X, 0, Y, 0, vd, corrected=cr) : cov(X, Y, dims=vd, corrected=cr) @test size(C) == (k, k) @@ -428,17 +439,18 @@ end end c = zm ? Statistics.corm(x1, 0) : cor(x1) + c_gen = zm ? Statistics.corm((xi for xi in x1), 0) : cor(xi for xi in x1) @test isa(c, Float64) - @test c ≈ Cxx[1,1] + @test c ≈ c_gen ≈ Cxx[1,1] @inferred cor(x1) - @test cor(X) == Statistics.corm(X, mean(X, dims=1)) + @test cor(X) == Statistics.corm(X, mean(X, dims=1)) == cor(X_vec) == cor(X_gen) C = zm ? Statistics.corm(X, 0, vd) : cor(X, dims=vd) @test size(C) == (k, k) @test C ≈ Cxx @inferred cor(X, dims=vd) - @test cor(x1, y1) == Statistics.corm(x1, mean(x1), y1, mean(y1)) + @test cor(x1, y1) == cor((ix for ix in x1), (iy for iy in y1)) == Statistics.corm(x1, mean(x1), y1, mean(y1)) c = zm ? Statistics.corm(x1, 0, y1, 0) : cor(x1, y1) @test isa(c, Float64) @test c ≈ Cxy[1,1] @@ -460,7 +472,8 @@ end @test vec(C) ≈ Cxy[:,1] @inferred cor(X, y1, dims=vd) - @test cor(X, Y) == Statistics.corm(X, mean(X, dims=1), Y, mean(Y, dims=1)) + @test_skip cor(X, Y) == cor(X_vec, Y_vec) == cor(X_gen, Y_gen) == + Statistics.corm(X, mean(X, dims=1), Y, mean(Y, dims=1)) C = zm ? Statistics.corm(X, 0, Y, 0, vd) : cor(X, Y, dims=vd) @test size(C) == (k, k) @test C ≈ Cxy @@ -644,12 +657,15 @@ end @testset "cov and cor of complex arrays (issue #21093)" begin x = [2.7 - 3.3im, 0.9 + 5.4im, 0.1 + 0.2im, -1.7 - 5.8im, 1.1 + 1.9im] y = [-1.7 - 1.6im, -0.2 + 6.5im, 0.8 - 10.0im, 9.1 - 3.4im, 2.7 - 5.5im] - @test cov(x, y) ≈ 4.8365 - 12.119im - @test cov(y, x) ≈ 4.8365 + 12.119im + x_gen = (i for i in x) + y_gen = (i for i in y) + xy_vec = [[x[i], y[i]] for i in 1:length(x)] + @test cov(x, y) ≈ cov(x_gen, y_gen) ≈4.8365 - 12.119im + @test cov(y, x) ≈ cov(y_gen, x_gen) ≈ 4.8365 + 12.119im @test cov(x, reshape(y, :, 1)) ≈ reshape([4.8365 - 12.119im], 1, 1) @test cov(reshape(x, :, 1), y) ≈ reshape([4.8365 - 12.119im], 1, 1) @test cov(reshape(x, :, 1), reshape(y, :, 1)) ≈ reshape([4.8365 - 12.119im], 1, 1) - @test cov([x y]) ≈ [21.779 4.8365-12.119im; + @test cov([x y]) ≈ cov(xy_vec) ≈ cov((i for i in xy_vec)) ≈ [21.779 4.8365-12.119im; 4.8365+12.119im 54.548] @test cor(x, y) ≈ 0.14032104449218274 - 0.35160772008699703im @test cor(y, x) ≈ 0.14032104449218274 + 0.35160772008699703im