From 91a8de2ad28450e89f8e1f71d0c7af0cae290aa7 Mon Sep 17 00:00:00 2001 From: Fredrik Ekre Date: Wed, 27 Jun 2018 11:03:26 +0200 Subject: [PATCH] move base/statistics.jl to Statistics stdlib fix #27374 --- base/deprecated.jl | 19 +- base/exports.jl | 15 -- base/range.jl | 7 - base/sysimg.jl | 19 +- doc/src/base/math.md | 18 -- doc/src/manual/interfaces.md | 11 +- doc/src/manual/missing.md | 2 +- stdlib/LinearAlgebra/docs/src/index.md | 1 - stdlib/LinearAlgebra/src/LinearAlgebra.jl | 1 - stdlib/LinearAlgebra/src/generic.jl | 40 ---- stdlib/LinearAlgebra/test/generic.jl | 47 ----- stdlib/Markdown/test/runtests.jl | 10 +- stdlib/Pkg/src/GraphType.jl | 2 +- stdlib/REPL/src/docview.jl | 2 +- stdlib/SparseArrays/src/sparsematrix.jl | 61 ------ stdlib/SparseArrays/test/sparse.jl | 64 +------ stdlib/Statistics/Project.toml | 6 + stdlib/Statistics/docs/src/index.md | 27 +++ .../Statistics/src/Statistics.jl | 181 +++++++++++++++--- .../Statistics/test/runtests.jl | 159 ++++++++++++--- test/choosetests.jl | 2 +- test/dimensionful.jl | 7 +- test/docs.jl | 1 - test/error.jl | 4 +- test/offsetarray.jl | 1 + test/precompile.jl | 3 +- test/ranges.jl | 8 - test/reducedim.jl | 6 - 28 files changed, 366 insertions(+), 358 deletions(-) create mode 100644 stdlib/Statistics/Project.toml create mode 100644 stdlib/Statistics/docs/src/index.md rename base/statistics.jl => stdlib/Statistics/src/Statistics.jl (80%) rename test/statistics.jl => stdlib/Statistics/test/runtests.jl (71%) diff --git a/base/deprecated.jl b/base/deprecated.jl index 22608b826528a..2a7fd0ac3726d 100644 --- a/base/deprecated.jl +++ b/base/deprecated.jl @@ -293,12 +293,6 @@ deprecate(Base, :DSP, 2) using .DSP export conv, conv2, deconv, filt, filt!, xcorr -# PR #21709 -@deprecate cov(x::AbstractVector, corrected::Bool) cov(x, corrected=corrected) -@deprecate cov(x::AbstractMatrix, vardim::Int, corrected::Bool) cov(x, dims=vardim, corrected=corrected) -@deprecate cov(X::AbstractVector, Y::AbstractVector, corrected::Bool) cov(X, Y, corrected=corrected) -@deprecate cov(X::AbstractVecOrMat, Y::AbstractVecOrMat, vardim::Int, corrected::Bool) cov(X, Y, dims=vardim, corrected=corrected) - # PR #22325 # TODO: when this replace is removed from deprecated.jl: # 1) rename the function replace_new from strings/util.jl to replace @@ -1372,16 +1366,6 @@ export readandwrite @deprecate findmax(A::AbstractArray, dims) findmax(A, dims=dims) @deprecate findmin(A::AbstractArray, dims) findmin(A, dims=dims) -@deprecate mean(A::AbstractArray, dims) mean(A, dims=dims) -@deprecate varm(A::AbstractArray, m::AbstractArray, dims; kwargs...) varm(A, m; kwargs..., dims=dims) -@deprecate var(A::AbstractArray, dims; kwargs...) var(A; kwargs..., dims=dims) -@deprecate std(A::AbstractArray, dims; kwargs...) std(A; kwargs..., dims=dims) -@deprecate cov(X::AbstractMatrix, dim::Int; kwargs...) cov(X; kwargs..., dims=dim) -@deprecate cov(x::AbstractVecOrMat, y::AbstractVecOrMat, dim::Int; kwargs...) cov(x, y; kwargs..., dims=dim) -@deprecate cor(X::AbstractMatrix, dim::Int) cor(X, dims=dim) -@deprecate cor(x::AbstractVecOrMat, y::AbstractVecOrMat, dim::Int) cor(x, y, dims=dim) -@deprecate median(A::AbstractArray, dims; kwargs...) median(A; kwargs..., dims=dims) - @deprecate mapreducedim(f, op, A::AbstractArray, dims) mapreduce(f, op, A, dims=dims) @deprecate mapreducedim(f, op, A::AbstractArray, dims, v0) mapreduce(f, op, v0, A, dims=dims) @deprecate reducedim(op, A::AbstractArray, dims) reduce(op, A, dims=dims) @@ -1704,6 +1688,9 @@ end # PR #25168 @deprecate ipermute!(a, p::AbstractVector) invpermute!(a, p) +# #27140, #27152 +@deprecate_moved linreg "StatsBase" + # ?? more special functions to SpecialFunctions.jl @deprecate_moved gamma "SpecialFunctions" @deprecate_moved lgamma "SpecialFunctions" diff --git a/base/exports.jl b/base/exports.jl index cdbca8df0f6f5..d87889eb047df 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -626,21 +626,6 @@ export get_zero_subnormals, set_zero_subnormals, -# statistics - cor, - cov, - mean!, - mean, - median!, - median, - middle, - quantile!, - quantile, - std, - stdm, - var, - varm, - # iteration done, next, diff --git a/base/range.jl b/base/range.jl index 1aaa5c64235f8..747148a17eba4 100644 --- a/base/range.jl +++ b/base/range.jl @@ -847,13 +847,6 @@ function sum(r::AbstractRange{<:Real}) : (step(r) * l) * ((l-1)>>1)) end -function mean(r::AbstractRange{<:Real}) - isempty(r) && throw(ArgumentError("mean of an empty range is undefined")) - (first(r) + last(r)) / 2 -end - -median(r::AbstractRange{<:Real}) = mean(r) - function _in_range(x, r::AbstractRange) if step(r) == 0 return !isempty(r) && first(r) == x diff --git a/base/sysimg.jl b/base/sysimg.jl index 65056a7108f2d..bd5f2ff9d515f 100644 --- a/base/sysimg.jl +++ b/base/sysimg.jl @@ -413,9 +413,6 @@ using .StackTraces include("initdefs.jl") -# statistics -include("statistics.jl") - # worker threads include("threadcall.jl") @@ -525,6 +522,7 @@ let :Pkg, :Test, :REPL, + :Statistics, ] maxlen = maximum(textwidth.(string.(stdlibs))) @@ -746,7 +744,6 @@ end # @deprecate_stdlib kron LinearAlgebra true @deprecate_stdlib ldltfact LinearAlgebra true @deprecate_stdlib ldltfact! LinearAlgebra true - @deprecate_stdlib linreg LinearAlgebra true @deprecate_stdlib logabsdet LinearAlgebra true @deprecate_stdlib logdet LinearAlgebra true @deprecate_stdlib lu LinearAlgebra true @@ -896,6 +893,20 @@ end @deprecate_stdlib send Sockets true @deprecate_stdlib TCPSocket Sockets true @deprecate_stdlib UDPSocket Sockets true + + @deprecate_stdlib cor Statistics true + @deprecate_stdlib cov Statistics true + @deprecate_stdlib std Statistics true + @deprecate_stdlib stdm Statistics true + @deprecate_stdlib var Statistics true + @deprecate_stdlib varm Statistics true + @deprecate_stdlib mean! Statistics true + @deprecate_stdlib mean Statistics true + @deprecate_stdlib median! Statistics true + @deprecate_stdlib median Statistics true + @deprecate_stdlib middle Statistics true + @deprecate_stdlib quantile! Statistics true + @deprecate_stdlib quantile Statistics true end end diff --git a/doc/src/base/math.md b/doc/src/base/math.md index eecbdb808c980..0fa973df3c828 100644 --- a/doc/src/base/math.md +++ b/doc/src/base/math.md @@ -177,21 +177,3 @@ Base.widemul Base.Math.@evalpoly Base.FastMath.@fastmath ``` - -## Statistics - -```@docs -Base.mean -Base.mean! -Base.std -Base.stdm -Base.var -Base.varm -Base.middle -Base.median -Base.median! -Base.quantile -Base.quantile! -Base.cov -Base.cor -``` diff --git a/doc/src/manual/interfaces.md b/doc/src/manual/interfaces.md index 176b45a0bb5b6..618f55f79e583 100644 --- a/doc/src/manual/interfaces.md +++ b/doc/src/manual/interfaces.md @@ -65,7 +65,6 @@ julia> struct Squares end julia> Base.iterate(S::Squares, state=1) = state > S.count ? nothing : (state*state, state+1) - ``` With only [`iterate`](@ref) definition, the `Squares` type is already pretty powerful. @@ -84,12 +83,16 @@ julia> for i in Squares(7) 49 ``` -We can use many of the builtin methods that work with iterables, like [`in`](@ref), [`mean`](@ref) and [`std`](@ref): +We can use many of the builtin methods that work with iterables, +like [`in`](@ref), or [`mean`](@ref) and [`std`](@ref) from the +`Statistics` standard library module: ```jldoctest squaretype julia> 25 in Squares(10) true +julia> using Statistics + julia> mean(Squares(100)) 3383.5 @@ -388,8 +391,8 @@ julia> A[SquaresVector(3)] 4.0 9.0 -julia> mean(A) -5.0 +julia> sum(A) +45.0 ``` If you are defining an array type that allows non-traditional indexing (indices that start at diff --git a/doc/src/manual/missing.md b/doc/src/manual/missing.md index 4467f587da162..67b349f57a157 100644 --- a/doc/src/manual/missing.md +++ b/doc/src/manual/missing.md @@ -294,7 +294,7 @@ julia> sum(skipmissing([1, missing])) This convenience function returns an iterator which filters out `missing` values efficiently. It can therefore be used with any function which supports iterators -```jldoctest +```jldoctest; setup = :(using Statistics) julia> maximum(skipmissing([3, missing, 2, 1])) 3 diff --git a/stdlib/LinearAlgebra/docs/src/index.md b/stdlib/LinearAlgebra/docs/src/index.md index 415d3d60cd1d7..6e7d122da83cc 100644 --- a/stdlib/LinearAlgebra/docs/src/index.md +++ b/stdlib/LinearAlgebra/docs/src/index.md @@ -370,7 +370,6 @@ Base.inv(::AbstractMatrix) LinearAlgebra.pinv LinearAlgebra.nullspace Base.kron -LinearAlgebra.linreg LinearAlgebra.exp(::StridedMatrix{<:LinearAlgebra.BlasFloat}) LinearAlgebra.log(::StridedMatrix) LinearAlgebra.sqrt(::StridedMatrix{<:Real}) diff --git a/stdlib/LinearAlgebra/src/LinearAlgebra.jl b/stdlib/LinearAlgebra/src/LinearAlgebra.jl index c0af08ebd9816..d841f2fc8f2e5 100644 --- a/stdlib/LinearAlgebra/src/LinearAlgebra.jl +++ b/stdlib/LinearAlgebra/src/LinearAlgebra.jl @@ -103,7 +103,6 @@ export ldiv!, ldlt!, ldlt, - linreg, logabsdet, logdet, lowrankdowndate, diff --git a/stdlib/LinearAlgebra/src/generic.jl b/stdlib/LinearAlgebra/src/generic.jl index cd2c92a22a549..d0dc58607597a 100644 --- a/stdlib/LinearAlgebra/src/generic.jl +++ b/stdlib/LinearAlgebra/src/generic.jl @@ -1143,46 +1143,6 @@ isdiag(A::AbstractMatrix) = isbanded(A, 0, 0) isdiag(x::Number) = true -""" - linreg(x, y) - -Perform simple linear regression using Ordinary Least Squares. Returns `a` and `b` such -that `a + b*x` is the closest straight line to the given points `(x, y)`, i.e., such that -the squared error between `y` and `a + b*x` is minimized. - -# Examples -```julia -using PyPlot -x = 1.0:12.0 -y = [5.5, 6.3, 7.6, 8.8, 10.9, 11.79, 13.48, 15.02, 17.77, 20.81, 22.0, 22.99] -a, b = linreg(x, y) # Linear regression -plot(x, y, "o") # Plot (x, y) points -plot(x, a + b*x) # Plot line determined by linear regression -``` - -See also: - -`\\`, [`cov`](@ref), [`std`](@ref), [`mean`](@ref). - -""" -function linreg(x::AbstractVector, y::AbstractVector) - # Least squares given - # Y = a + b*X - # where - # b = cov(X, Y)/var(X) - # a = mean(Y) - b*mean(X) - if size(x) != size(y) - throw(DimensionMismatch("x has size $(size(x)) and y has size $(size(y)), " * - "but these must be the same size")) - end - mx = mean(x) - my = mean(y) - # don't need to worry about the scaling (n vs n - 1) since they cancel in the ratio - b = Base.covm(x, mx, y, my)/Base.varm(x, mx) - a = my - b*mx - return (a, b) -end - # BLAS-like in-place y = x*α+y function (see also the version in blas.jl # for BlasFloat Arrays) function axpy!(α, x::AbstractArray, y::AbstractArray) diff --git a/stdlib/LinearAlgebra/test/generic.jl b/stdlib/LinearAlgebra/test/generic.jl index 3009d4ce8ab4b..6675e025a7912 100644 --- a/stdlib/LinearAlgebra/test/generic.jl +++ b/stdlib/LinearAlgebra/test/generic.jl @@ -89,53 +89,6 @@ n = 5 # should be odd end end -@testset "linrange" begin - # make sure unequal input arrays throw an error - x = [2; 5; 6] - y = [3; 7; 10; 10] - @test_throws DimensionMismatch linreg(x, y) - x = [2 5 6] - y = [3; 7; 10] - @test_throws MethodError linreg(x, y) - - # check (UnitRange, Array) - x = 1:12 - y = [5.5; 6.3; 7.6; 8.8; 10.9; 11.79; 13.48; 15.02; 17.77; 20.81; 22.0; 22.99] - @test [linreg(x,y)...] ≈ [2.5559090909090867, 1.6960139860139862] - @test [linreg(view(x,1:6),view(y,1:6))...] ≈ [3.8366666666666642,1.3271428571428574] - - # check (LinRange, UnitRange) - x = range(1.0, stop=12.0, length=100) - y = -100:-1 - @test [linreg(x, y)...] ≈ [-109.0, 9.0] - - # check (UnitRange, UnitRange) - x = 1:12 - y = 12:-1:1 - @test [linreg(x, y)...] ≈ [13.0, -1.0] - - # check (LinRange, LinRange) - x = range(-5, stop=10, length=100) - y = range(50, stop=200, length=100) - @test [linreg(x, y)...] ≈ [100.0, 10.0] - - # check (Array, Array) - # Anscombe's quartet (https://en.wikipedia.org/wiki/Anscombe%27s_quartet) - x123 = [10.0; 8.0; 13.0; 9.0; 11.0; 14.0; 6.0; 4.0; 12.0; 7.0; 5.0] - y1 = [8.04; 6.95; 7.58; 8.81; 8.33; 9.96; 7.24; 4.26; 10.84; 4.82; 5.68] - @test [linreg(x123,y1)...] ≈ [3.0,0.5] atol=15e-5 - - y2 = [9.14; 8.14; 8.74; 8.77; 9.26; 8.10; 6.12; 3.10; 9.13; 7.26; 4.74] - @test [linreg(x123,y2)...] ≈ [3.0,0.5] atol=10e-3 - - y3 = [7.46; 6.77; 12.74; 7.11; 7.81; 8.84; 6.08; 5.39; 8.15; 6.42; 5.73] - @test [linreg(x123,y3)...] ≈ [3.0,0.5] atol=10e-3 - - x4 = [8.0; 8.0; 8.0; 8.0; 8.0; 8.0; 8.0; 19.0; 8.0; 8.0; 8.0] - y4 = [6.58; 5.76; 7.71; 8.84; 8.47; 7.04; 5.25; 12.50; 5.56; 7.91; 6.89] - @test [linreg(x4,y4)...] ≈ [3.0,0.5] atol=10e-3 -end - @testset "diag" begin A = Matrix(1.0I, 4, 4) @test diag(A) == fill(1, 4) diff --git a/stdlib/Markdown/test/runtests.jl b/stdlib/Markdown/test/runtests.jl index 7cb5aee14d3ee..e1d03fdffaffd 100644 --- a/stdlib/Markdown/test/runtests.jl +++ b/stdlib/Markdown/test/runtests.jl @@ -427,20 +427,20 @@ end ref(x) = Reference(x) -ref(mean) +ref(sum) show(io::IO, m::MIME"text/plain", r::Reference) = print(io, "$(r.ref) (see Julia docs)") -mean_ref = md"Behaves like $(ref(mean))" -@test plain(mean_ref) == "Behaves like mean (see Julia docs)\n" -@test html(mean_ref) == "

Behaves like mean (see Julia docs)

\n" +sum_ref = md"Behaves like $(ref(sum))" +@test plain(sum_ref) == "Behaves like sum (see Julia docs)\n" +@test html(sum_ref) == "

Behaves like sum (see Julia docs)

\n" show(io::IO, m::MIME"text/html", r::Reference) = Markdown.withtag(io, :a, :href=>"test") do Markdown.htmlesc(io, Markdown.plaininline(r)) end -@test html(mean_ref) == "

Behaves like mean (see Julia docs)

\n" +@test html(sum_ref) == "

Behaves like sum (see Julia docs)

\n" @test md""" ````julia diff --git a/stdlib/Pkg/src/GraphType.jl b/stdlib/Pkg/src/GraphType.jl index e2af668c96406..6a879f0fa2676 100644 --- a/stdlib/Pkg/src/GraphType.jl +++ b/stdlib/Pkg/src/GraphType.jl @@ -1422,7 +1422,7 @@ function prune_graph!(graph::Graph) # Done - log_event_global!(graph, "pruned graph — stats (n. of packages, mean connectivity): before = ($np,$(mean(spp))) after = ($new_np,$(mean(new_spp)))") + log_event_global!(graph, "pruned graph — stats (n. of packages, mean connectivity): before = ($np,$(sum(spp)/length(spp))) after = ($new_np,$(sum(new_spp)/length(new_spp)))") # Replace old data with new data.pkgs = new_pkgs diff --git a/stdlib/REPL/src/docview.jl b/stdlib/REPL/src/docview.jl index f11aaef8fb26f..1b0d92691f9be 100644 --- a/stdlib/REPL/src/docview.jl +++ b/stdlib/REPL/src/docview.jl @@ -378,7 +378,7 @@ function fuzzyscore(needle, haystack) score += (acro ? 2 : 1)*length(is) # Matched characters score -= 2(length(needle)-length(is)) # Missing characters !acro && (score -= avgdistance(is)/10) # Contiguous - !isempty(is) && (score -= mean(is)/100) # Closer to beginning + !isempty(is) && (score -= sum(is)/length(is)/100) # Closer to beginning return score end diff --git a/stdlib/SparseArrays/src/sparsematrix.jl b/stdlib/SparseArrays/src/sparsematrix.jl index 6986b12cd2cf5..47fe84a7cd194 100644 --- a/stdlib/SparseArrays/src/sparsematrix.jl +++ b/stdlib/SparseArrays/src/sparsematrix.jl @@ -3489,67 +3489,6 @@ function hash(A::SparseMatrixCSC{T}, h::UInt) where T hashrun(0, length(A)-lastidx, h) # Hash zeros at end end -## Statistics - -# This is the function that does the reduction underlying var/std -function Base.centralize_sumabs2!(R::AbstractArray{S}, A::SparseMatrixCSC{Tv,Ti}, means::AbstractArray) where {S,Tv,Ti} - lsiz = Base.check_reducedims(R,A) - size(means) == size(R) || error("size of means must match size of R") - isempty(R) || fill!(R, zero(S)) - isempty(A) && return R - - colptr = A.colptr - rowval = A.rowval - nzval = A.nzval - m = size(A, 1) - n = size(A, 2) - - if size(R, 1) == size(R, 2) == 1 - # Reduction along both columns and rows - R[1, 1] = Base.centralize_sumabs2(A, means[1]) - elseif size(R, 1) == 1 - # Reduction along rows - @inbounds for col = 1:n - mu = means[col] - r = convert(S, (m-colptr[col+1]+colptr[col])*abs2(mu)) - @simd for j = colptr[col]:colptr[col+1]-1 - r += abs2(nzval[j] - mu) - end - R[1, col] = r - end - elseif size(R, 2) == 1 - # Reduction along columns - rownz = fill(convert(Ti, n), m) - @inbounds for col = 1:n - @simd for j = colptr[col]:colptr[col+1]-1 - row = rowval[j] - R[row, 1] += abs2(nzval[j] - means[row]) - rownz[row] -= 1 - end - end - for i = 1:m - R[i, 1] += rownz[i]*abs2(means[i]) - end - else - # Reduction along a dimension > 2 - @inbounds for col = 1:n - lastrow = 0 - @simd for j = colptr[col]:colptr[col+1]-1 - row = rowval[j] - for i = lastrow+1:row-1 - R[i, col] = abs2(means[i, col]) - end - R[row, col] = abs2(nzval[j] - means[row, col]) - lastrow = row - end - for i = lastrow+1:m - R[i, col] = abs2(means[i, col]) - end - end - end - return R -end - ## Uniform matrix arithmetic (+)(A::SparseMatrixCSC, J::UniformScaling) = A + sparse(J, size(A)...) diff --git a/stdlib/SparseArrays/test/sparse.jl b/stdlib/SparseArrays/test/sparse.jl index 46651803cc215..ad17fc1d3edb1 100644 --- a/stdlib/SparseArrays/test/sparse.jl +++ b/stdlib/SparseArrays/test/sparse.jl @@ -522,7 +522,7 @@ end pA = sparse(rand(3, 7)) for arr in (se33, sA, pA) - for f in (sum, prod, minimum, maximum, var) + for f in (sum, prod, minimum, maximum) farr = Array(arr) @test f(arr) ≈ f(farr) @test f(arr, dims=1) ≈ f(farr, dims=1) @@ -551,9 +551,8 @@ end @test prod(sparse(Int[])) === 1 @test_throws ArgumentError minimum(sparse(Int[])) @test_throws ArgumentError maximum(sparse(Int[])) - @test var(sparse(Int[])) === NaN - for f in (sum, prod, var) + for f in (sum, prod) @test isequal(f(spzeros(0, 1), dims=1), f(Matrix{Int}(I, 0, 1), dims=1)) @test isequal(f(spzeros(0, 1), dims=2), f(Matrix{Int}(I, 0, 1), dims=2)) @test isequal(f(spzeros(0, 1), dims=(1, 2)), f(Matrix{Int}(I, 0, 1), dims=(1, 2))) @@ -602,7 +601,7 @@ end @test length(sprb45) == 20 sprb45nnzs[i] = sum(sprb45)[1] end - @test 4 <= mean(sprb45nnzs) <= 16 + @test 4 <= sum(sprb45nnzs)/length(sprb45nnzs) <= 16 end @testset "issue #5853, sparse diff" begin @@ -2065,63 +2064,6 @@ end @test issymmetric(B) end -# Faster covariance function for sparse matrices -# Prevents densifying the input matrix when subtracting the mean -# Test against dense implementation -# PR https://github.com/JuliaLang/julia/pull/22735 -# Part of this test needed to be hacked due to the treatment -# of Inf in sparse matrix algebra -# https://github.com/JuliaLang/julia/issues/22921 -# The issue will be resolved in -# https://github.com/JuliaLang/julia/issues/22733 -@testset "optimizing sparse $elty covariance" for elty in (Float64, Complex{Float64}) - n = 10 - p = 5 - np2 = div(n*p, 2) - nzvals, x_sparse = guardsrand(1) do - if elty <: Real - nzvals = randn(np2) - else - nzvals = complex.(randn(np2), randn(np2)) - end - nzvals, sparse(rand(1:n, np2), rand(1:p, np2), nzvals, n, p) - end - x_dense = convert(Matrix{elty}, x_sparse) - @testset "Test with no Infs and NaNs, vardim=$vardim, corrected=$corrected" for vardim in (1, 2), - corrected in (true, false) - @test cov(x_sparse, dims=vardim, corrected=corrected) ≈ - cov(x_dense , dims=vardim, corrected=corrected) - end - - @testset "Test with $x11, vardim=$vardim, corrected=$corrected" for x11 in (NaN, Inf), - vardim in (1, 2), - corrected in (true, false) - x_sparse[1,1] = x11 - x_dense[1 ,1] = x11 - - cov_sparse = cov(x_sparse, dims=vardim, corrected=corrected) - cov_dense = cov(x_dense , dims=vardim, corrected=corrected) - @test cov_sparse[2:end, 2:end] ≈ cov_dense[2:end, 2:end] - @test isfinite.(cov_sparse) == isfinite.(cov_dense) - @test isfinite.(cov_sparse) == isfinite.(cov_dense) - end - - @testset "Test with NaN and Inf, vardim=$vardim, corrected=$corrected" for vardim in (1, 2), - corrected in (true, false) - x_sparse[1,1] = Inf - x_dense[1 ,1] = Inf - x_sparse[2,1] = NaN - x_dense[2 ,1] = NaN - - cov_sparse = cov(x_sparse, dims=vardim, corrected=corrected) - cov_dense = cov(x_dense , dims=vardim, corrected=corrected) - @test cov_sparse[(1 + vardim):end, (1 + vardim):end] ≈ - cov_dense[ (1 + vardim):end, (1 + vardim):end] - @test isfinite.(cov_sparse) == isfinite.(cov_dense) - @test isfinite.(cov_sparse) == isfinite.(cov_dense) - end -end - @testset "similar should not alias the input sparse array" begin a = sparse(rand(3,3) .+ 0.1) b = similar(a, Float32, Int32) diff --git a/stdlib/Statistics/Project.toml b/stdlib/Statistics/Project.toml new file mode 100644 index 0000000000000..772693d53e56e --- /dev/null +++ b/stdlib/Statistics/Project.toml @@ -0,0 +1,6 @@ +name = "Statistics" +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[deps] +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" diff --git a/stdlib/Statistics/docs/src/index.md b/stdlib/Statistics/docs/src/index.md new file mode 100644 index 0000000000000..5a684541f015f --- /dev/null +++ b/stdlib/Statistics/docs/src/index.md @@ -0,0 +1,27 @@ +# Statistics + +```@meta +DocTestSetup = :(using Statistics) +``` + +The Statistics module contains basic statistics functionality. + +```@docs +Statistics.std +Statistics.stdm +Statistics.var +Statistics.varm +Statistics.cor +Statistics.cov +Statistics.mean! +Statistics.mean +Statistics.median! +Statistics.median +Statistics.middle +Statistics.quantile! +Statistics.quantile +``` + +```@meta +DocTestSetup = nothing +``` diff --git a/base/statistics.jl b/stdlib/Statistics/src/Statistics.jl similarity index 80% rename from base/statistics.jl rename to stdlib/Statistics/src/Statistics.jl index 36698dd26ff50..be227b7db7359 100644 --- a/base/statistics.jl +++ b/stdlib/Statistics/src/Statistics.jl @@ -1,5 +1,17 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license +""" + Statistics + +Standard library module for basic statistics functionality. +""" +module Statistics + +using LinearAlgebra, SparseArrays + +export cor, cov, std, stdm, var, varm, mean!, mean, + median!, median, middle, quantile!, quantile + ##### mean ##### """ @@ -15,7 +27,7 @@ julia> mean([√1, √2, √3]) 1.3820881233139908 ``` """ -function mean(f::Callable, iterable) +function mean(f::Base.Callable, iterable) y = iterate(iterable) if y === nothing throw(ArgumentError("mean of empty collection undefined: $(repr(iterable))")) @@ -23,7 +35,7 @@ function mean(f::Callable, iterable) count = 1 value, state = y f_value = f(value) - total = reduce_first(add_sum, f_value) + total = Base.reduce_first(Base.add_sum, f_value) y = iterate(iterable, state) while y !== nothing value, state = y @@ -34,7 +46,7 @@ function mean(f::Callable, iterable) return total/count end mean(iterable) = mean(identity, iterable) -mean(f::Callable, A::AbstractArray) = sum(f, A) / _length(A) +mean(f::Base.Callable, A::AbstractArray) = sum(f, A) / Base._length(A) """ mean!(r, v) @@ -60,7 +72,7 @@ julia> mean!([1. 1.], v) """ function mean!(R::AbstractArray, A::AbstractArray) sum!(R, A; init=true) - x = max(1, _length(R)) // _length(A) + x = max(1, Base._length(R)) // Base._length(A) R .= R .* x return R end @@ -76,8 +88,15 @@ Compute the mean of whole array `v`, or optionally along the given dimensions. """ mean(A::AbstractArray; dims=:) = _mean(A, dims) -_mean(A::AbstractArray{T}, region) where {T} = mean!(reducedim_init(t -> t/2, +, A, region), A) -_mean(A::AbstractArray, ::Colon) = sum(A) / _length(A) +_mean(A::AbstractArray{T}, region) where {T} = mean!(Base.reducedim_init(t -> t/2, +, A, region), A) +_mean(A::AbstractArray, ::Colon) = sum(A) / Base._length(A) + +function mean(r::AbstractRange{<:Real}) + isempty(r) && throw(ArgumentError("mean of an empty range is undefined")) + (first(r) + last(r)) / 2 +end + +median(r::AbstractRange{<:Real}) = mean(r) ##### variances ##### @@ -136,12 +155,12 @@ centralize_sumabs2(A::AbstractArray, m, ifirst::Int, ilast::Int) = function centralize_sumabs2!(R::AbstractArray{S}, A::AbstractArray, means::AbstractArray) where S # following the implementation of _mapreducedim! at base/reducedim.jl - lsiz = check_reducedims(R,A) + lsiz = Base.check_reducedims(R,A) isempty(R) || fill!(R, zero(S)) isempty(A) && return R - if has_fast_linear_indexing(A) && lsiz > 16 - nslices = div(_length(A), lsiz) + if Base.has_fast_linear_indexing(A) && lsiz > 16 + nslices = div(Base._length(A), lsiz) ibase = first(LinearIndices(A))-1 for i = 1:nslices @inbounds R[i] = centralize_sumabs2(A, means[i], ibase+1, ibase+lsiz) @@ -149,10 +168,10 @@ function centralize_sumabs2!(R::AbstractArray{S}, A::AbstractArray, means::Abstr end return R end - indsAt, indsRt = safe_tail(axes(A)), safe_tail(axes(R)) # handle d=1 manually + indsAt, indsRt = Base.safe_tail(axes(A)), Base.safe_tail(axes(R)) # handle d=1 manually keep, Idefault = Broadcast.shapeindexer(indsRt) - if reducedim1(R, A) - i1 = first(indices1(R)) + if Base.reducedim1(R, A) + i1 = first(Base.indices1(R)) @inbounds for IA in CartesianIndices(indsAt) IR = Broadcast.newindex(IA, keep, Idefault) r = R[i1,IR] @@ -177,7 +196,7 @@ function varm!(R::AbstractArray{S}, A::AbstractArray, m::AbstractArray; correcte if isempty(A) fill!(R, convert(S, NaN)) else - rn = div(_length(A), _length(R)) - Int(corrected) + rn = div(Base._length(A), Base._length(R)) - Int(corrected) centralize_sumabs2!(R, A, m) R .= R .* (1 // rn) end @@ -199,12 +218,12 @@ whereas the sum is scaled with `n` if `corrected` is `false` where `n = length(x varm(A::AbstractArray, m::AbstractArray; corrected::Bool=true, dims=:) = _varm(A, m, corrected, dims) _varm(A::AbstractArray{T}, m, corrected::Bool, region) where {T} = - varm!(reducedim_init(t -> abs2(t)/2, +, A, region), A, m; corrected=corrected) + varm!(Base.reducedim_init(t -> abs2(t)/2, +, A, region), A, m; corrected=corrected) varm(A::AbstractArray, m; corrected::Bool=true) = _varm(A, m, corrected, :) function _varm(A::AbstractArray{T}, m, corrected::Bool, ::Colon) where T - n = _length(A) + n = Base._length(A) n == 0 && return typeof((abs2(zero(T)) + abs2(zero(T)))/2)(NaN) return centralize_sumabs2(A, m) / (n - Int(corrected)) end @@ -228,10 +247,10 @@ The mean `mean` over the region may be provided. var(A::AbstractArray; corrected::Bool=true, mean=nothing, dims=:) = _var(A, corrected, mean, dims) _var(A::AbstractArray, corrected::Bool, mean, dims) = - varm(A, coalesce(mean, Base.mean(A, dims=dims)); corrected=corrected, dims=dims) + varm(A, something(mean, Statistics.mean(A, dims=dims)); corrected=corrected, dims=dims) _var(A::AbstractArray, corrected::Bool, mean, ::Colon) = - real(varm(A, coalesce(mean, Base.mean(A)); corrected=corrected)) + real(varm(A, something(mean, Statistics.mean(A)); corrected=corrected)) varm(iterable, m; corrected::Bool=true) = _var(iterable, corrected, m) @@ -329,7 +348,7 @@ stdm(iterable, m; corrected::Bool=true) = _conj(x::AbstractArray{<:Real}) = x _conj(x::AbstractArray) = conj(x) -_getnobs(x::AbstractVector, vardim::Int) = _length(x) +_getnobs(x::AbstractVector, vardim::Int) = Base._length(x) _getnobs(x::AbstractMatrix, vardim::Int) = size(x, vardim) function _getnobs(x::AbstractVecOrMat, y::AbstractVecOrMat, vardim::Int) @@ -357,7 +376,7 @@ unscaled_covzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int) = # covzm (with centered data) -covzm(x::AbstractVector; corrected::Bool=true) = unscaled_covzm(x) / (_length(x) - Int(corrected)) +covzm(x::AbstractVector; corrected::Bool=true) = unscaled_covzm(x) / (Base._length(x) - Int(corrected)) function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true) C = unscaled_covzm(x, vardim) T = promote_type(typeof(first(C) / 1), eltype(C)) @@ -367,7 +386,7 @@ function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true) return A end covzm(x::AbstractVector, y::AbstractVector; corrected::Bool=true) = - unscaled_covzm(x, y) / (_length(x) - Int(corrected)) + unscaled_covzm(x, y) / (Base._length(x) - Int(corrected)) function covzm(x::AbstractVecOrMat, y::AbstractVecOrMat, vardim::Int=1; corrected::Bool=true) C = unscaled_covzm(x, y, vardim) T = promote_type(typeof(first(C) / 1), eltype(C)) @@ -396,7 +415,7 @@ covm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1; corr Compute the variance of the vector `x`. If `corrected` is `true` (the default) then the sum is scaled with `n-1`, whereas the sum is scaled with `n` if `corrected` is `false` where `n = length(x)`. """ -cov(x::AbstractVector; corrected::Bool=true) = covm(x, Base.mean(x); corrected=corrected) +cov(x::AbstractVector; corrected::Bool=true) = covm(x, mean(x); corrected=corrected) """ cov(X::AbstractMatrix; dims::Int=1, corrected::Bool=true) @@ -417,7 +436,7 @@ default), computes ``\\frac{1}{n-1}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*`` `false`, computes ``\\frac{1}{n}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*``. """ cov(x::AbstractVector, y::AbstractVector; corrected::Bool=true) = - covm(x, Base.mean(x), y, Base.mean(y); corrected=corrected) + covm(x, mean(x), y, mean(y); corrected=corrected) """ cov(X::AbstractVecOrMat, Y::AbstractVecOrMat; dims::Int=1, corrected::Bool=true) @@ -550,7 +569,7 @@ cor(X::AbstractMatrix; dims::Int=1) = corm(X, _vmean(X, dims), dims) Compute the Pearson correlation between the vectors `x` and `y`. """ -cor(x::AbstractVector, y::AbstractVector) = corm(x, Base.mean(x), y, Base.mean(y)) +cor(x::AbstractVector, y::AbstractVector) = corm(x, mean(x), y, mean(y)) """ cor(X::AbstractVecOrMat, Y::AbstractVecOrMat; dims=1) @@ -626,7 +645,7 @@ function median!(v::AbstractVector) end end inds = axes(v, 1) - n = _length(inds) + n = Base._length(inds) mid = div(first(inds)+last(inds),2) if isodd(n) return middle(partialsort!(v,mid)) @@ -653,7 +672,7 @@ median(v::AbstractArray; dims=:) = _median(v, dims) _median(v::AbstractArray, dims) = mapslices(median!, v, dims = dims) -_median(v::AbstractArray{T}, ::Colon) where {T} = median!(copyto!(Array{T,1}(undef, _length(v)), v)) +_median(v::AbstractArray{T}, ::Colon) where {T} = median!(copyto!(Array{T,1}(undef, Base._length(v)), v)) # for now, use the R/S definition of quantile; may want variants later # see ?quantile in R -- this is type 7 @@ -721,7 +740,7 @@ function _quantilesort!(v::AbstractArray, sorted::Bool, minp::Real, maxp::Real) hi = ceil(Int,1+maxp*(lv-1)) # only need to perform partial sort - sort!(v, 1, lv, Sort.PartialQuickSort(lo:hi), Base.Sort.Forward) + sort!(v, 1, lv, Base.Sort.PartialQuickSort(lo:hi), Base.Sort.Forward) end isnan(v[end]) && throw(ArgumentError("quantiles are undefined in presence of NaNs")) return v @@ -776,4 +795,112 @@ for `k = 1:n` where `n = length(v)`. This corresponds to Definition 7 of Hyndman *The American Statistician*, Vol. 50, No. 4, pp. 361-365 """ quantile(v::AbstractVector, p; sorted::Bool=false) = - quantile!(sorted ? v : copymutable(v), p; sorted=sorted) + quantile!(sorted ? v : Base.copymutable(v), p; sorted=sorted) + + +##### SparseArrays optimizations ##### + +function cov(X::SparseMatrixCSC; dims::Int=1, corrected::Bool=true) + vardim = dims + a, b = size(X) + n, p = vardim == 1 ? (a, b) : (b, a) + + # The covariance can be decomposed into two terms + # 1/(n - 1) ∑ (x_i - x̄)*(x_i - x̄)' = 1/(n - 1) (∑ x_i*x_i' - n*x̄*x̄') + # which can be evaluated via a sparse matrix-matrix product + + # Compute ∑ x_i*x_i' = X'X using sparse matrix-matrix product + out = Matrix(unscaled_covzm(X, vardim)) + + # Compute x̄ + x̄ᵀ = mean(X, dims=vardim) + + # Subtract n*x̄*x̄' from X'X + @inbounds for j in 1:p, i in 1:p + out[i,j] -= x̄ᵀ[i] * x̄ᵀ[j]' * n + end + + # scale with the sample size n or the corrected sample size n - 1 + return rmul!(out, inv(n - corrected)) +end + +# This is the function that does the reduction underlying var/std +function centralize_sumabs2!(R::AbstractArray{S}, A::SparseMatrixCSC{Tv,Ti}, means::AbstractArray) where {S,Tv,Ti} + lsiz = Base.check_reducedims(R,A) + size(means) == size(R) || error("size of means must match size of R") + isempty(R) || fill!(R, zero(S)) + isempty(A) && return R + + colptr = A.colptr + rowval = A.rowval + nzval = A.nzval + m = size(A, 1) + n = size(A, 2) + + if size(R, 1) == size(R, 2) == 1 + # Reduction along both columns and rows + R[1, 1] = centralize_sumabs2(A, means[1]) + elseif size(R, 1) == 1 + # Reduction along rows + @inbounds for col = 1:n + mu = means[col] + r = convert(S, (m-colptr[col+1]+colptr[col])*abs2(mu)) + @simd for j = colptr[col]:colptr[col+1]-1 + r += abs2(nzval[j] - mu) + end + R[1, col] = r + end + elseif size(R, 2) == 1 + # Reduction along columns + rownz = fill(convert(Ti, n), m) + @inbounds for col = 1:n + @simd for j = colptr[col]:colptr[col+1]-1 + row = rowval[j] + R[row, 1] += abs2(nzval[j] - means[row]) + rownz[row] -= 1 + end + end + for i = 1:m + R[i, 1] += rownz[i]*abs2(means[i]) + end + else + # Reduction along a dimension > 2 + @inbounds for col = 1:n + lastrow = 0 + @simd for j = colptr[col]:colptr[col+1]-1 + row = rowval[j] + for i = lastrow+1:row-1 + R[i, col] = abs2(means[i, col]) + end + R[row, col] = abs2(nzval[j] - means[row, col]) + lastrow = row + end + for i = lastrow+1:m + R[i, col] = abs2(means[i, col]) + end + end + end + return R +end + + +##### deprecations ##### + +# PR #21709 +@deprecate cov(x::AbstractVector, corrected::Bool) cov(x, corrected=corrected) +@deprecate cov(x::AbstractMatrix, vardim::Int, corrected::Bool) cov(x, dims=vardim, corrected=corrected) +@deprecate cov(X::AbstractVector, Y::AbstractVector, corrected::Bool) cov(X, Y, corrected=corrected) +@deprecate cov(X::AbstractVecOrMat, Y::AbstractVecOrMat, vardim::Int, corrected::Bool) cov(X, Y, dims=vardim, corrected=corrected) + +# issue #25501 +@deprecate mean(A::AbstractArray, dims) mean(A, dims=dims) +@deprecate varm(A::AbstractArray, m::AbstractArray, dims; kwargs...) varm(A, m; kwargs..., dims=dims) +@deprecate var(A::AbstractArray, dims; kwargs...) var(A; kwargs..., dims=dims) +@deprecate std(A::AbstractArray, dims; kwargs...) std(A; kwargs..., dims=dims) +@deprecate cov(X::AbstractMatrix, dim::Int; kwargs...) cov(X; kwargs..., dims=dim) +@deprecate cov(x::AbstractVecOrMat, y::AbstractVecOrMat, dim::Int; kwargs...) cov(x, y; kwargs..., dims=dim) +@deprecate cor(X::AbstractMatrix, dim::Int) cor(X, dims=dim) +@deprecate cor(x::AbstractVecOrMat, y::AbstractVecOrMat, dim::Int) cor(x, y, dims=dim) +@deprecate median(A::AbstractArray, dims; kwargs...) median(A; kwargs..., dims=dims) + +end # module diff --git a/test/statistics.jl b/stdlib/Statistics/test/runtests.jl similarity index 71% rename from test/statistics.jl rename to stdlib/Statistics/test/runtests.jl index 3d7f2c846c8f5..ad0e5076bbe0d 100644 --- a/test/statistics.jl +++ b/stdlib/Statistics/test/runtests.jl @@ -1,6 +1,6 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -using Test, Random, LinearAlgebra +using Statistics, Test, Random, LinearAlgebra, SparseArrays @testset "middle" begin @test middle(3) === 3.0 @@ -80,6 +80,15 @@ end end end +@testset "mean/median for ranges" begin + for f in (mean, median) + for n = 2:5 + @test f(2:n) == f([2:n;]) + @test f(2:0.1:n) ≈ f([2:0.1:n;]) + end + end +end + @testset "var & std" begin # edge case: empty vector # iterable; this has to throw for type stability @@ -185,6 +194,19 @@ end @test std([1//1, 2//1]) isa Float64 @test std([1//1, 2//1], dims=1) isa Vector{Float64} + + @testset "var: empty cases" begin + A = Matrix{Int}(undef, 0,1) + @test var(A) === NaN + + @test isequal(var(A, dims=1), fill(NaN, 1, 1)) + @test isequal(var(A, dims=2), fill(NaN, 0, 1)) + @test isequal(var(A, dims=(1, 2)), fill(NaN, 1, 1)) + @test isequal(var(A, dims=3), fill(NaN, 0, 1)) + end + + # issue #6672 + @test std(AbstractFloat[1,2,3], dims=1) == [1.0] end function safe_cov(x, y, zm::Bool, cr::Bool) @@ -231,46 +253,46 @@ Y = [6.0 2.0; y1 = vec(Y[1,:]) end - c = zm ? Base.covm(x1, 0, corrected=cr) : + c = zm ? Statistics.covm(x1, 0, corrected=cr) : cov(x1, corrected=cr) @test isa(c, Float64) @test c ≈ Cxx[1,1] @inferred cov(x1, corrected=cr) - @test cov(X) == Base.covm(X, mean(X, dims=1)) - C = zm ? Base.covm(X, 0, vd, corrected=cr) : + @test cov(X) == Statistics.covm(X, mean(X, dims=1)) + C = zm ? Statistics.covm(X, 0, vd, corrected=cr) : cov(X, dims=vd, corrected=cr) @test size(C) == (k, k) @test C ≈ Cxx @inferred cov(X, dims=vd, corrected=cr) - @test cov(x1, y1) == Base.covm(x1, mean(x1), y1, mean(y1)) - c = zm ? Base.covm(x1, 0, y1, 0, corrected=cr) : + @test cov(x1, y1) == Statistics.covm(x1, mean(x1), y1, mean(y1)) + c = zm ? Statistics.covm(x1, 0, y1, 0, corrected=cr) : cov(x1, y1, corrected=cr) @test isa(c, Float64) @test c ≈ Cxy[1,1] @inferred cov(x1, y1, corrected=cr) if vd == 1 - @test cov(x1, Y) == Base.covm(x1, mean(x1), Y, mean(Y, dims=1)) + @test cov(x1, Y) == Statistics.covm(x1, mean(x1), Y, mean(Y, dims=1)) end - C = zm ? Base.covm(x1, 0, Y, 0, vd, corrected=cr) : + C = zm ? Statistics.covm(x1, 0, Y, 0, vd, corrected=cr) : cov(x1, Y, dims=vd, corrected=cr) @test size(C) == (1, k) @test vec(C) ≈ Cxy[1,:] @inferred cov(x1, Y, dims=vd, corrected=cr) if vd == 1 - @test cov(X, y1) == Base.covm(X, mean(X, dims=1), y1, mean(y1)) + @test cov(X, y1) == Statistics.covm(X, mean(X, dims=1), y1, mean(y1)) end - C = zm ? Base.covm(X, 0, y1, 0, vd, corrected=cr) : + C = zm ? Statistics.covm(X, 0, y1, 0, vd, corrected=cr) : cov(X, y1, dims=vd, corrected=cr) @test size(C) == (k, 1) @test vec(C) ≈ Cxy[:,1] @inferred cov(X, y1, dims=vd, corrected=cr) - @test cov(X, Y) == Base.covm(X, mean(X, dims=1), Y, mean(Y, dims=1)) - C = zm ? Base.covm(X, 0, Y, 0, vd, corrected=cr) : + @test cov(X, Y) == Statistics.covm(X, mean(X, dims=1), Y, mean(Y, dims=1)) + C = zm ? Statistics.covm(X, 0, Y, 0, vd, corrected=cr) : cov(X, Y, dims=vd, corrected=cr) @test size(C) == (k, k) @test C ≈ Cxy @@ -312,41 +334,41 @@ end y1 = vec(Y[1,:]) end - c = zm ? Base.corm(x1, 0) : cor(x1) + c = zm ? Statistics.corm(x1, 0) : cor(x1) @test isa(c, Float64) @test c ≈ Cxx[1,1] @inferred cor(x1) - @test cor(X) == Base.corm(X, mean(X, dims=1)) - C = zm ? Base.corm(X, 0, vd) : cor(X, dims=vd) + @test cor(X) == Statistics.corm(X, mean(X, dims=1)) + C = zm ? Statistics.corm(X, 0, vd) : cor(X, dims=vd) @test size(C) == (k, k) @test C ≈ Cxx @inferred cor(X, dims=vd) - @test cor(x1, y1) == Base.corm(x1, mean(x1), y1, mean(y1)) - c = zm ? Base.corm(x1, 0, y1, 0) : cor(x1, y1) + @test cor(x1, y1) == Statistics.corm(x1, mean(x1), y1, mean(y1)) + c = zm ? Statistics.corm(x1, 0, y1, 0) : cor(x1, y1) @test isa(c, Float64) @test c ≈ Cxy[1,1] @inferred cor(x1, y1) if vd == 1 - @test cor(x1, Y) == Base.corm(x1, mean(x1), Y, mean(Y, dims=1)) + @test cor(x1, Y) == Statistics.corm(x1, mean(x1), Y, mean(Y, dims=1)) end - C = zm ? Base.corm(x1, 0, Y, 0, vd) : cor(x1, Y, dims=vd) + C = zm ? Statistics.corm(x1, 0, Y, 0, vd) : cor(x1, Y, dims=vd) @test size(C) == (1, k) @test vec(C) ≈ Cxy[1,:] @inferred cor(x1, Y, dims=vd) if vd == 1 - @test cor(X, y1) == Base.corm(X, mean(X, dims=1), y1, mean(y1)) + @test cor(X, y1) == Statistics.corm(X, mean(X, dims=1), y1, mean(y1)) end - C = zm ? Base.corm(X, 0, y1, 0, vd) : cor(X, y1, dims=vd) + C = zm ? Statistics.corm(X, 0, y1, 0, vd) : cor(X, y1, dims=vd) @test size(C) == (k, 1) @test vec(C) ≈ Cxy[:,1] @inferred cor(X, y1, dims=vd) - @test cor(X, Y) == Base.corm(X, mean(X, dims=1), Y, mean(Y, dims=1)) - C = zm ? Base.corm(X, 0, Y, 0, vd) : cor(X, Y, dims=vd) + @test cor(X, Y) == Statistics.corm(X, mean(X, dims=1), Y, mean(Y, dims=1)) + C = zm ? Statistics.corm(X, 0, Y, 0, vd) : cor(X, Y, dims=vd) @test size(C) == (k, k) @test C ≈ Cxy @inferred cor(X, Y, dims=vd) @@ -440,8 +462,13 @@ end end # dimensional correctness -isdefined(Main, :TestHelpers) || @eval Main include("TestHelpers.jl") +const BASE_TEST_PATH = joinpath(Sys.BINDIR, "..", "share", "julia", "test") +isdefined(Main, :TestHelpers) || @eval Main include(joinpath($(BASE_TEST_PATH), "TestHelpers.jl")) using .Main.TestHelpers: Furlong + +Statistics.middle(x::Furlong{p}) where {p} = Furlong{p}(middle(x.val)) +Statistics.middle(x::Furlong{p}, y::Furlong{p}) where {p} = Furlong{p}(middle(x.val, y.val)) + @testset "Unitful elements" begin r = Furlong(1):Furlong(1):Furlong(2) a = Vector(r) @@ -471,9 +498,9 @@ end @testset "Promotion in covzm. Issue #8080" begin A = [1 -1 -1; -1 1 1; -1 1 -1; 1 -1 -1; 1 -1 1] - @test Base.covzm(A) - mean(A, dims=1)'*mean(A, dims=1)*size(A, 1)/(size(A, 1) - 1) ≈ cov(A) + @test Statistics.covzm(A) - mean(A, dims=1)'*mean(A, dims=1)*size(A, 1)/(size(A, 1) - 1) ≈ cov(A) A = [1//1 -1 -1; -1 1 1; -1 1 -1; 1 -1 -1; 1 -1 1] - @test (A'A - size(A, 1)*Base.mean(A, dims=1)'*Base.mean(A, dims=1))/4 == cov(A) + @test (A'A - size(A, 1)*mean(A, dims=1)'*mean(A, dims=1))/4 == cov(A) end @testset "Mean along dimension of empty array" begin @@ -493,3 +520,83 @@ end @test std(x) ≈ vec(std([x[1] x[2]], dims=2)) @test cov(x) ≈ cov([x[1] x[2]], dims=2) end + +@testset "var of sparse array" begin + se33 = SparseMatrixCSC{Float64}(I, 3, 3) + sA = sprandn(3, 7, 0.5) + pA = sparse(rand(3, 7)) + + for arr in (se33, sA, pA) + farr = Array(arr) + @test var(arr) ≈ var(farr) + @test var(arr, dims=1) ≈ var(farr, dims=1) + @test var(arr, dims=2) ≈ var(farr, dims=2) + @test var(arr, dims=(1, 2)) ≈ [var(farr)] + @test isequal(var(arr, dims=3), var(farr, dims=3)) + end + + @testset "empty cases" begin + @test var(sparse(Int[])) === NaN + @test isequal(var(spzeros(0, 1), dims=1), var(Matrix{Int}(I, 0, 1), dims=1)) + @test isequal(var(spzeros(0, 1), dims=2), var(Matrix{Int}(I, 0, 1), dims=2)) + @test isequal(var(spzeros(0, 1), dims=(1, 2)), var(Matrix{Int}(I, 0, 1), dims=(1, 2))) + @test isequal(var(spzeros(0, 1), dims=3), var(Matrix{Int}(I, 0, 1), dims=3)) + end +end + +# Faster covariance function for sparse matrices +# Prevents densifying the input matrix when subtracting the mean +# Test against dense implementation +# PR https://github.com/JuliaLang/julia/pull/22735 +# Part of this test needed to be hacked due to the treatment +# of Inf in sparse matrix algebra +# https://github.com/JuliaLang/julia/issues/22921 +# The issue will be resolved in +# https://github.com/JuliaLang/julia/issues/22733 +@testset "optimizing sparse $elty covariance" for elty in (Float64, Complex{Float64}) + n = 10 + p = 5 + np2 = div(n*p, 2) + nzvals, x_sparse = guardsrand(1) do + if elty <: Real + nzvals = randn(np2) + else + nzvals = complex.(randn(np2), randn(np2)) + end + nzvals, sparse(rand(1:n, np2), rand(1:p, np2), nzvals, n, p) + end + x_dense = convert(Matrix{elty}, x_sparse) + @testset "Test with no Infs and NaNs, vardim=$vardim, corrected=$corrected" for vardim in (1, 2), + corrected in (true, false) + @test cov(x_sparse, dims=vardim, corrected=corrected) ≈ + cov(x_dense , dims=vardim, corrected=corrected) + end + + @testset "Test with $x11, vardim=$vardim, corrected=$corrected" for x11 in (NaN, Inf), + vardim in (1, 2), + corrected in (true, false) + x_sparse[1,1] = x11 + x_dense[1 ,1] = x11 + + cov_sparse = cov(x_sparse, dims=vardim, corrected=corrected) + cov_dense = cov(x_dense , dims=vardim, corrected=corrected) + @test cov_sparse[2:end, 2:end] ≈ cov_dense[2:end, 2:end] + @test isfinite.(cov_sparse) == isfinite.(cov_dense) + @test isfinite.(cov_sparse) == isfinite.(cov_dense) + end + + @testset "Test with NaN and Inf, vardim=$vardim, corrected=$corrected" for vardim in (1, 2), + corrected in (true, false) + x_sparse[1,1] = Inf + x_dense[1 ,1] = Inf + x_sparse[2,1] = NaN + x_dense[2 ,1] = NaN + + cov_sparse = cov(x_sparse, dims=vardim, corrected=corrected) + cov_dense = cov(x_dense , dims=vardim, corrected=corrected) + @test cov_sparse[(1 + vardim):end, (1 + vardim):end] ≈ + cov_dense[ (1 + vardim):end, (1 + vardim):end] + @test isfinite.(cov_sparse) == isfinite.(cov_dense) + @test isfinite.(cov_sparse) == isfinite.(cov_dense) + end +end diff --git a/test/choosetests.jl b/test/choosetests.jl index 2bceb7f35e626..5f2a4880f1926 100644 --- a/test/choosetests.jl +++ b/test/choosetests.jl @@ -41,7 +41,7 @@ function choosetests(choices = []) "intfuncs", "simdloop", "vecelement", "rational", "bitarray", "copy", "math", "fastmath", "functional", "iterators", "operators", "path", "ccall", "parse", "loading", "bigint", - "bigfloat", "sorting", "statistics", "spawn", "backtrace", + "bigfloat", "sorting", "spawn", "backtrace", "file", "read", "version", "namedtuple", "mpfr", "broadcast", "complex", "floatapprox", "stdlib", "reflection", "regex", "float16", diff --git a/test/dimensionful.jl b/test/dimensionful.jl index 21c9747806d54..d02313203103d 100644 --- a/test/dimensionful.jl +++ b/test/dimensionful.jl @@ -38,13 +38,12 @@ sylvester(a::Furlong,b::Furlong,c::Furlong) = -c / (a + b) for f in (:isfinite, :isnan, :isreal) @eval Base.$f(x::Furlong) = $f(x.val) end -for f in (:real,:imag,:complex,:middle,:+,:-) +for f in (:real,:imag,:complex,:+,:-) @eval Base.$f(x::Furlong{p}) where {p} = Furlong{p}($f(x.val)) end -import Base: +, -, ==, !=, <, <=, isless, isequal, *, /, //, div, rem, mod, ^, - middle, hypot -for op in (:+, :-, :middle, :hypot) +import Base: +, -, ==, !=, <, <=, isless, isequal, *, /, //, div, rem, mod, ^, hypot +for op in (:+, :-, :hypot) @eval function $op(x::Furlong{p}, y::Furlong{p}) where {p} v = $op(x.val, y.val) Furlong{p}(v) diff --git a/test/docs.jl b/test/docs.jl index cf20dbd1d5495..3a925d65a4334 100644 --- a/test/docs.jl +++ b/test/docs.jl @@ -669,7 +669,6 @@ end @test (@repl :@r_str) !== nothing # Simple tests for apropos: -@test occursin("cor", sprint(apropos, "pearson")) @test occursin("eachindex", sprint(apropos, r"ind(exes|ices)")) using Profile @test occursin("Profile.print", sprint(apropos, "print")) diff --git a/test/error.jl b/test/error.jl index 64932a8a80422..2d9b1065c036a 100644 --- a/test/error.jl +++ b/test/error.jl @@ -7,7 +7,9 @@ ratio(x) = x[2:end]./x[1:end-1] @test all(x->x ≈ 10.0, ratio(collect(ExponentialBackOff(n=10, max_delay=Inf, factor=10, jitter=0.0)))) guardsrand(12345) do - @test (mean(ratio(collect(ExponentialBackOff(n=100, max_delay=Inf, factor=1, jitter=0.1)))) - 1.0) < 1e-4 + x = ratio(collect(ExponentialBackOff(n=100, max_delay=Inf, factor=1, jitter=0.1))) + xm = sum(x) / length(x) + @test (xm - 1.0) < 1e-4 end end @testset "retrying after errors" begin diff --git a/test/offsetarray.jl b/test/offsetarray.jl index 0aa4248198788..fc88ec7eb49b1 100644 --- a/test/offsetarray.jl +++ b/test/offsetarray.jl @@ -5,6 +5,7 @@ using .Main.TestHelpers.OAs using DelimitedFiles using Random using LinearAlgebra +using Statistics const OAs_name = join(fullname(OAs), ".") diff --git a/test/precompile.jl b/test/precompile.jl index cb3e190dcfa7e..51a5833a31730 100644 --- a/test/precompile.jl +++ b/test/precompile.jl @@ -226,7 +226,8 @@ try [:Base64, :CRC32c, :Dates, :DelimitedFiles, :Distributed, :FileWatching, :Markdown, :Future, :Libdl, :LinearAlgebra, :Logging, :Mmap, :Printf, :Profile, :Random, :Serialization, :SharedArrays, :SparseArrays, :SuiteSparse, :Test, - :Unicode, :REPL, :InteractiveUtils, :OldPkg, :Pkg, :LibGit2, :SHA, :UUIDs, :Sockets])) + :Unicode, :REPL, :InteractiveUtils, :OldPkg, :Pkg, :LibGit2, :SHA, :UUIDs, :Sockets, + :Statistics, ])) @test discard_module.(deps) == deps1 @test current_task()(0x01, 0x4000, 0x30031234) == 2 diff --git a/test/ranges.jl b/test/ranges.jl index aa0d7980de956..dbfbc6609c84b 100644 --- a/test/ranges.jl +++ b/test/ranges.jl @@ -829,14 +829,6 @@ end @test length(map(identity, UInt64(1):UInt64(5))) == 5 @test length(map(identity, UInt128(1):UInt128(5))) == 5 end -@testset "mean/median" begin - for f in (mean, median) - for n = 2:5 - @test f(2:n) == f([2:n;]) - @test f(2:0.1:n) ≈ f([2:0.1:n;]) - end - end -end @testset "issue #8531" begin smallint = (Int === Int64 ? (Int8,UInt8,Int16,UInt16,Int32,UInt32) : diff --git a/test/reducedim.jl b/test/reducedim.jl index 8e80dc750a3e6..fcc7df1bf98d7 100644 --- a/test/reducedim.jl +++ b/test/reducedim.jl @@ -127,7 +127,6 @@ end @test prod(A) === 1 @test_throws ArgumentError minimum(A) @test_throws ArgumentError maximum(A) - @test var(A) === NaN @test isequal(sum(A, dims=1), zeros(Int, 1, 1)) @test isequal(sum(A, dims=2), zeros(Int, 0, 1)) @@ -137,10 +136,6 @@ end @test isequal(prod(A, dims=2), fill(1, 0, 1)) @test isequal(prod(A, dims=(1, 2)), fill(1, 1, 1)) @test isequal(prod(A, dims=3), fill(1, 0, 1)) - @test isequal(var(A, dims=1), fill(NaN, 1, 1)) - @test isequal(var(A, dims=2), fill(NaN, 0, 1)) - @test isequal(var(A, dims=(1, 2)), fill(NaN, 1, 1)) - @test isequal(var(A, dims=3), fill(NaN, 0, 1)) for f in (minimum, maximum) @test_throws ArgumentError f(A, dims=1) @@ -320,7 +315,6 @@ end # issue #6672 @test sum(Real[1 2 3; 4 5.3 7.1], dims=2) == reshape([6, 16.4], 2, 1) -@test std(AbstractFloat[1,2,3], dims=1) == [1.0] @test sum(Any[1 2;3 4], dims=1) == [4 6] @test sum(Vector{Int}[[1,2],[4,3]], dims=1)[1] == [5,5]