Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make covariance and correlation work for any iterators #30

Closed
wants to merge 11 commits into from
1 change: 0 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
name = "Statistics"
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[deps]
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Expand Down
121 changes: 117 additions & 4 deletions src/Statistics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -479,10 +479,13 @@ end
_vmean(x::AbstractVector, vardim::Int) = mean(x)
_vmean(x::AbstractMatrix, vardim::Int) = mean(x, dims=vardim)

_abs2(x::Number) = abs2(x)
_abs2(x) = x*x'

# core functions

unscaled_covzm(x::AbstractVector{<:Number}) = sum(abs2, x)
unscaled_covzm(x::AbstractVector) = sum(t -> t*t', x)
unscaled_covzm(x::AbstractVector{<:Number}) = sum(_abs2, x)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method isn't needed if you dispatch in _abs2.

Suggested change
unscaled_covzm(x::AbstractVector{<:Number}) = sum(_abs2, x)

unscaled_covzm(x::AbstractVector) = sum(_abs2, x)
unscaled_covzm(x::AbstractMatrix, vardim::Int) = (vardim == 1 ? _conj(x'x) : x * x')

unscaled_covzm(x::AbstractVector, y::AbstractVector) = sum(conj(y[i])*x[i] for i in eachindex(y, x))
Expand All @@ -494,7 +497,25 @@ unscaled_covzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int) =
(vardim == 1 ? *(transpose(x), _conj(y)) : *(x, adjoint(y)))

# covzm (with centered data)

function covzm(itr::Any; corrected::Bool=true)
y = iterate(itr)
if y === nothing
return Base.mapreduce_empty_iter(_abs2, Base.add_sum, itr,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+ is used instead of add_sum:

Suggested change
return Base.mapreduce_empty_iter(_abs2, Base.add_sum, itr,
return Base.mapreduce_empty_iter(_abs2, +, itr,

But maybe it would be clearer to use the same approach for all methods (see suggestion for the two-argument case).

Base.IteratorEltype(itr)) / 0
end
count = 1
value, state = y
f_value = _abs2(value)
total = Base.reduce_first(Base.add_sum, f_value)
y = iterate(itr, state)
while y !== nothing
value, state = y
total += _abs2(value)
count += 1
y = iterate(itr, state)
end
return total / (count - Int(corrected))
end
covzm(x::AbstractVector; corrected::Bool=true) = unscaled_covzm(x) / (length(x) - Int(corrected))
function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true)
C = unscaled_covzm(x, vardim)
Expand All @@ -504,6 +525,28 @@ function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true)
A .= A .* b
return A
end
function covzm(x::Any, y::Any; corrected::Bool=true)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe this could just call covm(x, 0, y, 0)? Check whether the compiler is able to optimize for 0 statically.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you tried this?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just did.

julia> x = randn(10_000); y = randn(10_000);

julia> x = x .- Statistics.mean(x); y = y .- Statistics.mean(y);

julia> gx = (xi for xi in x); gy = (yi for yi in y);

julia> @btime Statistics.covzm($gx, $gy)
  11.820 μs (0 allocations: 0 bytes)
0.009480946018135657

julia> @btime Statistics.covm($gx, 0,  $gy, 0)
  14.768 μs (0 allocations: 0 bytes)
0.009480946018135657

Let me know what you think of that difference. It doesn't seem very important to me. The covzm version takes 80% as long as covm with 0s.

z = zip(x, y)
z_itr = iterate(z)
if z_itr === nothing
# TODO: Understand how to improve this error.
#return Base.mapreduce_empty_iter(t -> _conj(t[2])*t[1]', Base.add_sum, itr,
# Base.IteratorEltype(x)) / 0
return NaN
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think mapreduce_empty_iter is really useful outside standard reductions. Here you just need to find the best way to compute NaN so that it works for all types. Maybe this?

Suggested change
# TODO: Understand how to improve this error.
#return Base.mapreduce_empty_iter(t -> _conj(t[2])*t[1]', Base.add_sum, itr,
# Base.IteratorEltype(x)) / 0
return NaN
v = conj(zero(eltype(y)))*zero(eltype(x))'
return (v + v) / 0

end
count = 1
value, state = z_itr
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something like this would be more readable (if you adapt code elsewhere):

Suggested change
value, state = z_itr
(xi, yi), state = z_itr

f_value = conj(value[2])*value[1]'
total = Base.reduce_first(Base.add_sum, f_value)
z_itr = iterate(z, state)
while z_itr !== nothing
value, state = z_itr
total += conj(value[2])*value[1]'
count += 1
z_itr = iterate(z, state)
end
return total ./ (count - Int(corrected))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return total ./ (count - Int(corrected))
return total / (count - Int(corrected))

end
covzm(x::AbstractVector, y::AbstractVector; corrected::Bool=true) =
unscaled_covzm(x, y) / (length(x) - Int(corrected))
function covzm(x::AbstractVecOrMat, y::AbstractVecOrMat, vardim::Int=1; corrected::Bool=true)
Expand All @@ -517,17 +560,75 @@ end

# covm (with provided mean)
## Use map(t -> t - xmean, x) instead of x .- xmean to allow for Vector{Vector}
## which can't be handled by broadcast
## which can't be handled by broadcastz
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
## which can't be handled by broadcastz
## which can't be handled by broadcast

(but this comment just be moved below)

function covm(itr::Any, itrmean; corrected::Bool=true)
y = iterate(itr)
f = let itrmean = itrmean
x -> _abs2(x-itrmean)
end
if y === nothing
return Base.mapreduce_empty_iter(f, Base.add_sum, itr,
Base.IteratorEltype(itr)) / 0
end
count = 1
value, state = y
f_value = f(value)
total = Base.reduce_first(Base.add_sum, f_value)
y = iterate(itr, state)
while y !== nothing
value, state = y
total += f(value)
count += 1
y = iterate(itr, state)
end
return total / (count - Int(corrected))
end
covm(x::AbstractVector, xmean; corrected::Bool=true) =
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method is identical to the previous one so it's no longer needed.

covzm(map(t -> t - xmean, x); corrected=corrected)
covm(x::AbstractMatrix, xmean, vardim::Int=1; corrected::Bool=true) =
covzm(x .- xmean, vardim; corrected=corrected)
function covm(x::Any, xmean, y::Any, ymean; corrected::Bool=true)
z = zip(x, y)
z_itr = iterate(z)
if z_itr === nothing
# TODO: Understand how to improve this error.
#return Base.mapreduce_empty_iter(t -> _conj(t[2])*t[1]', Base.add_sum, itr,
# Base.IteratorEltype(x)) / 0
return NaN
end
f = let xmean = xmean, ymean = ymean
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd just have f take xmean and ymean, and pass that explicitly in the two call places.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would have to write (x, y) -> _conj(x - xmean, y - ymean). Wouldn't that cause the closure bug?

t -> conj(t[2]-ymean)*(t[1]-xmean)'
end
count = 1
value, state = z_itr
f_value = f(value)
total = Base.reduce_first(Base.add_sum, f_value)
z_itr = iterate(z, state)
while z_itr !== nothing
value, state = z_itr
total += f(value)
count += 1
z_itr = iterate(z, state)
end
return total ./ (count - Int(corrected))
end
covm(x::AbstractVector, xmean, y::AbstractVector, ymean; corrected::Bool=true) =
covzm(map(t -> t - xmean, x), map(t -> t - ymean, y); corrected=corrected)
covm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1; corrected::Bool=true) =
covzm(x .- xmean, y .- ymean, vardim; corrected=corrected)

# cov (API)
"""
cov(x::Any; corrected::Bool=true)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better adapt the existing docstring (and method) to only mention iterators, since vectors are just a special case. Same for others.


Compute the variance of the vector `x`. If `corrected` is `true` (the default) then the sum
is scaled with `n-1`, whereas the sum is scaled with `n` if `corrected` is `false` where `n`
is the number of elements in the iterator, which is not necessarily known.
"""
function cov(x::Any, corrected::Bool=true)
covm(x, mean(x); corrected=corrected)
end

"""
cov(x::AbstractVector; corrected::Bool=true)

Expand All @@ -546,6 +647,18 @@ if `corrected` is `false` where `n = size(X, dims)`.
cov(X::AbstractMatrix; dims::Int=1, corrected::Bool=true) =
covm(X, _vmean(X, dims), dims; corrected=corrected)

"""
cov(x::Any, y::Any; corrected::Bool=true)

Compute the covariance between the iterators `x` and `y`. If `corrected` is `true` (the
default), computes ``\\frac{1}{n-1}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*`` where
``*`` denotes the complex conjugate and `n` is the number of elements in `x` which must equal
the number of elements in `y`. If `corrected` is `false`, computes ``\\frac{1}{n}\\sum_{i=1}^n
(x_i-\\bar x) (y_i-\\bar y)^*``.
"""
cov(x::Any, y::Any; corrected::Bool=true) =
covm(x, mean(x), y, mean(y); corrected=corrected)

"""
cov(x::AbstractVector, y::AbstractVector; corrected::Bool=true)

Expand Down
9 changes: 7 additions & 2 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,7 @@ Y = [6.0 2.0;
5.0 8.0;
3.0 4.0;
2.0 3.0]
X_gen = ([X[i,1], X[i,2]] for i in 1:size(X, 1))

@testset "covariance" begin
for vd in [1, 2], zm in [true, false], cr in [true, false]
Expand All @@ -328,6 +329,8 @@ Y = [6.0 2.0;
end
x1 = vec(X[:,1])
y1 = vec(Y[:,1])
x1_gen = (x for x in x1)
y1_gen = (y for y in y1)
else
k = size(X, 1)
Cxx = zeros(k, k)
Expand All @@ -338,6 +341,8 @@ Y = [6.0 2.0;
end
x1 = vec(X[1,:])
y1 = vec(Y[1,:])
x1_gen = (x for x in x1)
y1_gen = (y for y in y1)
end

c = zm ? Statistics.covm(x1, 0, corrected=cr) :
Expand All @@ -346,14 +351,14 @@ Y = [6.0 2.0;
@test c ≈ Cxx[1,1]
@inferred cov(x1, corrected=cr)

@test cov(X) == Statistics.covm(X, mean(X, dims=1))
@test cov(X) == cov(X_gen) == Statistics.covm(X, mean(X, dims=1))
C = zm ? Statistics.covm(X, 0, vd, corrected=cr) :
cov(X, dims=vd, corrected=cr)
@test size(C) == (k, k)
@test C ≈ Cxx
@inferred cov(X, dims=vd, corrected=cr)

@test cov(x1, y1) == Statistics.covm(x1, mean(x1), y1, mean(y1))
@test cov(x1, y1) == cov(x1_gen, y1_gen) == Statistics.covm(x1, mean(x1), y1, mean(y1))
c = zm ? Statistics.covm(x1, 0, y1, 0, corrected=cr) :
cov(x1, y1, corrected=cr)
@test isa(c, Float64)
Expand Down