From 2877091584e6c618683fb43340b6c465bdce47d4 Mon Sep 17 00:00:00 2001 From: Andreas Noack Jensen Date: Mon, 5 Aug 2013 15:59:55 +0200 Subject: [PATCH] Fix Spearman's rank correlation to address #2. Remove tiedrank method with dimension argument. Add Kendall's rank correlation. Enable tests. --- src/Stats.jl | 2 +- src/corr.jl | 143 ++++++++++++++++++++++++++++++++++++-------- src/scalar_stats.jl | 6 -- test/01.jl | 11 ++-- 4 files changed, 124 insertions(+), 38 deletions(-) diff --git a/src/Stats.jl b/src/Stats.jl index 963a1d691cb5a..f69e72525ffd0 100644 --- a/src/Stats.jl +++ b/src/Stats.jl @@ -32,7 +32,7 @@ module Stats # corr autocor, cor_spearman, - cov_spearman, + cor_kendall, # others rle, diff --git a/src/corr.jl b/src/corr.jl index 61dd1588519c1..807ee540d755c 100644 --- a/src/corr.jl +++ b/src/corr.jl @@ -1,50 +1,141 @@ # Various correlation -# -# spearman covariance functions -# - -# spearman covariance between two vectors -function cov_spearman(x::AbstractVector, y::AbstractVector) - return cov(tiedrank(x), tiedrank(y)) -end - -# spearman covariance over all pairs of columns of two matrices -function cov_spearman(X::AbstractMatrix, Y::AbstractMatrix) - return [cov_spearman(X[:,i], Y[:,j]) for i = 1:size(X, 2), j = 1:size(Y,2)] -end -function cov_spearman(x::AbstractVector, Y::AbstractMatrix) - return [cov_spearman(x, Y[:,i]) for i = 1:size(Y, 2)] -end -function cov_spearman(X::AbstractMatrix, y::AbstractVector) - return [cov_spearman(X[:,i], y) for i = 1:size(X, 2)] -end -# spearman covariance over all pairs of columns of a matrix -cov_spearman(X::AbstractMatrix) = cov(tiedrank(X, 1)) - # # spearman correlation functions # # spearman correlation between two vectors function cor_spearman(x::AbstractVector, y::AbstractVector) + if any(isnan(x)) || any(isnan(y)) return NaN end return cor(tiedrank(x), tiedrank(y)) end # spearman correlation over all pairs of columns of two matrices function cor_spearman(X::AbstractMatrix, Y::AbstractMatrix) - return cor(tiedrank(X, 1), tiedrank(Y, 1)) + return cor(mapslices(tiedrank, X, 1), mapslices(tiedrank, Y, 1)) end function cor_spearman(X::AbstractMatrix, y::AbstractVector) - return cor(tiedrank(X, 1), tiedrank(y)) + return cor(mapslices(tiedrank, X, 1), tiedrank(y)) end function cor_spearman(x::AbstractVector, Y::AbstractMatrix) - return cor(tiedrank(x), tiedrank(Y, 1)) + return cor(tiedrank(x), mapslices(tiedrank, Y, 1)) end # spearman correlation over all pairs of columns of a matrix function cor_spearman(X::AbstractMatrix) - return cor(tiedrank(X, 1)) + csp = cor(mapslices(tiedrank, X, 1)) + nanindex = vec(mapslices(any, isnan(X), 1)) + csp[nanindex, :] = NaN + csp[:, nanindex] = NaN + return csp +end + +# +# Kendall's rank correlation +# + +# Knigh JASA (1966) +function cor_kendall!{T<:Real,S<:Real}(x::AbstractVector{T}, y::AbstractVector{S}) + if any(isnan(x)) || any(isnan(y)) return NaN end + n = length(x) + if n != length(y) error("Vectors must have same length") end + + # Initial sorting + pm = sortperm(y) + x[:] = x[pm] + y[:] = y[pm] + pm[:] = sortperm(x) + x[:] = x[pm] + + # Counting ties in x and y + iT = 1 + nT = 0 + iU = 1 + nU = 0 + for i = 2:n + if x[i] == x[i-1] + iT += 1 + else + nT += iT*(iT - 1) + iT = 1 + end + if y[i] == y[i-1] + iU += 1 + else + nU += iU*(iU - 1) + iU = 1 + end + end + if iT > 1 nT += iT*(iT - 1) end + nT = div(nT,2) + if iU > 1 nU += iU*(iU - 1) end + nU = div(nU,2) + + # Sort y after x + y[:] = y[pm] + + # Calculate double ties + iV = 1 + nV = 0 + jV = 1 + for i = 2:n + if x[i] == x[i-1] && y[i] == y[i-1] + iV += 1 + else + nV += iV*(iV - 1) + iV = 1 + end + end + if iV > 1 nV += iV*(iV - 1) end + nV = div(nV,2) + + nD = div(n*(n - 1),2) + return (nD - nT - nU + nV - 2swaps!(y))/sqrt((nD - nT)*(nD - nU)) +end +cor_kendall(x::AbstractVector, y::AbstractVector) = cor_kendall!(copy(x), copy(y)) +cor_kendall(x::AbstractVector, Y::AbstractMatrix) = [cor_kendall(x, Y[:,i]) for i in 1:size(Y, 2)] +cor_kendall(X::AbstractMatrix, y::AbstractVector) = [cor_kendall(X[:,i], y) for i in 1:size(X, 2)] +cor_kendall(X::AbstractMatrix, Y::AbstractMatrix) = [cor_kendall(X[:,i], Y[:,j]) for i in 1:size(X, 2), j in 1:size(Y, 2)] +function cor_kendall(X::AbstractMatrix) + n = size(X, 2) + C = eye(n) + for j = 2:n + for i = 1:j-1 + C[i,j] = cor_kendall!(X[:,i],X[:,j]) + C[j,i] = C[i,j] + end + end + return C +end + +# Auxilliary functions for Kendall's rank correlation +function swaps!(x::AbstractVector) + n = length(x) + if n == 1 return 0 end + n2 = div(n, 2) + xl = sub(x, 1:n2) + xr = sub(x, n2+1:n) + nsl = swaps!(xl) + nsr = swaps!(xr) + sort!(xl) + sort!(xr) + return nsl + nsr + mswaps(xl,xr) +end + +function mswaps(x::AbstractVector, y::AbstractVector) + i = 1 + j = 1 + nSwaps = 0 + n = length(x) + while i <= n && j <= length(y) + if y[j] < x[i] + nSwaps += n - i + 1 + j += 1 + else + i += 1 + end + end + return nSwaps end # autocorrelation for range diff --git a/src/scalar_stats.jl b/src/scalar_stats.jl index 4cf56606b9d7a..503e61f462a78 100644 --- a/src/scalar_stats.jl +++ b/src/scalar_stats.jl @@ -151,12 +151,6 @@ end tiedrank{T<:Real}(X::AbstractMatrix{T}) = tiedrank(reshape(X, length(X))) -function tiedrank{T<:Real}(X::AbstractMatrix{T}, dim::Int) - retmat = apply(hcat, mapslices(tiedrank, X, 3 - dim)) - return dim == 1 ? retmat : retmat' -end - - # quantile and friends quantile{T<:Real}(v::AbstractVector{T}) = quantile(v, [.0, .25, .5, .75, 1.0]) diff --git a/test/01.jl b/test/01.jl index 13ae8b4f5df55..6f8f7a3f42245 100644 --- a/test/01.jl +++ b/test/01.jl @@ -17,11 +17,12 @@ values, lengths = rle(z) @test lengths == [2, 2, 1, 1, 3, 1] @test inverse_rle(values, lengths) == z -# X = [1 0; 2 1; 3 0; 4 1; 5 10] -# y = [5, 3, 4, 2, 5] -# @assert_approx_eq cov_spearman(X, y)[1] cov_spearman(X[:,1],y) -# @assert_approx_eq cov_spearman(X) cov_spearman(X, X) -# @assert_approx_eq cov_spearman(X, y) [-0.25, -0.1875] +X = [1 0; 2 1; 3 0; 4 1; 5 10] +y = [5, 3, 4, 2, 5] +@test_approx_eq cor_spearman(X, y)[1] cor_spearman(X[:,1],y) +@test_approx_eq cor_spearman(X) cor_spearman(X, X) +@test_approx_eq cor_spearman(X, y) [-0.102597835208515, -0.081110710565381] +@test_approx_eq cor_kendall(X,y) [-0.105409255338946, -0.117851130197758] fnecdf = ecdf(randn(10000000)) @test_approx_eq_eps fnecdf([-1.96, -1.644854, -1.281552, -0.6744898, 0, 0.6744898, 1.281552, 1.644854, 1.96]) [0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.975] 1e-3