diff --git a/src/bivariate.jl b/src/bivariate.jl index e77cf889..6a9a4ae4 100644 --- a/src/bivariate.jl +++ b/src/bivariate.jl @@ -30,7 +30,7 @@ function default_bandwidth(data::Tuple{RealVector,RealVector}) end # tabulate data for kde -function tabulate(data::Tuple{RealVector, RealVector}, midpoints::Tuple{Range, Range}) +function tabulate(data::Tuple{RealVector, RealVector}, midpoints::Tuple{Range, Range}, weights::Weights = default_weights(data)) xdata, ydata = data ndata = length(xdata) length(ydata) == ndata || error("data vectors must be of same length") @@ -41,17 +41,19 @@ function tabulate(data::Tuple{RealVector, RealVector}, midpoints::Tuple{Range, R # Set up a grid for discretized data grid = zeros(Float64, nx, ny) - ainc = 1.0 / (ndata*(sx*sy)^2) + ainc = 1.0 / (sum(weights)*(sx*sy)^2) # weighted discretization (cf. Jones and Lotwick) - for (x, y) in zip(xdata,ydata) + for i in 1:length(xdata) + x = xdata[i] + y = ydata[i] kx, ky = searchsortedfirst(xmid,x), searchsortedfirst(ymid,y) jx, jy = kx-1, ky-1 if 1 <= jx <= nx-1 && 1 <= jy <= ny-1 - grid[jx,jy] += (xmid[kx]-x)*(ymid[ky]-y)*ainc - grid[kx,jy] += (x-xmid[jx])*(ymid[ky]-y)*ainc - grid[jx,ky] += (xmid[kx]-x)*(y-ymid[jy])*ainc - grid[kx,ky] += (x-xmid[jx])*(y-ymid[jy])*ainc + grid[jx,jy] += (xmid[kx]-x)*(ymid[ky]-y)*ainc*weights[i] + grid[kx,jy] += (x-xmid[jx])*(ymid[ky]-y)*ainc*weights[i] + grid[jx,ky] += (xmid[kx]-x)*(y-ymid[jy])*ainc*weights[i] + grid[kx,ky] += (x-xmid[jx])*(y-ymid[jy])*ainc*weights[i] end end @@ -87,27 +89,30 @@ end const BivariateDistribution = Union{MultivariateDistribution,Tuple{UnivariateDistribution,UnivariateDistribution}} -function kde(data::Tuple{RealVector, RealVector}, midpoints::Tuple{Range, Range}, dist::BivariateDistribution) - k = tabulate(data,midpoints) +default_weights(data::Tuple{RealVector, RealVector}) = UniformWeights(length(data[1])) + +function kde(data::Tuple{RealVector, RealVector}, weights::Weights, midpoints::Tuple{Range, Range}, dist::BivariateDistribution) + k = tabulate(data, midpoints, weights) conv(k,dist) end function kde(data::Tuple{RealVector, RealVector}, dist::BivariateDistribution; boundary::Tuple{Tuple{Real,Real}, Tuple{Real,Real}} = (kde_boundary(data[1],std(dist[1])), kde_boundary(data[2],std(dist[2]))), - npoints::Tuple{Int,Int}=(256,256)) + npoints::Tuple{Int,Int}=(256,256), + weights::Weights = default_weights(data)) xmid = kde_range(boundary[1],npoints[1]) ymid = kde_range(boundary[2],npoints[2]) - kde(data,(xmid,ymid),dist) + kde(data,weights,(xmid,ymid),dist) end function kde(data::Tuple{RealVector, RealVector}, midpoints::Tuple{Range, Range}; - bandwidth=default_bandwidth(data), kernel=Normal) + bandwidth=default_bandwidth(data), kernel=Normal, weights::Weights = default_weights(data)) dist = kernel_dist(kernel,bandwidth) - kde(data,midpoints,dist) + kde(data,weights,midpoints,dist) end function kde(data::Tuple{RealVector, RealVector}; @@ -115,13 +120,14 @@ function kde(data::Tuple{RealVector, RealVector}; kernel=Normal, boundary::Tuple{Tuple{Real,Real}, Tuple{Real,Real}} = (kde_boundary(data[1],bandwidth[1]), kde_boundary(data[2],bandwidth[2])), - npoints::Tuple{Int,Int}=(256,256)) + npoints::Tuple{Int,Int}=(256,256), + weights::Weights = default_weights(data)) dist = kernel_dist(kernel,bandwidth) xmid = kde_range(boundary[1],npoints[1]) ymid = kde_range(boundary[2],npoints[2]) - kde(data,(xmid,ymid),dist) + kde(data,weights,(xmid,ymid),dist) end # matrix data diff --git a/src/univariate.jl b/src/univariate.jl index 97b52008..9bea4ad9 100644 --- a/src/univariate.jl +++ b/src/univariate.jl @@ -37,6 +37,10 @@ function default_bandwidth(data::RealVector, alpha::Float64 = 0.9) return alpha * width * ndata^(-0.2) end +function default_weights(data::RealVector) + UniformWeights(length(data)) +end + # Roughly based on: # B. W. Silverman (1982) "Algorithm AS 176: Kernel Density Estimation Using @@ -66,23 +70,32 @@ function kde_range(boundary::Tuple{Real,Real}, npoints::Int) lo:step:hi end +immutable UniformWeights{N} end + +UniformWeights(n) = UniformWeights{n}() + +Base.sum(x::UniformWeights) = 1. +Base.getindex{N}(x::UniformWeights{N}, i) = 1/N + +typealias Weights Union{UniformWeights, RealVector, WeightVec} + + # tabulate data for kde -function tabulate(data::RealVector, midpoints::Range) - ndata = length(data) +function tabulate(data::RealVector, midpoints::Range, weights::Weights=default_weights(data)) npoints = length(midpoints) s = step(midpoints) # Set up a grid for discretized data grid = zeros(Float64, npoints) - ainc = 1.0 / (ndata*s*s) + ainc = 1.0 / (sum(weights)*s*s) # weighted discretization (cf. Jones and Lotwick) - for x in data + for (i,x) in enumerate(data) k = searchsortedfirst(midpoints,x) j = k-1 if 1 <= j <= npoints-1 - grid[j] += (midpoints[k]-x)*ainc - grid[k] += (x-midpoints[j])*ainc + grid[j] += (midpoints[k]-x)*ainc*weights[i] + grid[k] += (x-midpoints[j])*ainc*weights[i] end end @@ -119,30 +132,30 @@ function conv(k::UnivariateKDE, dist::UnivariateDistribution) end # main kde interface methods -function kde(data::RealVector, midpoints::Range, dist::UnivariateDistribution) - k = tabulate(data, midpoints) +function kde(data::RealVector, weights::Weights, midpoints::Range, dist::UnivariateDistribution) + k = tabulate(data, midpoints, weights) conv(k,dist) end function kde(data::RealVector, dist::UnivariateDistribution; - boundary::Tuple{Real,Real}=kde_boundary(data,std(dist)), npoints::Int=2048) + boundary::Tuple{Real,Real}=kde_boundary(data,std(dist)), npoints::Int=2048, weights=default_weights(data)) midpoints = kde_range(boundary,npoints) - kde(data,midpoints,dist) + kde(data,weights,midpoints,dist) end function kde(data::RealVector, midpoints::Range; - bandwidth=default_bandwidth(data), kernel=Normal) + bandwidth=default_bandwidth(data), kernel=Normal, weights=default_weights(data)) bandwidth > 0.0 || error("Bandwidth must be positive") dist = kernel_dist(kernel,bandwidth) - kde(data,midpoints,dist) + kde(data,weights,midpoints,dist) end function kde(data::RealVector; bandwidth=default_bandwidth(data), kernel=Normal, - npoints::Int=2048, boundary::Tuple{Real,Real}=kde_boundary(data,bandwidth)) + npoints::Int=2048, boundary::Tuple{Real,Real}=kde_boundary(data,bandwidth), weights=default_weights(data)) bandwidth > 0.0 || error("Bandwidth must be positive") dist = kernel_dist(kernel,bandwidth) - kde(data,dist;boundary=boundary,npoints=npoints) + kde(data,dist;boundary=boundary,npoints=npoints,weights=weights) end # Select bandwidth using least-squares cross validation, from: @@ -152,10 +165,11 @@ end function kde_lscv(data::RealVector, midpoints::Range; kernel=Normal, - bandwidth_range::Tuple{Real,Real}=(h=default_bandwidth(data); (0.25*h,1.5*h))) + bandwidth_range::Tuple{Real,Real}=(h=default_bandwidth(data); (0.25*h,1.5*h)), + weights=default_weights(data)) ndata = length(data) - k = tabulate(data, midpoints) + k = tabulate(data, midpoints, weights) # the ft here is K/ba*sqrt(2pi) * u(s), it is K times the Yl in Silverman's book K = length(k.density) @@ -194,8 +208,9 @@ function kde_lscv(data::RealVector; boundary::Tuple{Real,Real}=kde_boundary(data,default_bandwidth(data)), npoints::Int=2048, kernel=Normal, - bandwidth_range::Tuple{Real,Real}=(h=default_bandwidth(data); (0.25*h,1.5*h))) + bandwidth_range::Tuple{Real,Real}=(h=default_bandwidth(data); (0.25*h,1.5*h)), + weights::Weights = default_weights(data)) midpoints = kde_range(boundary,npoints) - kde_lscv(data,midpoints; kernel=kernel, bandwidth_range=bandwidth_range) + kde_lscv(data,midpoints; kernel=kernel, bandwidth_range=bandwidth_range, weights=weights) end diff --git a/test/bivariate.jl b/test/bivariate.jl index 216c3880..1006ceef 100644 --- a/test/bivariate.jl +++ b/test/bivariate.jl @@ -56,5 +56,11 @@ for X in ([0.0], [0.0,0.0], [0.0,0.5], [-0.5:0.1:0.5;]) @test all(k5.density .>= 0.0) @test sum(k5.density)*step(k5.x)*step(k5.y) ≈ 1.0 + k6 = kde([X X],(r,r);kernel=D, weights=ones(X)/length(X)) + @test k4.density ≈ k6.density end end + +k1 = kde([0.0 0.0; 1.0 1.0], (r,r), bandwidth=(1,1), weights=[0,1]) +k2 = kde([1.0 1.0], (r,r), bandwidth=(1,1)) +@test k1.density ≈ k2.density diff --git a/test/univariate.jl b/test/univariate.jl index 4caf929e..dcefd08d 100644 --- a/test/univariate.jl +++ b/test/univariate.jl @@ -53,5 +53,11 @@ for X in ([0.0], [0.0,0.0], [0.0,0.5], [-0.5:0.1:0.5;]) @test all(k5.density .>= 0.0) @test sum(k5.density)*step(k5.x) ≈ 1.0 + k6 = kde(X,r;kernel=D, weights=ones(X)/length(X)) + @test k4.density ≈ k6.density end end + +k1 = kde([0.0, 1.], r, bandwidth=1, weights=[0,1]) +k2 = kde([1.], r, bandwidth=1) +@test k1.density ≈ k2.density