Skip to content

Commit

Permalink
Add weighted KDE
Browse files Browse the repository at this point in the history
  • Loading branch information
Luke Stagner committed Oct 13, 2016
1 parent cf96862 commit b48fb2d
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 28 deletions.
2 changes: 1 addition & 1 deletion src/KernelDensity.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ using Interpolations
using Compat

import Base: conv
import StatsBase: RealVector, RealMatrix
import StatsBase: RealVector, RealMatrix, WeightVec
import Distributions: twoπ, pdf

export kde, kde_lscv, UnivariateKDE, BivariateKDE, InterpKDE, pdf
Expand Down
34 changes: 20 additions & 14 deletions src/bivariate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,13 @@ function default_bandwidth(data::(@compat Tuple{RealVector,RealVector}))
default_bandwidth(data[1]), default_bandwidth(data[2])
end

function default_weights(data::(@compat Tuple{RealVector,RealVector}))
n = length(data[1])
WeightVec(fill(1/n,n))
end

# tabulate data for kde
function tabulate(data::(@compat Tuple{RealVector, RealVector}), midpoints::(@compat Tuple{Range, Range}))
function tabulate(data::(@compat Tuple{RealVector, RealVector}), midpoints::(@compat Tuple{Range, Range}), weights::WeightVec)
xdata, ydata = data
ndata = length(xdata)
length(ydata) == ndata || error("data vectors must be of same length")
Expand All @@ -35,17 +40,17 @@ function tabulate(data::(@compat Tuple{RealVector, RealVector}), midpoints::(@co

# Set up a grid for discretized data
grid = zeros(Float64, nx, ny)
ainc = 1.0 / (ndata*(sx*sy)^2)
ainc = 1.0 / (weights.sum*(sx*sy)^2)

# weighted discretization (cf. Jones and Lotwick)
for (x, y) in zip(xdata,ydata)
for (x, y, w) in zip(xdata,ydata,weights.values)
kx, ky = searchsortedfirst(xmid,x), searchsortedfirst(ymid,y)
jx, jy = kx-1, ky-1
if 1 <= jx <= nx-1 && 1 <= jy <= ny-1
grid[jx,jy] += (xmid[kx]-x)*(ymid[ky]-y)*ainc
grid[kx,jy] += (x-xmid[jx])*(ymid[ky]-y)*ainc
grid[jx,ky] += (xmid[kx]-x)*(y-ymid[jy])*ainc
grid[kx,ky] += (x-xmid[jx])*(y-ymid[jy])*ainc
grid[jx,jy] += (xmid[kx]-x)*(ymid[ky]-y)*ainc*w
grid[kx,jy] += (x-xmid[jx])*(ymid[ky]-y)*ainc*w
grid[jx,ky] += (xmid[kx]-x)*(y-ymid[jy])*ainc*w
grid[kx,ky] += (x-xmid[jx])*(y-ymid[jy])*ainc*w
end
end

Expand Down Expand Up @@ -81,32 +86,33 @@ end

typealias BivariateDistribution @compat(Union{MultivariateDistribution,Tuple{UnivariateDistribution,UnivariateDistribution}})

function kde(data::(@compat Tuple{RealVector, RealVector}), midpoints::(@compat Tuple{Range, Range}), dist::BivariateDistribution)
k = tabulate(data,midpoints)
function kde(data::(@compat Tuple{RealVector, RealVector}), midpoints::(@compat Tuple{Range, Range}), weights::WeightVec, dist::BivariateDistribution)
k = tabulate(data,midpoints,weights)
conv(k,dist)
end

function kde(data::(@compat Tuple{RealVector, RealVector}), dist::BivariateDistribution;
boundary::(@compat Tuple{(@compat Tuple{Real,Real}),(@compat Tuple{Real,Real})}) = (kde_boundary(data[1],std(dist[1])),
kde_boundary(data[2],std(dist[2]))),
npoints::(@compat Tuple{Int,Int})=(256,256))
npoints::(@compat Tuple{Int,Int})=(256,256), weights::WeightVec = default_weights(data))

xmid = kde_range(boundary[1],npoints[1])
ymid = kde_range(boundary[2],npoints[2])

kde(data,(xmid,ymid),dist)
kde(data,(xmid,ymid),weights,dist)
end

function kde(data::(@compat Tuple{RealVector, RealVector}), midpoints::(@compat Tuple{Range, Range});
bandwidth=default_bandwidth(data), kernel=Normal)
bandwidth=default_bandwidth(data), kernel=Normal, weights=default_weights(data))

dist = kernel_dist(kernel,bandwidth)
kde(data,midpoints,dist)
kde(data,midpoints,weights,dist)
end

function kde(data::(@compat Tuple{RealVector, RealVector});
bandwidth=default_bandwidth(data),
kernel=Normal,
weights=default_weights(data),
boundary::(@compat Tuple{(@compat Tuple{Real,Real}),(@compat Tuple{Real,Real})}) = (kde_boundary(data[1],bandwidth[1]),
kde_boundary(data[2],bandwidth[2])),
npoints::(@compat Tuple{Int,Int})=(256,256))
Expand All @@ -115,7 +121,7 @@ function kde(data::(@compat Tuple{RealVector, RealVector});
xmid = kde_range(boundary[1],npoints[1])
ymid = kde_range(boundary[2],npoints[2])

kde(data,(xmid,ymid),dist)
kde(data,(xmid,ymid),weights,dist)
end

# matrix data
Expand Down
33 changes: 20 additions & 13 deletions src/univariate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -66,23 +66,28 @@ function kde_range(boundary::(@compat Tuple{Real,Real}), npoints::Int)
lo:step:hi
end

function default_weights(data::RealVector)
n = length(data)
WeightVec(fill(1/n, n))
end

# tabulate data for kde
function tabulate(data::RealVector, midpoints::Range)
function tabulate(data::RealVector, midpoints::Range, weights::WeightVec)
ndata = length(data)
npoints = length(midpoints)
s = step(midpoints)

# Set up a grid for discretized data
grid = zeros(Float64, npoints)
ainc = 1.0 / (ndata*s*s)
ainc = 1.0 / (weights.sum*s*s)

# weighted discretization (cf. Jones and Lotwick)
for x in data
for (x, w) in zip(data,weights.values)
k = searchsortedfirst(midpoints,x)
j = k-1
if 1 <= j <= npoints-1
grid[j] += (midpoints[k]-x)*ainc
grid[k] += (x-midpoints[j])*ainc
grid[j] += (midpoints[k]-x)*ainc*w
grid[k] += (x-midpoints[j])*ainc*w
end
end

Expand Down Expand Up @@ -119,30 +124,31 @@ function conv(k::UnivariateKDE, dist::UnivariateDistribution)
end

# main kde interface methods
function kde(data::RealVector, midpoints::Range, dist::UnivariateDistribution)
k = tabulate(data, midpoints)
function kde(data::RealVector, midpoints::Range, weights::WeightVec, dist::UnivariateDistribution)
k = tabulate(data, midpoints, weights)
conv(k,dist)
end

function kde(data::RealVector, dist::UnivariateDistribution;
weights::WeightVec = default_weights(data),
boundary::(@compat Tuple{Real,Real})=kde_boundary(data,std(dist)), npoints::Int=2048)

midpoints = kde_range(boundary,npoints)
kde(data,midpoints,dist)
kde(data,midpoints,weights,dist)
end

function kde(data::RealVector, midpoints::Range;
bandwidth=default_bandwidth(data), kernel=Normal)
bandwidth=default_bandwidth(data), weights=default_weights(data),kernel=Normal)
bandwidth > 0.0 || error("Bandwidth must be positive")
dist = kernel_dist(kernel,bandwidth)
kde(data,midpoints,dist)
kde(data,midpoints,weights,dist)
end

function kde(data::RealVector; bandwidth=default_bandwidth(data), kernel=Normal,
function kde(data::RealVector; bandwidth=default_bandwidth(data), weights=default_weights(data), kernel=Normal,
npoints::Int=2048, boundary::(@compat Tuple{Real,Real})=kde_boundary(data,bandwidth))
bandwidth > 0.0 || error("Bandwidth must be positive")
dist = kernel_dist(kernel,bandwidth)
kde(data,dist;boundary=boundary,npoints=npoints)
kde(data,dist;weights=weights,boundary=boundary,npoints=npoints)
end

# Select bandwidth using least-squares cross validation, from:
Expand All @@ -151,11 +157,12 @@ end
# sections 3.4.3 (pp. 48-52) and 3.5 (pp. 61-66)

function kde_lscv(data::RealVector, midpoints::Range;
weights=default_weights(data),
kernel=Normal,
bandwidth_range::(@compat Tuple{Real,Real})=(h=default_bandwidth(data); (0.25*h,1.5*h)))

ndata = length(data)
k = tabulate(data, midpoints)
k = tabulate(data, midpoints, weights)

# the ft here is K/ba*sqrt(2pi) * u(s), it is K times the Yl in Silverman's book
K = length(k.density)
Expand Down

0 comments on commit b48fb2d

Please sign in to comment.