Skip to content

Commit

Permalink
Merge pull request #26 from axsk/weights
Browse files Browse the repository at this point in the history
Weighted KDE
  • Loading branch information
simonbyrne authored Jun 27, 2017
2 parents d7524ff + 4a3680a commit 4bcdc23
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 33 deletions.
36 changes: 21 additions & 15 deletions src/bivariate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ function default_bandwidth(data::Tuple{RealVector,RealVector})
end

# tabulate data for kde
function tabulate(data::Tuple{RealVector, RealVector}, midpoints::Tuple{Range, Range})
function tabulate(data::Tuple{RealVector, RealVector}, midpoints::Tuple{Range, Range}, weights::Weights = default_weights(data))
xdata, ydata = data
ndata = length(xdata)
length(ydata) == ndata || error("data vectors must be of same length")
Expand All @@ -41,17 +41,19 @@ function tabulate(data::Tuple{RealVector, RealVector}, midpoints::Tuple{Range, R

# Set up a grid for discretized data
grid = zeros(Float64, nx, ny)
ainc = 1.0 / (ndata*(sx*sy)^2)
ainc = 1.0 / (sum(weights)*(sx*sy)^2)

# weighted discretization (cf. Jones and Lotwick)
for (x, y) in zip(xdata,ydata)
for i in 1:length(xdata)
x = xdata[i]
y = ydata[i]
kx, ky = searchsortedfirst(xmid,x), searchsortedfirst(ymid,y)
jx, jy = kx-1, ky-1
if 1 <= jx <= nx-1 && 1 <= jy <= ny-1
grid[jx,jy] += (xmid[kx]-x)*(ymid[ky]-y)*ainc
grid[kx,jy] += (x-xmid[jx])*(ymid[ky]-y)*ainc
grid[jx,ky] += (xmid[kx]-x)*(y-ymid[jy])*ainc
grid[kx,ky] += (x-xmid[jx])*(y-ymid[jy])*ainc
grid[jx,jy] += (xmid[kx]-x)*(ymid[ky]-y)*ainc*weights[i]
grid[kx,jy] += (x-xmid[jx])*(ymid[ky]-y)*ainc*weights[i]
grid[jx,ky] += (xmid[kx]-x)*(y-ymid[jy])*ainc*weights[i]
grid[kx,ky] += (x-xmid[jx])*(y-ymid[jy])*ainc*weights[i]
end
end

Expand Down Expand Up @@ -87,41 +89,45 @@ end

const BivariateDistribution = Union{MultivariateDistribution,Tuple{UnivariateDistribution,UnivariateDistribution}}

function kde(data::Tuple{RealVector, RealVector}, midpoints::Tuple{Range, Range}, dist::BivariateDistribution)
k = tabulate(data,midpoints)
default_weights(data::Tuple{RealVector, RealVector}) = UniformWeights(length(data[1]))

function kde(data::Tuple{RealVector, RealVector}, weights::Weights, midpoints::Tuple{Range, Range}, dist::BivariateDistribution)
k = tabulate(data, midpoints, weights)
conv(k,dist)
end

function kde(data::Tuple{RealVector, RealVector}, dist::BivariateDistribution;
boundary::Tuple{Tuple{Real,Real}, Tuple{Real,Real}} = (kde_boundary(data[1],std(dist[1])),
kde_boundary(data[2],std(dist[2]))),
npoints::Tuple{Int,Int}=(256,256))
npoints::Tuple{Int,Int}=(256,256),
weights::Weights = default_weights(data))

xmid = kde_range(boundary[1],npoints[1])
ymid = kde_range(boundary[2],npoints[2])

kde(data,(xmid,ymid),dist)
kde(data,weights,(xmid,ymid),dist)
end

function kde(data::Tuple{RealVector, RealVector}, midpoints::Tuple{Range, Range};
bandwidth=default_bandwidth(data), kernel=Normal)
bandwidth=default_bandwidth(data), kernel=Normal, weights::Weights = default_weights(data))

dist = kernel_dist(kernel,bandwidth)
kde(data,midpoints,dist)
kde(data,weights,midpoints,dist)
end

function kde(data::Tuple{RealVector, RealVector};
bandwidth=default_bandwidth(data),
kernel=Normal,
boundary::Tuple{Tuple{Real,Real}, Tuple{Real,Real}} = (kde_boundary(data[1],bandwidth[1]),
kde_boundary(data[2],bandwidth[2])),
npoints::Tuple{Int,Int}=(256,256))
npoints::Tuple{Int,Int}=(256,256),
weights::Weights = default_weights(data))

dist = kernel_dist(kernel,bandwidth)
xmid = kde_range(boundary[1],npoints[1])
ymid = kde_range(boundary[2],npoints[2])

kde(data,(xmid,ymid),dist)
kde(data,weights,(xmid,ymid),dist)
end

# matrix data
Expand Down
51 changes: 33 additions & 18 deletions src/univariate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ function default_bandwidth(data::RealVector, alpha::Float64 = 0.9)
return alpha * width * ndata^(-0.2)
end

function default_weights(data::RealVector)
UniformWeights(length(data))
end


# Roughly based on:
# B. W. Silverman (1982) "Algorithm AS 176: Kernel Density Estimation Using
Expand Down Expand Up @@ -66,23 +70,32 @@ function kde_range(boundary::Tuple{Real,Real}, npoints::Int)
lo:step:hi
end

immutable UniformWeights{N} end

UniformWeights(n) = UniformWeights{n}()

Base.sum(x::UniformWeights) = 1.
Base.getindex{N}(x::UniformWeights{N}, i) = 1/N

typealias Weights Union{UniformWeights, RealVector, WeightVec}


# tabulate data for kde
function tabulate(data::RealVector, midpoints::Range)
ndata = length(data)
function tabulate(data::RealVector, midpoints::Range, weights::Weights=default_weights(data))
npoints = length(midpoints)
s = step(midpoints)

# Set up a grid for discretized data
grid = zeros(Float64, npoints)
ainc = 1.0 / (ndata*s*s)
ainc = 1.0 / (sum(weights)*s*s)

# weighted discretization (cf. Jones and Lotwick)
for x in data
for (i,x) in enumerate(data)
k = searchsortedfirst(midpoints,x)
j = k-1
if 1 <= j <= npoints-1
grid[j] += (midpoints[k]-x)*ainc
grid[k] += (x-midpoints[j])*ainc
grid[j] += (midpoints[k]-x)*ainc*weights[i]
grid[k] += (x-midpoints[j])*ainc*weights[i]
end
end

Expand Down Expand Up @@ -119,30 +132,30 @@ function conv(k::UnivariateKDE, dist::UnivariateDistribution)
end

# main kde interface methods
function kde(data::RealVector, midpoints::Range, dist::UnivariateDistribution)
k = tabulate(data, midpoints)
function kde(data::RealVector, weights::Weights, midpoints::Range, dist::UnivariateDistribution)
k = tabulate(data, midpoints, weights)
conv(k,dist)
end

function kde(data::RealVector, dist::UnivariateDistribution;
boundary::Tuple{Real,Real}=kde_boundary(data,std(dist)), npoints::Int=2048)
boundary::Tuple{Real,Real}=kde_boundary(data,std(dist)), npoints::Int=2048, weights=default_weights(data))

midpoints = kde_range(boundary,npoints)
kde(data,midpoints,dist)
kde(data,weights,midpoints,dist)
end

function kde(data::RealVector, midpoints::Range;
bandwidth=default_bandwidth(data), kernel=Normal)
bandwidth=default_bandwidth(data), kernel=Normal, weights=default_weights(data))
bandwidth > 0.0 || error("Bandwidth must be positive")
dist = kernel_dist(kernel,bandwidth)
kde(data,midpoints,dist)
kde(data,weights,midpoints,dist)
end

function kde(data::RealVector; bandwidth=default_bandwidth(data), kernel=Normal,
npoints::Int=2048, boundary::Tuple{Real,Real}=kde_boundary(data,bandwidth))
npoints::Int=2048, boundary::Tuple{Real,Real}=kde_boundary(data,bandwidth), weights=default_weights(data))
bandwidth > 0.0 || error("Bandwidth must be positive")
dist = kernel_dist(kernel,bandwidth)
kde(data,dist;boundary=boundary,npoints=npoints)
kde(data,dist;boundary=boundary,npoints=npoints,weights=weights)
end

# Select bandwidth using least-squares cross validation, from:
Expand All @@ -152,10 +165,11 @@ end

function kde_lscv(data::RealVector, midpoints::Range;
kernel=Normal,
bandwidth_range::Tuple{Real,Real}=(h=default_bandwidth(data); (0.25*h,1.5*h)))
bandwidth_range::Tuple{Real,Real}=(h=default_bandwidth(data); (0.25*h,1.5*h)),
weights=default_weights(data))

ndata = length(data)
k = tabulate(data, midpoints)
k = tabulate(data, midpoints, weights)

# the ft here is K/ba*sqrt(2pi) * u(s), it is K times the Yl in Silverman's book
K = length(k.density)
Expand Down Expand Up @@ -194,8 +208,9 @@ function kde_lscv(data::RealVector;
boundary::Tuple{Real,Real}=kde_boundary(data,default_bandwidth(data)),
npoints::Int=2048,
kernel=Normal,
bandwidth_range::Tuple{Real,Real}=(h=default_bandwidth(data); (0.25*h,1.5*h)))
bandwidth_range::Tuple{Real,Real}=(h=default_bandwidth(data); (0.25*h,1.5*h)),
weights::Weights = default_weights(data))

midpoints = kde_range(boundary,npoints)
kde_lscv(data,midpoints; kernel=kernel, bandwidth_range=bandwidth_range)
kde_lscv(data,midpoints; kernel=kernel, bandwidth_range=bandwidth_range, weights=weights)
end
6 changes: 6 additions & 0 deletions test/bivariate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -56,5 +56,11 @@ for X in ([0.0], [0.0,0.0], [0.0,0.5], [-0.5:0.1:0.5;])
@test all(k5.density .>= 0.0)
@test sum(k5.density)*step(k5.x)*step(k5.y) 1.0

k6 = kde([X X],(r,r);kernel=D, weights=ones(X)/length(X))
@test k4.density k6.density
end
end

k1 = kde([0.0 0.0; 1.0 1.0], (r,r), bandwidth=(1,1), weights=[0,1])
k2 = kde([1.0 1.0], (r,r), bandwidth=(1,1))
@test k1.density k2.density
6 changes: 6 additions & 0 deletions test/univariate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -53,5 +53,11 @@ for X in ([0.0], [0.0,0.0], [0.0,0.5], [-0.5:0.1:0.5;])
@test all(k5.density .>= 0.0)
@test sum(k5.density)*step(k5.x) 1.0

k6 = kde(X,r;kernel=D, weights=ones(X)/length(X))
@test k4.density k6.density
end
end

k1 = kde([0.0, 1.], r, bandwidth=1, weights=[0,1])
k2 = kde([1.], r, bandwidth=1)
@test k1.density k2.density

0 comments on commit 4bcdc23

Please sign in to comment.