Skip to content

Commit

Permalink
Introduce dropobs and dropvars and deprecate Drop.
Browse files Browse the repository at this point in the history
  • Loading branch information
rofinn committed Jul 8, 2019
1 parent 20f084e commit aedd1ab
Show file tree
Hide file tree
Showing 4 changed files with 176 additions and 34 deletions.
4 changes: 3 additions & 1 deletion src/Impute.jl
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@ include("context.jl")
include("imputors.jl")

const global imputation_methods = Dict{Symbol, Type}(
:drop => Drop,
:drop => DropObs,
:dropobs => DropObs,
:dropvars => DropVars,
:interp => Interpolate,
:fill => Fill,
:locf => LOCF,
Expand Down
1 change: 1 addition & 0 deletions src/deprecated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ end
# Misc Deprecations #
#####################
Base.@deprecate Fill(val; kwargs...) Fill(; value=val, kwargs...)
Base.@deprecate_binding Drop DropObs false

# This function is just used to support legacy behaviour and should be removed in a
# future release when we dropping accepting the limit kwarg to impute functions.
Expand Down
118 changes: 105 additions & 13 deletions src/imputors/drop.jl
Original file line number Diff line number Diff line change
@@ -1,39 +1,39 @@
"""
Drop <: Imputor
DropObs <: Imputor
Removes missing values from the `AbstractArray` or `Tables.table` provided.
# Fields
* `context::AbstractContext`: A context which keeps track of missing data
summary information
"""
struct Drop <: Imputor
struct DropObs <: Imputor
context::AbstractContext
end

"""Drop(; context=Context()) -> Drop"""
Drop(; context=Context()) = Drop(context)
"""DropObs(; context=Context()) -> DropObs"""
DropObs(; context=Context()) = DropObs(context)

"""
impute!(imp::Drop, data::AbstractVector)
impute!(imp::DropObs, data::AbstractVector)
Uses `filter!` to remove missing elements from the array.
# Arguments
* `imp::Drop`: this `Imputor` method
* `imp::DropObs`: this `Imputor` method
* `data::AbstractVector`: the data to impute
# Returns
* `AbstractVector`: our data array with missing elements removed
"""
function impute!(imp::Drop, data::AbstractVector)
function impute!(imp::DropObs, data::AbstractVector)
imp.context() do c
filter!(x -> !ismissing(c, x), data)
end
end

"""
impute!(imp::Drop, data::AbstractMatrix)
impute!(imp::DropObs, data::AbstractMatrix)
Finds the missing rows in the matrix and uses a mask (Vector{Bool}) to return the
`data` with those rows removed. Unfortunately, the mask approach requires copying the matrix.
Expand All @@ -46,32 +46,32 @@ NOTES (or premature optimizations):
3. reshaping the data back to the desired shape.
# Arguments
* `imp::Drop`: this `Imputor` method
* `imp::DropObs`: this `Imputor` method
* `data::AbstractMatrix`: the data to impute
# Returns
* `AbstractMatrix`: a new matrix with missing rows removed
"""
function impute!(imp::Drop, data::AbstractMatrix)
function impute!(imp::DropObs, data::AbstractMatrix)
imp.context() do c
mask = map(i -> !ismissing(c, data[i, :]), 1:size(data, 1))
return data[mask, :]
end
end

"""
impute!(imp::Drop, table)
impute!(imp::DropObs, table)
Finds the missing rows in the table and deletes them.
# Arguments
* `imp::Drop`: this `Imputor` method
* `imp::DropObs`: this `Imputor` method
* `table`: a type that implements the Tables API.
# Returns
* our data with the missing rows removed.
"""
function impute!(imp::Drop, table)
function impute!(imp::DropObs, table)
imp.context() do c
@assert istable(table)
rows = Tables.rows(table)
Expand All @@ -85,3 +85,95 @@ function impute!(imp::Drop, table)
return table
end
end


"""
DropVars <: Imputor
Removes missing values from the `AbstractArray` or `Tables.table` provided.
# Fields
* `context::AbstractContext`: A context which keeps track of missing data
summary information
"""
struct DropVars <: Imputor
context::AbstractContext
end

"""DropVars(; context=Context()) -> DropVars"""
DropVars(; context=Context()) = DropVars(context)

"""
impute!(imp::DropVars, data::AbstractMatrix)
Finds columns in the matrix with too many missing values and uses a mask (Vector{Bool}) to
return the `data` with those columns removed. Unfortunately, the mask approach
requires copying the matrix.
# Arguments
* `imp::DropVars`: this `Imputor` method
* `data::AbstractMatrix`: the data to impute
# Returns
* `AbstractMatrix`: a new matrix with missing columns removed
"""
function impute!(imp::DropVars, data::AbstractMatrix)
mask = map(1:size(data, 2)) do i
try
imp.context() do c
for j in 1:size(data, 1)
ismissing(c, data[j, i])
end
end
return true
catch e
if isa(e, ImputeError)
return false
else
rethrow(e)
end
end
end

data = data[:, mask]
return data
end

"""
impute!(imp::DropVars, table)
Find remove columns in the table with too many missing elements.
# Arguments
* `imp::DropVars`: this `Imputor` method
* `table`: a type that implements the Tables API.
# Returns
* our data with the missing columns removed.
"""
function impute!(imp::DropVars, table)
@assert istable(table)
cols = Tables.columns(table)

cnames = Iterators.filter(propertynames(cols)) do cname
try
imp.context() do c
col = getproperty(cols, cname)
for i in 1:length(col)
ismissing(c, col[i])
end
end
return true
catch e
if isa(e, ImputeError)
return false
else
rethrow(e)
end
end
end

table = Tables.select(table, cnames...) |> materializer(table)
return table
end
87 changes: 67 additions & 20 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,17 @@ using RDatasets
using Statistics
using StatsBase

import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, ImputeError
import Impute:
Drop,
DropObs,
DropVars,
Interpolate,
Fill,
LOCF,
NOCB,
Context,
WeightedContext,
ImputeError

@testset "Impute" begin
a = Vector{Union{Float64, Missing}}(1.0:1.0:20.0)
Expand All @@ -15,16 +25,53 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im
ctx = Context(; limit=0.2)

@testset "Drop" begin
result = impute(Drop(; context=ctx), a)
expected = copy(a)
deleteat!(expected, [2, 3, 7])
@testset "DropObs" begin
result = impute(DropObs(; context=ctx), a)
expected = copy(a)
deleteat!(expected, [2, 3, 7])

@test result == expected
@test result == Impute.drop(a; context=ctx)
@test result == expected
@test result == Impute.dropobs(a; context=ctx)

a2 = copy(a)
Impute.drop!(a2; context=ctx)
@test a2 == expected
a2 = copy(a)
Impute.dropobs!(a2; context=ctx)
@test a2 == expected
end
@testset "DropVars" begin
@testset "Matrix" begin
m = reshape(a, 5, 4)

result = impute(DropVars(; context=ctx), m)
expected = copy(m)[:, 2:4]

@test isequal(result, expected)
@test isequal(result, Impute.dropvars(m; context=ctx))

Impute.dropvars!(m; context=ctx)
# The mutating test is broken because we need to making a copy of
# the original matrix
@test_broken isequal(m, expected)
end
@testset "DataFrame" begin
df = DataFrame(
:sin => Vector{Union{Float64, Missing}}(sin.(1.0:1.0:20.0)),
:cos => Vector{Union{Float64, Missing}}(sin.(1.0:1.0:20.0)),
)
df.sin[[2, 3, 7, 12, 19]] .= missing
df.cos[[4, 9]] .= missing

result = impute(DropVars(; context=ctx), df)
expected = df[[:cos]]

@test isequal(result, expected)
@test isequal(result, Impute.dropvars(df; context=ctx))

Impute.dropvars!(df; context=ctx)
# The mutating test is broken because we need to making a copy of
# the original table
@test_broken isequal(df, expected)
end
end
end

@testset "Interpolate" begin
Expand Down Expand Up @@ -116,9 +163,9 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im
data = Matrix(dataset("boot", "neuro"))

@testset "Drop" begin
result = impute(Drop(; context=ctx), data)
result = impute(DropObs(; context=ctx), data)
@test size(result, 1) == 4
@test result == Impute.drop(data; context=ctx)
@test result == Impute.dropobs(data; context=ctx)
end

@testset "Fill" begin
Expand All @@ -134,8 +181,8 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im

@testset "Not enough data" begin
ctx = Context(; limit=0.1)
@test_throws ImputeError impute(Drop(; context=ctx), a)
@test_throws ImputeError Impute.drop(a; context=ctx)
@test_throws ImputeError impute(DropObs(; context=ctx), a)
@test_throws ImputeError Impute.dropobs(a; context=ctx)
end

@testset "Chain" begin
Expand Down Expand Up @@ -191,35 +238,35 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im
data1 = dataset("boot", "neuro") # Missing values with `missing`
data2 = Impute.fill(data1; value=NaN, context=ctx1) # Missing values with `NaN`

@test Impute.drop(data1; context=ctx1) == dropmissing(data1)
@test Impute.dropobs(data1; context=ctx1) == dropmissing(data1)

result1 = Impute.interp(data1; context=ctx1) |> Impute.drop!()
result2 = Impute.interp(data2; context=ctx2) |> Impute.drop!(; context=ctx2)
result1 = Impute.interp(data1; context=ctx1) |> Impute.dropobs!()
result2 = Impute.interp(data2; context=ctx2) |> Impute.dropobs!(; context=ctx2)

@test result1 == result2
end

@testset "Contexts" begin
@testset "Base" begin
ctx = Context(; limit=0.1)
@test_throws ImputeError Impute.drop(a; context=ctx)
@test_throws ImputeError impute(Drop(; context=ctx), a)
@test_throws ImputeError Impute.dropobs(a; context=ctx)
@test_throws ImputeError impute(DropObs(; context=ctx), a)
end

@testset "Weighted" begin
# If we use an exponentially weighted context then we won't pass the limit
# because missing earlier observations is less important than later ones.
ctx = WeightedContext(eweights(20, 0.3); limit=0.1)
@test isa(ctx, WeightedContext)
result = impute(Drop(), ctx, a)
result = impute(DropObs(), ctx, a)
expected = copy(a)
deleteat!(expected, [2, 3, 7])
@test result == expected

# If we reverse the weights such that earlier observations are more important
# then our previous limit of 0.2 won't be enough to succeed.
ctx = WeightedContext(reverse!(eweights(20, 0.3)); limit=0.2)
@test_throws ImputeError impute(Drop(), ctx, a)
@test_throws ImputeError impute(DropObs(), ctx, a)
end
end

Expand Down

0 comments on commit aedd1ab

Please sign in to comment.