diff --git a/src/Impute.jl b/src/Impute.jl index 7c5c062..e72a9fd 100644 --- a/src/Impute.jl +++ b/src/Impute.jl @@ -52,7 +52,9 @@ include("context.jl") include("imputors.jl") const global imputation_methods = Dict{Symbol, Type}( - :drop => Drop, + :drop => DropObs, + :dropobs => DropObs, + :dropvars => DropVars, :interp => Interpolate, :fill => Fill, :locf => LOCF, diff --git a/src/deprecated.jl b/src/deprecated.jl index 26dab75..a5174df 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -156,6 +156,7 @@ end # Misc Deprecations # ##################### Base.@deprecate Fill(val; kwargs...) Fill(; value=val, kwargs...) +Base.@deprecate_binding Drop DropObs false # This function is just used to support legacy behaviour and should be removed in a # future release when we dropping accepting the limit kwarg to impute functions. diff --git a/src/imputors/drop.jl b/src/imputors/drop.jl index 1e44355..8f02b88 100644 --- a/src/imputors/drop.jl +++ b/src/imputors/drop.jl @@ -1,5 +1,5 @@ """ - Drop <: Imputor + DropObs <: Imputor Removes missing values from the `AbstractArray` or `Tables.table` provided. @@ -7,33 +7,33 @@ Removes missing values from the `AbstractArray` or `Tables.table` provided. * `context::AbstractContext`: A context which keeps track of missing data summary information """ -struct Drop <: Imputor +struct DropObs <: Imputor context::AbstractContext end -"""Drop(; context=Context()) -> Drop""" -Drop(; context=Context()) = Drop(context) +"""DropObs(; context=Context()) -> DropObs""" +DropObs(; context=Context()) = DropObs(context) """ - impute!(imp::Drop, data::AbstractVector) + impute!(imp::DropObs, data::AbstractVector) Uses `filter!` to remove missing elements from the array. # Arguments -* `imp::Drop`: this `Imputor` method +* `imp::DropObs`: this `Imputor` method * `data::AbstractVector`: the data to impute # Returns * `AbstractVector`: our data array with missing elements removed """ -function impute!(imp::Drop, data::AbstractVector) +function impute!(imp::DropObs, data::AbstractVector) imp.context() do c filter!(x -> !ismissing(c, x), data) end end """ - impute!(imp::Drop, data::AbstractMatrix) + impute!(imp::DropObs, data::AbstractMatrix) Finds the missing rows in the matrix and uses a mask (Vector{Bool}) to return the `data` with those rows removed. Unfortunately, the mask approach requires copying the matrix. @@ -46,13 +46,13 @@ NOTES (or premature optimizations): 3. reshaping the data back to the desired shape. # Arguments -* `imp::Drop`: this `Imputor` method +* `imp::DropObs`: this `Imputor` method * `data::AbstractMatrix`: the data to impute # Returns * `AbstractMatrix`: a new matrix with missing rows removed """ -function impute!(imp::Drop, data::AbstractMatrix) +function impute!(imp::DropObs, data::AbstractMatrix) imp.context() do c mask = map(i -> !ismissing(c, data[i, :]), 1:size(data, 1)) return data[mask, :] @@ -60,18 +60,18 @@ function impute!(imp::Drop, data::AbstractMatrix) end """ - impute!(imp::Drop, table) + impute!(imp::DropObs, table) Finds the missing rows in the table and deletes them. # Arguments -* `imp::Drop`: this `Imputor` method +* `imp::DropObs`: this `Imputor` method * `table`: a type that implements the Tables API. # Returns * our data with the missing rows removed. """ -function impute!(imp::Drop, table) +function impute!(imp::DropObs, table) imp.context() do c @assert istable(table) rows = Tables.rows(table) @@ -85,3 +85,95 @@ function impute!(imp::Drop, table) return table end end + + +""" + DropVars <: Imputor + + +Removes missing values from the `AbstractArray` or `Tables.table` provided. + +# Fields +* `context::AbstractContext`: A context which keeps track of missing data + summary information +""" +struct DropVars <: Imputor + context::AbstractContext +end + +"""DropVars(; context=Context()) -> DropVars""" +DropVars(; context=Context()) = DropVars(context) + +""" + impute!(imp::DropVars, data::AbstractMatrix) + +Finds columns in the matrix with too many missing values and uses a mask (Vector{Bool}) to +return the `data` with those columns removed. Unfortunately, the mask approach +requires copying the matrix. + +# Arguments +* `imp::DropVars`: this `Imputor` method +* `data::AbstractMatrix`: the data to impute + +# Returns +* `AbstractMatrix`: a new matrix with missing columns removed +""" +function impute!(imp::DropVars, data::AbstractMatrix) + mask = map(1:size(data, 2)) do i + try + imp.context() do c + for j in 1:size(data, 1) + ismissing(c, data[j, i]) + end + end + return true + catch e + if isa(e, ImputeError) + return false + else + rethrow(e) + end + end + end + + data = data[:, mask] + return data +end + +""" + impute!(imp::DropVars, table) + +Find remove columns in the table with too many missing elements. + +# Arguments +* `imp::DropVars`: this `Imputor` method +* `table`: a type that implements the Tables API. + +# Returns +* our data with the missing columns removed. +""" +function impute!(imp::DropVars, table) + @assert istable(table) + cols = Tables.columns(table) + + cnames = Iterators.filter(propertynames(cols)) do cname + try + imp.context() do c + col = getproperty(cols, cname) + for i in 1:length(col) + ismissing(c, col[i]) + end + end + return true + catch e + if isa(e, ImputeError) + return false + else + rethrow(e) + end + end + end + + table = Tables.select(table, cnames...) |> materializer(table) + return table +end diff --git a/test/runtests.jl b/test/runtests.jl index 085d643..7d064a6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -6,7 +6,17 @@ using RDatasets using Statistics using StatsBase -import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, ImputeError +import Impute: + Drop, + DropObs, + DropVars, + Interpolate, + Fill, + LOCF, + NOCB, + Context, + WeightedContext, + ImputeError @testset "Impute" begin a = Vector{Union{Float64, Missing}}(1.0:1.0:20.0) @@ -15,16 +25,53 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im ctx = Context(; limit=0.2) @testset "Drop" begin - result = impute(Drop(; context=ctx), a) - expected = copy(a) - deleteat!(expected, [2, 3, 7]) + @testset "DropObs" begin + result = impute(DropObs(; context=ctx), a) + expected = copy(a) + deleteat!(expected, [2, 3, 7]) - @test result == expected - @test result == Impute.drop(a; context=ctx) + @test result == expected + @test result == Impute.dropobs(a; context=ctx) - a2 = copy(a) - Impute.drop!(a2; context=ctx) - @test a2 == expected + a2 = copy(a) + Impute.dropobs!(a2; context=ctx) + @test a2 == expected + end + @testset "DropVars" begin + @testset "Matrix" begin + m = reshape(a, 5, 4) + + result = impute(DropVars(; context=ctx), m) + expected = copy(m)[:, 2:4] + + @test isequal(result, expected) + @test isequal(result, Impute.dropvars(m; context=ctx)) + + Impute.dropvars!(m; context=ctx) + # The mutating test is broken because we need to making a copy of + # the original matrix + @test_broken isequal(m, expected) + end + @testset "DataFrame" begin + df = DataFrame( + :sin => Vector{Union{Float64, Missing}}(sin.(1.0:1.0:20.0)), + :cos => Vector{Union{Float64, Missing}}(sin.(1.0:1.0:20.0)), + ) + df.sin[[2, 3, 7, 12, 19]] .= missing + df.cos[[4, 9]] .= missing + + result = impute(DropVars(; context=ctx), df) + expected = df[[:cos]] + + @test isequal(result, expected) + @test isequal(result, Impute.dropvars(df; context=ctx)) + + Impute.dropvars!(df; context=ctx) + # The mutating test is broken because we need to making a copy of + # the original table + @test_broken isequal(df, expected) + end + end end @testset "Interpolate" begin @@ -116,9 +163,9 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im data = Matrix(dataset("boot", "neuro")) @testset "Drop" begin - result = impute(Drop(; context=ctx), data) + result = impute(DropObs(; context=ctx), data) @test size(result, 1) == 4 - @test result == Impute.drop(data; context=ctx) + @test result == Impute.dropobs(data; context=ctx) end @testset "Fill" begin @@ -134,8 +181,8 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im @testset "Not enough data" begin ctx = Context(; limit=0.1) - @test_throws ImputeError impute(Drop(; context=ctx), a) - @test_throws ImputeError Impute.drop(a; context=ctx) + @test_throws ImputeError impute(DropObs(; context=ctx), a) + @test_throws ImputeError Impute.dropobs(a; context=ctx) end @testset "Chain" begin @@ -191,10 +238,10 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im data1 = dataset("boot", "neuro") # Missing values with `missing` data2 = Impute.fill(data1; value=NaN, context=ctx1) # Missing values with `NaN` - @test Impute.drop(data1; context=ctx1) == dropmissing(data1) + @test Impute.dropobs(data1; context=ctx1) == dropmissing(data1) - result1 = Impute.interp(data1; context=ctx1) |> Impute.drop!() - result2 = Impute.interp(data2; context=ctx2) |> Impute.drop!(; context=ctx2) + result1 = Impute.interp(data1; context=ctx1) |> Impute.dropobs!() + result2 = Impute.interp(data2; context=ctx2) |> Impute.dropobs!(; context=ctx2) @test result1 == result2 end @@ -202,8 +249,8 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im @testset "Contexts" begin @testset "Base" begin ctx = Context(; limit=0.1) - @test_throws ImputeError Impute.drop(a; context=ctx) - @test_throws ImputeError impute(Drop(; context=ctx), a) + @test_throws ImputeError Impute.dropobs(a; context=ctx) + @test_throws ImputeError impute(DropObs(; context=ctx), a) end @testset "Weighted" begin @@ -211,7 +258,7 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im # because missing earlier observations is less important than later ones. ctx = WeightedContext(eweights(20, 0.3); limit=0.1) @test isa(ctx, WeightedContext) - result = impute(Drop(), ctx, a) + result = impute(DropObs(), ctx, a) expected = copy(a) deleteat!(expected, [2, 3, 7]) @test result == expected @@ -219,7 +266,7 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im # If we reverse the weights such that earlier observations are more important # then our previous limit of 0.2 won't be enough to succeed. ctx = WeightedContext(reverse!(eweights(20, 0.3)); limit=0.2) - @test_throws ImputeError impute(Drop(), ctx, a) + @test_throws ImputeError impute(DropObs(), ctx, a) end end