Introduce dropobs and dropvars and deprecate Drop.

invenia · Jul 8, 2019 · aedd1ab · aedd1ab
1 parent 20f084e
commit aedd1ab
Show file tree

Hide file tree

Showing 4 changed files with 176 additions and 34 deletions.
diff --git a/src/Impute.jl b/src/Impute.jl
@@ -52,7 +52,9 @@ include("context.jl")
 include("imputors.jl")
 
 const global imputation_methods = Dict{Symbol, Type}(
-    :drop => Drop,
+    :drop => DropObs,
+    :dropobs => DropObs,
+    :dropvars => DropVars,
     :interp => Interpolate,
     :fill => Fill,
     :locf => LOCF,

diff --git a/src/deprecated.jl b/src/deprecated.jl
@@ -156,6 +156,7 @@ end
 # Misc Deprecations #
 #####################
 Base.@deprecate Fill(val; kwargs...) Fill(; value=val, kwargs...)
+Base.@deprecate_binding Drop DropObs false
 
 # This function is just used to support legacy behaviour and should be removed in a
 # future release when we dropping accepting the limit kwarg to impute functions.

diff --git a/src/imputors/drop.jl b/src/imputors/drop.jl
@@ -1,39 +1,39 @@
 """
-    Drop <: Imputor
+    DropObs <: Imputor
 
 Removes missing values from the `AbstractArray` or `Tables.table` provided.
 
 # Fields
 * `context::AbstractContext`: A context which keeps track of missing data
   summary information
 """
-struct Drop <: Imputor
+struct DropObs <: Imputor
     context::AbstractContext
 end
 
-"""Drop(; context=Context()) -> Drop"""
-Drop(; context=Context()) = Drop(context)
+"""DropObs(; context=Context()) -> DropObs"""
+DropObs(; context=Context()) = DropObs(context)
 
 """
-    impute!(imp::Drop, data::AbstractVector)
+    impute!(imp::DropObs, data::AbstractVector)
 
 Uses `filter!` to remove missing elements from the array.
 
 # Arguments
-* `imp::Drop`: this `Imputor` method
+* `imp::DropObs`: this `Imputor` method
 * `data::AbstractVector`: the data to impute
 
 # Returns
 * `AbstractVector`: our data array with missing elements removed
 """
-function impute!(imp::Drop, data::AbstractVector)
+function impute!(imp::DropObs, data::AbstractVector)
     imp.context() do c
         filter!(x -> !ismissing(c, x), data)
     end
 end
 
 """
-    impute!(imp::Drop, data::AbstractMatrix)
+    impute!(imp::DropObs, data::AbstractMatrix)
 
 Finds the missing rows in the matrix and uses a mask (Vector{Bool}) to return the
 `data` with those rows removed. Unfortunately, the mask approach requires copying the matrix.
@@ -46,32 +46,32 @@ NOTES (or premature optimizations):
     3. reshaping the data back to the desired shape.
 
 # Arguments
-* `imp::Drop`: this `Imputor` method
+* `imp::DropObs`: this `Imputor` method
 * `data::AbstractMatrix`: the data to impute
 
 # Returns
 * `AbstractMatrix`: a new matrix with missing rows removed
 """
-function impute!(imp::Drop, data::AbstractMatrix)
+function impute!(imp::DropObs, data::AbstractMatrix)
     imp.context() do c
         mask = map(i -> !ismissing(c, data[i, :]), 1:size(data, 1))
         return data[mask, :]
     end
 end
 
 """
-    impute!(imp::Drop, table)
+    impute!(imp::DropObs, table)
 
 Finds the missing rows in the table and deletes them.
 
 # Arguments
-* `imp::Drop`: this `Imputor` method
+* `imp::DropObs`: this `Imputor` method
 * `table`: a type that implements the Tables API.
 
 # Returns
 * our data with the missing rows removed.
 """
-function impute!(imp::Drop, table)
+function impute!(imp::DropObs, table)
     imp.context() do c
         @assert istable(table)
         rows = Tables.rows(table)
@@ -85,3 +85,95 @@ function impute!(imp::Drop, table)
         return table
     end
 end
+
+
+"""
+    DropVars <: Imputor
+
+
+Removes missing values from the `AbstractArray` or `Tables.table` provided.
+
+# Fields
+* `context::AbstractContext`: A context which keeps track of missing data
+  summary information
+"""
+struct DropVars <: Imputor
+    context::AbstractContext
+end
+
+"""DropVars(; context=Context()) -> DropVars"""
+DropVars(; context=Context()) = DropVars(context)
+
+"""
+    impute!(imp::DropVars, data::AbstractMatrix)
+
+Finds columns in the matrix with too many missing values and uses a mask (Vector{Bool}) to
+return the `data` with those columns removed. Unfortunately, the mask approach
+requires copying the matrix.
+
+# Arguments
+* `imp::DropVars`: this `Imputor` method
+* `data::AbstractMatrix`: the data to impute
+
+# Returns
+* `AbstractMatrix`: a new matrix with missing columns removed
+"""
+function impute!(imp::DropVars, data::AbstractMatrix)
+    mask = map(1:size(data, 2)) do i
+        try
+            imp.context() do c
+                for j in 1:size(data, 1)
+                    ismissing(c, data[j, i])
+                end
+            end
+            return true
+        catch e
+            if isa(e, ImputeError)
+                return false
+            else
+                rethrow(e)
+            end
+        end
+    end
+
+    data = data[:, mask]
+    return data
+end
+
+"""
+    impute!(imp::DropVars, table)
+
+Find remove columns in the table with too many missing elements.
+
+# Arguments
+* `imp::DropVars`: this `Imputor` method
+* `table`: a type that implements the Tables API.
+
+# Returns
+* our data with the missing columns removed.
+"""
+function impute!(imp::DropVars, table)
+    @assert istable(table)
+    cols = Tables.columns(table)
+
+    cnames = Iterators.filter(propertynames(cols)) do cname
+        try
+            imp.context() do c
+                col = getproperty(cols, cname)
+                for i in 1:length(col)
+                    ismissing(c, col[i])
+                end
+            end
+            return true
+        catch e
+            if isa(e, ImputeError)
+                return false
+            else
+                rethrow(e)
+            end
+        end
+    end
+
+    table = Tables.select(table, cnames...) |> materializer(table)
+    return table
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -6,7 +6,17 @@ using RDatasets
 using Statistics
 using StatsBase
 
-import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, ImputeError
+import Impute:
+    Drop,
+    DropObs,
+    DropVars,
+    Interpolate,
+    Fill,
+    LOCF,
+    NOCB,
+    Context,
+    WeightedContext,
+    ImputeError
 
 @testset "Impute" begin
     a = Vector{Union{Float64, Missing}}(1.0:1.0:20.0)
@@ -15,16 +25,53 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im
     ctx = Context(; limit=0.2)
 
     @testset "Drop" begin
-        result = impute(Drop(; context=ctx), a)
-        expected = copy(a)
-        deleteat!(expected, [2, 3, 7])
+        @testset "DropObs" begin
+            result = impute(DropObs(; context=ctx), a)
+            expected = copy(a)
+            deleteat!(expected, [2, 3, 7])
 
-        @test result == expected
-        @test result == Impute.drop(a; context=ctx)
+            @test result == expected
+            @test result == Impute.dropobs(a; context=ctx)
 
-        a2 = copy(a)
-        Impute.drop!(a2; context=ctx)
-        @test a2 == expected
+            a2 = copy(a)
+            Impute.dropobs!(a2; context=ctx)
+            @test a2 == expected
+        end
+        @testset "DropVars" begin
+            @testset "Matrix" begin
+                m = reshape(a, 5, 4)
+
+                result = impute(DropVars(; context=ctx), m)
+                expected = copy(m)[:, 2:4]
+
+                @test isequal(result, expected)
+                @test isequal(result, Impute.dropvars(m; context=ctx))
+
+                Impute.dropvars!(m; context=ctx)
+                # The mutating test is broken because we need to making a copy of
+                # the original matrix
+                @test_broken isequal(m, expected)
+            end
+            @testset "DataFrame" begin
+                df = DataFrame(
+                    :sin => Vector{Union{Float64, Missing}}(sin.(1.0:1.0:20.0)),
+                    :cos => Vector{Union{Float64, Missing}}(sin.(1.0:1.0:20.0)),
+                )
+                df.sin[[2, 3, 7, 12, 19]] .= missing
+                df.cos[[4, 9]] .= missing
+
+                result = impute(DropVars(; context=ctx), df)
+                expected = df[[:cos]]
+
+                @test isequal(result, expected)
+                @test isequal(result, Impute.dropvars(df; context=ctx))
+
+                Impute.dropvars!(df; context=ctx)
+                # The mutating test is broken because we need to making a copy of
+                # the original table
+                @test_broken isequal(df, expected)
+            end
+        end
     end
 
     @testset "Interpolate" begin
@@ -116,9 +163,9 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im
         data = Matrix(dataset("boot", "neuro"))
 
         @testset "Drop" begin
-            result = impute(Drop(; context=ctx), data)
+            result = impute(DropObs(; context=ctx), data)
             @test size(result, 1) == 4
-            @test result == Impute.drop(data; context=ctx)
+            @test result == Impute.dropobs(data; context=ctx)
         end
 
         @testset "Fill" begin
@@ -134,8 +181,8 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im
 
     @testset "Not enough data" begin
         ctx = Context(; limit=0.1)
-        @test_throws ImputeError impute(Drop(; context=ctx), a)
-        @test_throws ImputeError Impute.drop(a; context=ctx)
+        @test_throws ImputeError impute(DropObs(; context=ctx), a)
+        @test_throws ImputeError Impute.dropobs(a; context=ctx)
     end
 
     @testset "Chain" begin
@@ -191,35 +238,35 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im
         data1 = dataset("boot", "neuro")                    # Missing values with `missing`
         data2 = Impute.fill(data1; value=NaN, context=ctx1)  # Missing values with `NaN`
 
-        @test Impute.drop(data1; context=ctx1) == dropmissing(data1)
+        @test Impute.dropobs(data1; context=ctx1) == dropmissing(data1)
 
-        result1 = Impute.interp(data1; context=ctx1) |> Impute.drop!()
-        result2 = Impute.interp(data2; context=ctx2) |> Impute.drop!(; context=ctx2)
+        result1 = Impute.interp(data1; context=ctx1) |> Impute.dropobs!()
+        result2 = Impute.interp(data2; context=ctx2) |> Impute.dropobs!(; context=ctx2)
 
         @test result1 == result2
     end
 
     @testset "Contexts" begin
         @testset "Base" begin
             ctx = Context(; limit=0.1)
-            @test_throws ImputeError Impute.drop(a; context=ctx)
-            @test_throws ImputeError impute(Drop(; context=ctx), a)
+            @test_throws ImputeError Impute.dropobs(a; context=ctx)
+            @test_throws ImputeError impute(DropObs(; context=ctx), a)
         end
 
         @testset "Weighted" begin
             # If we use an exponentially weighted context then we won't pass the limit
             # because missing earlier observations is less important than later ones.
             ctx = WeightedContext(eweights(20, 0.3); limit=0.1)
             @test isa(ctx, WeightedContext)
-            result = impute(Drop(), ctx, a)
+            result = impute(DropObs(), ctx, a)
             expected = copy(a)
             deleteat!(expected, [2, 3, 7])
             @test result == expected
 
             # If we reverse the weights such that earlier observations are more important
             # then our previous limit of 0.2 won't be enough to succeed.
             ctx = WeightedContext(reverse!(eweights(20, 0.3)); limit=0.2)
-            @test_throws ImputeError impute(Drop(), ctx, a)
+            @test_throws ImputeError impute(DropObs(), ctx, a)
         end
     end