diff --git a/.appveyor.yml b/.appveyor.yml index 2a62834..756f881 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,6 +1,5 @@ environment: matrix: - - julia_version: 0.7 - julia_version: 1.0 - julia_version: nightly diff --git a/.travis.yml b/.travis.yml index 56e488c..3e44a10 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,6 @@ os: - linux - osx julia: - - 0.7 - 1.0 - nightly notifications: @@ -18,18 +17,13 @@ matrix: # - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi # - julia -e 'Pkg.clone(pwd()); Pkg.build("Impute"); Pkg.test("Impute"; coverage=true)' after_success: - - | - julia -e ' - VERSION >= v"0.7.0-DEV.3656" && using Pkg - VERSION >= v"0.7.0-DEV.5183" || cd(Pkg.dir("Impute")) - Pkg.add("Coverage") - using Coverage - Codecov.submit(Codecov.process_folder()) - ' - - | - julia -e ' - VERSION >= v"0.7.0-DEV.3656" && using Pkg - VERSION >= v"0.7.0-DEV.5183" || cd(Pkg.dir("Impute")) - Pkg.add("Documenter") - include(joinpath("docs", "make.jl")) - ' + - julia -e 'using Pkg; Pkg.add("Coverage"); using Coverage; Codecov.submit(process_folder())' +jobs: + include: + - stage: "Documentation" + julia: 1.0 + os: linux + script: + - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()' + - julia --project=docs/ docs/make.jl + after_success: skip diff --git a/Project.toml b/Project.toml index 85a029f..b66b346 100644 --- a/Project.toml +++ b/Project.toml @@ -4,16 +4,22 @@ authors = ["Invenia Technical Computing"] version = "0.2.0" [deps] -DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -DataFrames = "0.17, 0.18" +DataFrames = ">= 0.16" +IterTools = "1.2" +RDatasets = ">= 0.6.2" +Tables = "0.2" julia = "1" [extras] +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["RDatasets", "Test"] +test = ["DataFrames", "RDatasets", "Test"] diff --git a/README.md b/README.md index 5c7f530..6943409 100644 --- a/README.md +++ b/README.md @@ -5,30 +5,110 @@ [![Build status](https://ci.appveyor.com/api/projects/status/github/invenia/Impute.jl?svg=true)](https://ci.appveyor.com/project/invenia/Impute-jl) [![codecov](https://codecov.io/gh/invenia/Impute.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/invenia/Impute.jl) -Impute.jl provides various data imputation methods for `Arrays` and `DataFrames` with various types of missing values. +Impute.jl provides various methods for handling missing data in Vectors, Matrices and [Tables](https://github.com/JuliaData/Tables.jl). ## Installation ```julia -Pkg.clone("https://github.com/invenia/Impute.jl") +julia> using Pkg; Pkg.add("Impute") ``` -## Features -* Operate over Vectors, Matrices or DataFrames -* Chaining of methods +## Quickstart +Let's start by loading our dependencies: +```julia +julia> using DataFrames, RDatasets, Impute +``` + +We'll also want some test data containing missings to work with: + +```julia +julia> df = dataset("boot", "neuro") +469×6 DataFrames.DataFrame +│ Row │ V1 │ V2 │ V3 │ V4 │ V5 │ V6 │ +│ │ Float64⍰ │ Float64⍰ │ Float64 │ Float64⍰ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┼─────────┼──────────┼──────────┼──────────┤ +│ 1 │ missing │ -203.7 │ -84.1 │ 18.5 │ missing │ missing │ +│ 2 │ missing │ -203.0 │ -97.8 │ 25.8 │ 134.7 │ missing │ +│ 3 │ missing │ -249.0 │ -92.1 │ 27.8 │ 177.1 │ missing │ +│ 4 │ missing │ -231.5 │ -97.5 │ 27.0 │ 150.3 │ missing │ +│ 5 │ missing │ missing │ -130.1 │ 25.8 │ 160.0 │ missing │ +│ 6 │ missing │ -223.1 │ -70.7 │ 62.1 │ 197.5 │ missing │ +│ 7 │ missing │ -164.8 │ -12.2 │ 76.8 │ 202.8 │ missing │ +⋮ +│ 462 │ missing │ -207.3 │ -88.3 │ 9.6 │ 104.1 │ 218.0 │ +│ 463 │ -242.6 │ -142.0 │ -21.8 │ 69.8 │ 148.7 │ missing │ +│ 464 │ -235.9 │ -128.8 │ -33.1 │ 68.8 │ 177.1 │ missing │ +│ 465 │ missing │ -140.8 │ -38.7 │ 58.1 │ 186.3 │ missing │ +│ 466 │ missing │ -149.5 │ -40.3 │ 62.8 │ 139.7 │ 242.5 │ +│ 467 │ -247.6 │ -157.8 │ -53.3 │ 28.3 │ 122.9 │ 227.6 │ +│ 468 │ missing │ -154.9 │ -50.8 │ 28.1 │ 119.9 │ 201.1 │ +│ 469 │ missing │ -180.7 │ -70.9 │ 33.7 │ 114.8 │ 222.5 │ +``` -## Methods +Our first instinct might be to drop all observations, but this leaves us too few rows to work with: -* drop - remove missing -* locf - last observation carried forward -* nocb - next observation carried backward -* interp - linear interpolation of values in vector -* fill - replace with a specific value or a function which returns a value given the existing vector with missing values dropped. +```julia +julia> Impute.drop(df) +4×6 DataFrames.DataFrame +│ Row │ V1 │ V2 │ V3 │ V4 │ V5 │ V6 │ +│ │ Float64 │ Float64 │ Float64 │ Float64 │ Float64 │ Float64 │ +├─────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┤ +│ 1 │ -247.0 │ -132.2 │ -18.8 │ 28.2 │ 81.4 │ 237.9 │ +│ 2 │ -234.0 │ -140.8 │ -56.5 │ 28.0 │ 114.3 │ 222.9 │ +│ 3 │ -215.8 │ -114.8 │ -18.4 │ 65.3 │ 171.6 │ 249.7 │ +│ 4 │ -247.6 │ -157.8 │ -53.3 │ 28.3 │ 122.9 │ 227.6 │ +``` -## TODO +We could try imputing the values with linear interpolation, but that still leaves missing +data at the head and tail of our dataset: + +```julia +julia> Impute.interp(df) +469×6 DataFrames.DataFrame +│ Row │ V1 │ V2 │ V3 │ V4 │ V5 │ V6 │ +│ │ Float64⍰ │ Float64⍰ │ Float64 │ Float64⍰ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┼─────────┼──────────┼──────────┼──────────┤ +│ 1 │ missing │ -203.7 │ -84.1 │ 18.5 │ missing │ missing │ +│ 2 │ missing │ -203.0 │ -97.8 │ 25.8 │ 134.7 │ missing │ +│ 3 │ missing │ -249.0 │ -92.1 │ 27.8 │ 177.1 │ missing │ +│ 4 │ missing │ -231.5 │ -97.5 │ 27.0 │ 150.3 │ missing │ +│ 5 │ missing │ -227.3 │ -130.1 │ 25.8 │ 160.0 │ missing │ +│ 6 │ missing │ -223.1 │ -70.7 │ 62.1 │ 197.5 │ missing │ +│ 7 │ missing │ -164.8 │ -12.2 │ 76.8 │ 202.8 │ missing │ +⋮ +│ 462 │ -241.025 │ -207.3 │ -88.3 │ 9.6 │ 104.1 │ 218.0 │ +│ 463 │ -242.6 │ -142.0 │ -21.8 │ 69.8 │ 148.7 │ 224.125 │ +│ 464 │ -235.9 │ -128.8 │ -33.1 │ 68.8 │ 177.1 │ 230.25 │ +│ 465 │ -239.8 │ -140.8 │ -38.7 │ 58.1 │ 186.3 │ 236.375 │ +│ 466 │ -243.7 │ -149.5 │ -40.3 │ 62.8 │ 139.7 │ 242.5 │ +│ 467 │ -247.6 │ -157.8 │ -53.3 │ 28.3 │ 122.9 │ 227.6 │ +│ 468 │ missing │ -154.9 │ -50.8 │ 28.1 │ 119.9 │ 201.1 │ +│ 469 │ missing │ -180.7 │ -70.9 │ 33.7 │ 114.8 │ 222.5 │ +``` + +Finally, we can chain multiple simple methods together to give a complete dataset: + +```julia +julia> Impute.interp(df) |> Impute.locf() |> Impute.nocb() +469×6 DataFrames.DataFrame +│ Row │ V1 │ V2 │ V3 │ V4 │ V5 │ V6 │ +│ │ Float64⍰ │ Float64⍰ │ Float64 │ Float64⍰ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┼─────────┼──────────┼──────────┼──────────┤ +│ 1 │ -233.6 │ -203.7 │ -84.1 │ 18.5 │ 134.7 │ 222.7 │ +│ 2 │ -233.6 │ -203.0 │ -97.8 │ 25.8 │ 134.7 │ 222.7 │ +│ 3 │ -233.6 │ -249.0 │ -92.1 │ 27.8 │ 177.1 │ 222.7 │ +│ 4 │ -233.6 │ -231.5 │ -97.5 │ 27.0 │ 150.3 │ 222.7 │ +│ 5 │ -233.6 │ -227.3 │ -130.1 │ 25.8 │ 160.0 │ 222.7 │ +│ 6 │ -233.6 │ -223.1 │ -70.7 │ 62.1 │ 197.5 │ 222.7 │ +│ 7 │ -233.6 │ -164.8 │ -12.2 │ 76.8 │ 202.8 │ 222.7 │ +⋮ +│ 462 │ -241.025 │ -207.3 │ -88.3 │ 9.6 │ 104.1 │ 218.0 │ +│ 463 │ -242.6 │ -142.0 │ -21.8 │ 69.8 │ 148.7 │ 224.125 │ +│ 464 │ -235.9 │ -128.8 │ -33.1 │ 68.8 │ 177.1 │ 230.25 │ +│ 465 │ -239.8 │ -140.8 │ -38.7 │ 58.1 │ 186.3 │ 236.375 │ +│ 466 │ -243.7 │ -149.5 │ -40.3 │ 62.8 │ 139.7 │ 242.5 │ +│ 467 │ -247.6 │ -157.8 │ -53.3 │ 28.3 │ 122.9 │ 227.6 │ +│ 468 │ -247.6 │ -154.9 │ -50.8 │ 28.1 │ 119.9 │ 201.1 │ +│ 469 │ -247.6 │ -180.7 │ -70.9 │ 33.7 │ 114.8 │ 222.5 │ +``` -* Dropping rows in a matrix allocates extra memory (ie: `data[mask, :]` make a copy). -* More sophisticated imputation methods - 1. MICE - 2. EM - 3. kNN - 4. Regression +**Warning**: Your approach should depend on the properties of you data (e.g., [MCAR, MAR, MNAR](https://en.wikipedia.org/wiki/Missing_data#Types_of_missing_data)). diff --git a/docs/Project.toml b/docs/Project.toml new file mode 100644 index 0000000..df82f6a --- /dev/null +++ b/docs/Project.toml @@ -0,0 +1,9 @@ +[deps] +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +Impute = "f7bf1975-0170-51b9-8c5f-a992d46b9575" +RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b" + +[compat] +DataFrames = ">= 0.16" +Documenter = "~0.22" diff --git a/docs/make.jl b/docs/make.jl index 6c28e25..c634dcb 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,4 +1,4 @@ -using Documenter, Impute, RDatasets +using Documenter, Impute makedocs( modules=[Impute], diff --git a/docs/src/index.md b/docs/src/index.md index 3afda96..31a547f 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,445 +1,55 @@ # Impute + +```@setup quickstart +using DataFrames, RDatasets, Impute +df = dataset("boot", "neuro") +``` + [![stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://invenia.github.io/Impute.jl/stable/) [![latest](https://img.shields.io/badge/docs-latest-blue.svg)](https://invenia.github.io/Impute.jl/latest/) [![Build Status](https://travis-ci.org/invenia/Impute.jl.svg?branch=master)](https://travis-ci.org/invenia/Impute.jl) [![Build status](https://ci.appveyor.com/api/projects/status/github/invenia/Impute.jl?svg=true)](https://ci.appveyor.com/project/invenia/Impute-jl) [![codecov](https://codecov.io/gh/invenia/Impute.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/invenia/Impute.jl) -Impute.jl provides various data imputation methods for `Arrays` and `DataFrames` with various types of missing values. +Impute.jl provides various methods for handling missing data in Vectors, Matrices and [Tables](https://github.com/JuliaData/Tables.jl). ## Installation ```julia -Pkg.clone("https://github.com/invenia/Impute.jl") +julia> using Pkg; Pkg.add("Impute") ``` -## Features - -* Operating over Vectors, Matrices and DataFrames -* Chaining of methods - -## Methods - -* drop - remove missing -* locf - last observation carried forward -* nocb - next observation carried backward -* interp - linear interpolation of values in vector -* fill - replace with a specific value or a function which returns a value given the existing vector with missing values dropped. - ## Quickstart -We'll start by imputing `NaN` values in 1-dimension vector. -```julia -julia> using Impute - -julia> a = collect(1.0:1.0:20.0) -20-element Array{Float64,1}: - 1.0 - 2.0 - 3.0 - 4.0 - 5.0 - 6.0 - 7.0 - 8.0 - 9.0 - 10.0 - 11.0 - 12.0 - 13.0 - 14.0 - 15.0 - 16.0 - 17.0 - 18.0 - 19.0 - 20.0 - -julia> a[[2, 3, 7]] = NaN -NaN -``` +Let's start by loading our dependencies: -The most common approach to missing data is to remove them. -```julia -julia> impute(a, :drop; limit=0.2) -17-element Array{Float64,1}: - 1.0 - 4.0 - 5.0 - 6.0 - 8.0 - 9.0 - 10.0 - 11.0 - 12.0 - 13.0 - 14.0 - 15.0 - 16.0 - 17.0 - 18.0 - 19.0 - 20.0 +```@repl +using DataFrames, RDatasets, Impute ``` -But we may want use linear interpolation, filling, etc -```julia -julia> impute(a, :interp; limit=0.2) -20-element Array{Float64,1}: - 1.0 - 2.0 - 3.0 - 4.0 - 5.0 - 6.0 - 7.0 - 8.0 - 9.0 - 10.0 - 11.0 - 12.0 - 13.0 - 14.0 - 15.0 - 16.0 - 17.0 - 18.0 - 19.0 - 20.0 - -julia> impute(a, :fill; limit=0.2) -20-element Array{Float64,1}: - 1.0 - 11.6471 - 11.6471 - 4.0 - 5.0 - 6.0 - 11.6471 - 8.0 - 9.0 - 10.0 - 11.0 - 12.0 - 13.0 - 14.0 - 15.0 - 16.0 - 17.0 - 18.0 - 19.0 - 20.0 - -julia> impute(a, :locf; limit=0.2) -20-element Array{Float64,1}: - 1.0 - 1.0 - 1.0 - 4.0 - 5.0 - 6.0 - 6.0 - 8.0 - 9.0 - 10.0 - 11.0 - 12.0 - 13.0 - 14.0 - 15.0 - 16.0 - 17.0 - 18.0 - 19.0 - 20.0 +We'll also want some test data containing `missing`s to work with: -julia> impute(a, :nocb; limit=0.2) -20-element Array{Float64,1}: - 1.0 - 4.0 - 4.0 - 4.0 - 5.0 - 6.0 - 8.0 - 8.0 - 9.0 - 10.0 - 11.0 - 12.0 - 13.0 - 14.0 - 15.0 - 16.0 - 17.0 - 18.0 - 19.0 - 20.0 +```@repl quickstart +df = dataset("boot", "neuro") ``` -We can also perform these operations on `DataFrame`s. +Our first instinct might be to drop all observations, but this leaves us too few +rows to work with: -```julia -julia> using DataFrames +```@repl quickstart +Impute.drop(df) +``` -julia> using RDatasets +We could try imputing the values with linear interpolation, but that still leaves missing +data at the head and tail of our dataset: -julia> df = dataset("boot", "neuro") -2814 -Symbol[:V1,:V2,:V3,:V4,:V5,:V6] -6 -469×6 DataFrames.DataFrame -│ Row │ V1 │ V2 │ V3 │ V4 │ V5 │ V6 │ -├─────┼────────┼────────┼────────┼───────┼───────┼───────┤ -│ 1 │ NA │ -203.7 │ -84.1 │ 18.5 │ NA │ NA │ -│ 2 │ NA │ -203.0 │ -97.8 │ 25.8 │ 134.7 │ NA │ -│ 3 │ NA │ -249.0 │ -92.1 │ 27.8 │ 177.1 │ NA │ -│ 4 │ NA │ -231.5 │ -97.5 │ 27.0 │ 150.3 │ NA │ -│ 5 │ NA │ NA │ -130.1 │ 25.8 │ 160.0 │ NA │ -│ 6 │ NA │ -223.1 │ -70.7 │ 62.1 │ 197.5 │ NA │ -│ 7 │ NA │ -164.8 │ -12.2 │ 76.8 │ 202.8 │ NA │ -│ 8 │ NA │ -221.6 │ -81.9 │ 27.5 │ 144.5 │ NA │ -│ 9 │ NA │ -153.7 │ -17.0 │ 76.1 │ 222.4 │ NA │ -│ 10 │ NA │ -184.7 │ -47.3 │ 74.4 │ 208.9 │ NA │ -│ 11 │ NA │ NA │ -148.8 │ 11.4 │ 137.7 │ NA │ -│ 12 │ NA │ -197.6 │ -6.4 │ 137.1 │ NA │ NA │ -│ 13 │ NA │ -247.8 │ -35.4 │ 80.9 │ 229.5 │ NA │ -│ 14 │ NA │ -227.0 │ -104.7 │ 20.2 │ 140.2 │ NA │ -│ 15 │ -233.6 │ -115.9 │ -10.5 │ 70.0 │ 202.6 │ NA │ -│ 16 │ NA │ -232.4 │ -100.6 │ 16.8 │ 145.1 │ NA │ -│ 17 │ NA │ -199.4 │ -58.2 │ 29.1 │ 184.4 │ NA │ -│ 18 │ NA │ -195.7 │ -89.5 │ 26.4 │ 142.7 │ NA │ -│ 19 │ NA │ -180.1 │ -65.0 │ 27.3 │ 171.1 │ NA │ -│ 20 │ NA │ NA │ -85.2 │ 27.1 │ NA │ NA │ -│ 21 │ NA │ -217.3 │ -77.1 │ 27.6 │ 151.5 │ NA │ -│ 22 │ NA │ -139.7 │ -15.8 │ 83.0 │ 215.5 │ NA │ -│ 23 │ -249.6 │ -132.8 │ -14.1 │ 78.1 │ 205.7 │ NA │ -│ 24 │ NA │ -152.7 │ -36.9 │ 29.7 │ 149.8 │ NA │ -│ 25 │ NA │ -224.1 │ -81.9 │ 29.1 │ 172.2 │ NA │ -│ 26 │ NA │ NA │ -235.8 │ 6.0 │ 144.4 │ NA │ -│ 27 │ NA │ -202.8 │ -45.1 │ 84.0 │ 227.3 │ NA │ -│ 28 │ -240.9 │ -138.4 │ -21.5 │ 73.4 │ 210.6 │ NA │ -│ 29 │ -247.1 │ -128.2 │ -31.3 │ 29.2 │ 143.1 │ NA │ -│ 30 │ NA │ -185.4 │ -80.3 │ 23.9 │ 115.8 │ 222.7 │ -│ 31 │ NA │ -182.5 │ -75.8 │ 27.5 │ 165.2 │ NA │ -│ 32 │ NA │ -202.2 │ -99.1 │ 23.8 │ 136.3 │ 242.5 │ -│ 33 │ NA │ -193.3 │ -82.6 │ 26.3 │ 160.5 │ NA │ -│ 34 │ NA │ -189.4 │ -63.3 │ 27.6 │ 136.8 │ NA │ -│ 35 │ NA │ -149.0 │ -31.0 │ 73.5 │ 187.8 │ NA │ -│ 36 │ NA │ -162.4 │ -26.5 │ 72.6 │ NA │ NA │ -│ 37 │ NA │ -213.4 │ -107.2 │ 24.7 │ 158.5 │ NA │ -⋮ -│ 432 │ NA │ -156.2 │ -32.9 │ 63.3 │ 182.8 │ NA │ -│ 433 │ NA │ -220.6 │ -114.2 │ 9.7 │ 106.4 │ 227.9 │ -│ 434 │ -219.9 │ -120.9 │ -1.3 │ 99.5 │ 207.6 │ NA │ -│ 435 │ NA │ -240.5 │ -110.3 │ 26.1 │ 142.8 │ NA │ -│ 436 │ NA │ -239.6 │ -121.4 │ 2.9 │ 124.9 │ NA │ -│ 437 │ NA │ -139.8 │ -7.3 │ 121.0 │ NA │ NA │ -│ 438 │ NA │ -212.0 │ -66.2 │ 50.4 │ 178.2 │ NA │ -│ 439 │ NA │ -232.7 │ -109.2 │ 18.4 │ 127.5 │ NA │ -│ 440 │ NA │ -236.3 │ -115.1 │ 5.1 │ 109.0 │ 212.0 │ -│ 441 │ -241.2 │ -107.1 │ -9.1 │ 95.1 │ 198.6 │ NA │ -│ 442 │ -226.7 │ -143.8 │ -30.4 │ 75.8 │ 196.6 │ NA │ -│ 443 │ NA │ -131.8 │ -26.5 │ 64.7 │ 177.2 │ NA │ -│ 444 │ NA │ -144.9 │ -0.9 │ 105.3 │ 230.9 │ NA │ -│ 445 │ NA │ -214.0 │ -81.8 │ 66.1 │ 191.3 │ NA │ -│ 446 │ NA │ -210.6 │ -94.3 │ 16.7 │ 125.5 │ 239.7 │ -│ 447 │ -215.8 │ -114.8 │ -18.4 │ 65.3 │ 171.6 │ 249.7 │ -│ 448 │ NA │ -156.0 │ -14.0 │ 113.7 │ 249.3 │ NA │ -│ 449 │ NA │ -210.5 │ -41.9 │ NA │ NA │ NA │ -│ 450 │ NA │ -189.2 │ -72.0 │ 56.8 │ 133.8 │ 246.7 │ -│ 451 │ NA │ -214.2 │ -102.2 │ 5.5 │ 75.6 │ 154.3 │ -│ 452 │ -219.6 │ -107.9 │ -16.0 │ 101.7 │ 186.0 │ NA │ -│ 453 │ NA │ -153.0 │ -38.0 │ 61.3 │ 144.4 │ 245.9 │ -│ 454 │ NA │ -179.8 │ -63.4 │ 56.0 │ 157.5 │ NA │ -│ 455 │ NA │ -174.5 │ -44.8 │ 73.3 │ 179.7 │ NA │ -│ 456 │ NA │ -206.8 │ -108.9 │ 3.7 │ 102.1 │ 210.3 │ -│ 457 │ NA │ -169.5 │ -79.7 │ 27.9 │ 129.4 │ 242.8 │ -│ 458 │ -222.2 │ -104.6 │ -2.4 │ 84.3 │ 204.7 │ NA │ -│ 459 │ -236.3 │ -124.0 │ -6.8 │ 95.7 │ 196.0 │ NA │ -│ 460 │ NA │ -216.5 │ -90.2 │ 27.8 │ 138.9 │ NA │ -│ 461 │ NA │ -163.2 │ -43.6 │ 69.5 │ 173.9 │ NA │ -│ 462 │ NA │ -207.3 │ -88.3 │ 9.6 │ 104.1 │ 218.0 │ -│ 463 │ -242.6 │ -142.0 │ -21.8 │ 69.8 │ 148.7 │ NA │ -│ 464 │ -235.9 │ -128.8 │ -33.1 │ 68.8 │ 177.1 │ NA │ -│ 465 │ NA │ -140.8 │ -38.7 │ 58.1 │ 186.3 │ NA │ -│ 466 │ NA │ -149.5 │ -40.3 │ 62.8 │ 139.7 │ 242.5 │ -│ 467 │ -247.6 │ -157.8 │ -53.3 │ 28.3 │ 122.9 │ 227.6 │ -│ 468 │ NA │ -154.9 │ -50.8 │ 28.1 │ 119.9 │ 201.1 │ -│ 469 │ NA │ -180.7 │ -70.9 │ 33.7 │ 114.8 │ 222.5 │ +```@repl quickstart +Impute.interp(df) +``` -julia> drop(df) -4×6 DataFrames.DataFrame -│ Row │ V1 │ V2 │ V3 │ V4 │ V5 │ V6 │ -├─────┼────────┼────────┼───────┼──────┼───────┼───────┤ -│ 1 │ -247.0 │ -132.2 │ -18.8 │ 28.2 │ 81.4 │ 237.9 │ -│ 2 │ -234.0 │ -140.8 │ -56.5 │ 28.0 │ 114.3 │ 222.9 │ -│ 3 │ -215.8 │ -114.8 │ -18.4 │ 65.3 │ 171.6 │ 249.7 │ -│ 4 │ -247.6 │ -157.8 │ -53.3 │ 28.3 │ 122.9 │ 227.6 │ +Finally, we can chain multiple simple methods together to give a complete dataset: -julia> interp(df) -469×6 DataFrames.DataFrame -│ Row │ V1 │ V2 │ V3 │ V4 │ V5 │ V6 │ -├─────┼──────────┼─────────┼────────┼───────┼────────┼─────────┤ -│ 1 │ NA │ -203.7 │ -84.1 │ 18.5 │ NA │ NA │ -│ 2 │ NA │ -203.0 │ -97.8 │ 25.8 │ 134.7 │ NA │ -│ 3 │ NA │ -249.0 │ -92.1 │ 27.8 │ 177.1 │ NA │ -│ 4 │ NA │ -231.5 │ -97.5 │ 27.0 │ 150.3 │ NA │ -│ 5 │ NA │ -227.3 │ -130.1 │ 25.8 │ 160.0 │ NA │ -│ 6 │ NA │ -223.1 │ -70.7 │ 62.1 │ 197.5 │ NA │ -│ 7 │ NA │ -164.8 │ -12.2 │ 76.8 │ 202.8 │ NA │ -│ 8 │ NA │ -221.6 │ -81.9 │ 27.5 │ 144.5 │ NA │ -│ 9 │ NA │ -153.7 │ -17.0 │ 76.1 │ 222.4 │ NA │ -│ 10 │ NA │ -184.7 │ -47.3 │ 74.4 │ 208.9 │ NA │ -│ 11 │ NA │ -191.15 │ -148.8 │ 11.4 │ 137.7 │ NA │ -│ 12 │ NA │ -197.6 │ -6.4 │ 137.1 │ 183.6 │ NA │ -│ 13 │ NA │ -247.8 │ -35.4 │ 80.9 │ 229.5 │ NA │ -│ 14 │ NA │ -227.0 │ -104.7 │ 20.2 │ 140.2 │ NA │ -│ 15 │ -233.6 │ -115.9 │ -10.5 │ 70.0 │ 202.6 │ NA │ -│ 16 │ -235.6 │ -232.4 │ -100.6 │ 16.8 │ 145.1 │ NA │ -│ 17 │ -237.6 │ -199.4 │ -58.2 │ 29.1 │ 184.4 │ NA │ -│ 18 │ -239.6 │ -195.7 │ -89.5 │ 26.4 │ 142.7 │ NA │ -│ 19 │ -241.6 │ -180.1 │ -65.0 │ 27.3 │ 171.1 │ NA │ -│ 20 │ -243.6 │ -198.7 │ -85.2 │ 27.1 │ 161.3 │ NA │ -│ 21 │ -245.6 │ -217.3 │ -77.1 │ 27.6 │ 151.5 │ NA │ -│ 22 │ -247.6 │ -139.7 │ -15.8 │ 83.0 │ 215.5 │ NA │ -│ 23 │ -249.6 │ -132.8 │ -14.1 │ 78.1 │ 205.7 │ NA │ -│ 24 │ -247.86 │ -152.7 │ -36.9 │ 29.7 │ 149.8 │ NA │ -│ 25 │ -246.12 │ -224.1 │ -81.9 │ 29.1 │ 172.2 │ NA │ -│ 26 │ -244.38 │ -213.45 │ -235.8 │ 6.0 │ 144.4 │ NA │ -│ 27 │ -242.64 │ -202.8 │ -45.1 │ 84.0 │ 227.3 │ NA │ -│ 28 │ -240.9 │ -138.4 │ -21.5 │ 73.4 │ 210.6 │ NA │ -│ 29 │ -247.1 │ -128.2 │ -31.3 │ 29.2 │ 143.1 │ NA │ -│ 30 │ -247.093 │ -185.4 │ -80.3 │ 23.9 │ 115.8 │ 222.7 │ -│ 31 │ -247.086 │ -182.5 │ -75.8 │ 27.5 │ 165.2 │ 232.6 │ -│ 32 │ -247.079 │ -202.2 │ -99.1 │ 23.8 │ 136.3 │ 242.5 │ -│ 33 │ -247.071 │ -193.3 │ -82.6 │ 26.3 │ 160.5 │ 242.082 │ -│ 34 │ -247.064 │ -189.4 │ -63.3 │ 27.6 │ 136.8 │ 241.664 │ -│ 35 │ -247.057 │ -149.0 │ -31.0 │ 73.5 │ 187.8 │ 241.245 │ -│ 36 │ -247.05 │ -162.4 │ -26.5 │ 72.6 │ 173.15 │ 240.827 │ -│ 37 │ -247.043 │ -213.4 │ -107.2 │ 24.7 │ 158.5 │ 240.409 │ -⋮ -│ 432 │ -219.99 │ -156.2 │ -32.9 │ 63.3 │ 182.8 │ 232.0 │ -│ 433 │ -219.945 │ -220.6 │ -114.2 │ 9.7 │ 106.4 │ 227.9 │ -│ 434 │ -219.9 │ -120.9 │ -1.3 │ 99.5 │ 207.6 │ 225.629 │ -│ 435 │ -222.943 │ -240.5 │ -110.3 │ 26.1 │ 142.8 │ 223.357 │ -│ 436 │ -225.986 │ -239.6 │ -121.4 │ 2.9 │ 124.9 │ 221.086 │ -│ 437 │ -229.029 │ -139.8 │ -7.3 │ 121.0 │ 151.55 │ 218.814 │ -│ 438 │ -232.071 │ -212.0 │ -66.2 │ 50.4 │ 178.2 │ 216.543 │ -│ 439 │ -235.114 │ -232.7 │ -109.2 │ 18.4 │ 127.5 │ 214.271 │ -│ 440 │ -238.157 │ -236.3 │ -115.1 │ 5.1 │ 109.0 │ 212.0 │ -│ 441 │ -241.2 │ -107.1 │ -9.1 │ 95.1 │ 198.6 │ 216.617 │ -│ 442 │ -226.7 │ -143.8 │ -30.4 │ 75.8 │ 196.6 │ 221.233 │ -│ 443 │ -224.52 │ -131.8 │ -26.5 │ 64.7 │ 177.2 │ 225.85 │ -│ 444 │ -222.34 │ -144.9 │ -0.9 │ 105.3 │ 230.9 │ 230.467 │ -│ 445 │ -220.16 │ -214.0 │ -81.8 │ 66.1 │ 191.3 │ 235.083 │ -│ 446 │ -217.98 │ -210.6 │ -94.3 │ 16.7 │ 125.5 │ 239.7 │ -│ 447 │ -215.8 │ -114.8 │ -18.4 │ 65.3 │ 171.6 │ 249.7 │ -│ 448 │ -216.56 │ -156.0 │ -14.0 │ 113.7 │ 249.3 │ 248.7 │ -│ 449 │ -217.32 │ -210.5 │ -41.9 │ 85.25 │ 191.55 │ 247.7 │ -│ 450 │ -218.08 │ -189.2 │ -72.0 │ 56.8 │ 133.8 │ 246.7 │ -│ 451 │ -218.84 │ -214.2 │ -102.2 │ 5.5 │ 75.6 │ 154.3 │ -│ 452 │ -219.6 │ -107.9 │ -16.0 │ 101.7 │ 186.0 │ 200.1 │ -│ 453 │ -220.033 │ -153.0 │ -38.0 │ 61.3 │ 144.4 │ 245.9 │ -│ 454 │ -220.467 │ -179.8 │ -63.4 │ 56.0 │ 157.5 │ 234.033 │ -│ 455 │ -220.9 │ -174.5 │ -44.8 │ 73.3 │ 179.7 │ 222.167 │ -│ 456 │ -221.333 │ -206.8 │ -108.9 │ 3.7 │ 102.1 │ 210.3 │ -│ 457 │ -221.767 │ -169.5 │ -79.7 │ 27.9 │ 129.4 │ 242.8 │ -│ 458 │ -222.2 │ -104.6 │ -2.4 │ 84.3 │ 204.7 │ 237.84 │ -│ 459 │ -236.3 │ -124.0 │ -6.8 │ 95.7 │ 196.0 │ 232.88 │ -│ 460 │ -237.875 │ -216.5 │ -90.2 │ 27.8 │ 138.9 │ 227.92 │ -│ 461 │ -239.45 │ -163.2 │ -43.6 │ 69.5 │ 173.9 │ 222.96 │ -│ 462 │ -241.025 │ -207.3 │ -88.3 │ 9.6 │ 104.1 │ 218.0 │ -│ 463 │ -242.6 │ -142.0 │ -21.8 │ 69.8 │ 148.7 │ 224.125 │ -│ 464 │ -235.9 │ -128.8 │ -33.1 │ 68.8 │ 177.1 │ 230.25 │ -│ 465 │ -239.8 │ -140.8 │ -38.7 │ 58.1 │ 186.3 │ 236.375 │ -│ 466 │ -243.7 │ -149.5 │ -40.3 │ 62.8 │ 139.7 │ 242.5 │ -│ 467 │ -247.6 │ -157.8 │ -53.3 │ 28.3 │ 122.9 │ 227.6 │ -│ 468 │ NA │ -154.9 │ -50.8 │ 28.1 │ 119.9 │ 201.1 │ -│ 469 │ NA │ -180.7 │ -70.9 │ 33.7 │ 114.8 │ 222.5 │ +```@repl quickstart +Impute.interp(df) |> Impute.locf() |> Impute.nocb() ``` -Finally, we can also chain imputation methods together. -As we saw in the last example linear interpolation can interpolate missing values -at the head or tail of the array (or column). -```julia -julia> chain(df, Impute.Interpolate(), Impute.LOCF(), Impute.NOCB(); limit=1.0) -469×6 DataFrames.DataFrame -│ Row │ V1 │ V2 │ V3 │ V4 │ V5 │ V6 │ -├─────┼──────────┼─────────┼────────┼───────┼────────┼─────────┤ -│ 1 │ -233.6 │ -203.7 │ -84.1 │ 18.5 │ 134.7 │ 222.7 │ -│ 2 │ -233.6 │ -203.0 │ -97.8 │ 25.8 │ 134.7 │ 222.7 │ -│ 3 │ -233.6 │ -249.0 │ -92.1 │ 27.8 │ 177.1 │ 222.7 │ -│ 4 │ -233.6 │ -231.5 │ -97.5 │ 27.0 │ 150.3 │ 222.7 │ -│ 5 │ -233.6 │ -227.3 │ -130.1 │ 25.8 │ 160.0 │ 222.7 │ -│ 6 │ -233.6 │ -223.1 │ -70.7 │ 62.1 │ 197.5 │ 222.7 │ -│ 7 │ -233.6 │ -164.8 │ -12.2 │ 76.8 │ 202.8 │ 222.7 │ -│ 8 │ -233.6 │ -221.6 │ -81.9 │ 27.5 │ 144.5 │ 222.7 │ -│ 9 │ -233.6 │ -153.7 │ -17.0 │ 76.1 │ 222.4 │ 222.7 │ -│ 10 │ -233.6 │ -184.7 │ -47.3 │ 74.4 │ 208.9 │ 222.7 │ -│ 11 │ -233.6 │ -191.15 │ -148.8 │ 11.4 │ 137.7 │ 222.7 │ -│ 12 │ -233.6 │ -197.6 │ -6.4 │ 137.1 │ 183.6 │ 222.7 │ -│ 13 │ -233.6 │ -247.8 │ -35.4 │ 80.9 │ 229.5 │ 222.7 │ -│ 14 │ -233.6 │ -227.0 │ -104.7 │ 20.2 │ 140.2 │ 222.7 │ -│ 15 │ -233.6 │ -115.9 │ -10.5 │ 70.0 │ 202.6 │ 222.7 │ -│ 16 │ -235.6 │ -232.4 │ -100.6 │ 16.8 │ 145.1 │ 222.7 │ -│ 17 │ -237.6 │ -199.4 │ -58.2 │ 29.1 │ 184.4 │ 222.7 │ -│ 18 │ -239.6 │ -195.7 │ -89.5 │ 26.4 │ 142.7 │ 222.7 │ -│ 19 │ -241.6 │ -180.1 │ -65.0 │ 27.3 │ 171.1 │ 222.7 │ -│ 20 │ -243.6 │ -198.7 │ -85.2 │ 27.1 │ 161.3 │ 222.7 │ -│ 21 │ -245.6 │ -217.3 │ -77.1 │ 27.6 │ 151.5 │ 222.7 │ -│ 22 │ -247.6 │ -139.7 │ -15.8 │ 83.0 │ 215.5 │ 222.7 │ -│ 23 │ -249.6 │ -132.8 │ -14.1 │ 78.1 │ 205.7 │ 222.7 │ -│ 24 │ -247.86 │ -152.7 │ -36.9 │ 29.7 │ 149.8 │ 222.7 │ -│ 25 │ -246.12 │ -224.1 │ -81.9 │ 29.1 │ 172.2 │ 222.7 │ -│ 26 │ -244.38 │ -213.45 │ -235.8 │ 6.0 │ 144.4 │ 222.7 │ -│ 27 │ -242.64 │ -202.8 │ -45.1 │ 84.0 │ 227.3 │ 222.7 │ -│ 28 │ -240.9 │ -138.4 │ -21.5 │ 73.4 │ 210.6 │ 222.7 │ -│ 29 │ -247.1 │ -128.2 │ -31.3 │ 29.2 │ 143.1 │ 222.7 │ -│ 30 │ -247.093 │ -185.4 │ -80.3 │ 23.9 │ 115.8 │ 222.7 │ -│ 31 │ -247.086 │ -182.5 │ -75.8 │ 27.5 │ 165.2 │ 232.6 │ -│ 32 │ -247.079 │ -202.2 │ -99.1 │ 23.8 │ 136.3 │ 242.5 │ -│ 33 │ -247.071 │ -193.3 │ -82.6 │ 26.3 │ 160.5 │ 242.082 │ -│ 34 │ -247.064 │ -189.4 │ -63.3 │ 27.6 │ 136.8 │ 241.664 │ -│ 35 │ -247.057 │ -149.0 │ -31.0 │ 73.5 │ 187.8 │ 241.245 │ -│ 36 │ -247.05 │ -162.4 │ -26.5 │ 72.6 │ 173.15 │ 240.827 │ -│ 37 │ -247.043 │ -213.4 │ -107.2 │ 24.7 │ 158.5 │ 240.409 │ -⋮ -│ 432 │ -219.99 │ -156.2 │ -32.9 │ 63.3 │ 182.8 │ 232.0 │ -│ 433 │ -219.945 │ -220.6 │ -114.2 │ 9.7 │ 106.4 │ 227.9 │ -│ 434 │ -219.9 │ -120.9 │ -1.3 │ 99.5 │ 207.6 │ 225.629 │ -│ 435 │ -222.943 │ -240.5 │ -110.3 │ 26.1 │ 142.8 │ 223.357 │ -│ 436 │ -225.986 │ -239.6 │ -121.4 │ 2.9 │ 124.9 │ 221.086 │ -│ 437 │ -229.029 │ -139.8 │ -7.3 │ 121.0 │ 151.55 │ 218.814 │ -│ 438 │ -232.071 │ -212.0 │ -66.2 │ 50.4 │ 178.2 │ 216.543 │ -│ 439 │ -235.114 │ -232.7 │ -109.2 │ 18.4 │ 127.5 │ 214.271 │ -│ 440 │ -238.157 │ -236.3 │ -115.1 │ 5.1 │ 109.0 │ 212.0 │ -│ 441 │ -241.2 │ -107.1 │ -9.1 │ 95.1 │ 198.6 │ 216.617 │ -│ 442 │ -226.7 │ -143.8 │ -30.4 │ 75.8 │ 196.6 │ 221.233 │ -│ 443 │ -224.52 │ -131.8 │ -26.5 │ 64.7 │ 177.2 │ 225.85 │ -│ 444 │ -222.34 │ -144.9 │ -0.9 │ 105.3 │ 230.9 │ 230.467 │ -│ 445 │ -220.16 │ -214.0 │ -81.8 │ 66.1 │ 191.3 │ 235.083 │ -│ 446 │ -217.98 │ -210.6 │ -94.3 │ 16.7 │ 125.5 │ 239.7 │ -│ 447 │ -215.8 │ -114.8 │ -18.4 │ 65.3 │ 171.6 │ 249.7 │ -│ 448 │ -216.56 │ -156.0 │ -14.0 │ 113.7 │ 249.3 │ 248.7 │ -│ 449 │ -217.32 │ -210.5 │ -41.9 │ 85.25 │ 191.55 │ 247.7 │ -│ 450 │ -218.08 │ -189.2 │ -72.0 │ 56.8 │ 133.8 │ 246.7 │ -│ 451 │ -218.84 │ -214.2 │ -102.2 │ 5.5 │ 75.6 │ 154.3 │ -│ 452 │ -219.6 │ -107.9 │ -16.0 │ 101.7 │ 186.0 │ 200.1 │ -│ 453 │ -220.033 │ -153.0 │ -38.0 │ 61.3 │ 144.4 │ 245.9 │ -│ 454 │ -220.467 │ -179.8 │ -63.4 │ 56.0 │ 157.5 │ 234.033 │ -│ 455 │ -220.9 │ -174.5 │ -44.8 │ 73.3 │ 179.7 │ 222.167 │ -│ 456 │ -221.333 │ -206.8 │ -108.9 │ 3.7 │ 102.1 │ 210.3 │ -│ 457 │ -221.767 │ -169.5 │ -79.7 │ 27.9 │ 129.4 │ 242.8 │ -│ 458 │ -222.2 │ -104.6 │ -2.4 │ 84.3 │ 204.7 │ 237.84 │ -│ 459 │ -236.3 │ -124.0 │ -6.8 │ 95.7 │ 196.0 │ 232.88 │ -│ 460 │ -237.875 │ -216.5 │ -90.2 │ 27.8 │ 138.9 │ 227.92 │ -│ 461 │ -239.45 │ -163.2 │ -43.6 │ 69.5 │ 173.9 │ 222.96 │ -│ 462 │ -241.025 │ -207.3 │ -88.3 │ 9.6 │ 104.1 │ 218.0 │ -│ 463 │ -242.6 │ -142.0 │ -21.8 │ 69.8 │ 148.7 │ 224.125 │ -│ 464 │ -235.9 │ -128.8 │ -33.1 │ 68.8 │ 177.1 │ 230.25 │ -│ 465 │ -239.8 │ -140.8 │ -38.7 │ 58.1 │ 186.3 │ 236.375 │ -│ 466 │ -243.7 │ -149.5 │ -40.3 │ 62.8 │ 139.7 │ 242.5 │ -│ 467 │ -247.6 │ -157.8 │ -53.3 │ 28.3 │ 122.9 │ 227.6 │ -│ 468 │ -247.6 │ -154.9 │ -50.8 │ 28.1 │ 119.9 │ 201.1 │ -│ 469 │ -247.6 │ -180.7 │ -70.9 │ 33.7 │ 114.8 │ 222.5 │ -``` +Warning: Your approach should depend on the properties of you data (e.g., [MCAR, MAR, MNAR](https://en.wikipedia.org/wiki/Missing_data#Types_of_missing_data)). diff --git a/src/Impute.jl b/src/Impute.jl index 96941a5..79c97e1 100644 --- a/src/Impute.jl +++ b/src/Impute.jl @@ -1,14 +1,31 @@ module Impute -using DataFrames +using IterTools using Statistics +using StatsBase +using Tables: Tables, materializer, istable -import DataFrames: DataFrameRow -import Base.Iterators +import Base.Iterators: drop export impute, impute!, chain, chain!, drop, drop!, interp, interp!, ImputeError -const Dataset = Union{AbstractArray, DataFrame} +function __init__() + sym = join(["chain", "chain!", "drop", "drop!", "interp", "interp!"], ", ", " and ") + + @warn( + """ + The following symbols will not be exported in future releases: $sym. + Please qualify your calls with `Impute.(...)` or explicitly import the symbol. + """ + ) + + @warn( + """ + The default limit for all impute functions will be 1.0 going forward. + If you depend on a specific threshold please pass in an appropriate `AbstractContext`. + """ + ) +end """ ImputeError{T} <: Exception @@ -27,119 +44,230 @@ Base.showerror(io::IO, err::ImputeError) = println(io, "ImputeError: $(err.msg)" include("context.jl") include("imputors.jl") -const global imputation_methods = Dict{Symbol, Type}( - :drop => Drop, - :interp => Interpolate, - :fill => Fill, - :locf => LOCF, - :nocb => NOCB, +const global imputation_methods = ( + drop = DropObs, + dropobs = DropObs, + dropvars = DropVars, + interp = Interpolate, + fill = Fill, + locf = LOCF, + nocb = NOCB, ) -""" - impute!(data::Dataset, method::Symbol=:interp, args...; limit::Float64=0.1) +include("deprecated.jl") -Looks up the `Imputor` type for the `method`, creates it and calls -`impute!(imputor::Imputor, data::Dataset, limit::Float64)` with it. +for (f, v) in pairs(imputation_methods) + typename = nameof(v) + f! = Symbol(f, :!) -# Arguments -* `data::Dataset`: the datset containing missing elements we should impute. -* `method::Symbol`: the imputation method to use - (options: [`:drop`, `:fill`, `:interp`, `:locf`, `:nocb`]) -* `args::Any...`: any arguments you should pass to the `Imputor` constructor. -* `limit::Float64`: missing data ratio limit/threshold (default: 0.1) -""" -function impute!(data::Dataset, method::Symbol, args...; limit::Float64=0.1) - imputor_type = imputation_methods[method] - imputor = length(args) > 0 ? imputor_type(args...) : imputor_type() - return impute!(imputor, data, limit) + @eval begin + $f(data; kwargs...) = impute(data, $typename(; _extract_context_kwargs(kwargs...)...)) + $f!(data; kwargs...) = impute!(data, $typename(; _extract_context_kwargs(kwargs...)...)) + $f(; kwargs...) = data -> impute(data, $typename(; _extract_context_kwargs(kwargs...)...)) + $f!(; kwargs...) = data -> impute!(data, $typename(; _extract_context_kwargs(kwargs...)...)) + end end -""" - impute!(data::Dataset, missing::Function, method::Symbol=:interp, args...; limit::Float64=0.1) - -Creates the appropriate `Imputor` type and `Context` (using `missing` function) in order to call -`impute!(imputor::Imputor, ctx::Context, data::Dataset)` with them. - -# Arguments -* `data::Dataset`: the datset containing missing elements we should impute. -* `missing::Function`: the missing data function to use -* `method::Symbol`: the imputation method to use - (options: [`:drop`, `:fill`, `:interp`, `:locf`, `:nocb`]) -* `args::Any...`: any arguments you should pass to the `Imputor` constructor. -* `limit::Float64`: missing data ratio limit/threshold (default: 0.1) -""" -function impute!(data::Dataset, missing::Function, method::Symbol, args...; limit::Float64=0.1) - imputor_type = imputation_methods[method] - imputor = length(args) > 0 ? imputor_type(args...) : imputor_type() - ctx = Context(*(size(data)...), 0, limit, missing) - return impute!(imputor, ctx, data) -end - -""" - impute(data::Dataset, args...; kwargs...) - -Copies the `data` before calling `impute!(new_data, args...; kwargs...)` -""" -function impute(data::Dataset, args...; kwargs...) - return impute!(deepcopy(data), args...; kwargs...) -end - -""" - chain!(data::Dataset, missing::Function, imputors::Imputor...; kwargs...) - -Creates a `Chain` with `imputors` and calls `impute!(imputor, missing, data; kwargs...)` -""" -function chain!(data::Dataset, missing::Function, imputors::Imputor...; kwargs...) - imputor = Chain(imputors...) - return impute!(imputor, missing, data; kwargs...) -end - -""" - chain!(data::Dataset, imputors::Imputor...; kwargs...) - -Creates a `Chain` with `imputors` and calls `impute!(imputor, data; kwargs...)` -""" -function chain!(data::Dataset, imputors::Imputor...; kwargs...) - imputor = Chain(imputors...) - return impute!(imputor, data; kwargs...) -end +@doc """ + Impute.dropobs(data; vardim=2, context=Context()) + +Removes missing observations from the `AbstractArray` or `Tables.table` provided. +See [DropObs](@ref) for details. + +# Example +``` +julia> using DataFrames; using Impute: Impute, Context + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.dropobs(df; vardim=1, context=Context(; limit=1.0)) +3×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64 │ Float64 │ +├─────┼─────────┼─────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ 5.0 │ 5.5 │ +``` +""" dropobs + +@doc """ + Impute.dropvars(data; vardim=2, context=Context()) + +Finds variables with too many missing values in a `AbstractMatrix` or `Tables.table` and +removes them from the input data. See [DropVars](@ref) for details. + +# Example +```jldoctest +julia> using DataFrames; using Impute: Impute, Context + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.dropvars(df; vardim=1, context=Context(; limit=0.2)) +5×1 DataFrames.DataFrame +│ Row │ b │ +│ │ Float64⍰ │ +├─────┼──────────┤ +│ 1 │ 1.1 │ +│ 2 │ 2.2 │ +│ 3 │ 3.3 │ +│ 4 │ missing │ +│ 5 │ 5.5 │ +``` +""" dropvars + +@doc """ + Impute.interp(data; vardim=2, context=Context()) + +Performs linear interpolation between the nearest values in an vector. +See [Interpolate](@ref) for details. + +# Example +```jldoctest +julia> using DataFrames; using Impute: Impute, Context + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.interp(df; vardim=1, context=Context(; limit=1.0)) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ 3.0 │ 3.3 │ +│ 4 │ 4.0 │ 4.4 │ +│ 5 │ 5.0 │ 5.5 │ +``` +""" interp + +@doc """ + Impute.fill(data; value=mean, vardim=2, context=Context()) + +Fills in the missing data with a specific value. See [Fill](@ref) for details. + +# Example +```jldoctest +julia> using DataFrames; using Impute: Impute, Context + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.fill(df; value=-1.0, vardim=1, context=Context(; limit=1.0)) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ -1.0 │ 3.3 │ +│ 4 │ -1.0 │ -1.0 │ +│ 5 │ 5.0 │ 5.5 │ +``` +""" fill + +@doc """ + Impute.locf(data; vardim=2, context=Context()) + +Iterates forwards through the `data` and fills missing data with the last existing +observation. See [LOCF](@ref) for details. + +# Example +```jldoctest +julia> using DataFrames; using Impute: Impute, Context + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.locf(df; vardim=1, context=Context(; limit=1.0)) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ 2.0 │ 3.3 │ +│ 4 │ 2.0 │ 3.3 │ +│ 5 │ 5.0 │ 5.5 │ +``` +""" locf + +@doc """ + Impute.nocb(data; vardim=2, context=Context()) + +Iterates backwards through the `data` and fills missing data with the next existing +observation. See [LOCF](@ref) for details. + +# Example +```jldoctest +julia> using DataFrames; using Impute: Impute, Context + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.nocb(df; vardim=1, context=Context(; limit=1.0)) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ 5.0 │ 3.3 │ +│ 4 │ 5.0 │ 5.5 │ +│ 5 │ 5.0 │ 5.5 │ +``` +""" nocb -""" - chain(data::Dataset, args...; kwargs...) - -Copies the `data` before calling `chain!(data, args...; kwargs...)` -""" -function chain(data::Dataset, args...; kwargs...) - result = deepcopy(data) - return chain!(data, args...; kwargs...) -end - -""" - drop!(data::Dataset; limit=1.0) - -Utility method for `impute!(data, :drop; limit=limit)` -""" -drop!(data::Dataset; limit=1.0) = impute!(data, :drop; limit=limit) - -""" - drop(data::Dataset; limit=1.0) - -Utility method for `impute(data, :drop; limit=limit)` -""" -Iterators.drop(data::Dataset; limit=1.0) = impute(data, :drop; limit=limit) - -""" - interp!(data::Dataset; limit=1.0) - -Utility method for `impute!(data, :interp; limit=limit)` -""" -interp!(data::Dataset; limit=1.0) = impute!(data, :interp; limit=limit) - -""" - interp(data::Dataset; limit=1.0) - -Utility method for `impute(data, :interp; limit=limit)` -""" -interp(data::Dataset; limit=1.0) = impute(data, :interp; limit=limit) end # module diff --git a/src/context.jl b/src/context.jl index 201965b..306acdd 100644 --- a/src/context.jl +++ b/src/context.jl @@ -1,30 +1,30 @@ """ - Context + AbstractContext -Stores common summary information for all Imputor types. +An imputation context records summary information about missing data for an imputation algorithm. +All `AbstractContext`s are callable with a function, which allows us to write code like: -# Fields -* `num::Int`: number of observations -* `count::Int`: number of missing values found -* `limit::Float64`: allowable limit for missing values to impute -* `missing::Function`: returns a Bool if the value counts as missing. -""" -mutable struct Context - num::Int - count::Int - limit::Float64 - missing::Function +```julia +context() do c + # My imputation code using a clean context end +``` -Context(limit::Float64, missing::Function=ismissing) = Context(0, 0, limit, missing) +This do-block will pass a fresh context to your code and apply the `on_complete` function on +the resulting data and context state. By default, `on_complete` will throw an +[ImputeError](@ref) if we have too many missing values. +""" +abstract type AbstractContext end -Base.copy(x::Context) = Context(x.num, x.count, x.limit, x.missing) +# We implement a version of copy for all contexts which reconstructs the context from the +# raw fields. +Base.copy(ctx::T) where {T <: AbstractContext} = T(fieldvalues(ctx)...) """ - ismissing(ctx::Context, x) -> Bool + ismissing(ctx::AbstractContext, x) -> Bool -Uses `ctx.missing` to determine if x is missing. If x is a data row or an abstract array -then `ismissing` will return true if `ctx.missing` returns true for any element. +Uses `ctx.is_missing` to determine if x is missing. If x is a `NamedTuple` or an `AbstractArray` +then `ismissing` will return true if `ctx.is_missing` returns true for any element. The ctx.count is increased whenever whenever we return true and if `ctx.count / ctx.num` exceeds our `ctx.limit` we throw an `ImputeError` @@ -32,74 +32,199 @@ exceeds our `ctx.limit` we throw an `ImputeError` * `ctx::Context`: the contextual information about missing information. * `x`: the value to check (may be an single values, abstract array or row) """ -function Base.ismissing(ctx::Context, x) - missing = if isa(x, DataFrameRow) - any(entry -> ctx.missing(entry[2]), pairs(x)) +function Base.ismissing(ctx::AbstractContext, x) + was_missing = if isa(x, NamedTuple) + any(ctx.is_missing, Tuple(x)) elseif isa(x, AbstractArray) - any(ctx.missing, x) + any(ctx.is_missing, x) else - ctx.missing(x) + ctx.is_missing(x) end - if missing - ctx.count += 1 - - if (ctx.count / ctx.num) > ctx.limit - throw(ImputeError( - "More than $(ctx.limit * 100)% of values were missing ()." - )) - end + missing_update!(ctx, was_missing) - return true - else - return false - end + return was_missing end """ - findfirst(ctx::Context, data::AbstractVector) -> Int + findfirst(ctx::AbstractContext, data::AbstractVector) -> Int -Returns the first not missing index in `data`. +Returns the first non-missing index in `data`. # Arguments -* `ctx::Context`: the context to pass into `ismissing` +* `ctx::AbstractContext`: the context to pass into `ismissing` * `data::AbstractVector`: the data array to search # Returns * `Int`: the first index in `data` that isn't missing """ -function Base.findfirst(ctx::Context, data::AbstractVector) +function Base.findfirst(ctx::AbstractContext, data::AbstractVector) return findfirst(x -> !ismissing(ctx, x), data) end """ - findlast(ctx::Context, data::AbstractVector) -> Int + findlast(ctx::AbstractContext, data::AbstractVector) -> Int -Returns the last not missing index in `data`. +Returns the last non-missing index in `data`. # Arguments -* `ctx::Context`: the context to pass into `ismissing` +* `ctx::AbstractContext`: the context to pass into `ismissing` * `data::AbstractVector`: the data array to search # Returns * `Int`: the last index in `data` that isn't missing """ -function Base.findlast(ctx::Context, data::AbstractVector) +function Base.findlast(ctx::AbstractContext, data::AbstractVector) return findlast(x -> !ismissing(ctx, x), data) end """ - findnext(ctx::Context, data::AbstractVector) -> Int + findnext(ctx::AbstractContext, data::AbstractVector) -> Int -Returns the next not missing index in `data`. +Returns the next non-missing index in `data`. # Arguments -* `ctx::Context`: the context to pass into `ismissing` +* `ctx::AbstractContext`: the context to pass into `ismissing` * `data::AbstractVector`: the data array to search # Returns * `Int`: the next index in `data` that isn't missing """ -function Base.findnext(ctx::Context, data::AbstractVector, idx::Int) +function Base.findnext(ctx::AbstractContext, data::AbstractVector, idx::Int) return findnext(x -> !ismissing(ctx, x), data, idx) end + +mutable struct Context <: AbstractContext + num::Int + count::Int + limit::Float64 + is_missing::Function + on_complete::Function +end + +""" + Context + +Records base information about the missing data and assume all observations are equally +weighted. + +# Keyword Arguments +* `n::Int`: number of observations +* `count::Int`: number of missing values found +* `limit::Float64`: portion of total values allowed to be imputed (should be between 0.0 and 1.0). +* `is_missing::Function`: must return a Bool indicating if the value counts as missing +* `on_complete::Function`: a function to run when imputation is complete +""" +function Context(; + limit::Float64=0.1, + is_missing::Function=ismissing, + on_complete::Function=complete +) + return Context(0, 0, limit, is_missing, on_complete) +end + +function Base.empty(ctx::Context) + _ctx = copy(ctx) + _ctx.num = 0 + _ctx.count = 0 + + return _ctx +end + +function missing_update!(ctx::Context, was_missing) + ctx.num += 1 + + if was_missing + ctx.count += 1 + end +end + +function complete(ctx::Context, data) + missing_ratio = ctx.count / ctx.num + + if missing_ratio > ctx.limit + throw(ImputeError( + "More than $(ctx.limit * 100)% of values were missing ($missing_ratio)." + )) + end + + return data +end + + +mutable struct WeightedContext <: AbstractContext + num::Int + s::Float64 + limit::Float64 + is_missing::Function + on_complete::Function + wv::AbstractWeights +end + +""" + WeightedContext(wv; limit=1.0, is_missing=ismissing, on_complete=complete) + +Records information about the missing data relative to a set of weights. +This context type can be useful if some missing observation are more important than others +(e.g., more recent observations in time series datasets) + +# Arguments +* `wv::AbstractWeights`: a set of statistical weights to use when evaluating the importance + of each observation. Will be accumulated during imputation. + +# Keyword Arguments +* `num::Int`: number of observations +* `s::Float64`: sum of the weights of missing values +* `limit::Float64`: portion of total values allowed to be imputed (should be between 0.0 and 1.0). +* `is_missing::Function`: returns a Bool if the value counts as missing +* `on_complete::Function`: a function to run when imputation is complete +""" +function WeightedContext( + wv::AbstractWeights; + limit::Real=1.0, + is_missing::Function=ismissing, + on_complete::Function=complete +) + return WeightedContext(0, 0.0, limit, is_missing, on_complete, wv) +end + +function Base.empty(ctx::WeightedContext) + _ctx = copy(ctx) + _ctx.num = 0 + _ctx.s = 0.0 + + return _ctx +end + +function missing_update!(ctx::WeightedContext, was_missing) + ctx.num += 1 + + if was_missing + ctx.s += ctx.wv[ctx.num] + end +end + +function complete(ctx::WeightedContext, data) + missing_ratio = ctx.s / sum(ctx.wv) + + if missing_ratio > ctx.limit + throw(ImputeError( + "More than $(ctx.limit * 100)% of weighted values were missing ($missing_ratio)." + )) + end + + return data +end + +#= +Define our callable methods for each context. Once we drop 1.0 we should be able to just +define this on the `AbstractContext`. +=# +for T in (Context, WeightedContext) + @eval begin + function (ctx::$T)(f::Function) + _ctx = empty(ctx) + return ctx.on_complete(_ctx, f(_ctx)) + end + end +end diff --git a/src/deprecated.jl b/src/deprecated.jl new file mode 100644 index 0000000..105c3b4 --- /dev/null +++ b/src/deprecated.jl @@ -0,0 +1,186 @@ +############################################################################### +# Deprecations for calling impute on an Imputor with a custom AbstractContext # +############################################################################### +Base.@deprecate( + impute(imp::Imputor, context::AbstractContext, data; kwargs...), + impute(data, typeof(imp)(; context=context, kwargs...)) +) + +Base.@deprecate( + impute!(imp::Imputor, context::AbstractContext, data; kwargs...), + impute!(data, typeof(imp)(; context=context, kwargs...)) +) + +Base.@deprecate impute(imp::Imputor, data) impute(data, imp) +Base.@deprecate impute!(imp::Imputor, data) impute!(data, imp) + +##################################################################### +# Deprecate all impute calls where the first argument is an Imputor # +##################################################################### +""" + impute!(data, method::Symbol=:interp, args...; limit::Float64=0.1) + +Looks up the `Imputor` type for the `method`, creates it and calls +`impute!(data, imputor::Imputor)` with it. + +# Arguments +* `data`: the datset containing missing elements we should impute. +* `method::Symbol`: the imputation method to use + (options: [`:drop`, `:fill`, `:interp`, `:locf`, `:nocb`]) +* `args::Any...`: any arguments you should pass to the `Imputor` constructor. +* `limit::Float64`: missing data ratio limit/threshold (default: 0.1) +""" +function impute!(data, method::Symbol, args...; limit::Float64=0.1) + Base.depwarn( + """ + impute!(data, method) is deprecated. + Please use Impute.method!(data) or impute!(data, imputor::Imputor). + """, + :impute! + ) + imputor_type = imputation_methods[method] + imputor = if length(args) > 0 + imputor_type(args...; context=Context(; limit=limit)) + else + imputor_type(; context=Context(; limit=limit)) + end + + return impute!(data, imputor) +end + +""" + impute!(data, missing::Function, method::Symbol=:interp, args...; limit::Float64=0.1) + +Creates the appropriate `Imputor` type and `Context` (using `missing` function) in order to call +`impute!(data, imputor::Imputor)` with them. + +# Arguments +* `data`: the datset containing missing elements we should impute. +* `missing::Function`: the missing data function to use +* `method::Symbol`: the imputation method to use + (options: [`:drop`, `:fill`, `:interp`, `:locf`, `:nocb`]) +* `args::Any...`: any arguments you should pass to the `Imputor` constructor. +* `limit::Float64`: missing data ratio limit/threshold (default: 0.1) +""" +function impute!(data, missing::Function, method::Symbol, args...; limit::Float64=0.1) + Base.depwarn( + """ + impute!(data, missing, method) is deprecated. Please use impute!(data, imputor::Imputor). + """, + :impute! + ) + imputor_type = imputation_methods[method] + imputor = if length(args) > 0 + imputor_type(args...; context=Context(; is_missing=missing, limit=limit)) + else + imputor_type(; context=Context(; is_missing=missing, limit=limit)) + end + + return impute!(data, imputor) +end + +""" + impute(data, args...; kwargs...) + +Copies the `data` before calling `impute!(new_data, args...; kwargs...)` +""" +function impute(data, args...; kwargs...) + Base.depwarn( + """ + impute(data, args...; kwargs...) is deprecated. + Please use Impute.method(data) or impute(data, imputor::Imputor). + """, + :impute + ) + # Call `deepcopy` because we can trust that it's available for all types. + return impute!(deepcopy(data), args...; kwargs...) +end + +################################# +# Deprecate the chain functions # +################################# +""" + chain!(data, missing::Function, imputors::Imputor...; kwargs...) + +Creates a `Chain` with `imputors` and calls `impute!(imputor, missing, data; kwargs...)` +""" +function chain!(data, missing::Function, imputors::Imputor...; kwargs...) + Base.depwarn( + """ + chain!(data, missing, imputors...) is deprecated. + Please use data = imp1(data) |> imp2 |> imp3 + """, + :chain! + ) + return chain!(data, imputors...; is_missing=missing, kwargs...) +end + +""" + chain!(data, imputors::Imputor...; kwargs...) + +Creates a `Chain` with `imputors` and calls `impute!(data, imputor)` +""" +function chain!(data, imputors::Imputor...; kwargs...) + Base.depwarn( + """ + chain!(data, imputors...) is deprecated. + Please use data = imp1(data) |> imp2 |> imp3 + """, + :chain! + ) + ctx = Context(; kwargs...) + + for imputor in imputors + imp = typeof(imputor)( + (isa(x, AbstractContext) ? ctx : x for x in fieldvalues(imputor))... + ) + data = impute!(data, imp) + end + + return data +end + +""" + chain(data, args...; kwargs...) + +Copies the `data` before calling `chain!(data, args...; kwargs...)` +""" +function chain(data, args...; kwargs...) + Base.depwarn( + """ + chain(data, args...) is deprecated. + Please use result = imp1(data) |> imp2 |> imp3 + """, + :chain + ) + # Call `deepcopy` because we can trust that it's available for all types. + return chain!(deepcopy(data), args...; kwargs...) +end + +##################### +# Misc Deprecations # +##################### +Base.@deprecate Fill(val; kwargs...) Fill(; value=val, kwargs...) +Base.@deprecate_binding Drop DropObs false + +# This function is just used to support legacy behaviour and should be removed in a +# future release when we dropping accepting the limit kwarg to impute functions. +function _extract_context_kwargs(kwargs...) + d = Dict{Symbol, Any}(kwargs...) + limit = 1.0 + + if haskey(d, :limit) + limit = d[:limit] + @warn( + "Passing `limit` directly to impute functions is deprecated. " * + "Please pass `context=Context(; limit=$limit)` in the future." + ) + delete!(d, :limit) + end + + if !haskey(d, :context) + d[:context] = Context(; limit=limit) + end + + return d +end diff --git a/src/imputors.jl b/src/imputors.jl index 9837750..4d4d471 100644 --- a/src/imputors.jl +++ b/src/imputors.jl @@ -1,77 +1,129 @@ """ Imputor -An imputor stores information about imputing values in `AbstractArray`s and `DataFrame`s. +An imputor stores information about imputing values in `AbstractArray`s and `Tables.table`s. New imputation methods are expected to sutype `Imputor` and, at minimum, -implement the `impute!{T<:Any}(imp::, ctx::Context, data::AbstractArray{T, 1})` -method. +implement the `impute!(imp::, data::AbstractVector)` method. """ - abstract type Imputor end -""" - impute!(imp::Imputor, data::Dataset, limit::Float64=0.1) +# A couple utility methods to avoid messing up var and obs dimensions +obsdim(imp::Imputor) = imp.vardim == 1 ? 2 : 1 +vardim(imp::Imputor) = imp.vardim -Creates a `Context` using information about `data`. These include +function obswise(imp::Imputor, data::AbstractMatrix) + return (selectdim(data, obsdim(imp), i) for i in axes(data, obsdim(imp))) +end -1. missing data function which defaults to `missing` +function varwise(imp::Imputor, data::AbstractMatrix) + return (selectdim(data, vardim(imp), i) for i in axes(data, vardim(imp))) +end -2. number of elements: `*(size(data)...)` +function filterobs(f::Function, imp::Imputor, data::AbstractMatrix) + mask = [f(x) for x in obswise(imp, data)] + return imp.vardim == 1 ? data[:, mask] : data[mask, :] +end -# Arguments -* `imp::Imputor`: the Imputor method to use -* `data::Dataset`: the data to impute -* `limit::Float64: missing data ratio limit/threshold (default: 0.1)` +function filtervars(f::Function, imp::Imputor, data::AbstractMatrix) + mask = [f(x) for x in varwise(imp, data)] + return imp.vardim == 1 ? data[mask, :] : data[:, mask] +end + +""" + impute(data, imp::Imputor) -# Return -* `Dataset`: the input `data` with values imputed. +Returns a new copy of the `data` with the missing data imputed by the imputor `imp`. """ -function impute!(imp::Imputor, data::Dataset, limit::Float64=0.1) - ctx = Context(*(size(data)...), 0, limit, ismissing) - return impute!(imp, ctx, data) +function impute(data, imp::Imputor) + # Call `deepcopy` because we can trust that it's available for all types. + return impute!(deepcopy(data), imp) end """ - impute!(imp::Imputor, ctx::Context, data::AbstractMatrix) + impute!(data::AbstractMatrix, imp::Imputor) -Imputes the data in a matrix by imputing the values 1 column at a time; +Impute the data in a matrix by imputing the values one variable at a time; if this is not the desired behaviour custom imputor methods should overload this method. # Arguments -* `imp::Imputor`: the Imputor method to use -* `ctx::Context`: the contextual information for missing data * `data::AbstractMatrix`: the data to impute +* `imp::Imputor`: the Imputor method to use # Returns * `AbstractMatrix`: the input `data` with values imputed + +# Example +```jldoctest +julia> using Impute: Interpolate, Context, impute + +julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 missing missing 5.0 + 1.1 2.2 3.3 missing 5.5 + +julia> impute(M, Interpolate(; vardim=1, context=Context(; limit=1.0))) +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 3.0 4.0 5.0 + 1.1 2.2 3.3 4.4 5.5 +``` """ -function impute!(imp::Imputor, ctx::Context, data::AbstractMatrix) - for i in 1:size(data, 2) - impute!(imp, ctx, view(data, :, i)) +function impute!(data::AbstractMatrix, imp::Imputor) + for var in varwise(imp, data) + impute!(var, imp) end return data end """ - impute!(imp::Imputor, ctx::Context, data::DataFrame) + impute!(table, imp::Imputor) -Imputes the data in a DataFrame by imputing the values 1 column at a time; +Imputes the data in a table by imputing the values 1 column at a time; if this is not the desired behaviour custom imputor methods should overload this method. # Arguments * `imp::Imputor`: the Imputor method to use -* `ctx::Context`: the contextual information for missing data -* `data::DataFrame`: the data to impute +* `table`: the data to impute # Returns -* `DataFrame`: the input `data` with values imputed +* the input `data` with values imputed + +# Example +``jldoctest +julia> using DataFrames; using Impute: Interpolate, Context, impute +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> impute(df, Interpolate(; vardim=1, context=Context(; limit=1.0))) +5×2 DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ 3.0 │ 3.3 │ +│ 4 │ 4.0 │ 4.4 │ +│ 5 │ 5.0 │ 5.5 │ """ -function impute!(imp::Imputor, ctx::Context, data::DataFrame) - colwise(data) do c - impute!(imp, ctx, c) +function impute!(table, imp::Imputor) + istable(table) || throw(MethodError(impute!, (table, imp))) + + # Extract a columns iterator that we should be able to use to mutate the data. + # NOTE: Mutation is not guaranteed for all table types, but it avoid copying the data + columntable = Tables.columns(table) + + for cname in propertynames(columntable) + impute!(getproperty(columntable, cname), imp) end - return data + return table end diff --git a/src/imputors/chain.jl b/src/imputors/chain.jl index c416b6b..25c458d 100644 --- a/src/imputors/chain.jl +++ b/src/imputors/chain.jl @@ -18,44 +18,21 @@ Creates a Chain using the `Imputor`s provided (ordering matters). Chain(imputors::Imputor...) = Chain(collect(imputors)) """ - impute!(imp::Chain, missing::Function, data::Dataset; limit::Float64=0.1) + impute!(data, imp::Chain) -Creates a `Context` and runs the `Imputor`s on the supplied data. +Runs the `Imputor`s on the supplied data. # Arguments * `imp::Chain`: the chain to run -* `missing::Function`: the missing function to use in the `Context` to pass to the `Imputor`s -* `data::Dataset`: our data to impute -* `limit::Float64`: the missing data ration limit/threshold +* `data`: our data to impute # Returns -* `Dataset`: our imputed data +* our imputed data """ -function impute!(imp::Chain, missing::Function, data::Dataset; limit::Float64=0.1) - ctx = Context(*(size(data)...), 0, limit, missing) - +function impute!(data, imp::Chain) for imputor in imp.imputors - impute!(imputor, copy(ctx), data) + data = impute!(data, imputor) end return data end - -""" - impute!(imp::Chain, data::Dataset; limit::Float64=0.1) - - -Infers the missing data function from the `data` and passes that to -`impute!(imp::Chain, missing::Function, data::Dataset; limit::Float64=0.1)`. - -# Arguments -* `imp::Chain`: the chain to run -* `data::Dataset`: our data to impute -* `limit::Float64`: the missing data ration limit/threshold - -# Returns -* `Dataset`: our imputed data -""" -function impute!(imp::Chain, data::Dataset; limit::Float64=0.1) - impute!(imp, ismissing, data; limit=limit) -end diff --git a/src/imputors/drop.jl b/src/imputors/drop.jl index 0c6c0cb..ad396e4 100644 --- a/src/imputors/drop.jl +++ b/src/imputors/drop.jl @@ -1,73 +1,145 @@ -""" - Drop <: Imputor +struct DropObs <: Imputor + vardim::Int + context::AbstractContext +end -Removes missing values from the `AbstractArray` or `DataFrame` provided. """ -struct Drop <: Imputor end + DropObs(; vardim=2, context=Context) -""" - impute!(imp::Drop, ctx::Context, data::AbstractVector) +Removes missing observations from the `AbstractArray` or `Tables.table` provided. -Uses `filter!` to remove missing elements from the array. +# Keyword Arguments +* `vardim=2::Int`: Specify the dimension for variables in matrix input data +* `context::AbstractContext=Context()`: A context which keeps track of missing data + summary information -# Arguments -* `imp::Drop`: this `Imputor` method -* `ctx::Context`: contextual information for missing data -* `data::AbstractVector`: the data to impute +# Example +```jldoctest +julia> using Impute: DropObs, Context, impute -# Returns -* `AbstractVector`: our data array with missing elements removed +julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 missing missing 5.0 + 1.1 2.2 3.3 missing 5.5 + +julia> impute(M, DropObs(; vardim=1, context=Context(; limit=1.0))) +2×3 Array{Union{Missing, Float64},2}: + 1.0 2.0 5.0 + 1.1 2.2 5.5 +``` """ -function impute!(imp::Drop, ctx::Context, data::AbstractVector) - return filter!(x -> !ismissing(ctx, x), data) +DropObs(; vardim=2, context=Context()) = DropObs(vardim, context) + +function impute!(data::AbstractVector, imp::DropObs) + imp.context() do c + filter!(x -> !ismissing(c, x), data) + end end -""" - impute!(imp::Drop, ctx::Context, data::AbstractMatrix) +function impute!(data::AbstractMatrix, imp::DropObs) + imp.context() do c + return filterobs(imp, data) do obs + !ismissing(c, obs) + end + end +end -Finds the missing rows in the matrix and uses a mask (Vector{Bool}) to return the -`data` with those rows removed. Unfortunately, the mask approach requires copying the matrix. +# Deleting elements from subarrays doesn't work so we need to collect that data into +# a separate array. +impute!(data::SubArray, imp::DropObs) = impute!(collect(data), imp::DropObs) -NOTES (or premature optimizations): -* We use `view`, but this will change the type of the `data` by returning a `SubArray` -* We might be able to do something clever by: - 1. reshaping the data to a vector - 2. running `deleteat!` for the appropriate indices and - 3. reshaping the data back to the desired shape. +function impute!(table, imp::DropObs) + imp.context() do c + @assert istable(table) + rows = Tables.rows(table) -# Arguments -* `imp::Drop`: this `Imputor` method -* `ctx::Context`: contextual information for missing data -* `data::AbstractMatrix`: the data to impute + # Unfortunately, we'll need to construct a new table + # since Tables.rows is just an iterator + filtered = Iterators.filter(rows) do r + !any(x -> ismissing(c, x), propertyvalues(r)) + end -# Returns -* `AbstractMatrix`: a new matrix with missing rows removed -""" -function impute!(imp::Drop, ctx::Context, data::AbstractMatrix) - ctx.num = size(data, 1) - mask = map(i -> !ismissing(ctx, data[i, :]), 1:size(data, 1)) - return data[mask, :] + table = materializer(table)(filtered) + return table + end +end + + +struct DropVars <: Imputor + vardim::Int + context::AbstractContext end """ - impute!(imp::Drop, ctx::Context, data::DataFrame) + DropVars(; vardim=2, context=Context()) + -Finds the missing rows in the `DataFrame` and deletes them. +Finds variables with too many missing values in a `AbstractMatrix` or `Tables.table` and +removes them from the input data. -NOTE: this isn't quite as fast as `dropnull` in `DataFrames`s as we're using an arbitrary -`missing` function rather than using the precomputed `dt.isnull` vector of bools. +# Keyword Arguments +* `vardim=2::Int`: Specify the dimension for variables in matrix input data +* `context::AbstractContext`: A context which keeps track of missing data + summary information -# Arguments -* `imp::Drop`: this `Imputor` method -* `ctx::Context`: contextual information for missing data -* `data::DataFrame`: the data to impute +# Examples +```jldoctest +julia> using Impute: DropVars, Context, impute -# Returns -* `DataFrame`: our data with the missing rows removed. +julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 missing missing 5.0 + 1.1 2.2 3.3 missing 5.5 + +julia> impute(M, DropVars(; vardim=1, context=Context(; limit=0.2))) +1×5 Array{Union{Missing, Float64},2}: + 1.1 2.2 3.3 missing 5.5 +``` """ -function impute!(imp::Drop, ctx::Context, data::DataFrame) - ctx.num = size(data, 1) - m = typeof(data).name.module - m.deleterows!(data, findall(r -> ismissing(ctx, r), m.eachrow(data))) - return data +DropVars(; vardim=2, context=Context()) = DropVars(vardim, context) + +function impute!(data::AbstractMatrix, imp::DropVars) + return filtervars(imp, data) do var + try + imp.context() do c + for x in var + ismissing(c, x) + end + end + return true + catch e + if isa(e, ImputeError) + return false + else + rethrow(e) + end + end + end +end + +function impute!(table, imp::DropVars) + istable(table) || throw(MethodError(impute!, (table, imp))) + cols = Tables.columns(table) + + cnames = Iterators.filter(propertynames(cols)) do cname + try + imp.context() do c + col = getproperty(cols, cname) + for i in eachindex(col) + ismissing(c, col[i]) + end + end + return true + catch e + if isa(e, ImputeError) + return false + else + rethrow(e) + end + end + end + + selected = Tables.select(table, cnames...) + table = materializer(table)(selected) + return table end diff --git a/src/imputors/fill.jl b/src/imputors/fill.jl index c7deec9..1aad47b 100644 --- a/src/imputors/fill.jl +++ b/src/imputors/fill.jl @@ -1,41 +1,55 @@ -""" - Fill <: Imputor - -Fills in the missing data with a specific value. - -# Fields -* `value::Any`: A scalar missing value or a function that returns the a scalar if - passed the data with missing data removed (e.g, `mean`) -""" struct Fill{T} <: Imputor value::T + vardim::Int + context::AbstractContext end """ - Fill() -> Fill - -By default `Fill()` will use the mean of the existing values as the fill value. -""" -Fill() = Fill(mean) + Fill(; value=mean, vardim=2, context=Context()) +Fills in the missing data with a specific value. +The current implementation is univariate, so each variable in a table or matrix will +be handled independently. + +# Keyword Arguments +* `value::Any`: A scalar or a function that returns a scalar if + passed the data with missing data removed (e.g, `mean`) +* `vardim=2::Int`: Specify the dimension for variables in matrix input data +* `context::AbstractContext`: A context which keeps track of missing data + summary information + +# Example +```jldoctest +julia> using Impute: Fill, Context, impute + +julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 missing missing 5.0 + 1.1 2.2 3.3 missing 5.5 + +julia> impute(M, Fill(; vardim=1, context=Context(; limit=1.0))) +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 2.66667 2.66667 5.0 + 1.1 2.2 3.3 3.025 5.5 +``` """ - impute!(imp::Fill, ctx::Context, data::AbstractVector) - -Computes the fill value if `imp.value` is a `Function` (i.e., `imp.value(drop(copy(data)))`) -and replaces all missing values in the `data` with that value. -""" -function impute!(imp::Fill, ctx::Context, data::AbstractVector) - fill_val = if isa(imp.value, Function) - imp.value(Iterators.drop(copy(data))) - else - imp.value - end +Fill(; value=mean, vardim=2, context=Context()) = Fill(value, vardim, context) + +function impute!(data::AbstractVector, imp::Fill) + imp.context() do c + fill_val = if isa(imp.value, Function) + # Call `deepcopy` because we can trust that it's available for all types. + imp.value(Impute.drop(deepcopy(data); context=c)) + else + imp.value + end - for i in 1:length(data) - if ismissing(ctx, data[i]) - data[i] = fill_val + for i in eachindex(data) + if ismissing(c, data[i]) + data[i] = fill_val + end end - end - return data + return data + end end diff --git a/src/imputors/interp.jl b/src/imputors/interp.jl index a2a8c92..6e41398 100644 --- a/src/imputors/interp.jl +++ b/src/imputors/interp.jl @@ -1,47 +1,71 @@ -""" - Interpolate <: Imputor - -Performs linear interpolation between the nearest values in an vector. -""" -struct Interpolate <: Imputor end +struct Interpolate <: Imputor + vardim::Int + context::AbstractContext +end """ - impute!(imp::Interpolate, ctx::Context, data::AbstractVector) + Interpolate(; vardim=2, context=Context()) -Uses linear interpolation between existing elements of a vector to fill in missing data. +Performs linear interpolation between the nearest values in an vector. +The current implementation is univariate, so each variable in a table or matrix will +be handled independently. WARNING: Missing values at the head or tail of the array cannot be interpolated if there are no existing values on both sides. As a result, this method does not guarantee that all missing values will be imputed. + +# Keyword Arguments +* `vardim=2::Int`: Specify the dimension for variables in matrix input data +* `context::AbstractContext`: A context which keeps track of missing data + summary information + +# Example +```jldoctest +julia> using Impute: Interpolate, Context, impute + +julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 missing missing 5.0 + 1.1 2.2 3.3 missing 5.5 + +julia> impute(M, Interpolate(; vardim=1, context=Context(; limit=1.0))) +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 3.0 4.0 5.0 + 1.1 2.2 3.3 4.4 5.5 +``` """ -function impute!(imp::Interpolate, ctx::Context, data::AbstractVector{<:Union{T, Missing}}) where T - i = findfirst(ctx, data) + 1 +Interpolate(; vardim=2, context=Context()) = Interpolate(vardim, context) - while i < length(data) - if ismissing(ctx, data[i]) - prev_idx = i - 1 - next_idx = findnext(ctx, data, i + 1) +function impute!(data::AbstractVector{<:Union{T, Missing}}, imp::Interpolate) where T + imp.context() do c + i = findfirst(c, data) + 1 - if next_idx !== nothing - gap_sz = (next_idx - prev_idx) - 1 + while i < lastindex(data) + if ismissing(c, data[i]) + prev_idx = i - 1 + next_idx = findnext(c, data, i + 1) - diff = data[next_idx] - data[prev_idx] - incr = diff / T(gap_sz + 1) - val = data[prev_idx] + incr + if next_idx !== nothing + gap_sz = (next_idx - prev_idx) - 1 - # Iteratively fill in the values - for j in i:(next_idx - 1) - data[j] = val - val += incr - end + diff = data[next_idx] - data[prev_idx] + incr = diff / T(gap_sz + 1) + val = data[prev_idx] + incr + + # Iteratively fill in the values + for j in i:(next_idx - 1) + data[j] = val + val += incr + end - i = next_idx - else - break + i = next_idx + else + break + end end + i += 1 end - i += 1 - end - return data + return data + end end diff --git a/src/imputors/locf.jl b/src/imputors/locf.jl index f911abf..02452ef 100644 --- a/src/imputors/locf.jl +++ b/src/imputors/locf.jl @@ -1,27 +1,53 @@ -struct LOCF <: Imputor end +struct LOCF <: Imputor + vardim::Int + context::AbstractContext +end """ - impute!(imp::LOCF, ctx::Context, data::AbstractVector) + LOCF(; vardim=2, context=Context()) + +Last observation carried forward (LOCF) iterates forwards through the `data` and fills +missing data with the last existing observation. The current implementation is univariate, +so each variable in a table or matrix will be handled independently. -Iterates forwards through the `data` and fills missing data with the last -existing observation. +See also: +- [NOCB](@ref): Next Observation Carried Backward WARNING: missing elements at the head of the array may not be imputed if there is no existing observation to carry forward. As a result, this method does not guarantee that all missing values will be imputed. -# Usage -``` +# Keyword Arguments +* `vardim=2::Int`: Specify the dimension for variables in matrix input data +* `context::AbstractContext`: A context which keeps track of missing data + summary information + +# Example +```jldoctest +julia> using Impute: LOCF, Context, impute +julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 missing missing 5.0 + 1.1 2.2 3.3 missing 5.5 + +julia> impute(M, LOCF(; vardim=1, context=Context(; limit=1.0))) +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 2.0 2.0 5.0 + 1.1 2.2 3.3 3.3 5.5 ``` """ -function impute!(imp::LOCF, ctx::Context, data::AbstractVector) - start_idx = findfirst(ctx, data) + 1 - for i in start_idx:length(data) - if ismissing(ctx, data[i]) - data[i] = data[i-1] +LOCF(; vardim=2, context=Context()) = LOCF(vardim, context) + +function impute!(data::AbstractVector, imp::LOCF) + imp.context() do c + start_idx = findfirst(c, data) + 1 + for i in start_idx:lastindex(data) + if ismissing(c, data[i]) + data[i] = data[i-1] + end end - end - return data + return data + end end diff --git a/src/imputors/nocb.jl b/src/imputors/nocb.jl index dd7c914..aca5798 100644 --- a/src/imputors/nocb.jl +++ b/src/imputors/nocb.jl @@ -1,32 +1,52 @@ -""" - NOCB <: Imputor +struct NOCB <: Imputor + vardim::Int + context::AbstractContext +end -Fills in missing data using the Next Observation Carried Backward (NOCB) approach. """ -struct NOCB <: Imputor end + NOCB(; vardim=2, context=Context()) -""" - impute!(imp::NOCB, ctx::Context, data::AbstractVector) +Next observation carried backward (NOCB) iterates backwards through the `data` and fills +missing data with the next existing observation. -Iterates backwards through the `data` and fills missing data with the next -existing observation. +See also: +- [LOCF](@ref): Last Observation Carried Forward WARNING: missing elements at the tail of the array may not be imputed if there is no existing observation to carry backward. As a result, this method does not guarantee that all missing values will be imputed. -# Usage -``` +# Keyword Arguments +* `vardim=2::Int`: Specify the dimension for variables in matrix input data +* `context::AbstractContext`: A context which keeps track of missing data + summary information + +# Example +```jldoctest +julia> using Impute: NOCB, Context, impute +julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 missing missing 5.0 + 1.1 2.2 3.3 missing 5.5 + +julia> impute(M, NOCB(; vardim=1, context=Context(; limit=1.0))) +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 5.0 5.0 5.0 + 1.1 2.2 3.3 5.5 5.5 ``` """ -function impute!(imp::NOCB, ctx::Context, data::AbstractVector) - end_idx = findlast(ctx, data) - 1 - for i in end_idx:-1:1 - if ismissing(ctx, data[i]) - data[i] = data[i+1] +NOCB(; vardim=2, context=Context()) = NOCB(vardim, context) + +function impute!(data::AbstractVector, imp::NOCB) + imp.context() do c + end_idx = findlast(c, data) - 1 + for i in end_idx:-1:firstindex(data) + if ismissing(c, data[i]) + data[i] = data[i+1] + end end - end - return data + return data + end end diff --git a/test/REQUIRE b/test/REQUIRE deleted file mode 100644 index b163b18..0000000 --- a/test/REQUIRE +++ /dev/null @@ -1 +0,0 @@ -RDatasets 0.5 diff --git a/test/deprecated.jl b/test/deprecated.jl new file mode 100644 index 0000000..961d86e --- /dev/null +++ b/test/deprecated.jl @@ -0,0 +1,174 @@ +@testset "deprecated" begin + a = allowmissing(1.0:1.0:20.0) + a[[2, 3, 7]] .= missing + mask = map(!ismissing, a) + + @testset "Drop" begin + result = impute(a, :drop; limit=0.2) + expected = copy(a) + deleteat!(expected, [2, 3, 7]) + + @test result == expected + + # Mutating method + a2 = copy(a) + Impute.drop!(a2; limit=0.2) + @test a2 == expected + end + + @testset "Interpolate" begin + result = impute(a, :interp; limit=0.2) + @test result == collect(1.0:1.0:20) + @test result == interp(a) + + # Test in-place method + a2 = copy(a) + Impute.interp!(a2; limit=0.2) + @test a2 == result + + # Test interpolation between identical points + b = ones(Union{Float64, Missing}, 20) + b[[2, 3, 7]] .= missing + @test interp(b) == ones(Union{Float64, Missing}, 20) + + # Test interpolation at endpoints + b = ones(Union{Float64, Missing}, 20) + b[[1, 3, 20]] .= missing + result = interp(b) + @test ismissing(result[1]) + @test ismissing(result[20]) + end + + @testset "Fill" begin + @testset "Value" begin + fill_val = -1.0 + result = impute(a, :fill, fill_val; limit=0.2) + expected = copy(a) + expected[[2, 3, 7]] .= fill_val + + @test result == expected + end + + @testset "Mean" begin + result = impute(a, :fill; limit=0.2) + expected = copy(a) + expected[[2, 3, 7]] .= mean(a[mask]) + + @test result == expected + + a2 = copy(a) + Impute.fill!(a2; limit=0.2) + @test a2 == result + end + end + + @testset "LOCF" begin + result = impute(a, :locf; limit=0.2) + expected = copy(a) + expected[2] = 1.0 + expected[3] = 1.0 + expected[7] = 6.0 + + @test result == expected + a2 = copy(a) + impute!(a2, :locf; limit=0.2) + @test a2 == result + end + + @testset "NOCB" begin + result = impute(a, :nocb; limit=0.2) + expected = copy(a) + expected[2] = 4.0 + expected[3] = 4.0 + expected[7] = 8.0 + + @test result == expected + a2 = copy(a) + Impute.nocb!(a2; limit=0.2) + @test a2 == result + end + + @testset "DataFrame" begin + data = dataset("boot", "neuro") + df = impute(data, :interp; limit=1.0) + end + + @testset "Matrix" begin + data = Matrix(dataset("boot", "neuro")) + + @testset "Drop" begin + result = Iterators.drop(data) + @test size(result, 1) == 4 + end + + @testset "Fill" begin + result = impute(data, :fill, 0.0; limit=1.0) + @test size(result) == size(data) + end + end + + @testset "Not enough data" begin + @test_throws ImputeError impute(a, :drop) + end + + @testset "Chain" begin + orig = dataset("boot", "neuro") + + @testset "DataFrame" begin + result = chain( + orig, + Impute.Interpolate(), + Impute.LOCF(), + Impute.NOCB(); + limit=1.0 + ) + + @test size(result) == size(orig) + # Confirm that we don't have any more missing values + @test all(!ismissing, Matrix(result)) + end + + @testset "Column Table" begin + data = Tables.columntable(orig) + result = chain( + data, + Impute.Interpolate(), + Impute.LOCF(), + Impute.NOCB(); + limit=1.0 + ) |> Tables.matrix + + @test size(result) == size(orig) + # Confirm that we don't have any more missing values + @test all(!ismissing, result) + end + + @testset "Matrix" begin + data = Matrix(orig) + result = chain( + data, + Impute.Interpolate(), + Impute.LOCF(), + Impute.NOCB(); + limit=1.0 + ) + + @test size(result) == size(data) + # Confirm that we don't have any more missing values + @test all(!ismissing, result) + end + end + + @testset "Alternate missing functions" begin + data1 = dataset("boot", "neuro") # Missing values with `missing` + data2 = impute(data1, :fill, NaN; limit=1.0) # Missing values with `NaN` + + @test impute(data1, :drop; limit=1.0) == dropmissing(data1) + + result1 = chain(data1, Impute.Interpolate(), Impute.Drop(); limit=1.0) + result2 = chain(data2, isnan, Impute.Interpolate(), Impute.Drop(); limit=1.0) + @test result1 == result2 + + @test Impute.drop(data1; limit=1.0) == impute(data2, isnan, :drop; limit=1.0) + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 1a268e5..05e3002 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,36 +1,103 @@ using Impute +using Tables using Test using DataFrames using RDatasets using Statistics +using StatsBase + +import Impute: + Drop, + DropObs, + DropVars, + Interpolate, + Fill, + LOCF, + NOCB, + Context, + WeightedContext, + ImputeError @testset "Impute" begin - a = Vector{Union{Float64, Missing}}(1.0:1.0:20.0) + a = allowmissing(1.0:1.0:20.0) a[[2, 3, 7]] .= missing mask = map(!ismissing, a) + ctx = Context(; limit=0.2) @testset "Drop" begin - result = impute(a, :drop; limit=0.2) - expected = copy(a) - deleteat!(expected, [2, 3, 7]) + @testset "DropObs" begin + result = impute(a, DropObs(; context=ctx)) + expected = copy(a) + deleteat!(expected, [2, 3, 7]) - @test result == expected + @test result == expected + @test result == Impute.dropobs(a; context=ctx) + + a2 = copy(a) + Impute.dropobs!(a2; context=ctx) + @test a2 == expected + end + @testset "DropVars" begin + @testset "Vector" begin + @test_throws MethodError Impute.dropvars(a) + end + + @testset "Matrix" begin + m = reshape(a, 5, 4) + + result = impute(m, DropVars(; context=ctx)) + expected = copy(m)[:, 2:4] + + @test isequal(result, expected) + @test isequal(result, Impute.dropvars(m; context=ctx)) + @test isequal(result', Impute.dropvars(m'; vardim=1, context=ctx)) + + Impute.dropvars!(m; context=ctx) + # The mutating test is broken because we need to making a copy of + # the original matrix + @test_broken isequal(m, expected) + end + @testset "DataFrame" begin + df = DataFrame( + :sin => allowmissing(sin.(1.0:1.0:20.0)), + :cos => allowmissing(sin.(1.0:1.0:20.0)), + ) + df.sin[[2, 3, 7, 12, 19]] .= missing + df.cos[[4, 9]] .= missing + + result = impute(df, DropVars(; context=ctx)) + expected = select(df, :cos) + + @test isequal(result, expected) + @test isequal(result, Impute.dropvars(df; context=ctx)) + + Impute.dropvars!(df; context=ctx) + # The mutating test is broken because we need to making a copy of + # the original table + @test_broken isequal(df, expected) + end + end end @testset "Interpolate" begin - result = impute(a, :interp; limit=0.2) + result = impute(a, Interpolate(; context=ctx)) @test result == collect(1.0:1.0:20) - @test result == interp(a) + @test result == interp(a; context=ctx) + + # Test in-place method + a2 = copy(a) + Impute.interp!(a2; context=ctx) + @test a2 == result # Test interpolation between identical points b = ones(Union{Float64, Missing}, 20) b[[2, 3, 7]] .= missing - @test interp(b) == ones(Union{Float64, Missing}, 20) + @test interp(b; context=ctx) == ones(Union{Float64, Missing}, 20) # Test interpolation at endpoints b = ones(Union{Float64, Missing}, 20) b[[1, 3, 20]] .= missing - result = interp(b) + result = interp(b; context=ctx) @test ismissing(result[1]) @test ismissing(result[20]) end @@ -38,88 +105,236 @@ using Statistics @testset "Fill" begin @testset "Value" begin fill_val = -1.0 - result = impute(a, :fill, fill_val; limit=0.2) + result = impute(a, Fill(; value=fill_val, context=ctx)) expected = copy(a) expected[[2, 3, 7]] .= fill_val @test result == expected + @test result == Impute.fill(a; value=fill_val, context=ctx) end @testset "Mean" begin - result = impute(a, :fill; limit=0.2) + result = impute(a, Fill(; value=mean, context=ctx)) expected = copy(a) expected[[2, 3, 7]] .= mean(a[mask]) @test result == expected + @test result == Impute.fill(a; value=mean, context=ctx) + + a2 = copy(a) + Impute.fill!(a2; context=ctx) + @test a2 == result end end @testset "LOCF" begin - result = impute(a, :locf; limit=0.2) + result = impute(a, LOCF(; context=ctx)) expected = copy(a) expected[2] = 1.0 expected[3] = 1.0 expected[7] = 6.0 @test result == expected + @test result == Impute.locf(a; context=ctx) + + a2 = copy(a) + Impute.locf!(a2; context=ctx) + @test a2 == result end @testset "NOCB" begin - result = impute(a, :nocb; limit=0.2) + result = impute(a, NOCB(; context=ctx)) expected = copy(a) expected[2] = 4.0 expected[3] = 4.0 expected[7] = 8.0 @test result == expected + @test result == Impute.nocb(a; context=ctx) + + a2 = copy(a) + Impute.nocb!(a2; context=ctx) + @test a2 == result end @testset "DataFrame" begin - data = dataset("boot", "neuro") - df = impute(data, :interp; limit=1.0) + ctx = Context(; limit=1.0) + @testset "Single DataFrame" begin + data = dataset("boot", "neuro") + df = impute(data, Interpolate(; context=ctx)) + @test isequal(df, Impute.interp(data; context=ctx)) + end + @testset "GroupedDataFrame" begin + hod = repeat(1:24, 12 * 10) + obj = repeat(1:12, 24 * 10) + n = length(hod) + + df = DataFrame( + :hod => hod, + :obj => obj, + :val => allowmissing( + [sin(x) * cos(y) for (x, y) in zip(hod, obj)] + ), + ) + + df.val[rand(1:n, 20)] .= missing + gdf1 = groupby(deepcopy(df), [:hod, :obj]) + gdf2 = groupby(df, [:hod, :obj]) + + f1 = Impute.interp(; context=ctx) ∘ Impute.locf!() ∘ Impute.nocb!() + f2 = Impute.interp!(; context=ctx) ∘ Impute.locf!() ∘ Impute.nocb!() + + result = vcat(f1.(gdf1)...) + @test df != result + @test size(result) == (24 * 12 * 10, 3) + @test all(!ismissing, Tables.matrix(result)) + + # Test that we can also mutate the dataframe directly + f2.(gdf2) + @test result == sort(df, (:hod, :obj)) + end end @testset "Matrix" begin + ctx = Context(; limit=1.0) + expected = Matrix(Impute.dropobs(dataset("boot", "neuro"); context=ctx)) data = Matrix(dataset("boot", "neuro")) @testset "Drop" begin - result = Iterators.drop(data) + result = impute(data, DropObs(; context=ctx)) @test size(result, 1) == 4 + @test result == Impute.dropobs(data; context=ctx) + + @test result == expected + @test Impute.dropobs(data'; vardim=1, context=ctx) == expected' end @testset "Fill" begin - result = impute(data, :fill, 0.0; limit=1.0) + result = impute(data, Fill(; value=0.0, context=ctx)) @test size(result) == size(data) + @test result == Impute.fill(data; value=0.0, context=ctx) + + data2 = copy(data) + Impute.fill!(data2; value=0.0, context=ctx) + @test data2 == result end end @testset "Not enough data" begin - @test_throws ImputeError impute(a, :drop) + ctx = Context(; limit=0.1) + @test_throws ImputeError impute(a, DropObs(; context=ctx)) + @test_throws ImputeError Impute.dropobs(a; context=ctx) end @testset "Chain" begin - data = Matrix(dataset("boot", "neuro")) - result = chain( - data, - Impute.Interpolate(), - Impute.LOCF(), - Impute.NOCB(); - limit=1.0 - ) - - @test size(result) == size(data) - # Confirm that we don't have any more missing values - @test !any(ismissing, result) + orig = dataset("boot", "neuro") + ctx = Context(; limit=1.0) + + @testset "DataFrame" begin + result = Impute.interp(orig; context=ctx) |> Impute.locf!() |> Impute.nocb!() + + @test size(result) == size(orig) + # Confirm that we don't have any more missing values + @test all(!ismissing, Matrix(result)) + + # We can also use the Chain type with explicit Imputor types + result2 = impute( + orig, + Impute.Chain( + Impute.Interpolate(; context=ctx), + Impute.LOCF(), + Impute.NOCB() + ), + ) + + @test result == result2 + end + + @testset "Column Table" begin + result = Tables.columntable(orig) |> + Impute.interp!(; context=ctx) |> + Impute.locf!() |> + Impute.nocb!() |> + Tables.matrix + + @test size(result) == size(orig) + # Confirm that we don't have any more missing values + @test all(!ismissing, result) + end + + @testset "Matrix" begin + data = Matrix(orig) + result = Impute.interp(data; context=ctx) |> Impute.locf!() |> Impute.nocb!() + + @test size(result) == size(data) + # Confirm that we don't have any more missing values + @test all(!ismissing, result) + end end @testset "Alternate missing functions" begin - data1 = dataset("boot", "neuro") # Missing values with `missing` - data2 = impute(data1, :fill, NaN; limit=1.0) # Missing values with `NaN` + ctx1 = Context(; limit=1.0) + ctx2 = Context(; limit=1.0, is_missing=isnan) + data1 = dataset("boot", "neuro") # Missing values with `missing` + data2 = Impute.fill(data1; value=NaN, context=ctx1) # Missing values with `NaN` + + @test Impute.dropobs(data1; context=ctx1) == dropmissing(data1) - @test impute(data1, :drop; limit=1.0) == dropmissing(data1) + result1 = Impute.interp(data1; context=ctx1) |> Impute.dropobs!() + result2 = Impute.interp(data2; context=ctx2) |> Impute.dropobs!(; context=ctx2) - result1 = chain(data1, Impute.Interpolate(), Impute.Drop(); limit=1.0) - result2 = chain(data2, isnan, Impute.Interpolate(), Impute.Drop(); limit=1.0) @test result1 == result2 end + + @testset "Contexts" begin + @testset "Base" begin + ctx = Context(; limit=0.1) + @test_throws ImputeError Impute.dropobs(a; context=ctx) + @test_throws ImputeError impute(a, DropObs(; context=ctx)) + end + + @testset "Weighted" begin + # If we use an exponentially weighted context then we won't pass the limit + # because missing earlier observations is less important than later ones. + ctx = WeightedContext(eweights(20, 0.3); limit=0.1) + @test isa(ctx, WeightedContext) + result = impute(a, DropObs(; context=ctx)) + expected = copy(a) + deleteat!(expected, [2, 3, 7]) + @test result == expected + + # If we reverse the weights such that earlier observations are more important + # then our previous limit of 0.2 won't be enough to succeed. + ctx = WeightedContext(reverse!(eweights(20, 0.3)); limit=0.2) + @test_throws ImputeError impute(a, DropObs(; context=ctx)) + end + end + + @testset "Utils" begin + drop_dim1 = DropObs(; vardim=1) + drop_dim2 = DropObs(; vardim=2) + M = [1.0 2.0 3.0 4.0 5.0; 1.1 2.2 3.3 4.4 5.5] + + @testset "obswise" begin + @test map(sum, Impute.obswise(drop_dim1, M)) == [2.1, 4.2, 6.3, 8.4, 10.5] + @test map(sum, Impute.obswise(drop_dim2, M)) == [15, 16.5] + end + + @testset "varwise" begin + @test map(sum, Impute.varwise(drop_dim1, M)) == [15, 16.5] + @test map(sum, Impute.varwise(drop_dim2, M)) == [2.1, 4.2, 6.3, 8.4, 10.5] + end + + @testset "filterobs" begin + @test Impute.filterobs(x -> sum(x) > 5.0, drop_dim1, M) == M[:, 3:5] + @test Impute.filterobs(x -> sum(x) > 15.0, drop_dim2, M) == M[[false, true], :] + end + + @testset "filtervars" begin + @test Impute.filtervars(x -> sum(x) > 15.0, drop_dim1, M) == M[[false, true], :] + @test Impute.filtervars(x -> sum(x) > 5.0, drop_dim2, M) == M[:, 3:5] + end + end + + include("deprecated.jl") end