Merge pull request #20 from invenia/rf/tables

Switch to Tables.jl API
invenia · Jul 15, 2019 · 77b9fa3 · 77b9fa3
2 parents 255ac57 + 051d6ce
commit 77b9fa3
Show file tree

Hide file tree

Showing 20 changed files with 1,565 additions and 855 deletions.
diff --git a/.appveyor.yml b/.appveyor.yml
@@ -1,6 +1,5 @@
 environment:
   matrix:
-  - julia_version: 0.7
   - julia_version: 1.0
   - julia_version: nightly
 

diff --git a/.travis.yml b/.travis.yml
@@ -4,7 +4,6 @@ os:
   - linux
   - osx
 julia:
-  - 0.7
   - 1.0
   - nightly
 notifications:
@@ -18,18 +17,13 @@ matrix:
 #  - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
 #  - julia -e 'Pkg.clone(pwd()); Pkg.build("Impute"); Pkg.test("Impute"; coverage=true)'
 after_success:
-  - |
-      julia -e '
-        VERSION >= v"0.7.0-DEV.3656" && using Pkg
-        VERSION >= v"0.7.0-DEV.5183" || cd(Pkg.dir("Impute"))
-        Pkg.add("Coverage")
-        using Coverage
-        Codecov.submit(Codecov.process_folder())
-      '
-  - |
-      julia -e '
-        VERSION >= v"0.7.0-DEV.3656" && using Pkg
-        VERSION >= v"0.7.0-DEV.5183" || cd(Pkg.dir("Impute"))
-        Pkg.add("Documenter")
-        include(joinpath("docs", "make.jl"))
-      '
+  - julia -e 'using Pkg; Pkg.add("Coverage"); using Coverage; Codecov.submit(process_folder())'
+jobs:
+  include:
+    - stage: "Documentation"
+      julia: 1.0
+      os: linux
+      script:
+        - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
+        - julia --project=docs/ docs/make.jl
+      after_success: skip
diff --git a/Project.toml b/Project.toml
@@ -4,16 +4,22 @@ authors = ["Invenia Technical Computing"]
 version = "0.2.0"
 
 [deps]
-DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
-DataFrames = "0.17, 0.18"
+DataFrames = ">= 0.16"
+IterTools = "1.2"
+RDatasets = ">= 0.6.2"
+Tables = "0.2"
 julia = "1"
 
 [extras]
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["RDatasets", "Test"]
+test = ["DataFrames", "RDatasets", "Test"]
diff --git a/README.md b/README.md
@@ -5,30 +5,110 @@
 [![Build status](https://ci.appveyor.com/api/projects/status/github/invenia/Impute.jl?svg=true)](https://ci.appveyor.com/project/invenia/Impute-jl)
 [![codecov](https://codecov.io/gh/invenia/Impute.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/invenia/Impute.jl)
 
-Impute.jl provides various data imputation methods for `Arrays` and `DataFrames` with various types of missing values.
+Impute.jl provides various methods for handling missing data in Vectors, Matrices and [Tables](https://github.com/JuliaData/Tables.jl).
 
 ## Installation
 ```julia
-Pkg.clone("https://github.com/invenia/Impute.jl")
+julia> using Pkg; Pkg.add("Impute")
 ```
 
-## Features
-* Operate over Vectors, Matrices or DataFrames
-* Chaining of methods
+## Quickstart
+Let's start by loading our dependencies:
+```julia
+julia> using DataFrames, RDatasets, Impute
+```
+
+We'll also want some test data containing missings to work with:
+
+```julia
+julia> df = dataset("boot", "neuro")
+469×6 DataFrames.DataFrame
+│ Row │ V1       │ V2       │ V3      │ V4       │ V5       │ V6       │
+│     │ Float64⍰ │ Float64⍰ │ Float64 │ Float64⍰ │ Float64⍰ │ Float64⍰ │
+├─────┼──────────┼──────────┼─────────┼──────────┼──────────┼──────────┤
+│ 1   │ missing  │ -203.7   │ -84.1   │ 18.5     │ missing  │ missing  │
+│ 2   │ missing  │ -203.0   │ -97.8   │ 25.8     │ 134.7    │ missing  │
+│ 3   │ missing  │ -249.0   │ -92.1   │ 27.8     │ 177.1    │ missing  │
+│ 4   │ missing  │ -231.5   │ -97.5   │ 27.0     │ 150.3    │ missing  │
+│ 5   │ missing  │ missing  │ -130.1  │ 25.8     │ 160.0    │ missing  │
+│ 6   │ missing  │ -223.1   │ -70.7   │ 62.1     │ 197.5    │ missing  │
+│ 7   │ missing  │ -164.8   │ -12.2   │ 76.8     │ 202.8    │ missing  │
+⋮
+│ 462 │ missing  │ -207.3   │ -88.3   │ 9.6      │ 104.1    │ 218.0    │
+│ 463 │ -242.6   │ -142.0   │ -21.8   │ 69.8     │ 148.7    │ missing  │
+│ 464 │ -235.9   │ -128.8   │ -33.1   │ 68.8     │ 177.1    │ missing  │
+│ 465 │ missing  │ -140.8   │ -38.7   │ 58.1     │ 186.3    │ missing  │
+│ 466 │ missing  │ -149.5   │ -40.3   │ 62.8     │ 139.7    │ 242.5    │
+│ 467 │ -247.6   │ -157.8   │ -53.3   │ 28.3     │ 122.9    │ 227.6    │
+│ 468 │ missing  │ -154.9   │ -50.8   │ 28.1     │ 119.9    │ 201.1    │
+│ 469 │ missing  │ -180.7   │ -70.9   │ 33.7     │ 114.8    │ 222.5    │
+```
 
-## Methods
+Our first instinct might be to drop all observations, but this leaves us too few rows to work with:
 
-* drop - remove missing
-* locf - last observation carried forward
-* nocb - next observation carried backward
-* interp - linear interpolation of values in vector
-* fill - replace with a specific value or a function which returns a value given the existing vector with missing values dropped.
+```julia
+julia> Impute.drop(df)
+4×6 DataFrames.DataFrame
+│ Row │ V1      │ V2      │ V3      │ V4      │ V5      │ V6      │
+│     │ Float64 │ Float64 │ Float64 │ Float64 │ Float64 │ Float64 │
+├─────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┤
+│ 1   │ -247.0  │ -132.2  │ -18.8   │ 28.2    │ 81.4    │ 237.9   │
+│ 2   │ -234.0  │ -140.8  │ -56.5   │ 28.0    │ 114.3   │ 222.9   │
+│ 3   │ -215.8  │ -114.8  │ -18.4   │ 65.3    │ 171.6   │ 249.7   │
+│ 4   │ -247.6  │ -157.8  │ -53.3   │ 28.3    │ 122.9   │ 227.6   │
+```
 
-## TODO
+We could try imputing the values with linear interpolation, but that still leaves missing
+data at the head and tail of our dataset:
+
+```julia
+julia> Impute.interp(df)
+469×6 DataFrames.DataFrame
+│ Row │ V1       │ V2       │ V3      │ V4       │ V5       │ V6       │
+│     │ Float64⍰ │ Float64⍰ │ Float64 │ Float64⍰ │ Float64⍰ │ Float64⍰ │
+├─────┼──────────┼──────────┼─────────┼──────────┼──────────┼──────────┤
+│ 1   │ missing  │ -203.7   │ -84.1   │ 18.5     │ missing  │ missing  │
+│ 2   │ missing  │ -203.0   │ -97.8   │ 25.8     │ 134.7    │ missing  │
+│ 3   │ missing  │ -249.0   │ -92.1   │ 27.8     │ 177.1    │ missing  │
+│ 4   │ missing  │ -231.5   │ -97.5   │ 27.0     │ 150.3    │ missing  │
+│ 5   │ missing  │ -227.3   │ -130.1  │ 25.8     │ 160.0    │ missing  │
+│ 6   │ missing  │ -223.1   │ -70.7   │ 62.1     │ 197.5    │ missing  │
+│ 7   │ missing  │ -164.8   │ -12.2   │ 76.8     │ 202.8    │ missing  │
+⋮
+│ 462 │ -241.025 │ -207.3   │ -88.3   │ 9.6      │ 104.1    │ 218.0    │
+│ 463 │ -242.6   │ -142.0   │ -21.8   │ 69.8     │ 148.7    │ 224.125  │
+│ 464 │ -235.9   │ -128.8   │ -33.1   │ 68.8     │ 177.1    │ 230.25   │
+│ 465 │ -239.8   │ -140.8   │ -38.7   │ 58.1     │ 186.3    │ 236.375  │
+│ 466 │ -243.7   │ -149.5   │ -40.3   │ 62.8     │ 139.7    │ 242.5    │
+│ 467 │ -247.6   │ -157.8   │ -53.3   │ 28.3     │ 122.9    │ 227.6    │
+│ 468 │ missing  │ -154.9   │ -50.8   │ 28.1     │ 119.9    │ 201.1    │
+│ 469 │ missing  │ -180.7   │ -70.9   │ 33.7     │ 114.8    │ 222.5    │
+```
+
+Finally, we can chain multiple simple methods together to give a complete dataset:
+
+```julia
+julia> Impute.interp(df) |> Impute.locf() |> Impute.nocb()
+469×6 DataFrames.DataFrame
+│ Row │ V1       │ V2       │ V3      │ V4       │ V5       │ V6       │
+│     │ Float64⍰ │ Float64⍰ │ Float64 │ Float64⍰ │ Float64⍰ │ Float64⍰ │
+├─────┼──────────┼──────────┼─────────┼──────────┼──────────┼──────────┤
+│ 1   │ -233.6   │ -203.7   │ -84.1   │ 18.5     │ 134.7    │ 222.7    │
+│ 2   │ -233.6   │ -203.0   │ -97.8   │ 25.8     │ 134.7    │ 222.7    │
+│ 3   │ -233.6   │ -249.0   │ -92.1   │ 27.8     │ 177.1    │ 222.7    │
+│ 4   │ -233.6   │ -231.5   │ -97.5   │ 27.0     │ 150.3    │ 222.7    │
+│ 5   │ -233.6   │ -227.3   │ -130.1  │ 25.8     │ 160.0    │ 222.7    │
+│ 6   │ -233.6   │ -223.1   │ -70.7   │ 62.1     │ 197.5    │ 222.7    │
+│ 7   │ -233.6   │ -164.8   │ -12.2   │ 76.8     │ 202.8    │ 222.7    │
+⋮
+│ 462 │ -241.025 │ -207.3   │ -88.3   │ 9.6      │ 104.1    │ 218.0    │
+│ 463 │ -242.6   │ -142.0   │ -21.8   │ 69.8     │ 148.7    │ 224.125  │
+│ 464 │ -235.9   │ -128.8   │ -33.1   │ 68.8     │ 177.1    │ 230.25   │
+│ 465 │ -239.8   │ -140.8   │ -38.7   │ 58.1     │ 186.3    │ 236.375  │
+│ 466 │ -243.7   │ -149.5   │ -40.3   │ 62.8     │ 139.7    │ 242.5    │
+│ 467 │ -247.6   │ -157.8   │ -53.3   │ 28.3     │ 122.9    │ 227.6    │
+│ 468 │ -247.6   │ -154.9   │ -50.8   │ 28.1     │ 119.9    │ 201.1    │
+│ 469 │ -247.6   │ -180.7   │ -70.9   │ 33.7     │ 114.8    │ 222.5    │
+```
 
-* Dropping rows in a matrix allocates extra memory (ie: `data[mask, :]` make a copy).
-* More sophisticated imputation methods
-    1. MICE
-    2. EM
-    3. kNN
-    4. Regression
+**Warning**: Your approach should depend on the properties of you data (e.g., [MCAR, MAR, MNAR](https://en.wikipedia.org/wiki/Missing_data#Types_of_missing_data)).
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -0,0 +1,9 @@
+[deps]
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+Impute = "f7bf1975-0170-51b9-8c5f-a992d46b9575"
+RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
+
+[compat]
+DataFrames = ">= 0.16"
+Documenter = "~0.22"
diff --git a/docs/make.jl b/docs/make.jl
@@ -1,4 +1,4 @@
-using Documenter, Impute, RDatasets
+using Documenter, Impute
 
 makedocs(
     modules=[Impute],