Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove n miss cumsum #187

Merged
merged 2 commits into from
Jul 30, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: naniar
Type: Package
Title: Data Structures, Summaries, and Visualisations for Missing Data
Version: 0.3.3.9000
Version: 0.3.3.9100
Authors@R: c(
person("Nicholas", "Tierney",
role = c("aut", "cre"),
Expand Down
8 changes: 8 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# naniar 0.3.3.9100 (2018/07/31)

## New feature

* `miss_var_summary` and `miss_case_summary` now no longer provide the
cumulative sum of missingness in the summaries - this summary can be added back
to the data with the option `add_cumsum = TRUE`. #186

# naniar 0.3.3.9000 (2018/07/30)

## Breaking Changes
Expand Down
12 changes: 10 additions & 2 deletions R/add-cols.R
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ add_shadow_shift <- function(data, ..., suffix = "shift"){
# change names
names(shadow_shifted_df) <- paste0(names(shadow_shifted_df), "_", suffix)

tibble::as_tibble(dplyr::bind_cols(data, shadow_shifted_df))
return(
tibble::as_tibble(dplyr::bind_cols(data, shadow_shifted_df))
)

}

Expand All @@ -81,7 +83,9 @@ add_shadow_shift <- function(data, ..., suffix = "shift"){
# change names
names(shadow_shifted_df) <- paste0(names(shadow_shifted_df),"_",suffix)

tibble::as_tibble(dplyr::bind_cols(data, shadow_shifted_df))
return(
tibble::as_tibble(dplyr::bind_cols(data, shadow_shifted_df))
)

}

Expand Down Expand Up @@ -141,7 +145,9 @@ add_any_miss <- function(data, ..., label = "any_miss"){

names(stub_data_label) <- paste0(label,"_all")

return(
dplyr::bind_cols(data, stub_data_label) %>% tibble::as_tibble()
)

}

Expand All @@ -159,7 +165,9 @@ add_any_miss <- function(data, ..., label = "any_miss"){

names(stub_data_label) <- paste0(label,"_vars")

return(
dplyr::bind_cols(data, stub_data_label) %>% tibble::as_tibble()
)

}

Expand Down
9 changes: 7 additions & 2 deletions R/add-n-prop-miss.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ add_n_miss <- function(data, ..., label = "n_miss"){

if (missing(...)) {
data[[paste0(label, "_all")]] <- n_miss_row(data)

return(data)
}

quo_vars <- rlang::quos(...)
Expand All @@ -38,7 +40,7 @@ add_n_miss <- function(data, ..., label = "n_miss"){

data[[paste0(label, "_vars")]] <- n_miss_row(selected_data)

data
return(data)
}

#' Add column containing proportion of missing data values
Expand Down Expand Up @@ -90,6 +92,8 @@ add_prop_miss <- function(data, ..., label = "prop_miss"){

if (missing(...)) {
data[[paste0(label, "_all")]] <- prop_miss_row(data)

return(data)
}

quo_vars <- rlang::quos(...)
Expand All @@ -98,5 +102,6 @@ add_prop_miss <- function(data, ..., label = "prop_miss"){

data[[paste0(label, "_vars")]] <- prop_miss_row(selected_data)

data
return(data)

}
93 changes: 69 additions & 24 deletions R/miss-x-summary.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
#' @param data a data.frame
#' @param order a logical indicating whether to order the result by `n_miss`.
#' Defaults to TRUE. If FALSE, order of variables is the order input.
#' @param add_cumsum logical indicating whether or not to add the cumulative
#' sum of missings to the data. This can be useful when exploring patterns
#' of nonresponse. These are calculated as the cumulative sum of the missings
#' in the variables as they are first presented to the function.
#' @param ... extra arguments
#'
#' @note `n_miss_cumsum` is calculated as the cumulative sum of missings in the
Expand All @@ -31,7 +35,10 @@
#' miss_var_summary()
#'
#' @export
miss_var_summary <- function(data, order = FALSE, ...) {
miss_var_summary <- function(data,
order = FALSE,
add_cumsum = FALSE,
...) {

test_if_null(data)

Expand All @@ -41,11 +48,19 @@ miss_var_summary <- function(data, order = FALSE, ...) {
}

#' @export
miss_var_summary.default <- function(data, order = TRUE, ...) {
miss_var_summary.default <- function(data,
order = TRUE,
add_cumsum = FALSE,
...) {

res <- purrr::map_df(data, n_miss) %>%
tidyr::gather(key = "variable", value = "n_miss") %>%
dplyr::mutate(pct_miss = (n_miss / nrow(data) * 100),
n_miss_cumsum = cumsum(n_miss))
dplyr::mutate(pct_miss = (n_miss / nrow(data) * 100))

if (add_cumsum) {
res <- res %>% dplyr::mutate(n_miss_cumsum = cumsum(n_miss))
}

if (order) {
return(dplyr::arrange(res, -n_miss))
}
Expand All @@ -56,9 +71,15 @@ miss_var_summary.default <- function(data, order = TRUE, ...) {
}

#' @export
miss_var_summary.grouped_df <- function(data, order = TRUE, ...) {
miss_var_summary.grouped_df <- function(data,
order = TRUE,
add_cumsum = FALSE,
...) {

group_by_fun(data, .fun = miss_var_summary, order = order)
group_by_fun(data,
.fun = miss_var_summary,
order = order,
add_cumsum = add_cumsum)

}

Expand All @@ -72,10 +93,10 @@ miss_var_summary.grouped_df <- function(data, order = TRUE, ...) {
#' @param order a logical indicating whether or not to order the result by
#' n_miss. Defaults to TRUE. If FALSE, order of cases is the order input.
#' @param ... extra arguments
#'
#' @note `n_miss_cumsum` is calculated as the cumulative sum of missings in the
#' variables in the order that they are given in the data when entering
#' the function
#' @param add_cumsum logical indicating whether or not to add the cumulative
#' sum of missings to the data. This can be useful when exploring patterns
#' of nonresponse. These are calculated as the cumulative sum of the missings
#' in the variables as they are first presented to the function.
#'
#' @return a tibble of the percent of missing data in each case.
#'
Expand All @@ -93,7 +114,10 @@ miss_var_summary.grouped_df <- function(data, order = TRUE, ...) {
#'
#' miss_case_summary(airquality)
#'
miss_case_summary <- function(data, order = TRUE, ...){
miss_case_summary <- function(data,
order = TRUE,
add_cumsum = FALSE,
...){

test_if_null(data)

Expand All @@ -103,35 +127,56 @@ miss_case_summary <- function(data, order = TRUE, ...){
}

#' @export
miss_case_summary.default <- function(data, order = TRUE, ...){
miss_case_summary.default <- function(data,
order = TRUE,
add_cumsum = FALSE,
...){

res <- data

res[["pct_miss"]] <- rowMeans(is.na(res))*100
res[["n_miss"]] <- as.integer(rowSums(is.na(res)))
res[["case"]] <- seq_len(nrow(res))
res[["n_miss_cumsum"]] <- cumsum(res[["n_miss"]])

res <- dplyr::as_tibble(res)
if (add_cumsum) {
res[["n_miss_cumsum"]] <- cumsum(res[["n_miss"]])
res <- dplyr::as_tibble(res)
res <- dplyr::select(res,
case,
n_miss,
pct_miss,
n_miss_cumsum)
}

if (!add_cumsum) {
res <- dplyr::as_tibble(res)

res <- dplyr::select(res,
case,
n_miss,
pct_miss,
n_miss_cumsum)
res <- dplyr::select(res,
case,
n_miss,
pct_miss)

}

if (order) {
return(dplyr::arrange(res, -n_miss))
}

return(res)

if (!order) {
return(res)
}
}

#' @export
miss_case_summary.grouped_df <- function(data, order = TRUE, ...){

group_by_fun(data, .fun = miss_case_summary, order = order)
miss_case_summary.grouped_df <- function(data,
order = TRUE,
add_cumsum = FALSE,
...){

group_by_fun(data,
.fun = miss_case_summary,
order = order,
add_cumsum = add_cumsum)

}

Expand Down
6 changes: 5 additions & 1 deletion R/shadows.R
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,9 @@ bind_shadow <- function(data, only_miss = FALSE){

shadow_vars <- dplyr::select(data, !!!miss_vars) %>% as_shadow()

tibble::as_tibble(dplyr::bind_cols(data, shadow_vars))
return(
tibble::as_tibble(dplyr::bind_cols(data, shadow_vars))
)

# if you want All the values to be added (the default behaviour)
}
Expand All @@ -136,7 +138,9 @@ bind_shadow <- function(data, only_miss = FALSE){

bound_shadow <- dplyr::bind_cols(data, data_shadow)

return(
tibble::as_tibble(bound_shadow)
)

}

Expand Down
12 changes: 6 additions & 6 deletions man/miss_case_summary.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion man/miss_var_summary.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 24 additions & 3 deletions tests/testthat/test-miss-case-summary.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,39 @@ test_that("miss_case_summary on grouped_df returns a tibble", {
expect_is(miss_case_summary(aq_group), "tbl_df")
})

test_that("grouped_df returns 1 more column than regular miss_case_summary", {
test_that("miss_case_summary produces the right number of columns", {
expect_equal(ncol(miss_case_summary(airquality)),
3)
})

test_that("miss_case_summary grouped produces the right number of columns", {
expect_equal(ncol(miss_case_summary(aq_group)),
4)
})

test_that("grouped_df returns the same number of columns as regular miss_case_summary", {
expect_equal(ncol(miss_case_summary(aq_group)),
ncol(miss_case_summary(airquality))+1)
ncol(miss_case_summary(airquality)) + 1)
})

test_that("grouped_df returns a column named 'Month'", {
expect_identical(names(miss_case_summary(aq_group)),
c("Month", "case", "n_miss","pct_miss", "n_miss_cumsum"))
c("Month", "case", "n_miss","pct_miss"))
})

test_that("grouped_df returns a column named 'Month' with the right levels", {
expect_identical(unique(miss_case_summary(aq_group)$Month),
5:9)
})

# add testing for cumulative sum ----------------------------------------------

test_that("miss_case_summary adds cumsum when add_cumsum = TRUE", {
expect_equal(names(miss_case_summary(airquality, add_cumsum = TRUE)),
c("case", "n_miss", "pct_miss", "n_miss_cumsum"))
})

test_that("miss_case_summary grouped adds cumsum when add_cumsum = TRUE", {
expect_equal(names(miss_case_summary(aq_group, add_cumsum = TRUE)),
c("Month", "case", "n_miss", "pct_miss", "n_miss_cumsum"))
})
Loading