diff --git a/R/summarise.r b/R/summarise.r index fce6df8..e9f1712 100644 --- a/R/summarise.r +++ b/R/summarise.r @@ -84,7 +84,7 @@ summarise_.grouped_svy <- function(.data, ..., .dots) { #' #' #' @param .data tbl A \code{tbl_svy} object -#' @param ... Name-value pairs of summary functions +#' @param ... Name-value pairs of summarizing expressions, see details #' @param .groups Defaults to "drop_last" in srvyr meaning that the last group is peeled #' off, but if there are more groups they will be preserved. Other options are "drop", which #' drops all groups, "keep" which keeps all of them and "rowwise" which converts the object @@ -130,6 +130,13 @@ summarise_.grouped_svy <- function(.data, ..., .dots) { #' Calculate an unweighted estimate as you would on a regular \code{tbl_df}. #' Based on dplyr's \code{\link[dplyr]{summarise}}.} #'} +#' +#' You can use expressions both in the \code{...} of \code{summarize} and also +#' in the arguments to the summarizing functions. Though this is valid syntactically +#' it can also allow you to calculate incorrect results (for example if you multiply +#' the mean by 100, the standard error is also multipled by 100, but the variance +#' is not). +#' #' @examples #' data(api, package = "survey") #' @@ -149,6 +156,17 @@ summarise_.grouped_svy <- function(.data, ..., .dots) { #' api00_mn = survey_mean(api00), #' api_diff = survey_mean(api00 - api99)) #' +#' # Expressions are allowed in summarize arguments & inside functions +#' # Here we can calculate binary variable on the fly and also multiply by 100 to +#' # get percentages +#' dstrata %>% +#' summarize(api99_over_700_pct = 100 * survey_mean(api99 > 700) +#' +#' # But be careful, the variance doesn't scale the same way, so this is wrong! +#' dstrata %>% +#' summarize(api99_over_700_pct = 100 * survey_mean(api99 > 700, vartype = "var") +#' # Wrong variance! +#' #' @name summarise #' @export #' @importFrom dplyr summarise diff --git a/man/summarise.Rd b/man/summarise.Rd index 44c5008..a9c6ad1 100644 --- a/man/summarise.Rd +++ b/man/summarise.Rd @@ -7,7 +7,7 @@ \arguments{ \item{.data}{tbl A \code{tbl_svy} object} -\item{...}{Name-value pairs of summary functions} +\item{...}{Name-value pairs of summarizing expressions, see details} \item{.groups}{Defaults to "drop_last" in srvyr meaning that the last group is peeled off, but if there are more groups they will be preserved. Other options are "drop", which @@ -58,6 +58,12 @@ The available functions from srvyr are: Calculate an unweighted estimate as you would on a regular \code{tbl_df}. Based on dplyr's \code{\link[dplyr]{summarise}}.} } + +You can use expressions both in the \code{...} of \code{summarize} and also +in the arguments to the summarizing functions. Though this is valid syntactically +it can also allow you to calculate incorrect results (for example if you multiply +the mean by 100, the standard error is also multipled by 100, but the variance +is not). } \examples{ data(api, package = "survey") @@ -78,4 +84,15 @@ dstrata_grp \%>\% api00_mn = survey_mean(api00), api_diff = survey_mean(api00 - api99)) +# Expressions are allowed in summarize arguments & inside functions +# Here we can calculate binary variable on the fly and also multiply by 100 to +# get percentages +dstrata \%>\% + summarize(api99_over_700_pct = 100 * survey_mean(api99 > 700) + +# But be careful, the variance doesn't scale the same way, so this is wrong! +dstrata \%>\% + summarize(api99_over_700_pct = 100 * survey_mean(api99 > 700, vartype = "var") +# Wrong variance! + } diff --git a/vignettes/srvyr-vs-survey.Rmd b/vignettes/srvyr-vs-survey.Rmd index 088017d..2757ab5 100644 --- a/vignettes/srvyr-vs-survey.Rmd +++ b/vignettes/srvyr-vs-survey.Rmd @@ -209,12 +209,6 @@ ggplot(data = out, aes(x = stype, y = api_diff, group = hs_grad_pct, fill = hs_g ``` - -# CAUTION! Difference in default degrees of freedom -Though it should be possible to get the exact same results using `srvyr` as you would -if you used the `survey` package, s - - # Grab Bag ## Using `survey` functions on `srvyr` objects @@ -229,11 +223,49 @@ summary(glm) ``` +## Using expressions to create variables on the fly +Like `dplyr`, `srvyr` allows you to use expressions in the arguments, +allowing you to create variables in a single step. For example, you can +use expressions: + +1) as the arguments inside the survey statistic functions like `survey_mean` +```{r, message = FALSE} +strat_design %>% + summarize(prop_api99_over_700 = survey_mean(api99 > 700)) +``` + +2) as an argument to `summarize` +```{r, message = FALSE} +strat_design %>% + group_by(awards) %>% + summarize(percentage = 100 * survey_mean()) +``` + +3) and you can even create varables inside of `group_by` +```{r, message = FALSE} +strat_design %>% + group_by(api99_above_700 = api99 > 700) %>% + summarize(api00_mn = survey_mean(api00)) +``` + +Though on-the-fly expressions are syntactically valid, it is possible to make statistically +invalid numbers from them. For example, though the standard error and confidence intervals +can be multiplied by a scalar (like 100), the variance does not scale the same way, so the +following is invalid: + +```{r, message = FALSE} +# BAD DON'T DO THIS! +strat_design %>% + group_by(awards) %>% + summarize(percentage = 100 * survey_mean(vartype = "var")) +# VARIANCE IS WRONG +``` + ## Non-Standard evaluation Srvyr supports the non-standard evaluation conventions that dplyr uses. If you'd like to use a function programmatically, you can use the functions from -rlang like the `{{` operator (aka "curly curly") from `rlang`. +rlang like the `{{` operator (aka "curly curly") from `rlang`. Here's a quick example, but please see the dplyr vignette [`vignette("programming", package = "dplyr")`](https://dplyr.tidyverse.org/articles/programming.html)