Andrew Gelman, Jennifer Hill, Aki Vehtari 2021-04-20
Tidyverse version by Bill Behrman.
Predict weight from height. See Chapters 9 and 10 in Regression and Other Stories.
# Packages
library(tidyverse)
library(rstanarm)
# Parameters
# Earnings data
file_earnings <- here::here("Earnings/data/earnings.csv")
# Common code
file_common <- here::here("_common.R")
#===============================================================================
# Run common code
source(file_common)
Data
earnings <-
file_earnings %>%
read_csv() %>%
mutate(
sex =
case_when(
male == 0 ~ "Female",
male == 1 ~ "Male",
TRUE ~ NA_character_
)
)
earnings %>%
select(height, weight, sex, ethnicity)
#> # A tibble: 1,816 x 4
#> height weight sex ethnicity
#> <dbl> <dbl> <chr> <chr>
#> 1 74 210 Male White
#> 2 66 125 Female White
#> 3 64 126 Female White
#> 4 65 200 Female White
#> 5 63 110 Female Other
#> 6 68 165 Female Black
#> 7 63 190 Female White
#> 8 64 125 Female White
#> 9 62 200 Female White
#> 10 73 230 Male White
#> # … with 1,806 more rows
mean(earnings$height)
#> [1] 66.6
Fit linear regression of weight against height.
The option refresh = 0
suppresses the default Stan sampling progress
output. This is useful for small data with fast computation. For more
complex models and bigger data, it can be useful to see the progress.
set.seed(733)
fit_1 <- stan_glm(weight ~ height, data = earnings, refresh = 0)
fit_1
#> stan_glm
#> family: gaussian [identity]
#> formula: weight ~ height
#> observations: 1789
#> predictors: 2
#> ------
#> Median MAD_SD
#> (Intercept) -173.4 12.3
#> height 4.9 0.2
#>
#> Auxiliary parameter(s):
#> Median MAD_SD
#> sigma 29.0 0.5
#>
#> ------
#> * For help interpreting the printed output see ?print.stanreg
#> * For info on the priors used see ?prior_summary.stanreg
Center heights.
earnings <-
earnings %>%
mutate(height_c = height - 66)
Fit using centered heights.
set.seed(733)
fit_2 <- stan_glm(weight ~ height_c, data = earnings, refresh = 0)
fit_2
#> stan_glm
#> family: gaussian [identity]
#> formula: weight ~ height_c
#> observations: 1789
#> predictors: 2
#> ------
#> Median MAD_SD
#> (Intercept) 153.4 0.7
#> height_c 4.9 0.2
#>
#> Auxiliary parameter(s):
#> Median MAD_SD
#> sigma 29.0 0.5
#>
#> ------
#> * For help interpreting the printed output see ?print.stanreg
#> * For info on the priors used see ?prior_summary.stanreg
New data for a 70-inch-tall person.
new <- tibble(height_c = 4)
Point prediction.
point_pred <- predict(fit_2, newdata = new)
as.double(point_pred)
#> [1] 173
Posterior simulations where variation is from posterior uncertainty in the coefficients.
linpred <- posterior_linpred(fit_2, newdata = new)
Posterior simulations where variation is from posterior uncertainty in the coefficients and predictive uncertainty.
set.seed(733)
pred <- posterior_predict(fit_2, newdata = new)
preds <- tibble(linpred = as.double(linpred), pred = as.double(pred))
preds %>%
summarize(across(everything(), list(median = median, mad = mad)))
#> # A tibble: 1 x 4
#> linpred_median linpred_mad pred_median pred_mad
#> <dbl> <dbl> <dbl> <dbl>
#> 1 173. 0.915 173. 28.3
The medians of the two sets of posterior simulations are similar, but the median absolute deviations are very different.
Distributions of posterior predictions.
preds %>%
ggplot(aes(y = stat(density))) +
geom_freqpoly(aes(pred, color = "pred"), binwidth = 10) +
geom_freqpoly(aes(linpred, color = "linpred"), binwidth = 0.2) +
scale_x_continuous(breaks = scales::breaks_width(50)) +
scale_y_continuous(breaks = 0) +
scale_color_discrete(
breaks = c("linpred", "pred"),
labels = c("Linear predictor", "Predicted value")
) +
theme(legend.position = "bottom") +
labs(
title = "Distributions of posterior predictions",
x = "Weight (pounds)",
y = NULL,
color = NULL
)
Data
earnings %>%
select(height, weight, sex, ethnicity)
#> # A tibble: 1,816 x 4
#> height weight sex ethnicity
#> <dbl> <dbl> <chr> <chr>
#> 1 74 210 Male White
#> 2 66 125 Female White
#> 3 64 126 Female White
#> 4 65 200 Female White
#> 5 63 110 Female Other
#> 6 68 165 Female Black
#> 7 63 190 Female White
#> 8 64 125 Female White
#> 9 62 200 Female White
#> 10 73 230 Male White
#> # … with 1,806 more rows
Fit linear regression of weight against height.
set.seed(733)
fit_1 <- stan_glm(weight ~ height, data = earnings, refresh = 0)
fit_1
#> stan_glm
#> family: gaussian [identity]
#> formula: weight ~ height
#> observations: 1789
#> predictors: 2
#> ------
#> Median MAD_SD
#> (Intercept) -173.4 12.3
#> height 4.9 0.2
#>
#> Auxiliary parameter(s):
#> Median MAD_SD
#> sigma 29.0 0.5
#>
#> ------
#> * For help interpreting the printed output see ?print.stanreg
#> * For info on the priors used see ?prior_summary.stanreg
Predict the weight for a 66-inch-tall person.
set.seed(733)
new <- tibble(height = 66)
pred <- posterior_predict(fit_1, newdata = new)
cat(
str_glue(
"The predicted weight for 66-inch-tall person is ",
"{format(mean(pred), digits = 1, nsmall = 1)} pounds with a sd of ",
"{format(sd(pred), digits = 1, nsmall = 1)}."
)
)
The predicted weight for 66-inch-tall person is 153.2 pounds with a sd of 29.0.
Center heights.
earnings <-
earnings %>%
mutate(height_c = height - 66)
Fit using centered heights.
set.seed(733)
fit_2 <- stan_glm(weight ~ height_c, data = earnings, refresh = 0)
fit_2
#> stan_glm
#> family: gaussian [identity]
#> formula: weight ~ height_c
#> observations: 1789
#> predictors: 2
#> ------
#> Median MAD_SD
#> (Intercept) 153.4 0.7
#> height_c 4.9 0.2
#>
#> Auxiliary parameter(s):
#> Median MAD_SD
#> sigma 29.0 0.5
#>
#> ------
#> * For help interpreting the printed output see ?print.stanreg
#> * For info on the priors used see ?prior_summary.stanreg
Fit linear regression of weight against height and sex.
set.seed(733)
fit_3 <- stan_glm(weight ~ height_c + sex, data = earnings, refresh = 0)
fit_3
#> stan_glm
#> family: gaussian [identity]
#> formula: weight ~ height_c + sex
#> observations: 1789
#> predictors: 3
#> ------
#> Median MAD_SD
#> (Intercept) 149.5 0.9
#> height_c 3.9 0.3
#> sexMale 11.8 2.0
#>
#> Auxiliary parameter(s):
#> Median MAD_SD
#> sigma 28.7 0.5
#>
#> ------
#> * For help interpreting the printed output see ?print.stanreg
#> * For info on the priors used see ?prior_summary.stanreg
Predict the weight for a 70-inch-tall woman.
set.seed(733)
new <- tibble(height_c = 4, sex = "Female")
pred <- posterior_predict(fit_3, newdata = new)
cat(
str_glue(
"The predicted weight for 70-inch-tall woman is ",
"{format(mean(pred), digits = 1, nsmall = 1)} pounds with a sd of ",
"{format(sd(pred), digits = 1, nsmall = 1)}."
)
)
The predicted weight for 70-inch-tall woman is 165.0 pounds with a sd of 28.8.
Predict the weight for a 70-inch-tall man.
set.seed(733)
new <- tibble(height_c = 4, sex = "Male")
pred <- posterior_predict(fit_3, newdata = new)
cat(
str_glue(
"The predicted weight for 70-inch-tall man is ",
"{format(mean(pred), digits = 1, nsmall = 1)} pounds with a sd of ",
"{format(sd(pred), digits = 1, nsmall = 1)}."
)
)
The predicted weight for 70-inch-tall man is 176.8 pounds with a sd of 28.7.
earnings %>%
count(ethnicity)
#> # A tibble: 4 x 2
#> ethnicity n
#> * <chr> <int>
#> 1 Black 180
#> 2 Hispanic 104
#> 3 Other 38
#> 4 White 1494
Fit linear regression of weight against height, sex, and ethnicity.
set.seed(733)
fit_4 <-
stan_glm(weight ~ height_c + sex + ethnicity, data = earnings, refresh = 0)
fit_4
#> stan_glm
#> family: gaussian [identity]
#> formula: weight ~ height_c + sex + ethnicity
#> observations: 1789
#> predictors: 6
#> ------
#> Median MAD_SD
#> (Intercept) 154.3 2.2
#> height_c 3.9 0.3
#> sexMale 12.2 2.0
#> ethnicityHispanic -6.2 3.5
#> ethnicityOther -12.3 5.1
#> ethnicityWhite -5.2 2.3
#>
#> Auxiliary parameter(s):
#> Median MAD_SD
#> sigma 28.6 0.5
#>
#> ------
#> * For help interpreting the printed output see ?print.stanreg
#> * For info on the priors used see ?prior_summary.stanreg
Make "White"
the baseline factor level for ethnicity
.
earnings <-
earnings %>%
mutate(
ethnicity =
factor(ethnicity, levels = c("White", "Black", "Hispanic", "Other"))
)
Fit linear regression of weight against height, sex, and ethnicity with new baseline factor level.
set.seed(733)
fit_5 <-
stan_glm(weight ~ height_c + sex + ethnicity, data = earnings, refresh = 0)
fit_5
#> stan_glm
#> family: gaussian [identity]
#> formula: weight ~ height_c + sex + ethnicity
#> observations: 1789
#> predictors: 6
#> ------
#> Median MAD_SD
#> (Intercept) 149.1 0.9
#> height_c 3.8 0.3
#> sexMale 12.1 1.9
#> ethnicityBlack 5.2 2.2
#> ethnicityHispanic -1.0 2.9
#> ethnicityOther -7.1 4.6
#>
#> Auxiliary parameter(s):
#> Median MAD_SD
#> sigma 28.6 0.5
#>
#> ------
#> * For help interpreting the printed output see ?print.stanreg
#> * For info on the priors used see ?prior_summary.stanreg
Equivalently, create indicator variables for the four ethnic groups.
earnings <-
earnings %>%
mutate(
ethnicity_white = if_else(ethnicity == "White", 1, 0),
ethnicity_black = if_else(ethnicity == "Black", 1, 0),
ethnicity_hispanic = if_else(ethnicity == "Hispanic", 1, 0),
ethnicity_other = if_else(ethnicity == "Other", 1, 0)
)
Fit linear regression of weight against height, sex, and ethnicity expressed as indicator variables.
set.seed(733)
fit_6 <-
stan_glm(
weight ~
height_c + sex + ethnicity_black + ethnicity_hispanic + ethnicity_other,
data = earnings,
refresh = 0
)
fit_6
#> stan_glm
#> family: gaussian [identity]
#> formula: weight ~ height_c + sex + ethnicity_black + ethnicity_hispanic +
#> ethnicity_other
#> observations: 1789
#> predictors: 6
#> ------
#> Median MAD_SD
#> (Intercept) 149.1 0.9
#> height_c 3.8 0.3
#> sexMale 12.1 1.9
#> ethnicity_black 5.2 2.2
#> ethnicity_hispanic -1.0 2.9
#> ethnicity_other -7.1 4.6
#>
#> Auxiliary parameter(s):
#> Median MAD_SD
#> sigma 28.6 0.5
#>
#> ------
#> * For help interpreting the printed output see ?print.stanreg
#> * For info on the priors used see ?prior_summary.stanreg