Regression and Other Stories: Golf

Andrew Gelman, Jennifer Hill, Aki Vehtari 2021-07-21

22 Advanced regression and multilevel models
- 22.6 Nonlinear models, a demonstration using Stan

Tidyverse version by Bill Behrman.

Golf putting accuracy: Fitting a nonlinear model using Stan. See Chapter 22 in Regression and Other Stories.

# Packages
library(tidyverse)
library(rstan)
library(rstanarm)

# Parameters
  # Seed
SEED <- 660
  # Golf data
file_golf <- here::here("Golf/data/golf.txt") 
  # Common code
file_common <- here::here("_common.R")

#===============================================================================

# Run common code
source(file_common)

22 Advanced regression and multilevel models

22.6 Nonlinear models, a demonstration using Stan

Data from golf puts

Data

golf <- 
  file_golf %>% 
  read_table(skip = 2) %>% 
  mutate(prob_se = sqrt((y / n) * (1 - y / n) / n))

golf

#> # A tibble: 19 x 4
#>        x     n     y prob_se
#>    <dbl> <dbl> <dbl>   <dbl>
#>  1     2  1443  1346 0.00659
#>  2     3   694   577 0.0142 
#>  3     4   455   337 0.0205 
#>  4     5   353   208 0.0262 
#>  5     6   272   149 0.0302 
#>  6     7   256   136 0.0312 
#>  7     8   240   111 0.0322 
#>  8     9   217    69 0.0316 
#>  9    10   200    67 0.0334 
#> 10    11   237    75 0.0302 
#> # … with 9 more rows

The variables are:

x: Putt distance (feet)
n: Number of attempts
y: Number of successful putts
prob_se: The standard error of y / n

Data on putts in pro golf.

golf <- 
  golf %>% 
  mutate(
    prob = y / n,
    lower = prob - prob_se,
    upper = prob + prob_se,
    label = str_c(y, " / ", n)
  )

plot <-
  golf %>% 
  ggplot(aes(x, prob)) +
  geom_linerange(aes(ymin = lower, ymax = upper)) +
  geom_point() +
  coord_cartesian(xlim = c(0, max(golf$x) + 1), ylim = 0:1) +
  scale_y_continuous(
    breaks = scales::breaks_width(0.2),
    labels = scales::label_percent(accuracy = 1)
  ) +
  labs(
    x = "Distance from hole (feet)",
    y = "Probabily of success"
  )

plot + 
  geom_text(
    aes(y = upper, label = label),
    size = 2,
    hjust = 0.15,
    vjust = -0.5
  ) +
  labs(
    title = "Data on putts in pro golf",
    subtitle = "Successful putts / Attempts"
  )

Fit logistic regression using stan_glm().

set.seed(SEED)

fit_1_stan_glm <- 
  stan_glm(
    cbind(y, n - y) ~ x,
    family = binomial(link = "logit"),
    data = golf,
    refresh = 0
  )

print(fit_1_stan_glm, digits = 2)

#> stan_glm
#>  family:       binomial [logit]
#>  formula:      cbind(y, n - y) ~ x
#>  observations: 19
#>  predictors:   2
#> ------
#>             Median MAD_SD
#> (Intercept)  2.23   0.06 
#> x           -0.26   0.01 
#> 
#> ------
#> * For help interpreting the printed output see ?print.stanreg
#> * For info on the priors used see ?prior_summary.stanreg

Fitted logistic regression.

line <- 
  tibble(
    model = 1,
    x = seq_range(c(0, max(golf$x) + 1)),
    prob = predict(fit_1_stan_glm, type = "response", newdata = tibble(x))
  )

label <- 
  tribble(
    ~x, ~prob, ~label,
     8,   0.7, 
      str_glue(
        "Logistic regression\n",
        "a = {format(coef(fit_1_stan_glm)[['(Intercept)']], digits = 2, nsmall = 2)}, ",
        "b = {format(coef(fit_1_stan_glm)[['x']], digits = 2, nsmall = 2)}"
      )
  )

plot +
  geom_line(data = line) +
  geom_text(aes(label = label), data = label, hjust = 0) +
  labs(title = "Fitted logistic regression")

Logistic regression using Stan

Here is the above logistic regression model expressed in the Stan language:

model_1 <-
"
data {
  int N;
  vector [N] x;
  int n[N];
  int y[N];
}
parameters {
  real a;
  real b;
}
model {
  y ~ binomial_logit(n, a + b * x);
}
"

Data in form required by stan().

data_1 <- c(list(N = nrow(golf)), as.list(golf))

Fit logistic regression using stan().

set.seed(SEED)

fit_1_stan <- stan(model_code = model_1, data = data_1, refresh = 0)

Here is the result:

summary(fit_1_stan, pars = c("a", "b"), probs = c(0.25, 0.5, 0.75))$summary

#>     mean  se_mean      sd   25%    50%    75% n_eff Rhat
#> a  2.233 0.001673 0.05503  2.20  2.232  2.269  1081    1
#> b -0.256 0.000194 0.00633 -0.26 -0.256 -0.252  1068    1

Let’s compare the coefficients and their standard errors from both methods.

coef(fit_1_stan_glm)[["(Intercept)"]] - summary(fit_1_stan)$summary["a", "mean"]

#> [1] -0.00168

coef(fit_1_stan_glm)[["x"]] - summary(fit_1_stan)$summary["b", "mean"]

#> [1] 0.000262

se(fit_1_stan_glm)[["(Intercept)"]] - summary(fit_1_stan)$summary["a", "sd"]

#> [1] 0.00501

se(fit_1_stan_glm)[["x"]] - summary(fit_1_stan)$summary["b", "sd"]

#> [1] 0.000485

In all cases, the coefficients and standard errors from both methods are close.

Fitting a nonlinear model from scratch using Stan

As an alternative to logistic regression, we build a model from first principles and fit it to the data.

model_2 <-
"
data {
  real RADIUS_DIFF;
  int N;
  vector [N] x;
  int n[N];
  int y[N];
}
parameters {
  real <lower = 0> sigma;
}
model {
  vector [N] p = 2 * Phi(asin(RADIUS_DIFF ./ x) / sigma) - 1;
  y ~ binomial(n, p);
}
"

RADIUS_HOLE <- (4.25 / 2) / 12
RADIUS_BALL <- (1.68 / 2) / 12
RADIUS_DIFF <- RADIUS_HOLE - RADIUS_BALL

data_2 <- c(list(RADIUS_DIFF = RADIUS_DIFF, N = nrow(golf)), as.list(golf))

Fit nonlinear model using stan().

set.seed(SEED)

fit_2_stan <- stan(model_code = model_2, data = data_2, refresh = 0)

Here is the result:

summary(fit_2_stan, pars = "sigma", probs = c(0.25, 0.5, 0.75))$summary

#>         mean  se_mean       sd    25%    50%    75% n_eff Rhat
#> sigma 0.0267 1.06e-05 0.000403 0.0264 0.0267 0.0269  1438    1

Comparing the two models

Two models fit to the golf putting data.

sigma <- 
  as_tibble(fit_2_stan) %>% 
  pull(sigma) %>% 
  median()

lines <- 
  line %>% 
  bind_rows(
    tibble(
      model = 2,
      x = seq_range(c(RADIUS_DIFF, max(golf$x) + 1)),
      prob = 2 * pnorm(asin(RADIUS_DIFF / x ) / sigma) - 1
    )
  )

labels <- 
  tribble(
      ~x, ~prob, ~label,
     7.5,  0.65, "Logistic regression",
    15.5,  0.25, "Geometry-based model"
  )

plot +
  geom_line(aes(group = model), data = lines) +
  geom_text(aes(label = label), data = labels, hjust = 0) +
  labs(title = "Two models fit to the golf putting data")

The above plots the data and the fitted model (here using the posterior median of σ but in this case the uncertainty is so narrow that any reasonable posterior summary would give essentially the same result), along with the logistic regression fitted earlier. The custom nonlinear model fits the data much better.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

golf_tv.md

golf_tv.md

Regression and Other Stories: Golf

22 Advanced regression and multilevel models

22.6 Nonlinear models, a demonstration using Stan

Data from golf puts

Logistic regression using Stan

Fitting a nonlinear model from scratch using Stan

Comparing the two models

Files

golf_tv.md

Latest commit

History

golf_tv.md

File metadata and controls

Regression and Other Stories: Golf

22 Advanced regression and multilevel models

22.6 Nonlinear models, a demonstration using Stan

Data from golf puts

Logistic regression using Stan

Fitting a nonlinear model from scratch using Stan

Comparing the two models