static.Rmd

# Static branching {#static}

```{r, message = FALSE, warning = FALSE, echo = FALSE}
knitr::opts_knit$set(root.dir = fs::dir_create(tempfile()))
knitr::opts_chunk$set(collapse = TRUE, comment = "#>")
options(
  drake_make_menu = FALSE,
  drake_clean_menu = FALSE,
  warnPartialMatchArgs = FALSE,
  crayon.enabled = FALSE,
  readr.show_progress = FALSE,
  tidyverse.quiet = TRUE
)
```

```{r, message = FALSE, warning = FALSE, echo = FALSE}
library(drake)
library(glue)
library(purrr)
library(rlang)
library(tidyverse)
invisible(drake_example("main", overwrite = TRUE))
invisible(file.copy("main/raw_data.xlsx", ".", overwrite = TRUE))
invisible(file.copy("main/report.Rmd", ".", overwrite = TRUE))
tmp <- suppressWarnings(drake_plan(x = 1, y = 2))
```

## Why static branching?

Static branching helps us write large plans compactly. Instead of typing out every single target by hand, we use a special shorthand to declare entire batches of similar targets. To practice static branching in a controlled setting, try the interactive exercises at <https://wlandau.shinyapps.io/learndrakeplans> (from the workshop at <https://github.com/wlandau/learndrake>).

Without static branching, plans like this one become too cumbersome to type by hand.

```{r, eval = FALSE}
# Without static branching:

drake_plan(
  data = get_data(),
  analysis_fast_1_main = main(data, mean = 1, tuning = "fast"),
  analysis_slow_1_main = main(data, mean = 1, tuning = "slow"),
  analysis_fast_2_main = main(data, mean = 2, tuning = "fast"),
  analysis_slow_2_main = main(data, mean = 2, tuning = "slow"),
  analysis_fast_3_main = main(data, mean = 3, tuning = "fast"),
  analysis_slow_3_main = main(data, mean = 3, tuning = "slow"),
  analysis_fast_4_main = main(data, mean = 4, tuning = "fast"),
  analysis_slow_4_main = main(data, mean = 4, tuning = "slow"),
  analysis_fast_1_altv = altv(data, mean = 1, tuning = "fast"),
  analysis_slow_1_altv = altv(data, mean = 1, tuning = "slow"),
  analysis_fast_2_altv = altv(data, mean = 2, tuning = "fast"),
  analysis_slow_2_altv = altv(data, mean = 2, tuning = "slow"),
  analysis_fast_3_altv = altv(data, mean = 3, tuning = "fast"),
  analysis_slow_3_altv = altv(data, mean = 3, tuning = "slow"),
  analysis_fast_4_altv = altv(data, mean = 4, tuning = "fast"),
  analysis_slow_4_altv = altv(data, mean = 4, tuning = "slow"),
  summary_analysis_fast_1_main = summarize_model(analysis_fast_1_main),
  summary_analysis_slow_1_main = summarize_model(analysis_slow_1_main),
  summary_analysis_fast_2_main = summarize_model(analysis_fast_2_main),
  summary_analysis_slow_2_main = summarize_model(analysis_slow_2_main),
  summary_analysis_fast_3_main = summarize_model(analysis_fast_3_main),
  summary_analysis_slow_3_main = summarize_model(analysis_slow_3_main),
  summary_analysis_fast_4_main = summarize_model(analysis_fast_4_main),
  summary_analysis_slow_4_main = summarize_model(analysis_slow_4_main),
  summary_analysis_fast_1_altv = summarize_model(analysis_fast_1_altv),
  summary_analysis_slow_1_altv = summarize_model(analysis_slow_1_altv),
  summary_analysis_fast_2_altv = summarize_model(analysis_fast_2_altv),
  summary_analysis_slow_2_altv = summarize_model(analysis_slow_2_altv),
  summary_analysis_fast_3_altv = summarize_model(analysis_fast_3_altv),
  summary_analysis_slow_3_altv = summarize_model(analysis_slow_3_altv),
  summary_analysis_fast_4_altv = summarize_model(analysis_fast_4_altv),
  summary_analysis_slow_4_altv = summarize_model(analysis_slow_4_altv),
  model_summary_altv = dplyr::bind_rows(
    summary_analysis_fast_1_altv,
    summary_analysis_slow_1_altv,
    summary_analysis_fast_2_altv,
    summary_analysis_slow_2_altv,
    summary_analysis_fast_3_altv,
    summary_analysis_slow_3_altv,
    summary_analysis_fast_4_altv,
    summary_analysis_slow_4_altv
  ),
  model_summary_main = dplyr::bind_rows(
    summary_analysis_fast_1_main,
    summary_analysis_slow_1_main,
    summary_analysis_fast_2_main,
    summary_analysis_slow_2_main,
    summary_analysis_fast_3_main,
    summary_analysis_slow_3_main,
    summary_analysis_fast_4_main,
    summary_analysis_slow_4_main
  )
)
```

Static branching makes it easier to write and understand plans. To activate static branching, use the `transform` argument of `target()`.

```{r}
# With static branching:

model_functions <- rlang::syms(c("main", "altv")) # We need symbols.

model_functions # List of symbols.

plan <- drake_plan(
  data = get_data(),
  analysis = target(
    model_function(data, mean = mean_value, tuning = tuning_setting),
    # Define an analysis target for each combination of
    # tuning_setting, mean_value, and model_function.
    transform = cross( 
      tuning_setting = c("fast", "slow"),
      mean_value = !!(1:4), # Why `!!`? See "Tidy Evaluation" below.
      model_function = !!model_functions # Why `!!`? See "Tidy Evaluation" below.
    )
  ),
  # Define a new summary target for each analysis target defined previously.
  summary = target(
    summarize_model(analysis),
    transform = map(analysis)
  ),
  # Group together the summary targets by the corresponding value
  # of model_function.
  model_summary = target(
    dplyr::bind_rows(summary),
    transform = combine(summary, .by = model_function) 
  )
)

plan
```

*Always* check the graph to make sure the plan makes sense.

```{r}
plot(plan) # a quick and dirty alternative to vis_drake_graph()
```


If the graph is too complicated to look at or too slow to load, downsize the plan with `max_expand`. Then, when you are done debugging and testing, remove `max_expand` to scale back up to the full plan.

```{r}
model_functions <- rlang::syms(c("main", "altv"))

plan <- drake_plan(
  max_expand = 2,
  data = get_data(),
  analysis = target(
    model_function(data, mean = mean_value, tuning = tuning_setting),
    transform = cross(
      tuning_setting = c("fast", "slow"),
      mean_value = !!(1:4), # Why `!!`? See "Tidy Evaluation" below.
      model_function = !!model_functions # Why `!!`? See "Tidy Evaluation" below.
    )
  ),
  summary = target(
    summarize_model(analysis),
    transform = map(analysis)
  ),
  model_summary = target(
    dplyr::bind_rows(summary),
    transform = combine(summary, .by = model_function) # defined in "analysis" 
  )
)

# Click and drag the nodes in the graph to improve the view.
plot(plan)
```

## Grouping variables

A *grouping variable* contains iterated values for a single instance of `map()` or `cross()`. `mean_value` and `tuning_par` are grouping variables below. Notice how they are defined inside `cross()`. Grouping variables are not targets, and they must be declared inside static transformations.

```{r}
drake_plan(
  data = get_data(),
  model = target(
    fit_model(data, mean_value, tuning_par),
    transform = cross(
      mean_value = c(1, 2),
      tuning_par = c("fast", "slow")
    )
  )
)
```

Each model has its own `mean_value` and `tuning_par`. To see this correspondence, set `trace = TRUE`.

```{r}
drake_plan(
  trace = TRUE,
  data = get_data(),
  model = target(
    fit_model(data, mean_value, tuning_par),
    transform = cross(
      mean_value = c(1, 2),
      tuning_par = c("fast", "slow")
    )
  )
)
```

If we summarize those models, each *summary* has its own `mean_value` and `tuning_par`. In other words, grouping variables have a natural nesting, and they propagate forward so we can use them in downstream targets. Notice how `mean_value` and `tuning_par` appear in `summarize_model()` and `combine()` below.

```{r}
plan <- drake_plan(
  trace = TRUE,
  data = get_data(),
  model = target(
    fit_model(data, mean_value, tuning_par),
    transform = cross(
      mean_value = c(1, 2),
      tuning_par = c("fast", "slow")
    )
  ),
  summary = target(
    # mean_value and tuning_par are old grouping variables from the models
    summarize_model(model, mean_value, tuning_par),
    transform = map(model)
  ),
  summary_by_tuning = target(
    dplyr::bind_rows(summary),
    # tuning_par is an old grouping variable from the models.
    transform = combine(summary, .by = tuning_par)
  )
)

plot(plan)
```


### Limitations of grouping variables

Each grouping variable should be defined only once. In the plan below, there are multiple conflicting definitions of `a1`, `a2`, and `a3` in the dependencies of `c1`, so `drake` does not know which definitions to use.

```{r, error = TRUE}
drake_plan(
  b1 = target(1, transform = map(a1 = 1, a2 = 1, .id = FALSE)),
  b2 = target(1, transform = map(a1 = 1, a3 = 1, .id = FALSE)),
  b3 = target(1, transform = map(a2 = 1, a3 = 1, .id = FALSE)),
  c1 = target(1, transform = map(a1, a2, a3, .id = FALSE)),
  trace = TRUE
)
```

Other workarounds include `bind_plans()` (on separate sub-plans) and [dynamic branching](#dynamic). Always check your plans before you run them (`vis_drake_graph()` etc.).

## Tidy evaluation

In earlier plans, we used "bang-bang" operator `!!` from [tidy evaluation](https://tidyeval.tidyverse.org/), e.g. `model_function = !!model_functions` in `cross()`. But why? Why not just type `model_function = model_functions`? Consider the following incorrect plan. 

```{r}
model_functions <- rlang::syms(c("main", "altv"))

plan <- drake_plan(
  data = get_data(),
  analysis = target(
    model_function(data, mean = mean_value, tuning = tuning_setting),
    transform = cross(
      tuning_setting = c("fast", "slow"),
      mean_value = 1:4, # without !!
      model_function = model_functions # without !!
    )
  )
)

drake_plan_source(plan)
```

Because we omit `!!`, we create two problems:

1. The commands use `model_functions()` instead of the desired `main()` and `altv()`.
2. We are missing the targets with `mean = 2` and `mean = 3`.

Why? To make static branching work properly, `drake` does not actually evaluate the arguments to `cross()`. It just uses the raw symbols and expressions. To force `drake` to use the *values* instead, we need `!!`.


```{r}
model_functions <- rlang::syms(c("main", "altv"))

plan <- drake_plan(
  data = get_data(),
  analysis = target(
    model_function(data, mean = mean_value, tuning = tuning_setting),
    transform = cross(
      tuning_setting = c("fast", "slow"),
      mean_value = !!(1:4), # with !!
      model_function = !!model_functions # with !!
    )
  )
)

drake_plan_source(plan)
```

## Static transformations

There are four transformations in static branching: `map()`, `cross()`, `split()`, and `combine()`. They are not actual functions, just special language to supply to the `transform` argument of `target()` in `drake_plan()`. Each transformation is similar to a function from the [Tidyverse](https://www.tidyverse.org/).

| `drake`     | Tidyverse analogue          |
|-------------|-----------------------------|
| `map()`     | `pmap()` from `purrr`       |
| `cross()`   | `crossing()` from `tidyr`   |
| `split()`   | `group_map()`  from `dplyr` |
| `combine()` | `summarize()` from `dplyr`  |

### `map()`

`map()` creates a new target for each row in a grid.

```{r}
drake_plan(
  x = target(
    simulate_data(center, scale),
    transform = map(center = c(2, 1, 0), scale = c(3, 2, 1))
  )
)
```

You can supply the grid directly with the `.data` argument. Note the use of `!!` below. (See the tidy evaluation section.)

```{r}
my_grid <- tibble(
  sim_function = c("rnorm", "rt", "rcauchy"),
  title = c("Normal", "Student t", "Cauchy")
)
my_grid$sim_function <- rlang::syms(my_grid$sim_function)

drake_plan(
  x = target(
    simulate_data(sim_function, title, center, scale),
    transform = map(
      center = c(2, 1, 0),
      scale = c(3, 2, 1),
      .data = !!my_grid,
      # In `.id`, you can select one or more grouping variables
      # for pretty target names.
      # Set to FALSE to use short numeric suffixes.
      .id = sim_function # Try `.id = c(sim_function, center)` yourself.
    )
  )
)
```

### `cross()`

`cross()` creates a new target for each combination of argument values.

```{r}
drake_plan(
  x = target(
    simulate_data(nrow, ncol),
    transform = cross(nrow = c(1, 2, 3), ncol = c(4, 5))
  )
)
```

### `split()`

The `split()` transformation distributes a dataset as uniformly as possible across multiple targets.

```{r, split1}
plan <- drake_plan(
  large_data = get_data(),
  slice_analysis = target(
    large_data %>%
      analyze(),
    transform = split(large_data, slices = 4)
  ),
  results = target(
    dplyr::bind_rows(slice_analysis),
    transform = combine(slice_analysis)
  )
)

plan
```

```{r}
plot(plan)
```

At runtime, `drake_slice()` takes a single subset of the data. It supports data frames, matrices, and arbitrary arrays.

```{r}
drake_slice(mtcars, slices = 32, index = 1)

drake_slice(mtcars, slices = 32, index = 2)
```


### `combine()`

`combine()` aggregates targets. The closest comparison is the unquote-splice operator `!!!` from tidy evaluation.

```{r}
plan <- drake_plan(
  data_group1 = target(
    sim_data(mean = x, sd = y),
    transform = map(x = c(1, 2), y = c(3, 4))
  ),
  data_group2 = target(
    pull_data(url),
    transform = map(url = c("example1.com", "example2.com"))
  ),
  larger = target(
    bind_rows(data_group1, data_group2, .id = "id") %>%
      arrange(sd) %>%
      head(n = 400),
    transform = combine(data_group1, data_group2)
  )
)

drake_plan_source(plan)
```

To create multiple combined groups, use the `.by` argument.

```{r}
plan <- drake_plan(
  data = target(
    sim_data(mean = x, sd = y, skew = z),
    transform = cross(x = c(1, 2), y = c(3, 4), z = c(5, 6))
  ),
  combined = target(
    bind_rows(data, .id = "id") %>%
      arrange(sd) %>%
      head(n = 400),
    transform = combine(data, .by = c(x, y))
  )
)

drake_plan_source(plan)
```

## Target names

`drake` releases after 7.12.0 let you define your own custom names with the optional `.names` argument of transformations.  

```{r}
analysis_names <- c("experimental", "thorough", "minimal", "naive")

plan <- drake_plan(
  dataset = target(
    get_dataset(data_index),
    transform = map(data_index = !!seq_len(2), .names = c("new", "old"))
  ),
  analysis = target(
    apply_method(method_name, dataset),
    transform = cross(
      method_name = c("method1", "method2"),
      dataset,
      .names = !!analysis_names
    )
  ),
  summary = target(
    summarize(analysis),
    transform = combine(analysis, .by = dataset, .names = c("table1", "table2"))
  )
)

plan

plot(plan)
```

The disadvantage of `.names` is you need to know in advance the number of targets a transformation will generate. As an alternative, all transformations have an optional `.id` argument to control the names of targets. Use it to select the grouping variables that go into the names, as well as the order they appear in the suffixes. 

```{r}
drake_plan(
  data = target(
    get_data(param1, param2),
    transform = map(
      param1 = c(123, 456),
      param2 = c(7, 9),
      param2 = c("abc", "xyz"),
      .id = param2
    )
  )
)
```

```{r}
drake_plan(
  data = target(
    get_data(param1, param2),
    transform = map(
      param1 = c(123, 456),
      param2 = c(7, 9),
      param2 = c("abc", "xyz"),
      .id = c(param2, param1)
    )
  )
)
```

```{r}
drake_plan(
  data = target(
    get_data(param1, param2),
    transform = map(
      param1 = c(123, 456),
      param2 = c(7, 9),
      param2 = c("abc", "xyz"),
      .id = c(param1, param2)
    )
  )
)
```

Set `.id` to `FALSE` to ignore the grouping variables altogether.

```{r}
drake_plan(
  data = target(
    get_data(param1, param2),
    transform = map(
      param1 = c(123, 456),
      param2 = c(7, 9),
      param2 = c("abc", "xyz"),
      .id = FALSE
    )
  )
)
```

Finally, `drake` supports a special `.id_chr` symbol in commands to let you refer to the name of the current target as a character string.

```{r}
as_chr <- function(x) {
  deparse(substitute(x))
}
plan <- drake_plan(
  data = target(
    get_data(param),
    transform = map(param = c(123, 456))
  ),
  keras_model = target(
    save_model_hdf5(fit_model(data), file_out(!!sprintf("%s.h5", .id_chr))),
    transform = map(data, .id = param)
  ),
  result = target(
    predict(load_model_hdf5(file_in(!!sprintf("%s.h5", as_chr(keras_model))))),
    transform = map(keras_model, .id = param)
  )
)

plan
```

```{r}
drake_plan_source(plan)
```

## Tags

A tag is a custom grouping variable for a transformation. There are two kinds of tags:

1. In-tags, which contain the target name you start with, and
2. Out-tags, which contain the target names generated by the transformations.

```{r}
drake_plan(
  x = target(
    command,
    transform = map(y = c(1, 2), .tag_in = from, .tag_out = c(to, out))
  ),
  trace = TRUE
)
```

Subsequent transformations can use tags as grouping variables and add to existing tags.

```{r}
plan <- drake_plan(
  prep_work = do_prep_work(),
  local = target(
    get_local_data(n, prep_work),
    transform = map(n = c(1, 2), .tag_in = data_source, .tag_out = data)
  ),
  online = target(
    get_online_data(n, prep_work, port = "8080"),
    transform = map(n = c(1, 2), .tag_in = data_source, .tag_out = data)
  ),
  summary = target(
    summarize(bind_rows(data, .id = "data")),
    transform = combine(data, .by = data_source)
  ),
  munged = target(
    munge(bind_rows(data, .id = "data")),
    transform = combine(data, .by = n)
  )
)

drake_plan_source(plan)

plot(plan)
```