From fe279e66d182752073f15539b8cae17e7beed331 Mon Sep 17 00:00:00 2001 From: wlandau-lilly Date: Wed, 26 Jun 2019 08:54:17 -0400 Subject: [PATCH] Enable history by default --- DESCRIPTION | 2 +- NAMESPACE | 1 + NEWS.md | 3 +- R/api-history.R | 1 - R/api-make.R | 12 ++------ R/api-package.R | 1 + R/exec-memory.R | 12 ++++++++ R/preprocess-config.R | 16 +++------- README.Rmd | 29 ++++++++---------- README.md | 69 +++++++++++++++++++++++++++++-------------- man/drake_config.Rd | 7 ++--- man/make.Rd | 7 ++--- 12 files changed, 89 insertions(+), 71 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 18d089fed..f4244eb69 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -87,6 +87,7 @@ Imports: methods, rlang (>= 0.2.0), storr (>= 1.1.0), + txtq (>= 0.1.3), utils Suggests: abind, @@ -115,7 +116,6 @@ Suggests: testthat (>= 2.1.0), tibble, tidyselect (>= 0.2.4), - txtq (>= 0.1.3), txtplot, usethis, visNetwork, diff --git a/NAMESPACE b/NAMESPACE index dc847ca9c..b93c6f715 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -228,6 +228,7 @@ importFrom(rlang,quo_squash) importFrom(rlang,quos) importFrom(storr,storr_environment) importFrom(storr,storr_rds) +importFrom(txtq,txtq) importFrom(utils,compareVersion) importFrom(utils,flush.console) importFrom(utils,head) diff --git a/NEWS.md b/NEWS.md index cc6e2d0ac..2651fd9ac 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,9 +2,10 @@ ## New features +- Track history and provenance of targets, viewable with `drake_history()`. Powered by [`txtq`](https://github.com/wlandau/txtq) (#918, #920). - Export `transform_plan()`. - Add a new `no_deps()` function, similar to `ignore()`. `no_deps()` suppresses dependency detection but still tracks changes to the literal code ([#910](https://github.com/ropensci/drake/issues/910)). -- Add a new "autoclean" memory strategy (#917) +- Add a new "autoclean" memory strategy (#917). ## Enhancements diff --git a/R/api-history.R b/R/api-history.R index 4a7b6205a..c0566d1ca 100644 --- a/R/api-history.R +++ b/R/api-history.R @@ -174,7 +174,6 @@ history_analyze_value <- function(name, value, ht) { } default_history_queue <- function(cache_path) { - assert_pkg("txtq", version = "0.1.2") cache_dir <- dirname(cache_path) history_path <- file.path(cache_dir, ".drake_history") txtq::txtq(history_path) diff --git a/R/api-make.R b/R/api-make.R index 85fbbff8c..02a1290d3 100644 --- a/R/api-make.R +++ b/R/api-make.R @@ -156,18 +156,10 @@ make <- function( template = list(), sleep = function(i) 0.01, hasty_build = NULL, - memory_strategy = c( - "speed", - "autoclean", - "preclean", - "lookahead", - "unload", - "none", - "memory" # deprecated on 2019-06-22 - ), + memory_strategy = "speed", layout = NULL, lock_envir = TRUE, - history = FALSE + history = TRUE ) { log_msg( "begin make()", diff --git a/R/api-package.R b/R/api-package.R index 2d9469e6c..0133db111 100644 --- a/R/api-package.R +++ b/R/api-package.R @@ -49,6 +49,7 @@ #' @importFrom methods new setRefClass #' @importFrom rlang dots_list enquo eval_tidy expr quo_squash quos #' @importFrom storr storr_environment storr_rds +#' @importFrom txtq txtq #' @importFrom utils compareVersion flush.console head menu packageVersion #' read.csv sessionInfo stack type.convert unzip write.table NULL diff --git a/R/exec-memory.R b/R/exec-memory.R index 30e0e9424..7aaf3f086 100644 --- a/R/exec-memory.R +++ b/R/exec-memory.R @@ -1,3 +1,15 @@ +memory_strategies <- function() { + c( + "speed", + "autoclean", + "preclean", + "lookahead", + "unload", + "none", + "memory" # deprecated on 2019-06-22 + ) +} + assign_to_envir <- function(target, value, config) { memory_strategy <- config$layout[[target]]$memory_strategy %||NA% config$memory_strategy diff --git a/R/preprocess-config.R b/R/preprocess-config.R index e4c5d9432..cdf5a3dff 100644 --- a/R/preprocess-config.R +++ b/R/preprocess-config.R @@ -399,7 +399,7 @@ #' of your targets. You can also supply a #' [`txtq`](https://github.com/wlandau/txtq), which is #' how `drake` records history. -#' Required for [drake_history()]. +#' Must be `TRUE` for [drake_history()] to work later. #' #' @examples #' \dontrun{ @@ -464,18 +464,10 @@ drake_config <- function( template = list(), sleep = function(i) 0.01, hasty_build = NULL, - memory_strategy = c( - "speed", - "autoclean", - "preclean", - "lookahead", - "unload", - "none", - "memory" # deprecated on 2019-06-22 - ), + memory_strategy = "speed", layout = NULL, lock_envir = TRUE, - history = FALSE + history = TRUE ) { log_msg( "begin drake_config()", @@ -560,7 +552,7 @@ drake_config <- function( # 2019-01-03 # nolint ) } - memory_strategy <- match.arg(memory_strategy) + memory_strategy <- match.arg(memory_strategy, choices = memory_strategies()) if (memory_strategy == "memory") { memory_strategy <- "preclean" warning( diff --git a/README.Rmd b/README.Rmd index 48890cce1..843f4fd97 100644 --- a/README.Rmd +++ b/README.Rmd @@ -7,6 +7,9 @@ output: ```{r knitrsetup, echo = FALSE} +dir <- tempfile() +dir.create(dir) +knitr::opts_knit$set(root.dir = dir) knitr::opts_chunk$set( collapse = TRUE, comment = "#>", @@ -188,7 +191,7 @@ plan So far, we have just been setting the stage. Use `make()` to do the real work. Targets are built in the correct order regardless of the row order of `plan`. ```{r make1} -make(plan, history = TRUE) # History is new in drake 7.5.0. +make(plan) ``` Except for files like `report.html`, your output is stored in a hidden `.drake/` folder. Reading it back is easy. @@ -229,7 +232,7 @@ vis_drake_graph(config) # Interactive graph: zoom, drag, etc. The next `make()` just builds `hist` and `report.html`. No point in wasting time on the data or model. ```{r justhistetc} -make(plan, history = TRUE) +make(plan) ``` ```{r hist2, eval = FALSE} @@ -267,12 +270,12 @@ make(plan) # Independently re-create the results from the code and input data. ## History and provenance -As of version 7.5.0, `drake` can track the history and provenance of your targets: +As of version 7.5.0, `drake` tracks the history and provenance of your targets: what you built, when you built it, how you built it, the arguments you -used in your function calls, and how to get the data back. +used in your function calls, and how to get the data back. (Disable with `make(history = FALSE)`) ```{r history} -history <- drake_history(analyze = TRUE) # Requires make(history = TRUE) +history <- drake_history(analyze = TRUE) history ``` @@ -280,13 +283,15 @@ Remarks: - The `quiet` column appears above because one of the `drake_plan()` commands has `knit(quiet = TRUE)`. - The `hash` column identifies all the previous the versions of your targets. As long as `exists` is `TRUE`, you can recover old data. +- Advanced: if you use `make(cache_log_file = TRUE)` and put the cache log file under version control, you can match the hashes from `drake_history()` with the `git` commit history of your code. -Let's use the history to recover the old histogram. +Let's use the history to recover the oldest histogram. ```{r, eval = FALSE} hash <- history %>% - filter(target == "hist" & !latest) %>% # Get the old histogram. - pull(hash) + filter(target == "hist") %>% + pull(hash) %>% + head(n = 1) cache <- drake_cache() cache$get_value(hash) #> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. @@ -476,11 +481,3 @@ Many thanks to [Julia Lowndes](https://github.com/jules32), [Ben Marwick](https: Credit for images is [attributed here](https://ropensci.github.io/drake/figures/image-credit.md). [![ropensci_footer](https://ropensci.org/public_images/github_footer.png)](https://ropensci.org) - -```{r cleanupfooter, echo = FALSE} -clean(destroy = TRUE) -unlink( - c(".drake_history", "main", "raw_data.xlsx", "report.Rmd"), - recursive = TRUE -) -``` diff --git a/README.md b/README.md index 367e5ec64..8a796b617 100644 --- a/README.md +++ b/README.md @@ -196,7 +196,7 @@ work. Targets are built in the correct order regardless of the row order of `plan`. ``` r -make(plan, history = TRUE) # History is new in drake 7.5.0. +make(plan) #> target raw_data #> target data #> target fit @@ -260,7 +260,7 @@ The next `make()` just builds `hist` and `report.html`. No point in wasting time on the data or model. ``` r -make(plan, history = TRUE) +make(plan) #> target hist #> target report ``` @@ -330,24 +330,29 @@ make(plan) # Independently re-create the results from the code and input data. ## History and provenance -As of version 7.5.0, `drake` can track the history and provenance of -your targets: what you built, when you built it, how you built it, the +As of version 7.5.0, `drake` tracks the history and provenance of your +targets: what you built, when you built it, how you built it, the arguments you used in your function calls, and how to get the data back. +(Disable with `make(history = FALSE)`) ``` r -history <- drake_history(analyze = TRUE) # Requires make(history = TRUE) +history <- drake_history(analyze = TRUE) history -#> # A tibble: 7 x 9 -#> target time hash exists command runtime latest quiet -#> -#> 1 data 2019-06-25 14:32:05 e580… TRUE raw_da… 0.004 TRUE NA -#> 2 fit 2019-06-25 14:32:05 62a1… TRUE lm(Sep… 0.007 TRUE NA -#> 3 hist 2019-06-25 14:32:05 10bc… TRUE create… 0.008 FALSE NA -#> 4 hist 2019-06-25 14:32:06 5252… TRUE create… 0.00400 TRUE NA -#> 5 raw_d… 2019-06-25 14:32:04 6317… TRUE "readx… 0.012 TRUE NA -#> 6 report 2019-06-25 14:32:06 9946… TRUE "rmark… 1.18 FALSE TRUE -#> 7 report 2019-06-25 14:32:07 9946… TRUE "rmark… 0.489 TRUE TRUE -#> # … with 1 more variable: output_file +#> # A tibble: 12 x 9 +#> target time hash exists command runtime latest quiet output_file +#> +#> 1 data 2019-0… e580… TRUE raw_data… 0.002 FALSE NA +#> 2 data 2019-0… e580… TRUE raw_data… 0 TRUE NA +#> 3 fit 2019-0… 62a1… TRUE lm(Sepal… 0.003 FALSE NA +#> 4 fit 2019-0… 62a1… TRUE lm(Sepal… 0.001000 TRUE NA +#> 5 hist 2019-0… 10bc… TRUE create_p… 0.006 FALSE NA +#> 6 hist 2019-0… 5252… TRUE create_p… 0.004 FALSE NA +#> 7 hist 2019-0… 00fa… TRUE create_p… 0.00600 TRUE NA +#> 8 raw_da… 2019-0… 6317… TRUE "readxl:… 0.01 FALSE NA +#> 9 raw_da… 2019-0… 6317… TRUE "readxl:… 0.007 TRUE NA +#> 10 report 2019-0… 0064… TRUE "rmarkdo… 0.647 FALSE TRUE report.html +#> 11 report 2019-0… 0064… TRUE "rmarkdo… 0.45 FALSE TRUE report.html +#> 12 report 2019-0… 0064… TRUE "rmarkdo… 0.456 TRUE TRUE report.html ``` Remarks: @@ -356,13 +361,17 @@ Remarks: commands has `knit(quiet = TRUE)`. - The `hash` column identifies all the previous the versions of your targets. As long as `exists` is `TRUE`, you can recover old data. + - Advanced: if you use `make(cache_log_file = TRUE)` and put the cache + log file under version control, you can match the hashes from + `drake_history()` with the `git` commit history of your code. -Let’s use the history to recover the old histogram. +Let’s use the history to recover the oldest histogram. ``` r hash <- history %>% - filter(target == "hist" & !latest) %>% # Get the old histogram. - pull(hash) + filter(target == "hist") %>% + pull(hash) %>% + head(n = 1) cache <- drake_cache() cache$get_value(hash) #> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. @@ -679,13 +688,29 @@ here](https://github.com/wlandau/drake-examples/tree/master/main). ### Version control -`drake` is not a version control tool. However, it is fully compatible with [`git`](https://git-scm.com/), [`svn`](https://en.wikipedia.org/wiki/Apache_Subversion), and similar software. In fact, it is good practice to use [`git`](https://git-scm.com/) alongside `drake` for reproducible workflows. +`drake` is not a version control tool. However, it is fully compatible +with [`git`](https://git-scm.com/), +[`svn`](https://en.wikipedia.org/wiki/Apache_Subversion), and similar +software. In fact, it is good practice to use +[`git`](https://git-scm.com/) alongside `drake` for reproducible +workflows. -However, data poses a challenge. The datasets created by `make()` can get large and numerous, and it is not recommended to put the `.drake/` cache or the `.drake_history/` logs under version control. Instead, it is recommended to use a data storage solution such as [DropBox](https://www.dropbox.com/) or [OSF](https://osf.io/ka7jv/wiki/home/). +However, data poses a challenge. The datasets created by `make()` can +get large and numerous, and it is not recommended to put the `.drake/` +cache or the `.drake_history/` logs under version control. Instead, it +is recommended to use a data storage solution such as +[DropBox](https://www.dropbox.com/) or +[OSF](https://osf.io/ka7jv/wiki/home/). ### Containerization and R package environments -`drake` does not track R packages or system dependencies for changes. Instead, it defers to tools like [Docker](https://www.docker.com), [Singularity](https://sylabs.io/singularity/), [`renv`](https://github.com/rstudio/renv), and [`packrat`](https://github.com/rstudio/packrat), which create self-contained portable environments to reproducibly isolate and ship data analysis projects. `drake` is fully compatible with these tools. +`drake` does not track R packages or system dependencies for changes. +Instead, it defers to tools like [Docker](https://www.docker.com), +[Singularity](https://sylabs.io/singularity/), +[`renv`](https://github.com/rstudio/renv), and +[`packrat`](https://github.com/rstudio/packrat), which create +self-contained portable environments to reproducibly isolate and ship +data analysis projects. `drake` is fully compatible with these tools. ### workflowr diff --git a/man/drake_config.Rd b/man/drake_config.Rd index 1110f7eb1..d9e4ba873 100644 --- a/man/drake_config.Rd +++ b/man/drake_config.Rd @@ -20,9 +20,8 @@ drake_config(plan, targets = NULL, envir = parent.frame(), session = NULL, pruning_strategy = NULL, makefile_path = NULL, console_log_file = NULL, ensure_workers = TRUE, garbage_collection = FALSE, template = list(), sleep = function(i) - 0.01, hasty_build = NULL, memory_strategy = c("speed", "autoclean", - "preclean", "lookahead", "unload", "none", "memory"), layout = NULL, - lock_envir = TRUE, history = FALSE) + 0.01, hasty_build = NULL, memory_strategy = "speed", layout = NULL, + lock_envir = TRUE, history = TRUE) } \arguments{ \item{plan}{Workflow plan data frame. @@ -421,7 +420,7 @@ and the reproducibility/credibility/trust you can place in your project. of your targets. You can also supply a \href{https://github.com/wlandau/txtq}{txtq}, which is how \code{drake} records history. -Required for \code{\link[=drake_history]{drake_history()}}.} +Must be \code{TRUE} for \code{\link[=drake_history]{drake_history()}} to work later.} } \value{ The master internal configuration list of a project. diff --git a/man/make.Rd b/man/make.Rd index f09b93931..4be248312 100644 --- a/man/make.Rd +++ b/man/make.Rd @@ -20,9 +20,8 @@ make(plan, targets = NULL, envir = parent.frame(), verbose = 1L, pruning_strategy = NULL, makefile_path = NULL, console_log_file = NULL, ensure_workers = TRUE, garbage_collection = FALSE, template = list(), sleep = function(i) - 0.01, hasty_build = NULL, memory_strategy = c("speed", "autoclean", - "preclean", "lookahead", "unload", "none", "memory"), layout = NULL, - lock_envir = TRUE, history = FALSE) + 0.01, hasty_build = NULL, memory_strategy = "speed", layout = NULL, + lock_envir = TRUE, history = TRUE) } \arguments{ \item{plan}{Workflow plan data frame. @@ -434,7 +433,7 @@ and the reproducibility/credibility/trust you can place in your project. of your targets. You can also supply a \href{https://github.com/wlandau/txtq}{txtq}, which is how \code{drake} records history. -Required for \code{\link[=drake_history]{drake_history()}}.} +Must be \code{TRUE} for \code{\link[=drake_history]{drake_history()}} to work later.} } \value{ nothing