From 65aca429a864223f19d92230efdbbb1adb55bf55 Mon Sep 17 00:00:00 2001 From: Max Kuhn Date: Thu, 28 Jan 2021 15:54:01 -0500 Subject: [PATCH 1/4] changes for #212 --- NAMESPACE | 5 +- NEWS.md | 2 +- R/misc.R | 82 ++++++++++++++++------------ R/rset.R | 6 ++ man/fingerprint.Rd | 38 ------------- man/get_fingerprint.Rd | 44 +++++++++++++++ man/new_rset.Rd | 5 ++ tests/testthat/test-print-groups.txt | 8 +-- tests/testthat/test_fingerprint.R | 23 +------- tests/testthat/test_rset.R | 4 +- 10 files changed, 115 insertions(+), 102 deletions(-) delete mode 100644 man/fingerprint.Rd create mode 100644 man/get_fingerprint.Rd diff --git a/NAMESPACE b/NAMESPACE index 1ca099cb..e8202e04 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,9 @@ S3method("[",rset) S3method("names<-",rset) +S3method(.get_fingerprint,default) +S3method(.get_fingerprint,rset) +S3method(.get_fingerprint,tune_results) S3method(as.data.frame,rsplit) S3method(as.integer,rsplit) S3method(complement,apparent_split) @@ -202,6 +205,7 @@ S3method(vec_restore,sliding_period) S3method(vec_restore,sliding_window) S3method(vec_restore,validation_split) S3method(vec_restore,vfold_cv) +export(.get_fingerprint) export(add_resample_id) export(all_of) export(analysis) @@ -215,7 +219,6 @@ export(contains) export(default_complement) export(ends_with) export(everything) -export(fingerprint) export(form_pred) export(gather) export(gather.rset) diff --git a/NEWS.md b/NEWS.md index 3ec2ae03..26115bac 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,7 +6,7 @@ * Fixed an issue where empty assessment sets couldn't be created by `make_splits()` (#188). -* A `fingerprint()` function was added to create a hash value that can be used to compare `rset` objects. +* `rset` objects now contain a "fingerprint" attribute that can be used to check to see if the same object uses the same resamples. * The `reg_intervals()` function is a convenience function for `lm()`, `glm()`, `survreg()`, and `coxph()` models. diff --git a/R/misc.R b/R/misc.R index ab478236..8a015ac8 100644 --- a/R/misc.R +++ b/R/misc.R @@ -58,53 +58,63 @@ split_unnamed <- function(x, f) { unname(out) } - -## ----------------------------------------------------------------------------- - -#' Create a cryptographical hash value for `rset` objects. +#' Obtain a identifier for the resamples #' -#' This function uses the distinct rows in the data set and the column(s) for the -#' resample identifier and the splits to produce a character string that can be -#' used to determine if another object shares the same splits. -#' -#' The comparison is based on the unique contents of the `id` and `splits` -#' columns. Attributes are not used in the comparison. -#' @param x An `rset` object. +#' This function returns a hash (or NA) for an attribute that is created when +#' the `rset` was initially constructed. This can be used to compare with other +#' resampling objects to see if they are the same. +#' @param x An `rset` or `tune_results` object. #' @param ... Not currently used. -#' @return A character string. +#' @return A character value or `NA_character_` if the object was created prior +#' to `rsample` version 0.1.0. +#' @rdname get_fingerprint +#' @aliases .get_fingerprint #' @examples #' set.seed(1) -#' fingerprint(vfold_cv(mtcars)) +#' .get_fingerprint(vfold_cv(mtcars)) #' #' set.seed(1) -#' fingerprint(vfold_cv(mtcars)) +#' .get_fingerprint(vfold_cv(mtcars)) #' #' set.seed(2) -#' fingerprint(vfold_cv(mtcars)) +#' .get_fingerprint(vfold_cv(mtcars)) #' #' set.seed(1) -#' fingerprint(vfold_cv(mtcars, repeats = 2)) +#' .get_fingerprint(vfold_cv(mtcars, repeats = 2)) #' @export -fingerprint <- function(x, ...) { - # For iterative models, the splits are replicated multiple times. Get the - # unique id values and has those rows - is_id_var <- col_starts_with_id(names(x)) - id_vars <- names(x)[is_id_var] - if (length(id_vars) == 0) { - rlang::abort("No ID columns were found.") - } - if (!any(names(x) == "splits")) { - rlang::abort("The 'split' column was not found.") - } +.get_fingerprint <- function(x, ...) { + UseMethod(".get_fingerprint") +} + +#' @export +#' @rdname get_fingerprint +.get_fingerprint.default <- function(x, ...) { + cls <- paste("'", class(x), "'", sep = ", ") + rlang::abort( + paste("No `.get_fingerprint()` method for this class(es)", cls) + ) +} - x <- - dplyr::select(x, splits, dplyr::all_of(id_vars)) %>% - dplyr::distinct() %>% - dplyr::arrange(!!!id_vars) %>% - tibble::as_tibble() - attrib <- attributes(x) - attrib <- attrib[names(attrib) %in% c("row.names", "names", "class")] - attributes(x) <- attrib - rlang::hash(x) +#' @export +#' @rdname get_fingerprint +.get_fingerprint.rset <- function(x, ...) { + att <- attributes(x) + if (any(names(att) == "fingerprint")) { + res <- att$fingerprint + } else { + res <- NA_character_ + } + res } +#' @export +#' @rdname get_fingerprint +.get_fingerprint.tune_results <- function(x, ...) { + att <- attributes(x)$rset_info$att + if (any(names(att) == "fingerprint")) { + res <- att$fingerprint + } else { + res <- NA_character_ + } + res +} diff --git a/R/rset.R b/R/rset.R index 0f08aab7..19abb474 100644 --- a/R/rset.R +++ b/R/rset.R @@ -6,6 +6,9 @@ #' @param attrib An optional named list of attributes to add to the object. #' @param subclass A character vector of subclasses to add. #' @return An `rset` object. +#' @details Once the new `rset` is constructed, an additional attribute called +#' "fingerprint" is added that is a hash of the `rset`. This can be used to +#' make sure other objects have the exact same resamples. #' @keywords internal #' @export new_rset <- function(splits, ids, attrib = NULL, @@ -71,6 +74,9 @@ new_rset <- function(splits, ids, attrib = NULL, res <- add_class(res, cls = subclass, at_end = FALSE) } + fingerprint <- rlang::hash(res) + attr(res, "fingerprint") <- fingerprint + res } diff --git a/man/fingerprint.Rd b/man/fingerprint.Rd deleted file mode 100644 index 1672a22d..00000000 --- a/man/fingerprint.Rd +++ /dev/null @@ -1,38 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/misc.R -\name{fingerprint} -\alias{fingerprint} -\title{Create a cryptographical hash value for \code{rset} objects.} -\usage{ -fingerprint(x, ...) -} -\arguments{ -\item{x}{An \code{rset} object.} - -\item{...}{Not currently used.} -} -\value{ -A character string. -} -\description{ -This function uses the distinct rows in the data set and the column(s) for the -resample identifier and the splits to produce a character string that can be -used to determine if another object shares the same splits. -} -\details{ -The comparison is based on the unique contents of the \code{id} and \code{splits} -columns. Attributes are not used in the comparison. -} -\examples{ -set.seed(1) -fingerprint(vfold_cv(mtcars)) - -set.seed(1) -fingerprint(vfold_cv(mtcars)) - -set.seed(2) -fingerprint(vfold_cv(mtcars)) - -set.seed(1) -fingerprint(vfold_cv(mtcars, repeats = 2)) -} diff --git a/man/get_fingerprint.Rd b/man/get_fingerprint.Rd new file mode 100644 index 00000000..ff37860f --- /dev/null +++ b/man/get_fingerprint.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/misc.R +\name{.get_fingerprint} +\alias{.get_fingerprint} +\alias{.get_fingerprint.default} +\alias{.get_fingerprint.rset} +\alias{.get_fingerprint.tune_results} +\title{Obtain a identifier for the resamples} +\usage{ +.get_fingerprint(x, ...) + +\method{.get_fingerprint}{default}(x, ...) + +\method{.get_fingerprint}{rset}(x, ...) + +\method{.get_fingerprint}{tune_results}(x, ...) +} +\arguments{ +\item{x}{An \code{rset} or \code{tune_results} object.} + +\item{...}{Not currently used.} +} +\value{ +A character value or \code{NA_character_} if the object was created prior +to \code{rsample} version 0.1.0. +} +\description{ +This function returns a hash (or NA) for an attribute that is created when +the \code{rset} was initially constructed. This can be used to compare with other +resampling objects to see if they are the same. +} +\examples{ +set.seed(1) +.get_fingerprint(vfold_cv(mtcars)) + +set.seed(1) +.get_fingerprint(vfold_cv(mtcars)) + +set.seed(2) +.get_fingerprint(vfold_cv(mtcars)) + +set.seed(1) +.get_fingerprint(vfold_cv(mtcars, repeats = 2)) +} diff --git a/man/new_rset.Rd b/man/new_rset.Rd index 2217c675..a416b2d2 100644 --- a/man/new_rset.Rd +++ b/man/new_rset.Rd @@ -23,4 +23,9 @@ An \code{rset} object. \description{ Constructor for new rset objects } +\details{ +Once the new \code{rset} is constructed, an additional attribute called +"fingerprint" is added that is a hash of the \code{rset}. This can be used to +make sure other objects have the exact same resamples. +} \keyword{internal} diff --git a/tests/testthat/test-print-groups.txt b/tests/testthat/test-print-groups.txt index aebcb9c1..50b84983 100644 --- a/tests/testthat/test-print-groups.txt +++ b/tests/testthat/test-print-groups.txt @@ -1,9 +1,9 @@ > print(group_vfold_cv(warpbreaks, "tension"), n = 2) # Group 3-fold cross-validation # A tibble: 3 x 2 - splits id - -1 Resample1 -2 Resample2 + splits id + +1 Resample1 +2 Resample2 # ... with 1 more row diff --git a/tests/testthat/test_fingerprint.R b/tests/testthat/test_fingerprint.R index d4b0fef7..b7d80c1c 100644 --- a/tests/testthat/test_fingerprint.R +++ b/tests/testthat/test_fingerprint.R @@ -2,34 +2,17 @@ test_that("fingerprinting", { set.seed(1) rs_1 <- vfold_cv(mtcars) - fp_1 <- fingerprint(rs_1) + fp_1 <- .get_fingerprint(rs_1) set.seed(1) - fp_2 <- fingerprint(vfold_cv(mtcars)) + fp_2 <- .get_fingerprint(vfold_cv(mtcars)) set.seed(1) - fp_3 <- fingerprint(vfold_cv(mtcars, repeats = 2)) + fp_3 <- .get_fingerprint(vfold_cv(mtcars, repeats = 2)) expect_true(class(fp_1) == "character") expect_true(class(fp_2) == "character") expect_true(class(fp_3) == "character") expect_equal(fp_1, fp_2) expect_false(fp_1 == fp_3) - - expect_error( - fingerprint(vfold_cv(mtcars) %>% dplyr::select(-id)), - "No ID columns were found" - ) - expect_error( - fingerprint(vfold_cv(mtcars) %>% dplyr::select(-splits)), - "The 'split' column was not found" - ) - - # test cases where the rows of the rset are expaned (e.g. in tune_bayes()) - set.seed(1) - rs_2 <- vfold_cv(mtcars) - rs_3 <- rs_2[rep(1:10, 3), ] - fp_4 <- fingerprint(rs_3) - expect_equal(fp_1, fp_4) - }) diff --git a/tests/testthat/test_rset.R b/tests/testthat/test_rset.R index 7bab4727..179ad772 100644 --- a/tests/testthat/test_rset.R +++ b/tests/testthat/test_rset.R @@ -1,4 +1,4 @@ -context("Rset constructor") +context("rset constructor") library(testthat) library(rsample) @@ -37,7 +37,7 @@ test_that('rset with attributes', { attrib = args ) expect_equal(sort(names(attributes(res3))), - c("class", "names", "row.names", "value")) + c("class", "fingerprint", "names", "row.names", "value")) expect_equal(attr(res3, "value"), "potato") }) From bb184be932df090d0393f0f09e65439f879a4b48 Mon Sep 17 00:00:00 2001 From: Max Kuhn Date: Thu, 28 Jan 2021 15:56:51 -0500 Subject: [PATCH 2/4] version bump for better remotes --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4c666862..42529406 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: rsample Title: General Resampling Infrastructure -Version: 0.0.8.9000 +Version: 0.0.8.9001 Authors@R: c( person(given = "Max", family = "Kuhn", email = "max@rstudio.com", role = c("aut", "cre")), person(given = "Fanny", family = "Chow", email = "fannybchow@gmail.com", role = c("aut")), From df858116c0bcdc9261b1e1489c78b15b4eb177f8 Mon Sep 17 00:00:00 2001 From: Max Kuhn Date: Thu, 28 Jan 2021 17:00:01 -0500 Subject: [PATCH 3/4] Fix test case with CRAN pillar. See r-lib/pillar#240 --- tests/testthat/test-print-groups.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-print-groups.txt b/tests/testthat/test-print-groups.txt index 50b84983..aebcb9c1 100644 --- a/tests/testthat/test-print-groups.txt +++ b/tests/testthat/test-print-groups.txt @@ -1,9 +1,9 @@ > print(group_vfold_cv(warpbreaks, "tension"), n = 2) # Group 3-fold cross-validation # A tibble: 3 x 2 - splits id - -1 Resample1 -2 Resample2 + splits id + +1 Resample1 +2 Resample2 # ... with 1 more row From 8b20a0962233715158ef8281d60397cb20fcb9b8 Mon Sep 17 00:00:00 2001 From: Max Kuhn Date: Mon, 1 Feb 2021 14:51:04 -0500 Subject: [PATCH 4/4] removed S3 method for tune objects --- NAMESPACE | 1 - R/misc.R | 12 ------------ man/get_fingerprint.Rd | 3 --- 3 files changed, 16 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index e8202e04..42b3a661 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,7 +4,6 @@ S3method("[",rset) S3method("names<-",rset) S3method(.get_fingerprint,default) S3method(.get_fingerprint,rset) -S3method(.get_fingerprint,tune_results) S3method(as.data.frame,rsplit) S3method(as.integer,rsplit) S3method(complement,apparent_split) diff --git a/R/misc.R b/R/misc.R index 8a015ac8..699f18ed 100644 --- a/R/misc.R +++ b/R/misc.R @@ -106,15 +106,3 @@ split_unnamed <- function(x, f) { } res } - -#' @export -#' @rdname get_fingerprint -.get_fingerprint.tune_results <- function(x, ...) { - att <- attributes(x)$rset_info$att - if (any(names(att) == "fingerprint")) { - res <- att$fingerprint - } else { - res <- NA_character_ - } - res -} diff --git a/man/get_fingerprint.Rd b/man/get_fingerprint.Rd index ff37860f..14492dd3 100644 --- a/man/get_fingerprint.Rd +++ b/man/get_fingerprint.Rd @@ -4,7 +4,6 @@ \alias{.get_fingerprint} \alias{.get_fingerprint.default} \alias{.get_fingerprint.rset} -\alias{.get_fingerprint.tune_results} \title{Obtain a identifier for the resamples} \usage{ .get_fingerprint(x, ...) @@ -12,8 +11,6 @@ \method{.get_fingerprint}{default}(x, ...) \method{.get_fingerprint}{rset}(x, ...) - -\method{.get_fingerprint}{tune_results}(x, ...) } \arguments{ \item{x}{An \code{rset} or \code{tune_results} object.}