diff --git a/.Rbuildignore b/.Rbuildignore index 2d553de..019764b 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -2,3 +2,5 @@ ^\.Rproj\.user$ \tests ^data-raw$ +^doc$ +^Meta$ diff --git a/.Rprofile b/.Rprofile new file mode 100644 index 0000000..992953d --- /dev/null +++ b/.Rprofile @@ -0,0 +1 @@ +reticulate::use_condaenv("bigsimr") diff --git a/.gitignore b/.gitignore index 7c0a8d1..04db965 100644 --- a/.gitignore +++ b/.gitignore @@ -25,7 +25,9 @@ vignettes/*.pdf .httr-oauth # knitr and R markdown default cache directories -/*_cache/ +notebooks/*_cache/ +notebooks/*_files/ +notebooks/*.html /cache/ # Temporary files created by R markdown diff --git a/DESCRIPTION b/DESCRIPTION index 42e242b..032a01e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,25 +1,27 @@ Package: bigsimr -Title: Simulate Multivariate Negative Binomial Data -Version: 0.2.0 +Title: An R Package for Generating High-Dimensional Random Vectors +Version: 0.3.0 Authors@R: person(given = "Alexander", family = "Knudson", role = c("aut", "cre"), - email = "aknudson@nevada.unr.edu", - comment = c(ORCID = "YOUR-ORCID-ID")) + email = "aknudson@nevada.unr.edu") Description: This package simulates data from a negative binomial distribution using a direct copula approach. Depends: R (>= 3.4.0) Imports: + fastrank, foreach, - mvnfast, Matrix, doParallel, parallel, - purrr + purrr, + coop +Remotes: + douglasgscofield/fastrank License: GNU GPLv3 Encoding: UTF-8 LazyData: true -RoxygenNote: 6.1.1 +RoxygenNote: 7.1.0 Suggests: knitr, rmarkdown diff --git a/NAMESPACE b/NAMESPACE index d286f54..5037471 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,11 +1,7 @@ # Generated by roxygen2: do not edit by hand -export(adjustForDiscrete) -export(adjustInputR) -export(convertKendallPearson) -export(convertPearsonKendall) -export(convertPearsonSpearman) -export(convertSpearmanPearson) +export(computeCorBounds) +export(convertCor) export(rcor) export(rnbinom_params) export(rvec) diff --git a/R/correlation_utils.R b/R/correlation_utils.R new file mode 100644 index 0000000..8bc4b95 --- /dev/null +++ b/R/correlation_utils.R @@ -0,0 +1,245 @@ +#' Convert between different types of correlation matrices +#' +#' @param rho A a square symmetric correlation matrix +#' @param from The type of the input correlation matrix +#' @param to The type of the output correlation matrix +#' @export +convertCor <- function(rho, + from = c("pearson", "spearman", "kendall"), + to = c("pearson", "spearman", "kendall")) { + + key <- paste(from, to) + A <- list( + "pearson pearson" = function(r) {r}, + "pearson spearman" = function(r) {(6/pi)*asin(r/2)}, + "pearson kendall" = function(r) {(2/pi)*asin(r)}, + "spearman pearson" = function(r) {2*sin(r*(pi/6))}, + "spearman spearman" = function(r) {r}, + "spearman kendall" = function(r) {(2/pi)*asin(2*sin(r*(pi/6)))}, + "kendall pearson" = function(r) {sin(r*(pi/2))}, + "kendal spearman" = function(r) {(6/pi)*asin(sin(r*(pi/2))/2)}, + "kendall kendall" = function(r) {r} + ) + + tmp <- as.matrix(Matrix::nearPD(A[[key]](rho))$mat) + rownames(tmp) <- colnames(tmp) <- colnames(rho) + tmp +} + + +#' Adjust the correlation matrix when there are discrete distributions present +#' +#' @param rho The input correlation matrix +#' @param params The parameters of the marginals. +#' @param nSigmas The number of standard deviations from the mean +adjustForDiscrete <- function(rho, params, nSigmas) { + upper_bound <- lapply(params, function(param) { + prob <- param[["prob"]] + size <- param[["size"]] + + mu <- size * (1 - prob) / prob + sigma2 <- size^2 * (1 - prob) / prob + + ceiling(mu + nSigmas * sqrt(sigma2)) + }) + upper_bound <- max(unlist(upper_bound)) + + adj <- lapply(params, function(param) { + if (param[[1]] %in% discrete_dists) { + pmf_x <- do.call(paste0("d", param[[1]]), + c(list(x = 0:upper_bound), param[-1])) + return(sqrt(1 - sum(pmf_x^3))) + } else { + return(1) + } + }) + adj <- unlist(adj) + + idx_mat <- combn(x = length(params), m = 2) + + rho_adj <- apply(idx_mat, 2, function(pair) { + adj_factor <- adj[pair[1]] * adj[pair[2]] + rho_tmp <- rho[pair[1], pair[2]] + min(rho_tmp / adj_factor, 1) + }) + + rho[lower.tri(rho)] <- rho_adj + rho[upper.tri(rho)] <- t(rho)[upper.tri(rho)] + rho +} + + +#' Computes the theoretical upper and lower bounds of possible correlations +#' given a set of marginals +#' +#' @param params The parameters of the marginals. +#' @param cores The number of cores to utilize. +#' @param type The type of correlation matrix that is being passed. +#' @return A list containing the theoretical upper and lower bounds +#' @export +computeCorBounds <- function(params, + cores = 1, + type = c("pearson", "kendall", "spearman"), + reps = 1e5){ + + d <- length(params) + + # Generate random samples for each margin and sort the vectors + # The sorted vectors are used for computing the bounds + if (cores == 1) { + sim_data <- sapply(1:d, function(i) { + sort(do.call(what = paste0("r", unlist(params[[i]][1])), + args = c(list(n = reps), params[[i]][-1]))) + }) + } else { + `%dopar%` <- foreach::`%dopar%` + cl <- parallel::makeCluster(cores, type = "FORK") + doParallel::registerDoParallel(cl) + sim_data <- foreach::foreach(i = 1:d, .combine = "cbind") %dopar% { + sort(do.call(what = paste0("r", unlist(params[[i]][1])), + args = c(list(n = reps), params[[i]][-1]))) + } + } + + unname(sim_data) + index_mat <- combn(x = d, m = 2) + + + if (cores == 1) { + # Upper bounds + rho_upper <- cor(sim_data, method = type) + + # Lower bounds + rho_lower_values <- apply(index_mat, 2, function(index, data, ...){ + cor(data[, index[1]], + rev(data[, index[2]]), + method = type) + }, data = sim_data, method = type) + + rho_lower <- diag(x = 1, nrow = d, ncol = d) + rho_lower[lower.tri(rho_lower)] <- rho_lower_values + rho_lower <- t(rho_lower) + rho_lower[lower.tri(rho_lower)] <- rho_lower_values + + } else { + # Upper bounds + rho_upper_values <- parallel::parApply( + cl = cl, + X = index_mat, + MARGIN = 2, + FUN = function(index, data, ...){ + cor(x = data[, index[1]], + y = data[, index[2]], + method = type) + }, data = sim_data, method = type) + + # Ensure that the result is a valid correlation matrix + rho_upper <- diag(x = 1, nrow = d, ncol = d) + rho_upper[lower.tri(rho_upper)] <- rho_upper_values + rho_upper <- t(rho_upper) + rho_upper[lower.tri(rho_upper)] <- rho_upper_values + + # Lower bounds + rho_lower_values <- parallel::parApply( + cl = cl, + X = index_mat, + MARGIN = 2, + FUN = function(index, data, ...){ + cor(x = data[, index[1]], + y = rev(data[, index[2]]), + method=type) + }, data = sim_data, method = type) + parallel::stopCluster(cl) + + # Ensure that the result is a valid correlation matrix + rho_lower <- diag(x = 1, nrow = d, ncol = d) + rho_lower[lower.tri(rho_lower)] <- rho_lower_values + rho_lower <- t(rho_lower) + rho_lower[lower.tri(rho_lower)] <- rho_lower_values + } + + list(upper = rho_upper, + lower = rho_lower) +} + + +#' Constrains a correlation matrix to the theoretical upper and lower bounds +#' +#' @param rho The input correlation matrix. +#' @param rho_bounds A list containing the theoretical upper and lower bounds +#' @return The constrained correlation matrix +constrainRho <- function(rho, rho_bounds){ + rho_tmp <- pmin(rho_bounds[["upper"]], rho) + pmax(rho_bounds[["lower"]], rho_tmp) +} + + +#' Returns a logical matrix of which correlations are in the feasible region +#' +#' @param rho The input correlation matrix. +#' @param rho_bounds A list containing the theoretical upper and lower bounds +#' @param negate Should the logical values be negated in order to identify +#' values that are outside the feasible region. +which_corInBounds <- function(rho, rho_bounds, negate = FALSE){ + + tooSmall <- rho < rho_bounds[["lower"]] + tooLarge <- rho > rho_bounds[["upper"]] + + outOfBounds <- tooSmall | tooLarge + inBounds <- !outOfBounds + + # Return either the in bounds or out of bounds matrix + if (negate) { + outOfBounds + } else { + inBounds + } +} + + +#' Returns TRUE if all correlations pairs are within the theoretical bounds +#' +#' @param rho The input correlation matrix. +#' @param params The parameters of the marginals. +#' @param cores The number of cores to utilize. +#' @param type The type of correlation matrix that is being passed. +#' @param rho_bounds Pre-computed upper and lower correlation bounds +#' ... Other arguments passed to `computeCoreBounds()` +#' @return Logical. TRUE if all correlations pairs are within the theoretical +#' bounds, and false otherwise. +all_corInBounds <- function(rho, + params, + cores = 1, + type = c("pearson", "spearman", "kendall"), + rho_bounds = NULL, ...){ + + if (is.null(rho_bounds)){ + rho_bounds <- computeCorBounds(params = params, + cores = cores, + type = type, + ...) + } + + outOfBounds <- which_corInBounds(rho, rho_bounds, negate = TRUE) + + if (any(outOfBounds)) { + warning(paste0("At least one of the specified correlations are either ", + "too large or too small. Please use 'which_corInBounds() ", + "to see which correlations are valid.")) + return(FALSE) + } + TRUE +} + + +#' Estimate the Spearman rank correlation +#' +#' For a Nx2 matrix when N=1e5, fast is about 2x faster, and when N=1e6, +#' fast is about 3x faster. +estimateSpearmanRho <- function(x, fast = TRUE){ + if (fast) { + coop::pcor(apply(x, 2, fastrank::fastrank_num_avg)) + } else { + cor(apply(x, 2, rank)) + } +} diff --git a/R/helper_funcs.R b/R/helper_funcs.R deleted file mode 100644 index b4fc715..0000000 --- a/R/helper_funcs.R +++ /dev/null @@ -1,140 +0,0 @@ -#' Converts Pearson correlation to Spearman. Used for normal random variables. -#' -#' @param rho A a square symmetric Pearson correlation matrix. -#' @return A Spearman correlation matrix. -#' @export -convertPearsonSpearman <- function(rho) { - tmp <- (6 / pi) * asin(rho / 2) - tmp <- as.matrix(Matrix::nearPD(tmp)$mat) - rownames(tmp) <- colnames(tmp) <- colnames(rho) - return(tmp) -} - - -#' Converts Spearman correlation to Pearson. Used for normal random variables. -#' -#' @param rho A a square symmetric Spearman correlation matrix. -#' @return A Pearson correlation matrix. -#' @export -convertSpearmanPearson <- function(rho) { - tmp <- 2* sin( rho * (pi / 6)) - tmp <- as.matrix(Matrix::nearPD(tmp)$mat) - rownames(tmp) <- colnames(tmp) <- colnames(rho) - return(tmp) -} - - -#' Converts Pearson correlation to Kendall. Used for normal random variables. -#' -#' @param rho A a square symmetric Pearson correlation matrix. -#' @return A Kendall correlation matrix. -#' @export -convertPearsonKendall <- function(rho) { - tmp <- (2 / pi) * asin(rho) - tmp <- as.matrix(Matrix::nearPD(tmp)$mat) - rownames(tmp) <- colnames(tmp) <- colnames(rho) - return(tmp) -} - - -#' Converts Kendall correlation to Pearson. Used for normal random variables. -#' -#' @param rho A a square symmetric Kendall correlation matrix. -#' @return A Pearson correlation matrix. -#' @export -convertKendallPearson <- function(rho) { - tmp <- sin( rho * (pi / 2) ) - tmp <- as.matrix(Matrix::nearPD(tmp)$mat) - rownames(tmp) <- colnames(tmp) <- colnames(rho) - return(tmp) -} - - -#' Adjust an input correlation matrix for our method to match -#' -#' @param rho A a square symmetric correlation matrix. -#' @param params The parameters of the marginals. -#' @param cores The number of cores to utilize. -#' @export -adjustInputR <- function(rho, params, cores = 1){ - - ## 1. find the pairs - d <- NROW(rho) - index_mat <- combn(x = 1:d, 2) - - ## 2. compute the first order approximation - if (cores == 1) { - input_rho_vector <- apply(index_mat, 2, function(tmp_index){ - ## extract and compute simple quantities - alpha1 <- params["alpha", tmp_index[1]] - alpha2 <- params["alpha", tmp_index[2]] - lambda_nb1 <- params["lambda", tmp_index[1]] - lambda_nb2 <- params["lambda", tmp_index[2]] - sd_nb1 <- sqrt(params["nb_var", tmp_index[1]]) - sd_nb2 <- sqrt(params["nb_var", tmp_index[2]]) - ## precompute scale factor - scale_factor <- sqrt(alpha1) * sqrt(alpha2) - ## extract target nb corr - tmp_nb_rho <- rho[tmp_index[1], tmp_index[2]] - ## convert to rho for gamma (exact) - target_gamma_rho <- tmp_nb_rho * ( (sd_nb1 * sd_nb2) / (scale_factor * lambda_nb1 * lambda_nb2) ) - ## now approximate rho for input corr for MVN - return(min(1, target_gamma_rho)) - }) - } else { - cl <- parallel::makeCluster(cores, type = "FORK") - - ## parallel version of the above code - doParallel::registerDoParallel(cl) - `%dopar%` <- foreach::`%dopar%` - input_rho_vector <- foreach::foreach(i = 1:ncol(index_mat), .combine = 'c') %dopar% { - ## extract and compute simple quantities - tmp_index <- index_mat[,i] - ## extract and compute simple quantities - alpha1 <- params["alpha", tmp_index[1]] - alpha2 <- params["alpha", tmp_index[2]] - lambda_nb1 <- params["lambda", tmp_index[1]] - lambda_nb2 <- params["lambda", tmp_index[2]] - sd_nb1 <- sqrt(params["nb_var", tmp_index[1]]) - sd_nb2 <- sqrt(params["nb_var", tmp_index[2]]) - ## precompute scale factor - scale_factor <- sqrt(alpha1) * sqrt(alpha2) - ## extract target nb corr - tmp_nb_rho <- rho[tmp_index[1], tmp_index[2]] - ## convert to rho for gamma (exact) - target_gamma_rho <- tmp_nb_rho * ((sd_nb1 * sd_nb2) / (scale_factor * lambda_nb1 * lambda_nb2)) - - ## now approximate rho for input corr for MVN - return(min(1, target_gamma_rho)) - } - ## close cluster - parallel::stopCluster(cl) - } - - ## Put input_rho_vector into a matrix - to_return <- diag(d) - to_return[lower.tri(to_return)] <- input_rho_vector - to_return <- to_return + t(to_return) - diag(diag(to_return)) - - ## Ensure positive semi-definiteness - eigen_obj <- eigen(to_return) - if (any(eigen_obj$values < 0)) { - to_return <- as.matrix(Matrix::nearPD(to_return)$mat) - } - - ## Label and return - rownames(to_return) <- colnames(to_return) <- rownames(rho) - return(to_return) -} - - -#' Transforms a [multivariate]normal vector to a different marginal via a -#' uniform intermediate transformation. -#' -#' @param x A normal random vector. -#' @param param A list ccontaining the marginal and its parameters. -normal2marginal <- function(x, param) { - do.call(what = paste0("q", param[[1]]), - args = c(list(p = pnorm(x)), param[-1])) -} - diff --git a/R/normal2margin.R b/R/normal2margin.R new file mode 100644 index 0000000..9d9f84b --- /dev/null +++ b/R/normal2margin.R @@ -0,0 +1,9 @@ +#' Transforms a [multivariate]normal vector to a different marginal via a +#' uniform intermediate transformation. +#' +#' @param x A normal random vector. +#' @param param A list ccontaining the marginal and its parameters. +normal2marginal <- function(x, param) { + do.call(what = paste0("q", param[[1]]), + args = c(list(p = pnorm(x)), param[-1])) +} diff --git a/R/crude_helper_funcs.R b/R/rand_utils.R similarity index 53% rename from R/crude_helper_funcs.R rename to R/rand_utils.R index 79e3c9e..4fa10d1 100644 --- a/R/crude_helper_funcs.R +++ b/R/rand_utils.R @@ -25,8 +25,10 @@ rcor <- function(d, constant_rho = FALSE) { #' Generate negative binomial marginal parameters #' #' @param d A positive integer. -#' @param id_margins A boolean value specifying if the margins should be identical. -#' @return A matrix with columns containing (in order) lambda, alpha, mean, and variance. +#' @param id_margins A boolean value specifying if the margins should be +#' identical. +#' @return A matrix with columns containing (in order) lambda, alpha, mean, +#' and variance. #' @examples #' rnbinom_params(4) #' rnbinom_params(10) @@ -42,42 +44,3 @@ rnbinom_params <- function(d, id_margins = FALSE) { } purrr::transpose(list(rep("nbinom", d), size = sizes, prob = probs)) } - - -#' @param rho The input correlation matrix -#' @param params The parameters of the marginals. -#' @param sigma The number of standard deviations from the mean -#' @export -adjustForDiscrete <- function(rho, params, sigma = 5) { - upper_bound <- lapply(params, function(param) { - prob <- param[["prob"]] - size <- param[["size"]] - - mu <- size * (1 - prob) / prob - sigma2 <- size^2 * (1 - prob) / prob - - ceiling(mu + sigma * sqrt(sigma2)) - }) - upper_bound <- max(unlist(upper_bound)) - - adj <- lapply(params, function(param) { - if (param[[1]] %in% discrete_dists) { - pmf_x <- do.call(paste0("d", param[[1]]), - c(list(x = 0:upper_bound), param[-1])) - return(sqrt(1 - sum(pmf_x^3))) - } else { - return(1) - } - }) - adj <- unlist(adj) - - idx_mat <- combn(x = length(params), m = 2) - - rho_adj <- apply(idx_mat, 2, function(pair) { - adj_factor <- adj[pair[1]] * adj[pair[2]] - rho_tmp <- rho[pair[1], pair[2]] - rho_tmp / adj_factor - }) - rho[lower.tri(rho)] <- rho[upper.tri(rho)] <- rho_adj - rho -} diff --git a/R/rmvn.R b/R/rmvn.R new file mode 100644 index 0000000..525deb5 --- /dev/null +++ b/R/rmvn.R @@ -0,0 +1,22 @@ +.rmvn_jax <- function(seed, mu, Sigma, n) { + + # Any RNG in jax requires a key. If none is supplied by the user, then it + # should be generated at random. + if (is.null(seed)) { + set.seed(seed) + seed <- as.integer(sample(1:1000, 1)) + } + + rmvn <- jax$random$multivariate_normal + + Sigma = numpy$array(Sigma, dtype=numpy$float32) + Sigma = jax$device_put(Sigma) + + mu = numpy$array(mu, dtype=numpy$float32) + mu = jax$device_put(mu) + + key = jax$random$PRNGKey(as.integer(seed)) + x = rmvn(key, mu, Sigma, reticulate::tuple(list(n)))$block_until_ready() + x = jax$device_get(x) + reticulate::py_to_r(x) +} diff --git a/R/rvec.R b/R/rvec.R index 9179c30..79c4905 100644 --- a/R/rvec.R +++ b/R/rvec.R @@ -6,7 +6,12 @@ #' @param params The parameters of the marginals. #' @param cores The number of cores to utilize. #' @param type The type of correlation matrix that is being passed. -#' @return A matrix of random vectors generated from the specified marginals and parameters. +#' @param adjustForDiscrete A boolean for whether to adjust the correlation +#' matrix when in the presence of discrete distributions +#' @param nSigmas The number of standard deviations above the mean to set the +#' upper bound when adjusting for discrete distributions +#' @return A matrix of random vectors generated from the specified marginals +#' and parameters. #' @examples #' d <- 5 #' rho <- rcor(d) @@ -29,45 +34,53 @@ #' rvec(10, rho, margins, cores = 1, type = "pearson") #' rvec(10, rho, margins2, cores = 1, type = "pearson") #' @export -rvec <- function(n, rho, params, cores = 1, +rvec <- function(n, + rho, + params, + cores = 1, type = c("pearson", "kendall", "spearman"), - adjustForDiscrete = TRUE){ + adjustForDiscrete = TRUE, + nSigmas = 10){ + # Handle different types of dependencies - if (type == "spearman") { + if (type == "spearman" && + adjustForDiscrete && + any(my_dists %in% discrete_dists)) { my_dists <- unlist(lapply(params, '[[', 1)) - if (adjustForDiscrete && any(my_dists %in% discrete_dists)) { - rho <- adjustForDiscrete(rho, params, sigma = 5) - } - rho <- convertSpearmanPearson(rho) + rho <- adjustForDiscrete(rho, params, nSigmas) } - if (type == "kendall") { - rho <- convertKendallPearson(rho) - } + # Correlation matrix must be a Pearson correlation + rho <- convertCor(rho, from = type, to = "pearson") - # determine the dimension d d <- NROW(rho) # generate MVN sample - mvn_sim <- mvnfast::rmvn(n = n, - mu = rep(0, d), - sigma = rho, - ncores = cores, - isChol = FALSE) + mvn_sim <- .rmvn_jax(NULL, rep(0, d), rho, as.integer(n)) + # Apply the NORTA algorithm if (cores == 1) { + mv_sim <- sapply(1:d, function(i){ normal2marginal(mvn_sim[,i], params[[i]]) }) + } else { + `%dopar%` <- foreach::`%dopar%` + cl <- parallel::makeCluster(cores, type = "FORK") doParallel::registerDoParallel(cl) + mv_sim <- foreach::foreach(i = 1:d, .combine = 'cbind') %dopar% { normal2marginal(mvn_sim[,i], params[[i]]) } + parallel::stopCluster(cl) + } + colnames(mv_sim) <- rownames(rho) return(mv_sim) + } diff --git a/R/zzz.R b/R/zzz.R new file mode 100644 index 0000000..1e49b36 --- /dev/null +++ b/R/zzz.R @@ -0,0 +1,8 @@ +numpy <- NULL +jax <- NULL + +.onLoad <- function(...) { + # Check that a python environment exists and is available + numpy <<- reticulate::import("numpy", delay_load = TRUE, convert = FALSE) + jax <<- reticulate::import("jax", delay_load = TRUE, convert = FALSE) +} diff --git a/doc/using-rvec.R b/doc/using-rvec.R new file mode 100644 index 0000000..a2618f3 --- /dev/null +++ b/doc/using-rvec.R @@ -0,0 +1,78 @@ +## ---- include = FALSE--------------------------------------------------------- +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) + +reticulate::use_condaenv("py37") +devtools::load_all() + +## ----setup, eval=FALSE-------------------------------------------------------- +# # Optional step if your python binary is in a miniconda environment +# reticulate::use_condaenv("py37") +# library(bigsimr) + +## ----------------------------------------------------------------------------- +# Generating samples from a normal distribution +# We need three things: +# 1) The number of samples to generate +# 2) The location of the normal distribution (mean) +# 3) The scale of the normal distribution (standard deviation) + +rnorm(n = 16, mean = 5, sd = 1.5) + +## ----------------------------------------------------------------------------- +margins = list( + list("norm", mean = 3.14, sd = 0.1), + list("beta", shape1 = 1, shape2 = 4), + list("nbinom", size = 10, prob = 0.75) +) + +## ----------------------------------------------------------------------------- +rho <- matrix(0.5, nrow = 3, ncol = 3) +diag(rho) <- 1.0 +rho + +## ----------------------------------------------------------------------------- +x = rvec(10, rho = rho, params = margins, type = "pearson") + +## ---- echo=FALSE-------------------------------------------------------------- +warning("warning.warn('No GPU/TPU found, falling back to CPU.')") + +## ----------------------------------------------------------------------------- +x + +## ---- fig.width=7------------------------------------------------------------- +x = rvec(10000, rho = rho, params = margins, type = "pearson") + +par(mfrow=c(1,3)) +hist(x[,1], breaks = 30, xlab = "", main = "Normal") +hist(x[,2], breaks = 30, xlab = "", main = "Beta") +hist(x[,3], breaks = 30, xlab = "", main = "Negative Binomial") + +## ----------------------------------------------------------------------------- +cor(x) + +## ---- eval=FALSE-------------------------------------------------------------- +# all_dists <- list( +# list(dist = "beta", shape1, shape2), +# list(dist = "binom", size, prob), +# list(dist = "cauchy", location, scale), +# list(dist = "chisq", df), +# list(dist = "exp", rate), +# list(dist = "f", df1, df2), +# list(dist = "gamma", shape, rate), +# list(dist = "geom", prob), +# list(dist = "hyper", m, n, k), +# list(dist = "logis", location, scale), +# list(dist = "lnorm", meanlog, sdlog), +# list(dist = "nbinom", size, prob), +# list(dist = "norm", mean, sd), +# list(dist = "pois", lambda), +# list(dist = "t", df), +# list(dist = "unif", min, max), +# list(dist = "weibull", shape, scale), +# list(dist = "wilcox", m, n), +# list(dist = "signrank", n) +# ) + diff --git a/doc/using-rvec.Rmd b/doc/using-rvec.Rmd new file mode 100644 index 0000000..a3beb20 --- /dev/null +++ b/doc/using-rvec.Rmd @@ -0,0 +1,131 @@ +--- +title: "Using bigsimr::rvec" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{using-rvec} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) + +reticulate::use_condaenv("py37") +devtools::load_all() +``` + +```{r setup, eval=FALSE} + # Optional step if your python binary is in a miniconda environment +reticulate::use_condaenv("py37") +library(bigsimr) +``` + +# Installation + +This package requires a working python distribution (typically via a conda environment). As of right now, we cannot automatically create the necessary environment for you, but you can do so by installing Anaconda or Miniconda, and then installing `jax` and `jaxlib` ([see here](https://github.com/google/jax#installation)). There is an option for install `jax` with GPU support. This is recommended if possible, otherwise `bigsimr` will simply throw a warning stating that it has fallen back onto the CPU. + +# Basics + +This package operates on the principle of Gaussian copulas, for which you will need a list of target marginal distributions and a correlation structure between the margins. + +The main function is `bigsimr::rvec` which mirrors some of the conventions found in the base `stats` package like `stats::rnorm`. I.e. the common convention for generating a random vector is to first specify the number of samples followed by the parameters for the distribution. + +```{r} +# Generating samples from a normal distribution +# We need three things: +# 1) The number of samples to generate +# 2) The location of the normal distribution (mean) +# 3) The scale of the normal distribution (standard deviation) + +rnorm(n = 16, mean = 5, sd = 1.5) +``` + +The 'r' in front of 'norm' is standard for 'random', as in generate random samples from the normal distribution. Other examples are `rexp` (exponential), `rnbinom` (negative binomial), and `rt` (Student t). + +One thing to familiarize yourself with is the specific parameterization of certain R distributions. `stats::rexp` uses a rate parameter while others may use the scale parameter, $1 / rate$. Our package uses the default parameterizations found in R, so be sure to check that you have the correct parameterization beforehand. + +# Building a Multivariate Distribution + +As stated earlier, to generate multivariate data, we need a list of marginals (and their parameters), and a correlation structure (matrix). The marginal distributions can be built up as a list of lists, where each sublist contains the information for the target distribution. + +```{r} +margins = list( + list("norm", mean = 3.14, sd = 0.1), + list("beta", shape1 = 1, shape2 = 4), + list("nbinom", size = 10, prob = 0.75) +) +``` + +The things to point out here are that in each sublist (marginal), the first item is an unnamed character string with the R name of the distribution *without a letter prefix*. E.g. instead of `rnorm`, we pass in just `"norm"`. The second thing to note is that the remaining items are *named* arguments that go along with the distribution. A full list of built-in distributions is found in the appendix. + +The next step is to define a correlation structure for the multivariate distribution. This correlation matrix can either come from observed data, or we can set it ourselves, or we can generate a random correlation matrix via `bigsimr::rcor`. Let's create a simple correlation matrix where all off-diagonal elements are 0.5. Since we have 3 marginals, we need a $3\times 3$ matrix. + +```{r} +rho <- matrix(0.5, nrow = 3, ncol = 3) +diag(rho) <- 1.0 +rho +``` + +Finally we can generate a random vector with our specified marginals and correlation structure. The last argument, `type`, is looking to know what kind of correlation matrix it is receiving. Right now it can handle Pearson, Spearman, or Kendal. + +```{r} +x = rvec(10, rho = rho, params = margins, type = "pearson") +``` + +On my machine, there is no dedicated GPU, so I would see the following warning message once per session. + +```{r, echo=FALSE} +warning("warning.warn('No GPU/TPU found, falling back to CPU.')") +``` + +Taking a look at our random vector, we see that it hs 10 rows and 3 columns, one column for each marginal. + +```{r} +x +``` + +We can simulate many more samples and then check the histogram of each margin, as well as the estimated correlation between the columns. + +```{r, fig.width=7} +x = rvec(10000, rho = rho, params = margins, type = "pearson") + +par(mfrow=c(1,3)) +hist(x[,1], breaks = 30, xlab = "", main = "Normal") +hist(x[,2], breaks = 30, xlab = "", main = "Beta") +hist(x[,3], breaks = 30, xlab = "", main = "Negative Binomial") +``` + +```{r} +cor(x) +``` + +We can see that even wih 10,000 samples, the estimated correlation of the simulated data is not exactly the same as the target correlation. This can be explained by the fact that some correlations are simply not possible due to the discrete nature of certain distributions. Another possibility is that the copula algorith is biased and needs correction. + +# Appendix + +```{r, eval=FALSE} +all_dists <- list( + list(dist = "beta", shape1, shape2), + list(dist = "binom", size, prob), + list(dist = "cauchy", location, scale), + list(dist = "chisq", df), + list(dist = "exp", rate), + list(dist = "f", df1, df2), + list(dist = "gamma", shape, rate), + list(dist = "geom", prob), + list(dist = "hyper", m, n, k), + list(dist = "logis", location, scale), + list(dist = "lnorm", meanlog, sdlog), + list(dist = "nbinom", size, prob), + list(dist = "norm", mean, sd), + list(dist = "pois", lambda), + list(dist = "t", df), + list(dist = "unif", min, max), + list(dist = "weibull", shape, scale), + list(dist = "wilcox", m, n), + list(dist = "signrank", n) +) +``` diff --git a/doc/using-rvec.html b/doc/using-rvec.html new file mode 100644 index 0000000..25027ab --- /dev/null +++ b/doc/using-rvec.html @@ -0,0 +1,417 @@ + + + + + + + + + + + + + + +Using bigsimr::rvec + + + + + + + + + + + + + + + + + + + + + +

Using bigsimr::rvec

+ + + +
 # Optional step if your python binary is in a miniconda environment
+reticulate::use_condaenv("py37")
+library(bigsimr)
+
+

Installation

+

This package requires a working python distribution (typically via a conda environment). As of right now, we cannot automatically create the necessary environment for you, but you can do so by installing Anaconda or Miniconda, and then installing jax and jaxlib (see here). There is an option for install jax with GPU support. This is recommended if possible, otherwise bigsimr will simply throw a warning stating that it has fallen back onto the CPU.

+
+
+

Basics

+

This package operates on the principle of Gaussian copulas, for which you will need a list of target marginal distributions and a correlation structure between the margins.

+

The main function is bigsimr::rvec which mirrors some of the conventions found in the base stats package like stats::rnorm. I.e. the common convention for generating a random vector is to first specify the number of samples followed by the parameters for the distribution.

+
# Generating samples from a normal distribution
+# We need three things: 
+#   1) The number of samples to generate
+#   2) The location of the normal distribution (mean)
+#   3) The scale of the normal distribution (standard deviation)
+
+rnorm(n = 16, mean = 5, sd = 1.5)
+#>  [1] 4.727757 4.002868 7.133968 6.431139 4.124616 4.310464 6.241137 6.005251
+#>  [9] 4.345459 4.200211 7.250144 6.428607 5.765199 4.909169 4.765040 4.431531
+

The ‘r’ in front of ‘norm’ is standard for ‘random’, as in generate random samples from the normal distribution. Other examples are rexp (exponential), rnbinom (negative binomial), and rt (Student t).

+

One thing to familiarize yourself with is the specific parameterization of certain R distributions. stats::rexp uses a rate parameter while others may use the scale parameter, \(1 / rate\). Our package uses the default parameterizations found in R, so be sure to check that you have the correct parameterization beforehand.

+
+
+

Building a Multivariate Distribution

+

As stated earlier, to generate multivariate data, we need a list of marginals (and their parameters), and a correlation structure (matrix). The marginal distributions can be built up as a list of lists, where each sublist contains the information for the target distribution.

+
margins = list(
+  list("norm", mean = 3.14, sd = 0.1),
+  list("beta", shape1 = 1, shape2 = 4),
+  list("nbinom", size = 10, prob = 0.75)
+)
+

The things to point out here are that in each sublist (marginal), the first item is an unnamed character string with the R name of the distribution without a letter prefix. E.g. instead of rnorm, we pass in just "norm". The second thing to note is that the remaining items are named arguments that go along with the distribution. A full list of built-in distributions is found in the appendix.

+

The next step is to define a correlation structure for the multivariate distribution. This correlation matrix can either come from observed data, or we can set it ourselves, or we can generate a random correlation matrix via bigsimr::rcor. Let’s create a simple correlation matrix where all off-diagonal elements are 0.5. Since we have 3 marginals, we need a \(3\times 3\) matrix.

+
rho <- matrix(0.5, nrow = 3, ncol = 3)
+diag(rho) <- 1.0
+rho
+#>      [,1] [,2] [,3]
+#> [1,]  1.0  0.5  0.5
+#> [2,]  0.5  1.0  0.5
+#> [3,]  0.5  0.5  1.0
+

Finally we can generate a random vector with our specified marginals and correlation structure. The last argument, type, is looking to know what kind of correlation matrix it is receiving. Right now it can handle Pearson, Spearman, or Kendal.

+
x = rvec(10, rho = rho, params = margins, type = "pearson")
+

On my machine, there is no dedicated GPU, so I would see the following warning message once per session.

+
#> Warning: warning.warn('No GPU/TPU found, falling back to CPU.')
+

Taking a look at our random vector, we see that it hs 10 rows and 3 columns, one column for each marginal.

+
x
+#>           [,1]       [,2] [,3]
+#>  [1,] 3.150112 0.04986300    1
+#>  [2,] 3.198523 0.12602385    4
+#>  [3,] 3.092609 0.12625169    0
+#>  [4,] 2.914857 0.02925966    1
+#>  [5,] 3.213697 0.29187930    5
+#>  [6,] 3.056378 0.22101881    2
+#>  [7,] 3.131806 0.31518815    7
+#>  [8,] 3.245636 0.14441806    3
+#>  [9,] 3.358278 0.20572350    3
+#> [10,] 3.142494 0.39363244    3
+

We can simulate many more samples and then check the histogram of each margin, as well as the estimated correlation between the columns.

+
x = rvec(10000, rho = rho, params = margins, type = "pearson")
+
+par(mfrow=c(1,3))
+hist(x[,1], breaks = 30, xlab = "", main = "Normal")
+hist(x[,2], breaks = 30, xlab = "", main = "Beta")
+hist(x[,3], breaks = 30, xlab = "", main = "Negative Binomial")
+

+
cor(x)
+#>           [,1]      [,2]      [,3]
+#> [1,] 1.0000000 0.4847622 0.4880023
+#> [2,] 0.4847622 1.0000000 0.4892170
+#> [3,] 0.4880023 0.4892170 1.0000000
+

We can see that even wih 10,000 samples, the estimated correlation of the simulated data is not exactly the same as the target correlation. This can be explained by the fact that some correlations are simply not possible due to the discrete nature of certain distributions. Another possibility is that the copula algorith is biased and needs correction.

+
+
+

Appendix

+
all_dists <- list(
+  list(dist = "beta", shape1, shape2),
+  list(dist = "binom", size, prob),
+  list(dist = "cauchy", location, scale),
+  list(dist = "chisq", df),
+  list(dist = "exp", rate),
+  list(dist = "f", df1, df2),
+  list(dist = "gamma", shape, rate),
+  list(dist = "geom", prob),
+  list(dist = "hyper", m, n, k),
+  list(dist = "logis", location, scale),
+  list(dist = "lnorm", meanlog, sdlog),
+  list(dist = "nbinom", size, prob),
+  list(dist = "norm", mean, sd),
+  list(dist = "pois", lambda),
+  list(dist = "t", df),
+  list(dist = "unif", min, max),
+  list(dist = "weibull", shape, scale),
+  list(dist = "wilcox", m, n),
+  list(dist = "signrank", n)
+)
+
+ + + + + + + + + + + diff --git a/man/adjustForDiscrete.Rd b/man/adjustForDiscrete.Rd new file mode 100644 index 0000000..17db7d2 --- /dev/null +++ b/man/adjustForDiscrete.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/correlation_utils.R +\name{adjustForDiscrete} +\alias{adjustForDiscrete} +\title{Adjust the correlation matrix when there are discrete distributions present} +\usage{ +adjustForDiscrete(rho, params, nSigmas) +} +\arguments{ +\item{rho}{The input correlation matrix} + +\item{params}{The parameters of the marginals.} + +\item{nSigmas}{The number of standard deviations from the mean} +} +\description{ +Adjust the correlation matrix when there are discrete distributions present +} diff --git a/man/adjustInputR.Rd b/man/adjustInputR.Rd deleted file mode 100644 index 0f50da4..0000000 --- a/man/adjustInputR.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helper_funcs.R -\name{adjustInputR} -\alias{adjustInputR} -\title{Adjust an input correlation matrix for our method to match} -\usage{ -adjustInputR(rho, params, cores = 1) -} -\arguments{ -\item{rho}{A a square symmetric correlation matrix.} - -\item{params}{The parameters of the marginals.} - -\item{cores}{The number of cores to utilize.} -} -\description{ -Adjust an input correlation matrix for our method to match -} diff --git a/man/all_corInBounds.Rd b/man/all_corInBounds.Rd new file mode 100644 index 0000000..0ae6352 --- /dev/null +++ b/man/all_corInBounds.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/correlation_utils.R +\name{all_corInBounds} +\alias{all_corInBounds} +\title{Returns TRUE if all correlations pairs are within the theoretical bounds} +\usage{ +all_corInBounds( + rho, + params, + cores = 1, + type = c("pearson", "spearman", "kendall"), + rho_bounds = NULL, + ... +) +} +\arguments{ +\item{rho}{The input correlation matrix.} + +\item{params}{The parameters of the marginals.} + +\item{cores}{The number of cores to utilize.} + +\item{type}{The type of correlation matrix that is being passed.} + +\item{rho_bounds}{Pre-computed upper and lower correlation bounds +... Other arguments passed to `computeCoreBounds()`} +} +\value{ +Logical. TRUE if all correlations pairs are within the theoretical + bounds, and false otherwise. +} +\description{ +Returns TRUE if all correlations pairs are within the theoretical bounds +} diff --git a/man/computeCorBounds.Rd b/man/computeCorBounds.Rd new file mode 100644 index 0000000..773dd60 --- /dev/null +++ b/man/computeCorBounds.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/correlation_utils.R +\name{computeCorBounds} +\alias{computeCorBounds} +\title{Computes the theoretical upper and lower bounds of possible correlations + given a set of marginals} +\usage{ +computeCorBounds( + params, + cores = 1, + type = c("pearson", "kendall", "spearman"), + reps = 1e+05 +) +} +\arguments{ +\item{params}{The parameters of the marginals.} + +\item{cores}{The number of cores to utilize.} + +\item{type}{The type of correlation matrix that is being passed.} +} +\value{ +A list containing the theoretical upper and lower bounds +} +\description{ +Computes the theoretical upper and lower bounds of possible correlations + given a set of marginals +} diff --git a/man/constrainRho.Rd b/man/constrainRho.Rd new file mode 100644 index 0000000..8a3f4dd --- /dev/null +++ b/man/constrainRho.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/correlation_utils.R +\name{constrainRho} +\alias{constrainRho} +\title{Constrains a correlation matrix to the theoretical upper and lower bounds} +\usage{ +constrainRho(rho, rho_bounds) +} +\arguments{ +\item{rho}{The input correlation matrix.} + +\item{rho_bounds}{A list containing the theoretical upper and lower bounds} +} +\value{ +The constrained correlation matrix +} +\description{ +Constrains a correlation matrix to the theoretical upper and lower bounds +} diff --git a/man/convertCor.Rd b/man/convertCor.Rd new file mode 100644 index 0000000..3fbcb0a --- /dev/null +++ b/man/convertCor.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/correlation_utils.R +\name{convertCor} +\alias{convertCor} +\title{Convert between different types of correlation matrices} +\usage{ +convertCor( + rho, + from = c("pearson", "spearman", "kendall"), + to = c("pearson", "spearman", "kendall") +) +} +\arguments{ +\item{rho}{A a square symmetric correlation matrix} + +\item{from}{The type of the input correlation matrix} + +\item{to}{The type of the output correlation matrix} +} +\description{ +Convert between different types of correlation matrices +} diff --git a/man/convertKendallPearson.Rd b/man/convertKendallPearson.Rd deleted file mode 100644 index bc5e7bd..0000000 --- a/man/convertKendallPearson.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helper_funcs.R -\name{convertKendallPearson} -\alias{convertKendallPearson} -\title{Converts Kendall correlation to Pearson. Used for normal random variables.} -\usage{ -convertKendallPearson(rho) -} -\arguments{ -\item{rho}{A a square symmetric Kendall correlation matrix.} -} -\value{ -A Pearson correlation matrix. -} -\description{ -Converts Kendall correlation to Pearson. Used for normal random variables. -} diff --git a/man/convertPearsonKendall.Rd b/man/convertPearsonKendall.Rd deleted file mode 100644 index 128acc3..0000000 --- a/man/convertPearsonKendall.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helper_funcs.R -\name{convertPearsonKendall} -\alias{convertPearsonKendall} -\title{Converts Pearson correlation to Kendall. Used for normal random variables.} -\usage{ -convertPearsonKendall(rho) -} -\arguments{ -\item{rho}{A a square symmetric Pearson correlation matrix.} -} -\value{ -A Kendall correlation matrix. -} -\description{ -Converts Pearson correlation to Kendall. Used for normal random variables. -} diff --git a/man/convertPearsonSpearman.Rd b/man/convertPearsonSpearman.Rd deleted file mode 100644 index dc575a7..0000000 --- a/man/convertPearsonSpearman.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helper_funcs.R -\name{convertPearsonSpearman} -\alias{convertPearsonSpearman} -\title{Converts Pearson correlation to Spearman. Used for normal random variables.} -\usage{ -convertPearsonSpearman(rho) -} -\arguments{ -\item{rho}{A a square symmetric Pearson correlation matrix.} -} -\value{ -A Spearman correlation matrix. -} -\description{ -Converts Pearson correlation to Spearman. Used for normal random variables. -} diff --git a/man/convertSpearmanPearson.Rd b/man/convertSpearmanPearson.Rd deleted file mode 100644 index 24f96a2..0000000 --- a/man/convertSpearmanPearson.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helper_funcs.R -\name{convertSpearmanPearson} -\alias{convertSpearmanPearson} -\title{Converts Spearman correlation to Pearson. Used for normal random variables.} -\usage{ -convertSpearmanPearson(rho) -} -\arguments{ -\item{rho}{A a square symmetric Spearman correlation matrix.} -} -\value{ -A Pearson correlation matrix. -} -\description{ -Converts Spearman correlation to Pearson. Used for normal random variables. -} diff --git a/man/estimateSpearmanRho.Rd b/man/estimateSpearmanRho.Rd new file mode 100644 index 0000000..2cff60e --- /dev/null +++ b/man/estimateSpearmanRho.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/correlation_utils.R +\name{estimateSpearmanRho} +\alias{estimateSpearmanRho} +\title{Estimate the Spearman rank correlation} +\usage{ +estimateSpearmanRho(x, fast = TRUE) +} +\description{ +For a Nx2 matrix when N=1e5, fast is about 2x faster, and when N=1e6, + fast is about 3x faster. +} diff --git a/man/normal2marginal.Rd b/man/normal2marginal.Rd index 8ef4361..3be55df 100644 --- a/man/normal2marginal.Rd +++ b/man/normal2marginal.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helper_funcs.R +% Please edit documentation in R/normal2margin.R \name{normal2marginal} \alias{normal2marginal} \title{Transforms a [multivariate]normal vector to a different marginal via a diff --git a/man/rcor.Rd b/man/rcor.Rd index 3ff13c7..d07ebaa 100644 --- a/man/rcor.Rd +++ b/man/rcor.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/crude_helper_funcs.R +% Please edit documentation in R/rand_utils.R \name{rcor} \alias{rcor} \title{Generate a random Pearson correlation matrix} diff --git a/man/rnbinom_params.Rd b/man/rnbinom_params.Rd index 96fec02..901ef81 100644 --- a/man/rnbinom_params.Rd +++ b/man/rnbinom_params.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/crude_helper_funcs.R +% Please edit documentation in R/rand_utils.R \name{rnbinom_params} \alias{rnbinom_params} \title{Generate negative binomial marginal parameters} @@ -9,10 +9,12 @@ rnbinom_params(d, id_margins = FALSE) \arguments{ \item{d}{A positive integer.} -\item{id_margins}{A boolean value specifying if the margins should be identical.} +\item{id_margins}{A boolean value specifying if the margins should be +identical.} } \value{ -A matrix with columns containing (in order) lambda, alpha, mean, and variance. +A matrix with columns containing (in order) lambda, alpha, mean, + and variance. } \description{ Generate negative binomial marginal parameters diff --git a/man/rvec.Rd b/man/rvec.Rd index 69bc26e..7003769 100644 --- a/man/rvec.Rd +++ b/man/rvec.Rd @@ -5,8 +5,15 @@ \title{Creates \code{n} observations from a multivariate distribution with the given marginals and correlation matrix.} \usage{ -rvec(n, rho, params, cores = 1, type = c("pearson", "kendall", - "spearman"), adjustForDiscrete = TRUE) +rvec( + n, + rho, + params, + cores = 1, + type = c("pearson", "kendall", "spearman"), + adjustForDiscrete = TRUE, + nSigmas = 10 +) } \arguments{ \item{n}{The number random vectors to generate.} @@ -18,9 +25,16 @@ rvec(n, rho, params, cores = 1, type = c("pearson", "kendall", \item{cores}{The number of cores to utilize.} \item{type}{The type of correlation matrix that is being passed.} + +\item{adjustForDiscrete}{A boolean for whether to adjust the correlation +matrix when in the presence of discrete distributions} + +\item{nSigmas}{The number of standard deviations above the mean to set the +upper bound when adjusting for discrete distributions} } \value{ -A matrix of random vectors generated from the specified marginals and parameters. +A matrix of random vectors generated from the specified marginals + and parameters. } \description{ Creates \code{n} observations from a multivariate distribution with the diff --git a/man/which_corInBounds.Rd b/man/which_corInBounds.Rd new file mode 100644 index 0000000..e093cab --- /dev/null +++ b/man/which_corInBounds.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/correlation_utils.R +\name{which_corInBounds} +\alias{which_corInBounds} +\title{Returns a logical matrix of which correlations are in the feasible region} +\usage{ +which_corInBounds(rho, rho_bounds, negate = FALSE) +} +\arguments{ +\item{rho}{The input correlation matrix.} + +\item{rho_bounds}{A list containing the theoretical upper and lower bounds} + +\item{negate}{Should the logical values be negated in order to identify +values that are outside the feasible region.} +} +\description{ +Returns a logical matrix of which correlations are in the feasible region +} diff --git a/notebooks/01-test-discrete-rescale.Rmd b/notebooks/01-test-discrete-rescale.Rmd new file mode 100644 index 0000000..f9504b6 --- /dev/null +++ b/notebooks/01-test-discrete-rescale.Rmd @@ -0,0 +1,29 @@ +--- +title: "Test Discrete Rescale" +author: "Alex Knudson" +date: "4/1/2020" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) + +devtools::load_all() +``` + +## Problem + +```{r} +r <- 0.5 +d <- 2 +rho <- matrix(r, d, d) +diag(rho) <- 1.0 + +size <- 12 +prob <- 0.75 +margins <- rep(list(list("nbinom", size = size, prob = prob)), d) + +rho +adjustForDiscrete(rho, margins, nSigmas = 6) +``` + diff --git a/notebooks/02-profile-mvn.Rmd b/notebooks/02-profile-mvn.Rmd new file mode 100644 index 0000000..21f2e81 --- /dev/null +++ b/notebooks/02-profile-mvn.Rmd @@ -0,0 +1,95 @@ +--- +title: "Profile Multivariate Normal" +author: "Alex Knudson" +date: "4/7/2020" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, + comment = NA, + cache=TRUE) + +library(tictoc) +library(mvnfast) + +library(reticulate) +use_python("~/miniconda3/envs/pygantic/bin/python") +``` + + +```{python, echo=FALSE} +import time +import numpy as onp +import jax.numpy as np +from jax import random +from jax import device_put, device_get + +d = 5000 +n = 2000 +mu = onp.zeros((d, ), dtype=onp.float32) +rho = 0.5 +Rho = onp.full((d, d), rho, dtype=onp.float32) +Rho[onp.diag_indices(d)] = 1.0 + +key = random.PRNGKey(0) +mu_dev = device_put(mu) +Rho_dev = device_put(Rho) +``` + + +# Basic Wall Clock Benchmark + +```{r} +d <- 5000L +n <- 2000L +rho <- 0.5 +Rho <- matrix(rho, d, d) +diag(Rho) <- 1.0 +cores <- 10L +``` + + +## `mvnfast` + +```{r} +tic() +mvn_sim <- rmvn(n = n, + mu = rep(0, d), + sigma = Rho, + ncores = cores, + isChol = FALSE) +toc() +``` + +## `numpy.random` + +```{python} +t = time.time() +x = onp.random.multivariate_normal(mean=mu, cov=Rho, size=(n,)) +elapsed = time.time() - t +print(f"{round(elapsed, 3)} sec elapsed") +``` + +```{python} +x.shape +``` + + +## `jax.random` + +```{python} +t = time.time() +x_gpu = random.multivariate_normal(key, + mean=mu_dev, + cov=Rho_dev, + shape=(n,)).block_until_ready() +elapsed = time.time() - t +print(f"{round(elapsed, 3)} sec elapsed") +``` + +```{python} +x_cpu = device_get(x_gpu) +x_cpu.shape +``` + diff --git a/notebooks/03-profile-rank.Rmd b/notebooks/03-profile-rank.Rmd new file mode 100644 index 0000000..0e5e7f5 --- /dev/null +++ b/notebooks/03-profile-rank.Rmd @@ -0,0 +1,196 @@ +--- +title: "Profile Rank" +author: "Alex Knudson" +date: "5/29/2020" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE, + comment = NA, + cache = TRUE, + warning = FALSE, + message = FALSE) + +library(ggplot2) + +library(microbenchmark) +library(fastrank) +library(data.table) + +library(reticulate) +use_python("/home/alex/miniconda3/envs/bigsimr/bin/python") +``` + + +# The functions + +```{r, echo=TRUE} +rank_dc <- fastrank::fastrank_num_avg +rank_dt <- function(x) data.table::frankv(x, ties.method = "average") +rank_new <- function(x) .Internal(rank(x, length(x), "average")) + +scipy <- import("scipy") +rank_py <- function(x) scipy$stats$rankdata(x, method='average') +``` + +# The Benchmarks + +**Small Vector** + +```{r} +n <- 1e2 +xx <- as.numeric(sample(n, n, replace=TRUE)) +yy <- rnorm(n) + +mbx <- microbenchmark(rank(xx), + rank_new(xx), + rank_dc(xx), + rank_dt(xx), + rank_py(xx), + times = 10000) + +mby <- microbenchmark(rank(yy), + rank_new(yy), + rank_dc(yy), + rank_dt(yy), + rank_py(yy), + times = 10000) +``` + + +```{r} +autoplot(mbx) + + labs(title = "Integer Samples", subtitle = paste("N =", n)) + +autoplot(mby) + + labs(title = "Normal Samples", subtitle = paste("N =", n)) +``` + + + + +**Medium Vector** + +```{r} +n <- 1e3 +xx <- as.numeric(sample(n, n, replace=TRUE)) +yy <- rnorm(n) + +mbx <- microbenchmark(rank(xx), + rank_new(xx), + rank_dc(xx), + rank_dt(xx), + rank_py(xx), + times = 10000) + +mby <- microbenchmark(rank(yy), + rank_new(yy), + rank_dc(yy), + rank_dt(yy), + rank_py(yy), + times = 10000) +``` + + +```{r} +autoplot(mbx) + + labs(title = "Integer Samples", subtitle = paste("N =", n)) + +autoplot(mby) + + labs(title = "Normal Samples", subtitle = paste("N =", n)) +``` + +**Big Vector** + +```{r} +n <- 1e4 +xx <- as.numeric(sample(n, n, replace=TRUE)) +yy <- rnorm(n) + +mbx <- microbenchmark(rank(xx), + rank_new(xx), + rank_dc(xx), + rank_dt(xx), + rank_py(xx), + times = 10000) + +mby <- microbenchmark(rank(yy), + rank_new(yy), + rank_dc(yy), + rank_dt(yy), + rank_py(yy), + times = 10000) +``` + + +```{r} +autoplot(mbx) + + labs(title = "Integer Samples", subtitle = paste("N =", n)) + +autoplot(mby) + + labs(title = "Normal Samples", subtitle = paste("N =", n)) +``` + + +**Huge Vector** + +```{r} +n <- 1e5 +xx <- as.numeric(sample(n, n, replace=TRUE)) +yy <- rnorm(n) + +mbx <- microbenchmark(rank(xx), + rank_new(xx), + rank_dc(xx), + rank_dt(xx), + rank_py(xx), + times = 100) + +mby <- microbenchmark(rank(yy), + rank_new(yy), + rank_dc(yy), + rank_dt(yy), + rank_py(yy), + times = 100) +``` + + +```{r} +autoplot(mbx) + + labs(title = "Integer Samples", subtitle = paste("N =", n)) + +autoplot(mby) + + labs(title = "Normal Samples", subtitle = paste("N =", n)) +``` + +**Ludicrous Vector** + +```{r} +n <- 1e6 +xx <- as.numeric(sample(n, n, replace=TRUE)) +yy <- rnorm(n) + +mbx <- microbenchmark(rank(xx), + rank_new(xx), + rank_dc(xx), + rank_dt(xx), + rank_py(xx), + times = 10) + +mby <- microbenchmark(rank(yy), + rank_new(yy), + rank_dc(yy), + rank_dt(yy), + rank_py(yy), + times = 10) +``` + + +```{r} +autoplot(mbx) + + labs(title = "Integer Samples", subtitle = paste("N =", n)) + +autoplot(mby) + + labs(title = "Normal Samples", subtitle = paste("N =", n)) +``` diff --git a/tests/benchmark_simInvNB.R b/tests/benchmark_simInvNB.R index 62aa0c5..f824e23 100644 --- a/tests/benchmark_simInvNB.R +++ b/tests/benchmark_simInvNB.R @@ -1,102 +1,104 @@ -source("R/crude_helper_funcs.R") -source("R/helper_funcs.R") -source("R/simInvNB.R") +stop("This function is now deprecated and will be removed in the next release. A new benchmarking script must be written.") -simInvNB_old <- function(n, R, params, cores = 1, type = c("pearson", "kendall", "spearman")){ - - ## Handle different types of dependencies - if (type == "spearman") { - R <- convertSpearmanPearson(R) - } - if (type == "kendall") { - R <- convertKendallPearson(R) - } - - ## determine the dimension d - d <- NROW(R) - - ## 1. generate MVN sample - mu <- rep(0, d) - mvn_sim <- mvnfast::rmvn(n = n, mu = mu, sigma = R, ncores = cores, isChol = FALSE) - ## cor(mvn_sim) - - ## single threaded - if ( cores == 1 ) { - ## 2. Apply normal cdf to obtain copula - unif_sim <- apply(mvn_sim, 2, pnorm) - remove(mvn_sim) - ## cor(unif_sim) - - ## 3. now DIRECTLY TO negative binomial inverse transform to gamma - nb_sim <- sapply(1:d, function(i){ - my_nb_prob <- (1 + params["lambda",i])^(-1) - qnbinom(p = unif_sim[,i], size = params["alpha",i] , prob = my_nb_prob) - }) - remove(unif_sim) - ## cor(nb_sim); R - - } else { - ## mulit-core --- parallelize - ## 1. Start cluster - cl <- parallel::makeCluster(cores, type = "FORK") - - ## 2. Apply normal cdf to obtain copula - unif_sim <- parallel::parApply(cl = cl, X = mvn_sim, 2, pnorm) - remove(mvn_sim) - - ## 3. now DIRECTLY TO transform to negative binomial - doParallel::registerDoParallel(cl) - nb_sim <- foreach::foreach(i = 1:d, .combine = 'cbind') %dopar% { - my_nb_prob <- (1 + params["lambda",i])^(-1) - qnbinom(p = unif_sim[,i], size = params["alpha",i] , prob = my_nb_prob) - } - remove(unif_sim) - - ## 4. close cluster - parallel::stopCluster(cl) - } - - ## 5. Return the simulated data set - colnames(nb_sim) <- rownames(R) - return(nb_sim) -} - -library(ggplot2) -library(microbenchmark) -library(foreach) - -#= D=10, N=1e5, Cores=24 -d <- 10 -set.seed(12) -rho <- rcorr(d = d) -params <- rnegbin_params(d = d) -n <- 1e5 -cores <- 10 - -mbm <- microbenchmark( - "0simInvNB" = simInvNB_old(n, rho, params, 1, "pearson"), - "1simInvNB_new" = simInvNB(n, rho, params, 1, "pearson"), - "2simInvNB_24core" = simInvNB_old(n, rho, params, cores, "pearson"), - "3simInvNB_new_24core" = simInvNB(n, rho, params, cores, "pearson") -) - -(p <- autoplot(mbm) + labs(title = "dims=10, n=1e5, cores=1, 24")) -ggsave("benchmark_simInvNB_d10n1e5c24.png", p, device = "png") - - -#= D=100, N=1e6, Cores=24 -d <- 100 -set.seed(12) -rho <- rcorr(d = d) -params <- rnegbin_params(d = d) -n <- 1e6 -cores <- 24 - -mbm <- microbenchmark( - "0simInvNB" = simInvNB_old(n, rho, params, 1, "pearson"), - "1simInvNB_new" = simInvNB(n, rho, params, 1, "pearson"), - "2simInvNB_24core" = simInvNB_old(n, rho, params, cores, "pearson"), - "3simInvNB_new_24core" = simInvNB(n, rho, params, cores, "pearson") -) -(p <- autoplot(mbm) + labs(title = "dims=100, n=1e6, cores=1, 24")) -ggsave("benchmark_simInvNB_d100n1e6c24.png", p, device = "png") \ No newline at end of file +# source("R/crude_helper_funcs.R") +# source("R/helper_funcs.R") +# source("R/simInvNB.R") +# +# simInvNB_old <- function(n, R, params, cores = 1, type = c("pearson", "kendall", "spearman")){ +# +# ## Handle different types of dependencies +# if (type == "spearman") { +# R <- convertSpearmanPearson(R) +# } +# if (type == "kendall") { +# R <- convertKendallPearson(R) +# } +# +# ## determine the dimension d +# d <- NROW(R) +# +# ## 1. generate MVN sample +# mu <- rep(0, d) +# mvn_sim <- mvnfast::rmvn(n = n, mu = mu, sigma = R, ncores = cores, isChol = FALSE) +# ## cor(mvn_sim) +# +# ## single threaded +# if ( cores == 1 ) { +# ## 2. Apply normal cdf to obtain copula +# unif_sim <- apply(mvn_sim, 2, pnorm) +# remove(mvn_sim) +# ## cor(unif_sim) +# +# ## 3. now DIRECTLY TO negative binomial inverse transform to gamma +# nb_sim <- sapply(1:d, function(i){ +# my_nb_prob <- (1 + params["lambda",i])^(-1) +# qnbinom(p = unif_sim[,i], size = params["alpha",i] , prob = my_nb_prob) +# }) +# remove(unif_sim) +# ## cor(nb_sim); R +# +# } else { +# ## mulit-core --- parallelize +# ## 1. Start cluster +# cl <- parallel::makeCluster(cores, type = "FORK") +# +# ## 2. Apply normal cdf to obtain copula +# unif_sim <- parallel::parApply(cl = cl, X = mvn_sim, 2, pnorm) +# remove(mvn_sim) +# +# ## 3. now DIRECTLY TO transform to negative binomial +# doParallel::registerDoParallel(cl) +# nb_sim <- foreach::foreach(i = 1:d, .combine = 'cbind') %dopar% { +# my_nb_prob <- (1 + params["lambda",i])^(-1) +# qnbinom(p = unif_sim[,i], size = params["alpha",i] , prob = my_nb_prob) +# } +# remove(unif_sim) +# +# ## 4. close cluster +# parallel::stopCluster(cl) +# } +# +# ## 5. Return the simulated data set +# colnames(nb_sim) <- rownames(R) +# return(nb_sim) +# } +# +# library(ggplot2) +# library(microbenchmark) +# library(foreach) +# +# #= D=10, N=1e5, Cores=24 +# d <- 10 +# set.seed(12) +# rho <- rcorr(d = d) +# params <- rnegbin_params(d = d) +# n <- 1e5 +# cores <- 10 +# +# mbm <- microbenchmark( +# "0simInvNB" = simInvNB_old(n, rho, params, 1, "pearson"), +# "1simInvNB_new" = simInvNB(n, rho, params, 1, "pearson"), +# "2simInvNB_24core" = simInvNB_old(n, rho, params, cores, "pearson"), +# "3simInvNB_new_24core" = simInvNB(n, rho, params, cores, "pearson") +# ) +# +# (p <- autoplot(mbm) + labs(title = "dims=10, n=1e5, cores=1, 24")) +# ggsave("benchmark_simInvNB_d10n1e5c24.png", p, device = "png") +# +# +# #= D=100, N=1e6, Cores=24 +# d <- 100 +# set.seed(12) +# rho <- rcorr(d = d) +# params <- rnegbin_params(d = d) +# n <- 1e6 +# cores <- 24 +# +# mbm <- microbenchmark( +# "0simInvNB" = simInvNB_old(n, rho, params, 1, "pearson"), +# "1simInvNB_new" = simInvNB(n, rho, params, 1, "pearson"), +# "2simInvNB_24core" = simInvNB_old(n, rho, params, cores, "pearson"), +# "3simInvNB_new_24core" = simInvNB(n, rho, params, cores, "pearson") +# ) +# (p <- autoplot(mbm) + labs(title = "dims=100, n=1e6, cores=1, 24")) +# ggsave("benchmark_simInvNB_d100n1e6c24.png", p, device = "png") diff --git a/tests/test_adjust_discrete.R b/tests/test_adjust_discrete.R deleted file mode 100644 index 1a4052f..0000000 --- a/tests/test_adjust_discrete.R +++ /dev/null @@ -1,11 +0,0 @@ -devtools::load_all() - -#= D=10, N=1e5, Cores=24 -d <- 10 -set.seed(12) -rho <- rcor(d = d) -params <- rnbinom_params(d, shape = 100, id_margins = FALSE) -n <- 10 -cores <- 1 - - diff --git a/tests/test_cor_adjustments.R b/tests/test_cor_adjustments.R new file mode 100644 index 0000000..8b41718 --- /dev/null +++ b/tests/test_cor_adjustments.R @@ -0,0 +1,44 @@ +devtools::load_all() + +d <- 5 +rho <- rcor(d) +margins <- list( + list("nbinom", size = 5, prob = 0.3), + list("exp", rate = 4), + list("binom", size = 5, prob = 0.7), + list("norm", mean = 10, sd = 3), + list("pois", lambda = 10) +) + +margins2 <- list( + list("nbinom", 5, 0.3), + list("exp", 4), + list("binom", 5, 0.7), + list("norm", 10, 3), + list("pois", 10) +) + + +cores = parallel::detectCores() - 1 +reps = 1e3 +type = "spearman" + +rho_bounds <- computeCorBounds(margins, + cores = cores, + type = type, + reps = reps) + +pm1_rho <- matrix(sample(c(-1, 1), d*d, TRUE), d, d) +diag(pm1_rho) <- 1 + +constrainRho(rho, rho_bounds) +constrainRho(pm1_rho, rho_bounds) + +all.corInBounds(rho, margins, cores, type, rho_bounds) +all.corInBounds(pm1_rho, margins, cores, type, rho_bounds) + +# Get coordinate of offending correlations +cols <- (which(which.corInBounds(pm1_rho, rho_bounds, TRUE)) - 1) %/% 5 + 1 +rows <- (which(which.corInBounds(pm1_rho, rho_bounds, TRUE)) - 1) %% 5 + 1 +coords <- cbind(rows, cols) + diff --git a/vignettes/using-rvec.Rmd b/vignettes/using-rvec.Rmd index 2e80214..a3beb20 100644 --- a/vignettes/using-rvec.Rmd +++ b/vignettes/using-rvec.Rmd @@ -1,5 +1,5 @@ --- -title: "using-rvec" +title: "Using bigsimr::rvec" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{using-rvec} @@ -12,9 +12,120 @@ knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) + +reticulate::use_condaenv("py37") +devtools::load_all() ``` -```{r setup} +```{r setup, eval=FALSE} + # Optional step if your python binary is in a miniconda environment +reticulate::use_condaenv("py37") library(bigsimr) ``` +# Installation + +This package requires a working python distribution (typically via a conda environment). As of right now, we cannot automatically create the necessary environment for you, but you can do so by installing Anaconda or Miniconda, and then installing `jax` and `jaxlib` ([see here](https://github.com/google/jax#installation)). There is an option for install `jax` with GPU support. This is recommended if possible, otherwise `bigsimr` will simply throw a warning stating that it has fallen back onto the CPU. + +# Basics + +This package operates on the principle of Gaussian copulas, for which you will need a list of target marginal distributions and a correlation structure between the margins. + +The main function is `bigsimr::rvec` which mirrors some of the conventions found in the base `stats` package like `stats::rnorm`. I.e. the common convention for generating a random vector is to first specify the number of samples followed by the parameters for the distribution. + +```{r} +# Generating samples from a normal distribution +# We need three things: +# 1) The number of samples to generate +# 2) The location of the normal distribution (mean) +# 3) The scale of the normal distribution (standard deviation) + +rnorm(n = 16, mean = 5, sd = 1.5) +``` + +The 'r' in front of 'norm' is standard for 'random', as in generate random samples from the normal distribution. Other examples are `rexp` (exponential), `rnbinom` (negative binomial), and `rt` (Student t). + +One thing to familiarize yourself with is the specific parameterization of certain R distributions. `stats::rexp` uses a rate parameter while others may use the scale parameter, $1 / rate$. Our package uses the default parameterizations found in R, so be sure to check that you have the correct parameterization beforehand. + +# Building a Multivariate Distribution + +As stated earlier, to generate multivariate data, we need a list of marginals (and their parameters), and a correlation structure (matrix). The marginal distributions can be built up as a list of lists, where each sublist contains the information for the target distribution. + +```{r} +margins = list( + list("norm", mean = 3.14, sd = 0.1), + list("beta", shape1 = 1, shape2 = 4), + list("nbinom", size = 10, prob = 0.75) +) +``` + +The things to point out here are that in each sublist (marginal), the first item is an unnamed character string with the R name of the distribution *without a letter prefix*. E.g. instead of `rnorm`, we pass in just `"norm"`. The second thing to note is that the remaining items are *named* arguments that go along with the distribution. A full list of built-in distributions is found in the appendix. + +The next step is to define a correlation structure for the multivariate distribution. This correlation matrix can either come from observed data, or we can set it ourselves, or we can generate a random correlation matrix via `bigsimr::rcor`. Let's create a simple correlation matrix where all off-diagonal elements are 0.5. Since we have 3 marginals, we need a $3\times 3$ matrix. + +```{r} +rho <- matrix(0.5, nrow = 3, ncol = 3) +diag(rho) <- 1.0 +rho +``` + +Finally we can generate a random vector with our specified marginals and correlation structure. The last argument, `type`, is looking to know what kind of correlation matrix it is receiving. Right now it can handle Pearson, Spearman, or Kendal. + +```{r} +x = rvec(10, rho = rho, params = margins, type = "pearson") +``` + +On my machine, there is no dedicated GPU, so I would see the following warning message once per session. + +```{r, echo=FALSE} +warning("warning.warn('No GPU/TPU found, falling back to CPU.')") +``` + +Taking a look at our random vector, we see that it hs 10 rows and 3 columns, one column for each marginal. + +```{r} +x +``` + +We can simulate many more samples and then check the histogram of each margin, as well as the estimated correlation between the columns. + +```{r, fig.width=7} +x = rvec(10000, rho = rho, params = margins, type = "pearson") + +par(mfrow=c(1,3)) +hist(x[,1], breaks = 30, xlab = "", main = "Normal") +hist(x[,2], breaks = 30, xlab = "", main = "Beta") +hist(x[,3], breaks = 30, xlab = "", main = "Negative Binomial") +``` + +```{r} +cor(x) +``` + +We can see that even wih 10,000 samples, the estimated correlation of the simulated data is not exactly the same as the target correlation. This can be explained by the fact that some correlations are simply not possible due to the discrete nature of certain distributions. Another possibility is that the copula algorith is biased and needs correction. + +# Appendix + +```{r, eval=FALSE} +all_dists <- list( + list(dist = "beta", shape1, shape2), + list(dist = "binom", size, prob), + list(dist = "cauchy", location, scale), + list(dist = "chisq", df), + list(dist = "exp", rate), + list(dist = "f", df1, df2), + list(dist = "gamma", shape, rate), + list(dist = "geom", prob), + list(dist = "hyper", m, n, k), + list(dist = "logis", location, scale), + list(dist = "lnorm", meanlog, sdlog), + list(dist = "nbinom", size, prob), + list(dist = "norm", mean, sd), + list(dist = "pois", lambda), + list(dist = "t", df), + list(dist = "unif", min, max), + list(dist = "weibull", shape, scale), + list(dist = "wilcox", m, n), + list(dist = "signrank", n) +) +```