diff --git a/.Rbuildignore b/.Rbuildignore index 2d553de..019764b 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -2,3 +2,5 @@ ^\.Rproj\.user$ \tests ^data-raw$ +^doc$ +^Meta$ diff --git a/.Rprofile b/.Rprofile new file mode 100644 index 0000000..992953d --- /dev/null +++ b/.Rprofile @@ -0,0 +1 @@ +reticulate::use_condaenv("bigsimr") diff --git a/.gitignore b/.gitignore index 7c0a8d1..04db965 100644 --- a/.gitignore +++ b/.gitignore @@ -25,7 +25,9 @@ vignettes/*.pdf .httr-oauth # knitr and R markdown default cache directories -/*_cache/ +notebooks/*_cache/ +notebooks/*_files/ +notebooks/*.html /cache/ # Temporary files created by R markdown diff --git a/DESCRIPTION b/DESCRIPTION index 42e242b..032a01e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,25 +1,27 @@ Package: bigsimr -Title: Simulate Multivariate Negative Binomial Data -Version: 0.2.0 +Title: An R Package for Generating High-Dimensional Random Vectors +Version: 0.3.0 Authors@R: person(given = "Alexander", family = "Knudson", role = c("aut", "cre"), - email = "aknudson@nevada.unr.edu", - comment = c(ORCID = "YOUR-ORCID-ID")) + email = "aknudson@nevada.unr.edu") Description: This package simulates data from a negative binomial distribution using a direct copula approach. Depends: R (>= 3.4.0) Imports: + fastrank, foreach, - mvnfast, Matrix, doParallel, parallel, - purrr + purrr, + coop +Remotes: + douglasgscofield/fastrank License: GNU GPLv3 Encoding: UTF-8 LazyData: true -RoxygenNote: 6.1.1 +RoxygenNote: 7.1.0 Suggests: knitr, rmarkdown diff --git a/NAMESPACE b/NAMESPACE index d286f54..5037471 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,11 +1,7 @@ # Generated by roxygen2: do not edit by hand -export(adjustForDiscrete) -export(adjustInputR) -export(convertKendallPearson) -export(convertPearsonKendall) -export(convertPearsonSpearman) -export(convertSpearmanPearson) +export(computeCorBounds) +export(convertCor) export(rcor) export(rnbinom_params) export(rvec) diff --git a/R/correlation_utils.R b/R/correlation_utils.R new file mode 100644 index 0000000..8bc4b95 --- /dev/null +++ b/R/correlation_utils.R @@ -0,0 +1,245 @@ +#' Convert between different types of correlation matrices +#' +#' @param rho A a square symmetric correlation matrix +#' @param from The type of the input correlation matrix +#' @param to The type of the output correlation matrix +#' @export +convertCor <- function(rho, + from = c("pearson", "spearman", "kendall"), + to = c("pearson", "spearman", "kendall")) { + + key <- paste(from, to) + A <- list( + "pearson pearson" = function(r) {r}, + "pearson spearman" = function(r) {(6/pi)*asin(r/2)}, + "pearson kendall" = function(r) {(2/pi)*asin(r)}, + "spearman pearson" = function(r) {2*sin(r*(pi/6))}, + "spearman spearman" = function(r) {r}, + "spearman kendall" = function(r) {(2/pi)*asin(2*sin(r*(pi/6)))}, + "kendall pearson" = function(r) {sin(r*(pi/2))}, + "kendal spearman" = function(r) {(6/pi)*asin(sin(r*(pi/2))/2)}, + "kendall kendall" = function(r) {r} + ) + + tmp <- as.matrix(Matrix::nearPD(A[[key]](rho))$mat) + rownames(tmp) <- colnames(tmp) <- colnames(rho) + tmp +} + + +#' Adjust the correlation matrix when there are discrete distributions present +#' +#' @param rho The input correlation matrix +#' @param params The parameters of the marginals. +#' @param nSigmas The number of standard deviations from the mean +adjustForDiscrete <- function(rho, params, nSigmas) { + upper_bound <- lapply(params, function(param) { + prob <- param[["prob"]] + size <- param[["size"]] + + mu <- size * (1 - prob) / prob + sigma2 <- size^2 * (1 - prob) / prob + + ceiling(mu + nSigmas * sqrt(sigma2)) + }) + upper_bound <- max(unlist(upper_bound)) + + adj <- lapply(params, function(param) { + if (param[[1]] %in% discrete_dists) { + pmf_x <- do.call(paste0("d", param[[1]]), + c(list(x = 0:upper_bound), param[-1])) + return(sqrt(1 - sum(pmf_x^3))) + } else { + return(1) + } + }) + adj <- unlist(adj) + + idx_mat <- combn(x = length(params), m = 2) + + rho_adj <- apply(idx_mat, 2, function(pair) { + adj_factor <- adj[pair[1]] * adj[pair[2]] + rho_tmp <- rho[pair[1], pair[2]] + min(rho_tmp / adj_factor, 1) + }) + + rho[lower.tri(rho)] <- rho_adj + rho[upper.tri(rho)] <- t(rho)[upper.tri(rho)] + rho +} + + +#' Computes the theoretical upper and lower bounds of possible correlations +#' given a set of marginals +#' +#' @param params The parameters of the marginals. +#' @param cores The number of cores to utilize. +#' @param type The type of correlation matrix that is being passed. +#' @return A list containing the theoretical upper and lower bounds +#' @export +computeCorBounds <- function(params, + cores = 1, + type = c("pearson", "kendall", "spearman"), + reps = 1e5){ + + d <- length(params) + + # Generate random samples for each margin and sort the vectors + # The sorted vectors are used for computing the bounds + if (cores == 1) { + sim_data <- sapply(1:d, function(i) { + sort(do.call(what = paste0("r", unlist(params[[i]][1])), + args = c(list(n = reps), params[[i]][-1]))) + }) + } else { + `%dopar%` <- foreach::`%dopar%` + cl <- parallel::makeCluster(cores, type = "FORK") + doParallel::registerDoParallel(cl) + sim_data <- foreach::foreach(i = 1:d, .combine = "cbind") %dopar% { + sort(do.call(what = paste0("r", unlist(params[[i]][1])), + args = c(list(n = reps), params[[i]][-1]))) + } + } + + unname(sim_data) + index_mat <- combn(x = d, m = 2) + + + if (cores == 1) { + # Upper bounds + rho_upper <- cor(sim_data, method = type) + + # Lower bounds + rho_lower_values <- apply(index_mat, 2, function(index, data, ...){ + cor(data[, index[1]], + rev(data[, index[2]]), + method = type) + }, data = sim_data, method = type) + + rho_lower <- diag(x = 1, nrow = d, ncol = d) + rho_lower[lower.tri(rho_lower)] <- rho_lower_values + rho_lower <- t(rho_lower) + rho_lower[lower.tri(rho_lower)] <- rho_lower_values + + } else { + # Upper bounds + rho_upper_values <- parallel::parApply( + cl = cl, + X = index_mat, + MARGIN = 2, + FUN = function(index, data, ...){ + cor(x = data[, index[1]], + y = data[, index[2]], + method = type) + }, data = sim_data, method = type) + + # Ensure that the result is a valid correlation matrix + rho_upper <- diag(x = 1, nrow = d, ncol = d) + rho_upper[lower.tri(rho_upper)] <- rho_upper_values + rho_upper <- t(rho_upper) + rho_upper[lower.tri(rho_upper)] <- rho_upper_values + + # Lower bounds + rho_lower_values <- parallel::parApply( + cl = cl, + X = index_mat, + MARGIN = 2, + FUN = function(index, data, ...){ + cor(x = data[, index[1]], + y = rev(data[, index[2]]), + method=type) + }, data = sim_data, method = type) + parallel::stopCluster(cl) + + # Ensure that the result is a valid correlation matrix + rho_lower <- diag(x = 1, nrow = d, ncol = d) + rho_lower[lower.tri(rho_lower)] <- rho_lower_values + rho_lower <- t(rho_lower) + rho_lower[lower.tri(rho_lower)] <- rho_lower_values + } + + list(upper = rho_upper, + lower = rho_lower) +} + + +#' Constrains a correlation matrix to the theoretical upper and lower bounds +#' +#' @param rho The input correlation matrix. +#' @param rho_bounds A list containing the theoretical upper and lower bounds +#' @return The constrained correlation matrix +constrainRho <- function(rho, rho_bounds){ + rho_tmp <- pmin(rho_bounds[["upper"]], rho) + pmax(rho_bounds[["lower"]], rho_tmp) +} + + +#' Returns a logical matrix of which correlations are in the feasible region +#' +#' @param rho The input correlation matrix. +#' @param rho_bounds A list containing the theoretical upper and lower bounds +#' @param negate Should the logical values be negated in order to identify +#' values that are outside the feasible region. +which_corInBounds <- function(rho, rho_bounds, negate = FALSE){ + + tooSmall <- rho < rho_bounds[["lower"]] + tooLarge <- rho > rho_bounds[["upper"]] + + outOfBounds <- tooSmall | tooLarge + inBounds <- !outOfBounds + + # Return either the in bounds or out of bounds matrix + if (negate) { + outOfBounds + } else { + inBounds + } +} + + +#' Returns TRUE if all correlations pairs are within the theoretical bounds +#' +#' @param rho The input correlation matrix. +#' @param params The parameters of the marginals. +#' @param cores The number of cores to utilize. +#' @param type The type of correlation matrix that is being passed. +#' @param rho_bounds Pre-computed upper and lower correlation bounds +#' ... Other arguments passed to `computeCoreBounds()` +#' @return Logical. TRUE if all correlations pairs are within the theoretical +#' bounds, and false otherwise. +all_corInBounds <- function(rho, + params, + cores = 1, + type = c("pearson", "spearman", "kendall"), + rho_bounds = NULL, ...){ + + if (is.null(rho_bounds)){ + rho_bounds <- computeCorBounds(params = params, + cores = cores, + type = type, + ...) + } + + outOfBounds <- which_corInBounds(rho, rho_bounds, negate = TRUE) + + if (any(outOfBounds)) { + warning(paste0("At least one of the specified correlations are either ", + "too large or too small. Please use 'which_corInBounds() ", + "to see which correlations are valid.")) + return(FALSE) + } + TRUE +} + + +#' Estimate the Spearman rank correlation +#' +#' For a Nx2 matrix when N=1e5, fast is about 2x faster, and when N=1e6, +#' fast is about 3x faster. +estimateSpearmanRho <- function(x, fast = TRUE){ + if (fast) { + coop::pcor(apply(x, 2, fastrank::fastrank_num_avg)) + } else { + cor(apply(x, 2, rank)) + } +} diff --git a/R/helper_funcs.R b/R/helper_funcs.R deleted file mode 100644 index b4fc715..0000000 --- a/R/helper_funcs.R +++ /dev/null @@ -1,140 +0,0 @@ -#' Converts Pearson correlation to Spearman. Used for normal random variables. -#' -#' @param rho A a square symmetric Pearson correlation matrix. -#' @return A Spearman correlation matrix. -#' @export -convertPearsonSpearman <- function(rho) { - tmp <- (6 / pi) * asin(rho / 2) - tmp <- as.matrix(Matrix::nearPD(tmp)$mat) - rownames(tmp) <- colnames(tmp) <- colnames(rho) - return(tmp) -} - - -#' Converts Spearman correlation to Pearson. Used for normal random variables. -#' -#' @param rho A a square symmetric Spearman correlation matrix. -#' @return A Pearson correlation matrix. -#' @export -convertSpearmanPearson <- function(rho) { - tmp <- 2* sin( rho * (pi / 6)) - tmp <- as.matrix(Matrix::nearPD(tmp)$mat) - rownames(tmp) <- colnames(tmp) <- colnames(rho) - return(tmp) -} - - -#' Converts Pearson correlation to Kendall. Used for normal random variables. -#' -#' @param rho A a square symmetric Pearson correlation matrix. -#' @return A Kendall correlation matrix. -#' @export -convertPearsonKendall <- function(rho) { - tmp <- (2 / pi) * asin(rho) - tmp <- as.matrix(Matrix::nearPD(tmp)$mat) - rownames(tmp) <- colnames(tmp) <- colnames(rho) - return(tmp) -} - - -#' Converts Kendall correlation to Pearson. Used for normal random variables. -#' -#' @param rho A a square symmetric Kendall correlation matrix. -#' @return A Pearson correlation matrix. -#' @export -convertKendallPearson <- function(rho) { - tmp <- sin( rho * (pi / 2) ) - tmp <- as.matrix(Matrix::nearPD(tmp)$mat) - rownames(tmp) <- colnames(tmp) <- colnames(rho) - return(tmp) -} - - -#' Adjust an input correlation matrix for our method to match -#' -#' @param rho A a square symmetric correlation matrix. -#' @param params The parameters of the marginals. -#' @param cores The number of cores to utilize. -#' @export -adjustInputR <- function(rho, params, cores = 1){ - - ## 1. find the pairs - d <- NROW(rho) - index_mat <- combn(x = 1:d, 2) - - ## 2. compute the first order approximation - if (cores == 1) { - input_rho_vector <- apply(index_mat, 2, function(tmp_index){ - ## extract and compute simple quantities - alpha1 <- params["alpha", tmp_index[1]] - alpha2 <- params["alpha", tmp_index[2]] - lambda_nb1 <- params["lambda", tmp_index[1]] - lambda_nb2 <- params["lambda", tmp_index[2]] - sd_nb1 <- sqrt(params["nb_var", tmp_index[1]]) - sd_nb2 <- sqrt(params["nb_var", tmp_index[2]]) - ## precompute scale factor - scale_factor <- sqrt(alpha1) * sqrt(alpha2) - ## extract target nb corr - tmp_nb_rho <- rho[tmp_index[1], tmp_index[2]] - ## convert to rho for gamma (exact) - target_gamma_rho <- tmp_nb_rho * ( (sd_nb1 * sd_nb2) / (scale_factor * lambda_nb1 * lambda_nb2) ) - ## now approximate rho for input corr for MVN - return(min(1, target_gamma_rho)) - }) - } else { - cl <- parallel::makeCluster(cores, type = "FORK") - - ## parallel version of the above code - doParallel::registerDoParallel(cl) - `%dopar%` <- foreach::`%dopar%` - input_rho_vector <- foreach::foreach(i = 1:ncol(index_mat), .combine = 'c') %dopar% { - ## extract and compute simple quantities - tmp_index <- index_mat[,i] - ## extract and compute simple quantities - alpha1 <- params["alpha", tmp_index[1]] - alpha2 <- params["alpha", tmp_index[2]] - lambda_nb1 <- params["lambda", tmp_index[1]] - lambda_nb2 <- params["lambda", tmp_index[2]] - sd_nb1 <- sqrt(params["nb_var", tmp_index[1]]) - sd_nb2 <- sqrt(params["nb_var", tmp_index[2]]) - ## precompute scale factor - scale_factor <- sqrt(alpha1) * sqrt(alpha2) - ## extract target nb corr - tmp_nb_rho <- rho[tmp_index[1], tmp_index[2]] - ## convert to rho for gamma (exact) - target_gamma_rho <- tmp_nb_rho * ((sd_nb1 * sd_nb2) / (scale_factor * lambda_nb1 * lambda_nb2)) - - ## now approximate rho for input corr for MVN - return(min(1, target_gamma_rho)) - } - ## close cluster - parallel::stopCluster(cl) - } - - ## Put input_rho_vector into a matrix - to_return <- diag(d) - to_return[lower.tri(to_return)] <- input_rho_vector - to_return <- to_return + t(to_return) - diag(diag(to_return)) - - ## Ensure positive semi-definiteness - eigen_obj <- eigen(to_return) - if (any(eigen_obj$values < 0)) { - to_return <- as.matrix(Matrix::nearPD(to_return)$mat) - } - - ## Label and return - rownames(to_return) <- colnames(to_return) <- rownames(rho) - return(to_return) -} - - -#' Transforms a [multivariate]normal vector to a different marginal via a -#' uniform intermediate transformation. -#' -#' @param x A normal random vector. -#' @param param A list ccontaining the marginal and its parameters. -normal2marginal <- function(x, param) { - do.call(what = paste0("q", param[[1]]), - args = c(list(p = pnorm(x)), param[-1])) -} - diff --git a/R/normal2margin.R b/R/normal2margin.R new file mode 100644 index 0000000..9d9f84b --- /dev/null +++ b/R/normal2margin.R @@ -0,0 +1,9 @@ +#' Transforms a [multivariate]normal vector to a different marginal via a +#' uniform intermediate transformation. +#' +#' @param x A normal random vector. +#' @param param A list ccontaining the marginal and its parameters. +normal2marginal <- function(x, param) { + do.call(what = paste0("q", param[[1]]), + args = c(list(p = pnorm(x)), param[-1])) +} diff --git a/R/crude_helper_funcs.R b/R/rand_utils.R similarity index 53% rename from R/crude_helper_funcs.R rename to R/rand_utils.R index 79e3c9e..4fa10d1 100644 --- a/R/crude_helper_funcs.R +++ b/R/rand_utils.R @@ -25,8 +25,10 @@ rcor <- function(d, constant_rho = FALSE) { #' Generate negative binomial marginal parameters #' #' @param d A positive integer. -#' @param id_margins A boolean value specifying if the margins should be identical. -#' @return A matrix with columns containing (in order) lambda, alpha, mean, and variance. +#' @param id_margins A boolean value specifying if the margins should be +#' identical. +#' @return A matrix with columns containing (in order) lambda, alpha, mean, +#' and variance. #' @examples #' rnbinom_params(4) #' rnbinom_params(10) @@ -42,42 +44,3 @@ rnbinom_params <- function(d, id_margins = FALSE) { } purrr::transpose(list(rep("nbinom", d), size = sizes, prob = probs)) } - - -#' @param rho The input correlation matrix -#' @param params The parameters of the marginals. -#' @param sigma The number of standard deviations from the mean -#' @export -adjustForDiscrete <- function(rho, params, sigma = 5) { - upper_bound <- lapply(params, function(param) { - prob <- param[["prob"]] - size <- param[["size"]] - - mu <- size * (1 - prob) / prob - sigma2 <- size^2 * (1 - prob) / prob - - ceiling(mu + sigma * sqrt(sigma2)) - }) - upper_bound <- max(unlist(upper_bound)) - - adj <- lapply(params, function(param) { - if (param[[1]] %in% discrete_dists) { - pmf_x <- do.call(paste0("d", param[[1]]), - c(list(x = 0:upper_bound), param[-1])) - return(sqrt(1 - sum(pmf_x^3))) - } else { - return(1) - } - }) - adj <- unlist(adj) - - idx_mat <- combn(x = length(params), m = 2) - - rho_adj <- apply(idx_mat, 2, function(pair) { - adj_factor <- adj[pair[1]] * adj[pair[2]] - rho_tmp <- rho[pair[1], pair[2]] - rho_tmp / adj_factor - }) - rho[lower.tri(rho)] <- rho[upper.tri(rho)] <- rho_adj - rho -} diff --git a/R/rmvn.R b/R/rmvn.R new file mode 100644 index 0000000..525deb5 --- /dev/null +++ b/R/rmvn.R @@ -0,0 +1,22 @@ +.rmvn_jax <- function(seed, mu, Sigma, n) { + + # Any RNG in jax requires a key. If none is supplied by the user, then it + # should be generated at random. + if (is.null(seed)) { + set.seed(seed) + seed <- as.integer(sample(1:1000, 1)) + } + + rmvn <- jax$random$multivariate_normal + + Sigma = numpy$array(Sigma, dtype=numpy$float32) + Sigma = jax$device_put(Sigma) + + mu = numpy$array(mu, dtype=numpy$float32) + mu = jax$device_put(mu) + + key = jax$random$PRNGKey(as.integer(seed)) + x = rmvn(key, mu, Sigma, reticulate::tuple(list(n)))$block_until_ready() + x = jax$device_get(x) + reticulate::py_to_r(x) +} diff --git a/R/rvec.R b/R/rvec.R index 9179c30..79c4905 100644 --- a/R/rvec.R +++ b/R/rvec.R @@ -6,7 +6,12 @@ #' @param params The parameters of the marginals. #' @param cores The number of cores to utilize. #' @param type The type of correlation matrix that is being passed. -#' @return A matrix of random vectors generated from the specified marginals and parameters. +#' @param adjustForDiscrete A boolean for whether to adjust the correlation +#' matrix when in the presence of discrete distributions +#' @param nSigmas The number of standard deviations above the mean to set the +#' upper bound when adjusting for discrete distributions +#' @return A matrix of random vectors generated from the specified marginals +#' and parameters. #' @examples #' d <- 5 #' rho <- rcor(d) @@ -29,45 +34,53 @@ #' rvec(10, rho, margins, cores = 1, type = "pearson") #' rvec(10, rho, margins2, cores = 1, type = "pearson") #' @export -rvec <- function(n, rho, params, cores = 1, +rvec <- function(n, + rho, + params, + cores = 1, type = c("pearson", "kendall", "spearman"), - adjustForDiscrete = TRUE){ + adjustForDiscrete = TRUE, + nSigmas = 10){ + # Handle different types of dependencies - if (type == "spearman") { + if (type == "spearman" && + adjustForDiscrete && + any(my_dists %in% discrete_dists)) { my_dists <- unlist(lapply(params, '[[', 1)) - if (adjustForDiscrete && any(my_dists %in% discrete_dists)) { - rho <- adjustForDiscrete(rho, params, sigma = 5) - } - rho <- convertSpearmanPearson(rho) + rho <- adjustForDiscrete(rho, params, nSigmas) } - if (type == "kendall") { - rho <- convertKendallPearson(rho) - } + # Correlation matrix must be a Pearson correlation + rho <- convertCor(rho, from = type, to = "pearson") - # determine the dimension d d <- NROW(rho) # generate MVN sample - mvn_sim <- mvnfast::rmvn(n = n, - mu = rep(0, d), - sigma = rho, - ncores = cores, - isChol = FALSE) + mvn_sim <- .rmvn_jax(NULL, rep(0, d), rho, as.integer(n)) + # Apply the NORTA algorithm if (cores == 1) { + mv_sim <- sapply(1:d, function(i){ normal2marginal(mvn_sim[,i], params[[i]]) }) + } else { + `%dopar%` <- foreach::`%dopar%` + cl <- parallel::makeCluster(cores, type = "FORK") doParallel::registerDoParallel(cl) + mv_sim <- foreach::foreach(i = 1:d, .combine = 'cbind') %dopar% { normal2marginal(mvn_sim[,i], params[[i]]) } + parallel::stopCluster(cl) + } + colnames(mv_sim) <- rownames(rho) return(mv_sim) + } diff --git a/R/zzz.R b/R/zzz.R new file mode 100644 index 0000000..1e49b36 --- /dev/null +++ b/R/zzz.R @@ -0,0 +1,8 @@ +numpy <- NULL +jax <- NULL + +.onLoad <- function(...) { + # Check that a python environment exists and is available + numpy <<- reticulate::import("numpy", delay_load = TRUE, convert = FALSE) + jax <<- reticulate::import("jax", delay_load = TRUE, convert = FALSE) +} diff --git a/doc/using-rvec.R b/doc/using-rvec.R new file mode 100644 index 0000000..a2618f3 --- /dev/null +++ b/doc/using-rvec.R @@ -0,0 +1,78 @@ +## ---- include = FALSE--------------------------------------------------------- +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) + +reticulate::use_condaenv("py37") +devtools::load_all() + +## ----setup, eval=FALSE-------------------------------------------------------- +# # Optional step if your python binary is in a miniconda environment +# reticulate::use_condaenv("py37") +# library(bigsimr) + +## ----------------------------------------------------------------------------- +# Generating samples from a normal distribution +# We need three things: +# 1) The number of samples to generate +# 2) The location of the normal distribution (mean) +# 3) The scale of the normal distribution (standard deviation) + +rnorm(n = 16, mean = 5, sd = 1.5) + +## ----------------------------------------------------------------------------- +margins = list( + list("norm", mean = 3.14, sd = 0.1), + list("beta", shape1 = 1, shape2 = 4), + list("nbinom", size = 10, prob = 0.75) +) + +## ----------------------------------------------------------------------------- +rho <- matrix(0.5, nrow = 3, ncol = 3) +diag(rho) <- 1.0 +rho + +## ----------------------------------------------------------------------------- +x = rvec(10, rho = rho, params = margins, type = "pearson") + +## ---- echo=FALSE-------------------------------------------------------------- +warning("warning.warn('No GPU/TPU found, falling back to CPU.')") + +## ----------------------------------------------------------------------------- +x + +## ---- fig.width=7------------------------------------------------------------- +x = rvec(10000, rho = rho, params = margins, type = "pearson") + +par(mfrow=c(1,3)) +hist(x[,1], breaks = 30, xlab = "", main = "Normal") +hist(x[,2], breaks = 30, xlab = "", main = "Beta") +hist(x[,3], breaks = 30, xlab = "", main = "Negative Binomial") + +## ----------------------------------------------------------------------------- +cor(x) + +## ---- eval=FALSE-------------------------------------------------------------- +# all_dists <- list( +# list(dist = "beta", shape1, shape2), +# list(dist = "binom", size, prob), +# list(dist = "cauchy", location, scale), +# list(dist = "chisq", df), +# list(dist = "exp", rate), +# list(dist = "f", df1, df2), +# list(dist = "gamma", shape, rate), +# list(dist = "geom", prob), +# list(dist = "hyper", m, n, k), +# list(dist = "logis", location, scale), +# list(dist = "lnorm", meanlog, sdlog), +# list(dist = "nbinom", size, prob), +# list(dist = "norm", mean, sd), +# list(dist = "pois", lambda), +# list(dist = "t", df), +# list(dist = "unif", min, max), +# list(dist = "weibull", shape, scale), +# list(dist = "wilcox", m, n), +# list(dist = "signrank", n) +# ) + diff --git a/doc/using-rvec.Rmd b/doc/using-rvec.Rmd new file mode 100644 index 0000000..a3beb20 --- /dev/null +++ b/doc/using-rvec.Rmd @@ -0,0 +1,131 @@ +--- +title: "Using bigsimr::rvec" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{using-rvec} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) + +reticulate::use_condaenv("py37") +devtools::load_all() +``` + +```{r setup, eval=FALSE} + # Optional step if your python binary is in a miniconda environment +reticulate::use_condaenv("py37") +library(bigsimr) +``` + +# Installation + +This package requires a working python distribution (typically via a conda environment). As of right now, we cannot automatically create the necessary environment for you, but you can do so by installing Anaconda or Miniconda, and then installing `jax` and `jaxlib` ([see here](https://github.com/google/jax#installation)). There is an option for install `jax` with GPU support. This is recommended if possible, otherwise `bigsimr` will simply throw a warning stating that it has fallen back onto the CPU. + +# Basics + +This package operates on the principle of Gaussian copulas, for which you will need a list of target marginal distributions and a correlation structure between the margins. + +The main function is `bigsimr::rvec` which mirrors some of the conventions found in the base `stats` package like `stats::rnorm`. I.e. the common convention for generating a random vector is to first specify the number of samples followed by the parameters for the distribution. + +```{r} +# Generating samples from a normal distribution +# We need three things: +# 1) The number of samples to generate +# 2) The location of the normal distribution (mean) +# 3) The scale of the normal distribution (standard deviation) + +rnorm(n = 16, mean = 5, sd = 1.5) +``` + +The 'r' in front of 'norm' is standard for 'random', as in generate random samples from the normal distribution. Other examples are `rexp` (exponential), `rnbinom` (negative binomial), and `rt` (Student t). + +One thing to familiarize yourself with is the specific parameterization of certain R distributions. `stats::rexp` uses a rate parameter while others may use the scale parameter, $1 / rate$. Our package uses the default parameterizations found in R, so be sure to check that you have the correct parameterization beforehand. + +# Building a Multivariate Distribution + +As stated earlier, to generate multivariate data, we need a list of marginals (and their parameters), and a correlation structure (matrix). The marginal distributions can be built up as a list of lists, where each sublist contains the information for the target distribution. + +```{r} +margins = list( + list("norm", mean = 3.14, sd = 0.1), + list("beta", shape1 = 1, shape2 = 4), + list("nbinom", size = 10, prob = 0.75) +) +``` + +The things to point out here are that in each sublist (marginal), the first item is an unnamed character string with the R name of the distribution *without a letter prefix*. E.g. instead of `rnorm`, we pass in just `"norm"`. The second thing to note is that the remaining items are *named* arguments that go along with the distribution. A full list of built-in distributions is found in the appendix. + +The next step is to define a correlation structure for the multivariate distribution. This correlation matrix can either come from observed data, or we can set it ourselves, or we can generate a random correlation matrix via `bigsimr::rcor`. Let's create a simple correlation matrix where all off-diagonal elements are 0.5. Since we have 3 marginals, we need a $3\times 3$ matrix. + +```{r} +rho <- matrix(0.5, nrow = 3, ncol = 3) +diag(rho) <- 1.0 +rho +``` + +Finally we can generate a random vector with our specified marginals and correlation structure. The last argument, `type`, is looking to know what kind of correlation matrix it is receiving. Right now it can handle Pearson, Spearman, or Kendal. + +```{r} +x = rvec(10, rho = rho, params = margins, type = "pearson") +``` + +On my machine, there is no dedicated GPU, so I would see the following warning message once per session. + +```{r, echo=FALSE} +warning("warning.warn('No GPU/TPU found, falling back to CPU.')") +``` + +Taking a look at our random vector, we see that it hs 10 rows and 3 columns, one column for each marginal. + +```{r} +x +``` + +We can simulate many more samples and then check the histogram of each margin, as well as the estimated correlation between the columns. + +```{r, fig.width=7} +x = rvec(10000, rho = rho, params = margins, type = "pearson") + +par(mfrow=c(1,3)) +hist(x[,1], breaks = 30, xlab = "", main = "Normal") +hist(x[,2], breaks = 30, xlab = "", main = "Beta") +hist(x[,3], breaks = 30, xlab = "", main = "Negative Binomial") +``` + +```{r} +cor(x) +``` + +We can see that even wih 10,000 samples, the estimated correlation of the simulated data is not exactly the same as the target correlation. This can be explained by the fact that some correlations are simply not possible due to the discrete nature of certain distributions. Another possibility is that the copula algorith is biased and needs correction. + +# Appendix + +```{r, eval=FALSE} +all_dists <- list( + list(dist = "beta", shape1, shape2), + list(dist = "binom", size, prob), + list(dist = "cauchy", location, scale), + list(dist = "chisq", df), + list(dist = "exp", rate), + list(dist = "f", df1, df2), + list(dist = "gamma", shape, rate), + list(dist = "geom", prob), + list(dist = "hyper", m, n, k), + list(dist = "logis", location, scale), + list(dist = "lnorm", meanlog, sdlog), + list(dist = "nbinom", size, prob), + list(dist = "norm", mean, sd), + list(dist = "pois", lambda), + list(dist = "t", df), + list(dist = "unif", min, max), + list(dist = "weibull", shape, scale), + list(dist = "wilcox", m, n), + list(dist = "signrank", n) +) +``` diff --git a/doc/using-rvec.html b/doc/using-rvec.html new file mode 100644 index 0000000..25027ab --- /dev/null +++ b/doc/using-rvec.html @@ -0,0 +1,417 @@ + + + + +
+ + + + + + + + + + # Optional step if your python binary is in a miniconda environment
+reticulate::use_condaenv("py37")
+library(bigsimr)
This package requires a working python distribution (typically via a conda environment). As of right now, we cannot automatically create the necessary environment for you, but you can do so by installing Anaconda or Miniconda, and then installing jax
and jaxlib
(see here). There is an option for install jax
with GPU support. This is recommended if possible, otherwise bigsimr
will simply throw a warning stating that it has fallen back onto the CPU.
This package operates on the principle of Gaussian copulas, for which you will need a list of target marginal distributions and a correlation structure between the margins.
+The main function is bigsimr::rvec
which mirrors some of the conventions found in the base stats
package like stats::rnorm
. I.e. the common convention for generating a random vector is to first specify the number of samples followed by the parameters for the distribution.
# Generating samples from a normal distribution
+# We need three things:
+# 1) The number of samples to generate
+# 2) The location of the normal distribution (mean)
+# 3) The scale of the normal distribution (standard deviation)
+
+rnorm(n = 16, mean = 5, sd = 1.5)
+#> [1] 4.727757 4.002868 7.133968 6.431139 4.124616 4.310464 6.241137 6.005251
+#> [9] 4.345459 4.200211 7.250144 6.428607 5.765199 4.909169 4.765040 4.431531
The ‘r’ in front of ‘norm’ is standard for ‘random’, as in generate random samples from the normal distribution. Other examples are rexp
(exponential), rnbinom
(negative binomial), and rt
(Student t).
One thing to familiarize yourself with is the specific parameterization of certain R distributions. stats::rexp
uses a rate parameter while others may use the scale parameter, \(1 / rate\). Our package uses the default parameterizations found in R, so be sure to check that you have the correct parameterization beforehand.
As stated earlier, to generate multivariate data, we need a list of marginals (and their parameters), and a correlation structure (matrix). The marginal distributions can be built up as a list of lists, where each sublist contains the information for the target distribution.
+margins = list(
+ list("norm", mean = 3.14, sd = 0.1),
+ list("beta", shape1 = 1, shape2 = 4),
+ list("nbinom", size = 10, prob = 0.75)
+)
The things to point out here are that in each sublist (marginal), the first item is an unnamed character string with the R name of the distribution without a letter prefix. E.g. instead of rnorm
, we pass in just "norm"
. The second thing to note is that the remaining items are named arguments that go along with the distribution. A full list of built-in distributions is found in the appendix.
The next step is to define a correlation structure for the multivariate distribution. This correlation matrix can either come from observed data, or we can set it ourselves, or we can generate a random correlation matrix via bigsimr::rcor
. Let’s create a simple correlation matrix where all off-diagonal elements are 0.5. Since we have 3 marginals, we need a \(3\times 3\) matrix.
rho <- matrix(0.5, nrow = 3, ncol = 3)
+diag(rho) <- 1.0
+rho
+#> [,1] [,2] [,3]
+#> [1,] 1.0 0.5 0.5
+#> [2,] 0.5 1.0 0.5
+#> [3,] 0.5 0.5 1.0
Finally we can generate a random vector with our specified marginals and correlation structure. The last argument, type
, is looking to know what kind of correlation matrix it is receiving. Right now it can handle Pearson, Spearman, or Kendal.
On my machine, there is no dedicated GPU, so I would see the following warning message once per session.
+#> Warning: warning.warn('No GPU/TPU found, falling back to CPU.')
+Taking a look at our random vector, we see that it hs 10 rows and 3 columns, one column for each marginal.
+x
+#> [,1] [,2] [,3]
+#> [1,] 3.150112 0.04986300 1
+#> [2,] 3.198523 0.12602385 4
+#> [3,] 3.092609 0.12625169 0
+#> [4,] 2.914857 0.02925966 1
+#> [5,] 3.213697 0.29187930 5
+#> [6,] 3.056378 0.22101881 2
+#> [7,] 3.131806 0.31518815 7
+#> [8,] 3.245636 0.14441806 3
+#> [9,] 3.358278 0.20572350 3
+#> [10,] 3.142494 0.39363244 3
We can simulate many more samples and then check the histogram of each margin, as well as the estimated correlation between the columns.
+x = rvec(10000, rho = rho, params = margins, type = "pearson")
+
+par(mfrow=c(1,3))
+hist(x[,1], breaks = 30, xlab = "", main = "Normal")
+hist(x[,2], breaks = 30, xlab = "", main = "Beta")
+hist(x[,3], breaks = 30, xlab = "", main = "Negative Binomial")
cor(x)
+#> [,1] [,2] [,3]
+#> [1,] 1.0000000 0.4847622 0.4880023
+#> [2,] 0.4847622 1.0000000 0.4892170
+#> [3,] 0.4880023 0.4892170 1.0000000
We can see that even wih 10,000 samples, the estimated correlation of the simulated data is not exactly the same as the target correlation. This can be explained by the fact that some correlations are simply not possible due to the discrete nature of certain distributions. Another possibility is that the copula algorith is biased and needs correction.
+all_dists <- list(
+ list(dist = "beta", shape1, shape2),
+ list(dist = "binom", size, prob),
+ list(dist = "cauchy", location, scale),
+ list(dist = "chisq", df),
+ list(dist = "exp", rate),
+ list(dist = "f", df1, df2),
+ list(dist = "gamma", shape, rate),
+ list(dist = "geom", prob),
+ list(dist = "hyper", m, n, k),
+ list(dist = "logis", location, scale),
+ list(dist = "lnorm", meanlog, sdlog),
+ list(dist = "nbinom", size, prob),
+ list(dist = "norm", mean, sd),
+ list(dist = "pois", lambda),
+ list(dist = "t", df),
+ list(dist = "unif", min, max),
+ list(dist = "weibull", shape, scale),
+ list(dist = "wilcox", m, n),
+ list(dist = "signrank", n)
+)