Merge branch 'develop'

SchisslerGroup · May 29, 2020 · 6b3173b · 6b3173b
2 parents a635d00 + f3cf43c
commit 6b3173b
Show file tree

Hide file tree

Showing 38 changed files with 1,717 additions and 420 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -2,3 +2,5 @@
 ^\.Rproj\.user$
 \tests
 ^data-raw$
+^doc$
+^Meta$
diff --git a/.Rprofile b/.Rprofile
@@ -0,0 +1 @@
+reticulate::use_condaenv("bigsimr")
diff --git a/.gitignore b/.gitignore
@@ -25,7 +25,9 @@ vignettes/*.pdf
 .httr-oauth
 
 # knitr and R markdown default cache directories
-/*_cache/
+notebooks/*_cache/
+notebooks/*_files/
+notebooks/*.html
 /cache/
 
 # Temporary files created by R markdown

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,25 +1,27 @@
 Package: bigsimr
-Title: Simulate Multivariate Negative Binomial Data
-Version: 0.2.0
+Title: An R Package for Generating High-Dimensional Random Vectors
+Version: 0.3.0
 Authors@R: 
     person(given = "Alexander",
            family = "Knudson",
            role = c("aut", "cre"),
-           email = "aknudson@nevada.unr.edu",
-           comment = c(ORCID = "YOUR-ORCID-ID"))
+           email = "aknudson@nevada.unr.edu")
 Description: This package simulates data from a negative binomial distribution using a direct copula approach.
 Depends: R (>= 3.4.0)
 Imports:
+    fastrank,
     foreach,
-    mvnfast,
     Matrix,
     doParallel,
     parallel,
-    purrr
+    purrr,
+    coop
+Remotes:
+    douglasgscofield/fastrank
 License: GNU GPLv3
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 6.1.1
+RoxygenNote: 7.1.0
 Suggests: 
     knitr,
     rmarkdown

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,11 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
-export(adjustForDiscrete)
-export(adjustInputR)
-export(convertKendallPearson)
-export(convertPearsonKendall)
-export(convertPearsonSpearman)
-export(convertSpearmanPearson)
+export(computeCorBounds)
+export(convertCor)
 export(rcor)
 export(rnbinom_params)
 export(rvec)
diff --git a/R/correlation_utils.R b/R/correlation_utils.R
@@ -0,0 +1,245 @@
+#' Convert between different types of correlation matrices
+#'
+#' @param rho A a square symmetric correlation matrix
+#' @param from The type of the input correlation matrix
+#' @param to The type of the output correlation matrix
+#' @export
+convertCor <- function(rho,
+                       from = c("pearson", "spearman", "kendall"),
+                       to = c("pearson", "spearman", "kendall")) {
+
+  key <- paste(from, to)
+  A <- list(
+    "pearson pearson"   = function(r) {r},
+    "pearson spearman"  = function(r) {(6/pi)*asin(r/2)},
+    "pearson kendall"   = function(r) {(2/pi)*asin(r)},
+    "spearman pearson"  = function(r) {2*sin(r*(pi/6))},
+    "spearman spearman" = function(r) {r},
+    "spearman kendall"  = function(r) {(2/pi)*asin(2*sin(r*(pi/6)))},
+    "kendall pearson"   = function(r) {sin(r*(pi/2))},
+    "kendal spearman"   = function(r) {(6/pi)*asin(sin(r*(pi/2))/2)},
+    "kendall kendall"   = function(r) {r}
+  )
+
+  tmp <- as.matrix(Matrix::nearPD(A[[key]](rho))$mat)
+  rownames(tmp) <- colnames(tmp) <- colnames(rho)
+  tmp
+}
+
+
+#' Adjust the correlation matrix when there are discrete distributions present
+#'
+#' @param rho The input correlation matrix
+#' @param params The parameters of the marginals.
+#' @param nSigmas The number of standard deviations from the mean
+adjustForDiscrete <- function(rho, params, nSigmas) {
+  upper_bound <- lapply(params, function(param) {
+    prob <- param[["prob"]]
+    size <- param[["size"]]
+
+    mu <- size * (1 - prob) / prob
+    sigma2 <- size^2 * (1 - prob) / prob
+
+    ceiling(mu + nSigmas * sqrt(sigma2))
+  })
+  upper_bound <- max(unlist(upper_bound))
+
+  adj <- lapply(params, function(param) {
+    if (param[[1]] %in% discrete_dists) {
+      pmf_x <- do.call(paste0("d", param[[1]]),
+                       c(list(x = 0:upper_bound), param[-1]))
+      return(sqrt(1 - sum(pmf_x^3)))
+    } else {
+      return(1)
+    }
+  })
+  adj <- unlist(adj)
+
+  idx_mat <- combn(x = length(params), m = 2)
+
+  rho_adj <- apply(idx_mat, 2, function(pair) {
+    adj_factor <- adj[pair[1]] * adj[pair[2]]
+    rho_tmp <- rho[pair[1], pair[2]]
+    min(rho_tmp / adj_factor, 1)
+  })
+
+  rho[lower.tri(rho)] <- rho_adj
+  rho[upper.tri(rho)] <- t(rho)[upper.tri(rho)]
+  rho
+}
+
+
+#' Computes the theoretical upper and lower bounds of possible correlations
+#'   given a set of marginals
+#'
+#' @param params The parameters of the marginals.
+#' @param cores The number of cores to utilize.
+#' @param type The type of correlation matrix that is being passed.
+#' @return A list containing the theoretical upper and lower bounds
+#' @export
+computeCorBounds <- function(params,
+                             cores = 1,
+                             type = c("pearson", "kendall", "spearman"),
+                             reps = 1e5){
+
+  d <- length(params)
+
+  # Generate random samples for each margin and sort the vectors
+  # The sorted vectors are used for computing the bounds
+  if (cores == 1) {
+    sim_data <- sapply(1:d, function(i) {
+      sort(do.call(what = paste0("r", unlist(params[[i]][1])),
+                   args = c(list(n = reps), params[[i]][-1])))
+    })
+  } else {
+    `%dopar%` <- foreach::`%dopar%`
+    cl <- parallel::makeCluster(cores, type = "FORK")
+    doParallel::registerDoParallel(cl)
+    sim_data <- foreach::foreach(i = 1:d, .combine = "cbind") %dopar% {
+        sort(do.call(what = paste0("r", unlist(params[[i]][1])),
+                     args = c(list(n = reps), params[[i]][-1])))
+      }
+  }
+
+  unname(sim_data)
+  index_mat <- combn(x = d, m = 2)
+
+
+  if (cores == 1) {
+    # Upper bounds
+    rho_upper <- cor(sim_data, method = type)
+
+    # Lower bounds
+    rho_lower_values <- apply(index_mat, 2, function(index, data, ...){
+      cor(data[, index[1]],
+          rev(data[, index[2]]),
+          method = type)
+    }, data = sim_data, method = type)
+
+    rho_lower <- diag(x = 1, nrow = d, ncol = d)
+    rho_lower[lower.tri(rho_lower)] <- rho_lower_values
+    rho_lower <- t(rho_lower)
+    rho_lower[lower.tri(rho_lower)] <- rho_lower_values
+
+  } else {
+    # Upper bounds
+    rho_upper_values <- parallel::parApply(
+      cl = cl,
+      X = index_mat,
+      MARGIN = 2,
+      FUN = function(index, data, ...){
+        cor(x = data[, index[1]],
+            y = data[, index[2]],
+            method = type)
+    }, data = sim_data, method = type)
+
+    # Ensure that the result is a valid correlation matrix
+    rho_upper <- diag(x = 1, nrow = d, ncol = d)
+    rho_upper[lower.tri(rho_upper)] <- rho_upper_values
+    rho_upper <- t(rho_upper)
+    rho_upper[lower.tri(rho_upper)] <- rho_upper_values
+
+    # Lower bounds
+    rho_lower_values <- parallel::parApply(
+      cl = cl,
+      X = index_mat,
+      MARGIN = 2,
+      FUN = function(index, data, ...){
+        cor(x = data[, index[1]],
+            y = rev(data[, index[2]]),
+            method=type)
+      }, data = sim_data, method = type)
+    parallel::stopCluster(cl)
+
+    # Ensure that the result is a valid correlation matrix
+    rho_lower <- diag(x = 1, nrow = d, ncol = d)
+    rho_lower[lower.tri(rho_lower)] <- rho_lower_values
+    rho_lower <- t(rho_lower)
+    rho_lower[lower.tri(rho_lower)] <- rho_lower_values
+  }
+
+  list(upper = rho_upper,
+       lower = rho_lower)
+}
+
+
+#' Constrains a correlation matrix to the theoretical upper and lower bounds
+#'
+#' @param rho The input correlation matrix.
+#' @param rho_bounds A list containing the theoretical upper and lower bounds
+#' @return The constrained correlation matrix
+constrainRho <- function(rho, rho_bounds){
+  rho_tmp <- pmin(rho_bounds[["upper"]], rho)
+  pmax(rho_bounds[["lower"]], rho_tmp)
+}
+
+
+#' Returns a logical matrix of which correlations are in the feasible region
+#'
+#' @param rho The input correlation matrix.
+#' @param rho_bounds A list containing the theoretical upper and lower bounds
+#' @param negate Should the logical values be negated in order to identify
+#'   values that are outside the feasible region.
+which_corInBounds <- function(rho, rho_bounds, negate = FALSE){
+
+  tooSmall <- rho < rho_bounds[["lower"]]
+  tooLarge <- rho > rho_bounds[["upper"]]
+
+  outOfBounds <- tooSmall | tooLarge
+  inBounds <- !outOfBounds
+
+  # Return either the in bounds or out of bounds matrix
+  if (negate) {
+    outOfBounds
+  } else {
+    inBounds
+  }
+}
+
+
+#' Returns TRUE if all correlations pairs are within the theoretical bounds
+#'
+#' @param rho The input correlation matrix.
+#' @param params The parameters of the marginals.
+#' @param cores The number of cores to utilize.
+#' @param type The type of correlation matrix that is being passed.
+#' @param rho_bounds Pre-computed upper and lower correlation bounds
+#' ... Other arguments passed to `computeCoreBounds()`
+#' @return Logical. TRUE if all correlations pairs are within the theoretical
+#'   bounds, and false otherwise.
+all_corInBounds <- function(rho,
+                          params,
+                          cores = 1,
+                          type = c("pearson", "spearman", "kendall"),
+                          rho_bounds = NULL, ...){
+
+  if (is.null(rho_bounds)){
+    rho_bounds <- computeCorBounds(params = params,
+                                   cores = cores,
+                                   type = type,
+                                   ...)
+  }
+
+  outOfBounds <- which_corInBounds(rho, rho_bounds, negate = TRUE)
+
+  if (any(outOfBounds)) {
+    warning(paste0("At least one of the specified correlations are either ",
+                   "too large or too small. Please use 'which_corInBounds() ",
+                   "to see which correlations are valid."))
+    return(FALSE)
+  }
+  TRUE
+}
+
+
+#' Estimate the Spearman rank correlation
+#'
+#' For a Nx2 matrix when N=1e5, fast is about 2x faster, and when N=1e6,
+#'   fast is about 3x faster.
+estimateSpearmanRho <- function(x, fast = TRUE){
+  if (fast) {
+    coop::pcor(apply(x, 2, fastrank::fastrank_num_avg))
+  } else {
+    cor(apply(x, 2, rank))
+  }
+}