Added dbr.per1k argument, updated dbr-related documentation as per #110

plger · Sep 19, 2024 · 3bbfa3f · 3bbfa3f
1 parent ba4e32e
commit 3bbfa3f
Show file tree

Hide file tree

Showing 8 changed files with 71 additions and 43 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: scDblFinder
 Type: Package
 Title: scDblFinder
-Version: 1.19.4
+Version: 1.19.5
 Authors@R: c(
     person("Pierre-Luc", "Germain", email="pierre-luc.germain@hest.ethz.ch", role=c("cre","aut"), comment=c(ORCID="0000-0003-3418-4218")),
     person("Aaron", "Lun", email="infinite.monkeys.with.keyboards@gmail.com", role="ctb"))

diff --git a/R/doubletThresholding.R b/R/doubletThresholding.R
@@ -10,6 +10,7 @@
 #' doublet rate will be adjusted for homotypic doublets.
 #' @param dbr.sd The standard deviation of the doublet rate, representing the
 #' uncertainty in the estimate. Ignored if `method!="optim"`.
+#' @param dbr.per1k The expected proportion of doublets per 1000 cells.
 #' @param stringency A numeric value >0 and <1 which controls the relative weight of false
 #'  positives (i.e. real cells) and false negatives (artificial doublets) in setting the
 #'  threshold. A value of 0.5 gives equal weight to both; a higher value (e.g. 0.7) gives
@@ -38,7 +39,8 @@
 #'
 #' @importFrom stats mad qnorm setNames
 #' @export
-doubletThresholding <- function( d, dbr=NULL, dbr.sd=NULL, stringency=0.5, p=0.1,
+doubletThresholding <- function( d, dbr=NULL, dbr.sd=NULL, dbr.per1k=0.008,
+                                 stringency=0.5, p=0.1,
                                  method=c("auto","optim","dbr","griffiths"),
                                  perSample=TRUE, returnType=c("threshold","call")){
   method <- match.arg(method)
@@ -192,15 +194,15 @@ doubletThresholding <- function( d, dbr=NULL, dbr.sd=NULL, stringency=0.5, p=0.1
   d
 }
 
-.getDoubletStats <- function( d, th, dbr=NULL, dbr.sd=0.015 ){
+.getDoubletStats <- function( d, th, dbr=NULL, dbr.sd=0.015, dbr.per1k=0.008 ){
   # check that we have all necessary fields:
   fields <- c("cluster","src","type","score","mostLikelyOrigin", "originAmbiguous","difficulty")
   if(!all(fields %in% colnames(d))) stop("Input misses some columns.")
   if(!is.null(d$sample))
     return(dplyr::bind_rows(lapply(split(seq_len(nrow(d)), d$sample), FUN=function(i){
       .getDoubletStats(d[i,fields], th, dbr=dbr, dbr.sd=dbr.sd)
     }), .id="sample"))
-  if(is.null(dbr)) dbr <- 0.01*sum(d$src=="real",na.rm=TRUE)/1000
+  if(is.null(dbr)) dbr <- dbr.per1k*sum(d$src=="real",na.rm=TRUE)/1000
   o <- d$mostLikelyOrigin[d$type=="real" & d$score>=th]
   expected <- getExpectedDoublets(d$cluster[d$src=="real"], dbr=dbr)
   stats <- .compareToExpectedDoublets(o, dbr=dbr, expected=expected)

diff --git a/R/misc.R b/R/misc.R
@@ -35,6 +35,7 @@
 #' @param dbr The expected doublet rate.
 #' @param only.heterotypic Logical; whether to return expectations only for
 #' heterotypic doublets
+#' @param dbr.per1k The expected proportion of doublets per 1000 cells.
 #'
 #' @return The expected number of doublets of each combination of clusters
 #'
@@ -43,7 +44,8 @@
 #' cl <- sample(head(LETTERS,4), size=2000, prob=c(.4,.2,.2,.2), replace=TRUE)
 #' getExpectedDoublets(cl)
 #' @export
-getExpectedDoublets <- function(x, dbr=NULL, only.heterotypic=TRUE){
+getExpectedDoublets <- function(x, dbr=NULL, only.heterotypic=TRUE,
+                                dbr.per1k=0.008){
   if(is(x,"SingleCellExperiment")){
     clusters <- x$scDblFinder.clusters
   }else{
@@ -54,7 +56,7 @@ getExpectedDoublets <- function(x, dbr=NULL, only.heterotypic=TRUE){
   if(all(grepl("^[0-9]*$",lvls))) lvls <- as.integer(lvls)
   clusters <- as.integer(clusters)
   ncells <- length(clusters)
-  if(is.null(dbr)) dbr <- (0.01*ncells/1000)
+  if(is.null(dbr)) dbr <- (dbr.per1k*ncells/1000)
   if(length(unique(clusters))==1) return(ncells*dbr)
 
   cs <- table(clusters)/ncells
@@ -371,7 +373,7 @@ cxds2 <- function(x, whichDbls=c(), ntop=500, binThresh=NULL){
 }
 
 # gets a global doublet rate from samples' doublet rates
-.gdbr <- function(d, dbr=NULL){
+.gdbr <- function(d, dbr=NULL, dbr.per1k=0.008){
   if(!is.null(dbr)){
     if(length(dbr)==1) return(dbr)
     stopifnot(!is.null(d$sample))
@@ -384,7 +386,7 @@ cxds2 <- function(x, whichDbls=c(), ntop=500, binThresh=NULL){
     ## estimate a global doublet rate
     sl <- as.numeric(table(d$sample, d$src=="real")[,2])
   }
-  dbr <- (0.01*sl/1000)
+  dbr <- (dbr.per1k*sl/1000)
   sum(dbr*sl)/sum(sl)
 }
 

diff --git a/R/scDblFinder.R b/R/scDblFinder.R
@@ -42,17 +42,21 @@
 #' @param nfeatures The number of top features to use. Alternatively, a 
 #'   character vectors of feature names (e.g. highly-variable genes) to use.
 #' @param dims The number of dimensions used.
-#' @param dbr The expected doublet rate. By default this is assumed to be 1\%
-#' per thousand cells captured (so 4\% among 4000 thousand cells), which is
-#' appropriate for 10x datasets. Corrections for homeotypic doublets will be
-#' performed on the given rate.
+#' @param dbr The expected doublet rate, i.e. the proportion of the cells 
+#' expected to be doublets. If omitted, will be calculated automatically based
+#' on the `dbr.per1k` argument and the number of cells.
 #' @param dbr.sd The uncertainty range in the doublet rate, interpreted as
 #' a +/- around `dbr`. During thresholding, deviation from the expected doublet
 #' rate will be calculated from these boundaries, and will be considered null
 #' within these boundaries. If NULL, will be 40\% of `dbr`. Set to `dbr.sd=0` to
 #'  disable the uncertainty around the doublet rate, or to `dbr.sd=1` to disable
 #'  any expectation of the number of doublets (thus letting the thresholding be
 #'  entirely driven by the misclassification of artificial doublets).
+#' @param dbr.per1k This is an alternative way of providing the expected doublet
+#'  rate as a fraction of the number of (the thousands of) cells captured. The 
+#'  default, 0.008 (e.g. 3.2\% doublets among 4000 cells), is appropriate for 
+#'  standard 10X chips. For High Throughput (HT) 10X chips, use half, i.e. 
+#'  0.004. (Some more recent chips might have this rate even lower).
 #' @param k Number of nearest neighbors (for KNN graph). If more than one value
 #' is given, the doublet density will be calculated at each k (and other values
 #' at the highest k), and all the information will be used by the classifier.
@@ -195,8 +199,8 @@
 scDblFinder <- function(
   sce, clusters=NULL, samples=NULL, clustCor=NULL, artificialDoublets=NULL,
   knownDoublets=NULL, knownUse=c("discard","positive"), dbr=NULL, dbr.sd=NULL, 
-  nfeatures=1352, dims=20, k=NULL, removeUnidentifiable=TRUE, includePCs=19, 
-  propRandom=0, propMarkers=0, aggregateFeatures=FALSE,
+  dbr.per1k=0.08, nfeatures=1352, dims=20, k=NULL, removeUnidentifiable=TRUE,
+  includePCs=19, propRandom=0, propMarkers=0, aggregateFeatures=FALSE,
   returnType=c("sce","table","full","counts"),
   score=c("xgb","weighted","ratio"), processing="default", metric="logloss",
   nrounds=0.25, max_depth=4, iter=3, trainingFeatures=NULL, unident.th=NULL, 
@@ -235,6 +239,7 @@ scDblFinder <- function(
   .checkPropArg(propMarkers)
   .checkPropArg(propRandom)
   .checkPropArg(dbr.sd)
+  .checkPropArg(dbr.per1k)
   .checkPropArg(dbr, acceptNull=TRUE)
   processing <- .checkProcArg(processing)
 
@@ -288,10 +293,10 @@ scDblFinder <- function(
       }
       out <- tryCatch(
         scDblFinder(sce[sel_features,x], clusters=clusters, dims=dims, dbr=dbr,
-                    dbr.sd=dbr.sd, clustCor=clustCor, unident.th=unident.th,
-                    knownDoublets=knownDoublets, knownUse=knownUse,
-                    artificialDoublets=artificialDoublets, k=k,
-                    processing=processing, nfeatures=nfeatures,
+                    dbr.sd=dbr.sd, dbr.per1k=dbr.per1k, clustCor=clustCor,
+                    unident.th=unident.th, knownDoublets=knownDoublets,
+                    knownUse=knownUse, artificialDoublets=artificialDoublets,
+                    k=k, processing=processing, nfeatures=nfeatures,
                     propRandom=propRandom, includePCs=includePCs,
                     propMarkers=propMarkers, trainingFeatures=trainingFeatures,
                     returnType=ifelse(returnType=="counts","counts","table"),
@@ -311,12 +316,12 @@ scDblFinder <- function(
     if(multiSampleMode!="split"){
       ## score and thresholding
       d <- .scDblscore(d, scoreType=score, threshold=threshold, dbr=dbr,
-                       dbr.sd=dbr.sd, max_depth=max_depth, nrounds=nrounds,
-                       iter=iter, BPPARAM=BPPARAM, verbose=verbose,
+                       dbr.sd=dbr.sd, dbr.per1k=dbr.per1k, max_depth=max_depth,
+                       nrounds=nrounds, iter=iter, BPPARAM=BPPARAM, 
                        features=trainingFeatures, unident.th=unident.th,
                        metric=metric, filterUnidentifiable=removeUnidentifiable,
                        perSample=multiSampleMode=="singleModelSplitThres",
-                       includeSamples=TRUE)
+                       includeSamples=TRUE, verbose=verbose)
     }
     if(returnType=="table") return(d)
     return(.scDblAddCD(sce, d))
@@ -465,7 +470,8 @@ scDblFinder <- function(
   }
 
   ex <- NULL
-  if(!is.null(clusters)) ex <- getExpectedDoublets(clusters, dbr)
+  if(!is.null(clusters)) ex <- getExpectedDoublets(clusters, dbr,
+                                                   dbr.per1k=dbr.per1k)
 
   if(verbose) message("Evaluating kNN...")
   d <- .evaluateKNN(pca, ctype, ado2, expected=ex, k=k)
@@ -492,10 +498,10 @@ scDblFinder <- function(
   includePCs <- includePCs[includePCs<ncol(pca)]
   d <- .scDblscore(d, scoreType=score, addVals=pca[,includePCs,drop=FALSE],
                    threshold=threshold, dbr=dbr, dbr.sd=dbr.sd, nrounds=nrounds,
-                   max_depth=max_depth, iter=iter, BPPARAM=BPPARAM,
+                   dbr.per1k=dbr.per1k, max_depth=max_depth, iter=iter,
                    features=trainingFeatures, verbose=verbose, metric=metric,
                    filterUnidentifiable=removeUnidentifiable,
-                   unident.th=unident.th)
+                   unident.th=unident.th, BPPARAM=BPPARAM)
 
   #if(characterize) d <- .callDblType(d, pca, knn=knn, origins=ado2)
   if(returnType=="table") return(d)
@@ -590,10 +596,11 @@ scDblFinder <- function(
 #' @importFrom stats predict quantile
 .scDblscore <- function(d, scoreType="xgb", nrounds=NULL, max_depth=5, iter=2,
                         threshold=TRUE, verbose=TRUE, dbr=NULL, dbr.sd=NULL,
-                        features=NULL, filterUnidentifiable=TRUE, addVals=NULL,
-                        metric="logloss", eta=0.3, BPPARAM=SerialParam(),
-                        includeSamples=FALSE, perSample=TRUE, unident.th=0.1, ...){
-  gdbr <- .gdbr(d, dbr)
+                        dbr.per1k=dbr.per1k, features=NULL, addVals=NULL,
+                        filterUnidentifiable=TRUE, metric="logloss", eta=0.3,
+                        BPPARAM=SerialParam(), includeSamples=FALSE, 
+                        perSample=TRUE, unident.th=0.1, ...){
+  gdbr <- .gdbr(d, dbr, dbr.per1k=dbr.per1k)
   if(!is.null(d$sample) && length(unique(d$sample))==1) d$sample <- NULL
   if(is.null(dbr.sd)) dbr.sd <- 0.3*gdbr+0.025
   if(scoreType=="xgb"){
@@ -650,8 +657,8 @@ scDblFinder <- function(
       # as well as unidentifiable artificial doublets
       w1 <- which(d$type=="real" &
                   doubletThresholding(d, dbr=dbr, dbr.sd=dbr.sd, stringency=0.7,
-                                       perSample=perSample,
-                                       returnType="call")=="doublet")
+                                      dbr.per1k=dbr.per1k, perSample=perSample,
+                                      returnType="call")=="doublet")
       if(length(w1) > sum(d$type=="real")/3){
         # enforce max prop excluded
         w1 <- head(order(d$type!="real", -d$score),
@@ -700,8 +707,8 @@ scDblFinder <- function(
   }
   d <- DataFrame(d)
   if(threshold){
-    th <- doubletThresholding( d, dbr=dbr, dbr.sd=dbr.sd, perSample=perSample,
-                               ... )
+    th <- doubletThresholding( d, dbr=dbr, dbr.sd=dbr.sd, dbr.per1k=dbr.per1k,
+                               perSample=perSample, ... )
     if(!is.null(d$sample) && length(th)>1){
       d$class <- ifelse(d$score >= th[d$sample], "doublet", "singlet")
     }else{
@@ -711,7 +718,7 @@ scDblFinder <- function(
     ## set class of known (i.e. inputted) doublets:
     d$class[d$src=="real" & d$type=="doublet"] <- "doublet"
     if(!is.null(d$mostLikelyOrigin)){
-      th.stats <- .getDoubletStats(d, th, dbr, dbr.sd)
+      th.stats <- .getDoubletStats(d, th, dbr, dbr.sd, dbr.per1k=dbr.per1k)
       metadata(d)$scDblFinder.stats <- th.stats
     }
     metadata(d)$scDblFinder.threshold <- th

diff --git a/man/doubletThresholding.Rd b/man/doubletThresholding.Rd
diff --git a/man/getExpectedDoublets.Rd b/man/getExpectedDoublets.Rd
diff --git a/man/scDblFinder.Rd b/man/scDblFinder.Rd