diff --git a/DESCRIPTION b/DESCRIPTION index 6c2c30f..f6892ca 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: scDblFinder Type: Package Title: scDblFinder -Version: 1.19.4 +Version: 1.19.5 Authors@R: c( person("Pierre-Luc", "Germain", email="pierre-luc.germain@hest.ethz.ch", role=c("cre","aut"), comment=c(ORCID="0000-0003-3418-4218")), person("Aaron", "Lun", email="infinite.monkeys.with.keyboards@gmail.com", role="ctb")) diff --git a/R/doubletThresholding.R b/R/doubletThresholding.R index dd5644c..b6a1e94 100644 --- a/R/doubletThresholding.R +++ b/R/doubletThresholding.R @@ -10,6 +10,7 @@ #' doublet rate will be adjusted for homotypic doublets. #' @param dbr.sd The standard deviation of the doublet rate, representing the #' uncertainty in the estimate. Ignored if `method!="optim"`. +#' @param dbr.per1k The expected proportion of doublets per 1000 cells. #' @param stringency A numeric value >0 and <1 which controls the relative weight of false #' positives (i.e. real cells) and false negatives (artificial doublets) in setting the #' threshold. A value of 0.5 gives equal weight to both; a higher value (e.g. 0.7) gives @@ -38,7 +39,8 @@ #' #' @importFrom stats mad qnorm setNames #' @export -doubletThresholding <- function( d, dbr=NULL, dbr.sd=NULL, stringency=0.5, p=0.1, +doubletThresholding <- function( d, dbr=NULL, dbr.sd=NULL, dbr.per1k=0.008, + stringency=0.5, p=0.1, method=c("auto","optim","dbr","griffiths"), perSample=TRUE, returnType=c("threshold","call")){ method <- match.arg(method) @@ -192,7 +194,7 @@ doubletThresholding <- function( d, dbr=NULL, dbr.sd=NULL, stringency=0.5, p=0.1 d } -.getDoubletStats <- function( d, th, dbr=NULL, dbr.sd=0.015 ){ +.getDoubletStats <- function( d, th, dbr=NULL, dbr.sd=0.015, dbr.per1k=0.008 ){ # check that we have all necessary fields: fields <- c("cluster","src","type","score","mostLikelyOrigin", "originAmbiguous","difficulty") if(!all(fields %in% colnames(d))) stop("Input misses some columns.") @@ -200,7 +202,7 @@ doubletThresholding <- function( d, dbr=NULL, dbr.sd=NULL, stringency=0.5, p=0.1 return(dplyr::bind_rows(lapply(split(seq_len(nrow(d)), d$sample), FUN=function(i){ .getDoubletStats(d[i,fields], th, dbr=dbr, dbr.sd=dbr.sd) }), .id="sample")) - if(is.null(dbr)) dbr <- 0.01*sum(d$src=="real",na.rm=TRUE)/1000 + if(is.null(dbr)) dbr <- dbr.per1k*sum(d$src=="real",na.rm=TRUE)/1000 o <- d$mostLikelyOrigin[d$type=="real" & d$score>=th] expected <- getExpectedDoublets(d$cluster[d$src=="real"], dbr=dbr) stats <- .compareToExpectedDoublets(o, dbr=dbr, expected=expected) diff --git a/R/misc.R b/R/misc.R index 350694a..630abe0 100644 --- a/R/misc.R +++ b/R/misc.R @@ -35,6 +35,7 @@ #' @param dbr The expected doublet rate. #' @param only.heterotypic Logical; whether to return expectations only for #' heterotypic doublets +#' @param dbr.per1k The expected proportion of doublets per 1000 cells. #' #' @return The expected number of doublets of each combination of clusters #' @@ -43,7 +44,8 @@ #' cl <- sample(head(LETTERS,4), size=2000, prob=c(.4,.2,.2,.2), replace=TRUE) #' getExpectedDoublets(cl) #' @export -getExpectedDoublets <- function(x, dbr=NULL, only.heterotypic=TRUE){ +getExpectedDoublets <- function(x, dbr=NULL, only.heterotypic=TRUE, + dbr.per1k=0.008){ if(is(x,"SingleCellExperiment")){ clusters <- x$scDblFinder.clusters }else{ @@ -54,7 +56,7 @@ getExpectedDoublets <- function(x, dbr=NULL, only.heterotypic=TRUE){ if(all(grepl("^[0-9]*$",lvls))) lvls <- as.integer(lvls) clusters <- as.integer(clusters) ncells <- length(clusters) - if(is.null(dbr)) dbr <- (0.01*ncells/1000) + if(is.null(dbr)) dbr <- (dbr.per1k*ncells/1000) if(length(unique(clusters))==1) return(ncells*dbr) cs <- table(clusters)/ncells @@ -371,7 +373,7 @@ cxds2 <- function(x, whichDbls=c(), ntop=500, binThresh=NULL){ } # gets a global doublet rate from samples' doublet rates -.gdbr <- function(d, dbr=NULL){ +.gdbr <- function(d, dbr=NULL, dbr.per1k=0.008){ if(!is.null(dbr)){ if(length(dbr)==1) return(dbr) stopifnot(!is.null(d$sample)) @@ -384,7 +386,7 @@ cxds2 <- function(x, whichDbls=c(), ntop=500, binThresh=NULL){ ## estimate a global doublet rate sl <- as.numeric(table(d$sample, d$src=="real")[,2]) } - dbr <- (0.01*sl/1000) + dbr <- (dbr.per1k*sl/1000) sum(dbr*sl)/sum(sl) } diff --git a/R/scDblFinder.R b/R/scDblFinder.R index 7674c51..f77e35a 100644 --- a/R/scDblFinder.R +++ b/R/scDblFinder.R @@ -42,10 +42,9 @@ #' @param nfeatures The number of top features to use. Alternatively, a #' character vectors of feature names (e.g. highly-variable genes) to use. #' @param dims The number of dimensions used. -#' @param dbr The expected doublet rate. By default this is assumed to be 1\% -#' per thousand cells captured (so 4\% among 4000 thousand cells), which is -#' appropriate for 10x datasets. Corrections for homeotypic doublets will be -#' performed on the given rate. +#' @param dbr The expected doublet rate, i.e. the proportion of the cells +#' expected to be doublets. If omitted, will be calculated automatically based +#' on the `dbr.per1k` argument and the number of cells. #' @param dbr.sd The uncertainty range in the doublet rate, interpreted as #' a +/- around `dbr`. During thresholding, deviation from the expected doublet #' rate will be calculated from these boundaries, and will be considered null @@ -53,6 +52,11 @@ #' disable the uncertainty around the doublet rate, or to `dbr.sd=1` to disable #' any expectation of the number of doublets (thus letting the thresholding be #' entirely driven by the misclassification of artificial doublets). +#' @param dbr.per1k This is an alternative way of providing the expected doublet +#' rate as a fraction of the number of (the thousands of) cells captured. The +#' default, 0.008 (e.g. 3.2\% doublets among 4000 cells), is appropriate for +#' standard 10X chips. For High Throughput (HT) 10X chips, use half, i.e. +#' 0.004. (Some more recent chips might have this rate even lower). #' @param k Number of nearest neighbors (for KNN graph). If more than one value #' is given, the doublet density will be calculated at each k (and other values #' at the highest k), and all the information will be used by the classifier. @@ -195,8 +199,8 @@ scDblFinder <- function( sce, clusters=NULL, samples=NULL, clustCor=NULL, artificialDoublets=NULL, knownDoublets=NULL, knownUse=c("discard","positive"), dbr=NULL, dbr.sd=NULL, - nfeatures=1352, dims=20, k=NULL, removeUnidentifiable=TRUE, includePCs=19, - propRandom=0, propMarkers=0, aggregateFeatures=FALSE, + dbr.per1k=0.08, nfeatures=1352, dims=20, k=NULL, removeUnidentifiable=TRUE, + includePCs=19, propRandom=0, propMarkers=0, aggregateFeatures=FALSE, returnType=c("sce","table","full","counts"), score=c("xgb","weighted","ratio"), processing="default", metric="logloss", nrounds=0.25, max_depth=4, iter=3, trainingFeatures=NULL, unident.th=NULL, @@ -235,6 +239,7 @@ scDblFinder <- function( .checkPropArg(propMarkers) .checkPropArg(propRandom) .checkPropArg(dbr.sd) + .checkPropArg(dbr.per1k) .checkPropArg(dbr, acceptNull=TRUE) processing <- .checkProcArg(processing) @@ -288,10 +293,10 @@ scDblFinder <- function( } out <- tryCatch( scDblFinder(sce[sel_features,x], clusters=clusters, dims=dims, dbr=dbr, - dbr.sd=dbr.sd, clustCor=clustCor, unident.th=unident.th, - knownDoublets=knownDoublets, knownUse=knownUse, - artificialDoublets=artificialDoublets, k=k, - processing=processing, nfeatures=nfeatures, + dbr.sd=dbr.sd, dbr.per1k=dbr.per1k, clustCor=clustCor, + unident.th=unident.th, knownDoublets=knownDoublets, + knownUse=knownUse, artificialDoublets=artificialDoublets, + k=k, processing=processing, nfeatures=nfeatures, propRandom=propRandom, includePCs=includePCs, propMarkers=propMarkers, trainingFeatures=trainingFeatures, returnType=ifelse(returnType=="counts","counts","table"), @@ -311,12 +316,12 @@ scDblFinder <- function( if(multiSampleMode!="split"){ ## score and thresholding d <- .scDblscore(d, scoreType=score, threshold=threshold, dbr=dbr, - dbr.sd=dbr.sd, max_depth=max_depth, nrounds=nrounds, - iter=iter, BPPARAM=BPPARAM, verbose=verbose, + dbr.sd=dbr.sd, dbr.per1k=dbr.per1k, max_depth=max_depth, + nrounds=nrounds, iter=iter, BPPARAM=BPPARAM, features=trainingFeatures, unident.th=unident.th, metric=metric, filterUnidentifiable=removeUnidentifiable, perSample=multiSampleMode=="singleModelSplitThres", - includeSamples=TRUE) + includeSamples=TRUE, verbose=verbose) } if(returnType=="table") return(d) return(.scDblAddCD(sce, d)) @@ -465,7 +470,8 @@ scDblFinder <- function( } ex <- NULL - if(!is.null(clusters)) ex <- getExpectedDoublets(clusters, dbr) + if(!is.null(clusters)) ex <- getExpectedDoublets(clusters, dbr, + dbr.per1k=dbr.per1k) if(verbose) message("Evaluating kNN...") d <- .evaluateKNN(pca, ctype, ado2, expected=ex, k=k) @@ -492,10 +498,10 @@ scDblFinder <- function( includePCs <- includePCs[includePCs sum(d$type=="real")/3){ # enforce max prop excluded w1 <- head(order(d$type!="real", -d$score), @@ -700,8 +707,8 @@ scDblFinder <- function( } d <- DataFrame(d) if(threshold){ - th <- doubletThresholding( d, dbr=dbr, dbr.sd=dbr.sd, perSample=perSample, - ... ) + th <- doubletThresholding( d, dbr=dbr, dbr.sd=dbr.sd, dbr.per1k=dbr.per1k, + perSample=perSample, ... ) if(!is.null(d$sample) && length(th)>1){ d$class <- ifelse(d$score >= th[d$sample], "doublet", "singlet") }else{ @@ -711,7 +718,7 @@ scDblFinder <- function( ## set class of known (i.e. inputted) doublets: d$class[d$src=="real" & d$type=="doublet"] <- "doublet" if(!is.null(d$mostLikelyOrigin)){ - th.stats <- .getDoubletStats(d, th, dbr, dbr.sd) + th.stats <- .getDoubletStats(d, th, dbr, dbr.sd, dbr.per1k=dbr.per1k) metadata(d)$scDblFinder.stats <- th.stats } metadata(d)$scDblFinder.threshold <- th diff --git a/man/doubletThresholding.Rd b/man/doubletThresholding.Rd index 433605e..41a0f8b 100644 --- a/man/doubletThresholding.Rd +++ b/man/doubletThresholding.Rd @@ -8,6 +8,7 @@ doubletThresholding( d, dbr = NULL, dbr.sd = NULL, + dbr.per1k = 0.008, stringency = 0.5, p = 0.1, method = c("auto", "optim", "dbr", "griffiths"), @@ -26,6 +27,8 @@ doublet rate will be adjusted for homotypic doublets.} \item{dbr.sd}{The standard deviation of the doublet rate, representing the uncertainty in the estimate. Ignored if `method!="optim"`.} +\item{dbr.per1k}{The expected proportion of doublets per 1000 cells.} + \item{stringency}{A numeric value >0 and <1 which controls the relative weight of false positives (i.e. real cells) and false negatives (artificial doublets) in setting the threshold. A value of 0.5 gives equal weight to both; a higher value (e.g. 0.7) gives diff --git a/man/getExpectedDoublets.Rd b/man/getExpectedDoublets.Rd index be91ed8..8b8b7d6 100644 --- a/man/getExpectedDoublets.Rd +++ b/man/getExpectedDoublets.Rd @@ -4,7 +4,7 @@ \alias{getExpectedDoublets} \title{getExpectedDoublets} \usage{ -getExpectedDoublets(x, dbr = NULL, only.heterotypic = TRUE) +getExpectedDoublets(x, dbr = NULL, only.heterotypic = TRUE, dbr.per1k = 0.008) } \arguments{ \item{x}{A vector of cluster labels for each cell} @@ -13,6 +13,8 @@ getExpectedDoublets(x, dbr = NULL, only.heterotypic = TRUE) \item{only.heterotypic}{Logical; whether to return expectations only for heterotypic doublets} + +\item{dbr.per1k}{The expected proportion of doublets per 1000 cells.} } \value{ The expected number of doublets of each combination of clusters diff --git a/man/scDblFinder.Rd b/man/scDblFinder.Rd index 95cf321..b64cd66 100644 --- a/man/scDblFinder.Rd +++ b/man/scDblFinder.Rd @@ -14,6 +14,7 @@ scDblFinder( knownUse = c("discard", "positive"), dbr = NULL, dbr.sd = NULL, + dbr.per1k = 0.08, nfeatures = 1352, dims = 20, k = NULL, @@ -86,10 +87,9 @@ that `scDblFinder` does *not* enforce that the knownDoublets be necessarily called as doublets in the final classification, if they are not predicted as such.} -\item{dbr}{The expected doublet rate. By default this is assumed to be 1\% -per thousand cells captured (so 4\% among 4000 thousand cells), which is -appropriate for 10x datasets. Corrections for homeotypic doublets will be -performed on the given rate.} +\item{dbr}{The expected doublet rate, i.e. the proportion of the cells +expected to be doublets. If omitted, will be calculated automatically based +on the `dbr.per1k` argument and the number of cells.} \item{dbr.sd}{The uncertainty range in the doublet rate, interpreted as a +/- around `dbr`. During thresholding, deviation from the expected doublet @@ -99,6 +99,12 @@ within these boundaries. If NULL, will be 40\% of `dbr`. Set to `dbr.sd=0` to any expectation of the number of doublets (thus letting the thresholding be entirely driven by the misclassification of artificial doublets).} +\item{dbr.per1k}{This is an alternative way of providing the expected doublet +rate as a fraction of the number of (the thousands of) cells captured. The +default, 0.008 (e.g. 3.2\% doublets among 4000 cells), is appropriate for +standard 10X chips. For High Throughput (HT) 10X chips, use half, i.e. +0.004. (Some more recent chips might have this rate even lower).} + \item{nfeatures}{The number of top features to use. Alternatively, a character vectors of feature names (e.g. highly-variable genes) to use.} diff --git a/vignettes/scDblFinder.Rmd b/vignettes/scDblFinder.Rmd index 043258a..ceef64a 100644 --- a/vignettes/scDblFinder.Rmd +++ b/vignettes/scDblFinder.Rmd @@ -55,6 +55,7 @@ sce <- scDblFinder(sce, dbr=0.1) ``` For 10x data, it is usually safe to leave the `dbr` empty, and it will be automatically estimated. +(If using a chip other than the standard 10X, you might have to adjust it or the related `dbr.per1k` argument. `scDblFinder` will add a number of columns to the colData of `sce` prefixed with 'scDblFinder', the most important of which are: @@ -141,7 +142,7 @@ This score is available in the output, in the `scDblFinder.score` colData column ### Thresholding -Rather than thresholding on some arbitrary cutoff of the score, `scDblFinder` uses the expected number of doublets in combination to the misclassification rate to establish a threshold. Unless it is manually given through the `dbr` argument, the expected doublet rate is first estimated using the empirical rule of thumb applicable to 10X data, namely roughly 1\% per 1000 cells captured (so with 5000 cells, (0.01\*5)\*5000 = 250 doublets, and the more cells you capture the higher the chance of creating a doublet). If samples were specified, and if the `dbr` is automatically calculated, thresholding is performed separately across samples. +Rather than thresholding on some arbitrary cutoff of the score, `scDblFinder` uses the expected number of doublets in combination to the misclassification rate to establish a threshold. Unless it is manually given through the `dbr` argument, the expected doublet rate is first estimated (see below). If samples were specified, and if the `dbr` is automatically calculated, thresholding is performed separately across samples. Thresholding then tries to simultaneously minimize: 1) the classification error (in terms of the proportion of known doublets below the threshold) and 2) the deviation from the expected number of doublets among real cells (as a ratio of the total number of expected doublets within the range determined by `dbr.sd`, and adjusted for homotypic doublets). This means that, if you have no idea about the doublet rate, setting `dbr.sd=1` will make the threshold depend entirely on the misclassification rate. @@ -165,7 +166,14 @@ In addition, two frameworks are offered for testing the significance of enrichme ### Expected proportion of doublets -The expected proportion of doublets has no impact on the density of artificial doublets in the neighborhood, but impacts the classifier's score and, especially, where the cutoff will be placed. It is specified through the `dbr` parameter and the `dbr.sd` parameter (the latter specifies a +/- range around `dbr` within which the deviation from `dbr` will be considered null). For 10x data, the more cells you capture the higher the chance of creating a doublet, and Chromium documentation indicates a doublet rate of roughly 1\% per 1000 cells captures (so with 5000 cells, (0.01\*5)\*5000 = 250 doublets), and the default expected doublet rate will be set to this value (with a default standard deviation of 0.015). Note however that different protocols may create considerably more doublets, and that this should be updated accordingly. If you have unsure about the doublet rate, you might consider increasing `dbr.sd`, so that it is estimated mostl/purely from the misclassification error. +The expected proportion of doublets has no impact on the density of artificial doublets in the neighborhood, but impacts the classifier's score and, especially, where the cutoff will be placed. It is specified through the `dbr` parameter, as well as the `dbr.per1k` (specifies, if `dbr` is omitted, the rate per thousands cells from which to estimate it) and `dbr.sd` parameters (the latter specifies a +/- range around `dbr` within which the deviation from `dbr` will be considered null). + +For most platforms, the more cells you capture the higher the chance of creating a doublet. +For standard 10X data, the 10X documentation indicates a doublet rate of roughly 0.8\% per 1000 cells captured, which is the default value of `dbr.per1k`. This means that unless `dbr` is manually set, with 5000 cells, (0.008\*5)\*5000 = 200 doublets are expected, and the default expected doublet rate will be set to this value (with a default standard deviation of 0.015). Note however that different protocols may vary in the expected proportion of doublets. For example, the high-throughput (HT) 10X kit has an expected doublet rate of half the standard, i.e. 0.4\% per 1000 cells, so if using that kit, set `dbr.per1k=0.004`. + +Also note that strictly speaking, the proportion of doublets depends more on the number of cells inputted than that recovered. If your recovery rate was lower than expected, you might observe a higher doublet rate (see the [too-many doublets](#toomany) section below). + +The impact of the expected doublet rate on the thresholding will depend on how hard the classification task is: if it is easy, the called doublets will not depend much on the expected rate. If you have unsure about the doublet rate, you might consider increasing `dbr.sd`, so that it is estimated mostly (or even purely) from the misclassification error. ### Number of artificial doublets @@ -177,7 +185,7 @@ By default, `scDblFinder` will generate roughly as many artificial doublets as t ## Frequently-asked questions -### I'm getting way too many doublets called - what's going on? +### I'm getting way too many doublets called - what's going on? {#toomany} Then you most likely have a wrong doublet rate. If you did not provide it (`dbr` argument), the doublet rate will be calculated automatically using expected doublet rates from 10x, meaning that the more cells captured, the higher the doublet rates. If you have reasons to think that this is not applicable to your data, set the `dbr` manually. @@ -201,8 +209,6 @@ You will get this error if you have some cells that have zero reads (or a very l ### Identifying homotypic doublets - - Like other similar tools, scDblFinder focuses on identifying heterotypic doublets (formed by different cell types), and has only a low performance in identifying homotypic doublets (see [this preprint](https://doi.org/10.1101/2023.08.04.552078)). This can lead to disagreements with doublets called using cell hashes or SNPs in multiplexed samples, which capture both types of doublets similarly (and can miss intra-sample heterotypic doublets, especially if the multiplexing is low). This is why we treat these approaches as complementary. However, should you for some reason try to identify also homotypic doublets with scDblFinder, be sure to not to use the cluster-based approach, and to set `removeUnidentifiable=FALSE`. Otherwise, scDblFinder removes artificial doublets likely to be homotypic from training, therefore focusing the task on heterotypic doublets, but at the expense ot homotypic ones (which are typically deemed relatively harmless).