Multiome keeper cell metrics (#1303)

Added keeper cell metrics, expected_cells input, and, new documentation for library-level metrics.
broadinstitute · Jul 1, 2024 · 4c62783 · 4c62783
1 parent 3ccfbde
commit 4c62783
Show file tree

Hide file tree

Showing 13 changed files with 88 additions and 11 deletions.
diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md
@@ -1,3 +1,9 @@
+# 5.1.0
+2024-06-28 (Date of Last Commit)
+
+* Updated the STARsolo parameters for estimating cells to Emptydrops_CR
+* Added an optional input for expected cells which is used for metric calculation
+
 # 5.0.0
 2024-05-20 (Date of Last Commit)
 

diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl
@@ -7,7 +7,7 @@ import "https://raw.githubusercontent.com/broadinstitute/CellBender/v0.3.0/wdl/c
 
 workflow Multiome {
 
-    String pipeline_version = "5.0.0"
+    String pipeline_version = "5.1.0"
 
     input {
         String input_id

diff --git a/pipelines/skylab/optimus/Optimus.changelog.md b/pipelines/skylab/optimus/Optimus.changelog.md
@@ -1,3 +1,9 @@
+# 7.2.0
+2024-06-28 (Date of Last Commit)
+
+* Updated the STARsolo parameters for estimating cells to Emptydrops_CR
+* Added an optional input for expected cells which is used for metric calculation
+
 # 7.1.0
 2024-05-20 (Date of Last Commit)
 

diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl
@@ -31,6 +31,7 @@ workflow Optimus {
     File annotations_gtf
     File? mt_genes
     String? soloMultiMappers = "Uniform"
+    Int? expected_cells
 
     # Chemistry options include: 2 or 3
     Int tenx_chemistry_version
@@ -65,7 +66,7 @@ workflow Optimus {
   # version of this pipeline
 
 
-  String pipeline_version = "7.1.0"
+  String pipeline_version = "7.2.0"
 
 
   # this is used to scatter matched [r1_fastq, r2_fastq, i1_fastq] arrays
@@ -168,7 +169,8 @@ workflow Optimus {
       align_features = STARsoloFastq.align_features,
       umipercell = STARsoloFastq.umipercell,
       input_id = input_id,
-      counting_mode = counting_mode
+      counting_mode = counting_mode,
+      expected_cells = expected_cells
   }
   if (counting_mode == "sc_rna"){
     call RunEmptyDrops.RunEmptyDrops {

diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md
@@ -1,3 +1,9 @@
+# 1.1.0
+2024-06-28 (Date of Last Commit)
+
+* Updated the STARsolo parameters for estimating cells to Emptydrops_CR
+* Added an optional input for expected cells which is used for metric calculation
+
 # 1.0.0
 2024-06-26
 

diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl
@@ -5,7 +5,7 @@ import "../../../pipelines/skylab/optimus/Optimus.wdl" as optimus
 import "../../../tasks/skylab/H5adUtils.wdl" as H5adUtils
 import "../../../tasks/skylab/PairedTagUtils.wdl" as Demultiplexing
 workflow PairedTag {
-    String pipeline_version = "1.0.0"
+    String pipeline_version = "1.1.0"
 
     input {
         String input_id

diff --git a/pipelines/skylab/slideseq/SlideSeq.changelog.md b/pipelines/skylab/slideseq/SlideSeq.changelog.md
@@ -1,3 +1,8 @@
+# 3.1.7
+2024-06-28 (Date of Last Commit)
+
+* Updated the STARsolo parameters for estimating cells to Emptydrops_CR; this does not affect the slideseq pipeline
+
 # 3.1.6
 2024-05-20 (Date of Last Commit)
 

diff --git a/pipelines/skylab/slideseq/SlideSeq.wdl b/pipelines/skylab/slideseq/SlideSeq.wdl
@@ -23,7 +23,7 @@ import "../../../tasks/skylab/MergeSortBam.wdl" as Merge
 
 workflow SlideSeq {
 
-    String pipeline_version = "3.1.6"
+    String pipeline_version = "3.1.7"
 
     input {
         Array[File] r1_fastq

diff --git a/...tseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md b/...tseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md
@@ -1,3 +1,8 @@
+# 1.3.5
+2024-06-28 (Date of Last Commit)
+
+* Updated the STARsolo parameters for estimating cells to Emptydrops_CR; this does not impact the snSS2 pipeline
+
 # 1.3.4
 2024-04-12 (Date of Last Commit)
 

diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl
@@ -40,7 +40,7 @@ workflow MultiSampleSmartSeq2SingleNucleus {
       String? input_id_metadata_field
   }
   # Version of this pipeline
-  String pipeline_version = "1.3.4"
+  String pipeline_version = "1.3.5"
 
   if (false) {
      String? none = "None"

diff --git a/tasks/skylab/StarAlign.wdl b/tasks/skylab/StarAlign.wdl
@@ -327,7 +327,8 @@ task STARsoloFastq {
         --soloBarcodeReadLength 0 \
         --soloCellReadStats Standard \
         ~{"--soloMultiMappers " + soloMultiMappers} \
-        --soloUMIfiltering MultiGeneUMI_CR
+        --soloUMIfiltering MultiGeneUMI_CR \
+        --soloCellFilter EmptyDrops_CR
 
     echo "UMI LEN " $UMILen
 
@@ -442,11 +443,12 @@ task MergeStarOutput {
     String? counting_mode
 
     String input_id
+    Int expected_cells = 3000
     File barcodes_single = barcodes[0]
     File features_single = features[0]
 
     #runtime values
-    String docker = "us.gcr.io/broad-gotc-prod/star-merge-npz:1.1"
+    String docker = "us.gcr.io/broad-gotc-prod/star-merge-npz:1.2"
     Int machine_mem_gb = 20
     Int cpu = 1
     Int disk = ceil(size(matrix, "Gi") * 2) + 10
@@ -491,7 +493,7 @@ task MergeStarOutput {
 
     # Running star for combined cell matrix
     # outputs will be called outputbarcodes.tsv. outputmatrix.mtx, and outputfeatures.tsv
-    STAR --runMode soloCellFiltering ./matrix ./output --soloCellFilter CellRanger2.2
+    STAR --runMode soloCellFiltering ./matrix ./output --soloCellFilter EmptyDrops_CR
 
     #list files
     echo "listing files"
@@ -567,7 +569,8 @@ task MergeStarOutput {
       ~{counting_mode} \
       ~{input_id} \
       outputbarcodes.tsv \
-      outputmatrix.mtx 
+      outputmatrix.mtx \
+      ~{expected_cells}
       tar -zcvf ~{input_id}.star_metrics.tar *.txt
     else
       echo "No text files found in the folder."

diff --git a/website/docs/Pipelines/Optimus_Pipeline/Library-metrics.md b/website/docs/Pipelines/Optimus_Pipeline/Library-metrics.md
@@ -0,0 +1,43 @@
+---
+sidebar_position: 5
+---
+
+# Optimus Library-level metrics
+
+The following table describes the library level metrics of the produced by the Optimus workflow. These are calcuated using custom python scripts available in the warp-tools repository. The Optimus workflow aligns files in shards to parallelize computationally intensive steps. This results in multiple matrix market files and shard-levl library metrics. 
+
+To produce the library-level metrics here, the [combined_mtx.py script](https://github.com/broadinstitute/warp-tools/blob/develop/3rd-party-tools/star-merge-npz/scripts/combined_mtx.py) combines all the shard-level matrix market files into one raw mtx file. Then, STARsolo is run to filter this matrix to only those barcodes that meet STARsolo's criteria of cells (using the Emptydrops_CR parameter). Lastly, the [combine_shard_metrics.py script](https://github.com/broadinstitute/warp-tools/blob/develop/3rd-party-tools/star-merge-npz/scripts/combine_shard_metrics.py) uses the filtered matrix and the all of the shard-level metrics files produced by STARsolo to calculate the metrics below. Each of the scripts are called from [MergeStarOutput task](https://github.com/broadinstitute/warp/blob/develop/tasks/skylab/StarAlign.wdl) of the Optimus workflow. 
+
+
+| Metric | Description |
+| ---| --- |
+| number_of_reads | Total number of reads.|
+| sequencing_saturation | Proportion of unique molecular identifiers (UMIs) observed relative to the total number of possible UMIs. |
+| fraction_of_unique_reads_mapped_to_genome | Fraction of unique reads that map to the genome. |
+| fraction_of_unique_and_multiple_reads_mapped_to_genome| Fraction of both unique and multiple reads that map to the genome. |
+| fraction_of_reads_with_Q30_bases_in_rna | Fraction of reads with base quality score ≥ Q30 in RNA sequences. |
+| fraction_of_reads_with_Q30_bases_in_cb_and_umi | Fraction of reads with base quality score ≥ Q30 in cell barcode (CB) and unique molecular identifier (UMI). |
+| fraction_of_reads_with_valid_barcodes | Fraction of reads with valid cell barcodes.                                                                   |
+| reads_mapped_antisense_to_gene | Number of reads mapped antisense to gene regions.  |
+| reads_mapped_confidently_exonic | Number of reads mapped confidently to exonic regions. |
+| reads_mapped_confidently_to_genome | Number of reads mapped confidently to the genome. |
+| reads_mapped_confidently_to_intronic_regions | Number of reads mapped confidently to intronic regions. |
+| reads_mapped_confidently_to_transcriptome | Number of reads mapped confidently to the transcriptome. |
+| estimated_cells | Estimated number of cells from STARsolo using the Emptydops_CR parameter. |
+| umis_in_cells | Total number of unique molecular identifiers (UMIs) in cells. |
+| mean_umi_per_cell | Average number of UMIs per cell. |
+| median_umi_per_cell | Median number of UMIs per cell. |
+| unique_reads_in_cells_mapped_to_gene | Number of unique reads in cells mapped to genes. |
+| fraction_of_unique_reads_in_cells  | Fraction of unique reads in cells. |
+| mean_reads_per_cell | Average number of reads per cell. |
+| median_reads_per_cell | Median number of reads per cell. |
+| mean_gene_per_cell | Average number of genes per cell. |
+| median_gene_per_cell  | Median number of genes per cell. |
+| total_genes_unique_detected | Total number of unique genes detected.  |
+| percent_target | Percentage of target cells. Calculated as: estimated_number_of_cells / barcoded_cell_sample_number_of_expected_cells |
+| percent_intronic_reads | Percentage of intronic reads. Calculated as: reads_mapped_confidently_to_intronic_regions / number_of_reads |
+| keeper_mean_reads_per_cell | Mean reads per cell for cells with >1500 genes or nuclei with >1000 genes. |
+| keeper_median_genes | Median genes per cell for cells with >1500 genes or nuclei with >1000 genes.  |
+| keeper_cells | Number of cells with >1500 genes or nuclei with >1000 genes.|
+| percent_keeper | Percentage of keeper cells. Calculated as: keeper_cells / estimated_cells |
+| percent_usable | Percentage of usable cells. Calculated as: keeper_cells / expected_cells |
diff --git a/website/docs/Pipelines/Optimus_Pipeline/README.md b/website/docs/Pipelines/Optimus_Pipeline/README.md
@@ -102,6 +102,7 @@ The example configuration files also contain metadata for the reference files, d
 | ignore_r1_read_length | Boolean that overrides a check on the 10x chemistry. Default is set to false. If true, the workflow will not ensure that the 10x_chemistry_version input matches the chemistry in the read 1 FASTQ. | "true" or "false" (default) | 
 | emptydrops_lower | UMI threshold for emptyDrops detection; default is 100. | N/A |
 | count_exons | Boolean indicating if the workflow should calculate exon counts **when in single-nucleus (sn_rna) mode**. If true, this option will output an additional layer for the h5ad file. By default, it is set to "false". If the parameter is true and used with sc_rnamode, the workflow will return an error. | "true" or "false" (default) |
+| expected_cells | Optional integer input for the expected number of cells, which is used calculate library-level metrics. The default is set to 3,000 | 
 
 #### Pseudogene handling
 The example Optimus reference files are downloaded directly from GENCODE (see Quickstart table) and are not modified to remove pseudogenes. This is in contrast to the [references created for Cell Ranger](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/release-notes/references#header) which remove pseudogenes and small RNAs.
@@ -255,7 +256,7 @@ The following table lists the output files produced from the pipeline. For sampl
 | cell_metrics | `<input_id>.cell-metrics.csv.gz` | Matrix of metrics by cells. | Compressed CSV |
 | gene_metrics | `<input_id>.gene-metrics.csv.gz` |  Matrix of metrics by genes. | Compressed CSV |
 | aligner_metrics | `<input_id>.star_metrics.tar` | Tarred metrics files produced by the STARsolo aligner; contains align features, cell reads, summary, and UMI per cell metrics files. | TXT |
-| library_metrics | `<input_id>_library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. | CSV |
+| library_metrics | `<input_id>_library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. See the [Library-level metrics](./Library-metrics.md) for how metrics are calculated. | CSV |
 | multimappers_EM_matrix | `UniqueAndMult-EM.mtx` | Optional output produced when `soloMultiMappers` is "EM"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information. | MTX |
 | multimappers_Uniform_matrix | `UniqueAndMult-Uniform.mtx` | Optional output produced when `soloMultiMappers` is "Uniform"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information. | MTX |
 | multimappers_Rescue_matrix | `UniqueAndMult-Rescue.mtx` | Optional output produced when `soloMultiMappers` is "Rescue"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information. | MTX |