Merge pull request #402 from AlexsLemonade/allyhawkins/cellassign-ass…

…ignments Add CellAssign assignments to SCE object
AlexsLemonade · Aug 14, 2023 · 9efd773 · 9efd773
2 parents 2a78a01 + 9c97aee
commit 9efd773
Show file tree

Hide file tree

Showing 4 changed files with 137 additions and 18 deletions.
diff --git a/bin/classify_SingleR.R b/bin/classify_SingleR.R
@@ -56,6 +56,11 @@ if(!file.exists(opt$input_sce_file)){
   stop("Missing input SCE file")
 }
 
+# check that output file ends in rds
+if (!(stringr::str_ends(opt$output_sce_file, ".rds"))){
+  stop("output sce file name must end in .rds")
+}
+
 # check that references all exist
 singler_model_file <- opt$singler_model_file
 if(!file.exists(singler_model_file)) {

diff --git a/bin/classify_cellassign.R b/bin/classify_cellassign.R
@@ -0,0 +1,80 @@
+#!/usr/bin/env Rscript
+
+# This script is used to read in the predictions from CellAssign and assign cell types in the annotated RDS file
+
+# import libraries
+suppressPackageStartupMessages({
+  library(optparse)
+  library(SingleCellExperiment)
+})
+
+# set up arguments
+option_list <- list(
+  make_option(
+    opt_str = c("-i", "--input_sce_file"),
+    type = "character",
+    help = "path to rds file with input sce object"
+  ),
+  make_option(
+    opt_str = c("-o", "--output_sce_file"),
+    type = "character",
+    help = "path to output rds file to store annotated sce object. Must end in .rds"
+  ),
+  make_option(
+    opt_str = c("--cellassign_predictions"),
+    type = "character",
+    help = "path to tsv file containing the prediction matrix returned by running CellAssign"
+  ),
+  make_option(
+    opt_str = c("--reference_name"),
+    type = "character",
+    help = "name referring to the marker gene reference used for CellAssign"
+  )
+)
+
+opt <- parse_args(OptionParser(option_list = option_list))
+
+# check that input file exists
+if (!file.exists(opt$input_sce_file)){
+  stop("Missing input SCE file")
+}
+
+# check that cellassign predictions file was provided
+if (!file.exists(opt$cellassign_predictions)){
+  stop("Missing CellAssign predictions file")
+}
+# check that reference_name was provided
+if (is.null(opt$reference_name)) {
+  stop("Missing reference name")
+}
+
+# check that output file ends in rds
+if(!(stringr::str_ends(opt$output_sce_file, ".rds"))){
+  stop("output sce file name must end in .rds")
+}
+# read in input files
+sce <- readr::read_rds(opt$input_sce_file)
+predictions <- readr::read_tsv(opt$cellassign_predictions)
+
+celltype_assignments <- predictions |>
+  tidyr::pivot_longer(!barcode,
+                      names_to = "celltype",
+                      values_to = "prediction") |>
+  dplyr::group_by(barcode) |>
+  dplyr::slice_max(prediction, n = 1) |>
+  dplyr::ungroup()
+
+# join by barcode to make sure assignments are in the right order
+celltype_assignments <- data.frame(barcode = sce$barcodes) |>
+  dplyr::left_join(celltype_assignments, by = "barcode")
+
+# add cell type and prediction to colData
+sce$cellassign_celltype_annotation <- celltype_assignments
+sce$cellassign_max_prediction <- celltype_assignments$prediction
+
+# add entire predictions matrix and ref name to metadata
+metadata(sce)$cellassign_predictions <- predictions
+metadata(sce)$cellassign_reference <- opt$reference_name
+
+# export annotated object with cellassign assignments 
+readr::write_rds(sce, opt$output_sce_file)
diff --git a/bin/classify_cellassign.py → bin/predict_cellassign.py b/bin/classify_cellassign.py → bin/predict_cellassign.py
diff --git a/modules/classify-celltypes.nf b/modules/classify-celltypes.nf
@@ -3,7 +3,6 @@ process classify_singleR {
     container params.SCPCATOOLS_CONTAINER
     label 'mem_8'
     label 'cpus_4'
-    publishDir "${params.results_dir}/${meta.project_id}/${meta.sample_id}", mode: 'copy'
     input:
         tuple val(meta), path(processed_rds), path(singler_model_file)
     output:
@@ -28,16 +27,17 @@ process classify_singleR {
 
 process predict_cellassign {
   container params.SCPCATOOLS_CONTAINER
+  publishDir "${params.checkpoints_dir}/celltype/${meta.library_id}", mode: 'copy'
   label 'mem_32'
   label 'cpus_12'
   input:
-    tuple val(meta), path(processed_hdf5), path(cellassign_reference_mtx)
+    tuple val(meta), path(processed_hdf5), path(cellassign_reference_mtx), val(ref_name)
   output:
-    tuple val(meta), path(cellassign_predictions)
+    tuple val(meta), path(cellassign_predictions), val(ref_name)
   script:
     cellassign_predictions = "${meta.library_id}_predictions.tsv"
     """
-    classify_cellassign.py \
+    predict_cellassign.py \
       --input_hdf5_file ${processed_hdf5} \
       --output_predictions ${cellassign_predictions} \
       --reference ${cellassign_reference_mtx} \
@@ -51,6 +51,27 @@ process predict_cellassign {
     """
 }
 
+process classify_cellassign {
+  container params.SCPCATOOLS_CONTAINER
+  publishDir "${params.results_dir}/${meta.project_id}/${meta.sample_id}", mode: 'copy'
+  label 'mem_4'
+  label 'cpus_2'
+  input:
+    tuple val(meta), path(input_rds), path(cellassign_predictions), val(ref_name)
+  output:
+    tuple val(meta), path(annotated_rds)
+  script:
+    annotated_rds = "${meta.library_id}_annotated.rds"
+    """
+    classify_cellassign.R \
+      --input_sce_file ${input_rds} \
+      --output_sce_file ${annotated_rds} \
+      --cellassign_predictions ${cellassign_predictions} \
+      --reference_name ${ref_name}
+
+    """
+}
+
 workflow annotate_celltypes {
     take: processed_sce_channel
     main:
@@ -60,7 +81,10 @@ workflow annotate_celltypes {
         .map{[
           project_id = it.scpca_project_id,
           singler_model_file = "${params.singler_models_dir}/${it.singler_ref_file}",
-          cellassign_ref_file = "${params.cellassign_ref_dir}/${it.cellassign_ref_file}"
+          cellassign_ref_file = "${params.cellassign_ref_dir}/${it.cellassign_ref_file}",
+          // add ref name for cellassign since we cannot store it in the cellassign output
+          // singler ref name does not need to be added because it is stored in the singler model
+          cellassign_ref_name = it.cellassign_ref_name
         ]}
 
       // create channel grouped_celltype_ch as: [meta, processed sce object, SingleR reference model]
@@ -70,26 +94,36 @@ workflow annotate_celltypes {
         .combine(celltype_ch, by: 0)
         .map{it.drop(1)} // remove extra project ID
 
-      // creates [meta, processed, SingleR reference model]
+      // creates input for singleR [meta, processed, SingleR reference model]
       singler_input_ch = grouped_celltype_ch
-        .map{meta, processed_rds, processed_hdf5, singler_model_file, cellassign_ref_file -> tuple(meta,
-                                                                                                  processed_rds,
-                                                                                                  singler_model_file
-                                                                                                  )}
+        .map{meta, processed_rds, processed_hdf5, singler_model_file, cellassign_ref_file, cellassign_ref_name -> tuple(meta,
+                                                                                                                        processed_rds,
+                                                                                                                        singler_model_file
+                                                                                                                        )}
 
+      // creates input for cellassign [meta, processed hdf5, cellassign ref file, cell assign ref name]
       cellassign_input_ch = grouped_celltype_ch
-        .map{meta, processed_rds, processed_hdf5, singler_model_file, cellassign_ref_file -> tuple(meta,
-                                                                                                  processed_hdf5,
-                                                                                                  cellassign_ref_file
-                                                                                                  )}
-
+        .map{meta, processed_rds, processed_hdf5, singler_model_file, cellassign_ref_file, cellassign_ref_name -> tuple(meta,
+                                                                                                                        processed_hdf5,
+                                                                                                                        cellassign_ref_file,
+                                                                                                                        cellassign_ref_name
+                                                                                                                        )}
 
+      // get SingleR cell type assignments and add them to SCE
       classify_singleR(singler_input_ch)
 
+      // get cellassign predictions file
       predict_cellassign(cellassign_input_ch)
 
-    emit:
-      classify_singleR.out
-      predict_cellassign.out
+      // add cellassign annotations to the object with singleR results
+      all_celltype_assignments_ch = classify_singleR.out
+        // combines using meta from both singleR and cellassign, they should be the same
+        // resulting tuple should be [meta, singleR annotated rds, cellassign predictions]
+        .combine(predict_cellassign.out, by: 0)
+
+      // get CellAssign cell type predictions and add them to SCE
+      classify_cellassign(all_celltype_assignments_ch)
+
+    emit: classify_cellassign.out
 
 }