From 8841d77a99139b933105f4a3a464e42e410cee28 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Thu, 10 Aug 2023 12:10:05 -0500
Subject: [PATCH 1/9] initial cell type assignments script and process

---
 bin/classify_cellassign.R                     | 61 +++++++++++++++++
 ...fy_cellassign.py => predict_cellassign.py} |  0
 modules/classify-celltypes.nf                 | 65 ++++++++++++++-----
 3 files changed, 109 insertions(+), 17 deletions(-)
 create mode 100644 bin/classify_cellassign.R
 rename bin/{classify_cellassign.py => predict_cellassign.py} (100%)

diff --git a/bin/classify_cellassign.R b/bin/classify_cellassign.R
new file mode 100644
index 00000000..e1a981ad
--- /dev/null
+++ b/bin/classify_cellassign.R
@@ -0,0 +1,61 @@
+#!/usr/bin/env Rscript
+
+# This script is used to read in the predictions from CellAssign and assign cell types
+
+# import libraries
+suppressPackageStartupMessages({
+  library(optparse)
+  library(SingleCellExperiment)
+})
+
+# set up arguments
+option_list <- list(
+  make_option(
+    opt_str = c("-i", "--input_sce_file"),
+    type = "character",
+    help = "path to rds file with input sce object"
+  ),
+  make_option(
+    opt_str = c("-o", "--output_sce_file"),
+    type = "character",
+    help = "path to output rds file to store annotated sce object. Must end in .rds"
+  ),
+  make_option(
+    opt_str = c("--cellassign_predictions"),
+    type = "character",
+    help = "path to tsv file containing the prediction matrix returned by running CellAssign"
+  ),
+  make_option(
+    opt_str = c("--reference_name"),
+    type = "character",
+    help = "name referring to the marker gene reference used for CellAssign"
+  )
+)
+
+opt <- parse_args(OptionParser(option_list = option_list))
+
+# check that input file file exists
+if(!file.exists(opt$input_sce_file)){
+  stop("Missing input SCE file")
+}
+
+if(!file.exists(opt$cellassign_predictions)){
+  stop("Missing CellAssign predictions file")
+}
+
+# read in input files 
+sce <- readr::read_rds(opt$input_sce_file)
+predictions <- readr::read_tsv(opt$cellassign_predictions) 
+
+celltype_assignments <- predictions |>
+  tidyr::pivot_longer(!barcode,
+                      names_to = "celltype",
+                      values_to = "prediction") |>
+  dplyr::group_by(barcode) |>
+  dplyr::slice_max(prediction, n = 1) |>
+  dplyr::ungroup()
+
+sce$cellassign_celltype_annotation <- celltype_assignments
+
+metadata(sce)$cellassign_predictions <- predictions
+metadata(sce)$cellassign_reference <- opt$reference_name
diff --git a/bin/classify_cellassign.py b/bin/predict_cellassign.py
similarity index 100%
rename from bin/classify_cellassign.py
rename to bin/predict_cellassign.py
diff --git a/modules/classify-celltypes.nf b/modules/classify-celltypes.nf
index e5999d18..a86db090 100644
--- a/modules/classify-celltypes.nf
+++ b/modules/classify-celltypes.nf
@@ -3,7 +3,6 @@ process classify_singleR {
     container params.SCPCATOOLS_CONTAINER
     label 'mem_8'
     label 'cpus_4'
-    publishDir "${params.results_dir}/${meta.project_id}/${meta.sample_id}", mode: 'copy'
     input:
         tuple val(meta), path(processed_rds), path(singler_model_file)
     output:
@@ -28,16 +27,17 @@ process classify_singleR {
 
 process predict_cellassign {
   container params.SCPCATOOLS_CONTAINER
+  publishDir "${params.results_dir}/${meta.project_id}/${meta.sample_id}", mode: 'copy'
   label 'mem_32'
   label 'cpus_12'
   input:
-    tuple val(meta), path(processed_hdf5), path(cellassign_reference_mtx)
+    tuple val(meta), path(processed_hdf5), path(cellassign_reference_mtx), val(ref_name)
   output:
-    tuple val(meta), path(cellassign_predictions)
+    tuple val(meta), path(cellassign_predictions), val(ref_name)
   script:
     cellassign_predictions = "${meta.library_id}_predictions.tsv"
     """
-    classify_cellassign.py \
+    predict_cellassign.py \
       --input_hdf5_file ${processed_hdf5} \
       --output_predictions ${cellassign_predictions} \
       --reference ${cellassign_reference_mtx} \
@@ -51,6 +51,27 @@ process predict_cellassign {
     """
 }
 
+process classify_cellassign {
+  container params.SCPCATOOLS_CONTAINER
+  publishDir "${params.results_dir}/${meta.project_id}/${meta.sample_id}", mode: 'copy'
+  label 'mem_4'
+  label 'cpus_2'
+  input:
+    tuple val(meta), path(cellassign_predictions), path(input_rds), val(ref_name)
+  output:
+    tuple val(meta), path(annotated_rds)
+  script:
+    annotated_rds = "${meta.library_id}_annotated.rds"
+    """
+    classify_cellassign.R \
+      --input_sce_file ${input_rds} \
+      --output_sce_file ${annotated_rds} \
+      --cellassign_predictions ${cellassign_predictions} \
+      --reference_name ${ref_name}
+
+    """
+}
+
 workflow annotate_celltypes {
     take: processed_sce_channel
     main:
@@ -60,7 +81,8 @@ workflow annotate_celltypes {
         .map{[
           project_id = it.scpca_project_id,
           singler_model_file = "${params.singler_models_dir}/${it.singler_ref_file}",
-          cellassign_ref_file = "${params.cellassign_ref_dir}/${it.cellassign_ref_file}"
+          cellassign_ref_file = "${params.cellassign_ref_dir}/${it.cellassign_ref_file}",
+          cellassign_ref_name = it.cellassign_ref_name
         ]}
 
       // create channel grouped_celltype_ch as: [meta, processed sce object, SingleR reference model]
@@ -72,24 +94,33 @@ workflow annotate_celltypes {
 
       // creates [meta, processed, SingleR reference model]
       singler_input_ch = grouped_celltype_ch
-        .map{meta, processed_rds, processed_hdf5, singler_model_file, cellassign_ref_file -> tuple(meta,
-                                                                                                  processed_rds,
-                                                                                                  singler_model_file
-                                                                                                  )}
+        .map{meta, processed_rds, processed_hdf5, singler_model_file, cellassign_ref_file, cellassign_ref_name -> tuple(meta,
+                                                                                                                        processed_rds,
+                                                                                                                        singler_model_file
+                                                                                                                        )}
 
+      // creates [meta, processed hdf5, cellassign ref file, cell assign ref name]
       cellassign_input_ch = grouped_celltype_ch
-        .map{meta, processed_rds, processed_hdf5, singler_model_file, cellassign_ref_file -> tuple(meta,
-                                                                                                  processed_hdf5,
-                                                                                                  cellassign_ref_file
-                                                                                                  )}
-
+        .map{meta, processed_rds, processed_hdf5, singler_model_file, cellassign_ref_file, cellassign_ref_name -> tuple(meta,
+                                                                                                                        processed_hdf5,
+                                                                                                                        cellassign_ref_file,
+                                                                                                                        cellassign_ref_name
+                                                                                                                        )}
 
+      // get SingleR cell type assignments and add them to SCE
       classify_singleR(singler_input_ch)
 
+      // get cellassign predictions file
       predict_cellassign(cellassign_input_ch)
 
-    emit:
-      classify_singleR.out
-      predict_cellassign.out
+      // add cellassign annotations to the object with singleR results
+      all_celltype_assignments_ch = classify_singleR.out
+        // combines using meta from both singleR and cellassign, they should be the same
+        // resulting tuple should be [meta, singleR annotated rds, cellassign predictions]
+        .combine(predict_cellassign.out, by: 0)
+
+      classify_cellassign(all_celltype_assignments_ch)
+
+    emit: classify_cellassign.out
 
 }

From e65169e37558f5f2fd796f08e2e67a985e54ee99 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Fri, 11 Aug 2023 10:31:34 -0500
Subject: [PATCH 2/9] switch order of inputs

---
 modules/classify-celltypes.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/classify-celltypes.nf b/modules/classify-celltypes.nf
index a86db090..8737ecd6 100644
--- a/modules/classify-celltypes.nf
+++ b/modules/classify-celltypes.nf
@@ -57,7 +57,7 @@ process classify_cellassign {
   label 'mem_4'
   label 'cpus_2'
   input:
-    tuple val(meta), path(cellassign_predictions), path(input_rds), val(ref_name)
+    tuple val(meta), path(input_rds), path(cellassign_predictions), val(ref_name)
   output:
     tuple val(meta), path(annotated_rds)
   script:

From 91715783868306db5cf044b2b83c445d8355f4e9 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Fri, 11 Aug 2023 11:43:20 -0500
Subject: [PATCH 3/9] make sure cell type assignments are in the right order

---
 bin/classify_cellassign.R | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/bin/classify_cellassign.R b/bin/classify_cellassign.R
index e1a981ad..5a7ad862 100644
--- a/bin/classify_cellassign.R
+++ b/bin/classify_cellassign.R
@@ -43,9 +43,9 @@ if(!file.exists(opt$cellassign_predictions)){
   stop("Missing CellAssign predictions file")
 }
 
-# read in input files 
+# read in input files
 sce <- readr::read_rds(opt$input_sce_file)
-predictions <- readr::read_tsv(opt$cellassign_predictions) 
+predictions <- readr::read_tsv(opt$cellassign_predictions)
 
 celltype_assignments <- predictions |>
   tidyr::pivot_longer(!barcode,
@@ -55,7 +55,13 @@ celltype_assignments <- predictions |>
   dplyr::slice_max(prediction, n = 1) |>
   dplyr::ungroup()
 
-sce$cellassign_celltype_annotation <- celltype_assignments
+# join by barcode to make sure assignments are in the right order
+celltype_assignments <- data.frame(barcode = sce$barcodes) |>
+  dplyr::left_join(celltype_assignments, by = "barcode")
+
+# add into sce
+sce$cellassign_celltype_annotation <- celltype_assignments$celltype
+sce$cellassign_max_prediction <- celltype_assignments$prediction
 
 metadata(sce)$cellassign_predictions <- predictions
 metadata(sce)$cellassign_reference <- opt$reference_name

From d48ec60b31ab2d40935255ec788001be85738ad2 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Fri, 11 Aug 2023 11:43:37 -0500
Subject: [PATCH 4/9] publish predictions to checkpoints directory

---
 modules/classify-celltypes.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/classify-celltypes.nf b/modules/classify-celltypes.nf
index 8737ecd6..c13f291a 100644
--- a/modules/classify-celltypes.nf
+++ b/modules/classify-celltypes.nf
@@ -27,7 +27,7 @@ process classify_singleR {
 
 process predict_cellassign {
   container params.SCPCATOOLS_CONTAINER
-  publishDir "${params.results_dir}/${meta.project_id}/${meta.sample_id}", mode: 'copy'
+  publishDir "${params.checkpoints_dir}/celltype/${meta.library_id}", mode: 'copy'
   label 'mem_32'
   label 'cpus_12'
   input:

From a8289788f502d7d17d5183fd16232a6b5b3838e4 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <54039191+allyhawkins@users.noreply.github.com>
Date: Fri, 11 Aug 2023 14:50:46 -0500
Subject: [PATCH 5/9] Apply suggestions from code review

Co-authored-by: Stephanie <stephanie.spielman@gmail.com>
---
 bin/classify_cellassign.R     | 13 +++++++++++--
 modules/classify-celltypes.nf |  1 +
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/bin/classify_cellassign.R b/bin/classify_cellassign.R
index 5a7ad862..70e0f6bd 100644
--- a/bin/classify_cellassign.R
+++ b/bin/classify_cellassign.R
@@ -1,6 +1,6 @@
 #!/usr/bin/env Rscript
 
-# This script is used to read in the predictions from CellAssign and assign cell types
+# This script is used to read in the predictions from CellAssign and assign cell types in the annotated RDS file
 
 # import libraries
 suppressPackageStartupMessages({
@@ -34,15 +34,24 @@ option_list <- list(
 
 opt <- parse_args(OptionParser(option_list = option_list))
 
-# check that input file file exists
+# check that input file exists
 if(!file.exists(opt$input_sce_file)){
   stop("Missing input SCE file")
 }
 
+# check that cellassign predictions file was provided
 if(!file.exists(opt$cellassign_predictions)){
   stop("Missing CellAssign predictions file")
 }
+# check that reference_name was provided
+if (is.null(opt$reference_name)) {
+  stop("Missing reference name")
+}
 
+# check that output file ends in rds
+if(!(stringr::str_ends(opt$output_sce_file, ".rds"))){
+  stop("output sce file name must end in .rds")
+}
 # read in input files
 sce <- readr::read_rds(opt$input_sce_file)
 predictions <- readr::read_tsv(opt$cellassign_predictions)
diff --git a/modules/classify-celltypes.nf b/modules/classify-celltypes.nf
index c13f291a..f26f83b5 100644
--- a/modules/classify-celltypes.nf
+++ b/modules/classify-celltypes.nf
@@ -119,6 +119,7 @@ workflow annotate_celltypes {
         // resulting tuple should be [meta, singleR annotated rds, cellassign predictions]
         .combine(predict_cellassign.out, by: 0)
 
+      // get CellAssign cell type predictions and add them to SCE
       classify_cellassign(all_celltype_assignments_ch)
 
     emit: classify_cellassign.out

From 9719001ba9a281b3cd992af5ff5036aaeb063283 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Fri, 11 Aug 2023 14:54:14 -0500
Subject: [PATCH 6/9] export rds

---
 bin/classify_cellassign.R | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/bin/classify_cellassign.R b/bin/classify_cellassign.R
index 70e0f6bd..4b096295 100644
--- a/bin/classify_cellassign.R
+++ b/bin/classify_cellassign.R
@@ -68,9 +68,13 @@ celltype_assignments <- predictions |>
 celltype_assignments <- data.frame(barcode = sce$barcodes) |>
   dplyr::left_join(celltype_assignments, by = "barcode")
 
-# add into sce
-sce$cellassign_celltype_annotation <- celltype_assignments$celltype
+# add cell type and prediction to colData
+sce$cellassign_celltype_annotation <- celltype_assignments
 sce$cellassign_max_prediction <- celltype_assignments$prediction
 
+# add entire predictions matrix and ref name to metadata
 metadata(sce)$cellassign_predictions <- predictions
 metadata(sce)$cellassign_reference <- opt$reference_name
+
+# export annotated object with cellassign assignments 
+readr::write_rds(sce, opt$output_sce_file)

From a3a8d5234d9f39e8f46c4eb69e851a849facf9b3 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Fri, 11 Aug 2023 14:55:12 -0500
Subject: [PATCH 7/9] add output file check to classify SingleR

---
 bin/classify_SingleR.R | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/bin/classify_SingleR.R b/bin/classify_SingleR.R
index 870081c8..fc46e2ed 100644
--- a/bin/classify_SingleR.R
+++ b/bin/classify_SingleR.R
@@ -56,6 +56,11 @@ if(!file.exists(opt$input_sce_file)){
   stop("Missing input SCE file")
 }
 
+# check that output file ends in rds
+if(!(stringr::str_ends(opt$output_sce_file, ".rds"))){
+  stop("output sce file name must end in .rds")
+}
+
 # check that references all exist
 singler_model_file <- opt$singler_model_file
 if(!file.exists(singler_model_file)) {

From 6449ca11f6a9307b33abdb5323fce1f8ef89086b Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Fri, 11 Aug 2023 14:57:20 -0500
Subject: [PATCH 8/9] update comments

---
 modules/classify-celltypes.nf | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/modules/classify-celltypes.nf b/modules/classify-celltypes.nf
index f26f83b5..07c757db 100644
--- a/modules/classify-celltypes.nf
+++ b/modules/classify-celltypes.nf
@@ -82,6 +82,8 @@ workflow annotate_celltypes {
           project_id = it.scpca_project_id,
           singler_model_file = "${params.singler_models_dir}/${it.singler_ref_file}",
           cellassign_ref_file = "${params.cellassign_ref_dir}/${it.cellassign_ref_file}",
+          // add ref name for cellassign since we cannot store it in the cellassign output
+          // singler ref name does not need to be added because it is stored in the singler model
           cellassign_ref_name = it.cellassign_ref_name
         ]}
 
@@ -92,14 +94,14 @@ workflow annotate_celltypes {
         .combine(celltype_ch, by: 0)
         .map{it.drop(1)} // remove extra project ID
 
-      // creates [meta, processed, SingleR reference model]
+      // creates input for singleR [meta, processed, SingleR reference model]
       singler_input_ch = grouped_celltype_ch
         .map{meta, processed_rds, processed_hdf5, singler_model_file, cellassign_ref_file, cellassign_ref_name -> tuple(meta,
                                                                                                                         processed_rds,
                                                                                                                         singler_model_file
                                                                                                                         )}
 
-      // creates [meta, processed hdf5, cellassign ref file, cell assign ref name]
+      // creates input for cellassign [meta, processed hdf5, cellassign ref file, cell assign ref name]
       cellassign_input_ch = grouped_celltype_ch
         .map{meta, processed_rds, processed_hdf5, singler_model_file, cellassign_ref_file, cellassign_ref_name -> tuple(meta,
                                                                                                                         processed_hdf5,

From 9c97aeea74e49f61f02adea55e051a66c5748d7a Mon Sep 17 00:00:00 2001
From: Ally Hawkins <54039191+allyhawkins@users.noreply.github.com>
Date: Mon, 14 Aug 2023 11:38:22 -0500
Subject: [PATCH 9/9] Apply suggestions from code review

Co-authored-by: Stephanie <stephanie.spielman@gmail.com>
---
 bin/classify_SingleR.R    | 2 +-
 bin/classify_cellassign.R | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/bin/classify_SingleR.R b/bin/classify_SingleR.R
index fc46e2ed..2365753d 100644
--- a/bin/classify_SingleR.R
+++ b/bin/classify_SingleR.R
@@ -57,7 +57,7 @@ if(!file.exists(opt$input_sce_file)){
 }
 
 # check that output file ends in rds
-if(!(stringr::str_ends(opt$output_sce_file, ".rds"))){
+if (!(stringr::str_ends(opt$output_sce_file, ".rds"))){
   stop("output sce file name must end in .rds")
 }
 
diff --git a/bin/classify_cellassign.R b/bin/classify_cellassign.R
index 4b096295..02ba3f99 100644
--- a/bin/classify_cellassign.R
+++ b/bin/classify_cellassign.R
@@ -35,12 +35,12 @@ option_list <- list(
 opt <- parse_args(OptionParser(option_list = option_list))
 
 # check that input file exists
-if(!file.exists(opt$input_sce_file)){
+if (!file.exists(opt$input_sce_file)){
   stop("Missing input SCE file")
 }
 
 # check that cellassign predictions file was provided
-if(!file.exists(opt$cellassign_predictions)){
+if (!file.exists(opt$cellassign_predictions)){
   stop("Missing CellAssign predictions file")
 }
 # check that reference_name was provided