From 8841d77a99139b933105f4a3a464e42e410cee28 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Thu, 10 Aug 2023 12:10:05 -0500 Subject: [PATCH 1/9] initial cell type assignments script and process --- bin/classify_cellassign.R | 61 +++++++++++++++++ ...fy_cellassign.py => predict_cellassign.py} | 0 modules/classify-celltypes.nf | 65 ++++++++++++++----- 3 files changed, 109 insertions(+), 17 deletions(-) create mode 100644 bin/classify_cellassign.R rename bin/{classify_cellassign.py => predict_cellassign.py} (100%) diff --git a/bin/classify_cellassign.R b/bin/classify_cellassign.R new file mode 100644 index 00000000..e1a981ad --- /dev/null +++ b/bin/classify_cellassign.R @@ -0,0 +1,61 @@ +#!/usr/bin/env Rscript + +# This script is used to read in the predictions from CellAssign and assign cell types + +# import libraries +suppressPackageStartupMessages({ + library(optparse) + library(SingleCellExperiment) +}) + +# set up arguments +option_list <- list( + make_option( + opt_str = c("-i", "--input_sce_file"), + type = "character", + help = "path to rds file with input sce object" + ), + make_option( + opt_str = c("-o", "--output_sce_file"), + type = "character", + help = "path to output rds file to store annotated sce object. Must end in .rds" + ), + make_option( + opt_str = c("--cellassign_predictions"), + type = "character", + help = "path to tsv file containing the prediction matrix returned by running CellAssign" + ), + make_option( + opt_str = c("--reference_name"), + type = "character", + help = "name referring to the marker gene reference used for CellAssign" + ) +) + +opt <- parse_args(OptionParser(option_list = option_list)) + +# check that input file file exists +if(!file.exists(opt$input_sce_file)){ + stop("Missing input SCE file") +} + +if(!file.exists(opt$cellassign_predictions)){ + stop("Missing CellAssign predictions file") +} + +# read in input files +sce <- readr::read_rds(opt$input_sce_file) +predictions <- readr::read_tsv(opt$cellassign_predictions) + +celltype_assignments <- predictions |> + tidyr::pivot_longer(!barcode, + names_to = "celltype", + values_to = "prediction") |> + dplyr::group_by(barcode) |> + dplyr::slice_max(prediction, n = 1) |> + dplyr::ungroup() + +sce$cellassign_celltype_annotation <- celltype_assignments + +metadata(sce)$cellassign_predictions <- predictions +metadata(sce)$cellassign_reference <- opt$reference_name diff --git a/bin/classify_cellassign.py b/bin/predict_cellassign.py similarity index 100% rename from bin/classify_cellassign.py rename to bin/predict_cellassign.py diff --git a/modules/classify-celltypes.nf b/modules/classify-celltypes.nf index e5999d18..a86db090 100644 --- a/modules/classify-celltypes.nf +++ b/modules/classify-celltypes.nf @@ -3,7 +3,6 @@ process classify_singleR { container params.SCPCATOOLS_CONTAINER label 'mem_8' label 'cpus_4' - publishDir "${params.results_dir}/${meta.project_id}/${meta.sample_id}", mode: 'copy' input: tuple val(meta), path(processed_rds), path(singler_model_file) output: @@ -28,16 +27,17 @@ process classify_singleR { process predict_cellassign { container params.SCPCATOOLS_CONTAINER + publishDir "${params.results_dir}/${meta.project_id}/${meta.sample_id}", mode: 'copy' label 'mem_32' label 'cpus_12' input: - tuple val(meta), path(processed_hdf5), path(cellassign_reference_mtx) + tuple val(meta), path(processed_hdf5), path(cellassign_reference_mtx), val(ref_name) output: - tuple val(meta), path(cellassign_predictions) + tuple val(meta), path(cellassign_predictions), val(ref_name) script: cellassign_predictions = "${meta.library_id}_predictions.tsv" """ - classify_cellassign.py \ + predict_cellassign.py \ --input_hdf5_file ${processed_hdf5} \ --output_predictions ${cellassign_predictions} \ --reference ${cellassign_reference_mtx} \ @@ -51,6 +51,27 @@ process predict_cellassign { """ } +process classify_cellassign { + container params.SCPCATOOLS_CONTAINER + publishDir "${params.results_dir}/${meta.project_id}/${meta.sample_id}", mode: 'copy' + label 'mem_4' + label 'cpus_2' + input: + tuple val(meta), path(cellassign_predictions), path(input_rds), val(ref_name) + output: + tuple val(meta), path(annotated_rds) + script: + annotated_rds = "${meta.library_id}_annotated.rds" + """ + classify_cellassign.R \ + --input_sce_file ${input_rds} \ + --output_sce_file ${annotated_rds} \ + --cellassign_predictions ${cellassign_predictions} \ + --reference_name ${ref_name} + + """ +} + workflow annotate_celltypes { take: processed_sce_channel main: @@ -60,7 +81,8 @@ workflow annotate_celltypes { .map{[ project_id = it.scpca_project_id, singler_model_file = "${params.singler_models_dir}/${it.singler_ref_file}", - cellassign_ref_file = "${params.cellassign_ref_dir}/${it.cellassign_ref_file}" + cellassign_ref_file = "${params.cellassign_ref_dir}/${it.cellassign_ref_file}", + cellassign_ref_name = it.cellassign_ref_name ]} // create channel grouped_celltype_ch as: [meta, processed sce object, SingleR reference model] @@ -72,24 +94,33 @@ workflow annotate_celltypes { // creates [meta, processed, SingleR reference model] singler_input_ch = grouped_celltype_ch - .map{meta, processed_rds, processed_hdf5, singler_model_file, cellassign_ref_file -> tuple(meta, - processed_rds, - singler_model_file - )} + .map{meta, processed_rds, processed_hdf5, singler_model_file, cellassign_ref_file, cellassign_ref_name -> tuple(meta, + processed_rds, + singler_model_file + )} + // creates [meta, processed hdf5, cellassign ref file, cell assign ref name] cellassign_input_ch = grouped_celltype_ch - .map{meta, processed_rds, processed_hdf5, singler_model_file, cellassign_ref_file -> tuple(meta, - processed_hdf5, - cellassign_ref_file - )} - + .map{meta, processed_rds, processed_hdf5, singler_model_file, cellassign_ref_file, cellassign_ref_name -> tuple(meta, + processed_hdf5, + cellassign_ref_file, + cellassign_ref_name + )} + // get SingleR cell type assignments and add them to SCE classify_singleR(singler_input_ch) + // get cellassign predictions file predict_cellassign(cellassign_input_ch) - emit: - classify_singleR.out - predict_cellassign.out + // add cellassign annotations to the object with singleR results + all_celltype_assignments_ch = classify_singleR.out + // combines using meta from both singleR and cellassign, they should be the same + // resulting tuple should be [meta, singleR annotated rds, cellassign predictions] + .combine(predict_cellassign.out, by: 0) + + classify_cellassign(all_celltype_assignments_ch) + + emit: classify_cellassign.out } From e65169e37558f5f2fd796f08e2e67a985e54ee99 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Fri, 11 Aug 2023 10:31:34 -0500 Subject: [PATCH 2/9] switch order of inputs --- modules/classify-celltypes.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/classify-celltypes.nf b/modules/classify-celltypes.nf index a86db090..8737ecd6 100644 --- a/modules/classify-celltypes.nf +++ b/modules/classify-celltypes.nf @@ -57,7 +57,7 @@ process classify_cellassign { label 'mem_4' label 'cpus_2' input: - tuple val(meta), path(cellassign_predictions), path(input_rds), val(ref_name) + tuple val(meta), path(input_rds), path(cellassign_predictions), val(ref_name) output: tuple val(meta), path(annotated_rds) script: From 91715783868306db5cf044b2b83c445d8355f4e9 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Fri, 11 Aug 2023 11:43:20 -0500 Subject: [PATCH 3/9] make sure cell type assignments are in the right order --- bin/classify_cellassign.R | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/bin/classify_cellassign.R b/bin/classify_cellassign.R index e1a981ad..5a7ad862 100644 --- a/bin/classify_cellassign.R +++ b/bin/classify_cellassign.R @@ -43,9 +43,9 @@ if(!file.exists(opt$cellassign_predictions)){ stop("Missing CellAssign predictions file") } -# read in input files +# read in input files sce <- readr::read_rds(opt$input_sce_file) -predictions <- readr::read_tsv(opt$cellassign_predictions) +predictions <- readr::read_tsv(opt$cellassign_predictions) celltype_assignments <- predictions |> tidyr::pivot_longer(!barcode, @@ -55,7 +55,13 @@ celltype_assignments <- predictions |> dplyr::slice_max(prediction, n = 1) |> dplyr::ungroup() -sce$cellassign_celltype_annotation <- celltype_assignments +# join by barcode to make sure assignments are in the right order +celltype_assignments <- data.frame(barcode = sce$barcodes) |> + dplyr::left_join(celltype_assignments, by = "barcode") + +# add into sce +sce$cellassign_celltype_annotation <- celltype_assignments$celltype +sce$cellassign_max_prediction <- celltype_assignments$prediction metadata(sce)$cellassign_predictions <- predictions metadata(sce)$cellassign_reference <- opt$reference_name From d48ec60b31ab2d40935255ec788001be85738ad2 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Fri, 11 Aug 2023 11:43:37 -0500 Subject: [PATCH 4/9] publish predictions to checkpoints directory --- modules/classify-celltypes.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/classify-celltypes.nf b/modules/classify-celltypes.nf index 8737ecd6..c13f291a 100644 --- a/modules/classify-celltypes.nf +++ b/modules/classify-celltypes.nf @@ -27,7 +27,7 @@ process classify_singleR { process predict_cellassign { container params.SCPCATOOLS_CONTAINER - publishDir "${params.results_dir}/${meta.project_id}/${meta.sample_id}", mode: 'copy' + publishDir "${params.checkpoints_dir}/celltype/${meta.library_id}", mode: 'copy' label 'mem_32' label 'cpus_12' input: From a8289788f502d7d17d5183fd16232a6b5b3838e4 Mon Sep 17 00:00:00 2001 From: Ally Hawkins <54039191+allyhawkins@users.noreply.github.com> Date: Fri, 11 Aug 2023 14:50:46 -0500 Subject: [PATCH 5/9] Apply suggestions from code review Co-authored-by: Stephanie --- bin/classify_cellassign.R | 13 +++++++++++-- modules/classify-celltypes.nf | 1 + 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/bin/classify_cellassign.R b/bin/classify_cellassign.R index 5a7ad862..70e0f6bd 100644 --- a/bin/classify_cellassign.R +++ b/bin/classify_cellassign.R @@ -1,6 +1,6 @@ #!/usr/bin/env Rscript -# This script is used to read in the predictions from CellAssign and assign cell types +# This script is used to read in the predictions from CellAssign and assign cell types in the annotated RDS file # import libraries suppressPackageStartupMessages({ @@ -34,15 +34,24 @@ option_list <- list( opt <- parse_args(OptionParser(option_list = option_list)) -# check that input file file exists +# check that input file exists if(!file.exists(opt$input_sce_file)){ stop("Missing input SCE file") } +# check that cellassign predictions file was provided if(!file.exists(opt$cellassign_predictions)){ stop("Missing CellAssign predictions file") } +# check that reference_name was provided +if (is.null(opt$reference_name)) { + stop("Missing reference name") +} +# check that output file ends in rds +if(!(stringr::str_ends(opt$output_sce_file, ".rds"))){ + stop("output sce file name must end in .rds") +} # read in input files sce <- readr::read_rds(opt$input_sce_file) predictions <- readr::read_tsv(opt$cellassign_predictions) diff --git a/modules/classify-celltypes.nf b/modules/classify-celltypes.nf index c13f291a..f26f83b5 100644 --- a/modules/classify-celltypes.nf +++ b/modules/classify-celltypes.nf @@ -119,6 +119,7 @@ workflow annotate_celltypes { // resulting tuple should be [meta, singleR annotated rds, cellassign predictions] .combine(predict_cellassign.out, by: 0) + // get CellAssign cell type predictions and add them to SCE classify_cellassign(all_celltype_assignments_ch) emit: classify_cellassign.out From 9719001ba9a281b3cd992af5ff5036aaeb063283 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Fri, 11 Aug 2023 14:54:14 -0500 Subject: [PATCH 6/9] export rds --- bin/classify_cellassign.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bin/classify_cellassign.R b/bin/classify_cellassign.R index 70e0f6bd..4b096295 100644 --- a/bin/classify_cellassign.R +++ b/bin/classify_cellassign.R @@ -68,9 +68,13 @@ celltype_assignments <- predictions |> celltype_assignments <- data.frame(barcode = sce$barcodes) |> dplyr::left_join(celltype_assignments, by = "barcode") -# add into sce -sce$cellassign_celltype_annotation <- celltype_assignments$celltype +# add cell type and prediction to colData +sce$cellassign_celltype_annotation <- celltype_assignments sce$cellassign_max_prediction <- celltype_assignments$prediction +# add entire predictions matrix and ref name to metadata metadata(sce)$cellassign_predictions <- predictions metadata(sce)$cellassign_reference <- opt$reference_name + +# export annotated object with cellassign assignments +readr::write_rds(sce, opt$output_sce_file) From a3a8d5234d9f39e8f46c4eb69e851a849facf9b3 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Fri, 11 Aug 2023 14:55:12 -0500 Subject: [PATCH 7/9] add output file check to classify SingleR --- bin/classify_SingleR.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bin/classify_SingleR.R b/bin/classify_SingleR.R index 870081c8..fc46e2ed 100644 --- a/bin/classify_SingleR.R +++ b/bin/classify_SingleR.R @@ -56,6 +56,11 @@ if(!file.exists(opt$input_sce_file)){ stop("Missing input SCE file") } +# check that output file ends in rds +if(!(stringr::str_ends(opt$output_sce_file, ".rds"))){ + stop("output sce file name must end in .rds") +} + # check that references all exist singler_model_file <- opt$singler_model_file if(!file.exists(singler_model_file)) { From 6449ca11f6a9307b33abdb5323fce1f8ef89086b Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Fri, 11 Aug 2023 14:57:20 -0500 Subject: [PATCH 8/9] update comments --- modules/classify-celltypes.nf | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/classify-celltypes.nf b/modules/classify-celltypes.nf index f26f83b5..07c757db 100644 --- a/modules/classify-celltypes.nf +++ b/modules/classify-celltypes.nf @@ -82,6 +82,8 @@ workflow annotate_celltypes { project_id = it.scpca_project_id, singler_model_file = "${params.singler_models_dir}/${it.singler_ref_file}", cellassign_ref_file = "${params.cellassign_ref_dir}/${it.cellassign_ref_file}", + // add ref name for cellassign since we cannot store it in the cellassign output + // singler ref name does not need to be added because it is stored in the singler model cellassign_ref_name = it.cellassign_ref_name ]} @@ -92,14 +94,14 @@ workflow annotate_celltypes { .combine(celltype_ch, by: 0) .map{it.drop(1)} // remove extra project ID - // creates [meta, processed, SingleR reference model] + // creates input for singleR [meta, processed, SingleR reference model] singler_input_ch = grouped_celltype_ch .map{meta, processed_rds, processed_hdf5, singler_model_file, cellassign_ref_file, cellassign_ref_name -> tuple(meta, processed_rds, singler_model_file )} - // creates [meta, processed hdf5, cellassign ref file, cell assign ref name] + // creates input for cellassign [meta, processed hdf5, cellassign ref file, cell assign ref name] cellassign_input_ch = grouped_celltype_ch .map{meta, processed_rds, processed_hdf5, singler_model_file, cellassign_ref_file, cellassign_ref_name -> tuple(meta, processed_hdf5, From 9c97aeea74e49f61f02adea55e051a66c5748d7a Mon Sep 17 00:00:00 2001 From: Ally Hawkins <54039191+allyhawkins@users.noreply.github.com> Date: Mon, 14 Aug 2023 11:38:22 -0500 Subject: [PATCH 9/9] Apply suggestions from code review Co-authored-by: Stephanie --- bin/classify_SingleR.R | 2 +- bin/classify_cellassign.R | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/classify_SingleR.R b/bin/classify_SingleR.R index fc46e2ed..2365753d 100644 --- a/bin/classify_SingleR.R +++ b/bin/classify_SingleR.R @@ -57,7 +57,7 @@ if(!file.exists(opt$input_sce_file)){ } # check that output file ends in rds -if(!(stringr::str_ends(opt$output_sce_file, ".rds"))){ +if (!(stringr::str_ends(opt$output_sce_file, ".rds"))){ stop("output sce file name must end in .rds") } diff --git a/bin/classify_cellassign.R b/bin/classify_cellassign.R index 4b096295..02ba3f99 100644 --- a/bin/classify_cellassign.R +++ b/bin/classify_cellassign.R @@ -35,12 +35,12 @@ option_list <- list( opt <- parse_args(OptionParser(option_list = option_list)) # check that input file exists -if(!file.exists(opt$input_sce_file)){ +if (!file.exists(opt$input_sce_file)){ stop("Missing input SCE file") } # check that cellassign predictions file was provided -if(!file.exists(opt$cellassign_predictions)){ +if (!file.exists(opt$cellassign_predictions)){ stop("Missing CellAssign predictions file") } # check that reference_name was provided