diff --git a/bin/classify_SingleR.R b/bin/classify_SingleR.R index 870081c8..2365753d 100644 --- a/bin/classify_SingleR.R +++ b/bin/classify_SingleR.R @@ -56,6 +56,11 @@ if(!file.exists(opt$input_sce_file)){ stop("Missing input SCE file") } +# check that output file ends in rds +if (!(stringr::str_ends(opt$output_sce_file, ".rds"))){ + stop("output sce file name must end in .rds") +} + # check that references all exist singler_model_file <- opt$singler_model_file if(!file.exists(singler_model_file)) { diff --git a/bin/classify_cellassign.R b/bin/classify_cellassign.R new file mode 100644 index 00000000..02ba3f99 --- /dev/null +++ b/bin/classify_cellassign.R @@ -0,0 +1,80 @@ +#!/usr/bin/env Rscript + +# This script is used to read in the predictions from CellAssign and assign cell types in the annotated RDS file + +# import libraries +suppressPackageStartupMessages({ + library(optparse) + library(SingleCellExperiment) +}) + +# set up arguments +option_list <- list( + make_option( + opt_str = c("-i", "--input_sce_file"), + type = "character", + help = "path to rds file with input sce object" + ), + make_option( + opt_str = c("-o", "--output_sce_file"), + type = "character", + help = "path to output rds file to store annotated sce object. Must end in .rds" + ), + make_option( + opt_str = c("--cellassign_predictions"), + type = "character", + help = "path to tsv file containing the prediction matrix returned by running CellAssign" + ), + make_option( + opt_str = c("--reference_name"), + type = "character", + help = "name referring to the marker gene reference used for CellAssign" + ) +) + +opt <- parse_args(OptionParser(option_list = option_list)) + +# check that input file exists +if (!file.exists(opt$input_sce_file)){ + stop("Missing input SCE file") +} + +# check that cellassign predictions file was provided +if (!file.exists(opt$cellassign_predictions)){ + stop("Missing CellAssign predictions file") +} +# check that reference_name was provided +if (is.null(opt$reference_name)) { + stop("Missing reference name") +} + +# check that output file ends in rds +if(!(stringr::str_ends(opt$output_sce_file, ".rds"))){ + stop("output sce file name must end in .rds") +} +# read in input files +sce <- readr::read_rds(opt$input_sce_file) +predictions <- readr::read_tsv(opt$cellassign_predictions) + +celltype_assignments <- predictions |> + tidyr::pivot_longer(!barcode, + names_to = "celltype", + values_to = "prediction") |> + dplyr::group_by(barcode) |> + dplyr::slice_max(prediction, n = 1) |> + dplyr::ungroup() + +# join by barcode to make sure assignments are in the right order +celltype_assignments <- data.frame(barcode = sce$barcodes) |> + dplyr::left_join(celltype_assignments, by = "barcode") + +# add cell type and prediction to colData +sce$cellassign_celltype_annotation <- celltype_assignments +sce$cellassign_max_prediction <- celltype_assignments$prediction + +# add entire predictions matrix and ref name to metadata +metadata(sce)$cellassign_predictions <- predictions +metadata(sce)$cellassign_reference <- opt$reference_name + +# export annotated object with cellassign assignments +readr::write_rds(sce, opt$output_sce_file) diff --git a/bin/classify_cellassign.py b/bin/predict_cellassign.py similarity index 100% rename from bin/classify_cellassign.py rename to bin/predict_cellassign.py diff --git a/modules/classify-celltypes.nf b/modules/classify-celltypes.nf index e5999d18..07c757db 100644 --- a/modules/classify-celltypes.nf +++ b/modules/classify-celltypes.nf @@ -3,7 +3,6 @@ process classify_singleR { container params.SCPCATOOLS_CONTAINER label 'mem_8' label 'cpus_4' - publishDir "${params.results_dir}/${meta.project_id}/${meta.sample_id}", mode: 'copy' input: tuple val(meta), path(processed_rds), path(singler_model_file) output: @@ -28,16 +27,17 @@ process classify_singleR { process predict_cellassign { container params.SCPCATOOLS_CONTAINER + publishDir "${params.checkpoints_dir}/celltype/${meta.library_id}", mode: 'copy' label 'mem_32' label 'cpus_12' input: - tuple val(meta), path(processed_hdf5), path(cellassign_reference_mtx) + tuple val(meta), path(processed_hdf5), path(cellassign_reference_mtx), val(ref_name) output: - tuple val(meta), path(cellassign_predictions) + tuple val(meta), path(cellassign_predictions), val(ref_name) script: cellassign_predictions = "${meta.library_id}_predictions.tsv" """ - classify_cellassign.py \ + predict_cellassign.py \ --input_hdf5_file ${processed_hdf5} \ --output_predictions ${cellassign_predictions} \ --reference ${cellassign_reference_mtx} \ @@ -51,6 +51,27 @@ process predict_cellassign { """ } +process classify_cellassign { + container params.SCPCATOOLS_CONTAINER + publishDir "${params.results_dir}/${meta.project_id}/${meta.sample_id}", mode: 'copy' + label 'mem_4' + label 'cpus_2' + input: + tuple val(meta), path(input_rds), path(cellassign_predictions), val(ref_name) + output: + tuple val(meta), path(annotated_rds) + script: + annotated_rds = "${meta.library_id}_annotated.rds" + """ + classify_cellassign.R \ + --input_sce_file ${input_rds} \ + --output_sce_file ${annotated_rds} \ + --cellassign_predictions ${cellassign_predictions} \ + --reference_name ${ref_name} + + """ +} + workflow annotate_celltypes { take: processed_sce_channel main: @@ -60,7 +81,10 @@ workflow annotate_celltypes { .map{[ project_id = it.scpca_project_id, singler_model_file = "${params.singler_models_dir}/${it.singler_ref_file}", - cellassign_ref_file = "${params.cellassign_ref_dir}/${it.cellassign_ref_file}" + cellassign_ref_file = "${params.cellassign_ref_dir}/${it.cellassign_ref_file}", + // add ref name for cellassign since we cannot store it in the cellassign output + // singler ref name does not need to be added because it is stored in the singler model + cellassign_ref_name = it.cellassign_ref_name ]} // create channel grouped_celltype_ch as: [meta, processed sce object, SingleR reference model] @@ -70,26 +94,36 @@ workflow annotate_celltypes { .combine(celltype_ch, by: 0) .map{it.drop(1)} // remove extra project ID - // creates [meta, processed, SingleR reference model] + // creates input for singleR [meta, processed, SingleR reference model] singler_input_ch = grouped_celltype_ch - .map{meta, processed_rds, processed_hdf5, singler_model_file, cellassign_ref_file -> tuple(meta, - processed_rds, - singler_model_file - )} + .map{meta, processed_rds, processed_hdf5, singler_model_file, cellassign_ref_file, cellassign_ref_name -> tuple(meta, + processed_rds, + singler_model_file + )} + // creates input for cellassign [meta, processed hdf5, cellassign ref file, cell assign ref name] cellassign_input_ch = grouped_celltype_ch - .map{meta, processed_rds, processed_hdf5, singler_model_file, cellassign_ref_file -> tuple(meta, - processed_hdf5, - cellassign_ref_file - )} - + .map{meta, processed_rds, processed_hdf5, singler_model_file, cellassign_ref_file, cellassign_ref_name -> tuple(meta, + processed_hdf5, + cellassign_ref_file, + cellassign_ref_name + )} + // get SingleR cell type assignments and add them to SCE classify_singleR(singler_input_ch) + // get cellassign predictions file predict_cellassign(cellassign_input_ch) - emit: - classify_singleR.out - predict_cellassign.out + // add cellassign annotations to the object with singleR results + all_celltype_assignments_ch = classify_singleR.out + // combines using meta from both singleR and cellassign, they should be the same + // resulting tuple should be [meta, singleR annotated rds, cellassign predictions] + .combine(predict_cellassign.out, by: 0) + + // get CellAssign cell type predictions and add them to SCE + classify_cellassign(all_celltype_assignments_ch) + + emit: classify_cellassign.out }