Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CellAssign process #479

Merged
merged 12 commits into from
Oct 3, 2023
21 changes: 18 additions & 3 deletions bin/classify_SingleR.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ option_list <- list(
make_option(
opt_str = c("--singler_model_file"),
type = "character",
help = "path to file containing a single model generated for SingleR annotation"
help = "path to file containing a single model generated for SingleR annotation.
File name is expected to be in form: <model name>_model.rds."
),
make_option(
opt_str = c("--output_singler_annotations_file"),
Expand Down Expand Up @@ -55,19 +56,28 @@ if (!file.exists(opt$sce_file)) {
stop("Missing SCE file")
}

# check that output files have the righr extensions
# check that output files have the right extensions
if (!(stringr::str_ends(opt$output_singler_results_file, ".rds"))) {
stop("output SingleR result file name must end in .rds")
}
if (!(stringr::str_ends(opt$output_singler_annotations_file, ".tsv"))) {
stop("output SingleR annotations file name must end in .tsv")
}

# check that references all exist
# check that reference exists and filename is properly formatted
singler_model_file <- opt$singler_model_file
if (!file.exists(singler_model_file)) {
stop(glue::glue("Provided model file {singler_model_file} is missing."))
}
if (!(stringr::str_ends(singler_model_file, "_model.rds"))) {
stop(glue::glue("Provided model file {singler_model_file} must end in .rds."))
sjspielman marked this conversation as resolved.
Show resolved Hide resolved
}

# get & check reference name
reference_name <- stringr::str_remove(singler_model_file, "_model.rds")
if (reference_name == "") {
stop(glue::glue("Provided model file name must be formatted as `<model_name>_model.rds`"))
}
sjspielman marked this conversation as resolved.
Show resolved Hide resolved

# set up multiprocessing params
if (opt$threads > 1) {
Expand All @@ -90,6 +100,10 @@ singler_results <- SingleR::classifySingleR(
BPPARAM = bp_param
)

# add reference name as metadata in singler_results DataFrame
metadata(singler_results)$reference_name <- reference_name


# export results

# first, a stand-alone tsv of annotations with both pruned and full labels
Expand All @@ -102,6 +116,7 @@ readr::write_tsv(
)

# next, the full result to a compressed rds

sjspielman marked this conversation as resolved.
Show resolved Hide resolved
readr::write_rds(
singler_results,
opt$output_singler_results_file,
Expand Down
91 changes: 67 additions & 24 deletions modules/classify-celltypes.nf
Original file line number Diff line number Diff line change
Expand Up @@ -38,33 +38,52 @@ process classify_singler {
}


process predict_cellassign {
process classify_cellassign {
container params.SCPCATOOLS_CONTAINER
publishDir "${params.checkpoints_dir}/celltype/${meta.library_id}", mode: 'copy'
publishDir (
path: "${meta.celltype_publish_dir}",
mode: 'copy',
pattern: "${cellassign_dir}"
)
label 'mem_32'
label 'cpus_12'
input:
tuple val(meta), path(processed_hdf5), path(cellassign_reference_mtx), val(ref_name)
tuple val(meta), path(processed_rds), path(cellassign_reference_file)
output:
tuple val(meta), path(cellassign_predictions), val(ref_name)
tuple val(meta.library_id), path(cellassign_dir)
script:
cellassign_predictions = "${meta.library_id}_predictions.tsv"
cellassign_dir = file(meta.cellassign_dir).name

"""
# create output directory
mkdir "${cellassign_dir}"

# Convert SCE to AnnData
sce_to_anndata.R \
--input_sce_file "${processed_rds}" \
--output_rna_h5 processed.hdf5

# Run CellAssign
predict_cellassign.py \
--input_hdf5_file ${processed_hdf5} \
--output_predictions ${cellassign_predictions} \
--reference ${cellassign_reference_mtx} \
--input_hdf5_file processed.hdf5
--output_predictions "${cellassign_dir}/cellassign_predictions.tsv" \
--reference "${cellassign_reference_file}" \
--seed ${params.seed} \
--threads ${task.cpus}

# write out meta file
echo "${Utils.makeJson(meta)}" > "${cellassign_dir}/scpca-meta.json"
"""
stub:
cellassign_predictions = "${meta.library_id}_predictions.tsv"
cellassign_dir = file(meta.cellassign_dir).name
"""
touch "${cellassign_predictions}"
mkdir "${cellassign_dir}"
echo "${Utils.makeJson(meta)}" > "${cellassign_dir}/scpca-meta.json"
"""
}

process classify_cellassign {
// TODO: overhaul this process next
process add_celltypes_to_sce {
container params.SCPCATOOLS_CONTAINER
publishDir "${params.results_dir}/${meta.project_id}/${meta.sample_id}", mode: 'copy'
label 'mem_4'
Expand Down Expand Up @@ -101,30 +120,27 @@ workflow annotate_celltypes {
// singler model file
Utils.parseNA(it.singler_ref_file) ? "${params.singler_models_dir}/${it.singler_ref_file}" : null,
// cellassign reference file
Utils.parseNA(it.cellassign_ref_file) ? "${params.cellassign_ref_dir}/${it.cellassign_ref_file}" : null,
// add ref name for cellassign since we cannot store it in the cellassign output
// singler ref name does not need to be added because it is stored in the singler model
Utils.parseNA(it.cellassign_ref_name)
Utils.parseNA(it.cellassign_ref_file) ? "${params.cellassign_ref_dir}/${it.cellassign_ref_file}" : null
]}

// create input for typing: [augmented meta, processed_sce]
celltype_input_ch = processed_sce_channel
.map{[it[0]["project_id"]] + it}
.combine(celltype_ch, by: 0)
// current contents: [project_id, meta, processed_sce, singler_model_file, cellassign_reference_file, cellassign_reference_name]
// current contents: [project_id, meta, processed_sce, singler_model_file, cellassign_reference_file]
// add values to meta for later use
.map{ project_id, meta, processed_sce, singler_model_file, cellassign_reference_file, cellassign_reference_name ->
.map{ project_id, meta, processed_sce, singler_model_file, cellassign_reference_file ->
meta.celltype_publish_dir = "${params.checkpoints_dir}/celltype/${meta.library_id}";
meta.singler_dir = "${meta.celltype_publish_dir}/${meta.library_id}_singler";
meta.cellassign_dir = "${meta.celltype_publish_dir}/${meta.library_id}_cellassign";
meta.singler_model_file = singler_model_file;
meta.singler_model_file = singler_model_file;
sjspielman marked this conversation as resolved.
Show resolved Hide resolved
meta.cellassign_reference_file = cellassign_reference_file;
meta.cellassign_reference_name = cellassign_reference_name;
Comment on lines 137 to -122
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume that by removing this here, you plan to add some code to add_celltypes_to_sce to pull out the cellassign reference name?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that was the idea!

// return simplified input:
[meta, processed_sce]
}



// creates [meta, processed sce, singler model file]
singler_input_ch = celltype_input_ch
// add in singler model or empty file
.map{it.toList() + [file(it[0].singler_model_file ?: empty_file)]}
Expand All @@ -133,23 +149,50 @@ workflow annotate_celltypes {
missing_ref: it[2].name == "NO_FILE"
do_singler: true
}


// perform singleR celltyping and export results
classify_singler(singler_input_ch.do_singler)

sjspielman marked this conversation as resolved.
Show resolved Hide resolved
// singleR output channel: [library_id, singler_results]
singler_output_ch = singler_input_ch.missing_ref
.map{[it[0]["library_id"], file(empty_file)]}
// add in channel outputs
.mix(classify_singler.out)


// input for rds assignment step

// create cellassign input channel: [meta, processed sce, cellassign reference file]
cellassign_input_ch = celltype_input_ch
// add in cellassign reference
.map{it.toList() + [file(it[0].cellassign_reference_file ?: empty_file)]}
// skip if no cellassign reference file or reference name is not defined
.branch{
missing_ref: it[2].name == "NO_FILE"
do_cellassign: true
}


// perform CellAssign celltyping and export results
classify_cellassign(cellassign_input_ch.do_cellassign)

// cellassign output channel: [library_id, cellassign_dir]
cellassign_output_ch = cellassign_input_ch.missing_ref
.map{[it[0]["library_id"], file(empty_file)]}
// add in channel outputs
.mix(classify_cellassign.out)

// prepare input for process to add celltypes to the processed SCE
assignment_input_ch = processed_sce_channel
.map{[it[0]["library_id"]] + it}
// add in singler results
.join(singler_output_ch, by: 0, failOnMismatch: true, failOnDuplicate: true)
// add in cell assign results
.join(cellassign_output_ch, by: 0, failOnMismatch: true, failOnDuplicate: true)
.map{it.drop(1)} // remove library_id


// Next PR:
//add_celltypes_to_sce(assignment_input_ch)

// add back in the unchanged sce files
// TODO update below with output channel results:
// export_channel = processed_sce_channel
Expand Down