AlexsLemonade · sjspielman · Oct 3, 2023 · Oct 2, 2023 · Oct 2, 2023 · Oct 2, 2023
diff --git a/bin/classify_SingleR.R b/bin/classify_SingleR.R
@@ -18,7 +18,8 @@ option_list <- list(
   make_option(
     opt_str = c("--singler_model_file"),
     type = "character",
-    help = "path to file containing a single model generated for SingleR annotation"
+    help = "path to file containing a single model generated for SingleR annotation. 
+            File name is expected to be in form: <model name>_model.rds."
   ),
   make_option(
     opt_str = c("--output_singler_annotations_file"),
@@ -55,19 +56,28 @@ if (!file.exists(opt$sce_file)) {
   stop("Missing SCE file")
 }
 
-# check that output files have the righr extensions
+# check that output files have the right extensions
 if (!(stringr::str_ends(opt$output_singler_results_file, ".rds"))) {
   stop("output SingleR result file name must end in .rds")
 }
 if (!(stringr::str_ends(opt$output_singler_annotations_file, ".tsv"))) {
   stop("output SingleR annotations file name must end in .tsv")
 }
 
-# check that references all exist
+# check that reference exists and filename is properly formatted
 singler_model_file <- opt$singler_model_file
 if (!file.exists(singler_model_file)) {
   stop(glue::glue("Provided model file {singler_model_file} is missing."))
 }
+if (!(stringr::str_ends(singler_model_file, "_model.rds"))) {
+  stop(glue::glue("Provided model file {singler_model_file} must end in .rds."))
+}
+
+# get & check reference name
+reference_name <- stringr::str_remove(singler_model_file, "_model.rds")
+if (reference_name == "") {
+  stop(glue::glue("Provided model file name must be formatted as `<model_name>_model.rds`"))
+}
 
 # set up multiprocessing params
 if (opt$threads > 1) {
@@ -90,6 +100,10 @@ singler_results <- SingleR::classifySingleR(
   BPPARAM = bp_param
 )
 
+# add reference name as metadata in singler_results DataFrame
+metadata(singler_results)$reference_name <- reference_name
+
+
 # export results
 
 # first, a stand-alone tsv of annotations with both pruned and full labels
@@ -102,6 +116,7 @@ readr::write_tsv(
 )
 
 # next, the full result to a compressed rds
+
 readr::write_rds(
   singler_results,
   opt$output_singler_results_file,

diff --git a/modules/classify-celltypes.nf b/modules/classify-celltypes.nf
@@ -38,33 +38,52 @@ process classify_singler {
 }
 
 
-process predict_cellassign {
+process classify_cellassign {
   container params.SCPCATOOLS_CONTAINER
-  publishDir "${params.checkpoints_dir}/celltype/${meta.library_id}", mode: 'copy'
+    publishDir (
+      path: "${meta.celltype_publish_dir}",
+      mode: 'copy',
+      pattern: "${cellassign_dir}"
+    )
   label 'mem_32'
   label 'cpus_12'
   input:
-    tuple val(meta), path(processed_hdf5), path(cellassign_reference_mtx), val(ref_name)
+    tuple val(meta), path(processed_rds), path(cellassign_reference_file)
   output:
-    tuple val(meta), path(cellassign_predictions), val(ref_name)
+    tuple val(meta.library_id), path(cellassign_dir)
   script:
-    cellassign_predictions = "${meta.library_id}_predictions.tsv"
+    cellassign_dir = file(meta.cellassign_dir).name
+
     """
+    # create output directory
+    mkdir "${cellassign_dir}"
+
+    # Convert SCE to AnnData
+    sce_to_anndata.R \
+        --input_sce_file "${processed_rds}" \
+        --output_rna_h5 processed.hdf5 
+
+    # Run CellAssign
     predict_cellassign.py \
-      --input_hdf5_file ${processed_hdf5} \
-      --output_predictions ${cellassign_predictions} \
-      --reference ${cellassign_reference_mtx} \
+      --input_hdf5_file processed.hdf5 
+      --output_predictions "${cellassign_dir}/cellassign_predictions.tsv" \
+      --reference "${cellassign_reference_file}" \
       --seed ${params.seed} \
       --threads ${task.cpus}
+
+    # write out meta file
+    echo "${Utils.makeJson(meta)}" > "${cellassign_dir}/scpca-meta.json"
     """
   stub:
-    cellassign_predictions = "${meta.library_id}_predictions.tsv"
+    cellassign_dir = file(meta.cellassign_dir).name
     """
-    touch "${cellassign_predictions}"
+    mkdir "${cellassign_dir}"
+    echo "${Utils.makeJson(meta)}" > "${cellassign_dir}/scpca-meta.json"
     """
 }
 
-process classify_cellassign {
+// TODO: overhaul this process next
+process add_celltypes_to_sce {
   container params.SCPCATOOLS_CONTAINER
   publishDir "${params.results_dir}/${meta.project_id}/${meta.sample_id}", mode: 'copy'
   label 'mem_4'
@@ -101,30 +120,27 @@ workflow annotate_celltypes {
          // singler model file
          Utils.parseNA(it.singler_ref_file) ? "${params.singler_models_dir}/${it.singler_ref_file}" : null,
          // cellassign reference file
-         Utils.parseNA(it.cellassign_ref_file) ? "${params.cellassign_ref_dir}/${it.cellassign_ref_file}" : null,
-         // add ref name for cellassign since we cannot store it in the cellassign output
-         // singler ref name does not need to be added because it is stored in the singler model
-         Utils.parseNA(it.cellassign_ref_name)
+         Utils.parseNA(it.cellassign_ref_file) ? "${params.cellassign_ref_dir}/${it.cellassign_ref_file}" : null
         ]}
 
       // create input for typing: [augmented meta, processed_sce]
       celltype_input_ch = processed_sce_channel
         .map{[it[0]["project_id"]] + it}
         .combine(celltype_ch, by: 0)
-        // current contents: [project_id, meta, processed_sce, singler_model_file, cellassign_reference_file, cellassign_reference_name]
+        // current contents: [project_id, meta, processed_sce, singler_model_file, cellassign_reference_file]
         // add values to meta for later use
-        .map{ project_id, meta, processed_sce, singler_model_file, cellassign_reference_file, cellassign_reference_name ->
+        .map{ project_id, meta, processed_sce, singler_model_file, cellassign_reference_file ->
           meta.celltype_publish_dir = "${params.checkpoints_dir}/celltype/${meta.library_id}";
           meta.singler_dir = "${meta.celltype_publish_dir}/${meta.library_id}_singler";
           meta.cellassign_dir = "${meta.celltype_publish_dir}/${meta.library_id}_cellassign";
-          meta.singler_model_file = singler_model_file;
+          meta.singler_model_file = singler_model_file; 
           meta.cellassign_reference_file = cellassign_reference_file;
-          meta.cellassign_reference_name = cellassign_reference_name;
           // return simplified input:
           [meta, processed_sce]
         }
 
-
+
+      // creates [meta, processed sce, singler model file]
       singler_input_ch = celltype_input_ch
         // add in singler model or empty file
         .map{it.toList() + [file(it[0].singler_model_file ?: empty_file)]}
@@ -133,23 +149,50 @@ workflow annotate_celltypes {
           missing_ref: it[2].name == "NO_FILE"
           do_singler: true
         }
+
 
       // perform singleR celltyping and export results
       classify_singler(singler_input_ch.do_singler)
-
+      
       // singleR output channel: [library_id, singler_results]
       singler_output_ch = singler_input_ch.missing_ref
         .map{[it[0]["library_id"], file(empty_file)]}
         // add in channel outputs
         .mix(classify_singler.out)
-
-
-      // input for rds assignment step
+
+      // create cellassign input channel: [meta, processed sce, cellassign reference file]
+       cellassign_input_ch = celltype_input_ch
+        // add in cellassign reference
+        .map{it.toList() + [file(it[0].cellassign_reference_file ?: empty_file)]}
+        // skip if no cellassign reference file or reference name is not defined
+        .branch{
+          missing_ref: it[2].name == "NO_FILE"
+          do_cellassign: true
+        }     
+
+
+      // perform CellAssign celltyping and export results
+      classify_cellassign(cellassign_input_ch.do_cellassign)
+
+      // cellassign output channel: [library_id, cellassign_dir]
+      cellassign_output_ch = cellassign_input_ch.missing_ref
+        .map{[it[0]["library_id"], file(empty_file)]}
+        // add in channel outputs
+        .mix(classify_cellassign.out) 
+
+      // prepare input for process to add celltypes to the processed SCE
       assignment_input_ch = processed_sce_channel
         .map{[it[0]["library_id"]] + it}
         // add in singler results
         .join(singler_output_ch, by: 0, failOnMismatch: true, failOnDuplicate: true)
+        // add in cell assign results
+        .join(cellassign_output_ch, by: 0, failOnMismatch: true, failOnDuplicate: true)
+        .map{it.drop(1)} // remove library_id
+
 
+      // Next PR:
+      //add_celltypes_to_sce(assignment_input_ch)
+
       // add back in the unchanged sce files
       // TODO update below with output channel results:
       // export_channel = processed_sce_channel