Merge pull request #487 from AlexsLemonade/jashapiro/try-no-null2

Try to solve intermittent workflow failures
AlexsLemonade · Oct 5, 2023 · 3c21045 · 3c21045
2 parents 1e3ab54 + e5ee352
commit 3c21045
Show file tree

Hide file tree

Showing 20 changed files with 76 additions and 41 deletions.
diff --git a/bin/add_submitter_annotations.R b/bin/add_submitter_annotations.R
diff --git a/bin/classify_SingleR.R b/bin/classify_SingleR.R
diff --git a/bin/classify_cellassign.R b/bin/classify_cellassign.R
diff --git a/bin/cluster_sce.R b/bin/cluster_sce.R
diff --git a/bin/filter_sce_rds.R → bin/filter_sce.R b/bin/filter_sce_rds.R → bin/filter_sce.R
diff --git a/bin/generate_cellassign_refs.R b/bin/generate_cellassign_refs.R
diff --git a/bin/integrate_sce.R b/bin/integrate_sce.R
diff --git a/bin/merge_sces.R b/bin/merge_sces.R
diff --git a/bin/predict_cellassign.py b/bin/predict_cellassign.py
diff --git a/bin/train_SingleR.R b/bin/train_SingleR.R
diff --git a/lib/Utils.groovy b/lib/Utils.groovy
@@ -38,21 +38,23 @@ class Utils {
    */
   static def getMetaVal(file, key){
     def obj = new JsonSlurper().parse(file)
+
     return(obj[key])
   }
 
 
   /**
-   * Replace a string with an NA value with null
+   * Replace a string with an NA value with ""
+   * (which evaluates as false in boolean contexts)
    *
    * @param str A string
-   * @return The input string unless it was NA or a variant thereof, in which case returns null
+   * @return The input string unless it was NA or a variant thereof, in which case returns ""
    */
   static def parseNA(str) {
     if (str){
-      str.toLowerCase() in ["na","n/a","nan"]? null : str
+      str.toLowerCase() in ['na','n/a','nan']? '' : str
     } else {
-      null
+      ''
     }
    }
 }
diff --git a/main.nf b/main.nf
@@ -93,7 +93,7 @@ workflow {
       library_id: it.scpca_library_id,
       sample_id: it.scpca_sample_id.split(";").sort().join(","),
       project_id: Utils.parseNA(it.scpca_project_id)?: "no_project",
-      submitter: it.submitter,
+      submitter: Utils.parseNA(it.submitter),
       technology: it.technology,
       assay_ontology_term_id: Utils.parseNA(it.assay_ontology_term_id),
       seq_unit: it.seq_unit,

diff --git a/modules/af-features.nf b/modules/af-features.nf
@@ -2,6 +2,7 @@
 //index a feature barcode file
 process index_feature{
   container params.SALMON_CONTAINER
+  tag "${id}"
 
   input:
     tuple val(id), path(feature_file)

diff --git a/modules/bulk-salmon.nf b/modules/bulk-salmon.nf
@@ -70,6 +70,7 @@ process merge_bulk_quants {
     container params.SCPCATOOLS_CONTAINER
     label 'mem_8'
     publishDir "${params.results_dir}/${meta.project_id}", mode: 'copy'
+    tag "${meta.project_id}"
     input:
         tuple val(meta), path(salmon_directories), path(t2g_bulk)
         path(library_metadata)

diff --git a/modules/classify-celltypes.nf b/modules/classify-celltypes.nf
@@ -8,6 +8,7 @@ process classify_singler {
     )
     label 'mem_8'
     label 'cpus_4'
+    tag "${meta.library_id}"
     input:
       tuple val(meta), path(processed_rds), path(singler_model_file)
     output:
@@ -27,13 +28,13 @@ process classify_singler {
         --threads ${task.cpus}
 
       # write out meta file
-      echo "${Utils.makeJson(meta)}" > "${singler_dir}/scpca-meta.json"
+      echo '${Utils.makeJson(meta)}' > "${singler_dir}/scpca-meta.json"
       """
     stub:
       singler_dir = file(meta.singler_dir).name
       """
       mkdir "${singler_dir}"
-      echo "${Utils.makeJson(meta)}" > "${singler_dir}/scpca-meta.json"
+      echo '${Utils.makeJson(meta)}' > "${singler_dir}/scpca-meta.json"
       """
 }
 
@@ -47,6 +48,7 @@ process classify_cellassign {
     )
   label 'mem_32'
   label 'cpus_12'
+  tag "${meta.library_id}"
   input:
     tuple val(meta), path(processed_rds), path(cellassign_reference_file)
   output:
@@ -57,28 +59,28 @@ process classify_cellassign {
     """
     # create output directory
     mkdir "${cellassign_dir}"
-    
+
     # Convert SCE to AnnData
     sce_to_anndata.R \
         --input_sce_file "${processed_rds}" \
-        --output_rna_h5 processed.hdf5 
-        
+        --output_rna_h5 processed.hdf5
+
     # Run CellAssign
     predict_cellassign.py \
-      --input_hdf5_file processed.hdf5 
+      --input_hdf5_file processed.hdf5
       --output_predictions "${cellassign_dir}/cellassign_predictions.tsv" \
       --reference "${cellassign_reference_file}" \
       --seed ${params.seed} \
       --threads ${task.cpus}
-    
+
     # write out meta file
-    echo "${Utils.makeJson(meta)}" > "${cellassign_dir}/scpca-meta.json"
+    echo '${Utils.makeJson(meta)}' > "${cellassign_dir}/scpca-meta.json"
     """
   stub:
     cellassign_dir = file(meta.cellassign_dir).name
     """
     mkdir "${cellassign_dir}"
-    echo "${Utils.makeJson(meta)}" > "${cellassign_dir}/scpca-meta.json"
+    echo '${Utils.makeJson(meta)}' > "${cellassign_dir}/scpca-meta.json"
     """
 }
 
@@ -88,6 +90,7 @@ process add_celltypes_to_sce {
   publishDir "${params.results_dir}/${meta.project_id}/${meta.sample_id}", mode: 'copy'
   label 'mem_4'
   label 'cpus_2'
+  tag "${meta.library_id}"
   input:
     tuple val(meta), path(input_rds), path(cellassign_predictions), val(ref_name)
   output:
@@ -118,9 +121,9 @@ workflow annotate_celltypes {
          // project id
          it.scpca_project_id,
          // singler model file
-         Utils.parseNA(it.singler_ref_file) ? "${params.singler_models_dir}/${it.singler_ref_file}" : null,
+         Utils.parseNA(it.singler_ref_file) ? "${params.singler_models_dir}/${it.singler_ref_file}" : '',
          // cellassign reference file
-         Utils.parseNA(it.cellassign_ref_file) ? "${params.cellassign_ref_dir}/${it.cellassign_ref_file}" : null
+         Utils.parseNA(it.cellassign_ref_file) ? "${params.cellassign_ref_dir}/${it.cellassign_ref_file}" : ''
         ]}
 
       // create input for typing: [augmented meta, processed_sce]
@@ -129,7 +132,8 @@ workflow annotate_celltypes {
         .combine(celltype_ch, by: 0)
         // current contents: [project_id, meta, processed_sce, singler_model_file, cellassign_reference_file]
         // add values to meta for later use
-        .map{ project_id, meta, processed_sce, singler_model_file, cellassign_reference_file ->
+        .map{ project_id, meta_in, processed_sce, singler_model_file, cellassign_reference_file ->
+          def meta = meta_in.clone(); // local copy for safe modification
           meta.celltype_publish_dir = "${params.checkpoints_dir}/celltype/${meta.library_id}";
           meta.singler_dir = "${meta.celltype_publish_dir}/${meta.library_id}_singler";
           meta.cellassign_dir = "${meta.celltype_publish_dir}/${meta.library_id}_cellassign";
@@ -139,7 +143,7 @@ workflow annotate_celltypes {
           [meta, processed_sce]
         }
 
-      
+
       // creates [meta, processed sce, singler model file]
       singler_input_ch = celltype_input_ch
         // add in singler model or empty file
@@ -149,7 +153,7 @@ workflow annotate_celltypes {
           missing_ref: it[2].name == "NO_FILE"
           do_singler: true
         }
-      
+
 
       // perform singleR celltyping and export results
       classify_singler(singler_input_ch.do_singler)
@@ -158,7 +162,7 @@ workflow annotate_celltypes {
         .map{[it[0]["library_id"], file(empty_file)]}
         // add in channel outputs
         .mix(classify_singler.out)
-      
+
       // create cellassign input channel: [meta, processed sce, cellassign reference file]
        cellassign_input_ch = celltype_input_ch
         // add in cellassign reference
@@ -167,18 +171,18 @@ workflow annotate_celltypes {
         .branch{
           missing_ref: it[2].name == "NO_FILE"
           do_cellassign: true
-        }     
+        }
+
 
-
       // perform CellAssign celltyping and export results
       classify_cellassign(cellassign_input_ch.do_cellassign)
-  
+
       // cellassign output channel: [library_id, cellassign_dir]
       cellassign_output_ch = cellassign_input_ch.missing_ref
         .map{[it[0]["library_id"], file(empty_file)]}
         // add in channel outputs
-        .mix(classify_cellassign.out) 
-      
+        .mix(classify_cellassign.out)
+
       // prepare input for process to add celltypes to the processed SCE
       assignment_input_ch = processed_sce_channel
         .map{[it[0]["library_id"]] + it}
@@ -191,7 +195,7 @@ workflow annotate_celltypes {
 
       // Next PR:
       //add_celltypes_to_sce(assignment_input_ch)
-    
+
       // add back in the unchanged sce files
       // TODO update below with output channel results:
       // export_channel = processed_sce_channel

diff --git a/modules/export-anndata.nf b/modules/export-anndata.nf
@@ -66,7 +66,10 @@ workflow sce_to_anndata{
                  ]}
         // remove any sce files that don't have enough cells in the sce object
         // number of cells are stored in each metadata.json file
-        .filter{ Utils.getMetaVal(file(it[3]), "${it[2]}_cells") > 1 }
+        .filter{
+          cells = Utils.getMetaVal(file(it[3]), "${it[2]}_cells");
+          cells ? cells > 1 : true // if no cell count, keep file (for testing)
+        }
         // remove metadata.json file from tuple
         .map{it.dropRight(1)}
 

diff --git a/modules/qc-report.nf b/modules/qc-report.nf
@@ -10,22 +10,38 @@ process sce_qc_report{
         tuple val(meta), path(unfiltered_rds), path(filtered_rds), path(processed_rds)
         tuple path(template_dir), val(template_file)
     output:
-        tuple val(meta), path(unfiltered_rds), path(filtered_rds), path(processed_rds), path(metadata_json), emit: data
+        tuple val(meta), path(unfiltered_out), path(filtered_out), path(processed_out), path(metadata_json), emit: data
         path qc_report, emit: report
     script:
         qc_report = "${meta.library_id}_qc.html"
         template_path = "${template_dir}/${template_file}"
         metadata_json = "${meta.library_id}_metadata.json"
         workflow_url = workflow.repository ?: workflow.manifest.homePage
         workflow_version = workflow.revision ?: workflow.manifest.version
+        // names for final output files
+        unfiltered_out = "${meta.library_id}_unfiltered.rds"
+        filtered_out = "${meta.library_id}_filtered.rds"
+        processed_out = "${meta.library_id}_processed.rds"
         """
+        # move files for output
+        if [ "${unfiltered_rds}" != "${unfiltered_out}" ]; then
+            mv "${unfiltered_rds}" "${unfiltered_out}"
+        fi
+        if [ "${filtered_rds}" != "${filtered_out}" ]; then
+            mv "${filtered_rds}" "${filtered_out}"
+        fi
+        if [ "${processed_rds}" != "${processed_out}" ]; then
+            mv "${processed_rds}" "${processed_out}"
+        fi
+
+        # generate report
         sce_qc_report.R \
           --report_template "${template_path}" \
           --library_id "${meta.library_id}" \
           --sample_id "${meta.sample_id}" \
-          --unfiltered_sce ${unfiltered_rds} \
-          --filtered_sce ${filtered_rds} \
-          --processed_sce ${processed_rds} \
+          --unfiltered_sce ${unfiltered_out} \
+          --filtered_sce ${filtered_out} \
+          --processed_sce ${processed_out} \
           --qc_report_file ${qc_report} \
           --metadata_json ${metadata_json} \
           --technology "${meta.technology}" \
@@ -37,10 +53,16 @@ process sce_qc_report{
           --seed "${params.seed}"
         """
     stub:
+        unfiltered_out = "${meta.library_id}_unfiltered.rds"
+        filtered_out = "${meta.library_id}_filtered.rds"
+        processed_out = "${meta.library_id}_processed.rds"
         qc_report = "${meta.library_id}_qc.html"
         metadata_json = "${meta.library_id}_metadata.json"
         """
+        touch ${unfiltered_out}
+        touch ${filtered_out}
+        touch ${processed_out}
         touch ${qc_report}
-        echo '{}' > ${metadata_json}
+        echo '{"unfiltered_cells": 10, "filtered_cells": 10, "processed_cells": 10}' > ${metadata_json}
         """
 }
diff --git a/modules/samtools.nf b/modules/samtools.nf
@@ -1,6 +1,7 @@
 
 process index_bam{
   container params.SAMTOOLS_CONTAINER
+  tag "${meta.run_id}"
   input:
     tuple val(meta), path(bamfile)
   output:

diff --git a/modules/sce-processing.nf b/modules/sce-processing.nf
@@ -12,7 +12,6 @@ process make_unfiltered_sce{
         tuple val(meta), path(unfiltered_rds)
     script:
         unfiltered_rds = "${meta.library_id}_unfiltered.rds"
-
         """
         generate_unfiltered_sce.R \
           --alevin_dir ${alevin_dir} \
@@ -48,18 +47,18 @@ process make_unfiltered_sce{
 // channels with RNA and feature data
 process make_merged_unfiltered_sce{
     label 'mem_8'
-    tag "${meta.library_id}"
+    tag "${rna_meta.library_id}"
     container params.SCPCATOOLS_CONTAINER
     input:
         tuple val(feature_meta), path(feature_alevin_dir),
-              val (meta), path(alevin_dir),
+              val(rna_meta), path(alevin_dir),
               path(mito_file), path(ref_gtf), path(submitter_cell_types_file)
         path sample_metafile
     output:
         tuple val(meta), path(unfiltered_rds)
     script:
-        unfiltered_rds = "${meta.library_id}_unfiltered.rds"
-        // add feature metadata as an element of the main meta object
+        // add feature metadata as elements of the main meta object
+        meta = rna_meta.clone()
         meta['feature_type'] = feature_meta.technology.split('_')[0]
         meta['feature_meta'] = feature_meta
 
@@ -68,6 +67,7 @@ process make_merged_unfiltered_sce{
           meta['feature_type'] = "adt"
         }
 
+        unfiltered_rds = "${meta.library_id}_unfiltered.rds"
         """
         generate_unfiltered_sce.R \
           --alevin_dir ${alevin_dir} \
@@ -92,12 +92,13 @@ process make_merged_unfiltered_sce{
             --library_id "${meta.library_id}" \
             --submitter_cell_types_file "${submitter_cell_types_file}"
         fi
-
         """
     stub:
-        unfiltered_rds = "${meta.library_id}_unfiltered.rds"
+        meta = rna_meta.clone()
         meta['feature_type'] = feature_meta.technology.split('_')[0]
         meta['feature_meta'] = feature_meta
+
+        unfiltered_rds = "${meta.library_id}_unfiltered.rds"
         """
         touch "${meta.library_id}_unfiltered.rds"
         """
@@ -121,7 +122,7 @@ process filter_sce{
           feature_barcode_file.name != "NO_FILE"
 
         """
-        filter_sce_rds.R \
+        filter_sce.R \
           --unfiltered_file ${unfiltered_rds} \
           --filtered_file ${filtered_rds} \
           ${adt_present ? "--adt_name ${meta.feature_type}":""} \
@@ -200,7 +201,6 @@ process post_process_sce{
         tuple val(meta), path(unfiltered_rds), path(filtered_rds), path(processed_rds)
     script:
         processed_rds = "${meta.library_id}_processed.rds"
-
         """
         post_process_sce.R \
           --filtered_sce_file ${filtered_rds} \

diff --git a/modules/spaceranger.nf b/modules/spaceranger.nf
@@ -44,6 +44,7 @@ process spaceranger{
 
 process spaceranger_publish{
   container params.SCPCATOOLS_CONTAINER
+  tag "${meta.library_id}"
   publishDir "${params.results_dir}/${meta.project_id}/${meta.sample_id}", mode: 'copy'
   input:
     tuple val(meta), path(spatial_out)