Merge pull request #91 from sanger-bentley-group/dev

Release Version 1.0.0-rc4
sanger-bentley-group · Feb 23, 2024 · 745f064 · 745f064
2 parents 54ab60a + 356b0f4
commit 745f064
Show file tree

Hide file tree

Showing 7 changed files with 94 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # GPS Pipeline <!-- omit in toc -->
 
-[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-23.10.0-23aa62.svg)](https://www.nextflow.io/)
+[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-23.10.1-23aa62.svg)](https://www.nextflow.io/)
 [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)
 [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/singularity/)
 [![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/sanger-bentley-group/gps-pipeline)
@@ -281,6 +281,8 @@ The pipeline is compatible with [Launchpad](https://help.tower.nf/23.2/launch/la
     <!-- -->
   > ℹ️ For virulence genes: POS = Positive; NEG = Negative
   <!-- -->
+  > ⚠️ If the result of `Overall_QC` of a sample is `READ_ONE_CORRUPTED`, `READ_TWO_CORRUPTED` or both, the specific read file is found to be corrupted (i.e. incomplete/damaged Gzip file, mis-match(s) in read length and quality-score length). You might want to reacquire the read file from its source, or discard the sample if the source file is corrupted as well. 
+  <!-- -->
   > ⚠️ If the result of `Overall_QC` of a sample is `ASSEMBLER FAILURE`, the assembler has crashed when trying to assembly the reads. You might want to re-run the sample with [another assembler](#assembly), or discard the sample if it is a low quality one.
   <!-- -->
   > ⚠️ If the result of `Serotype` of a sample is `SEROBA FAILURE`, SeroBA has crashed when trying to serotype the sample.

diff --git a/bin/get_overall_qc.sh b/bin/get_overall_qc.sh
@@ -1,7 +1,10 @@
-# Determine overall QC result based on Assembly QC, Mapping QC and Taxonomy QC
-# In case of assembler failure, there will be no Assembly QC input, hence output result as ASSEMBLER FAILURE
+# Determine overall QC result based on File Validity, Read QC, Assembly QC, Mapping QC and Taxonomy QC
+# In case File Validity is not PASS, save its value (i.e. description of the issue) to Overall QC
+# In case of assembler failure, there will be no Assembly QC input, save ASSEMBLER FAILURE to Overall QC
 
-if [[ "$READ_QC" == "PASS" ]] && [[ "$ASSEMBLY_QC" == "PASS" ]] && [[ "$MAPPING_QC" == "PASS" ]] && [[ "$TAXONOMY_QC" == "PASS" ]]; then
+if [[ ! "$FILE_VALIDITY" == "PASS" ]]; then
+    OVERALL_QC="$FILE_VALIDITY"
+elif [[ "$READ_QC" == "PASS" ]] && [[ "$ASSEMBLY_QC" == "PASS" ]] && [[ "$MAPPING_QC" == "PASS" ]] && [[ "$TAXONOMY_QC" == "PASS" ]]; then
     OVERALL_QC="PASS"
 elif [[ "$READ_QC" == "PASS" ]] && [[ "$ASSEMBLY_QC" == "null" ]]; then
     OVERALL_QC="ASSEMBLER FAILURE"

diff --git a/bin/validate_file.sh b/bin/validate_file.sh
@@ -0,0 +1,38 @@
+# Basic file validation of input files
+# Check if read length is the same as quality-score length on all entries; if input is Gzipped, check file integrity first
+# Report if any read is corrupted
+
+FILE_VALIDITY=""
+
+case "$READ_ONE" in
+    *.gz)
+        { 
+            gzip -t "$READ_ONE" && zcat "$READ_ONE" | paste - - - - | awk -F"\t" '{ if (length($2) != length($4)) exit 1 }' ;
+        } || { 
+            FILE_VALIDITY+="READ_ONE_CORRUPTED;"; 
+        }
+        { 
+            gzip -t "$READ_TWO" && zcat "$READ_TWO" | paste - - - - | awk -F"\t" '{ if (length($2) != length($4)) exit 1 }' ;
+        } || { 
+            FILE_VALIDITY+="READ_TWO_CORRUPTED;"; 
+        }
+        ;;
+    *) 
+        { 
+            cat "$READ_ONE" | paste - - - - | awk -F"\t" '{ if (length($2) != length($4)) exit 1 }' ;
+        } || { 
+            FILE_VALIDITY+="READ_ONE_CORRUPTED;"; 
+        }
+        {
+            cat "$READ_TWO" | paste - - - - | awk -F"\t" '{ if (length($2) != length($4)) exit 1 }' ;
+        } || { 
+            FILE_VALIDITY+="READ_TWO_CORRUPTED;"; 
+        }
+        ;;
+esac
+
+if [[ "$FILE_VALIDITY" == "" ]]; then
+    FILE_VALIDITY="PASS"
+else
+    FILE_VALIDITY="${FILE_VALIDITY%;}"
+fi
diff --git a/main.nf b/main.nf
@@ -1,7 +1,7 @@
 #!/usr/bin/env nextflow
 
 // Version of this release
-pipelineVersion = '1.0.0-rc3'
+pipelineVersion = '1.0.0-rc4'
 
 // Import workflow modules
 include { PIPELINE } from "$projectDir/workflows/pipeline"

diff --git a/modules/overall_qc.nf b/modules/overall_qc.nf
@@ -6,7 +6,7 @@ process OVERALL_QC {
     tag "$sample_id"
 
     input:
-    tuple val(sample_id), val(read_qc), val(assembly_qc), val(mapping_qc), val(taxonomy_qc)
+    tuple val(sample_id), val(file_validity), val(read_qc), val(assembly_qc), val(mapping_qc), val(taxonomy_qc)
 
     output:
     tuple val(sample_id), env(OVERALL_QC), emit: result
@@ -15,6 +15,7 @@ process OVERALL_QC {
     script:
     overall_qc_report='overall_qc_report.csv'
     """
+    FILE_VALIDITY="$file_validity"
     READ_QC="$read_qc"
     ASSEMBLY_QC="$assembly_qc"
     MAPPING_QC="$mapping_qc"

diff --git a/modules/preprocess.nf b/modules/preprocess.nf
@@ -1,3 +1,28 @@
+// Basic file validation of input files
+process FILE_VALIDATION {
+    label 'bash_container'
+    label 'farm_low'
+
+    tag "$sample_id"
+
+    input:
+    tuple val(sample_id), path(reads)
+
+    output:
+    tuple val(sample_id), env(FILE_VALIDITY), emit: result
+
+    script:
+    read_one="${reads[0]}"
+    read_two="${reads[1]}"
+    """
+    READ_ONE="$read_one"
+    READ_TWO="$read_two"
+
+    source validate_file.sh
+    """
+}
+
+
 // Run fastp to preprocess the FASTQs
 process PREPROCESS {
     label 'fastp_container'

diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf
@@ -1,5 +1,5 @@
 // Import process modules
-include { PREPROCESS; READ_QC } from "$projectDir/modules/preprocess"
+include { FILE_VALIDATION; PREPROCESS; READ_QC } from "$projectDir/modules/preprocess"
 include { ASSEMBLY_UNICYCLER; ASSEMBLY_SHOVILL; ASSEMBLY_ASSESS; ASSEMBLY_QC } from "$projectDir/modules/assembly"
 include { GET_REF_GENOME_BWA_DB; MAPPING; SAM_TO_SORTED_BAM; SNP_CALL; HET_SNP_COUNT; MAPPING_QC } from "$projectDir/modules/mapping"
 include { GET_KRAKEN2_DB; TAXONOMY; TAXONOMY_QC } from "$projectDir/modules/taxonomy"
@@ -32,9 +32,18 @@ workflow PIPELINE {
     // Get read pairs into Channel raw_read_pairs_ch
     raw_read_pairs_ch = Channel.fromFilePairs("$params.reads/*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}", checkIfExists: true)
 
-    // Preprocess read pairs
+    // Basic input files validation
+    // Output into Channel FILE_VALIDATION.out.result
+    FILE_VALIDATION(raw_read_pairs_ch)
+
+    // From Channel raw_read_pairs_ch, only output valid reads of samples based on Channel FILE_VALIDATION.out.result
+    VALID_READS_ch = FILE_VALIDATION.out.result.join(raw_read_pairs_ch, failOnDuplicate: true, failOnMismatch: true)
+                        .filter { it[1] == 'PASS' }
+                        .map { it[0, 2..-1] }
+
+    // Preprocess valid read pairs
     // Output into Channels PREPROCESS.out.processed_reads & PREPROCESS.out.json
-    PREPROCESS(raw_read_pairs_ch)
+    PREPROCESS(VALID_READS_ch)
 
     // From Channel PREPROCESS.out.json, provide Read QC status
     // Output into Channels READ_QC.out.bases, READ_QC.out.result, READ_QC.out.report
@@ -102,10 +111,12 @@ workflow PIPELINE {
     // Output into Channels TAXONOMY_QC.out.result & TAXONOMY_QC.out.report
     TAXONOMY_QC(TAXONOMY.out.report, params.spneumo_percentage, params.non_strep_percentage)
 
-    // Merge Channels AREAD_QC.out.result & SSEMBLY_QC.out.result & MAPPING_QC.out.result & TAXONOMY_QC.out.result to provide Overall QC Status
+    // Merge Channels FILE_VALIDATION.out.result & READ_QC.out.result & ASSEMBLY_QC.out.result & MAPPING_QC.out.result & TAXONOMY_QC.out.result to provide Overall QC Status
     // Output into Channel OVERALL_QC.out.result & OVERALL_QC.out.report
     OVERALL_QC(
-        READ_QC.out.result
+        raw_read_pairs_ch.map{ it[0] }
+        .join(FILE_VALIDATION.out.result, failOnDuplicate: true, failOnMismatch: true)
+        .join(READ_QC.out.result, failOnDuplicate: true, remainder: true)
         .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true)
         .join(MAPPING_QC.out.result, failOnDuplicate: true, remainder: true)
         .join(TAXONOMY_QC.out.result, failOnDuplicate: true, remainder: true)
@@ -150,11 +161,12 @@ workflow PIPELINE {
 
     // Generate sample reports by merging outputs from all result-generating modules
     GENERATE_SAMPLE_REPORT(
-        READ_QC.out.report
+        raw_read_pairs_ch.map{ it[0] }
+        .join(READ_QC.out.report, failOnDuplicate: true, remainder: true)
         .join(ASSEMBLY_QC.out.report, failOnDuplicate: true, remainder: true)
         .join(MAPPING_QC.out.report, failOnDuplicate: true, remainder: true)
         .join(TAXONOMY_QC.out.report, failOnDuplicate: true, remainder: true)
-        .join(OVERALL_QC.out.report, failOnDuplicate: true, remainder: true)
+        .join(OVERALL_QC.out.report, failOnDuplicate: true, failOnMismatch: true)
         .join(SEROTYPE.out.report, failOnDuplicate: true, remainder: true)
         .join(MLST.out.report, failOnDuplicate: true, remainder: true)
         .join(PARSE_PBP_RESISTANCE.out.report, failOnDuplicate: true, remainder: true)