diff --git a/README.md b/README.md index 2dd94b3..d6dcb2a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # GPS Pipeline -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-23.10.0-23aa62.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-23.10.1-23aa62.svg)](https://www.nextflow.io/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/singularity/) [![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/sanger-bentley-group/gps-pipeline) @@ -281,6 +281,8 @@ The pipeline is compatible with [Launchpad](https://help.tower.nf/23.2/launch/la > ℹ️ For virulence genes: POS = Positive; NEG = Negative + > ⚠️ If the result of `Overall_QC` of a sample is `READ_ONE_CORRUPTED`, `READ_TWO_CORRUPTED` or both, the specific read file is found to be corrupted (i.e. incomplete/damaged Gzip file, mis-match(s) in read length and quality-score length). You might want to reacquire the read file from its source, or discard the sample if the source file is corrupted as well. + > ⚠️ If the result of `Overall_QC` of a sample is `ASSEMBLER FAILURE`, the assembler has crashed when trying to assembly the reads. You might want to re-run the sample with [another assembler](#assembly), or discard the sample if it is a low quality one. > ⚠️ If the result of `Serotype` of a sample is `SEROBA FAILURE`, SeroBA has crashed when trying to serotype the sample. diff --git a/bin/get_overall_qc.sh b/bin/get_overall_qc.sh index a0fc3d3..232da0f 100755 --- a/bin/get_overall_qc.sh +++ b/bin/get_overall_qc.sh @@ -1,7 +1,10 @@ -# Determine overall QC result based on Assembly QC, Mapping QC and Taxonomy QC -# In case of assembler failure, there will be no Assembly QC input, hence output result as ASSEMBLER FAILURE +# Determine overall QC result based on File Validity, Read QC, Assembly QC, Mapping QC and Taxonomy QC +# In case File Validity is not PASS, save its value (i.e. description of the issue) to Overall QC +# In case of assembler failure, there will be no Assembly QC input, save ASSEMBLER FAILURE to Overall QC -if [[ "$READ_QC" == "PASS" ]] && [[ "$ASSEMBLY_QC" == "PASS" ]] && [[ "$MAPPING_QC" == "PASS" ]] && [[ "$TAXONOMY_QC" == "PASS" ]]; then +if [[ ! "$FILE_VALIDITY" == "PASS" ]]; then + OVERALL_QC="$FILE_VALIDITY" +elif [[ "$READ_QC" == "PASS" ]] && [[ "$ASSEMBLY_QC" == "PASS" ]] && [[ "$MAPPING_QC" == "PASS" ]] && [[ "$TAXONOMY_QC" == "PASS" ]]; then OVERALL_QC="PASS" elif [[ "$READ_QC" == "PASS" ]] && [[ "$ASSEMBLY_QC" == "null" ]]; then OVERALL_QC="ASSEMBLER FAILURE" diff --git a/bin/validate_file.sh b/bin/validate_file.sh new file mode 100755 index 0000000..8fda3e5 --- /dev/null +++ b/bin/validate_file.sh @@ -0,0 +1,38 @@ +# Basic file validation of input files +# Check if read length is the same as quality-score length on all entries; if input is Gzipped, check file integrity first +# Report if any read is corrupted + +FILE_VALIDITY="" + +case "$READ_ONE" in + *.gz) + { + gzip -t "$READ_ONE" && zcat "$READ_ONE" | paste - - - - | awk -F"\t" '{ if (length($2) != length($4)) exit 1 }' ; + } || { + FILE_VALIDITY+="READ_ONE_CORRUPTED;"; + } + { + gzip -t "$READ_TWO" && zcat "$READ_TWO" | paste - - - - | awk -F"\t" '{ if (length($2) != length($4)) exit 1 }' ; + } || { + FILE_VALIDITY+="READ_TWO_CORRUPTED;"; + } + ;; + *) + { + cat "$READ_ONE" | paste - - - - | awk -F"\t" '{ if (length($2) != length($4)) exit 1 }' ; + } || { + FILE_VALIDITY+="READ_ONE_CORRUPTED;"; + } + { + cat "$READ_TWO" | paste - - - - | awk -F"\t" '{ if (length($2) != length($4)) exit 1 }' ; + } || { + FILE_VALIDITY+="READ_TWO_CORRUPTED;"; + } + ;; +esac + +if [[ "$FILE_VALIDITY" == "" ]]; then + FILE_VALIDITY="PASS" +else + FILE_VALIDITY="${FILE_VALIDITY%;}" +fi \ No newline at end of file diff --git a/main.nf b/main.nf index b041b5f..fb6f0b0 100644 --- a/main.nf +++ b/main.nf @@ -1,7 +1,7 @@ #!/usr/bin/env nextflow // Version of this release -pipelineVersion = '1.0.0-rc3' +pipelineVersion = '1.0.0-rc4' // Import workflow modules include { PIPELINE } from "$projectDir/workflows/pipeline" diff --git a/modules/overall_qc.nf b/modules/overall_qc.nf index 31c8cca..15c1f56 100644 --- a/modules/overall_qc.nf +++ b/modules/overall_qc.nf @@ -6,7 +6,7 @@ process OVERALL_QC { tag "$sample_id" input: - tuple val(sample_id), val(read_qc), val(assembly_qc), val(mapping_qc), val(taxonomy_qc) + tuple val(sample_id), val(file_validity), val(read_qc), val(assembly_qc), val(mapping_qc), val(taxonomy_qc) output: tuple val(sample_id), env(OVERALL_QC), emit: result @@ -15,6 +15,7 @@ process OVERALL_QC { script: overall_qc_report='overall_qc_report.csv' """ + FILE_VALIDITY="$file_validity" READ_QC="$read_qc" ASSEMBLY_QC="$assembly_qc" MAPPING_QC="$mapping_qc" diff --git a/modules/preprocess.nf b/modules/preprocess.nf index 1e89da7..c429589 100644 --- a/modules/preprocess.nf +++ b/modules/preprocess.nf @@ -1,3 +1,28 @@ +// Basic file validation of input files +process FILE_VALIDATION { + label 'bash_container' + label 'farm_low' + + tag "$sample_id" + + input: + tuple val(sample_id), path(reads) + + output: + tuple val(sample_id), env(FILE_VALIDITY), emit: result + + script: + read_one="${reads[0]}" + read_two="${reads[1]}" + """ + READ_ONE="$read_one" + READ_TWO="$read_two" + + source validate_file.sh + """ +} + + // Run fastp to preprocess the FASTQs process PREPROCESS { label 'fastp_container' diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 2e9ef27..468e570 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -1,5 +1,5 @@ // Import process modules -include { PREPROCESS; READ_QC } from "$projectDir/modules/preprocess" +include { FILE_VALIDATION; PREPROCESS; READ_QC } from "$projectDir/modules/preprocess" include { ASSEMBLY_UNICYCLER; ASSEMBLY_SHOVILL; ASSEMBLY_ASSESS; ASSEMBLY_QC } from "$projectDir/modules/assembly" include { GET_REF_GENOME_BWA_DB; MAPPING; SAM_TO_SORTED_BAM; SNP_CALL; HET_SNP_COUNT; MAPPING_QC } from "$projectDir/modules/mapping" include { GET_KRAKEN2_DB; TAXONOMY; TAXONOMY_QC } from "$projectDir/modules/taxonomy" @@ -32,9 +32,18 @@ workflow PIPELINE { // Get read pairs into Channel raw_read_pairs_ch raw_read_pairs_ch = Channel.fromFilePairs("$params.reads/*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}", checkIfExists: true) - // Preprocess read pairs + // Basic input files validation + // Output into Channel FILE_VALIDATION.out.result + FILE_VALIDATION(raw_read_pairs_ch) + + // From Channel raw_read_pairs_ch, only output valid reads of samples based on Channel FILE_VALIDATION.out.result + VALID_READS_ch = FILE_VALIDATION.out.result.join(raw_read_pairs_ch, failOnDuplicate: true, failOnMismatch: true) + .filter { it[1] == 'PASS' } + .map { it[0, 2..-1] } + + // Preprocess valid read pairs // Output into Channels PREPROCESS.out.processed_reads & PREPROCESS.out.json - PREPROCESS(raw_read_pairs_ch) + PREPROCESS(VALID_READS_ch) // From Channel PREPROCESS.out.json, provide Read QC status // Output into Channels READ_QC.out.bases, READ_QC.out.result, READ_QC.out.report @@ -102,10 +111,12 @@ workflow PIPELINE { // Output into Channels TAXONOMY_QC.out.result & TAXONOMY_QC.out.report TAXONOMY_QC(TAXONOMY.out.report, params.spneumo_percentage, params.non_strep_percentage) - // Merge Channels AREAD_QC.out.result & SSEMBLY_QC.out.result & MAPPING_QC.out.result & TAXONOMY_QC.out.result to provide Overall QC Status + // Merge Channels FILE_VALIDATION.out.result & READ_QC.out.result & ASSEMBLY_QC.out.result & MAPPING_QC.out.result & TAXONOMY_QC.out.result to provide Overall QC Status // Output into Channel OVERALL_QC.out.result & OVERALL_QC.out.report OVERALL_QC( - READ_QC.out.result + raw_read_pairs_ch.map{ it[0] } + .join(FILE_VALIDATION.out.result, failOnDuplicate: true, failOnMismatch: true) + .join(READ_QC.out.result, failOnDuplicate: true, remainder: true) .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true) .join(MAPPING_QC.out.result, failOnDuplicate: true, remainder: true) .join(TAXONOMY_QC.out.result, failOnDuplicate: true, remainder: true) @@ -150,11 +161,12 @@ workflow PIPELINE { // Generate sample reports by merging outputs from all result-generating modules GENERATE_SAMPLE_REPORT( - READ_QC.out.report + raw_read_pairs_ch.map{ it[0] } + .join(READ_QC.out.report, failOnDuplicate: true, remainder: true) .join(ASSEMBLY_QC.out.report, failOnDuplicate: true, remainder: true) .join(MAPPING_QC.out.report, failOnDuplicate: true, remainder: true) .join(TAXONOMY_QC.out.report, failOnDuplicate: true, remainder: true) - .join(OVERALL_QC.out.report, failOnDuplicate: true, remainder: true) + .join(OVERALL_QC.out.report, failOnDuplicate: true, failOnMismatch: true) .join(SEROTYPE.out.report, failOnDuplicate: true, remainder: true) .join(MLST.out.report, failOnDuplicate: true, remainder: true) .join(PARSE_PBP_RESISTANCE.out.report, failOnDuplicate: true, remainder: true)