Skip to content

Commit

Permalink
Merge pull request #91 from sanger-bentley-group/dev
Browse files Browse the repository at this point in the history
Release Version 1.0.0-rc4
  • Loading branch information
HarryHung authored Feb 23, 2024
2 parents 54ab60a + 356b0f4 commit 745f064
Show file tree
Hide file tree
Showing 7 changed files with 94 additions and 13 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# GPS Pipeline <!-- omit in toc -->

[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-23.10.0-23aa62.svg)](https://www.nextflow.io/)
[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-23.10.1-23aa62.svg)](https://www.nextflow.io/)
[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)
[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/singularity/)
[![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/sanger-bentley-group/gps-pipeline)
Expand Down Expand Up @@ -281,6 +281,8 @@ The pipeline is compatible with [Launchpad](https://help.tower.nf/23.2/launch/la
<!-- -->
> ℹ️ For virulence genes: POS = Positive; NEG = Negative
<!-- -->
> ⚠️ If the result of `Overall_QC` of a sample is `READ_ONE_CORRUPTED`, `READ_TWO_CORRUPTED` or both, the specific read file is found to be corrupted (i.e. incomplete/damaged Gzip file, mis-match(s) in read length and quality-score length). You might want to reacquire the read file from its source, or discard the sample if the source file is corrupted as well.
<!-- -->
> ⚠️ If the result of `Overall_QC` of a sample is `ASSEMBLER FAILURE`, the assembler has crashed when trying to assembly the reads. You might want to re-run the sample with [another assembler](#assembly), or discard the sample if it is a low quality one.
<!-- -->
> ⚠️ If the result of `Serotype` of a sample is `SEROBA FAILURE`, SeroBA has crashed when trying to serotype the sample.
Expand Down
9 changes: 6 additions & 3 deletions bin/get_overall_qc.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
# Determine overall QC result based on Assembly QC, Mapping QC and Taxonomy QC
# In case of assembler failure, there will be no Assembly QC input, hence output result as ASSEMBLER FAILURE
# Determine overall QC result based on File Validity, Read QC, Assembly QC, Mapping QC and Taxonomy QC
# In case File Validity is not PASS, save its value (i.e. description of the issue) to Overall QC
# In case of assembler failure, there will be no Assembly QC input, save ASSEMBLER FAILURE to Overall QC

if [[ "$READ_QC" == "PASS" ]] && [[ "$ASSEMBLY_QC" == "PASS" ]] && [[ "$MAPPING_QC" == "PASS" ]] && [[ "$TAXONOMY_QC" == "PASS" ]]; then
if [[ ! "$FILE_VALIDITY" == "PASS" ]]; then
OVERALL_QC="$FILE_VALIDITY"
elif [[ "$READ_QC" == "PASS" ]] && [[ "$ASSEMBLY_QC" == "PASS" ]] && [[ "$MAPPING_QC" == "PASS" ]] && [[ "$TAXONOMY_QC" == "PASS" ]]; then
OVERALL_QC="PASS"
elif [[ "$READ_QC" == "PASS" ]] && [[ "$ASSEMBLY_QC" == "null" ]]; then
OVERALL_QC="ASSEMBLER FAILURE"
Expand Down
38 changes: 38 additions & 0 deletions bin/validate_file.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Basic file validation of input files
# Check if read length is the same as quality-score length on all entries; if input is Gzipped, check file integrity first
# Report if any read is corrupted

FILE_VALIDITY=""

case "$READ_ONE" in
*.gz)
{
gzip -t "$READ_ONE" && zcat "$READ_ONE" | paste - - - - | awk -F"\t" '{ if (length($2) != length($4)) exit 1 }' ;
} || {
FILE_VALIDITY+="READ_ONE_CORRUPTED;";
}
{
gzip -t "$READ_TWO" && zcat "$READ_TWO" | paste - - - - | awk -F"\t" '{ if (length($2) != length($4)) exit 1 }' ;
} || {
FILE_VALIDITY+="READ_TWO_CORRUPTED;";
}
;;
*)
{
cat "$READ_ONE" | paste - - - - | awk -F"\t" '{ if (length($2) != length($4)) exit 1 }' ;
} || {
FILE_VALIDITY+="READ_ONE_CORRUPTED;";
}
{
cat "$READ_TWO" | paste - - - - | awk -F"\t" '{ if (length($2) != length($4)) exit 1 }' ;
} || {
FILE_VALIDITY+="READ_TWO_CORRUPTED;";
}
;;
esac

if [[ "$FILE_VALIDITY" == "" ]]; then
FILE_VALIDITY="PASS"
else
FILE_VALIDITY="${FILE_VALIDITY%;}"
fi
2 changes: 1 addition & 1 deletion main.nf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env nextflow

// Version of this release
pipelineVersion = '1.0.0-rc3'
pipelineVersion = '1.0.0-rc4'

// Import workflow modules
include { PIPELINE } from "$projectDir/workflows/pipeline"
Expand Down
3 changes: 2 additions & 1 deletion modules/overall_qc.nf
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ process OVERALL_QC {
tag "$sample_id"

input:
tuple val(sample_id), val(read_qc), val(assembly_qc), val(mapping_qc), val(taxonomy_qc)
tuple val(sample_id), val(file_validity), val(read_qc), val(assembly_qc), val(mapping_qc), val(taxonomy_qc)

output:
tuple val(sample_id), env(OVERALL_QC), emit: result
Expand All @@ -15,6 +15,7 @@ process OVERALL_QC {
script:
overall_qc_report='overall_qc_report.csv'
"""
FILE_VALIDITY="$file_validity"
READ_QC="$read_qc"
ASSEMBLY_QC="$assembly_qc"
MAPPING_QC="$mapping_qc"
Expand Down
25 changes: 25 additions & 0 deletions modules/preprocess.nf
Original file line number Diff line number Diff line change
@@ -1,3 +1,28 @@
// Basic file validation of input files
process FILE_VALIDATION {
label 'bash_container'
label 'farm_low'

tag "$sample_id"

input:
tuple val(sample_id), path(reads)

output:
tuple val(sample_id), env(FILE_VALIDITY), emit: result

script:
read_one="${reads[0]}"
read_two="${reads[1]}"
"""
READ_ONE="$read_one"
READ_TWO="$read_two"
source validate_file.sh
"""
}


// Run fastp to preprocess the FASTQs
process PREPROCESS {
label 'fastp_container'
Expand Down
26 changes: 19 additions & 7 deletions workflows/pipeline.nf
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// Import process modules
include { PREPROCESS; READ_QC } from "$projectDir/modules/preprocess"
include { FILE_VALIDATION; PREPROCESS; READ_QC } from "$projectDir/modules/preprocess"
include { ASSEMBLY_UNICYCLER; ASSEMBLY_SHOVILL; ASSEMBLY_ASSESS; ASSEMBLY_QC } from "$projectDir/modules/assembly"
include { GET_REF_GENOME_BWA_DB; MAPPING; SAM_TO_SORTED_BAM; SNP_CALL; HET_SNP_COUNT; MAPPING_QC } from "$projectDir/modules/mapping"
include { GET_KRAKEN2_DB; TAXONOMY; TAXONOMY_QC } from "$projectDir/modules/taxonomy"
Expand Down Expand Up @@ -32,9 +32,18 @@ workflow PIPELINE {
// Get read pairs into Channel raw_read_pairs_ch
raw_read_pairs_ch = Channel.fromFilePairs("$params.reads/*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}", checkIfExists: true)

// Preprocess read pairs
// Basic input files validation
// Output into Channel FILE_VALIDATION.out.result
FILE_VALIDATION(raw_read_pairs_ch)

// From Channel raw_read_pairs_ch, only output valid reads of samples based on Channel FILE_VALIDATION.out.result
VALID_READS_ch = FILE_VALIDATION.out.result.join(raw_read_pairs_ch, failOnDuplicate: true, failOnMismatch: true)
.filter { it[1] == 'PASS' }
.map { it[0, 2..-1] }

// Preprocess valid read pairs
// Output into Channels PREPROCESS.out.processed_reads & PREPROCESS.out.json
PREPROCESS(raw_read_pairs_ch)
PREPROCESS(VALID_READS_ch)

// From Channel PREPROCESS.out.json, provide Read QC status
// Output into Channels READ_QC.out.bases, READ_QC.out.result, READ_QC.out.report
Expand Down Expand Up @@ -102,10 +111,12 @@ workflow PIPELINE {
// Output into Channels TAXONOMY_QC.out.result & TAXONOMY_QC.out.report
TAXONOMY_QC(TAXONOMY.out.report, params.spneumo_percentage, params.non_strep_percentage)

// Merge Channels AREAD_QC.out.result & SSEMBLY_QC.out.result & MAPPING_QC.out.result & TAXONOMY_QC.out.result to provide Overall QC Status
// Merge Channels FILE_VALIDATION.out.result & READ_QC.out.result & ASSEMBLY_QC.out.result & MAPPING_QC.out.result & TAXONOMY_QC.out.result to provide Overall QC Status
// Output into Channel OVERALL_QC.out.result & OVERALL_QC.out.report
OVERALL_QC(
READ_QC.out.result
raw_read_pairs_ch.map{ it[0] }
.join(FILE_VALIDATION.out.result, failOnDuplicate: true, failOnMismatch: true)
.join(READ_QC.out.result, failOnDuplicate: true, remainder: true)
.join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true)
.join(MAPPING_QC.out.result, failOnDuplicate: true, remainder: true)
.join(TAXONOMY_QC.out.result, failOnDuplicate: true, remainder: true)
Expand Down Expand Up @@ -150,11 +161,12 @@ workflow PIPELINE {

// Generate sample reports by merging outputs from all result-generating modules
GENERATE_SAMPLE_REPORT(
READ_QC.out.report
raw_read_pairs_ch.map{ it[0] }
.join(READ_QC.out.report, failOnDuplicate: true, remainder: true)
.join(ASSEMBLY_QC.out.report, failOnDuplicate: true, remainder: true)
.join(MAPPING_QC.out.report, failOnDuplicate: true, remainder: true)
.join(TAXONOMY_QC.out.report, failOnDuplicate: true, remainder: true)
.join(OVERALL_QC.out.report, failOnDuplicate: true, remainder: true)
.join(OVERALL_QC.out.report, failOnDuplicate: true, failOnMismatch: true)
.join(SEROTYPE.out.report, failOnDuplicate: true, remainder: true)
.join(MLST.out.report, failOnDuplicate: true, remainder: true)
.join(PARSE_PBP_RESISTANCE.out.report, failOnDuplicate: true, remainder: true)
Expand Down

0 comments on commit 745f064

Please sign in to comment.