From e8a2905c8a533ea2535f861815071d1008667eee Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 19 Jul 2023 15:17:17 +0000 Subject: [PATCH 01/41] Ensure version of Python 3 is captured --- modules/info.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/info.nf b/modules/info.nf index 4cabccb..90cbc5b 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -445,7 +445,7 @@ process PYTHON_VERSION { shell: $/ - VERSION=$(python --version | sed -r "s/.*\s(.+)/\1/") + VERSION=$(python3 --version | sed -r "s/.*\s(.+)/\1/") /$ } From dc1064e8b6eac5887a968bef99a2b3d117d18532 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 19 Jul 2023 16:20:26 +0000 Subject: [PATCH 02/41] Remove unnecessary versioning of Het-SNP Counter --- modules/info.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/info.nf b/modules/info.nf index 90cbc5b..cde8662 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -245,7 +245,6 @@ process PARSE { |${toolTextRow('BWA', 'bwa')} |${toolTextRow('SAMtools', 'samtools')} |${toolTextRow('BCFtools', 'bcftools')} - |${toolTextRow('Het-SNP Counter', 'het_snp_count')} |${toolTextRow('PopPUNK', 'poppunk')} |${toolTextRow('CDC PBP AMR Predictor', 'spn_pbp_amr')} |${toolTextRow('ARIBA', 'ariba')} From 47bb796fbbb8fcb79e0d78a79ebf303940f49cd5 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 19 Jul 2023 16:20:51 +0000 Subject: [PATCH 03/41] Change default Python image to include Pandas --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 1ab6cb0..1a322f1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -66,7 +66,7 @@ process { container = 'bitnami/git:2.39.0' } withLabel: python_container { - container = 'python:3.11.1-bullseye' + container = 'amancevice/pandas:2.0.2-slim' } withLabel: fastp_container { container = 'staphb/fastp:0.23.2' From 231b7b1ae78c1fde380b7056da22c3202b8940a8 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 19 Jul 2023 16:21:36 +0000 Subject: [PATCH 04/41] Improve Docker Image capturing --- bin/get_images_info.sh | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/bin/get_images_info.sh b/bin/get_images_info.sh index ba5428f..95dd83f 100755 --- a/bin/get_images_info.sh +++ b/bin/get_images_info.sh @@ -1,25 +1,25 @@ # Extract containers information from nextflow.config and save into a JSON file -IMAGES=$(grep -E "container\s?=" $NEXTFLOW_CONFIG \ - | sort -u \ - | sed -r "s/\s+container\s?=\s?'(.+)'/\1/") +find_image () { + grep -E "container\s?=" -B 1 $NEXTFLOW_CONFIG | grep -v -- "^--$" | paste - - | sort -u | grep $1 | sed -r "s/.+container\s?=\s?'(.+)'/\1/" +} -BASH=$(grep network-multitool <<< $IMAGES) -GIT=$(grep git <<< $IMAGES) -PYTHON=$(grep python <<< $IMAGES) -FASTP=$(grep fastp <<< $IMAGES) -UNICYCLER=$(grep unicycler <<< $IMAGES) -SHOVILL=$(grep shovill <<< $IMAGES) -QUAST=$(grep quast <<< $IMAGES) -BWA=$(grep bwa <<< $IMAGES) -SAMTOOLS=$(grep samtools <<< $IMAGES) -BCFTOOLS=$(grep bcftools <<< $IMAGES) -POPPUNK=$(grep poppunk <<< $IMAGES) -SPN_PBP_AMR=$(grep spn-pbp-amr <<< $IMAGES) -ARIBA=$(grep ariba <<< $IMAGES) -MLST=$(grep mlst <<< $IMAGES) -KRAKEN2=$(grep kraken2 <<< $IMAGES) -SEROBA=$(grep seroba <<< $IMAGES) +BASH=$(find_image bash) +GIT=$(find_image git) +PYTHON=$(find_image python) +FASTP=$(find_image fastp) +UNICYCLER=$(find_image unicycler) +SHOVILL=$(find_image shovill) +QUAST=$(find_image quast) +BWA=$(find_image bwa) +SAMTOOLS=$(find_image samtools) +BCFTOOLS=$(find_image bcftools) +POPPUNK=$(find_image poppunk) +SPN_PBP_AMR=$(find_image spn-pbp-amr) +ARIBA=$(find_image ariba) +MLST=$(find_image mlst) +KRAKEN2=$(find_image kraken2) +SEROBA=$(find_image seroba) add_container () { jq -n --arg container $1 '.container = $container' From 5d072252de429525558fb157ee858317c566b71d Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 20 Jul 2023 16:16:27 +0000 Subject: [PATCH 05/41] Save reports as .csv --- bin/assembly_qc.sh | 3 +++ bin/mapping_qc.sh | 3 +++ bin/overall_qc.sh | 7 ++++++- bin/read_qc.sh | 3 +++ bin/taxonomy_qc.sh | 3 +++ modules/assembly.nf | 4 +++- modules/mapping.nf | 4 +++- modules/overall_qc.nf | 6 +++++- modules/preprocess.nf | 3 +++ modules/taxonomy.nf | 4 +++- 10 files changed, 35 insertions(+), 5 deletions(-) diff --git a/bin/assembly_qc.sh b/bin/assembly_qc.sh index 160ed72..9d399fc 100755 --- a/bin/assembly_qc.sh +++ b/bin/assembly_qc.sh @@ -9,3 +9,6 @@ if (( $CONTIGS < $QC_CONTIGS )) && (( $LENGTH >= $QC_LENGTH_LOW )) && (( $LENGTH else ASSEMBLY_QC="FAIL" fi + +echo \"Assembly_QC\",\"Contigs#\",\"Assembly_Length\",\"Seq_Depth\" > $ASSEMBLY_QC_REPORT +echo \"$ASSEMBLY_QC\",\"$CONTIGS\",\"$LENGTH\",\"$DEPTH\" >> $ASSEMBLY_QC_REPORT \ No newline at end of file diff --git a/bin/mapping_qc.sh b/bin/mapping_qc.sh index 450e871..75b18a0 100755 --- a/bin/mapping_qc.sh +++ b/bin/mapping_qc.sh @@ -7,3 +7,6 @@ if (( $(echo "$COVERAGE > $QC_REF_COVERAGE" | bc -l) )) && (( $HET_SNP < $QC_HET else MAPPING_QC="FAIL" fi + +echo \"Mapping_QC\",\"Ref_Cov_%\",\"Het-SNP#\" > $MAPPING_QC_REPORT +echo \"$MAPPING_QC\",\"$COVERAGE\",\"$QC_HET_SNP_SITE\" >> $MAPPING_QC_REPORT \ No newline at end of file diff --git a/bin/overall_qc.sh b/bin/overall_qc.sh index ccdb803..de7e116 100755 --- a/bin/overall_qc.sh +++ b/bin/overall_qc.sh @@ -1,10 +1,15 @@ # Determine overall QC result based on Assembly QC, Mapping QC and Taxonomy QC # In case of assembler failure, there will be no Assembly QC input, hence output result as ASSEMBLER FAILURE -if [[ "$ASSEMBLY_QC" == "PASS" ]] && [[ "$MAPPING_QC" == "PASS" ]] && [[ "$TAXONOMY_QC" == "PASS" ]]; then +if [[ "$READ_QC" == "PASS" ]] && [[ "$ASSEMBLY_QC" == "PASS" ]] && [[ "$MAPPING_QC" == "PASS" ]] && [[ "$TAXONOMY_QC" == "PASS" ]]; then OVERALL_QC="PASS" +elif [[ "$READ_QC" == "FAIL" ]]; then + OVERALL_QC="FAIL" elif [[ "$ASSEMBLY_QC" == "null" ]]; then OVERALL_QC="ASSEMBLER FAILURE" else OVERALL_QC="FAIL" fi + +echo \"Overall_QC\" > $OVERALL_QC_REPORT +echo \"$OVERALL_QC\" >> $OVERALL_QC_REPORT \ No newline at end of file diff --git a/bin/read_qc.sh b/bin/read_qc.sh index 040c6ec..6ce8382 100755 --- a/bin/read_qc.sh +++ b/bin/read_qc.sh @@ -7,3 +7,6 @@ if (( $(echo "$BASES >= ($QC_LENGTH_LOW*$QC_DEPTH)" | bc -l) )); then else READ_QC="FAIL" fi + +echo \"Read_QC\",\"Bases\" > $READ_QC_REPORT +echo \"$READ_QC\",\"$BASES\" >> $READ_QC_REPORT \ No newline at end of file diff --git a/bin/taxonomy_qc.sh b/bin/taxonomy_qc.sh index c468b14..23254b1 100755 --- a/bin/taxonomy_qc.sh +++ b/bin/taxonomy_qc.sh @@ -11,3 +11,6 @@ if (( $(echo "$PERCENTAGE > $QC_SPNEUMO_PERCENTAGE" | bc -l) )); then else TAXONOMY_QC="FAIL" fi + +echo \"Taxonomy_QC\",\"S.Pneumo_%\" > $TAXONOMY_QC_REPORT +echo \"$TAXONOMY_QC\",\"$PERCENTAGE\" >> $TAXONOMY_QC_REPORT \ No newline at end of file diff --git a/modules/assembly.nf b/modules/assembly.nf index 4699289..fd84c76 100644 --- a/modules/assembly.nf +++ b/modules/assembly.nf @@ -85,10 +85,11 @@ process ASSEMBLY_QC { val(qc_depth) output: - tuple val(sample_id), env(CONTIGS), env(LENGTH), env(DEPTH), emit: info tuple val(sample_id), env(ASSEMBLY_QC), emit: result + tuple val(sample_id), path(assembly_qc_report), emit: report script: + assembly_qc_report='assembly_qc_report.csv' """ REPORT="$report" BASES="$bases" @@ -96,6 +97,7 @@ process ASSEMBLY_QC { QC_LENGTH_LOW="$qc_length_low" QC_LENGTH_HIGH="$qc_length_high" QC_DEPTH="$qc_depth" + ASSEMBLY_QC_REPORT="$assembly_qc_report" source assembly_qc.sh """ diff --git a/modules/mapping.nf b/modules/mapping.nf index f0d1e0e..0a37628 100644 --- a/modules/mapping.nf +++ b/modules/mapping.nf @@ -138,15 +138,17 @@ process MAPPING_QC { val(qc_het_snp_site) output: - tuple val(sample_id), env(COVERAGE), env(HET_SNP), emit: info tuple val(sample_id), env(MAPPING_QC), emit: result + tuple val(sample_id), path(mapping_qc_report), emit: report script: + mapping_qc_report='mapping_qc_report.csv' """ COVERAGE="$ref_coverage" HET_SNP="$het_snp_count" QC_REF_COVERAGE="$qc_ref_coverage" QC_HET_SNP_SITE="$qc_het_snp_site" + MAPPING_QC_REPORT="$mapping_qc_report" source mapping_qc.sh """ diff --git a/modules/overall_qc.nf b/modules/overall_qc.nf index b212595..fa639d9 100644 --- a/modules/overall_qc.nf +++ b/modules/overall_qc.nf @@ -6,16 +6,20 @@ process OVERALL_QC { tag "$sample_id" input: - tuple val(sample_id), val(assembly_qc), val(mapping_qc), val(taxonomy_qc) + tuple val(sample_id), val(read_qc), val(assembly_qc), val(mapping_qc), val(taxonomy_qc) output: tuple val(sample_id), env(OVERALL_QC), emit: result + tuple val(sample_id), path(overall_qc_report), emit: report script: + overall_qc_report='overall_qc_report.csv' """ + READ_QC="$read_qc" ASSEMBLY_QC="$assembly_qc" MAPPING_QC="$mapping_qc" TAXONOMY_QC="$taxonomy_qc" + OVERALL_QC_REPORT="$overall_qc_report" source overall_qc.sh """ diff --git a/modules/preprocess.nf b/modules/preprocess.nf index 4e8e18c..e04b756 100644 --- a/modules/preprocess.nf +++ b/modules/preprocess.nf @@ -38,12 +38,15 @@ process READ_QC { output: tuple val(sample_id), env(BASES), emit: bases tuple val(sample_id), env(READ_QC), emit: result + tuple val(sample_id), path(read_qc_report), emit: report script: + read_qc_report='read_qc_report.csv' """ JSON="$json" QC_LENGTH_LOW="$qc_length_low" QC_DEPTH="$qc_depth" + READ_QC_REPORT="$read_qc_report" source read_qc.sh """ diff --git a/modules/taxonomy.nf b/modules/taxonomy.nf index af6266d..b4d1e62 100644 --- a/modules/taxonomy.nf +++ b/modules/taxonomy.nf @@ -63,13 +63,15 @@ process TAXONOMY_QC { val(qc_spneumo_percentage) output: - tuple val(sample_id), env(PERCENTAGE), emit: percentage tuple val(sample_id), env(TAXONOMY_QC), emit: result + tuple val(sample_id), path(taxonomy_qc_report), emit: report script: + taxonomy_qc_report='taxonomy_qc_report.csv' """ KRAKEN2_REPORT="$kraken2_report" QC_SPNEUMO_PERCENTAGE="$qc_spneumo_percentage" + TAXONOMY_QC_REPORT="$taxonomy_qc_report" source taxonomy_qc.sh """ From ea1b1bab32ded4e1ba187b693acfed0e91bc733f Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 20 Jul 2023 16:17:14 +0000 Subject: [PATCH 06/41] Combining reports to generate sample report --- bin/generate_sample_report.sh | 3 +++ modules/output.nf | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100755 bin/generate_sample_report.sh create mode 100644 modules/output.nf diff --git a/bin/generate_sample_report.sh b/bin/generate_sample_report.sh new file mode 100755 index 0000000..ec769f3 --- /dev/null +++ b/bin/generate_sample_report.sh @@ -0,0 +1,3 @@ +paste -d , *.csv \ +| sed '1 s/^/\"Sample_ID\",/' \ +| sed "2 s/^/\"${SAMPLE_ID}\",/" > $SAMPLE_REPORT \ No newline at end of file diff --git a/modules/output.nf b/modules/output.nf new file mode 100644 index 0000000..a711425 --- /dev/null +++ b/modules/output.nf @@ -0,0 +1,21 @@ +process GENERATE_SAMPLE_REPORT { + label 'bash_container' + label 'farm_low' + + tag "$sample_id" + + input: + tuple val(sample_id), path ('report*.csv') + + output: + path sample_report + + script: + sample_report="${sample_id}_report.csv" + """ + SAMPLE_ID=$sample_id + SAMPLE_REPORT=$sample_report + + source generate_sample_report.sh + """ +} \ No newline at end of file From 01b22b87a70de4c9077b879c74a69aeeb03fedbd Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 20 Jul 2023 16:17:23 +0000 Subject: [PATCH 07/41] Initial work on output revamp (WIP) --- workflows/pipeline.nf | 110 ++++++++++++++++++++++++------------------ 1 file changed, 62 insertions(+), 48 deletions(-) diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 6dc59fe..ef8f650 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -8,6 +8,7 @@ include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS; LINEAGE } from "$projectDir/ include { GET_SEROBA_DB; CREATE_SEROBA_DB; SEROTYPE } from "$projectDir/modules/serotype" include { MLST } from "$projectDir/modules/mlst" include { PBP_RESISTANCE; GET_PBP_RESISTANCE; CREATE_ARIBA_DB; OTHER_RESISTANCE; GET_OTHER_RESISTANCE } from "$projectDir/modules/amr" +include { GENERATE_SAMPLE_REPORT } from "$projectDir/modules/output" // Main pipeline workflow workflow PIPELINE { @@ -103,9 +104,10 @@ workflow PIPELINE { // Merge Channels ASSEMBLY_QC.out.result & MAPPING_QC.out.result & TAXONOMY_QC.out.result to provide Overall QC Status // Output into Channel OVERALL_QC.out.result OVERALL_QC( - ASSEMBLY_QC.out.result + READ_QC.out.result + .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true) .join(MAPPING_QC.out.result, failOnDuplicate: true, remainder: true) - .join(TAXONOMY_QC.out.result, failOnDuplicate: true) + .join(TAXONOMY_QC.out.result, failOnDuplicate: true, remainder: true) ) // From Channel READ_QC_PASSED_READS_ch, only output reads of samples passed overall QC based on Channel OVERALL_QC.out.result @@ -155,52 +157,64 @@ workflow PIPELINE { // GET_OTHER_RESISTANCE.out.result // // Replace null with approiate amount of "_" items when sample_id does not exist in that output (i.e. QC rejected) - READ_QC.out.result - .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - .join(MAPPING_QC.out.result, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - .join(TAXONOMY_QC.out.result, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - .join(OVERALL_QC.out.result, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['FAIL'] : it } - .join(READ_QC.out.bases, failOnDuplicate: true, failOnMismatch: true) - .join(ASSEMBLY_QC.out.info, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] * 3 : it } - .join(MAPPING_QC.out.info, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] * 2 : it } - .join(TAXONOMY_QC.out.percentage, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - .join(LINEAGE.out.csv.splitCsv(skip: 1), failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - .join(SEROTYPE.out.result, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - .join(MLST.out.result, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] * 8 : it } - .join(GET_PBP_RESISTANCE.out.result, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] * 18 : it } - .join(GET_OTHER_RESISTANCE.out, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] * 24 : it } - .map { it.collect {"\"$it\""}.join',' } - .collectFile( - name: 'results.csv', - storeDir: "$params.output", - seed: [ - 'Sample_ID', - 'Read_QC', 'Assembly_QC', 'Mapping_QC', 'Taxonomy_QC', 'Overall_QC', - 'Bases', - 'Contigs#' , 'Assembly_Length', 'Seq_Depth', - 'Ref_Cov_%', 'Het-SNP#' , - 'S.Pneumo_%', - 'GPSC', - 'Serotype', - 'ST', 'aroE', 'gdh', 'gki', 'recP', 'spi', 'xpt', 'ddl', - 'pbp1a', 'pbp2b', 'pbp2x', 'AMO_MIC', 'AMO_Res', 'CFT_MIC', 'CFT_Res(Meningital)', 'CFT_Res(Non-meningital)', 'TAX_MIC', 'TAX_Res(Meningital)', 'TAX_Res(Non-meningital)', 'CFX_MIC', 'CFX_Res', 'MER_MIC', 'MER_Res', 'PEN_MIC', 'PEN_Res(Meningital)', 'PEN_Res(Non-meningital)', - 'CHL_Res', 'CHL_Determinant', 'ERY_Res', 'ERY_Determinant', 'CLI_Res', 'CLI_Determinant', 'ERY_CLI_Res', 'ERY_CLI_Determinant', 'FQ_Res', 'FQ_Determinant', 'LFX_Res', 'LFX_Determinant', 'KAN_Res', 'KAN_Determinant', 'TET_Res', 'TET_Determinant', 'DOX_Res', 'DOX_Determinant', 'TMP_Res', 'TMP_Determinant', 'SMX_Res', 'SMX_Determinant', 'COT_Res', 'COT_Determinant', 'RIF_Res', 'RIF_Determinant', 'VAN_Res', 'VAN_Determinant', 'PILI1', 'PILI1_Determinant', 'PILI2', 'PILI2_Determinant' - ].join(','), - sort: { it.split(',')[0] }, - newLine: true - ) + + GENERATE_SAMPLE_REPORT( + READ_QC.out.report + .join(ASSEMBLY_QC.out.report, failOnDuplicate: true, remainder: true) + .join(MAPPING_QC.out.report, failOnDuplicate: true, remainder: true) + .join(TAXONOMY_QC.out.report, failOnDuplicate: true, remainder: true) + .join(OVERALL_QC.out.report, failOnDuplicate: true, remainder: true) + .map { [it[0], it[1..-1].minus(null)] } + ).view() + + // GENERATE_OVERALL_REPORT + + // READ_QC.out.result + // .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } + // .join(MAPPING_QC.out.result, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } + // .join(TAXONOMY_QC.out.result, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } + // .join(OVERALL_QC.out.result, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['FAIL'] : it } + // .join(READ_QC.out.bases, failOnDuplicate: true, failOnMismatch: true) + // .join(ASSEMBLY_QC.out.info, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 3 : it } + // .join(MAPPING_QC.out.info, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 2 : it } + // .join(TAXONOMY_QC.out.percentage, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } + // .join(LINEAGE.out.csv.splitCsv(skip: 1), failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } + // .join(SEROTYPE.out.result, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } + // .join(MLST.out.result, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 8 : it } + // .join(GET_PBP_RESISTANCE.out.result, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 18 : it } + // .join(GET_OTHER_RESISTANCE.out, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 24 : it } + // .map { it.collect {"\"$it\""}.join',' } + // .collectFile( + // name: 'results.csv', + // storeDir: "$params.output", + // seed: [ + // 'Sample_ID', + // 'Read_QC', 'Assembly_QC', 'Mapping_QC', 'Taxonomy_QC', 'Overall_QC', + // 'Bases', + // 'Contigs#' , 'Assembly_Length', 'Seq_Depth', + // 'Ref_Cov_%', 'Het-SNP#' , + // 'S.Pneumo_%', + // 'GPSC', + // 'Serotype', + // 'ST', 'aroE', 'gdh', 'gki', 'recP', 'spi', 'xpt', 'ddl', + // 'pbp1a', 'pbp2b', 'pbp2x', 'AMO_MIC', 'AMO_Res', 'CFT_MIC', 'CFT_Res(Meningital)', 'CFT_Res(Non-meningital)', 'TAX_MIC', 'TAX_Res(Meningital)', 'TAX_Res(Non-meningital)', 'CFX_MIC', 'CFX_Res', 'MER_MIC', 'MER_Res', 'PEN_MIC', 'PEN_Res(Meningital)', 'PEN_Res(Non-meningital)', + // 'CHL_Res', 'CHL_Determinant', 'ERY_Res', 'ERY_Determinant', 'CLI_Res', 'CLI_Determinant', 'ERY_CLI_Res', 'ERY_CLI_Determinant', 'FQ_Res', 'FQ_Determinant', 'LFX_Res', 'LFX_Determinant', 'KAN_Res', 'KAN_Determinant', 'TET_Res', 'TET_Determinant', 'DOX_Res', 'DOX_Determinant', 'TMP_Res', 'TMP_Determinant', 'SMX_Res', 'SMX_Determinant', 'COT_Res', 'COT_Determinant', 'RIF_Res', 'RIF_Determinant', 'VAN_Res', 'VAN_Determinant', 'PILI1', 'PILI1_Determinant', 'PILI2', 'PILI2_Determinant' + // ].join(','), + // sort: { it.split(',')[0] }, + // newLine: true + // ) // Pass to SAVE_INFO sub-workflow DATABASES_INFO = CREATE_REF_GENOME_BWA_DB.out.path.map { [["bwa_db_path", it]] } From d0c5123b34773a7832dd7ff7ea3740c078c29dda Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 21 Jul 2023 11:40:03 +0000 Subject: [PATCH 08/41] Save Lineage report per sample as .csv --- bin/get_lineage.sh | 13 +++++++++++++ modules/lineage.nf | 15 ++++++++------- workflows/pipeline.nf | 1 + 3 files changed, 22 insertions(+), 7 deletions(-) create mode 100755 bin/get_lineage.sh diff --git a/bin/get_lineage.sh b/bin/get_lineage.sh new file mode 100755 index 0000000..70e4f08 --- /dev/null +++ b/bin/get_lineage.sh @@ -0,0 +1,13 @@ +# Run PopPUNK to assign GPSCs to samples + +# Add "prefix_" to all sample names in qfile to avoid poppunk_assign crashing due to sample name already exists in database +# Remove "prefix_" from all sample names in the result + +# Save results of individual sample into .csv with its name as filename + +sed 's/^/prefix_/' "$QFILE" > safe_qfile.txt +poppunk_assign --db "${POPPUNK_DIR}/${DB_NAME}" --external-clustering "${POPPUNK_DIR}/${EXT_CLUSTERS_FILE}" --query safe_qfile.txt --output output --threads $(nproc) +sed 's/^prefix_//' output/output_external_clusters.csv > result.txt + + +awk -F , 'NR!=1 { print "GPSC\n" "\"" $2 "\"" > $1 ".csv" }' result.txt \ No newline at end of file diff --git a/modules/lineage.nf b/modules/lineage.nf index 6e13fab..68edae3 100644 --- a/modules/lineage.nf +++ b/modules/lineage.nf @@ -46,8 +46,7 @@ process GET_POPPUNK_EXT_CLUSTERS { } // Run PopPUNK to assign GPSCs to samples -// Add "prefix_" to all sample names in qfile to avoid poppunk_assign crashing due to sample name already exists in database -// Remove "prefix_" from all sample names in the output +// Save results of individual sample into .csv with its name as filename process LINEAGE { label 'poppunk_container' label 'farm_high' @@ -63,13 +62,15 @@ process LINEAGE { path qfile output: - path(result), emit: csv + path '*.csv', emit: reports script: - result='result.csv' """ - sed 's/^/prefix_/' "$qfile" > safe_qfile.txt - poppunk_assign --db "${poppunk_dir}/${db_name}" --external-clustering "${poppunk_dir}/${ext_clusters_file}" --query safe_qfile.txt --output output --threads `nproc` - sed 's/^prefix_//' output/output_external_clusters.csv > "$result" + QFILE="$qfile" + POPPUNK_DIR="$poppunk_dir" + DB_NAME="$db_name" + EXT_CLUSTERS_FILE="$ext_clusters_file" + + source get_lineage.sh """ } diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index ef8f650..688e928 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -164,6 +164,7 @@ workflow PIPELINE { .join(MAPPING_QC.out.report, failOnDuplicate: true, remainder: true) .join(TAXONOMY_QC.out.report, failOnDuplicate: true, remainder: true) .join(OVERALL_QC.out.report, failOnDuplicate: true, remainder: true) + .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name .map { [it[0], it[1..-1].minus(null)] } ).view() From bfcca3163f37354fda219cfab3c2feec9ddd01f6 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 21 Jul 2023 14:04:07 +0000 Subject: [PATCH 09/41] Add quote to csv header --- bin/get_lineage.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/get_lineage.sh b/bin/get_lineage.sh index 70e4f08..63b6ec0 100755 --- a/bin/get_lineage.sh +++ b/bin/get_lineage.sh @@ -10,4 +10,4 @@ poppunk_assign --db "${POPPUNK_DIR}/${DB_NAME}" --external-clustering "${POPPUNK sed 's/^prefix_//' output/output_external_clusters.csv > result.txt -awk -F , 'NR!=1 { print "GPSC\n" "\"" $2 "\"" > $1 ".csv" }' result.txt \ No newline at end of file +awk -F , 'NR!=1 { print "\"GPSC\"\n" "\"" $2 "\"" > $1 ".csv" }' result.txt \ No newline at end of file From 897aa091a42ef4372a0476bea63114de9c51eef8 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 21 Jul 2023 14:05:13 +0000 Subject: [PATCH 10/41] Save Serotype report as .csv --- bin/get_serotype.sh | 3 +++ modules/serotype.nf | 6 +++++- workflows/pipeline.nf | 1 + 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/bin/get_serotype.sh b/bin/get_serotype.sh index b17c2de..80bfcc7 100755 --- a/bin/get_serotype.sh +++ b/bin/get_serotype.sh @@ -4,3 +4,6 @@ } || { SEROTYPE="SEROBA FAILURE" } + +echo \"Serotype\" > $SEROTYPE_REPORT +echo \"$SEROTYPE\" >> $SEROTYPE_REPORT \ No newline at end of file diff --git a/modules/serotype.nf b/modules/serotype.nf index 5c268fc..0c69bad 100644 --- a/modules/serotype.nf +++ b/modules/serotype.nf @@ -66,9 +66,10 @@ process SEROTYPE { tuple val(sample_id), path(read1), path(read2), path(unpaired) output: - tuple val(sample_id), env(SEROTYPE), emit: result + tuple val(sample_id), path(serotype_report), emit: report script: + serotype_report='serotype_report.csv' // When using Singularity as container engine, SeroBA sometimes gives incorrect result or critical error // Uncertain root cause, happen randomly when input are located directly in a Nextflow process work directory // Workaround: create and use a subdirectory to alter the path @@ -79,6 +80,7 @@ process SEROTYPE { READ1="$read1" READ2="$read2" SAMPLE_ID="$sample_id" + SEROTYPE_REPORT="$serotype_report" source get_serotype.sh """ @@ -89,12 +91,14 @@ process SEROTYPE { READ1="$read1" READ2="$read2" SAMPLE_ID="$sample_id" + SEROTYPE_REPORT="$serotype_report" mkdir SEROBA_WORKDIR && mv $seroba_dir $read1 $read2 SEROBA_WORKDIR && cd SEROBA_WORKDIR source get_serotype.sh cd ../ + mv SEROBA_WORKDIR/$serotype_report ./ """ else error "The process must be run with Docker or Singularity as container engine." diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 688e928..257d4dd 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -164,6 +164,7 @@ workflow PIPELINE { .join(MAPPING_QC.out.report, failOnDuplicate: true, remainder: true) .join(TAXONOMY_QC.out.report, failOnDuplicate: true, remainder: true) .join(OVERALL_QC.out.report, failOnDuplicate: true, remainder: true) + .join(SEROTYPE.out.report, failOnDuplicate: true, remainder: true) .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name .map { [it[0], it[1..-1].minus(null)] } ).view() From 58a549f948e3c9581779f1b753448bdc8e7acfc6 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 21 Jul 2023 14:18:04 +0000 Subject: [PATCH 11/41] Save MLST report as .csv --- bin/get_mlst.sh | 3 +++ modules/mlst.nf | 4 +++- workflows/pipeline.nf | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/bin/get_mlst.sh b/bin/get_mlst.sh index 7e5c61a..ab7c8e9 100755 --- a/bin/get_mlst.sh +++ b/bin/get_mlst.sh @@ -12,3 +12,6 @@ recP=$(awk -F'\t' 'FNR == 2 {print $7}' $OUTPUT) spi=$(awk -F'\t' 'FNR == 2 {print $8}' $OUTPUT) xpt=$(awk -F'\t' 'FNR == 2 {print $9}' $OUTPUT) ddl=$(awk -F'\t' 'FNR == 2 {print $10}' $OUTPUT) + +echo \"ST\",\"aroE\",\"gdh\",\"gki\",\"recP\",\"spi\",\"xpt\",\"ddl\" > $MLST_REPORT +echo \"$ST\",\"$aroE\",\"$gdh\",\"$gki\",\"$recP\",\"$spi\",\"$xpt\",\"$ddl\" >> $MLST_REPORT \ No newline at end of file diff --git a/modules/mlst.nf b/modules/mlst.nf index cc766d4..c8d12e4 100644 --- a/modules/mlst.nf +++ b/modules/mlst.nf @@ -9,11 +9,13 @@ process MLST { tuple val(sample_id), path(assembly) output: - tuple val(sample_id), env(ST), env(aroE), env(gdh), env(gki), env(recP), env(spi), env(xpt), env(ddl), emit: result + tuple val(sample_id), path(mlst_report), emit: report script: + mlst_report='mlst_report.csv' """ ASSEMBLY="$assembly" + MLST_REPORT="$mlst_report" source get_mlst.sh """ diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 257d4dd..32089a9 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -165,6 +165,7 @@ workflow PIPELINE { .join(TAXONOMY_QC.out.report, failOnDuplicate: true, remainder: true) .join(OVERALL_QC.out.report, failOnDuplicate: true, remainder: true) .join(SEROTYPE.out.report, failOnDuplicate: true, remainder: true) + .join(MLST.out.report, failOnDuplicate: true, remainder: true) .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name .map { [it[0], it[1..-1].minus(null)] } ).view() From 51aebdc959fab1e66639e7567ac369175169f5a8 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 21 Jul 2023 15:21:49 +0000 Subject: [PATCH 12/41] Save PBP AMR report as .csv --- bin/get_pbp_resistance.sh | 3 +++ modules/amr.nf | 4 +++- workflows/pipeline.nf | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/bin/get_pbp_resistance.sh b/bin/get_pbp_resistance.sh index d7082eb..5e833c3 100755 --- a/bin/get_pbp_resistance.sh +++ b/bin/get_pbp_resistance.sh @@ -30,3 +30,6 @@ MER=$(GET_RES "mem") PEN_MIC=$(GET_VALUE "penMic") PEN_NONMENINGITIS=$(GET_RES "penNonMeningitis") PEN_MENINGITIS=$(GET_RES "penMeningitis") + +echo \"pbp1a\",\"pbp2b\",\"pbp2x\",\"AMO_MIC\",\"AMO_Res\",\"CFT_MIC\",\"CFT_Res\(Meningital\)\",\"CFT_Res\(Non-meningital\)\",\"TAX_MIC\",\"TAX_Res\(Meningital\)\",\"TAX_Res\(Non-meningital\)\",\"CFX_MIC\",\"CFX_Res\",\"MER_MIC\",\"MER_Res\",\"PEN_MIC\",\"PEN_Res\(Meningital\)\",\"PEN_Res\(Non-meningital\)\" > $PBP_AMR_REPORT +echo \"$pbp1a\",\"$pbp2b\",\"$pbp2x\",\"$AMO_MIC\",\"$AMO\",\"$CFT_MIC\",\"$CFT_MENINGITIS\",\"$CFT_NONMENINGITIS\",\"$TAX_MIC\",\"$TAX_MENINGITIS\",\"$TAX_NONMENINGITIS\",\"$CFX_MIC\",\"$CFX\",\"$MER_MIC\",\"$MER\",\"$PEN_MIC\",\"$PEN_MENINGITIS\",\"$PEN_NONMENINGITIS\" >> $PBP_AMR_REPORT \ No newline at end of file diff --git a/modules/amr.nf b/modules/amr.nf index 6a2a0bf..3aabed7 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -29,11 +29,13 @@ process GET_PBP_RESISTANCE { tuple val(sample_id), path(json) output: - tuple val(sample_id), env(pbp1a), env(pbp2b), env(pbp2x), env(AMO_MIC), env(AMO), env(CFT_MIC), env(CFT_MENINGITIS), env(CFT_NONMENINGITIS), env(TAX_MIC), env(TAX_MENINGITIS), env(TAX_NONMENINGITIS), env(CFX_MIC), env(CFX), env(MER_MIC), env(MER), env(PEN_MIC), env(PEN_MENINGITIS), env(PEN_NONMENINGITIS), emit: result + tuple val(sample_id), path(pbp_amr_report), emit: report script: + pbp_amr_report='pbp_amr_report.csv' """ JSON_FILE="$json" + PBP_AMR_REPORT="$pbp_amr_report" source get_pbp_resistance.sh """ diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 32089a9..c78f5fa 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -166,6 +166,7 @@ workflow PIPELINE { .join(OVERALL_QC.out.report, failOnDuplicate: true, remainder: true) .join(SEROTYPE.out.report, failOnDuplicate: true, remainder: true) .join(MLST.out.report, failOnDuplicate: true, remainder: true) + .join(GET_PBP_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name .map { [it[0], it[1..-1].minus(null)] } ).view() From 2bb7ad12dd1b2555da6e52cd4432525929925680 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 21 Jul 2023 15:39:52 +0000 Subject: [PATCH 13/41] Save other AMR report as .csv --- bin/get_other_resistance.py | 7 +++++-- bin/get_other_resistance.sh | 40 ------------------------------------- modules/amr.nf | 9 +++------ workflows/pipeline.nf | 1 + 4 files changed, 9 insertions(+), 48 deletions(-) delete mode 100755 bin/get_other_resistance.sh diff --git a/bin/get_other_resistance.py b/bin/get_other_resistance.py index 4f71294..d902f59 100755 --- a/bin/get_other_resistance.py +++ b/bin/get_other_resistance.py @@ -5,11 +5,13 @@ import sys from itertools import chain from collections import defaultdict -import json +import pandas as pd +import csv report_path = sys.argv[1] debug_report_path = sys.argv[2] metadata_path = sys.argv[3] +output_file = sys.argv[4] with open(report_path) as report, open(debug_report_path) as debug_report, open(metadata_path) as metadata: # For saving (reference, gene, var_only) combinations as key and their information ({var_change: target}) as value found in metadata @@ -127,4 +129,5 @@ output['ERY_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['ERY'])) if 'ERY' in target_dict and len(target_dict['ERY']) != 0 else output['ERY_CLI_Determinant'] output['CLI_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['CLI'])) if 'CLI' in target_dict and len(target_dict['CLI']) != 0 else output['ERY_CLI_Determinant'] - print(json.dumps(output, indent=4)) \ No newline at end of file + # Save output dict as csv + pd.DataFrame([output]).to_csv(output_file, index=False, quoting=csv.QUOTE_ALL) \ No newline at end of file diff --git a/bin/get_other_resistance.sh b/bin/get_other_resistance.sh deleted file mode 100755 index befd4a4..0000000 --- a/bin/get_other_resistance.sh +++ /dev/null @@ -1,40 +0,0 @@ -# Run get_other_resistance.py to infer AMR from ARIBA reports, then capture individual AMR from the output for Nextflow - -function GET_VALUE { - echo $(grep \"$1\" <<< $OUTPUT | sed -r 's/.+: "(.*)",?/\1/') -} - -OUTPUT=$(get_other_resistance.py "$REPORT" "$REPORT_DEBUG" "$METADATA") - -CHL_Res=$(GET_VALUE "CHL_Res") -CHL_Determinant=$(GET_VALUE "CHL_Determinant") -ERY_Res=$(GET_VALUE "ERY_Res") -ERY_Determinant=$(GET_VALUE "ERY_Determinant") -CLI_Res=$(GET_VALUE "CLI_Res") -CLI_Determinant=$(GET_VALUE "CLI_Determinant") -ERY_CLI_Res=$(GET_VALUE "ERY_CLI_Res") -ERY_CLI_Determinant=$(GET_VALUE "ERY_CLI_Determinant") -FQ_Res=$(GET_VALUE "FQ_Res") -FQ_Determinant=$(GET_VALUE "FQ_Determinant") -LFX_Res=$(GET_VALUE "LFX_Res") -LFX_Determinant=$(GET_VALUE "LFX_Determinant") -KAN_Res=$(GET_VALUE "KAN_Res") -KAN_Determinant=$(GET_VALUE "KAN_Determinant") -TET_Res=$(GET_VALUE "TET_Res") -TET_Determinant=$(GET_VALUE "TET_Determinant") -DOX_Res=$(GET_VALUE "DOX_Res") -DOX_Determinant=$(GET_VALUE "DOX_Determinant") -TMP_Res=$(GET_VALUE "TMP_Res") -TMP_Determinant=$(GET_VALUE "TMP_Determinant") -SMX_Res=$(GET_VALUE "SMX_Res") -SMX_Determinant=$(GET_VALUE "SMX_Determinant") -COT_Res=$(GET_VALUE "COT_Res") -COT_Determinant=$(GET_VALUE "COT_Determinant") -RIF_Res=$(GET_VALUE "RIF_Res") -RIF_Determinant=$(GET_VALUE "RIF_Determinant") -VAN_Res=$(GET_VALUE "VAN_Res") -VAN_Determinant=$(GET_VALUE "VAN_Determinant") -PILI1=$(GET_VALUE "PILI1") -PILI1_Determinant=$(GET_VALUE "PILI1_Determinant") -PILI2=$(GET_VALUE "PILI2") -PILI2_Determinant=$(GET_VALUE "PILI2_Determinant") \ No newline at end of file diff --git a/modules/amr.nf b/modules/amr.nf index 3aabed7..1fd57f4 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -104,14 +104,11 @@ process GET_OTHER_RESISTANCE { path metadata output: - tuple val(sample_id), env(CHL_Res), env(CHL_Determinant), env(ERY_Res), env(ERY_Determinant), env(CLI_Res), env(CLI_Determinant), env(ERY_CLI_Res), env(ERY_CLI_Determinant), env(FQ_Res), env(FQ_Determinant), env(LFX_Res), env(LFX_Determinant), env(KAN_Res), env(KAN_Determinant), env(TET_Res), env(TET_Determinant), env(DOX_Res), env(DOX_Determinant), env(TMP_Res), env(TMP_Determinant), env(SMX_Res), env(SMX_Determinant), env(COT_Res), env(COT_Determinant), env(RIF_Res), env(RIF_Determinant), env(VAN_Res), env(VAN_Determinant), env(PILI1), env(PILI1_Determinant), env(PILI2), env(PILI2_Determinant), emit: result + tuple val(sample_id), path(output_file), emit: report script: + output_file="other_amr_report.csv" """ - REPORT="$report" - REPORT_DEBUG="$report_debug" - METADATA="$metadata" - - source get_other_resistance.sh + get_other_resistance.py "$report" "$report_debug" "$metadata" "$output_file" """ } diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index c78f5fa..f8fc630 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -167,6 +167,7 @@ workflow PIPELINE { .join(SEROTYPE.out.report, failOnDuplicate: true, remainder: true) .join(MLST.out.report, failOnDuplicate: true, remainder: true) .join(GET_PBP_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) + .join(GET_OTHER_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name .map { [it[0], it[1..-1].minus(null)] } ).view() From e94cc56378de3fef6c4ff3debfcec9296dc31ff8 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 24 Jul 2023 13:58:13 +0000 Subject: [PATCH 14/41] Improve clarity of Read QC module --- README.md | 2 +- doc/workflow.drawio.svg | 100 +++++++++++++++++++++++----------------- 2 files changed, 59 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 8fdc0d6..149928e 100644 --- a/README.md +++ b/README.md @@ -184,7 +184,7 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | `--assembly_publish` | `"link"` or `"symlink"` or `"copy"`
(Default: `"link"`)| Method used by Nextflow to publish the generated assemblies.
(The default setting `"link"` means hard link, therefore will fail if the output directory is set to outside of the working file system) | ## QC Parameters -> ℹ️ Read QC does not have directly accessible parameters. The minimum base count in reads of Read QC is based on the multiplication of `--length_low` and `--depth` of Assembly QC. +> ℹ️ Read QC does not have directly accessible parameters. The minimum base count in reads of Read QC is based on the multiplication of `--length_low` and `--depth` of Assembly QC (i.e. default value is `38000000`). | Option | Values | Description | | --- | ---| --- | diff --git a/doc/workflow.drawio.svg b/doc/workflow.drawio.svg index f2e08ab..00766b6 100644 --- a/doc/workflow.drawio.svg +++ b/doc/workflow.drawio.svg @@ -1,4 +1,4 @@ - + @@ -157,10 +157,10 @@ - - - - + + + + @@ -279,8 +279,8 @@ - - + + @@ -302,13 +302,13 @@ - - - + + + -
+
@@ -320,17 +320,17 @@
- + PBP... - - + + -
+
@@ -342,7 +342,7 @@
- + MLST... @@ -368,12 +368,12 @@ - - + + -
+
@@ -385,17 +385,17 @@
- + Line... - - + + -
+
@@ -407,17 +407,17 @@
- + Sero... - - + + -
+
@@ -429,7 +429,7 @@
- + Othe... @@ -459,21 +459,12 @@ - Read QC - - - + - - Go / No-go - - Bases: - - - ≥ Min Length x Depth + + Bases: ≥ 38 Mb @@ -482,9 +473,9 @@ Go / No-go - - - + + + @@ -526,6 +517,31 @@ + + + + + +
+
+
+ + Read QC + +
+
+
+
+ + Read... + +
+
+ + + QC values shown in the diagram are the default values + + From fe079b96442e9ba534703488a885f74f90584e94 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 24 Jul 2023 15:47:35 +0000 Subject: [PATCH 15/41] Correct output column names --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 149928e..aa68b3e 100644 --- a/README.md +++ b/README.md @@ -351,10 +351,10 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | `RIF_Determinant` | Other AMR | Known determinants that inferred the RIF resistance | | `VAN_Res` | Other AMR | Resistance phenotype against Vancomycin (VAN) | | `VAN_Determinant` | Other AMR | Known determinants that inferred the VAN resistance | - | `PILI-1` | Other AMR | Expression of PILI-1 | - | `PILI-1_Determinant` | Other AMR | Known determinants that inferred the PILI-1 expression | - | `PILI-2` | Other AMR | Expression of PILI-2 | - | `PILI-2_Determinant` | Other AMR | Known determinants that inferred the PILI-2 expression | + | `PILI1` | Other AMR | Expression of PILI-1 | + | `PILI1_Determinant` | Other AMR | Known determinants that inferred the PILI-1 expression | + | `PILI2` | Other AMR | Expression of PILI-2 | + | `PILI2_Determinant` | Other AMR | Known determinants that inferred the PILI-2 expression |   # Credits From 19312f9574d53d1cfd3f5b49e27044d264865c37 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 24 Jul 2023 15:53:21 +0000 Subject: [PATCH 16/41] Initial implementation of overall report revamp --- bin/generate_overall_report.py | 22 ++++++++++++++++++++++ modules/output.nf | 25 +++++++++++++++++++++++-- workflows/pipeline.nf | 6 +++--- 3 files changed, 48 insertions(+), 5 deletions(-) create mode 100755 bin/generate_overall_report.py diff --git a/bin/generate_overall_report.py b/bin/generate_overall_report.py new file mode 100755 index 0000000..55287d2 --- /dev/null +++ b/bin/generate_overall_report.py @@ -0,0 +1,22 @@ +#! /usr/bin/env python3 + +import sys +import glob +import pandas as pd + +workdir_path = sys.argv[1] +ariba_metadata = sys.argv[2] +output_file = sys.argv[3] + +output_columns = ['Sample_ID' , 'Read_QC' , 'Assembly_QC' , 'Mapping_QC' , 'Taxonomy_QC' , 'Overall_QC' , 'Bases' , 'Contigs#' , 'Assembly_Length' , 'Seq_Depth' , 'Ref_Cov_%' , 'Het-SNP#' , 'S.Pneumo_%' , 'GPSC' , 'Serotype' , 'ST' , 'aroE' , 'gdh' , 'gki' , 'recP' , 'spi' , 'xpt' , 'ddl' , 'pbp1a' , 'pbp2b' , 'pbp2x' , 'AMO_MIC' , 'AMO_Res' , 'CFT_MIC' , 'CFT_Res(Meningital)' , 'CFT_Res(Non-meningital)' , 'TAX_MIC' , 'TAX_Res(Meningital)' , 'TAX_Res(Non-meningital)' , 'CFX_MIC' , 'CFX_Res' , 'MER_MIC' , 'MER_Res' , 'PEN_MIC' , 'PEN_Res(Meningital)' , 'PEN_Res(Non-meningital)' , 'CHL_Res' , 'CHL_Determinant' , 'ERY_Res' , 'ERY_Determinant' , 'CLI_Res' , 'CLI_Determinant' , 'ERY_CLI_Res' , 'ERY_CLI_Determinant' , 'FQ_Res' , 'FQ_Determinant' , 'LFX_Res' , 'LFX_Determinant' , 'KAN_Res' , 'KAN_Determinant' , 'TET_Res' , 'TET_Determinant' , 'DOX_Res' , 'DOX_Determinant' , 'TMP_Res' , 'TMP_Determinant' , 'SMX_Res' , 'SMX_Determinant' , 'COT_Res' , 'COT_Determinant' , 'RIF_Res' , 'RIF_Determinant' , 'VAN_Res' , 'VAN_Determinant' , 'PILI1' , 'PILI1_Determinant' , 'PILI2' , 'PILI2_Determinant'] +df_manifest = pd.DataFrame(columns=output_columns) + +dfs = [df_manifest] + +reports = glob.glob(workdir_path +'/*.csv') +for report in reports: + df = pd.read_csv(report) + dfs.append(df) + +df_output = pd.concat(dfs, ignore_index=True).sort_values(by=['Sample_ID']) +df_output.to_csv(output_file, index=False, na_rep='_') diff --git a/modules/output.nf b/modules/output.nf index a711425..6bc6a27 100644 --- a/modules/output.nf +++ b/modules/output.nf @@ -8,7 +8,7 @@ process GENERATE_SAMPLE_REPORT { tuple val(sample_id), path ('report*.csv') output: - path sample_report + path sample_report, emit: report script: sample_report="${sample_id}_report.csv" @@ -18,4 +18,25 @@ process GENERATE_SAMPLE_REPORT { source generate_sample_report.sh """ -} \ No newline at end of file +} + +process GENERATE_OVERALL_REPORT { + label 'python_container' + label 'farm_low' + + publishDir "${params.output}", mode: "copy" + + input: + path 'report*.csv' + path "$ariba_metadata" + + output: + path "$overall_report", emit: report + + script: + overall_report='results.csv' + ariba_metadata='ariba_metadata.tsv' + """ + generate_overall_report.py `pwd` $ariba_metadata $overall_report + """ +} diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index f8fc630..e962742 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -8,7 +8,7 @@ include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS; LINEAGE } from "$projectDir/ include { GET_SEROBA_DB; CREATE_SEROBA_DB; SEROTYPE } from "$projectDir/modules/serotype" include { MLST } from "$projectDir/modules/mlst" include { PBP_RESISTANCE; GET_PBP_RESISTANCE; CREATE_ARIBA_DB; OTHER_RESISTANCE; GET_OTHER_RESISTANCE } from "$projectDir/modules/amr" -include { GENERATE_SAMPLE_REPORT } from "$projectDir/modules/output" +include { GENERATE_SAMPLE_REPORT; GENERATE_OVERALL_REPORT } from "$projectDir/modules/output" // Main pipeline workflow workflow PIPELINE { @@ -170,9 +170,9 @@ workflow PIPELINE { .join(GET_OTHER_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name .map { [it[0], it[1..-1].minus(null)] } - ).view() + ) - // GENERATE_OVERALL_REPORT + GENERATE_OVERALL_REPORT(GENERATE_SAMPLE_REPORT.out.report.collect(), params.ariba_metadata) // READ_QC.out.result // .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true) From 4c7fc286c0a754820b2490a9cb9d08d486810e2c Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 24 Jul 2023 16:01:39 +0000 Subject: [PATCH 17/41] Improve comments; remove obsolete code --- workflows/pipeline.nf | 64 +++---------------------------------------- 1 file changed, 4 insertions(+), 60 deletions(-) diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index e962742..fd288c9 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -147,17 +147,7 @@ workflow PIPELINE { OTHER_RESISTANCE(CREATE_ARIBA_DB.out.path, CREATE_ARIBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.reports, params.ariba_metadata) - // Generate results.csv by sorted sample_id based on merged Channels - // READ_QC.out.result, ASSEMBLY_QC.out.result, MAPPING_QC.out.result, TAXONOMY_QC.out.result, OVERALL_QC.out.result, - // READ_QC.out.bases, ASSEMBLY_QC.out.info, MAPPING_QC.out.info, TAXONOMY_QC.out.percentage - // LINEAGE.out.csv, - // SEROTYPE.out.result, - // MLST.out.result, - // GET_PBP_RESISTANCE.out.result, - // GET_OTHER_RESISTANCE.out.result - // - // Replace null with approiate amount of "_" items when sample_id does not exist in that output (i.e. QC rejected) - + // Generate sample reports by merging outputs from all result-generating modules GENERATE_SAMPLE_REPORT( READ_QC.out.report .join(ASSEMBLY_QC.out.report, failOnDuplicate: true, remainder: true) @@ -169,59 +159,13 @@ workflow PIPELINE { .join(GET_PBP_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) .join(GET_OTHER_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name - .map { [it[0], it[1..-1].minus(null)] } + .map { [it[0], it[1..-1].minus(null)] } // Map Sample_ID to index 0 and all reports (with null entries removed) as a list to index 1 ) + // Generate overall report by concatenating sample reports GENERATE_OVERALL_REPORT(GENERATE_SAMPLE_REPORT.out.report.collect(), params.ariba_metadata) - // READ_QC.out.result - // .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - // .join(MAPPING_QC.out.result, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - // .join(TAXONOMY_QC.out.result, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - // .join(OVERALL_QC.out.result, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['FAIL'] : it } - // .join(READ_QC.out.bases, failOnDuplicate: true, failOnMismatch: true) - // .join(ASSEMBLY_QC.out.info, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 3 : it } - // .join(MAPPING_QC.out.info, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 2 : it } - // .join(TAXONOMY_QC.out.percentage, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - // .join(LINEAGE.out.csv.splitCsv(skip: 1), failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - // .join(SEROTYPE.out.result, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - // .join(MLST.out.result, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 8 : it } - // .join(GET_PBP_RESISTANCE.out.result, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 18 : it } - // .join(GET_OTHER_RESISTANCE.out, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 24 : it } - // .map { it.collect {"\"$it\""}.join',' } - // .collectFile( - // name: 'results.csv', - // storeDir: "$params.output", - // seed: [ - // 'Sample_ID', - // 'Read_QC', 'Assembly_QC', 'Mapping_QC', 'Taxonomy_QC', 'Overall_QC', - // 'Bases', - // 'Contigs#' , 'Assembly_Length', 'Seq_Depth', - // 'Ref_Cov_%', 'Het-SNP#' , - // 'S.Pneumo_%', - // 'GPSC', - // 'Serotype', - // 'ST', 'aroE', 'gdh', 'gki', 'recP', 'spi', 'xpt', 'ddl', - // 'pbp1a', 'pbp2b', 'pbp2x', 'AMO_MIC', 'AMO_Res', 'CFT_MIC', 'CFT_Res(Meningital)', 'CFT_Res(Non-meningital)', 'TAX_MIC', 'TAX_Res(Meningital)', 'TAX_Res(Non-meningital)', 'CFX_MIC', 'CFX_Res', 'MER_MIC', 'MER_Res', 'PEN_MIC', 'PEN_Res(Meningital)', 'PEN_Res(Non-meningital)', - // 'CHL_Res', 'CHL_Determinant', 'ERY_Res', 'ERY_Determinant', 'CLI_Res', 'CLI_Determinant', 'ERY_CLI_Res', 'ERY_CLI_Determinant', 'FQ_Res', 'FQ_Determinant', 'LFX_Res', 'LFX_Determinant', 'KAN_Res', 'KAN_Determinant', 'TET_Res', 'TET_Determinant', 'DOX_Res', 'DOX_Determinant', 'TMP_Res', 'TMP_Determinant', 'SMX_Res', 'SMX_Determinant', 'COT_Res', 'COT_Determinant', 'RIF_Res', 'RIF_Determinant', 'VAN_Res', 'VAN_Determinant', 'PILI1', 'PILI1_Determinant', 'PILI2', 'PILI2_Determinant' - // ].join(','), - // sort: { it.split(',')[0] }, - // newLine: true - // ) - - // Pass to SAVE_INFO sub-workflow + // Pass databases information to SAVE_INFO sub-workflow DATABASES_INFO = CREATE_REF_GENOME_BWA_DB.out.path.map { [["bwa_db_path", it]] } .merge(CREATE_ARIBA_DB.out.path.map { [["ariba_db_path", it]] }) .merge(GET_KRAKEN2_DB.out.path.map { [["kraken2_db_path", it]] }) From 073ddd3b1bdd090026017eb34c57f1538e7b90e2 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 24 Jul 2023 17:18:24 +0000 Subject: [PATCH 18/41] Generate results.csv based on ARIBA metadata --- bin/generate_overall_report.py | 33 ++++++++++++++++++++++++++++++++- modules/output.nf | 3 +-- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/bin/generate_overall_report.py b/bin/generate_overall_report.py index 55287d2..e48fb53 100755 --- a/bin/generate_overall_report.py +++ b/bin/generate_overall_report.py @@ -8,7 +8,35 @@ ariba_metadata = sys.argv[2] output_file = sys.argv[3] -output_columns = ['Sample_ID' , 'Read_QC' , 'Assembly_QC' , 'Mapping_QC' , 'Taxonomy_QC' , 'Overall_QC' , 'Bases' , 'Contigs#' , 'Assembly_Length' , 'Seq_Depth' , 'Ref_Cov_%' , 'Het-SNP#' , 'S.Pneumo_%' , 'GPSC' , 'Serotype' , 'ST' , 'aroE' , 'gdh' , 'gki' , 'recP' , 'spi' , 'xpt' , 'ddl' , 'pbp1a' , 'pbp2b' , 'pbp2x' , 'AMO_MIC' , 'AMO_Res' , 'CFT_MIC' , 'CFT_Res(Meningital)' , 'CFT_Res(Non-meningital)' , 'TAX_MIC' , 'TAX_Res(Meningital)' , 'TAX_Res(Non-meningital)' , 'CFX_MIC' , 'CFX_Res' , 'MER_MIC' , 'MER_Res' , 'PEN_MIC' , 'PEN_Res(Meningital)' , 'PEN_Res(Non-meningital)' , 'CHL_Res' , 'CHL_Determinant' , 'ERY_Res' , 'ERY_Determinant' , 'CLI_Res' , 'CLI_Determinant' , 'ERY_CLI_Res' , 'ERY_CLI_Determinant' , 'FQ_Res' , 'FQ_Determinant' , 'LFX_Res' , 'LFX_Determinant' , 'KAN_Res' , 'KAN_Determinant' , 'TET_Res' , 'TET_Determinant' , 'DOX_Res' , 'DOX_Determinant' , 'TMP_Res' , 'TMP_Determinant' , 'SMX_Res' , 'SMX_Determinant' , 'COT_Res' , 'COT_Determinant' , 'RIF_Res' , 'RIF_Determinant' , 'VAN_Res' , 'VAN_Determinant' , 'PILI1' , 'PILI1_Determinant' , 'PILI2' , 'PILI2_Determinant'] +output_columns = ['Sample_ID' , 'Read_QC' , 'Assembly_QC' , 'Mapping_QC' , 'Taxonomy_QC' , 'Overall_QC' , 'Bases' , 'Contigs#' , 'Assembly_Length' , 'Seq_Depth' , 'Ref_Cov_%' , 'Het-SNP#' , 'S.Pneumo_%' , 'GPSC' , 'Serotype' , 'ST' , 'aroE' , 'gdh' , 'gki' , 'recP' , 'spi' , 'xpt' , 'ddl' , 'pbp1a' , 'pbp2b' , 'pbp2x' , 'AMO_MIC' , 'AMO_Res' , 'CFT_MIC' , 'CFT_Res(Meningital)' , 'CFT_Res(Non-meningital)' , 'TAX_MIC' , 'TAX_Res(Meningital)' , 'TAX_Res(Non-meningital)' , 'CFX_MIC' , 'CFX_Res' , 'MER_MIC' , 'MER_Res' , 'PEN_MIC' , 'PEN_Res(Meningital)' , 'PEN_Res(Non-meningital)'] + +ariba_targets = set(pd.read_csv(ariba_metadata, sep='\t')['target'].unique()) + +if 'TET' in ariba_targets: + ariba_targets.add('DOX') + +if 'FQ' in ariba_targets: + ariba_targets.add('LFX') + +if 'TMP' in ariba_targets and 'SMX' in ariba_targets: + ariba_targets.add('COT') + +if 'ERY_CLI' in ariba_targets: + ariba_targets.update(['ERY', 'CLI']) + +ariba_targets = sorted(ariba_targets) + +pilis = [] + +for target in ariba_targets: + if target.lower().startswith('pili'): + pilis.append(target) + else: + output_columns.extend([f'{target}_Res', f'{target}_Determinant']) + +for pili in pilis: + output_columns.extend([f'{pili}', f'{pili}_Determinant']) + df_manifest = pd.DataFrame(columns=output_columns) dfs = [df_manifest] @@ -19,4 +47,7 @@ dfs.append(df) df_output = pd.concat(dfs, ignore_index=True).sort_values(by=['Sample_ID']) + +df_output = df_output[output_columns] + df_output.to_csv(output_file, index=False, na_rep='_') diff --git a/modules/output.nf b/modules/output.nf index 6bc6a27..0fdbfc6 100644 --- a/modules/output.nf +++ b/modules/output.nf @@ -28,14 +28,13 @@ process GENERATE_OVERALL_REPORT { input: path 'report*.csv' - path "$ariba_metadata" + path 'ariba_metadata' output: path "$overall_report", emit: report script: overall_report='results.csv' - ariba_metadata='ariba_metadata.tsv' """ generate_overall_report.py `pwd` $ariba_metadata $overall_report """ From fd905cfa221f04810dbbd64b64147854bc0ffef3 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 25 Jul 2023 20:06:43 +0000 Subject: [PATCH 19/41] Improve shell scripts style --- bin/assembly_qc.sh | 12 ++++----- bin/combine_info.sh | 6 ++--- bin/create_ariba_db.sh | 48 ++++++++++++++++----------------- bin/create_ref_genome_bwa_db.sh | 26 +++++++++--------- bin/create_seroba_db.sh | 6 ++--- 5 files changed, 49 insertions(+), 49 deletions(-) diff --git a/bin/assembly_qc.sh b/bin/assembly_qc.sh index 9d399fc..f23ff7f 100755 --- a/bin/assembly_qc.sh +++ b/bin/assembly_qc.sh @@ -1,14 +1,14 @@ # Extract assembly QC information and determine QC result based on report.tsv from Quast, total base count -CONTIGS=$(awk -F'\t' '$1 == "# contigs (>= 0 bp)" { print $2 }' $REPORT) -LENGTH=$(awk -F'\t' '$1 == "Total length" { print $2 }' $REPORT) -DEPTH=$(printf %.2f $(echo "$BASES / $LENGTH" | bc -l) ) +CONTIGS=$(awk -F'\t' '$1 == "# contigs (>= 0 bp)" { print $2 }' "$REPORT") +LENGTH=$(awk -F'\t' '$1 == "Total length" { print $2 }' "$REPORT") +DEPTH=$(echo "scale=2; $BASES / $LENGTH" | bc -l) -if (( $CONTIGS < $QC_CONTIGS )) && (( $LENGTH >= $QC_LENGTH_LOW )) && (( $LENGTH <= $QC_LENGTH_HIGH )) && (( $(echo "$DEPTH >= $QC_DEPTH" | bc -l) )); then +if [[ $CONTIGS -lt $QC_CONTIGS ]] && [[ $LENGTH -ge $QC_LENGTH_LOW ]] && [[ $LENGTH -le $QC_LENGTH_HIGH ]] && [[ "$(echo "$DEPTH >= $QC_DEPTH" | bc -l)" == 1 ]]; then ASSEMBLY_QC="PASS" else ASSEMBLY_QC="FAIL" fi -echo \"Assembly_QC\",\"Contigs#\",\"Assembly_Length\",\"Seq_Depth\" > $ASSEMBLY_QC_REPORT -echo \"$ASSEMBLY_QC\",\"$CONTIGS\",\"$LENGTH\",\"$DEPTH\" >> $ASSEMBLY_QC_REPORT \ No newline at end of file +echo \"Assembly_QC\",\"Contigs#\",\"Assembly_Length\",\"Seq_Depth\" > "$ASSEMBLY_QC_REPORT" +echo \""$ASSEMBLY_QC"\",\""$CONTIGS"\",\""$LENGTH"\",\""$DEPTH"\" >> "$ASSEMBLY_QC_REPORT" diff --git a/bin/combine_info.sh b/bin/combine_info.sh index 7409046..5275baa 100755 --- a/bin/combine_info.sh +++ b/bin/combine_info.sh @@ -1,12 +1,12 @@ # Combine pipeline version, Nextflow version, databases information, container images, tools version JSON files into the a single JSON file -jq -s '.[0] * .[1] * .[2]' $DATABASE $IMAGES $TOOLS > working.json +jq -s '.[0] * .[1] * .[2]' "$DATABASE" "$IMAGES" "$TOOLS" > working.json add_version () { - jq --arg entry $1 --arg version "$2" '.[$entry] += {"version": $version}' working.json > tmp.json && mv tmp.json working.json + jq --arg entry "$1" --arg version "$2" '.[$entry] += {"version": $version}' working.json > tmp.json && mv tmp.json working.json } add_version pipeline "$PIPELINE_VERSION" add_version nextflow "$NEXTFLOW_VERSION" -mv working.json $JSON_FILE +mv working.json "$JSON_FILE" diff --git a/bin/create_ariba_db.sh b/bin/create_ariba_db.sh index 289fff4..fb2b657 100755 --- a/bin/create_ariba_db.sh +++ b/bin/create_ariba_db.sh @@ -1,32 +1,32 @@ # Check if CREATE_ARIBA_DB has run successfully on the specific reference sequences and metadata. # If not: remove the $OUTPUT directory, and prepare the ARIBA database from reference sequences and metadata, also save metadata to JSON -REF_SEQUENCES_MD5=$(md5sum $REF_SEQUENCES | awk '{ print $1 }') -METADATA_MD5=$(md5sum $METADATA | awk '{ print $1 }') +REF_SEQUENCES_MD5=$(md5sum "$REF_SEQUENCES" | awk '{ print $1 }') +METADATA_MD5=$(md5sum "$METADATA" | awk '{ print $1 }') -if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ - [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$REF_SEQUENCES" ] || \ - [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$REF_SEQUENCES_MD5" ] || \ - [ ! "$(grep '"metadata"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$METADATA" ] || \ - [ ! "$(grep '"metadata_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$METADATA_MD5" ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/00.info.txt ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/00.version_info.txt ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_genes.log ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_metadata.log ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_metadata.tsv ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_noncoding.log ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.all.fa ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.clusters.pickle ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.clusters.tsv ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.gene.fa ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.gene.varonly.fa ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.fa ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.varonly.fa ] ; then +if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \ + [ ! "$(grep '"reference"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$REF_SEQUENCES" ] || \ + [ ! "$(grep '"reference_md5"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$REF_SEQUENCES_MD5" ] || \ + [ ! "$(grep '"metadata"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$METADATA" ] || \ + [ ! "$(grep '"metadata_md5"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$METADATA_MD5" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/00.info.txt" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/00.version_info.txt" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/01.filter.check_genes.log" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/01.filter.check_metadata.log" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/01.filter.check_metadata.tsv" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/01.filter.check_noncoding.log" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.all.fa" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.clusters.pickle" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.clusters.tsv" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.gene.fa" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.gene.varonly.fa" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.fa" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.varonly.fa" ] ; then - rm -rf "$DB_LOCAL/$OUTPUT" + rm -rf "${DB_LOCAL:?}/${OUTPUT}" - ariba prepareref -f "$REF_SEQUENCES" -m "$METADATA" "$DB_LOCAL/$OUTPUT" + ariba prepareref -f "$REF_SEQUENCES" -m "$METADATA" "${DB_LOCAL}/${OUTPUT}" - echo -e "{\n \"reference\": \"$REF_SEQUENCES\",\n \"reference_md5\": \"$REF_SEQUENCES_MD5\",\n \"metadata\": \"$METADATA\",\n \"metadata_md5\": \"$METADATA_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON_FILE} + echo -e "{\n \"reference\": \"$REF_SEQUENCES\",\n \"reference_md5\": \"$REF_SEQUENCES_MD5\",\n \"metadata\": \"$METADATA\",\n \"metadata_md5\": \"$METADATA_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > "${DB_LOCAL}/${JSON_FILE}" -fi \ No newline at end of file +fi diff --git a/bin/create_ref_genome_bwa_db.sh b/bin/create_ref_genome_bwa_db.sh index 5bd277a..385b609 100755 --- a/bin/create_ref_genome_bwa_db.sh +++ b/bin/create_ref_genome_bwa_db.sh @@ -1,23 +1,23 @@ # Check if CREATE_REF_GENOME_BWA_DB has run successfully on the specific reference. # If not: remove files in database directory, and construct the FM-index database of the reference genome for BWA, also save metadata to JSON -REFERENCE_MD5=$(md5sum $REFERENCE | awk '{ print $1 }') +REFERENCE_MD5=$(md5sum "$REFERENCE" | awk '{ print $1 }') -if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ - [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$REFERENCE" ] || \ - [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$REFERENCE_MD5" ] || \ - [ ! -f ${DB_LOCAL}/${PREFIX}.amb ] || \ - [ ! -f ${DB_LOCAL}/${PREFIX}.ann ] || \ - [ ! -f ${DB_LOCAL}/${PREFIX}.bwt ] || \ - [ ! -f ${DB_LOCAL}/${PREFIX}.pac ] || \ - [ ! -f ${DB_LOCAL}/${PREFIX}.sa ] ; then +if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \ + [ ! "$(grep '"reference"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$REFERENCE" ] || \ + [ ! "$(grep '"reference_md5"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$REFERENCE_MD5" ] || \ + [ ! -f "${DB_LOCAL}/${PREFIX}.amb" ] || \ + [ ! -f "${DB_LOCAL}/${PREFIX}.ann" ] || \ + [ ! -f "${DB_LOCAL}/${PREFIX}.bwt" ] || \ + [ ! -f "${DB_LOCAL}/${PREFIX}.pac" ] || \ + [ ! -f "${DB_LOCAL}/${PREFIX}.sa" ] ; then - rm -rf ${DB_LOCAL}/{,.[!.],..?}* + rm -rf "${DB_LOCAL:?}"/{,.[!.],..?}* - bwa index -p $PREFIX $REFERENCE + bwa index -p "$PREFIX" "$REFERENCE" - mv ${PREFIX}.amb ${PREFIX}.ann ${PREFIX}.bwt ${PREFIX}.pac ${PREFIX}.sa -t $DB_LOCAL + mv "${PREFIX}.amb" "${PREFIX}.ann" "${PREFIX}.bwt" "${PREFIX}.pac" "${PREFIX}.sa" -t "$DB_LOCAL" - echo -e "{\n \"reference\": \"$REFERENCE\",\n \"reference_md5\": \"$REFERENCE_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON_FILE} + echo -e "{\n \"reference\": \"$REFERENCE\",\n \"reference_md5\": \"$REFERENCE_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > "${DB_LOCAL}/${JSON_FILE}" fi diff --git a/bin/create_seroba_db.sh b/bin/create_seroba_db.sh index 21a058f..3ff36b2 100755 --- a/bin/create_seroba_db.sh +++ b/bin/create_seroba_db.sh @@ -1,9 +1,9 @@ # If create_db is true: re-create KMC and ARIBA databases, also save metadata to JSON -if [ $CREATE_DB = true ]; then +if [ "$CREATE_DB" = true ]; then - seroba createDBs ${DB_LOCAL}/${DATABASE}/ ${KMER} + seroba createDBs "${DB_LOCAL}/${DATABASE}/" "${KMER}" - echo -e "{\n \"git\": \"$DB_REMOTE\",\n \"kmer\": \"$KMER\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON_FILE} + echo -e "{\n \"git\": \"$DB_REMOTE\",\n \"kmer\": \"$KMER\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > "${DB_LOCAL}/${JSON_FILE}" fi From 29caffdda1fbb8858cb80aeba3ef93c699a86f0d Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 27 Jul 2023 13:40:52 +0000 Subject: [PATCH 20/41] Refactor to improve maintainability & readability --- bin/generate_overall_report.py | 136 +++++++++++++++++++++------------ 1 file changed, 89 insertions(+), 47 deletions(-) diff --git a/bin/generate_overall_report.py b/bin/generate_overall_report.py index e48fb53..602e1d0 100755 --- a/bin/generate_overall_report.py +++ b/bin/generate_overall_report.py @@ -1,53 +1,95 @@ #! /usr/bin/env python3 +# Generate overall report based on sample reports and columns specified by COLUMNS_BY_CATEGORY and ARIBA metadata + import sys +from itertools import chain +import pandas as pd import glob -import pandas as pd - -workdir_path = sys.argv[1] -ariba_metadata = sys.argv[2] -output_file = sys.argv[3] - -output_columns = ['Sample_ID' , 'Read_QC' , 'Assembly_QC' , 'Mapping_QC' , 'Taxonomy_QC' , 'Overall_QC' , 'Bases' , 'Contigs#' , 'Assembly_Length' , 'Seq_Depth' , 'Ref_Cov_%' , 'Het-SNP#' , 'S.Pneumo_%' , 'GPSC' , 'Serotype' , 'ST' , 'aroE' , 'gdh' , 'gki' , 'recP' , 'spi' , 'xpt' , 'ddl' , 'pbp1a' , 'pbp2b' , 'pbp2x' , 'AMO_MIC' , 'AMO_Res' , 'CFT_MIC' , 'CFT_Res(Meningital)' , 'CFT_Res(Non-meningital)' , 'TAX_MIC' , 'TAX_Res(Meningital)' , 'TAX_Res(Non-meningital)' , 'CFX_MIC' , 'CFX_Res' , 'MER_MIC' , 'MER_Res' , 'PEN_MIC' , 'PEN_Res(Meningital)' , 'PEN_Res(Non-meningital)'] - -ariba_targets = set(pd.read_csv(ariba_metadata, sep='\t')['target'].unique()) - -if 'TET' in ariba_targets: - ariba_targets.add('DOX') - -if 'FQ' in ariba_targets: - ariba_targets.add('LFX') - -if 'TMP' in ariba_targets and 'SMX' in ariba_targets: - ariba_targets.add('COT') - -if 'ERY_CLI' in ariba_targets: - ariba_targets.update(['ERY', 'CLI']) - -ariba_targets = sorted(ariba_targets) - -pilis = [] - -for target in ariba_targets: - if target.lower().startswith('pili'): - pilis.append(target) - else: - output_columns.extend([f'{target}_Res', f'{target}_Determinant']) - -for pili in pilis: - output_columns.extend([f'{pili}', f'{pili}_Determinant']) - -df_manifest = pd.DataFrame(columns=output_columns) - -dfs = [df_manifest] - -reports = glob.glob(workdir_path +'/*.csv') -for report in reports: - df = pd.read_csv(report) - dfs.append(df) - -df_output = pd.concat(dfs, ignore_index=True).sort_values(by=['Sample_ID']) -df_output = df_output[output_columns] -df_output.to_csv(output_file, index=False, na_rep='_') +# Specify columns need to be included in the output file and their orders (except those based on ARIBA metadata) +COLUMNS_BY_CATEGORY = { + 'IDENTIFICATION': ['Sample_ID'], + 'QC': ['Read_QC' , 'Assembly_QC' , 'Mapping_QC' , 'Taxonomy_QC' , 'Overall_QC'] , + 'READ': ['Bases'], + 'ASSEMBLY': ['Contigs#' , 'Assembly_Length' , 'Seq_Depth'], + 'MAPPING': ['Ref_Cov_%' , 'Het-SNP#'], + 'TAXONOMY': ['S.Pneumo_%'], + 'LINEAGE': ['GPSC'], + 'SEROTYPE': ['Serotype'], + 'MLST': ['ST' , 'aroE' , 'gdh' , 'gki' , 'recP' , 'spi' , 'xpt' , 'ddl'], + 'PBP': ['pbp1a' , 'pbp2b' , 'pbp2x' , 'AMO_MIC' , 'AMO_Res' , 'CFT_MIC' , 'CFT_Res(Meningital)' , 'CFT_Res(Non-meningital)' , 'TAX_MIC' , 'TAX_Res(Meningital)' , 'TAX_Res(Non-meningital)' , 'CFX_MIC' , 'CFX_Res' , 'MER_MIC' , 'MER_Res' , 'PEN_MIC' , 'PEN_Res(Meningital)' , 'PEN_Res(Non-meningital)'] +} + + +# Check argv and save the global variables +if len(sys.argv) != 4: + sys.exit('Usage: generate_overall_report.py WORKDIR_PATH ARIBA_METADATA OUTPUT_FILE') +WORKDIR_PATH = sys.argv[1] +ARIBA_METADATA = sys.argv[2] +OUTPUT_FILE = sys.argv[3] + + +def main(): + output_columns = get_output_columns() + df_output = get_df_output(output_columns) + + # Saving df_output to output_file in csv format + df_output.to_csv(OUTPUT_FILE, index=False, na_rep='_') + + +# Get output columns based on COLUMNS_BY_CATEGORY and ARIBA metadata +def get_output_columns(): + output_columns = list(chain.from_iterable(COLUMNS_BY_CATEGORY.values())) + add_ariba_columns(output_columns) + return output_columns + + +# Based on ARIBA metadata, add additional output columns +def add_ariba_columns(output_columns): + # Get all targets in ARIBA metadata + ariba_targets = set(pd.read_csv(ARIBA_METADATA, sep='\t')['target'].unique()) + + # Adding special cases if certain targets exist + if 'TET' in ariba_targets: + ariba_targets.add('DOX') + if 'FQ' in ariba_targets: + ariba_targets.add('LFX') + if 'TMP' in ariba_targets and 'SMX' in ariba_targets: + ariba_targets.add('COT') + if 'ERY_CLI' in ariba_targets: + ariba_targets.update(['ERY', 'CLI']) + + # Add all targets alphabetically, except always adding PILI at the end + pilis = [] + for target in sorted(ariba_targets): + if target.lower().startswith('pili'): + pilis.append(target) + else: + output_columns.extend([f'{target}_Res', f'{target}_Determinant']) + for pili in pilis: + output_columns.extend([f'{pili}', f'{pili}_Determinant']) + + +# Generating df_output based on all sample reports with columns in the order of output_columns +def get_df_output(output_columns): + # Generate an empty dataframe as df_manifest based on output_columns + df_manifest = pd.DataFrame(columns=output_columns) + + # Generate a dataframe for each sample report and then concat df_manifest and all dataframes into df_output + dfs = [df_manifest] + reports = glob.glob(WORKDIR_PATH +'/*.csv') + for report in reports: + df = pd.read_csv(report) + dfs.append(df) + df_output = pd.concat(dfs, ignore_index=True).sort_values(by=['Sample_ID']) + + # Ensure column order in df_output is the same as output_columns + df_output = df_output[output_columns] + + return df_output + + +if __name__ == "__main__": + main() From 60614c8ef117a5641b4fbbf92343e2721e6d27db Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 27 Jul 2023 14:57:36 +0000 Subject: [PATCH 21/41] Improve shell scripts style --- bin/generate_sample_report.sh | 2 +- bin/get_databases_info.sh | 28 +++++++++++++------------- bin/get_docker_compose.sh | 8 ++++---- bin/get_images_info.sh | 38 +++++++++++++++++------------------ bin/get_kraken2_db.sh | 19 +++++++++--------- bin/get_lineage.sh | 4 ++-- bin/get_mlst.sh | 6 +++--- 7 files changed, 52 insertions(+), 53 deletions(-) diff --git a/bin/generate_sample_report.sh b/bin/generate_sample_report.sh index ec769f3..cb7ab52 100755 --- a/bin/generate_sample_report.sh +++ b/bin/generate_sample_report.sh @@ -1,3 +1,3 @@ paste -d , *.csv \ | sed '1 s/^/\"Sample_ID\",/' \ -| sed "2 s/^/\"${SAMPLE_ID}\",/" > $SAMPLE_REPORT \ No newline at end of file +| sed "2 s/^/\"${SAMPLE_ID}\",/" > "$SAMPLE_REPORT" diff --git a/bin/get_databases_info.sh b/bin/get_databases_info.sh index c87d56f..3d9dd98 100755 --- a/bin/get_databases_info.sh +++ b/bin/get_databases_info.sh @@ -3,9 +3,9 @@ add_bwa_db () { BWA_DB_JSON=${BWA_DB_PATH}/${BWA_JSON} if [ -f "$BWA_DB_JSON" ]; then - REFERENCE=$(jq -r .reference $BWA_DB_JSON) - REFERENCE_MD5=$(jq -r .reference_md5 $BWA_DB_JSON) - CREATE_TIME=$(jq -r .create_time $BWA_DB_JSON) + REFERENCE=$(jq -r .reference "$BWA_DB_JSON") + REFERENCE_MD5=$(jq -r .reference_md5 "$BWA_DB_JSON") + CREATE_TIME=$(jq -r .create_time "$BWA_DB_JSON") else REFERENCE="Not yet created" REFERENCE_MD5="Not yet created" @@ -17,11 +17,11 @@ add_bwa_db () { add_ariba_db () { ARIBA_DB_JSON=${ARIBA_DB_PATH}/${ARIBA_JSON} if [ -f "$ARIBA_DB_JSON" ]; then - REFERENCE=$(jq -r .reference $ARIBA_DB_JSON) - REFERENCE_MD5=$(jq -r .reference_md5 $ARIBA_DB_JSON) - METADATA=$(jq -r .metadata $ARIBA_DB_JSON) - METADATA_MD5=$(jq -r .metadata_md5 $ARIBA_DB_JSON) - CREATE_TIME=$(jq -r .create_time $ARIBA_DB_JSON) + REFERENCE=$(jq -r .reference "$ARIBA_DB_JSON") + REFERENCE_MD5=$(jq -r .reference_md5 "$ARIBA_DB_JSON") + METADATA=$(jq -r .metadata "$ARIBA_DB_JSON") + METADATA_MD5=$(jq -r .metadata_md5 "$ARIBA_DB_JSON") + CREATE_TIME=$(jq -r .create_time "$ARIBA_DB_JSON") else REFERENCE="Not yet created" REFERENCE_MD5="Not yet created" @@ -35,9 +35,9 @@ add_ariba_db () { add_seroba_db () { SEROBA_DB_JSON=${SEROBA_DB_PATH}/${SEROBA_JSON} if [ -f "$SEROBA_DB_JSON" ]; then - GIT=$(jq -r .git $SEROBA_DB_JSON) - KMER=$(jq -r .kmer $SEROBA_DB_JSON) - CREATE_TIME=$(jq -r .create_time $SEROBA_DB_JSON) + GIT=$(jq -r .git "$SEROBA_DB_JSON") + KMER=$(jq -r .kmer "$SEROBA_DB_JSON") + CREATE_TIME=$(jq -r .create_time "$SEROBA_DB_JSON") else GIT="Not yet created" KMER="Not yet created" @@ -49,8 +49,8 @@ add_seroba_db () { add_url_db () { DB_JSON=$1 if [ -f "$DB_JSON" ]; then - URL=$(jq -r .url $DB_JSON) - SAVE_TIME=$(jq -r .save_time $DB_JSON) + URL=$(jq -r .url "$DB_JSON") + SAVE_TIME=$(jq -r .save_time "$DB_JSON") else URL="Not yet downloaded" SAVE_TIME="Not yet downloaded" @@ -65,4 +65,4 @@ jq -n \ --argjson kraken2_db "$(add_url_db "${KRAKEN2_DB_PATH}/${KRAKEN2_JSON}")" \ --argjson poppunnk_db "$(add_url_db "${POPPUNK_DB_PATH}/${POPPUNK_JSON}")" \ --argjson poppunk_ext "$(add_url_db "${POPPUNK_DB_PATH}/${POPPUNK_EXT_JSON}")" \ - '$ARGS.named' > $JSON_FILE + '$ARGS.named' > "$JSON_FILE" diff --git a/bin/get_docker_compose.sh b/bin/get_docker_compose.sh index e581e54..5f8ff8b 100755 --- a/bin/get_docker_compose.sh +++ b/bin/get_docker_compose.sh @@ -2,13 +2,13 @@ COUNT=0 -echo "services:" >> $COMPOSE +echo "services:" >> "$COMPOSE" -grep -E "container\s?=" $NEXTFLOW_CONFIG \ +grep -E "container\s?=" "$NEXTFLOW_CONFIG" \ | sort -u \ | sed -r "s/\s+container\s?=\s?'(.+)'/\1/" \ | while read -r IMAGE ; do COUNT=$((COUNT+1)) - echo " SERVICE${COUNT}:" >> $COMPOSE - echo " image: $IMAGE" >> $COMPOSE + echo " SERVICE${COUNT}:" >> "$COMPOSE" + echo " image: $IMAGE" >> "$COMPOSE" done diff --git a/bin/get_images_info.sh b/bin/get_images_info.sh index 95dd83f..51b20aa 100755 --- a/bin/get_images_info.sh +++ b/bin/get_images_info.sh @@ -1,7 +1,7 @@ # Extract containers information from nextflow.config and save into a JSON file find_image () { - grep -E "container\s?=" -B 1 $NEXTFLOW_CONFIG | grep -v -- "^--$" | paste - - | sort -u | grep $1 | sed -r "s/.+container\s?=\s?'(.+)'/\1/" + grep -E "container\s?=" -B 1 "$NEXTFLOW_CONFIG" | grep -v -- "^--$" | paste - - | sort -u | grep "$1" | sed -r "s/.+container\s?=\s?'(.+)'/\1/" } BASH=$(find_image bash) @@ -22,24 +22,24 @@ KRAKEN2=$(find_image kraken2) SEROBA=$(find_image seroba) add_container () { - jq -n --arg container $1 '.container = $container' + jq -n --arg container "$1" '.container = $container' } jq -n \ - --argjson bash "$(add_container $BASH)" \ - --argjson git "$(add_container $GIT)" \ - --argjson python "$(add_container $PYTHON)" \ - --argjson fastp "$(add_container $FASTP)" \ - --argjson unicycler "$(add_container $UNICYCLER)" \ - --argjson shovill "$(add_container $SHOVILL)" \ - --argjson quast "$(add_container $QUAST)" \ - --argjson bwa "$(add_container $BWA)" \ - --argjson samtools "$(add_container $SAMTOOLS)" \ - --argjson bcftools "$(add_container $BCFTOOLS)" \ - --argjson poppunk "$(add_container $POPPUNK)" \ - --argjson spn_pbp_amr "$(add_container $SPN_PBP_AMR)" \ - --argjson ariba "$(add_container $ARIBA)" \ - --argjson mlst "$(add_container $MLST)" \ - --argjson kraken2 "$(add_container $KRAKEN2)" \ - --argjson seroba "$(add_container $SEROBA)" \ - '$ARGS.named' > $JSON_FILE + --argjson bash "$(add_container "$BASH")" \ + --argjson git "$(add_container "$GIT")" \ + --argjson python "$(add_container "$PYTHON")" \ + --argjson fastp "$(add_container "$FASTP")" \ + --argjson unicycler "$(add_container "$UNICYCLER")" \ + --argjson shovill "$(add_container "$SHOVILL")" \ + --argjson quast "$(add_container "$QUAST")" \ + --argjson bwa "$(add_container "$BWA")" \ + --argjson samtools "$(add_container "$SAMTOOLS")" \ + --argjson bcftools "$(add_container "$BCFTOOLS")" \ + --argjson poppunk "$(add_container "$POPPUNK")" \ + --argjson spn_pbp_amr "$(add_container "$SPN_PBP_AMR")" \ + --argjson ariba "$(add_container "$ARIBA")" \ + --argjson mlst "$(add_container "$MLST")" \ + --argjson kraken2 "$(add_container "$KRAKEN2")" \ + --argjson seroba "$(add_container "$SEROBA")" \ + '$ARGS.named' > "$JSON_FILE" diff --git a/bin/get_kraken2_db.sh b/bin/get_kraken2_db.sh index c53cc52..8632bc8 100755 --- a/bin/get_kraken2_db.sh +++ b/bin/get_kraken2_db.sh @@ -1,29 +1,28 @@ # Check if all file exists and were obtained from the database at the specific link. # If not: remove files in database directory, download, and unzip to database directory, also save metadata to JSON -DB_NAME=$(basename $DB_REMOTE) ZIPPED_DB='kraken2_db.tar.gz' -if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ - [ ! "$DB_REMOTE" == "$(jq -r .url ${DB_LOCAL}/${JSON_FILE})" ] || \ - [ ! -f ${DB_LOCAL}/hash.k2d ] || \ - [ ! -f ${DB_LOCAL}/opts.k2d ] || \ - [ ! -f ${DB_LOCAL}/taxo.k2d ]; then +if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \ + [ ! "$DB_REMOTE" == "$(jq -r .url "${DB_LOCAL}/${JSON_FILE}")" ] || \ + [ ! -f "${DB_LOCAL}/hash.k2d" ] || \ + [ ! -f "${DB_LOCAL}/opts.k2d" ] || \ + [ ! -f "${DB_LOCAL}/taxo.k2d" ]; then - rm -rf ${DB_LOCAL}/{,.[!.],..?}* + rm -rf "${DB_LOCAL:?}"/{,.[!.],..?}* - wget ${DB_REMOTE} -O $ZIPPED_DB + wget "${DB_REMOTE}" -O $ZIPPED_DB # Use tmp dir and find to ensure files are saved directly at $DB_LOCAL regardless of archive directory structure mkdir tmp tar -xzf $ZIPPED_DB -C tmp - find tmp -type f -exec mv {} $DB_LOCAL \; + find tmp -type f -exec mv {} "$DB_LOCAL" \; rm -f $ZIPPED_DB jq -n \ --arg url "${DB_REMOTE}" \ --arg save_time "$(date +"%Y-%m-%d %H:%M:%S %Z")" \ - '{"url" : $url, "save_time": $save_time}' > ${DB_LOCAL}/${JSON_FILE} + '{"url" : $url, "save_time": $save_time}' > "${DB_LOCAL}/${JSON_FILE}" fi diff --git a/bin/get_lineage.sh b/bin/get_lineage.sh index 63b6ec0..cd57737 100755 --- a/bin/get_lineage.sh +++ b/bin/get_lineage.sh @@ -6,8 +6,8 @@ # Save results of individual sample into .csv with its name as filename sed 's/^/prefix_/' "$QFILE" > safe_qfile.txt -poppunk_assign --db "${POPPUNK_DIR}/${DB_NAME}" --external-clustering "${POPPUNK_DIR}/${EXT_CLUSTERS_FILE}" --query safe_qfile.txt --output output --threads $(nproc) +poppunk_assign --db "${POPPUNK_DIR}/${DB_NAME}" --external-clustering "${POPPUNK_DIR}/${EXT_CLUSTERS_FILE}" --query safe_qfile.txt --output output --threads "$(nproc)" sed 's/^prefix_//' output/output_external_clusters.csv > result.txt -awk -F , 'NR!=1 { print "\"GPSC\"\n" "\"" $2 "\"" > $1 ".csv" }' result.txt \ No newline at end of file +awk -F , 'NR!=1 { print "\"GPSC\"\n" "\"" $2 "\"" > $1 ".csv" }' result.txt diff --git a/bin/get_mlst.sh b/bin/get_mlst.sh index ab7c8e9..72e0400 100755 --- a/bin/get_mlst.sh +++ b/bin/get_mlst.sh @@ -2,7 +2,7 @@ OUTPUT='output.tsv' -mlst --legacy --scheme spneumoniae "$ASSEMBLY" > $OUTPUT +mlst --legacy --scheme spneumoniae "$ASSEMBLY" > "$OUTPUT" ST=$(awk -F'\t' 'FNR == 2 {print $3}' $OUTPUT) aroE=$(awk -F'\t' 'FNR == 2 {print $4}' $OUTPUT) @@ -13,5 +13,5 @@ spi=$(awk -F'\t' 'FNR == 2 {print $8}' $OUTPUT) xpt=$(awk -F'\t' 'FNR == 2 {print $9}' $OUTPUT) ddl=$(awk -F'\t' 'FNR == 2 {print $10}' $OUTPUT) -echo \"ST\",\"aroE\",\"gdh\",\"gki\",\"recP\",\"spi\",\"xpt\",\"ddl\" > $MLST_REPORT -echo \"$ST\",\"$aroE\",\"$gdh\",\"$gki\",\"$recP\",\"$spi\",\"$xpt\",\"$ddl\" >> $MLST_REPORT \ No newline at end of file +echo \"ST\",\"aroE\",\"gdh\",\"gki\",\"recP\",\"spi\",\"xpt\",\"ddl\" > "$MLST_REPORT" +echo \""$ST"\",\""$aroE"\",\""$gdh"\",\""$gki"\",\""$recP"\",\""$spi"\",\""$xpt"\",\""$ddl"\" >> "$MLST_REPORT" From 00e3de37a77c7ee13aa179509a78d69e635870b0 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 27 Jul 2023 17:04:47 +0000 Subject: [PATCH 22/41] Fix comment --- bin/generate_overall_report.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/generate_overall_report.py b/bin/generate_overall_report.py index 602e1d0..f238979 100755 --- a/bin/generate_overall_report.py +++ b/bin/generate_overall_report.py @@ -23,7 +23,7 @@ } -# Check argv and save the global variables +# Check argv and save to global variables if len(sys.argv) != 4: sys.exit('Usage: generate_overall_report.py WORKDIR_PATH ARIBA_METADATA OUTPUT_FILE') WORKDIR_PATH = sys.argv[1] @@ -35,7 +35,7 @@ def main(): output_columns = get_output_columns() df_output = get_df_output(output_columns) - # Saving df_output to output_file in csv format + # Saving df_output to OUTPUT_FILE in csv format df_output.to_csv(OUTPUT_FILE, index=False, na_rep='_') From b813ad0a8aba3f2e252b5d00d659e96e64d024cc Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 27 Jul 2023 17:10:04 +0000 Subject: [PATCH 23/41] Refactor to improve maintainability & readability --- bin/get_other_resistance.py | 169 +++++++++++++++++++++--------------- 1 file changed, 100 insertions(+), 69 deletions(-) diff --git a/bin/get_other_resistance.py b/bin/get_other_resistance.py index d902f59..8743ff7 100755 --- a/bin/get_other_resistance.py +++ b/bin/get_other_resistance.py @@ -8,72 +8,97 @@ import pandas as pd import csv -report_path = sys.argv[1] -debug_report_path = sys.argv[2] -metadata_path = sys.argv[3] -output_file = sys.argv[4] -with open(report_path) as report, open(debug_report_path) as debug_report, open(metadata_path) as metadata: - # For saving (reference, gene, var_only) combinations as key and their information ({var_change: target}) as value found in metadata - gene_dict = defaultdict(dict) - - # For saving targets found in metadata as key and their determinants (add to a set) as value - target_dict = {} - - # Skip the header in metadata - next(metadata) - # Go through lines in metadata and save findings to gene_dict and target_dict - for line in (line.strip() for line in metadata): - # Extract useful fields - fields = [str(field) for field in line.split("\t")] - ref_name, gene, var_only, var_change, _, target = fields - - # Populating gene_dict - gene_dict[(ref_name, gene, var_only)].update({var_change: target}) - # Populating target_dict - target_dict.update({target: set()}) - - # Skip the header in report and debug report - next(report) - next(debug_report) - # Go through lines in both report and debug report to detect targets - for line in (line.strip() for line in chain(report, debug_report)): - # Extract useful fields - fields = [str(field) for field in line.split("\t")] - ref_name, gene, var_only, ref_len, ref_base_assembled, known_var_change, has_known_var, ref_ctg_effect, ref_start, ref_end = fields[1], fields[2], fields[3], fields[7], fields[8], fields[16], fields[17], fields[19], fields[20], fields[21] - - # If coverage (ref_base_assembled / ref_len) < 0.9 or either variable contains non-numeric value, skip the line - if not ref_base_assembled.isdigit() or not ref_len.isdigit() or int(ref_base_assembled)/int(ref_len) < 0.9: - continue - - # If the known_var_change (. for genes, specific change for variants) is not found in the metadata of the (ref_name, gene, var_only) combination, skip the line - gene_dict_key = (ref_name, gene, var_only) - try: - target = gene_dict[gene_dict_key][known_var_change] - except KeyError: - continue - - # Logic for gene detection. Found means hit. - if var_only == "0": - target_dict[target].add(f'{ref_name}') - - # Logic for variant detection, further criteria required - if var_only == "1": - # folP-specific criteria: ref_ctg_effect (effect of change between reference and contig) is one of the keywords and the change occurs within nt 168-201 - if ref_name.lower().startswith("folp") and ref_ctg_effect.lower() in ('fshift', 'trunc', 'indel', 'ins', 'multiple') and (168 <= int(ref_start) <= 201 or 168 <= int(ref_end) <= 201): - pos = ref_start if ref_start == ref_end else f'{ref_start}-{ref_end}' - target_dict[target].add(f'{ref_name} {ref_ctg_effect} at {pos}') - # Common criteria: the assembly has that variant - elif has_known_var == "1": - target_dict[target].add(f'{ref_name} Variant {known_var_change}') +# Check argv and save to global variables +if len(sys.argv) != 5: + sys.exit('Usage: get_other_resistance.py REPORT_PATH DEBUG_REPORT_PATH METADATA_PATH OUTPUT_FILE') + +REPORT_PATH = sys.argv[1] +DEBUG_REPORT_PATH = sys.argv[2] +METADATA_PATH = sys.argv[3] +OUTPUT_FILE = sys.argv[4] + + +def main(): + targets_dict, hits_dict = prepare_dicts() + find_hits(targets_dict, hits_dict) + output = get_output(hits_dict) + # Save output to OUTPUT_FILE in csv format + pd.DataFrame([output]).to_csv(OUTPUT_FILE, index=False, quoting=csv.QUOTE_ALL) + + +def prepare_dicts(): + # For saving (reference, gene, var_only) combinations as key and their information ({var_change: target}) as value found in metadata + # Used to search whether there is a hit in the ARIBA result + targets_dict = defaultdict(dict) + + # For saving targets found in metadata as key and their determinants (i.e. hits) found in ARIBA result as values in set + hits_dict = {} + + with open(METADATA_PATH) as metadata: + # Skip the header in metadata + next(metadata) + + # Go through lines in metadata and save findings to targets_dict and hits_dict + for line in (line.strip() for line in metadata): + # Extract useful fields + fields = [str(field) for field in line.split("\t")] + ref_name, gene, var_only, var_change, _, target = fields + + # Populating targets_dict + targets_dict[(ref_name, gene, var_only)].update({var_change: target}) + # Populating hits_dict + hits_dict.update({target: set()}) + + return targets_dict, hits_dict + + +def find_hits(targets_dict, hits_dict): + with open(REPORT_PATH) as report, open(DEBUG_REPORT_PATH) as debug_report: + # Skip the header in report and debug report + next(report) + next(debug_report) + + # Go through lines in both report and debug report to detect targets + for line in (line.strip() for line in chain(report, debug_report)): + # Extract useful fields + fields = [str(field) for field in line.split("\t")] + ref_name, gene, var_only, ref_len, ref_base_assembled, known_var_change, has_known_var, ref_ctg_effect, ref_start, ref_end = fields[1], fields[2], fields[3], fields[7], fields[8], fields[16], fields[17], fields[19], fields[20], fields[21] + + # If coverage (ref_base_assembled / ref_len) < 0.9 or either variable contains non-numeric value, skip the line + if not ref_base_assembled.isdigit() or not ref_len.isdigit() or int(ref_base_assembled)/int(ref_len) < 0.9: + continue + + # If the known_var_change (. for genes, specific change for variants) is not found in the metadata of the (ref_name, gene, var_only) combination, skip the line + try: + target = targets_dict[(ref_name, gene, var_only)][known_var_change] + except KeyError: + continue + + # Logic for gene detection. Found means hit. + if var_only == "0": + hits_dict[target].add(f'{ref_name}') + + # Logic for variant detection, further criteria required + if var_only == "1": + # folP-specific criteria: ref_ctg_effect (effect of change between reference and contig) is one of the keywords and the change occurs within nt 168-201 + if ref_name.lower().startswith("folp") and ref_ctg_effect.lower() in ('fshift', 'trunc', 'indel', 'ins', 'multiple') and (168 <= int(ref_start) <= 201 or 168 <= int(ref_end) <= 201): + pos = ref_start if ref_start == ref_end else f'{ref_start}-{ref_end}' + hits_dict[target].add(f'{ref_name} {ref_ctg_effect} at {pos}') + # Common criteria: the assembly has that variant + elif has_known_var == "1": + hits_dict[target].add(f'{ref_name} Variant {known_var_change}') + + +def get_output(hits_dict): # For saving final output, where information is saved per-target output = {} - # Go through targets in metadata - for target in target_dict: + # Go through targets in hits_dict + for target in hits_dict: # If the target has no hit, set output as S or NEG (only for PILI-1/2), and determinant as _ - if len(target_dict[target]) == 0: + if len(hits_dict[target]) == 0: if target.lower().startswith('pili'): output[target] = 'NEG' else: @@ -87,10 +112,15 @@ else: output[f'{target}_Res'] = 'R' - output[f'{target}_Determinant'] = '; '.join(target_dict[target]) + output[f'{target}_Determinant'] = '; '.join(sorted(hits_dict[target])) - # Special cases to add to output + add_output_special_cases(output, hits_dict) + return output + + +# Special cases to add to output +def add_output_special_cases(output, hits_dict): # If TET exists and DOX does not: add DOX to output; directly copy output and determinant if 'TET_Res' in output and 'DOX_Res' not in output: output['DOX_Res'] = output['TET_Res'] @@ -107,15 +137,15 @@ if 'TMP_Res' in output and 'SMX_Res' in output and 'COT_Res' not in output: if output['TMP_Res'] == 'R' and output['SMX_Res'] == 'R': output['COT_Res'] = 'R' - output['COT_Determinant'] = '; '.join(target_dict['TMP'].union(target_dict['SMX'])) + output['COT_Determinant'] = '; '.join(sorted(hits_dict['TMP'].union(hits_dict['SMX']))) elif (output['TMP_Res'] == 'R') ^ (output['SMX_Res'] == 'R'): output['COT_Res'] = 'I' - output['COT_Determinant'] = '; '.join(target_dict['TMP'].union(target_dict['SMX'])) + output['COT_Determinant'] = '; '.join(sorted(hits_dict['TMP'].union(hits_dict['SMX']))) elif output['TMP_Res'] == 'S' and output['SMX_Res'] == 'S': output['COT_Res'] = 'S' output['COT_Determinant'] = '_' - # If ERY_CLI exists, add ERY and CLI to output. + # If ERY_CLI exists: add ERY and CLI to output. # If ERY_CLI is R, ERY and CLI are R, and add ERY_CLI determinant to their determinants # If ERY_CLI is S, ERY and CLI are S if they do not already exist, otherwise leave them unchanged if 'ERY_CLI_Res' in output: @@ -126,8 +156,9 @@ output['ERY_Res'] = output['ERY_Res'] if 'ERY_Res' in output else 'S' output['CLI_Res'] = output['CLI_Res'] if 'CLI_Res' in output else 'S' - output['ERY_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['ERY'])) if 'ERY' in target_dict and len(target_dict['ERY']) != 0 else output['ERY_CLI_Determinant'] - output['CLI_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['CLI'])) if 'CLI' in target_dict and len(target_dict['CLI']) != 0 else output['ERY_CLI_Determinant'] + output['ERY_Determinant'] = '; '.join(sorted(hits_dict['ERY_CLI'].union(hits_dict['ERY']))) if 'ERY' in hits_dict and len(hits_dict['ERY']) != 0 else output['ERY_CLI_Determinant'] + output['CLI_Determinant'] = '; '.join(sorted(hits_dict['ERY_CLI'].union(hits_dict['CLI']))) if 'CLI' in hits_dict and len(hits_dict['CLI']) != 0 else output['ERY_CLI_Determinant'] + - # Save output dict as csv - pd.DataFrame([output]).to_csv(output_file, index=False, quoting=csv.QUOTE_ALL) \ No newline at end of file +if __name__ == "__main__": + main() From 6002529434876cfc7e91d35f6782fc043e8c327c Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 28 Jul 2023 09:31:21 +0000 Subject: [PATCH 24/41] Improve code comments --- bin/get_other_resistance.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bin/get_other_resistance.py b/bin/get_other_resistance.py index 8743ff7..a77eab6 100755 --- a/bin/get_other_resistance.py +++ b/bin/get_other_resistance.py @@ -28,12 +28,13 @@ def main(): pd.DataFrame([output]).to_csv(OUTPUT_FILE, index=False, quoting=csv.QUOTE_ALL) +# Prepare targets_dict for searching hits and hits_dict for saving hits def prepare_dicts(): - # For saving (reference, gene, var_only) combinations as key and their information ({var_change: target}) as value found in metadata + # For saving (reference, gene, var_only) combinations as keys and their information found in metadata as values in dict format (i.e. {var_change: target}) # Used to search whether there is a hit in the ARIBA result targets_dict = defaultdict(dict) - # For saving targets found in metadata as key and their determinants (i.e. hits) found in ARIBA result as values in set + # For saving targets found in metadata as key and their determinants (i.e. hits) found in ARIBA result as values in set format hits_dict = {} with open(METADATA_PATH) as metadata: @@ -54,6 +55,7 @@ def prepare_dicts(): return targets_dict, hits_dict +# Finding hits in ARIBA results based on targets_dict and save hits to hits_dict def find_hits(targets_dict, hits_dict): with open(REPORT_PATH) as report, open(DEBUG_REPORT_PATH) as debug_report: # Skip the header in report and debug report @@ -91,6 +93,7 @@ def find_hits(targets_dict, hits_dict): hits_dict[target].add(f'{ref_name} Variant {known_var_change}') +# Generating final output dataframe based on hits_dict def get_output(hits_dict): # For saving final output, where information is saved per-target output = {} From 63ad10beee0e35b3578e30c985a4e9fe732a4637 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 28 Jul 2023 11:14:12 +0000 Subject: [PATCH 25/41] Improve shell scripts style --- bin/generate_sample_report.sh | 2 ++ bin/get_pbp_resistance.sh | 12 ++++++------ bin/get_poppunk_db.sh | 30 +++++++++++++++--------------- bin/get_poppunk_ext_clusters.sh | 15 +++++++-------- bin/get_seroba_db.sh | 12 ++++++------ bin/get_serotype.sh | 7 ++++--- bin/get_tools_info.sh | 4 ++-- bin/mapping_qc.sh | 8 ++++---- bin/overall_qc.sh | 4 ++-- bin/read_qc.sh | 8 ++++---- bin/taxonomy_qc.sh | 8 ++++---- 11 files changed, 56 insertions(+), 54 deletions(-) diff --git a/bin/generate_sample_report.sh b/bin/generate_sample_report.sh index cb7ab52..2a82172 100755 --- a/bin/generate_sample_report.sh +++ b/bin/generate_sample_report.sh @@ -1,3 +1,5 @@ +# Combine all csv reports into a single csv, then add Sample_ID as the first field + paste -d , *.csv \ | sed '1 s/^/\"Sample_ID\",/' \ | sed "2 s/^/\"${SAMPLE_ID}\",/" > "$SAMPLE_REPORT" diff --git a/bin/get_pbp_resistance.sh b/bin/get_pbp_resistance.sh index 5e833c3..0c9c943 100755 --- a/bin/get_pbp_resistance.sh +++ b/bin/get_pbp_resistance.sh @@ -3,13 +3,13 @@ # For all, replace null or space-only string with empty string function GET_VALUE { - echo $( < $JSON_FILE jq -r --arg target "$1" '.[$target]' \ - | sed 's/^null$//g;s/^\s+$//g' ) + < "$JSON_FILE" jq -r --arg target "$1" '.[$target]' \ + | sed 's/^null$//g;s/^\s+$//g' } function GET_RES { - echo $( < $JSON_FILE jq -r --arg target "$1" '.[$target]' \ - | sed 's/^null$//g;s/^\s+$//g' ) + < "$JSON_FILE" jq -r --arg target "$1" '.[$target]' \ + | sed 's/^null$//g;s/^\s+$//g' } pbp1a=$(GET_VALUE "pbp1a") @@ -31,5 +31,5 @@ PEN_MIC=$(GET_VALUE "penMic") PEN_NONMENINGITIS=$(GET_RES "penNonMeningitis") PEN_MENINGITIS=$(GET_RES "penMeningitis") -echo \"pbp1a\",\"pbp2b\",\"pbp2x\",\"AMO_MIC\",\"AMO_Res\",\"CFT_MIC\",\"CFT_Res\(Meningital\)\",\"CFT_Res\(Non-meningital\)\",\"TAX_MIC\",\"TAX_Res\(Meningital\)\",\"TAX_Res\(Non-meningital\)\",\"CFX_MIC\",\"CFX_Res\",\"MER_MIC\",\"MER_Res\",\"PEN_MIC\",\"PEN_Res\(Meningital\)\",\"PEN_Res\(Non-meningital\)\" > $PBP_AMR_REPORT -echo \"$pbp1a\",\"$pbp2b\",\"$pbp2x\",\"$AMO_MIC\",\"$AMO\",\"$CFT_MIC\",\"$CFT_MENINGITIS\",\"$CFT_NONMENINGITIS\",\"$TAX_MIC\",\"$TAX_MENINGITIS\",\"$TAX_NONMENINGITIS\",\"$CFX_MIC\",\"$CFX\",\"$MER_MIC\",\"$MER\",\"$PEN_MIC\",\"$PEN_MENINGITIS\",\"$PEN_NONMENINGITIS\" >> $PBP_AMR_REPORT \ No newline at end of file +echo \"pbp1a\",\"pbp2b\",\"pbp2x\",\"AMO_MIC\",\"AMO_Res\",\"CFT_MIC\",\"CFT_Res\(Meningital\)\",\"CFT_Res\(Non-meningital\)\",\"TAX_MIC\",\"TAX_Res\(Meningital\)\",\"TAX_Res\(Non-meningital\)\",\"CFX_MIC\",\"CFX_Res\",\"MER_MIC\",\"MER_Res\",\"PEN_MIC\",\"PEN_Res\(Meningital\)\",\"PEN_Res\(Non-meningital\)\" > "$PBP_AMR_REPORT" +echo \""$pbp1a"\",\""$pbp2b"\",\""$pbp2x"\",\""$AMO_MIC"\",\""$AMO"\",\""$CFT_MIC"\",\""$CFT_MENINGITIS"\",\""$CFT_NONMENINGITIS"\",\""$TAX_MIC"\",\""$TAX_MENINGITIS"\",\""$TAX_NONMENINGITIS"\",\""$CFX_MIC"\",\""$CFX"\",\""$MER_MIC"\",\""$MER"\",\""$PEN_MIC"\",\""$PEN_MENINGITIS"\",\""$PEN_NONMENINGITIS"\" >> "$PBP_AMR_REPORT" diff --git a/bin/get_poppunk_db.sh b/bin/get_poppunk_db.sh index d4e705a..48a0198 100755 --- a/bin/get_poppunk_db.sh +++ b/bin/get_poppunk_db.sh @@ -6,27 +6,27 @@ DB_NAME=$(basename "$DB_REMOTE" .tar.gz) DB_PATH=${DB_LOCAL}/${DB_NAME} -if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ - [ ! "$DB_REMOTE" == "$(jq -r .url ${DB_LOCAL}/${JSON_FILE})" ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}.h5 ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}.dists.npy ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}.dists.pkl ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}_fit.npz ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}_fit.pkl ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}_graph.gt ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}_clusters.csv ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}.refs ]; then +if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \ + [ ! "$DB_REMOTE" == "$(jq -r .url "${DB_LOCAL}/${JSON_FILE}")" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}.h5" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}.dists.npy" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}.dists.pkl" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}_fit.npz" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}_fit.pkl" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}_graph.gt" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}_clusters.csv" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}.refs" ]; then - rm -rf ${DB_LOCAL}/${JSON_FILE} - rm -rf ${DB_LOCAL}/*/ + rm -rf "${DB_LOCAL:?}/${JSON_FILE}" + rm -rf "${DB_LOCAL:?}"/*/ - wget $DB_REMOTE -O poppunk_db.tar.gz - tar -xzf poppunk_db.tar.gz -C $DB_LOCAL + wget "$DB_REMOTE" -O poppunk_db.tar.gz + tar -xzf poppunk_db.tar.gz -C "$DB_LOCAL" rm poppunk_db.tar.gz jq -n \ --arg url "$DB_REMOTE" \ --arg save_time "$(date +"%Y-%m-%d %H:%M:%S %Z")" \ - '{"url" : $url, "save_time": $save_time}' > ${DB_LOCAL}/${JSON_FILE} + '{"url" : $url, "save_time": $save_time}' > "${DB_LOCAL}/${JSON_FILE}" fi diff --git a/bin/get_poppunk_ext_clusters.sh b/bin/get_poppunk_ext_clusters.sh index e330968..273ccbb 100755 --- a/bin/get_poppunk_ext_clusters.sh +++ b/bin/get_poppunk_ext_clusters.sh @@ -4,20 +4,19 @@ # If not: remove all csv files, and download to database directory, also save metadata to JSON EXT_CLUSTERS_CSV=$(basename "$EXT_CLUSTERS_REMOTE") -EXT_CLUSTERS_NAME=$(basename "$EXT_CLUSTERS_REMOTE" .csv) -if [ ! -f ${EXT_CLUSTERS_LOCAL}/${JSON_FILE} ] || \ - [ ! "$EXT_CLUSTERS_REMOTE" == "$(jq -r .url ${EXT_CLUSTERS_LOCAL}/${JSON_FILE})" ] || \ - [ ! -f ${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV} ]; then +if [ ! -f "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}" ] || \ + [ ! "$EXT_CLUSTERS_REMOTE" == "$(jq -r .url "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}")" ] || \ + [ ! -f "${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV}" ]; then - rm -f ${EXT_CLUSTERS_LOCAL}/*.csv - rm -f ${EXT_CLUSTERS_LOCAL}/${JSON_FILE} + rm -f "${EXT_CLUSTERS_LOCAL}"/*.csv + rm -f "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}" - wget $EXT_CLUSTERS_REMOTE -O ${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV} + wget "$EXT_CLUSTERS_REMOTE" -O "${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV}" jq -n \ --arg url "$EXT_CLUSTERS_REMOTE" \ --arg save_time "$(date +"%Y-%m-%d %H:%M:%S %Z")" \ - '{"url" : $url, "save_time": $save_time}' > ${EXT_CLUSTERS_LOCAL}/${JSON_FILE} + '{"url" : $url, "save_time": $save_time}' > "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}" fi diff --git a/bin/get_seroba_db.sh b/bin/get_seroba_db.sh index a3e1d3c..0cda2fc 100755 --- a/bin/get_seroba_db.sh +++ b/bin/get_seroba_db.sh @@ -5,13 +5,13 @@ # Assume up-to-date if JSON passes checks and the host cannot be resolved to allow offline usage -if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ - [ ! "$(grep 'git' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "${DB_REMOTE}" ] || \ - [ ! "$(grep 'kmer' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "${KMER}" ] || \ - !((git -C ${DB_LOCAL} pull || echo 'Already up-to-date') | grep -q 'Already up[- ]to[- ]date'); then +if [ ! -f "${DB_LOCAL}"/"${JSON_FILE}" ] || \ + [ ! "$(grep 'git' "${DB_LOCAL}"/"${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "${DB_REMOTE}" ] || \ + [ ! "$(grep 'kmer' "${DB_LOCAL}"/"${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "${KMER}" ] || \ + ! ( (git -C "${DB_LOCAL}" pull || echo 'Already up-to-date') | grep -q 'Already up[- ]to[- ]date' ); then - rm -rf ${DB_LOCAL}/{,.[!.],..?}* - git clone ${DB_REMOTE} ${DB_LOCAL} + rm -rf "${DB_LOCAL:?}"/{,.[!.],..?}* + git clone "${DB_REMOTE}" "${DB_LOCAL}" CREATE_DB=true diff --git a/bin/get_serotype.sh b/bin/get_serotype.sh index 80bfcc7..560cdd7 100755 --- a/bin/get_serotype.sh +++ b/bin/get_serotype.sh @@ -1,9 +1,10 @@ # Run SeroBA to serotype samples + { - seroba runSerotyping "$SEROBA_DIR"/"$DATABASE" "$READ1" "$READ2" "$SAMPLE_ID" && SEROTYPE=$(awk -F'\t' '{ print $2 }' ${SAMPLE_ID}/pred.tsv) + seroba runSerotyping "${SEROBA_DIR}/${DATABASE}" "$READ1" "$READ2" "$SAMPLE_ID" && SEROTYPE=$(awk -F'\t' '{ print $2 }' "${SAMPLE_ID}/pred.tsv") } || { SEROTYPE="SEROBA FAILURE" } -echo \"Serotype\" > $SEROTYPE_REPORT -echo \"$SEROTYPE\" >> $SEROTYPE_REPORT \ No newline at end of file +echo \"Serotype\" > "$SEROTYPE_REPORT" +echo \""$SEROTYPE"\" >> "$SEROTYPE_REPORT" diff --git a/bin/get_tools_info.sh b/bin/get_tools_info.sh index 23d9520..9d20e16 100755 --- a/bin/get_tools_info.sh +++ b/bin/get_tools_info.sh @@ -1,7 +1,7 @@ # Save received tools versions into a JSON file add_version () { - jq -n --arg version $1 '.version = $version' + jq -n --arg version "$1" '.version = $version' } jq -n \ @@ -19,4 +19,4 @@ jq -n \ --argjson kraken2 "$(add_version "$KRAKEN2_VERSION")" \ --argjson seroba "$(add_version "$SEROBA_VERSION")" \ --argjson ariba "$(add_version "$ARIBA_VERSION")" \ - '$ARGS.named' > $JSON_FILE + '$ARGS.named' > "$JSON_FILE" diff --git a/bin/mapping_qc.sh b/bin/mapping_qc.sh index 75b18a0..f6eb48e 100755 --- a/bin/mapping_qc.sh +++ b/bin/mapping_qc.sh @@ -1,12 +1,12 @@ # Extract mapping QC information and determine QC result based on reference coverage and count of Het-SNP sites -COVERAGE=$(printf %.2f $COVERAGE) +COVERAGE=$(printf %.2f "$COVERAGE") -if (( $(echo "$COVERAGE > $QC_REF_COVERAGE" | bc -l) )) && (( $HET_SNP < $QC_HET_SNP_SITE )); then +if [[ "$(echo "$COVERAGE > $QC_REF_COVERAGE" | bc -l)" == 1 ]] && [[ $HET_SNP -lt $QC_HET_SNP_SITE ]]; then MAPPING_QC="PASS" else MAPPING_QC="FAIL" fi -echo \"Mapping_QC\",\"Ref_Cov_%\",\"Het-SNP#\" > $MAPPING_QC_REPORT -echo \"$MAPPING_QC\",\"$COVERAGE\",\"$QC_HET_SNP_SITE\" >> $MAPPING_QC_REPORT \ No newline at end of file +echo \"Mapping_QC\",\"Ref_Cov_%\",\"Het-SNP#\" > "$MAPPING_QC_REPORT" +echo \"$MAPPING_QC\",\""$COVERAGE"\",\""$QC_HET_SNP_SITE"\" >> "$MAPPING_QC_REPORT" diff --git a/bin/overall_qc.sh b/bin/overall_qc.sh index de7e116..d83e52a 100755 --- a/bin/overall_qc.sh +++ b/bin/overall_qc.sh @@ -11,5 +11,5 @@ else OVERALL_QC="FAIL" fi -echo \"Overall_QC\" > $OVERALL_QC_REPORT -echo \"$OVERALL_QC\" >> $OVERALL_QC_REPORT \ No newline at end of file +echo \"Overall_QC\" > "$OVERALL_QC_REPORT" +echo \""$OVERALL_QC"\" >> "$OVERALL_QC_REPORT" diff --git a/bin/read_qc.sh b/bin/read_qc.sh index 6ce8382..14d7519 100755 --- a/bin/read_qc.sh +++ b/bin/read_qc.sh @@ -1,12 +1,12 @@ # Extract total base count and determine QC result based on output JSON file of fastp -BASES=$(< $JSON jq -r .summary.after_filtering.total_bases) +BASES=$(< "$JSON" jq -r .summary.after_filtering.total_bases) -if (( $(echo "$BASES >= ($QC_LENGTH_LOW*$QC_DEPTH)" | bc -l) )); then +if [[ "$(echo "$BASES >= ($QC_LENGTH_LOW*$QC_DEPTH)" | bc -l)" == 1 ]]; then READ_QC="PASS" else READ_QC="FAIL" fi -echo \"Read_QC\",\"Bases\" > $READ_QC_REPORT -echo \"$READ_QC\",\"$BASES\" >> $READ_QC_REPORT \ No newline at end of file +echo \"Read_QC\",\"Bases\" > "$READ_QC_REPORT" +echo \"$READ_QC\",\""$BASES"\" >> "$READ_QC_REPORT" diff --git a/bin/taxonomy_qc.sh b/bin/taxonomy_qc.sh index 23254b1..7528b1d 100755 --- a/bin/taxonomy_qc.sh +++ b/bin/taxonomy_qc.sh @@ -1,16 +1,16 @@ # Extract taxonomy QC information and determine QC result based on kraken2_report.txt -PERCENTAGE=$(awk -F"\t" '$4 ~ /^S$/ && $6 ~ /Streptococcus pneumoniae$/ { gsub(/^[ \t]+/, "", $1); printf "%.2f", $1 }' $KRAKEN2_REPORT) +PERCENTAGE=$(awk -F"\t" '$4 ~ /^S$/ && $6 ~ /Streptococcus pneumoniae$/ { gsub(/^[ \t]+/, "", $1); printf "%.2f", $1 }' "$KRAKEN2_REPORT") if [ -z "$PERCENTAGE" ]; then PERCENTAGE="0.00" fi -if (( $(echo "$PERCENTAGE > $QC_SPNEUMO_PERCENTAGE" | bc -l) )); then +if [[ "$(echo "$PERCENTAGE > $QC_SPNEUMO_PERCENTAGE" | bc -l)" == 1 ]]; then TAXONOMY_QC="PASS" else TAXONOMY_QC="FAIL" fi -echo \"Taxonomy_QC\",\"S.Pneumo_%\" > $TAXONOMY_QC_REPORT -echo \"$TAXONOMY_QC\",\"$PERCENTAGE\" >> $TAXONOMY_QC_REPORT \ No newline at end of file +echo \"Taxonomy_QC\",\"S.Pneumo_%\" > "$TAXONOMY_QC_REPORT" +echo \"$TAXONOMY_QC\",\""$PERCENTAGE"\" >> "$TAXONOMY_QC_REPORT" From 2e6707ad4889d1018856400a405d6b5a35003e93 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 28 Jul 2023 14:49:25 +0000 Subject: [PATCH 26/41] Fix outputing incorrect variable for Het-SNP# --- bin/mapping_qc.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/mapping_qc.sh b/bin/mapping_qc.sh index f6eb48e..a737d09 100755 --- a/bin/mapping_qc.sh +++ b/bin/mapping_qc.sh @@ -9,4 +9,4 @@ else fi echo \"Mapping_QC\",\"Ref_Cov_%\",\"Het-SNP#\" > "$MAPPING_QC_REPORT" -echo \"$MAPPING_QC\",\""$COVERAGE"\",\""$QC_HET_SNP_SITE"\" >> "$MAPPING_QC_REPORT" +echo \"$MAPPING_QC\",\""$COVERAGE"\",\""$HET_SNP"\" >> "$MAPPING_QC_REPORT" From 8d352582bcbd82a34797e22c2f0433d588e5d6a7 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 28 Jul 2023 15:42:41 +0000 Subject: [PATCH 27/41] Avoid numbers output as float --- bin/generate_overall_report.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/generate_overall_report.py b/bin/generate_overall_report.py index f238979..59a1737 100755 --- a/bin/generate_overall_report.py +++ b/bin/generate_overall_report.py @@ -81,7 +81,7 @@ def get_df_output(output_columns): dfs = [df_manifest] reports = glob.glob(WORKDIR_PATH +'/*.csv') for report in reports: - df = pd.read_csv(report) + df = pd.read_csv(report, dtype=str) dfs.append(df) df_output = pd.concat(dfs, ignore_index=True).sort_values(by=['Sample_ID']) From 1327f764db5a22a74da889a101104c99cbbaeb22 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 28 Jul 2023 15:44:33 +0000 Subject: [PATCH 28/41] Refactor to improve maintainability & readability --- bin/het_snp_count.py | 171 ++++++++++++++++++++++++++----------------- modules/mapping.nf | 4 +- 2 files changed, 107 insertions(+), 68 deletions(-) diff --git a/bin/het_snp_count.py b/bin/het_snp_count.py index 74d45ee..cf92dc6 100755 --- a/bin/het_snp_count.py +++ b/bin/het_snp_count.py @@ -5,72 +5,109 @@ import re import sys -# Input VCF path -vcf = sys.argv[1] -# Minimum distance between SNPs to not consider as part of cluster -min_snp_distance = int(sys.argv[2]) - - -with open(vcf) as f: - lines = [line.strip() for line in f] - - # List of positions of non-cluster Het-SNPs - het_noncluster_pos = [] - # Previous Het-SNP position. Initialise with the negative of min_snp_distance for calculation of the sites in starting positions - prev_het_pos = -min_snp_distance - - for line in lines: - # Skip lines of header and INDEL calls - if line.startswith("#") or "INDEL" in line: - continue - - # Get fields from the call - chrom, pos, id, ref, alt, qual, filter, info, format, sample = line.split("\t") - - # Get DP (The number of reads covering or bridging POS) from the INFO field - dp = re.search(r'DP=([0-9]+)', info).group(1) - # Get DP4 (Number of forward ref alleles; reverse ref; forward non-ref; reverse non-ref alleles, used in variant calling) from the INFO field - reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref = re.search(r'DP4=([0-9,]+)', info).group(1).split(",") - # Get MQ (Root-Mean-Square mapping quality of covering reads) from the INFO field - mq = re.search(r'MQ=([0-9]+)', info).group(1) - - # Get PV4 (P-values for strand bias; baseQ bias; mapQ bias; tail distance bias) from the INFO field; set to None if it is not found - try: - pv4 = re.search(r'PV4=([0-9,.]+)', info).group(1) - except AttributeError: - pv4 = None - - # Ensure qual is float - qual = float(qual) - # Ensure pos, dp, mq, reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref are int - pos, dp, mq, reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref = map(int, [pos, dp, mq, reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref]) - - # Basic quality filter, skip this call if fails - if not(qual > 50 and dp > 5 and mq > 30 and reads_for_non_ref > 2 and reads_rev_non_ref > 2): - continue - - # Further quality filter if PV4 exists, skip this call if fails - if pv4 is not None: - pv_strand, pv_baseq, pv_mapq, pv_tail_distance = map(float, pv4.split(",")) - if not (pv_strand > 0.001 and pv_mapq > 0.001 and pv_tail_distance > 0.001): + +# Check argv and save to global variables +if len(sys.argv) != 4: + sys.exit('Usage: het_snp_count.py VCF MIN_SNP_DISTANCE OUTPUT_FILE') +VCF = sys.argv[1] +MIN_SNP_DISTANCE = int(sys.argv[2]) # Minimum distance between SNPs to not consider as part of cluster +OUTPUT_FILE=sys.argv[3] + + +def main(): + with open(VCF) as vcf, open(OUTPUT_FILE, 'w') as output_file: + lines = [line.strip() for line in vcf] + + # List of positions of non-cluster Het-SNPs + het_noncluster_pos = [] + # Previous Het-SNP position + prev_het_pos = None + + for line in lines: + # Skip lines of header and INDEL calls + if line.startswith("#") or "INDEL" in line: continue + + pos, qual, info = extract_vcf_fields(line) + + dp, reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref, mq, pv4 = extract_info(info) + + if not quality_check(qual, dp, mq, reads_for_non_ref, reads_rev_non_ref, pv4): + continue + + if is_het_snp(het_noncluster_pos, pos, prev_het_pos, reads_for_non_ref, reads_for_ref, reads_rev_non_ref, reads_rev_ref): + # Mark current pos as previous Het-SNP pos for the next Het-SNP + prev_het_pos = pos + + # Save amount of non-cluster Het-SNP sites to OUTPUT_FILE + output_file.write(f'{len(het_noncluster_pos)}') + + +# Extract relevant fields from the call +def extract_vcf_fields(line): + fields = line.split("\t") + pos, qual, info = fields[1], fields[5], fields[7] + + # Ensure pos is int and qual is float + return int(pos), float(qual), info + + +# Extract information from the INFO field +def extract_info(info): + # Get DP (The number of reads covering or bridging POS) + dp = re.search(r'DP=([0-9]+)', info).group(1) + + # Get DP4 (Number of forward ref alleles; reverse ref; forward non-ref; reverse non-ref alleles, used in variant calling) + reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref = re.search(r'DP4=([0-9,]+)', info).group(1).split(",") + + # Get MQ (Root-Mean-Square mapping quality of covering reads) + mq = re.search(r'MQ=([0-9]+)', info).group(1) + + # Get PV4 (P-values for strand bias; baseQ bias; mapQ bias; tail distance bias); set to None if it is not found + try: + pv4 = re.search(r'PV4=([0-9,.]+)', info).group(1) + except AttributeError: + pv4 = None + + # Ensure dp, reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref, mq are int + return *map(int, [dp, reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref, mq]), pv4 + + +# Quality check for call +def quality_check(qual, dp, mq, reads_for_non_ref, reads_rev_non_ref, pv4): + # Basic quality check, skip this call if fails + if not(qual > 50 and dp > 5 and mq > 30 and reads_for_non_ref > 2 and reads_rev_non_ref > 2): + return False + + # Further quality check if PV4 exists, skip this call if fails + if pv4 is not None: + pv_strand, pv_baseq, pv_mapq, pv_tail_distance = map(float, pv4.split(",")) + if not (pv_strand > 0.001 and pv_mapq > 0.001 and pv_tail_distance > 0.001): + return False + + return True + + +# Check if this call is a Het-SNP and add/remove Het-SNP to/from het_noncluster_pos +def is_het_snp(het_noncluster_pos, pos, prev_het_pos, reads_for_non_ref, reads_for_ref, reads_rev_non_ref, reads_rev_ref): + # Calculate forward and reverse non-reference reads ratios (variant allele frequencies) + forward_non_ref_ratio = reads_for_non_ref / (reads_for_non_ref + reads_for_ref) + reverse_non_ref_ratio = reads_rev_non_ref / (reads_rev_non_ref + reads_rev_ref) + + # Consider as Het-SNP when both forward and reverse non-reference reads ratios are below 0.90 + if forward_non_ref_ratio < 0.90 and reverse_non_ref_ratio < 0.90: + # If the distance between current and previous Het-SNP position is >= the minimum non-cluster SNP distance or there is no previous Het-SNP, + # add the position to the list of non-cluster Het-SNP positions + if prev_het_pos is None or pos - prev_het_pos >= MIN_SNP_DISTANCE: + het_noncluster_pos.append(pos) + # If the last Het-SNP in the list of non-cluster Het-SNP positions is part of the current cluster, remove it + elif het_noncluster_pos and pos - het_noncluster_pos[-1] < MIN_SNP_DISTANCE: + het_noncluster_pos.pop() + + return True + + return False + - # Calculate forward and reverse non-reference reads ratios (variant allele frequencies) - forward_non_ref_ratio = reads_for_non_ref / (reads_for_non_ref + reads_for_ref) - reverse_non_ref_ratio = reads_rev_non_ref / (reads_rev_non_ref + reads_rev_ref) - - # Consider as Het-SNP when both forward and reverse non-reference reads ratios are below 0.90 - if forward_non_ref_ratio < 0.90 and reverse_non_ref_ratio < 0.90: - # If the distance between current and previous Het-SNP position is >= the minimum non-cluster SNP distance, - # add the position to the list of non-cluster Het-SNP positions - if pos - prev_het_pos >= min_snp_distance: - het_noncluster_pos.append(pos) - # If the last Het-SNP in the list of non-cluster Het-SNP positions is part of the current cluster, remove it - elif het_noncluster_pos and pos - het_noncluster_pos[-1] < min_snp_distance: - het_noncluster_pos.pop() - # Mark current pos as previous Het-SNP pos for the next Het-SNP - prev_het_pos = pos - - # Amount of non-cluster Het-SNP sites, print to be captured by Nextflow - het_noncluster_sites = len(het_noncluster_pos) - print(het_noncluster_sites, end="") +if __name__ == "__main__": + main() diff --git a/modules/mapping.nf b/modules/mapping.nf index 0a37628..d545607 100644 --- a/modules/mapping.nf +++ b/modules/mapping.nf @@ -120,8 +120,10 @@ process HET_SNP_COUNT { tuple val(sample_id), env(OUTPUT), emit: result script: + het_snp_count_output='output.txt' """ - OUTPUT=`het_snp_count.py "$vcf" 50` + het_snp_count.py "$vcf" 50 "$het_snp_count_output" + OUTPUT=`cat $het_snp_count_output` """ } From 8f15a7de7d3a77d34a95d8ebf1ac166af9bb8a96 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 31 Jul 2023 13:45:59 +0000 Subject: [PATCH 29/41] Improve shell scripts style --- bin/mapping_qc.sh | 2 +- bin/read_qc.sh | 2 +- bin/taxonomy_qc.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/mapping_qc.sh b/bin/mapping_qc.sh index a737d09..9ed580d 100755 --- a/bin/mapping_qc.sh +++ b/bin/mapping_qc.sh @@ -9,4 +9,4 @@ else fi echo \"Mapping_QC\",\"Ref_Cov_%\",\"Het-SNP#\" > "$MAPPING_QC_REPORT" -echo \"$MAPPING_QC\",\""$COVERAGE"\",\""$HET_SNP"\" >> "$MAPPING_QC_REPORT" +echo \""$MAPPING_QC"\",\""$COVERAGE"\",\""$HET_SNP"\" >> "$MAPPING_QC_REPORT" diff --git a/bin/read_qc.sh b/bin/read_qc.sh index 14d7519..a72c1d7 100755 --- a/bin/read_qc.sh +++ b/bin/read_qc.sh @@ -9,4 +9,4 @@ else fi echo \"Read_QC\",\"Bases\" > "$READ_QC_REPORT" -echo \"$READ_QC\",\""$BASES"\" >> "$READ_QC_REPORT" +echo \""$READ_QC"\",\""$BASES"\" >> "$READ_QC_REPORT" diff --git a/bin/taxonomy_qc.sh b/bin/taxonomy_qc.sh index 7528b1d..a867804 100755 --- a/bin/taxonomy_qc.sh +++ b/bin/taxonomy_qc.sh @@ -13,4 +13,4 @@ else fi echo \"Taxonomy_QC\",\"S.Pneumo_%\" > "$TAXONOMY_QC_REPORT" -echo \"$TAXONOMY_QC\",\""$PERCENTAGE"\" >> "$TAXONOMY_QC_REPORT" +echo \""$TAXONOMY_QC"\",\""$PERCENTAGE"\" >> "$TAXONOMY_QC_REPORT" From 9d6ead6f62d07b58ca5e304f4fc45702d4a25b81 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 31 Jul 2023 14:06:38 +0000 Subject: [PATCH 30/41] Improve chart --- doc/workflow.drawio.svg | 238 ++++++++++++++++++++-------------------- 1 file changed, 119 insertions(+), 119 deletions(-) diff --git a/doc/workflow.drawio.svg b/doc/workflow.drawio.svg index 00766b6..873d1b7 100644 --- a/doc/workflow.drawio.svg +++ b/doc/workflow.drawio.svg @@ -1,23 +1,23 @@ - + - - + + Output - - + + Input - + - +
@@ -32,12 +32,12 @@ - - - - + + + + - +
@@ -57,14 +57,14 @@ - + - + - + - -
+ +
FASTQ (Reads) @@ -72,36 +72,36 @@
- + FASTQ (Reads) - - - + + + S. Pneumo:  > 60% - - - + + + Contigs:  < 500 - + Length:   1.9 - 2.3 Mb - + Depth:     ≥ 20x - + - -
+ +
FASTA (Assemblies) @@ -109,16 +109,16 @@
- + FASTA (Assemblies) - + - -
+ +
SAM @@ -126,25 +126,25 @@
- + SAM - - - + + + Ref Coverage:  > 60% - + Het-SNP site:   < 220 - + - -
+ +
Results @@ -152,21 +152,21 @@
- + Results - - - - - - - - + + + + + + + + - +
@@ -188,10 +188,10 @@ - - + + - +
@@ -209,10 +209,10 @@ - - + + - +
@@ -233,10 +233,10 @@ - - + + - +
@@ -254,10 +254,10 @@ - - + + - +
@@ -279,12 +279,12 @@ - - - - + + + + - +
@@ -302,12 +302,12 @@ - - - - + + + + - +
@@ -325,11 +325,11 @@ - - - + + + - +
@@ -347,13 +347,13 @@ - - - - + + + + - -
+ +
@@ -363,16 +363,16 @@
- + Over... - - - + + + - +
@@ -390,11 +390,11 @@ - - - + + + - +
@@ -412,11 +412,11 @@ - - - + + + - +
@@ -435,9 +435,9 @@ - + - +
@@ -455,30 +455,24 @@ - - - - - Go / No-go - - - - + + + Bases: ≥ 38 Mb - + Go / No-go - + - - + + - +
@@ -497,9 +491,9 @@ - + - +
@@ -517,11 +511,11 @@ - - - + + + - +
@@ -537,11 +531,17 @@ - + QC values shown in the diagram are the default values + + + + Go / No-go + + From d899ec958206749174162c4e5e82a1511980bb17 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 31 Jul 2023 14:37:56 +0000 Subject: [PATCH 31/41] Update Nextflow executable to 23.04.2 --- nextflow | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow b/nextflow index a6ece4c..b7725ce 100755 --- a/nextflow +++ b/nextflow @@ -15,7 +15,7 @@ # limitations under the License. [[ "$NXF_DEBUG" == 'x' ]] && set -x -NXF_VER=${NXF_VER:-'23.04.1'} +NXF_VER=${NXF_VER:-'23.04.2'} NXF_ORG=${NXF_ORG:-'nextflow-io'} NXF_HOME=${NXF_HOME:-$HOME/.nextflow} NXF_PROT=${NXF_PROT:-'https'} From a98510ee41c53e924b7fdd46949c1104d62b6bef Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 31 Jul 2023 14:47:49 +0000 Subject: [PATCH 32/41] Improve wording of messages. --- modules/messages.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/messages.nf b/modules/messages.nf index 8e13998..7a9d189 100644 --- a/modules/messages.nf +++ b/modules/messages.nf @@ -46,7 +46,7 @@ void workflowSelectMessage(String selectedWorkflow) { switch (selectedWorkflow) { case 'pipeline': message = """ - |The main pipeline workflow was selected. + |The main pipeline workflow has been selected. | |Input Directory: ${readsDir.canonicalPath} |Output Directory: ${outputDir.canonicalPath} @@ -54,12 +54,12 @@ void workflowSelectMessage(String selectedWorkflow) { break case 'init': message = ''' - |The alternative workflow for initialisation was selected. + |The alternative workflow for initialisation has been selected. '''.stripMargin() break case 'version': message = ''' - |The alternative workflow for getting versions of pipeline, tools and databases was selected. + |The alternative workflow for getting versions of pipeline, tools and databases has been selected. '''.stripMargin() break } From fa3f58f26c148d373780e12886af10517c8335be Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 31 Jul 2023 14:53:48 +0000 Subject: [PATCH 33/41] Improve shell scripts style --- bin/get_databases_info.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/get_databases_info.sh b/bin/get_databases_info.sh index 3d9dd98..10f2174 100755 --- a/bin/get_databases_info.sh +++ b/bin/get_databases_info.sh @@ -1,7 +1,7 @@ # Save received databases information into a JSON file add_bwa_db () { - BWA_DB_JSON=${BWA_DB_PATH}/${BWA_JSON} + BWA_DB_JSON="${BWA_DB_PATH}/${BWA_JSON}" if [ -f "$BWA_DB_JSON" ]; then REFERENCE=$(jq -r .reference "$BWA_DB_JSON") REFERENCE_MD5=$(jq -r .reference_md5 "$BWA_DB_JSON") @@ -15,7 +15,7 @@ add_bwa_db () { } add_ariba_db () { - ARIBA_DB_JSON=${ARIBA_DB_PATH}/${ARIBA_JSON} + ARIBA_DB_JSON="${ARIBA_DB_PATH}/${ARIBA_JSON}" if [ -f "$ARIBA_DB_JSON" ]; then REFERENCE=$(jq -r .reference "$ARIBA_DB_JSON") REFERENCE_MD5=$(jq -r .reference_md5 "$ARIBA_DB_JSON") @@ -33,7 +33,7 @@ add_ariba_db () { } add_seroba_db () { - SEROBA_DB_JSON=${SEROBA_DB_PATH}/${SEROBA_JSON} + SEROBA_DB_JSON="${SEROBA_DB_PATH}/${SEROBA_JSON}" if [ -f "$SEROBA_DB_JSON" ]; then GIT=$(jq -r .git "$SEROBA_DB_JSON") KMER=$(jq -r .kmer "$SEROBA_DB_JSON") @@ -47,7 +47,7 @@ add_seroba_db () { } add_url_db () { - DB_JSON=$1 + DB_JSON="$1" if [ -f "$DB_JSON" ]; then URL=$(jq -r .url "$DB_JSON") SAVE_TIME=$(jq -r .save_time "$DB_JSON") From 6c54b3e998ca66a1ba47bda6d8845edaed8e62ce Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 31 Jul 2023 17:29:22 +0000 Subject: [PATCH 34/41] Improve names & comments of processes & scripts --- ...e_ariba_db.sh => check-create_ariba_db.sh} | 2 +- ...b.sh => check-create_ref_genome_bwa_db.sh} | 2 +- ...en2_db.sh => check-download_kraken2_db.sh} | 0 ...unk_db.sh => check-download_poppunk_db.sh} | 0 ...=> check-download_poppunk_ext_clusters.sh} | 0 bin/{get_seroba_db.sh => check_seroba_db.sh} | 6 +- ...er_compose.sh => create_docker_compose.sh} | 2 +- bin/{assembly_qc.sh => get_assembly_qc.sh} | 0 bin/{mapping_qc.sh => get_mapping_qc.sh} | 0 bin/{overall_qc.sh => get_overall_qc.sh} | 0 bin/{read_qc.sh => get_read_qc.sh} | 0 bin/{taxonomy_qc.sh => get_taxonomy_qc.sh} | 2 +- ...esistance.py => parse_other_resistance.py} | 0 ..._resistance.sh => parse_pbp_resistance.sh} | 0 ...{combine_info.sh => save_combined_info.sh} | 0 ...tabases_info.sh => save_databases_info.sh} | 0 ...get_images_info.sh => save_images_info.sh} | 0 bin/{get_tools_info.sh => save_tools_info.sh} | 0 modules/amr.nf | 16 ++--- modules/assembly.nf | 6 +- modules/docker.nf | 2 +- modules/info.nf | 8 +-- modules/lineage.nf | 4 +- modules/mapping.nf | 6 +- modules/overall_qc.nf | 2 +- modules/preprocess.nf | 2 +- modules/serotype.nf | 8 +-- modules/taxonomy.nf | 4 +- workflows/init.nf | 14 ++-- workflows/pipeline.nf | 66 ++++++++++--------- 30 files changed, 76 insertions(+), 76 deletions(-) rename bin/{create_ariba_db.sh => check-create_ariba_db.sh} (95%) rename bin/{create_ref_genome_bwa_db.sh => check-create_ref_genome_bwa_db.sh} (92%) rename bin/{get_kraken2_db.sh => check-download_kraken2_db.sh} (100%) rename bin/{get_poppunk_db.sh => check-download_poppunk_db.sh} (100%) rename bin/{get_poppunk_ext_clusters.sh => check-download_poppunk_ext_clusters.sh} (100%) rename bin/{get_seroba_db.sh => check_seroba_db.sh} (63%) rename bin/{get_docker_compose.sh => create_docker_compose.sh} (95%) rename bin/{assembly_qc.sh => get_assembly_qc.sh} (100%) rename bin/{mapping_qc.sh => get_mapping_qc.sh} (100%) rename bin/{overall_qc.sh => get_overall_qc.sh} (100%) rename bin/{read_qc.sh => get_read_qc.sh} (100%) rename bin/{taxonomy_qc.sh => get_taxonomy_qc.sh} (95%) rename bin/{get_other_resistance.py => parse_other_resistance.py} (100%) rename bin/{get_pbp_resistance.sh => parse_pbp_resistance.sh} (100%) rename bin/{combine_info.sh => save_combined_info.sh} (100%) rename bin/{get_databases_info.sh => save_databases_info.sh} (100%) rename bin/{get_images_info.sh => save_images_info.sh} (100%) rename bin/{get_tools_info.sh => save_tools_info.sh} (100%) diff --git a/bin/create_ariba_db.sh b/bin/check-create_ariba_db.sh similarity index 95% rename from bin/create_ariba_db.sh rename to bin/check-create_ariba_db.sh index fb2b657..32ff767 100755 --- a/bin/create_ariba_db.sh +++ b/bin/check-create_ariba_db.sh @@ -1,4 +1,4 @@ -# Check if CREATE_ARIBA_DB has run successfully on the specific reference sequences and metadata. +# Check if ARIBA database was prepared from the specific reference sequences and metadata. # If not: remove the $OUTPUT directory, and prepare the ARIBA database from reference sequences and metadata, also save metadata to JSON REF_SEQUENCES_MD5=$(md5sum "$REF_SEQUENCES" | awk '{ print $1 }') diff --git a/bin/create_ref_genome_bwa_db.sh b/bin/check-create_ref_genome_bwa_db.sh similarity index 92% rename from bin/create_ref_genome_bwa_db.sh rename to bin/check-create_ref_genome_bwa_db.sh index 385b609..65a7da8 100755 --- a/bin/create_ref_genome_bwa_db.sh +++ b/bin/check-create_ref_genome_bwa_db.sh @@ -1,4 +1,4 @@ -# Check if CREATE_REF_GENOME_BWA_DB has run successfully on the specific reference. +# Check if BWA database was prepared from the specific reference. # If not: remove files in database directory, and construct the FM-index database of the reference genome for BWA, also save metadata to JSON REFERENCE_MD5=$(md5sum "$REFERENCE" | awk '{ print $1 }') diff --git a/bin/get_kraken2_db.sh b/bin/check-download_kraken2_db.sh similarity index 100% rename from bin/get_kraken2_db.sh rename to bin/check-download_kraken2_db.sh diff --git a/bin/get_poppunk_db.sh b/bin/check-download_poppunk_db.sh similarity index 100% rename from bin/get_poppunk_db.sh rename to bin/check-download_poppunk_db.sh diff --git a/bin/get_poppunk_ext_clusters.sh b/bin/check-download_poppunk_ext_clusters.sh similarity index 100% rename from bin/get_poppunk_ext_clusters.sh rename to bin/check-download_poppunk_ext_clusters.sh diff --git a/bin/get_seroba_db.sh b/bin/check_seroba_db.sh similarity index 63% rename from bin/get_seroba_db.sh rename to bin/check_seroba_db.sh index 0cda2fc..2e6ff2d 100755 --- a/bin/get_seroba_db.sh +++ b/bin/check_seroba_db.sh @@ -1,7 +1,5 @@ -# Return boolean of CREATE_DB, download if necessary - -# Check if GET_SEROBA_DB and CREATE_SEROBA_DB has run successfully on the database at the specific link, CREATE_SEROBA_DB used the specific Kmerm and pull to check if SeroBA database is up-to-date. -# If outdated or does not exist: remove files in database directory and clone, set CREATE_DB to true +# Check if database was cloned from specific link and is up-to-date, also prepared by the specific Kmer +# If not: remove files in database directory and clone, set CREATE_DB to true # Assume up-to-date if JSON passes checks and the host cannot be resolved to allow offline usage diff --git a/bin/get_docker_compose.sh b/bin/create_docker_compose.sh similarity index 95% rename from bin/get_docker_compose.sh rename to bin/create_docker_compose.sh index 5f8ff8b..d6fc3ba 100755 --- a/bin/get_docker_compose.sh +++ b/bin/create_docker_compose.sh @@ -1,4 +1,4 @@ -# Generate a Docker compose file that includes all images used in nextflow.config +# Generate a Docker compose file that includes all images used in $NEXTFLOW_CONFIG COUNT=0 diff --git a/bin/assembly_qc.sh b/bin/get_assembly_qc.sh similarity index 100% rename from bin/assembly_qc.sh rename to bin/get_assembly_qc.sh diff --git a/bin/mapping_qc.sh b/bin/get_mapping_qc.sh similarity index 100% rename from bin/mapping_qc.sh rename to bin/get_mapping_qc.sh diff --git a/bin/overall_qc.sh b/bin/get_overall_qc.sh similarity index 100% rename from bin/overall_qc.sh rename to bin/get_overall_qc.sh diff --git a/bin/read_qc.sh b/bin/get_read_qc.sh similarity index 100% rename from bin/read_qc.sh rename to bin/get_read_qc.sh diff --git a/bin/taxonomy_qc.sh b/bin/get_taxonomy_qc.sh similarity index 95% rename from bin/taxonomy_qc.sh rename to bin/get_taxonomy_qc.sh index a867804..cb1e382 100755 --- a/bin/taxonomy_qc.sh +++ b/bin/get_taxonomy_qc.sh @@ -1,4 +1,4 @@ -# Extract taxonomy QC information and determine QC result based on kraken2_report.txt +# Extract taxonomy QC information and determine QC result based on $KRAKEN2_REPORT PERCENTAGE=$(awk -F"\t" '$4 ~ /^S$/ && $6 ~ /Streptococcus pneumoniae$/ { gsub(/^[ \t]+/, "", $1); printf "%.2f", $1 }' "$KRAKEN2_REPORT") diff --git a/bin/get_other_resistance.py b/bin/parse_other_resistance.py similarity index 100% rename from bin/get_other_resistance.py rename to bin/parse_other_resistance.py diff --git a/bin/get_pbp_resistance.sh b/bin/parse_pbp_resistance.sh similarity index 100% rename from bin/get_pbp_resistance.sh rename to bin/parse_pbp_resistance.sh diff --git a/bin/combine_info.sh b/bin/save_combined_info.sh similarity index 100% rename from bin/combine_info.sh rename to bin/save_combined_info.sh diff --git a/bin/get_databases_info.sh b/bin/save_databases_info.sh similarity index 100% rename from bin/get_databases_info.sh rename to bin/save_databases_info.sh diff --git a/bin/get_images_info.sh b/bin/save_images_info.sh similarity index 100% rename from bin/get_images_info.sh rename to bin/save_images_info.sh diff --git a/bin/get_tools_info.sh b/bin/save_tools_info.sh similarity index 100% rename from bin/get_tools_info.sh rename to bin/save_tools_info.sh diff --git a/modules/amr.nf b/modules/amr.nf index 1fd57f4..d7a7206 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -19,7 +19,7 @@ process PBP_RESISTANCE { } // Extract the results from the output file of the PBP AMR predictor -process GET_PBP_RESISTANCE { +process PARSE_PBP_RESISTANCE { label 'bash_container' label 'farm_low' @@ -37,12 +37,12 @@ process GET_PBP_RESISTANCE { JSON_FILE="$json" PBP_AMR_REPORT="$pbp_amr_report" - source get_pbp_resistance.sh + source parse_pbp_resistance.sh """ } -// Create ARIBA database and return database path -process CREATE_ARIBA_DB { +// Return database path, create if necessary +process GET_ARIBA_DB { label 'ariba_container' label 'farm_low' @@ -65,7 +65,7 @@ process CREATE_ARIBA_DB { OUTPUT="$output" JSON_FILE="$json" - source create_ariba_db.sh + source check-create_ariba_db.sh """ } @@ -88,12 +88,12 @@ process OTHER_RESISTANCE { report='result/report.tsv' report_debug='result/debug.report.tsv' """ - ariba run --nucmer_min_id 80 --assembled_threshold 0.80 $ariba_database/$database $read1 $read2 result + ariba run --nucmer_min_id 80 --assembled_threshold 0.80 "$ariba_database/$database" "$read1" "$read2" result """ } // Extracting resistance information from ARIBA report -process GET_OTHER_RESISTANCE { +process PARSE_OTHER_RESISTANCE { label 'python_container' label 'farm_low' @@ -109,6 +109,6 @@ process GET_OTHER_RESISTANCE { script: output_file="other_amr_report.csv" """ - get_other_resistance.py "$report" "$report_debug" "$metadata" "$output_file" + parse_other_resistance.py "$report" "$report_debug" "$metadata" "$output_file" """ } diff --git a/modules/assembly.nf b/modules/assembly.nf index fd84c76..ab66b6e 100644 --- a/modules/assembly.nf +++ b/modules/assembly.nf @@ -1,5 +1,5 @@ // Run Unicycler to get assembly -// Return sample_id and assembly, and hardlink the assembly to ${params.output}/assemblies directory +// Return sample_id and assembly, and publish the assembly to ${params.output}/assemblies directory based on ${params.assembly_publish} process ASSEMBLY_UNICYCLER { label 'unicycler_container' label 'farm_high_fallible' @@ -26,7 +26,7 @@ process ASSEMBLY_UNICYCLER { } // Run Shovill to get assembly -// Return sample_id and assembly, and hardlink the assembly to ${params.output}/assemblies directory +// Return sample_id and assembly, and publish the assembly to ${params.output}/assemblies directory based on ${params.assembly_publish} process ASSEMBLY_SHOVILL { label 'shovill_container' label 'farm_high_fallible' @@ -99,6 +99,6 @@ process ASSEMBLY_QC { QC_DEPTH="$qc_depth" ASSEMBLY_QC_REPORT="$assembly_qc_report" - source assembly_qc.sh + source get_assembly_qc.sh """ } diff --git a/modules/docker.nf b/modules/docker.nf index 4090957..ef0236b 100644 --- a/modules/docker.nf +++ b/modules/docker.nf @@ -15,7 +15,7 @@ process GET_DOCKER_COMPOSE { NEXTFLOW_CONFIG="$nextflowConfig" COMPOSE="$compose" - source get_docker_compose.sh + source create_docker_compose.sh """ } diff --git a/modules/info.nf b/modules/info.nf index cde8662..a317a3b 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -18,7 +18,7 @@ process IMAGES { NEXTFLOW_CONFIG="$nextflowConfig" JSON_FILE="$json" - source get_images_info.sh + source save_images_info.sh """ } @@ -59,7 +59,7 @@ process DATABASES { POPPUNK_EXT_JSON="$poppunk_ext_json" JSON_FILE="$json" - source get_databases_info.sh + source save_databases_info.sh """ } @@ -106,7 +106,7 @@ process TOOLS { ARIBA_VERSION="$ariba_version" JSON_FILE="$json" - source get_tools_info.sh + source save_tools_info.sh """ } @@ -135,7 +135,7 @@ process COMBINE_INFO { TOOLS="$tools" JSON_FILE="$json" - source combine_info.sh + source save_combined_info.sh """ } diff --git a/modules/lineage.nf b/modules/lineage.nf index 68edae3..9090f02 100644 --- a/modules/lineage.nf +++ b/modules/lineage.nf @@ -18,7 +18,7 @@ process GET_POPPUNK_DB { DB_LOCAL="$local" JSON_FILE="$json" - source get_poppunk_db.sh + source check-download_poppunk_db.sh """ } @@ -41,7 +41,7 @@ process GET_POPPUNK_EXT_CLUSTERS { EXT_CLUSTERS_LOCAL="$local" JSON_FILE="$json" - source get_poppunk_ext_clusters.sh + source check-download_poppunk_ext_clusters.sh """ } diff --git a/modules/mapping.nf b/modules/mapping.nf index d545607..e5d7c4e 100644 --- a/modules/mapping.nf +++ b/modules/mapping.nf @@ -1,5 +1,5 @@ // Return database path and prefix, construct if necessary -process CREATE_REF_GENOME_BWA_DB { +process GET_REF_GENOME_BWA_DB { label 'bwa_container' label 'farm_mid' @@ -20,7 +20,7 @@ process CREATE_REF_GENOME_BWA_DB { PREFIX="$prefix" JSON_FILE="$json" - source create_ref_genome_bwa_db.sh + source check-create_ref_genome_bwa_db.sh """ } @@ -152,6 +152,6 @@ process MAPPING_QC { QC_HET_SNP_SITE="$qc_het_snp_site" MAPPING_QC_REPORT="$mapping_qc_report" - source mapping_qc.sh + source get_mapping_qc.sh """ } diff --git a/modules/overall_qc.nf b/modules/overall_qc.nf index fa639d9..31c8cca 100644 --- a/modules/overall_qc.nf +++ b/modules/overall_qc.nf @@ -21,6 +21,6 @@ process OVERALL_QC { TAXONOMY_QC="$taxonomy_qc" OVERALL_QC_REPORT="$overall_qc_report" - source overall_qc.sh + source get_overall_qc.sh """ } diff --git a/modules/preprocess.nf b/modules/preprocess.nf index e04b756..e87ef99 100644 --- a/modules/preprocess.nf +++ b/modules/preprocess.nf @@ -48,6 +48,6 @@ process READ_QC { QC_DEPTH="$qc_depth" READ_QC_REPORT="$read_qc_report" - source read_qc.sh + source get_read_qc.sh """ } diff --git a/modules/serotype.nf b/modules/serotype.nf index 0c69bad..02327dd 100644 --- a/modules/serotype.nf +++ b/modules/serotype.nf @@ -1,5 +1,5 @@ -// Return boolean of CREATE_DB, download if necessary -process GET_SEROBA_DB { +// Return boolean of CREATE_DB, remove and clone if necessary +process CHECK_SEROBA_DB { label 'git_container' label 'farm_low' @@ -19,12 +19,12 @@ process GET_SEROBA_DB { KMER="$kmer" JSON_FILE="$json" - source get_seroba_db.sh + source check_seroba_db.sh """ } // Return SeroBA databases path, create databases if necessary -process CREATE_SEROBA_DB { +process GET_SEROBA_DB { label 'seroba_container' label 'farm_low' diff --git a/modules/taxonomy.nf b/modules/taxonomy.nf index b4d1e62..34ebeab 100644 --- a/modules/taxonomy.nf +++ b/modules/taxonomy.nf @@ -17,7 +17,7 @@ process GET_KRAKEN2_DB { DB_LOCAL="$local" JSON_FILE="$json" - source get_kraken2_db.sh + source check-download_kraken2_db.sh """ } @@ -73,6 +73,6 @@ process TAXONOMY_QC { QC_SPNEUMO_PERCENTAGE="$qc_spneumo_percentage" TAXONOMY_QC_REPORT="$taxonomy_qc_report" - source taxonomy_qc.sh + source get_taxonomy_qc.sh """ } diff --git a/workflows/init.nf b/workflows/init.nf index 64a748f..20eff25 100644 --- a/workflows/init.nf +++ b/workflows/init.nf @@ -1,25 +1,25 @@ // Import process modules -include { CREATE_REF_GENOME_BWA_DB } from "$projectDir/modules/mapping" +include { GET_REF_GENOME_BWA_DB } from "$projectDir/modules/mapping" include { GET_KRAKEN2_DB } from "$projectDir/modules/taxonomy" include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS } from "$projectDir/modules/lineage" -include { GET_SEROBA_DB; CREATE_SEROBA_DB } from "$projectDir/modules/serotype" +include { CHECK_SEROBA_DB; GET_SEROBA_DB } from "$projectDir/modules/serotype" include { GET_DOCKER_COMPOSE; PULL_IMAGES } from "$projectDir/modules/docker" -include { CREATE_ARIBA_DB } from "$projectDir/modules/amr" +include { GET_ARIBA_DB } from "$projectDir/modules/amr" // Alternative workflow for initialisation only workflow INIT { // Check Reference Genome BWA Database, generate from assembly if necessary - CREATE_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) + GET_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) // Check ARIBA database, generate from reference sequences and metadata if ncessary - CREATE_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) + GET_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) // Check Kraken2 Database, download if necessary GET_KRAKEN2_DB(params.kraken2_db_remote, params.kraken2_db_local) // Check SeroBA Databases, clone and rebuild if necessary - GET_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) - CREATE_SEROBA_DB(params.seroba_remote, params.seroba_local, GET_SEROBA_DB.out.create_db, params.seroba_kmer) + CHECK_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) + GET_SEROBA_DB(params.seroba_remote, params.seroba_local, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) // Check to PopPUNK Database and External Clusters, download if necessary GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_local) diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index fd288c9..01f3172 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -1,34 +1,34 @@ // Import process modules include { PREPROCESS; READ_QC } from "$projectDir/modules/preprocess" include { ASSEMBLY_UNICYCLER; ASSEMBLY_SHOVILL; ASSEMBLY_ASSESS; ASSEMBLY_QC } from "$projectDir/modules/assembly" -include { CREATE_REF_GENOME_BWA_DB; MAPPING; SAM_TO_SORTED_BAM; SNP_CALL; HET_SNP_COUNT; MAPPING_QC } from "$projectDir/modules/mapping" +include { GET_REF_GENOME_BWA_DB; MAPPING; SAM_TO_SORTED_BAM; SNP_CALL; HET_SNP_COUNT; MAPPING_QC } from "$projectDir/modules/mapping" include { GET_KRAKEN2_DB; TAXONOMY; TAXONOMY_QC } from "$projectDir/modules/taxonomy" include { OVERALL_QC } from "$projectDir/modules/overall_qc" include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS; LINEAGE } from "$projectDir/modules/lineage" -include { GET_SEROBA_DB; CREATE_SEROBA_DB; SEROTYPE } from "$projectDir/modules/serotype" +include { CHECK_SEROBA_DB; GET_SEROBA_DB; SEROTYPE } from "$projectDir/modules/serotype" include { MLST } from "$projectDir/modules/mlst" -include { PBP_RESISTANCE; GET_PBP_RESISTANCE; CREATE_ARIBA_DB; OTHER_RESISTANCE; GET_OTHER_RESISTANCE } from "$projectDir/modules/amr" +include { PBP_RESISTANCE; PARSE_PBP_RESISTANCE; GET_ARIBA_DB; OTHER_RESISTANCE; PARSE_OTHER_RESISTANCE } from "$projectDir/modules/amr" include { GENERATE_SAMPLE_REPORT; GENERATE_OVERALL_REPORT } from "$projectDir/modules/output" // Main pipeline workflow workflow PIPELINE { main: // Get path and prefix of Reference Genome BWA Database, generate from assembly if necessary - CREATE_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) + GET_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) // Get path to Kraken2 Database, download if necessary GET_KRAKEN2_DB(params.kraken2_db_remote, params.kraken2_db_local) // Get path to SeroBA Databases, clone and rebuild if necessary - GET_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) - CREATE_SEROBA_DB(params.seroba_remote, params.seroba_local, GET_SEROBA_DB.out.create_db, params.seroba_kmer) + CHECK_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) + GET_SEROBA_DB(params.seroba_remote, params.seroba_local, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) // Get paths to PopPUNK Database and External Clusters, download if necessary GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_local) GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_local) // Get path to ARIBA database, generate from reference sequences and metadata if ncessary - CREATE_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) + GET_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) // Get read pairs into Channel raw_read_pairs_ch raw_read_pairs_ch = Channel.fromFilePairs("$params.reads/*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}", checkIfExists: true) @@ -38,7 +38,7 @@ workflow PIPELINE { PREPROCESS(raw_read_pairs_ch) // From Channel PREPROCESS.out.json, provide Read QC status - // Output into Channel READ_QC_PASSED_READS_ch + // Output into Channels READ_QC.out.bases, READ_QC.out.result, READ_QC.out.report READ_QC(PREPROCESS.out.json, params.length_low, params.depth) // From Channel PREPROCESS.out.processed_reads, only output reads of samples passed Read QC based on Channel READ_QC.out.result @@ -47,7 +47,7 @@ workflow PIPELINE { .map { it[0, 2..-1] } // From Channel READ_QC_PASSED_READS_ch, assemble the preprocess read pairs - // Output into Channel ASSEMBLY_ch, and hardlink the assemblies to $params.output directory + // Output into Channel ASSEMBLY_ch, and hardlink (default) the assemblies to $params.output directory switch (params.assembler) { case 'shovill': ASSEMBLY_ch = ASSEMBLY_SHOVILL(READ_QC_PASSED_READS_ch, params.min_contig_length) @@ -59,10 +59,11 @@ workflow PIPELINE { } // From Channel ASSEMBLY_ch, assess assembly quality + // Output into Channel ASSEMBLY_ASSESS.out.report ASSEMBLY_ASSESS(ASSEMBLY_ch) // From Channel ASSEMBLY_ASSESS.out.report and Channel READ_QC.out.bases, provide Assembly QC status - // Output into Channels ASSEMBLY_QC.out.detailed_result & ASSEMBLY_QC.out.result + // Output into Channels ASSEMBLY_QC.out.result & ASSEMBLY_QC.out.report ASSEMBLY_QC( ASSEMBLY_ASSESS.out.report .join(READ_QC.out.bases, failOnDuplicate: true), @@ -74,7 +75,7 @@ workflow PIPELINE { // From Channel READ_QC_PASSED_READS_ch map reads to reference // Output into Channel MAPPING.out.sam - MAPPING(CREATE_REF_GENOME_BWA_DB.out.path, CREATE_REF_GENOME_BWA_DB.out.prefix, READ_QC_PASSED_READS_ch) + MAPPING(GET_REF_GENOME_BWA_DB.out.path, GET_REF_GENOME_BWA_DB.out.prefix, READ_QC_PASSED_READS_ch) // From Channel MAPPING.out.sam, Convert SAM into sorted BAM and calculate reference coverage // Output into Channels SAM_TO_SORTED_BAM.out.bam and SAM_TO_SORTED_BAM.out.ref_coverage @@ -82,10 +83,11 @@ workflow PIPELINE { // From Channel SAM_TO_SORTED_BAM.out.bam calculates non-cluster Het-SNP site count // Output into Channel HET_SNP_COUNT.out.result - SNP_CALL(params.ref_genome, SAM_TO_SORTED_BAM.out.bam, params.lite) | HET_SNP_COUNT + SNP_CALL(params.ref_genome, SAM_TO_SORTED_BAM.out.bam, params.lite) + HET_SNP_COUNT(SNP_CALL.out.vcf) // Merge Channels SAM_TO_SORTED_BAM.out.ref_coverage & HET_SNP_COUNT.out.result to provide Mapping QC Status - // Output into Channels MAPPING_QC.out.detailed_result & MAPPING_QC.out.result + // Output into Channels MAPPING_QC.out.result & MAPPING_QC.out.report MAPPING_QC( SAM_TO_SORTED_BAM.out.ref_coverage .join(HET_SNP_COUNT.out.result, failOnDuplicate: true, failOnMismatch: true), @@ -94,15 +96,15 @@ workflow PIPELINE { ) // From Channel READ_QC_PASSED_READS_ch assess Streptococcus pneumoniae percentage in reads - // Output into Channels TAXONOMY.out.detailed_result & TAXONOMY.out.result report + // Output into Channel TAXONOMY.out.report TAXONOMY(GET_KRAKEN2_DB.out.path, params.kraken2_memory_mapping, READ_QC_PASSED_READS_ch) // From Channel TAXONOMY.out.report, provide taxonomy QC status - // Output into Channels TAXONOMY_QC.out.detailed_result & TAXONOMY_QC.out.result report + // Output into Channels TAXONOMY_QC.out.result & TAXONOMY_QC.out.report TAXONOMY_QC(TAXONOMY.out.report, params.spneumo_percentage) - // Merge Channels ASSEMBLY_QC.out.result & MAPPING_QC.out.result & TAXONOMY_QC.out.result to provide Overall QC Status - // Output into Channel OVERALL_QC.out.result + // Merge Channels AREAD_QC.out.result & SSEMBLY_QC.out.result & MAPPING_QC.out.result & TAXONOMY_QC.out.result to provide Overall QC Status + // Output into Channel OVERALL_QC.out.result & OVERALL_QC.out.report OVERALL_QC( READ_QC.out.result .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true) @@ -121,31 +123,31 @@ workflow PIPELINE { .map { it[0, 2..-1] } // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, generate PopPUNK query file containing assemblies of samples passed overall QC - // Output into POPPUNK_QFILE POPPUNK_QFILE = OVERALL_QC_PASSED_ASSEMBLIES_ch .map { it.join'\t' } .collectFile(name: 'qfile.txt', newLine: true) // From generated POPPUNK_QFILE, assign GPSC to samples passed overall QC + // Output into Channel LINEAGE.out.reports (multiple reports from a single process) LINEAGE(GET_POPPUNK_DB.out.path, GET_POPPUNK_DB.out.database, GET_POPPUNK_EXT_CLUSTERS.out.file, POPPUNK_QFILE) // From Channel OVERALL_QC_PASSED_READS_ch, serotype the preprocess reads of samples passed overall QC - // Output into Channel SEROTYPE.out.result - SEROTYPE(CREATE_SEROBA_DB.out.path, CREATE_SEROBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) + // Output into Channel SEROTYPE.out.report + SEROTYPE(GET_SEROBA_DB.out.path, GET_SEROBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, PubMLST typing the assemblies of samples passed overall QC - // Output into Channel MLST.out.result + // Output into Channel MLST.out.report MLST(OVERALL_QC_PASSED_ASSEMBLIES_ch) // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, assign PBP genes and estimate MIC (minimum inhibitory concentration) for 6 Beta-lactam antibiotics - // Output into Channel GET_PBP_RESISTANCE.out.result + // Output into Channel PARSE_PBP_RESISTANCE.out.report PBP_RESISTANCE(OVERALL_QC_PASSED_ASSEMBLIES_ch) - GET_PBP_RESISTANCE(PBP_RESISTANCE.out.json) + PARSE_PBP_RESISTANCE(PBP_RESISTANCE.out.json) - // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, infer resistance (also determinants if any) of other antimicrobials - // Output into Channel GET_OTHER_RESISTANCE.out.result - OTHER_RESISTANCE(CREATE_ARIBA_DB.out.path, CREATE_ARIBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) - GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.reports, params.ariba_metadata) + // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, infer resistance and determinants of other antimicrobials + // Output into Channel PARSE_OTHER_RESISTANCE.out.result + OTHER_RESISTANCE(GET_ARIBA_DB.out.path, GET_ARIBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) + PARSE_OTHER_RESISTANCE(OTHER_RESISTANCE.out.reports, params.ariba_metadata) // Generate sample reports by merging outputs from all result-generating modules GENERATE_SAMPLE_REPORT( @@ -156,8 +158,8 @@ workflow PIPELINE { .join(OVERALL_QC.out.report, failOnDuplicate: true, remainder: true) .join(SEROTYPE.out.report, failOnDuplicate: true, remainder: true) .join(MLST.out.report, failOnDuplicate: true, remainder: true) - .join(GET_PBP_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) - .join(GET_OTHER_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) + .join(PARSE_PBP_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) + .join(PARSE_OTHER_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name .map { [it[0], it[1..-1].minus(null)] } // Map Sample_ID to index 0 and all reports (with null entries removed) as a list to index 1 ) @@ -166,10 +168,10 @@ workflow PIPELINE { GENERATE_OVERALL_REPORT(GENERATE_SAMPLE_REPORT.out.report.collect(), params.ariba_metadata) // Pass databases information to SAVE_INFO sub-workflow - DATABASES_INFO = CREATE_REF_GENOME_BWA_DB.out.path.map { [["bwa_db_path", it]] } - .merge(CREATE_ARIBA_DB.out.path.map { [["ariba_db_path", it]] }) + DATABASES_INFO = GET_REF_GENOME_BWA_DB.out.path.map { [["bwa_db_path", it]] } + .merge(GET_ARIBA_DB.out.path.map { [["ariba_db_path", it]] }) .merge(GET_KRAKEN2_DB.out.path.map { [["kraken2_db_path", it]] }) - .merge(CREATE_SEROBA_DB.out.path.map { [["seroba_db_path", it]] }) + .merge(GET_SEROBA_DB.out.path.map { [["seroba_db_path", it]] }) .merge(GET_POPPUNK_DB.out.path.map { [["poppunk_db_path", it]] }) .merge(GET_POPPUNK_EXT_CLUSTERS.out.file.map { [["poppunk_ext_file", it]] }) // Save key-value tuples into a map From 76afb5731cc9171b4528393a9155998a7dde0e3d Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 1 Aug 2023 15:10:13 +0000 Subject: [PATCH 35/41] Extract scripts to separated files --- bin/call_snp.sh | 8 ++++++++ bin/convert_sam_to_sorted_bam.sh | 11 +++++++++++ modules/mapping.nf | 31 ++++++++++++++----------------- workflows/pipeline.nf | 6 +++--- 4 files changed, 36 insertions(+), 20 deletions(-) create mode 100755 bin/call_snp.sh create mode 100755 bin/convert_sam_to_sorted_bam.sh diff --git a/bin/call_snp.sh b/bin/call_snp.sh new file mode 100755 index 0000000..d3fba63 --- /dev/null +++ b/bin/call_snp.sh @@ -0,0 +1,8 @@ +# Call SNPs and save to .vcf +# Remove source sorted BAM file if $LITE is true + +bcftools mpileup --threads "$(nproc)" -f "$REFERENCE" "$SORTED_BAM" | bcftools call --threads "$(nproc)" -mv -O v -o "$VCF" + +if [ "$LITE" = true ]; then + rm "$(readlink -f "$SORTED_BAM")" +fi diff --git a/bin/convert_sam_to_sorted_bam.sh b/bin/convert_sam_to_sorted_bam.sh new file mode 100755 index 0000000..c730a5f --- /dev/null +++ b/bin/convert_sam_to_sorted_bam.sh @@ -0,0 +1,11 @@ +# Convet SAM to sorted BAM file +# Remove source SAM file if $LITE is true + +samtools view -@ "$(nproc)" -b "$SAM" > "$BAM" + +samtools sort -@ "$(nproc)" -o "$SORTED_BAM" "$BAM" +rm "$BAM" + +if [ "$LITE" = true ]; then + rm "$(readlink -f "$SAM")" +fi diff --git a/modules/mapping.nf b/modules/mapping.nf index e5d7c4e..964a415 100644 --- a/modules/mapping.nf +++ b/modules/mapping.nf @@ -43,7 +43,7 @@ process MAPPING { script: sam="${sample_id}_mapped.sam" """ - bwa mem -t `nproc` "${bwa_ref_db_dir}/${prefix}" <(zcat -f -- < "$read1") <(zcat -f -- < "$read2") > "$sam" + bwa mem -t "`nproc`" "${bwa_ref_db_dir}/${prefix}" <(zcat -f -- < "$read1") <(zcat -f -- < "$read2") > "$sam" """ } @@ -60,22 +60,18 @@ process SAM_TO_SORTED_BAM { val lite output: - tuple val(sample_id), path(bam), emit: bam + tuple val(sample_id), path(sorted_bam), emit: sorted_bam tuple val(sample_id), env(COVERAGE), emit: ref_coverage script: - bam="${sample_id}_mapped_sorted.bam" + sorted_bam="${sample_id}_mapped_sorted.bam" """ - samtools view -@ `nproc` -b "$sam" > mapped.bam + SAM="$sam" + BAM="mapped.bam" + SORTED_BAM="$sorted_bam" + LITE="$lite" - samtools sort -@ `nproc` -o "$bam" mapped.bam - rm mapped.bam - - if [ $lite = true ]; then - rm `readlink -f "$sam"` - fi - - BAM="$bam" + source convert_sam_to_sorted_bam.sh source get_ref_coverage.sh """ } @@ -89,7 +85,7 @@ process SNP_CALL { input: path reference - tuple val(sample_id), path(bam) + tuple val(sample_id), path(sorted_bam) val lite output: @@ -98,11 +94,12 @@ process SNP_CALL { script: vcf="${sample_id}.vcf" """ - bcftools mpileup --threads `nproc` -f "$reference" "$bam" | bcftools call --threads `nproc` -mv -O v -o "$vcf" + REFERENCE="$reference" + SORTED_BAM="$sorted_bam" + VCF="$vcf" + LITE="$lite" - if [ $lite = true ]; then - rm `readlink -f "$bam"` - fi + source call_snp.sh """ } diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 01f3172..51a93f9 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -78,12 +78,12 @@ workflow PIPELINE { MAPPING(GET_REF_GENOME_BWA_DB.out.path, GET_REF_GENOME_BWA_DB.out.prefix, READ_QC_PASSED_READS_ch) // From Channel MAPPING.out.sam, Convert SAM into sorted BAM and calculate reference coverage - // Output into Channels SAM_TO_SORTED_BAM.out.bam and SAM_TO_SORTED_BAM.out.ref_coverage + // Output into Channels SAM_TO_SORTED_BAM.out.sorted_bam and SAM_TO_SORTED_BAM.out.ref_coverage SAM_TO_SORTED_BAM(MAPPING.out.sam, params.lite) - // From Channel SAM_TO_SORTED_BAM.out.bam calculates non-cluster Het-SNP site count + // From Channel SAM_TO_SORTED_BAM.out.sorted_bam calculates non-cluster Het-SNP site count // Output into Channel HET_SNP_COUNT.out.result - SNP_CALL(params.ref_genome, SAM_TO_SORTED_BAM.out.bam, params.lite) + SNP_CALL(params.ref_genome, SAM_TO_SORTED_BAM.out.sorted_bam, params.lite) HET_SNP_COUNT(SNP_CALL.out.vcf) // Merge Channels SAM_TO_SORTED_BAM.out.ref_coverage & HET_SNP_COUNT.out.result to provide Mapping QC Status From 2e625f5e2ee984c2dbd4966f9eedb538d3f27b96 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 1 Aug 2023 15:11:11 +0000 Subject: [PATCH 36/41] Improve shell scripts style --- bin/get_ref_coverage.sh | 4 ++-- modules/assembly.nf | 4 ++-- modules/preprocess.nf | 2 +- modules/taxonomy.nf | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bin/get_ref_coverage.sh b/bin/get_ref_coverage.sh index 69a131b..199c481 100755 --- a/bin/get_ref_coverage.sh +++ b/bin/get_ref_coverage.sh @@ -1,4 +1,4 @@ # Return reference coverage percentage by the reads -samtools index -@ `nproc` "$BAM" -COVERAGE=$(samtools coverage "$BAM" | awk -F'\t' 'FNR==2 {print $6}') +samtools index -@ "$(nproc)" "$SORTED_BAM" +COVERAGE=$(samtools coverage "$SORTED_BAM" | awk -F'\t' 'FNR==2 {print $6}') diff --git a/modules/assembly.nf b/modules/assembly.nf index ab66b6e..6bf7b79 100644 --- a/modules/assembly.nf +++ b/modules/assembly.nf @@ -20,7 +20,7 @@ process ASSEMBLY_UNICYCLER { script: fasta="${sample_id}.contigs.fasta" """ - unicycler -1 "$read1" -2 "$read2" -s "$unpaired" -o results -t `nproc` --min_fasta_length "$min_contig_length" + unicycler -1 "$read1" -2 "$read2" -s "$unpaired" -o results -t "`nproc`" --min_fasta_length "$min_contig_length" mv results/assembly.fasta "${fasta}" """ } @@ -47,7 +47,7 @@ process ASSEMBLY_SHOVILL { script: fasta="${sample_id}.contigs.fasta" """ - shovill --R1 "$read1" --R2 "$read2" --outdir results --cpus `nproc` --minlen "$min_contig_length" --force + shovill --R1 "$read1" --R2 "$read2" --outdir results --cpus "`nproc`" --minlen "$min_contig_length" --force mv results/contigs.fa "${fasta}" """ } diff --git a/modules/preprocess.nf b/modules/preprocess.nf index e87ef99..1e89da7 100644 --- a/modules/preprocess.nf +++ b/modules/preprocess.nf @@ -19,7 +19,7 @@ process PREPROCESS { processed_two="processed-${sample_id}_2.fastq.gz" processed_unpaired="processed-${sample_id}_unpaired.fastq.gz" """ - fastp --thread `nproc` --in1 "$read_one" --in2 "$read_two" --out1 "$processed_one" --out2 "$processed_two" --unpaired1 "$processed_unpaired" --unpaired2 "$processed_unpaired" + fastp --thread "`nproc`" --in1 "$read_one" --in2 "$read_two" --out1 "$processed_one" --out2 "$processed_two" --unpaired1 "$processed_unpaired" --unpaired2 "$processed_unpaired" """ } diff --git a/modules/taxonomy.nf b/modules/taxonomy.nf index 34ebeab..735b59d 100644 --- a/modules/taxonomy.nf +++ b/modules/taxonomy.nf @@ -41,11 +41,11 @@ process TAXONOMY { if (kraken2_memory_mapping === true) """ - kraken2 --threads `nproc` --use-names --memory-mapping --db "$kraken2_db" --paired "$read1" "$read2" --report "$report" --output - + kraken2 --threads "`nproc`" --use-names --memory-mapping --db "$kraken2_db" --paired "$read1" "$read2" --report "$report" --output - """ else if (kraken2_memory_mapping === false) """ - kraken2 --threads `nproc` --use-names --db "$kraken2_db" --paired "$read1" "$read2" --report "$report" --output - + kraken2 --threads "`nproc`" --use-names --db "$kraken2_db" --paired "$read1" "$read2" --report "$report" --output - """ else error "The value for --kraken2_memory_mapping is not valid." From 0c70390e8657999edfb717a9a94df8fbfb8806dc Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 1 Aug 2023 17:02:31 +0000 Subject: [PATCH 37/41] Improve option names consistency --- modules/validate.nf | 6 +++--- nextflow.config | 6 +++--- workflows/info_and_version.nf | 4 ++-- workflows/init.nf | 8 ++++---- workflows/pipeline.nf | 8 ++++---- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/modules/validate.nf b/modules/validate.nf index 4cf3438..f2bceb0 100644 --- a/modules/validate.nf +++ b/modules/validate.nf @@ -8,8 +8,8 @@ validParams = [ assembler: 'assembler', min_contig_length: 'int', assembly_publish: 'publish_mode', - seroba_remote: 'url_git', - seroba_local: 'path', + seroba_db_remote: 'url_git', + seroba_db_local: 'path', seroba_kmer: 'int', kraken2_db_remote: 'url_targz', kraken2_db_local: 'path', @@ -18,7 +18,7 @@ validParams = [ ref_genome_bwa_db_local: 'path', poppunk_db_remote: 'url_targz', poppunk_ext_remote: 'url_csv', - poppunk_local: 'path', + poppunk_db_local: 'path', spneumo_percentage: 'int_float', ref_coverage: 'int_float', het_snp_site: 'int', diff --git a/nextflow.config b/nextflow.config index 1a322f1..6048efb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -20,8 +20,8 @@ params { assembly_publish = "link" // Default git repository and local directory, and KMC kmer size for SeroBA - seroba_remote = "https://github.com/sanger-pathogens/seroba.git" - seroba_local = "$projectDir/databases/seroba" + seroba_db_remote = "https://github.com/sanger-pathogens/seroba.git" + seroba_db_local = "$projectDir/databases/seroba" seroba_kmer = 71 // Default link and local directory for Kraken2 Database, and usage of memory mapping @@ -36,7 +36,7 @@ params { // Default links for PopPUNK Database and External Clusters, and local directory for both poppunk_db_remote = "https://gps-project.cog.sanger.ac.uk/GPS_v6.tar.gz" poppunk_ext_remote = "https://www.pneumogen.net/gps/GPS_v6_external_clusters.csv" - poppunk_local = "$projectDir/databases/poppunk" + poppunk_db_local = "$projectDir/databases/poppunk" // Default values for QC spneumo_percentage = 60.00 diff --git a/workflows/info_and_version.nf b/workflows/info_and_version.nf index bb5ce37..186dadf 100644 --- a/workflows/info_and_version.nf +++ b/workflows/info_and_version.nf @@ -10,8 +10,8 @@ workflow PRINT_VERSION { params.ref_genome_bwa_db_local, params.ariba_db_local, params.kraken2_db_local, - params.seroba_local, - params.poppunk_local, + params.seroba_db_local, + params.poppunk_db_local, pipeline_version ) \ | PARSE \ diff --git a/workflows/init.nf b/workflows/init.nf index 20eff25..7d1ed77 100644 --- a/workflows/init.nf +++ b/workflows/init.nf @@ -18,12 +18,12 @@ workflow INIT { GET_KRAKEN2_DB(params.kraken2_db_remote, params.kraken2_db_local) // Check SeroBA Databases, clone and rebuild if necessary - CHECK_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) - GET_SEROBA_DB(params.seroba_remote, params.seroba_local, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) + CHECK_SEROBA_DB(params.seroba_db_remote, params.seroba_db_local, params.seroba_kmer) + GET_SEROBA_DB(params.seroba_db_remote, params.seroba_db_local, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) // Check to PopPUNK Database and External Clusters, download if necessary - GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_local) - GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_local) + GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_db_local) + GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_db_local) // Pull all Docker images mentioned in nextflow.config if using Docker if (workflow.containerEngine === 'docker') { diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 51a93f9..e39599f 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -20,12 +20,12 @@ workflow PIPELINE { GET_KRAKEN2_DB(params.kraken2_db_remote, params.kraken2_db_local) // Get path to SeroBA Databases, clone and rebuild if necessary - CHECK_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) - GET_SEROBA_DB(params.seroba_remote, params.seroba_local, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) + CHECK_SEROBA_DB(params.seroba_db_remote, params.seroba_db_local, params.seroba_kmer) + GET_SEROBA_DB(params.seroba_db_remote, params.seroba_db_local, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) // Get paths to PopPUNK Database and External Clusters, download if necessary - GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_local) - GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_local) + GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_db_local) + GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_db_local) // Get path to ARIBA database, generate from reference sequences and metadata if ncessary GET_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) From d76351c9528a7ad8e2b0a551e63a0ca361a0b80a Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 1 Aug 2023 17:02:46 +0000 Subject: [PATCH 38/41] Improve help message --- modules/messages.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/messages.nf b/modules/messages.nf index 7a9d189..78e9cd3 100644 --- a/modules/messages.nf +++ b/modules/messages.nf @@ -30,7 +30,7 @@ void helpMessage() { |--reads [PATH] Path to the input directory that contains the reads to be processed |--output [PATH] Path to the output directory that save the results |--init Alternative workflow for initialisation - |--version Alternative workflow for getting versions of pipeline, tools and databases + |--version Alternative workflow for getting versions of pipeline, container images, tools and databases | |For all available options, please refer to README.md '''.stripMargin() From 7935001f2b822e7d845d43cdf4ff091b0c46ead0 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 1 Aug 2023 17:03:39 +0000 Subject: [PATCH 39/41] Improve content and update to reflect changes --- README.md | 127 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 65 insertions(+), 62 deletions(-) diff --git a/README.md b/README.md index aa68b3e..cb10fa9 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ # GPS Unified Pipeline -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-23.04.1-23aa62.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-23.04.2-23aa62.svg)](https://www.nextflow.io/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/singularity/) The GPS Unified Pipeline is a Nextflow pipeline designed for processing raw reads (FASTQ files) of *Streptococcus pneumoniae* samples. After preprocessing, the pipeline performs initial assessment based on the total bases in reads. Passed samples will be further assess based on assembly, mapping, and taxonomy. If the sample passes all quality controls (QC), the pipeline also provides the sample's serotype, multi-locus sequence typing (MLST), lineage (based on the [Global Pneumococcal Sequence Cluster (GPSC)](https://www.pneumogen.net/gps/GPSC_lineages.html)), and antimicrobial resistance (AMR) against multiple antimicrobials. -The pipeline is designed to be easy to set up and use, and is suitable for use on local machines. It is also offline-capable, making it an ideal option for cases where the FASTQ files being analysed should not leave the local machine. Additionally, the pipeline only downloads essential files to enable the analysis, and no data is uploaded from the local machine. After initialisation or the first successful complete run, the pipeline can be used offline unless you have changed the selection of any database or container image. +The pipeline is designed to be easy to set up and use, and is suitable for use on local machines and high-performance computing (HPC) clusters alike. Additionally, the pipeline only downloads essential files to enable the analysis, and no data is uploaded from the local environment, making it an ideal option for cases where the FASTQ files being analysed is confidential. After initialisation or the first successful complete run, the pipeline can be used offline unless you have changed the selection of any database or container image. The development of this pipeline is part of the GPS Project ([Global Pneumococcal Sequencing Project](https://www.pneumogen.net/gps/)). @@ -57,7 +57,7 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca > - The pipeline generates ~1.8GB intermediate files for each sample on average
(These files can be removed when the pipeline run is completed, please refer to [Clean Up](#clean-up))
(To further reduce storage requirement by sacrificing the ability to resume the pipeline, please refer to [Experimental](#experimental)) ## Accepted Inputs -- Currently, only Illumina paired-end short reads are supported +- Only Illumina paired-end short reads are supported - Each sample is expected to be a pair of raw reads following this file name pattern: - `*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}` - example 1: `SampleName_R1_001.fastq.gz`, `SampleName_R2_001.fastq.gz` @@ -70,18 +70,18 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca ``` or - Download and unzip the [repository](https://github.com/HarryHung/gps-unified-pipeline/archive/refs/heads/master.zip) -2. Go into the local copy of the repository + Download and unzip the [latest release](https://github.com/HarryHung/gps-unified-pipeline/releases) +2. Go into the local copy of the repository and the pipeline is ready to use without installation ``` cd gps-unified-pipeline ``` 3. (Optional) You could perform an initialisation to download all required additional files and container images, so the pipeline can be used at any time with or without the Internet afterwards. > ⚠️ Docker or Singularity must be running, and an Internet connection is required. - - For those using Docker as the container engine + - Using Docker as the container engine ``` ./run_pipeline --init ``` - - For those using Singularity as the container engine + - Using Singularity as the container engine ``` ./run_pipeline --init -profile singularity ``` @@ -91,8 +91,8 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca > ⚠️ If this is the first run and initialisation was not performed, an Internet connection is required. -> ℹ️ By default, Docker is used as the container engine and all the processes are executed by the local machine. See [Profile](#profile) for details on running the pipeline with Singularity or on a server farm. -- You can run the pipeline without options. It will attempt to get the raw reads from the default location (`input` directory inside the `gps-unified-pipeline` local repository) +> ℹ️ By default, Docker is used as the container engine and all the processes are executed by the local machine. See [Profile](#profile) for details on running the pipeline with Singularity or on a HPC cluster. +- You can run the pipeline without options. It will attempt to get the raw reads from the default location (i.e. `input` directory inside the `gps-unified-pipeline` local repository) ``` ./run_pipeline ``` @@ -113,29 +113,30 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca ``` ./run_pipeline -profile [profile name] ``` -- Currently, the following profiles are available +- Available profiles: | Profile Name | Details | | --- | --- | | `standard`
(Default) | Docker is used as the container engine.
Processes are executed locally. | | `singularity` | Singularity is used as the container engine.
Processes are executed locally. | - | `lsf` | **The pipeline should be launched from a LSF cluster head node with this profile.**
Singularity is used as the container engine.
Processes are submitted to your LSF cluster via `bsub`.
(Tested on Sanger farm5) | + | `lsf` | **The pipeline should be launched from a LSF cluster head node with this profile.**
Singularity is used as the container engine.
Processes are submitted to your LSF cluster via `bsub` by the pipeline.
(Tested on Sanger farm5 cluster only) | ## Resume - If the pipeline is interrupted mid-run, Nextflow's built-in `-resume` option can be used to resume the pipeline execution instead of starting from scratch again - You should use the same command of the original run, only add `-resume` at the end (i.e. all pipeline options should be identical) > ℹ️ `-resume` is a built-in Nextflow option, it only has one leading `-` - ``` - # original command - ./run_pipeline --reads /path/to/raw-reads-directory - - # command to resume the pipeline execution - ./run_pipeline --reads /path/to/raw-reads-directory -resume - ``` + - If the original command is + ``` + ./run_pipeline --reads /path/to/raw-reads-directory + ``` + - The command to resume the pipeline execution should be + ``` + ./run_pipeline --reads /path/to/raw-reads-directory -resume + ``` ## Clean Up - During the run of the pipeline, Nextflow generates a considerable amount of intermediate files -- If the run has been completed and you do not intend to use the `-resume` option, you can remove the intermediate files using one of the following ways: - - Run `clean_pipeline` script +- If the run has been completed and you do not intend to use the `-resume` option or those intermediate files, you can remove the intermediate files using one of the following ways: + - Run the included `clean_pipeline` script - It runs the commands in manual removal for you - It removes the `work` directory and log files within the `gps-unified-pipeline` local repository ``` @@ -167,13 +168,13 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca > ℹ️ `$projectDir` is a [Nextflow built-in implicit variables](https://www.nextflow.io/docs/latest/script.html?highlight=projectdir#implicit-variables), it is defined as the directory where the `gps-unified-pipeline` local repository is stored. -> ℹ️ They are not built-in Nextflow options, hence lead with `--` instead of `-` +> ℹ️ Pipeline options are not built-in Nextflow options, they are lead with `--` instead of `-` ## Alternative Workflows | Option | Values | Description | | --- | ---| --- | - | `--init` | `true` or `false`
(Default: `false`) | Use alternative workflow for initialisation, which means downloading all required additional files and container images.
Can be enabled by including `--init` without value. | - | `--version` | `true` or `false`
(Default: `false`)| Use alternative workflow for getting versions of pipeline, tools and databases.
Can be enabled by including `--version` without value.
(This workflow pulls the required container images if they are not yet available locally) | + | `--init` | `true` or `false`
(Default: `false`) | Use alternative workflow for initialisation, which means downloading all required additional files and container images, and creating databases.
Can be enabled by including `--init` without value. | + | `--version` | `true` or `false`
(Default: `false`)| Use alternative workflow for showing versions of pipeline, container images, tools and databases.
Can be enabled by including `--version` without value.
(This workflow pulls the required container images if they are not yet available locally) | | `--help` | `true` or `false`
(Default: `false`)| Show help message.
Can be enabled by including `--help` without value. | ## Input and Output @@ -199,7 +200,7 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca ## Assembly | Option | Values | Description | | --- | ---| --- | - | `--assembler` | `"shovill"` or `"unicycler"`
(Default: `"shovill"`)| SPAdes Assembler to assemble the reads. | + | `--assembler` | `"shovill"` or `"unicycler"`
(Default: `"shovill"`)| Using which SPAdes-based assembler to assemble the reads. | | `--min_contig_length` | Any integer value
(Default: `500`) | Minimum legnth of contig to be included in the assembly | ## Mapping @@ -220,22 +221,22 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | `--kraken2_memory_mapping` | `true` or `false`
(Default: `true`) | Using the memory mapping option of Kraken2 or not.
`true` means not loading the database into RAM, suitable for memory-limited or fast storage environments. | ## Serotype - > ⚠️ `--seroba_local` does not accept user provided local database, directory content will be overwritten + > ⚠️ `--seroba_db_local` does not accept user provided local database, directory content will be overwritten | Option | Values | Description | | --- | ---| --- | - | `--seroba_remote` | Any valid URL to a Git remote repository
(Default: [SeroBA GitHub Repo](https://github.com/sanger-pathogens/seroba.git))| URL to a SeroBA Git remote repository. | - | `--seroba_local` | Any valid path
(Default: `"$projectDir/databases/seroba"`) | Path to the directory where SeroBA local repository should be saved to. | + | `--seroba_db_remote` | Any valid URL to a Git remote repository
(Default: [SeroBA GitHub Repo](https://github.com/sanger-pathogens/seroba.git))| URL to a SeroBA Git remote repository. | + | `--seroba_db_local` | Any valid path
(Default: `"$projectDir/databases/seroba"`) | Path to the directory where SeroBA local repository should be saved to. | | `--seroba_kmer` | Any integer value
(Default: `71`) | Kmer size for creating the KMC database of SeroBA. | ## Lineage - > ⚠️ `--poppunk_local` does not accept user provided local database, directory content will be overwritten + > ⚠️ `--poppunk_db_local` does not accept user provided local database, directory content will be overwritten | Option | Values | Description | | --- | ---| --- | | `--poppunk_db_remote` | Any valid URL to a PopPUNK database in `.tar.gz` or `.tgz` format
(Default: [GPS v6](https://gps-project.cog.sanger.ac.uk/GPS_v6.tar.gz)) | URL to a PopPUNK database. | | `--poppunk_ext_remote` | Any valid URL to a PopPUNK external clusters file in `.csv` format
(Default: [GPS v6 GPSC Designation](https://www.pneumogen.net/gps/GPS_v6_external_clusters.csv)) | URL to a PopPUNK external clusters file. | - | `--poppunk_local` | Any valid path
(Default: `"$projectDir/databases/poppunk"`) | Path to the directory where the remote PopPUNK database and external clusters file should be saved to. | + | `--poppunk_db_local` | Any valid path
(Default: `"$projectDir/databases/poppunk"`) | Path to the directory where the remote PopPUNK database and external clusters file should be saved to. | ## Other AMR > ⚠️ `--ariba_db_local` does not accept user provided local database, directory content will be overwritten @@ -276,9 +277,11 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca - The following fields can be found in the output `results.csv` > ℹ️ For resistance phenotypes: S = Sensitive/Susceptible; I = Intermediate; R = Resistant + > ℹ️ * The exact output fields of Other AMR depends on the provided ARIBA database, the below table is based on the default ARIBA database + > ⚠️ If the result of `Overall_QC` of a sample is `ASSEMBLER FAILURE`, the assembler has crashed when trying to assembly the reads. You might want to re-run the sample with [another assembler](#assembly), or discard the sample if it is a low quality one. - > ⚠️ If the result of `Serotype` of a sample is `SEROBA FAILURE`, SeroBA has crashed when trying to serotype the sample. Please report the issue. + > ⚠️ If the result of `Serotype` of a sample is `SEROBA FAILURE`, SeroBA has crashed when trying to serotype the sample. | Field | Type | Description | | --- | --- | --- | @@ -323,38 +326,38 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | `PEN_MIC` | PBP AMR | Estimated MIC of penicillin (PEN) | | `PEN_Res(Meningital)` | PBP AMR | Resistance phenotype against PEN in meningital form | | `PEN_Res(Non-meningital)` | PBP AMR | Resistance phenotype against PEN in non-meningital form | - | `CHL_Res` | Other AMR | Resistance phenotype against Chloramphenicol (CHL) | - | `CHL_Determinant` | Other AMR | Known determinants that inferred the CHL resistance | - | `ERY_Res` | Other AMR | Resistance phenotype against Erythromycin (ERY) | - | `ERY_Determinant` | Other AMR | Known determinants that inferred the ERY resistance | - | `CLI_Res` | Other AMR | Resistance phenotype against Clindamycin (CLI) | - | `CLI_Determinant` | Other AMR | Known determinants that inferred the CLI resistance | - | `ERY_CLI_Res` | Other AMR | Resistance phenotype against Erythromycin (ERY) and Clindamycin (CLI) | - | `ERY_CLI_Determinant` | Other AMR | Known determinants that inferred the ERY and CLI resistance | - | `FQ_Res` | Other AMR | Resistance phenotype against Fluoroquinolones (FQ) | - | `FQ_Determinant` | Other AMR | Known determinants that inferred the FQ resistance | - | `LFX_Res` | Other AMR | Resistance phenotype against Levofloxacin (LFX) | - | `LFX_Determinant` | Other AMR | Known determinants that inferred the LFX resistance | - | `KAN_Res` | Other AMR | Resistance phenotype against Kanamycin (KAN) | - | `KAN_Determinant` | Other AMR | Known determinants that inferred the KAN resistance | - | `TET_Res` | Other AMR | Resistance phenotype against Tetracycline (TET) | - | `TET_Determinant` | Other AMR | Known determinants that inferred the TET resistance | - | `DOX_Res` | Other AMR | Resistance phenotype against Doxycycline (DOX) | - | `DOX_Determinant` | Other AMR | Known determinants that inferred the DOX resistance | - | `TMP_Res` | Other AMR | Resistance phenotype against Trimethoprim (TMP) | - | `TMP_Determinant` | Other AMR | Known determinants that inferred the TMP resistance | - | `SMX_Res` | Other AMR | Resistance phenotype against Sulfamethoxazole (SMX) | - | `SMX_Determinant` | Other AMR | Known determinants that inferred the SMX resistance | - | `COT_Res` | Other AMR | Resistance phenotype against Co-Trimoxazole (COT) | - | `COT_Determinant` | Other AMR | Known determinants that inferred the COT resistance | - | `RIF_Res` | Other AMR | Resistance phenotype against Rifampin (RIF) | - | `RIF_Determinant` | Other AMR | Known determinants that inferred the RIF resistance | - | `VAN_Res` | Other AMR | Resistance phenotype against Vancomycin (VAN) | - | `VAN_Determinant` | Other AMR | Known determinants that inferred the VAN resistance | - | `PILI1` | Other AMR | Expression of PILI-1 | - | `PILI1_Determinant` | Other AMR | Known determinants that inferred the PILI-1 expression | - | `PILI2` | Other AMR | Expression of PILI-2 | - | `PILI2_Determinant` | Other AMR | Known determinants that inferred the PILI-2 expression | + | `CHL_Res` | Other AMR* | Resistance phenotype against Chloramphenicol (CHL) | + | `CHL_Determinant` | Other AMR* | Known determinants that inferred the CHL resistance | + | `CLI_Res` | Other AMR* | Resistance phenotype against Clindamycin (CLI) | + | `CLI_Determinant` | Other AMR* | Known determinants that inferred the CLI resistance | + | `COT_Res` | Other AMR* | Resistance phenotype against Co-Trimoxazole (COT) | + | `COT_Determinant` | Other AMR* | Known determinants that inferred the COT resistance | + | `DOX_Res` | Other AMR* | Resistance phenotype against Doxycycline (DOX) | + | `DOX_Determinant` | Other AMR* | Known determinants that inferred the DOX resistance | + | `ERY_Res` | Other AMR* | Resistance phenotype against Erythromycin (ERY) | + | `ERY_Determinant` | Other AMR* | Known determinants that inferred the ERY resistance | + | `ERY_CLI_Res` | Other AMR* | Resistance phenotype against Erythromycin (ERY) and Clindamycin (CLI) | + | `ERY_CLI_Determinant` | Other AMR* | Known determinants that inferred the ERY and CLI resistance | + | `FQ_Res` | Other AMR* | Resistance phenotype against Fluoroquinolones (FQ) | + | `FQ_Determinant` | Other AMR* | Known determinants that inferred the FQ resistance | + | `KAN_Res` | Other AMR* | Resistance phenotype against Kanamycin (KAN) | + | `KAN_Determinant` | Other AMR* | Known determinants that inferred the KAN resistance | + | `LFX_Res` | Other AMR* | Resistance phenotype against Levofloxacin (LFX) | + | `LFX_Determinant` | Other AMR* | Known determinants that inferred the LFX resistance | + | `RIF_Res` | Other AMR* | Resistance phenotype against Rifampin (RIF) | + | `RIF_Determinant` | Other AMR* | Known determinants that inferred the RIF resistance | + | `SMX_Res` | Other AMR* | Resistance phenotype against Sulfamethoxazole (SMX) | + | `SMX_Determinant` | Other AMR* | Known determinants that inferred the SMX resistance | + | `TET_Res` | Other AMR* | Resistance phenotype against Tetracycline (TET) | + | `TET_Determinant` | Other AMR* | Known determinants that inferred the TET resistance | + | `TMP_Res` | Other AMR* | Resistance phenotype against Trimethoprim (TMP) | + | `TMP_Determinant` | Other AMR* | Known determinants that inferred the TMP resistance | + | `VAN_Res` | Other AMR* | Resistance phenotype against Vancomycin (VAN) | + | `VAN_Determinant` | Other AMR* | Known determinants that inferred the VAN resistance | + | `PILI1` | Other AMR* | Expression of PILI-1 | + | `PILI1_Determinant` | Other AMR* | Known determinants that inferred the PILI-1 expression | + | `PILI2` | Other AMR* | Expression of PILI-2 | + | `PILI2_Determinant` | Other AMR* | Known determinants that inferred the PILI-2 expression |   # Credits From 457b99629e7fc812230497d9f04cbe93b3cbea12 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 2 Aug 2023 09:54:56 +0000 Subject: [PATCH 40/41] Update version of ARIBA container --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 6048efb..268f05a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -96,7 +96,7 @@ process { container = 'harryhungch/spn-pbp-amr:23.01.16' } withLabel: ariba_container { - container = 'staphb/ariba:2.14.4' + container = 'staphb/ariba:2.14.6' } withLabel: mlst_container { container = 'staphb/mlst:2.23.0' From ebc7ce5dd388af8c8a3f821d5f02842d331d799a Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 2 Aug 2023 09:55:21 +0000 Subject: [PATCH 41/41] Update credits section --- README.md | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index cb10fa9..bda84c2 100644 --- a/README.md +++ b/README.md @@ -366,39 +366,39 @@ This project uses open-source components. You can find the homepage or source co [ARIBA](https://sanger-pathogens.github.io/ariba/) - ARIBA: rapid antimicrobial resistance genotyping directly from sequencing reads Hunt M, Mather AE, Sánchez-Busó L, Page AJ, Parkhill J , Keane JA, Harris SR. Microbial Genomics 2017. doi: [110.1099/mgen.0.000131](http://mgen.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000131) - License (GPL-3.0): https://github.com/sanger-pathogens/ariba/blob/master/LICENSE -- This tool is used in `CREATE_ARIBA_DB` and `OTHER_RESISTANCE` processes of the `amr.nf` module +- This tool is used in `GET_ARIBA_DB` and `OTHER_RESISTANCE` processes of the `amr.nf` module [BCFtools](https://samtools.github.io/bcftools/) and [SAMtools](https://www.htslib.org/) - Twelve years of SAMtools and BCFtools. Petr Danecek, James K Bonfield, Jennifer Liddle, John Marshall, Valeriu Ohan, Martin O Pollard, Andrew Whitwham, Thomas Keane, Shane A McCarthy, Robert M Davies, Heng Li. **GigaScience**, Volume 10, Issue 2, February 2021, giab008, https://doi.org/10.1093/gigascience/giab008 - Licenses - BCFtools (MIT/Expat or GPL-3.0): https://github.com/samtools/bcftools/blob/develop/LICENSE - SAMtools (MIT/Expat): https://github.com/samtools/samtools/blob/develop/LICENSE -- These tools are used in `SAM_TO_SORTED_BAM`, `REF_COVERAGE` and `SNP_CALL` processes of the `mapping.nf` module +- These tools are used in `SAM_TO_SORTED_BAM` and `SNP_CALL` processes of the `mapping.nf` module [BWA](https://github.com/lh3/bwa) - Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. [arXiv:1303.3997v2](http://arxiv.org/abs/1303.3997) [q-bio.GN] - License (GPL-3.0): https://github.com/lh3/bwa/blob/master/COPYING -- This tool is used in `GET_REF_GENOME_BWA_DB_PREFIX` and `MAPPING` processes of the `mapping.nf` module +- This tool is used in `GET_REF_GENOME_BWA_DB` and `MAPPING` processes of the `mapping.nf` module -[Docker Images](https://hub.docker.com/u/staphb) of [BCFtools](https://hub.docker.com/r/staphb/bcftools), [BWA](https://hub.docker.com/r/staphb/bwa), [fastp](https://hub.docker.com/r/staphb/fastp), [Kraken 2](https://hub.docker.com/r/staphb/kraken2), [mlst](https://hub.docker.com/r/staphb/mlst), [PopPUNK](https://hub.docker.com/r/staphb/poppunk), [QUAST](https://hub.docker.com/r/staphb/quast), [SAMtools](https://hub.docker.com/r/staphb/samtools), [Shovill](https://hub.docker.com/r/staphb/shovill), [Unicycler](https://hub.docker.com/r/staphb/unicycler) +[Docker Images](https://hub.docker.com/u/staphb) of [ARIBA](https://hub.docker.com/r/staphb/ariba), [BCFtools](https://hub.docker.com/r/staphb/bcftools), [BWA](https://hub.docker.com/r/staphb/bwa), [fastp](https://hub.docker.com/r/staphb/fastp), [Kraken 2](https://hub.docker.com/r/staphb/kraken2), [mlst](https://hub.docker.com/r/staphb/mlst), [PopPUNK](https://hub.docker.com/r/staphb/poppunk), [QUAST](https://hub.docker.com/r/staphb/quast), [SAMtools](https://hub.docker.com/r/staphb/samtools), [Shovill](https://hub.docker.com/r/staphb/shovill), [Unicycler](https://hub.docker.com/r/staphb/unicycler) - [State Public Health Bioinformatics Workgroup](https://staphb.org/) ([@StaPH-B](https://github.com/StaPH-B)) - License (GPL-3.0): https://github.com/StaPH-B/docker-builds/blob/master/LICENSE -- These Docker images provide containerised environments for processes of multiple modules +- These Docker images provide containerised environments with different bioinformatics tools for processes of multiple modules [Docker Image of Git](https://hub.docker.com/r/bitnami/git) - [Bitnami](https://bitnami.com/) ([@Bitnami](https://github.com/bitnami)) - License (Apache 2.0): https://github.com/bitnami/containers/blob/main/LICENSE.md -- This Docker image provides the containerised environment for `GET_SEROBA_DB` process of the `serotype.nf` module +- This Docker image provides the containerised environment with Git for `CHECK_SEROBA_DB` process of the `serotype.nf` module [Docker Image of network-multitool](https://hub.docker.com/r/wbitt/network-multitool) - [Wbitt - We Bring In Tomorrow's Technolgies](https://wbitt.com/) ([@WBITT](https://github.com/wbitt)) - License (MIT): https://github.com/wbitt/Network-MultiTool/blob/master/LICENSE -- This Docker image provides the containerised environment for processes of multiple modules +- This Docker image provides the containerised environment with Bash tools for processes of multiple modules -[Docker Image of Python](https://hub.docker.com/_/python) -- The Docker Community ([@docker-library](https://github.com/docker-library)) -- License (MIT): https://github.com/docker-library/python/blob/master/LICENSE -- This Docker image provides the containerised environment for `HET_SNP_COUNT` process of the `mapping.nf` module and `GET_OTHER_RESISTANCE` process of the `amr.nf` module +[Docker Image of Pandas](https://hub.docker.com/r/amancevice/pandas) +- Alexander Mancevice ([@amancevice](https://github.com/amancevice)) +- License (MIT): https://github.com/amancevice/docker-pandas/blob/main/LICENSE +- This Docker image provides the containerised environment with Python and Pandas for `GENERATE_OVERALL_REPORT` process of the `output.nf` module, `HET_SNP_COUNT` process of the `mapping.nf` module and `PARSE_OTHER_RESISTANCE` process of the `amr.nf` module [fastp](https://github.com/OpenGene/fastp) - Shifu Chen, Yanqing Zhou, Yaru Chen, Jia Gu; fastp: an ultra-fast all-in-one FASTQ preprocessor, Bioinformatics, Volume 34, Issue 17, 1 September 2018, Pages i884–i890, https://doi.org/10.1093/bioinformatics/bty560 @@ -406,9 +406,9 @@ This project uses open-source components. You can find the homepage or source co - This tool is used in `PREPROCESS` process of the `preprocess.nf` module [GPSC_pipeline_nf](https://github.com/sanger-bentley-group/GPSC_pipeline_nf) -- Victoria Carr ([@blue-moon22](https://github.com/blue-moon22)) +- Victoria Dyster ([@blue-moon22](https://github.com/blue-moon22)) - License (GPL-3.0): https://github.com/sanger-bentley-group/GPSC_pipeline_nf/blob/master/LICENSE -- Code adapted into `LINEAGE` process of the `lineage.nf` module +- Code adapted into the `get_lineage.sh` script [Kraken 2](https://ccb.jhu.edu/software/kraken2/) - Wood, D.E., Lu, J. & Langmead, B. Improved metagenomic analysis with Kraken 2. Genome Biol 20, 257 (2019). https://doi.org/10.1186/s13059-019-1891-0 @@ -418,7 +418,7 @@ This project uses open-source components. You can find the homepage or source co [mecA-HetSites-calculator](https://github.com/kumarnaren/mecA-HetSites-calculator) - Narender Kumar ([@kumarnaren](https://github.com/kumarnaren)) - License (GPL-3.0): https://github.com/kumarnaren/mecA-HetSites-calculator/blob/master/LICENSE -- Code was rewritten into the `het_snp_count.py` script used by `HET_SNP_COUNT` process of the `mapping.nf` module +- Code was rewritten into the `het_snp_count.py` script [mlst](https://github.com/tseemann/mlst) - Torsten Seemann ([@tseemann](https://github.com/tseemann)) @@ -446,14 +446,14 @@ This project uses open-source components. You can find the homepage or source co - License (GPL-3.0): https://github.com/sanger-pathogens/seroba/blob/master/LICENSE - This project uses a Docker image built from a [custom fork](https://github.com/HarryHung/seroba) - The fork includes critical bug fixes for SeroBA as the original repository is no longer maintained - - The Docker image provides the containerised environment for `CREATE_SEROBA_DB` and `SEROTYPE` processes of the `serotype.nf` module + - The Docker image provides the containerised environment with SeroBA for `GET_SEROBA_DB` and `SEROTYPE` processes of the `serotype.nf` module [resistanceDatabase](https://github.com/kumarnaren/resistanceDatabase) - Narender Kumar ([@kumarnaren](https://github.com/kumarnaren)) - License (GPL-3.0): https://github.com/kumarnaren/resistanceDatabase/blob/main/LICENSE - `sequences.fasta` is renamed to `ariba_ref_sequences-*.fasta` and used as-is - `metadata.tsv` is renamed to `ariba_metadata-*.tsv` and modified -- The files are used as the default inputs of `CREATE_ARIBA_DB` process of the `amr.nf` module +- The files are used as the default inputs of `GET_ARIBA_DB` process of the `amr.nf` module [Shovill](https://github.com/tseemann/shovill) - Torsten Seemann ([@tseemann](https://github.com/tseemann)) @@ -466,7 +466,7 @@ This project uses open-source components. You can find the homepage or source co - This is a modified version of [AMR predictor](https://github.com/BenJamesMetcalf/Spn_Scripts_Reference) by Ben Metcalf ([@BenJamesMetcalf](https://github.com/BenJamesMetcalf)) at the Centre for Disease Control (CDC) - This project uses a Docker image built from a [custom fork](https://github.com/HarryHung/spn-resistance-pbp) - The fork changes the Docker image from a Docker executable image to a Docker environment for Nextflow integration - - The Docker image provides the containerised environment for `PBP_RESISTANCE` process of the `amr.nf` module + - The Docker image provides the containerised environment with SPN-PBP-MAR for `PBP_RESISTANCE` process of the `amr.nf` module [Unicycler](https://github.com/rrwick/Unicycler) - **Wick RR, Judd LM, Gorrie CL, Holt KE**. Unicycler: resolving bacterial genome assemblies from short and long sequencing reads. *PLoS Comput Biol* 2017.