From 33bdad08989c27f6a77ecd59a34125758a7f877c Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 21 Jun 2023 13:32:29 +0000 Subject: [PATCH 001/157] Initial implementation of ARIBA Former-commit-id: 98d9289b72bcf6f111b9f56de90d71c3df70c8fc --- data/ariba_metadata.tsv | 73 ++++ data/ariba_sequences.fasta | 715 +++++++++++++++++++++++++++++++++++++ modules/amr.nf | 43 ++- nextflow.config | 4 +- workflows/pipeline.nf | 15 +- 5 files changed, 828 insertions(+), 22 deletions(-) create mode 100644 data/ariba_metadata.tsv create mode 100644 data/ariba_sequences.fasta diff --git a/data/ariba_metadata.tsv b/data/ariba_metadata.tsv new file mode 100644 index 0000000..2a73517 --- /dev/null +++ b/data/ariba_metadata.tsv @@ -0,0 +1,73 @@ +reference coding_yes(1)_no(0) pr/ab(0)_var(1) description of the variant Group FreeText_Drug +aph_3prime_III_1_M26832 1 0 . . Kanamycin resistance +ermB_1_JN899585 1 0 . . Erythromycin and Clindamycin resistance +ermB_10_U86375 1 0 . . Erythromycin and Clindamycin resistance +ermB_16_X82819 1 0 . . Erythromycin and Clindamycin resistance +ermB_20_AF109075 1 0 . . Erythromycin and Clindamycin resistance +ermC_13_M13761 1 0 . . Erythromycin and Clindamycin resistance +cat_5_U35036 1 0 . . Chloramphenicol resistance +catpC194_1_NC_002013 1 0 . . Chloramphenicol resistance +catpC233_1_AY355285 1 0 . . Chloramphenicol resistance +catQ_1_M55620 1 0 . . Chloramphenicol resistance +msrD_2_AF274302 1 0 . . Erythromycin resistance +msrD_3_AF227520 1 0 . . Erythromycin resistance +mefA_10_AF376746 1 0 . . Erythromycin resistance +mefE_AE007317 1 0 . . Erythromycin resistance +tetM_1_X92947 1 0 . . Tetracycline resistance +tetM_12_FR671418 1 0 . . Tetracycline resistance +tetK_4_U38428 1 0 . . Tetracycline resistance +tetM_13_AM990992 1 0 . . Tetracycline resistance +tetM_2_X90939 1 0 . . Tetracycline resistance +tetM_4_X75073 1 0 . . Tetracycline resistance +tetM_5_U58985 1 0 . . Tetracycline resistance +tetM_8_X04388 1 0 . . Tetracycline resistance +tetS_M 1 0 . . Tetracycline resistance +tetS_M_MH283012 1 0 . . Tetracycline resistance +tetAp_L20800 1 0 . . Tetracycline resistance +tetBp_L20800 1 0 . . Tetracycline resistance +tetAQ2_Z21523 1 0 . . Tetracycline resistance +tetS_FN555436 1 0 . . Tetracycline resistance +tetT_L42544 1 0 . . Tetracycline resistance +tetW_AJ222769 1 0 . . Tetracycline resistance +tet32_AJ295238 1 0 . . Tetracycline resistance +tet36_AJ514254 1 0 . . Tetracycline resistance +tet44_FN594949 1 0 . . Tetracycline resistance +tet58_KY887560 1 0 . . Tetracycline resistance +tet_M74049 1 0 . . Tetracycline resistance +tetS_M_HM367711 1 0 . . Tetracycline resistance +tetS_M_AY534326 1 0 . . Tetracycline resistance +tetM_M85225 1 0 . . Tetracycline resistance +tetS_FN555436 1 0 . . Tetracycline resistance +tetM_MH283017 1 0 . . tetracycline resistance +folA_AE007317 1 1 I100L . Trimethoprim +folP_AE007317 1 1 . . Sulfamethoxazole resistance on if insertions in 56-67 amino acids +gyrA_AE007317 1 1 S81F . Fluoroquinolone +gyrA_AE007317 1 1 S81Y . Fluoroquinolone +gyrA_AE007317 1 1 S81C . Fluoroquinolone +gyrA_AE007317 1 1 S81I . Fluoroquinolone +gyrA_AE007317 1 1 E85K . Fluoroquinolone +gyrA_AE007317 1 1 Q118A . Fluoroquinolone +gyrB_AE007317 1 1 E474K . Fluoroquinolone +parC_AE007317 1 1 A63T . Fluoroquinolone +parC_AE007317 1 1 S79F . Fluoroquinolone +parC_AE007317 1 1 S79Y . Fluoroquinolone +parC_AE007317 1 1 S79L . Fluoroquinolone +parC_AE007317 1 1 S79F . Fluoroquinolone +parC_AE007317 1 1 D83G . Fluoroquinolone +parC_AE007317 1 1 D83N . Fluoroquinolone +parE_AE007317 1 1 E474K . Fluoroquinolone +parE_AE007317 1 1 D435N . Fluoroquinolone +parE_AE007317 1 1 D435H . Fluoroquinolone +parE_AE007317 1 1 P454S . Fluoroquinolone +tetO_Y07780 1 0 . . Tetracycline resistance +ermBups_HG799494 0 0 . . Erythromycin and Clindamycin resistance +ermbTr_CP002121 0 0 . . Erythromycin and Clindamycin resistance +rplD_AE007317 1 1 . . Linezolid resistance (deletion within the L4 region of the gene PMID:24492357) +rpoB_AE007317 1 1 D489E . rifampicin resistance PMID:10508007-D415E +rpoB_AE007317 1 1 H499N . rifampicin resistance PMID:10508007-H425N +rpoB_AE007317 1 1 D489N . rifampicin resistance PMID:10508007-H415N +vanB_KC489787 1 0 . . Vacomycin resistance +vanD_EU999036 1 0 . . Vacomycin resistance +vanE_FJ872411 1 0 . . Vacomycin resistance +vanG_KF704242 1 0 . . Vacomycin resistance +otrA_X53401 1 0 . . Tetracycline resistance \ No newline at end of file diff --git a/data/ariba_sequences.fasta b/data/ariba_sequences.fasta new file mode 100644 index 0000000..8da1617 --- /dev/null +++ b/data/ariba_sequences.fasta @@ -0,0 +1,715 @@ +>aph_3prime_III_1_M26832 +ATGGCTAAAATGAGAATATCACCGGAATTGAAAAAACTGATCGAAAAATACCGCTGCGTAAAAGATACGGAAGGAATGTCTCCTGCTAAGGTATATAAGCTGGTGGGAGAAAATGAAAACCTATATTTAAAAATGACGGACAGCCGGTATAAAGGGACCACCTATGATGTGGAACGGGAAAAGGACATGATGCTATGGCTGGAAGGAAAGCTGCCTGTTCCAAAGGTCCTGCACTTTGAACGGCATGATGGCTGGAGCAATCTGCTCATGAGTGAGGCCGATGGCGTCCTTTGCTCGGAAGAGTATGAAGATGAACAAAGCCCTGAAAAGATTATCGAGCTGTATGCGGAGTGCATCAGGCTCTTTCACTCCATCGACATATCGGATTGTCCCTATACGAATAGCTTAGACAGCCGCTTAGCCGAATTGGATTACTTACTGAATAACGATCTGGCCGATGTGGATTGCGAAAACTGGGAAGAAGACACTCCATTTAAAGATCCGCGCGAGCTGTATGATTTTTTAAAGACGGAAAAGCCCGAAGAGGAACTTGTCTTTTCCCACGGCGACCTGGGAGACAGCAACATCTTTGTGAAAGATGGCAAAGTAAGTGGCTTTATTGATCTTGGGAGAAGCGGCAGGGCGGACAAGTGGTATGACATTGCCTTCTGCGTCCGGTCGATCAGGGAGGATATCGGGGAAGAACAGTATGTCGAGCTATTTTTTGACTTACTGGGGATCAAGCCTGATTGGGAGAAAATAAAATATTATATTTTACTGGATGAATTGTTTTAG +>ermB_1_JN899585 +ATGAACAAAAATATAAAATATTCTCAAAACTTTTTAACGAGTGAAAAAGTACTCAACCAAATAATAAAACAATTGAATTTAAAAGAAACCGATACCGTTTACGAAATTGGAACAGGTAAAGGGCATTTAACGACGAAACTGGCTAAAATAAGTAAACAGGTAACGTCTATTGAATTAGACAGTCATCTATTCAACTTATCGTCAGAAAAATTAAAACTGAATACTCGTGTCACTTTAATTCACCAAGATATTCTACAGTTTCAATTCCCTAACAAACAGAGGTATAAAATTGTTGGGAATATTCCTTACCATTTAAGCACACAAATTATTAAAAAAGTGGTTTTTGAAAGCCATGCGTCTGACATCTATCTGATTGTTGAAGAAGGATTCTACAAGCGTACCTTGGATATTCACCGAACACTAGGGTTGCTCTTGCACACTCAAGTCTCGATTCAGCAATTGCTTAAGCTGCCAGCGGAATGCTTTCATCCTAAACCAAAAGTAAACAGTGTCTTAATAAAACTTACCCGCCATACCACAGATGTTCCAGATAAATATTGGAAGCTATATACGTACTTTGTTTCAAAATGGGTCAATCGAGAATATCGTCAACTGTTTACTAAAAATCAGTTTCATCAAGCAATGAAACACGCCAAAGTAAACAATTTAAGTACCGTTACTTATGAGCAAGTATTGTCTATTTTTAATAGTTATCTATTATTTAACGGGAGGAAATAA +>ermB_10_U86375 +ATGAACAAAAATATAAAATATTCTCAAAACTTTTTAACGAGTGAAAAAGTACTCAACCAAATAATAAAACAATTGAATTTAAAAGAAACCGATACCGTTTACGAAATTGGAACAGGTAAAGGGCATTTAACGACGAAACTGGCTAAAATAAGTAAACAGGTAACGTCTATTGAATTAGACAGTCATCTATTCAACTTATCGTCAGAAAAATTAAAACTGAACATTCGTGTCACTTTAATTCACCAAGATATTCTACAGTTTCAATTCCCTAACAAACAGAGGTATAAAATTGTTGGGAATATTCCTTACCATTTAAGCACACAAATTATTAAAAAAGTGGTTTTTGAAAGCCATGCGTCTGACATCTATCTGATTGTTGAAGAAGGATTCTACAAGCGTACCTTGGATATTCACCGAACACTAGGGTTGCTCTTGCACACTCAAGTCTCGATTCAGCAATTGCTTAAGCTGCCAGCGGAATGCTTTCATCCTAAACCAAAAGTAAACAGTGTCTTAATAAAACTTACCCGCCATACCACAGATGTTCCAGATAAATATTGGAAGCTATATACGTACTTTGTTTCAAAATGGGTCAATCGAGAATATCGTCAACTGTTTACTAAAAATCAGTTTCATCAAGCAATGAAACACGCCAAAGTAAACAATTTAAGTACCGTTACTTATGAGCAAGTATTGTCTATTTTTAATAGTTATCTATTATTTAACGGGAGGAAATAA +>ermB_16_X82819 +ATGAACAAAAATATAAAATATTCTCAAAACTTTTTAACGAGTGAAAAAGTACTCAACCAAATAATAAAACAATTGAATTTAAAAGAAACCGATACCGTTTACGAAATTGGAACAGGTAAAGGGCATTTAACGACGAAACTGGCTAAAATAAGTAAACAGGTAACGTCTATTGAATTAGACAGTCATCTATTCAACTTATCGTCAGAAAAATTAAAACTGAATACTCGTGTCACTTTAATTCACCAAGATATTCTACAGTTTCAATTCCCTAACAAACAGAGGTATAAAATTGTTGGGAATATTCCTTACCATTTAAGCACACAAATTATTAAAAAAGTGGTTTTTGAAAGCCGTGCGTCTGACATCTATCTGATTGTTGAAGAAGGATTCTACAAGCGTACCTTGGATATTCACCGAACACTAGGGTTGCTCTTGCACACTCAAGTCTCGATTCAGCAATTGCTTAAGCTGCCAGCGGAATGCTTTCATCCTAAACCAAAAGTAAACAGTGTCTTAATAAAACTTACCCGCCATACCACAGATGTTCCAGATAAATATTGGAAGCTATATACGTACTTTGTTTCAAAATGGGTCAATCGAGAATATCGTCAACTGTTTACTAAAAATCAGTTTCATCAAGCAATGAAACACGCCAAAGTAAACAATTTAAGTACCATTACTTATGAGCAAGTATTGTCTATTTTTAATAGTTATCTATTATTTAACGGGAGGAAATAA +>ermB_20_AF109075 +ATGAACAAAAATATAAAATATTCTCAAAACTTTTTAACGAGTGAAAAAGTACTCAACCAAATAATAAAACAATTGAATTTAAAAGAAACCGATACCGTTTACGAAATTGGAACAGGTAAAGGGCATTTAACGACGAAACTGGCTAAAATAAGTAAACAGGTAACGTCTATTGAATTAGACAGTCATCTATTCAACTTATCGTCAGAAAAATTAAAACTGAATACTCGTGTCACTTTAATTCACCAAGATATTCTACAGTTTCAATTCCCTAACAAACAGAGGTATAAAATTGTTGGGAGTATTCCTTACCATTTAAGCACACAAATTATTAAAAAAGTGGTTTTTGAAAGCCGTGCGTCTGACATCTATCTGATTGTTGAAGAAGGATTCTACAAGCGTACCTTGGATATTCACCGAACACTAGGGTTGCTCTTGCACACTCAAGTCTCGATTAAGCAATTGCTTAAGCTGCCAGCTGAATGCTTTCATCCTAAACCAAAAGTAAACAGTGCCTTAATAAAACTTACCCGCCATACCACAGATGTTCCAGATAAATATTGGAAACTATATACGTACTTTGTTTCAAAATGGGTCAATCGAGAATATCGTCAACTGTTTACTAAAAATCAGTTTCATCAAGCAATGAAATACGCCAAAGTAAACGATTTAAGTACCGTTACTTATGAGCAAGTATTGTCTATTTTTAATAGTTATCTATTATTTAACGGGAGGAAATAA +>ermC_13_M13761 +ATGAACGAGAAAAATATAAAACACAGTCAAAACTTTATTACTTCAAAACATAATATAGATAAAATAATGACAAATATAAGATTAAATGAACATGATAATATCTTTGAAATCGGCTCAGGAAAAGGGCATTTTACCCTTGAATTAGTACAGAGGTGTAATTTCGTAACTGCCATTGAAATAGACCATAAATTATGCAAAACTACAGAAAATAAACTTGTTGATCACGATAATTTCCAAGTTTTAAACAAGGATATATTGCAGTTTAAATTTCCTAAAAACCAATCCTATAAAATATTTGGTAATATACCTTATAACATAAGTACGGATATAATACGCAAAATTGTTTTTGATAGTATAGCTGATGAGATTTATTTAATCGTGGAATACGGGTTTGCTAAAAGATTATTAAATACAAAACGCTCATTGGCATTATTTTTAATGGCAGAAGTTGATATTTCTATATTAAGTATGGTTCCAAGAGAATATTTTCATCCTAAACCTAAAGTGAATAGCTCACTTATCAGATTAAATAGAAAAAAATCAAGAATATCACACAAAGATAAACAGAAGTATAATTATTTCGTTATGAAATGGGTTAACAAAGAATACAAGAAAATATTTACAAAAAATCAATTTAACAATTCCTTAAAACATGCAGGAATTGACGATTTAAACAATATTAGCTTTGAACAATTCTTATCTCTTTTCAATAGCTATAAATTATTTAATAAGTAA +>cat_5_U35036 +ATGACTTTTAATATTATTAATTTGGAAACTTGGGATAGAAAAGAATATTTTAATCATTATTTCAATCAACAAACAACTTACAGTGTTACTAAAGAATTTGATATCACTTTACTTAAAAGTATGATAAAAAATAAAGGATATGAACTGTATCCTGCTTTGATTTATACAATTGTAAATATTATAAATCAAAATAAAGTATTTAGAACAGGAATTAATAGTGAGGGAAATTTGGGTTATTGGGATAAATTAAACCCTTTATATACAGTCTTTAATAAAGAAACTGAAAAATTTTCTAACATTTGGACAGAATCAAATGTTAGTTTTAATTCTTTTTATAATAGTTATAAGAGTGACTTACTTGAATATAAAGATAAAAATGAAATGTTTCCTAAAAAACCAATACCTGAAAACACAGTTCCTATTTCGATGATTCCTTGGATTGATTTTAGTTCATTTAATTTAAATATTGGTAATAATAGTAGATTCCTATTGCCAATTATTACAATAGGTAAATTTTATAGTAAGAATAATAAGATCTATTTACCAGTCTCATTGCAAGTTCATCATGCGGTATGTGATGGTTACCATGTTTCATTATTTATGAGTGAATTTCAAAATATAGTTGATAGTGTAAATGAATGGATTTAA +>catpC194_1_NC_002013 +ATGAACTTTAATAAAATTGATTTAGACAATTGGAAGAGAAAAGAGATATTTAATCATTATTTGAACCAACAAACGACTTTTAGTATAACCACAGAAATTGATATTAGTGTTTTATACCGAAACATAAAACAAGAAGGATATAAATTTTACCCTGCATTTATTTTCTTAGTGACAAGGGTGATAAACTCAAATACAGCTTTTAGAACTGGTTACAATAGCGACGGAGAGTTAGGTTATTGGGATAAGTTAGAGCCACTTTATACAATTTTTGATGGTGTATCTAAAACATTCTCTGGTATTTGGACTCCTGTAAAGAATGACTTCAAAGAGTTTTATGATTTATACCTTTCTGATGTAGAGAAATATAATGGTTCGGGGAAATTGTTTCCCAAAACACCTATACCTGAAAATGCTTTTTCTCTTTCTATTATTCCATGGACTTCATTTACTGGGTTTAACTTAAATATCAATAATAATAGTAATTACCTTCTACCCATTATTACAGCAGGAAAATTCATTAATAAAGGTAATTCAATATATTTACCGCTATCTTTACAGGTACATCATTCTGTTTGTGATGGTTATCATGCAGGATTGTTTATGAACTCTATTCAGGAATTGTCAGATAGGCCTAATGACTGGCTTTTATAA +>catpC233_1_AY355285 +ATGACTTTTAATATTATTAATTTAGAAACTTGGGATAGAAAAGAATATTTCAATCATTATTTTAATCAACAAACAACTTATAGTGTTACTAAAGAATTAGATATTACCTTGTTAAAAAGTATGATAAAAGATAAAGGATATGAACTGTATCCTGCTTTGATTCATGCAATTGTAAGTGTTATAAATCGAAATAAAGTATTTAGAACAGGGATTAATAGTGAGGGGAATTTGGGTTATTGGGATAAATTAGAACCTTTATATACAGTCTTTAATAAAGAAACTGAAAAATTTTCTAATATTTGGACAGAATCAAATGCTAGTTTTAACTCTTTTTATAATAGTTATAAGAATGATTTATTTAAATATAAAGATAAAAATGAAATGTTTCCTAAAAAGCCGATACCTGAAAACACAGTTCCTATCTCGATGATTCCTTGGATTGATTTTAGTTCATTTAATTTAAATATTGGTAATAATAGTAGATTTTTATTGCCAATTATTACAATAGGTAAATTTTATAGTAAGGATGATAAGATCTATTTACCATTTTCATTGCAAGTTCATCATGCAGTATGTGATGGTTACCATGTTTCATTATTTATGAATGAATTTCAAAATATAATTGATAATGTAAATGAATGGATTTAA +>catQ_1_M55620 +ATGAAATTTAATTTGATAGATATTGAGGATTGGAATAGAAAGCCATACTTTGAGCATTATTTAAATGCGGTTAGGTGCACTTACAGTATGACTGCAAATATAGAGATAACTGGTTTACTGCGTGAAATTAAACTTAAGGGCCTGAAACTGTACCCTACGCTTATTTATATCATCACAACTGTGGTTAACCGTCACAAGGAGTTCCGCACCTGTTTTGATCAAAAAGGTAAGTTAGGATACTGGGATAGTATGAACCCAAGTTATACTGTCTTTCATAAGGATAACGAAACTTTTTCAAGTATTTGGACAGAGTATGACGAGAACTTCCCACGTTTTTACTATAATTACCTTGAGGATATTAGAAACTATAGCGACGTTTTGAATTTCATGCCTAAGACAGGTGAACCTGCTAATACAATTAATGTGTCCAGCATTCCTTGGGTGAATTTTACCGGATTCAACCTGAATATATACAATGATGCAACATATCTAATCCCTATTTTTACTTTGGGTAAGTATTTTCAGCAGGATAATAAAATTTTATTACCTATGTCTGTACAGGTGCATCATGCGGTTTGCGACGGTTATCATATAAGCAGATTTTTTAATGAGGCACAGGAATTAGCGTCAAATTATGAGACATGGTTAGGAGAAAAATAA +>msrD_2_AF274302 +ATGGAATTAATATTAAAAGCAAAAGACATTCGTGTGGAATTCAAAGGACGCGATGTTTTAGATATAAATGAATTAGAAGTATATGATTATGACCGTATTGGTTTAGTAGGAGCAAATGGTGCTGGAAAAAGCACTTTACTCAGGGTACTTTTAGGAGAATTAACTCCCCCAGGATGTAAAATGAATCGTCTGGGTGAACTTGCCTATATTCCCCAGTTGGACGAAGTAACTCTGCAGGAGGAAAAAGATTTTGCACTTGTAGGCAAGCTAGGTGTTGAGCAATTAAATATACAGACTATGAGCGGTGGTGAAGAAACAAGGCTTAAAATAGCACAGGCCTTATCGGCACAGGTTCATGGTATTTTAGCGGATGAACCTACGAGCCATTTAGACCGTGAAGGAATTGATTTTCTAATAGGACAGCTAAAATATTTTACAGGTGCACTGTTAGTTATTAGCCATGACCGCTATTTTCTTGATGAAATAGTAGATAAAATATGGGAACTGAAAGATGGCAAAATCACTGAGTATTGGGGAAACTATTCTGATTATCTTCGTCAGAAAGAGGAAGAACGTAAGAGCCAAGCTGCAGAATACGAACAATTTATTGCGGAACGTGCCCGATTGGAAAGGGCTGCGGAGGAAAAGCGAAAACAGGCTCGTAAAATAGAACAGAAGGCAAAAGGTTCTTCAAAGAAAAAAAGTACTGAAGACGGAGGGCGTTTAGCTCATCAAAAATCAATAGGAAGTAAGGAAAAAAAGATGTATAATGCTGCTAAAACCCTAGAGCACAGGATTGCGGCCTTAGGAAAAGTAGAAGCTCCGGAAGGCATTCGCAGAATTCGTTTCAGGCAAAGTAAAGCATTGGAGCTCCATAATCCATACCCTATAGTCGGTGCAGAAATTAATAAAGTATTTGGGGATAAGGCTCTGTTTGAAAATGCATCTTTTCAAATTCCGTTAGGAGCAAAAGTGGCGTTAACTGGTGGTAATGGAATCGGAAAAACAACTTTAATCCAAATGATCTTAAACCATGAAGAAGGAATTTCTATTTCGCCTAAGGCAAAAATAGGTTACTTTGCACAGAATGGTTACAAGTACAACAGTAATCAGAATGTTATGGAGTTTATGCAGAAGGATTGTGACTACAATATATCAGAAATTCGTTCAGTGCTAGCATCTATGGGGTTCAAACAGAACGATATTGGAAAAAGTTTATCTGTTTTAAGCGGTGGAGAAATTATAAAATTGTTGCTTGCTAAAATGCTCATGGGTAGATATAACATCCTAATAATGGATGAACCCAGTAACTTCCTTGACATACCAAGTTTAGAGGCTTTGGAAATACTAATGAAGGAGTACACCGGAACTATCGTGTTTATCACCCACGATAAACGATTACTCGAAAATGTAGCAGATGTAGTTTATGAAATTAGAGATAAGAAAATAAATCTGAAACATTAA +>msrD_3_AF227520 +ATGGAATTAATATTAAAAGCAAAAGACATTAGTGTGGAATTCAAAGGACACGATGTTTTAGATATAAATGAATTAGAAGTATATGATTATGACCGTATTGGTTTAGTAGGAGCAAATGGTGCAGGAAAAAGCACTTTATTCAAGGTACTTTTAGGAGAATTAATTCCCCCAGGATGTAAAATGAATCATCTGGGTGAACTTGCCTATATTCCCCAGTTGGACGAAGTAACTCTGCAGGAGGAAAAAGATTTTGCGCTTGTAGGCAAGCTAGGTGTTGAGCAATTAAATATACAGACCATGAGCGGTGGTGAAGAAACAAGGCTTAAAATAGCACAGGCCTTATCGGCACAGGTTCATGGTATTTTAGCGGATGAACCTACGAGCCATTTAGACCGTGAAGGAATTGATTTTCTAATAGGACAGCTAAAATATTTTACAGGTGCACTGTTAGTTATTAGCCATGACCGCTATTTTCTTGATGAAATAGTAGATAAAATATGGGAACTGAAAGATGGCAAAATCACTGAGTATTGGGGAAACTATTCTGATTATCTTCGTCAGAAAGAGGAAGAACGTAAGAGACAAGCTGCAGAATACGAACAATTTATTGCGGAACGTGCTCGATTGGAAAGGGCTGCGGAGGAAAAGCGAAAACAGGCTCGTAAAATAGAACAGAAGGCAAAAGGTTCTTCAAAGAAAAAAAGTACTGAAGGCGGAGGGCGTTTAGCTCATCAAAAATCAATAGGAAGTAAGGAAAAAAAGATGCATAATGCCGCTAAATCCCTAGAGAACAGGATTGCGGCATTAGGAAAAGTAGAAGCTCCGGAAGGCATTCGCAGAATTCGTTTCAGGCAAAGTAAAGCATTGGAGCTCCATAATCCATACCCTATAGTCGGTGCGGAAATTAATAAAGTATTTGGGGATAAGGCACTGTTTGAAAATGCATCTTTTCAAATTCCGCTAGGAGCAAAAGTGGCATTAACGGGTGGTAATGGAACCGGAAAAACAACTTTAATCCAAATGATCTTAAACCATGAAGAAGGAATTTCTATTTCACCTAAGGCAAAAATAGGTTACTTTGCACAGAATGGTTACAAGTACAACAGTAATCAGAATGTTATGGAGTTTATGCAGAAGGATTGTGATTACAATATATCAGAAATTCGTTCTGTGCTAGCATCTATGGGGTTCAAACAGAACGATATTGGAAAAAGCTTATCTGTTTTAAGCGGTGGAGAAATTATAAAATTGTTGCTTGCTAAAATGCTCATGGGTAGATATAACATCCTAATAATGGATGAACCCAGTAACTTCCTTGACATACCAAGTTTAGAGGCTTTGGAAATACTAATGAAGGAGTACACCGGAACTATCGTGTTTATCACCCACGATAAACGATTACTCGAAAATGTAGCTGATGTAGTTTATGAAATTAGAGATAAGAAAATTAAGCTGAAACATTAA +>mefA_10_AF376746 +ATGGAAAAATACAACAATTGGAAACGAAAATTTTATGCAATATGGGCAGGGCAAGCAGTATCATTAATCACTAGTGCCATCCTGCAAATGGCGATTATTTTTTACCTTACAGAAAAAACAGGATCTGCGATGGTCTTGTCTATGGCTTCATTAGTAGGTTTTTTACCCTATGCGATTTTGGGACCTGCCATTGGTGTGCTAGTGGATCGTCATGATAGGAAGAAGATAATGATTGGTGCCGATTTAATTATCGCAGCAGCTGGTGCAGTGCTTGCTATTGTTGCATTCTGTATGGAGCTACCTGTCTGGATGATTATGATAGTATTGTTTATCCGTAGCATTGGAACAGCTTTTCATACCCCAGCACTCAATGCGGTTACACCACTTTTAGTACCAGAAGAACAGCTAACGAAATGCGCAGGCTATAGTCAGTCTTTGCAGTCTATAAGCTATATTGTTAGTCCGGCAGTTGCAGCACTCTTATACTCCGTTTGGGATTTAAATGCTATTATTGCCATCGACGTATTGGGTGCTGTGATTGCATCTATTACGGTAGCAATTGTACGTATACCTAAGCTGGGTAATCAAGTGCAAAGTTTAGAACCAAATTTCATAAGGGAGATGAAAGAAGGAGTTGTGGTTCTGAGACAAAACAAAGGATTGTTTGCCTTATTACTCTTAGGAACACTATATACTTTTGTTTATATGCCAATCAATGCACTATTTCCTTTAATAAGCATGGAACACTTTAATGGAACGCCTGTGCATATTTCTATTACGGAAATTTCCTTTGCATTTGGGATGCTAGCAGGAGGCTTATTATTAGGAAGATTAGGGGGCTTCGAAAAGCATGTATTACTAATAACAAGTTCATTTTTTATAATGGGGACCAGTTTAGCCGTTTCGGGAATACTTCCTCCAAATGGATTTGTAATATTCGTAGTTTGCTGTGCAATAATGGGGCTTTCGGTGCCATTTTATAGCGGTGTGCAAACAGCTCTTTTTCAGGAGAAAATTAAGCCTGAATATTTAGGACGTGTATTTTCTTTGATCGGAAGTATCATGTCACTTGCTATGCCAATTGGGTTAATTCTTTCTGGATTCTTTGCTGATAAAATCGGTGTAAATCATTGGTTTTTACTATCAGGTATTTTAATTATTGGCATTGCTATAGTTTGCCAAATGATAACTGAGGTTAGAAAATTAGATTTAAAATAA +>mefE_AE007317 +TTGAAAATAGATAAAAAAAACGAGGCTTTCCTTATTGTAAGTAGAGGCATATCTCGAATTGGAGATATTATGTTTGACTTTGCGAATAATACCTTTCTTGCAGGATTAAATCCAACATCTTTATCATTGGTTGCAGTATATCAGTCACTAGAAAGTGTGATAGGTGTTCTTTTTAATTTATTTGGTGGAGTCATTGCAGATAGTTTCAAGCGGAAAAAAATTATTATTGTTGCAAATATCTTATGTGGTATTGCTTGTATAATTCTTTCATTCATATCACAAGAGCAGTGGATGGTCTTTGCAATTGTCATCACTAATATTATCTTGGCATTTATGAGTGCTTTTTCTGGACCGTCCTATAAAGCATTTACAAAAGAAATTGTAAAAAAGGATAGTATATCACAACTTAATTCATTGCTAGAGATAACAAGTACTATAATTAAAGTAACAATACCAATGGTAGCAATTTTATTATATAAGCTACTTGGGATACATGGTGTTTTACTATTGGATGGATTCTCATTTCTAATTGCTGCATCACTGATTTCCTTTATTGTACCCGTTAATGACGAAGTGGTCACAAAGGATAAAATGACAATAGGAGGAGTTTTAAATGACTTAAAAATAGGGTTTAAGTATATTTATAGTCATAAGACAATATTTATGATTATTATTCTCTCTGCTTTTGTTAATTTTTTTCTAGCAGCTTATAATTTATTGTTACCTTATAGTAATCAAATGTTTGGAGAAATTTCAGATGGGCTTTATGGTGTTTTTCTAACTGCGGAAGCAATTGGAGGATTTATTGGAGCGATATTAAGTGGTGTTATAAATAAAACCTTGTCAAGCAAACGTTTAATGGTCTTCTTATCATGTTCAGGATTGATGTTAATGCTATCAACGCCACTCTATTTTTTGTTTCAAAACTTCATTATTCTAGCCTTTTCTCCGGCATTATTTAGTCTATTTATTTCTATTTTTAATATTCAATTTTTCTCTATTGTTCAAAGAGAAGTTGATACTGAGTTTCTCGGTAGAGTCTTTGGAATCATCTTTACGGTAGCTATTCTTTTTATGCCAGTTGGGTCTGGATTTTTCTCAGTAGTTTTAAATCCTAACAATACTTTTAATCTTTTTATTATTGGTGTATCTATTACGATATTATCGCTAATATTCAGCACGCTATTGAAGAGGTATGATAAAAATAGCTGA +>tetM_1_X92947 +ATGAAAATTATTAATATTGGAGTTTTAGCTCATGTTGATGCAGGAAAAACTACCTTAACAGAAAGCTTATTATATAACAGTGGAGCGATTACAGAATTAGGAAGCGTGGACAAAGGTACAACGAGGACGGATAATACGCTTTTAGAACGTCAGAGAGGAATTACAATTCAGACAGGAATAACCTCTTTTCAGTGGGAAAATACGAAGGTGAACATCATAGACACGCCAGGACATATGGATTTCTTAGCAGAAGTATATCGTTCATTATCAGTTTTAGATGGGGCAATTCTACTGATTTCTGCAAAAGATGGCGTACAAGCACAAACTCGTATATTATTTCATGCACTTAGGAAAATGGGGATTCCCACAATCTTTTTTATCAATAAGATTGACCAAAATGGAATTGATTTATCAACGGTTTATCAGGATATTAAAGAGAAACTTTCTGCCGAAATTGTAATCAAACAGAAGGTAGAACTGTATCCTAATGTGTGTGTGACGAACTTTACCGAATCTGAACAATGGGATACGGTAATAGAGGGAAACGATGACCTTTTAGAGAAATATATGTCCGGTAAATCATTAGAAGCATTGGAACTCGAACAAGAGGAAAGCATAAGATTTCAGAATTGTTCTCTGTTCCCTCTTTATCATGGAAGTGCAAAAAGTAATATAGGGATTGATAACCTTATAGAAGTTATTACTAATAAATTTTATTCATCAACACATCGAGGTCCGTCTGAACTTTGCGGAAATGTTTTCAAAATTGAATATACAAAAAAAAGACAACGTCTTGCATATATACGCCTTTATAGTGGAGTACTACATTTACGAGATTCGGTTAGAGTATCAGAAAAAGAAAAAATAAAAGTTACAGAAATGTATACTTCAATAAATGGTGAATTATGTAAGATTGATAGAGCTTATTCTGGAGAAATTGTTATTTTGCAAAATGAGTTTTTGAAGTTAAATAGTGTTCTTGGAGATACAAAACTATTGCCACAGAGAAAAAAGATTGAAAATCCGCACCCTCTACTACAAACAACTGTTGAACCGAGTAAACCTGAACAGAGAGAAATGTTGCTTGATGCCCTTTTGGAAATCTCAGATAGTGATCCGCTTCTACGATATTACGTGGATTCTACGACACATGAAATTATACTTTCTTTCTTAGGGAAAGTACAAATGGAAGTGATTAGTGCACTGTTGCAAGAAAAGTATCATGTGGAGATAGAACTAAAAGAGCCTACAGTCATTTATATGGAGAGACCGTTAAAAAATGCAGAATATACCATTCACATCGAAGTGCCGCCAAATCCTTTCTGGGCTTCCATTGGTTTATCTGTATCACCGCTTCCGTTGGGAAGTGGAATGCAGTATGAGAGCTCGGTTTCTCTTGGATACTTAAATCAATCATTTCAAAATGCAGTTATGGAAGGGATACGCTATGGTTGTGAACAAGGATTGTATGGTTGGAATGTGACGGACTGTAAAATCTGTTTTAAGTATGGCTTATACTATAGCCCTGTTAGTACCCCAGCAGATTTTCGGATGCTTGCTCCTATTGTATTGGAACAAGTCTTAAAAAAAGCTGGAACAGAATTGTTAGAGCCATATCTTAGTTTTAAAATTTATGCGCCACAGGAATATCTTTCACGAGCATACAACGATGCTCCTAAATATTGTGCGAACATCGTAGACACTCAATTGAAAAATAATGAGGTCATTCTTAGTGGAGAAATCCCTGCTCGGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGTTTGTTTAACAGAGTTAAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAAGTACGATATATGTTCAATAAAATAACTTAG +>tetM_12_FR671418 +ATGAAAATTATTAATATTGGAGTTTTAGCTCATGTTGATGCAGGAAAAACTACCTTAACAGAAAGCTTATTATATAACAGTGGAGCGATTACAGAATTAGGAAGCGTGGACAAAGGTACAACGAGGACGGATAATACGCTTTTAGAACGTCAGAGAGGAATTACAATTCAGACAGGAATAACCTCTTTTCAGTGGGAAAATACGAAGGTGAACATCATAGACACGCCAGGACATATGGATTTCTTAGCAGAAGTATATCGTTCATTATCAGTTTTAGATGGGGCAATTCTACTGATTTCTGCAAAAGATGGCGTACAAGCACAAACTCGTATATTATTTCATGCACTTAGGAAAATGGGGATTCCCACAATCTTTTTTATCAATAAGATTGACCAAAATGGAATTGATTTATCAACGGTTTATCAGGATATTAAAGAGAAACTTTCTGCCGAAATTGTAATCAAACAGAAGGTAGAACTGTATCCTAATATGTGTGTGACGAACTTTACCGAATCTGAACAATGGGATACGGTAATAGAGGGAAACGATGACCTTTTAGAGAAATATATGTCCGGTAAATCATTAGAAGCATTGGAACTCGAACAAGAGGAAAGCATAAGATTTCAGAATTGTTCTCTGTTCCCTCTTTATCATGGAAGTGCAAAAAGTAATATAGGGATTGATAACCTTATAGAAGTGATTACGAATAAATTTTATTCATCAACACATCGAGGTCAGTCTGAACTTTGCGGAAAAGTTTTCAAAATTGAGTATTCGGAAAAAAGACAGCGTCTTGCATATATACGTCTTTATAGTGGCGTACTGCATTTGCGAGATTCGGTTAGAATATCGGAAAAGGAAAAAATAAAAATTACAGAAATGTATACTTCAATAAATGGTGAATTATGTAAAATCGATAAGGCTTATTCCGGGGAAATTGTTATTTTGCAGAATGAGTTTTTGAAGTTAAATAGTGTTCTTGGAGATACAAAGCTATTGCCACAGAGAGAGAGAATTGAAAATCCCCTCCCTCTGCTGCAAACGACTGTTGAACCGAGCAAACCTCAACAAAGGGAAATGTTACTTGATGCACTTTTAGAAATCTCCGACAGTGACCCGCTTCTGCGATATTATGTGGATTCTGCGACACATGAAATCATACTTTCTTTCTTAGGGAAAGTACAAATGGAAGTGACTTGTGCTCTGCTGCAAGAAAAGTATCATGTGGAGATAGAAATAAAAGAGCCTACAGTCATTTATATGGAAAGACCGTTAAAAAAAGCAGAGTATACCATTCACATCGAAGTTCCACCGAATCCTTTCTGGGCTTCCATTGGTCTATCTGTAGCACAGCTTCCATTAGGGAGCGGAGTACAGTATGAGAGCTCGGTTTCTCTTGGATACTTAAATCAATCGTTTCAAAATGCAGTTATGGAGGGGATACGCTATGGCTGTGAACAAGGATTGTATGGTTGGAATGTGACGGACTGTAAAATCTGTTTTAAGTATGGCTTATACTATAGCCCTGTTAGTACCCCAGCAGATTTTCGGATGCTTGCTCCTATTGTATTGGAACAAGTCTTAAAAAAAGCTGGAACAGAATTGTTAGAGCCATATCTTAGTTTTAAAATTTATGCGCCACAGGAATATCTTTCACGAGCATACAACGATGCTCCTAAATATTGTGCGAACATCGTAGACACTCAATTGAAAAATAATGAGGTCATTCTTAGTGGAGAAATCCCTGCTCGGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGTTTGTTTAACAGAGTTAAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAAGTACGATATATGTTCAATAAAATAACTTAG +>tetK_4_U38428 +TTGTTTAGTTTATATAAAAAATTTAAAGGTTTGTTTTATAGCGTTTTATTTTGGCTTTGTATTCTTTCATTTTTTAGTGTATTAAATGAAATGGTTTTAAATGTTTCTTTACCTGATATTGCAAATCATTTTAATACTACTCCTGGAATTACAAACTGGGTAAACACTGCATATATGTTAACTTTTTCGATAGGAACAGCAGTATATGGAAAATTATCTGATTATATAAATATAAAAAAATTGTTAATTATTGGTATTAGTTTGAGCTGTCTTGGTTCATTGATTGCTTTTATTGGTCACAATCACTTTTTTATTTTGATTTTTGGTAGGTTAGTACAAGGAGTAGGATCTGCTGCATTCCCTTCACTGATTATGGTGGTTGTAGCTAGAAATATTACAAGAAAAAAACAAGGCAAAGCCTTTGGTTTTATAGGATCAATTGTAGCTTTAGGTGAAGGGTTAGGTCCTTCAATAGGGGGAATAATAGCACATTATATTCATTGGTCTTACCTACTTATACTTCCTATGATTACAATAGTAACTATACCTTTTCTTATTAAAGTAATGGTACCTGGTAAATCAACAAAAAATACATTAGATATCGTAGGTATTGTTTTAATGTCTATAAGTATTATATGTTTTATGTTATTTACGACAAATTATAATTGGACTTTTTTAATACTCTTCACAATCTTTTTTGTGATTTTTATTAAACATATTTCAAGAGTTTCTAACCCTTTTATTAATCCTAAACTAGGGAAAAACATTCCGTTTATGCTTGGTTTGTTTTCTGGTGGGCTAATATTTTCTATAGTAGCTGGTTTTATATCAATGGTGCCTTATATGATGAAAACTATTTATCATGTAAATGTAGCGACAATAGGTAATAGTGTTATTTTTCCTGGAACCATGAGTGTTATTGTTTTTGGTTATTTTGGTGGTTTTTTAGTGGATAGAAAAGGATCATTATTTGTTTTTATTTTAGGATCATTGTCTATCTCTATAAGTTTTTTAACTATTGCATTTTTTGTTGAGTTTAGTATGTGGTTGACTACTTTTATGTTTATATTTGTTATGGGCGGATTATCTTTTACTAAAACAGTTATATCAAAAATAGTATCAAGTAGTCTTTCTGAAGAAGAAGTTGCTTCTGGAATGAGTTTGCTAAATTTCACAAGTTTTTTATCAGAGGGAACAGGTATAGCAATTGTAGGAGGTTTATTGTCACTACAATTGATTAATCGTAAACTAGTTCTGGAATTTATAAATTATTCTTCTGGAGTGTATAGTAATATTCTTGTAGCCATGGCTATCCTTATTATTTTATGTTGTCTTTTGACGATTATTGTATTTAAACGTTCTGAAAAGCAGTTTGAATAG +>tetM_13_AM990992 +ATGAAAATTATTAATATTGGAGTTTTAGCTCATGTTGATGCAGGAAAAACTACCTTAACAGAAAGCTTATTATATAACAGTGGAGCGATTACAGAATTAGGAAGCGTGGACAAAGGTACAACGAGGACGGATAATACGCTTTTAGAACGTCAGAGAGGAATTACAATTCAGACAGGAATAACCTCTTTTCAGTGGGAAAATACGAAGGTGAACATCATAGACACGCCAGGACATATGGATTTCTTAGCAGAAGTATATCGTTCATTATCAGTTTTAGATGGGGCAATTCTACTGATTTCTGCAAAAGATGGCGTACAAGCACAAACTCGTATATTATTTCATGCACTTAGGAAAATGGGGATTCCCACAATCTTTTTTATCAATAAGATTGACCAAAATGGAATTGATTTATCAACGGTTTATCAGGATATTAAAGAGAAACTTTCTGCCGAAATTGTAATCAAACAGAAGGTAGAACTGTATCCTAATATGTGTGTGACGAACTTTACCGAATCTGAACAATGGGATACGGTAATAGAGGGAAACGATGACCTTTTAGAGAAATATATGTCCGGTAAATCATTAGAAGCATTGGAACTCGAACAAGAGGAAAGCATAAGATTTCAGAATTGTTCTCTGTTCCCTCTTTATCATGGAAGTGCAAAAAGTAATATAGGGATTGATAACCTTATAGAAGTTATTACTAATAAATTTTATTCATCAACACATCGAGGTCCGTCTGAACTTTGCGGAAATGTTTTCAAAATTGAATATACAAAAAAAAGACAACGTCTTGCATATATACGCCTTTATAGTGGAGTACTACATTTACGAGATTCGGTTAGAGTATCAGAAAAAGAAAAAATAAAAGTTACAGAAATGTATACTTCAATAAATGGTGAATTATGTAAGATTGATAGAGCTTATTCTGGAGAAATTGTTATTTTGCAAAATGAGTTTTTGAAGTTAAATAGTGTTCTTGGAGATACAAAACTATTGCCACAGAGAAAAAAGATTGAAAATCCGCACCCTCTACTACAAACAACTGTTGAACCGAGTAAACCTGAACAGAGAGAAATGTTGCTTGATGCCCTTTTGGAAATCTCAGATAGTGATCCGCTTCTACGATATTACGTGGATTCTACGACACATGAAATTATACTTTCTTTCTTAGGGAAAGTACAAATGGAAGTGATTAGTGCACTGTTGCAAGAAAAGTATCATGTGGAGATAGAACTAAAAGAGCCTACAGTCATTTATATGGAGAGACCGTTAAAAAATGCAGAATATACCATTCACATCGAAGTGCCGCCAAATCCTTTCTGGGCTTCCATTGGTTTATCTGTATCACCGCTTCCGTTGGGAAGTGGAATGCAGTATGAGAGCTCGGTTTCTCTTGGATACTTAAATCAATCGTTTCAAAATGCAGTTATGGAGGGGATACGCTATGGCTGTGAACAAGGATTGTATGGTTGGAATGTGACGGACTGTAAAATCTGTTTTAAGTATGGCTTATACTATAGCCCTGTTAGTACCCCAGCAGATTTTCGGATGCTTGCTCCTATTGTATTGGAACAAGTCTTAAAAAAAGCTGGAACAGAATTGTTAGAGCCATATCTTAGTTTTAAAATTTATGCGCCACAGGAATATCTTTCACGAGCATACAACGATGCTCCTAAATATTGTGCGAACATCGTAGACACTCAATTGAAAAATAATGAGGTCATTCTTAGTGGAGAAATCCCTGCTCGGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGTTTGTTTAACAGAGTTAAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAAGTACGATATATGTTCAATAAAATAACTTAG +>tetM_2_X90939 +ATGAAAATTATTAATATTGGAGTTTTAGCTCATGTTGATGCGGGAAAAACTACCTTAACAGAAAGCTTATTATATAACAGTGGAGCGATTACAGAATTAGGAAGCGTGGACAAAGGTACAACGAGGACGGATAATACGCTTTTAGAACGTCAGAGAGGAATTACAATTCAGACAGGAATAACCTCTTTTCAGTGGGAAAATACGAAGGTGAACATCATAGACACGCCAGGACATATGGATTTCTTAGCAGAAGTATATCGTTCATTATCAGTTTTAGATGGGGCAATTCTACTGATTTCTGCAAAAGATGGCGTACAAGCACAAACTCGTATATTATTTCATGCACTTAGGAAAATGGGGATTCCCACAATCTTTTTTATCAATAAGATTGACCAAAATGGAATTGATTTATCAACGGTTTATCAGGATATTAAAGAGAAACTTTCTGCCGAAATTGTAATCAAACAGAAGGTAGAACTGTATCCTAATATGTGTGTGACGAACTTTACCGAATCTGAACAATGGGATACGGTAATAGAGGGAAACGATGACCTTTTAGAGAAATATATGTCCGGTAAATCATTAGAAGCATTGGAACTCGAACAAGAGGAAAGCATAAGATTTCATAATTGTTCCCTGTTCCCTGTTTATCACGGAAGTGCAAAAAACAATATAGGGATTGATAACCTTATAGAAGTGATTACGAATAAATTTTATTCATCAACACATCGAGGTCAGTCTGAACTTTGCGGAAAAGTTTTCAAAATTGAGTATTCGGAAAAAAGACAGCGTCTTGCATATATACGTCTTTATAGTGGCGTACTGCATTTGCGAGATTCGGTTAGAATATCGGAAAAGGAAAAAATAAAAATTACAGAAATGTATACTTCAATAAATGGTGAATTATGTAAAATCGATAAGGCTTATTCCGGGGAAATTGTTATTTTGCAGAATGAGTTTTTGAAGTTAAATAGTGTTCTTGGAGATACAAAGCTATTGCCACAGAGAGAGAGAATTGAAAATCCCCTCCCTCTGCTGCAAACGACTGTTGAACCGAGCAAACCTCAACAAAGGGAAATGTTACTTGATGCACTTTTAGAAATCTCCGACAGTGACCCGCTTCTGCGATATTATGTGGATTCTGCGACACATGAAATCATACTTTCTTTCTTAGGGAAAGTACAAATGGAAGTGACTTGTGCTCTGCTGCAAGAAAAGTATCATGTGGAGATAGAAATAAAAGAGCCTACAGTCATTTATATGGAAAGACCGTTAAAAAAAGCAGAGTATACCATTCACATCGAAGTTCCACCGAATCCTTTCTGGGCTTCCATTGGTTTATCTGTATCACCGCTTCCGTTGGGAAGTGGAATGCAGTATGAGAGCTCGGTTTCTCTTGGATACTTAAATCAATCATTTCAAAATGCAGTTATGGAAGGGATACGCTATGGCTGTGAACAAGGATTGTATGGTTGGAATGTGACGGACTGTAAAATCTGTTTTAAGTATGGCTTATACTATAGCCCTGTTAGTACCCCAGCAGATTTTCGGATGCTTGCTCCTATTGTATTGGAACAAGTCTTAAAAAAAGCTGGAACAGAATTGTTAGAGCCATATCTTAGTTTTAAAATTTATGCGCCACAGGAATATCTTTCACGAGCATACACCGATGCTCCTAAATATTGTGCGAACATCGTAGACACTCAATTGAAAAATAATGAGGTCATTCTTAGTGGAGAAATCCCTGCTCGGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGTTTGTTTAACAGAGTTAAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAAGTACGATATATGTTCAATAAAATAACTTAG +>tetM_4_X75073 +ATGAAAATTATTAATATTGGAGTTTTAGCTCATGTTGATGCAGGAAAAACTACCTTAACAGAAAGCTTATTATATAACAGTGGAGCGATTACAGAATTAGGAAGCGTGGACAAAGGTACAACGAGGACGGATAATACGCTTTTAGAACGTCAGAGAGGAATTACAATTCAGACAGGAATAACCTCTTTTCAGTGGGAAAATACGAAGGTGAACATCATAGACACGCCAGGACATATGGATTTCTTAGCAGAAGTATATCGTTCATTATCAGTTTTAGATGGGGCAATTCTACTGATTTCTGCAAAAGATGGCGTACAAGCACAAACTCGTATATTATTTCATGCACTTAGGAAAATGGGGATTCCCACAATCTTTTTTATCAATAAGATTGACCAAAATGGAATTGATTTATCAACGGTTTATCAGGATATTAAAGAGAAACTTTCTGCCGAAATTGTAATCAAACAGAAGGTAGAACTGTATCCTAATATGTGTGTGACGAACTTTACCGAATCTGAACAATGGGATACGGTAATAGAGGGAAACGATGACCTTTTAGAGAAATATATGTCCGGTAAATCATTAGAAGCATTGGAACTCGAACAAGAGGAAAGCATAAGATTTCATAATTGTTCCCTGTTCCCTGTTTATCACGGAAGTGCAAAAAACAATATAGGGATTGATAACCTTATAGAAGTGATTACGAATAAATTTTATTCATCAACACATCGAGGTCCGTCTGAACTTTGCGGAAATGTTTTCAAAATTGAATATACAAAAAAAAGACAACGTCTTGCATATATACGCCTTTATAGTGGAGTACTACATTTACGAGATTCGGTTAGAGTATCAGAAAAAGAAAAAATAAAAGTTACAGAAATGTATACTTCAATAAATGGTGAATTATGTAAGATTGATAGAGCTTATTCTGGAGAAATTGTTATTTTGCAAAATGAGTTTTTGAAGTTAAATAGTGTTCTTGGAGATACAAAACTATTGCCACAGAGAAAAAAGATTGAAAATCCGCACCCTCTACTACGAACAACTGTTGAACCGAGTAAACCTGAACAGAGAGAAATGTTGCTTGATGCCCTTTTGGAAATCTCAGATAGTGATCCGCTTCTACGATATTACGTGGATTCTACGACACATGAAATTATACTTTCTTTCTTAGGGAAAGTACAAATGGAAGTGATTAGTGCACTGTTGCAAGAAAAGTATCATGTGGAGATAGAACTAAAAGAGCCTACAGTCATTTATATGGAGAGACCGTTAAAAAATGCAGAATATACCATTCACATCGAAGTGCCGCCAAATCCTTTCTGGGCTTCCATTGGTTTATCTGTATCACCGCTTCCGTTGGGAAGTGGAATGCAGTATGAGAGCTCGGTTTCTCTTGGATACTTAAATCAATCATTTCAAAATGCAGTTATGGAAGGGATACGCTATGGTTGCGAACAAGGATTATATGGTTGGAATGTGACGGATTGTAAAATCTGTTTTAAGTATGGCTTATACTATAGCCCTGTTAGTACCCCAGCAGATTTTCGGATGCTTGCTCCTATTGTATTGGAACAAGTCTTAAAAAAAGCTGGAACAGAATTGTTAGAGCCATATCTTAGTTTTAAAATTTATACGCCACAGGAATATCTTTCACGAGCATACAACGATGCTCCTAAATATTGTGCGAACATCGTAGACACTCAATTGAAAAATAATGAGGTCATTCTTAGTGGAGAAATCCCTGCTCGGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGTTTGTTTAACAGAGTTAAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAAGTACGATATATGTTCAATAAAATAACTTAG +>tetM_5_U58985 +ATGAAAATTATTAATATTGGAGTTTTAGCTCATGTTGATGCAGGAAAAACTACCTTAACAGAAAGCTTATTATATAACAGTGGAGCGATTACAGAATTAGGAAGCGTGGACAAAGGTACAACGAGGACGGATAATACGCTTTTAGAACGTCAGAGAGGAATTACAATTCAGACAGGAATAACCTCTTTTCAGTGGGAAAATACGAAGGTGAACATCATAGACACGCCAGGACATATGGATTTCTTAGCAGAAGTATATCGTTCATTATCAGTTTTAGATGGGGCAATTCTACTGATTTCTGCAAAAGATGGCGTACAAGCACAAACTCGTATATTATTTCATGCACTTAGGAAAATGGGGATTCCCACAATCTTTTTTATCAATAAGATTGACCAAAATGGAATTGATTTATCAACGGTTTATCAGGATATTAAAGAGAAACTTTCTGCCGAAATTGTAATCAAACAGAAGGTAGAACTGTATCCTAATATGTGTGTGACGAACTTTACCGAATCTGAACAATGGGATACGGTAATAGAGGGAAACGATGACCTTTTAGAGAAATATATGTCCGGTAAATCATTAGAAGCATTGGAACTCGAACAAGAGGAAAGCATAAGATTTCAGAATTGTTCTCTGTTCCCTCTTTATCATGGAAGTGCAAAAAGTAATATAGGGATTGATAACCTTATAGAAGTTATTACTAATAAATTTTATTCATCAACACATCGAGGTCCGTCTGAACTTTGCGGAAATGTTTTCAAAATTGAATATACAAAAAAAAGACAACGTCTTGCATATATACGTCTTTATAGTGGAGTACTACATTTACGAGATTCGGTTAGAATATCGGAAAAGGAAAAAATAAAAATTACAGAAATGTATACTTCAATAAATGGTGAATTATGTAAAATCGATAAGGCTTATTCTGGAGAAATTGTTATTTTGCAAAATGAGTTTTTGAAGTTAAATAGTGTTCTTGGAGATACAAAACTATTGCCACAGAGAAAAAGAATTGAAAATCCGCACCCTCTACTACAAACAACTGTTGAACCGAGTAAACCTGAACAGAGAGAAATGTTGCTTGATGCCCTTTTGGAAATCTCAGATAGTGATCCGCTTCTACGATATTACGTGGATTCTACGACACATGAAATTATACTTTCTTTCTTAGGGAAAGTACAAATGGAAGTGATTAGTGCACTGTTGCAAGAAAAGTATCATGTGGAGATAGAACTAAAAGAGCCTACAGTCATTTATATGGAGAGACCGTTAAAAAATGCAGAATATACCATTCACATCGAAGTGCCGCCAAATCCTTTCTGGGCTTCCATTGGTTTATCTGTATCACCGCTTCCGTTGGGAAGTGGAATGCAGTATGAGAGCTCGGTTTCTCTTGGATACTTAAATCAATCATTTCAAAATGCAGTTATGGAAGGGATACGCTATGGTTGCGAACAAGGATTATATGGTTGGAATGTGACGGACTGTAAAATCTGTTTTAAGTATGGCTTATACTATAGCCCTGTTAGTACCCCAGCAGATTTTCGGATGCTTGCTCCTATTGTATTGGAACAAGTCTTAAAAAAAGCTGGAACAGAATTGTTAGAGCCATATCTTAGTTTTAAAATTTATGCGCCACAGGAATATCTTTCACGAGCATACAACGATGCTCCTAAATATTGTGCGAACATCGTAGACACTCAATTGAAAAATAATGAGGTCATTCTTAGTGGAGAAATCCCTGCTCGGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGTTTGTTTAACAGAGTTAAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAAGTACGATATATGTTCAATAAAATAACTTAG +>tetM_8_X04388 +ATGAAAATTATTAATATTGGAGTTTTAGCTCATGTTGATGCGGGAAAAACTACCTTAACAGAAAGCTTATTATATAACAGTGGAGCGATTACAGAATTAGGAAGCGTGGACAGAGGTACAACGAAAACGGATAATACGCTTTTAGAACGTCAGAGAGGAATTACAATTCAGACGGCGATAACCTCTTTTCAGTGGAAAAATACTAAGGTGAACATCATAGACACGCCAGGACATATGGATTTTTTAGCAGAAGTATATCGTTCATTATCAGTATTAGATGGGGCAATTCTACTGATTTCTGCAAAAGATGGCGTACAAGCACAAACTCGTATATTGTTTCATGCACTTAGGAAAATAGGTATTCCCACAATCTTTTTTATCAATAAGATTGACCAAAATGGAATTGATTTATCAACGGTTTATCAGGATATTAAAGAGAAACTTTCTGCGGAAATTGTAATCAAACAGAAGGTAGAACTGCATCCTAATATGCGTGTAATGAACTTTACCGAATCTGAACAATGGGATATGGTAATAGAAGGAAATGATTACCTTTTGGAGAAATATACGTCTGGGAAATTATTGGAAGCATTAGAACTCGAACAAGAGGAAAGCATAAGATTTCATAATTGTTCCCTGTTCCCTGTTTATCACGGAAGTGCAAAAAACAATATAGGGATTGATAACCTTATAGAAGTGATTACGAATAAATTTTATTCATCAACACATCGAGGTCAGTCTGAACTTTGCGGAAAAGTTTTCAAAATTGAGTATTCGGAAAAAAGACAGCGTCTTGCATATATACGTCTTTATAGTGGCGTACTGCATTTGCGAGATCCGGTTAGAATATCGGAAAAGGAAAAAATAAAAATTACAGAAATGTATACTTCAATAAATGGTGAATTATGTAAAATCGATAAGGCTTATTCCGGGGAAATTGTTATTTTGCAGAATGAGTTTTTGAAGTTAAATAGTGTTCTTGGAGATACAAAGCTATTGCCACAGAGAGAGAGAATTGAAAATCCCCTCCCTCTGCTGCAAACGACTGTTGAACCGAGCAAACCTCAACAAAGGGAAATGTTACTTGATGCACTTTTAGAAATCTCCGACAGTGACCCGCTTCTGCGATATTATGTGGATTCTGCGACACATGAAATCATACTTTCTTTCTTAGGGAAAGTACAAATGGAAGTGACTTGTGCTCTGCTGCAAGAAAAGTATCATGTGGAGATAGAAATAAAAGAGCCTACAGTCATTTATATGGAAAGACCGTTAAAAAAAGCAGAGTATACCATTCACATCGAAGTTCCACCGAATCCTTTCTGGGCTTCCATTGGTCTATCTGTAGCACCGCTTCCATTAGGGAGCGGAGTACAGTATGAGAGCTCGGTTTCTCTTGGATACTTAAATCAATCGTTTCAAAATGCAGTTATGGAGGGGATACGCTATGGCTGTGAACAAGGATTGTATGGTTGGAATGTGACGGACTGTAAAATCTGTTTTAAGTATGGCTTATACTATAGCCCTGTTAGTACCCCAGCAGATTTTCGGATGCTTGCTCCTATTGTATTGGAACAAGTCTTAAAAAAAGCTGGAACAGAATTGTTAGAGCCATATCTTAGTTTTAAAATTTATGCGCCACAGGAATATCTTTCACGAGCATACAACGATGCTCCTAAATATTGTGCGAACATCGTAGACACTCAATTGAAAAATAATGAGGTCATTCTTAGTGGAGAAATCCCTGCTCGGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGTTTGTTTAACAGAGTTAAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAAGTACGATATATGTTCAATAAAATAACTTAG +>tetS_M +ATGGAGGAAATAAAATTGAAAATTATTAATATCGGTATCTTAGCACATGTTGATGCAGGAAAAACTACTTTGACAGAAAGCTTACTATACAGTAGCGGAGCAATTAAAGAGTTAGGAAGTGTAGATAGCGGTACAACGAAAACGGATACTATGTTTTTGGAACGCCAGAGAGGTATTACTATTCAGACCGCAATAACATCTTTTCAACGGGAAAATGTTAAAGTAAATATTGTAGATACTCCTGGACACATGGATTTTTTGGCAGATGTATACCGTTCATTATCTGTTTTGGATGGAGCTATTTTGCTAATCTCTGCAAAAGATGGAGTACAGTCACAAACTCGTATACTATTCCATGCACTTAGAAAGATGAACATACCTATAATATTTTTTATTAACAAAATTGATCAAAATGGAATAAATTTGCCAGATGTTTATCAAGATATTAAGGACAAACTTTCTGACGACATCATAATTAAGCAGACTGTGAATCTAAATTTGAAACCTTATGTAATAGATTATACTGAACCAGAACAATGGGAGACAGTAATTGTGGGAAATGATTATTTATTAGAAAAATATACCATTGGGAAAACATTGAATATTGCAGAACTTGAAAAGGAGGAAAACGAAAGAATTCAAAGTTGCTCCTTATATCCTGTTTATCACGGAAGTGCAAAGAATAATATTGGAATTAAACAACTTATAGAGGTAATTACTAGCAAATTATTTTCACCCACACAACTCAATTCAGATAAACTTTGTGGAAATGTTTTTAAAGTAGAATATTCAGATGATGGTCAACGGCTTGTCTATGTACGTCTTTATAGTGGAACGCTACATTTGCGAGACTCAGTCAATATATCAGAAAAGGAAAAAATAAAAGTTACAGAAATGTATACTTCAATAAATGGAGAATTACGCCAGATAGATAAGGCAGAGCCTGGTGAGATTATTATTTTAAAAAATGAGCTTTTAAAACTAAATAACGTACTTGGAGATAAAAAAAGATTACCACATAGAGAAATTCTTGAGAATCCTCTTCCTATGTTACAAACAACAATTGAACCATGTAAATCAGTACAAAGAGAAAAGTTACTAGATGCACTTTTTGAAATATCCGATAGTGATCCCCTTCTACAATATTATGTAGATACAGTAACTCACGAAATTGTGCTATCTTTTTTAGGTGAGGTCCAAATGGAGGTAACTTGTACTCTGATTCAAGAAAAATATCATATTGAGATAGAAACAAGAAAACCAACTGTCATTTATATGGAAAGACCATTAAAAAAATCTGAATTTACCATTGATATCGAAGTACCTCCAAATCCTTTCTGGGCTTCTATTGGTTTATCTGTAACACCACTTCCTTTGGGTAGTGGCATTCAGTATGAGAGCCTGGTTTCTCTAGGTTATTTAAATCAATCATTTCAAAATGCAGTTATGGAAGGTATACGCTATGGGTGTGAACAAGGATTGTACGGTTGGAAATTAACAGACTGTAAGATCTGTTTTAAGTATGGTCTATATTACAGCCCTGTCAGTACGCCAGCAGATTTCCGAATGCTTGCGCCTATTGTACTAGAGCAGGCTTTTAGAAAGAGTGGTACAGAGTTATTAGAGCCATATCTTAGCTTCGAAATTTATGTACCACAAGAATATCTTTCGAGAGCATATAATGATGCTTCCAAATATTGTGCAAATATTTTAAATACTAAGTTAAAAGGTAACGAGGTCATTCTCATTGGTGAAATTCCAGCCCGTTGTATTCAAGAGTATCAAAACAGTTTAACTTTCTTTACAAATGGACGCAGTGTCTGTTTAACAGAGTTAAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAAGTACGATATATGTTCAATAAAATAACTTAG +>tetO_Y07780 +ATGAAAATAATTAACTTAGGCATTCTGGCTCACGTTGACGCAGGAAAGACAACATTAACGGAGAGTTTAT +TGTATACCAGTGGTGCAATTGCAGAACCAGGGAGCGTAGATAAAGGCACAACAAGGACAGATACAATGAA +TTTGGAGCGTCAAAGGGGAATCACTATCCAGACAGCAGTGACATCTTTTCAGTGGGAGGATGTAAAAGTC +AACATTATAGATACGCCAGGCCATATGGATTTTTTGGCGGAAGTATACCGTTCTTTATCCGTATTAGACG +GAGCAGTATTATTAGTTTCTGCAAAGGATGGCATACAGGCACAGACCCGTATACTGTTTCATGCACTACA +GACAATGAAGATTCCGACAATTTTTTTCATCAATAAAATTGACCAAGAGGGGATTGATTTGCCAATGGTA +TATCAAGAAATGAAAGCAAAGCTTTCTTCGGAAATTATAGTGAAGCAAAAGGTTGGGCAGCATCCCCATA +TAAATGTAACGGACAATGACGATATGGAACAGTGGGATGCGGTAATTATGGGAAACGATGAACTATTAGA +GAAATATATGTCAGGGAAACCGTTTAAAATGTCAGAACTGGAACAGGAAGAAAACAGGAGATTCCAAAAC +GGAACGTTATTTCCCGTTTATCACGGAAGTGCTAAAAACAATCTGGGGATTCGGCAGCTTATAGAAGTGA +TTGCCAGTAAGTTTTATTCATCAACGCCTGAAGGTCAATCTGAACTATGCGGGCAGGTTTTTAAGATTGA +ATATTCAGAGAAAAGGCGGCGTTTTGTTTATGTGCGTATATATAGCGGAACATTGCATTTGAGGGATGTT +ATTAAAATATCTGAAAAAGAGAAAATAAAAATCACAGAGATGTGTGTTCCGACAAACGGTGAATTATATT +CATCCGATACAGCCTGCTCTGGTGATATTGTAATTTTACCAAATGATGTTTTGCAGCTAAACAGTATTTT +GGGGAACGAAATGCTGTTGCCGCAGAGAAAATTTATTGAAAATCCTCTCCCTATGCTCCAAACAACGATT +GCAGTAAAGAAATCTGAACAGCGGGAAATATTGCTTGGGGCACTTACAGAAATTTCAGATGGCGACCCTC +TTTTAAAATATTATGTGGATACTACAACGCATGAGATTATACTTTCTTTTTTGGGGAATGTGCAGATGGA +AGTCATTTGTGCCATCCTTGAGGAAAAATACCATGTGGAGGCAGAAATAAAAGAGCCTACTGTTATATAT +ATGGAAAGACCGCTTAGAAAAGCAGAATATACCATCCACATAGAAGTCCCGCCAAATCCTTTCTGGGCTT +CTGTCGGGTTGTCCATAGAGCCGCTCCCTATTGGAAGCGGAGTGCAGTATGAAAGCAGAGTTTCACTTGG +ATATTTAAACCAATCGTTCCAAAATGCGGTTATGGAGGGGGTTCTTTATGGCTGCGAGCAGGGGCTGTAT +GGATGGAAAGTGACAGACTGTAAAATCTGTTTTGAATATGGATTGTATTATAGTCCTGTAAGTACCCCCG +CAGACTTTCGGCTGCTTTCCCCTATCGTATTGGAGCAGGCTTTAAAAAAAGCAGGGACAGAACTATTAGA +GCCATATCTCCACTTTGAAATTTATGCACCGCAGGAATATCTCTCACGGGCGTATCATGATGCCCCAAGG +TATTGTGCAGATATTGTAAGTACTCAGGTAAAGAATGACGAGGTCATTCTGAAAGGAGAAATCCCTGCCA +GATGTATTCAAGAATACAGGAACGATTTAACTTATTTCACAAATGGGCAGGGAGTCTGCTTGACAGAGTT +AAAAGGATACCAGCCAGCTATTGGTAAATTTATTTGCCAACCCCGCCGCCCGAATAGCCGTATAGATAAG +GTTCGGCATATGTTCCACAAGTTAGCTTAA +>folA_AE007317 +ATGACTAAGAAAATCGTAGCTATTTGGGCCCAGGATGAAGAGGGTTTGATTGGTAAGGAAAATCGTCTGCCTTGGCATTTGCCAGCAGAATTGCAGCACTTTAAAGAAACAACTCTGAATCATGCTATCTTGATGGGGCGTGTGACCTTTGATGGGATGGGGCGTCGCTTGCTTCCAAAACGGGAAACCCTGATTTTGACGCGTAATCCGGAAGAAAAGATAGATGGGGTTGCTACTTTTCAGGACGTCCAGTCTGTTCTTGACTGGTATCAGGATCAAGAAAAGAATCTCTACATTATCGGTGGGAAGCAAATTTTTCAGGCTTTTGAACCTTACCTTGATGAAGTGATTGTCACTCACATTCATGCTCGGGTGGAAGGAGATACCTATTTCCCTGAAGAGCTTGACTTGTCTCTTTTTGAGACAGTTTCAAGCAAATTTTACGCCAAAGATGAGAAGAATCCTTATGATTTTACCATCCAATACCGCAAGAGAAAGGAAGTCTAA +>gyrA_AE007317 +ATGCAGGATAAAAATTTAGTGAATGTCAATCTGACAAAGGAGATGAAGGCAAGTTTTATCGACTACGCCATGAGTGTTATCGTAGCGCGAGCTCTTCCTGATGTTCGAGATGGCTTAAAACCTGTTCACCGTCGCATTCTCTACGGAATGAATGAATTGGGTGTGACCCCAGACAAACCCCATAAAAAATCTGCTCGTATTACAGGGGATGTCATGGGTAAATACCACCCACACGGGGATTCCTCTATTTATGAAGCCATGGTCCGTATGGCTCAATGGTGGAGCTACCGTTACATGCTTGTAGATGGTCATGGGAATTTTGGTTCCATGGATGGAGATAGTGCTGCCGCTCAACGTTATACCGAGGCACGTATGAGCAAGATTGCTCTGGAAATGCTTCGTGATATCAACAAAAATACAGTTGATTTCGTTGATAACTATGATGCCAATGAACGGGAACCCTTGGTCTTGCCAGCGCGTTTTCCAAACCTTTTGGTTAATGGAGCAACTGGTATCGCGGTTGGGATGGCAACCAATATTCCACCTCATAATCTGGGTGAAACCATTGATGCAGTGAAGTTGGTCATGGATAATCCTGAAGTGACTACCAAGGACTTGATGGAAGTCTTGCCTGGACCAGATTTTCCAACTGGTGCTCTTGTCATGGGGAAATCAGGTATCCATAAGGCTTATGAAACAGGTAAAGGTTCGATTGTCCTACGTTCTCGTACAGAGATTGAAACGACTAAGACTGGTCGTGAGCGTATCGTTGTAACAGAATTTCCTTACATGGTCAATAAAACCAAGGTGCATGAGCATATTGTTCGCTTGGTTCAGGAAAAACGCATTGAGGGTATCACAGCAGTACGTGATGAGTCAAACCGTGAAGGTGTTCGATTTGTTATTGAAGTCAAGCGCGACGCCTCAGCCAATGTTATTCTCAATAACCTCTTCAAAATGACCCAAATGCAAACCAATTTTGGTTTCAATATGCTCGCTATCCAAAATGGTATACCGAAAATTTTGTCTCTTCGTCAGATTTTGGATGCTTATATTGAGCACCAAAAAGAAGTGGTTGTTCGTCGTACACGTTTTGATAAGGAAAAAGCGGAAGCGCGCGCTCATATCTTAGAAGGTCTCTTGATTGCGCTAGACCATATCGACGAAGTGATTCGTATCATCCGTGCTAGTGAAACGGATGCGGAAGCTCAAGCTGAGTTGATGAGCAAGTTTAAGCTTTCTGAACGTCAAAGTCAAGCTATCCTTGATATGCGTCTTCGTCGTTTGACAGGTTTGGAACGCGATAAGATTCAATCTGAGTATGATGACCTCTTGGCTCTGATTGCGGATTTAGCAGATATTCTTGCTAAGCCTGAACGTGTTTCTCAAATTATCAAAGACGAATTGGATGAAGTTAAACGTAAATTTTCTGATAAACGCCGTACAGAGTTGATGGTTGGACAGGTCTTGAGTCTCGAGGATGAGGACTTGATTGAAGAATCGGATGTCTTGATTACCCTTTCTAACAGAGGCTACATTAAGCGTTTGGATCAGGACGAGTTCACTGCTCAAAAACGTGGGGGTCGTGGTGTCCAAGGAACGGGAGTGAAAGATGATGACTTTGTTCGTGAGTTAGTGTCAACTAGTACCCATGATCATCTGCTCTTCTTCACAAACAAGGGACGTGTCTATCGTCTTAAAGGTTATGAAATTCCTGAGTATGGTCGGACTGCCAAAGGGCTACCAGTAGTCAATCTCTTGAAATTGGATGAAGACGAAAGTATTCAGACGGTTATCAATGTTGAGTCTGATCGCAGTGATGATGCTTATCTCTTCTTTACAACCCGTCACGGTATTGTGAAGAGAACCAGTGTTAAGGAGTTTGCCAATATTCGTCAAAATGGTCTCAAAGCGCTGAATTTAAAGGATGAAGATGAGTTAATCAATGTCTTGTTGGCAGAAGGAGATATGGATATTATCATTGGTACCAAGTTTGGTTATGCAGTTCGCTTTAATCAATCAGCCGTTCGTGGTATGAGCCGTATCGCCACTGGTGTGAAAGGTGTTAACCTTCGTGAAGGAGACACAGTTGTTGGTGCCAGCTTGATTACTGATCAAGATGAGGTTCTTATTATCACAGAAAAAGGATATGGTAAGCGTACAGTCGCTACTGAATACCCAACAAAAGGTCGTGGTGGTAAGGGAATGCAGACAGCTAAAATTACCGAAAAAAATGGCTTGCTGGCCGGTCTTATGACTGTTCAAGGGGATGAGGATTTGATGATTATCACTGATACAGGTGTCATGATTCGAACCAATCTTGCCAATATTTCACAAACAGGACGTGCAACTATGGGAGTTAAAGTAATGCGCCTGGATCAAGATGCTCAGATAGTGACTTTCACAACGGTTGCGGTGGCAGAAAAAGAAGAAGTTGGGACAGAAAACGAAACAGAAGGTGAAGCATAA +>gyrB_AE007317 +ATGACAGAAGAAATCAAAAATCTGCAGGCACAGGATTATGATGCCAGTCAAATTCAAGTTTTAGAGGGCTTAGAGGCTGTTCGTATGCGTCCAGGGATGTACATTGGATCAACCTCAAAAGAAGGTCTTCACCATCTAGTCTGGGAAATTGTTGATAACTCAATTGACGAGGCCTTGGCAGGATTTGCCAGCCATATTCAAGTTTTTATTGAGCCAGATGATTCGATTACTGTTGTGGATGATGGGCGTGGTATCCCAGTCGATATTCAGGAAAAAACAGGTCGTCCTGCTGTTGAGACCGTCTTTACAGTCCTTCACGCTGGAGGAAAGTTCGGCGGTGGTGGATACAAGGTTTCAGGTGGTCTTCACGGGGTGGGGTCGTCAGTTGTTAATGCCCTTTCCACTCAATTAGACGTTCATGTCCATAAAAACGGTAAGATTCATTACCAAGAATACCGTCGTGGTCATGTTGTCGCAGATCTTGAAATAGTTGGAGATACGGATAAAACAGGAACAACTGTTCACTTCACACCGGACCCAAAAATCTTCACTGAAACAACAATCTTTGATTTTGATAAATTAAATAAACGGATTCAAGAGTTGGCCTTTCTAAATCGCGGTCTTCAAATTTCTATCACTGATAAGCGCCAAGGTTTGGAACAAACCAAGCATTATCATTATGAAGGTGGGATTGCTAGTTACGTTGAATATATCAACGAGAACAAGGATGTAATCTTTGATACACCAATCTATACAGACGGTGAGATGGATGATATCACAGTTGAGGTAGCCATGCAGTACACAACGGGTTACCATGAAAATGTCATGAGTTTCGCCAATAATATTCATACACATGAAGGTGGAACGCATGAACAAGGTTTCCGTACAGCCTTGACACGTGTTATCAACGATTATGCTCGTAAGAATAAGTTACTGAAAGACAATGAAGACAATCTAACAGGGGAAGATGTTCGCGAAGGCTTAACTGCAGTTATCTCAGTTAAACACCCAAATCCACAGTTTGAAGGACAAACGAAGACCAAATTGGGAAATAGCGAAGTGGTCAAGATTACCAATCGCCTCTTCAGTGAAGCCTTCTCCGATTTCCTCATGGAAAATCCACAGATTGCCAAACGTATCGTAGAAAAAGGAATTTTGGCTGCCAAGGCTCGTGTGGCTGCCAAGCGTGCGCGTGAAGTCACACGTAAAAAATCTGGTTTGGAAATTTCCAACCTTCCAGGGAAACTAGCAGACTGTTCTTCTAATAACCCTGCTGAAACAGAACTCTTCATCGTCGAAGGAGACTCAGCTGGTGGATCAGCCAAATCTGGTCGTAACCGTGAGTTTCAGGCTATCCTTCCAATTCGCGGTAAGATTTTGAACGTTGAAAAAGCAAGTATGGATAAGATTCTAGCTAACGAAGAAATTCGTAGTCTTTTCACAGCCATGGGAACAGGATTTGGCGCAGAATTTGATGTTTCGAAAGCCCGTTACCAAAAACTCGTTTTGATGACCGATGCCGATGTCGATGGAGCCCACATTCGTACCCTTCTTTTAACCTTGATTTATCGTTATATGAAACCAATCCTAGAAGCTGGCTATGTTTATATTGCCCAACCACCAATCTATGGTGTCAAGGTTGGAAGCGAGATTAAAGAATATATCCAGCCGGGTGCAGATCAAGAAATCAAACTCCAAGAAGCTTTAGCCCGTTATAGTGAAGGTCGTACCAAACCGACTATTCAGCGTTATAAGGGGCTAGGTGAAATGGACGATCATCAGCTGTGGGAAACAACCATGGATCCCGAACATCGCTTGATGGCTAGAGTTTCTGTAGATGATGCTGCAGAAGCAGATAAAATCTTTGATATGTTGATGGGGGATCGAGTAGAGCCTCGTCGTGAGTTTATCGAAGAAAATGCTGTCTATAGTACACTTGATGTCTAA +>parC_AE007317 +ATGTCTAACATTCAAAACATGTCCCTGGAGGACATCATGGGAGAGCGCTTTGGTCGCTACTCCAAGTACATTATTCAAGACCGGGCTTTGCCAGATATTCGTGATGGGTTGAAGCCGGTTCAACGCCGTATTCTTTATTCTATGAATAAGGATAGCAATACTTTTGACAAGAGCTACCGTAAGTCGGCCAAGTCAGTCGGGAACATCATGGGGAATTTCCACCCACACGGGGATTCTTCTATCTATGATGCCATGGTTCGTATGTCACAGAACTGGAAAAATCGTGAGATTCTAGTTGAAATGCACGGTAATAACGGTTCTATGGACGGAGATCCTCCTGCGGCTATGCGTTATACTGAGGCACGTTTGTCTGAAATTGCAGGCTACCTTCTTCAGGATATCGAGAAAAAGACAGTTCCTTTTGCATGGAACTTTGACGATACGGAGAAAGAACCAACGGTCTTGCCAGCAGCCTTTCCAAACCTCTTGGTCAATGGTTCGACTGGGATTTCGGCTGGTTATGCCACAGACATTCCTCCCCATAATTTAGCTGAGGTCATAGATGCTGCAGTTTACATGATTGACCACCCAACTGCAAAGATTGATAAACTCATGGAATTCTTACCTGGACCAGACTTCCCTACAGGGGCTATTATTCAGGGTCGTGATGAAATCAAGAAAGCCTATGAGACTGGGAAAGGGCGCGTGGTTGTTCGTTCCAAGACTGAAATTGAAAAGCTAAAAGGTGGTAAGGAACAAATCGTTATTACTGAGATTCCTTATGAAATCAATAAGGCCAATCTAGTCAAGAAAATCGATGATGTTCGTGTTAATAACAAGGTAGCTGGGATTGCTGAGGTTCGTGATGAGTCTGACCGTGATGGTCTTCGTATCGCTATTGAACTTAAGAAAGACGCTAATACTGAGCTTGTTCTCAACTACTTATTTAAGTACACCGACCTACAAATCAACTACAACTTTAATATGGTGGCGATTGACAATTTCACACCTCGTCAGGTTGGGATTGTTCCAATCCTGTCTAGCTACATCGCTCACCGTCGAGAAGTGATTTTGGCGCGTTCACGCTTTGACAAAGAAAAGGCTGAGAAACGTCTCCGTATCGTCGAAGGTTTGATTCGTGTGATTTCGATTTTGGATGAAGTCATTGCTCTTATCCGTGCTTCTGAGAATAAGGCGGACGCCAAGGAAAACCTCAAAGTTAGCTATGATTTTACGGAAGAACAGGCTGAGGCTATCGTAACTTTGCAACTGTACCGTTTGACCAATACCGATGTGGTTGTCTTGCAGGAAGAAGAAGCAGAGCTTCGTGAGAAGATTGCTATGCTGGCGGCTATTATCGGTGATGAAAGGACTATGTACAATCTCATGAAGAAAGAACTTCGTGAGGTCAAGAAGAACTTTGCAACTCCTCGTTTGAGTTCTTTAGAAGACACTGCGAAAGCAATTGAGATTGATACAGCTAGTCTTATCGCTGAGGAAGATACCTACGTCAGCGTGACCAAGGCAGGTTACATCAAGCGTACCAGTCCACGTTCCTTTGCGGCTTCCACCTTGGAAGAAATTGGCAAGCGTGATGATGACCGTTTGATTTTTGTTCAATCTGCCAAGACAACCCAGCACCTCTTGATGTTCACAAGTCTTGGAAATGTCATCTACAGACCAATCCATGAGTTGGCAGATATTCGTTGGAAGGACATCGGAGAGCATCTGAGCCAAACCATCACAAACTTTGAAACGAATGAAGCAATCCTTTATGTGGAAGTACTGGATCAGTTTGACGATGCGACAACCTACTTTGCAGCGACTCGCCTTGGTCAAATCAAACGGGTAGAGCGAAAAGAATTCACTCCATGGCGGACCTATAGATCTAAGTCTGTCAAGTATGCTAAGCTCAAAGACGATACAGATCAGATTGTAGCAGTGGCTCCGATTAAACTAGATGATGTTGTCTTGGTTAGTCAAAATGGTTATGCCCTGCGTTTCAATATCGAAGAGGTTCCGGTTGTCGGTGCTAAGGCAGCAGGTGTCAAGGCTATGAATTTGAAAGAAGATGATGTCCTCCAATCTGGCTTTATCTGTAATACTTCGTCCTTCTACCTCTTGACCCAGCGTGGAAGCTTGAAACGTGTTTCCATTGAGGAAATTCTAGCAACCAGCCGTGCCAAACGAGGATTACAAGTCTTGCGTGAGTTGAAAAACAAACCGCATCGTGTCTTCTTGGCAGGAGCAGTTGCAGAGCAAGGATTTGTTGGCGATTTCTTCAGTACGGAAGTGGATGTGAACGACCAAACTCTGCTTGTCCAATCCAATAAAGGAACAATCTATGAAAGCCGATTGCAAGACTTGAACTTGTCAGAACGCACTAGCAATGGAAGCTTCATTTCTGACACGATTTCAGATGAAGAAGTTTTTGACGCTTATCTTCAGGAAGTAGTTACTGAAGATAAATAA +>parE_AE007317 +GTGTCAAAAAAGGAAATCAATATTAACAATTATAATGATGATGCTATTCAGGTGCTAGAAGGGTTGGATGCGGTCCGAAAACGTCCAGGGATGTATATTGGATCGACCGATGGCGCTGGTCTTCATCACCTAGTTTGGGAAATCGTTGATAATGCAGTCGATGAAGCCTTGTCTGGGTTTGGTGATCGTATTGATGTAACTATCAATAAAGACGGTAGTCTAACGGTTCAAGACCATGGACGTGGGATGCCGACAGGTATGCACGCTATGGGAATTCCAACTGTTGAGGTTATCTTTACCATTCTTCATGCCGGAGGGAAATTCGGTCAAGGTGGCTATAAGACATCAGGTGGACTTCACGGAGTGGGTTCTTCCGTTGTTAACGCCCTTTCTAGCTGGTTAGAAGTTGAAATTACCCGTGATGGCGCAGTTTACAAGCAACGTTTTGAAAATGGTGGAAAACCTGTCACGACTTTGAAGAAAATCGGTACAGCACTCAAGTCTAAAACAGGCACCAAAGTTACTTTTATGCCTGACGCGACTATCTTTTCTACGACAGATTTCAAGTACAATACCATTTCAGAGCGCCTTAATGAATCAGCCTTTCTCTTGAAAAATGTGACCTTGTCTTTAACGGACAAGCGAACAGATGAAGCGATTGAGTTCCACTATGAGAATGGAGTACAAGATTTTGTTTCTTATCTCAACGAAGATAAGGAAATCTTGACGCCAGTTCTTTACTTTGAAGGGGAAGACAATGGTTTTCAAGTGGAAGTAGCCCTCCAGTACAATGACGGATTCTCAGATAACATTCTATCCTTTGTCAATAACGTTCGCACCAAGGACGGTGGAACGCACGAGACAGGACTCAAGTCTGCCATTACCAAGGTCATGAATGACTATGCACGTAAAACAGGTCTTCTCAAGGAAAAAGATAAAAACCTTGAAGGTTCAGACTATCGTGAGGGACTAGCGGCCGTTCTTTCTATCTTAGTTCCTGAAGAACACTTGCAGTTTGAAGGACAGACCAAGGATAAACTAGGAAGCCCCCTAGCTCGCCCAGTTGTGGATGGAATAGTGGCTGATAAGTTGACCTTTTTCCTTATGGAAAATGGGGAATTAGCTTCTAACCTCATCCGCAAGGCTATCAAGGCCCGTGATGCTCGTGAAGCAGCACGTAAGGCGCGTGATGAGAGCCGAAATGGGAAGAAAAACAAGAAAGATAAGGGCTTGTTGTCTGGGAAATTGACCCCAGCCCAATCTAAGAATCCTGCTAAGAATGAACTCTATCTAGTTGAGGGGGACTCTGCCGGTGGTTCTGCCAAACAAGGTCGTGACCGCAAGTTCCAGGCTATTCTACCTCTTCGTGGTAAGGTTATCAATACAGCCAAGGCCAAGATGGCGGATATCCTCAAAAATGAAGAGATCAATACCATGATTTATACCATTGGTGCGGGTGTTGGAGCAGACTTCTCTATTGAAGATGCCAACTATGATAAGATCATTATCATGACCGATGCGGATACCGACGGTGCCCATATCCAGACCTTGCTCTTGACATTTTTCTACCGTTACATGCGTCCGCTAGTCGAGGCAGGTCATGTCTATATTGCCCTCCCACCTCTTTACAAGATGTCCAAAGGTAAAGGCAAGAAAGAAGAAGTGGCCTACGCTTGGACGGACGGAGAACTAGAAGAACTCCGTAAACAGTTCGGTAAAGGCGCTACCCTCCAACGATACAAAGGACTTGGTGAGATGAATGCGGACCAGCTCTGGGAAACAACCATGAACCCAGAAACACGTACCCTCATCCGTGTCACAATTGAAGATTTAGCGCGCGCCGAACGCCGCGTCAATGTTCTCATGGGAGATAAGGTAGAACCACGCCGTAAATGGATTGAAGATAATGTCAAGTTTACGCTAGAAGAAGCGACAGTGTTTTAA +>folP_AE007317 +ATGTCAAGTAAAGCCAATCATGCAAAGACAGTTATTTGCGGAATTATCAATGTAACCCCAGACTCCTTTT +CGGACGGTGGTCAATTTTTTGCTCTTGAGCAGGCACTCCAGCAGGCTCGTAAATTGATAGCAGAAGGAGC +CAGTATGCTCGATATCGGCGGAGAATCGACTCGGCCGGGCAGTAGCTATGTTGAGATAGAAGAGGAAATC +CAGCGTGTTGTTCCAGTGATCAAAGCGATTCGCAAGGAAAGTGATGTCCTCATCTCTATTGATACTTGGA +AGAGCCAAGTAGCAGAGGCTGCTTTGGCTGCTGGTGCCGATCTAGTCAATGATATCACTGGTCTTATGGG +TGATGAGAAAATGCCTCATGTGGTAGCTGAAGCGAGAGCGCAAGTGGTCATCATGTTTAATCCAGTTATG +GCGCGACCTCAGCACCCTAGCTCGCTCATCTTCCCTCATTTTGGTTTTGGTCAAGCTTTTACAGAGGAAG +AGTTAGCTGACTTTGAAACATTGCCAATCGAAGAATTGATGGAGGCTTTCTTTGAACGAGCACTAGCGAG +AGCGGCAGAAGCTGGTATTGCACCAGAAAATATCCTGTTGGATCCAGGAATTGGCTTTGGTCTGACCAAG +AAAGAAAATCTGCTTCTTTTACGGGACCTGGATAAACTACATCAGAAGGGCTATCCAATCTTTCTCGGAG +TGTCGCGCAAGCGATTTGTCATCAATATCCTAGAGGAGAATGGTTTTGAAGTCAATCCTGAGACAGAGCT +TGGTTTCCGCAATCGGGACACGGCTTCGGCTCATGTAACCAGTATCGCTGCAAGACAGGGTGTAGAAGTG +GTGCGCGTGCATGACGTAGCTAGTCACAGGATGGCAGTTGAAATTGCCTCTGCCATTCGTCTGGCTGATG +AAGCGGAAAATTTAGATTTAAAACAATATAAATAA +>ermBups_HG799494 +ATGCGTAATGTAGATAAAACATCTACTGTTTTGAAACAGACTAAAAACAGTGATTACGCA +GATAAATAA +>ermbTr_CP002121 +GCTTTTGATAGTCAAGCGAAATATAGCTACCTTATTGTAGAGAGGGGATTTGCTAAAAGG +TTGCAAAA +>rplD_AE007317 +ATGGCAAACGTAACATTATTTGACCAAACTGGTAAAGAAGCTGGCCAAGTTGTTCTTAGCGATGCAGTAT +TTGGTATCGAACCAAATGAATCAGTTGTGTTTGATGTAATCATCAGCCAACGCGCAAGCCTTCGTCAAGG +AACACACGCTGTTAAAAACCGCTCTGCAGTATCAGGTGGTGGACGCAAACCATGGCGTCAAAAAGGAACT +GGACGTGCTCGTCAAGGTTCTATCCGCTCACCACAATGGCGTGGTGGTGGTGTTGTCTTCGGACCAACTC +CACGTTCATACGGCTACAAACTTCCACAAAAAGTTCGTCGCCTAGCTCTTAAATCAGTTTACTCTGAAAA +AGTTGCTGAAAACAAATTCGTAGCTGTAGACGCTCTTTCATTTACAGCTCCAAAAACTGCTGAATTTGCA +AAAGTTCTTGCAGCATTGAGCATCGATTCTAAAGTTCTTGTTATCCTTGAAGAAGGAAATGAATTCGCAG +CTCTTTCAGCTCGTAACCTTCCAAACGTGAAAGTTGCAACTGCTACAACTGCAAGTGTTCTTGACATCGC +AAATAGCGACAAACTTCTTGTCACACAAGCAGCTATCTCTAAAATCGAGGAGGTTCTTGCATAA +>rpoB_AE007317 +TTGACAAGGCTTGGAACTTATTTACAAAGGAGAATCATCTTGGCAGGACATGACGTTCAATACGGGAAAC +ATCGTACCCGTCGTAGTTTTTCAAGAATCAAAGAAGTTCTTGACTTACCAAATTTGATTGAAATTCAAAC +TGACTCATTCAAAGCTTTCCTAGACCACGGTCTTAAGGAAGTGTTTGAAGATGTATTGCCAATTTCAAAC +TTCACAGACACAATGGAGTTGGAATTTGTTGGATATGAAATCAAGGAACCAAAATACACGCTAGAAGAAG +CTCGTATCCACGATGCTAGCTACTCAGCACCAATTTTTGTAACCTTCCGTTTGATCAATAAAGAAACAGG +CGAAATCAAGACCCAAGAAGTTTTCTTTGGTGATTTCCCAATCATGACAGAAATGGGTACTTTCATCATC +AATGGTGGTGAACGTATTATCGTTTCTCAGTTGGTCCGCTCACCAGGTGTTTACTTTAACGACAAAGTAG +ACAAAAATGGTAAGGTGGGCTATGGTTCAACTGTTATCCCTAACCGTGGAGCTTGGTTGGAACTTGAAAG +CGACTCAAAAGATATCACCTACACTCGTATCGACCGTACTCGTAAGATTCCATTTACAACCTTGGTTCGT +GCTCTTGGTTTCTCAGGTGATGATGAAATCTTTGATATCTTTGGTGACAGCGAATTGGTTCGCAACACTG +TTGAAAAAGATATCCACAAGAATCCAATGGACTCTCGTACAGACGAAGCCTTGAAAGAAATTTACGAACG +CCTTCGTCCAGGTGAGCCTAAGACAGCTGAAAGCTCACGTAGCTTGCTTGTGGCTCGCTTCTTTGACCCA +CGTCGCTATGACTTGGCAGCAGTTGGTCGTTACAAAATCAATAAAAAACTCAATGTTAAAACACGTTTGC +TCAACCAAACCATTGCAGAGCCATTGGTAGACCCTGAAACTGGAGAAATCTTGGTAGAAGCTGGTACGAT +TATGACTCGTAGCGTGATTGAAAGCATTGAAAGCCATTTGGATGGCGACTTGAACAAGATTGTCTACATC +CCAAACGATGCAGCCGTTGTGACTGAGCCTGTTGTTCTTCAAAAATTCAAGGTTGTTGCTCCAACTGATC +CAGATCGCGTCGTAACGATCATTGGTAATGCTAACCCAGATGACAAGGTTCGTACGGTGACTCCTGCAGA +TATCCTTGCTGAGATGAGCTACTTCCTCAACTTGGCTGAAGGACTTGGCCGTGTAGATGATATCGACCAC +CTTGGAAATCGTCGTATCCGTGCGGTTGGTGAATTGCTTGCCAACCAAGTACGTTTGGGACTTTCTCGTA +TGGAACGTAATGTCCGTGAACGTATGTCTGTTCAGGACAATGAAGTCTTGACACCACAACAAATTATCAA +TATCCGTCCTGTAACAGCTGCAGTTAAAGAATTCTTTGGTTCATCACAGTTGTCACAGTTCATGGACCAA +CACAACCCGCTTTCTGAGTTGTCTCACAAACGCCGTTTGTCAGCCTTAGGACCTGGTGGTTTGACTCGTG +ACCGTGCCGGATATGAAGTGCGTGACGTGCACTACACTCACTATGGTCGTATGTGTCCAATCGAAACACC +TGAAGGACCTAACATCGGTTTGATCAATAACTTGTCATCTTACGGACACTTGAACAAATATGGTTTTGTT +CAAACACCATACCGTAAGGTTGACCGTGAAACAGGTGTTGTCACGAACGAAATTGTTTGGTTGACAGCTG +ATGAAGAAGATGAATATACTGTAGCTCAGGCTAACTCTCGTCTGAATGAAGATGGAACCTTTGCTGAGAA +GATTGTCATGGGACGTCACCAAGGGGTCAACCAAGAGTATCCAGCTAATATTGTTGACTACATGGACGTT +TCACCAAAACAGGTAGTTGCCGTTGCGACAGCATGTATTCCTTTCTTGGAAAACGATGACTCCAACCGTG +CCCTCATGGGAGCCAATATGCAACGTCAGGCTGTGCCATTGATTAATCCTCAGGCACCTTACGTTGGTAC +TGGTATGGAATACCAAGCAGCCCACGATTCTGGTGCGGCTGTGATTGCTCAGTATGATGGTAAAGTTACT +TACGCAGATGCTGACAAGGTAGAAGTTCGTCGTGAAGATGGTTCATTGGATGTTTACCACATCCAAAAAT +TCCGTCGTTCAAACTCAGGTACTGCTTACAACCAACGCACTCTCGTAAAAGTTGGTGATGTCGTTGAAAA +AGGCGATTTCATCGCTGACGGACCTTCTATGGAAAATGGAGAAATGGCGCTTGGACAAAACCCAATCGTT +GCCTACATGACTTGGGAAGGTTACAACTTCGAGGATGCCGTTATCATGAGCGAACGCTTGGTGAAGGACG +ATGTCTACACATCTGTTCACCTTGAAGAATACGAATCAGAAACGCGCGATACAAAGCTTGGGCCTGAAGA +AATCACTCGCGAAATTCCAAACGTTGGTGAAGATGCCCTCAAAGACCTTGACGAAATGGGGATTATCCGT +ATTGGTGCTGAGGTTAAAGAAGGTGATATTCTTGTAGGTAAAGTAACACCTAAGGGTGAGAAAGATCTTT +CAGCTGAAGAACGTCTCTTGCACGCTATCTTTGGAGACAAGTCTCGTGAAGTGCGTGATACTTCTCTTCG +TGTACCACACGGTGCCGATGGTGTCGTTCGTGATGTTAAGATCTTTACACGTGTAAATGGAGATGAGTTG +CAATCAGGTGTTAACATGTTGGTTCGTGTTTACATCGCTCAAAAACGTAAGATTAAGGTCGGAGATAAAA +TGGCCGGACGTCACGGAAACAAAGGGGTTGTCTCTCGTATCGTTCCTGTAGAAGACATGCCTTACCTTCC +AGACGGAACTCCAGTCGACATCATGTTGAACCCACTTGGGGTGCCATCACGTATGAATATCGGTCAGGTT +ATGGAGCTTCACCTTGGTATGGCAGCTCGTACTCTTGGTATTCACATTGCGACACCAGTCTTTGATGGAG +CAAGTTCTGAAGATCTTTGGTCAACTGTTAAAGAAGCAGGTATGGATAGCGATGCCAAGACAATCCTTTA +CGATGGACGTACAGGTGAACCATTTGATAACCGTGTTTCTGTTGGAGTCATGTACATGATCAAACTCCAC +CACATGGTTGACGATAAATTGCACGCGCGTTCAGTCGGACCTTACTCAACTGTTACCCAACAACCACTCG +GAGGTAAAGCTCAGTTTGGTGGACAACGTTTCGGTGAGATGGAGGTTTGGGCTCTTGAAGCCTACGGTGC +GTCAAATGTCCTTCAAGAAATCTTGACTTACAAGTCGGACGATATCAACGGACGTTTGAAAGCCTATGAA +GCTATTACAAAAGGCAAACCAATTCCAAAACCAGGTGTTCCAGAATCCTTCCGAGTTCTTGTCAAAGAAT +TGCAATCTCTTGGTCTTGACATGCGTGTCCTAGACGAAGATGACCAAGAAGTGGAACTTCGCGACTTGGA +TGAAGGAATGGACGAAGATGTCATCCACGTAGATGACCTTGAAAAAGCCCGCGAAAAAGCAGCCCAAGAG +GCTAAAGCAGCCTTTGAAGCTGAAGAAGCTGAGAAAGCAACAAAAGCGGAAGCAACAGAAGAAGCTGCTG +AACAAGAATAA +>vanB_KC489787 +ATGAATAAAATAAAAGTCGCAATTATCTTCGGCGGTTGCTCGGAGGAACATGATGTATCGGTAAAATCCG +CAATAGAAATTGCTGCGAACATTAATACTGAAAAATTCGATCCGCACTACATCGGAATTACAAAAAACGG +CGTATGGAAGCTATGCAAGAAGCCATGTACGGAATGGGAAGCCGATAGTCTCCCCGCCATATTCTCCCCG +GATAGGAAAACGCATGGTCTGCTTGTCATGAAAGAAAGAGAATACGAAACTCGGCGTATTGACGTGGCTT +TCCCGGTTTTGCATGGCAAATGCGGGGAGGATGGTGCGATACAGGGTCTGTTTGAATTGTCTGGTATCCC +CTATGTAGGCTGCGATATTCAAAGCTCCGCAGCTTGCATGGACAAATCACTGGCCTACATTCTTACAAAA +AATGCGGGCATCGCCGTCCCCGAATTTCAAGTGATTGAAAAAGGTGGCAAACCGGAGGCGAGGACGCTTA +CCTACCCTGTCTTTGTGAAGCCGGCACGGTCAGGTTCGTCCTTTGGCGTAACCAAAGTAAACAGTACGGA +AGAACTAAACGCTGCGATAGAAGCAGCAGGACAATATGATGGAAAAATCTTAATTGAGCAAGCGATTTCG +GGCTGTGAGGTCGGCTGCGCGGTCATGGGAAACGAGGATGATTTGATTGTCGGCGAAGTGGATCAAATCC +GGTTGAGCCACGGTATCTTCCGCATCCATCAGGAAAACGAGCCGGAAAAAGGCTCAGAGAATGCGATGAT +TATCGTTCCAGCAGACATTCCGGTCGAGGAACGAAATCGGGTGCAAGAAACGGCAAAGAAAGTATATCGG +GTGCTTGGATGCAGAGGGCTTGCTCGTGTTGATCTTTTTTTGCAGGAGGATGGCGGCATCGTTCTAAACG +AGGTCAATACCCTGCCCGGTTTTACATCGTACAGCCGCTATCCACGCATGGCGGCTGCCACAGGAATCAC +GCTTCCCGCACTAATTGACAGCCTGATTACATTGGCGATAGAGAGGTGA +>vanD_EU999036 +ATGTTTAAGATTAAAGTTGCAGTTCTGTTTGGGGGCTGTTCAGAGGAACATAATGTTTCGATAAAATCTG +CGATGGAGATTGCCGCAAACATAGATACAAAAAAATATCAGCCTTATTATATTGGAATCACAAAATCCGG +CGTTTGGAAGATGTGTGAAAAACCTTGTTTGGGGTGGGAACAATATGCGGGGGATCCGGTTGTTTTTTCG +CCGGACAGAAGTACGCATGGTCTGCTGATACAAAAAGACACTGGGTATGAAATCCAGCCTGTAGATGTGG +TATTTCCGATGATTCATGGCAAGTTTGGCGAAGATGGATCCATACAAGGCTTGCTTGAATTGTCAGGCAT +TCCGTATGTGGGATGCGATATTCAAAGCTCCGTGATCTGCATGGATAAGGCGCTTGCATATACCGTTGTG +AAAAATGCGGGTATCGCTGTGCCTGGGTTCCGGATCCTTCAGGAGGGGGATCGCCTGGAAACGGAGGATT +TAGTATATCCCGTCTTTGTAAAGCCTGCCCGTTCTGGCTCATCCTTTGGCGTAAACAAGGTATGCAAGGC +AGAAGAACTGCAGGCAGCAATCAGAGAAGCAAGAAAATATGATAGCAAGATTTTGATTGAAGAGGCCGTT +ACCGGGAGTGAGGTAGGCTGCGCCATACTGGGAAACGAAAATGATCTCATGGCTGGCGAGGTGGATCAGA +TTGAGCTGAGACACGGCTTTTTTAAGATTCATCAGGAAGCACAGCCGGAGAAGGGATCTGAAAATGCAGT +TATCAGAGTTCCAGCCGCCTTACCGGATGAGGTAAGAGAACGGATTCGGAAAACAGCAATGAAGATTTAC +CGGATACTTGGCTGCCGAGGATTGGCCCGTATTGATCTGTTTTTGCGGGAGGACGGCTGCATTGTGCTGA +ATGAAGTGAATACCATGCCGGGTTTTACTTCCTACAGCCGTTATCCCCGCATGATGACAGCAGCCGGTTT +TACGCTTTCTGAAATACTGGATCGCTTGATTGAATTTTCACTTAGGAGGTAA +>vanE_FJ872411 +ATGAAGACAGTTGCGATTATCTTTGGCGGAGTTTCTTCTGAATATGAAGTTTCACTGAAA +TCTGCTGTAGCGATTATTAAAAATATGGAATCTATTGATTATAACGTAATGAAAATAGGG +ATCACCGAAGAAGGTCATTGGTATCTATTTGAAGGAACGACAGACAAAATAAAGAAAGAT +CGTTGGTTTTTAGATGAAAGCTGTGAAGAAATCGTAGTTGATTTCGCAAAAAAAAGCTTT +GTATTGAAAAACAGTAAAAAAATAATCAAGCCTGATATTTTATTCCCAGTTTTACATGGA +GGTTATGGTGAGAATGGTGCTATGCAGGGAGTATTTGAGTTATTAGATATTCCATATGTA +GGTTGTGGTATCGGAGCTGCAGCAATCTCTATGAATAAAATAATGCTCCATCAATTTGCT +GAAGCAATTGGTGTAAAAAGCACCCCTAGTATGATTATAGAAAAGGGACAAGACCTACAA +AAAGTCGATGCGTTTGCGAAAATACATGGATTTCCTTTATATATTAAACCGAATGAGGCA +GGCTCATCAAAAGGAATTAGCAAGGTAGAACGAAAAAGTGATTTATATAAAGCAATAGAC +GAAGCTTCAAAATATGATAGTCGTATTTTAATTCAAAAGGAAGTGAAAGGGGTAGAAATT +GGTTGTGGTATTTTAGGAAATGAACAATTGGTCGTTGGAGAATGTGACCAAATCAGTCTT +GTGGATGGCTTTTTCGATTATGAAGAGAAATACAATTTAGTAACAGCAGAAATTTTGTTA +CCAGCTAAACTATCAATAGACAAAAAAGAAGATATTCAGATGAAAGCAAAAAAACTATAC +AGACTATTAGGATGCAAAGGATTAGCGAGAATCGACTTTTTCTTAACTGATGACGGAGAA +ATTTTATTAAATGAAATCAATACAATGCCTGGTTTTACAGAGCATTCGAGATTTCCAATG +ATGATGAATGAGATTGGGATGGACTACAAAGAGATTATAGAAAACCTATTAGTATTGGCG +GTGGAAAATCATGAAAAAAAATTATCTACGATTGATTAA +>vanG_KF704242 +ATGCAGAAGAAAAAAATAGCTATTATTTTTGGCGGCAATTCAACAGAGTATGAGGTGTCATTACAATCGG +CATTTTCTGTTTTTGAAAATATCAATAAAGAAAAATTCGACATAGTTCCAATCGGAATTACCAGAAATGG +CGACTGGTATCATTACACAGGCAAAAAAGAAAAGATTGCAAATAATACTTGGTTTGAGGATAACGAAAAC +CTGTATTCTGTTGCGGTATCGCAAAACCGTTCTGTAAAAGGCTTTATAGAATTTAAGGAAGAAAAATTCT +ACATCATTAAGGTTGACTTGATATTTCCTGTATTGCACGGCAAGAACGGCGAGGACGGTACTTTGCAGGG +ATTATTTGAATTGGCAGGAATACCTGTTGTTGGGTGTGATACACTCTCGTCTGCTCTTTGTATGGACAAA +GATAAAGCACATAAACTTGTTAGCCTTGCGGGTATCTCTGTTCCAAAATCAGTAACATTCAAACGCTTTA +ACAAAGAAGCAGCGATGAAAGAGATTGAAGCGAATTTAACTTATCCGCTGTTTATTAAACCTGTTCGTGC +AGGCTCTTCCTTTGGAATAACAAAAGTAATTGAAGAGCAAGAGCTTGATGCTGCCATAGAGTTGGCATTT +GAACACGATACAGAAGTCATCGTTGAAGAAACAATAAACGGCTTTGAAGTCGGTTGTGCCGTACTTGGCA +TAGATGAGCTGATTGTTGGCAGAGTTGATGAAATCGAACTGTCAAGCGGCTTTTTTGATTATACAGAGAA +ATATACACTTAAATCTTCAAAGATATATATGCCTGCAAGGATTGATGCTGAAGCAGAAAAACGGATACAA +GAAACGGCTGTAACTATATATAAAGCTCTGGGCTGTTCGGGTTTTTCCAGAGTGGATATGTTTTATACAC +CGTCTGGCGAAATTGTATTTAATGAGGTAAACACAATACCAGGCTTTACCTCGCACAGTCGCTATCCAAA +TATGATGAAAGGCATTGGTCTATCGTTCGCCCAAATGTTGGATAAGCTGATAGGTCTGTATGTGGAATGA +>tetS_M_MH283012 +TTGAAAATTATTAATATCGGTATCTTAGCACATGTTGATGCAGGAAAAACTACTTTGACAGAAAGCTTAC +TATACAGTAGCGGAGCAATTAAAGAGTTAGGAAGTGTAGATAGCGGTACAACGAAAACGGATACTATGTT +TTTGGAACGCCAGAGAGGTATTACTATTCAGACCGCAATAACATCTTTTCAACGGGAAAATGTTAAAGTA +AATATTGTAGATACTCCTGGACACATGGATTTTTTGGCAGATGTATACCGTTCATTATCTGTTTTGGATG +GAGCTATTTTGCTAATCTCTGCAAAAGATGGAGTACAGTCACAAACTCGTATACTATTCCATGCACTTAG +AAAGATGAACATACCTATAATATTTTTTATTAACAAAATTGATCAAAATGGAATAAATTTGCCAGATGTT +TATCAAGATATTAAGGACAAACTTTCTGACGACATCATAATTAAGCAGACTGTGAATCTAAATTTGAAAC +CTTATGTAATAGATTATACTGAACCAGAACAATGGGAGACAGTAATTGTGGGAAATGATTATTTATTAGA +AAAATATACCATTGGGAAAACATTGAATATTGCAGAACTTGAAAAGGAGGAAAACGAAAGAATTCAAAGT +TGCTCCTTATATCCTGTTTATCACGGAAGTGCAAAGAATAATATTGGAATTAAACAACTTATAGAGGTAA +TTACTAGCAAATTATTTTCACCCACACAACTCAATTCAGATAAACTTTGTGGAAATGTTTTTAAAGTAGA +ATATTCAGATGATGGTCAACGGCTTGTCTATGTACGTCTTTATAGTGGAACGCTACATTTGCGAGACTCA +GTCAATATATCAGAAAAGGAAAAAATAAAAGTTACAGAAATGTATACTTCAATAAATGGAGAATTACGCC +AGATAGATAAGGCAGAGCCTGGTGAGATTATTATTTTAAAAAATGAGCTTTTAAAACTAAATAACGTACT +TGGAGATAAAAAAAGATTACCACATAGAGAAATTCTTGAGAATCCTCTTCCTATGTTACAAACAACAATT +GAACCATGTAAATCAGTACAAAGAGAAAAGTTACTAGATGCACTTTTTGAAATATCCGATAGTGATCCCC +TTCTACAATATTATGTAGATACAGTAACTCACGAAATTGTGCTATCTTTTTTAGGTGAGGTCCAAATGGA +GGTAACTTGTACTCTGATTCAAGAAAAATATCATATTGAGATAGAAACAAGAAAACCAACTGTCATTTAT +ATGGAAAGACCATTAAAAAAATCTGAATTTACCATTGATATCGAAGTACCTCCAAATCCTTTCTGGGCTT +CTATTGGTTTATCTGTAACACCACTTCCTTTGGGTAGTGGCATTCAGTATGAGAGCCTGGTTTCTCTAGG +TTATTTAAATCAATCATTTCAAAATGCAGTTATGGAAGGTATACGCTATGGGTGTGAACAAGGATTGTAC +GGTTGGAAATTAACAGACTGTAAGATCTGTTTTAAGTATGGTCTATATTACAGCCCTGTCAGTACGCCAG +CAGATTTCCGAATGCTTGCGCCTATTGTACTAGAGCAGGCTTTTAGAAAGAGTGGTACAGAGTTATTAGA +GCCATATCTTAGCTTCGAAATTTATGTACCACAAGAATATCTTTCGAGAGCATATAATGATGCTTCCAAA +TATTGTGCAAATATTTTAAATACTAAGTTAAAAGGTAACGAGGTCATTCTCATTGGTGAAATTCCAGCCC +GTTGTATTCAAGAGTATCGAAACAGTTTAACTTTCTTTACAAATGGACGCAGTGTCTGTTTAACAGAGTT +AAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAA +GTACGATATATGTTCAATAAAATAACTTAG +>tetM_MH283017 +ATGAAAATTATTAATATTGGAGTTTTAGCTCATGTTGATGCGGGAAAAACTACCTTAACAGAAAGCTTAT +TATATAACAGTGGAGCGATTACAGAATTAGGAAGCGTGGACAGAGGTACAACGAAAACGGATAATACGCT +TTTAGAACGTCAGAGAGGAATTACAATTCAGACGGCGATAACCTCTTTTCAGTGGAAAAATACTAAGATA +AACATCATAGACACGCCAGGACATATGGATTTTTTAGCAGAAGTATATCGTTCATTATCAGTATTAGATG +GGGCAATTCTACTGATTTCTGCAAAAGATGGCGTACAAGCACAAACTCGTATATTGTTTCATGCACTTAG +GAAAATAGGTATTCCCACAATCTTTTTTATCAATAAGATTGACCAAAATGGAATTGATTTATCAACGGTT +TATCAGGATATTAAAGAGAAACTTTCTGCGGAAATTGTAATCAAACAGAAGGTAGAACTGCATCCTAATA +TGCGTGTAATGAACTTTACCGAATCTGAACAATGGGATATGGTAATAGAAGGAAATGATTACCTTTTGGA +GAAATATACGTCTGGGAAATTATTGGAAGCATTAGAACTCGAACAAGAGGAAAGCATAAGATTTCATAAT +TGTTCCCTGTTCCCTGTTTATCACGGAAGTGCAAAAAACAATATAGGGATTGATAACCTTATAGAAGTGA +TTACGAATAAATTTTATTCATCAACACATCGAGGTCAGTCTGAACTTTGCGGAAAAGTTTTCAAAATTGA +GTATTCGGAAAAAAGACAGCGTCTTGCATATATACGTCTTTATAGTGGCGTACTGCATTTGCGAGATTCG +GTTAGAATATCGGAAAAGGAAAAAATAAAAATTACAGAAATGTATACTTCAATAAATGGTGAATTATGTA +AAATCGATAAGGCTTATTCCGGGGAAATTGTTATTTTGCAGAATGAGTTTTTGAAGTTAAATAGTGTTCT +TGGAGATACAAAGCTATTGCCACAGAGAGAGAGAATTGAAAATCCCCTCCCTCTGCTGCAAACGACTGTT +GAACCGAGCAAACCTCAACAAAGGGAAATGTTACTTGATGCACTTTTAGAAATCTCCGACAGTGACCCGC +TTCTGCGATATTATGTGGATTCTGCGACACATGAAATCATACTTTCTTTCTTAGGGAAAGTACAAATGGA +AGTGACTTGTGCTCTGCTGCAAGAAAAGTATCATGTGGAGATAGAAATAAAAGAGCCTACAGTCATTTAT +ATGGAAAGACCGTTAAAAAAAGCAGAGTATACCATTCACATCGAAGTTCCACCGAATCCTTTCTGGGCTT +CCATTGGTCTATCTGTAGCACCGCTTCCATTAGGGAGCGGAGTACAGTATGAGAGCTCGGTTTCTCTTGG +ATACTTAAATCAATCGTTTCAAAATGCAGTTATGGAGGGGATACGCTATGGCTGTGAACAAGGATTGTAT +GGTTGGAATGTGACGGACTGTAAAATCTGTTTTAAGTATGGCTTATACTATAGCCCTGTTAGTACCCCAG +CAGATTTTCGGATGCTTGCTCCTATTGTATTGGAACAAGTCTTAAAAAAAGCTGGAACAGAATTGTTAGA +GCCATATCTTAGTTTTAAAATTTATGCGCCACAGGAATATCTTTCACGAGCATACAACGATGCTCCTAAA +TATTGTGCGAACATCGTAGACACTCAATTGAAAAATAATGAGGTCATTCTTAGTGGAGAAATCCCTGCTC +GGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGTTTGTTTAACAGAGTT +AAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAA +GTACGATATATGTTCAATAAAATAACTTAG +>tetAp_L20800 +ATGGTTAATAAACTTTCAGCATATAAAACTTATTTATTATTTTCAGCTATTACAGCAATGTGTTTTTCGT +TAGTAGCTACAGTTATGATGGTGTATCACATTGAAATAGTTCATTTAAATCCACTTCAGCTTATACTTGT +TGGAACTACTTTGGAATTAGCATGCTTTATATTTGAAATTCCTACAGCTATAGTTGCAGATGTGTATAGT +CGTAAACTATCTATTGTTATTGGGGGAGTTTTAACAGGAGTGGGATTTATTTTAGAAGGTTCTATTTCTA +GTTTTGTTTTCGTACTTGTAGCACAGATTGTATGGGGATTAGGGTCTACTTTTATCAGTGGCTCGCTTGA +AGCTTGGATTGCGGAAGAAGAGAAGAATAAAGATTTAGATGAAATTTATATAAAGGGAGCACAAGCAGGG +CAGATAGGAGCATTTATTGGAATAGTACTAAGCACTGTAATAGCTAATTTCTCTGTAAGGCTTCCTATTA +TAGTTAGTGGAGTTTTATTTATAATTCTTGCATTATTTTTATGGTTATATATGCCAGAAAATAATTTTAA +ACCATCTGCTCCTGGGGATTTAAATACATTCAAAAAGATGGTATATACATTTAAATCTGGTCTTAAATTT +GTAAAAAGTAAATCTATAATTATGATTTTACTTGCAGTAACTTTATTTTATGGATTATCAAGTGAAGGTT +ATGATAGACTTTCTAATGCGCATTTTTTACAAGATACTACACTTCCTAAACTTGGAAACCTTAGTTCAGT +GACTTGGTTTGGAATTTTTGGAATTTTAGGAATGATATTGAGCTTCATAGTAATGCATTTTATGGCAAAG +AATCTTAAGAATGAGGATAATAGGAAAAATGGAAAACTATTATTATGCATAAATATACTTTATATATCGT +CTATGTTGATATTTGCTCTTACAAGAAACTTTAGTTTAATGTTAATAGCTTATTTGGCAACAAATACCTT +TAGAATTATAAATAAACCTATATTCAGTGCGTGGTTAAATGGGCATATAGATGATAATTCTAGAGCTACT +GTGCTTTCTATAAATGGACAAATGAATTCCTTAGGTCAAATTTTAGGTGGACCGATTATAGGAATCATAG +CTACAAATATTTCAGTAAGTATTGGTATAGTATGTACTTCGTTATTAGTAACACCGGTATTAGTGTTATA +TATTGTTGCTATGATAATTGATAAAAAGGTGGATGATAGAGTTGGAGGTATTGATTATGAAGAAAATAAT +TAA +>tetBp_L20800 +ATGAAGAAAATAATTAATATAGGAATCGTAGCACACGTGGATGCAGGAAAAACAACTATAACAGAAAACT +TATTATATTATAGTGGAGCTATAAAATCAGTTGGAAGAGTTGATTTAGGCAATACACAGACGGATTCTAT +GGAGCTTGAGCGTAAGAGAGGAATTACCATTAAATCGTCAACCATATCTTTTAATTGGAATAATGTTAAG +GTGAATATTATTGATACTCCAGGACATGTGGATTTTATTTCGGAAGTTGAACGTTCATTAAATAGCTTAG +ATGGAGCAATACTAGTTATATCAGGAGTAGAGGGGATTCAGTCACAAACAAGAATATTATTTGACACATT +AAAGGAGTTAAATATTCCAACAATAATTTTTGTAAATAAGCTAGATAGAATTGGGGCAAATTTCAACAAA +GTATTTGAAGAAATAAAGAAGAATATGTCCAATAAAGTAGTTAGATTACAAGAAGTATATGATGTAGGAA +GCAAAGCTGTTTATATAAAAAAACTATTTGATACATGCATAATAAATGATGATGCTATTAATGTTTTATC +AGACTTAGACGAAGCATTTTTAGAAAGATATATTGGTGGAATAGAACCTGATAAAGAAGAAATACAAGAA +AAGCTTTCATTATATGCAAGAGAAGGAAGTCTATATCCAGTATTTTGTGGTGCTGCAGCAATTGGACTTG +GAATTGAAGATTTATTAGATGGAATTTGTAGTTATTTTCCATTTGCAAGTAATGATTGTGAAAGTGATTT +ATCTGGGGTAGTATTTAAAATCGAAAGAACAAGTAAAAATGAAAAGAAGGTTTATGTAAGATTATTTGGA +GGAAAAATATCTGTAAGAGATAAAATTCAAGTACCTAATAAGGAGATAGCAGAAAAAGTAAAGAAAATTA +ATAGGTTAGAAAATGGGGGAGTTGTTGAAGCACAGAGGATAGAAGCAGGGGATATAGGTATTTTATATGG +ACTTACAAGTTTCCAAGTGGGAGATGTTATTGGAATTTCAAATGATAAAATTAAAAATATATCTATAGCT +AAACCAGCATTAAAAACAACAATTTCTGCAATTGATAAAGAAAAAAATCCAGAGCTATTTAAAGCATTAA +CATTACTTGCAGAGGAAGATCCACTACTCGCCTTCGCGATGAATGACATAGATAAAGAAATTTATGTCAA +CTTATTCGGTGAAGTTCAAATGGAAATACTAAGTTCCATGTTAGATGATTTATATGGAATAAAAGTAGAG +TTTTCGAATATTGAGACTATCTATAAGGAAACACCTAAAGGTTTTGGAGCGTCAATAATGCATATGCAGG +AAGACTTAAATCCATTTTGGGCGACAGTAGGCTTAGAAATAGAACCAGCAGGGAGAGGCGAAGGTCTTAG +GTATATTTCTAATGTTTCAGTAGGGTCATTGCCAAAATCTTTTCAAAATGCAATTGAAGAAGCAGTTATT +AAGACAAGTAAACAAGGATTATTTGGATGGGAGGTTACAGATGTAAAAGTCACTCTTAGCTGTGGTGAAT +TTTTTAGTCCAGCCAGCACTCCAGCAGATTTTAGAAATGTGACACCTATGGTATTCATGGAAGCATTATA +TAAAGCACAAACTGTTTTATTAGAGCCATTACATGAGTTTGAGTTAAAGATTCCTCAAAATGCTTTAAGC +AAAGCGGTATGGGATTTAGAAACTATGAGGGCAACCTTTGATAATCCTATTGTTATAGGGGATGAATTCT +CAATAAAGGGATTAATTCCAGTAGAAAATTCAAAAGAATATAAAATGAAAATAGCTTCATATACAGAAGG +TAGAGGAATGTTTGTGACAAAATTTTATGGGTATAAGGAAGCTTCAGCTGAATTTTCAAAAGCACGCAAA +AAAACAACGTATGATCCATTGAATAAAAAAGAGTATTTGCTTCATAAACTAAACGCAATTAGAGATTAA +>tetAQ2_Z21523 +GTGCGTTTCGACAATGCATCTATTGTAGTATATTATTGCTTAATCCAAATGAATATTATAAATTTAGGAA +TTCTTGCTCACATTGATGCAGGAAAAACTTCCGTAACCGAGAATCTGCTGTTTGCCAGTGGAGCAACGGA +AAAGTGCGGCCGTGTGGATAATGGTGACACCATAACAGACTCTATGGATATAGAGAAACGTAGAGGAATT +ACTGTTCGGGCTTCTACGACATCTATTATCTGGAATGGAGTGAAATGCAATATCATTGACACTCCGGGAC +ACATGGATTTTATTGCGGAAGTGGAGCGGACATTCAAAATGCTTGATGGAGCAGTCCTCATCTTATCCGC +AAAGGAAGGCATACAAGCGCAAACAAAGTTGCTGTTCAATACTTTACAAAAACTGCAAATCCCGACAATT +ATATTTATCAATAAAATTGACCGTGACGGTGTGAATTTAGAGCGTTTGTATCTGGATATAAAAACAAATC +TGTCTCAAGATGTCCTGTTTATGCAAACTGTTGTCGATGGATTGGTTTATCCGATTTGCTCCCAAACATA +TATAAAGGAAGAATACAAAGAATTTGTATGCAACCATGACGACAATATATTAGAACGATATTTGGCGGAT +AGCGAAATTTCACCGGCTGATTATTGGAATACGATAATCGATCTTGTGGCAAAAGCCAAAGTCTATCCGG +TACTACATGGATCAGCAATGTTCAATATCGGTATCAATGAGTTGTTGGACGCCATCTCTTCTTTTATACT +TCCTCCAGAATCAGTCTCAAACAGACTTTCAGCTTATCTCTATAAGATAGAGCATGACCCCAAAGGACAT +AAAAGAAGTTTTCTAAAAATAATTGACGGAAGTCTGAGACTTCGAGACATTGTAAGAATCAACGATTCGG +AAAAATTCATCAAGATTAAAAATCTAAAGACTATTTATCAGGGCAGAGAGATAAATGTTGATGAAGTGGG +GGCCAATGATATCGCGATTGTAGAAGATATGGAAGATTTTCGAATCGGAGATTATTTAGGTACTAAACCT +TGTTTGATTCAAGGGTTATCTCATCAGCATCCCGCTCTCAAATCCTCCGTCCGGCCAGACAGGTCCGAAG +AGAGAAGCAAGGTGATATCCGCTCTGAATACATTGTGGATTGAAGACCCGTCTTTGTCCTTTTCCATAAA +CTCATATAGTGATGAATTGGAAATCTCGTTATATGGTTTGACACAAAAGGAAATCATACAGACATTGCTG +GAAGAACGATTTTCCGTAAAGGTCCATTTTGATGAGATCAAGACTATCTACAAAGAACGACCTGTAAAAA +AGGTCAATAAGATTATTCAGATCGAAGTGCCACCCAACCCTTACTGGGCCACAATAGGGCTGACGCTTGA +ACCCTTGCCGTTAGGGACAGGGTTGCAAATCGAAAGTGACATCTCCTATGGTTATCTGAACCATTCTTTT +CAAAATGCCGTTTTTGAAGGGATTCGTATGTCTTGCCAATCTGGTTTACATGGATGGGAAGTGACTGATC +TGAAAGTAACTTTTACTCAAGCCGAGTATTATAGCCCGGTAAGTACACCTGCTGATTTCAGACAGCTGAC +CCCTTATGTCTTCAGGCTGGCCTTGCAACAGTCAGGTGTGGACATTCTCGAACCGATGCTCTATTTTGAG +TTGCAGATACCCCAAGCGGCAAGTTCCAAAGCTATTACAGATTTGCAAAAAATGATGTCTGAGATTGAAG +ACATCAGTTGCAATAATGAGTGGTGTCATATTAAAGGGAAAGTTCCATTAAATACAAGTAAAGACTACGC +CTCAGAAGTAAGTTCATACACTAAGGGCTTAGGCGTTTTTATGGTCAAGCCATGCGGGTATCAAATAACA +AAAGGCGATTATTCTGATAATATCCGCATGAACGAAAAAGATAAACTTTTATTCATGTTCCAAAAATCAA +TGTCATCAAAATAA +>tetS_FN555436 +TTGAAAATTATTAATATCGGTATCTTAGCACATGTTGATGCAGGAAAAACTACTTTGACAGAAAGCTTAC +TATACAGTAGCGGAGCAATTAAAGAGTTAGGAAGTGTAGATAGCGGTACAACGAAAACGGATACTATGTT +TTTGGAACGCCAGAGAGGTATTACTATTCAGACCGCAATAACATCTTTTCAACGGGAAAATGTTAAAGTA +AATATTGTAGATACTCCTGGACACATGGATTTTTTGGCAGATGTATACCGTTCATTATCTGTTTTGGATG +GAGCTATTTTGCTAATCTCTGCAAAAGATGGAGTACAGTCACAAACTCGTATACTATTCCATGCACTTAG +AAAGATGAACATACCTATAATATTTTTTATTAACAAAATTGATCAAAATGGAATAAATTTGCCAGATGTT +TATCAAGATATTAAGGACAAACTTTCTGACGACATCATAATTAAGCAGACTGTGAATCTAAATTTGAAAC +CTTATGTAATAGATTATACTGAACCAGAACAATGGGAGACAGTAATTGTGGGAAATGATTATTTATTAGA +AAAATATACCATTGGGAAAACATTGAATATTGCAGAACTTGAAAAGGAGGAAAACGAAAGAATTCAAAGT +TGCTCCTTATATCCTGTTTATCACGGAAGTGCAAAGAATAATATTGGAATTAAACAACTTATAGAGGTAA +TTACTAGCAAATTATTTTCACCCACACAACTCAATTCAGATAAACTTTGTGGAAATGTTTTTAAAGTAGA +ATATTCAGATGATGGTCAACGGCTTGTCTATGTACGTCTTTATAGTGGAACGCTACATTTGCGAGACTCA +GTCAATATATCAGAAAAGGAAAAAATAAAAGTTACAGAAATGTATACTTCAATAAATGGAGAATTACGCC +AGATAGATAAGGCAGAGCCTGGTGAGATTATTATTTTAAAAAATGAGCTTTTAAAACTAAATAACGTACT +TGGAGATAAAAAAAGATTACCACATAGAGAAATTCTTGAGAATCCTCTTCCTATGTTACAAACAACAATT +GAACCATGTAAATCAGTACAAAGAGAAAAGTTACTAGATGCACTTTTTGAAATATCCGATAGTGATCCCC +TTCTACAATATTATGTAGATACAGTAACTCACGAAATTGTGCTATCTTTTTTAGGTGAGGTCCAAATGGA +GGTAACTTGTACTCTGATTCAAGAAAAATATCATATTGAGATAGAAACAAGAAAACCAACTGTCATTTAT +ATGGAAAGACCATTAAAAAAATCTGAATTTACCATTGATATCGAAGTACCTCCAAATCCTTTCTGGGCTT +CTATTGGTTTATCTGTAACACCACTTCCTTTGGGTAGTGGCATTCAGTATGAGAGCCTGGTTTCTCTAGG +TTATTTAAATCAATCATTTCAAAATGCAGTTATGGAAGGTATACGCTATGGGTGTGAACAAGGATTGTAC +GGTTGGAAATTAACAGACTGTAAGATCTGTTTTAAGTATGGTCTATATTACAGCCCTGTCAGTACGCCAG +CAGATTTCCGAATGCTTGCGCCTATTGTACTAGAGCAGGCTTTTAGAAAGAGTGGTACAGAGTTATTAGA +GCCATATCTTAGCTTCGAAATTTATGTACCACAAGAATATCTTTCGAGAGCATATAATGATGCTTCCAAA +TATTGTGCAAATATTTTAAATACTAAGTTAAAAGGTAACGAGGTCATTCTCATTGGTGAAATTCCAGCCC +GTTGTATTCAAGAGTATCGAAACAGTTTAACTTTCTTTACAAATGGACGCAGTGTCTGTTTAACAGAGTT +AAAAGGTTATCAGGTTACTAACATTAAGTCTGCTTTCCAACCACGTCGTCCAAATAATAGAATAGACAAA +GTAAGGCATATGTTTAATAAAATCAACTTACATTGA +>tetT_L42544 +ATGAAAATTATTAATATAGGAATATTAGCACATGTTGATGCAGGTAAAACAACTGTTACAGAAGGTTTAT +TATATAAAAGTGGGGCGATTAATAAAATTGGAAGAGTTGATAATGCTACAACGACAACGGATTCGATGGA +ACTTGAAAGAGATAGGGGAATAACTATACGGGCGTCTACAGTTTCATTTAATTACAATGATACAAAGGTA +AATATCATAGATACACCTGGGCACATGGATTTCATAGCCGAAGTTGAGCGAACTCTGAAAGTGTTAGATG +GAGCTATTTTAGTAATTTCAGCAAAAGAAGGAATTCAAGTCCAAACTAAAGTGATTTTTAATACTTTAGT +GAAATTAAATATACCAACACTTATATTTGTGAATAAAATAGATCGAAAGGGAGTATGTTTGGATGAGATA +TACACTCAAATACAGGAGAAATTAACTTCTAATCTTGCAATAATGCAATCAGTTAAAATAAAAGATAAAG +GTGATTTTGAATTGACAAATGTAAGGGATGATAAAGTAATTCAAAGTCAAATAATAGAGAAGTTACTGGA +TATAAATGATTATCTAGCAGAAAAATATATAAATGGCGATGTCATTGCAGAAAAAGAATATAATGATGTA +TTTTTGGATGAGATTAATAACTGCAATCTTTATCCTGTATTTCATGGTTCGGCTTTAAAAAATATTGGAA +TTGACGAGCTATTATTTGCCATTACTAAATATCTTCCTACCAAGAGCTATAATACTGAAGACCTTTTATC +AGCGTATGTTTATAAGATTGATAGGGATGAAAAATCTAGAAAGATGACTTTCTTAAGAGTATTCAGTGGG +AATATAAGGACACGTCAAGATGTTTATATAAATGGCACAGAAGAAACTTTCAAGATAAAAAGTCTGGAAT +CAATTATGAATGGTGAAATTGTGAAGGTAGGTCAGGTTAATAGTGGGGATATTGCTATTATTTCTAATGC +TAATTCTCTGAAGATAGGTGATTATATTGGTAAGAAATATGACGGGATTTTAGATATAAAGATAGCCCAA +CCGGCATTGAGAGCATCAATTAAACCTTGTGATTTAAGCAAAAGAAGCAAACTGATAGAAGCACTATTTG +AATTAACTGAAGAAGACCCATTTCTCGATTGTGAAATTAACGGAGATACTGGAGAAATCATATTGAGGCT +ATTTGGAAATATTCAAATGGAAGTAATAGAATCACTACTTAAAAGCCGATACAAAATAGATGCTAGATTT +GGTGAATTGAAAACAATATATAAAGAACGACCTAAGAGAAACTCTAAAGCAGTAATCCATATAGAGGTTC +CACCAAATCCTTATTGGGCATCTATTGGACTGTCAATAGAACCACTACCAATAGGGTCAGGATTATTATA +TAAGACAGAGGTGTCCTATGGATATTTAAATAATTCATTTCAAAATGCAGTAAAAGATGCTGTAGAGAAG +GCTTGTAAAGAAGGGCTTTATGGATGGGAAGTTACAGACTTAAAGGTAACTTTTGACTACGGATTATACT +ATAGCCCGGTAAGTACCCCCTCTGACTTTAGGAATTTAACACCATATGTATTTTGGGAAGCTCTTCGAAA +AGCAGGAACTGAAATATTAGAACCTTATTTAAAATATACAGTTCAAGTTCCAAATGATTTCTGCGGAAGG +GTTATGAGTGATCTTAGAAAGATGAGGGCTTCTATTGAAGATATAATAGCCAAGGGAGAGGAGACAACTT +TAAGTGGAAAGATACCTGTTGATACATCGAAGTCCTATCAGTCAGAATTACTTTCTTATTCAAATGGAAA +GGGTATATTTATTACTGAGCCTTATGGGTATGATATATATAATGATAAGCCTATAATTAATGATATTGGG +AACGACAATAATGATAGCAACAAGGAAGGGTTAAGATATTTATTTCAAAAACAGGATGAAAATTGA +>tetW_AJ222769 +ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAAAGACGACCTTGACGGAGAGCCTGC +TATATGCCAGCGGAGCCATTTCAGAACCGGGGAGCGTCGAAAAAGGGACAACGAGGACGGACACCATGTT +TTTGGAGCGGCAGCGTGGGATTACCATTCAAGCGGCAGTCACTTCCTTCCAGTGGCACAGATGTAAAGTT +AACATTGTGGATACGCCCGGCCACATGGATTTTTTGGCGGAGGTGTACCGCTCTTTGGCTGTTTTAGATG +GGGCCATCTTGGTGATCTCCGCTAAAGATGGCGTGCAGGCCCAGACCCGTATTCTGTTCCATGCCCTGCG +GAAAATGAACATTCCCACCGTTATCTTTATCAACAAGATCGACCAGGCTGGCGTTGATTTGCAGAGCGTG +GTTCAGTCTGTTCGGGATAAGCTCTCCGCCGATATTATCATCAAGCAGACGGTGTCGCTGTCCCCGGAAA +TAGTCCTGGAGGAAAATACCGACATAGAAGCATGGGATGCGGTCATCGAAAATAACGATGAATTATTGGA +AAAGTATATCGCAGGAGAACCAATCAGCCGGGAAAAACTTGCGCGGGAGGAACAGCAGCGGGTTCAAGAC +GCCTCCCTGTTCCCAGTCTATCATGGCAGCGCCAAAAATGGCCTTGGCATTCAACCGTTGATGGATGCGG +TGACAGGGCTGTTCCAACCGATTGGGGAACAGGGGGGCGCCGCCCTATGCGGCAGCGTTTTCAAGGTTGA +GTACACCGATTGCGGCCAGCGGCGTGTCTATCTACGGTTATACAGCGGAACGCTGCGCCTGCGGGATACG +GTGGCCCTGGCCGGGAGAGAAAAGCTGAAAATCACAGAGATGCGTATTCCATCCAAAGGGGAAATTGTTC +GGACAGACACCGCTTATCAGGGTGAAATTGTTATCCTTCCCAGCGACAGCGTGAGGTTAAACGATGTATT +AGGGGACCAAACCCGGCTCCCTCGTAAAAGGTGGCGCGAGGACCCCCTCCCCATGCTGCGGACGACGATT +GCGCCGAAAACGGCAGCGCAAAGAGAACGGCTGCTGGACGCTCTTACGCAACTTGCGGATACTGACCCGC +TTTTGCGTTGCGAAGTGGATTCCATCACCCATGAGATCATTCTTTCTTTTTTGGGCCGGGTGCAGTTGGA +GGTTGTTTCCGCTTTGCTGTCGGAAAAATACAAGCTTGAAACAGTGGTAAAGGAACCCTCCGTCATTTAT +ATGGAGCGGCCGCTCAAAGCAGCCAGCCACACCATCCATATCGAGGTGCCGCCCAACCCGTTTTGGGCAT +CCATAGGACTGTCTGTTACACCACTCTCGCTTGGCTCCGGTGTACAATACGAGAGCCGGGTTTCGCTGGG +ATACTTGAACCAGAGTTTTCAAAACGCTGTCAGGGATGGTATCCGTTACGGGCTGGAGCAGGGCTTGTTC +GGCTGGAACGTAACGGACTGTAAGATTTGCTTTGAATACGGGCTTTATTACAGTCCGGTCAGCACGCCGG +CGGACTTCCGCTCATTGGCCCCGATTGTATTGGAACAGGCATTGAAGGAATCGGGGACGCAGCTGCTGGA +ACCTTATCTCTCCTTCATCCTCTATGCGCCCCAGGAATACCTTTCCAGGGCTTATCATGATGCACCGAAA +TACTGTGCCACCATCGAAACGGCCCAGGTAAAAAAGGATGAAGTTGTCTTTACTGGCGAGATTCCCGCCC +GCTGTATACAGGCATACCGTACTGATCTGGCCTTTTACACCAACGGGCGGAGCGTATGCCTTACAGAGCT +GAAAGGATATCAGGCCGCTGTCGGTCAGCCGGTCATCCAGCCCCGCCGTCCAAACAGCCGCCTGGACAAG +GTGCGCCATATGTTTCAGAAGGTAATGTAA +>tet32_AJ295238 +ATGAAAATAATTAACTTAGGCATTCTGGCTCACGTTGACGCAGGAAAGACAACATTAACGGAAAGTTTAT +TGTATACCAGTGGTGCAATTGCAGAACTAGGGAGCGTAGATGAAGGCACAACAAGGACAGATACAATGAA +TTTGGAGCGTCAAAGGGGAATCACTATCCAGACAGCAGTGACATCTTTTCAGTGGGAGGATGTAAAAGTC +AACATTATAGATACGCCAGGCCATATGGATTTTTTAACCGAAGCATACCGCTCTTTATCTGTCCTTGACG +GAGCTGTTTTAGTCATTTCGGCAAAAGACGGCGTACAGGCACAGACGCGTATATTATTCCATGCGCTTCA +GAAAATGAACATTCCGACAATTATCTTTATAAATAAGATAGACCAAAATGGAATCGACCTACAGCGTGTT +TACCAAAGCATTAAAGACAAACTTACCAGTGATATGATTGTCATGCAGGAGGTTTCCCTGTCGCCAAAGA +TAACCATGACCGATATTTCTGATTTGGACAAATGGGATATGATTATTTCCGGAAGCGATGAACTATTAGA +ACGATATGTTGCAGAGGATTCTTTGGATATACAGGAATTACAATATGAAAAGTGCAAAAGAACCAGATGC +TGCTCTTTGTTTCCTGTTTATCATGGGAGTGCAAAAGACAATTTAGGAACAGAAAAACTGATTGAAGCGA +TTACAGAAACTTTCATTACAGAAACAGACGATATTCAGTCTGAATTATGTGGATATGTTTTTAAGGTTGA +GTATACAGAGCGGAAAAAACGGCTTTCTTATTTACGCCTGTATCATGGGACGCTCCATTTACGGGATACC +CTGCTGCTGTCAAAAAAGGAAAAAATAAAGATTACAGAAATGTGTATTCCGTCAAATGGTGAAATCGTCC +CGGTTGACCATGCCTGTCCGGGAGAAATTGTTATTTTAGCTGATGATACTTTGAAACTGAACGACATTCT +GGGAAATGAAAAACTCCTGCCTCACAAAACACGGATTGATAATCCCATGCCATTACTTCGGACAACGGTA +GAGCCGCAAAAGCCGGAGCAAAGGGAAGCCCTGTTAAATGCCCTCACAGAGATTGCTGATACAGACCCTC +TTTTGCATTTTGACATTGATACTGTTACACATGAGATTATATTATCTTTTTTGGGAAAAGTACAGTTAGA +AGTTATTTGTTCGCTATTAGAAGAAAAATATCATGTGGGCGTGGCTATGAAAGAGCCTTCGGTTATTTAT +CTGGAAAGACCGCTTAGAAAAGCAGAATATACCATCCACATAGAAGTCCCGCCAAATCCTTTCTGGGCTT +CTGTCGGGTTGTCCATAGAGCCGCTCCCTATTGGAAGCGGAGTGCAGTATGAAAGCAGAGTTTCACTTGG +ATATTTAAATCAATCGTTCCAAAATGCGGTTATGGAGGGGGTTCTTTATGGCTGCGAGCAGGGGCTGTAT +GGATGGAAAGTGACAGACTGTAAAATCTGTTTTGAATATGGATTGTATTATAGTCCTGTAAGTACCCCCG +CAGACTTTCGGCTGCTTTCCCCTATCGTATTGGAGCAGGCTTTAAAAAAAGCAGGGACAGAACTATTAGA +GCCATATCTCCACTTTGAAATTTATGCACCGCAGGAATATCTCTCACGGGCGTATCATGATGCTCCAAGG +TATTGTGCAGATATTGTAAGTACTCAGATAAAGAATGACGAGGTCATTCTGAAAGGAGAAATCCCTGCTA +GATGTATTCAAGAATACAGGAACGATTTAACTTATTTCACAAATGGGCAGGGAGTCTGCTTGACAGAGTT +AAAAGGATACCAGCCAGCTATTGGTAAATTTATTTGCCAACCCCGCCGCCCGAATAGCCGTATAGATAAG +GTTCGGCATATGTTCCACAAGTTAGCTTAA +>tet36_AJ514254 +ATGAGAACTATAAATATAGGTATTCTTGCACATATTGATGCAGGAAAGACCTCCATTACAGAGAACTTGC +TATTTGCGAGTGGAGCAACCATAGTACGTGGAAGTGTGGACAAAGGAAACACTACAACCGATTCGATGGA +TATCGAAAAACGAAGAGGTATCACAGTTAGAGCGTCTACAACATCTATTCAATGGAATGATACAAAGATT +AATATCATCGACACTCCTGGACACATGGACTTTCTGGCAGAGGTAGAACGCACTTTTAGGATGCTAGATG +GTGCTATACTTGTGGTGTCTGCCAAAGAGGGCATTCAAGCTCAAACAAGGTTGTTGTTCAATGTCCTGCA +ACAACTAGAAATACCTACAATTCTATTCGTCAACAAAATAGACAGAGAGGGAGTCAATCTAAATCAGCTT +TATTTAGAGATACAAAATAGCCTTTCTAAAGATATTATCTTTATGCAATCCGTTGAAGGCAAGGAATTAA +CATCTAGCTGTACAATACACTACATATCAGAAAAGAACAGAGAAACAATTTTAGAGAAAGATGATCTCTT +GCTTGAAAAATACTTGAGTGATACACAGCTTTCTAATTTAGATTATTGGAATTCAATGGTTCGTCTTGTT +CAAGCTGCTAAATTACATCCTATCTATCATGGTTCAGCAATGTATGGCATTGGTATTGAAGATTTGCTAA +ACTCAATCACTACTTTTATCGAAACATCTCTACCTCAAGAGAACGCTTTGTCTGCCTATGTTTATAAAAT +TGAGCATAATAAGAAGGAACAGAAACGAGCCTATCTAAAGATTATAGGTGGAACCCTTAAATCTCGAAAA +TTATATAGCCTCAATGGCTCAGATGAGAATCTGAAGATAAGAGGTTTAAAGACCTTTTACTCAGGAGACG +AAATAGATGTAGACGAAGTTTTTACAAATGATATTGCAATTGCAGATCATGCTGATAACTTAATGGTAGG +AGATTATCTAGGAATAATGCCAAACTTATTCGACAAATTGAATATTCCTAGTCCTGCTCTCAAATCGTCT +ATACATCCTGCAAAAGTAGAGAATAGGAGTAAATTGATTTCTGCTATGAATGTATTATCAGTAGAAGATC +CATCTTTGGCCTTTAGCATTAATGCTGATAATAATGAATTGGAGGTTTCGCTTTATGGAGCAACTCAACG +GGAGGTGATTTTGACTTTATTGGAAGAGAGATTTTCGGTAGATGCTTACTTTGAAGAGGTGAAAACTATC +TATAAAGAACGTCTTAAAACAAAATCGGAATACACCATTCATATCGAAGTGCCACCTAATCCGTATTGGG +CATCTATTGGCTTGATTATAGAGCCTTTGCCAATTGGGGCGGGACTTGTAATGGAGAGTGAAATATCATT +GGGATATTTGAATCGATCCTTTCAGAATGCAGTATTCGATGGAGTCAAGAAAGCCTGTGAATCGGGTTTG +TACGGTTGGGAAGTAACTGACCTTAAAGTCACTTTTTCTCACGGAATCTATTATAGCCCAGTGAGTACAC +CTGCCGACTTTAGAAGTTTAGCACCTTATGTTTTTCGATTAGCTTTGCAACAAGCTGATGTTGAGTTATT +GGAGCCAATCTTAGATTTTAAATTGCAAATTCCACTAGCTGTGAATGCTAGAGCTATTACAGACATCAAC +AAGATGCAAGGCGAAATATCTACTATTACTTCAGATGGTGATTGGACTACTATTTTGGGTAATATTCCTT +TAGATACTAGTAAAGAATACTCAGCAGAGGTCAGTTCCTACACACAAGGCTTGGGCGTTTTTGTTACTCG +ATTTTCGGGTTATCGACCTACCAACAAAAAGGTAAGCAGAAGTGTAGAACTGAATGAAAAAGATAAGCTG +ATGTATATGTTTGAGAAGGAAAGTATCAAATAA +>tet44_FN594949 +ATGAAAATAATCAACATTGGTATTCTTGCTCATGTAGATGCAGGAAAGACGACCTTAACGGAAAGTCTGC +TTTATACAAGTGGAGCAATTTTAGAATTAGGCAGTGTAGATAAGGGAACAACAAGGACAGATACTATGTT +TTTAGAACGTCAGCGTGGAATCACAATTCAGGCAGCAGTTACTTCTTTTAATTGGAATGACTACAAAATC +AATATTGTAGATACTCCTGGACATACAGATTTTATAACAGAAGTGTATCGTTCCTTATCTGTTCTTGATG +GAGCAATTTTAGTAATTTCTGCTAAAGATGGTGTACAAGCACAAACCCGAATACTATTCCATGCACTTCA +AAAAATGAATATACCAACAATTATTTTTATAAATAAAATAGATCAGGATGGAATTAACTTAAATAATATT +TATCAAAATATCAAAGAAAAACTTTCAAATGATATTATTGTTATGCAAAATGTAACATTAACTCCAGAAA +TATCAATTAAAAATATCATTGATTTAGATGATTGGGATCCTGTAATTTCCAAAAATGATAAACTTTTAGA +AAAATATATTGTAGGAGAAAAATTGACTATACAAGAATTAATGTATGAAGAATATAGGTGTGTTAAAAAA +GGTTCGTTGTTTCCTATATACCATGGAAGTGCTAGAAATAATATAGGGACTCAACAACTTATCGAAGCTA +TTTCAAATCTTTTTTGTTCTGAAATGAATGAGAATGATTCAGAACTATGTGGAAGAGTTTTTAAAATTGA +ATATACAGACCATAAGCAAAGATTAGTTTATTTGCGTCTTTATAGTGGAACATTACACTTACGAGATACA +ATTATATTGCCAGAAAAAAAGAAAGTGAAACTTACAGAAATATATATTCCTTCAAATGGAGAAATGATAC +AGACAAAAACAGTTTGTTCTGGAGATATTTTTATTATACCTAACAATACATTAAGATTGAACGATATTAT +AGGAAATGAAAAGCTTTTGCCATGCAATGTATGGAATGACAAGACTGTACCAATACTACGAACAAGAATT +GAACCGATAAAAATAGAAGAGAGAGAAAAATTATTGGATGCTCTTACAGAAATTGCAGATACTGATCCTC +TTTTACGTTATTATGTTGATACGATAACACATGAAATCATCATTTCTTTTTTAGGAACAGTGCAGTTAGA +AGTTATCTGTTCTCTGTTGATTGAAAAATATCACATAAACATAAGAATCGAAGATCCAACCGTAATTTAT +TTGGAAAAACCATTACAAAAGGCAGATTATACTATTCATATTGAAGTACCACCAAATCCATTTTGGGCAT +CGATTGGATTATCAATAACTCCACTTCCAATTGGCAGTGGAATACAGTACGAAAGCAAAGTTTCACTCGG +TTATTTAAATCAAAGTTTCCAAAATGCAGTAAGAGAAGGTATTAATTATGGACTGGAGCAAGGTTTGTAT +GGTTGGGAAGTAACAGATTGTAAAATATGTTTTGAATATGGTGTTTATTATAGCCCTGTTAGTACTCCCT +CGGATTTTCGCTTTCTTGCCCCAATTGTACTTGAACAAACATTGAAAAAAGCGGGAACGCAATTATTAGA +GCCATATCTTTCGTTTATACTTTTTACGCCACAGGGATACTTTTCTCGTGCATATAAAGATGCACAAAAA +CATTGTGCAATAATTGAAACAAGTCAATCAAAAAATGATGAAGTTATTTTTACAGGACATATTCCTGTAC +GTTGTATTAATGAATATCGTAATACTTTAACTCTATATACAAATGGGCAAGCAGTTTTTTTGACAGAATT +AAAAGATTATCAAATTGCTACTTGTGAACCAGTTATTCAATCACGTAGACCAAATAATCGAATAGATAAA +GTACGCCATATGTTTAATAAAAAAGAAAATTAA +>tet58_KY887560 +ATGAATTCTAATTCGTCAAACCATAAATCACAATACAACAAATTATTACTTTGGCTTTGCTTTTTATCTT +TCTTTAGTGTACTAAATGAAATGGTTCTAAATGTATCTTTTCCTGATGTAGCGAATTACTTTGGAAAAGC +TCCTGCAAGTATAAATTGGATTAATACATCGTTCATGTTAAGTTTTTCTGTAGGAACAGCCATATATGGC +AAAGTTTCTGATTATGTTGGTATTAAGAAACTGTTATTAACAGGAATTTTATTAAATTGTATAGGTTCAA +TTATGGGTTTTATAGGTCATACATCATTTCCTGTATTATTATTATCACGATTCATTCAAGGTACAGGAGC +TGCAGCTTTTCCTGCGTTAATTATGGTGGTTGTTGCTAAATATATTCCAAGGGAGAGCCAGGGAAAGGCT +TTTGGACTTATTGGTTCCATTGTTGCAATGGGTGAAGCTCTAGGTCCATCTATCGGTGGAATGATAGCTG +AATATATTCATTGGTCTTATCTATTGATATTACCTTTAGGAACTTTAATATCAGTTCCTTTTCTTATCAA +AATGCTTGATCATGAACCGATTAAAAAGGGAAGTTTTGATTTTATAGGATTAGTATTAATGTCGTTAAGC +ATAGTAACTTTTATGGTGTTTACCACATCATATAAATTATATTTCTTAGGAATAAGTTTCGTTATTTTCA +TTATTTTTATTAAGCACATAAAAAAAGTGGATGAGCCGTTTATTGAGCCTAAATTAGGTGAAAACCGATC +TTTTATGGTTGGTATTGTTTGTGGAGGTCTTTTTTTTGGAACGGTGGCAGGATTTATTTCAATGGTTCCT +TATATGATGAGAGATTTATATCAGTTAAGTACACTTGCCATTGGTAACGGGATTATCTTTCCAGGAGCTG +TTAGTGTCATTATTTTTGGTTACTTTGGTGGAATACTAGTAGATAAAAAAGGACCAATATTCGTGTTAAC +TATAGGAGCTATGTTGTTATCAATTAGTTTTCTATTGGCTGCACTGTTTGTTGAAACGACACCTTTTTTA +ATTACTATATTAATTATTTTTATATTTGGAGGTCTGTCCTTTACAAAAACGGTTATATCTACAATTGTTT +CAAGTAGTTTAACTACAAAAGAAAGTGGTTCAGGAATGAGTTTACTTAATTTTACAAGTTTCTTATCTGA +AGGACTAGGAATCGCAGTTGTAGGAGGATTACTGTCTGTAGACATACTAAATAAAAAAATTATTCCTATA +AATGTTTCTTCTCAATCATATTTATATAGCAATATGTTACTTATTTTTTCTATAATAATTATTTTTAGTT +GGTTAATTACTATCAAAGTGTACTCTGAGCCAAAAATAAAATAG +>otrA_X53401 +ATGAACAAGCTGAATCTGGGCATCCTGGCCCACGTTGACGCCGGCAAGACCAGCCTCACCGAGCGCCTGC +TGCACCGCACCGGTGTGATCGACGAGGTCGGCAGCGTGGACGCCGGCACCACGACGACCGACTCGATGGA +GCTGGAGCGGCAGCGCGGCATCACCATCCGGTCCGCCGTGGCCACGTTCGTCCTGGACGATCTCAAGGTC +AACCTCATCGACACCCCGGGCCACTCCGACTTCATCTCCGAGGTCGAGCGGGCGCTCGGGGTGCTCGACG +GCGCGGTCCTGGTGGTCTCGGCCGTCGAGGGCGTCCAGCCGCAGACCCGCATCCTGATGCGGACCCTGCG +CAGGCTGGGCATTCCCACGCTGGTCTTCGTCAACAAGATCGACCGGGGCGGCGCGCGTCCCGACGGTGTG +CTGCGGGAGATCCGCGACCGGCTCACCCCCGCCGCGGTGGCACTGTCCGCCGTGGCGGACGCCGGCACGC +CGCGGGCCCGCGCGATCGCGCTCGGCCCGGACACCGACCCGGACTTCGCCGTCCGGGTCGGTGAGCTGCT +GGCCGACCACGACGACGCGTTCCTCACCGCCTACCTGGACGAGGAACACGTACTGACCGAGAAGGAGTAC +GCGGAGGAACTGGCCGCGCAGACCGCGCGCGGTCTGGTGCACCCGGTGTACTTCGGGTCCGCGCTGACCG +GCGAGGGCCTGGACCATCTGGTGCACGGCATCCGGGAGTTGCTGCCGTCCGTGCACGCGTCGCAGGACGC +GCCGCTGCGGGCCACCGTGTTCAAGGTGGACCGTGGCGCGCGCGGCGAGGCCGTCGCGTACCTGCGGCTG +GTCTCCGGCACGCTGGGCACCCGCGATTCGGTGACGCTGCACCGCGTCGACCACACCGGCCGGGTCACCG +AGCACGCCGGACGCATCACCGCGCTGCGGGTCTTCGAGCACGGGTCGGCCACCAGCGAGACCCGGGCGAC +CGCCGGGGACATCGCGCAGGCGTGGGGCCTGAAGGACGTACGGGTCGGTGACCGGGCCGGGCACCTCGAC +GGTCCCCCGCCGCGCAACTTCTTCGCGCCGCCCAGCCTGGAGACCGTGATCAGGCCGGAGCGCCCGGAGG +AAGCGGGACGGCTGCACGCCGCGCTGCGCATGCTGGACGAGCAGGACCCCTCGATCGACCTGCGGCAGGA +CGAGGAGAACGCGGCCGGCGCGGTGGTCCGCCTCTACGGGGAGGTGCAGAAGGAGATCCTCGGCAGCACG +CTCGCGGAGTCCTTCGGCGTACGGGTGCGCTTCGACCCGACCCGTACGGTCTGCATCGAAAAGCCCGTGG +GGACCGGCGAGGCGCTGATCGAGCTGGACACGCGGACGCACAACTACTTCTGGGGCGCACCGTGGGTCTG +CGCGTCGGACCGGCCGAGCCCGGCGCGGGCGATCACGTTCCGTTTGGCGGTGGAACTGGGCTCGCTCCCC +CTGGCCTTCCACAAGGCCATCGAGGAGACGGTGCACACCACCCTGCGGCACGGTCTGTACGGCTGGCAGG +TCACCGACTGCGCCGTCACCCTGACCCGTACCGGCGTTCGCAGTCCGGTCAGCGCGGCCGACGACTTCCG +CAAGGCCAACGCGCGCTTGGTCCTGATGGACGCGCTCGGCAGGGCCGGTACGGAGGTGCACGAGCCGGTC +AGCTCCTTCGAACTGGAGGTGCCCGCCGCCCGGCTCAGCCCGGTACTTGCGAAACTCGCGGAACTGGGCG +CGACGCCCGGTGTGCCCACGGCCGAGGGGGACGTCTTCCGCCTGGAGGGCACGATGCCGACCAGCCTCGT +GCACGACTTCAACCAGCGGGTTCCCGGACTGACCCAGGGCGAGGGCGTGTTCCTGGCCGAGCACCGGGGC +TACCGGCCCGCCGTCGGACAGCCGCCCGTGCGGCCGCGGCCCGAGGGGCCCAACCCGCTCAACCGCGACG +AGTACATCCTGCACGTGCTCAAGCGCGTGTGA +>tet_M74049 +ATGCGCACCCTGAACATCGGCATTCTGGCCCACGTCGACGCGGGTAAGACCAGCCTGACCGAACGGCTCC +TGTTCGACCACGGCGCCGTCGACCGGCTCGGCAGCGTCGACGCCGGCGACACCCGTACGGTCGACGGCGG +TATCGAGCGCCGCCGCGGCATCACCATCCGCTCCGCCGTCGCCGCCTTCACCGTCGGCGACACGCGCGTC +AACCTGATCGACACCCCGGGACACTCCGACTTCGTCGCGGAGGTCGAGCGGGCCCTGGAAGTGCTCGACG +GGGCGGTGCTGCTGCTGTCCGCCGTCGAGGGCGTCCAGGCGCGGACCCGCGTCCTGATGCGCGCGCTGCG +GCGGCTGCGGCTGCCCACGATCGTGTTCGTCAACAAGATCGACCGGGCCGGCGCGCGCACCGACGGCCTC +CTCGGTGACGTCCGGCGCCTGCTGACGCCGCACGTCGCGCCGCTGACCGAGGTGGCGGACGCCGGTACCC +CGCGCGCCCGGGTCACCCGCCGCCCGCCGGACGGGCGGACCGCGGAGGCCCTCGCCGAGGTCGACACGGA +GGTCCTGGCCGCGCTGGTCGACGGCCCCGAGCCGACCGGGGAGGACGTGGCCCGCGCCCTCGCCGCCCGT +ACCGCCGACGGCTCGTTCCACCCGCTGTACCACGGCTCCGCGCTCGGCGGACAGGGCGTCGCGGAGCTGG +TCGAGGGCCTGCTCGGCCTGATCCCGGCCGCCACGCCGGGCACGTCCGGCGGCACGTCCGGCGGCACGGA +ACCGCGCGGCACGGTCTTCGCCGTGCGCCCCGGACCCGCCGGCGAGCGCACCGCGTACCTCAGGCTGTAC +GGCGGCGAGGTGCACCCGCGCCGGCGGCTCACCTTCCTGCGGCGCGAGTCCGACGGGCGGACCACCGAGG +TCTCCGGCCGGGTGACCCGCCTCGACGTCGTCGGCGGCGACGCCACGCTCACCGCCGGGAACATCGCCGC +GCTCACCGTTCCCGGGGGCCTGCGCGTCGGCGACCGGCTCGGCGGACCGACCGACCGTGCACCGCAGTTC +GCGCCACCGACCCTGCAGACGCTGGTCCGGGCCCGGCACCCGGAGCAGGCGGCGCCGCTGCGCTCCGCCC +TGCTGGCGCTGGCCGACCAGGACCCGCTGCTGCACGCCCGACCGGCGGCGTCCGGCGCCACCGCCCTGCT +CCTGTACGGCGAGGTCCAGATGGAGGTGCTCGCGGCGACACTGGCCGAGGACTTCGGGATCGAGGCGGAG +TTCACGCCGGGCCGCGTCCGGTTCCTGGAGCGTCCGGCGGGCACCGACGAGGCCGCGGAGGAGATGCCGT +GGCTCGACCGCACCCGGTACTTCGCGACGATCGGGCTGCGCGTCGAACCGGGTCCGCGCGGCTCCGGCGG +GGCCTTCGGGTACGAGACGGAGCTCGGCGCGCTCCCCCGGGCCTTCCACCAGGCCGTCGAGGAGACCGTC +CACGACACGCTGCGGACCGGGCTCACCGGTGCGGCGGTCACCGACTACCGGGTCACGCTGATCCGCTCCG +GCTTCAGCTCGCCGCTCAGCACGGCCGCCGACTTCCGCGGGCTGACACCGCTCGTGCTGCGCCGTGCCCT +CGCCCGCGCGGGGACCGTGCTCCACGAGCCGTACCAGGCCTTCGAGGCGGAGGTCCCGGCGGACACGCTG +GCCGCCGTGACGGCCCTGCTGGCCTCGCTGGGCGCGGACTTCACCGGAACGACGGGGGGCGACCCGGCCT +GGATCGTCACCGGCGAGCTGCCGGCCCGGCGGGTGCGGGAGGCCGAGCTGCGGCTGCCGGGGCTGACGCA +CGGGGAGGCGGTCTGGTCCTCCCGCCCTTGCGAGGACCGACCGCTGAAGGCCGGAAACTCTGGGCCTGGC +ACGGGAGTTGGCGGGCATTCGGGTGAGTAG +>tetS_M_AY534326 +ATGGAGGAAATAAAATTGAAAATTATTAATATCGGTATCTTAGCACATGTTGATGCAGGAAAAACTACTT +TGACAGAAAGCTTACTATACAGTAGCGGAGCAATTAAAGAGTTAGGAAGTGTAGATAGCGGTACAACGAA +AACGGATACTATGTTTTTGGAACGCCAGAGAGGTATTACTATTCAGACCGCAATAACATCTTTTCAACGG +GAAAATGTTAAAGTAAATATTGTAGATACTCCTGGACACATGGATTTTTTGGCAGATGTATACCGTTCAT +TATCTGTTTTGGATGGAGCTATTTTGCTAATCTCTGCAAAAGATGGAGTACAGTCACAAACTCGTATACT +ATTCCATGCACTTAGAAAGATGAACATACCTATAATATTTTTTATTAACAAAATTGATCAAAATGGAATA +AATTTGCCAGATGTTTATCAAGATATTAAGGACAAACTTTCTGACGACATCATAATTAAGCAGACTGTGA +ATCTAAATTTGAAACCTTATGTAATAGATTATACTGAACCAGAACAATGGGAGACAGTAATTGTGGGAAA +TGATTATTTATTAGAAAAATATACCATTGGGAAAACATTGAATATTGCAGAACTTGAAAAGGAGGAAAAC +GAAAGAATTCAAAGTTGCTCCTTATATCCTGTTTATCACGGAAGTGCAAAGAATAATATTGGAATTAAAC +AACTTATAGAGGTAATTACTAGCAAATTATTTTCACCCACACAACTCAATTCAGATAAACTTTGTGGAAA +TGTTTTTAAAGTAGAATATTCAGATGATGGTCAACGGCTTGTCTATGTACGTCTTTATAGTGGAACGCTA +CATTTGCGAGACTCAGTCAATATATCAGAAAAGGAAAAAATAAAAGTTACAGAAATGTATACTTCAATAA +ATGGAGAATTACGCCAGATAGATAAGGCAGAGCCTGGTGAGATTATTATTTTAAAAAATGAGCTTTTAAA +ACTAAATAACGTACTTGGAGATAAAAAAAGATTACCACATAGAGAAATTCTTGAGAATCCTCTTCCTATG +TTACAAACAACAATTGAACCATGTAAATCAGTACAAAGAGAAAAGTTACTAGATGCACTTTTTGAAATAT +CCGATAGTGATCCCCTTCTACAATATTATGTAGATACAGTAACTCACGAAATTGTGCTATCTTTTTTAGG +TGAGGTCCAAATGGAGGTAACTTGTACTCTGATTCAAGAAAAATATCATATTGAGATAGAAACAAGAAAA +CCAACTGTCATTTATATGGAAAGACCATTAAAAAAATCTGAATTTACCATTGATATCGAAGTACCTCCAA +ATCCTTTCTGGGCTTCTATTGGTTTATCTGTAACACCACTTCCTTTGGGTAGTGGCATTCAGTATGAGAG +CCTGGTTTCTCTAGGTTATTTAAATCAATCATTTCAAAATGCAGTTATGGAAGGTATACGCTATGGGTGT +GAACAAGGATTGTACGGTTGGAAATTAACAGACTGTAAGATCTGTTTTAAGTATGGTCTATATTACAGCC +CTGTCAGTACGCCAGCAGATTTCCGAATGCTTGCGCCTATTGTACTAGAGCAGGCTTTTAGAAAGAGTGG +TACAGAGTTATTAGAGCCATATCTTAGCTTCGAAATTTATGTACCACAAGAATATCTTTCGAGAGCATAT +AATGATGCTTCCAAATATTGTGCAAATATTTTAAATACTAAGTTAAAAGGTAACGAGGTCATTCTCATTG +GTGAAATCCCTGCTCGGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGT +TTGTTTAACAGAGTTAAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAAT +AGTCGGATAGATAAAGTACGATATATGTTCAATAAAATAACTTAG +>tetS_M_HM367711 +CTAAGTTATTTTATTGAACATATATCGTACTTTATCTATCCGACTATTTGGACGACGGGGCTGGCAAACA +GGTTCACCGGTAGTAACATGGTACCCTTTTAACTCTGTTAAACAAACACTACGTCCATTTGTAAAGAAAG +TTAAATCACTACGATATTCTTGAATACACCGATCAGGGATTTCTCCACTAAGAATGACCTCATTATTTTT +CAATTGAGTGTCTACGATGTTCGCACAATATTTAGGAGCATCGTTGTATGCTCGTGAAAGATATTCCTGT +GGCGCATAAATTTTAAAACTAAGATATGGCTCTAACAATTCTGTTCCAGCTTTTTTTAAGACTTGTTCCA +ATACAATAGGAGCAAGCATCCGAAAATCTGCTGGGGTACTAACAGGGCTATAGTATAAGCCATACTTAAA +ACAGATTTTACAGTCCGTCACATTCCAACCATACAATCCTTGTTCACAGCCATAGCGTATCCCTTCCATA +ACTGCATTTTGAAATGATTGATTTAAGTATCCAAGAGAAACCGAGCTCTCATACTGCATTCCACTTCCCA +ACGGAAGCGGTGATACAGATAAACCAATGGAAGCCCAGAAAGGATTTGGCGGCACTTCGATGTGAATGGT +ATATTCTGCATTTTTTAACGGTCTCTCCATATAAATGACTGTAGGCTCTTTTAGTTCTATCTCCACATGA +TACTTTTCTTGCAACAGTGCACTAATCACTTCCATTTGTACTTTCCCTAAGAAAGAAAGTATAATTTCAT +GTGTCGTAGAATCCACGTAATATCGTAGAAGCGGATCACTATCTGAGATTTCCAAAAGGGCATCAAGCAA +CATTTCTCTCTGTTCAGGTTTACTCGGTTCAACAGTTGTTTGTAGTAGAGGGTGCGGATTTTCAATCTTT +TTTCTCTGTGGCAATAGTTTTGTATCTCCAAGAACACTATTTAACTTCAAAAACTCATTTTGCAAAATAA +CAATTTCTCCAGAATAAGCTCTATCAATCTTACATAATTCACCATTTATTGAAGTATACATTTCTGTAAC +TTTTATTTTTTCTTTTTCTGATACTCTAACCGAATCTCGTAAATGTAGTACTCCACTATAAAGGCGTATA +TATGCAAGACGTTGTCTTTTTTTTGTATATTCAATTTTGAAAACATTTCCGCAAAGTTCAGACGGACCTC +GATGTGTTGATGAATAAAATTTATTAGTAATAACTTCTATAAGGTTATCAATCCCTATATTACTTTTTGC +ACTTCCATGATAAAGAGGGAACAGAGAACAATTCTGAAATCTTATGCTTTCCTCTTGTTCGAGTTCCAAT +GCTTCTAATGATTTACCGGACATATATTTCTCTAAAAGGTCATCGTTTCCCTCTATTACCGTATCCCATT +GTTCAGATTCGGTAAAGTTCGTCACACACATATTAGGATACAGTTCTACCTTCTGTTTGATTACAATTTC +CGCAGAAAGTTTCTCTTTAATATCCTGATAAACCGTTGATAAATCAATTCCATTTTGGTCAATCTTATTG +ATAAAAAAGATTGTGGGAATACCTATTTTCCTAAGTGCATGAAACAATATACGAGTTTGTGCTTGTACGC +CATCTTTTGCAGAAATCAGTAGAATTGCCCCATCTAAAACTGATAATGAACGATATACTTCTGCTAAGAA +ATCCATATGTCCTGGCGTGTCTATGATGTTCACCTTCGTATTTTCCCACTGAAAAGAGGTTATTCCTGTC +TGAATTGTAATTCCTCTCTGACGTTCTAAAAGCGTATTATCCGTCCTCGTTGTACCTTTGTCCACGCTTC +CTAATTCTGTAATCGCTCCGCTACTGTATAGTAAGCTTTCTGTCAAAGTAGTTTTTCCTGCATCAACATG +TGCTAAGATACCGATATTAATAATTTTCAATTTTATTTCCTCCAT +>tetM_M85225 +ATGAAAATTATTAATATTGGAGTTTTAGCTCATGTTGATGCAGGAAAAACTACCTTAACAGAAAGCTTAT +TATATAACAGTGGAGCGATTACAGAATTAGGAAGCGTGGACAAAGGTACAACGAGGACGGATAATACGCT +TTTAGAACGTCAGAGAGGAATTACAATTCAGACAGGAATAACCTCTTTTCAGTGGGAAAATACGAAGGTG +AACATCATAGACACGCCAGGACATATGGATTTCTTAGCAGAAGTATATCGTTCATTATCAGTTTTAGATG +GGGCAATTCTACTGATTTCTGCAAAAGATGGCGTACAAGCACAAACTCGTATATTATTTCATGCACTTAG +GAAAATGGGGATTCCCACAATCTTTTTTATCAATAAGATTGACCAAAATGGAATTGATTTATCAACGGTT +TATCAGGATATTAAAGAGAAACTTTCTGCCGAAATTGTAATCAAACAGAAGGTAGAACTGTATCCTAATG +TGTGTGTGACGAACTTTACCGAATCTGAACAATGGGATACGGTAATAGAGGGAAACGATGACCTTTTAGA +GAAATATATGTCCGGTAAATCATTAGAAGCATTGGAACTCGAACAAGAGGAAAGCATAAGATTTCAGAAT +TGTTCTCTGTTCCCTCTTTATCATGGAAGTGCAAAAAGTAATATAGGGATTGATAACCTTATAGAAGTTA +TTACTAATAAATTTTATTCATCAACACATCGAGGTCCGTCTGAACTTTGCGGAAATGTTTTCAAAATTGA +ATATACAAAAAAAAGACAACGTCTTGCATATATACGCCTTTATAGTGGAGTACTACATTTACGAGATTCG +GTTAGAGTATCAGAAAAAGAAAAAATAAAAGTTACAGAAATGTATACTTCAATAAATGGTGAATTATGTA +AGATTGATAGAGCTTATTCTGGAGAAATTGTTATTTTGCAAAATGAGTTTTTGAAGTTAAATAGTGTTCT +TGGAGATACAAAACTATTGCCACAGAGAAAAAAGATTGAAAATCCGCACCCTCTACTACAAACAACTGTT +GAACCGAGTAAACCTGAACAGAGAGAAATGTTGCTTGATGCCCTTTTGGAAATCTCAGATAGTGATCCGC +TTCTACGATATTACGTGGATTCTACGACACATGAAATTATACTTTCTTTCTTAGGGAAAGTACAAATGGA +AGTGATTAGTGCACTGTTGCAAGAAAAGTATCATGTGGAGATAGAACTAAAAGAGCCTACAGTCATTTAT +ATGGAGAGACCGTTAAAAAATGCAGAATATACCATTCACATCGAAGTGCCGCCAAATCCTTTCTGGGCTT +CCATTGGTTTATCTGTATCACCGCTTCCGTTGGGAAGTGGAATGCAGTATGAGAGCTCGGTTTCTCTTGG +ATACTTAAATCAATCATTTCAAAATGCAGTTATGGAAGGGATACGCTATGGTTGTGAACAAGGATTGTAT +GGTTGGAATGTGACGGACTGTAAAATCTGTTTTAAGTATGGCTTATACTATAGCCCTGTTAGTACCCCAG +CAGATTTTCGGATGCTTGCTCCTATTGTATTGGAACAAGTCTTAAAAAAAGCTGGAACAGAATTGTTAGA +GCCATATCTTAGTTTTAAAATTTATGCGCCACAGGAATATCTTTCACGAGCATACAACGATGCTCCTAAA +TATTGTGCGAACATCGTAGACACTCAATTGAAAAATAATGAGGTCATTCTTAGTGGAGAAATCCCTGCTC +GGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGTTTGTTTAACAGAGTT +AAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAA +GTACGATATATGTTCAATAAAATAACTTAG diff --git a/modules/amr.nf b/modules/amr.nf index 176ea37..94c3ddb 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -39,27 +39,47 @@ process GET_PBP_RESISTANCE { """ } -// Run AMRsearch to infer resistance (also determinants if any) of other antimicrobials +// Create ARIBA database and return database path +process CREATE_ARIBA_DB { + label 'ariba_container' + label 'farm_low' + + input: + path(ref_genome) + path(metadata) + + output: + path ariba_database + + script: + """ + ariba prepareref -f "$ref_genome" -m "$metadata" ariba_database + """ +} + +// Run ARIBA to identify AMR process OTHER_RESISTANCE { - label 'amrsearch_container' + label 'ariba_container' label 'farm_low' tag "$sample_id" input: - tuple val(sample_id), path(assembly) + path ariba_database + tuple val(sample_id), path(read1), path(read2), path(unpaired) output: - tuple val(sample_id), path(json), emit: json + tuple val(sample_id), path(tsv), emit: tsv script: - json='result.json' + tsv='report.tsv' """ - java -jar /paarsnp/paarsnp.jar -i "$assembly" -s 1313 -o > $json + ariba run --nucmer_min_id 80 --assembled_threshold 0.80 --assembly_cov 10 $ariba_database $read1 $read2 result + mv result/report.tsv "${tsv}" """ } -// Extract the results from the output file of the AMRsearch +// WIP, for extracting information from ARIBA report process GET_OTHER_RESISTANCE { label 'bash_container' label 'farm_low' @@ -67,15 +87,10 @@ process GET_OTHER_RESISTANCE { tag "$sample_id" input: - tuple val(sample_id), path(json) - - output: - tuple val(sample_id), env(CHL_RES), env(CHL_DETERMINANTS), env(CLI_RES), env(CLI_DETERMINANTS), env(ERY_RES), env(ERY_DETERMINANTS), env(FQ_RES), env(FQ_DETERMINANTS), env(KAN_RES), env(KAN_DETERMINANTS), env(LZO_RES), env(LZO_DETERMINANTS), env(TET_RES), env(TET_DETERMINANTS), env(TMP_RES), env(TMP_DETERMINANTS), env(SMX_RES), env(SMX_DETERMINANTS), env(COT_RES), env(COT_DETERMINANTS), emit: result + tuple val(sample_id), path(tsv) script: """ - JSON_FILE="$json" - - source get_other_resistance.sh + # TBC """ } diff --git a/nextflow.config b/nextflow.config index 7687a2f..ee0459d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -90,8 +90,8 @@ process { withLabel: spn_pbp_amr_container { container = 'harryhungch/spn-pbp-amr:23.01.16' } - withLabel: amrsearch_container { - container = 'harryhungch/amrsearch:23.02.23' + withLabel: ariba_container { + container = 'staphb/ariba:2.14.4' } withLabel: mlst_container { container = 'staphb/mlst:2.23.0' diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 27bb589..4e020ce 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -7,7 +7,7 @@ include { OVERALL_QC } from "$projectDir/modules/overall_qc" include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS; LINEAGE } from "$projectDir/modules/lineage" include { GET_SEROBA_DB; CREATE_SEROBA_DB; SEROTYPE } from "$projectDir/modules/serotype" include { MLST } from "$projectDir/modules/mlst" -include { PBP_RESISTANCE; GET_PBP_RESISTANCE; OTHER_RESISTANCE; GET_OTHER_RESISTANCE } from "$projectDir/modules/amr" +include { PBP_RESISTANCE; GET_PBP_RESISTANCE; CREATE_ARIBA_DB; OTHER_RESISTANCE; GET_OTHER_RESISTANCE } from "$projectDir/modules/amr" // Main pipeline workflow workflow PIPELINE { @@ -26,6 +26,9 @@ workflow PIPELINE { poppunk_db = GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_local) poppunk_ext_clusters = GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_local) + // Get path to ARIBA database, create from reference and metadata + ariba_db = CREATE_ARIBA_DB("$projectDir/data/ariba_sequences.fasta", "$projectDir/data/ariba_metadata.tsv") + // Get read pairs into Channel raw_read_pairs_ch raw_read_pairs_ch = Channel.fromFilePairs("$params.reads/*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}", checkIfExists: true) @@ -139,8 +142,8 @@ workflow PIPELINE { // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, infer resistance (also determinants if any) of other antimicrobials // Output into Channel GET_OTHER_RESISTANCE.out.result - OTHER_RESISTANCE(OVERALL_QC_PASSED_ASSEMBLIES_ch) - GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.json) + OTHER_RESISTANCE(ariba_db, OVERALL_QC_PASSED_READS_ch) + GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.tsv) // Generate results.csv by sorted sample_id based on merged Channels // READ_QC.out.result, ASSEMBLY_QC.out.result, MAPPING_QC.out.result, TAXONOMY_QC.out.result, OVERALL_QC.out.result, @@ -176,8 +179,8 @@ workflow PIPELINE { .map { (it[-1] == null) ? it[0..-2] + ['_'] * 8 : it } .join(GET_PBP_RESISTANCE.out.result, failOnDuplicate: true, remainder: true) .map { (it[-1] == null) ? it[0..-2] + ['_'] * 18 : it } - .join(GET_OTHER_RESISTANCE.out, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] * 20 : it } + // .join(GET_OTHER_RESISTANCE.out, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 20 : it } .map { it.collect {"\"$it\""}.join',' } .collectFile( name: 'results.csv', @@ -193,7 +196,7 @@ workflow PIPELINE { 'Serotype', 'ST', 'aroE', 'gdh', 'gki', 'recP', 'spi', 'xpt', 'ddl', 'pbp1a', 'pbp2b', 'pbp2x', 'AMO_MIC', 'AMO_Res', 'CFT_MIC', 'CFT_Res(Meningital)', 'CFT_Res(Non-meningital)', 'TAX_MIC', 'TAX_Res(Meningital)', 'TAX_Res(Non-meningital)', 'CFX_MIC', 'CFX_Res', 'MER_MIC', 'MER_Res', 'PEN_MIC', 'PEN_Res(Meningital)', 'PEN_Res(Non-meningital)', - 'CHL_Res', 'CHL_Determinant', 'CLI_Res', 'CLI_Determinant', 'ERY_Res', 'ERY_Determinant', 'FQ_Res', 'FQ_Determinant', 'KAN_Res', 'KAN_Determinant', 'LZO_Res', 'LZO_Determinant', 'TET_Res', 'TET_Determinant', 'TMP_Res', 'TMP_Determinant', 'SMX_Res', 'SMX_Determinant', 'COT_Res', 'COT_Determinant' + // 'CHL_Res', 'CHL_Determinant', 'CLI_Res', 'CLI_Determinant', 'ERY_Res', 'ERY_Determinant', 'FQ_Res', 'FQ_Determinant', 'KAN_Res', 'KAN_Determinant', 'LZO_Res', 'LZO_Determinant', 'TET_Res', 'TET_Determinant', 'TMP_Res', 'TMP_Determinant', 'SMX_Res', 'SMX_Determinant', 'COT_Res', 'COT_Determinant' ].join(','), sort: { it.split(',')[0] }, newLine: true From 4510697ee539318b7c8722b0d8bb6bf24180f11c Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 21 Jun 2023 13:32:55 +0000 Subject: [PATCH 002/157] Include ARIBA info Former-commit-id: c1fd7c2234289bcf7dfcc760580ce9701f70ce6d --- bin/get_images_info.sh | 4 ++-- bin/get_tools_info.sh | 1 + modules/info.nf | 19 +++++++++++++++++-- workflows/info_and_version.nf | 6 ++++-- 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/bin/get_images_info.sh b/bin/get_images_info.sh index c4474db..ba5428f 100755 --- a/bin/get_images_info.sh +++ b/bin/get_images_info.sh @@ -16,7 +16,7 @@ SAMTOOLS=$(grep samtools <<< $IMAGES) BCFTOOLS=$(grep bcftools <<< $IMAGES) POPPUNK=$(grep poppunk <<< $IMAGES) SPN_PBP_AMR=$(grep spn-pbp-amr <<< $IMAGES) -AMRSEARCH=$(grep amrsearch <<< $IMAGES) +ARIBA=$(grep ariba <<< $IMAGES) MLST=$(grep mlst <<< $IMAGES) KRAKEN2=$(grep kraken2 <<< $IMAGES) SEROBA=$(grep seroba <<< $IMAGES) @@ -38,7 +38,7 @@ jq -n \ --argjson bcftools "$(add_container $BCFTOOLS)" \ --argjson poppunk "$(add_container $POPPUNK)" \ --argjson spn_pbp_amr "$(add_container $SPN_PBP_AMR)" \ - --argjson amrsearch "$(add_container $AMRSEARCH)" \ + --argjson ariba "$(add_container $ARIBA)" \ --argjson mlst "$(add_container $MLST)" \ --argjson kraken2 "$(add_container $KRAKEN2)" \ --argjson seroba "$(add_container $SEROBA)" \ diff --git a/bin/get_tools_info.sh b/bin/get_tools_info.sh index c2cc99c..23d9520 100755 --- a/bin/get_tools_info.sh +++ b/bin/get_tools_info.sh @@ -18,4 +18,5 @@ jq -n \ --argjson mlst "$(add_version "$MLST_VERSION")" \ --argjson kraken2 "$(add_version "$KRAKEN2_VERSION")" \ --argjson seroba "$(add_version "$SEROBA_VERSION")" \ + --argjson ariba "$(add_version "$ARIBA_VERSION")" \ '$ARGS.named' > $JSON_FILE diff --git a/modules/info.nf b/modules/info.nf index 3f70fee..5dadd65 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -68,6 +68,7 @@ process TOOLS { val mlst_version val kraken2_version val seroba_version + val ariba_version output: path(json), emit: json @@ -88,6 +89,7 @@ process TOOLS { MLST_VERSION="$mlst_version" KRAKEN2_VERSION="$kraken2_version" SEROBA_VERSION="$seroba_version" + ARIBA_VERSION="$ariba_version" JSON_FILE="$json" source get_tools_info.sh @@ -223,7 +225,7 @@ process PARSE { |${toolTextRow('Het-SNP Counter', 'het_snp_count')} |${toolTextRow('PopPUNK', 'poppunk')} |${toolTextRow('CDC PBP AMR Predictor', 'spn_pbp_amr')} - |${toolTextRow('AMRsearch', 'amrsearch')} + |${toolTextRow('ARIBA', 'ariba')} |${toolTextRow('mlst', 'mlst')} |${toolTextRow('Kraken 2', 'kraken2')} |${toolTextRow('SeroBA', 'seroba')} @@ -259,7 +261,7 @@ process PARSE { |${imageTextRow('BCFtools', 'bcftools')} |${imageTextRow('PopPUNK', 'poppunk')} |${imageTextRow('CDC PBP AMR Predictor', 'spn_pbp_amr')} - |${imageTextRow('AMRsearch', 'amrsearch')} + |${imageTextRow('ARIBA', 'ariba')} |${imageTextRow('mlst', 'mlst')} |${imageTextRow('Kraken 2', 'kraken2')} |${imageTextRow('SeroBA', 'seroba')} @@ -566,3 +568,16 @@ process SEROBA_VERSION { VERSION=$(seroba version) /$ } + +process ARIBA_VERSION { + label 'ariba_container' + label 'farm_low' + + output: + env VERSION + + shell: + $/ + VERSION=$(ariba version | grep ARIBA | sed -r "s/.*:\s(.+)/\1/") + /$ +} diff --git a/workflows/info_and_version.nf b/workflows/info_and_version.nf index d6f0d50..3808a57 100644 --- a/workflows/info_and_version.nf +++ b/workflows/info_and_version.nf @@ -1,4 +1,4 @@ -include { IMAGES; DATABASES; TOOLS; COMBINE_INFO; PARSE; PRINT; SAVE; GIT_VERSION; PYTHON_VERSION; FASTP_VERSION; UNICYCLER_VERSION; SHOVILL_VERSION; QUAST_VERSION; BWA_VERSION; SAMTOOLS_VERSION; BCFTOOLS_VERSION; POPPUNK_VERSION; MLST_VERSION; KRAKEN2_VERSION; SEROBA_VERSION } from "$projectDir/modules/info" +include { IMAGES; DATABASES; TOOLS; COMBINE_INFO; PARSE; PRINT; SAVE; GIT_VERSION; PYTHON_VERSION; FASTP_VERSION; UNICYCLER_VERSION; SHOVILL_VERSION; QUAST_VERSION; BWA_VERSION; SAMTOOLS_VERSION; BCFTOOLS_VERSION; POPPUNK_VERSION; MLST_VERSION; KRAKEN2_VERSION; SEROBA_VERSION; ARIBA_VERSION } from "$projectDir/modules/info" // Alternative workflow that prints versions of pipeline and tools workflow PRINT_VERSION { @@ -69,6 +69,7 @@ workflow GET_VERSION { MLST_VERSION() KRAKEN2_VERSION() SEROBA_VERSION() + ARIBA_VERSION() TOOLS( GIT_VERSION.out, @@ -83,7 +84,8 @@ workflow GET_VERSION { POPPUNK_VERSION.out, MLST_VERSION.out, KRAKEN2_VERSION.out, - SEROBA_VERSION.out + SEROBA_VERSION.out, + ARIBA_VERSION.out ) COMBINE_INFO( From 648d3807705610b7ea4f5d485312abbcdad0e32c Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 21 Jun 2023 15:39:40 +0000 Subject: [PATCH 003/157] Update ARIBA reference and metadata Former-commit-id: 0106a59bac2e9a2916947a0617168d44d58cbd7f --- data/ariba_metadata.tsv | 21 ++++++++----- data/ariba_sequences.fasta | 64 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 7 deletions(-) diff --git a/data/ariba_metadata.tsv b/data/ariba_metadata.tsv index 2a73517..9d35ca5 100644 --- a/data/ariba_metadata.tsv +++ b/data/ariba_metadata.tsv @@ -39,8 +39,8 @@ tetS_M_AY534326 1 0 . . Tetracycline resistance tetM_M85225 1 0 . . Tetracycline resistance tetS_FN555436 1 0 . . Tetracycline resistance tetM_MH283017 1 0 . . tetracycline resistance -folA_AE007317 1 1 I100L . Trimethoprim -folP_AE007317 1 1 . . Sulfamethoxazole resistance on if insertions in 56-67 amino acids +folA_AE007317 1 1 I100L . "proteinID-AAL00232.1, Trimethoprim" +folP_AE007317 1 1 . . "proteinID-AAK99071.1, Sulfamethoxazole resistance on if insertions in 56-67 amino acids" gyrA_AE007317 1 1 S81F . Fluoroquinolone gyrA_AE007317 1 1 S81Y . Fluoroquinolone gyrA_AE007317 1 1 S81C . Fluoroquinolone @@ -62,12 +62,19 @@ parE_AE007317 1 1 P454S . Fluoroquinolone tetO_Y07780 1 0 . . Tetracycline resistance ermBups_HG799494 0 0 . . Erythromycin and Clindamycin resistance ermbTr_CP002121 0 0 . . Erythromycin and Clindamycin resistance -rplD_AE007317 1 1 . . Linezolid resistance (deletion within the L4 region of the gene PMID:24492357) -rpoB_AE007317 1 1 D489E . rifampicin resistance PMID:10508007-D415E -rpoB_AE007317 1 1 H499N . rifampicin resistance PMID:10508007-H425N -rpoB_AE007317 1 1 D489N . rifampicin resistance PMID:10508007-H415N +rplD_AE007317 1 1 . . Linezolid resistance (deletion within the L4 region of the gene ) +rpoB_AE007317 1 1 D489E . rifampicin resistance -D415E +rpoB_AE007317 1 1 H499N . rifampicin resistance -H425N +rpoB_AE007317 1 1 D489N . rifampicin resistance -H415N vanB_KC489787 1 0 . . Vacomycin resistance vanD_EU999036 1 0 . . Vacomycin resistance vanE_FJ872411 1 0 . . Vacomycin resistance vanG_KF704242 1 0 . . Vacomycin resistance -otrA_X53401 1 0 . . Tetracycline resistance \ No newline at end of file +otrA_X53401 1 0 . . Tetracycline resistance +vanA_M97297 1 0 . . Vacomycin resistance (E.faecium) +vanC_AF162694 1 0 . . Vacomycin resistance (E.gallinarum) +23S_NZ_CP018347 0 1 A2114G . Macrolide:32347-35250 +23S_NZ_CP018347 0 1 A2115G . Macrolide:32347-35250 +23S_NZ_CP018347 0 1 A2118G . Macrolide/Streptogramin:32347-35250 +23S_NZ_CP018347 0 1 C2630A . Macrolide:32347-35250 +23S_NZ_CP018347 0 1 C2630G . Macrolide:32347-35250 \ No newline at end of file diff --git a/data/ariba_sequences.fasta b/data/ariba_sequences.fasta index 8da1617..4509177 100644 --- a/data/ariba_sequences.fasta +++ b/data/ariba_sequences.fasta @@ -713,3 +713,67 @@ TATTGTGCGAACATCGTAGACACTCAATTGAAAAATAATGAGGTCATTCTTAGTGGAGAAATCCCTGCTC GGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGTTTGTTTAACAGAGTT AAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAA GTACGATATATGTTCAATAAAATAACTTAG +>vanA_M97297 +ATGAATAGAATAAAAGTTGCAATACTGTTTGGGGGTTGCTCAGAGGAGCATGACGTATCGGTAAAATCTGCAATAGAGATAGCCGCTAAC +ATTAATAAAGAAAAATACGAGCCGTTATACATTGGAATTACGAAATCTGGTGTATGGAAAATGTGCGAAAAACCTTGCGCGGAATGGGAA +AACGACAATTGCTATTCAGCTGTACTCTCGCCGGATAAAAAAATGCACGGATTACTTGTTAAAAAGAACCATGAATATGAAATCAACCAT +GTTGATGTAGCATTTTCAGCTTTGCATGGCAAGTCAGGTGAAGATGGATCCATACAAGGTCTGTTTGAATTGTCCGGTATCCCTTTTGTA +GGCTGCGATATTCAAAGCTCAGCAATTTGTATGGACAAATCGTTGACATACATCGTTGCGAAAAATGCTGGGATAGCTACTCCCGCCTTT +TGGGTTATTAATAAAGATGATAGGCCGGTGGCAGCTACGTTTACCTATCCTGTTTTTGTTAAGCCGGCGCGTTCAGGCTCATCCTTCGGT +GTGAAAAAAGTCAATAGCGCGGACGAATTGGACTACGCAATTGAATCGGCAAGACAATATGACAGCAAAATCTTAATTGAGCAGGCTGTT +TCGGGCTGTGAGGTCGGTTGTGCGGTATTGGGAAACAGTGCCGCGTTAGTTGTTGGCGAGGTGGACCAAATCAGGCTGCAGTACGGAATC +TTTCGTATTCATCAGGAAGTCGAGCCGGAAAAAGGCTCTGAAAACGCAGTTATAACCGTTCCCGCAGACCTTTCAGCAGAGGAGCGAGGA +CGGATACAGGAAACGGCAAAAAAAATATATAAAGCGCTCGGCTGTAGAGGTCTAGCCCGTGTGGATATGTTTTTACAAGATAACGGCCGC +ATTGTACTGAACGAAGTCAATACTCTGCCCGGTTTCACGTCATACAGTCGTTATCCCCGTATGATGGCCGCTGCAGGTATTGCACTTCCC +GAACTGATTGACCGCTTGATCGTATTAGCGTTAAAGGGGTGA +>vanC_AF162694 +ATGAAAAAAATTGCCGTTTTATTTGGAGGGAATTCTCCAGAATACTCAGTGTCACTAACCTCAGCAGCAAGTGTGATCCAAGCTATTGAC +CCGCTGAAATATGAAGTAATGACCATTGGCATCGCACCAACAATGGATTGGTATTGGTATCAAGGAAACCTCGCGAATGTTCGCAATGAT +ACTTGGCTAGAAGATCACAAAAACTGTCACCAGCTGACTTTTTCTAGCCAAGGATTTATATTAGGAGAAAAACGAATCGTCCCTGATGTC +CTCTTTCCAGTCTTGCATGGGAAGTATGGCGAGGATGGCTGTATCCAAGGACTGCTTGAACTAATGAACCTGCCTTATGTTGGTTGCCAT +GTCGCTGCCTCCGCATTATGTATGAACAAATGGCTCTTGCATCAACTTGCTGATACCATGGGAATCGCTAGTGCTCCCACTTTGCTTTTA +TCCCGCTATGAAAACGATCCTGCCACAATCGATCGTTTTATTCAAGACCATGGATTCCCGATCTTTATCAAGCCGAATGAAGCCGGTTCT +TCAAAAGGGATCACAAAAGTAACTGACAAAACAGCGCTCCAATCTGCATTAACGACTGCTTTTGCTTACGGTTCTACTGTGTTGATCCAA +AAGGCGATAGCGGGTATTGAAATTGGCTGCGGCATCTTAGGAAATGAGCAATTGACGATTGGTGCTTGTGATGCGATTTCTCTTGTCGAC +GGTTTTTTTGATTTTGAAGAGAAATACCAATTAATCAGCGCCACGATCACTGTCCCAGCACCATTGCCTCTCGCGCTTGAATCACAGATC +AAGGAGCAGGCACAGCTGCTTTATCGAAACTTGGGATTGACGGGTCTGGCTCGAATCGATTTTTTCGTCACCAATCAAGGAGCGATTTAT +TTAAACGAAATCAACACCATGCCGGGATTTACTGGGCACTCCCGCTACCCAGCTATGATGGCGGAAGTCGGGTTATCCTACGAAATATTA +GTAGAGCAATTGATTGCACTGGCAGAGGAGGACAAACGATGA +>23S_NZ_CP018347 +tttggataagtcctcgagctattagtattagtccgctacatgtgtcgccacacttccacttctaacctatctacctgatc +atctctcagggctcttactgatatataatcatgggaaatctcatcttgaggtgggtttcacacttagatgctttcagcgt +ttatcccttccctacatagctacccagcgatgcctttggcaagacaactggtacaccagcggtaagtccactctggtcct +ctcgtactaggagcagatcctctcaaatttcctacgcccgcgacggatagggaccgaactgtctcacgacgttctgaacc +cagctcgcgtgccgctttaatgggcgaacagcccaacccttgggaccgactacagccccaggatgcgacgagccgacatc +gaggtgccaaacctccccgtcgatgtgaactcttgggggagataagcctgttatccccagggtagcttttatccgttgag +cgatggcccttccatacggaaccaccggatcactaagcccgactttcgtccctgctcgagttgtagctctcgcagtcaag +ctcccttatacctttacactctgcgaatgatttccaaccattctgagggaacctttgggcgcctccgttaccttttagga +ggcgaccgccccagtcaaactgcccgtcagacactgtctccgatagggatcacctatctgggttagagtggccataacac +aagggtagtatcccaacagcgtctccttcgaaactggcgtcccgatctcttagactcctacctatcctgtacatgtggta +cagacactcaatatcaaactgcagtaaagctccatggggtctttccgtcctgtcgcgggtaacctgcatcttcacaggta +ctaaaatttcaccgagtctctcgttgagacagtgcccaaatcattacgcctttcgtgcgggtcggaacttacccgacaag +gaatttcgctaccttaggaccgttatagttacggccgccgtttactggggcttcaattcataccttcgcttacgctaagc +actcctcttaaccttccagcaccgggcaggcgtcaccccctatacatcatcttacgatttagcagagagctgtgtttttg +ataaacagttgcttgggcctattcactgcggctgacctaaagtcagcaccccttctcccgaagttacggggtcattttgc +cgagttccttaacgagagttctctcgctcacctgaggctactcgcctcgactacctgtgtcggtttgcggtacgggtaga +gtatgtttaaacgctagaagcttttcttggcagtgtgacgtcactaacttcgctactaaacttcgctccccatcacagct +caatgttatagaattaagcatttgactcaattcacacctcactgcttagacagactcttccaatcgtctgctttagttag +cctactgcgtccctccatcactacatactctagtacaggaatatcaacctgttgtccatcggatacacctttcggtctct +ccttaggtcccgactaacccagggcggacgagccttcccctggaaaccttagtcttacggtggacaggattctcacctgt +ctttcgctactcataccggcattctcacttctatgcgttccagcactcctcacggtataccttcatcacacatagaacgc +tctcctaccatacctataaaggtatccacagcttcggtaaattgttttagccccggtacattttcggcgcagggtcactc +gactagtgagctattacgcactctttgaatgaatagctgcttctaagctaacatcctagttgtctgtgcaaccccacatc +cttttccacttaacaattattttgggaccttagctggtggtctgggctgtttccctttcgactacggatcttagcactcg +cagtctgactgccgaccataattcattggcattcggagtttatctgagattggtaatccgggatggacccctcacccaaa +cagtgctctacctccaagaatctctaatgtcgacgctagccctaaagctatttcggagagaaccagctatctccaagttc +gtttggaatttctccgctacccacaagtcatccaagcacttttcaacgtgccctggttcggtcctccagtgcgtcttacc +gcaccttcaacctgctcatgggtaggtcacatggtttcgggtctacgtcatgatactaattcgccctgttcagactcggt +ttccctacggctccgtctcttcaacttaacctcgcatcataacgtaactcgccggttcattctacaaaaggcacgctctc +acccattaacgggctcgaacttgttgtaggcacacggtttcaggttctatttcactcccctcccggggtgcttttcacct +ttccctcacggtactggttcactatcggtcactagggagtatttagggttgggagatggtcctcccagattccgacggga +tttcacgtgtcccgccgtactcaggatactgctaggtacaaagactattttaaatacgaggctattactctctttggctg +atcttcccaaatcattcttctataatctttgagtccacattgcagtcctacaaccccgaagagtaaactcttcggtttgc +ccttctgccgtttcgctcgccgctactaaggcaatcgcttttgctttctcttcctgcagctacttagatgtttcagttca +ctgcgtcttcctcctcacatccttaacagatgtgggtaacaggtattacctgttgggttcccccattcggaaatccctgg +atcatcgcttacttacagctacccaaggtatatcgtcgtttgtcacgtccttcgtcggctcctagtgccaaggcatccac +cgtgcgcccttattaacttaacct From c7c22b3271dfd4ea49de74f8325aea955d6079ee Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 28 Jun 2023 14:11:58 +0000 Subject: [PATCH 004/157] Rename included ARIBA references Former-commit-id: ede4f8469abbed6593b5925268e33005c18a02cc --- data/{ariba_metadata.tsv => ariba_metadata-20230628.tsv} | 0 .../{ariba_sequences.fasta => ariba_ref_sequences-20230628.fasta} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename data/{ariba_metadata.tsv => ariba_metadata-20230628.tsv} (100%) rename data/{ariba_sequences.fasta => ariba_ref_sequences-20230628.fasta} (100%) diff --git a/data/ariba_metadata.tsv b/data/ariba_metadata-20230628.tsv similarity index 100% rename from data/ariba_metadata.tsv rename to data/ariba_metadata-20230628.tsv diff --git a/data/ariba_sequences.fasta b/data/ariba_ref_sequences-20230628.fasta similarity index 100% rename from data/ariba_sequences.fasta rename to data/ariba_ref_sequences-20230628.fasta From 94e3d8e7d18741aceaf8a8d6fc10f05a6607badf Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 28 Jun 2023 14:12:40 +0000 Subject: [PATCH 005/157] Allow custom ARIBA references and save database Former-commit-id: 1233fdecf005871f57fc669c9350bb7d93832141 --- modules/amr.nf | 20 +++++++++++--------- modules/validate.nf | 12 ++++++++++++ nextflow.config | 5 +++++ workflows/pipeline.nf | 7 ++++--- 4 files changed, 32 insertions(+), 12 deletions(-) diff --git a/modules/amr.nf b/modules/amr.nf index 94c3ddb..b6e3e24 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -45,15 +45,17 @@ process CREATE_ARIBA_DB { label 'farm_low' input: - path(ref_genome) - path(metadata) + path ref_sequences + path metadata + path local output: - path ariba_database + path "${local}/database" script: """ - ariba prepareref -f "$ref_genome" -m "$metadata" ariba_database + rm -rf "$local/database" + ariba prepareref -f "$ref_sequences" -m "$metadata" "$local/database" """ } @@ -69,13 +71,13 @@ process OTHER_RESISTANCE { tuple val(sample_id), path(read1), path(read2), path(unpaired) output: - tuple val(sample_id), path(tsv), emit: tsv + tuple val(sample_id), path(report), path(report_debug), emit: reports script: - tsv='report.tsv' + report='result/report.tsv' + report_debug='result/debug.report.tsv' """ - ariba run --nucmer_min_id 80 --assembled_threshold 0.80 --assembly_cov 10 $ariba_database $read1 $read2 result - mv result/report.tsv "${tsv}" + ariba run --nucmer_min_id 80 --assembled_threshold 0.80 --assembler spades $ariba_database $read1 $read2 result """ } @@ -87,7 +89,7 @@ process GET_OTHER_RESISTANCE { tag "$sample_id" input: - tuple val(sample_id), path(tsv) + tuple val(sample_id), path(report), path(report_debug) script: """ diff --git a/modules/validate.nf b/modules/validate.nf index b8fb6be..4cf3438 100644 --- a/modules/validate.nf +++ b/modules/validate.nf @@ -26,6 +26,9 @@ validParams = [ length_low: 'int', length_high: 'int', depth: 'int_float', + ariba_ref: 'path_fasta', + ariba_metadata: 'path_tsv', + ariba_db_local: 'path', lite: 'boolean' ] @@ -141,6 +144,15 @@ void validate(Map params) { invalidValues[key] = [value, 'path to a fasta file (file does not have an filename extension of .fasta or .fa)'] } break + + case 'path_tsv': + File tsv = new File(value) + if (!tsv.exists()) { + invalidValues[key] = [value, 'path to a TSV file (file does not exist)'] + } else if (!(value ==~ /.+\.tsv$/)) { + invalidValues[key] = [value, 'path to a TSV file (file does not have an filename extension of .tsv)'] + } + break case 'url_git': if (!(value ==~ /^(https?:\/\/)?(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)\.git$/)) { diff --git a/nextflow.config b/nextflow.config index ee0459d..191a4e5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -47,6 +47,11 @@ params { length_high = 2300000 depth = 20.00 + // Default ARIBA referece sequences and metadata paths, and local directory for its generated database + ariba_ref = "$projectDir/data/ariba_ref_sequences-20230628.fasta" + ariba_metadata = "$projectDir/data/ariba_metadata-20230628.tsv" + ariba_db_local = "$projectDir/databases/ariba" + // Toggle for removing .bam and .sam files mid-run to reduce storage requirement // Warning: This will break the -resume function of Nextflow lite = false diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 4e020ce..46e6a6f 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -26,8 +26,8 @@ workflow PIPELINE { poppunk_db = GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_local) poppunk_ext_clusters = GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_local) - // Get path to ARIBA database, create from reference and metadata - ariba_db = CREATE_ARIBA_DB("$projectDir/data/ariba_sequences.fasta", "$projectDir/data/ariba_metadata.tsv") + // Get path to ARIBA database, generate from reference sequences and metadata if ncessary + ariba_db = CREATE_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) // Get read pairs into Channel raw_read_pairs_ch raw_read_pairs_ch = Channel.fromFilePairs("$params.reads/*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}", checkIfExists: true) @@ -143,7 +143,8 @@ workflow PIPELINE { // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, infer resistance (also determinants if any) of other antimicrobials // Output into Channel GET_OTHER_RESISTANCE.out.result OTHER_RESISTANCE(ariba_db, OVERALL_QC_PASSED_READS_ch) - GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.tsv) + OTHER_RESISTANCE.out.reports.view() + GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.reports) // Generate results.csv by sorted sample_id based on merged Channels // READ_QC.out.result, ASSEMBLY_QC.out.result, MAPPING_QC.out.result, TAXONOMY_QC.out.result, OVERALL_QC.out.result, From e1c91e702d9d80b1c96036eb0e39d9e79194dc84 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 30 Jun 2023 11:31:23 +0000 Subject: [PATCH 006/157] Implement validation of existing ARIBA database Former-commit-id: de24962f929e06658774ba006d188f612db064e1 --- bin/create_ariba_db.sh | 34 ++++++++++++++++++++++++++++++++++ modules/amr.nf | 11 ++++++++--- 2 files changed, 42 insertions(+), 3 deletions(-) create mode 100755 bin/create_ariba_db.sh diff --git a/bin/create_ariba_db.sh b/bin/create_ariba_db.sh new file mode 100755 index 0000000..03d9997 --- /dev/null +++ b/bin/create_ariba_db.sh @@ -0,0 +1,34 @@ +# Check if CREATE_ARIBA_DB has run successfully on the specific reference sequences and metadata. +# If not: remove the $OUTPUT directory, and prepare the ARIBA database from reference sequences and metadata, also save metadata to done_ariba_db.json + +JSON="done_ariba_db.json" + +REF_SEQUENCES_MD5=$(md5sum $REF_SEQUENCES | awk '{ print $1 }') +METADATA_MD5=$(md5sum $METADATA | awk '{ print $1 }') + +if [ ! -f ${DB_LOCAL}/${JSON} ] || \ + [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$REF_SEQUENCES" ] || \ + [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$REF_SEQUENCES_MD5" ] || \ + [ ! "$(grep '"metadata"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$METADATA" ] || \ + [ ! "$(grep '"metadata_md5"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$METADATA_MD5" ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/00.info.txt ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/00.version_info.txt ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_genes.log ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_metadata.log ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_metadata.tsv ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_noncoding.log ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.all.fa ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.clusters.pickle ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.clusters.tsv ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.gene.fa ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.gene.varonly.fa ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.fa ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.varonly.fa ] ; then + + rm -rf "$DB_LOCAL/$OUTPUT" + + ariba prepareref -f "$REF_SEQUENCES" -m "$METADATA" "$DB_LOCAL/$OUTPUT" + + echo -e "{\n \"reference\": \"$REF_SEQUENCES\",\n \"reference_md5\": \"$REF_SEQUENCES_MD5\",\n \"metadata\": \"$METADATA\",\n \"metadata_md5\": \"$METADATA_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON} + +fi \ No newline at end of file diff --git a/modules/amr.nf b/modules/amr.nf index b6e3e24..bafff49 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -50,12 +50,17 @@ process CREATE_ARIBA_DB { path local output: - path "${local}/database" + path "${local}/${output}" script: + output='database' """ - rm -rf "$local/database" - ariba prepareref -f "$ref_sequences" -m "$metadata" "$local/database" + REF_SEQUENCES="$ref_sequences" + METADATA="$metadata" + DB_LOCAL="$local" + OUTPUT="$output" + + source create_ariba_db.sh """ } From de2e2a3f29019d82828a736466cd9040a1feefd2 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 30 Jun 2023 11:53:03 +0000 Subject: [PATCH 007/157] Improve naming of BWA database module and script Former-commit-id: 93a926df184a9b607cf7b64d30eccc554bff8b9c --- ...me_bwa_db_prefix.sh => create_ref_genome_bwa_db.sh} | 2 +- modules/mapping.nf | 6 +++--- workflows/init.nf | 4 ++-- workflows/pipeline.nf | 10 +++++----- 4 files changed, 11 insertions(+), 11 deletions(-) rename bin/{get_ref_genome_bwa_db_prefix.sh => create_ref_genome_bwa_db.sh} (90%) diff --git a/bin/get_ref_genome_bwa_db_prefix.sh b/bin/create_ref_genome_bwa_db.sh similarity index 90% rename from bin/get_ref_genome_bwa_db_prefix.sh rename to bin/create_ref_genome_bwa_db.sh index 7a155a1..e9f5225 100755 --- a/bin/get_ref_genome_bwa_db_prefix.sh +++ b/bin/create_ref_genome_bwa_db.sh @@ -1,4 +1,4 @@ -# Check if GET_REF_GENOME_BWA_DB_PREFIX has run successfully on the specific reference. +# Check if CREATE_REF_GENOME_BWA_DB has run successfully on the specific reference. # If not: remove files in database directory, and construct the FM-index database of the reference genome for BWA, also save metadata to done_bwa_db.json if [ ! -f ${DB_LOCAL}/done_bwa_db.json ] || \ diff --git a/modules/mapping.nf b/modules/mapping.nf index 46b04c8..c058954 100644 --- a/modules/mapping.nf +++ b/modules/mapping.nf @@ -1,5 +1,5 @@ -// Return database prefix with path, construct if necessary -process GET_REF_GENOME_BWA_DB_PREFIX { +// Return database path and prefix, construct if necessary +process CREATE_REF_GENOME_BWA_DB { label 'bwa_container' label 'farm_mid' @@ -17,7 +17,7 @@ process GET_REF_GENOME_BWA_DB_PREFIX { DB_LOCAL="$local" PREFIX="$prefix" - source get_ref_genome_bwa_db_prefix.sh + source create_ref_genome_bwa_db.sh """ } diff --git a/workflows/init.nf b/workflows/init.nf index 24e1deb..e92c2da 100644 --- a/workflows/init.nf +++ b/workflows/init.nf @@ -1,5 +1,5 @@ // Import process modules -include { GET_REF_GENOME_BWA_DB_PREFIX } from "$projectDir/modules/mapping" +include { CREATE_REF_GENOME_BWA_DB } from "$projectDir/modules/mapping" include { GET_KRAKEN_DB } from "$projectDir/modules/taxonomy" include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS } from "$projectDir/modules/lineage" include { GET_SEROBA_DB; CREATE_SEROBA_DB } from "$projectDir/modules/serotype" @@ -8,7 +8,7 @@ include { GET_DOCKER_COMPOSE; PULL_IMAGES } from "$projectDir/modules/docker" // Alternative workflow for initialisation only workflow INIT { // Check Reference Genome BWA Database, generate from assembly if necessary - GET_REF_GENOME_BWA_DB_PREFIX(params.ref_genome, params.ref_genome_bwa_db_local) + CREATE_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) // Check Kraken2 Database, download if necessary kraken2_db = GET_KRAKEN_DB(params.kraken2_db_remote, params.kraken2_db_local) diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 46e6a6f..f08f357 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -1,7 +1,7 @@ // Import process modules include { PREPROCESS; READ_QC } from "$projectDir/modules/preprocess" include { ASSEMBLY_UNICYCLER; ASSEMBLY_SHOVILL; ASSEMBLY_ASSESS; ASSEMBLY_QC } from "$projectDir/modules/assembly" -include { GET_REF_GENOME_BWA_DB_PREFIX; MAPPING; SAM_TO_SORTED_BAM; SNP_CALL; HET_SNP_COUNT; MAPPING_QC } from "$projectDir/modules/mapping" +include { CREATE_REF_GENOME_BWA_DB; MAPPING; SAM_TO_SORTED_BAM; SNP_CALL; HET_SNP_COUNT; MAPPING_QC } from "$projectDir/modules/mapping" include { GET_KRAKEN_DB; TAXONOMY; TAXONOMY_QC } from "$projectDir/modules/taxonomy" include { OVERALL_QC } from "$projectDir/modules/overall_qc" include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS; LINEAGE } from "$projectDir/modules/lineage" @@ -12,8 +12,8 @@ include { PBP_RESISTANCE; GET_PBP_RESISTANCE; CREATE_ARIBA_DB; OTHER_RESISTANCE; // Main pipeline workflow workflow PIPELINE { main: - // Get path to prefix of Reference Genome BWA Database, generate from assembly if necessary - ref_genome_bwa_db_prefix = GET_REF_GENOME_BWA_DB_PREFIX(params.ref_genome, params.ref_genome_bwa_db_local) + // Get path and prefix of Reference Genome BWA Database, generate from assembly if necessary + ref_genome_bwa_db = CREATE_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) // Get path to Kraken2 Database, download if necessary kraken2_db = GET_KRAKEN_DB(params.kraken2_db_remote, params.kraken2_db_local) @@ -73,7 +73,7 @@ workflow PIPELINE { // From Channel READ_QC_PASSED_READS_ch map reads to reference // Output into Channel MAPPING.out.sam - MAPPING(ref_genome_bwa_db_prefix, READ_QC_PASSED_READS_ch) + MAPPING(ref_genome_bwa_db, READ_QC_PASSED_READS_ch) // From Channel MAPPING.out.sam, Convert SAM into sorted BAM and calculate reference coverage // Output into Channels SAM_TO_SORTED_BAM.out.bam and SAM_TO_SORTED_BAM.out.ref_coverage @@ -204,7 +204,7 @@ workflow PIPELINE { ) // Pass to SAVE_INFO sub-workflow - DATABASES_INFO = ref_genome_bwa_db_prefix.map { it[0] } + DATABASES_INFO = ref_genome_bwa_db.map { it[0] } .merge(kraken2_db) .merge(seroba_db.map { it[0] }) .merge(poppunk_db.map { it[0] }) From 1b9851633627debc9836142e841a5da940df57de Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 30 Jun 2023 12:30:39 +0000 Subject: [PATCH 008/157] Add MD5 check to BWA database Former-commit-id: 199e21324c6a793a16ad7ea4ec079df5aa18bbde --- bin/create_ref_genome_bwa_db.sh | 11 ++++++++--- bin/get_databases_info.sh | 3 ++- modules/info.nf | 23 ++++++++++++----------- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/bin/create_ref_genome_bwa_db.sh b/bin/create_ref_genome_bwa_db.sh index e9f5225..aae6537 100755 --- a/bin/create_ref_genome_bwa_db.sh +++ b/bin/create_ref_genome_bwa_db.sh @@ -1,8 +1,13 @@ # Check if CREATE_REF_GENOME_BWA_DB has run successfully on the specific reference. # If not: remove files in database directory, and construct the FM-index database of the reference genome for BWA, also save metadata to done_bwa_db.json -if [ ! -f ${DB_LOCAL}/done_bwa_db.json ] || \ - [ ! "$(grep 'reference' ${DB_LOCAL}/done_bwa_db.json | sed -r 's/.+: "(.*)",/\1/')" == "$REFERENCE" ] || \ +JSON="done_bwa_db.json" + +REFERENCE_MD5=$(md5sum $REFERENCE | awk '{ print $1 }') + +if [ ! -f ${DB_LOCAL}/${JSON} ] || \ + [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$REFERENCE" ] || \ + [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$REFERENCE_MD5" ] || \ [ ! -f ${DB_LOCAL}/${PREFIX}.amb ] || \ [ ! -f ${DB_LOCAL}/${PREFIX}.ann ] || \ [ ! -f ${DB_LOCAL}/${PREFIX}.bwt ] || \ @@ -15,6 +20,6 @@ if [ ! -f ${DB_LOCAL}/done_bwa_db.json ] || \ mv ${PREFIX}.amb ${PREFIX}.ann ${PREFIX}.bwt ${PREFIX}.pac ${PREFIX}.sa -t $DB_LOCAL - echo -e "{\n \"reference\": \"$REFERENCE\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/done_bwa_db.json + echo -e "{\n \"reference\": \"$REFERENCE\",\n \"reference_md5\": \"$REFERENCE_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON} fi diff --git a/bin/get_databases_info.sh b/bin/get_databases_info.sh index acc9258..45d126b 100755 --- a/bin/get_databases_info.sh +++ b/bin/get_databases_info.sh @@ -4,12 +4,13 @@ add_bwa_db () { BWA_DB_JSON=${BWA_DB_PATH}/done_bwa_db.json if [ -f "$BWA_DB_JSON" ]; then REFERENCE=$(jq -r .reference $BWA_DB_JSON) + REFERENCE_MD5=$(jq -r .reference_md5 $BWA_DB_JSON) CREATE_TIME=$(jq -r .create_time $BWA_DB_JSON) else REFERENCE="Not yet created" CREATE_TIME="Not yet created" fi - jq -n --arg ref "$REFERENCE" --arg create_time "$CREATE_TIME" '. = {"reference": $ref, "create_time": $create_time}' + jq -n --arg ref "$REFERENCE" --arg ref_md5 "$REFERENCE_MD5" --arg create_time "$CREATE_TIME" '. = {"reference": $ref, "reference_md5": $ref_md5, "create_time": $create_time}' } add_seroba_db () { diff --git a/modules/info.nf b/modules/info.nf index 5dadd65..dd456ed 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -162,38 +162,39 @@ process PARSE { |""".stripMargin() def dbTextRow = { leftContent, rightContent -> - textRow(9, 77, leftContent, rightContent) + textRow(13, 73, leftContent, rightContent) } dbText = """\ |┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ Databases Information ┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ |╔═══════════════════════════════════════════════════════════════════════════════════════════╗ |║ BWA reference genome FM-index database ║ - |╟───────────┬───────────────────────────────────────────────────────────────────────────────╢ + |╟───────────────┬───────────────────────────────────────────────────────────────────────────╢ |${dbTextRow('Reference', json.bwa_db.reference)} + |${dbTextRow('Reference MD5', json.bwa_db.reference_md5)} |${dbTextRow('Created', json.bwa_db.create_time)} - |╠═══════════╧═══════════════════════════════════════════════════════════════════════════════╣ + |╠═══════════════╧═══════════════════════════════════════════════════════════════════════════╣ |║ Kraken 2 database ║ - |╟───────────┬───────────────────────────────────────────────────────────────────────────────╢ + |╟───────────────┬───────────────────────────────────────────────────────────────────────────╢ |${dbTextRow('Source', json.kraken2_db.url)} |${dbTextRow('Saved', json.kraken2_db.save_time)} - |╠═══════════╧═══════════════════════════════════════════════════════════════════════════════╣ + |╠═══════════════╧═══════════════════════════════════════════════════════════════════════════╣ |║ PopPUNK database ║ - |╟───────────┬───────────────────────────────────────────────────────────────────────────────╢ + |╟───────────────┬───────────────────────────────────────────────────────────────────────────╢ |${dbTextRow('Source', json.poppunnk_db.url)} |${dbTextRow('Saved', json.poppunnk_db.save_time)} - |╠═══════════╧═══════════════════════════════════════════════════════════════════════════════╣ + |╠═══════════════╧═══════════════════════════════════════════════════════════════════════════╣ |║ PopPUNK external clusters file ║ - |╟───────────┬───────────────────────────────────────────────────────────────────────────────╢ + |╟───────────────┬───────────────────────────────────────────────────────────────────────────╢ |${dbTextRow('Source', json.poppunk_ext.url)} |${dbTextRow('Saved', json.poppunk_ext.save_time)} - |╠═══════════╧═══════════════════════════════════════════════════════════════════════════════╣ + |╠═══════════════╧═══════════════════════════════════════════════════════════════════════════╣ |║ SeroBA database ║ - |╟───────────┬───────────────────────────────────────────────────────────────────────────────╢ + |╟───────────────┬───────────────────────────────────────────────────────────────────────────╢ |${dbTextRow('Source', json.seroba_db.git)} |${dbTextRow('Kmer size', json.seroba_db.kmer)} |${dbTextRow('Created', json.seroba_db.create_time)} - |╚═══════════╧═══════════════════════════════════════════════════════════════════════════════╝ + |╚═══════════════╧═══════════════════════════════════════════════════════════════════════════╝ |""".stripMargin() def getVersion = { tool -> From 7e8cfad4d5505b9ad44d58e06e46e39dba9d9762 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 30 Jun 2023 13:41:38 +0000 Subject: [PATCH 009/157] Output ARIBA database info Former-commit-id: 92797594e9261dc669247a89b60bda709eac6eee --- bin/get_databases_info.sh | 30 +++++++++++++++++++++++++----- modules/amr.nf | 6 +++--- modules/info.nf | 22 ++++++++++++++++++++++ workflows/info_and_version.nf | 4 ++++ workflows/pipeline.nf | 8 +++++--- 5 files changed, 59 insertions(+), 11 deletions(-) diff --git a/bin/get_databases_info.sh b/bin/get_databases_info.sh index 45d126b..c87d56f 100755 --- a/bin/get_databases_info.sh +++ b/bin/get_databases_info.sh @@ -1,20 +1,39 @@ # Save received databases information into a JSON file add_bwa_db () { - BWA_DB_JSON=${BWA_DB_PATH}/done_bwa_db.json + BWA_DB_JSON=${BWA_DB_PATH}/${BWA_JSON} if [ -f "$BWA_DB_JSON" ]; then REFERENCE=$(jq -r .reference $BWA_DB_JSON) REFERENCE_MD5=$(jq -r .reference_md5 $BWA_DB_JSON) CREATE_TIME=$(jq -r .create_time $BWA_DB_JSON) else REFERENCE="Not yet created" + REFERENCE_MD5="Not yet created" CREATE_TIME="Not yet created" fi jq -n --arg ref "$REFERENCE" --arg ref_md5 "$REFERENCE_MD5" --arg create_time "$CREATE_TIME" '. = {"reference": $ref, "reference_md5": $ref_md5, "create_time": $create_time}' } +add_ariba_db () { + ARIBA_DB_JSON=${ARIBA_DB_PATH}/${ARIBA_JSON} + if [ -f "$ARIBA_DB_JSON" ]; then + REFERENCE=$(jq -r .reference $ARIBA_DB_JSON) + REFERENCE_MD5=$(jq -r .reference_md5 $ARIBA_DB_JSON) + METADATA=$(jq -r .metadata $ARIBA_DB_JSON) + METADATA_MD5=$(jq -r .metadata_md5 $ARIBA_DB_JSON) + CREATE_TIME=$(jq -r .create_time $ARIBA_DB_JSON) + else + REFERENCE="Not yet created" + REFERENCE_MD5="Not yet created" + METADATA="Not yet created" + METADATA_MD5="Not yet created" + CREATE_TIME="Not yet created" + fi + jq -n --arg ref "$REFERENCE" --arg ref_md5 "$REFERENCE_MD5" --arg meta "$METADATA" --arg meta_md5 "$METADATA_MD5" --arg create_time "$CREATE_TIME" '. = {"reference": $ref, "reference_md5": $ref_md5, "metadata": $meta, "metadata_md5": $meta_md5, "create_time": $create_time}' +} + add_seroba_db () { - SEROBA_DB_JSON=${SEROBA_DB_PATH}/done_seroba.json + SEROBA_DB_JSON=${SEROBA_DB_PATH}/${SEROBA_JSON} if [ -f "$SEROBA_DB_JSON" ]; then GIT=$(jq -r .git $SEROBA_DB_JSON) KMER=$(jq -r .kmer $SEROBA_DB_JSON) @@ -41,8 +60,9 @@ add_url_db () { jq -n \ --argjson bwa_db "$(add_bwa_db)" \ + --argjson ariba_db "$(add_ariba_db)" \ --argjson seroba_db "$(add_seroba_db)" \ - --argjson kraken2_db "$(add_url_db "${KRAKEN2_DB_PATH}/done_kraken.json")" \ - --argjson poppunnk_db "$(add_url_db "${POPPUNK_DB_PATH}/done_poppunk.json")" \ - --argjson poppunk_ext "$(add_url_db "${POPPUNK_DB_PATH}/done_poppunk_ext.json")" \ + --argjson kraken2_db "$(add_url_db "${KRAKEN2_DB_PATH}/${KRAKEN2_JSON}")" \ + --argjson poppunnk_db "$(add_url_db "${POPPUNK_DB_PATH}/${POPPUNK_JSON}")" \ + --argjson poppunk_ext "$(add_url_db "${POPPUNK_DB_PATH}/${POPPUNK_EXT_JSON}")" \ '$ARGS.named' > $JSON_FILE diff --git a/modules/amr.nf b/modules/amr.nf index bafff49..6507c41 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -50,7 +50,7 @@ process CREATE_ARIBA_DB { path local output: - path "${local}/${output}" + tuple path(local), val(output) script: output='database' @@ -72,7 +72,7 @@ process OTHER_RESISTANCE { tag "$sample_id" input: - path ariba_database + tuple path(ariba_database), val(database) tuple val(sample_id), path(read1), path(read2), path(unpaired) output: @@ -82,7 +82,7 @@ process OTHER_RESISTANCE { report='result/report.tsv' report_debug='result/debug.report.tsv' """ - ariba run --nucmer_min_id 80 --assembled_threshold 0.80 --assembler spades $ariba_database $read1 $read2 result + ariba run --nucmer_min_id 80 --assembled_threshold 0.80 --assembler spades $ariba_database/$database $read1 $read2 result """ } diff --git a/modules/info.nf b/modules/info.nf index dd456ed..4cabccb 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -29,6 +29,7 @@ process DATABASES { input: val bwa_db_path + val ariba_db_path val kraken2_db_path val seroba_db_path val poppunk_db_path @@ -38,11 +39,24 @@ process DATABASES { script: json='databases.json' + bwa_json='done_bwa_db.json' + ariba_json='done_ariba_db.json' + seroba_json='done_seroba.json' + kraken2_json='done_kraken.json' + poppunk_json='done_poppunk.json' + poppunk_ext_json='done_poppunk_ext.json' """ BWA_DB_PATH="$bwa_db_path" + BWA_JSON="$bwa_json" + ARIBA_DB_PATH="$ariba_db_path" + ARIBA_JSON="$ariba_json" KRAKEN2_DB_PATH="$kraken2_db_path" + KRAKEN2_JSON="$kraken2_json" SEROBA_DB_PATH="$seroba_db_path" + SEROBA_JSON="$seroba_json" POPPUNK_DB_PATH="$poppunk_db_path" + POPPUNK_JSON="$poppunk_json" + POPPUNK_EXT_JSON="$poppunk_ext_json" JSON_FILE="$json" source get_databases_info.sh @@ -194,6 +208,14 @@ process PARSE { |${dbTextRow('Source', json.seroba_db.git)} |${dbTextRow('Kmer size', json.seroba_db.kmer)} |${dbTextRow('Created', json.seroba_db.create_time)} + |╠═══════════════╧═══════════════════════════════════════════════════════════════════════════╣ + |║ ARIBA database ║ + |╟───────────────┬───────────────────────────────────────────────────────────────────────────╢ + |${dbTextRow('Reference', json.ariba_db.reference)} + |${dbTextRow('Reference MD5', json.ariba_db.reference_md5)} + |${dbTextRow('Metadata', json.ariba_db.metadata)} + |${dbTextRow('Metadata MD5', json.ariba_db.metadata_md5)} + |${dbTextRow('Created', json.ariba_db.create_time)} |╚═══════════════╧═══════════════════════════════════════════════════════════════════════════╝ |""".stripMargin() diff --git a/workflows/info_and_version.nf b/workflows/info_and_version.nf index 3808a57..bb5ce37 100644 --- a/workflows/info_and_version.nf +++ b/workflows/info_and_version.nf @@ -8,6 +8,7 @@ workflow PRINT_VERSION { main: GET_VERSION( params.ref_genome_bwa_db_local, + params.ariba_db_local, params.kraken2_db_local, params.seroba_local, params.poppunk_local, @@ -26,6 +27,7 @@ workflow SAVE_INFO { main: GET_VERSION( databases_info.bwa_db_path, + databases_info.ariba_db_path, databases_info.kraken2_db_path, databases_info.seroba_db_path, databases_info.poppunk_db_path, @@ -39,6 +41,7 @@ workflow SAVE_INFO { workflow GET_VERSION { take: bwa_db_path + ariba_db_path kraken2_db_path seroba_db_path poppunk_db_path @@ -49,6 +52,7 @@ workflow GET_VERSION { DATABASES( bwa_db_path, + ariba_db_path, kraken2_db_path, seroba_db_path, poppunk_db_path diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index f08f357..21a0593 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -205,6 +205,7 @@ workflow PIPELINE { // Pass to SAVE_INFO sub-workflow DATABASES_INFO = ref_genome_bwa_db.map { it[0] } + .merge(ariba_db.map { it[0] }) .merge(kraken2_db) .merge(seroba_db.map { it[0] }) .merge(poppunk_db.map { it[0] }) @@ -212,9 +213,10 @@ workflow PIPELINE { .map { [ bwa_db_path: it[0], - kraken2_db_path: it[1], - seroba_db_path: it[2], - poppunk_db_path: it[3] + ariba_db_path: it[1], + kraken2_db_path: it[2], + seroba_db_path: it[3], + poppunk_db_path: it[4] ] } From fd31c316cd6a5bbebf334cbb2c778d3e04abe4de Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 30 Jun 2023 15:09:59 +0000 Subject: [PATCH 010/157] Improve DATABASES_INFO maintainability Former-commit-id: 9ebff00173b08cc4f05a0b45d54477566500a8d6 --- workflows/pipeline.nf | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 21a0593..d11726d 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -143,7 +143,6 @@ workflow PIPELINE { // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, infer resistance (also determinants if any) of other antimicrobials // Output into Channel GET_OTHER_RESISTANCE.out.result OTHER_RESISTANCE(ariba_db, OVERALL_QC_PASSED_READS_ch) - OTHER_RESISTANCE.out.reports.view() GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.reports) // Generate results.csv by sorted sample_id based on merged Channels @@ -204,21 +203,14 @@ workflow PIPELINE { ) // Pass to SAVE_INFO sub-workflow - DATABASES_INFO = ref_genome_bwa_db.map { it[0] } - .merge(ariba_db.map { it[0] }) - .merge(kraken2_db) - .merge(seroba_db.map { it[0] }) - .merge(poppunk_db.map { it[0] }) - .merge(poppunk_ext_clusters) - .map { - [ - bwa_db_path: it[0], - ariba_db_path: it[1], - kraken2_db_path: it[2], - seroba_db_path: it[3], - poppunk_db_path: it[4] - ] - } + DATABASES_INFO = ref_genome_bwa_db.map { [["bwa_db_path", it[0]]] } + .merge(ariba_db.map { [["ariba_db_path", it[0]]] }) + .merge(kraken2_db.map { [["kraken2_db_path", it]] }) + .merge(seroba_db.map { [["seroba_db_path", it[0]]] }) + .merge(poppunk_db.map { [["poppunk_db_path", it[0]]] }) + .merge(poppunk_ext_clusters.map { [["poppunk_ext_path", it]] }) + // Save key-value tuples into a map + .map { it.collectEntries() } emit: databases_info = DATABASES_INFO From 671fce649e5dc6a988dfc005a56bb455b64b3c64 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 30 Jun 2023 15:48:05 +0000 Subject: [PATCH 011/157] Improve code maintainability Former-commit-id: f8b857a959f6517e5a7807ba17f39c04593abfa6 --- bin/{get_kraken_db.sh => get_kraken2_db.sh} | 7 ++-- bin/taxonomy_qc.sh | 4 +-- modules/amr.nf | 6 ++-- modules/lineage.nf | 8 +++-- modules/mapping.nf | 6 ++-- modules/serotype.nf | 6 ++-- modules/taxonomy.nf | 20 ++++++------ workflows/init.nf | 4 +-- workflows/pipeline.nf | 36 ++++++++++----------- 9 files changed, 53 insertions(+), 44 deletions(-) rename bin/{get_kraken_db.sh => get_kraken2_db.sh} (88%) diff --git a/bin/get_kraken_db.sh b/bin/get_kraken2_db.sh similarity index 88% rename from bin/get_kraken_db.sh rename to bin/get_kraken2_db.sh index 862e868..cb42d7c 100755 --- a/bin/get_kraken_db.sh +++ b/bin/get_kraken2_db.sh @@ -2,6 +2,7 @@ # If not: remove files in database directory, download, and unzip to database directory, also save metadata to done_kraken.json DB_NAME=$(basename $DB_REMOTE) +ZIPPED_DB='kraken2_db.tar.gz' if [ ! -f ${DB_LOCAL}/done_kraken.json ] || \ [ ! "$DB_REMOTE" == "$(jq -r .url ${DB_LOCAL}/done_kraken.json)" ] || \ @@ -11,14 +12,14 @@ if [ ! -f ${DB_LOCAL}/done_kraken.json ] || \ rm -rf ${DB_LOCAL}/{,.[!.],..?}* - wget ${DB_REMOTE} -O kraken_db.tar.gz + wget ${DB_REMOTE} -O $ZIPPED_DB # Use tmp dir and find to ensure files are saved directly at $DB_LOCAL regardless of archive directory structure mkdir tmp - tar -xzf kraken_db.tar.gz -C tmp + tar -xzf $ZIPPED_DB -C tmp find tmp -type f -exec mv {} $DB_LOCAL \; - rm -f kraken_db.tar.gz + rm -f $ZIPPED_DB jq -n \ --arg url "${DB_REMOTE}" \ diff --git a/bin/taxonomy_qc.sh b/bin/taxonomy_qc.sh index 232e61b..c468b14 100755 --- a/bin/taxonomy_qc.sh +++ b/bin/taxonomy_qc.sh @@ -1,6 +1,6 @@ -# Extract taxonomy QC information and determine QC result based on kraken_report.txt +# Extract taxonomy QC information and determine QC result based on kraken2_report.txt -PERCENTAGE=$(awk -F"\t" '$4 ~ /^S$/ && $6 ~ /Streptococcus pneumoniae$/ { gsub(/^[ \t]+/, "", $1); printf "%.2f", $1 }' $KRAKEN_REPORT) +PERCENTAGE=$(awk -F"\t" '$4 ~ /^S$/ && $6 ~ /Streptococcus pneumoniae$/ { gsub(/^[ \t]+/, "", $1); printf "%.2f", $1 }' $KRAKEN2_REPORT) if [ -z "$PERCENTAGE" ]; then PERCENTAGE="0.00" diff --git a/modules/amr.nf b/modules/amr.nf index 6507c41..8d3fd53 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -50,7 +50,8 @@ process CREATE_ARIBA_DB { path local output: - tuple path(local), val(output) + path local, emit: path + val output, emit: database script: output='database' @@ -72,7 +73,8 @@ process OTHER_RESISTANCE { tag "$sample_id" input: - tuple path(ariba_database), val(database) + path ariba_database + val database tuple val(sample_id), path(read1), path(read2), path(unpaired) output: diff --git a/modules/lineage.nf b/modules/lineage.nf index 646077b..40156a3 100644 --- a/modules/lineage.nf +++ b/modules/lineage.nf @@ -8,7 +8,8 @@ process GET_POPPUNK_DB { path local output: - tuple path(local), env(DB_NAME) + path local, emit: path + env DB_NAME, emit: database script: """ @@ -29,7 +30,7 @@ process GET_POPPUNK_EXT_CLUSTERS { path local output: - env EXT_CLUSTERS_CSV + env EXT_CLUSTERS_CSV, emit: file script: """ @@ -52,7 +53,8 @@ process LINEAGE { tag 'All samples' input: - tuple path(poppunk_dir), val(db_name) + path poppunk_dir + val db_name val ext_clusters_file path qfile diff --git a/modules/mapping.nf b/modules/mapping.nf index c058954..c640544 100644 --- a/modules/mapping.nf +++ b/modules/mapping.nf @@ -8,7 +8,8 @@ process CREATE_REF_GENOME_BWA_DB { path local output: - tuple path(local), val(prefix) + path(local), emit: path + val(prefix), emit: prefix script: prefix='reference' @@ -30,7 +31,8 @@ process MAPPING { tag "$sample_id" input: - tuple path(bwa_ref_db_dir), val(prefix) + path bwa_ref_db_dir + val prefix tuple val(sample_id), path(read1), path(read2), path(unpaired) output: diff --git a/modules/serotype.nf b/modules/serotype.nf index 2e3ff3d..764fceb 100644 --- a/modules/serotype.nf +++ b/modules/serotype.nf @@ -33,7 +33,8 @@ process CREATE_SEROBA_DB { val kmer output: - tuple path(local), val(database) + path local, emit: path + val database, emit: database script: database='database' @@ -56,7 +57,8 @@ process SEROTYPE { tag "$sample_id" input: - tuple path(seroba_dir), val(database) + path seroba_dir + val database tuple val(sample_id), path(read1), path(read2), path(unpaired) output: diff --git a/modules/taxonomy.nf b/modules/taxonomy.nf index aefa4ad..c76771c 100644 --- a/modules/taxonomy.nf +++ b/modules/taxonomy.nf @@ -1,5 +1,5 @@ // Return Kraken 2 database path, download if necessary -process GET_KRAKEN_DB { +process GET_KRAKEN2_DB { label 'bash_container' label 'farm_low' @@ -8,14 +8,14 @@ process GET_KRAKEN_DB { path local output: - path local + path local, emit: path script: """ DB_REMOTE="$remote" DB_LOCAL="$local" - source get_kraken_db.sh + source get_kraken2_db.sh """ } @@ -27,7 +27,7 @@ process TAXONOMY { tag "$sample_id" input: - path kraken_db + path kraken2_db val kraken2_memory_mapping tuple val(sample_id), path(read1), path(read2), path(unpaired) @@ -35,21 +35,21 @@ process TAXONOMY { tuple val(sample_id), path(report), emit: report script: - report='kraken_report.txt' + report='kraken2_report.txt' if (kraken2_memory_mapping === true) """ - kraken2 --threads `nproc` --use-names --memory-mapping --db "$kraken_db" --paired "$read1" "$read2" --report "$report" --output - + kraken2 --threads `nproc` --use-names --memory-mapping --db "$kraken2_db" --paired "$read1" "$read2" --report "$report" --output - """ else if (kraken2_memory_mapping === false) """ - kraken2 --threads `nproc` --use-names --db "$kraken_db" --paired "$read1" "$read2" --report "$report" --output - + kraken2 --threads `nproc` --use-names --db "$kraken2_db" --paired "$read1" "$read2" --report "$report" --output - """ else error "The value for --kraken2_memory_mapping is not valid." } -// Extract taxonomy QC information and determine QC result based on kraken_report.txt +// Extract taxonomy QC information and determine QC result based on kraken2_report.txt process TAXONOMY_QC { label 'bash_container' label 'farm_low' @@ -57,7 +57,7 @@ process TAXONOMY_QC { tag "$sample_id" input: - tuple val(sample_id), path(kraken_report) + tuple val(sample_id), path(kraken2_report) val(qc_spneumo_percentage) output: @@ -66,7 +66,7 @@ process TAXONOMY_QC { script: """ - KRAKEN_REPORT="$kraken_report" + KRAKEN2_REPORT="$kraken2_report" QC_SPNEUMO_PERCENTAGE="$qc_spneumo_percentage" source taxonomy_qc.sh diff --git a/workflows/init.nf b/workflows/init.nf index e92c2da..0650922 100644 --- a/workflows/init.nf +++ b/workflows/init.nf @@ -1,6 +1,6 @@ // Import process modules include { CREATE_REF_GENOME_BWA_DB } from "$projectDir/modules/mapping" -include { GET_KRAKEN_DB } from "$projectDir/modules/taxonomy" +include { GET_KRAKEN2_DB } from "$projectDir/modules/taxonomy" include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS } from "$projectDir/modules/lineage" include { GET_SEROBA_DB; CREATE_SEROBA_DB } from "$projectDir/modules/serotype" include { GET_DOCKER_COMPOSE; PULL_IMAGES } from "$projectDir/modules/docker" @@ -11,7 +11,7 @@ workflow INIT { CREATE_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) // Check Kraken2 Database, download if necessary - kraken2_db = GET_KRAKEN_DB(params.kraken2_db_remote, params.kraken2_db_local) + GET_KRAKEN2_DB(params.kraken2_db_remote, params.kraken2_db_local) // Check SeroBA Databases, clone and rebuild if necessary GET_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index d11726d..1aea8d7 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -2,7 +2,7 @@ include { PREPROCESS; READ_QC } from "$projectDir/modules/preprocess" include { ASSEMBLY_UNICYCLER; ASSEMBLY_SHOVILL; ASSEMBLY_ASSESS; ASSEMBLY_QC } from "$projectDir/modules/assembly" include { CREATE_REF_GENOME_BWA_DB; MAPPING; SAM_TO_SORTED_BAM; SNP_CALL; HET_SNP_COUNT; MAPPING_QC } from "$projectDir/modules/mapping" -include { GET_KRAKEN_DB; TAXONOMY; TAXONOMY_QC } from "$projectDir/modules/taxonomy" +include { GET_KRAKEN2_DB; TAXONOMY; TAXONOMY_QC } from "$projectDir/modules/taxonomy" include { OVERALL_QC } from "$projectDir/modules/overall_qc" include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS; LINEAGE } from "$projectDir/modules/lineage" include { GET_SEROBA_DB; CREATE_SEROBA_DB; SEROTYPE } from "$projectDir/modules/serotype" @@ -13,21 +13,21 @@ include { PBP_RESISTANCE; GET_PBP_RESISTANCE; CREATE_ARIBA_DB; OTHER_RESISTANCE; workflow PIPELINE { main: // Get path and prefix of Reference Genome BWA Database, generate from assembly if necessary - ref_genome_bwa_db = CREATE_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) + CREATE_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) // Get path to Kraken2 Database, download if necessary - kraken2_db = GET_KRAKEN_DB(params.kraken2_db_remote, params.kraken2_db_local) + GET_KRAKEN2_DB(params.kraken2_db_remote, params.kraken2_db_local) // Get path to SeroBA Databases, clone and rebuild if necessary GET_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) - seroba_db = CREATE_SEROBA_DB(params.seroba_remote, params.seroba_local, GET_SEROBA_DB.out.create_db, params.seroba_kmer) + CREATE_SEROBA_DB(params.seroba_remote, params.seroba_local, GET_SEROBA_DB.out.create_db, params.seroba_kmer) // Get paths to PopPUNK Database and External Clusters, download if necessary - poppunk_db = GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_local) - poppunk_ext_clusters = GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_local) + GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_local) + GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_local) // Get path to ARIBA database, generate from reference sequences and metadata if ncessary - ariba_db = CREATE_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) + CREATE_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) // Get read pairs into Channel raw_read_pairs_ch raw_read_pairs_ch = Channel.fromFilePairs("$params.reads/*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}", checkIfExists: true) @@ -73,7 +73,7 @@ workflow PIPELINE { // From Channel READ_QC_PASSED_READS_ch map reads to reference // Output into Channel MAPPING.out.sam - MAPPING(ref_genome_bwa_db, READ_QC_PASSED_READS_ch) + MAPPING(CREATE_REF_GENOME_BWA_DB.out.path, CREATE_REF_GENOME_BWA_DB.out.prefix, READ_QC_PASSED_READS_ch) // From Channel MAPPING.out.sam, Convert SAM into sorted BAM and calculate reference coverage // Output into Channels SAM_TO_SORTED_BAM.out.bam and SAM_TO_SORTED_BAM.out.ref_coverage @@ -94,7 +94,7 @@ workflow PIPELINE { // From Channel READ_QC_PASSED_READS_ch assess Streptococcus pneumoniae percentage in reads // Output into Channels TAXONOMY.out.detailed_result & TAXONOMY.out.result report - TAXONOMY(kraken2_db, params.kraken2_memory_mapping, READ_QC_PASSED_READS_ch) + TAXONOMY(GET_KRAKEN2_DB.out.path, params.kraken2_memory_mapping, READ_QC_PASSED_READS_ch) // From Channel TAXONOMY.out.report, provide taxonomy QC status // Output into Channels TAXONOMY_QC.out.detailed_result & TAXONOMY_QC.out.result report @@ -125,11 +125,11 @@ workflow PIPELINE { .collectFile(name: 'qfile.txt', newLine: true) // From generated POPPUNK_QFILE, assign GPSC to samples passed overall QC - LINEAGE(poppunk_db, poppunk_ext_clusters, POPPUNK_QFILE) + LINEAGE(GET_POPPUNK_DB.out.path, GET_POPPUNK_DB.out.database, GET_POPPUNK_EXT_CLUSTERS.out.file, POPPUNK_QFILE) // From Channel OVERALL_QC_PASSED_READS_ch, serotype the preprocess reads of samples passed overall QC // Output into Channel SEROTYPE.out.result - SEROTYPE(seroba_db, OVERALL_QC_PASSED_READS_ch) + SEROTYPE(CREATE_SEROBA_DB.out.path, CREATE_SEROBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, PubMLST typing the assemblies of samples passed overall QC // Output into Channel MLST.out.result @@ -142,7 +142,7 @@ workflow PIPELINE { // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, infer resistance (also determinants if any) of other antimicrobials // Output into Channel GET_OTHER_RESISTANCE.out.result - OTHER_RESISTANCE(ariba_db, OVERALL_QC_PASSED_READS_ch) + OTHER_RESISTANCE(CREATE_ARIBA_DB.out.path, CREATE_ARIBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.reports) // Generate results.csv by sorted sample_id based on merged Channels @@ -203,12 +203,12 @@ workflow PIPELINE { ) // Pass to SAVE_INFO sub-workflow - DATABASES_INFO = ref_genome_bwa_db.map { [["bwa_db_path", it[0]]] } - .merge(ariba_db.map { [["ariba_db_path", it[0]]] }) - .merge(kraken2_db.map { [["kraken2_db_path", it]] }) - .merge(seroba_db.map { [["seroba_db_path", it[0]]] }) - .merge(poppunk_db.map { [["poppunk_db_path", it[0]]] }) - .merge(poppunk_ext_clusters.map { [["poppunk_ext_path", it]] }) + DATABASES_INFO = CREATE_REF_GENOME_BWA_DB.out.path.map { [["bwa_db_path", it]] } + .merge(CREATE_ARIBA_DB.out.path.map { [["ariba_db_path", it]] }) + .merge(GET_KRAKEN2_DB.out.path.map { [["kraken2_db_path", it]] }) + .merge(CREATE_SEROBA_DB.out.path.map { [["seroba_db_path", it]] }) + .merge(GET_POPPUNK_DB.out.path.map { [["poppunk_db_path", it]] }) + .merge(GET_POPPUNK_EXT_CLUSTERS.out.file.map { [["poppunk_ext_file", it]] }) // Save key-value tuples into a map .map { it.collectEntries() } From 81c0fc3072d9ff04557a89e8f3f960d553484732 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 30 Jun 2023 16:32:17 +0000 Subject: [PATCH 012/157] Include ARIBA database generation Former-commit-id: d6ab8c63ce7367dd6c9af1b7ee4bbb5ae7484f67 --- workflows/init.nf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/workflows/init.nf b/workflows/init.nf index 0650922..64a748f 100644 --- a/workflows/init.nf +++ b/workflows/init.nf @@ -4,12 +4,16 @@ include { GET_KRAKEN2_DB } from "$projectDir/modules/taxonomy" include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS } from "$projectDir/modules/lineage" include { GET_SEROBA_DB; CREATE_SEROBA_DB } from "$projectDir/modules/serotype" include { GET_DOCKER_COMPOSE; PULL_IMAGES } from "$projectDir/modules/docker" +include { CREATE_ARIBA_DB } from "$projectDir/modules/amr" // Alternative workflow for initialisation only workflow INIT { // Check Reference Genome BWA Database, generate from assembly if necessary CREATE_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) + // Check ARIBA database, generate from reference sequences and metadata if ncessary + CREATE_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) + // Check Kraken2 Database, download if necessary GET_KRAKEN2_DB(params.kraken2_db_remote, params.kraken2_db_local) From 9e5eb5b9ca7f37241ebb5b2674c589f857c02d26 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 30 Jun 2023 16:40:03 +0000 Subject: [PATCH 013/157] Improve code maintainability Former-commit-id: 4140943a45e80f5e2fc3235c8db3ca122aec8926 --- bin/create_ariba_db.sh | 16 +++++++--------- bin/create_ref_genome_bwa_db.sh | 12 +++++------- bin/create_seroba_db.sh | 4 ++-- bin/get_kraken2_db.sh | 8 ++++---- bin/get_poppunk_db.sh | 10 +++++----- bin/get_poppunk_ext_clusters.sh | 10 +++++----- bin/get_seroba_db.sh | 8 ++++---- modules/amr.nf | 2 ++ modules/lineage.nf | 4 ++++ modules/mapping.nf | 2 ++ modules/serotype.nf | 4 ++++ modules/taxonomy.nf | 2 ++ 12 files changed, 46 insertions(+), 36 deletions(-) diff --git a/bin/create_ariba_db.sh b/bin/create_ariba_db.sh index 03d9997..073028e 100755 --- a/bin/create_ariba_db.sh +++ b/bin/create_ariba_db.sh @@ -1,16 +1,14 @@ # Check if CREATE_ARIBA_DB has run successfully on the specific reference sequences and metadata. -# If not: remove the $OUTPUT directory, and prepare the ARIBA database from reference sequences and metadata, also save metadata to done_ariba_db.json - -JSON="done_ariba_db.json" +# If not: remove the $OUTPUT directory, and prepare the ARIBA database from reference sequences and metadata, also save metadata to JSON REF_SEQUENCES_MD5=$(md5sum $REF_SEQUENCES | awk '{ print $1 }') METADATA_MD5=$(md5sum $METADATA | awk '{ print $1 }') -if [ ! -f ${DB_LOCAL}/${JSON} ] || \ - [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$REF_SEQUENCES" ] || \ - [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$REF_SEQUENCES_MD5" ] || \ - [ ! "$(grep '"metadata"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$METADATA" ] || \ - [ ! "$(grep '"metadata_md5"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$METADATA_MD5" ] || \ +if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ + [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$REF_SEQUENCES" ] || \ + [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$REF_SEQUENCES_MD5" ] || \ + [ ! "$(grep '"metadata"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$METADATA" ] || \ + [ ! "$(grep '"metadata_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$METADATA_MD5" ] || \ [ ! -f ${DB_LOCAL}/${OUTPUT}/00.info.txt ] || \ [ ! -f ${DB_LOCAL}/${OUTPUT}/00.version_info.txt ] || \ [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_genes.log ] || \ @@ -29,6 +27,6 @@ if [ ! -f ${DB_LOCAL}/${JSON} ] || \ ariba prepareref -f "$REF_SEQUENCES" -m "$METADATA" "$DB_LOCAL/$OUTPUT" - echo -e "{\n \"reference\": \"$REF_SEQUENCES\",\n \"reference_md5\": \"$REF_SEQUENCES_MD5\",\n \"metadata\": \"$METADATA\",\n \"metadata_md5\": \"$METADATA_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON} + echo -e "{\n \"reference\": \"$REF_SEQUENCES\",\n \"reference_md5\": \"$REF_SEQUENCES_MD5\",\n \"metadata\": \"$METADATA\",\n \"metadata_md5\": \"$METADATA_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON_FILE} fi \ No newline at end of file diff --git a/bin/create_ref_genome_bwa_db.sh b/bin/create_ref_genome_bwa_db.sh index aae6537..6cee335 100755 --- a/bin/create_ref_genome_bwa_db.sh +++ b/bin/create_ref_genome_bwa_db.sh @@ -1,13 +1,11 @@ # Check if CREATE_REF_GENOME_BWA_DB has run successfully on the specific reference. -# If not: remove files in database directory, and construct the FM-index database of the reference genome for BWA, also save metadata to done_bwa_db.json - -JSON="done_bwa_db.json" +# If not: remove files in database directory, and construct the FM-index database of the reference genome for BWA, also save metadata to JSON REFERENCE_MD5=$(md5sum $REFERENCE | awk '{ print $1 }') -if [ ! -f ${DB_LOCAL}/${JSON} ] || \ - [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$REFERENCE" ] || \ - [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$REFERENCE_MD5" ] || \ +if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ + [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$REFERENCE" ] || \ + [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$REFERENCE_MD5" ] || \ [ ! -f ${DB_LOCAL}/${PREFIX}.amb ] || \ [ ! -f ${DB_LOCAL}/${PREFIX}.ann ] || \ [ ! -f ${DB_LOCAL}/${PREFIX}.bwt ] || \ @@ -20,6 +18,6 @@ if [ ! -f ${DB_LOCAL}/${JSON} ] || \ mv ${PREFIX}.amb ${PREFIX}.ann ${PREFIX}.bwt ${PREFIX}.pac ${PREFIX}.sa -t $DB_LOCAL - echo -e "{\n \"reference\": \"$REFERENCE\",\n \"reference_md5\": \"$REFERENCE_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON} + echo -e "{\n \"reference\": \"$REFERENCE\",\n \"reference_md5\": \"$REFERENCE_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON_FILE} fi diff --git a/bin/create_seroba_db.sh b/bin/create_seroba_db.sh index 44b3be4..21a058f 100755 --- a/bin/create_seroba_db.sh +++ b/bin/create_seroba_db.sh @@ -1,9 +1,9 @@ -# If create_db is true: re-create KMC and ARIBA databases, also save metadata to done_seroba.json +# If create_db is true: re-create KMC and ARIBA databases, also save metadata to JSON if [ $CREATE_DB = true ]; then seroba createDBs ${DB_LOCAL}/${DATABASE}/ ${KMER} - echo -e "{\n \"git\": \"$DB_REMOTE\",\n \"kmer\": \"$KMER\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/done_seroba.json + echo -e "{\n \"git\": \"$DB_REMOTE\",\n \"kmer\": \"$KMER\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON_FILE} fi diff --git a/bin/get_kraken2_db.sh b/bin/get_kraken2_db.sh index cb42d7c..c53cc52 100755 --- a/bin/get_kraken2_db.sh +++ b/bin/get_kraken2_db.sh @@ -1,11 +1,11 @@ # Check if all file exists and were obtained from the database at the specific link. -# If not: remove files in database directory, download, and unzip to database directory, also save metadata to done_kraken.json +# If not: remove files in database directory, download, and unzip to database directory, also save metadata to JSON DB_NAME=$(basename $DB_REMOTE) ZIPPED_DB='kraken2_db.tar.gz' -if [ ! -f ${DB_LOCAL}/done_kraken.json ] || \ - [ ! "$DB_REMOTE" == "$(jq -r .url ${DB_LOCAL}/done_kraken.json)" ] || \ +if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ + [ ! "$DB_REMOTE" == "$(jq -r .url ${DB_LOCAL}/${JSON_FILE})" ] || \ [ ! -f ${DB_LOCAL}/hash.k2d ] || \ [ ! -f ${DB_LOCAL}/opts.k2d ] || \ [ ! -f ${DB_LOCAL}/taxo.k2d ]; then @@ -24,6 +24,6 @@ if [ ! -f ${DB_LOCAL}/done_kraken.json ] || \ jq -n \ --arg url "${DB_REMOTE}" \ --arg save_time "$(date +"%Y-%m-%d %H:%M:%S %Z")" \ - '{"url" : $url, "save_time": $save_time}' > ${DB_LOCAL}/done_kraken.json + '{"url" : $url, "save_time": $save_time}' > ${DB_LOCAL}/${JSON_FILE} fi diff --git a/bin/get_poppunk_db.sh b/bin/get_poppunk_db.sh index 33420b5..d4e705a 100755 --- a/bin/get_poppunk_db.sh +++ b/bin/get_poppunk_db.sh @@ -1,13 +1,13 @@ # Return PopPUNK database name # Check if all files exist and were obtained from the database at the specific link. -# If not: remove all sub-directories, download, and unzip to database directory, also save metadata to done_poppunk.json +# If not: remove all sub-directories, download, and unzip to database directory, also save metadata to JSON DB_NAME=$(basename "$DB_REMOTE" .tar.gz) DB_PATH=${DB_LOCAL}/${DB_NAME} -if [ ! -f ${DB_LOCAL}/done_poppunk.json ] || \ - [ ! "$DB_REMOTE" == "$(jq -r .url ${DB_LOCAL}/done_poppunk.json)" ] || \ +if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ + [ ! "$DB_REMOTE" == "$(jq -r .url ${DB_LOCAL}/${JSON_FILE})" ] || \ [ ! -f ${DB_PATH}/${DB_NAME}.h5 ] || \ [ ! -f ${DB_PATH}/${DB_NAME}.dists.npy ] || \ [ ! -f ${DB_PATH}/${DB_NAME}.dists.pkl ] || \ @@ -17,7 +17,7 @@ if [ ! -f ${DB_LOCAL}/done_poppunk.json ] || \ [ ! -f ${DB_PATH}/${DB_NAME}_clusters.csv ] || \ [ ! -f ${DB_PATH}/${DB_NAME}.refs ]; then - rm -rf ${DB_LOCAL}/done_poppunk.json + rm -rf ${DB_LOCAL}/${JSON_FILE} rm -rf ${DB_LOCAL}/*/ wget $DB_REMOTE -O poppunk_db.tar.gz @@ -27,6 +27,6 @@ if [ ! -f ${DB_LOCAL}/done_poppunk.json ] || \ jq -n \ --arg url "$DB_REMOTE" \ --arg save_time "$(date +"%Y-%m-%d %H:%M:%S %Z")" \ - '{"url" : $url, "save_time": $save_time}' > ${DB_LOCAL}/done_poppunk.json + '{"url" : $url, "save_time": $save_time}' > ${DB_LOCAL}/${JSON_FILE} fi diff --git a/bin/get_poppunk_ext_clusters.sh b/bin/get_poppunk_ext_clusters.sh index c971567..e330968 100755 --- a/bin/get_poppunk_ext_clusters.sh +++ b/bin/get_poppunk_ext_clusters.sh @@ -1,23 +1,23 @@ # Return PopPUNK External Clusters file name # Check if specific external clusters file exists and was obtained from the specific link. -# If not: remove all csv files, and download to database directory, also save metadata to done_poppunk_ext.json +# If not: remove all csv files, and download to database directory, also save metadata to JSON EXT_CLUSTERS_CSV=$(basename "$EXT_CLUSTERS_REMOTE") EXT_CLUSTERS_NAME=$(basename "$EXT_CLUSTERS_REMOTE" .csv) -if [ ! -f ${EXT_CLUSTERS_LOCAL}/done_poppunk_ext.json ] || \ - [ ! "$EXT_CLUSTERS_REMOTE" == "$(jq -r .url ${EXT_CLUSTERS_LOCAL}/done_poppunk_ext.json)" ] || \ +if [ ! -f ${EXT_CLUSTERS_LOCAL}/${JSON_FILE} ] || \ + [ ! "$EXT_CLUSTERS_REMOTE" == "$(jq -r .url ${EXT_CLUSTERS_LOCAL}/${JSON_FILE})" ] || \ [ ! -f ${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV} ]; then rm -f ${EXT_CLUSTERS_LOCAL}/*.csv - rm -f ${EXT_CLUSTERS_LOCAL}/done_${EXT_CLUSTERS_NAME}.json + rm -f ${EXT_CLUSTERS_LOCAL}/${JSON_FILE} wget $EXT_CLUSTERS_REMOTE -O ${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV} jq -n \ --arg url "$EXT_CLUSTERS_REMOTE" \ --arg save_time "$(date +"%Y-%m-%d %H:%M:%S %Z")" \ - '{"url" : $url, "save_time": $save_time}' > ${EXT_CLUSTERS_LOCAL}/done_poppunk_ext.json + '{"url" : $url, "save_time": $save_time}' > ${EXT_CLUSTERS_LOCAL}/${JSON_FILE} fi diff --git a/bin/get_seroba_db.sh b/bin/get_seroba_db.sh index 1b2235e..736a99b 100755 --- a/bin/get_seroba_db.sh +++ b/bin/get_seroba_db.sh @@ -3,11 +3,11 @@ # Check if GET_SEROBA_DB and CREATE_SEROBA_DB has run successfully on the database at the specific link, CREATE_SEROBA_DB used the specific Kmerm and pull to check if SeroBA database is up-to-date. # If outdated or does not exist: remove files in database directory and clone, set CREATE_DB to true -# Assume up-to-date if done_seroba.json passes checks and the host cannot be resolved to allow offline usage +# Assume up-to-date if JSON passes checks and the host cannot be resolved to allow offline usage -if [ ! -f ${DB_LOCAL}/done_seroba.json ] || \ - [ ! "$(grep 'git' ${DB_LOCAL}/done_seroba.json | sed -r 's/.+: "(.*)",/\1/')" == "${DB_REMOTE}" ] || \ - [ ! "$(grep 'kmer' ${DB_LOCAL}/done_seroba.json | sed -r 's/.+: "(.*)",/\1/')" == "${KMER}" ] || \ +if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ + [ ! "$(grep 'git' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "${DB_REMOTE}" ] || \ + [ ! "$(grep 'kmer' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "${KMER}" ] || \ !((git -C ${DB_LOCAL} pull || echo 'Already up-to-date') | grep -q 'Already up[- ]to[- ]date'); then rm -rf ${DB_LOCAL}/{,.[!.],..?}* diff --git a/modules/amr.nf b/modules/amr.nf index 8d3fd53..feaa3ad 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -55,11 +55,13 @@ process CREATE_ARIBA_DB { script: output='database' + json='done_ariba_db.json' """ REF_SEQUENCES="$ref_sequences" METADATA="$metadata" DB_LOCAL="$local" OUTPUT="$output" + JSON_FILE="$json" source create_ariba_db.sh """ diff --git a/modules/lineage.nf b/modules/lineage.nf index 40156a3..6e13fab 100644 --- a/modules/lineage.nf +++ b/modules/lineage.nf @@ -12,9 +12,11 @@ process GET_POPPUNK_DB { env DB_NAME, emit: database script: + json='done_poppunk.json' """ DB_REMOTE="$db_remote" DB_LOCAL="$local" + JSON_FILE="$json" source get_poppunk_db.sh """ @@ -33,9 +35,11 @@ process GET_POPPUNK_EXT_CLUSTERS { env EXT_CLUSTERS_CSV, emit: file script: + json='done_poppunk_ext.json' """ EXT_CLUSTERS_REMOTE="$ext_clusters_remote" EXT_CLUSTERS_LOCAL="$local" + JSON_FILE="$json" source get_poppunk_ext_clusters.sh """ diff --git a/modules/mapping.nf b/modules/mapping.nf index c640544..f0d1e0e 100644 --- a/modules/mapping.nf +++ b/modules/mapping.nf @@ -13,10 +13,12 @@ process CREATE_REF_GENOME_BWA_DB { script: prefix='reference' + json='done_bwa_db.json' """ REFERENCE="$reference" DB_LOCAL="$local" PREFIX="$prefix" + JSON_FILE="$json" source create_ref_genome_bwa_db.sh """ diff --git a/modules/serotype.nf b/modules/serotype.nf index 764fceb..5c268fc 100644 --- a/modules/serotype.nf +++ b/modules/serotype.nf @@ -12,10 +12,12 @@ process GET_SEROBA_DB { env CREATE_DB, emit: create_db script: + json='done_seroba.json' """ DB_REMOTE="$remote" DB_LOCAL="$local" KMER="$kmer" + JSON_FILE="$json" source get_seroba_db.sh """ @@ -38,12 +40,14 @@ process CREATE_SEROBA_DB { script: database='database' + json='done_seroba.json' """ DATABASE="$database" DB_REMOTE="$remote" DB_LOCAL="$local" KMER="$kmer" CREATE_DB="$create_db" + JSON_FILE="$json" source create_seroba_db.sh """ diff --git a/modules/taxonomy.nf b/modules/taxonomy.nf index c76771c..af6266d 100644 --- a/modules/taxonomy.nf +++ b/modules/taxonomy.nf @@ -11,9 +11,11 @@ process GET_KRAKEN2_DB { path local, emit: path script: + json='done_kraken.json' """ DB_REMOTE="$remote" DB_LOCAL="$local" + JSON_FILE="$json" source get_kraken2_db.sh """ From ee1a725837386591f41ea30f5968af33ac95f2ed Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 4 Jul 2023 16:16:26 +0000 Subject: [PATCH 014/157] Update ARIBA database Former-commit-id: 3bbde9b138528817562ba23699797da2cd3a5972 --- ...230628.tsv => ariba_metadata-20230629.tsv} | 4 +- ...sta => ariba_ref_sequences-20230629.fasta} | 59 +++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) rename data/{ariba_metadata-20230628.tsv => ariba_metadata-20230629.tsv} (94%) rename data/{ariba_ref_sequences-20230628.fasta => ariba_ref_sequences-20230629.fasta} (95%) diff --git a/data/ariba_metadata-20230628.tsv b/data/ariba_metadata-20230629.tsv similarity index 94% rename from data/ariba_metadata-20230628.tsv rename to data/ariba_metadata-20230629.tsv index 9d35ca5..89ce2e8 100644 --- a/data/ariba_metadata-20230628.tsv +++ b/data/ariba_metadata-20230629.tsv @@ -77,4 +77,6 @@ vanC_AF162694 1 0 . . Vacomycin resistance (E.gallinarum) 23S_NZ_CP018347 0 1 A2115G . Macrolide:32347-35250 23S_NZ_CP018347 0 1 A2118G . Macrolide/Streptogramin:32347-35250 23S_NZ_CP018347 0 1 C2630A . Macrolide:32347-35250 -23S_NZ_CP018347 0 1 C2630G . Macrolide:32347-35250 \ No newline at end of file +23S_NZ_CP018347 0 1 C2630G . Macrolide:32347-35250 +rrgA_EF560637 1 0 . . Pili-1-(RlrA pilus-1-2279-4939) +pitB_GU256423 1 0 . . Pili-2-(pitB pilus-3504-4736) diff --git a/data/ariba_ref_sequences-20230628.fasta b/data/ariba_ref_sequences-20230629.fasta similarity index 95% rename from data/ariba_ref_sequences-20230628.fasta rename to data/ariba_ref_sequences-20230629.fasta index 4509177..5dffbd5 100644 --- a/data/ariba_ref_sequences-20230628.fasta +++ b/data/ariba_ref_sequences-20230629.fasta @@ -777,3 +777,62 @@ ccttctgccgtttcgctcgccgctactaaggcaatcgcttttgctttctcttcctgcagctacttagatgtttcagttca ctgcgtcttcctcctcacatccttaacagatgtgggtaacaggtattacctgttgggttcccccattcggaaatccctgg atcatcgcttacttacagctacccaaggtatatcgtcgtttgtcacgtccttcgtcggctcctagtgccaaggcatccac cgtgcgcccttattaacttaacct +>rrgA_EF560637 +ATGAAAAAAGTAAGAAAGATATTTCAGAAGGCAGTTGCAGGACTGTGCTGTATATCTCAGTTGACAGCTT +TTTCTTCGATAGTTGCTTTAGCAGAAACGCCTGAAACCAGTCCAGCGATAGGAAAAGTAGTGATTAAGGA +GACAGGCGAAGGAGGAGCGCTTCTAGGAGATGCCGTCTTTGAGTTGAAAAACAATACGAATGGCACAACT +GTTTCGCAAAGGACAGAGGCGCAAACAGGAGAAGCGATATTTTCAAACATAAAACCTGGGACATACACCT +TGACAGAAGCCCAACCTCCAGTTGGTTATAAACCCTCTACTAAACAACGGACTGTTGAAGTTGAGAAGAA +TGGTCGGACGACTGTCCAAGGTGAACAGGTAGAAAATCGAGAAGAGGCTCTATCTGACCAGTATCCACAA +ACAGGGACTTATCCAGATGTTCAAACACCTTATCAGATTATTAAGGTAGATGGTTCGGAAAAAAACGGAC +AGCACAAGGCGTTGAATCCGAATCCATATGAACGTGTGATTCCAGAAGGTACACTTTCAAAGAGAATTTA +TCAAGTGAATAATTTGGATGATAACCAATATGGAATCGAATTGACGGTTAGTGGGAAAACAGTGTATGAA +CGAAAAGATAAGTCTGTGCCGCTGGATGTCGTTATCTTGCTCGATAACTCAAATAGTATGAGTAACATTC +GAAACAAGAATGCTCGACGTGCGGAAAGAGCTGGTGAGGCGACACGTTCTCTTATTGATAAAATTACATC +TGATCCAGAAAATAGGGTAGCGCTTGTGACTTATGCTTCCACTATCTTTGATGGGACCGAGTTTACAGTA +GAAAAAGGGGTAGCAGATAAAAACGGAAAACGATTGAATGATTCTCTTTTTTGGAATTATGATCAGACGA +GTTTTACAACCAATACCAAAGATTATAGTTATTTAAAGCTGACTAATGATAAGAATGACATTGTAGAATT +AAAAAATAAGGTACCTACCGAGGCAGAAGACCATGATGGAAATAGATTGATGTACCAATTCGGTGCCACT +TTTACTCAGAAAGCTTTGATGAAGGCCGATGAGATTTTGACACAACAAGCGAGACAAAATAGTCAAAAAG +TCATTTTCCATATTACGGATGGTGTCCCAACTATGTCGTATCCGATTAATTTTAATCATGCTACGTTTGC +TCCATCATATCAAAATCAACTAAATGTATTTTTTAGTAAATCTCCTAATAAAGATGGAATACTATTAAGT +GATTTTATTACGCAAGCAACTAGTGGAGAACATACAATTGTACGCGGAGATGGGCAAAGTTACCAGATGT +TTACAGATAAGACAGTTTATGAAAAAGGTGCTCCTGCAGCTTTCCCAGTTAAACCTGAAAAATATTCTGA +AATGAAGGCGGTTGGTTATGCAGTTATAGGCGATCCAATTAATGGTGGATATATTTGGCTTAATTGGAGA +GAGAGTATTCTGGCTTATCCGTTTAATTCTAATACTGCTAAAATTACCAATCATGGTGACCCTACAAGAT +GGTACTATAACGGGAATATTGCTCCTGATGGGTATGATGTCTTTACGGTAGGTATTGGTATTAACGGAGA +TCCTGGTACGGATGAAGCAACGGCTACTAGTTTTATGCAAAGTATTTCTAGTAAACCTGAAAACTATACC +AATGTTACTGACACGACAAAAATATTGGAACAGTTGAATCGTTATTTCCACACCATCGTAACTGAAAAGA +AATCAATTGAGAATGGTACGATTACAGATCCGATGGGTGAGTTAATTGATTTGCAATTGGGCACAGATGG +AAGATTTGATCCAGCAGATTACACTTTAACTGCAAACGATGGTAGTCGCTTGGAGAATGGACAAGCTGTA +GGTGGTCCACAAAATGATGGTGGCTTGCTAAAAAATGCAAAAGTGTTCTATGATACGACTGAGAAAAGGA +TTCGTGTAACAGGTTTGTACCTTGGAACGGGTGAAAAAGTTACATTGACTTATAATGTTCGCTTGAATGA +CCAATTTGTAAGCAATAAATTCTATGACACGAATGGTCGAACAACCCTACACCCTAAGGAAGTAGAAAAG +AACACAGTGCGCGACTTCCCGATTCCTAAGATTCGTGATGTGCGAAAATATCCAGCAATTACGATTGCAA +AAGAGAAAAAACTTGGTGAAATTGAGTTTATTAAGATCAATAAGAATGATAAAAAACCACTGAGAGATGC +GGTCTTTAGTCTTCAAAAACAACATCCGGATTATCCAGATATTTATGGAGCTATTGATCAAAATGGCACT +TATCAAAATGTGAGAACAGGTGAAGATGGTAAGTTGACCTTTAAAAATCTGTCAGATGGGAAATATCGAT +TATTTGAAAATTCTGAACCAGCTGGTTATAAACCCGTTCAAAATAAGCCTATCGTTGCCTTCCAAATAGT +AAATGGAGAAGTCAGAGATGTGACTTCAATCGTTCCACAAGATATACCAGCGGGTTACGAGTTTACGAAT +GATAAGCACTATATTACCAATGAACCTATTCCTCCAAAGAGAGAATATCCTCGAACTGGTGGTATCGGAA +TGTTGCTATTCTATCTGATAGGTTGCATGATGATGGGAGGAGTTCTATTATACACACGGAAACATCCGTA +A +>pitB_GU256423 +ATGAAAAAAGAAAATAAAAAAACAAAAGAAATAATCATGAAAAAAACATTCTTTAAAAAGCTATTCACTG +CAAGCATTGCAGCTATAACCGCTTTGTCCGTATTCAGAGGTGTCCCGACTTTTGCGGATGATAATTCAGC +AATAACCAAAGCAAATGGTGAAAATAATGCTGTTGTGAAGATTAATAAAACGTTGAATATTGCAGAGGGA +ATAACAACACCAACAGCGACATTTACATTTAAGTTTACAGAAAAAACAGGACAATCTTCTAACGGTGCGC +CATATCAAACCGGAGTTGCAATTCCAGATAGAAATGTAGAATACAATAAAAATGATCACCCAACTGCTGA +TAAGATTCAAAAAGCAACAGAAGACATTTTTTCGGGAGTTGCTTATGGCCATGCTGGTGAATACGTTTAT +GATGTAGCGGAAGCAAAAACTGGATGGCAGGCGATTACCAAAAATGGTAAAACAATTGATGCCATGAGAT +ACGACAAACGTACATATGAAATGCACGTTATTGTTAAGAATAAAGTAAATGGTGGTGTCTATATTTCATC +AGTATACTTTAAGGAAAATAATAAATCTAACGCCCCTAAAGTAGAACCAAGTGAACAAGGCGTTTATAAT +TTATTTGATAACACATATACCAAAGACGCAAGTAAGGAGCCTAATCCTGATGATCCGAGTCAAGTAGACC +CCAATGCGAAAGCATTAACAATTACTAAAAAAGTTGATGGAGCTTCAGGGGATAAAACAAGAGATTTCCA +ATTCCATATCAAGATTCAACTTCCAAGTACAAATAAAACAGCAGAAACCCCTGTTACGAATATTATAGTA +AAACATGGATCTAAGTCAGAGGTGTTGGCAGTAGTGACCCCAGCAGATACAGTTGAGTACAATTTTACTC +TTAAAGATGGTGAAACATTTACAGTTGAACAACTACCAGCAGGTTCTAAATATACAGTAACTGAAACTGG +AGTAGCAGGTTATACAGATTCATCAATTTATACTACAAATGGTGCAGAACAAACATCTCAAGGACAAAAA +AATGTAGATTTTACATTAACAGATATCCTCATAGGTGAAAAGAAAAACGACAACAAAGTTACTAACAAAA +TCGACGACGTTACTCCTACTGGTCTCTTGATTGATAACCTTCCATTCATTTTGATGATTGGTCTTGGTTT +GGCTGGATTTGTTGTCTTGTCTAAAAAACGTAGAGAAGCCTAA From dec7f680fc0bba6ed3e6478b3ee7e95d35e2c630 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 5 Jul 2023 10:45:46 +0000 Subject: [PATCH 015/157] Update default ARIBA reference files Former-commit-id: bb3c15889772d749bd2cae42d11ebd229513efc3 --- nextflow.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index 191a4e5..ba6ff1a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -48,8 +48,8 @@ params { depth = 20.00 // Default ARIBA referece sequences and metadata paths, and local directory for its generated database - ariba_ref = "$projectDir/data/ariba_ref_sequences-20230628.fasta" - ariba_metadata = "$projectDir/data/ariba_metadata-20230628.tsv" + ariba_ref = "$projectDir/data/ariba_ref_sequences-20230629.fasta" + ariba_metadata = "$projectDir/data/ariba_metadata-20230629.tsv" ariba_db_local = "$projectDir/databases/ariba" // Toggle for removing .bam and .sam files mid-run to reduce storage requirement From e5ca3e629e392dcc3b0703c1b3c86d6bc330b0a8 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 6 Jul 2023 11:26:14 +0000 Subject: [PATCH 016/157] Remove unused script Former-commit-id: 0a898f8946925b38fc27b4c9ea6cf34e27b0b9aa --- bin/get_other_resistance.sh | 63 ------------------------------------- 1 file changed, 63 deletions(-) delete mode 100755 bin/get_other_resistance.sh diff --git a/bin/get_other_resistance.sh b/bin/get_other_resistance.sh deleted file mode 100755 index 610059d..0000000 --- a/bin/get_other_resistance.sh +++ /dev/null @@ -1,63 +0,0 @@ -# Extract the results from the output file of the AMRsearch - -# For resistances, change NOT_FOUND to S, lower cases to upper cases, SENSITIVE to S, INTERMEDIATE to I, RESISTANT to R, null or space-only string to empty string -# For determinants, determinants are sorted and separated by "; ", and no determinant is output as "_". Each acquired gene is output as "*gene*", each variant is output as "*gene*_*variant*" - -function GET_RES { - echo $( < $JSON_FILE jq -r --arg target "$1" '.resistanceProfile[] | select( .agent.key == $target ) | .state' \ - | tr '[:lower:]' '[:upper:]' \ - | sed 's/^NOT_FOUND$/S/g;s/^SENSITIVE$/S/g;s/^INTERMEDIATE$/I/g;s/^RESISTANT$/R/g;s/^null$//g;s/^\s+$//g' ) -} - -function GET_DETERMINANTS { - DETERMINANTS=() - - ACQUIRED=( $(< $JSON_FILE jq -r --arg target "$1" '.resistanceProfile[] | select( .agent.key == $target ) | .determinants | .acquired | map(.gene)[]') ) - VARIANTS=( $(< $JSON_FILE jq -r --arg target "$1" '.resistanceProfile[] | select( .agent.key == $target ) | .determinants | .variants | map(.gene + "_" +.variant)[]') ) - - if (( ${#ACQUIRED[@]} != 0 )); then - DETERMINANTS+=( "${ACQUIRED[@]}" ) - fi - - if (( ${#VARIANTS[@]} != 0 )); then - DETERMINANTS+=( "${VARIANTS[@]}" ) - fi - - if (( ${#DETERMINANTS[@]} == 0 )); then - DETERMINANTS+=("_") - fi - - IFS=$'\n' SORTED_DETERMINANTS=($(sort -f <<<"${DETERMINANTS[*]}")); unset IFS - printf -v JOINED_DETERMINANTS '; %s' "${SORTED_DETERMINANTS[@]}" - echo ${JOINED_DETERMINANTS:2} -} - -CHL_RES=$(GET_RES "CHL") -CHL_DETERMINANTS=$(GET_DETERMINANTS "CHL") - -CLI_RES=$(GET_RES "CLI") -CLI_DETERMINANTS=$(GET_DETERMINANTS "CLI") - -ERY_RES=$(GET_RES "ERY") -ERY_DETERMINANTS=$(GET_DETERMINANTS "ERY") - -FQ_RES=$(GET_RES "FLQ") -FQ_DETERMINANTS=$(GET_DETERMINANTS "FLQ") - -KAN_RES=$(GET_RES "KAN") -KAN_DETERMINANTS=$(GET_DETERMINANTS "KAN") - -LZO_RES=$(GET_RES "LNZ") -LZO_DETERMINANTS=$(GET_DETERMINANTS "LNZ") - -TET_RES=$(GET_RES "TCY") -TET_DETERMINANTS=$(GET_DETERMINANTS "TCY") - -TMP_RES=$(GET_RES "TMP") -TMP_DETERMINANTS=$(GET_DETERMINANTS "TMP") - -SMX_RES=$(GET_RES "SSS") -SMX_DETERMINANTS=$(GET_DETERMINANTS "SSS") - -COT_RES=$(GET_RES "SXT") -COT_DETERMINANTS=$(GET_DETERMINANTS "SXT") From 3feab10b3985a1cc85bad0cc13ddc8c7d437a8c0 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 6 Jul 2023 11:30:55 +0000 Subject: [PATCH 017/157] Initial work on extracting AMR from ARIBA report Former-commit-id: 3ec708917092711711520e4498d4775d00b51ba4 --- bin/get_other_resistance.py | 43 +++++++++++++++++++++++++++++++++++++ modules/amr.nf | 5 +++-- workflows/pipeline.nf | 2 +- 3 files changed, 47 insertions(+), 3 deletions(-) create mode 100755 bin/get_other_resistance.py diff --git a/bin/get_other_resistance.py b/bin/get_other_resistance.py new file mode 100755 index 0000000..df0899e --- /dev/null +++ b/bin/get_other_resistance.py @@ -0,0 +1,43 @@ +#! /usr/bin/env python3 + +import sys + +report_path = sys.argv[1] +metadata_path = sys.argv[2] + +with open(report_path) as report, open(metadata_path) as metadata: + # Save (reference, gene, var_only) combination found in metadata + gene_dict = {} + # Save drug found in metadata + drug_set = set() + + # Skip the header in metadata + next(metadata) + # Go through lines and save findings to gene_dict and drug_set + lines = [line.strip() for line in metadata] + for line in lines: + fields = line.split("\t") + reference, gene, var_only, var_change, _, drug = fields + gene_dict[(reference, gene, var_only)] = {"var_change": var_change, "drug": drug} + drug_set.add(drug) + + # Skip the header in report + next(report) + # Go through lines to detect targets + lines = [line.strip() for line in report] + for line in lines: + # Extract useful fields + fields = line.split("\t") + ref_name, gene, var_only, ref_len, ref_base_assembled, known_var_change, has_known_var = fields[1], fields[2], fields[3], fields[7], fields[8], fields[16], fields[17] + + # If coverage (ref_base_assembled / ref_len) < 0.9 or either variable contains non-numeric value, skip the line + if not ref_base_assembled.isdigit() or not ref_len.isdigit() or int(ref_base_assembled)/int(ref_len) < 0.9: + continue + + # WIP + gene_dict_key = (ref_name, gene, var_only) + if gene_dict_key in gene_dict: + if var_only == 0: + print(gene_dict[gene_dict_key]) + if var_only == 1 and gene_dict[gene_dict_key]['var_change'] == known_var_change and has_known_var == 1: + print(gene_dict[gene_dict_key]) diff --git a/modules/amr.nf b/modules/amr.nf index feaa3ad..905fd2b 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -92,16 +92,17 @@ process OTHER_RESISTANCE { // WIP, for extracting information from ARIBA report process GET_OTHER_RESISTANCE { - label 'bash_container' + label 'python_container' label 'farm_low' tag "$sample_id" input: tuple val(sample_id), path(report), path(report_debug) + path metadata script: """ - # TBC + get_other_resistance.py "$report_debug" "$metadata" """ } diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 1aea8d7..811da5a 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -143,7 +143,7 @@ workflow PIPELINE { // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, infer resistance (also determinants if any) of other antimicrobials // Output into Channel GET_OTHER_RESISTANCE.out.result OTHER_RESISTANCE(CREATE_ARIBA_DB.out.path, CREATE_ARIBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) - GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.reports) + GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.reports, params.ariba_metadata) // Generate results.csv by sorted sample_id based on merged Channels // READ_QC.out.result, ASSEMBLY_QC.out.result, MAPPING_QC.out.result, TAXONOMY_QC.out.result, OVERALL_QC.out.result, From fe57162c1a2a4032c6288105d5483e7f6d46b04d Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 6 Jul 2023 11:31:29 +0000 Subject: [PATCH 018/157] Save AMR info to FreeText_Drug column Former-commit-id: 67277d0022fbb2ef53e8d492d61bb5cd47718e02 --- data/ariba_metadata-20230629.tsv | 162 +++++++++++++++---------------- 1 file changed, 81 insertions(+), 81 deletions(-) diff --git a/data/ariba_metadata-20230629.tsv b/data/ariba_metadata-20230629.tsv index 89ce2e8..88c3a4f 100644 --- a/data/ariba_metadata-20230629.tsv +++ b/data/ariba_metadata-20230629.tsv @@ -1,82 +1,82 @@ reference coding_yes(1)_no(0) pr/ab(0)_var(1) description of the variant Group FreeText_Drug -aph_3prime_III_1_M26832 1 0 . . Kanamycin resistance -ermB_1_JN899585 1 0 . . Erythromycin and Clindamycin resistance -ermB_10_U86375 1 0 . . Erythromycin and Clindamycin resistance -ermB_16_X82819 1 0 . . Erythromycin and Clindamycin resistance -ermB_20_AF109075 1 0 . . Erythromycin and Clindamycin resistance -ermC_13_M13761 1 0 . . Erythromycin and Clindamycin resistance -cat_5_U35036 1 0 . . Chloramphenicol resistance -catpC194_1_NC_002013 1 0 . . Chloramphenicol resistance -catpC233_1_AY355285 1 0 . . Chloramphenicol resistance -catQ_1_M55620 1 0 . . Chloramphenicol resistance -msrD_2_AF274302 1 0 . . Erythromycin resistance -msrD_3_AF227520 1 0 . . Erythromycin resistance -mefA_10_AF376746 1 0 . . Erythromycin resistance -mefE_AE007317 1 0 . . Erythromycin resistance -tetM_1_X92947 1 0 . . Tetracycline resistance -tetM_12_FR671418 1 0 . . Tetracycline resistance -tetK_4_U38428 1 0 . . Tetracycline resistance -tetM_13_AM990992 1 0 . . Tetracycline resistance -tetM_2_X90939 1 0 . . Tetracycline resistance -tetM_4_X75073 1 0 . . Tetracycline resistance -tetM_5_U58985 1 0 . . Tetracycline resistance -tetM_8_X04388 1 0 . . Tetracycline resistance -tetS_M 1 0 . . Tetracycline resistance -tetS_M_MH283012 1 0 . . Tetracycline resistance -tetAp_L20800 1 0 . . Tetracycline resistance -tetBp_L20800 1 0 . . Tetracycline resistance -tetAQ2_Z21523 1 0 . . Tetracycline resistance -tetS_FN555436 1 0 . . Tetracycline resistance -tetT_L42544 1 0 . . Tetracycline resistance -tetW_AJ222769 1 0 . . Tetracycline resistance -tet32_AJ295238 1 0 . . Tetracycline resistance -tet36_AJ514254 1 0 . . Tetracycline resistance -tet44_FN594949 1 0 . . Tetracycline resistance -tet58_KY887560 1 0 . . Tetracycline resistance -tet_M74049 1 0 . . Tetracycline resistance -tetS_M_HM367711 1 0 . . Tetracycline resistance -tetS_M_AY534326 1 0 . . Tetracycline resistance -tetM_M85225 1 0 . . Tetracycline resistance -tetS_FN555436 1 0 . . Tetracycline resistance -tetM_MH283017 1 0 . . tetracycline resistance -folA_AE007317 1 1 I100L . "proteinID-AAL00232.1, Trimethoprim" -folP_AE007317 1 1 . . "proteinID-AAK99071.1, Sulfamethoxazole resistance on if insertions in 56-67 amino acids" -gyrA_AE007317 1 1 S81F . Fluoroquinolone -gyrA_AE007317 1 1 S81Y . Fluoroquinolone -gyrA_AE007317 1 1 S81C . Fluoroquinolone -gyrA_AE007317 1 1 S81I . Fluoroquinolone -gyrA_AE007317 1 1 E85K . Fluoroquinolone -gyrA_AE007317 1 1 Q118A . Fluoroquinolone -gyrB_AE007317 1 1 E474K . Fluoroquinolone -parC_AE007317 1 1 A63T . Fluoroquinolone -parC_AE007317 1 1 S79F . Fluoroquinolone -parC_AE007317 1 1 S79Y . Fluoroquinolone -parC_AE007317 1 1 S79L . Fluoroquinolone -parC_AE007317 1 1 S79F . Fluoroquinolone -parC_AE007317 1 1 D83G . Fluoroquinolone -parC_AE007317 1 1 D83N . Fluoroquinolone -parE_AE007317 1 1 E474K . Fluoroquinolone -parE_AE007317 1 1 D435N . Fluoroquinolone -parE_AE007317 1 1 D435H . Fluoroquinolone -parE_AE007317 1 1 P454S . Fluoroquinolone -tetO_Y07780 1 0 . . Tetracycline resistance -ermBups_HG799494 0 0 . . Erythromycin and Clindamycin resistance -ermbTr_CP002121 0 0 . . Erythromycin and Clindamycin resistance -rplD_AE007317 1 1 . . Linezolid resistance (deletion within the L4 region of the gene ) -rpoB_AE007317 1 1 D489E . rifampicin resistance -D415E -rpoB_AE007317 1 1 H499N . rifampicin resistance -H425N -rpoB_AE007317 1 1 D489N . rifampicin resistance -H415N -vanB_KC489787 1 0 . . Vacomycin resistance -vanD_EU999036 1 0 . . Vacomycin resistance -vanE_FJ872411 1 0 . . Vacomycin resistance -vanG_KF704242 1 0 . . Vacomycin resistance -otrA_X53401 1 0 . . Tetracycline resistance -vanA_M97297 1 0 . . Vacomycin resistance (E.faecium) -vanC_AF162694 1 0 . . Vacomycin resistance (E.gallinarum) -23S_NZ_CP018347 0 1 A2114G . Macrolide:32347-35250 -23S_NZ_CP018347 0 1 A2115G . Macrolide:32347-35250 -23S_NZ_CP018347 0 1 A2118G . Macrolide/Streptogramin:32347-35250 -23S_NZ_CP018347 0 1 C2630A . Macrolide:32347-35250 -23S_NZ_CP018347 0 1 C2630G . Macrolide:32347-35250 -rrgA_EF560637 1 0 . . Pili-1-(RlrA pilus-1-2279-4939) -pitB_GU256423 1 0 . . Pili-2-(pitB pilus-3504-4736) +aph_3prime_III_1_M26832 1 0 . . KAN +ermB_1_JN899585 1 0 . . ERY CLI +ermB_10_U86375 1 0 . . ERY CLI +ermB_16_X82819 1 0 . . ERY CLI +ermB_20_AF109075 1 0 . . ERY CLI +ermC_13_M13761 1 0 . . ERY CLI +cat_5_U35036 1 0 . . CHL +catpC194_1_NC_002013 1 0 . . CHL +catpC233_1_AY355285 1 0 . . CHL +catQ_1_M55620 1 0 . . CHL +msrD_2_AF274302 1 0 . . ERY +msrD_3_AF227520 1 0 . . ERY +mefA_10_AF376746 1 0 . . ERY +mefE_AE007317 1 0 . . ERY +tetM_1_X92947 1 0 . . TET +tetM_12_FR671418 1 0 . . TET +tetK_4_U38428 1 0 . . TET +tetM_13_AM990992 1 0 . . TET +tetM_2_X90939 1 0 . . TET +tetM_4_X75073 1 0 . . TET +tetM_5_U58985 1 0 . . TET +tetM_8_X04388 1 0 . . TET +tetS_M 1 0 . . TET +tetS_M_MH283012 1 0 . . TET +tetAp_L20800 1 0 . . TET +tetBp_L20800 1 0 . . TET +tetAQ2_Z21523 1 0 . . TET +tetS_FN555436 1 0 . . TET +tetT_L42544 1 0 . . TET +tetW_AJ222769 1 0 . . TET +tet32_AJ295238 1 0 . . TET +tet36_AJ514254 1 0 . . TET +tet44_FN594949 1 0 . . TET +tet58_KY887560 1 0 . . TET +tet_M74049 1 0 . . TET +tetS_M_HM367711 1 0 . . TET +tetS_M_AY534326 1 0 . . TET +tetM_M85225 1 0 . . TET +tetS_FN555436 1 0 . . TET +tetM_MH283017 1 0 . . TET +folA_AE007317 1 1 I100L . TMP +folP_AE007317 1 1 . . SMX +gyrA_AE007317 1 1 S81F . FLQ +gyrA_AE007317 1 1 S81Y . FLQ +gyrA_AE007317 1 1 S81C . FLQ +gyrA_AE007317 1 1 S81I . FLQ +gyrA_AE007317 1 1 E85K . FLQ +gyrA_AE007317 1 1 Q118A . FLQ +gyrB_AE007317 1 1 E474K . FLQ +parC_AE007317 1 1 A63T . FLQ +parC_AE007317 1 1 S79F . FLQ +parC_AE007317 1 1 S79Y . FLQ +parC_AE007317 1 1 S79L . FLQ +parC_AE007317 1 1 S79F . FLQ +parC_AE007317 1 1 D83G . FLQ +parC_AE007317 1 1 D83N . FLQ +parE_AE007317 1 1 E474K . FLQ +parE_AE007317 1 1 D435N . FLQ +parE_AE007317 1 1 D435H . FLQ +parE_AE007317 1 1 P454S . FLQ +tetO_Y07780 1 0 . . TET +ermBups_HG799494 0 0 . . ERY +ermbTr_CP002121 0 0 . . ERY +rplD_AE007317 1 1 . . LNZ +rpoB_AE007317 1 1 D489E . RIF +rpoB_AE007317 1 1 H499N . RIF +rpoB_AE007317 1 1 D489N . RIF +vanB_KC489787 1 0 . . VAN +vanD_EU999036 1 0 . . VAN +vanE_FJ872411 1 0 . . VAN +vanG_KF704242 1 0 . . VAN +otrA_X53401 1 0 . . TET +vanA_M97297 1 0 . . TET +vanC_AF162694 1 0 . . TET +23S_NZ_CP018347 0 1 A2114G . ERY +23S_NZ_CP018347 0 1 A2115G . ERY +23S_NZ_CP018347 0 1 A2118G . ERY +23S_NZ_CP018347 0 1 C2630A . ERY +23S_NZ_CP018347 0 1 C2630G . ERY +rrgA_EF560637 1 0 . . PILI-1 +pitB_GU256423 1 0 . . PILI-2 From aa3d09b8e2d0f50ddf6d286d1c234ffddc31e212 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 6 Jul 2023 12:13:30 +0000 Subject: [PATCH 019/157] Revert ARIBA assembler back to default Former-commit-id: ad3cdc824697aecb1b0704a0cb489d5279e97083 --- modules/amr.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/amr.nf b/modules/amr.nf index 905fd2b..0079b1e 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -86,7 +86,7 @@ process OTHER_RESISTANCE { report='result/report.tsv' report_debug='result/debug.report.tsv' """ - ariba run --nucmer_min_id 80 --assembled_threshold 0.80 --assembler spades $ariba_database/$database $read1 $read2 result + ariba run --nucmer_min_id 80 --assembled_threshold 0.80 $ariba_database/$database $read1 $read2 result """ } From 840b26208fcafe0c7d93b788207b2397971d8278 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 7 Jul 2023 09:09:53 +0000 Subject: [PATCH 020/157] Ensure type matching of variable comparsion Former-commit-id: 2f8c6dc502140522f3583fbe75895d353aa24f1c --- bin/get_other_resistance.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/bin/get_other_resistance.py b/bin/get_other_resistance.py index df0899e..56ae244 100755 --- a/bin/get_other_resistance.py +++ b/bin/get_other_resistance.py @@ -13,21 +13,21 @@ # Skip the header in metadata next(metadata) - # Go through lines and save findings to gene_dict and drug_set + # Go through lines in metadata and save findings to gene_dict and drug_set lines = [line.strip() for line in metadata] for line in lines: - fields = line.split("\t") + fields = [str(field) for field in line.split("\t")] reference, gene, var_only, var_change, _, drug = fields gene_dict[(reference, gene, var_only)] = {"var_change": var_change, "drug": drug} drug_set.add(drug) # Skip the header in report next(report) - # Go through lines to detect targets + # Go through lines in report to detect targets lines = [line.strip() for line in report] for line in lines: # Extract useful fields - fields = line.split("\t") + fields = [str(field) for field in line.split("\t")] ref_name, gene, var_only, ref_len, ref_base_assembled, known_var_change, has_known_var = fields[1], fields[2], fields[3], fields[7], fields[8], fields[16], fields[17] # If coverage (ref_base_assembled / ref_len) < 0.9 or either variable contains non-numeric value, skip the line @@ -37,7 +37,7 @@ # WIP gene_dict_key = (ref_name, gene, var_only) if gene_dict_key in gene_dict: - if var_only == 0: - print(gene_dict[gene_dict_key]) - if var_only == 1 and gene_dict[gene_dict_key]['var_change'] == known_var_change and has_known_var == 1: - print(gene_dict[gene_dict_key]) + if var_only == "0": + print(ref_name, gene_dict[gene_dict_key]) + if var_only == "1" and gene_dict[gene_dict_key]['var_change'] == known_var_change and has_known_var == "1": + print(ref_name, gene_dict[gene_dict_key]) From 82eb910b04f43e7bb5b8a6f40eb4108ef7485a44 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 13 Jul 2023 09:18:13 +0000 Subject: [PATCH 021/157] Remove LNZ from ARIBA database Former-commit-id: 7bf53626f236254d8683b31eb6f64fa933dc1582 --- ...tadata-20230629.tsv => ariba_metadata-20230712.tsv} | 1 - ...230629.fasta => ariba_ref_sequences-20230712.fasta} | 10 ---------- nextflow.config | 6 +++--- 3 files changed, 3 insertions(+), 14 deletions(-) rename data/{ariba_metadata-20230629.tsv => ariba_metadata-20230712.tsv} (95%) rename data/{ariba_ref_sequences-20230629.fasta => ariba_ref_sequences-20230712.fasta} (99%) diff --git a/data/ariba_metadata-20230629.tsv b/data/ariba_metadata-20230712.tsv similarity index 95% rename from data/ariba_metadata-20230629.tsv rename to data/ariba_metadata-20230712.tsv index 88c3a4f..de44254 100644 --- a/data/ariba_metadata-20230629.tsv +++ b/data/ariba_metadata-20230712.tsv @@ -62,7 +62,6 @@ parE_AE007317 1 1 P454S . FLQ tetO_Y07780 1 0 . . TET ermBups_HG799494 0 0 . . ERY ermbTr_CP002121 0 0 . . ERY -rplD_AE007317 1 1 . . LNZ rpoB_AE007317 1 1 D489E . RIF rpoB_AE007317 1 1 H499N . RIF rpoB_AE007317 1 1 D489N . RIF diff --git a/data/ariba_ref_sequences-20230629.fasta b/data/ariba_ref_sequences-20230712.fasta similarity index 99% rename from data/ariba_ref_sequences-20230629.fasta rename to data/ariba_ref_sequences-20230712.fasta index 5dffbd5..aac7dd0 100644 --- a/data/ariba_ref_sequences-20230629.fasta +++ b/data/ariba_ref_sequences-20230712.fasta @@ -104,16 +104,6 @@ GATAAATAA >ermbTr_CP002121 GCTTTTGATAGTCAAGCGAAATATAGCTACCTTATTGTAGAGAGGGGATTTGCTAAAAGG TTGCAAAA ->rplD_AE007317 -ATGGCAAACGTAACATTATTTGACCAAACTGGTAAAGAAGCTGGCCAAGTTGTTCTTAGCGATGCAGTAT -TTGGTATCGAACCAAATGAATCAGTTGTGTTTGATGTAATCATCAGCCAACGCGCAAGCCTTCGTCAAGG -AACACACGCTGTTAAAAACCGCTCTGCAGTATCAGGTGGTGGACGCAAACCATGGCGTCAAAAAGGAACT -GGACGTGCTCGTCAAGGTTCTATCCGCTCACCACAATGGCGTGGTGGTGGTGTTGTCTTCGGACCAACTC -CACGTTCATACGGCTACAAACTTCCACAAAAAGTTCGTCGCCTAGCTCTTAAATCAGTTTACTCTGAAAA -AGTTGCTGAAAACAAATTCGTAGCTGTAGACGCTCTTTCATTTACAGCTCCAAAAACTGCTGAATTTGCA -AAAGTTCTTGCAGCATTGAGCATCGATTCTAAAGTTCTTGTTATCCTTGAAGAAGGAAATGAATTCGCAG -CTCTTTCAGCTCGTAACCTTCCAAACGTGAAAGTTGCAACTGCTACAACTGCAAGTGTTCTTGACATCGC -AAATAGCGACAAACTTCTTGTCACACAAGCAGCTATCTCTAAAATCGAGGAGGTTCTTGCATAA >rpoB_AE007317 TTGACAAGGCTTGGAACTTATTTACAAAGGAGAATCATCTTGGCAGGACATGACGTTCAATACGGGAAAC ATCGTACCCGTCGTAGTTTTTCAAGAATCAAAGAAGTTCTTGACTTACCAAATTTGATTGAAATTCAAAC diff --git a/nextflow.config b/nextflow.config index ba6ff1a..fd30ded 100644 --- a/nextflow.config +++ b/nextflow.config @@ -48,8 +48,8 @@ params { depth = 20.00 // Default ARIBA referece sequences and metadata paths, and local directory for its generated database - ariba_ref = "$projectDir/data/ariba_ref_sequences-20230629.fasta" - ariba_metadata = "$projectDir/data/ariba_metadata-20230629.tsv" + ariba_ref = "$projectDir/data/ariba_ref_sequences-20230712.fasta" + ariba_metadata = "$projectDir/data/ariba_metadata-20230712.tsv" ariba_db_local = "$projectDir/databases/ariba" // Toggle for removing .bam and .sam files mid-run to reduce storage requirement @@ -105,7 +105,7 @@ process { container = 'staphb/kraken2:2.1.2-no-db' } withLabel: seroba_container { - container = 'staphb/seroba:1.0.2' + container = 'harryhungch/seroba:test' } } From 3495e5caa1046ed4e2e217d61ff1704fb7180338 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 13 Jul 2023 13:24:45 +0000 Subject: [PATCH 022/157] Update SeroBA image Former-commit-id: 619a2e97ab11d93c59adbf4a9420ed9d6820aca0 --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index fd30ded..1ab6cb0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -105,7 +105,7 @@ process { container = 'staphb/kraken2:2.1.2-no-db' } withLabel: seroba_container { - container = 'harryhungch/seroba:test' + container = 'harryhungch/seroba:1.0.3' } } From dae98893b173f81c71016c16e509fc6b2363cac4 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 13 Jul 2023 16:33:10 +0000 Subject: [PATCH 023/157] Use both normal and debug reports of ARIBA Former-commit-id: 20af1fd7bbf6bfe374af4b8960aaafe0bf06ec18 --- modules/amr.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/amr.nf b/modules/amr.nf index 0079b1e..982e348 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -103,6 +103,6 @@ process GET_OTHER_RESISTANCE { script: """ - get_other_resistance.py "$report_debug" "$metadata" + get_other_resistance.py "$report" "$report_debug" "$metadata" """ } From df95121c2f13ea19e93d1c757113d4b37550cc58 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 13 Jul 2023 16:33:30 +0000 Subject: [PATCH 024/157] Improve header of ARIBA metadata Former-commit-id: 9c747b31a229a1d7885a4852c5fcbd3402622a4b --- data/ariba_metadata-20230712.tsv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/ariba_metadata-20230712.tsv b/data/ariba_metadata-20230712.tsv index de44254..9afa30a 100644 --- a/data/ariba_metadata-20230712.tsv +++ b/data/ariba_metadata-20230712.tsv @@ -1,4 +1,4 @@ -reference coding_yes(1)_no(0) pr/ab(0)_var(1) description of the variant Group FreeText_Drug +ref_name gene var_only var_change group target aph_3prime_III_1_M26832 1 0 . . KAN ermB_1_JN899585 1 0 . . ERY CLI ermB_10_U86375 1 0 . . ERY CLI From 4f889b70b76bc3e8cb0e59f14f11f04dd556e72b Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 13 Jul 2023 17:03:44 +0000 Subject: [PATCH 025/157] Further work on extracting info from ARIBA reports Former-commit-id: 9f649c3940a3e20738c88cda529bf93f4019743d --- bin/get_other_resistance.py | 68 ++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 23 deletions(-) diff --git a/bin/get_other_resistance.py b/bin/get_other_resistance.py index 56ae244..ae6d154 100755 --- a/bin/get_other_resistance.py +++ b/bin/get_other_resistance.py @@ -1,43 +1,65 @@ #! /usr/bin/env python3 import sys +from itertools import chain +from collections import defaultdict report_path = sys.argv[1] -metadata_path = sys.argv[2] +debug_report_path = sys.argv[2] +metadata_path = sys.argv[3] -with open(report_path) as report, open(metadata_path) as metadata: - # Save (reference, gene, var_only) combination found in metadata - gene_dict = {} - # Save drug found in metadata - drug_set = set() +with open(report_path) as report, open(debug_report_path) as debug_report, open(metadata_path) as metadata: + # For saving (reference, gene, var_only) combinations as key and their information ({var_change: target}) as value found in metadata + gene_dict = defaultdict(dict) + + # For saving targets found in metadata as key and their determinants (add to a set) as value + target_dict = {} # Skip the header in metadata next(metadata) - # Go through lines in metadata and save findings to gene_dict and drug_set - lines = [line.strip() for line in metadata] - for line in lines: + # Go through lines in metadata and save findings to gene_dict and target_dict + for line in (line.strip() for line in metadata): + # Extract useful fields fields = [str(field) for field in line.split("\t")] - reference, gene, var_only, var_change, _, drug = fields - gene_dict[(reference, gene, var_only)] = {"var_change": var_change, "drug": drug} - drug_set.add(drug) + ref_name, gene, var_only, var_change, _, target = fields - # Skip the header in report + # Populating gene_dict + gene_dict[(ref_name, gene, var_only)].update({var_change: target}) + # Populating target_dict + target_dict.update({target: set()}) + + # Skip the header in report and debug report next(report) - # Go through lines in report to detect targets - lines = [line.strip() for line in report] - for line in lines: + next(debug_report) + # Go through lines in both report and debug report to detect targets + for line in (line.strip() for line in chain(report, debug_report)): # Extract useful fields fields = [str(field) for field in line.split("\t")] - ref_name, gene, var_only, ref_len, ref_base_assembled, known_var_change, has_known_var = fields[1], fields[2], fields[3], fields[7], fields[8], fields[16], fields[17] + ref_name, gene, var_only, ref_len, ref_base_assembled, known_var_change, has_known_var, ref_ctg_effect, ref_start, ref_end = fields[1], fields[2], fields[3], fields[7], fields[8], fields[16], fields[17], fields[19], fields[20], fields[21] # If coverage (ref_base_assembled / ref_len) < 0.9 or either variable contains non-numeric value, skip the line if not ref_base_assembled.isdigit() or not ref_len.isdigit() or int(ref_base_assembled)/int(ref_len) < 0.9: continue - # WIP + # If the known_var_change (. for genes, specific change for variants) is not found in the metadata of the (ref_name, gene, var_only) combination, skip the line gene_dict_key = (ref_name, gene, var_only) - if gene_dict_key in gene_dict: - if var_only == "0": - print(ref_name, gene_dict[gene_dict_key]) - if var_only == "1" and gene_dict[gene_dict_key]['var_change'] == known_var_change and has_known_var == "1": - print(ref_name, gene_dict[gene_dict_key]) + try: + target = gene_dict[gene_dict_key][known_var_change] + except KeyError: + continue + + # Logic for gene detection. Found means hit. + if var_only == "0": + target_dict[target].add(f'Found {ref_name}') + + # Logic for variant detection, further criteria required + if var_only == "1": + # folP-specific criteria: ref_ctg_effect (effect of change between reference and contig) is one of the keywords and the change occurs within nt 168-201 + if ref_name.lower().startswith("folp") and ref_ctg_effect.lower() in ('fshift', 'trunc', 'indel', 'ins', 'multiple') and (168 <= int(ref_start) <= 201 or 168 <= int(ref_end) <= 201): + pos = ref_start if ref_start == ref_end else f'{ref_start}-{ref_end}' + target_dict[target].add(f'{ref_name} {ref_ctg_effect} at {pos}') + # Common criteria: the assembly has that variant + elif has_known_var == "1": + target_dict[target].add(f'{ref_name} {known_var_change}') + + print(target_dict) \ No newline at end of file From 38241206ef4b27d63a066a01c33aeb03d02eac7d Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 14 Jul 2023 15:05:35 +0000 Subject: [PATCH 026/157] Improve robustness of JSON capture Former-commit-id: 44b133139bc6df54d85d4afcf3869e67d92fa236 --- bin/create_ariba_db.sh | 8 ++++---- bin/create_ref_genome_bwa_db.sh | 4 ++-- bin/get_seroba_db.sh | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/bin/create_ariba_db.sh b/bin/create_ariba_db.sh index 073028e..289fff4 100755 --- a/bin/create_ariba_db.sh +++ b/bin/create_ariba_db.sh @@ -5,10 +5,10 @@ REF_SEQUENCES_MD5=$(md5sum $REF_SEQUENCES | awk '{ print $1 }') METADATA_MD5=$(md5sum $METADATA | awk '{ print $1 }') if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ - [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$REF_SEQUENCES" ] || \ - [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$REF_SEQUENCES_MD5" ] || \ - [ ! "$(grep '"metadata"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$METADATA" ] || \ - [ ! "$(grep '"metadata_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$METADATA_MD5" ] || \ + [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$REF_SEQUENCES" ] || \ + [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$REF_SEQUENCES_MD5" ] || \ + [ ! "$(grep '"metadata"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$METADATA" ] || \ + [ ! "$(grep '"metadata_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$METADATA_MD5" ] || \ [ ! -f ${DB_LOCAL}/${OUTPUT}/00.info.txt ] || \ [ ! -f ${DB_LOCAL}/${OUTPUT}/00.version_info.txt ] || \ [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_genes.log ] || \ diff --git a/bin/create_ref_genome_bwa_db.sh b/bin/create_ref_genome_bwa_db.sh index 6cee335..5bd277a 100755 --- a/bin/create_ref_genome_bwa_db.sh +++ b/bin/create_ref_genome_bwa_db.sh @@ -4,8 +4,8 @@ REFERENCE_MD5=$(md5sum $REFERENCE | awk '{ print $1 }') if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ - [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$REFERENCE" ] || \ - [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$REFERENCE_MD5" ] || \ + [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$REFERENCE" ] || \ + [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$REFERENCE_MD5" ] || \ [ ! -f ${DB_LOCAL}/${PREFIX}.amb ] || \ [ ! -f ${DB_LOCAL}/${PREFIX}.ann ] || \ [ ! -f ${DB_LOCAL}/${PREFIX}.bwt ] || \ diff --git a/bin/get_seroba_db.sh b/bin/get_seroba_db.sh index 736a99b..a3e1d3c 100755 --- a/bin/get_seroba_db.sh +++ b/bin/get_seroba_db.sh @@ -6,8 +6,8 @@ # Assume up-to-date if JSON passes checks and the host cannot be resolved to allow offline usage if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ - [ ! "$(grep 'git' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "${DB_REMOTE}" ] || \ - [ ! "$(grep 'kmer' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "${KMER}" ] || \ + [ ! "$(grep 'git' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "${DB_REMOTE}" ] || \ + [ ! "$(grep 'kmer' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "${KMER}" ] || \ !((git -C ${DB_LOCAL} pull || echo 'Already up-to-date') | grep -q 'Already up[- ]to[- ]date'); then rm -rf ${DB_LOCAL}/{,.[!.],..?}* From 5784155f2141a6f075e74d997e340c96cfd46aa6 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 14 Jul 2023 15:06:03 +0000 Subject: [PATCH 027/157] Improve target names Former-commit-id: a0938d18db7980c3c6dc049d01610c98462a0fd8 --- data/ariba_metadata-20230712.tsv | 50 ++++++++++++++++---------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/data/ariba_metadata-20230712.tsv b/data/ariba_metadata-20230712.tsv index 9afa30a..1a0a6ea 100644 --- a/data/ariba_metadata-20230712.tsv +++ b/data/ariba_metadata-20230712.tsv @@ -1,10 +1,10 @@ ref_name gene var_only var_change group target aph_3prime_III_1_M26832 1 0 . . KAN -ermB_1_JN899585 1 0 . . ERY CLI -ermB_10_U86375 1 0 . . ERY CLI -ermB_16_X82819 1 0 . . ERY CLI -ermB_20_AF109075 1 0 . . ERY CLI -ermC_13_M13761 1 0 . . ERY CLI +ermB_1_JN899585 1 0 . . ERY_CLI +ermB_10_U86375 1 0 . . ERY_CLI +ermB_16_X82819 1 0 . . ERY_CLI +ermB_20_AF109075 1 0 . . ERY_CLI +ermC_13_M13761 1 0 . . ERY_CLI cat_5_U35036 1 0 . . CHL catpC194_1_NC_002013 1 0 . . CHL catpC233_1_AY355285 1 0 . . CHL @@ -41,24 +41,24 @@ tetS_FN555436 1 0 . . TET tetM_MH283017 1 0 . . TET folA_AE007317 1 1 I100L . TMP folP_AE007317 1 1 . . SMX -gyrA_AE007317 1 1 S81F . FLQ -gyrA_AE007317 1 1 S81Y . FLQ -gyrA_AE007317 1 1 S81C . FLQ -gyrA_AE007317 1 1 S81I . FLQ -gyrA_AE007317 1 1 E85K . FLQ -gyrA_AE007317 1 1 Q118A . FLQ -gyrB_AE007317 1 1 E474K . FLQ -parC_AE007317 1 1 A63T . FLQ -parC_AE007317 1 1 S79F . FLQ -parC_AE007317 1 1 S79Y . FLQ -parC_AE007317 1 1 S79L . FLQ -parC_AE007317 1 1 S79F . FLQ -parC_AE007317 1 1 D83G . FLQ -parC_AE007317 1 1 D83N . FLQ -parE_AE007317 1 1 E474K . FLQ -parE_AE007317 1 1 D435N . FLQ -parE_AE007317 1 1 D435H . FLQ -parE_AE007317 1 1 P454S . FLQ +gyrA_AE007317 1 1 S81F . FQ +gyrA_AE007317 1 1 S81Y . FQ +gyrA_AE007317 1 1 S81C . FQ +gyrA_AE007317 1 1 S81I . FQ +gyrA_AE007317 1 1 E85K . FQ +gyrA_AE007317 1 1 Q118A . FQ +gyrB_AE007317 1 1 E474K . FQ +parC_AE007317 1 1 A63T . FQ +parC_AE007317 1 1 S79F . FQ +parC_AE007317 1 1 S79Y . FQ +parC_AE007317 1 1 S79L . FQ +parC_AE007317 1 1 S79F . FQ +parC_AE007317 1 1 D83G . FQ +parC_AE007317 1 1 D83N . FQ +parE_AE007317 1 1 E474K . FQ +parE_AE007317 1 1 D435N . FQ +parE_AE007317 1 1 D435H . FQ +parE_AE007317 1 1 P454S . FQ tetO_Y07780 1 0 . . TET ermBups_HG799494 0 0 . . ERY ermbTr_CP002121 0 0 . . ERY @@ -77,5 +77,5 @@ vanC_AF162694 1 0 . . TET 23S_NZ_CP018347 0 1 A2118G . ERY 23S_NZ_CP018347 0 1 C2630A . ERY 23S_NZ_CP018347 0 1 C2630G . ERY -rrgA_EF560637 1 0 . . PILI-1 -pitB_GU256423 1 0 . . PILI-2 +rrgA_EF560637 1 0 . . PILI1 +pitB_GU256423 1 0 . . PILI2 From 9c95da62fe5f03f222eabf2fa0deaca29ed53d4c Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 14 Jul 2023 15:06:50 +0000 Subject: [PATCH 028/157] ARIBA-based AMR detection prototype Former-commit-id: a4d225166c6978a5ed780ec25311a4801aa0b516 --- bin/get_other_resistance.py | 30 +++++++++++++++++++++++++++--- bin/get_other_resistance.sh | 32 ++++++++++++++++++++++++++++++++ modules/amr.nf | 11 +++++++++-- workflows/pipeline.nf | 6 +++--- 4 files changed, 71 insertions(+), 8 deletions(-) create mode 100755 bin/get_other_resistance.sh diff --git a/bin/get_other_resistance.py b/bin/get_other_resistance.py index ae6d154..29f8234 100755 --- a/bin/get_other_resistance.py +++ b/bin/get_other_resistance.py @@ -1,8 +1,11 @@ #! /usr/bin/env python3 +# Output AMR of a sample based on its ARIBA report and ARIBA metadata + import sys from itertools import chain from collections import defaultdict +import json report_path = sys.argv[1] debug_report_path = sys.argv[2] @@ -50,7 +53,7 @@ # Logic for gene detection. Found means hit. if var_only == "0": - target_dict[target].add(f'Found {ref_name}') + target_dict[target].add(f'{ref_name}') # Logic for variant detection, further criteria required if var_only == "1": @@ -60,6 +63,27 @@ target_dict[target].add(f'{ref_name} {ref_ctg_effect} at {pos}') # Common criteria: the assembly has that variant elif has_known_var == "1": - target_dict[target].add(f'{ref_name} {known_var_change}') + target_dict[target].add(f'{ref_name} Variant {known_var_change}') + + # For saving final output, where information is saved per-target + output = {} + + # Go through targets in metadata + for target in target_dict: + # + if len(target_dict[target]) == 0: + if target.lower().startswith('pili'): + output[target] = 'NEG' + else: + output[f'{target}_Res'] = 'S' - print(target_dict) \ No newline at end of file + output[f'{target}_Determinant'] = '_' + else: + if target.lower().startswith('pili'): + output[target] = 'POS' + else: + output[f'{target}_Res'] = 'R' + + output[f'{target}_Determinant'] = '; '.join(target_dict[target]) + + print(json.dumps(output, indent=4)) \ No newline at end of file diff --git a/bin/get_other_resistance.sh b/bin/get_other_resistance.sh new file mode 100755 index 0000000..fdd5f58 --- /dev/null +++ b/bin/get_other_resistance.sh @@ -0,0 +1,32 @@ +# Run get_other_resistance.py to infer AMR from ARIBA reports, then capture individual AMR from the output for Nextflow + +function GET_VALUE { + echo $(grep \"$1\" <<< $OUTPUT | sed -r 's/.+: "(.*)",?/\1/') +} + +OUTPUT=$(get_other_resistance.py "$REPORT" "$REPORT_DEBUG" "$METADATA") + +CHL_Res=$(GET_VALUE "CHL_Res") +CHL_Determinant=$(GET_VALUE "CHL_Determinant") +ERY_Res=$(GET_VALUE "ERY_Res") +ERY_Determinant=$(GET_VALUE "ERY_Determinant") +FQ_Res=$(GET_VALUE "FQ_Res") +FQ_Determinant=$(GET_VALUE "FQ_Determinant") +KAN_Res=$(GET_VALUE "KAN_Res") +KAN_Determinant=$(GET_VALUE "KAN_Determinant") +TET_Res=$(GET_VALUE "TET_Res") +TET_Determinant=$(GET_VALUE "TET_Determinant") +TMP_Res=$(GET_VALUE "TMP_Res") +TMP_Determinant=$(GET_VALUE "TMP_Determinant") +SMX_Res=$(GET_VALUE "SMX_Res") +SMX_Determinant=$(GET_VALUE "SMX_Determinant") +ERY_CLI_Res=$(GET_VALUE "ERY_CLI_Res") +ERY_CLI_Determinant=$(GET_VALUE "ERY_CLI_Determinant") +RIF_Res=$(GET_VALUE "RIF_Res") +RIF_Determinant=$(GET_VALUE "RIF_Determinant") +VAN_Res=$(GET_VALUE "VAN_Res") +VAN_Determinant=$(GET_VALUE "VAN_Determinant") +PILI1=$(GET_VALUE "PILI1") +PILI1_Determinant=$(GET_VALUE "PILI1_Determinant") +PILI2=$(GET_VALUE "PILI2") +PILI2_Determinant=$(GET_VALUE "PILI2_Determinant") \ No newline at end of file diff --git a/modules/amr.nf b/modules/amr.nf index 982e348..19429f7 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -90,7 +90,7 @@ process OTHER_RESISTANCE { """ } -// WIP, for extracting information from ARIBA report +// Extracting resistance information from ARIBA report process GET_OTHER_RESISTANCE { label 'python_container' label 'farm_low' @@ -101,8 +101,15 @@ process GET_OTHER_RESISTANCE { tuple val(sample_id), path(report), path(report_debug) path metadata + output: + tuple val(sample_id), env(CHL_Res), env(CHL_Determinant), env(ERY_Res), env(ERY_Determinant), env(FQ_Res), env(FQ_Determinant), env(KAN_Res), env(KAN_Determinant), env(TET_Res), env(TET_Determinant), env(TMP_Res), env(TMP_Determinant), env(SMX_Res), env(SMX_Determinant), env(ERY_CLI_Res), env(ERY_CLI_Determinant), env(RIF_Res), env(RIF_Determinant), env(VAN_Res), env(VAN_Determinant), env(PILI1), env(PILI1_Determinant), env(PILI2), env(PILI2_Determinant), emit: result + script: """ - get_other_resistance.py "$report" "$report_debug" "$metadata" + REPORT="$report" + REPORT_DEBUG="$report_debug" + METADATA="$metadata" + + source get_other_resistance.sh """ } diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 811da5a..bdead12 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -179,8 +179,8 @@ workflow PIPELINE { .map { (it[-1] == null) ? it[0..-2] + ['_'] * 8 : it } .join(GET_PBP_RESISTANCE.out.result, failOnDuplicate: true, remainder: true) .map { (it[-1] == null) ? it[0..-2] + ['_'] * 18 : it } - // .join(GET_OTHER_RESISTANCE.out, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 20 : it } + .join(GET_OTHER_RESISTANCE.out, failOnDuplicate: true, remainder: true) + .map { (it[-1] == null) ? it[0..-2] + ['_'] * 24 : it } .map { it.collect {"\"$it\""}.join',' } .collectFile( name: 'results.csv', @@ -196,7 +196,7 @@ workflow PIPELINE { 'Serotype', 'ST', 'aroE', 'gdh', 'gki', 'recP', 'spi', 'xpt', 'ddl', 'pbp1a', 'pbp2b', 'pbp2x', 'AMO_MIC', 'AMO_Res', 'CFT_MIC', 'CFT_Res(Meningital)', 'CFT_Res(Non-meningital)', 'TAX_MIC', 'TAX_Res(Meningital)', 'TAX_Res(Non-meningital)', 'CFX_MIC', 'CFX_Res', 'MER_MIC', 'MER_Res', 'PEN_MIC', 'PEN_Res(Meningital)', 'PEN_Res(Non-meningital)', - // 'CHL_Res', 'CHL_Determinant', 'CLI_Res', 'CLI_Determinant', 'ERY_Res', 'ERY_Determinant', 'FQ_Res', 'FQ_Determinant', 'KAN_Res', 'KAN_Determinant', 'LZO_Res', 'LZO_Determinant', 'TET_Res', 'TET_Determinant', 'TMP_Res', 'TMP_Determinant', 'SMX_Res', 'SMX_Determinant', 'COT_Res', 'COT_Determinant' + 'CHL_Res', 'CHL_Determinant', 'ERY_Res', 'ERY_Determinant', 'FQ_Res', 'FQ_Determinant', 'KAN_Res', 'KAN_Determinant', 'TET_Res', 'TET_Determinant', 'TMP_Res', 'TMP_Determinant', 'SMX_Res', 'SMX_Determinant', 'ERY_CLI_Res', 'ERY_CLI_Determinant', 'RIF_Res', 'RIF_Determinant', 'VAN_Res', 'VAN_Determinant', 'PILI-1', 'PILI-1_Determinant', 'PILI-2', 'PILI-2_Determinant' ].join(','), sort: { it.split(',')[0] }, newLine: true From 864644128e7cf6dfbe1c21e0cbde9e465a650a47 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 17 Jul 2023 17:50:58 +0000 Subject: [PATCH 029/157] Update to reflect change from AMRsearch to ARIBA Former-commit-id: 7a38ec57cf10d8dc62d1687f1fbef4a9de52ae41 --- doc/workflow.drawio.svg | 542 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 538 insertions(+), 4 deletions(-) diff --git a/doc/workflow.drawio.svg b/doc/workflow.drawio.svg index c5d85f5..f2e08ab 100644 --- a/doc/workflow.drawio.svg +++ b/doc/workflow.drawio.svg @@ -1,4 +1,538 @@ - - - -OutputInput
Data Type
Data Type
Raw Reads
*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}
Raw Reads...
FASTQ (Reads)
FASTQ (Reads)
S. Pneumo:  > 60%Contigs:  < 500Length:   1.9 - 2.3 MbDepth:     ≥ 20x
FASTA (Assemblies)
FASTA (Assemblies)
SAM
SAM
Ref Coverage:  > 60%Het-SNP site:   < 220
Results
Results
Assembly
SPAdes
(Default: Shovill)
Asse...
Mapping
BWA MEM
Mapp...
Taxonomy & Taxonomy QC
Kraken 2
Taxo...
Assembly QC
QUAST
Asse...
Mapping QC
SAMtools, BCFtools,
Het-SNP Counter
Mapp...
Preprocess
fastp
Prep...
PBP AMR
CDC PBP AMR Predictor
PBP...
MLST
mlst
MLST...
Overall QC
Over...
Lineage
PopPUNK
Line...
Serotype
SeroBA
Sero...
Other AMR
AMRsearch
Othe...
Information
info.txt
Informatio...
Read QC+Go / No-goBases:  ≥ Min Length x Depth Go / No-go
Assemblies
assemblies/*.contigs.fasta
Assemblies...
Results
results.csv
Results...
Text is not SVG - cannot display
\ No newline at end of file + + + + + + + + Output + + + + + + Input + + + + + + +
+
+
+ Data Type +
+
+
+
+ + Data Type + +
+
+ + + + + + + +
+
+
+ + Raw Reads + +
+ + *_{,R}{1,2}{,_001}.{fq,fastq}{,.gz} + +
+
+
+
+ + Raw Reads... + +
+
+ + + + + + + +
+
+
+ FASTQ (Reads) +
+
+
+
+ + FASTQ (Reads) + +
+
+ + + + + + S. Pneumo:  > 60% + + + + + + Contigs:  < 500 + + + Length:   1.9 - 2.3 Mb + + + Depth:     ≥ 20x + + + + + + +
+
+
+ FASTA (Assemblies) +
+
+
+
+ + FASTA (Assemblies) + +
+
+ + + + +
+
+
+ SAM +
+
+
+
+ + SAM + +
+
+ + + + Ref Coverage:  > 60% + + + Het-SNP site:   < 220 + + + + + + +
+
+
+ Results +
+
+
+
+ + Results + +
+
+ + + + + + + + + + +
+
+
+ + Assembly + +
+ SPAdes +
+ + (Default: Shovill) + +
+
+
+
+ + Asse... + +
+
+ + + + +
+
+
+ + Mapping + +
+ BWA MEM +
+
+
+
+ + Mapp... + +
+
+ + + + +
+
+
+
+ + Taxonomy & Taxonomy QC + +
+
+ Kraken 2 +
+
+
+
+
+ + Taxo... + +
+
+ + + + +
+
+
+ + Assembly QC + +
+ QUAST +
+
+
+
+ + Asse... + +
+
+ + + + +
+
+
+ + Mapping QC + + +
+
+ SAMtools, BCFtools, +
+ custom script +
+
+
+
+ + Mapp... + +
+
+ + + + + + +
+
+
+ + Preprocess + +
+ fastp +
+
+
+
+ + Prep... + +
+
+ + + + + + +
+
+
+ + PBP AMR + +
+ CDC PBP AMR Predictor +
+
+
+
+ + PBP... + +
+
+ + + + + +
+
+
+ + MLST + +
+ mlst +
+
+
+
+ + MLST... + +
+
+ + + + + + +
+
+
+ + Overall QC + +
+
+
+
+ + Over... + +
+
+ + + + + +
+
+
+ + Lineage + +
+ PopPUNK +
+
+
+
+ + Line... + +
+
+ + + + + +
+
+
+ + Serotype + +
+ SeroBA +
+
+
+
+ + Sero... + +
+
+ + + + + +
+
+
+ + Other AMR + +
+ ARIBA, custom script +
+
+
+
+ + Othe... + +
+
+ + + + +
+
+
+ + Information + +
+ info.txt +
+
+
+
+ + Informatio... + +
+
+ + + + + Read QC + + + + + + + Go / No-go + + + + + Bases: + + + ≥ Min Length x Depth + + + + + + Go / No-go + + + + + + + + +
+
+
+ + Assemblies + +
+ assemblies/*.contigs.fasta +
+
+
+
+ + Assemblies... + +
+
+ + + + +
+
+
+ + Results + +
+ results.csv +
+
+
+
+ + Results... + +
+
+
+ + + + + Text is not SVG - cannot display + + + +
\ No newline at end of file From 6b8def4e564d3f8d296a990c5e48b0ab106f77dc Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 17 Jul 2023 17:51:45 +0000 Subject: [PATCH 030/157] Add ARIBA-related options, and update credits Former-commit-id: 5905964b46720da1877b76c2f22f3dd99c00a682 --- README.md | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index df7cb44..aab4261 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca - [Taxonomy](#taxonomy) - [Serotype](#serotype) - [Lineage](#lineage) + - [Other AMR](#other-amr) - [Singularity](#singularity) - [Experimental](#experimental) - [Output](#output) @@ -236,6 +237,15 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | `--poppunk_ext_remote` | Any valid URL to a PopPUNK external clusters file in `.csv` format
(Default: [GPS v6 GPSC Designation](https://www.pneumogen.net/gps/GPS_v6_external_clusters.csv)) | URL to a PopPUNK external clusters file. | | `--poppunk_local` | Any valid path
(Default: `"$projectDir/databases/poppunk"`) | Path to the directory where the remote PopPUNK database and external clusters file should be saved to. | +## Other AMR + > ⚠️ `--ariba_db_local` does not accept user provided local database, directory content will be overwritten + + | Option | Values | Description | + | --- | ---| --- | + | `--ariba_ref` | Any valid path to a `.fa` or `.fasta` file
(Default: `"$projectDir/data/ariba_ref_sequences-20230712.fasta"`) | Path to the reference sequences for ARIBA. | + | `--ariba_metadata` | Any valid path to a `tsv` file
(Default: `"$projectDir/data/ariba_metadata-20230712.tsv"`) | Path to the metadata file for ARIBA. | + | `--ariba_db_local` | Any valid path
(Default: `"$projectDir/databases/ariba"`) | Path to the directory where ARIBA reference database should be saved to. | + ## Singularity > ℹ️ This section is only valid when Singularity is used as the container engine @@ -336,12 +346,10 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca # Credits This project uses open-source components. You can find the homepage or source code of their open-source projects along with license information below. I acknowledge and am grateful to these developers for their contributions to open source. -[AMRsearch](https://github.com/pathogenwatch-oss/amr-search) -- [Pathogenwatch](https://pathogen.watch/) ([@pathogenwatch-oss](https://github.com/pathogenwatch-oss)) -- License (MIT): https://github.com/pathogenwatch-oss/amr-search/blob/main/LICENSE -- This project uses a Docker image built from a [custom fork](https://github.com/HarryHung/amr-search) - - The fork changes the Docker image from a Docker executable image to a Docker environment for Nextflow integration - - The Docker image provides the containerised environment for `OTHER_RESISTANCE` process of the `amr.nf` module +[ARIBA](https://sanger-pathogens.github.io/ariba/) +- ARIBA: rapid antimicrobial resistance genotyping directly from sequencing reads Hunt M, Mather AE, Sánchez-Busó L, Page AJ, Parkhill J , Keane JA, Harris SR. Microbial Genomics 2017. doi: [110.1099/mgen.0.000131](http://mgen.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000131) +- License (GNU): https://github.com/sanger-pathogens/ariba/blob/master/LICENSE +- This tool is used in `CREATE_ARIBA_DB` and `OTHER_RESISTANCE` processes of the `amr.nf` module [BCFtools](https://samtools.github.io/bcftools/) and [SAMtools](https://www.htslib.org/) - Twelve years of SAMtools and BCFtools. Petr Danecek, James K Bonfield, Jennifer Liddle, John Marshall, Valeriu Ohan, Martin O Pollard, Andrew Whitwham, Thomas Keane, Shane A McCarthy, Robert M Davies, Heng Li. **GigaScience**, Volume 10, Issue 2, February 2021, giab008, https://doi.org/10.1093/gigascience/giab008 @@ -355,7 +363,7 @@ This project uses open-source components. You can find the homepage or source co - License (GPL-3.0): https://github.com/lh3/bwa/blob/master/COPYING - This tool is used in `GET_REF_GENOME_BWA_DB_PREFIX` and `MAPPING` processes of the `mapping.nf` module -[Docker Images](https://hub.docker.com/u/staphb) of [BCFtools](https://hub.docker.com/r/staphb/bcftools), [BWA](https://hub.docker.com/r/staphb/bwa), [fastp](https://hub.docker.com/r/staphb/fastp), [Kraken 2](https://hub.docker.com/r/staphb/kraken2), [mlst](https://hub.docker.com/r/staphb/mlst), [PopPUNK](https://hub.docker.com/r/staphb/poppunk), [QUAST](https://hub.docker.com/r/staphb/quast), [SAMtools](https://hub.docker.com/r/staphb/samtools), [SeroBA](https://hub.docker.com/r/staphb/seroba), [Shovill](https://hub.docker.com/r/staphb/shovill), [Unicycler](https://hub.docker.com/r/staphb/unicycler) +[Docker Images](https://hub.docker.com/u/staphb) of [BCFtools](https://hub.docker.com/r/staphb/bcftools), [BWA](https://hub.docker.com/r/staphb/bwa), [fastp](https://hub.docker.com/r/staphb/fastp), [Kraken 2](https://hub.docker.com/r/staphb/kraken2), [mlst](https://hub.docker.com/r/staphb/mlst), [PopPUNK](https://hub.docker.com/r/staphb/poppunk), [QUAST](https://hub.docker.com/r/staphb/quast), [SAMtools](https://hub.docker.com/r/staphb/samtools), [Shovill](https://hub.docker.com/r/staphb/shovill), [Unicycler](https://hub.docker.com/r/staphb/unicycler) - [State Public Health Bioinformatics Workgroup](https://staphb.org/) ([@StaPH-B](https://github.com/StaPH-B)) - License (GPL-3.0): https://github.com/StaPH-B/docker-builds/blob/master/LICENSE - These Docker images provide containerised environments for processes of multiple modules @@ -373,7 +381,7 @@ This project uses open-source components. You can find the homepage or source co [Docker Image of Python](https://hub.docker.com/_/python) - The Docker Community ([@docker-library](https://github.com/docker-library)) - License (MIT): https://github.com/docker-library/python/blob/master/LICENSE -- This Docker image provides the containerised environment for `HET_SNP_COUNT` process of the `mapping.nf` module +- This Docker image provides the containerised environment for `HET_SNP_COUNT` process of the `mapping.nf` module and `GET_OTHER_RESISTANCE` process of the `amr.nf` module [fastp](https://github.com/OpenGene/fastp) - Shifu Chen, Yanqing Zhou, Yaru Chen, Jia Gu; fastp: an ultra-fast all-in-one FASTQ preprocessor, Bioinformatics, Volume 34, Issue 17, 1 September 2018, Pages i884–i890, https://doi.org/10.1093/bioinformatics/bty560 @@ -393,7 +401,7 @@ This project uses open-source components. You can find the homepage or source co [mecA-HetSites-calculator](https://github.com/kumarnaren/mecA-HetSites-calculator) - Narender Kumar ([@kumarnaren](https://github.com/kumarnaren)) - License (GPL-3.0): https://github.com/kumarnaren/mecA-HetSites-calculator/blob/master/LICENSE -- Code was rewritten into `HET_SNP_COUNT` process of the `mapping.nf` module +- Code was rewritten into the `het_snp_count.py` script used by `HET_SNP_COUNT` process of the `mapping.nf` module [mlst](https://github.com/tseemann/mlst) - Torsten Seemann ([@tseemann](https://github.com/tseemann)) @@ -419,7 +427,15 @@ This project uses open-source components. You can find the homepage or source co [SeroBA](https://sanger-pathogens.github.io/seroba/) - **SeroBA: rapid high-throughput serotyping of Streptococcus pneumoniae from whole genome sequence data**. Epping L, van Tonder, AJ, Gladstone RA, GPS Consortium, Bentley SD, Page AJ, Keane JA, Microbial Genomics 2018, doi: [10.1099/mgen.0.000186](http://mgen.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000186) - License (GPL-3.0): https://github.com/sanger-pathogens/seroba/blob/master/LICENSE -- This tool is used in `CREATE_SEROBA_DB` and `SEROTYPE` processes of the `serotype.nf` module +- This project uses a Docker image built from a [custom fork](https://github.com/HarryHung/seroba) + - The fork integrates bug fixes + - The Docker image provides the containerised environment for `CREATE_SEROBA_DB` and `SEROTYPE` processes of the `serotype.nf` module + +[resistanceDatabase](https://github.com/kumarnaren/resistanceDatabase) +- Narender Kumar ([@kumarnaren](https://github.com/kumarnaren)) +- `sequences.fasta` is renamed to `ariba_ref_sequences-*.fasta` and used as-is +- `metadata.tsv` is renamed to `ariba_metadata-*.tsv` and modified +- The files are used as the default inputs of `CREATE_ARIBA_DB` process of the `amr.nf` module [Shovill](https://github.com/tseemann/shovill) - Torsten Seemann ([@tseemann](https://github.com/tseemann)) From 06c16d0078e92080da312b53eb94fab7b4b2a015 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 17 Jul 2023 19:31:40 +0000 Subject: [PATCH 031/157] Update License of ARIBA and resistanceDatabase Former-commit-id: 69009e2417f1106d201320ae733977035ce80e3c --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index aab4261..0bbc36f 100644 --- a/README.md +++ b/README.md @@ -348,7 +348,7 @@ This project uses open-source components. You can find the homepage or source co [ARIBA](https://sanger-pathogens.github.io/ariba/) - ARIBA: rapid antimicrobial resistance genotyping directly from sequencing reads Hunt M, Mather AE, Sánchez-Busó L, Page AJ, Parkhill J , Keane JA, Harris SR. Microbial Genomics 2017. doi: [110.1099/mgen.0.000131](http://mgen.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000131) -- License (GNU): https://github.com/sanger-pathogens/ariba/blob/master/LICENSE +- License (GPL-3.0): https://github.com/sanger-pathogens/ariba/blob/master/LICENSE - This tool is used in `CREATE_ARIBA_DB` and `OTHER_RESISTANCE` processes of the `amr.nf` module [BCFtools](https://samtools.github.io/bcftools/) and [SAMtools](https://www.htslib.org/) @@ -433,6 +433,7 @@ This project uses open-source components. You can find the homepage or source co [resistanceDatabase](https://github.com/kumarnaren/resistanceDatabase) - Narender Kumar ([@kumarnaren](https://github.com/kumarnaren)) +- License (GPL-3.0): https://github.com/kumarnaren/resistanceDatabase/blob/main/LICENSE - `sequences.fasta` is renamed to `ariba_ref_sequences-*.fasta` and used as-is - `metadata.tsv` is renamed to `ariba_metadata-*.tsv` and modified - The files are used as the default inputs of `CREATE_ARIBA_DB` process of the `amr.nf` module From a980fd78b9a2ad5cdd1983bd7860ffc9c32735e3 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 18 Jul 2023 08:56:59 +0000 Subject: [PATCH 032/157] Update Output section based on ARIBA-based AMR Former-commit-id: d0ea4fc4d79134df1ca48eaa5a87155541e8abe2 --- README.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0bbc36f..99535cb 100644 --- a/README.md +++ b/README.md @@ -323,24 +323,28 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | `PEN_Res(Non-meningital)` | PBP AMR | Resistance phenotype against PEN in non-meningital form | | `CHL_Res` | Other AMR | Resistance phenotype against Chloramphenicol (CHL) | | `CHL_Determinant` | Other AMR | Known determinants that inferred the CHL resistance | - | `CLI_Res` | Other AMR | Resistance phenotype against Clindamycin (CLI) | - | `CLI_Determinant` | Other AMR | Known determinants that inferred the CLI resistance | | `ERY_Res` | Other AMR | Resistance phenotype against Erythromycin (ERY) | | `ERY_Determinant` | Other AMR | Known determinants that inferred the ERY resistance | | `FQ_Res` | Other AMR | Resistance phenotype against Fluoroquinolones (FQ) | | `FQ_Determinant` | Other AMR | Known determinants that inferred the FQ resistance | | `KAN_Res` | Other AMR | Resistance phenotype against Kanamycin (KAN) | | `KAN_Determinant` | Other AMR | Known determinants that inferred the KAN resistance | - | `LZO_Res` | Other AMR | Resistance phenotype against Linezolid (LZO) | - | `LZO_Determinant` | Other AMR | Known determinants that inferred the LZO resistance | | `TET_Res` | Other AMR | Resistance phenotype against Tetracycline (TET) | | `TET_Determinant` | Other AMR | Known determinants that inferred the TET resistance | | `TMP_Res` | Other AMR | Resistance phenotype against Trimethoprim (TMP) | | `TMP_Determinant` | Other AMR | Known determinants that inferred the TMP resistance | | `SMX_Res` | Other AMR | Resistance phenotype against Sulfamethoxazole (SMX) | | `SMX_Determinant` | Other AMR | Known determinants that inferred the SMX resistance | - | `COT_Res` | Other AMR | Resistance phenotype against Co-Trimoxazole (COT) | - | `COT_Determinant` | Other AMR | Known determinants that inferred the COT resistance | + | `ERY_CLI_Res` | Other AMR | Resistance phenotype against Erythromycin (ERY) and Clindamycin (CLI) | + | `ERY_CLI_Determinant` | Other AMR | Known determinants that inferred the ERY and CLI resistance | + | `RIF_Res` | Other AMR | Resistance phenotype against Rifampin (RIF) | + | `RIF_Determinant` | Other AMR | Known determinants that inferred the RIF resistance | + | `VAN_Res` | Other AMR | Resistance phenotype against Vancomycin (VAN) | + | `VAN_Determinant` | Other AMR | Known determinants that inferred the VAN resistance | + | `PILI-1` | Other AMR | Expression of PILI-1 | + | `PILI-1_Determinant` | Other AMR | Known determinants that inferred the PILI-1 expression | + | `PILI-2` | Other AMR | Expression of PILI-2 | + | `PILI-2_Determinant` | Other AMR | Known determinants that inferred the PILI-2 expression |   # Credits From c9f27009998d652ff74f3f9d38ab931cb57dbc4d Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 18 Jul 2023 15:13:51 +0000 Subject: [PATCH 033/157] Add AMR inference based on other AMR Former-commit-id: 08ba429a8b7cada1d5cd1ee0ec37ea3f520fe400 --- bin/get_other_resistance.py | 43 ++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/bin/get_other_resistance.py b/bin/get_other_resistance.py index 29f8234..a4e42d1 100755 --- a/bin/get_other_resistance.py +++ b/bin/get_other_resistance.py @@ -70,7 +70,7 @@ # Go through targets in metadata for target in target_dict: - # + # If the target has no hit, set output as S or NEG (only for PILI-1/2), and determinant as _ if len(target_dict[target]) == 0: if target.lower().startswith('pili'): output[target] = 'NEG' @@ -78,6 +78,7 @@ output[f'{target}_Res'] = 'S' output[f'{target}_Determinant'] = '_' + # If the target has hit, set output as R or POS (only for PILI-1/2), and join all hits as determinant else: if target.lower().startswith('pili'): output[target] = 'POS' @@ -85,5 +86,45 @@ output[f'{target}_Res'] = 'R' output[f'{target}_Determinant'] = '; '.join(target_dict[target]) + + # Special cases to add to output + + # If TET exists and DOX does not: add DOX to output; directly copy output and determinant + if 'TET_Res' in output and 'DOX_Res' not in output: + output['DOX_Res'] = output['TET_Res'] + output['DOX_Determinant'] = output['TET_Determinant'] + + # If FQ exists and LFX does not: add LFX to output; directly copy output and determinant + if 'FQ_Res' in output and 'LFX_Res' not in output: + output['LFX_Res'] = output['FQ_Res'] + output['LFX_Determinant'] = output['FQ_Determinant'] + + # If both TMP and SMX exists, and COT does not: add COT to output. + # If R in both, COT is R; if R in one of them, COT is I; if S in both, COT is S + # Copy TMP_Determinant and SMX_Determinant to COT_Determinant + if 'TMP_Res' in output and 'SMX_Res' in output and 'COT_Res' not in output: + if output['TMP_Res'] == 'R' and output['SMX_Res'] == 'R': + output['COT_Res'] = 'R' + output['COT_Determinant'] = '; '.join(target_dict['TMP'].union(target_dict['SMX'])) + elif (output['TMP_Res'] == 'R') ^ (output['SMX_Res'] == 'R'): + output['COT_Res'] = 'I' + output['COT_Determinant'] = '; '.join(target_dict['TMP'].union(target_dict['SMX'])) + elif output['TMP_Res'] == 'S' and output['SMX_Res'] == 'S': + output['COT_Res'] = 'S' + output['COT_Determinant'] = '_' + + # If ERY_CLI exists, add ERY and CLI to output. + # If ERY_CLI is R, ERY and CLI are R, and add ERY_CLI determinant to their determinants + # If ERY_CLI is S, ERY and CLI are S if they do not already exist, otherwise leave them unchanged + if 'ERY_CLI_Res' in output: + if output['ERY_CLI_Res'] == 'R': + output['ERY_Res'] = 'R' + output['CLI_Res'] = 'R' + elif output['ERY_CLI_Res'] == 'S': + output['ERY_Res'] = output['ERY_Res'] if 'ERY_Res' in output else 'S' + output['CLI_Res'] = output['CLI_Res'] if 'CLI_Res' in output else 'S' + output['ERY_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['ERY'])) if 'ERY' in target_dict else output['ERY_CLI_Determinant'] + output['CLI_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['CLI'])) if 'CLI' in target_dict else output['ERY_CLI_Determinant'] + print(json.dumps(output, indent=4)) \ No newline at end of file From baaacb7132dad659984c3f7971df0bc7a5fb4b52 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 18 Jul 2023 15:31:34 +0000 Subject: [PATCH 034/157] Include new AMR in results.csv Former-commit-id: 82c6fa948e5c99d92651479f385d6027e4d9d792 --- bin/get_other_resistance.sh | 12 ++++++++++-- modules/amr.nf | 2 +- workflows/pipeline.nf | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/bin/get_other_resistance.sh b/bin/get_other_resistance.sh index fdd5f58..befd4a4 100755 --- a/bin/get_other_resistance.sh +++ b/bin/get_other_resistance.sh @@ -10,18 +10,26 @@ CHL_Res=$(GET_VALUE "CHL_Res") CHL_Determinant=$(GET_VALUE "CHL_Determinant") ERY_Res=$(GET_VALUE "ERY_Res") ERY_Determinant=$(GET_VALUE "ERY_Determinant") +CLI_Res=$(GET_VALUE "CLI_Res") +CLI_Determinant=$(GET_VALUE "CLI_Determinant") +ERY_CLI_Res=$(GET_VALUE "ERY_CLI_Res") +ERY_CLI_Determinant=$(GET_VALUE "ERY_CLI_Determinant") FQ_Res=$(GET_VALUE "FQ_Res") FQ_Determinant=$(GET_VALUE "FQ_Determinant") +LFX_Res=$(GET_VALUE "LFX_Res") +LFX_Determinant=$(GET_VALUE "LFX_Determinant") KAN_Res=$(GET_VALUE "KAN_Res") KAN_Determinant=$(GET_VALUE "KAN_Determinant") TET_Res=$(GET_VALUE "TET_Res") TET_Determinant=$(GET_VALUE "TET_Determinant") +DOX_Res=$(GET_VALUE "DOX_Res") +DOX_Determinant=$(GET_VALUE "DOX_Determinant") TMP_Res=$(GET_VALUE "TMP_Res") TMP_Determinant=$(GET_VALUE "TMP_Determinant") SMX_Res=$(GET_VALUE "SMX_Res") SMX_Determinant=$(GET_VALUE "SMX_Determinant") -ERY_CLI_Res=$(GET_VALUE "ERY_CLI_Res") -ERY_CLI_Determinant=$(GET_VALUE "ERY_CLI_Determinant") +COT_Res=$(GET_VALUE "COT_Res") +COT_Determinant=$(GET_VALUE "COT_Determinant") RIF_Res=$(GET_VALUE "RIF_Res") RIF_Determinant=$(GET_VALUE "RIF_Determinant") VAN_Res=$(GET_VALUE "VAN_Res") diff --git a/modules/amr.nf b/modules/amr.nf index 19429f7..6a2a0bf 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -102,7 +102,7 @@ process GET_OTHER_RESISTANCE { path metadata output: - tuple val(sample_id), env(CHL_Res), env(CHL_Determinant), env(ERY_Res), env(ERY_Determinant), env(FQ_Res), env(FQ_Determinant), env(KAN_Res), env(KAN_Determinant), env(TET_Res), env(TET_Determinant), env(TMP_Res), env(TMP_Determinant), env(SMX_Res), env(SMX_Determinant), env(ERY_CLI_Res), env(ERY_CLI_Determinant), env(RIF_Res), env(RIF_Determinant), env(VAN_Res), env(VAN_Determinant), env(PILI1), env(PILI1_Determinant), env(PILI2), env(PILI2_Determinant), emit: result + tuple val(sample_id), env(CHL_Res), env(CHL_Determinant), env(ERY_Res), env(ERY_Determinant), env(CLI_Res), env(CLI_Determinant), env(ERY_CLI_Res), env(ERY_CLI_Determinant), env(FQ_Res), env(FQ_Determinant), env(LFX_Res), env(LFX_Determinant), env(KAN_Res), env(KAN_Determinant), env(TET_Res), env(TET_Determinant), env(DOX_Res), env(DOX_Determinant), env(TMP_Res), env(TMP_Determinant), env(SMX_Res), env(SMX_Determinant), env(COT_Res), env(COT_Determinant), env(RIF_Res), env(RIF_Determinant), env(VAN_Res), env(VAN_Determinant), env(PILI1), env(PILI1_Determinant), env(PILI2), env(PILI2_Determinant), emit: result script: """ diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index bdead12..6dc59fe 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -196,7 +196,7 @@ workflow PIPELINE { 'Serotype', 'ST', 'aroE', 'gdh', 'gki', 'recP', 'spi', 'xpt', 'ddl', 'pbp1a', 'pbp2b', 'pbp2x', 'AMO_MIC', 'AMO_Res', 'CFT_MIC', 'CFT_Res(Meningital)', 'CFT_Res(Non-meningital)', 'TAX_MIC', 'TAX_Res(Meningital)', 'TAX_Res(Non-meningital)', 'CFX_MIC', 'CFX_Res', 'MER_MIC', 'MER_Res', 'PEN_MIC', 'PEN_Res(Meningital)', 'PEN_Res(Non-meningital)', - 'CHL_Res', 'CHL_Determinant', 'ERY_Res', 'ERY_Determinant', 'FQ_Res', 'FQ_Determinant', 'KAN_Res', 'KAN_Determinant', 'TET_Res', 'TET_Determinant', 'TMP_Res', 'TMP_Determinant', 'SMX_Res', 'SMX_Determinant', 'ERY_CLI_Res', 'ERY_CLI_Determinant', 'RIF_Res', 'RIF_Determinant', 'VAN_Res', 'VAN_Determinant', 'PILI-1', 'PILI-1_Determinant', 'PILI-2', 'PILI-2_Determinant' + 'CHL_Res', 'CHL_Determinant', 'ERY_Res', 'ERY_Determinant', 'CLI_Res', 'CLI_Determinant', 'ERY_CLI_Res', 'ERY_CLI_Determinant', 'FQ_Res', 'FQ_Determinant', 'LFX_Res', 'LFX_Determinant', 'KAN_Res', 'KAN_Determinant', 'TET_Res', 'TET_Determinant', 'DOX_Res', 'DOX_Determinant', 'TMP_Res', 'TMP_Determinant', 'SMX_Res', 'SMX_Determinant', 'COT_Res', 'COT_Determinant', 'RIF_Res', 'RIF_Determinant', 'VAN_Res', 'VAN_Determinant', 'PILI1', 'PILI1_Determinant', 'PILI2', 'PILI2_Determinant' ].join(','), sort: { it.split(',')[0] }, newLine: true From e2ff36e522930058a06f981fe936ee79219f7a6a Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 18 Jul 2023 15:40:14 +0000 Subject: [PATCH 035/157] Fixing ERY and CLI determinant output when empty Former-commit-id: c2d321ce266856f17b8bc09f80ce52313c53c0dc --- bin/get_other_resistance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/get_other_resistance.py b/bin/get_other_resistance.py index a4e42d1..4f71294 100755 --- a/bin/get_other_resistance.py +++ b/bin/get_other_resistance.py @@ -124,7 +124,7 @@ output['ERY_Res'] = output['ERY_Res'] if 'ERY_Res' in output else 'S' output['CLI_Res'] = output['CLI_Res'] if 'CLI_Res' in output else 'S' - output['ERY_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['ERY'])) if 'ERY' in target_dict else output['ERY_CLI_Determinant'] - output['CLI_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['CLI'])) if 'CLI' in target_dict else output['ERY_CLI_Determinant'] + output['ERY_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['ERY'])) if 'ERY' in target_dict and len(target_dict['ERY']) != 0 else output['ERY_CLI_Determinant'] + output['CLI_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['CLI'])) if 'CLI' in target_dict and len(target_dict['CLI']) != 0 else output['ERY_CLI_Determinant'] print(json.dumps(output, indent=4)) \ No newline at end of file From f217c516fe5e0f99cb6b2ebfb28960aae4787884 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 18 Jul 2023 15:59:33 +0000 Subject: [PATCH 036/157] Add information on new AMR Former-commit-id: 7de88411bad5ac33ac7f9729377aeaa16678678f --- README.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 99535cb..e0a2220 100644 --- a/README.md +++ b/README.md @@ -325,18 +325,26 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | `CHL_Determinant` | Other AMR | Known determinants that inferred the CHL resistance | | `ERY_Res` | Other AMR | Resistance phenotype against Erythromycin (ERY) | | `ERY_Determinant` | Other AMR | Known determinants that inferred the ERY resistance | + | `CLI_Res` | Other AMR | Resistance phenotype against Clindamycin (CLI) | + | `CLI_Determinant` | Other AMR | Known determinants that inferred the CLI resistance | + | `ERY_CLI_Res` | Other AMR | Resistance phenotype against Erythromycin (ERY) and Clindamycin (CLI) | + | `ERY_CLI_Determinant` | Other AMR | Known determinants that inferred the ERY and CLI resistance | | `FQ_Res` | Other AMR | Resistance phenotype against Fluoroquinolones (FQ) | | `FQ_Determinant` | Other AMR | Known determinants that inferred the FQ resistance | + | `LFX_Res` | Other AMR | Resistance phenotype against Levofloxacin (LFX) | + | `LFX_Determinant` | Other AMR | Known determinants that inferred the LFX resistance | | `KAN_Res` | Other AMR | Resistance phenotype against Kanamycin (KAN) | | `KAN_Determinant` | Other AMR | Known determinants that inferred the KAN resistance | | `TET_Res` | Other AMR | Resistance phenotype against Tetracycline (TET) | | `TET_Determinant` | Other AMR | Known determinants that inferred the TET resistance | + | `DOX_Res` | Other AMR | Resistance phenotype against Doxycycline (DOX) | + | `DOX_Determinant` | Other AMR | Known determinants that inferred the DOX resistance | | `TMP_Res` | Other AMR | Resistance phenotype against Trimethoprim (TMP) | | `TMP_Determinant` | Other AMR | Known determinants that inferred the TMP resistance | | `SMX_Res` | Other AMR | Resistance phenotype against Sulfamethoxazole (SMX) | | `SMX_Determinant` | Other AMR | Known determinants that inferred the SMX resistance | - | `ERY_CLI_Res` | Other AMR | Resistance phenotype against Erythromycin (ERY) and Clindamycin (CLI) | - | `ERY_CLI_Determinant` | Other AMR | Known determinants that inferred the ERY and CLI resistance | + | `COT_Res` | Other AMR | Resistance phenotype against Co-Trimoxazole (COT) | + | `COT_Determinant` | Other AMR | Known determinants that inferred the COT resistance | | `RIF_Res` | Other AMR | Resistance phenotype against Rifampin (RIF) | | `RIF_Determinant` | Other AMR | Known determinants that inferred the RIF resistance | | `VAN_Res` | Other AMR | Resistance phenotype against Vancomycin (VAN) | From 5ebf6047cdf77b95dfc18bbd7899e335cbdc5866 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 19 Jul 2023 15:17:17 +0000 Subject: [PATCH 037/157] Ensure version of Python 3 is captured Former-commit-id: e8a2905c8a533ea2535f861815071d1008667eee --- modules/info.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/info.nf b/modules/info.nf index 4cabccb..90cbc5b 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -445,7 +445,7 @@ process PYTHON_VERSION { shell: $/ - VERSION=$(python --version | sed -r "s/.*\s(.+)/\1/") + VERSION=$(python3 --version | sed -r "s/.*\s(.+)/\1/") /$ } From a291074aa544664df09a6b9af74a90426b9cd2ee Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 19 Jul 2023 16:20:26 +0000 Subject: [PATCH 038/157] Remove unnecessary versioning of Het-SNP Counter Former-commit-id: dc1064e8b6eac5887a968bef99a2b3d117d18532 --- modules/info.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/info.nf b/modules/info.nf index 90cbc5b..cde8662 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -245,7 +245,6 @@ process PARSE { |${toolTextRow('BWA', 'bwa')} |${toolTextRow('SAMtools', 'samtools')} |${toolTextRow('BCFtools', 'bcftools')} - |${toolTextRow('Het-SNP Counter', 'het_snp_count')} |${toolTextRow('PopPUNK', 'poppunk')} |${toolTextRow('CDC PBP AMR Predictor', 'spn_pbp_amr')} |${toolTextRow('ARIBA', 'ariba')} From ebd212ea124576beb8a15425292c5858347e433c Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 19 Jul 2023 16:20:51 +0000 Subject: [PATCH 039/157] Change default Python image to include Pandas Former-commit-id: 47bb796fbbb8fcb79e0d78a79ebf303940f49cd5 --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 1ab6cb0..1a322f1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -66,7 +66,7 @@ process { container = 'bitnami/git:2.39.0' } withLabel: python_container { - container = 'python:3.11.1-bullseye' + container = 'amancevice/pandas:2.0.2-slim' } withLabel: fastp_container { container = 'staphb/fastp:0.23.2' From af43d4f077c3dd15e00fa18ab8e19395bade2d3c Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 19 Jul 2023 16:21:36 +0000 Subject: [PATCH 040/157] Improve Docker Image capturing Former-commit-id: 231b7b1ae78c1fde380b7056da22c3202b8940a8 --- bin/get_images_info.sh | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/bin/get_images_info.sh b/bin/get_images_info.sh index ba5428f..95dd83f 100755 --- a/bin/get_images_info.sh +++ b/bin/get_images_info.sh @@ -1,25 +1,25 @@ # Extract containers information from nextflow.config and save into a JSON file -IMAGES=$(grep -E "container\s?=" $NEXTFLOW_CONFIG \ - | sort -u \ - | sed -r "s/\s+container\s?=\s?'(.+)'/\1/") +find_image () { + grep -E "container\s?=" -B 1 $NEXTFLOW_CONFIG | grep -v -- "^--$" | paste - - | sort -u | grep $1 | sed -r "s/.+container\s?=\s?'(.+)'/\1/" +} -BASH=$(grep network-multitool <<< $IMAGES) -GIT=$(grep git <<< $IMAGES) -PYTHON=$(grep python <<< $IMAGES) -FASTP=$(grep fastp <<< $IMAGES) -UNICYCLER=$(grep unicycler <<< $IMAGES) -SHOVILL=$(grep shovill <<< $IMAGES) -QUAST=$(grep quast <<< $IMAGES) -BWA=$(grep bwa <<< $IMAGES) -SAMTOOLS=$(grep samtools <<< $IMAGES) -BCFTOOLS=$(grep bcftools <<< $IMAGES) -POPPUNK=$(grep poppunk <<< $IMAGES) -SPN_PBP_AMR=$(grep spn-pbp-amr <<< $IMAGES) -ARIBA=$(grep ariba <<< $IMAGES) -MLST=$(grep mlst <<< $IMAGES) -KRAKEN2=$(grep kraken2 <<< $IMAGES) -SEROBA=$(grep seroba <<< $IMAGES) +BASH=$(find_image bash) +GIT=$(find_image git) +PYTHON=$(find_image python) +FASTP=$(find_image fastp) +UNICYCLER=$(find_image unicycler) +SHOVILL=$(find_image shovill) +QUAST=$(find_image quast) +BWA=$(find_image bwa) +SAMTOOLS=$(find_image samtools) +BCFTOOLS=$(find_image bcftools) +POPPUNK=$(find_image poppunk) +SPN_PBP_AMR=$(find_image spn-pbp-amr) +ARIBA=$(find_image ariba) +MLST=$(find_image mlst) +KRAKEN2=$(find_image kraken2) +SEROBA=$(find_image seroba) add_container () { jq -n --arg container $1 '.container = $container' From a0877b052c29f3d4f290fdbf54083647a311f627 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 20 Jul 2023 16:16:27 +0000 Subject: [PATCH 041/157] Save reports as .csv Former-commit-id: 5d072252de429525558fb157ee858317c566b71d --- bin/assembly_qc.sh | 3 +++ bin/mapping_qc.sh | 3 +++ bin/overall_qc.sh | 7 ++++++- bin/read_qc.sh | 3 +++ bin/taxonomy_qc.sh | 3 +++ modules/assembly.nf | 4 +++- modules/mapping.nf | 4 +++- modules/overall_qc.nf | 6 +++++- modules/preprocess.nf | 3 +++ modules/taxonomy.nf | 4 +++- 10 files changed, 35 insertions(+), 5 deletions(-) diff --git a/bin/assembly_qc.sh b/bin/assembly_qc.sh index 160ed72..9d399fc 100755 --- a/bin/assembly_qc.sh +++ b/bin/assembly_qc.sh @@ -9,3 +9,6 @@ if (( $CONTIGS < $QC_CONTIGS )) && (( $LENGTH >= $QC_LENGTH_LOW )) && (( $LENGTH else ASSEMBLY_QC="FAIL" fi + +echo \"Assembly_QC\",\"Contigs#\",\"Assembly_Length\",\"Seq_Depth\" > $ASSEMBLY_QC_REPORT +echo \"$ASSEMBLY_QC\",\"$CONTIGS\",\"$LENGTH\",\"$DEPTH\" >> $ASSEMBLY_QC_REPORT \ No newline at end of file diff --git a/bin/mapping_qc.sh b/bin/mapping_qc.sh index 450e871..75b18a0 100755 --- a/bin/mapping_qc.sh +++ b/bin/mapping_qc.sh @@ -7,3 +7,6 @@ if (( $(echo "$COVERAGE > $QC_REF_COVERAGE" | bc -l) )) && (( $HET_SNP < $QC_HET else MAPPING_QC="FAIL" fi + +echo \"Mapping_QC\",\"Ref_Cov_%\",\"Het-SNP#\" > $MAPPING_QC_REPORT +echo \"$MAPPING_QC\",\"$COVERAGE\",\"$QC_HET_SNP_SITE\" >> $MAPPING_QC_REPORT \ No newline at end of file diff --git a/bin/overall_qc.sh b/bin/overall_qc.sh index ccdb803..de7e116 100755 --- a/bin/overall_qc.sh +++ b/bin/overall_qc.sh @@ -1,10 +1,15 @@ # Determine overall QC result based on Assembly QC, Mapping QC and Taxonomy QC # In case of assembler failure, there will be no Assembly QC input, hence output result as ASSEMBLER FAILURE -if [[ "$ASSEMBLY_QC" == "PASS" ]] && [[ "$MAPPING_QC" == "PASS" ]] && [[ "$TAXONOMY_QC" == "PASS" ]]; then +if [[ "$READ_QC" == "PASS" ]] && [[ "$ASSEMBLY_QC" == "PASS" ]] && [[ "$MAPPING_QC" == "PASS" ]] && [[ "$TAXONOMY_QC" == "PASS" ]]; then OVERALL_QC="PASS" +elif [[ "$READ_QC" == "FAIL" ]]; then + OVERALL_QC="FAIL" elif [[ "$ASSEMBLY_QC" == "null" ]]; then OVERALL_QC="ASSEMBLER FAILURE" else OVERALL_QC="FAIL" fi + +echo \"Overall_QC\" > $OVERALL_QC_REPORT +echo \"$OVERALL_QC\" >> $OVERALL_QC_REPORT \ No newline at end of file diff --git a/bin/read_qc.sh b/bin/read_qc.sh index 040c6ec..6ce8382 100755 --- a/bin/read_qc.sh +++ b/bin/read_qc.sh @@ -7,3 +7,6 @@ if (( $(echo "$BASES >= ($QC_LENGTH_LOW*$QC_DEPTH)" | bc -l) )); then else READ_QC="FAIL" fi + +echo \"Read_QC\",\"Bases\" > $READ_QC_REPORT +echo \"$READ_QC\",\"$BASES\" >> $READ_QC_REPORT \ No newline at end of file diff --git a/bin/taxonomy_qc.sh b/bin/taxonomy_qc.sh index c468b14..23254b1 100755 --- a/bin/taxonomy_qc.sh +++ b/bin/taxonomy_qc.sh @@ -11,3 +11,6 @@ if (( $(echo "$PERCENTAGE > $QC_SPNEUMO_PERCENTAGE" | bc -l) )); then else TAXONOMY_QC="FAIL" fi + +echo \"Taxonomy_QC\",\"S.Pneumo_%\" > $TAXONOMY_QC_REPORT +echo \"$TAXONOMY_QC\",\"$PERCENTAGE\" >> $TAXONOMY_QC_REPORT \ No newline at end of file diff --git a/modules/assembly.nf b/modules/assembly.nf index 4699289..fd84c76 100644 --- a/modules/assembly.nf +++ b/modules/assembly.nf @@ -85,10 +85,11 @@ process ASSEMBLY_QC { val(qc_depth) output: - tuple val(sample_id), env(CONTIGS), env(LENGTH), env(DEPTH), emit: info tuple val(sample_id), env(ASSEMBLY_QC), emit: result + tuple val(sample_id), path(assembly_qc_report), emit: report script: + assembly_qc_report='assembly_qc_report.csv' """ REPORT="$report" BASES="$bases" @@ -96,6 +97,7 @@ process ASSEMBLY_QC { QC_LENGTH_LOW="$qc_length_low" QC_LENGTH_HIGH="$qc_length_high" QC_DEPTH="$qc_depth" + ASSEMBLY_QC_REPORT="$assembly_qc_report" source assembly_qc.sh """ diff --git a/modules/mapping.nf b/modules/mapping.nf index f0d1e0e..0a37628 100644 --- a/modules/mapping.nf +++ b/modules/mapping.nf @@ -138,15 +138,17 @@ process MAPPING_QC { val(qc_het_snp_site) output: - tuple val(sample_id), env(COVERAGE), env(HET_SNP), emit: info tuple val(sample_id), env(MAPPING_QC), emit: result + tuple val(sample_id), path(mapping_qc_report), emit: report script: + mapping_qc_report='mapping_qc_report.csv' """ COVERAGE="$ref_coverage" HET_SNP="$het_snp_count" QC_REF_COVERAGE="$qc_ref_coverage" QC_HET_SNP_SITE="$qc_het_snp_site" + MAPPING_QC_REPORT="$mapping_qc_report" source mapping_qc.sh """ diff --git a/modules/overall_qc.nf b/modules/overall_qc.nf index b212595..fa639d9 100644 --- a/modules/overall_qc.nf +++ b/modules/overall_qc.nf @@ -6,16 +6,20 @@ process OVERALL_QC { tag "$sample_id" input: - tuple val(sample_id), val(assembly_qc), val(mapping_qc), val(taxonomy_qc) + tuple val(sample_id), val(read_qc), val(assembly_qc), val(mapping_qc), val(taxonomy_qc) output: tuple val(sample_id), env(OVERALL_QC), emit: result + tuple val(sample_id), path(overall_qc_report), emit: report script: + overall_qc_report='overall_qc_report.csv' """ + READ_QC="$read_qc" ASSEMBLY_QC="$assembly_qc" MAPPING_QC="$mapping_qc" TAXONOMY_QC="$taxonomy_qc" + OVERALL_QC_REPORT="$overall_qc_report" source overall_qc.sh """ diff --git a/modules/preprocess.nf b/modules/preprocess.nf index 4e8e18c..e04b756 100644 --- a/modules/preprocess.nf +++ b/modules/preprocess.nf @@ -38,12 +38,15 @@ process READ_QC { output: tuple val(sample_id), env(BASES), emit: bases tuple val(sample_id), env(READ_QC), emit: result + tuple val(sample_id), path(read_qc_report), emit: report script: + read_qc_report='read_qc_report.csv' """ JSON="$json" QC_LENGTH_LOW="$qc_length_low" QC_DEPTH="$qc_depth" + READ_QC_REPORT="$read_qc_report" source read_qc.sh """ diff --git a/modules/taxonomy.nf b/modules/taxonomy.nf index af6266d..b4d1e62 100644 --- a/modules/taxonomy.nf +++ b/modules/taxonomy.nf @@ -63,13 +63,15 @@ process TAXONOMY_QC { val(qc_spneumo_percentage) output: - tuple val(sample_id), env(PERCENTAGE), emit: percentage tuple val(sample_id), env(TAXONOMY_QC), emit: result + tuple val(sample_id), path(taxonomy_qc_report), emit: report script: + taxonomy_qc_report='taxonomy_qc_report.csv' """ KRAKEN2_REPORT="$kraken2_report" QC_SPNEUMO_PERCENTAGE="$qc_spneumo_percentage" + TAXONOMY_QC_REPORT="$taxonomy_qc_report" source taxonomy_qc.sh """ From d3d28be81945b08be19ea3be79a8a3eea20bf8c6 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 20 Jul 2023 16:17:14 +0000 Subject: [PATCH 042/157] Combining reports to generate sample report Former-commit-id: ea1b1bab32ded4e1ba187b693acfed0e91bc733f --- bin/generate_sample_report.sh | 3 +++ modules/output.nf | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100755 bin/generate_sample_report.sh create mode 100644 modules/output.nf diff --git a/bin/generate_sample_report.sh b/bin/generate_sample_report.sh new file mode 100755 index 0000000..ec769f3 --- /dev/null +++ b/bin/generate_sample_report.sh @@ -0,0 +1,3 @@ +paste -d , *.csv \ +| sed '1 s/^/\"Sample_ID\",/' \ +| sed "2 s/^/\"${SAMPLE_ID}\",/" > $SAMPLE_REPORT \ No newline at end of file diff --git a/modules/output.nf b/modules/output.nf new file mode 100644 index 0000000..a711425 --- /dev/null +++ b/modules/output.nf @@ -0,0 +1,21 @@ +process GENERATE_SAMPLE_REPORT { + label 'bash_container' + label 'farm_low' + + tag "$sample_id" + + input: + tuple val(sample_id), path ('report*.csv') + + output: + path sample_report + + script: + sample_report="${sample_id}_report.csv" + """ + SAMPLE_ID=$sample_id + SAMPLE_REPORT=$sample_report + + source generate_sample_report.sh + """ +} \ No newline at end of file From 9c5c5be5c22629f023ea755f04ab1ea5206d77c2 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 20 Jul 2023 16:17:23 +0000 Subject: [PATCH 043/157] Initial work on output revamp (WIP) Former-commit-id: 01b22b87a70de4c9077b879c74a69aeeb03fedbd --- workflows/pipeline.nf | 110 ++++++++++++++++++++++++------------------ 1 file changed, 62 insertions(+), 48 deletions(-) diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 6dc59fe..ef8f650 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -8,6 +8,7 @@ include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS; LINEAGE } from "$projectDir/ include { GET_SEROBA_DB; CREATE_SEROBA_DB; SEROTYPE } from "$projectDir/modules/serotype" include { MLST } from "$projectDir/modules/mlst" include { PBP_RESISTANCE; GET_PBP_RESISTANCE; CREATE_ARIBA_DB; OTHER_RESISTANCE; GET_OTHER_RESISTANCE } from "$projectDir/modules/amr" +include { GENERATE_SAMPLE_REPORT } from "$projectDir/modules/output" // Main pipeline workflow workflow PIPELINE { @@ -103,9 +104,10 @@ workflow PIPELINE { // Merge Channels ASSEMBLY_QC.out.result & MAPPING_QC.out.result & TAXONOMY_QC.out.result to provide Overall QC Status // Output into Channel OVERALL_QC.out.result OVERALL_QC( - ASSEMBLY_QC.out.result + READ_QC.out.result + .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true) .join(MAPPING_QC.out.result, failOnDuplicate: true, remainder: true) - .join(TAXONOMY_QC.out.result, failOnDuplicate: true) + .join(TAXONOMY_QC.out.result, failOnDuplicate: true, remainder: true) ) // From Channel READ_QC_PASSED_READS_ch, only output reads of samples passed overall QC based on Channel OVERALL_QC.out.result @@ -155,52 +157,64 @@ workflow PIPELINE { // GET_OTHER_RESISTANCE.out.result // // Replace null with approiate amount of "_" items when sample_id does not exist in that output (i.e. QC rejected) - READ_QC.out.result - .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - .join(MAPPING_QC.out.result, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - .join(TAXONOMY_QC.out.result, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - .join(OVERALL_QC.out.result, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['FAIL'] : it } - .join(READ_QC.out.bases, failOnDuplicate: true, failOnMismatch: true) - .join(ASSEMBLY_QC.out.info, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] * 3 : it } - .join(MAPPING_QC.out.info, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] * 2 : it } - .join(TAXONOMY_QC.out.percentage, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - .join(LINEAGE.out.csv.splitCsv(skip: 1), failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - .join(SEROTYPE.out.result, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - .join(MLST.out.result, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] * 8 : it } - .join(GET_PBP_RESISTANCE.out.result, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] * 18 : it } - .join(GET_OTHER_RESISTANCE.out, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] * 24 : it } - .map { it.collect {"\"$it\""}.join',' } - .collectFile( - name: 'results.csv', - storeDir: "$params.output", - seed: [ - 'Sample_ID', - 'Read_QC', 'Assembly_QC', 'Mapping_QC', 'Taxonomy_QC', 'Overall_QC', - 'Bases', - 'Contigs#' , 'Assembly_Length', 'Seq_Depth', - 'Ref_Cov_%', 'Het-SNP#' , - 'S.Pneumo_%', - 'GPSC', - 'Serotype', - 'ST', 'aroE', 'gdh', 'gki', 'recP', 'spi', 'xpt', 'ddl', - 'pbp1a', 'pbp2b', 'pbp2x', 'AMO_MIC', 'AMO_Res', 'CFT_MIC', 'CFT_Res(Meningital)', 'CFT_Res(Non-meningital)', 'TAX_MIC', 'TAX_Res(Meningital)', 'TAX_Res(Non-meningital)', 'CFX_MIC', 'CFX_Res', 'MER_MIC', 'MER_Res', 'PEN_MIC', 'PEN_Res(Meningital)', 'PEN_Res(Non-meningital)', - 'CHL_Res', 'CHL_Determinant', 'ERY_Res', 'ERY_Determinant', 'CLI_Res', 'CLI_Determinant', 'ERY_CLI_Res', 'ERY_CLI_Determinant', 'FQ_Res', 'FQ_Determinant', 'LFX_Res', 'LFX_Determinant', 'KAN_Res', 'KAN_Determinant', 'TET_Res', 'TET_Determinant', 'DOX_Res', 'DOX_Determinant', 'TMP_Res', 'TMP_Determinant', 'SMX_Res', 'SMX_Determinant', 'COT_Res', 'COT_Determinant', 'RIF_Res', 'RIF_Determinant', 'VAN_Res', 'VAN_Determinant', 'PILI1', 'PILI1_Determinant', 'PILI2', 'PILI2_Determinant' - ].join(','), - sort: { it.split(',')[0] }, - newLine: true - ) + + GENERATE_SAMPLE_REPORT( + READ_QC.out.report + .join(ASSEMBLY_QC.out.report, failOnDuplicate: true, remainder: true) + .join(MAPPING_QC.out.report, failOnDuplicate: true, remainder: true) + .join(TAXONOMY_QC.out.report, failOnDuplicate: true, remainder: true) + .join(OVERALL_QC.out.report, failOnDuplicate: true, remainder: true) + .map { [it[0], it[1..-1].minus(null)] } + ).view() + + // GENERATE_OVERALL_REPORT + + // READ_QC.out.result + // .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } + // .join(MAPPING_QC.out.result, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } + // .join(TAXONOMY_QC.out.result, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } + // .join(OVERALL_QC.out.result, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['FAIL'] : it } + // .join(READ_QC.out.bases, failOnDuplicate: true, failOnMismatch: true) + // .join(ASSEMBLY_QC.out.info, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 3 : it } + // .join(MAPPING_QC.out.info, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 2 : it } + // .join(TAXONOMY_QC.out.percentage, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } + // .join(LINEAGE.out.csv.splitCsv(skip: 1), failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } + // .join(SEROTYPE.out.result, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } + // .join(MLST.out.result, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 8 : it } + // .join(GET_PBP_RESISTANCE.out.result, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 18 : it } + // .join(GET_OTHER_RESISTANCE.out, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 24 : it } + // .map { it.collect {"\"$it\""}.join',' } + // .collectFile( + // name: 'results.csv', + // storeDir: "$params.output", + // seed: [ + // 'Sample_ID', + // 'Read_QC', 'Assembly_QC', 'Mapping_QC', 'Taxonomy_QC', 'Overall_QC', + // 'Bases', + // 'Contigs#' , 'Assembly_Length', 'Seq_Depth', + // 'Ref_Cov_%', 'Het-SNP#' , + // 'S.Pneumo_%', + // 'GPSC', + // 'Serotype', + // 'ST', 'aroE', 'gdh', 'gki', 'recP', 'spi', 'xpt', 'ddl', + // 'pbp1a', 'pbp2b', 'pbp2x', 'AMO_MIC', 'AMO_Res', 'CFT_MIC', 'CFT_Res(Meningital)', 'CFT_Res(Non-meningital)', 'TAX_MIC', 'TAX_Res(Meningital)', 'TAX_Res(Non-meningital)', 'CFX_MIC', 'CFX_Res', 'MER_MIC', 'MER_Res', 'PEN_MIC', 'PEN_Res(Meningital)', 'PEN_Res(Non-meningital)', + // 'CHL_Res', 'CHL_Determinant', 'ERY_Res', 'ERY_Determinant', 'CLI_Res', 'CLI_Determinant', 'ERY_CLI_Res', 'ERY_CLI_Determinant', 'FQ_Res', 'FQ_Determinant', 'LFX_Res', 'LFX_Determinant', 'KAN_Res', 'KAN_Determinant', 'TET_Res', 'TET_Determinant', 'DOX_Res', 'DOX_Determinant', 'TMP_Res', 'TMP_Determinant', 'SMX_Res', 'SMX_Determinant', 'COT_Res', 'COT_Determinant', 'RIF_Res', 'RIF_Determinant', 'VAN_Res', 'VAN_Determinant', 'PILI1', 'PILI1_Determinant', 'PILI2', 'PILI2_Determinant' + // ].join(','), + // sort: { it.split(',')[0] }, + // newLine: true + // ) // Pass to SAVE_INFO sub-workflow DATABASES_INFO = CREATE_REF_GENOME_BWA_DB.out.path.map { [["bwa_db_path", it]] } From 897d4a297fb1ca149a0f6198f2ebdcc2047d8b1f Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 21 Jul 2023 11:40:03 +0000 Subject: [PATCH 044/157] Save Lineage report per sample as .csv Former-commit-id: d0c5123b34773a7832dd7ff7ea3740c078c29dda --- bin/get_lineage.sh | 13 +++++++++++++ modules/lineage.nf | 15 ++++++++------- workflows/pipeline.nf | 1 + 3 files changed, 22 insertions(+), 7 deletions(-) create mode 100755 bin/get_lineage.sh diff --git a/bin/get_lineage.sh b/bin/get_lineage.sh new file mode 100755 index 0000000..70e4f08 --- /dev/null +++ b/bin/get_lineage.sh @@ -0,0 +1,13 @@ +# Run PopPUNK to assign GPSCs to samples + +# Add "prefix_" to all sample names in qfile to avoid poppunk_assign crashing due to sample name already exists in database +# Remove "prefix_" from all sample names in the result + +# Save results of individual sample into .csv with its name as filename + +sed 's/^/prefix_/' "$QFILE" > safe_qfile.txt +poppunk_assign --db "${POPPUNK_DIR}/${DB_NAME}" --external-clustering "${POPPUNK_DIR}/${EXT_CLUSTERS_FILE}" --query safe_qfile.txt --output output --threads $(nproc) +sed 's/^prefix_//' output/output_external_clusters.csv > result.txt + + +awk -F , 'NR!=1 { print "GPSC\n" "\"" $2 "\"" > $1 ".csv" }' result.txt \ No newline at end of file diff --git a/modules/lineage.nf b/modules/lineage.nf index 6e13fab..68edae3 100644 --- a/modules/lineage.nf +++ b/modules/lineage.nf @@ -46,8 +46,7 @@ process GET_POPPUNK_EXT_CLUSTERS { } // Run PopPUNK to assign GPSCs to samples -// Add "prefix_" to all sample names in qfile to avoid poppunk_assign crashing due to sample name already exists in database -// Remove "prefix_" from all sample names in the output +// Save results of individual sample into .csv with its name as filename process LINEAGE { label 'poppunk_container' label 'farm_high' @@ -63,13 +62,15 @@ process LINEAGE { path qfile output: - path(result), emit: csv + path '*.csv', emit: reports script: - result='result.csv' """ - sed 's/^/prefix_/' "$qfile" > safe_qfile.txt - poppunk_assign --db "${poppunk_dir}/${db_name}" --external-clustering "${poppunk_dir}/${ext_clusters_file}" --query safe_qfile.txt --output output --threads `nproc` - sed 's/^prefix_//' output/output_external_clusters.csv > "$result" + QFILE="$qfile" + POPPUNK_DIR="$poppunk_dir" + DB_NAME="$db_name" + EXT_CLUSTERS_FILE="$ext_clusters_file" + + source get_lineage.sh """ } diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index ef8f650..688e928 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -164,6 +164,7 @@ workflow PIPELINE { .join(MAPPING_QC.out.report, failOnDuplicate: true, remainder: true) .join(TAXONOMY_QC.out.report, failOnDuplicate: true, remainder: true) .join(OVERALL_QC.out.report, failOnDuplicate: true, remainder: true) + .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name .map { [it[0], it[1..-1].minus(null)] } ).view() From b19ffa403b9d72a249c7d66a668f05ee1336ef39 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 21 Jul 2023 14:04:07 +0000 Subject: [PATCH 045/157] Add quote to csv header Former-commit-id: bfcca3163f37354fda219cfab3c2feec9ddd01f6 --- bin/get_lineage.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/get_lineage.sh b/bin/get_lineage.sh index 70e4f08..63b6ec0 100755 --- a/bin/get_lineage.sh +++ b/bin/get_lineage.sh @@ -10,4 +10,4 @@ poppunk_assign --db "${POPPUNK_DIR}/${DB_NAME}" --external-clustering "${POPPUNK sed 's/^prefix_//' output/output_external_clusters.csv > result.txt -awk -F , 'NR!=1 { print "GPSC\n" "\"" $2 "\"" > $1 ".csv" }' result.txt \ No newline at end of file +awk -F , 'NR!=1 { print "\"GPSC\"\n" "\"" $2 "\"" > $1 ".csv" }' result.txt \ No newline at end of file From 3811c8be69d36a384b6dca1b190a7860cced052d Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 21 Jul 2023 14:05:13 +0000 Subject: [PATCH 046/157] Save Serotype report as .csv Former-commit-id: 897aa091a42ef4372a0476bea63114de9c51eef8 --- bin/get_serotype.sh | 3 +++ modules/serotype.nf | 6 +++++- workflows/pipeline.nf | 1 + 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/bin/get_serotype.sh b/bin/get_serotype.sh index b17c2de..80bfcc7 100755 --- a/bin/get_serotype.sh +++ b/bin/get_serotype.sh @@ -4,3 +4,6 @@ } || { SEROTYPE="SEROBA FAILURE" } + +echo \"Serotype\" > $SEROTYPE_REPORT +echo \"$SEROTYPE\" >> $SEROTYPE_REPORT \ No newline at end of file diff --git a/modules/serotype.nf b/modules/serotype.nf index 5c268fc..0c69bad 100644 --- a/modules/serotype.nf +++ b/modules/serotype.nf @@ -66,9 +66,10 @@ process SEROTYPE { tuple val(sample_id), path(read1), path(read2), path(unpaired) output: - tuple val(sample_id), env(SEROTYPE), emit: result + tuple val(sample_id), path(serotype_report), emit: report script: + serotype_report='serotype_report.csv' // When using Singularity as container engine, SeroBA sometimes gives incorrect result or critical error // Uncertain root cause, happen randomly when input are located directly in a Nextflow process work directory // Workaround: create and use a subdirectory to alter the path @@ -79,6 +80,7 @@ process SEROTYPE { READ1="$read1" READ2="$read2" SAMPLE_ID="$sample_id" + SEROTYPE_REPORT="$serotype_report" source get_serotype.sh """ @@ -89,12 +91,14 @@ process SEROTYPE { READ1="$read1" READ2="$read2" SAMPLE_ID="$sample_id" + SEROTYPE_REPORT="$serotype_report" mkdir SEROBA_WORKDIR && mv $seroba_dir $read1 $read2 SEROBA_WORKDIR && cd SEROBA_WORKDIR source get_serotype.sh cd ../ + mv SEROBA_WORKDIR/$serotype_report ./ """ else error "The process must be run with Docker or Singularity as container engine." diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 688e928..257d4dd 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -164,6 +164,7 @@ workflow PIPELINE { .join(MAPPING_QC.out.report, failOnDuplicate: true, remainder: true) .join(TAXONOMY_QC.out.report, failOnDuplicate: true, remainder: true) .join(OVERALL_QC.out.report, failOnDuplicate: true, remainder: true) + .join(SEROTYPE.out.report, failOnDuplicate: true, remainder: true) .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name .map { [it[0], it[1..-1].minus(null)] } ).view() From 41287de8e37d2dd86a27fa0cc0c2f5ab62e13c74 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 21 Jul 2023 14:18:04 +0000 Subject: [PATCH 047/157] Save MLST report as .csv Former-commit-id: 58a549f948e3c9581779f1b753448bdc8e7acfc6 --- bin/get_mlst.sh | 3 +++ modules/mlst.nf | 4 +++- workflows/pipeline.nf | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/bin/get_mlst.sh b/bin/get_mlst.sh index 7e5c61a..ab7c8e9 100755 --- a/bin/get_mlst.sh +++ b/bin/get_mlst.sh @@ -12,3 +12,6 @@ recP=$(awk -F'\t' 'FNR == 2 {print $7}' $OUTPUT) spi=$(awk -F'\t' 'FNR == 2 {print $8}' $OUTPUT) xpt=$(awk -F'\t' 'FNR == 2 {print $9}' $OUTPUT) ddl=$(awk -F'\t' 'FNR == 2 {print $10}' $OUTPUT) + +echo \"ST\",\"aroE\",\"gdh\",\"gki\",\"recP\",\"spi\",\"xpt\",\"ddl\" > $MLST_REPORT +echo \"$ST\",\"$aroE\",\"$gdh\",\"$gki\",\"$recP\",\"$spi\",\"$xpt\",\"$ddl\" >> $MLST_REPORT \ No newline at end of file diff --git a/modules/mlst.nf b/modules/mlst.nf index cc766d4..c8d12e4 100644 --- a/modules/mlst.nf +++ b/modules/mlst.nf @@ -9,11 +9,13 @@ process MLST { tuple val(sample_id), path(assembly) output: - tuple val(sample_id), env(ST), env(aroE), env(gdh), env(gki), env(recP), env(spi), env(xpt), env(ddl), emit: result + tuple val(sample_id), path(mlst_report), emit: report script: + mlst_report='mlst_report.csv' """ ASSEMBLY="$assembly" + MLST_REPORT="$mlst_report" source get_mlst.sh """ diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 257d4dd..32089a9 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -165,6 +165,7 @@ workflow PIPELINE { .join(TAXONOMY_QC.out.report, failOnDuplicate: true, remainder: true) .join(OVERALL_QC.out.report, failOnDuplicate: true, remainder: true) .join(SEROTYPE.out.report, failOnDuplicate: true, remainder: true) + .join(MLST.out.report, failOnDuplicate: true, remainder: true) .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name .map { [it[0], it[1..-1].minus(null)] } ).view() From 6faf88f4ddab167c23fcde04efe2ecd14e65bd60 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 21 Jul 2023 15:21:49 +0000 Subject: [PATCH 048/157] Save PBP AMR report as .csv Former-commit-id: 51aebdc959fab1e66639e7567ac369175169f5a8 --- bin/get_pbp_resistance.sh | 3 +++ modules/amr.nf | 4 +++- workflows/pipeline.nf | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/bin/get_pbp_resistance.sh b/bin/get_pbp_resistance.sh index d7082eb..5e833c3 100755 --- a/bin/get_pbp_resistance.sh +++ b/bin/get_pbp_resistance.sh @@ -30,3 +30,6 @@ MER=$(GET_RES "mem") PEN_MIC=$(GET_VALUE "penMic") PEN_NONMENINGITIS=$(GET_RES "penNonMeningitis") PEN_MENINGITIS=$(GET_RES "penMeningitis") + +echo \"pbp1a\",\"pbp2b\",\"pbp2x\",\"AMO_MIC\",\"AMO_Res\",\"CFT_MIC\",\"CFT_Res\(Meningital\)\",\"CFT_Res\(Non-meningital\)\",\"TAX_MIC\",\"TAX_Res\(Meningital\)\",\"TAX_Res\(Non-meningital\)\",\"CFX_MIC\",\"CFX_Res\",\"MER_MIC\",\"MER_Res\",\"PEN_MIC\",\"PEN_Res\(Meningital\)\",\"PEN_Res\(Non-meningital\)\" > $PBP_AMR_REPORT +echo \"$pbp1a\",\"$pbp2b\",\"$pbp2x\",\"$AMO_MIC\",\"$AMO\",\"$CFT_MIC\",\"$CFT_MENINGITIS\",\"$CFT_NONMENINGITIS\",\"$TAX_MIC\",\"$TAX_MENINGITIS\",\"$TAX_NONMENINGITIS\",\"$CFX_MIC\",\"$CFX\",\"$MER_MIC\",\"$MER\",\"$PEN_MIC\",\"$PEN_MENINGITIS\",\"$PEN_NONMENINGITIS\" >> $PBP_AMR_REPORT \ No newline at end of file diff --git a/modules/amr.nf b/modules/amr.nf index 6a2a0bf..3aabed7 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -29,11 +29,13 @@ process GET_PBP_RESISTANCE { tuple val(sample_id), path(json) output: - tuple val(sample_id), env(pbp1a), env(pbp2b), env(pbp2x), env(AMO_MIC), env(AMO), env(CFT_MIC), env(CFT_MENINGITIS), env(CFT_NONMENINGITIS), env(TAX_MIC), env(TAX_MENINGITIS), env(TAX_NONMENINGITIS), env(CFX_MIC), env(CFX), env(MER_MIC), env(MER), env(PEN_MIC), env(PEN_MENINGITIS), env(PEN_NONMENINGITIS), emit: result + tuple val(sample_id), path(pbp_amr_report), emit: report script: + pbp_amr_report='pbp_amr_report.csv' """ JSON_FILE="$json" + PBP_AMR_REPORT="$pbp_amr_report" source get_pbp_resistance.sh """ diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 32089a9..c78f5fa 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -166,6 +166,7 @@ workflow PIPELINE { .join(OVERALL_QC.out.report, failOnDuplicate: true, remainder: true) .join(SEROTYPE.out.report, failOnDuplicate: true, remainder: true) .join(MLST.out.report, failOnDuplicate: true, remainder: true) + .join(GET_PBP_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name .map { [it[0], it[1..-1].minus(null)] } ).view() From 505152f09c98a5758bb952ea4795f2f1f120a158 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 21 Jul 2023 15:39:52 +0000 Subject: [PATCH 049/157] Save other AMR report as .csv Former-commit-id: 2bb7ad12dd1b2555da6e52cd4432525929925680 --- bin/get_other_resistance.py | 7 +++++-- bin/get_other_resistance.sh | 40 ------------------------------------- modules/amr.nf | 9 +++------ workflows/pipeline.nf | 1 + 4 files changed, 9 insertions(+), 48 deletions(-) delete mode 100755 bin/get_other_resistance.sh diff --git a/bin/get_other_resistance.py b/bin/get_other_resistance.py index 4f71294..d902f59 100755 --- a/bin/get_other_resistance.py +++ b/bin/get_other_resistance.py @@ -5,11 +5,13 @@ import sys from itertools import chain from collections import defaultdict -import json +import pandas as pd +import csv report_path = sys.argv[1] debug_report_path = sys.argv[2] metadata_path = sys.argv[3] +output_file = sys.argv[4] with open(report_path) as report, open(debug_report_path) as debug_report, open(metadata_path) as metadata: # For saving (reference, gene, var_only) combinations as key and their information ({var_change: target}) as value found in metadata @@ -127,4 +129,5 @@ output['ERY_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['ERY'])) if 'ERY' in target_dict and len(target_dict['ERY']) != 0 else output['ERY_CLI_Determinant'] output['CLI_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['CLI'])) if 'CLI' in target_dict and len(target_dict['CLI']) != 0 else output['ERY_CLI_Determinant'] - print(json.dumps(output, indent=4)) \ No newline at end of file + # Save output dict as csv + pd.DataFrame([output]).to_csv(output_file, index=False, quoting=csv.QUOTE_ALL) \ No newline at end of file diff --git a/bin/get_other_resistance.sh b/bin/get_other_resistance.sh deleted file mode 100755 index befd4a4..0000000 --- a/bin/get_other_resistance.sh +++ /dev/null @@ -1,40 +0,0 @@ -# Run get_other_resistance.py to infer AMR from ARIBA reports, then capture individual AMR from the output for Nextflow - -function GET_VALUE { - echo $(grep \"$1\" <<< $OUTPUT | sed -r 's/.+: "(.*)",?/\1/') -} - -OUTPUT=$(get_other_resistance.py "$REPORT" "$REPORT_DEBUG" "$METADATA") - -CHL_Res=$(GET_VALUE "CHL_Res") -CHL_Determinant=$(GET_VALUE "CHL_Determinant") -ERY_Res=$(GET_VALUE "ERY_Res") -ERY_Determinant=$(GET_VALUE "ERY_Determinant") -CLI_Res=$(GET_VALUE "CLI_Res") -CLI_Determinant=$(GET_VALUE "CLI_Determinant") -ERY_CLI_Res=$(GET_VALUE "ERY_CLI_Res") -ERY_CLI_Determinant=$(GET_VALUE "ERY_CLI_Determinant") -FQ_Res=$(GET_VALUE "FQ_Res") -FQ_Determinant=$(GET_VALUE "FQ_Determinant") -LFX_Res=$(GET_VALUE "LFX_Res") -LFX_Determinant=$(GET_VALUE "LFX_Determinant") -KAN_Res=$(GET_VALUE "KAN_Res") -KAN_Determinant=$(GET_VALUE "KAN_Determinant") -TET_Res=$(GET_VALUE "TET_Res") -TET_Determinant=$(GET_VALUE "TET_Determinant") -DOX_Res=$(GET_VALUE "DOX_Res") -DOX_Determinant=$(GET_VALUE "DOX_Determinant") -TMP_Res=$(GET_VALUE "TMP_Res") -TMP_Determinant=$(GET_VALUE "TMP_Determinant") -SMX_Res=$(GET_VALUE "SMX_Res") -SMX_Determinant=$(GET_VALUE "SMX_Determinant") -COT_Res=$(GET_VALUE "COT_Res") -COT_Determinant=$(GET_VALUE "COT_Determinant") -RIF_Res=$(GET_VALUE "RIF_Res") -RIF_Determinant=$(GET_VALUE "RIF_Determinant") -VAN_Res=$(GET_VALUE "VAN_Res") -VAN_Determinant=$(GET_VALUE "VAN_Determinant") -PILI1=$(GET_VALUE "PILI1") -PILI1_Determinant=$(GET_VALUE "PILI1_Determinant") -PILI2=$(GET_VALUE "PILI2") -PILI2_Determinant=$(GET_VALUE "PILI2_Determinant") \ No newline at end of file diff --git a/modules/amr.nf b/modules/amr.nf index 3aabed7..1fd57f4 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -104,14 +104,11 @@ process GET_OTHER_RESISTANCE { path metadata output: - tuple val(sample_id), env(CHL_Res), env(CHL_Determinant), env(ERY_Res), env(ERY_Determinant), env(CLI_Res), env(CLI_Determinant), env(ERY_CLI_Res), env(ERY_CLI_Determinant), env(FQ_Res), env(FQ_Determinant), env(LFX_Res), env(LFX_Determinant), env(KAN_Res), env(KAN_Determinant), env(TET_Res), env(TET_Determinant), env(DOX_Res), env(DOX_Determinant), env(TMP_Res), env(TMP_Determinant), env(SMX_Res), env(SMX_Determinant), env(COT_Res), env(COT_Determinant), env(RIF_Res), env(RIF_Determinant), env(VAN_Res), env(VAN_Determinant), env(PILI1), env(PILI1_Determinant), env(PILI2), env(PILI2_Determinant), emit: result + tuple val(sample_id), path(output_file), emit: report script: + output_file="other_amr_report.csv" """ - REPORT="$report" - REPORT_DEBUG="$report_debug" - METADATA="$metadata" - - source get_other_resistance.sh + get_other_resistance.py "$report" "$report_debug" "$metadata" "$output_file" """ } diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index c78f5fa..f8fc630 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -167,6 +167,7 @@ workflow PIPELINE { .join(SEROTYPE.out.report, failOnDuplicate: true, remainder: true) .join(MLST.out.report, failOnDuplicate: true, remainder: true) .join(GET_PBP_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) + .join(GET_OTHER_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name .map { [it[0], it[1..-1].minus(null)] } ).view() From 6eff23c3a2555ada827883b9b906c4dab7e5c691 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 24 Jul 2023 13:58:13 +0000 Subject: [PATCH 050/157] Improve clarity of Read QC module Former-commit-id: e94cc56378de3fef6c4ff3debfcec9296dc31ff8 --- README.md | 2 +- doc/workflow.drawio.svg | 100 +++++++++++++++++++++++----------------- 2 files changed, 59 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 8fdc0d6..149928e 100644 --- a/README.md +++ b/README.md @@ -184,7 +184,7 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | `--assembly_publish` | `"link"` or `"symlink"` or `"copy"`
(Default: `"link"`)| Method used by Nextflow to publish the generated assemblies.
(The default setting `"link"` means hard link, therefore will fail if the output directory is set to outside of the working file system) | ## QC Parameters -> ℹ️ Read QC does not have directly accessible parameters. The minimum base count in reads of Read QC is based on the multiplication of `--length_low` and `--depth` of Assembly QC. +> ℹ️ Read QC does not have directly accessible parameters. The minimum base count in reads of Read QC is based on the multiplication of `--length_low` and `--depth` of Assembly QC (i.e. default value is `38000000`). | Option | Values | Description | | --- | ---| --- | diff --git a/doc/workflow.drawio.svg b/doc/workflow.drawio.svg index f2e08ab..00766b6 100644 --- a/doc/workflow.drawio.svg +++ b/doc/workflow.drawio.svg @@ -1,4 +1,4 @@ - + @@ -157,10 +157,10 @@ - - - - + + + + @@ -279,8 +279,8 @@ - - + + @@ -302,13 +302,13 @@ - - - + + + -
+
@@ -320,17 +320,17 @@
- + PBP... - - + + -
+
@@ -342,7 +342,7 @@
- + MLST... @@ -368,12 +368,12 @@ - - + + -
+
@@ -385,17 +385,17 @@
- + Line... - - + + -
+
@@ -407,17 +407,17 @@
- + Sero... - - + + -
+
@@ -429,7 +429,7 @@
- + Othe... @@ -459,21 +459,12 @@ - Read QC - - - + - - Go / No-go - - Bases: - - - ≥ Min Length x Depth + + Bases: ≥ 38 Mb @@ -482,9 +473,9 @@ Go / No-go - - - + + + @@ -526,6 +517,31 @@ + + + + + +
+
+
+ + Read QC + +
+
+
+
+ + Read... + +
+
+ + + QC values shown in the diagram are the default values + + From f1c30b345354f83c101e7be3e25a38db99320d6a Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 24 Jul 2023 15:47:35 +0000 Subject: [PATCH 051/157] Correct output column names Former-commit-id: fe079b96442e9ba534703488a885f74f90584e94 --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 149928e..aa68b3e 100644 --- a/README.md +++ b/README.md @@ -351,10 +351,10 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | `RIF_Determinant` | Other AMR | Known determinants that inferred the RIF resistance | | `VAN_Res` | Other AMR | Resistance phenotype against Vancomycin (VAN) | | `VAN_Determinant` | Other AMR | Known determinants that inferred the VAN resistance | - | `PILI-1` | Other AMR | Expression of PILI-1 | - | `PILI-1_Determinant` | Other AMR | Known determinants that inferred the PILI-1 expression | - | `PILI-2` | Other AMR | Expression of PILI-2 | - | `PILI-2_Determinant` | Other AMR | Known determinants that inferred the PILI-2 expression | + | `PILI1` | Other AMR | Expression of PILI-1 | + | `PILI1_Determinant` | Other AMR | Known determinants that inferred the PILI-1 expression | + | `PILI2` | Other AMR | Expression of PILI-2 | + | `PILI2_Determinant` | Other AMR | Known determinants that inferred the PILI-2 expression |   # Credits From 5b59a0bc31c4730342280b6d09e17284de07169d Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 24 Jul 2023 15:53:21 +0000 Subject: [PATCH 052/157] Initial implementation of overall report revamp Former-commit-id: 19312f9574d53d1cfd3f5b49e27044d264865c37 --- bin/generate_overall_report.py | 22 ++++++++++++++++++++++ modules/output.nf | 25 +++++++++++++++++++++++-- workflows/pipeline.nf | 6 +++--- 3 files changed, 48 insertions(+), 5 deletions(-) create mode 100755 bin/generate_overall_report.py diff --git a/bin/generate_overall_report.py b/bin/generate_overall_report.py new file mode 100755 index 0000000..55287d2 --- /dev/null +++ b/bin/generate_overall_report.py @@ -0,0 +1,22 @@ +#! /usr/bin/env python3 + +import sys +import glob +import pandas as pd + +workdir_path = sys.argv[1] +ariba_metadata = sys.argv[2] +output_file = sys.argv[3] + +output_columns = ['Sample_ID' , 'Read_QC' , 'Assembly_QC' , 'Mapping_QC' , 'Taxonomy_QC' , 'Overall_QC' , 'Bases' , 'Contigs#' , 'Assembly_Length' , 'Seq_Depth' , 'Ref_Cov_%' , 'Het-SNP#' , 'S.Pneumo_%' , 'GPSC' , 'Serotype' , 'ST' , 'aroE' , 'gdh' , 'gki' , 'recP' , 'spi' , 'xpt' , 'ddl' , 'pbp1a' , 'pbp2b' , 'pbp2x' , 'AMO_MIC' , 'AMO_Res' , 'CFT_MIC' , 'CFT_Res(Meningital)' , 'CFT_Res(Non-meningital)' , 'TAX_MIC' , 'TAX_Res(Meningital)' , 'TAX_Res(Non-meningital)' , 'CFX_MIC' , 'CFX_Res' , 'MER_MIC' , 'MER_Res' , 'PEN_MIC' , 'PEN_Res(Meningital)' , 'PEN_Res(Non-meningital)' , 'CHL_Res' , 'CHL_Determinant' , 'ERY_Res' , 'ERY_Determinant' , 'CLI_Res' , 'CLI_Determinant' , 'ERY_CLI_Res' , 'ERY_CLI_Determinant' , 'FQ_Res' , 'FQ_Determinant' , 'LFX_Res' , 'LFX_Determinant' , 'KAN_Res' , 'KAN_Determinant' , 'TET_Res' , 'TET_Determinant' , 'DOX_Res' , 'DOX_Determinant' , 'TMP_Res' , 'TMP_Determinant' , 'SMX_Res' , 'SMX_Determinant' , 'COT_Res' , 'COT_Determinant' , 'RIF_Res' , 'RIF_Determinant' , 'VAN_Res' , 'VAN_Determinant' , 'PILI1' , 'PILI1_Determinant' , 'PILI2' , 'PILI2_Determinant'] +df_manifest = pd.DataFrame(columns=output_columns) + +dfs = [df_manifest] + +reports = glob.glob(workdir_path +'/*.csv') +for report in reports: + df = pd.read_csv(report) + dfs.append(df) + +df_output = pd.concat(dfs, ignore_index=True).sort_values(by=['Sample_ID']) +df_output.to_csv(output_file, index=False, na_rep='_') diff --git a/modules/output.nf b/modules/output.nf index a711425..6bc6a27 100644 --- a/modules/output.nf +++ b/modules/output.nf @@ -8,7 +8,7 @@ process GENERATE_SAMPLE_REPORT { tuple val(sample_id), path ('report*.csv') output: - path sample_report + path sample_report, emit: report script: sample_report="${sample_id}_report.csv" @@ -18,4 +18,25 @@ process GENERATE_SAMPLE_REPORT { source generate_sample_report.sh """ -} \ No newline at end of file +} + +process GENERATE_OVERALL_REPORT { + label 'python_container' + label 'farm_low' + + publishDir "${params.output}", mode: "copy" + + input: + path 'report*.csv' + path "$ariba_metadata" + + output: + path "$overall_report", emit: report + + script: + overall_report='results.csv' + ariba_metadata='ariba_metadata.tsv' + """ + generate_overall_report.py `pwd` $ariba_metadata $overall_report + """ +} diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index f8fc630..e962742 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -8,7 +8,7 @@ include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS; LINEAGE } from "$projectDir/ include { GET_SEROBA_DB; CREATE_SEROBA_DB; SEROTYPE } from "$projectDir/modules/serotype" include { MLST } from "$projectDir/modules/mlst" include { PBP_RESISTANCE; GET_PBP_RESISTANCE; CREATE_ARIBA_DB; OTHER_RESISTANCE; GET_OTHER_RESISTANCE } from "$projectDir/modules/amr" -include { GENERATE_SAMPLE_REPORT } from "$projectDir/modules/output" +include { GENERATE_SAMPLE_REPORT; GENERATE_OVERALL_REPORT } from "$projectDir/modules/output" // Main pipeline workflow workflow PIPELINE { @@ -170,9 +170,9 @@ workflow PIPELINE { .join(GET_OTHER_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name .map { [it[0], it[1..-1].minus(null)] } - ).view() + ) - // GENERATE_OVERALL_REPORT + GENERATE_OVERALL_REPORT(GENERATE_SAMPLE_REPORT.out.report.collect(), params.ariba_metadata) // READ_QC.out.result // .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true) From 8949b84bed03b0be56da14999c98e97b6029405f Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 24 Jul 2023 16:01:39 +0000 Subject: [PATCH 053/157] Improve comments; remove obsolete code Former-commit-id: 4c7fc286c0a754820b2490a9cb9d08d486810e2c --- workflows/pipeline.nf | 64 +++---------------------------------------- 1 file changed, 4 insertions(+), 60 deletions(-) diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index e962742..fd288c9 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -147,17 +147,7 @@ workflow PIPELINE { OTHER_RESISTANCE(CREATE_ARIBA_DB.out.path, CREATE_ARIBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.reports, params.ariba_metadata) - // Generate results.csv by sorted sample_id based on merged Channels - // READ_QC.out.result, ASSEMBLY_QC.out.result, MAPPING_QC.out.result, TAXONOMY_QC.out.result, OVERALL_QC.out.result, - // READ_QC.out.bases, ASSEMBLY_QC.out.info, MAPPING_QC.out.info, TAXONOMY_QC.out.percentage - // LINEAGE.out.csv, - // SEROTYPE.out.result, - // MLST.out.result, - // GET_PBP_RESISTANCE.out.result, - // GET_OTHER_RESISTANCE.out.result - // - // Replace null with approiate amount of "_" items when sample_id does not exist in that output (i.e. QC rejected) - + // Generate sample reports by merging outputs from all result-generating modules GENERATE_SAMPLE_REPORT( READ_QC.out.report .join(ASSEMBLY_QC.out.report, failOnDuplicate: true, remainder: true) @@ -169,59 +159,13 @@ workflow PIPELINE { .join(GET_PBP_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) .join(GET_OTHER_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name - .map { [it[0], it[1..-1].minus(null)] } + .map { [it[0], it[1..-1].minus(null)] } // Map Sample_ID to index 0 and all reports (with null entries removed) as a list to index 1 ) + // Generate overall report by concatenating sample reports GENERATE_OVERALL_REPORT(GENERATE_SAMPLE_REPORT.out.report.collect(), params.ariba_metadata) - // READ_QC.out.result - // .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - // .join(MAPPING_QC.out.result, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - // .join(TAXONOMY_QC.out.result, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - // .join(OVERALL_QC.out.result, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['FAIL'] : it } - // .join(READ_QC.out.bases, failOnDuplicate: true, failOnMismatch: true) - // .join(ASSEMBLY_QC.out.info, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 3 : it } - // .join(MAPPING_QC.out.info, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 2 : it } - // .join(TAXONOMY_QC.out.percentage, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - // .join(LINEAGE.out.csv.splitCsv(skip: 1), failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - // .join(SEROTYPE.out.result, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - // .join(MLST.out.result, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 8 : it } - // .join(GET_PBP_RESISTANCE.out.result, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 18 : it } - // .join(GET_OTHER_RESISTANCE.out, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 24 : it } - // .map { it.collect {"\"$it\""}.join',' } - // .collectFile( - // name: 'results.csv', - // storeDir: "$params.output", - // seed: [ - // 'Sample_ID', - // 'Read_QC', 'Assembly_QC', 'Mapping_QC', 'Taxonomy_QC', 'Overall_QC', - // 'Bases', - // 'Contigs#' , 'Assembly_Length', 'Seq_Depth', - // 'Ref_Cov_%', 'Het-SNP#' , - // 'S.Pneumo_%', - // 'GPSC', - // 'Serotype', - // 'ST', 'aroE', 'gdh', 'gki', 'recP', 'spi', 'xpt', 'ddl', - // 'pbp1a', 'pbp2b', 'pbp2x', 'AMO_MIC', 'AMO_Res', 'CFT_MIC', 'CFT_Res(Meningital)', 'CFT_Res(Non-meningital)', 'TAX_MIC', 'TAX_Res(Meningital)', 'TAX_Res(Non-meningital)', 'CFX_MIC', 'CFX_Res', 'MER_MIC', 'MER_Res', 'PEN_MIC', 'PEN_Res(Meningital)', 'PEN_Res(Non-meningital)', - // 'CHL_Res', 'CHL_Determinant', 'ERY_Res', 'ERY_Determinant', 'CLI_Res', 'CLI_Determinant', 'ERY_CLI_Res', 'ERY_CLI_Determinant', 'FQ_Res', 'FQ_Determinant', 'LFX_Res', 'LFX_Determinant', 'KAN_Res', 'KAN_Determinant', 'TET_Res', 'TET_Determinant', 'DOX_Res', 'DOX_Determinant', 'TMP_Res', 'TMP_Determinant', 'SMX_Res', 'SMX_Determinant', 'COT_Res', 'COT_Determinant', 'RIF_Res', 'RIF_Determinant', 'VAN_Res', 'VAN_Determinant', 'PILI1', 'PILI1_Determinant', 'PILI2', 'PILI2_Determinant' - // ].join(','), - // sort: { it.split(',')[0] }, - // newLine: true - // ) - - // Pass to SAVE_INFO sub-workflow + // Pass databases information to SAVE_INFO sub-workflow DATABASES_INFO = CREATE_REF_GENOME_BWA_DB.out.path.map { [["bwa_db_path", it]] } .merge(CREATE_ARIBA_DB.out.path.map { [["ariba_db_path", it]] }) .merge(GET_KRAKEN2_DB.out.path.map { [["kraken2_db_path", it]] }) From 4c86c59e0d066a47b0898d687c3bf59287e7677a Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 24 Jul 2023 17:18:24 +0000 Subject: [PATCH 054/157] Generate results.csv based on ARIBA metadata Former-commit-id: 073ddd3b1bdd090026017eb34c57f1538e7b90e2 --- bin/generate_overall_report.py | 33 ++++++++++++++++++++++++++++++++- modules/output.nf | 3 +-- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/bin/generate_overall_report.py b/bin/generate_overall_report.py index 55287d2..e48fb53 100755 --- a/bin/generate_overall_report.py +++ b/bin/generate_overall_report.py @@ -8,7 +8,35 @@ ariba_metadata = sys.argv[2] output_file = sys.argv[3] -output_columns = ['Sample_ID' , 'Read_QC' , 'Assembly_QC' , 'Mapping_QC' , 'Taxonomy_QC' , 'Overall_QC' , 'Bases' , 'Contigs#' , 'Assembly_Length' , 'Seq_Depth' , 'Ref_Cov_%' , 'Het-SNP#' , 'S.Pneumo_%' , 'GPSC' , 'Serotype' , 'ST' , 'aroE' , 'gdh' , 'gki' , 'recP' , 'spi' , 'xpt' , 'ddl' , 'pbp1a' , 'pbp2b' , 'pbp2x' , 'AMO_MIC' , 'AMO_Res' , 'CFT_MIC' , 'CFT_Res(Meningital)' , 'CFT_Res(Non-meningital)' , 'TAX_MIC' , 'TAX_Res(Meningital)' , 'TAX_Res(Non-meningital)' , 'CFX_MIC' , 'CFX_Res' , 'MER_MIC' , 'MER_Res' , 'PEN_MIC' , 'PEN_Res(Meningital)' , 'PEN_Res(Non-meningital)' , 'CHL_Res' , 'CHL_Determinant' , 'ERY_Res' , 'ERY_Determinant' , 'CLI_Res' , 'CLI_Determinant' , 'ERY_CLI_Res' , 'ERY_CLI_Determinant' , 'FQ_Res' , 'FQ_Determinant' , 'LFX_Res' , 'LFX_Determinant' , 'KAN_Res' , 'KAN_Determinant' , 'TET_Res' , 'TET_Determinant' , 'DOX_Res' , 'DOX_Determinant' , 'TMP_Res' , 'TMP_Determinant' , 'SMX_Res' , 'SMX_Determinant' , 'COT_Res' , 'COT_Determinant' , 'RIF_Res' , 'RIF_Determinant' , 'VAN_Res' , 'VAN_Determinant' , 'PILI1' , 'PILI1_Determinant' , 'PILI2' , 'PILI2_Determinant'] +output_columns = ['Sample_ID' , 'Read_QC' , 'Assembly_QC' , 'Mapping_QC' , 'Taxonomy_QC' , 'Overall_QC' , 'Bases' , 'Contigs#' , 'Assembly_Length' , 'Seq_Depth' , 'Ref_Cov_%' , 'Het-SNP#' , 'S.Pneumo_%' , 'GPSC' , 'Serotype' , 'ST' , 'aroE' , 'gdh' , 'gki' , 'recP' , 'spi' , 'xpt' , 'ddl' , 'pbp1a' , 'pbp2b' , 'pbp2x' , 'AMO_MIC' , 'AMO_Res' , 'CFT_MIC' , 'CFT_Res(Meningital)' , 'CFT_Res(Non-meningital)' , 'TAX_MIC' , 'TAX_Res(Meningital)' , 'TAX_Res(Non-meningital)' , 'CFX_MIC' , 'CFX_Res' , 'MER_MIC' , 'MER_Res' , 'PEN_MIC' , 'PEN_Res(Meningital)' , 'PEN_Res(Non-meningital)'] + +ariba_targets = set(pd.read_csv(ariba_metadata, sep='\t')['target'].unique()) + +if 'TET' in ariba_targets: + ariba_targets.add('DOX') + +if 'FQ' in ariba_targets: + ariba_targets.add('LFX') + +if 'TMP' in ariba_targets and 'SMX' in ariba_targets: + ariba_targets.add('COT') + +if 'ERY_CLI' in ariba_targets: + ariba_targets.update(['ERY', 'CLI']) + +ariba_targets = sorted(ariba_targets) + +pilis = [] + +for target in ariba_targets: + if target.lower().startswith('pili'): + pilis.append(target) + else: + output_columns.extend([f'{target}_Res', f'{target}_Determinant']) + +for pili in pilis: + output_columns.extend([f'{pili}', f'{pili}_Determinant']) + df_manifest = pd.DataFrame(columns=output_columns) dfs = [df_manifest] @@ -19,4 +47,7 @@ dfs.append(df) df_output = pd.concat(dfs, ignore_index=True).sort_values(by=['Sample_ID']) + +df_output = df_output[output_columns] + df_output.to_csv(output_file, index=False, na_rep='_') diff --git a/modules/output.nf b/modules/output.nf index 6bc6a27..0fdbfc6 100644 --- a/modules/output.nf +++ b/modules/output.nf @@ -28,14 +28,13 @@ process GENERATE_OVERALL_REPORT { input: path 'report*.csv' - path "$ariba_metadata" + path 'ariba_metadata' output: path "$overall_report", emit: report script: overall_report='results.csv' - ariba_metadata='ariba_metadata.tsv' """ generate_overall_report.py `pwd` $ariba_metadata $overall_report """ From 7d9fb4a39521c3816fcf9a14cfbbcef478544dfb Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 25 Jul 2023 20:06:43 +0000 Subject: [PATCH 055/157] Improve shell scripts style Former-commit-id: fd905cfa221f04810dbbd64b64147854bc0ffef3 --- bin/assembly_qc.sh | 12 ++++----- bin/combine_info.sh | 6 ++--- bin/create_ariba_db.sh | 48 ++++++++++++++++----------------- bin/create_ref_genome_bwa_db.sh | 26 +++++++++--------- bin/create_seroba_db.sh | 6 ++--- 5 files changed, 49 insertions(+), 49 deletions(-) diff --git a/bin/assembly_qc.sh b/bin/assembly_qc.sh index 9d399fc..f23ff7f 100755 --- a/bin/assembly_qc.sh +++ b/bin/assembly_qc.sh @@ -1,14 +1,14 @@ # Extract assembly QC information and determine QC result based on report.tsv from Quast, total base count -CONTIGS=$(awk -F'\t' '$1 == "# contigs (>= 0 bp)" { print $2 }' $REPORT) -LENGTH=$(awk -F'\t' '$1 == "Total length" { print $2 }' $REPORT) -DEPTH=$(printf %.2f $(echo "$BASES / $LENGTH" | bc -l) ) +CONTIGS=$(awk -F'\t' '$1 == "# contigs (>= 0 bp)" { print $2 }' "$REPORT") +LENGTH=$(awk -F'\t' '$1 == "Total length" { print $2 }' "$REPORT") +DEPTH=$(echo "scale=2; $BASES / $LENGTH" | bc -l) -if (( $CONTIGS < $QC_CONTIGS )) && (( $LENGTH >= $QC_LENGTH_LOW )) && (( $LENGTH <= $QC_LENGTH_HIGH )) && (( $(echo "$DEPTH >= $QC_DEPTH" | bc -l) )); then +if [[ $CONTIGS -lt $QC_CONTIGS ]] && [[ $LENGTH -ge $QC_LENGTH_LOW ]] && [[ $LENGTH -le $QC_LENGTH_HIGH ]] && [[ "$(echo "$DEPTH >= $QC_DEPTH" | bc -l)" == 1 ]]; then ASSEMBLY_QC="PASS" else ASSEMBLY_QC="FAIL" fi -echo \"Assembly_QC\",\"Contigs#\",\"Assembly_Length\",\"Seq_Depth\" > $ASSEMBLY_QC_REPORT -echo \"$ASSEMBLY_QC\",\"$CONTIGS\",\"$LENGTH\",\"$DEPTH\" >> $ASSEMBLY_QC_REPORT \ No newline at end of file +echo \"Assembly_QC\",\"Contigs#\",\"Assembly_Length\",\"Seq_Depth\" > "$ASSEMBLY_QC_REPORT" +echo \""$ASSEMBLY_QC"\",\""$CONTIGS"\",\""$LENGTH"\",\""$DEPTH"\" >> "$ASSEMBLY_QC_REPORT" diff --git a/bin/combine_info.sh b/bin/combine_info.sh index 7409046..5275baa 100755 --- a/bin/combine_info.sh +++ b/bin/combine_info.sh @@ -1,12 +1,12 @@ # Combine pipeline version, Nextflow version, databases information, container images, tools version JSON files into the a single JSON file -jq -s '.[0] * .[1] * .[2]' $DATABASE $IMAGES $TOOLS > working.json +jq -s '.[0] * .[1] * .[2]' "$DATABASE" "$IMAGES" "$TOOLS" > working.json add_version () { - jq --arg entry $1 --arg version "$2" '.[$entry] += {"version": $version}' working.json > tmp.json && mv tmp.json working.json + jq --arg entry "$1" --arg version "$2" '.[$entry] += {"version": $version}' working.json > tmp.json && mv tmp.json working.json } add_version pipeline "$PIPELINE_VERSION" add_version nextflow "$NEXTFLOW_VERSION" -mv working.json $JSON_FILE +mv working.json "$JSON_FILE" diff --git a/bin/create_ariba_db.sh b/bin/create_ariba_db.sh index 289fff4..fb2b657 100755 --- a/bin/create_ariba_db.sh +++ b/bin/create_ariba_db.sh @@ -1,32 +1,32 @@ # Check if CREATE_ARIBA_DB has run successfully on the specific reference sequences and metadata. # If not: remove the $OUTPUT directory, and prepare the ARIBA database from reference sequences and metadata, also save metadata to JSON -REF_SEQUENCES_MD5=$(md5sum $REF_SEQUENCES | awk '{ print $1 }') -METADATA_MD5=$(md5sum $METADATA | awk '{ print $1 }') +REF_SEQUENCES_MD5=$(md5sum "$REF_SEQUENCES" | awk '{ print $1 }') +METADATA_MD5=$(md5sum "$METADATA" | awk '{ print $1 }') -if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ - [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$REF_SEQUENCES" ] || \ - [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$REF_SEQUENCES_MD5" ] || \ - [ ! "$(grep '"metadata"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$METADATA" ] || \ - [ ! "$(grep '"metadata_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$METADATA_MD5" ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/00.info.txt ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/00.version_info.txt ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_genes.log ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_metadata.log ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_metadata.tsv ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_noncoding.log ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.all.fa ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.clusters.pickle ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.clusters.tsv ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.gene.fa ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.gene.varonly.fa ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.fa ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.varonly.fa ] ; then +if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \ + [ ! "$(grep '"reference"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$REF_SEQUENCES" ] || \ + [ ! "$(grep '"reference_md5"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$REF_SEQUENCES_MD5" ] || \ + [ ! "$(grep '"metadata"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$METADATA" ] || \ + [ ! "$(grep '"metadata_md5"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$METADATA_MD5" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/00.info.txt" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/00.version_info.txt" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/01.filter.check_genes.log" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/01.filter.check_metadata.log" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/01.filter.check_metadata.tsv" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/01.filter.check_noncoding.log" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.all.fa" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.clusters.pickle" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.clusters.tsv" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.gene.fa" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.gene.varonly.fa" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.fa" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.varonly.fa" ] ; then - rm -rf "$DB_LOCAL/$OUTPUT" + rm -rf "${DB_LOCAL:?}/${OUTPUT}" - ariba prepareref -f "$REF_SEQUENCES" -m "$METADATA" "$DB_LOCAL/$OUTPUT" + ariba prepareref -f "$REF_SEQUENCES" -m "$METADATA" "${DB_LOCAL}/${OUTPUT}" - echo -e "{\n \"reference\": \"$REF_SEQUENCES\",\n \"reference_md5\": \"$REF_SEQUENCES_MD5\",\n \"metadata\": \"$METADATA\",\n \"metadata_md5\": \"$METADATA_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON_FILE} + echo -e "{\n \"reference\": \"$REF_SEQUENCES\",\n \"reference_md5\": \"$REF_SEQUENCES_MD5\",\n \"metadata\": \"$METADATA\",\n \"metadata_md5\": \"$METADATA_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > "${DB_LOCAL}/${JSON_FILE}" -fi \ No newline at end of file +fi diff --git a/bin/create_ref_genome_bwa_db.sh b/bin/create_ref_genome_bwa_db.sh index 5bd277a..385b609 100755 --- a/bin/create_ref_genome_bwa_db.sh +++ b/bin/create_ref_genome_bwa_db.sh @@ -1,23 +1,23 @@ # Check if CREATE_REF_GENOME_BWA_DB has run successfully on the specific reference. # If not: remove files in database directory, and construct the FM-index database of the reference genome for BWA, also save metadata to JSON -REFERENCE_MD5=$(md5sum $REFERENCE | awk '{ print $1 }') +REFERENCE_MD5=$(md5sum "$REFERENCE" | awk '{ print $1 }') -if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ - [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$REFERENCE" ] || \ - [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$REFERENCE_MD5" ] || \ - [ ! -f ${DB_LOCAL}/${PREFIX}.amb ] || \ - [ ! -f ${DB_LOCAL}/${PREFIX}.ann ] || \ - [ ! -f ${DB_LOCAL}/${PREFIX}.bwt ] || \ - [ ! -f ${DB_LOCAL}/${PREFIX}.pac ] || \ - [ ! -f ${DB_LOCAL}/${PREFIX}.sa ] ; then +if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \ + [ ! "$(grep '"reference"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$REFERENCE" ] || \ + [ ! "$(grep '"reference_md5"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$REFERENCE_MD5" ] || \ + [ ! -f "${DB_LOCAL}/${PREFIX}.amb" ] || \ + [ ! -f "${DB_LOCAL}/${PREFIX}.ann" ] || \ + [ ! -f "${DB_LOCAL}/${PREFIX}.bwt" ] || \ + [ ! -f "${DB_LOCAL}/${PREFIX}.pac" ] || \ + [ ! -f "${DB_LOCAL}/${PREFIX}.sa" ] ; then - rm -rf ${DB_LOCAL}/{,.[!.],..?}* + rm -rf "${DB_LOCAL:?}"/{,.[!.],..?}* - bwa index -p $PREFIX $REFERENCE + bwa index -p "$PREFIX" "$REFERENCE" - mv ${PREFIX}.amb ${PREFIX}.ann ${PREFIX}.bwt ${PREFIX}.pac ${PREFIX}.sa -t $DB_LOCAL + mv "${PREFIX}.amb" "${PREFIX}.ann" "${PREFIX}.bwt" "${PREFIX}.pac" "${PREFIX}.sa" -t "$DB_LOCAL" - echo -e "{\n \"reference\": \"$REFERENCE\",\n \"reference_md5\": \"$REFERENCE_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON_FILE} + echo -e "{\n \"reference\": \"$REFERENCE\",\n \"reference_md5\": \"$REFERENCE_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > "${DB_LOCAL}/${JSON_FILE}" fi diff --git a/bin/create_seroba_db.sh b/bin/create_seroba_db.sh index 21a058f..3ff36b2 100755 --- a/bin/create_seroba_db.sh +++ b/bin/create_seroba_db.sh @@ -1,9 +1,9 @@ # If create_db is true: re-create KMC and ARIBA databases, also save metadata to JSON -if [ $CREATE_DB = true ]; then +if [ "$CREATE_DB" = true ]; then - seroba createDBs ${DB_LOCAL}/${DATABASE}/ ${KMER} + seroba createDBs "${DB_LOCAL}/${DATABASE}/" "${KMER}" - echo -e "{\n \"git\": \"$DB_REMOTE\",\n \"kmer\": \"$KMER\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON_FILE} + echo -e "{\n \"git\": \"$DB_REMOTE\",\n \"kmer\": \"$KMER\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > "${DB_LOCAL}/${JSON_FILE}" fi From 69290926d97146d7ee1ee6e5bbd43d67ded61759 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 27 Jul 2023 13:40:52 +0000 Subject: [PATCH 056/157] Refactor to improve maintainability & readability Former-commit-id: 29caffdda1fbb8858cb80aeba3ef93c699a86f0d --- bin/generate_overall_report.py | 136 +++++++++++++++++++++------------ 1 file changed, 89 insertions(+), 47 deletions(-) diff --git a/bin/generate_overall_report.py b/bin/generate_overall_report.py index e48fb53..602e1d0 100755 --- a/bin/generate_overall_report.py +++ b/bin/generate_overall_report.py @@ -1,53 +1,95 @@ #! /usr/bin/env python3 +# Generate overall report based on sample reports and columns specified by COLUMNS_BY_CATEGORY and ARIBA metadata + import sys +from itertools import chain +import pandas as pd import glob -import pandas as pd - -workdir_path = sys.argv[1] -ariba_metadata = sys.argv[2] -output_file = sys.argv[3] - -output_columns = ['Sample_ID' , 'Read_QC' , 'Assembly_QC' , 'Mapping_QC' , 'Taxonomy_QC' , 'Overall_QC' , 'Bases' , 'Contigs#' , 'Assembly_Length' , 'Seq_Depth' , 'Ref_Cov_%' , 'Het-SNP#' , 'S.Pneumo_%' , 'GPSC' , 'Serotype' , 'ST' , 'aroE' , 'gdh' , 'gki' , 'recP' , 'spi' , 'xpt' , 'ddl' , 'pbp1a' , 'pbp2b' , 'pbp2x' , 'AMO_MIC' , 'AMO_Res' , 'CFT_MIC' , 'CFT_Res(Meningital)' , 'CFT_Res(Non-meningital)' , 'TAX_MIC' , 'TAX_Res(Meningital)' , 'TAX_Res(Non-meningital)' , 'CFX_MIC' , 'CFX_Res' , 'MER_MIC' , 'MER_Res' , 'PEN_MIC' , 'PEN_Res(Meningital)' , 'PEN_Res(Non-meningital)'] - -ariba_targets = set(pd.read_csv(ariba_metadata, sep='\t')['target'].unique()) - -if 'TET' in ariba_targets: - ariba_targets.add('DOX') - -if 'FQ' in ariba_targets: - ariba_targets.add('LFX') - -if 'TMP' in ariba_targets and 'SMX' in ariba_targets: - ariba_targets.add('COT') - -if 'ERY_CLI' in ariba_targets: - ariba_targets.update(['ERY', 'CLI']) - -ariba_targets = sorted(ariba_targets) - -pilis = [] - -for target in ariba_targets: - if target.lower().startswith('pili'): - pilis.append(target) - else: - output_columns.extend([f'{target}_Res', f'{target}_Determinant']) - -for pili in pilis: - output_columns.extend([f'{pili}', f'{pili}_Determinant']) - -df_manifest = pd.DataFrame(columns=output_columns) - -dfs = [df_manifest] - -reports = glob.glob(workdir_path +'/*.csv') -for report in reports: - df = pd.read_csv(report) - dfs.append(df) - -df_output = pd.concat(dfs, ignore_index=True).sort_values(by=['Sample_ID']) -df_output = df_output[output_columns] -df_output.to_csv(output_file, index=False, na_rep='_') +# Specify columns need to be included in the output file and their orders (except those based on ARIBA metadata) +COLUMNS_BY_CATEGORY = { + 'IDENTIFICATION': ['Sample_ID'], + 'QC': ['Read_QC' , 'Assembly_QC' , 'Mapping_QC' , 'Taxonomy_QC' , 'Overall_QC'] , + 'READ': ['Bases'], + 'ASSEMBLY': ['Contigs#' , 'Assembly_Length' , 'Seq_Depth'], + 'MAPPING': ['Ref_Cov_%' , 'Het-SNP#'], + 'TAXONOMY': ['S.Pneumo_%'], + 'LINEAGE': ['GPSC'], + 'SEROTYPE': ['Serotype'], + 'MLST': ['ST' , 'aroE' , 'gdh' , 'gki' , 'recP' , 'spi' , 'xpt' , 'ddl'], + 'PBP': ['pbp1a' , 'pbp2b' , 'pbp2x' , 'AMO_MIC' , 'AMO_Res' , 'CFT_MIC' , 'CFT_Res(Meningital)' , 'CFT_Res(Non-meningital)' , 'TAX_MIC' , 'TAX_Res(Meningital)' , 'TAX_Res(Non-meningital)' , 'CFX_MIC' , 'CFX_Res' , 'MER_MIC' , 'MER_Res' , 'PEN_MIC' , 'PEN_Res(Meningital)' , 'PEN_Res(Non-meningital)'] +} + + +# Check argv and save the global variables +if len(sys.argv) != 4: + sys.exit('Usage: generate_overall_report.py WORKDIR_PATH ARIBA_METADATA OUTPUT_FILE') +WORKDIR_PATH = sys.argv[1] +ARIBA_METADATA = sys.argv[2] +OUTPUT_FILE = sys.argv[3] + + +def main(): + output_columns = get_output_columns() + df_output = get_df_output(output_columns) + + # Saving df_output to output_file in csv format + df_output.to_csv(OUTPUT_FILE, index=False, na_rep='_') + + +# Get output columns based on COLUMNS_BY_CATEGORY and ARIBA metadata +def get_output_columns(): + output_columns = list(chain.from_iterable(COLUMNS_BY_CATEGORY.values())) + add_ariba_columns(output_columns) + return output_columns + + +# Based on ARIBA metadata, add additional output columns +def add_ariba_columns(output_columns): + # Get all targets in ARIBA metadata + ariba_targets = set(pd.read_csv(ARIBA_METADATA, sep='\t')['target'].unique()) + + # Adding special cases if certain targets exist + if 'TET' in ariba_targets: + ariba_targets.add('DOX') + if 'FQ' in ariba_targets: + ariba_targets.add('LFX') + if 'TMP' in ariba_targets and 'SMX' in ariba_targets: + ariba_targets.add('COT') + if 'ERY_CLI' in ariba_targets: + ariba_targets.update(['ERY', 'CLI']) + + # Add all targets alphabetically, except always adding PILI at the end + pilis = [] + for target in sorted(ariba_targets): + if target.lower().startswith('pili'): + pilis.append(target) + else: + output_columns.extend([f'{target}_Res', f'{target}_Determinant']) + for pili in pilis: + output_columns.extend([f'{pili}', f'{pili}_Determinant']) + + +# Generating df_output based on all sample reports with columns in the order of output_columns +def get_df_output(output_columns): + # Generate an empty dataframe as df_manifest based on output_columns + df_manifest = pd.DataFrame(columns=output_columns) + + # Generate a dataframe for each sample report and then concat df_manifest and all dataframes into df_output + dfs = [df_manifest] + reports = glob.glob(WORKDIR_PATH +'/*.csv') + for report in reports: + df = pd.read_csv(report) + dfs.append(df) + df_output = pd.concat(dfs, ignore_index=True).sort_values(by=['Sample_ID']) + + # Ensure column order in df_output is the same as output_columns + df_output = df_output[output_columns] + + return df_output + + +if __name__ == "__main__": + main() From ae3cce669b98644ec3ca8e96c9586bf648ba0b39 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 27 Jul 2023 14:57:36 +0000 Subject: [PATCH 057/157] Improve shell scripts style Former-commit-id: 60614c8ef117a5641b4fbbf92343e2721e6d27db --- bin/generate_sample_report.sh | 2 +- bin/get_databases_info.sh | 28 +++++++++++++------------- bin/get_docker_compose.sh | 8 ++++---- bin/get_images_info.sh | 38 +++++++++++++++++------------------ bin/get_kraken2_db.sh | 19 +++++++++--------- bin/get_lineage.sh | 4 ++-- bin/get_mlst.sh | 6 +++--- 7 files changed, 52 insertions(+), 53 deletions(-) diff --git a/bin/generate_sample_report.sh b/bin/generate_sample_report.sh index ec769f3..cb7ab52 100755 --- a/bin/generate_sample_report.sh +++ b/bin/generate_sample_report.sh @@ -1,3 +1,3 @@ paste -d , *.csv \ | sed '1 s/^/\"Sample_ID\",/' \ -| sed "2 s/^/\"${SAMPLE_ID}\",/" > $SAMPLE_REPORT \ No newline at end of file +| sed "2 s/^/\"${SAMPLE_ID}\",/" > "$SAMPLE_REPORT" diff --git a/bin/get_databases_info.sh b/bin/get_databases_info.sh index c87d56f..3d9dd98 100755 --- a/bin/get_databases_info.sh +++ b/bin/get_databases_info.sh @@ -3,9 +3,9 @@ add_bwa_db () { BWA_DB_JSON=${BWA_DB_PATH}/${BWA_JSON} if [ -f "$BWA_DB_JSON" ]; then - REFERENCE=$(jq -r .reference $BWA_DB_JSON) - REFERENCE_MD5=$(jq -r .reference_md5 $BWA_DB_JSON) - CREATE_TIME=$(jq -r .create_time $BWA_DB_JSON) + REFERENCE=$(jq -r .reference "$BWA_DB_JSON") + REFERENCE_MD5=$(jq -r .reference_md5 "$BWA_DB_JSON") + CREATE_TIME=$(jq -r .create_time "$BWA_DB_JSON") else REFERENCE="Not yet created" REFERENCE_MD5="Not yet created" @@ -17,11 +17,11 @@ add_bwa_db () { add_ariba_db () { ARIBA_DB_JSON=${ARIBA_DB_PATH}/${ARIBA_JSON} if [ -f "$ARIBA_DB_JSON" ]; then - REFERENCE=$(jq -r .reference $ARIBA_DB_JSON) - REFERENCE_MD5=$(jq -r .reference_md5 $ARIBA_DB_JSON) - METADATA=$(jq -r .metadata $ARIBA_DB_JSON) - METADATA_MD5=$(jq -r .metadata_md5 $ARIBA_DB_JSON) - CREATE_TIME=$(jq -r .create_time $ARIBA_DB_JSON) + REFERENCE=$(jq -r .reference "$ARIBA_DB_JSON") + REFERENCE_MD5=$(jq -r .reference_md5 "$ARIBA_DB_JSON") + METADATA=$(jq -r .metadata "$ARIBA_DB_JSON") + METADATA_MD5=$(jq -r .metadata_md5 "$ARIBA_DB_JSON") + CREATE_TIME=$(jq -r .create_time "$ARIBA_DB_JSON") else REFERENCE="Not yet created" REFERENCE_MD5="Not yet created" @@ -35,9 +35,9 @@ add_ariba_db () { add_seroba_db () { SEROBA_DB_JSON=${SEROBA_DB_PATH}/${SEROBA_JSON} if [ -f "$SEROBA_DB_JSON" ]; then - GIT=$(jq -r .git $SEROBA_DB_JSON) - KMER=$(jq -r .kmer $SEROBA_DB_JSON) - CREATE_TIME=$(jq -r .create_time $SEROBA_DB_JSON) + GIT=$(jq -r .git "$SEROBA_DB_JSON") + KMER=$(jq -r .kmer "$SEROBA_DB_JSON") + CREATE_TIME=$(jq -r .create_time "$SEROBA_DB_JSON") else GIT="Not yet created" KMER="Not yet created" @@ -49,8 +49,8 @@ add_seroba_db () { add_url_db () { DB_JSON=$1 if [ -f "$DB_JSON" ]; then - URL=$(jq -r .url $DB_JSON) - SAVE_TIME=$(jq -r .save_time $DB_JSON) + URL=$(jq -r .url "$DB_JSON") + SAVE_TIME=$(jq -r .save_time "$DB_JSON") else URL="Not yet downloaded" SAVE_TIME="Not yet downloaded" @@ -65,4 +65,4 @@ jq -n \ --argjson kraken2_db "$(add_url_db "${KRAKEN2_DB_PATH}/${KRAKEN2_JSON}")" \ --argjson poppunnk_db "$(add_url_db "${POPPUNK_DB_PATH}/${POPPUNK_JSON}")" \ --argjson poppunk_ext "$(add_url_db "${POPPUNK_DB_PATH}/${POPPUNK_EXT_JSON}")" \ - '$ARGS.named' > $JSON_FILE + '$ARGS.named' > "$JSON_FILE" diff --git a/bin/get_docker_compose.sh b/bin/get_docker_compose.sh index e581e54..5f8ff8b 100755 --- a/bin/get_docker_compose.sh +++ b/bin/get_docker_compose.sh @@ -2,13 +2,13 @@ COUNT=0 -echo "services:" >> $COMPOSE +echo "services:" >> "$COMPOSE" -grep -E "container\s?=" $NEXTFLOW_CONFIG \ +grep -E "container\s?=" "$NEXTFLOW_CONFIG" \ | sort -u \ | sed -r "s/\s+container\s?=\s?'(.+)'/\1/" \ | while read -r IMAGE ; do COUNT=$((COUNT+1)) - echo " SERVICE${COUNT}:" >> $COMPOSE - echo " image: $IMAGE" >> $COMPOSE + echo " SERVICE${COUNT}:" >> "$COMPOSE" + echo " image: $IMAGE" >> "$COMPOSE" done diff --git a/bin/get_images_info.sh b/bin/get_images_info.sh index 95dd83f..51b20aa 100755 --- a/bin/get_images_info.sh +++ b/bin/get_images_info.sh @@ -1,7 +1,7 @@ # Extract containers information from nextflow.config and save into a JSON file find_image () { - grep -E "container\s?=" -B 1 $NEXTFLOW_CONFIG | grep -v -- "^--$" | paste - - | sort -u | grep $1 | sed -r "s/.+container\s?=\s?'(.+)'/\1/" + grep -E "container\s?=" -B 1 "$NEXTFLOW_CONFIG" | grep -v -- "^--$" | paste - - | sort -u | grep "$1" | sed -r "s/.+container\s?=\s?'(.+)'/\1/" } BASH=$(find_image bash) @@ -22,24 +22,24 @@ KRAKEN2=$(find_image kraken2) SEROBA=$(find_image seroba) add_container () { - jq -n --arg container $1 '.container = $container' + jq -n --arg container "$1" '.container = $container' } jq -n \ - --argjson bash "$(add_container $BASH)" \ - --argjson git "$(add_container $GIT)" \ - --argjson python "$(add_container $PYTHON)" \ - --argjson fastp "$(add_container $FASTP)" \ - --argjson unicycler "$(add_container $UNICYCLER)" \ - --argjson shovill "$(add_container $SHOVILL)" \ - --argjson quast "$(add_container $QUAST)" \ - --argjson bwa "$(add_container $BWA)" \ - --argjson samtools "$(add_container $SAMTOOLS)" \ - --argjson bcftools "$(add_container $BCFTOOLS)" \ - --argjson poppunk "$(add_container $POPPUNK)" \ - --argjson spn_pbp_amr "$(add_container $SPN_PBP_AMR)" \ - --argjson ariba "$(add_container $ARIBA)" \ - --argjson mlst "$(add_container $MLST)" \ - --argjson kraken2 "$(add_container $KRAKEN2)" \ - --argjson seroba "$(add_container $SEROBA)" \ - '$ARGS.named' > $JSON_FILE + --argjson bash "$(add_container "$BASH")" \ + --argjson git "$(add_container "$GIT")" \ + --argjson python "$(add_container "$PYTHON")" \ + --argjson fastp "$(add_container "$FASTP")" \ + --argjson unicycler "$(add_container "$UNICYCLER")" \ + --argjson shovill "$(add_container "$SHOVILL")" \ + --argjson quast "$(add_container "$QUAST")" \ + --argjson bwa "$(add_container "$BWA")" \ + --argjson samtools "$(add_container "$SAMTOOLS")" \ + --argjson bcftools "$(add_container "$BCFTOOLS")" \ + --argjson poppunk "$(add_container "$POPPUNK")" \ + --argjson spn_pbp_amr "$(add_container "$SPN_PBP_AMR")" \ + --argjson ariba "$(add_container "$ARIBA")" \ + --argjson mlst "$(add_container "$MLST")" \ + --argjson kraken2 "$(add_container "$KRAKEN2")" \ + --argjson seroba "$(add_container "$SEROBA")" \ + '$ARGS.named' > "$JSON_FILE" diff --git a/bin/get_kraken2_db.sh b/bin/get_kraken2_db.sh index c53cc52..8632bc8 100755 --- a/bin/get_kraken2_db.sh +++ b/bin/get_kraken2_db.sh @@ -1,29 +1,28 @@ # Check if all file exists and were obtained from the database at the specific link. # If not: remove files in database directory, download, and unzip to database directory, also save metadata to JSON -DB_NAME=$(basename $DB_REMOTE) ZIPPED_DB='kraken2_db.tar.gz' -if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ - [ ! "$DB_REMOTE" == "$(jq -r .url ${DB_LOCAL}/${JSON_FILE})" ] || \ - [ ! -f ${DB_LOCAL}/hash.k2d ] || \ - [ ! -f ${DB_LOCAL}/opts.k2d ] || \ - [ ! -f ${DB_LOCAL}/taxo.k2d ]; then +if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \ + [ ! "$DB_REMOTE" == "$(jq -r .url "${DB_LOCAL}/${JSON_FILE}")" ] || \ + [ ! -f "${DB_LOCAL}/hash.k2d" ] || \ + [ ! -f "${DB_LOCAL}/opts.k2d" ] || \ + [ ! -f "${DB_LOCAL}/taxo.k2d" ]; then - rm -rf ${DB_LOCAL}/{,.[!.],..?}* + rm -rf "${DB_LOCAL:?}"/{,.[!.],..?}* - wget ${DB_REMOTE} -O $ZIPPED_DB + wget "${DB_REMOTE}" -O $ZIPPED_DB # Use tmp dir and find to ensure files are saved directly at $DB_LOCAL regardless of archive directory structure mkdir tmp tar -xzf $ZIPPED_DB -C tmp - find tmp -type f -exec mv {} $DB_LOCAL \; + find tmp -type f -exec mv {} "$DB_LOCAL" \; rm -f $ZIPPED_DB jq -n \ --arg url "${DB_REMOTE}" \ --arg save_time "$(date +"%Y-%m-%d %H:%M:%S %Z")" \ - '{"url" : $url, "save_time": $save_time}' > ${DB_LOCAL}/${JSON_FILE} + '{"url" : $url, "save_time": $save_time}' > "${DB_LOCAL}/${JSON_FILE}" fi diff --git a/bin/get_lineage.sh b/bin/get_lineage.sh index 63b6ec0..cd57737 100755 --- a/bin/get_lineage.sh +++ b/bin/get_lineage.sh @@ -6,8 +6,8 @@ # Save results of individual sample into .csv with its name as filename sed 's/^/prefix_/' "$QFILE" > safe_qfile.txt -poppunk_assign --db "${POPPUNK_DIR}/${DB_NAME}" --external-clustering "${POPPUNK_DIR}/${EXT_CLUSTERS_FILE}" --query safe_qfile.txt --output output --threads $(nproc) +poppunk_assign --db "${POPPUNK_DIR}/${DB_NAME}" --external-clustering "${POPPUNK_DIR}/${EXT_CLUSTERS_FILE}" --query safe_qfile.txt --output output --threads "$(nproc)" sed 's/^prefix_//' output/output_external_clusters.csv > result.txt -awk -F , 'NR!=1 { print "\"GPSC\"\n" "\"" $2 "\"" > $1 ".csv" }' result.txt \ No newline at end of file +awk -F , 'NR!=1 { print "\"GPSC\"\n" "\"" $2 "\"" > $1 ".csv" }' result.txt diff --git a/bin/get_mlst.sh b/bin/get_mlst.sh index ab7c8e9..72e0400 100755 --- a/bin/get_mlst.sh +++ b/bin/get_mlst.sh @@ -2,7 +2,7 @@ OUTPUT='output.tsv' -mlst --legacy --scheme spneumoniae "$ASSEMBLY" > $OUTPUT +mlst --legacy --scheme spneumoniae "$ASSEMBLY" > "$OUTPUT" ST=$(awk -F'\t' 'FNR == 2 {print $3}' $OUTPUT) aroE=$(awk -F'\t' 'FNR == 2 {print $4}' $OUTPUT) @@ -13,5 +13,5 @@ spi=$(awk -F'\t' 'FNR == 2 {print $8}' $OUTPUT) xpt=$(awk -F'\t' 'FNR == 2 {print $9}' $OUTPUT) ddl=$(awk -F'\t' 'FNR == 2 {print $10}' $OUTPUT) -echo \"ST\",\"aroE\",\"gdh\",\"gki\",\"recP\",\"spi\",\"xpt\",\"ddl\" > $MLST_REPORT -echo \"$ST\",\"$aroE\",\"$gdh\",\"$gki\",\"$recP\",\"$spi\",\"$xpt\",\"$ddl\" >> $MLST_REPORT \ No newline at end of file +echo \"ST\",\"aroE\",\"gdh\",\"gki\",\"recP\",\"spi\",\"xpt\",\"ddl\" > "$MLST_REPORT" +echo \""$ST"\",\""$aroE"\",\""$gdh"\",\""$gki"\",\""$recP"\",\""$spi"\",\""$xpt"\",\""$ddl"\" >> "$MLST_REPORT" From 2449fdaf9e26ec61bf606682d8067edf6bf28656 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 27 Jul 2023 17:04:47 +0000 Subject: [PATCH 058/157] Fix comment Former-commit-id: 00e3de37a77c7ee13aa179509a78d69e635870b0 --- bin/generate_overall_report.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/generate_overall_report.py b/bin/generate_overall_report.py index 602e1d0..f238979 100755 --- a/bin/generate_overall_report.py +++ b/bin/generate_overall_report.py @@ -23,7 +23,7 @@ } -# Check argv and save the global variables +# Check argv and save to global variables if len(sys.argv) != 4: sys.exit('Usage: generate_overall_report.py WORKDIR_PATH ARIBA_METADATA OUTPUT_FILE') WORKDIR_PATH = sys.argv[1] @@ -35,7 +35,7 @@ def main(): output_columns = get_output_columns() df_output = get_df_output(output_columns) - # Saving df_output to output_file in csv format + # Saving df_output to OUTPUT_FILE in csv format df_output.to_csv(OUTPUT_FILE, index=False, na_rep='_') From e0d2010b02cb46362e05f815e3e1e3552a5affba Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 27 Jul 2023 17:10:04 +0000 Subject: [PATCH 059/157] Refactor to improve maintainability & readability Former-commit-id: b813ad0a8aba3f2e252b5d00d659e96e64d024cc --- bin/get_other_resistance.py | 169 +++++++++++++++++++++--------------- 1 file changed, 100 insertions(+), 69 deletions(-) diff --git a/bin/get_other_resistance.py b/bin/get_other_resistance.py index d902f59..8743ff7 100755 --- a/bin/get_other_resistance.py +++ b/bin/get_other_resistance.py @@ -8,72 +8,97 @@ import pandas as pd import csv -report_path = sys.argv[1] -debug_report_path = sys.argv[2] -metadata_path = sys.argv[3] -output_file = sys.argv[4] -with open(report_path) as report, open(debug_report_path) as debug_report, open(metadata_path) as metadata: - # For saving (reference, gene, var_only) combinations as key and their information ({var_change: target}) as value found in metadata - gene_dict = defaultdict(dict) - - # For saving targets found in metadata as key and their determinants (add to a set) as value - target_dict = {} - - # Skip the header in metadata - next(metadata) - # Go through lines in metadata and save findings to gene_dict and target_dict - for line in (line.strip() for line in metadata): - # Extract useful fields - fields = [str(field) for field in line.split("\t")] - ref_name, gene, var_only, var_change, _, target = fields - - # Populating gene_dict - gene_dict[(ref_name, gene, var_only)].update({var_change: target}) - # Populating target_dict - target_dict.update({target: set()}) - - # Skip the header in report and debug report - next(report) - next(debug_report) - # Go through lines in both report and debug report to detect targets - for line in (line.strip() for line in chain(report, debug_report)): - # Extract useful fields - fields = [str(field) for field in line.split("\t")] - ref_name, gene, var_only, ref_len, ref_base_assembled, known_var_change, has_known_var, ref_ctg_effect, ref_start, ref_end = fields[1], fields[2], fields[3], fields[7], fields[8], fields[16], fields[17], fields[19], fields[20], fields[21] - - # If coverage (ref_base_assembled / ref_len) < 0.9 or either variable contains non-numeric value, skip the line - if not ref_base_assembled.isdigit() or not ref_len.isdigit() or int(ref_base_assembled)/int(ref_len) < 0.9: - continue - - # If the known_var_change (. for genes, specific change for variants) is not found in the metadata of the (ref_name, gene, var_only) combination, skip the line - gene_dict_key = (ref_name, gene, var_only) - try: - target = gene_dict[gene_dict_key][known_var_change] - except KeyError: - continue - - # Logic for gene detection. Found means hit. - if var_only == "0": - target_dict[target].add(f'{ref_name}') - - # Logic for variant detection, further criteria required - if var_only == "1": - # folP-specific criteria: ref_ctg_effect (effect of change between reference and contig) is one of the keywords and the change occurs within nt 168-201 - if ref_name.lower().startswith("folp") and ref_ctg_effect.lower() in ('fshift', 'trunc', 'indel', 'ins', 'multiple') and (168 <= int(ref_start) <= 201 or 168 <= int(ref_end) <= 201): - pos = ref_start if ref_start == ref_end else f'{ref_start}-{ref_end}' - target_dict[target].add(f'{ref_name} {ref_ctg_effect} at {pos}') - # Common criteria: the assembly has that variant - elif has_known_var == "1": - target_dict[target].add(f'{ref_name} Variant {known_var_change}') +# Check argv and save to global variables +if len(sys.argv) != 5: + sys.exit('Usage: get_other_resistance.py REPORT_PATH DEBUG_REPORT_PATH METADATA_PATH OUTPUT_FILE') + +REPORT_PATH = sys.argv[1] +DEBUG_REPORT_PATH = sys.argv[2] +METADATA_PATH = sys.argv[3] +OUTPUT_FILE = sys.argv[4] + + +def main(): + targets_dict, hits_dict = prepare_dicts() + find_hits(targets_dict, hits_dict) + output = get_output(hits_dict) + # Save output to OUTPUT_FILE in csv format + pd.DataFrame([output]).to_csv(OUTPUT_FILE, index=False, quoting=csv.QUOTE_ALL) + + +def prepare_dicts(): + # For saving (reference, gene, var_only) combinations as key and their information ({var_change: target}) as value found in metadata + # Used to search whether there is a hit in the ARIBA result + targets_dict = defaultdict(dict) + + # For saving targets found in metadata as key and their determinants (i.e. hits) found in ARIBA result as values in set + hits_dict = {} + + with open(METADATA_PATH) as metadata: + # Skip the header in metadata + next(metadata) + + # Go through lines in metadata and save findings to targets_dict and hits_dict + for line in (line.strip() for line in metadata): + # Extract useful fields + fields = [str(field) for field in line.split("\t")] + ref_name, gene, var_only, var_change, _, target = fields + + # Populating targets_dict + targets_dict[(ref_name, gene, var_only)].update({var_change: target}) + # Populating hits_dict + hits_dict.update({target: set()}) + + return targets_dict, hits_dict + + +def find_hits(targets_dict, hits_dict): + with open(REPORT_PATH) as report, open(DEBUG_REPORT_PATH) as debug_report: + # Skip the header in report and debug report + next(report) + next(debug_report) + + # Go through lines in both report and debug report to detect targets + for line in (line.strip() for line in chain(report, debug_report)): + # Extract useful fields + fields = [str(field) for field in line.split("\t")] + ref_name, gene, var_only, ref_len, ref_base_assembled, known_var_change, has_known_var, ref_ctg_effect, ref_start, ref_end = fields[1], fields[2], fields[3], fields[7], fields[8], fields[16], fields[17], fields[19], fields[20], fields[21] + + # If coverage (ref_base_assembled / ref_len) < 0.9 or either variable contains non-numeric value, skip the line + if not ref_base_assembled.isdigit() or not ref_len.isdigit() or int(ref_base_assembled)/int(ref_len) < 0.9: + continue + + # If the known_var_change (. for genes, specific change for variants) is not found in the metadata of the (ref_name, gene, var_only) combination, skip the line + try: + target = targets_dict[(ref_name, gene, var_only)][known_var_change] + except KeyError: + continue + + # Logic for gene detection. Found means hit. + if var_only == "0": + hits_dict[target].add(f'{ref_name}') + + # Logic for variant detection, further criteria required + if var_only == "1": + # folP-specific criteria: ref_ctg_effect (effect of change between reference and contig) is one of the keywords and the change occurs within nt 168-201 + if ref_name.lower().startswith("folp") and ref_ctg_effect.lower() in ('fshift', 'trunc', 'indel', 'ins', 'multiple') and (168 <= int(ref_start) <= 201 or 168 <= int(ref_end) <= 201): + pos = ref_start if ref_start == ref_end else f'{ref_start}-{ref_end}' + hits_dict[target].add(f'{ref_name} {ref_ctg_effect} at {pos}') + # Common criteria: the assembly has that variant + elif has_known_var == "1": + hits_dict[target].add(f'{ref_name} Variant {known_var_change}') + + +def get_output(hits_dict): # For saving final output, where information is saved per-target output = {} - # Go through targets in metadata - for target in target_dict: + # Go through targets in hits_dict + for target in hits_dict: # If the target has no hit, set output as S or NEG (only for PILI-1/2), and determinant as _ - if len(target_dict[target]) == 0: + if len(hits_dict[target]) == 0: if target.lower().startswith('pili'): output[target] = 'NEG' else: @@ -87,10 +112,15 @@ else: output[f'{target}_Res'] = 'R' - output[f'{target}_Determinant'] = '; '.join(target_dict[target]) + output[f'{target}_Determinant'] = '; '.join(sorted(hits_dict[target])) - # Special cases to add to output + add_output_special_cases(output, hits_dict) + return output + + +# Special cases to add to output +def add_output_special_cases(output, hits_dict): # If TET exists and DOX does not: add DOX to output; directly copy output and determinant if 'TET_Res' in output and 'DOX_Res' not in output: output['DOX_Res'] = output['TET_Res'] @@ -107,15 +137,15 @@ if 'TMP_Res' in output and 'SMX_Res' in output and 'COT_Res' not in output: if output['TMP_Res'] == 'R' and output['SMX_Res'] == 'R': output['COT_Res'] = 'R' - output['COT_Determinant'] = '; '.join(target_dict['TMP'].union(target_dict['SMX'])) + output['COT_Determinant'] = '; '.join(sorted(hits_dict['TMP'].union(hits_dict['SMX']))) elif (output['TMP_Res'] == 'R') ^ (output['SMX_Res'] == 'R'): output['COT_Res'] = 'I' - output['COT_Determinant'] = '; '.join(target_dict['TMP'].union(target_dict['SMX'])) + output['COT_Determinant'] = '; '.join(sorted(hits_dict['TMP'].union(hits_dict['SMX']))) elif output['TMP_Res'] == 'S' and output['SMX_Res'] == 'S': output['COT_Res'] = 'S' output['COT_Determinant'] = '_' - # If ERY_CLI exists, add ERY and CLI to output. + # If ERY_CLI exists: add ERY and CLI to output. # If ERY_CLI is R, ERY and CLI are R, and add ERY_CLI determinant to their determinants # If ERY_CLI is S, ERY and CLI are S if they do not already exist, otherwise leave them unchanged if 'ERY_CLI_Res' in output: @@ -126,8 +156,9 @@ output['ERY_Res'] = output['ERY_Res'] if 'ERY_Res' in output else 'S' output['CLI_Res'] = output['CLI_Res'] if 'CLI_Res' in output else 'S' - output['ERY_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['ERY'])) if 'ERY' in target_dict and len(target_dict['ERY']) != 0 else output['ERY_CLI_Determinant'] - output['CLI_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['CLI'])) if 'CLI' in target_dict and len(target_dict['CLI']) != 0 else output['ERY_CLI_Determinant'] + output['ERY_Determinant'] = '; '.join(sorted(hits_dict['ERY_CLI'].union(hits_dict['ERY']))) if 'ERY' in hits_dict and len(hits_dict['ERY']) != 0 else output['ERY_CLI_Determinant'] + output['CLI_Determinant'] = '; '.join(sorted(hits_dict['ERY_CLI'].union(hits_dict['CLI']))) if 'CLI' in hits_dict and len(hits_dict['CLI']) != 0 else output['ERY_CLI_Determinant'] + - # Save output dict as csv - pd.DataFrame([output]).to_csv(output_file, index=False, quoting=csv.QUOTE_ALL) \ No newline at end of file +if __name__ == "__main__": + main() From d377c2928068a9e9238b467c80dd752f86705644 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 28 Jul 2023 09:31:21 +0000 Subject: [PATCH 060/157] Improve code comments Former-commit-id: 6002529434876cfc7e91d35f6782fc043e8c327c --- bin/get_other_resistance.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bin/get_other_resistance.py b/bin/get_other_resistance.py index 8743ff7..a77eab6 100755 --- a/bin/get_other_resistance.py +++ b/bin/get_other_resistance.py @@ -28,12 +28,13 @@ def main(): pd.DataFrame([output]).to_csv(OUTPUT_FILE, index=False, quoting=csv.QUOTE_ALL) +# Prepare targets_dict for searching hits and hits_dict for saving hits def prepare_dicts(): - # For saving (reference, gene, var_only) combinations as key and their information ({var_change: target}) as value found in metadata + # For saving (reference, gene, var_only) combinations as keys and their information found in metadata as values in dict format (i.e. {var_change: target}) # Used to search whether there is a hit in the ARIBA result targets_dict = defaultdict(dict) - # For saving targets found in metadata as key and their determinants (i.e. hits) found in ARIBA result as values in set + # For saving targets found in metadata as key and their determinants (i.e. hits) found in ARIBA result as values in set format hits_dict = {} with open(METADATA_PATH) as metadata: @@ -54,6 +55,7 @@ def prepare_dicts(): return targets_dict, hits_dict +# Finding hits in ARIBA results based on targets_dict and save hits to hits_dict def find_hits(targets_dict, hits_dict): with open(REPORT_PATH) as report, open(DEBUG_REPORT_PATH) as debug_report: # Skip the header in report and debug report @@ -91,6 +93,7 @@ def find_hits(targets_dict, hits_dict): hits_dict[target].add(f'{ref_name} Variant {known_var_change}') +# Generating final output dataframe based on hits_dict def get_output(hits_dict): # For saving final output, where information is saved per-target output = {} From 40613bbb22e12e670a1cf12eff6eb764a87d8059 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 28 Jul 2023 11:14:12 +0000 Subject: [PATCH 061/157] Improve shell scripts style Former-commit-id: 63ad10beee0e35b3578e30c985a4e9fe732a4637 --- bin/generate_sample_report.sh | 2 ++ bin/get_pbp_resistance.sh | 12 ++++++------ bin/get_poppunk_db.sh | 30 +++++++++++++++--------------- bin/get_poppunk_ext_clusters.sh | 15 +++++++-------- bin/get_seroba_db.sh | 12 ++++++------ bin/get_serotype.sh | 7 ++++--- bin/get_tools_info.sh | 4 ++-- bin/mapping_qc.sh | 8 ++++---- bin/overall_qc.sh | 4 ++-- bin/read_qc.sh | 8 ++++---- bin/taxonomy_qc.sh | 8 ++++---- 11 files changed, 56 insertions(+), 54 deletions(-) diff --git a/bin/generate_sample_report.sh b/bin/generate_sample_report.sh index cb7ab52..2a82172 100755 --- a/bin/generate_sample_report.sh +++ b/bin/generate_sample_report.sh @@ -1,3 +1,5 @@ +# Combine all csv reports into a single csv, then add Sample_ID as the first field + paste -d , *.csv \ | sed '1 s/^/\"Sample_ID\",/' \ | sed "2 s/^/\"${SAMPLE_ID}\",/" > "$SAMPLE_REPORT" diff --git a/bin/get_pbp_resistance.sh b/bin/get_pbp_resistance.sh index 5e833c3..0c9c943 100755 --- a/bin/get_pbp_resistance.sh +++ b/bin/get_pbp_resistance.sh @@ -3,13 +3,13 @@ # For all, replace null or space-only string with empty string function GET_VALUE { - echo $( < $JSON_FILE jq -r --arg target "$1" '.[$target]' \ - | sed 's/^null$//g;s/^\s+$//g' ) + < "$JSON_FILE" jq -r --arg target "$1" '.[$target]' \ + | sed 's/^null$//g;s/^\s+$//g' } function GET_RES { - echo $( < $JSON_FILE jq -r --arg target "$1" '.[$target]' \ - | sed 's/^null$//g;s/^\s+$//g' ) + < "$JSON_FILE" jq -r --arg target "$1" '.[$target]' \ + | sed 's/^null$//g;s/^\s+$//g' } pbp1a=$(GET_VALUE "pbp1a") @@ -31,5 +31,5 @@ PEN_MIC=$(GET_VALUE "penMic") PEN_NONMENINGITIS=$(GET_RES "penNonMeningitis") PEN_MENINGITIS=$(GET_RES "penMeningitis") -echo \"pbp1a\",\"pbp2b\",\"pbp2x\",\"AMO_MIC\",\"AMO_Res\",\"CFT_MIC\",\"CFT_Res\(Meningital\)\",\"CFT_Res\(Non-meningital\)\",\"TAX_MIC\",\"TAX_Res\(Meningital\)\",\"TAX_Res\(Non-meningital\)\",\"CFX_MIC\",\"CFX_Res\",\"MER_MIC\",\"MER_Res\",\"PEN_MIC\",\"PEN_Res\(Meningital\)\",\"PEN_Res\(Non-meningital\)\" > $PBP_AMR_REPORT -echo \"$pbp1a\",\"$pbp2b\",\"$pbp2x\",\"$AMO_MIC\",\"$AMO\",\"$CFT_MIC\",\"$CFT_MENINGITIS\",\"$CFT_NONMENINGITIS\",\"$TAX_MIC\",\"$TAX_MENINGITIS\",\"$TAX_NONMENINGITIS\",\"$CFX_MIC\",\"$CFX\",\"$MER_MIC\",\"$MER\",\"$PEN_MIC\",\"$PEN_MENINGITIS\",\"$PEN_NONMENINGITIS\" >> $PBP_AMR_REPORT \ No newline at end of file +echo \"pbp1a\",\"pbp2b\",\"pbp2x\",\"AMO_MIC\",\"AMO_Res\",\"CFT_MIC\",\"CFT_Res\(Meningital\)\",\"CFT_Res\(Non-meningital\)\",\"TAX_MIC\",\"TAX_Res\(Meningital\)\",\"TAX_Res\(Non-meningital\)\",\"CFX_MIC\",\"CFX_Res\",\"MER_MIC\",\"MER_Res\",\"PEN_MIC\",\"PEN_Res\(Meningital\)\",\"PEN_Res\(Non-meningital\)\" > "$PBP_AMR_REPORT" +echo \""$pbp1a"\",\""$pbp2b"\",\""$pbp2x"\",\""$AMO_MIC"\",\""$AMO"\",\""$CFT_MIC"\",\""$CFT_MENINGITIS"\",\""$CFT_NONMENINGITIS"\",\""$TAX_MIC"\",\""$TAX_MENINGITIS"\",\""$TAX_NONMENINGITIS"\",\""$CFX_MIC"\",\""$CFX"\",\""$MER_MIC"\",\""$MER"\",\""$PEN_MIC"\",\""$PEN_MENINGITIS"\",\""$PEN_NONMENINGITIS"\" >> "$PBP_AMR_REPORT" diff --git a/bin/get_poppunk_db.sh b/bin/get_poppunk_db.sh index d4e705a..48a0198 100755 --- a/bin/get_poppunk_db.sh +++ b/bin/get_poppunk_db.sh @@ -6,27 +6,27 @@ DB_NAME=$(basename "$DB_REMOTE" .tar.gz) DB_PATH=${DB_LOCAL}/${DB_NAME} -if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ - [ ! "$DB_REMOTE" == "$(jq -r .url ${DB_LOCAL}/${JSON_FILE})" ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}.h5 ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}.dists.npy ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}.dists.pkl ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}_fit.npz ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}_fit.pkl ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}_graph.gt ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}_clusters.csv ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}.refs ]; then +if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \ + [ ! "$DB_REMOTE" == "$(jq -r .url "${DB_LOCAL}/${JSON_FILE}")" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}.h5" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}.dists.npy" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}.dists.pkl" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}_fit.npz" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}_fit.pkl" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}_graph.gt" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}_clusters.csv" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}.refs" ]; then - rm -rf ${DB_LOCAL}/${JSON_FILE} - rm -rf ${DB_LOCAL}/*/ + rm -rf "${DB_LOCAL:?}/${JSON_FILE}" + rm -rf "${DB_LOCAL:?}"/*/ - wget $DB_REMOTE -O poppunk_db.tar.gz - tar -xzf poppunk_db.tar.gz -C $DB_LOCAL + wget "$DB_REMOTE" -O poppunk_db.tar.gz + tar -xzf poppunk_db.tar.gz -C "$DB_LOCAL" rm poppunk_db.tar.gz jq -n \ --arg url "$DB_REMOTE" \ --arg save_time "$(date +"%Y-%m-%d %H:%M:%S %Z")" \ - '{"url" : $url, "save_time": $save_time}' > ${DB_LOCAL}/${JSON_FILE} + '{"url" : $url, "save_time": $save_time}' > "${DB_LOCAL}/${JSON_FILE}" fi diff --git a/bin/get_poppunk_ext_clusters.sh b/bin/get_poppunk_ext_clusters.sh index e330968..273ccbb 100755 --- a/bin/get_poppunk_ext_clusters.sh +++ b/bin/get_poppunk_ext_clusters.sh @@ -4,20 +4,19 @@ # If not: remove all csv files, and download to database directory, also save metadata to JSON EXT_CLUSTERS_CSV=$(basename "$EXT_CLUSTERS_REMOTE") -EXT_CLUSTERS_NAME=$(basename "$EXT_CLUSTERS_REMOTE" .csv) -if [ ! -f ${EXT_CLUSTERS_LOCAL}/${JSON_FILE} ] || \ - [ ! "$EXT_CLUSTERS_REMOTE" == "$(jq -r .url ${EXT_CLUSTERS_LOCAL}/${JSON_FILE})" ] || \ - [ ! -f ${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV} ]; then +if [ ! -f "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}" ] || \ + [ ! "$EXT_CLUSTERS_REMOTE" == "$(jq -r .url "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}")" ] || \ + [ ! -f "${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV}" ]; then - rm -f ${EXT_CLUSTERS_LOCAL}/*.csv - rm -f ${EXT_CLUSTERS_LOCAL}/${JSON_FILE} + rm -f "${EXT_CLUSTERS_LOCAL}"/*.csv + rm -f "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}" - wget $EXT_CLUSTERS_REMOTE -O ${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV} + wget "$EXT_CLUSTERS_REMOTE" -O "${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV}" jq -n \ --arg url "$EXT_CLUSTERS_REMOTE" \ --arg save_time "$(date +"%Y-%m-%d %H:%M:%S %Z")" \ - '{"url" : $url, "save_time": $save_time}' > ${EXT_CLUSTERS_LOCAL}/${JSON_FILE} + '{"url" : $url, "save_time": $save_time}' > "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}" fi diff --git a/bin/get_seroba_db.sh b/bin/get_seroba_db.sh index a3e1d3c..0cda2fc 100755 --- a/bin/get_seroba_db.sh +++ b/bin/get_seroba_db.sh @@ -5,13 +5,13 @@ # Assume up-to-date if JSON passes checks and the host cannot be resolved to allow offline usage -if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ - [ ! "$(grep 'git' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "${DB_REMOTE}" ] || \ - [ ! "$(grep 'kmer' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "${KMER}" ] || \ - !((git -C ${DB_LOCAL} pull || echo 'Already up-to-date') | grep -q 'Already up[- ]to[- ]date'); then +if [ ! -f "${DB_LOCAL}"/"${JSON_FILE}" ] || \ + [ ! "$(grep 'git' "${DB_LOCAL}"/"${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "${DB_REMOTE}" ] || \ + [ ! "$(grep 'kmer' "${DB_LOCAL}"/"${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "${KMER}" ] || \ + ! ( (git -C "${DB_LOCAL}" pull || echo 'Already up-to-date') | grep -q 'Already up[- ]to[- ]date' ); then - rm -rf ${DB_LOCAL}/{,.[!.],..?}* - git clone ${DB_REMOTE} ${DB_LOCAL} + rm -rf "${DB_LOCAL:?}"/{,.[!.],..?}* + git clone "${DB_REMOTE}" "${DB_LOCAL}" CREATE_DB=true diff --git a/bin/get_serotype.sh b/bin/get_serotype.sh index 80bfcc7..560cdd7 100755 --- a/bin/get_serotype.sh +++ b/bin/get_serotype.sh @@ -1,9 +1,10 @@ # Run SeroBA to serotype samples + { - seroba runSerotyping "$SEROBA_DIR"/"$DATABASE" "$READ1" "$READ2" "$SAMPLE_ID" && SEROTYPE=$(awk -F'\t' '{ print $2 }' ${SAMPLE_ID}/pred.tsv) + seroba runSerotyping "${SEROBA_DIR}/${DATABASE}" "$READ1" "$READ2" "$SAMPLE_ID" && SEROTYPE=$(awk -F'\t' '{ print $2 }' "${SAMPLE_ID}/pred.tsv") } || { SEROTYPE="SEROBA FAILURE" } -echo \"Serotype\" > $SEROTYPE_REPORT -echo \"$SEROTYPE\" >> $SEROTYPE_REPORT \ No newline at end of file +echo \"Serotype\" > "$SEROTYPE_REPORT" +echo \""$SEROTYPE"\" >> "$SEROTYPE_REPORT" diff --git a/bin/get_tools_info.sh b/bin/get_tools_info.sh index 23d9520..9d20e16 100755 --- a/bin/get_tools_info.sh +++ b/bin/get_tools_info.sh @@ -1,7 +1,7 @@ # Save received tools versions into a JSON file add_version () { - jq -n --arg version $1 '.version = $version' + jq -n --arg version "$1" '.version = $version' } jq -n \ @@ -19,4 +19,4 @@ jq -n \ --argjson kraken2 "$(add_version "$KRAKEN2_VERSION")" \ --argjson seroba "$(add_version "$SEROBA_VERSION")" \ --argjson ariba "$(add_version "$ARIBA_VERSION")" \ - '$ARGS.named' > $JSON_FILE + '$ARGS.named' > "$JSON_FILE" diff --git a/bin/mapping_qc.sh b/bin/mapping_qc.sh index 75b18a0..f6eb48e 100755 --- a/bin/mapping_qc.sh +++ b/bin/mapping_qc.sh @@ -1,12 +1,12 @@ # Extract mapping QC information and determine QC result based on reference coverage and count of Het-SNP sites -COVERAGE=$(printf %.2f $COVERAGE) +COVERAGE=$(printf %.2f "$COVERAGE") -if (( $(echo "$COVERAGE > $QC_REF_COVERAGE" | bc -l) )) && (( $HET_SNP < $QC_HET_SNP_SITE )); then +if [[ "$(echo "$COVERAGE > $QC_REF_COVERAGE" | bc -l)" == 1 ]] && [[ $HET_SNP -lt $QC_HET_SNP_SITE ]]; then MAPPING_QC="PASS" else MAPPING_QC="FAIL" fi -echo \"Mapping_QC\",\"Ref_Cov_%\",\"Het-SNP#\" > $MAPPING_QC_REPORT -echo \"$MAPPING_QC\",\"$COVERAGE\",\"$QC_HET_SNP_SITE\" >> $MAPPING_QC_REPORT \ No newline at end of file +echo \"Mapping_QC\",\"Ref_Cov_%\",\"Het-SNP#\" > "$MAPPING_QC_REPORT" +echo \"$MAPPING_QC\",\""$COVERAGE"\",\""$QC_HET_SNP_SITE"\" >> "$MAPPING_QC_REPORT" diff --git a/bin/overall_qc.sh b/bin/overall_qc.sh index de7e116..d83e52a 100755 --- a/bin/overall_qc.sh +++ b/bin/overall_qc.sh @@ -11,5 +11,5 @@ else OVERALL_QC="FAIL" fi -echo \"Overall_QC\" > $OVERALL_QC_REPORT -echo \"$OVERALL_QC\" >> $OVERALL_QC_REPORT \ No newline at end of file +echo \"Overall_QC\" > "$OVERALL_QC_REPORT" +echo \""$OVERALL_QC"\" >> "$OVERALL_QC_REPORT" diff --git a/bin/read_qc.sh b/bin/read_qc.sh index 6ce8382..14d7519 100755 --- a/bin/read_qc.sh +++ b/bin/read_qc.sh @@ -1,12 +1,12 @@ # Extract total base count and determine QC result based on output JSON file of fastp -BASES=$(< $JSON jq -r .summary.after_filtering.total_bases) +BASES=$(< "$JSON" jq -r .summary.after_filtering.total_bases) -if (( $(echo "$BASES >= ($QC_LENGTH_LOW*$QC_DEPTH)" | bc -l) )); then +if [[ "$(echo "$BASES >= ($QC_LENGTH_LOW*$QC_DEPTH)" | bc -l)" == 1 ]]; then READ_QC="PASS" else READ_QC="FAIL" fi -echo \"Read_QC\",\"Bases\" > $READ_QC_REPORT -echo \"$READ_QC\",\"$BASES\" >> $READ_QC_REPORT \ No newline at end of file +echo \"Read_QC\",\"Bases\" > "$READ_QC_REPORT" +echo \"$READ_QC\",\""$BASES"\" >> "$READ_QC_REPORT" diff --git a/bin/taxonomy_qc.sh b/bin/taxonomy_qc.sh index 23254b1..7528b1d 100755 --- a/bin/taxonomy_qc.sh +++ b/bin/taxonomy_qc.sh @@ -1,16 +1,16 @@ # Extract taxonomy QC information and determine QC result based on kraken2_report.txt -PERCENTAGE=$(awk -F"\t" '$4 ~ /^S$/ && $6 ~ /Streptococcus pneumoniae$/ { gsub(/^[ \t]+/, "", $1); printf "%.2f", $1 }' $KRAKEN2_REPORT) +PERCENTAGE=$(awk -F"\t" '$4 ~ /^S$/ && $6 ~ /Streptococcus pneumoniae$/ { gsub(/^[ \t]+/, "", $1); printf "%.2f", $1 }' "$KRAKEN2_REPORT") if [ -z "$PERCENTAGE" ]; then PERCENTAGE="0.00" fi -if (( $(echo "$PERCENTAGE > $QC_SPNEUMO_PERCENTAGE" | bc -l) )); then +if [[ "$(echo "$PERCENTAGE > $QC_SPNEUMO_PERCENTAGE" | bc -l)" == 1 ]]; then TAXONOMY_QC="PASS" else TAXONOMY_QC="FAIL" fi -echo \"Taxonomy_QC\",\"S.Pneumo_%\" > $TAXONOMY_QC_REPORT -echo \"$TAXONOMY_QC\",\"$PERCENTAGE\" >> $TAXONOMY_QC_REPORT \ No newline at end of file +echo \"Taxonomy_QC\",\"S.Pneumo_%\" > "$TAXONOMY_QC_REPORT" +echo \"$TAXONOMY_QC\",\""$PERCENTAGE"\" >> "$TAXONOMY_QC_REPORT" From c47ca63a1c6e04dc9c2f755c8fd5e3147b508207 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 28 Jul 2023 14:49:25 +0000 Subject: [PATCH 062/157] Fix outputing incorrect variable for Het-SNP# Former-commit-id: 2e6707ad4889d1018856400a405d6b5a35003e93 --- bin/mapping_qc.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/mapping_qc.sh b/bin/mapping_qc.sh index f6eb48e..a737d09 100755 --- a/bin/mapping_qc.sh +++ b/bin/mapping_qc.sh @@ -9,4 +9,4 @@ else fi echo \"Mapping_QC\",\"Ref_Cov_%\",\"Het-SNP#\" > "$MAPPING_QC_REPORT" -echo \"$MAPPING_QC\",\""$COVERAGE"\",\""$QC_HET_SNP_SITE"\" >> "$MAPPING_QC_REPORT" +echo \"$MAPPING_QC\",\""$COVERAGE"\",\""$HET_SNP"\" >> "$MAPPING_QC_REPORT" From 346e6be5e8c12f379a20f617f892dd1235e77729 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 28 Jul 2023 15:42:41 +0000 Subject: [PATCH 063/157] Avoid numbers output as float Former-commit-id: 8d352582bcbd82a34797e22c2f0433d588e5d6a7 --- bin/generate_overall_report.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/generate_overall_report.py b/bin/generate_overall_report.py index f238979..59a1737 100755 --- a/bin/generate_overall_report.py +++ b/bin/generate_overall_report.py @@ -81,7 +81,7 @@ def get_df_output(output_columns): dfs = [df_manifest] reports = glob.glob(WORKDIR_PATH +'/*.csv') for report in reports: - df = pd.read_csv(report) + df = pd.read_csv(report, dtype=str) dfs.append(df) df_output = pd.concat(dfs, ignore_index=True).sort_values(by=['Sample_ID']) From 4e60552ec4a66f7c661477d8ef096c0418f16fac Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 28 Jul 2023 15:44:33 +0000 Subject: [PATCH 064/157] Refactor to improve maintainability & readability Former-commit-id: 1327f764db5a22a74da889a101104c99cbbaeb22 --- bin/het_snp_count.py | 171 ++++++++++++++++++++++++++----------------- modules/mapping.nf | 4 +- 2 files changed, 107 insertions(+), 68 deletions(-) diff --git a/bin/het_snp_count.py b/bin/het_snp_count.py index 74d45ee..cf92dc6 100755 --- a/bin/het_snp_count.py +++ b/bin/het_snp_count.py @@ -5,72 +5,109 @@ import re import sys -# Input VCF path -vcf = sys.argv[1] -# Minimum distance between SNPs to not consider as part of cluster -min_snp_distance = int(sys.argv[2]) - - -with open(vcf) as f: - lines = [line.strip() for line in f] - - # List of positions of non-cluster Het-SNPs - het_noncluster_pos = [] - # Previous Het-SNP position. Initialise with the negative of min_snp_distance for calculation of the sites in starting positions - prev_het_pos = -min_snp_distance - - for line in lines: - # Skip lines of header and INDEL calls - if line.startswith("#") or "INDEL" in line: - continue - - # Get fields from the call - chrom, pos, id, ref, alt, qual, filter, info, format, sample = line.split("\t") - - # Get DP (The number of reads covering or bridging POS) from the INFO field - dp = re.search(r'DP=([0-9]+)', info).group(1) - # Get DP4 (Number of forward ref alleles; reverse ref; forward non-ref; reverse non-ref alleles, used in variant calling) from the INFO field - reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref = re.search(r'DP4=([0-9,]+)', info).group(1).split(",") - # Get MQ (Root-Mean-Square mapping quality of covering reads) from the INFO field - mq = re.search(r'MQ=([0-9]+)', info).group(1) - - # Get PV4 (P-values for strand bias; baseQ bias; mapQ bias; tail distance bias) from the INFO field; set to None if it is not found - try: - pv4 = re.search(r'PV4=([0-9,.]+)', info).group(1) - except AttributeError: - pv4 = None - - # Ensure qual is float - qual = float(qual) - # Ensure pos, dp, mq, reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref are int - pos, dp, mq, reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref = map(int, [pos, dp, mq, reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref]) - - # Basic quality filter, skip this call if fails - if not(qual > 50 and dp > 5 and mq > 30 and reads_for_non_ref > 2 and reads_rev_non_ref > 2): - continue - - # Further quality filter if PV4 exists, skip this call if fails - if pv4 is not None: - pv_strand, pv_baseq, pv_mapq, pv_tail_distance = map(float, pv4.split(",")) - if not (pv_strand > 0.001 and pv_mapq > 0.001 and pv_tail_distance > 0.001): + +# Check argv and save to global variables +if len(sys.argv) != 4: + sys.exit('Usage: het_snp_count.py VCF MIN_SNP_DISTANCE OUTPUT_FILE') +VCF = sys.argv[1] +MIN_SNP_DISTANCE = int(sys.argv[2]) # Minimum distance between SNPs to not consider as part of cluster +OUTPUT_FILE=sys.argv[3] + + +def main(): + with open(VCF) as vcf, open(OUTPUT_FILE, 'w') as output_file: + lines = [line.strip() for line in vcf] + + # List of positions of non-cluster Het-SNPs + het_noncluster_pos = [] + # Previous Het-SNP position + prev_het_pos = None + + for line in lines: + # Skip lines of header and INDEL calls + if line.startswith("#") or "INDEL" in line: continue + + pos, qual, info = extract_vcf_fields(line) + + dp, reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref, mq, pv4 = extract_info(info) + + if not quality_check(qual, dp, mq, reads_for_non_ref, reads_rev_non_ref, pv4): + continue + + if is_het_snp(het_noncluster_pos, pos, prev_het_pos, reads_for_non_ref, reads_for_ref, reads_rev_non_ref, reads_rev_ref): + # Mark current pos as previous Het-SNP pos for the next Het-SNP + prev_het_pos = pos + + # Save amount of non-cluster Het-SNP sites to OUTPUT_FILE + output_file.write(f'{len(het_noncluster_pos)}') + + +# Extract relevant fields from the call +def extract_vcf_fields(line): + fields = line.split("\t") + pos, qual, info = fields[1], fields[5], fields[7] + + # Ensure pos is int and qual is float + return int(pos), float(qual), info + + +# Extract information from the INFO field +def extract_info(info): + # Get DP (The number of reads covering or bridging POS) + dp = re.search(r'DP=([0-9]+)', info).group(1) + + # Get DP4 (Number of forward ref alleles; reverse ref; forward non-ref; reverse non-ref alleles, used in variant calling) + reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref = re.search(r'DP4=([0-9,]+)', info).group(1).split(",") + + # Get MQ (Root-Mean-Square mapping quality of covering reads) + mq = re.search(r'MQ=([0-9]+)', info).group(1) + + # Get PV4 (P-values for strand bias; baseQ bias; mapQ bias; tail distance bias); set to None if it is not found + try: + pv4 = re.search(r'PV4=([0-9,.]+)', info).group(1) + except AttributeError: + pv4 = None + + # Ensure dp, reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref, mq are int + return *map(int, [dp, reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref, mq]), pv4 + + +# Quality check for call +def quality_check(qual, dp, mq, reads_for_non_ref, reads_rev_non_ref, pv4): + # Basic quality check, skip this call if fails + if not(qual > 50 and dp > 5 and mq > 30 and reads_for_non_ref > 2 and reads_rev_non_ref > 2): + return False + + # Further quality check if PV4 exists, skip this call if fails + if pv4 is not None: + pv_strand, pv_baseq, pv_mapq, pv_tail_distance = map(float, pv4.split(",")) + if not (pv_strand > 0.001 and pv_mapq > 0.001 and pv_tail_distance > 0.001): + return False + + return True + + +# Check if this call is a Het-SNP and add/remove Het-SNP to/from het_noncluster_pos +def is_het_snp(het_noncluster_pos, pos, prev_het_pos, reads_for_non_ref, reads_for_ref, reads_rev_non_ref, reads_rev_ref): + # Calculate forward and reverse non-reference reads ratios (variant allele frequencies) + forward_non_ref_ratio = reads_for_non_ref / (reads_for_non_ref + reads_for_ref) + reverse_non_ref_ratio = reads_rev_non_ref / (reads_rev_non_ref + reads_rev_ref) + + # Consider as Het-SNP when both forward and reverse non-reference reads ratios are below 0.90 + if forward_non_ref_ratio < 0.90 and reverse_non_ref_ratio < 0.90: + # If the distance between current and previous Het-SNP position is >= the minimum non-cluster SNP distance or there is no previous Het-SNP, + # add the position to the list of non-cluster Het-SNP positions + if prev_het_pos is None or pos - prev_het_pos >= MIN_SNP_DISTANCE: + het_noncluster_pos.append(pos) + # If the last Het-SNP in the list of non-cluster Het-SNP positions is part of the current cluster, remove it + elif het_noncluster_pos and pos - het_noncluster_pos[-1] < MIN_SNP_DISTANCE: + het_noncluster_pos.pop() + + return True + + return False + - # Calculate forward and reverse non-reference reads ratios (variant allele frequencies) - forward_non_ref_ratio = reads_for_non_ref / (reads_for_non_ref + reads_for_ref) - reverse_non_ref_ratio = reads_rev_non_ref / (reads_rev_non_ref + reads_rev_ref) - - # Consider as Het-SNP when both forward and reverse non-reference reads ratios are below 0.90 - if forward_non_ref_ratio < 0.90 and reverse_non_ref_ratio < 0.90: - # If the distance between current and previous Het-SNP position is >= the minimum non-cluster SNP distance, - # add the position to the list of non-cluster Het-SNP positions - if pos - prev_het_pos >= min_snp_distance: - het_noncluster_pos.append(pos) - # If the last Het-SNP in the list of non-cluster Het-SNP positions is part of the current cluster, remove it - elif het_noncluster_pos and pos - het_noncluster_pos[-1] < min_snp_distance: - het_noncluster_pos.pop() - # Mark current pos as previous Het-SNP pos for the next Het-SNP - prev_het_pos = pos - - # Amount of non-cluster Het-SNP sites, print to be captured by Nextflow - het_noncluster_sites = len(het_noncluster_pos) - print(het_noncluster_sites, end="") +if __name__ == "__main__": + main() diff --git a/modules/mapping.nf b/modules/mapping.nf index 0a37628..d545607 100644 --- a/modules/mapping.nf +++ b/modules/mapping.nf @@ -120,8 +120,10 @@ process HET_SNP_COUNT { tuple val(sample_id), env(OUTPUT), emit: result script: + het_snp_count_output='output.txt' """ - OUTPUT=`het_snp_count.py "$vcf" 50` + het_snp_count.py "$vcf" 50 "$het_snp_count_output" + OUTPUT=`cat $het_snp_count_output` """ } From fb375e084fa90e0a3f076d8b2c8cfb6a2194b6c1 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 31 Jul 2023 13:45:59 +0000 Subject: [PATCH 065/157] Improve shell scripts style Former-commit-id: 8f15a7de7d3a77d34a95d8ebf1ac166af9bb8a96 --- bin/mapping_qc.sh | 2 +- bin/read_qc.sh | 2 +- bin/taxonomy_qc.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/mapping_qc.sh b/bin/mapping_qc.sh index a737d09..9ed580d 100755 --- a/bin/mapping_qc.sh +++ b/bin/mapping_qc.sh @@ -9,4 +9,4 @@ else fi echo \"Mapping_QC\",\"Ref_Cov_%\",\"Het-SNP#\" > "$MAPPING_QC_REPORT" -echo \"$MAPPING_QC\",\""$COVERAGE"\",\""$HET_SNP"\" >> "$MAPPING_QC_REPORT" +echo \""$MAPPING_QC"\",\""$COVERAGE"\",\""$HET_SNP"\" >> "$MAPPING_QC_REPORT" diff --git a/bin/read_qc.sh b/bin/read_qc.sh index 14d7519..a72c1d7 100755 --- a/bin/read_qc.sh +++ b/bin/read_qc.sh @@ -9,4 +9,4 @@ else fi echo \"Read_QC\",\"Bases\" > "$READ_QC_REPORT" -echo \"$READ_QC\",\""$BASES"\" >> "$READ_QC_REPORT" +echo \""$READ_QC"\",\""$BASES"\" >> "$READ_QC_REPORT" diff --git a/bin/taxonomy_qc.sh b/bin/taxonomy_qc.sh index 7528b1d..a867804 100755 --- a/bin/taxonomy_qc.sh +++ b/bin/taxonomy_qc.sh @@ -13,4 +13,4 @@ else fi echo \"Taxonomy_QC\",\"S.Pneumo_%\" > "$TAXONOMY_QC_REPORT" -echo \"$TAXONOMY_QC\",\""$PERCENTAGE"\" >> "$TAXONOMY_QC_REPORT" +echo \""$TAXONOMY_QC"\",\""$PERCENTAGE"\" >> "$TAXONOMY_QC_REPORT" From 532509dc62daf5c1705a24e2d86dc971171df6e1 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 31 Jul 2023 14:06:38 +0000 Subject: [PATCH 066/157] Improve chart Former-commit-id: 9d6ead6f62d07b58ca5e304f4fc45702d4a25b81 --- doc/workflow.drawio.svg | 238 ++++++++++++++++++++-------------------- 1 file changed, 119 insertions(+), 119 deletions(-) diff --git a/doc/workflow.drawio.svg b/doc/workflow.drawio.svg index 00766b6..873d1b7 100644 --- a/doc/workflow.drawio.svg +++ b/doc/workflow.drawio.svg @@ -1,23 +1,23 @@ - + - - + + Output - - + + Input - + - +
@@ -32,12 +32,12 @@ - - - - + + + + - +
@@ -57,14 +57,14 @@ - + - + - + - -
+ +
FASTQ (Reads) @@ -72,36 +72,36 @@
- + FASTQ (Reads) - - - + + + S. Pneumo:  > 60% - - - + + + Contigs:  < 500 - + Length:   1.9 - 2.3 Mb - + Depth:     ≥ 20x - + - -
+ +
FASTA (Assemblies) @@ -109,16 +109,16 @@
- + FASTA (Assemblies) - + - -
+ +
SAM @@ -126,25 +126,25 @@
- + SAM - - - + + + Ref Coverage:  > 60% - + Het-SNP site:   < 220 - + - -
+ +
Results @@ -152,21 +152,21 @@
- + Results - - - - - - - - + + + + + + + + - +
@@ -188,10 +188,10 @@ - - + + - +
@@ -209,10 +209,10 @@ - - + + - +
@@ -233,10 +233,10 @@ - - + + - +
@@ -254,10 +254,10 @@ - - + + - +
@@ -279,12 +279,12 @@ - - - - + + + + - +
@@ -302,12 +302,12 @@ - - - - + + + + - +
@@ -325,11 +325,11 @@ - - - + + + - +
@@ -347,13 +347,13 @@ - - - - + + + + - -
+ +
@@ -363,16 +363,16 @@
- + Over... - - - + + + - +
@@ -390,11 +390,11 @@ - - - + + + - +
@@ -412,11 +412,11 @@ - - - + + + - +
@@ -435,9 +435,9 @@ - + - +
@@ -455,30 +455,24 @@ - - - - - Go / No-go - - - - + + + Bases: ≥ 38 Mb - + Go / No-go - + - - + + - +
@@ -497,9 +491,9 @@ - + - +
@@ -517,11 +511,11 @@ - - - + + + - +
@@ -537,11 +531,17 @@ - + QC values shown in the diagram are the default values + + + + Go / No-go + + From 119622d167e569163ef47083324e3c3d83984501 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 31 Jul 2023 14:37:56 +0000 Subject: [PATCH 067/157] Update Nextflow executable to 23.04.2 Former-commit-id: d899ec958206749174162c4e5e82a1511980bb17 --- nextflow | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow b/nextflow index a6ece4c..b7725ce 100755 --- a/nextflow +++ b/nextflow @@ -15,7 +15,7 @@ # limitations under the License. [[ "$NXF_DEBUG" == 'x' ]] && set -x -NXF_VER=${NXF_VER:-'23.04.1'} +NXF_VER=${NXF_VER:-'23.04.2'} NXF_ORG=${NXF_ORG:-'nextflow-io'} NXF_HOME=${NXF_HOME:-$HOME/.nextflow} NXF_PROT=${NXF_PROT:-'https'} From 5f0e3011e2ef2d74fc56a9dc02b0e81197e59a25 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 31 Jul 2023 14:47:49 +0000 Subject: [PATCH 068/157] Improve wording of messages. Former-commit-id: a98510ee41c53e924b7fdd46949c1104d62b6bef --- modules/messages.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/messages.nf b/modules/messages.nf index 8e13998..7a9d189 100644 --- a/modules/messages.nf +++ b/modules/messages.nf @@ -46,7 +46,7 @@ void workflowSelectMessage(String selectedWorkflow) { switch (selectedWorkflow) { case 'pipeline': message = """ - |The main pipeline workflow was selected. + |The main pipeline workflow has been selected. | |Input Directory: ${readsDir.canonicalPath} |Output Directory: ${outputDir.canonicalPath} @@ -54,12 +54,12 @@ void workflowSelectMessage(String selectedWorkflow) { break case 'init': message = ''' - |The alternative workflow for initialisation was selected. + |The alternative workflow for initialisation has been selected. '''.stripMargin() break case 'version': message = ''' - |The alternative workflow for getting versions of pipeline, tools and databases was selected. + |The alternative workflow for getting versions of pipeline, tools and databases has been selected. '''.stripMargin() break } From 873ca1c426ab33303abc8b0a546b4ee9ab768c90 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 31 Jul 2023 14:53:48 +0000 Subject: [PATCH 069/157] Improve shell scripts style Former-commit-id: fa3f58f26c148d373780e12886af10517c8335be --- bin/get_databases_info.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/get_databases_info.sh b/bin/get_databases_info.sh index 3d9dd98..10f2174 100755 --- a/bin/get_databases_info.sh +++ b/bin/get_databases_info.sh @@ -1,7 +1,7 @@ # Save received databases information into a JSON file add_bwa_db () { - BWA_DB_JSON=${BWA_DB_PATH}/${BWA_JSON} + BWA_DB_JSON="${BWA_DB_PATH}/${BWA_JSON}" if [ -f "$BWA_DB_JSON" ]; then REFERENCE=$(jq -r .reference "$BWA_DB_JSON") REFERENCE_MD5=$(jq -r .reference_md5 "$BWA_DB_JSON") @@ -15,7 +15,7 @@ add_bwa_db () { } add_ariba_db () { - ARIBA_DB_JSON=${ARIBA_DB_PATH}/${ARIBA_JSON} + ARIBA_DB_JSON="${ARIBA_DB_PATH}/${ARIBA_JSON}" if [ -f "$ARIBA_DB_JSON" ]; then REFERENCE=$(jq -r .reference "$ARIBA_DB_JSON") REFERENCE_MD5=$(jq -r .reference_md5 "$ARIBA_DB_JSON") @@ -33,7 +33,7 @@ add_ariba_db () { } add_seroba_db () { - SEROBA_DB_JSON=${SEROBA_DB_PATH}/${SEROBA_JSON} + SEROBA_DB_JSON="${SEROBA_DB_PATH}/${SEROBA_JSON}" if [ -f "$SEROBA_DB_JSON" ]; then GIT=$(jq -r .git "$SEROBA_DB_JSON") KMER=$(jq -r .kmer "$SEROBA_DB_JSON") @@ -47,7 +47,7 @@ add_seroba_db () { } add_url_db () { - DB_JSON=$1 + DB_JSON="$1" if [ -f "$DB_JSON" ]; then URL=$(jq -r .url "$DB_JSON") SAVE_TIME=$(jq -r .save_time "$DB_JSON") From ab20087a04365363751a15ed80f4811c5afba1a1 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 31 Jul 2023 17:29:22 +0000 Subject: [PATCH 070/157] Improve names & comments of processes & scripts Former-commit-id: 6c54b3e998ca66a1ba47bda6d8845edaed8e62ce --- ...e_ariba_db.sh => check-create_ariba_db.sh} | 2 +- ...b.sh => check-create_ref_genome_bwa_db.sh} | 2 +- ...en2_db.sh => check-download_kraken2_db.sh} | 0 ...unk_db.sh => check-download_poppunk_db.sh} | 0 ...=> check-download_poppunk_ext_clusters.sh} | 0 bin/{get_seroba_db.sh => check_seroba_db.sh} | 6 +- ...er_compose.sh => create_docker_compose.sh} | 2 +- bin/{assembly_qc.sh => get_assembly_qc.sh} | 0 bin/{mapping_qc.sh => get_mapping_qc.sh} | 0 bin/{overall_qc.sh => get_overall_qc.sh} | 0 bin/{read_qc.sh => get_read_qc.sh} | 0 bin/{taxonomy_qc.sh => get_taxonomy_qc.sh} | 2 +- ...esistance.py => parse_other_resistance.py} | 0 ..._resistance.sh => parse_pbp_resistance.sh} | 0 ...{combine_info.sh => save_combined_info.sh} | 0 ...tabases_info.sh => save_databases_info.sh} | 0 ...get_images_info.sh => save_images_info.sh} | 0 bin/{get_tools_info.sh => save_tools_info.sh} | 0 modules/amr.nf | 16 ++--- modules/assembly.nf | 6 +- modules/docker.nf | 2 +- modules/info.nf | 8 +-- modules/lineage.nf | 4 +- modules/mapping.nf | 6 +- modules/overall_qc.nf | 2 +- modules/preprocess.nf | 2 +- modules/serotype.nf | 8 +-- modules/taxonomy.nf | 4 +- workflows/init.nf | 14 ++-- workflows/pipeline.nf | 66 ++++++++++--------- 30 files changed, 76 insertions(+), 76 deletions(-) rename bin/{create_ariba_db.sh => check-create_ariba_db.sh} (95%) rename bin/{create_ref_genome_bwa_db.sh => check-create_ref_genome_bwa_db.sh} (92%) rename bin/{get_kraken2_db.sh => check-download_kraken2_db.sh} (100%) rename bin/{get_poppunk_db.sh => check-download_poppunk_db.sh} (100%) rename bin/{get_poppunk_ext_clusters.sh => check-download_poppunk_ext_clusters.sh} (100%) rename bin/{get_seroba_db.sh => check_seroba_db.sh} (63%) rename bin/{get_docker_compose.sh => create_docker_compose.sh} (95%) rename bin/{assembly_qc.sh => get_assembly_qc.sh} (100%) rename bin/{mapping_qc.sh => get_mapping_qc.sh} (100%) rename bin/{overall_qc.sh => get_overall_qc.sh} (100%) rename bin/{read_qc.sh => get_read_qc.sh} (100%) rename bin/{taxonomy_qc.sh => get_taxonomy_qc.sh} (95%) rename bin/{get_other_resistance.py => parse_other_resistance.py} (100%) rename bin/{get_pbp_resistance.sh => parse_pbp_resistance.sh} (100%) rename bin/{combine_info.sh => save_combined_info.sh} (100%) rename bin/{get_databases_info.sh => save_databases_info.sh} (100%) rename bin/{get_images_info.sh => save_images_info.sh} (100%) rename bin/{get_tools_info.sh => save_tools_info.sh} (100%) diff --git a/bin/create_ariba_db.sh b/bin/check-create_ariba_db.sh similarity index 95% rename from bin/create_ariba_db.sh rename to bin/check-create_ariba_db.sh index fb2b657..32ff767 100755 --- a/bin/create_ariba_db.sh +++ b/bin/check-create_ariba_db.sh @@ -1,4 +1,4 @@ -# Check if CREATE_ARIBA_DB has run successfully on the specific reference sequences and metadata. +# Check if ARIBA database was prepared from the specific reference sequences and metadata. # If not: remove the $OUTPUT directory, and prepare the ARIBA database from reference sequences and metadata, also save metadata to JSON REF_SEQUENCES_MD5=$(md5sum "$REF_SEQUENCES" | awk '{ print $1 }') diff --git a/bin/create_ref_genome_bwa_db.sh b/bin/check-create_ref_genome_bwa_db.sh similarity index 92% rename from bin/create_ref_genome_bwa_db.sh rename to bin/check-create_ref_genome_bwa_db.sh index 385b609..65a7da8 100755 --- a/bin/create_ref_genome_bwa_db.sh +++ b/bin/check-create_ref_genome_bwa_db.sh @@ -1,4 +1,4 @@ -# Check if CREATE_REF_GENOME_BWA_DB has run successfully on the specific reference. +# Check if BWA database was prepared from the specific reference. # If not: remove files in database directory, and construct the FM-index database of the reference genome for BWA, also save metadata to JSON REFERENCE_MD5=$(md5sum "$REFERENCE" | awk '{ print $1 }') diff --git a/bin/get_kraken2_db.sh b/bin/check-download_kraken2_db.sh similarity index 100% rename from bin/get_kraken2_db.sh rename to bin/check-download_kraken2_db.sh diff --git a/bin/get_poppunk_db.sh b/bin/check-download_poppunk_db.sh similarity index 100% rename from bin/get_poppunk_db.sh rename to bin/check-download_poppunk_db.sh diff --git a/bin/get_poppunk_ext_clusters.sh b/bin/check-download_poppunk_ext_clusters.sh similarity index 100% rename from bin/get_poppunk_ext_clusters.sh rename to bin/check-download_poppunk_ext_clusters.sh diff --git a/bin/get_seroba_db.sh b/bin/check_seroba_db.sh similarity index 63% rename from bin/get_seroba_db.sh rename to bin/check_seroba_db.sh index 0cda2fc..2e6ff2d 100755 --- a/bin/get_seroba_db.sh +++ b/bin/check_seroba_db.sh @@ -1,7 +1,5 @@ -# Return boolean of CREATE_DB, download if necessary - -# Check if GET_SEROBA_DB and CREATE_SEROBA_DB has run successfully on the database at the specific link, CREATE_SEROBA_DB used the specific Kmerm and pull to check if SeroBA database is up-to-date. -# If outdated or does not exist: remove files in database directory and clone, set CREATE_DB to true +# Check if database was cloned from specific link and is up-to-date, also prepared by the specific Kmer +# If not: remove files in database directory and clone, set CREATE_DB to true # Assume up-to-date if JSON passes checks and the host cannot be resolved to allow offline usage diff --git a/bin/get_docker_compose.sh b/bin/create_docker_compose.sh similarity index 95% rename from bin/get_docker_compose.sh rename to bin/create_docker_compose.sh index 5f8ff8b..d6fc3ba 100755 --- a/bin/get_docker_compose.sh +++ b/bin/create_docker_compose.sh @@ -1,4 +1,4 @@ -# Generate a Docker compose file that includes all images used in nextflow.config +# Generate a Docker compose file that includes all images used in $NEXTFLOW_CONFIG COUNT=0 diff --git a/bin/assembly_qc.sh b/bin/get_assembly_qc.sh similarity index 100% rename from bin/assembly_qc.sh rename to bin/get_assembly_qc.sh diff --git a/bin/mapping_qc.sh b/bin/get_mapping_qc.sh similarity index 100% rename from bin/mapping_qc.sh rename to bin/get_mapping_qc.sh diff --git a/bin/overall_qc.sh b/bin/get_overall_qc.sh similarity index 100% rename from bin/overall_qc.sh rename to bin/get_overall_qc.sh diff --git a/bin/read_qc.sh b/bin/get_read_qc.sh similarity index 100% rename from bin/read_qc.sh rename to bin/get_read_qc.sh diff --git a/bin/taxonomy_qc.sh b/bin/get_taxonomy_qc.sh similarity index 95% rename from bin/taxonomy_qc.sh rename to bin/get_taxonomy_qc.sh index a867804..cb1e382 100755 --- a/bin/taxonomy_qc.sh +++ b/bin/get_taxonomy_qc.sh @@ -1,4 +1,4 @@ -# Extract taxonomy QC information and determine QC result based on kraken2_report.txt +# Extract taxonomy QC information and determine QC result based on $KRAKEN2_REPORT PERCENTAGE=$(awk -F"\t" '$4 ~ /^S$/ && $6 ~ /Streptococcus pneumoniae$/ { gsub(/^[ \t]+/, "", $1); printf "%.2f", $1 }' "$KRAKEN2_REPORT") diff --git a/bin/get_other_resistance.py b/bin/parse_other_resistance.py similarity index 100% rename from bin/get_other_resistance.py rename to bin/parse_other_resistance.py diff --git a/bin/get_pbp_resistance.sh b/bin/parse_pbp_resistance.sh similarity index 100% rename from bin/get_pbp_resistance.sh rename to bin/parse_pbp_resistance.sh diff --git a/bin/combine_info.sh b/bin/save_combined_info.sh similarity index 100% rename from bin/combine_info.sh rename to bin/save_combined_info.sh diff --git a/bin/get_databases_info.sh b/bin/save_databases_info.sh similarity index 100% rename from bin/get_databases_info.sh rename to bin/save_databases_info.sh diff --git a/bin/get_images_info.sh b/bin/save_images_info.sh similarity index 100% rename from bin/get_images_info.sh rename to bin/save_images_info.sh diff --git a/bin/get_tools_info.sh b/bin/save_tools_info.sh similarity index 100% rename from bin/get_tools_info.sh rename to bin/save_tools_info.sh diff --git a/modules/amr.nf b/modules/amr.nf index 1fd57f4..d7a7206 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -19,7 +19,7 @@ process PBP_RESISTANCE { } // Extract the results from the output file of the PBP AMR predictor -process GET_PBP_RESISTANCE { +process PARSE_PBP_RESISTANCE { label 'bash_container' label 'farm_low' @@ -37,12 +37,12 @@ process GET_PBP_RESISTANCE { JSON_FILE="$json" PBP_AMR_REPORT="$pbp_amr_report" - source get_pbp_resistance.sh + source parse_pbp_resistance.sh """ } -// Create ARIBA database and return database path -process CREATE_ARIBA_DB { +// Return database path, create if necessary +process GET_ARIBA_DB { label 'ariba_container' label 'farm_low' @@ -65,7 +65,7 @@ process CREATE_ARIBA_DB { OUTPUT="$output" JSON_FILE="$json" - source create_ariba_db.sh + source check-create_ariba_db.sh """ } @@ -88,12 +88,12 @@ process OTHER_RESISTANCE { report='result/report.tsv' report_debug='result/debug.report.tsv' """ - ariba run --nucmer_min_id 80 --assembled_threshold 0.80 $ariba_database/$database $read1 $read2 result + ariba run --nucmer_min_id 80 --assembled_threshold 0.80 "$ariba_database/$database" "$read1" "$read2" result """ } // Extracting resistance information from ARIBA report -process GET_OTHER_RESISTANCE { +process PARSE_OTHER_RESISTANCE { label 'python_container' label 'farm_low' @@ -109,6 +109,6 @@ process GET_OTHER_RESISTANCE { script: output_file="other_amr_report.csv" """ - get_other_resistance.py "$report" "$report_debug" "$metadata" "$output_file" + parse_other_resistance.py "$report" "$report_debug" "$metadata" "$output_file" """ } diff --git a/modules/assembly.nf b/modules/assembly.nf index fd84c76..ab66b6e 100644 --- a/modules/assembly.nf +++ b/modules/assembly.nf @@ -1,5 +1,5 @@ // Run Unicycler to get assembly -// Return sample_id and assembly, and hardlink the assembly to ${params.output}/assemblies directory +// Return sample_id and assembly, and publish the assembly to ${params.output}/assemblies directory based on ${params.assembly_publish} process ASSEMBLY_UNICYCLER { label 'unicycler_container' label 'farm_high_fallible' @@ -26,7 +26,7 @@ process ASSEMBLY_UNICYCLER { } // Run Shovill to get assembly -// Return sample_id and assembly, and hardlink the assembly to ${params.output}/assemblies directory +// Return sample_id and assembly, and publish the assembly to ${params.output}/assemblies directory based on ${params.assembly_publish} process ASSEMBLY_SHOVILL { label 'shovill_container' label 'farm_high_fallible' @@ -99,6 +99,6 @@ process ASSEMBLY_QC { QC_DEPTH="$qc_depth" ASSEMBLY_QC_REPORT="$assembly_qc_report" - source assembly_qc.sh + source get_assembly_qc.sh """ } diff --git a/modules/docker.nf b/modules/docker.nf index 4090957..ef0236b 100644 --- a/modules/docker.nf +++ b/modules/docker.nf @@ -15,7 +15,7 @@ process GET_DOCKER_COMPOSE { NEXTFLOW_CONFIG="$nextflowConfig" COMPOSE="$compose" - source get_docker_compose.sh + source create_docker_compose.sh """ } diff --git a/modules/info.nf b/modules/info.nf index cde8662..a317a3b 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -18,7 +18,7 @@ process IMAGES { NEXTFLOW_CONFIG="$nextflowConfig" JSON_FILE="$json" - source get_images_info.sh + source save_images_info.sh """ } @@ -59,7 +59,7 @@ process DATABASES { POPPUNK_EXT_JSON="$poppunk_ext_json" JSON_FILE="$json" - source get_databases_info.sh + source save_databases_info.sh """ } @@ -106,7 +106,7 @@ process TOOLS { ARIBA_VERSION="$ariba_version" JSON_FILE="$json" - source get_tools_info.sh + source save_tools_info.sh """ } @@ -135,7 +135,7 @@ process COMBINE_INFO { TOOLS="$tools" JSON_FILE="$json" - source combine_info.sh + source save_combined_info.sh """ } diff --git a/modules/lineage.nf b/modules/lineage.nf index 68edae3..9090f02 100644 --- a/modules/lineage.nf +++ b/modules/lineage.nf @@ -18,7 +18,7 @@ process GET_POPPUNK_DB { DB_LOCAL="$local" JSON_FILE="$json" - source get_poppunk_db.sh + source check-download_poppunk_db.sh """ } @@ -41,7 +41,7 @@ process GET_POPPUNK_EXT_CLUSTERS { EXT_CLUSTERS_LOCAL="$local" JSON_FILE="$json" - source get_poppunk_ext_clusters.sh + source check-download_poppunk_ext_clusters.sh """ } diff --git a/modules/mapping.nf b/modules/mapping.nf index d545607..e5d7c4e 100644 --- a/modules/mapping.nf +++ b/modules/mapping.nf @@ -1,5 +1,5 @@ // Return database path and prefix, construct if necessary -process CREATE_REF_GENOME_BWA_DB { +process GET_REF_GENOME_BWA_DB { label 'bwa_container' label 'farm_mid' @@ -20,7 +20,7 @@ process CREATE_REF_GENOME_BWA_DB { PREFIX="$prefix" JSON_FILE="$json" - source create_ref_genome_bwa_db.sh + source check-create_ref_genome_bwa_db.sh """ } @@ -152,6 +152,6 @@ process MAPPING_QC { QC_HET_SNP_SITE="$qc_het_snp_site" MAPPING_QC_REPORT="$mapping_qc_report" - source mapping_qc.sh + source get_mapping_qc.sh """ } diff --git a/modules/overall_qc.nf b/modules/overall_qc.nf index fa639d9..31c8cca 100644 --- a/modules/overall_qc.nf +++ b/modules/overall_qc.nf @@ -21,6 +21,6 @@ process OVERALL_QC { TAXONOMY_QC="$taxonomy_qc" OVERALL_QC_REPORT="$overall_qc_report" - source overall_qc.sh + source get_overall_qc.sh """ } diff --git a/modules/preprocess.nf b/modules/preprocess.nf index e04b756..e87ef99 100644 --- a/modules/preprocess.nf +++ b/modules/preprocess.nf @@ -48,6 +48,6 @@ process READ_QC { QC_DEPTH="$qc_depth" READ_QC_REPORT="$read_qc_report" - source read_qc.sh + source get_read_qc.sh """ } diff --git a/modules/serotype.nf b/modules/serotype.nf index 0c69bad..02327dd 100644 --- a/modules/serotype.nf +++ b/modules/serotype.nf @@ -1,5 +1,5 @@ -// Return boolean of CREATE_DB, download if necessary -process GET_SEROBA_DB { +// Return boolean of CREATE_DB, remove and clone if necessary +process CHECK_SEROBA_DB { label 'git_container' label 'farm_low' @@ -19,12 +19,12 @@ process GET_SEROBA_DB { KMER="$kmer" JSON_FILE="$json" - source get_seroba_db.sh + source check_seroba_db.sh """ } // Return SeroBA databases path, create databases if necessary -process CREATE_SEROBA_DB { +process GET_SEROBA_DB { label 'seroba_container' label 'farm_low' diff --git a/modules/taxonomy.nf b/modules/taxonomy.nf index b4d1e62..34ebeab 100644 --- a/modules/taxonomy.nf +++ b/modules/taxonomy.nf @@ -17,7 +17,7 @@ process GET_KRAKEN2_DB { DB_LOCAL="$local" JSON_FILE="$json" - source get_kraken2_db.sh + source check-download_kraken2_db.sh """ } @@ -73,6 +73,6 @@ process TAXONOMY_QC { QC_SPNEUMO_PERCENTAGE="$qc_spneumo_percentage" TAXONOMY_QC_REPORT="$taxonomy_qc_report" - source taxonomy_qc.sh + source get_taxonomy_qc.sh """ } diff --git a/workflows/init.nf b/workflows/init.nf index 64a748f..20eff25 100644 --- a/workflows/init.nf +++ b/workflows/init.nf @@ -1,25 +1,25 @@ // Import process modules -include { CREATE_REF_GENOME_BWA_DB } from "$projectDir/modules/mapping" +include { GET_REF_GENOME_BWA_DB } from "$projectDir/modules/mapping" include { GET_KRAKEN2_DB } from "$projectDir/modules/taxonomy" include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS } from "$projectDir/modules/lineage" -include { GET_SEROBA_DB; CREATE_SEROBA_DB } from "$projectDir/modules/serotype" +include { CHECK_SEROBA_DB; GET_SEROBA_DB } from "$projectDir/modules/serotype" include { GET_DOCKER_COMPOSE; PULL_IMAGES } from "$projectDir/modules/docker" -include { CREATE_ARIBA_DB } from "$projectDir/modules/amr" +include { GET_ARIBA_DB } from "$projectDir/modules/amr" // Alternative workflow for initialisation only workflow INIT { // Check Reference Genome BWA Database, generate from assembly if necessary - CREATE_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) + GET_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) // Check ARIBA database, generate from reference sequences and metadata if ncessary - CREATE_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) + GET_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) // Check Kraken2 Database, download if necessary GET_KRAKEN2_DB(params.kraken2_db_remote, params.kraken2_db_local) // Check SeroBA Databases, clone and rebuild if necessary - GET_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) - CREATE_SEROBA_DB(params.seroba_remote, params.seroba_local, GET_SEROBA_DB.out.create_db, params.seroba_kmer) + CHECK_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) + GET_SEROBA_DB(params.seroba_remote, params.seroba_local, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) // Check to PopPUNK Database and External Clusters, download if necessary GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_local) diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index fd288c9..01f3172 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -1,34 +1,34 @@ // Import process modules include { PREPROCESS; READ_QC } from "$projectDir/modules/preprocess" include { ASSEMBLY_UNICYCLER; ASSEMBLY_SHOVILL; ASSEMBLY_ASSESS; ASSEMBLY_QC } from "$projectDir/modules/assembly" -include { CREATE_REF_GENOME_BWA_DB; MAPPING; SAM_TO_SORTED_BAM; SNP_CALL; HET_SNP_COUNT; MAPPING_QC } from "$projectDir/modules/mapping" +include { GET_REF_GENOME_BWA_DB; MAPPING; SAM_TO_SORTED_BAM; SNP_CALL; HET_SNP_COUNT; MAPPING_QC } from "$projectDir/modules/mapping" include { GET_KRAKEN2_DB; TAXONOMY; TAXONOMY_QC } from "$projectDir/modules/taxonomy" include { OVERALL_QC } from "$projectDir/modules/overall_qc" include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS; LINEAGE } from "$projectDir/modules/lineage" -include { GET_SEROBA_DB; CREATE_SEROBA_DB; SEROTYPE } from "$projectDir/modules/serotype" +include { CHECK_SEROBA_DB; GET_SEROBA_DB; SEROTYPE } from "$projectDir/modules/serotype" include { MLST } from "$projectDir/modules/mlst" -include { PBP_RESISTANCE; GET_PBP_RESISTANCE; CREATE_ARIBA_DB; OTHER_RESISTANCE; GET_OTHER_RESISTANCE } from "$projectDir/modules/amr" +include { PBP_RESISTANCE; PARSE_PBP_RESISTANCE; GET_ARIBA_DB; OTHER_RESISTANCE; PARSE_OTHER_RESISTANCE } from "$projectDir/modules/amr" include { GENERATE_SAMPLE_REPORT; GENERATE_OVERALL_REPORT } from "$projectDir/modules/output" // Main pipeline workflow workflow PIPELINE { main: // Get path and prefix of Reference Genome BWA Database, generate from assembly if necessary - CREATE_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) + GET_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) // Get path to Kraken2 Database, download if necessary GET_KRAKEN2_DB(params.kraken2_db_remote, params.kraken2_db_local) // Get path to SeroBA Databases, clone and rebuild if necessary - GET_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) - CREATE_SEROBA_DB(params.seroba_remote, params.seroba_local, GET_SEROBA_DB.out.create_db, params.seroba_kmer) + CHECK_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) + GET_SEROBA_DB(params.seroba_remote, params.seroba_local, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) // Get paths to PopPUNK Database and External Clusters, download if necessary GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_local) GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_local) // Get path to ARIBA database, generate from reference sequences and metadata if ncessary - CREATE_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) + GET_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) // Get read pairs into Channel raw_read_pairs_ch raw_read_pairs_ch = Channel.fromFilePairs("$params.reads/*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}", checkIfExists: true) @@ -38,7 +38,7 @@ workflow PIPELINE { PREPROCESS(raw_read_pairs_ch) // From Channel PREPROCESS.out.json, provide Read QC status - // Output into Channel READ_QC_PASSED_READS_ch + // Output into Channels READ_QC.out.bases, READ_QC.out.result, READ_QC.out.report READ_QC(PREPROCESS.out.json, params.length_low, params.depth) // From Channel PREPROCESS.out.processed_reads, only output reads of samples passed Read QC based on Channel READ_QC.out.result @@ -47,7 +47,7 @@ workflow PIPELINE { .map { it[0, 2..-1] } // From Channel READ_QC_PASSED_READS_ch, assemble the preprocess read pairs - // Output into Channel ASSEMBLY_ch, and hardlink the assemblies to $params.output directory + // Output into Channel ASSEMBLY_ch, and hardlink (default) the assemblies to $params.output directory switch (params.assembler) { case 'shovill': ASSEMBLY_ch = ASSEMBLY_SHOVILL(READ_QC_PASSED_READS_ch, params.min_contig_length) @@ -59,10 +59,11 @@ workflow PIPELINE { } // From Channel ASSEMBLY_ch, assess assembly quality + // Output into Channel ASSEMBLY_ASSESS.out.report ASSEMBLY_ASSESS(ASSEMBLY_ch) // From Channel ASSEMBLY_ASSESS.out.report and Channel READ_QC.out.bases, provide Assembly QC status - // Output into Channels ASSEMBLY_QC.out.detailed_result & ASSEMBLY_QC.out.result + // Output into Channels ASSEMBLY_QC.out.result & ASSEMBLY_QC.out.report ASSEMBLY_QC( ASSEMBLY_ASSESS.out.report .join(READ_QC.out.bases, failOnDuplicate: true), @@ -74,7 +75,7 @@ workflow PIPELINE { // From Channel READ_QC_PASSED_READS_ch map reads to reference // Output into Channel MAPPING.out.sam - MAPPING(CREATE_REF_GENOME_BWA_DB.out.path, CREATE_REF_GENOME_BWA_DB.out.prefix, READ_QC_PASSED_READS_ch) + MAPPING(GET_REF_GENOME_BWA_DB.out.path, GET_REF_GENOME_BWA_DB.out.prefix, READ_QC_PASSED_READS_ch) // From Channel MAPPING.out.sam, Convert SAM into sorted BAM and calculate reference coverage // Output into Channels SAM_TO_SORTED_BAM.out.bam and SAM_TO_SORTED_BAM.out.ref_coverage @@ -82,10 +83,11 @@ workflow PIPELINE { // From Channel SAM_TO_SORTED_BAM.out.bam calculates non-cluster Het-SNP site count // Output into Channel HET_SNP_COUNT.out.result - SNP_CALL(params.ref_genome, SAM_TO_SORTED_BAM.out.bam, params.lite) | HET_SNP_COUNT + SNP_CALL(params.ref_genome, SAM_TO_SORTED_BAM.out.bam, params.lite) + HET_SNP_COUNT(SNP_CALL.out.vcf) // Merge Channels SAM_TO_SORTED_BAM.out.ref_coverage & HET_SNP_COUNT.out.result to provide Mapping QC Status - // Output into Channels MAPPING_QC.out.detailed_result & MAPPING_QC.out.result + // Output into Channels MAPPING_QC.out.result & MAPPING_QC.out.report MAPPING_QC( SAM_TO_SORTED_BAM.out.ref_coverage .join(HET_SNP_COUNT.out.result, failOnDuplicate: true, failOnMismatch: true), @@ -94,15 +96,15 @@ workflow PIPELINE { ) // From Channel READ_QC_PASSED_READS_ch assess Streptococcus pneumoniae percentage in reads - // Output into Channels TAXONOMY.out.detailed_result & TAXONOMY.out.result report + // Output into Channel TAXONOMY.out.report TAXONOMY(GET_KRAKEN2_DB.out.path, params.kraken2_memory_mapping, READ_QC_PASSED_READS_ch) // From Channel TAXONOMY.out.report, provide taxonomy QC status - // Output into Channels TAXONOMY_QC.out.detailed_result & TAXONOMY_QC.out.result report + // Output into Channels TAXONOMY_QC.out.result & TAXONOMY_QC.out.report TAXONOMY_QC(TAXONOMY.out.report, params.spneumo_percentage) - // Merge Channels ASSEMBLY_QC.out.result & MAPPING_QC.out.result & TAXONOMY_QC.out.result to provide Overall QC Status - // Output into Channel OVERALL_QC.out.result + // Merge Channels AREAD_QC.out.result & SSEMBLY_QC.out.result & MAPPING_QC.out.result & TAXONOMY_QC.out.result to provide Overall QC Status + // Output into Channel OVERALL_QC.out.result & OVERALL_QC.out.report OVERALL_QC( READ_QC.out.result .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true) @@ -121,31 +123,31 @@ workflow PIPELINE { .map { it[0, 2..-1] } // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, generate PopPUNK query file containing assemblies of samples passed overall QC - // Output into POPPUNK_QFILE POPPUNK_QFILE = OVERALL_QC_PASSED_ASSEMBLIES_ch .map { it.join'\t' } .collectFile(name: 'qfile.txt', newLine: true) // From generated POPPUNK_QFILE, assign GPSC to samples passed overall QC + // Output into Channel LINEAGE.out.reports (multiple reports from a single process) LINEAGE(GET_POPPUNK_DB.out.path, GET_POPPUNK_DB.out.database, GET_POPPUNK_EXT_CLUSTERS.out.file, POPPUNK_QFILE) // From Channel OVERALL_QC_PASSED_READS_ch, serotype the preprocess reads of samples passed overall QC - // Output into Channel SEROTYPE.out.result - SEROTYPE(CREATE_SEROBA_DB.out.path, CREATE_SEROBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) + // Output into Channel SEROTYPE.out.report + SEROTYPE(GET_SEROBA_DB.out.path, GET_SEROBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, PubMLST typing the assemblies of samples passed overall QC - // Output into Channel MLST.out.result + // Output into Channel MLST.out.report MLST(OVERALL_QC_PASSED_ASSEMBLIES_ch) // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, assign PBP genes and estimate MIC (minimum inhibitory concentration) for 6 Beta-lactam antibiotics - // Output into Channel GET_PBP_RESISTANCE.out.result + // Output into Channel PARSE_PBP_RESISTANCE.out.report PBP_RESISTANCE(OVERALL_QC_PASSED_ASSEMBLIES_ch) - GET_PBP_RESISTANCE(PBP_RESISTANCE.out.json) + PARSE_PBP_RESISTANCE(PBP_RESISTANCE.out.json) - // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, infer resistance (also determinants if any) of other antimicrobials - // Output into Channel GET_OTHER_RESISTANCE.out.result - OTHER_RESISTANCE(CREATE_ARIBA_DB.out.path, CREATE_ARIBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) - GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.reports, params.ariba_metadata) + // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, infer resistance and determinants of other antimicrobials + // Output into Channel PARSE_OTHER_RESISTANCE.out.result + OTHER_RESISTANCE(GET_ARIBA_DB.out.path, GET_ARIBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) + PARSE_OTHER_RESISTANCE(OTHER_RESISTANCE.out.reports, params.ariba_metadata) // Generate sample reports by merging outputs from all result-generating modules GENERATE_SAMPLE_REPORT( @@ -156,8 +158,8 @@ workflow PIPELINE { .join(OVERALL_QC.out.report, failOnDuplicate: true, remainder: true) .join(SEROTYPE.out.report, failOnDuplicate: true, remainder: true) .join(MLST.out.report, failOnDuplicate: true, remainder: true) - .join(GET_PBP_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) - .join(GET_OTHER_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) + .join(PARSE_PBP_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) + .join(PARSE_OTHER_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name .map { [it[0], it[1..-1].minus(null)] } // Map Sample_ID to index 0 and all reports (with null entries removed) as a list to index 1 ) @@ -166,10 +168,10 @@ workflow PIPELINE { GENERATE_OVERALL_REPORT(GENERATE_SAMPLE_REPORT.out.report.collect(), params.ariba_metadata) // Pass databases information to SAVE_INFO sub-workflow - DATABASES_INFO = CREATE_REF_GENOME_BWA_DB.out.path.map { [["bwa_db_path", it]] } - .merge(CREATE_ARIBA_DB.out.path.map { [["ariba_db_path", it]] }) + DATABASES_INFO = GET_REF_GENOME_BWA_DB.out.path.map { [["bwa_db_path", it]] } + .merge(GET_ARIBA_DB.out.path.map { [["ariba_db_path", it]] }) .merge(GET_KRAKEN2_DB.out.path.map { [["kraken2_db_path", it]] }) - .merge(CREATE_SEROBA_DB.out.path.map { [["seroba_db_path", it]] }) + .merge(GET_SEROBA_DB.out.path.map { [["seroba_db_path", it]] }) .merge(GET_POPPUNK_DB.out.path.map { [["poppunk_db_path", it]] }) .merge(GET_POPPUNK_EXT_CLUSTERS.out.file.map { [["poppunk_ext_file", it]] }) // Save key-value tuples into a map From 9157a20cf7e1264e313d658b2f0725701bd3a28a Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 1 Aug 2023 15:10:13 +0000 Subject: [PATCH 071/157] Extract scripts to separated files Former-commit-id: 76afb5731cc9171b4528393a9155998a7dde0e3d --- bin/call_snp.sh | 8 ++++++++ bin/convert_sam_to_sorted_bam.sh | 11 +++++++++++ modules/mapping.nf | 31 ++++++++++++++----------------- workflows/pipeline.nf | 6 +++--- 4 files changed, 36 insertions(+), 20 deletions(-) create mode 100755 bin/call_snp.sh create mode 100755 bin/convert_sam_to_sorted_bam.sh diff --git a/bin/call_snp.sh b/bin/call_snp.sh new file mode 100755 index 0000000..d3fba63 --- /dev/null +++ b/bin/call_snp.sh @@ -0,0 +1,8 @@ +# Call SNPs and save to .vcf +# Remove source sorted BAM file if $LITE is true + +bcftools mpileup --threads "$(nproc)" -f "$REFERENCE" "$SORTED_BAM" | bcftools call --threads "$(nproc)" -mv -O v -o "$VCF" + +if [ "$LITE" = true ]; then + rm "$(readlink -f "$SORTED_BAM")" +fi diff --git a/bin/convert_sam_to_sorted_bam.sh b/bin/convert_sam_to_sorted_bam.sh new file mode 100755 index 0000000..c730a5f --- /dev/null +++ b/bin/convert_sam_to_sorted_bam.sh @@ -0,0 +1,11 @@ +# Convet SAM to sorted BAM file +# Remove source SAM file if $LITE is true + +samtools view -@ "$(nproc)" -b "$SAM" > "$BAM" + +samtools sort -@ "$(nproc)" -o "$SORTED_BAM" "$BAM" +rm "$BAM" + +if [ "$LITE" = true ]; then + rm "$(readlink -f "$SAM")" +fi diff --git a/modules/mapping.nf b/modules/mapping.nf index e5d7c4e..964a415 100644 --- a/modules/mapping.nf +++ b/modules/mapping.nf @@ -43,7 +43,7 @@ process MAPPING { script: sam="${sample_id}_mapped.sam" """ - bwa mem -t `nproc` "${bwa_ref_db_dir}/${prefix}" <(zcat -f -- < "$read1") <(zcat -f -- < "$read2") > "$sam" + bwa mem -t "`nproc`" "${bwa_ref_db_dir}/${prefix}" <(zcat -f -- < "$read1") <(zcat -f -- < "$read2") > "$sam" """ } @@ -60,22 +60,18 @@ process SAM_TO_SORTED_BAM { val lite output: - tuple val(sample_id), path(bam), emit: bam + tuple val(sample_id), path(sorted_bam), emit: sorted_bam tuple val(sample_id), env(COVERAGE), emit: ref_coverage script: - bam="${sample_id}_mapped_sorted.bam" + sorted_bam="${sample_id}_mapped_sorted.bam" """ - samtools view -@ `nproc` -b "$sam" > mapped.bam + SAM="$sam" + BAM="mapped.bam" + SORTED_BAM="$sorted_bam" + LITE="$lite" - samtools sort -@ `nproc` -o "$bam" mapped.bam - rm mapped.bam - - if [ $lite = true ]; then - rm `readlink -f "$sam"` - fi - - BAM="$bam" + source convert_sam_to_sorted_bam.sh source get_ref_coverage.sh """ } @@ -89,7 +85,7 @@ process SNP_CALL { input: path reference - tuple val(sample_id), path(bam) + tuple val(sample_id), path(sorted_bam) val lite output: @@ -98,11 +94,12 @@ process SNP_CALL { script: vcf="${sample_id}.vcf" """ - bcftools mpileup --threads `nproc` -f "$reference" "$bam" | bcftools call --threads `nproc` -mv -O v -o "$vcf" + REFERENCE="$reference" + SORTED_BAM="$sorted_bam" + VCF="$vcf" + LITE="$lite" - if [ $lite = true ]; then - rm `readlink -f "$bam"` - fi + source call_snp.sh """ } diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 01f3172..51a93f9 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -78,12 +78,12 @@ workflow PIPELINE { MAPPING(GET_REF_GENOME_BWA_DB.out.path, GET_REF_GENOME_BWA_DB.out.prefix, READ_QC_PASSED_READS_ch) // From Channel MAPPING.out.sam, Convert SAM into sorted BAM and calculate reference coverage - // Output into Channels SAM_TO_SORTED_BAM.out.bam and SAM_TO_SORTED_BAM.out.ref_coverage + // Output into Channels SAM_TO_SORTED_BAM.out.sorted_bam and SAM_TO_SORTED_BAM.out.ref_coverage SAM_TO_SORTED_BAM(MAPPING.out.sam, params.lite) - // From Channel SAM_TO_SORTED_BAM.out.bam calculates non-cluster Het-SNP site count + // From Channel SAM_TO_SORTED_BAM.out.sorted_bam calculates non-cluster Het-SNP site count // Output into Channel HET_SNP_COUNT.out.result - SNP_CALL(params.ref_genome, SAM_TO_SORTED_BAM.out.bam, params.lite) + SNP_CALL(params.ref_genome, SAM_TO_SORTED_BAM.out.sorted_bam, params.lite) HET_SNP_COUNT(SNP_CALL.out.vcf) // Merge Channels SAM_TO_SORTED_BAM.out.ref_coverage & HET_SNP_COUNT.out.result to provide Mapping QC Status From 7b024a18dce9953431865a17ebef2a391828efd6 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 1 Aug 2023 15:11:11 +0000 Subject: [PATCH 072/157] Improve shell scripts style Former-commit-id: 2e625f5e2ee984c2dbd4966f9eedb538d3f27b96 --- bin/get_ref_coverage.sh | 4 ++-- modules/assembly.nf | 4 ++-- modules/preprocess.nf | 2 +- modules/taxonomy.nf | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bin/get_ref_coverage.sh b/bin/get_ref_coverage.sh index 69a131b..199c481 100755 --- a/bin/get_ref_coverage.sh +++ b/bin/get_ref_coverage.sh @@ -1,4 +1,4 @@ # Return reference coverage percentage by the reads -samtools index -@ `nproc` "$BAM" -COVERAGE=$(samtools coverage "$BAM" | awk -F'\t' 'FNR==2 {print $6}') +samtools index -@ "$(nproc)" "$SORTED_BAM" +COVERAGE=$(samtools coverage "$SORTED_BAM" | awk -F'\t' 'FNR==2 {print $6}') diff --git a/modules/assembly.nf b/modules/assembly.nf index ab66b6e..6bf7b79 100644 --- a/modules/assembly.nf +++ b/modules/assembly.nf @@ -20,7 +20,7 @@ process ASSEMBLY_UNICYCLER { script: fasta="${sample_id}.contigs.fasta" """ - unicycler -1 "$read1" -2 "$read2" -s "$unpaired" -o results -t `nproc` --min_fasta_length "$min_contig_length" + unicycler -1 "$read1" -2 "$read2" -s "$unpaired" -o results -t "`nproc`" --min_fasta_length "$min_contig_length" mv results/assembly.fasta "${fasta}" """ } @@ -47,7 +47,7 @@ process ASSEMBLY_SHOVILL { script: fasta="${sample_id}.contigs.fasta" """ - shovill --R1 "$read1" --R2 "$read2" --outdir results --cpus `nproc` --minlen "$min_contig_length" --force + shovill --R1 "$read1" --R2 "$read2" --outdir results --cpus "`nproc`" --minlen "$min_contig_length" --force mv results/contigs.fa "${fasta}" """ } diff --git a/modules/preprocess.nf b/modules/preprocess.nf index e87ef99..1e89da7 100644 --- a/modules/preprocess.nf +++ b/modules/preprocess.nf @@ -19,7 +19,7 @@ process PREPROCESS { processed_two="processed-${sample_id}_2.fastq.gz" processed_unpaired="processed-${sample_id}_unpaired.fastq.gz" """ - fastp --thread `nproc` --in1 "$read_one" --in2 "$read_two" --out1 "$processed_one" --out2 "$processed_two" --unpaired1 "$processed_unpaired" --unpaired2 "$processed_unpaired" + fastp --thread "`nproc`" --in1 "$read_one" --in2 "$read_two" --out1 "$processed_one" --out2 "$processed_two" --unpaired1 "$processed_unpaired" --unpaired2 "$processed_unpaired" """ } diff --git a/modules/taxonomy.nf b/modules/taxonomy.nf index 34ebeab..735b59d 100644 --- a/modules/taxonomy.nf +++ b/modules/taxonomy.nf @@ -41,11 +41,11 @@ process TAXONOMY { if (kraken2_memory_mapping === true) """ - kraken2 --threads `nproc` --use-names --memory-mapping --db "$kraken2_db" --paired "$read1" "$read2" --report "$report" --output - + kraken2 --threads "`nproc`" --use-names --memory-mapping --db "$kraken2_db" --paired "$read1" "$read2" --report "$report" --output - """ else if (kraken2_memory_mapping === false) """ - kraken2 --threads `nproc` --use-names --db "$kraken2_db" --paired "$read1" "$read2" --report "$report" --output - + kraken2 --threads "`nproc`" --use-names --db "$kraken2_db" --paired "$read1" "$read2" --report "$report" --output - """ else error "The value for --kraken2_memory_mapping is not valid." From 2bc27c7e2a81bd9a501603ad0296e30029ff4af5 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 1 Aug 2023 17:02:31 +0000 Subject: [PATCH 073/157] Improve option names consistency Former-commit-id: 0c70390e8657999edfb717a9a94df8fbfb8806dc --- modules/validate.nf | 6 +++--- nextflow.config | 6 +++--- workflows/info_and_version.nf | 4 ++-- workflows/init.nf | 8 ++++---- workflows/pipeline.nf | 8 ++++---- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/modules/validate.nf b/modules/validate.nf index 4cf3438..f2bceb0 100644 --- a/modules/validate.nf +++ b/modules/validate.nf @@ -8,8 +8,8 @@ validParams = [ assembler: 'assembler', min_contig_length: 'int', assembly_publish: 'publish_mode', - seroba_remote: 'url_git', - seroba_local: 'path', + seroba_db_remote: 'url_git', + seroba_db_local: 'path', seroba_kmer: 'int', kraken2_db_remote: 'url_targz', kraken2_db_local: 'path', @@ -18,7 +18,7 @@ validParams = [ ref_genome_bwa_db_local: 'path', poppunk_db_remote: 'url_targz', poppunk_ext_remote: 'url_csv', - poppunk_local: 'path', + poppunk_db_local: 'path', spneumo_percentage: 'int_float', ref_coverage: 'int_float', het_snp_site: 'int', diff --git a/nextflow.config b/nextflow.config index 1a322f1..6048efb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -20,8 +20,8 @@ params { assembly_publish = "link" // Default git repository and local directory, and KMC kmer size for SeroBA - seroba_remote = "https://github.com/sanger-pathogens/seroba.git" - seroba_local = "$projectDir/databases/seroba" + seroba_db_remote = "https://github.com/sanger-pathogens/seroba.git" + seroba_db_local = "$projectDir/databases/seroba" seroba_kmer = 71 // Default link and local directory for Kraken2 Database, and usage of memory mapping @@ -36,7 +36,7 @@ params { // Default links for PopPUNK Database and External Clusters, and local directory for both poppunk_db_remote = "https://gps-project.cog.sanger.ac.uk/GPS_v6.tar.gz" poppunk_ext_remote = "https://www.pneumogen.net/gps/GPS_v6_external_clusters.csv" - poppunk_local = "$projectDir/databases/poppunk" + poppunk_db_local = "$projectDir/databases/poppunk" // Default values for QC spneumo_percentage = 60.00 diff --git a/workflows/info_and_version.nf b/workflows/info_and_version.nf index bb5ce37..186dadf 100644 --- a/workflows/info_and_version.nf +++ b/workflows/info_and_version.nf @@ -10,8 +10,8 @@ workflow PRINT_VERSION { params.ref_genome_bwa_db_local, params.ariba_db_local, params.kraken2_db_local, - params.seroba_local, - params.poppunk_local, + params.seroba_db_local, + params.poppunk_db_local, pipeline_version ) \ | PARSE \ diff --git a/workflows/init.nf b/workflows/init.nf index 20eff25..7d1ed77 100644 --- a/workflows/init.nf +++ b/workflows/init.nf @@ -18,12 +18,12 @@ workflow INIT { GET_KRAKEN2_DB(params.kraken2_db_remote, params.kraken2_db_local) // Check SeroBA Databases, clone and rebuild if necessary - CHECK_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) - GET_SEROBA_DB(params.seroba_remote, params.seroba_local, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) + CHECK_SEROBA_DB(params.seroba_db_remote, params.seroba_db_local, params.seroba_kmer) + GET_SEROBA_DB(params.seroba_db_remote, params.seroba_db_local, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) // Check to PopPUNK Database and External Clusters, download if necessary - GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_local) - GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_local) + GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_db_local) + GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_db_local) // Pull all Docker images mentioned in nextflow.config if using Docker if (workflow.containerEngine === 'docker') { diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 51a93f9..e39599f 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -20,12 +20,12 @@ workflow PIPELINE { GET_KRAKEN2_DB(params.kraken2_db_remote, params.kraken2_db_local) // Get path to SeroBA Databases, clone and rebuild if necessary - CHECK_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) - GET_SEROBA_DB(params.seroba_remote, params.seroba_local, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) + CHECK_SEROBA_DB(params.seroba_db_remote, params.seroba_db_local, params.seroba_kmer) + GET_SEROBA_DB(params.seroba_db_remote, params.seroba_db_local, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) // Get paths to PopPUNK Database and External Clusters, download if necessary - GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_local) - GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_local) + GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_db_local) + GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_db_local) // Get path to ARIBA database, generate from reference sequences and metadata if ncessary GET_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) From 109da30408076c265cc52e43122557b5e93af7fb Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 1 Aug 2023 17:02:46 +0000 Subject: [PATCH 074/157] Improve help message Former-commit-id: d76351c9528a7ad8e2b0a551e63a0ca361a0b80a --- modules/messages.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/messages.nf b/modules/messages.nf index 7a9d189..78e9cd3 100644 --- a/modules/messages.nf +++ b/modules/messages.nf @@ -30,7 +30,7 @@ void helpMessage() { |--reads [PATH] Path to the input directory that contains the reads to be processed |--output [PATH] Path to the output directory that save the results |--init Alternative workflow for initialisation - |--version Alternative workflow for getting versions of pipeline, tools and databases + |--version Alternative workflow for getting versions of pipeline, container images, tools and databases | |For all available options, please refer to README.md '''.stripMargin() From 2a1135b5dd2294b019739fe1af73fed8e2a5b9f2 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 1 Aug 2023 17:03:39 +0000 Subject: [PATCH 075/157] Improve content and update to reflect changes Former-commit-id: 7935001f2b822e7d845d43cdf4ff091b0c46ead0 --- README.md | 127 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 65 insertions(+), 62 deletions(-) diff --git a/README.md b/README.md index aa68b3e..cb10fa9 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ # GPS Unified Pipeline -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-23.04.1-23aa62.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-23.04.2-23aa62.svg)](https://www.nextflow.io/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/singularity/) The GPS Unified Pipeline is a Nextflow pipeline designed for processing raw reads (FASTQ files) of *Streptococcus pneumoniae* samples. After preprocessing, the pipeline performs initial assessment based on the total bases in reads. Passed samples will be further assess based on assembly, mapping, and taxonomy. If the sample passes all quality controls (QC), the pipeline also provides the sample's serotype, multi-locus sequence typing (MLST), lineage (based on the [Global Pneumococcal Sequence Cluster (GPSC)](https://www.pneumogen.net/gps/GPSC_lineages.html)), and antimicrobial resistance (AMR) against multiple antimicrobials. -The pipeline is designed to be easy to set up and use, and is suitable for use on local machines. It is also offline-capable, making it an ideal option for cases where the FASTQ files being analysed should not leave the local machine. Additionally, the pipeline only downloads essential files to enable the analysis, and no data is uploaded from the local machine. After initialisation or the first successful complete run, the pipeline can be used offline unless you have changed the selection of any database or container image. +The pipeline is designed to be easy to set up and use, and is suitable for use on local machines and high-performance computing (HPC) clusters alike. Additionally, the pipeline only downloads essential files to enable the analysis, and no data is uploaded from the local environment, making it an ideal option for cases where the FASTQ files being analysed is confidential. After initialisation or the first successful complete run, the pipeline can be used offline unless you have changed the selection of any database or container image. The development of this pipeline is part of the GPS Project ([Global Pneumococcal Sequencing Project](https://www.pneumogen.net/gps/)). @@ -57,7 +57,7 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca > - The pipeline generates ~1.8GB intermediate files for each sample on average
(These files can be removed when the pipeline run is completed, please refer to [Clean Up](#clean-up))
(To further reduce storage requirement by sacrificing the ability to resume the pipeline, please refer to [Experimental](#experimental)) ## Accepted Inputs -- Currently, only Illumina paired-end short reads are supported +- Only Illumina paired-end short reads are supported - Each sample is expected to be a pair of raw reads following this file name pattern: - `*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}` - example 1: `SampleName_R1_001.fastq.gz`, `SampleName_R2_001.fastq.gz` @@ -70,18 +70,18 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca ``` or - Download and unzip the [repository](https://github.com/HarryHung/gps-unified-pipeline/archive/refs/heads/master.zip) -2. Go into the local copy of the repository + Download and unzip the [latest release](https://github.com/HarryHung/gps-unified-pipeline/releases) +2. Go into the local copy of the repository and the pipeline is ready to use without installation ``` cd gps-unified-pipeline ``` 3. (Optional) You could perform an initialisation to download all required additional files and container images, so the pipeline can be used at any time with or without the Internet afterwards. > ⚠️ Docker or Singularity must be running, and an Internet connection is required. - - For those using Docker as the container engine + - Using Docker as the container engine ``` ./run_pipeline --init ``` - - For those using Singularity as the container engine + - Using Singularity as the container engine ``` ./run_pipeline --init -profile singularity ``` @@ -91,8 +91,8 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca > ⚠️ If this is the first run and initialisation was not performed, an Internet connection is required. -> ℹ️ By default, Docker is used as the container engine and all the processes are executed by the local machine. See [Profile](#profile) for details on running the pipeline with Singularity or on a server farm. -- You can run the pipeline without options. It will attempt to get the raw reads from the default location (`input` directory inside the `gps-unified-pipeline` local repository) +> ℹ️ By default, Docker is used as the container engine and all the processes are executed by the local machine. See [Profile](#profile) for details on running the pipeline with Singularity or on a HPC cluster. +- You can run the pipeline without options. It will attempt to get the raw reads from the default location (i.e. `input` directory inside the `gps-unified-pipeline` local repository) ``` ./run_pipeline ``` @@ -113,29 +113,30 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca ``` ./run_pipeline -profile [profile name] ``` -- Currently, the following profiles are available +- Available profiles: | Profile Name | Details | | --- | --- | | `standard`
(Default) | Docker is used as the container engine.
Processes are executed locally. | | `singularity` | Singularity is used as the container engine.
Processes are executed locally. | - | `lsf` | **The pipeline should be launched from a LSF cluster head node with this profile.**
Singularity is used as the container engine.
Processes are submitted to your LSF cluster via `bsub`.
(Tested on Sanger farm5) | + | `lsf` | **The pipeline should be launched from a LSF cluster head node with this profile.**
Singularity is used as the container engine.
Processes are submitted to your LSF cluster via `bsub` by the pipeline.
(Tested on Sanger farm5 cluster only) | ## Resume - If the pipeline is interrupted mid-run, Nextflow's built-in `-resume` option can be used to resume the pipeline execution instead of starting from scratch again - You should use the same command of the original run, only add `-resume` at the end (i.e. all pipeline options should be identical) > ℹ️ `-resume` is a built-in Nextflow option, it only has one leading `-` - ``` - # original command - ./run_pipeline --reads /path/to/raw-reads-directory - - # command to resume the pipeline execution - ./run_pipeline --reads /path/to/raw-reads-directory -resume - ``` + - If the original command is + ``` + ./run_pipeline --reads /path/to/raw-reads-directory + ``` + - The command to resume the pipeline execution should be + ``` + ./run_pipeline --reads /path/to/raw-reads-directory -resume + ``` ## Clean Up - During the run of the pipeline, Nextflow generates a considerable amount of intermediate files -- If the run has been completed and you do not intend to use the `-resume` option, you can remove the intermediate files using one of the following ways: - - Run `clean_pipeline` script +- If the run has been completed and you do not intend to use the `-resume` option or those intermediate files, you can remove the intermediate files using one of the following ways: + - Run the included `clean_pipeline` script - It runs the commands in manual removal for you - It removes the `work` directory and log files within the `gps-unified-pipeline` local repository ``` @@ -167,13 +168,13 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca > ℹ️ `$projectDir` is a [Nextflow built-in implicit variables](https://www.nextflow.io/docs/latest/script.html?highlight=projectdir#implicit-variables), it is defined as the directory where the `gps-unified-pipeline` local repository is stored. -> ℹ️ They are not built-in Nextflow options, hence lead with `--` instead of `-` +> ℹ️ Pipeline options are not built-in Nextflow options, they are lead with `--` instead of `-` ## Alternative Workflows | Option | Values | Description | | --- | ---| --- | - | `--init` | `true` or `false`
(Default: `false`) | Use alternative workflow for initialisation, which means downloading all required additional files and container images.
Can be enabled by including `--init` without value. | - | `--version` | `true` or `false`
(Default: `false`)| Use alternative workflow for getting versions of pipeline, tools and databases.
Can be enabled by including `--version` without value.
(This workflow pulls the required container images if they are not yet available locally) | + | `--init` | `true` or `false`
(Default: `false`) | Use alternative workflow for initialisation, which means downloading all required additional files and container images, and creating databases.
Can be enabled by including `--init` without value. | + | `--version` | `true` or `false`
(Default: `false`)| Use alternative workflow for showing versions of pipeline, container images, tools and databases.
Can be enabled by including `--version` without value.
(This workflow pulls the required container images if they are not yet available locally) | | `--help` | `true` or `false`
(Default: `false`)| Show help message.
Can be enabled by including `--help` without value. | ## Input and Output @@ -199,7 +200,7 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca ## Assembly | Option | Values | Description | | --- | ---| --- | - | `--assembler` | `"shovill"` or `"unicycler"`
(Default: `"shovill"`)| SPAdes Assembler to assemble the reads. | + | `--assembler` | `"shovill"` or `"unicycler"`
(Default: `"shovill"`)| Using which SPAdes-based assembler to assemble the reads. | | `--min_contig_length` | Any integer value
(Default: `500`) | Minimum legnth of contig to be included in the assembly | ## Mapping @@ -220,22 +221,22 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | `--kraken2_memory_mapping` | `true` or `false`
(Default: `true`) | Using the memory mapping option of Kraken2 or not.
`true` means not loading the database into RAM, suitable for memory-limited or fast storage environments. | ## Serotype - > ⚠️ `--seroba_local` does not accept user provided local database, directory content will be overwritten + > ⚠️ `--seroba_db_local` does not accept user provided local database, directory content will be overwritten | Option | Values | Description | | --- | ---| --- | - | `--seroba_remote` | Any valid URL to a Git remote repository
(Default: [SeroBA GitHub Repo](https://github.com/sanger-pathogens/seroba.git))| URL to a SeroBA Git remote repository. | - | `--seroba_local` | Any valid path
(Default: `"$projectDir/databases/seroba"`) | Path to the directory where SeroBA local repository should be saved to. | + | `--seroba_db_remote` | Any valid URL to a Git remote repository
(Default: [SeroBA GitHub Repo](https://github.com/sanger-pathogens/seroba.git))| URL to a SeroBA Git remote repository. | + | `--seroba_db_local` | Any valid path
(Default: `"$projectDir/databases/seroba"`) | Path to the directory where SeroBA local repository should be saved to. | | `--seroba_kmer` | Any integer value
(Default: `71`) | Kmer size for creating the KMC database of SeroBA. | ## Lineage - > ⚠️ `--poppunk_local` does not accept user provided local database, directory content will be overwritten + > ⚠️ `--poppunk_db_local` does not accept user provided local database, directory content will be overwritten | Option | Values | Description | | --- | ---| --- | | `--poppunk_db_remote` | Any valid URL to a PopPUNK database in `.tar.gz` or `.tgz` format
(Default: [GPS v6](https://gps-project.cog.sanger.ac.uk/GPS_v6.tar.gz)) | URL to a PopPUNK database. | | `--poppunk_ext_remote` | Any valid URL to a PopPUNK external clusters file in `.csv` format
(Default: [GPS v6 GPSC Designation](https://www.pneumogen.net/gps/GPS_v6_external_clusters.csv)) | URL to a PopPUNK external clusters file. | - | `--poppunk_local` | Any valid path
(Default: `"$projectDir/databases/poppunk"`) | Path to the directory where the remote PopPUNK database and external clusters file should be saved to. | + | `--poppunk_db_local` | Any valid path
(Default: `"$projectDir/databases/poppunk"`) | Path to the directory where the remote PopPUNK database and external clusters file should be saved to. | ## Other AMR > ⚠️ `--ariba_db_local` does not accept user provided local database, directory content will be overwritten @@ -276,9 +277,11 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca - The following fields can be found in the output `results.csv` > ℹ️ For resistance phenotypes: S = Sensitive/Susceptible; I = Intermediate; R = Resistant + > ℹ️ * The exact output fields of Other AMR depends on the provided ARIBA database, the below table is based on the default ARIBA database + > ⚠️ If the result of `Overall_QC` of a sample is `ASSEMBLER FAILURE`, the assembler has crashed when trying to assembly the reads. You might want to re-run the sample with [another assembler](#assembly), or discard the sample if it is a low quality one. - > ⚠️ If the result of `Serotype` of a sample is `SEROBA FAILURE`, SeroBA has crashed when trying to serotype the sample. Please report the issue. + > ⚠️ If the result of `Serotype` of a sample is `SEROBA FAILURE`, SeroBA has crashed when trying to serotype the sample. | Field | Type | Description | | --- | --- | --- | @@ -323,38 +326,38 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | `PEN_MIC` | PBP AMR | Estimated MIC of penicillin (PEN) | | `PEN_Res(Meningital)` | PBP AMR | Resistance phenotype against PEN in meningital form | | `PEN_Res(Non-meningital)` | PBP AMR | Resistance phenotype against PEN in non-meningital form | - | `CHL_Res` | Other AMR | Resistance phenotype against Chloramphenicol (CHL) | - | `CHL_Determinant` | Other AMR | Known determinants that inferred the CHL resistance | - | `ERY_Res` | Other AMR | Resistance phenotype against Erythromycin (ERY) | - | `ERY_Determinant` | Other AMR | Known determinants that inferred the ERY resistance | - | `CLI_Res` | Other AMR | Resistance phenotype against Clindamycin (CLI) | - | `CLI_Determinant` | Other AMR | Known determinants that inferred the CLI resistance | - | `ERY_CLI_Res` | Other AMR | Resistance phenotype against Erythromycin (ERY) and Clindamycin (CLI) | - | `ERY_CLI_Determinant` | Other AMR | Known determinants that inferred the ERY and CLI resistance | - | `FQ_Res` | Other AMR | Resistance phenotype against Fluoroquinolones (FQ) | - | `FQ_Determinant` | Other AMR | Known determinants that inferred the FQ resistance | - | `LFX_Res` | Other AMR | Resistance phenotype against Levofloxacin (LFX) | - | `LFX_Determinant` | Other AMR | Known determinants that inferred the LFX resistance | - | `KAN_Res` | Other AMR | Resistance phenotype against Kanamycin (KAN) | - | `KAN_Determinant` | Other AMR | Known determinants that inferred the KAN resistance | - | `TET_Res` | Other AMR | Resistance phenotype against Tetracycline (TET) | - | `TET_Determinant` | Other AMR | Known determinants that inferred the TET resistance | - | `DOX_Res` | Other AMR | Resistance phenotype against Doxycycline (DOX) | - | `DOX_Determinant` | Other AMR | Known determinants that inferred the DOX resistance | - | `TMP_Res` | Other AMR | Resistance phenotype against Trimethoprim (TMP) | - | `TMP_Determinant` | Other AMR | Known determinants that inferred the TMP resistance | - | `SMX_Res` | Other AMR | Resistance phenotype against Sulfamethoxazole (SMX) | - | `SMX_Determinant` | Other AMR | Known determinants that inferred the SMX resistance | - | `COT_Res` | Other AMR | Resistance phenotype against Co-Trimoxazole (COT) | - | `COT_Determinant` | Other AMR | Known determinants that inferred the COT resistance | - | `RIF_Res` | Other AMR | Resistance phenotype against Rifampin (RIF) | - | `RIF_Determinant` | Other AMR | Known determinants that inferred the RIF resistance | - | `VAN_Res` | Other AMR | Resistance phenotype against Vancomycin (VAN) | - | `VAN_Determinant` | Other AMR | Known determinants that inferred the VAN resistance | - | `PILI1` | Other AMR | Expression of PILI-1 | - | `PILI1_Determinant` | Other AMR | Known determinants that inferred the PILI-1 expression | - | `PILI2` | Other AMR | Expression of PILI-2 | - | `PILI2_Determinant` | Other AMR | Known determinants that inferred the PILI-2 expression | + | `CHL_Res` | Other AMR* | Resistance phenotype against Chloramphenicol (CHL) | + | `CHL_Determinant` | Other AMR* | Known determinants that inferred the CHL resistance | + | `CLI_Res` | Other AMR* | Resistance phenotype against Clindamycin (CLI) | + | `CLI_Determinant` | Other AMR* | Known determinants that inferred the CLI resistance | + | `COT_Res` | Other AMR* | Resistance phenotype against Co-Trimoxazole (COT) | + | `COT_Determinant` | Other AMR* | Known determinants that inferred the COT resistance | + | `DOX_Res` | Other AMR* | Resistance phenotype against Doxycycline (DOX) | + | `DOX_Determinant` | Other AMR* | Known determinants that inferred the DOX resistance | + | `ERY_Res` | Other AMR* | Resistance phenotype against Erythromycin (ERY) | + | `ERY_Determinant` | Other AMR* | Known determinants that inferred the ERY resistance | + | `ERY_CLI_Res` | Other AMR* | Resistance phenotype against Erythromycin (ERY) and Clindamycin (CLI) | + | `ERY_CLI_Determinant` | Other AMR* | Known determinants that inferred the ERY and CLI resistance | + | `FQ_Res` | Other AMR* | Resistance phenotype against Fluoroquinolones (FQ) | + | `FQ_Determinant` | Other AMR* | Known determinants that inferred the FQ resistance | + | `KAN_Res` | Other AMR* | Resistance phenotype against Kanamycin (KAN) | + | `KAN_Determinant` | Other AMR* | Known determinants that inferred the KAN resistance | + | `LFX_Res` | Other AMR* | Resistance phenotype against Levofloxacin (LFX) | + | `LFX_Determinant` | Other AMR* | Known determinants that inferred the LFX resistance | + | `RIF_Res` | Other AMR* | Resistance phenotype against Rifampin (RIF) | + | `RIF_Determinant` | Other AMR* | Known determinants that inferred the RIF resistance | + | `SMX_Res` | Other AMR* | Resistance phenotype against Sulfamethoxazole (SMX) | + | `SMX_Determinant` | Other AMR* | Known determinants that inferred the SMX resistance | + | `TET_Res` | Other AMR* | Resistance phenotype against Tetracycline (TET) | + | `TET_Determinant` | Other AMR* | Known determinants that inferred the TET resistance | + | `TMP_Res` | Other AMR* | Resistance phenotype against Trimethoprim (TMP) | + | `TMP_Determinant` | Other AMR* | Known determinants that inferred the TMP resistance | + | `VAN_Res` | Other AMR* | Resistance phenotype against Vancomycin (VAN) | + | `VAN_Determinant` | Other AMR* | Known determinants that inferred the VAN resistance | + | `PILI1` | Other AMR* | Expression of PILI-1 | + | `PILI1_Determinant` | Other AMR* | Known determinants that inferred the PILI-1 expression | + | `PILI2` | Other AMR* | Expression of PILI-2 | + | `PILI2_Determinant` | Other AMR* | Known determinants that inferred the PILI-2 expression |   # Credits From 4fd29070fdbc412cb6762164c62ce60a0424b39f Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 2 Aug 2023 09:54:56 +0000 Subject: [PATCH 076/157] Update version of ARIBA container Former-commit-id: 457b99629e7fc812230497d9f04cbe93b3cbea12 --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 6048efb..268f05a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -96,7 +96,7 @@ process { container = 'harryhungch/spn-pbp-amr:23.01.16' } withLabel: ariba_container { - container = 'staphb/ariba:2.14.4' + container = 'staphb/ariba:2.14.6' } withLabel: mlst_container { container = 'staphb/mlst:2.23.0' From cdb0251f4619c83fcad940463d7734cfb89ba33c Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 2 Aug 2023 09:55:21 +0000 Subject: [PATCH 077/157] Update credits section Former-commit-id: ebc7ce5dd388af8c8a3f821d5f02842d331d799a --- README.md | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index cb10fa9..bda84c2 100644 --- a/README.md +++ b/README.md @@ -366,39 +366,39 @@ This project uses open-source components. You can find the homepage or source co [ARIBA](https://sanger-pathogens.github.io/ariba/) - ARIBA: rapid antimicrobial resistance genotyping directly from sequencing reads Hunt M, Mather AE, Sánchez-Busó L, Page AJ, Parkhill J , Keane JA, Harris SR. Microbial Genomics 2017. doi: [110.1099/mgen.0.000131](http://mgen.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000131) - License (GPL-3.0): https://github.com/sanger-pathogens/ariba/blob/master/LICENSE -- This tool is used in `CREATE_ARIBA_DB` and `OTHER_RESISTANCE` processes of the `amr.nf` module +- This tool is used in `GET_ARIBA_DB` and `OTHER_RESISTANCE` processes of the `amr.nf` module [BCFtools](https://samtools.github.io/bcftools/) and [SAMtools](https://www.htslib.org/) - Twelve years of SAMtools and BCFtools. Petr Danecek, James K Bonfield, Jennifer Liddle, John Marshall, Valeriu Ohan, Martin O Pollard, Andrew Whitwham, Thomas Keane, Shane A McCarthy, Robert M Davies, Heng Li. **GigaScience**, Volume 10, Issue 2, February 2021, giab008, https://doi.org/10.1093/gigascience/giab008 - Licenses - BCFtools (MIT/Expat or GPL-3.0): https://github.com/samtools/bcftools/blob/develop/LICENSE - SAMtools (MIT/Expat): https://github.com/samtools/samtools/blob/develop/LICENSE -- These tools are used in `SAM_TO_SORTED_BAM`, `REF_COVERAGE` and `SNP_CALL` processes of the `mapping.nf` module +- These tools are used in `SAM_TO_SORTED_BAM` and `SNP_CALL` processes of the `mapping.nf` module [BWA](https://github.com/lh3/bwa) - Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. [arXiv:1303.3997v2](http://arxiv.org/abs/1303.3997) [q-bio.GN] - License (GPL-3.0): https://github.com/lh3/bwa/blob/master/COPYING -- This tool is used in `GET_REF_GENOME_BWA_DB_PREFIX` and `MAPPING` processes of the `mapping.nf` module +- This tool is used in `GET_REF_GENOME_BWA_DB` and `MAPPING` processes of the `mapping.nf` module -[Docker Images](https://hub.docker.com/u/staphb) of [BCFtools](https://hub.docker.com/r/staphb/bcftools), [BWA](https://hub.docker.com/r/staphb/bwa), [fastp](https://hub.docker.com/r/staphb/fastp), [Kraken 2](https://hub.docker.com/r/staphb/kraken2), [mlst](https://hub.docker.com/r/staphb/mlst), [PopPUNK](https://hub.docker.com/r/staphb/poppunk), [QUAST](https://hub.docker.com/r/staphb/quast), [SAMtools](https://hub.docker.com/r/staphb/samtools), [Shovill](https://hub.docker.com/r/staphb/shovill), [Unicycler](https://hub.docker.com/r/staphb/unicycler) +[Docker Images](https://hub.docker.com/u/staphb) of [ARIBA](https://hub.docker.com/r/staphb/ariba), [BCFtools](https://hub.docker.com/r/staphb/bcftools), [BWA](https://hub.docker.com/r/staphb/bwa), [fastp](https://hub.docker.com/r/staphb/fastp), [Kraken 2](https://hub.docker.com/r/staphb/kraken2), [mlst](https://hub.docker.com/r/staphb/mlst), [PopPUNK](https://hub.docker.com/r/staphb/poppunk), [QUAST](https://hub.docker.com/r/staphb/quast), [SAMtools](https://hub.docker.com/r/staphb/samtools), [Shovill](https://hub.docker.com/r/staphb/shovill), [Unicycler](https://hub.docker.com/r/staphb/unicycler) - [State Public Health Bioinformatics Workgroup](https://staphb.org/) ([@StaPH-B](https://github.com/StaPH-B)) - License (GPL-3.0): https://github.com/StaPH-B/docker-builds/blob/master/LICENSE -- These Docker images provide containerised environments for processes of multiple modules +- These Docker images provide containerised environments with different bioinformatics tools for processes of multiple modules [Docker Image of Git](https://hub.docker.com/r/bitnami/git) - [Bitnami](https://bitnami.com/) ([@Bitnami](https://github.com/bitnami)) - License (Apache 2.0): https://github.com/bitnami/containers/blob/main/LICENSE.md -- This Docker image provides the containerised environment for `GET_SEROBA_DB` process of the `serotype.nf` module +- This Docker image provides the containerised environment with Git for `CHECK_SEROBA_DB` process of the `serotype.nf` module [Docker Image of network-multitool](https://hub.docker.com/r/wbitt/network-multitool) - [Wbitt - We Bring In Tomorrow's Technolgies](https://wbitt.com/) ([@WBITT](https://github.com/wbitt)) - License (MIT): https://github.com/wbitt/Network-MultiTool/blob/master/LICENSE -- This Docker image provides the containerised environment for processes of multiple modules +- This Docker image provides the containerised environment with Bash tools for processes of multiple modules -[Docker Image of Python](https://hub.docker.com/_/python) -- The Docker Community ([@docker-library](https://github.com/docker-library)) -- License (MIT): https://github.com/docker-library/python/blob/master/LICENSE -- This Docker image provides the containerised environment for `HET_SNP_COUNT` process of the `mapping.nf` module and `GET_OTHER_RESISTANCE` process of the `amr.nf` module +[Docker Image of Pandas](https://hub.docker.com/r/amancevice/pandas) +- Alexander Mancevice ([@amancevice](https://github.com/amancevice)) +- License (MIT): https://github.com/amancevice/docker-pandas/blob/main/LICENSE +- This Docker image provides the containerised environment with Python and Pandas for `GENERATE_OVERALL_REPORT` process of the `output.nf` module, `HET_SNP_COUNT` process of the `mapping.nf` module and `PARSE_OTHER_RESISTANCE` process of the `amr.nf` module [fastp](https://github.com/OpenGene/fastp) - Shifu Chen, Yanqing Zhou, Yaru Chen, Jia Gu; fastp: an ultra-fast all-in-one FASTQ preprocessor, Bioinformatics, Volume 34, Issue 17, 1 September 2018, Pages i884–i890, https://doi.org/10.1093/bioinformatics/bty560 @@ -406,9 +406,9 @@ This project uses open-source components. You can find the homepage or source co - This tool is used in `PREPROCESS` process of the `preprocess.nf` module [GPSC_pipeline_nf](https://github.com/sanger-bentley-group/GPSC_pipeline_nf) -- Victoria Carr ([@blue-moon22](https://github.com/blue-moon22)) +- Victoria Dyster ([@blue-moon22](https://github.com/blue-moon22)) - License (GPL-3.0): https://github.com/sanger-bentley-group/GPSC_pipeline_nf/blob/master/LICENSE -- Code adapted into `LINEAGE` process of the `lineage.nf` module +- Code adapted into the `get_lineage.sh` script [Kraken 2](https://ccb.jhu.edu/software/kraken2/) - Wood, D.E., Lu, J. & Langmead, B. Improved metagenomic analysis with Kraken 2. Genome Biol 20, 257 (2019). https://doi.org/10.1186/s13059-019-1891-0 @@ -418,7 +418,7 @@ This project uses open-source components. You can find the homepage or source co [mecA-HetSites-calculator](https://github.com/kumarnaren/mecA-HetSites-calculator) - Narender Kumar ([@kumarnaren](https://github.com/kumarnaren)) - License (GPL-3.0): https://github.com/kumarnaren/mecA-HetSites-calculator/blob/master/LICENSE -- Code was rewritten into the `het_snp_count.py` script used by `HET_SNP_COUNT` process of the `mapping.nf` module +- Code was rewritten into the `het_snp_count.py` script [mlst](https://github.com/tseemann/mlst) - Torsten Seemann ([@tseemann](https://github.com/tseemann)) @@ -446,14 +446,14 @@ This project uses open-source components. You can find the homepage or source co - License (GPL-3.0): https://github.com/sanger-pathogens/seroba/blob/master/LICENSE - This project uses a Docker image built from a [custom fork](https://github.com/HarryHung/seroba) - The fork includes critical bug fixes for SeroBA as the original repository is no longer maintained - - The Docker image provides the containerised environment for `CREATE_SEROBA_DB` and `SEROTYPE` processes of the `serotype.nf` module + - The Docker image provides the containerised environment with SeroBA for `GET_SEROBA_DB` and `SEROTYPE` processes of the `serotype.nf` module [resistanceDatabase](https://github.com/kumarnaren/resistanceDatabase) - Narender Kumar ([@kumarnaren](https://github.com/kumarnaren)) - License (GPL-3.0): https://github.com/kumarnaren/resistanceDatabase/blob/main/LICENSE - `sequences.fasta` is renamed to `ariba_ref_sequences-*.fasta` and used as-is - `metadata.tsv` is renamed to `ariba_metadata-*.tsv` and modified -- The files are used as the default inputs of `CREATE_ARIBA_DB` process of the `amr.nf` module +- The files are used as the default inputs of `GET_ARIBA_DB` process of the `amr.nf` module [Shovill](https://github.com/tseemann/shovill) - Torsten Seemann ([@tseemann](https://github.com/tseemann)) @@ -466,7 +466,7 @@ This project uses open-source components. You can find the homepage or source co - This is a modified version of [AMR predictor](https://github.com/BenJamesMetcalf/Spn_Scripts_Reference) by Ben Metcalf ([@BenJamesMetcalf](https://github.com/BenJamesMetcalf)) at the Centre for Disease Control (CDC) - This project uses a Docker image built from a [custom fork](https://github.com/HarryHung/spn-resistance-pbp) - The fork changes the Docker image from a Docker executable image to a Docker environment for Nextflow integration - - The Docker image provides the containerised environment for `PBP_RESISTANCE` process of the `amr.nf` module + - The Docker image provides the containerised environment with SPN-PBP-MAR for `PBP_RESISTANCE` process of the `amr.nf` module [Unicycler](https://github.com/rrwick/Unicycler) - **Wick RR, Judd LM, Gorrie CL, Holt KE**. Unicycler: resolving bacterial genome assemblies from short and long sequencing reads. *PLoS Comput Biol* 2017. From 30fbe71480d5a1a83dc74aeb7fc9ddb6ec0c6741 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 2 Aug 2023 10:27:19 +0000 Subject: [PATCH 078/157] Use full Pandas image for NF metrics collection Former-commit-id: db629f27a21165a37d1fe408a649eaed08702ee5 --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 268f05a..5a29810 100644 --- a/nextflow.config +++ b/nextflow.config @@ -66,7 +66,7 @@ process { container = 'bitnami/git:2.39.0' } withLabel: python_container { - container = 'amancevice/pandas:2.0.2-slim' + container = 'amancevice/pandas:2.0.2' } withLabel: fastp_container { container = 'staphb/fastp:0.23.2' From 545709e0e8fb56e9492d782ee8c29beb6d11617a Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 2 Aug 2023 16:36:10 +0000 Subject: [PATCH 079/157] Include Virulence in the chart Former-commit-id: 09672c296931c8216b741bbac1a847f0dec9f215 --- doc/workflow.drawio.svg | 216 ++++++++++++++++++++-------------------- 1 file changed, 108 insertions(+), 108 deletions(-) diff --git a/doc/workflow.drawio.svg b/doc/workflow.drawio.svg index 873d1b7..43e654b 100644 --- a/doc/workflow.drawio.svg +++ b/doc/workflow.drawio.svg @@ -1,23 +1,23 @@ - + - - + + Output - - + + Input - + - +
@@ -32,12 +32,12 @@ - - - - + + + + - +
@@ -57,14 +57,14 @@ - + - + - + - -
+ +
FASTQ (Reads) @@ -72,36 +72,36 @@
- + FASTQ (Reads) - - - + + + S. Pneumo:  > 60% - - - + + + Contigs:  < 500 - + Length:   1.9 - 2.3 Mb - + Depth:     ≥ 20x - + - -
+ +
FASTA (Assemblies) @@ -109,16 +109,16 @@
- + FASTA (Assemblies) - + - -
+ +
SAM @@ -126,25 +126,25 @@
- + SAM - - - + + + Ref Coverage:  > 60% - + Het-SNP site:   < 220 - + - -
+ +
Results @@ -152,21 +152,21 @@
- + Results - - - - - - - - + + + + + + + + - +
@@ -188,10 +188,10 @@ - - + + - +
@@ -209,10 +209,10 @@ - - + + - +
@@ -233,10 +233,10 @@ - - + + - +
@@ -254,10 +254,10 @@ - - + + - +
@@ -279,12 +279,12 @@ - + - - + + - +
@@ -302,12 +302,12 @@ - - - - + + + + - +
@@ -325,11 +325,11 @@ - - - + + + - +
@@ -347,12 +347,12 @@ - - + + - + - +
@@ -368,11 +368,11 @@ - - - + + + - +
@@ -390,11 +390,11 @@ - - - + + + - +
@@ -412,16 +412,16 @@ - - - + + + - +
- Other AMR + Other AMR / Virulence
ARIBA, custom script @@ -435,9 +435,9 @@ - + - +
@@ -455,24 +455,24 @@ - - + + Bases: ≥ 38 Mb - + Go / No-go - + - + - +
@@ -491,9 +491,9 @@ - + - +
@@ -512,10 +512,10 @@ - - + + - +
@@ -531,13 +531,13 @@ - + QC values shown in the diagram are the default values - + Go / No-go From 4af22629ea41f50d7e80fd3a357a2119bd7abe69 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 2 Aug 2023 16:36:34 +0000 Subject: [PATCH 080/157] Include information on virulence detection Former-commit-id: 494bfb3ff8afb4efa43382f13bd63baedb338acb --- README.md | 70 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index bda84c2..6557d5c 100644 --- a/README.md +++ b/README.md @@ -275,9 +275,11 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca ## Details of `results.csv` - The following fields can be found in the output `results.csv` + > ℹ️ The output fields in Other AMR / Virulence type depends on the provided ARIBA database, the below table is based on the default ARIBA database + > ℹ️ For resistance phenotypes: S = Sensitive/Susceptible; I = Intermediate; R = Resistant - - > ℹ️ * The exact output fields of Other AMR depends on the provided ARIBA database, the below table is based on the default ARIBA database + + > ℹ️ For virulence genes: POS = Positive; NEG = Negative > ⚠️ If the result of `Overall_QC` of a sample is `ASSEMBLER FAILURE`, the assembler has crashed when trying to assembly the reads. You might want to re-run the sample with [another assembler](#assembly), or discard the sample if it is a low quality one. @@ -326,38 +328,38 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | `PEN_MIC` | PBP AMR | Estimated MIC of penicillin (PEN) | | `PEN_Res(Meningital)` | PBP AMR | Resistance phenotype against PEN in meningital form | | `PEN_Res(Non-meningital)` | PBP AMR | Resistance phenotype against PEN in non-meningital form | - | `CHL_Res` | Other AMR* | Resistance phenotype against Chloramphenicol (CHL) | - | `CHL_Determinant` | Other AMR* | Known determinants that inferred the CHL resistance | - | `CLI_Res` | Other AMR* | Resistance phenotype against Clindamycin (CLI) | - | `CLI_Determinant` | Other AMR* | Known determinants that inferred the CLI resistance | - | `COT_Res` | Other AMR* | Resistance phenotype against Co-Trimoxazole (COT) | - | `COT_Determinant` | Other AMR* | Known determinants that inferred the COT resistance | - | `DOX_Res` | Other AMR* | Resistance phenotype against Doxycycline (DOX) | - | `DOX_Determinant` | Other AMR* | Known determinants that inferred the DOX resistance | - | `ERY_Res` | Other AMR* | Resistance phenotype against Erythromycin (ERY) | - | `ERY_Determinant` | Other AMR* | Known determinants that inferred the ERY resistance | - | `ERY_CLI_Res` | Other AMR* | Resistance phenotype against Erythromycin (ERY) and Clindamycin (CLI) | - | `ERY_CLI_Determinant` | Other AMR* | Known determinants that inferred the ERY and CLI resistance | - | `FQ_Res` | Other AMR* | Resistance phenotype against Fluoroquinolones (FQ) | - | `FQ_Determinant` | Other AMR* | Known determinants that inferred the FQ resistance | - | `KAN_Res` | Other AMR* | Resistance phenotype against Kanamycin (KAN) | - | `KAN_Determinant` | Other AMR* | Known determinants that inferred the KAN resistance | - | `LFX_Res` | Other AMR* | Resistance phenotype against Levofloxacin (LFX) | - | `LFX_Determinant` | Other AMR* | Known determinants that inferred the LFX resistance | - | `RIF_Res` | Other AMR* | Resistance phenotype against Rifampin (RIF) | - | `RIF_Determinant` | Other AMR* | Known determinants that inferred the RIF resistance | - | `SMX_Res` | Other AMR* | Resistance phenotype against Sulfamethoxazole (SMX) | - | `SMX_Determinant` | Other AMR* | Known determinants that inferred the SMX resistance | - | `TET_Res` | Other AMR* | Resistance phenotype against Tetracycline (TET) | - | `TET_Determinant` | Other AMR* | Known determinants that inferred the TET resistance | - | `TMP_Res` | Other AMR* | Resistance phenotype against Trimethoprim (TMP) | - | `TMP_Determinant` | Other AMR* | Known determinants that inferred the TMP resistance | - | `VAN_Res` | Other AMR* | Resistance phenotype against Vancomycin (VAN) | - | `VAN_Determinant` | Other AMR* | Known determinants that inferred the VAN resistance | - | `PILI1` | Other AMR* | Expression of PILI-1 | - | `PILI1_Determinant` | Other AMR* | Known determinants that inferred the PILI-1 expression | - | `PILI2` | Other AMR* | Expression of PILI-2 | - | `PILI2_Determinant` | Other AMR* | Known determinants that inferred the PILI-2 expression | + | `CHL_Res` | Other AMR | Resistance phenotype against Chloramphenicol (CHL) | + | `CHL_Determinant` | Other AMR | Known determinants that inferred the CHL resistance | + | `CLI_Res` | Other AMR | Resistance phenotype against Clindamycin (CLI) | + | `CLI_Determinant` | Other AMR | Known determinants that inferred the CLI resistance | + | `COT_Res` | Other AMR | Resistance phenotype against Co-Trimoxazole (COT) | + | `COT_Determinant` | Other AMR | Known determinants that inferred the COT resistance | + | `DOX_Res` | Other AMR | Resistance phenotype against Doxycycline (DOX) | + | `DOX_Determinant` | Other AMR | Known determinants that inferred the DOX resistance | + | `ERY_Res` | Other AMR | Resistance phenotype against Erythromycin (ERY) | + | `ERY_Determinant` | Other AMR | Known determinants that inferred the ERY resistance | + | `ERY_CLI_Res` | Other AMR | Resistance phenotype against Erythromycin (ERY) and Clindamycin (CLI) | + | `ERY_CLI_Determinant` | Other AMR | Known determinants that inferred the ERY and CLI resistance | + | `FQ_Res` | Other AMR | Resistance phenotype against Fluoroquinolones (FQ) | + | `FQ_Determinant` | Other AMR | Known determinants that inferred the FQ resistance | + | `KAN_Res` | Other AMR | Resistance phenotype against Kanamycin (KAN) | + | `KAN_Determinant` | Other AMR | Known determinants that inferred the KAN resistance | + | `LFX_Res` | Other AMR | Resistance phenotype against Levofloxacin (LFX) | + | `LFX_Determinant` | Other AMR | Known determinants that inferred the LFX resistance | + | `RIF_Res` | Other AMR | Resistance phenotype against Rifampin (RIF) | + | `RIF_Determinant` | Other AMR | Known determinants that inferred the RIF resistance | + | `SMX_Res` | Other AMR | Resistance phenotype against Sulfamethoxazole (SMX) | + | `SMX_Determinant` | Other AMR | Known determinants that inferred the SMX resistance | + | `TET_Res` | Other AMR | Resistance phenotype against Tetracycline (TET) | + | `TET_Determinant` | Other AMR | Known determinants that inferred the TET resistance | + | `TMP_Res` | Other AMR | Resistance phenotype against Trimethoprim (TMP) | + | `TMP_Determinant` | Other AMR | Known determinants that inferred the TMP resistance | + | `VAN_Res` | Other AMR | Resistance phenotype against Vancomycin (VAN) | + | `VAN_Determinant` | Other AMR | Known determinants that inferred the VAN resistance | + | `PILI1` | Virulence | Expression of PILI-1 | + | `PILI1_Determinant` | Virulence | Known determinants that inferred the PILI-1 expression | + | `PILI2` | Virulence | Expression of PILI-2 | + | `PILI2_Determinant` | Virulence | Known determinants that inferred the PILI-2 expression |   # Credits From 1a3c3f32605dac5fee8b367693cee128c05b9972 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 3 Aug 2023 18:14:44 +0100 Subject: [PATCH 081/157] Improve the robustness of LSF profile Former-commit-id: 2dc2f896c34f1ce539c054711d583e9d99d7ec87 --- nextflow.config | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index 5a29810..98d95ed 100644 --- a/nextflow.config +++ b/nextflow.config @@ -141,11 +141,12 @@ profiles { lsf { params.singularity_cachedir = "$projectDir/singularity_cache" params.maxretries = 4 + params.kraken2_memory_mapping = false process { executor = 'lsf' scratch = true - time = {10.min * task.attempt} + time = {30.min * task.attempt} withLabel: farm_low { cpus = 1 memory = {1.GB * task.attempt} @@ -171,7 +172,7 @@ profiles { maxRetries = params.maxretries } withLabel: farm_slow { - time = {30.min * task.attempt} + time = {2.hour * task.attempt} } withLabel: farm_scratchless { scratch = false From be38d8240c1b50c7b9257190a64cc185555836e6 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 3 Aug 2023 18:16:02 +0100 Subject: [PATCH 082/157] Improve LSF config for database processes Former-commit-id: 6c075e5db629caa4d772e6899ed20bb86d9558c7 --- modules/amr.nf | 2 ++ modules/lineage.nf | 5 ++++- modules/mapping.nf | 1 + modules/serotype.nf | 3 +++ 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/modules/amr.nf b/modules/amr.nf index d7a7206..a128b7b 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -45,6 +45,8 @@ process PARSE_PBP_RESISTANCE { process GET_ARIBA_DB { label 'ariba_container' label 'farm_low' + label 'farm_scratchless' + label 'farm_slow' input: path ref_sequences diff --git a/modules/lineage.nf b/modules/lineage.nf index 9090f02..76f456c 100644 --- a/modules/lineage.nf +++ b/modules/lineage.nf @@ -2,6 +2,8 @@ process GET_POPPUNK_DB { label 'bash_container' label 'farm_low' + label 'farm_scratchless' + label 'farm_slow' input: val db_remote @@ -26,6 +28,8 @@ process GET_POPPUNK_DB { process GET_POPPUNK_EXT_CLUSTERS { label 'bash_container' label 'farm_low' + label 'farm_scratchless' + label 'farm_slow' input: val ext_clusters_remote @@ -51,7 +55,6 @@ process LINEAGE { label 'poppunk_container' label 'farm_high' label 'farm_slow' - label 'farm_scratchless' tag 'All samples' diff --git a/modules/mapping.nf b/modules/mapping.nf index 964a415..5544582 100644 --- a/modules/mapping.nf +++ b/modules/mapping.nf @@ -2,6 +2,7 @@ process GET_REF_GENOME_BWA_DB { label 'bwa_container' label 'farm_mid' + label 'farm_scratchless' input: path reference diff --git a/modules/serotype.nf b/modules/serotype.nf index 02327dd..5c3211f 100644 --- a/modules/serotype.nf +++ b/modules/serotype.nf @@ -2,6 +2,8 @@ process CHECK_SEROBA_DB { label 'git_container' label 'farm_low' + label 'farm_scratchless' + label 'farm_slow' input: val remote @@ -27,6 +29,7 @@ process CHECK_SEROBA_DB { process GET_SEROBA_DB { label 'seroba_container' label 'farm_low' + label 'farm_scratchless' input: val remote From b61a186691286bb83faaa9e575a3ff5386712d74 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 4 Aug 2023 12:02:40 +0100 Subject: [PATCH 083/157] Improve lsf profile description Former-commit-id: 4eac3dff2a32621f7d6c2c2b1364f1d82045d5bc --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6557d5c..e44c12b 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,7 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | --- | --- | | `standard`
(Default) | Docker is used as the container engine.
Processes are executed locally. | | `singularity` | Singularity is used as the container engine.
Processes are executed locally. | - | `lsf` | **The pipeline should be launched from a LSF cluster head node with this profile.**
Singularity is used as the container engine.
Processes are submitted to your LSF cluster via `bsub` by the pipeline.
(Tested on Sanger farm5 cluster only) | + | `lsf` | **The pipeline should be launched from a LSF cluster head node with this profile.**
Singularity is used as the container engine.
Processes are submitted to your LSF cluster via `bsub` by the pipeline.
(Tested on Wellcome Sanger Institute farm5 LSF cluster only)
(Option `--kraken2_memory_mapping` default change to `false`.) | ## Resume - If the pipeline is interrupted mid-run, Nextflow's built-in `-resume` option can be used to resume the pipeline execution instead of starting from scratch again From 973b42f115258cf676e50347169d646ac4960eee Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 4 Aug 2023 14:11:03 +0100 Subject: [PATCH 084/157] Update mlst image with latest database Former-commit-id: ad2aa01d8c545478cca889cc68fc30468a2fa24c --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 98d95ed..ea00c6b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -99,7 +99,7 @@ process { container = 'staphb/ariba:2.14.6' } withLabel: mlst_container { - container = 'staphb/mlst:2.23.0' + container = 'staphb/mlst:2.23.0-2023-07' } withLabel: kraken2_container { container = 'staphb/kraken2:2.1.2-no-db' From b6227796bdf34e2c727c1c2f9b665625eff4d962 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 4 Aug 2023 14:41:10 +0000 Subject: [PATCH 085/157] Prototype schema Former-commit-id: 0d4a5c47cd30a29d6dc1de9683c93be0cbf7decc --- nextflow_schema.json | 151 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 nextflow_schema.json diff --git a/nextflow_schema.json b/nextflow_schema.json new file mode 100644 index 0000000..169ac9e --- /dev/null +++ b/nextflow_schema.json @@ -0,0 +1,151 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json", + "title": ". pipeline parameters", + "description": "", + "type": "object", + "definitions": { + "generic_options": { + "title": "Generic options", + "type": "object", + "fa_icon": "fas fa-file-import", + "description": "Less common options for the pipeline, typically set in a config file.", + "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.", + "properties": { + "help": { + "type": "boolean", + "description": "Display help text.", + "fa_icon": "fas fa-question-circle", + "hidden": true + }, + "version": { + "type": "boolean", + "description": "Display version and exit.", + "fa_icon": "fas fa-question-circle", + "hidden": true + }, + "init": { + "type": "string", + "default": "false", + "hidden": true + } + } + } + }, + "allOf": [ + { + "$ref": "#/definitions/generic_options" + } + ], + "properties": { + "reads": { + "type": "string", + "default": "/home/ubuntu/local-repo/gps-unified-pipeline/input" + }, + "output": { + "type": "string", + "default": "/home/ubuntu/local-repo/gps-unified-pipeline/output" + }, + "assembler": { + "type": "string", + "default": "shovill" + }, + "min_contig_length": { + "type": "integer", + "default": 500 + }, + "assembly_publish": { + "type": "string", + "default": "link" + }, + "seroba_db_remote": { + "type": "string", + "default": "https://github.com/sanger-pathogens/seroba.git" + }, + "seroba_db_local": { + "type": "string", + "default": "/home/ubuntu/local-repo/gps-unified-pipeline/databases/seroba" + }, + "seroba_kmer": { + "type": "integer", + "default": 71 + }, + "kraken2_db_remote": { + "type": "string", + "default": "https://genome-idx.s3.amazonaws.com/kraken/minikraken2_v1_8GB_201904.tgz" + }, + "kraken2_db_local": { + "type": "string", + "default": "/home/ubuntu/local-repo/gps-unified-pipeline/databases/kraken" + }, + "kraken2_memory_mapping": { + "type": "string", + "default": "true" + }, + "ref_genome": { + "type": "string", + "default": "/home/ubuntu/local-repo/gps-unified-pipeline/data/ATCC_700669_v1.fa" + }, + "ref_genome_bwa_db_local": { + "type": "string", + "default": "/home/ubuntu/local-repo/gps-unified-pipeline/databases/bwa_ref_db" + }, + "poppunk_db_remote": { + "type": "string", + "default": "https://gps-project.cog.sanger.ac.uk/GPS_v6.tar.gz" + }, + "poppunk_ext_remote": { + "type": "string", + "default": "https://www.pneumogen.net/gps/GPS_v6_external_clusters.csv" + }, + "poppunk_db_local": { + "type": "string", + "default": "/home/ubuntu/local-repo/gps-unified-pipeline/databases/poppunk" + }, + "spneumo_percentage": { + "type": "integer", + "default": 60 + }, + "ref_coverage": { + "type": "integer", + "default": 60 + }, + "het_snp_site": { + "type": "integer", + "default": 220 + }, + "contigs": { + "type": "integer", + "default": 500 + }, + "length_low": { + "type": "integer", + "default": 1900000 + }, + "length_high": { + "type": "integer", + "default": 2300000 + }, + "depth": { + "type": "integer", + "default": 20 + }, + "ariba_ref": { + "type": "string", + "default": "/home/ubuntu/local-repo/gps-unified-pipeline/data/ariba_ref_sequences-20230712.fasta" + }, + "ariba_metadata": { + "type": "string", + "default": "/home/ubuntu/local-repo/gps-unified-pipeline/data/ariba_metadata-20230712.tsv" + }, + "ariba_db_local": { + "type": "string", + "default": "/home/ubuntu/local-repo/gps-unified-pipeline/databases/ariba" + }, + "lite": { + "type": "string", + "default": "false" + } + }, + "required": ["reads", "output"] +} From b5196f1b0e903444c00aa8474106ebed29e1d6c8 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 4 Aug 2023 15:41:40 +0100 Subject: [PATCH 086/157] Add sangertower profile Former-commit-id: 0372bc257c389ad6a1692e74da5b5bfc7f9c1201 --- nextflow.config | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/nextflow.config b/nextflow.config index ea00c6b..e5b104d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -190,5 +190,11 @@ profiles { cacheDir = params.singularity_cachedir } } + sangertower { + tower { + enabled = true + endpoint = 'https://tower.internal.sanger.ac.uk/api/' + } + } } From a3b8f26c9b5a1980ba91af68930b1adf21d2cd36 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 4 Aug 2023 15:42:05 +0100 Subject: [PATCH 087/157] Fix parameter validation for multiple profiles Former-commit-id: c276926c501b772162c203dcffb57d7925d68c5c --- modules/validate.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/validate.nf b/modules/validate.nf index f2bceb0..bd1abd3 100644 --- a/modules/validate.nf +++ b/modules/validate.nf @@ -53,8 +53,8 @@ void validate(Map params) { validParams.put("singularity_cachedir", "path") } - // Add params.maxretries when workflow.profile == 'lsf' - if (workflow.profile == 'lsf' ) { + // Add params.maxretries when workflow.profile contains 'lsf' + if (workflow.profile.split(',').contains('lsf')) { validParams.put("maxretries", "int") } From a0d2bbddd2757cf8327848dccba743388cb16c4b Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 4 Aug 2023 15:45:53 +0100 Subject: [PATCH 088/157] Add basic I/O options Former-commit-id: dc78cc66712c9e02ff17addb579a5864616d4895 --- nextflow_schema.json | 291 ++++++++++++++++++++++--------------------- 1 file changed, 152 insertions(+), 139 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 169ac9e..837683d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -5,147 +5,160 @@ "description": "", "type": "object", "definitions": { - "generic_options": { - "title": "Generic options", - "type": "object", - "fa_icon": "fas fa-file-import", - "description": "Less common options for the pipeline, typically set in a config file.", - "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.", - "properties": { - "help": { - "type": "boolean", - "description": "Display help text.", - "fa_icon": "fas fa-question-circle", - "hidden": true - }, - "version": { - "type": "boolean", - "description": "Display version and exit.", - "fa_icon": "fas fa-question-circle", - "hidden": true - }, - "init": { - "type": "string", - "default": "false", - "hidden": true - } + "input_output": { + "title": "Input / Output", + "type": "object", + "description": "", + "default": "", + "properties": { + "output": { + "type": "string", + "default": "/home/ubuntu/local-repo/gps-unified-pipeline/output" + }, + "reads": { + "type": "string", + "default": "/home/ubuntu/local-repo/gps-unified-pipeline/input" + } + }, + "required": [ + "output", + "reads" + ] + }, + "cli_options": { + "title": "CLI Options", + "type": "object", + "fa_icon": "fas fa-file-import", + "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.", + "properties": { + "help": { + "type": "boolean", + "description": "Display help text.", + "fa_icon": "fas fa-question-circle", + "hidden": true + }, + "version": { + "type": "boolean", + "description": "Display version and exit.", + "fa_icon": "fas fa-question-circle", + "hidden": true + }, + "init": { + "type": "boolean", + "hidden": true, + "description": "Initialise pipeline" + } + } } - } }, "allOf": [ - { - "$ref": "#/definitions/generic_options" - } + { + "$ref": "#/definitions/input_output" + }, + { + "$ref": "#/definitions/cli_options" + } ], "properties": { - "reads": { - "type": "string", - "default": "/home/ubuntu/local-repo/gps-unified-pipeline/input" - }, - "output": { - "type": "string", - "default": "/home/ubuntu/local-repo/gps-unified-pipeline/output" - }, - "assembler": { - "type": "string", - "default": "shovill" - }, - "min_contig_length": { - "type": "integer", - "default": 500 - }, - "assembly_publish": { - "type": "string", - "default": "link" - }, - "seroba_db_remote": { - "type": "string", - "default": "https://github.com/sanger-pathogens/seroba.git" - }, - "seroba_db_local": { - "type": "string", - "default": "/home/ubuntu/local-repo/gps-unified-pipeline/databases/seroba" - }, - "seroba_kmer": { - "type": "integer", - "default": 71 - }, - "kraken2_db_remote": { - "type": "string", - "default": "https://genome-idx.s3.amazonaws.com/kraken/minikraken2_v1_8GB_201904.tgz" - }, - "kraken2_db_local": { - "type": "string", - "default": "/home/ubuntu/local-repo/gps-unified-pipeline/databases/kraken" - }, - "kraken2_memory_mapping": { - "type": "string", - "default": "true" - }, - "ref_genome": { - "type": "string", - "default": "/home/ubuntu/local-repo/gps-unified-pipeline/data/ATCC_700669_v1.fa" - }, - "ref_genome_bwa_db_local": { - "type": "string", - "default": "/home/ubuntu/local-repo/gps-unified-pipeline/databases/bwa_ref_db" - }, - "poppunk_db_remote": { - "type": "string", - "default": "https://gps-project.cog.sanger.ac.uk/GPS_v6.tar.gz" - }, - "poppunk_ext_remote": { - "type": "string", - "default": "https://www.pneumogen.net/gps/GPS_v6_external_clusters.csv" - }, - "poppunk_db_local": { - "type": "string", - "default": "/home/ubuntu/local-repo/gps-unified-pipeline/databases/poppunk" - }, - "spneumo_percentage": { - "type": "integer", - "default": 60 - }, - "ref_coverage": { - "type": "integer", - "default": 60 - }, - "het_snp_site": { - "type": "integer", - "default": 220 - }, - "contigs": { - "type": "integer", - "default": 500 - }, - "length_low": { - "type": "integer", - "default": 1900000 - }, - "length_high": { - "type": "integer", - "default": 2300000 - }, - "depth": { - "type": "integer", - "default": 20 - }, - "ariba_ref": { - "type": "string", - "default": "/home/ubuntu/local-repo/gps-unified-pipeline/data/ariba_ref_sequences-20230712.fasta" - }, - "ariba_metadata": { - "type": "string", - "default": "/home/ubuntu/local-repo/gps-unified-pipeline/data/ariba_metadata-20230712.tsv" - }, - "ariba_db_local": { - "type": "string", - "default": "/home/ubuntu/local-repo/gps-unified-pipeline/databases/ariba" - }, - "lite": { - "type": "string", - "default": "false" - } - }, - "required": ["reads", "output"] -} + "assembler": { + "type": "string", + "default": "shovill" + }, + "min_contig_length": { + "type": "integer", + "default": 500 + }, + "assembly_publish": { + "type": "string", + "default": "link" + }, + "seroba_db_remote": { + "type": "string", + "default": "https://github.com/sanger-pathogens/seroba.git" + }, + "seroba_db_local": { + "type": "string", + "default": "/home/ubuntu/local-repo/gps-unified-pipeline/databases/seroba" + }, + "seroba_kmer": { + "type": "integer", + "default": 71 + }, + "kraken2_db_remote": { + "type": "string", + "default": "https://genome-idx.s3.amazonaws.com/kraken/minikraken2_v1_8GB_201904.tgz" + }, + "kraken2_db_local": { + "type": "string", + "default": "/home/ubuntu/local-repo/gps-unified-pipeline/databases/kraken" + }, + "kraken2_memory_mapping": { + "type": "string", + "default": "true" + }, + "ref_genome": { + "type": "string", + "default": "/home/ubuntu/local-repo/gps-unified-pipeline/data/ATCC_700669_v1.fa" + }, + "ref_genome_bwa_db_local": { + "type": "string", + "default": "/home/ubuntu/local-repo/gps-unified-pipeline/databases/bwa_ref_db" + }, + "poppunk_db_remote": { + "type": "string", + "default": "https://gps-project.cog.sanger.ac.uk/GPS_v6.tar.gz" + }, + "poppunk_ext_remote": { + "type": "string", + "default": "https://www.pneumogen.net/gps/GPS_v6_external_clusters.csv" + }, + "poppunk_db_local": { + "type": "string", + "default": "/home/ubuntu/local-repo/gps-unified-pipeline/databases/poppunk" + }, + "spneumo_percentage": { + "type": "integer", + "default": 60 + }, + "ref_coverage": { + "type": "integer", + "default": 60 + }, + "het_snp_site": { + "type": "integer", + "default": 220 + }, + "contigs": { + "type": "integer", + "default": 500 + }, + "length_low": { + "type": "integer", + "default": 1900000 + }, + "length_high": { + "type": "integer", + "default": 2300000 + }, + "depth": { + "type": "integer", + "default": 20 + }, + "ariba_ref": { + "type": "string", + "default": "/home/ubuntu/local-repo/gps-unified-pipeline/data/ariba_ref_sequences-20230712.fasta" + }, + "ariba_metadata": { + "type": "string", + "default": "/home/ubuntu/local-repo/gps-unified-pipeline/data/ariba_metadata-20230712.tsv" + }, + "ariba_db_local": { + "type": "string", + "default": "/home/ubuntu/local-repo/gps-unified-pipeline/databases/ariba" + }, + "lite": { + "type": "string", + "default": "false" + } + } +} \ No newline at end of file From b10ee28f6d82b7c5126befab66638b517ae49028 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 4 Aug 2023 17:46:53 +0100 Subject: [PATCH 089/157] Update schema Former-commit-id: c79932f13e25e221a99831e1cd8919436af317c6 --- nextflow_schema.json | 395 +++++++++++++++++++++++++++++-------------- 1 file changed, 272 insertions(+), 123 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 837683d..fa5994a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -11,44 +11,274 @@ "description": "", "default": "", "properties": { + "reads": { + "type": "string", + "description": "Path to the input directory that contains the reads to be processed." + }, "output": { "type": "string", - "default": "/home/ubuntu/local-repo/gps-unified-pipeline/output" + "description": "Path to the output directory that save the results." }, - "reads": { + "assembly_publish": { "type": "string", - "default": "/home/ubuntu/local-repo/gps-unified-pipeline/input" + "default": "link", + "description": "Method used by Nextflow to publish the generated assemblies.", + "hidden": true } }, "required": [ + "reads", "output", - "reads" + "assembly_publish" ] }, - "cli_options": { - "title": "CLI Options", + "qc_parameters": { + "title": "QC Parameters", "type": "object", - "fa_icon": "fas fa-file-import", - "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.", + "description": "", + "default": "", "properties": { - "help": { - "type": "boolean", - "description": "Display help text.", - "fa_icon": "fas fa-question-circle", + "spneumo_percentage": { + "type": "number", + "default": 60, + "description": "Minimum S. pneumoniae percentage in reads to pass Taxonomy QC.", "hidden": true }, - "version": { - "type": "boolean", - "description": "Display version and exit.", - "fa_icon": "fas fa-question-circle", + "ref_coverage": { + "type": "number", + "default": 60, + "description": "Minimum reference coverage percentage by the reads to pass Mapping QC.", + "hidden": true + }, + "het_snp_site": { + "type": "integer", + "default": 220, + "description": "Maximum non-cluster heterozygous SNP (Het-SNP) site count to pass Mapping QC.", + "hidden": true + }, + "contigs": { + "type": "integer", + "default": 500, + "hidden": true, + "description": "Maximum contig count in assembly to pass Assembly QC." + }, + "length_low": { + "type": "integer", + "default": 1900000, + "hidden": true, + "description": "Minimum assembly length to pass Assembly QC." + }, + "length_high": { + "type": "integer", + "default": 2300000, + "hidden": true, + "description": "Maximum assembly length to pass Assembly QC." + }, + "depth": { + "type": "number", + "default": 20, + "hidden": true, + "description": "Minimum sequencing depth to pass Assembly QC." + } + }, + "required": [ + "spneumo_percentage", + "ref_coverage", + "het_snp_site", + "contigs", + "length_low", + "length_high", + "depth" + ] + }, + "assembly": { + "title": "Assembly", + "type": "object", + "description": "", + "default": "", + "properties": { + "assembler": { + "type": "string", + "default": "shovill", + "hidden": true, + "description": "Using which SPAdes-based assembler to assemble the reads." + }, + "min_contig_length": { + "type": "integer", + "default": 500, + "description": "Minimum legnth of contig to be included in the assembly.", + "hidden": true + } + }, + "required": [ + "assembler", + "min_contig_length" + ] + }, + "mapping": { + "title": "Mapping", + "type": "object", + "description": "", + "default": "", + "properties": { + "ref_genome": { + "type": "string", + "default": "${projectDir}/data/ATCC_700669_v1.fa", + "hidden": true, + "description": "Path to the reference genome for mapping." + }, + "ref_genome_bwa_db_local": { + "type": "string", + "default": "${projectDir}/databases/bwa_ref_db", + "description": "Path to the directory where the reference genome FM-index database for BWA should be saved to.", "hidden": true + } + }, + "required": [ + "ref_genome", + "ref_genome_bwa_db_local" + ] + }, + "taxonomy": { + "title": "Taxonomy", + "type": "object", + "description": "", + "default": "", + "properties": { + "kraken2_db_remote": { + "type": "string", + "default": "https://genome-idx.s3.amazonaws.com/kraken/minikraken2_v1_8GB_201904.tgz", + "hidden": true, + "description": "URL to a Kraken2 database." }, - "init": { + "kraken2_db_local": { + "type": "string", + "default": "${projectDir}/databases/kraken", + "hidden": true, + "description": "Path to the directory where the remote Kraken2 database should be saved to." + }, + "kraken2_memory_mapping": { "type": "boolean", + "default": true, "hidden": true, - "description": "Initialise pipeline" + "description": "Using the memory mapping option of Kraken2 or not." } - } + }, + "required": [ + "kraken2_db_remote", + "kraken2_db_local", + "kraken2_memory_mapping" + ] + }, + "serotype": { + "title": "Serotype", + "type": "object", + "description": "", + "default": "", + "properties": { + "seroba_db_remote": { + "type": "string", + "default": "https://github.com/sanger-pathogens/seroba.git", + "hidden": true, + "description": "URL to a SeroBA Git remote repository." + }, + "seroba_db_local": { + "type": "string", + "default": "${projectDir}/databases/seroba", + "hidden": true, + "description": "Path to the directory where SeroBA local repository should be saved to." + }, + "seroba_kmer": { + "type": "integer", + "default": 71, + "hidden": true, + "description": "Kmer size for creating the KMC database of SeroBA." + } + }, + "required": [ + "seroba_db_remote", + "seroba_db_local", + "seroba_kmer" + ] + }, + "lineage": { + "title": "Lineage", + "type": "object", + "description": "", + "default": "", + "properties": { + "poppunk_db_remote": { + "type": "string", + "default": "https://gps-project.cog.sanger.ac.uk/GPS_v6.tar.gz", + "hidden": true, + "description": "URL to a PopPUNK database." + }, + "poppunk_ext_remote": { + "type": "string", + "default": "https://www.pneumogen.net/gps/GPS_v6_external_clusters.csv", + "hidden": true, + "description": "URL to a PopPUNK external clusters file." + }, + "poppunk_db_local": { + "type": "string", + "default": "${projectDir}/databases/poppunk", + "hidden": true, + "description": "Path to the directory where the remote PopPUNK database and external clusters file should be saved to." + } + }, + "required": [ + "poppunk_db_remote", + "poppunk_ext_remote", + "poppunk_db_local" + ] + }, + "other_amr": { + "title": "Other AMR", + "type": "object", + "description": "", + "default": "", + "properties": { + "ariba_ref": { + "type": "string", + "default": "${projectDir}/data/ariba_ref_sequences-20230712.fasta", + "hidden": true, + "description": "Path to the reference sequences for ARIBA." + }, + "ariba_metadata": { + "type": "string", + "default": "${projectDir}/data/ariba_metadata-20230712.tsv", + "hidden": true, + "description": "Path to the metadata file for ARIBA." + }, + "ariba_db_local": { + "type": "string", + "default": "${projectDir}/databases/ariba", + "hidden": true, + "description": "Path to the directory where ARIBA reference database should be saved to." + } + }, + "required": [ + "ariba_ref", + "ariba_metadata", + "ariba_db_local" + ] + }, + "singularity": { + "title": "Singularity", + "type": "object", + "description": "", + "default": "", + "properties": { + "singularity_cachedir": { + "type": "string", + "description": "Path to the directory where Singularity images should be saved to.", + "hidden": true, + "default": "${projectDir}/singularity_cache" + } + }, + "required": [ + "singularity_cachedir" + ] } }, "allOf": [ @@ -56,109 +286,28 @@ "$ref": "#/definitions/input_output" }, { - "$ref": "#/definitions/cli_options" - } - ], - "properties": { - "assembler": { - "type": "string", - "default": "shovill" - }, - "min_contig_length": { - "type": "integer", - "default": 500 - }, - "assembly_publish": { - "type": "string", - "default": "link" - }, - "seroba_db_remote": { - "type": "string", - "default": "https://github.com/sanger-pathogens/seroba.git" - }, - "seroba_db_local": { - "type": "string", - "default": "/home/ubuntu/local-repo/gps-unified-pipeline/databases/seroba" - }, - "seroba_kmer": { - "type": "integer", - "default": 71 - }, - "kraken2_db_remote": { - "type": "string", - "default": "https://genome-idx.s3.amazonaws.com/kraken/minikraken2_v1_8GB_201904.tgz" - }, - "kraken2_db_local": { - "type": "string", - "default": "/home/ubuntu/local-repo/gps-unified-pipeline/databases/kraken" - }, - "kraken2_memory_mapping": { - "type": "string", - "default": "true" - }, - "ref_genome": { - "type": "string", - "default": "/home/ubuntu/local-repo/gps-unified-pipeline/data/ATCC_700669_v1.fa" - }, - "ref_genome_bwa_db_local": { - "type": "string", - "default": "/home/ubuntu/local-repo/gps-unified-pipeline/databases/bwa_ref_db" - }, - "poppunk_db_remote": { - "type": "string", - "default": "https://gps-project.cog.sanger.ac.uk/GPS_v6.tar.gz" - }, - "poppunk_ext_remote": { - "type": "string", - "default": "https://www.pneumogen.net/gps/GPS_v6_external_clusters.csv" - }, - "poppunk_db_local": { - "type": "string", - "default": "/home/ubuntu/local-repo/gps-unified-pipeline/databases/poppunk" - }, - "spneumo_percentage": { - "type": "integer", - "default": 60 - }, - "ref_coverage": { - "type": "integer", - "default": 60 - }, - "het_snp_site": { - "type": "integer", - "default": 220 - }, - "contigs": { - "type": "integer", - "default": 500 - }, - "length_low": { - "type": "integer", - "default": 1900000 - }, - "length_high": { - "type": "integer", - "default": 2300000 - }, - "depth": { - "type": "integer", - "default": 20 - }, - "ariba_ref": { - "type": "string", - "default": "/home/ubuntu/local-repo/gps-unified-pipeline/data/ariba_ref_sequences-20230712.fasta" - }, - "ariba_metadata": { - "type": "string", - "default": "/home/ubuntu/local-repo/gps-unified-pipeline/data/ariba_metadata-20230712.tsv" - }, - "ariba_db_local": { - "type": "string", - "default": "/home/ubuntu/local-repo/gps-unified-pipeline/databases/ariba" - }, - "lite": { - "type": "string", - "default": "false" + "$ref": "#/definitions/qc_parameters" + }, + { + "$ref": "#/definitions/assembly" + }, + { + "$ref": "#/definitions/mapping" + }, + { + "$ref": "#/definitions/taxonomy" + }, + { + "$ref": "#/definitions/serotype" + }, + { + "$ref": "#/definitions/lineage" + }, + { + "$ref": "#/definitions/other_amr" + }, + { + "$ref": "#/definitions/singularity" } - } + ] } \ No newline at end of file From 6c7d3dece20f218cce99f9f1ced52d1a4c1f2bcd Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 4 Aug 2023 17:47:35 +0100 Subject: [PATCH 090/157] Fix missing period Former-commit-id: e629017ef44c642e656b781b7961505603887432 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e44c12b..f6cdd91 100644 --- a/README.md +++ b/README.md @@ -201,7 +201,7 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | Option | Values | Description | | --- | ---| --- | | `--assembler` | `"shovill"` or `"unicycler"`
(Default: `"shovill"`)| Using which SPAdes-based assembler to assemble the reads. | - | `--min_contig_length` | Any integer value
(Default: `500`) | Minimum legnth of contig to be included in the assembly | + | `--min_contig_length` | Any integer value
(Default: `500`) | Minimum legnth of contig to be included in the assembly. | ## Mapping > ⚠️ `--ref_genome_bwa_db_local` does not accept user provided local database, directory content will be overwritten From bffc99b81710a69cff27bde710a3f7bd01f1db3d Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Sun, 6 Aug 2023 22:44:18 +0100 Subject: [PATCH 091/157] Ensure usage of DSL2 Former-commit-id: d8b2c7d3840a99151d5df6007711c89153066186 --- nextflow.config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nextflow.config b/nextflow.config index e5b104d..98854ed 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,3 +1,5 @@ +nextflow.enable.dsl=2 + // Default parameters that can be overridden params { // Show help message From 59d2ab240e28f255d27eef687104918ef53a6cac Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 7 Aug 2023 08:52:19 +0000 Subject: [PATCH 092/157] Fix type mismatch error Former-commit-id: b77377b61fe98b4519be6c63643314966d482d95 --- modules/info.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/info.nf b/modules/info.nf index a317a3b..1315c13 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -372,7 +372,7 @@ process SAVE { |╔═══════════════════════════════════════════════════════════════════════════════════════════╗ |║ Read QC ║ |╟──────────────────────────────────────────────────────────────┬────────────────────────────╢ - |${qcTextRow('Minimum bases in processed reads', String.format("%.0f", params.length_low * params.depth))} + |${qcTextRow('Minimum bases in processed reads', String.format("%d", (int)(params.length_low * params.depth)))} |╠══════════════════════════════════════════════════════════════╧════════════════════════════╣ |║ Taxonomy QC ║ |╟──────────────────────────────────────────────────────────────┬────────────────────────────╢ From 65d78829a1852d2de301afcafccccd454fa000e4 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 7 Aug 2023 09:59:18 +0000 Subject: [PATCH 093/157] Yield more accurate number Former-commit-id: 1562dbe8a7680dd8ffb3f2892c479fecac89fc98 --- modules/info.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/info.nf b/modules/info.nf index 1315c13..72547d0 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -372,7 +372,7 @@ process SAVE { |╔═══════════════════════════════════════════════════════════════════════════════════════════╗ |║ Read QC ║ |╟──────────────────────────────────────────────────────────────┬────────────────────────────╢ - |${qcTextRow('Minimum bases in processed reads', String.format("%d", (int)(params.length_low * params.depth)))} + |${qcTextRow('Minimum bases in processed reads', String.format("%.0f", Math.ceil(params.length_low * params.depth)))} |╠══════════════════════════════════════════════════════════════╧════════════════════════════╣ |║ Taxonomy QC ║ |╟──────────────────────────────────────────────────────────────┬────────────────────────────╢ From 0b56e639dda6f85b7cd5ff4dce66e711e8155657 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 7 Aug 2023 11:28:57 +0100 Subject: [PATCH 094/157] Remove default values from schema Former-commit-id: 9766fe2a098542fc5b197263bf399fa6dec756e7 --- nextflow_schema.json | 597 +++++++++++++++++++++---------------------- 1 file changed, 286 insertions(+), 311 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index fa5994a..b625e8b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,313 +1,288 @@ { - "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json", - "title": ". pipeline parameters", - "description": "", - "type": "object", - "definitions": { - "input_output": { - "title": "Input / Output", - "type": "object", - "description": "", - "default": "", - "properties": { - "reads": { - "type": "string", - "description": "Path to the input directory that contains the reads to be processed." - }, - "output": { - "type": "string", - "description": "Path to the output directory that save the results." - }, - "assembly_publish": { - "type": "string", - "default": "link", - "description": "Method used by Nextflow to publish the generated assemblies.", - "hidden": true - } - }, - "required": [ - "reads", - "output", - "assembly_publish" - ] - }, - "qc_parameters": { - "title": "QC Parameters", - "type": "object", - "description": "", - "default": "", - "properties": { - "spneumo_percentage": { - "type": "number", - "default": 60, - "description": "Minimum S. pneumoniae percentage in reads to pass Taxonomy QC.", - "hidden": true - }, - "ref_coverage": { - "type": "number", - "default": 60, - "description": "Minimum reference coverage percentage by the reads to pass Mapping QC.", - "hidden": true - }, - "het_snp_site": { - "type": "integer", - "default": 220, - "description": "Maximum non-cluster heterozygous SNP (Het-SNP) site count to pass Mapping QC.", - "hidden": true - }, - "contigs": { - "type": "integer", - "default": 500, - "hidden": true, - "description": "Maximum contig count in assembly to pass Assembly QC." - }, - "length_low": { - "type": "integer", - "default": 1900000, - "hidden": true, - "description": "Minimum assembly length to pass Assembly QC." - }, - "length_high": { - "type": "integer", - "default": 2300000, - "hidden": true, - "description": "Maximum assembly length to pass Assembly QC." - }, - "depth": { - "type": "number", - "default": 20, - "hidden": true, - "description": "Minimum sequencing depth to pass Assembly QC." - } - }, - "required": [ - "spneumo_percentage", - "ref_coverage", - "het_snp_site", - "contigs", - "length_low", - "length_high", - "depth" - ] - }, - "assembly": { - "title": "Assembly", - "type": "object", - "description": "", - "default": "", - "properties": { - "assembler": { - "type": "string", - "default": "shovill", - "hidden": true, - "description": "Using which SPAdes-based assembler to assemble the reads." - }, - "min_contig_length": { - "type": "integer", - "default": 500, - "description": "Minimum legnth of contig to be included in the assembly.", - "hidden": true - } - }, - "required": [ - "assembler", - "min_contig_length" - ] - }, - "mapping": { - "title": "Mapping", - "type": "object", - "description": "", - "default": "", - "properties": { - "ref_genome": { - "type": "string", - "default": "${projectDir}/data/ATCC_700669_v1.fa", - "hidden": true, - "description": "Path to the reference genome for mapping." - }, - "ref_genome_bwa_db_local": { - "type": "string", - "default": "${projectDir}/databases/bwa_ref_db", - "description": "Path to the directory where the reference genome FM-index database for BWA should be saved to.", - "hidden": true - } - }, - "required": [ - "ref_genome", - "ref_genome_bwa_db_local" - ] - }, - "taxonomy": { - "title": "Taxonomy", - "type": "object", - "description": "", - "default": "", - "properties": { - "kraken2_db_remote": { - "type": "string", - "default": "https://genome-idx.s3.amazonaws.com/kraken/minikraken2_v1_8GB_201904.tgz", - "hidden": true, - "description": "URL to a Kraken2 database." - }, - "kraken2_db_local": { - "type": "string", - "default": "${projectDir}/databases/kraken", - "hidden": true, - "description": "Path to the directory where the remote Kraken2 database should be saved to." - }, - "kraken2_memory_mapping": { - "type": "boolean", - "default": true, - "hidden": true, - "description": "Using the memory mapping option of Kraken2 or not." - } - }, - "required": [ - "kraken2_db_remote", - "kraken2_db_local", - "kraken2_memory_mapping" - ] - }, - "serotype": { - "title": "Serotype", - "type": "object", - "description": "", - "default": "", - "properties": { - "seroba_db_remote": { - "type": "string", - "default": "https://github.com/sanger-pathogens/seroba.git", - "hidden": true, - "description": "URL to a SeroBA Git remote repository." - }, - "seroba_db_local": { - "type": "string", - "default": "${projectDir}/databases/seroba", - "hidden": true, - "description": "Path to the directory where SeroBA local repository should be saved to." - }, - "seroba_kmer": { - "type": "integer", - "default": 71, - "hidden": true, - "description": "Kmer size for creating the KMC database of SeroBA." - } - }, - "required": [ - "seroba_db_remote", - "seroba_db_local", - "seroba_kmer" - ] - }, - "lineage": { - "title": "Lineage", - "type": "object", - "description": "", - "default": "", - "properties": { - "poppunk_db_remote": { - "type": "string", - "default": "https://gps-project.cog.sanger.ac.uk/GPS_v6.tar.gz", - "hidden": true, - "description": "URL to a PopPUNK database." - }, - "poppunk_ext_remote": { - "type": "string", - "default": "https://www.pneumogen.net/gps/GPS_v6_external_clusters.csv", - "hidden": true, - "description": "URL to a PopPUNK external clusters file." - }, - "poppunk_db_local": { - "type": "string", - "default": "${projectDir}/databases/poppunk", - "hidden": true, - "description": "Path to the directory where the remote PopPUNK database and external clusters file should be saved to." - } - }, - "required": [ - "poppunk_db_remote", - "poppunk_ext_remote", - "poppunk_db_local" - ] - }, - "other_amr": { - "title": "Other AMR", - "type": "object", - "description": "", - "default": "", - "properties": { - "ariba_ref": { - "type": "string", - "default": "${projectDir}/data/ariba_ref_sequences-20230712.fasta", - "hidden": true, - "description": "Path to the reference sequences for ARIBA." - }, - "ariba_metadata": { - "type": "string", - "default": "${projectDir}/data/ariba_metadata-20230712.tsv", - "hidden": true, - "description": "Path to the metadata file for ARIBA." - }, - "ariba_db_local": { - "type": "string", - "default": "${projectDir}/databases/ariba", - "hidden": true, - "description": "Path to the directory where ARIBA reference database should be saved to." - } - }, - "required": [ - "ariba_ref", - "ariba_metadata", - "ariba_db_local" - ] - }, - "singularity": { - "title": "Singularity", - "type": "object", - "description": "", - "default": "", - "properties": { - "singularity_cachedir": { - "type": "string", - "description": "Path to the directory where Singularity images should be saved to.", - "hidden": true, - "default": "${projectDir}/singularity_cache" - } - }, - "required": [ - "singularity_cachedir" - ] - } - }, - "allOf": [ - { - "$ref": "#/definitions/input_output" - }, - { - "$ref": "#/definitions/qc_parameters" - }, - { - "$ref": "#/definitions/assembly" - }, - { - "$ref": "#/definitions/mapping" - }, - { - "$ref": "#/definitions/taxonomy" - }, - { - "$ref": "#/definitions/serotype" - }, - { - "$ref": "#/definitions/lineage" - }, - { - "$ref": "#/definitions/other_amr" - }, - { - "$ref": "#/definitions/singularity" - } - ] + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json", + "title": ". pipeline parameters", + "description": "", + "type": "object", + "definitions": { + "input_output": { + "title": "Input / Output", + "type": "object", + "description": "", + "default": "", + "properties": { + "reads": { + "type": "string", + "description": "Path to the input directory that contains the reads to be processed." + }, + "output": { + "type": "string", + "description": "Path to the output directory that save the results." + }, + "assembly_publish": { + "type": "string", + "description": "Method used by Nextflow to publish the generated assemblies.", + "hidden": true + } + }, + "required": [ + "reads", + "output", + "assembly_publish" + ] + }, + "qc_parameters": { + "title": "QC Parameters", + "type": "object", + "description": "", + "default": "", + "properties": { + "spneumo_percentage": { + "type": "number", + "description": "Minimum S. pneumoniae percentage in reads to pass Taxonomy QC.", + "hidden": true + }, + "ref_coverage": { + "type": "number", + "description": "Minimum reference coverage percentage by the reads to pass Mapping QC.", + "hidden": true + }, + "het_snp_site": { + "type": "integer", + "description": "Maximum non-cluster heterozygous SNP (Het-SNP) site count to pass Mapping QC.", + "hidden": true + }, + "contigs": { + "type": "integer", + "hidden": true, + "description": "Maximum contig count in assembly to pass Assembly QC." + }, + "length_low": { + "type": "integer", + "hidden": true, + "description": "Minimum assembly length to pass Assembly QC." + }, + "length_high": { + "type": "integer", + "hidden": true, + "description": "Maximum assembly length to pass Assembly QC." + }, + "depth": { + "type": "number", + "hidden": true, + "description": "Minimum sequencing depth to pass Assembly QC." + } + }, + "required": [ + "spneumo_percentage", + "ref_coverage", + "het_snp_site", + "contigs", + "length_low", + "length_high", + "depth" + ] + }, + "assembly": { + "title": "Assembly", + "type": "object", + "description": "", + "default": "", + "properties": { + "assembler": { + "type": "string", + "hidden": true, + "description": "Using which SPAdes-based assembler to assemble the reads." + }, + "min_contig_length": { + "type": "integer", + "description": "Minimum legnth of contig to be included in the assembly.", + "hidden": true + } + }, + "required": [ + "assembler", + "min_contig_length" + ] + }, + "mapping": { + "title": "Mapping", + "type": "object", + "description": "", + "default": "", + "properties": { + "ref_genome": { + "type": "string", + "hidden": true, + "description": "Path to the reference genome for mapping." + }, + "ref_genome_bwa_db_local": { + "type": "string", + "description": "Path to the directory where the reference genome FM-index database for BWA should be saved to.", + "hidden": true + } + }, + "required": [ + "ref_genome", + "ref_genome_bwa_db_local" + ] + }, + "taxonomy": { + "title": "Taxonomy", + "type": "object", + "description": "", + "default": "", + "properties": { + "kraken2_db_remote": { + "type": "string", + "hidden": true, + "description": "URL to a Kraken2 database." + }, + "kraken2_db_local": { + "type": "string", + "hidden": true, + "description": "Path to the directory where the remote Kraken2 database should be saved to." + }, + "kraken2_memory_mapping": { + "type": "boolean", + "hidden": true, + "description": "Using the memory mapping option of Kraken2 or not." + } + }, + "required": [ + "kraken2_db_remote", + "kraken2_db_local", + "kraken2_memory_mapping" + ] + }, + "serotype": { + "title": "Serotype", + "type": "object", + "description": "", + "default": "", + "properties": { + "seroba_db_remote": { + "type": "string", + "hidden": true, + "description": "URL to a SeroBA Git remote repository." + }, + "seroba_db_local": { + "type": "string", + "hidden": true, + "description": "Path to the directory where SeroBA local repository should be saved to." + }, + "seroba_kmer": { + "type": "integer", + "hidden": true, + "description": "Kmer size for creating the KMC database of SeroBA." + } + }, + "required": [ + "seroba_db_remote", + "seroba_db_local", + "seroba_kmer" + ] + }, + "lineage": { + "title": "Lineage", + "type": "object", + "description": "", + "default": "", + "properties": { + "poppunk_db_remote": { + "type": "string", + "hidden": true, + "description": "URL to a PopPUNK database." + }, + "poppunk_ext_remote": { + "type": "string", + "hidden": true, + "description": "URL to a PopPUNK external clusters file." + }, + "poppunk_db_local": { + "type": "string", + "hidden": true, + "description": "Path to the directory where the remote PopPUNK database and external clusters file should be saved to." + } + }, + "required": [ + "poppunk_db_remote", + "poppunk_ext_remote", + "poppunk_db_local" + ] + }, + "other_amr": { + "title": "Other AMR", + "type": "object", + "description": "", + "default": "", + "properties": { + "ariba_ref": { + "type": "string", + "hidden": true, + "description": "Path to the reference sequences for ARIBA." + }, + "ariba_metadata": { + "type": "string", + "hidden": true, + "description": "Path to the metadata file for ARIBA." + }, + "ariba_db_local": { + "type": "string", + "hidden": true, + "description": "Path to the directory where ARIBA reference database should be saved to." + } + }, + "required": [ + "ariba_ref", + "ariba_metadata", + "ariba_db_local" + ] + }, + "singularity": { + "title": "Singularity", + "type": "object", + "description": "", + "default": "", + "properties": { + "singularity_cachedir": { + "type": "string", + "description": "Path to the directory where Singularity images should be saved to.", + "hidden": true + } + }, + "required": [ + "singularity_cachedir" + ] + } + }, + "allOf": [ + { + "$ref": "#/definitions/input_output" + }, + { + "$ref": "#/definitions/qc_parameters" + }, + { + "$ref": "#/definitions/assembly" + }, + { + "$ref": "#/definitions/mapping" + }, + { + "$ref": "#/definitions/taxonomy" + }, + { + "$ref": "#/definitions/serotype" + }, + { + "$ref": "#/definitions/lineage" + }, + { + "$ref": "#/definitions/other_amr" + }, + { + "$ref": "#/definitions/singularity" + } + ] } \ No newline at end of file From 96d03664f6582d18d23a7cb0a3b72234c8fb32f3 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 7 Aug 2023 11:30:23 +0100 Subject: [PATCH 095/157] Correct schema title Former-commit-id: 8b86a4cc6fd9f5655a576496ed0a5bb4fde71af2 --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index b625e8b..be58342 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,7 +1,7 @@ { "$schema": "http://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json", - "title": ". pipeline parameters", + "title": "GPS Unified Pipeline Parameters", "description": "", "type": "object", "definitions": { From 554d8155fab694aec85005a7f1f4c77b139bb2d9 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 7 Aug 2023 16:44:11 +0100 Subject: [PATCH 096/157] Display reports on NF Tower UI Former-commit-id: 628aadf318dd567b7636705c81a9ff443735d15a --- tower.yml | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 tower.yml diff --git a/tower.yml b/tower.yml new file mode 100644 index 0000000..cbc5d54 --- /dev/null +++ b/tower.yml @@ -0,0 +1,3 @@ +reports: + results.csv: + display: "Overall Results" From 9226c7f620c766bcde281d1f2a4a49ed3711a36f Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 8 Aug 2023 16:33:09 +0000 Subject: [PATCH 097/157] Improve clarity of Singularity error message Former-commit-id: 999212cf2919cf4a267f7542de0f54e54ed70ab6 --- modules/singularity.nf | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/modules/singularity.nf b/modules/singularity.nf index 6c09642..a18520e 100644 --- a/modules/singularity.nf +++ b/modules/singularity.nf @@ -33,7 +33,17 @@ void singularityPreflight(Path configPath, String singularityCacheDir) { process.waitFor() if (process.exitValue()) { - log.info("${container} cannot be pulled successfully. Check your Internet connection and re-run the pipeline.\n") + def errorMessage = new BufferedReader(new InputStreamReader(process.getErrorStream())).getText() + + log.info( + """ + |Singularity Error Messages: + |${errorMessage} + | + |${container} cannot be pulled successfully. Resolve the above error and re-run the pipeline. + | + """.stripMargin() + ) System.exit(1) } From 7f596dd95dea655acb905648752196e8acea1836 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 14 Aug 2023 16:09:45 +0000 Subject: [PATCH 098/157] Add info.txt to Tower Reports Former-commit-id: db33914233a83ee2926e94868a985a832b372d4c --- tower.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tower.yml b/tower.yml index cbc5d54..ee4061b 100644 --- a/tower.yml +++ b/tower.yml @@ -1,3 +1,5 @@ reports: results.csv: display: "Overall Results" + info.txt: + display: "Run Information" \ No newline at end of file From 07b4015e4054e9c4c06609d0052b5a0569d476d0 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 14 Aug 2023 18:20:58 +0000 Subject: [PATCH 099/157] Consolidate local databases Former-commit-id: ae7029f960d828630aaa6656896c74fad3fae88a --- bin/check-create_ariba_db.sh | 3 ++- bin/check-create_ref_genome_bwa_db.sh | 5 +++-- bin/check-download_kraken2_db.sh | 3 ++- bin/check-download_poppunk_db.sh | 4 ++-- bin/check-download_poppunk_ext_clusters.sh | 4 ++-- bin/check_seroba_db.sh | 2 +- bin/get_lineage.sh | 2 +- modules/amr.nf | 7 ++++--- modules/lineage.nf | 15 ++++++++++----- modules/mapping.nf | 9 +++++---- modules/serotype.nf | 12 +++++++----- modules/taxonomy.nf | 7 ++++--- modules/validate.nf | 6 +----- nextflow.config | 18 ++++++++---------- workflows/init.nf | 14 +++++++------- workflows/pipeline.nf | 16 ++++++++-------- 16 files changed, 67 insertions(+), 60 deletions(-) diff --git a/bin/check-create_ariba_db.sh b/bin/check-create_ariba_db.sh index 32ff767..6544fa2 100755 --- a/bin/check-create_ariba_db.sh +++ b/bin/check-create_ariba_db.sh @@ -23,8 +23,9 @@ if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \ [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.fa" ] || \ [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.varonly.fa" ] ; then - rm -rf "${DB_LOCAL:?}/${OUTPUT}" + rm -rf "${DB_LOCAL}" + mkdir -p "${DB_LOCAL}" ariba prepareref -f "$REF_SEQUENCES" -m "$METADATA" "${DB_LOCAL}/${OUTPUT}" echo -e "{\n \"reference\": \"$REF_SEQUENCES\",\n \"reference_md5\": \"$REF_SEQUENCES_MD5\",\n \"metadata\": \"$METADATA\",\n \"metadata_md5\": \"$METADATA_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > "${DB_LOCAL}/${JSON_FILE}" diff --git a/bin/check-create_ref_genome_bwa_db.sh b/bin/check-create_ref_genome_bwa_db.sh index 65a7da8..9eb13e2 100755 --- a/bin/check-create_ref_genome_bwa_db.sh +++ b/bin/check-create_ref_genome_bwa_db.sh @@ -12,11 +12,12 @@ if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \ [ ! -f "${DB_LOCAL}/${PREFIX}.pac" ] || \ [ ! -f "${DB_LOCAL}/${PREFIX}.sa" ] ; then - rm -rf "${DB_LOCAL:?}"/{,.[!.],..?}* + rm -rf "${DB_LOCAL}" bwa index -p "$PREFIX" "$REFERENCE" - mv "${PREFIX}.amb" "${PREFIX}.ann" "${PREFIX}.bwt" "${PREFIX}.pac" "${PREFIX}.sa" -t "$DB_LOCAL" + mkdir -p "${DB_LOCAL}" + mv "${PREFIX}.amb" "${PREFIX}.ann" "${PREFIX}.bwt" "${PREFIX}.pac" "${PREFIX}.sa" -t "${DB_LOCAL}/${OUTPUT}" echo -e "{\n \"reference\": \"$REFERENCE\",\n \"reference_md5\": \"$REFERENCE_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > "${DB_LOCAL}/${JSON_FILE}" diff --git a/bin/check-download_kraken2_db.sh b/bin/check-download_kraken2_db.sh index 8632bc8..5ba9703 100755 --- a/bin/check-download_kraken2_db.sh +++ b/bin/check-download_kraken2_db.sh @@ -9,13 +9,14 @@ if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \ [ ! -f "${DB_LOCAL}/opts.k2d" ] || \ [ ! -f "${DB_LOCAL}/taxo.k2d" ]; then - rm -rf "${DB_LOCAL:?}"/{,.[!.],..?}* + rm -rf "${DB_LOCAL}" wget "${DB_REMOTE}" -O $ZIPPED_DB # Use tmp dir and find to ensure files are saved directly at $DB_LOCAL regardless of archive directory structure mkdir tmp tar -xzf $ZIPPED_DB -C tmp + mkdir -p "${DB_LOCAL}" find tmp -type f -exec mv {} "$DB_LOCAL" \; rm -f $ZIPPED_DB diff --git a/bin/check-download_poppunk_db.sh b/bin/check-download_poppunk_db.sh index 48a0198..0dbdd74 100755 --- a/bin/check-download_poppunk_db.sh +++ b/bin/check-download_poppunk_db.sh @@ -17,10 +17,10 @@ if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \ [ ! -f "${DB_PATH}/${DB_NAME}_clusters.csv" ] || \ [ ! -f "${DB_PATH}/${DB_NAME}.refs" ]; then - rm -rf "${DB_LOCAL:?}/${JSON_FILE}" - rm -rf "${DB_LOCAL:?}"/*/ + rm -rf "${DB_LOCAL}" wget "$DB_REMOTE" -O poppunk_db.tar.gz + mkdir -p "${DB_LOCAL}" tar -xzf poppunk_db.tar.gz -C "$DB_LOCAL" rm poppunk_db.tar.gz diff --git a/bin/check-download_poppunk_ext_clusters.sh b/bin/check-download_poppunk_ext_clusters.sh index 273ccbb..cef4b67 100755 --- a/bin/check-download_poppunk_ext_clusters.sh +++ b/bin/check-download_poppunk_ext_clusters.sh @@ -9,9 +9,9 @@ if [ ! -f "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}" ] || \ [ ! "$EXT_CLUSTERS_REMOTE" == "$(jq -r .url "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}")" ] || \ [ ! -f "${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV}" ]; then - rm -f "${EXT_CLUSTERS_LOCAL}"/*.csv - rm -f "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}" + rm -f "${EXT_CLUSTERS_LOCAL}" + mkdir -p "${EXT_CLUSTERS_LOCAL}" wget "$EXT_CLUSTERS_REMOTE" -O "${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV}" jq -n \ diff --git a/bin/check_seroba_db.sh b/bin/check_seroba_db.sh index 2e6ff2d..fd9578d 100755 --- a/bin/check_seroba_db.sh +++ b/bin/check_seroba_db.sh @@ -8,7 +8,7 @@ if [ ! -f "${DB_LOCAL}"/"${JSON_FILE}" ] || \ [ ! "$(grep 'kmer' "${DB_LOCAL}"/"${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "${KMER}" ] || \ ! ( (git -C "${DB_LOCAL}" pull || echo 'Already up-to-date') | grep -q 'Already up[- ]to[- ]date' ); then - rm -rf "${DB_LOCAL:?}"/{,.[!.],..?}* + rm -rf "${DB_LOCAL}" git clone "${DB_REMOTE}" "${DB_LOCAL}" CREATE_DB=true diff --git a/bin/get_lineage.sh b/bin/get_lineage.sh index cd57737..24ea2e4 100755 --- a/bin/get_lineage.sh +++ b/bin/get_lineage.sh @@ -6,7 +6,7 @@ # Save results of individual sample into .csv with its name as filename sed 's/^/prefix_/' "$QFILE" > safe_qfile.txt -poppunk_assign --db "${POPPUNK_DIR}/${DB_NAME}" --external-clustering "${POPPUNK_DIR}/${EXT_CLUSTERS_FILE}" --query safe_qfile.txt --output output --threads "$(nproc)" +poppunk_assign --db "${POPPUNK_DIR}/${DB_NAME}" --external-clustering "${EXT_CLUSTERS_DIR}/${EXT_CLUSTERS_FILE}" --query safe_qfile.txt --output output --threads "$(nproc)" sed 's/^prefix_//' output/output_external_clusters.csv > result.txt diff --git a/modules/amr.nf b/modules/amr.nf index a128b7b..cf080e2 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -51,19 +51,20 @@ process GET_ARIBA_DB { input: path ref_sequences path metadata - path local + path db output: - path local, emit: path + path ariba_db, emit: path val output, emit: database script: + ariba_db="${db}/ariba" output='database' json='done_ariba_db.json' """ REF_SEQUENCES="$ref_sequences" METADATA="$metadata" - DB_LOCAL="$local" + DB_LOCAL="$ariba_db" OUTPUT="$output" JSON_FILE="$json" diff --git a/modules/lineage.nf b/modules/lineage.nf index 76f456c..7dba5ac 100644 --- a/modules/lineage.nf +++ b/modules/lineage.nf @@ -7,17 +7,18 @@ process GET_POPPUNK_DB { input: val db_remote - path local + path db output: - path local, emit: path + path poppunk_db, emit: path env DB_NAME, emit: database script: + poppunk_db="${db}/poppunk" json='done_poppunk.json' """ DB_REMOTE="$db_remote" - DB_LOCAL="$local" + DB_LOCAL="$poppunk_db" JSON_FILE="$json" source check-download_poppunk_db.sh @@ -33,16 +34,18 @@ process GET_POPPUNK_EXT_CLUSTERS { input: val ext_clusters_remote - path local + path db output: + path poppunk_ext, emit: path env EXT_CLUSTERS_CSV, emit: file script: + poppunk_ext="${db}/poppunk_ext" json='done_poppunk_ext.json' """ EXT_CLUSTERS_REMOTE="$ext_clusters_remote" - EXT_CLUSTERS_LOCAL="$local" + EXT_CLUSTERS_LOCAL="$poppunk_ext" JSON_FILE="$json" source check-download_poppunk_ext_clusters.sh @@ -61,6 +64,7 @@ process LINEAGE { input: path poppunk_dir val db_name + path ext_clusters_dir val ext_clusters_file path qfile @@ -72,6 +76,7 @@ process LINEAGE { QFILE="$qfile" POPPUNK_DIR="$poppunk_dir" DB_NAME="$db_name" + EXT_CLUSTERS_DIR="$ext_clusters_dir" EXT_CLUSTERS_FILE="$ext_clusters_file" source get_lineage.sh diff --git a/modules/mapping.nf b/modules/mapping.nf index 5544582..08c298b 100644 --- a/modules/mapping.nf +++ b/modules/mapping.nf @@ -6,18 +6,19 @@ process GET_REF_GENOME_BWA_DB { input: path reference - path local + path db output: - path(local), emit: path - val(prefix), emit: prefix + path bwa_db, emit: path + val prefix, emit: prefix script: + bwa_db="${db}/bwa" prefix='reference' json='done_bwa_db.json' """ REFERENCE="$reference" - DB_LOCAL="$local" + DB_LOCAL="$bwa_db" PREFIX="$prefix" JSON_FILE="$json" diff --git a/modules/serotype.nf b/modules/serotype.nf index 5c3211f..18a23c7 100644 --- a/modules/serotype.nf +++ b/modules/serotype.nf @@ -7,17 +7,18 @@ process CHECK_SEROBA_DB { input: val remote - path local + path db val kmer output: env CREATE_DB, emit: create_db script: + seroba_db="${db}/seroba" json='done_seroba.json' """ DB_REMOTE="$remote" - DB_LOCAL="$local" + DB_LOCAL="$seroba_db" KMER="$kmer" JSON_FILE="$json" @@ -33,21 +34,22 @@ process GET_SEROBA_DB { input: val remote - path local + path db val create_db val kmer output: - path local, emit: path + path seroba_db, emit: path val database, emit: database script: + seroba_db="${db}/seroba" database='database' json='done_seroba.json' """ DATABASE="$database" DB_REMOTE="$remote" - DB_LOCAL="$local" + DB_LOCAL="$seroba_db" KMER="$kmer" CREATE_DB="$create_db" JSON_FILE="$json" diff --git a/modules/taxonomy.nf b/modules/taxonomy.nf index 735b59d..162a986 100644 --- a/modules/taxonomy.nf +++ b/modules/taxonomy.nf @@ -5,16 +5,17 @@ process GET_KRAKEN2_DB { input: val remote - path local + path db output: - path local, emit: path + path kraken2_db, emit: path script: + kraken2_db="${db}/kraken2_db" json='done_kraken.json' """ DB_REMOTE="$remote" - DB_LOCAL="$local" + DB_LOCAL="$kraken2_db" JSON_FILE="$json" source check-download_kraken2_db.sh diff --git a/modules/validate.nf b/modules/validate.nf index bd1abd3..b994678 100644 --- a/modules/validate.nf +++ b/modules/validate.nf @@ -5,20 +5,17 @@ validParams = [ version: 'boolean', reads: 'path_exist', output: 'path', + db: 'path', assembler: 'assembler', min_contig_length: 'int', assembly_publish: 'publish_mode', seroba_db_remote: 'url_git', - seroba_db_local: 'path', seroba_kmer: 'int', kraken2_db_remote: 'url_targz', - kraken2_db_local: 'path', kraken2_memory_mapping: 'boolean', ref_genome: 'path_fasta', - ref_genome_bwa_db_local: 'path', poppunk_db_remote: 'url_targz', poppunk_ext_remote: 'url_csv', - poppunk_db_local: 'path', spneumo_percentage: 'int_float', ref_coverage: 'int_float', het_snp_site: 'int', @@ -28,7 +25,6 @@ validParams = [ depth: 'int_float', ariba_ref: 'path_fasta', ariba_metadata: 'path_tsv', - ariba_db_local: 'path', lite: 'boolean' ] diff --git a/nextflow.config b/nextflow.config index 98854ed..902d9ec 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,9 @@ params { reads = "$projectDir/input" // Default output directory output = "$projectDir/output" + + // Default databases directory for saving all the required databases + db = "$projectDir/databases" // Default assembler assembler = "shovill" @@ -21,24 +24,20 @@ params { // Default assembly publish mode assembly_publish = "link" - // Default git repository and local directory, and KMC kmer size for SeroBA + // Default git repository, and KMC kmer size for SeroBA seroba_db_remote = "https://github.com/sanger-pathogens/seroba.git" - seroba_db_local = "$projectDir/databases/seroba" seroba_kmer = 71 - // Default link and local directory for Kraken2 Database, and usage of memory mapping + // Default link for Kraken2 Database, and usage of memory mapping kraken2_db_remote = "https://genome-idx.s3.amazonaws.com/kraken/minikraken2_v1_8GB_201904.tgz" - kraken2_db_local = "$projectDir/databases/kraken" kraken2_memory_mapping = true - // Default referece genome assembly path and local directory for its BWA database + // Default referece genome assembly path for its BWA database ref_genome = "$projectDir/data/ATCC_700669_v1.fa" - ref_genome_bwa_db_local = "$projectDir/databases/bwa_ref_db" - // Default links for PopPUNK Database and External Clusters, and local directory for both + // Default links for PopPUNK Database and External Clusters poppunk_db_remote = "https://gps-project.cog.sanger.ac.uk/GPS_v6.tar.gz" poppunk_ext_remote = "https://www.pneumogen.net/gps/GPS_v6_external_clusters.csv" - poppunk_db_local = "$projectDir/databases/poppunk" // Default values for QC spneumo_percentage = 60.00 @@ -49,10 +48,9 @@ params { length_high = 2300000 depth = 20.00 - // Default ARIBA referece sequences and metadata paths, and local directory for its generated database + // Default ARIBA referece sequences and metadata paths ariba_ref = "$projectDir/data/ariba_ref_sequences-20230712.fasta" ariba_metadata = "$projectDir/data/ariba_metadata-20230712.tsv" - ariba_db_local = "$projectDir/databases/ariba" // Toggle for removing .bam and .sam files mid-run to reduce storage requirement // Warning: This will break the -resume function of Nextflow diff --git a/workflows/init.nf b/workflows/init.nf index 7d1ed77..9a29e62 100644 --- a/workflows/init.nf +++ b/workflows/init.nf @@ -9,21 +9,21 @@ include { GET_ARIBA_DB } from "$projectDir/modules/amr" // Alternative workflow for initialisation only workflow INIT { // Check Reference Genome BWA Database, generate from assembly if necessary - GET_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) + GET_REF_GENOME_BWA_DB(params.ref_genome, params.db) // Check ARIBA database, generate from reference sequences and metadata if ncessary - GET_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) + GET_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.db) // Check Kraken2 Database, download if necessary - GET_KRAKEN2_DB(params.kraken2_db_remote, params.kraken2_db_local) + GET_KRAKEN2_DB(params.kraken2_db_remote, params.db) // Check SeroBA Databases, clone and rebuild if necessary - CHECK_SEROBA_DB(params.seroba_db_remote, params.seroba_db_local, params.seroba_kmer) - GET_SEROBA_DB(params.seroba_db_remote, params.seroba_db_local, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) + CHECK_SEROBA_DB(params.seroba_db_remote, params.db, params.seroba_kmer) + GET_SEROBA_DB(params.seroba_db_remote, params.db, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) // Check to PopPUNK Database and External Clusters, download if necessary - GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_db_local) - GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_db_local) + GET_POPPUNK_DB(params.poppunk_db_remote, params.db) + GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.db) // Pull all Docker images mentioned in nextflow.config if using Docker if (workflow.containerEngine === 'docker') { diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index e39599f..00429da 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -14,21 +14,21 @@ include { GENERATE_SAMPLE_REPORT; GENERATE_OVERALL_REPORT } from "$projectDir/mo workflow PIPELINE { main: // Get path and prefix of Reference Genome BWA Database, generate from assembly if necessary - GET_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) + GET_REF_GENOME_BWA_DB(params.ref_genome, params.db) // Get path to Kraken2 Database, download if necessary - GET_KRAKEN2_DB(params.kraken2_db_remote, params.kraken2_db_local) + GET_KRAKEN2_DB(params.kraken2_db_remote, params.db) // Get path to SeroBA Databases, clone and rebuild if necessary - CHECK_SEROBA_DB(params.seroba_db_remote, params.seroba_db_local, params.seroba_kmer) - GET_SEROBA_DB(params.seroba_db_remote, params.seroba_db_local, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) + CHECK_SEROBA_DB(params.seroba_db_remote, params.db, params.seroba_kmer) + GET_SEROBA_DB(params.seroba_db_remote, params.db, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) // Get paths to PopPUNK Database and External Clusters, download if necessary - GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_db_local) - GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_db_local) + GET_POPPUNK_DB(params.poppunk_db_remote, params.db) + GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.db) // Get path to ARIBA database, generate from reference sequences and metadata if ncessary - GET_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) + GET_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.db) // Get read pairs into Channel raw_read_pairs_ch raw_read_pairs_ch = Channel.fromFilePairs("$params.reads/*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}", checkIfExists: true) @@ -129,7 +129,7 @@ workflow PIPELINE { // From generated POPPUNK_QFILE, assign GPSC to samples passed overall QC // Output into Channel LINEAGE.out.reports (multiple reports from a single process) - LINEAGE(GET_POPPUNK_DB.out.path, GET_POPPUNK_DB.out.database, GET_POPPUNK_EXT_CLUSTERS.out.file, POPPUNK_QFILE) + LINEAGE(GET_POPPUNK_DB.out.path, GET_POPPUNK_DB.out.database, GET_POPPUNK_EXT_CLUSTERS.out.path, GET_POPPUNK_EXT_CLUSTERS.out.file, POPPUNK_QFILE) // From Channel OVERALL_QC_PASSED_READS_ch, serotype the preprocess reads of samples passed overall QC // Output into Channel SEROTYPE.out.report From 47f6ccfc5975c96768d768313b62347bce15769d Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 14 Aug 2023 18:57:31 +0000 Subject: [PATCH 100/157] Fix saving of BWA database Former-commit-id: 365cb8e69ce7cfac534783357d04c8eff6ef0c20 --- bin/check-create_ref_genome_bwa_db.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/check-create_ref_genome_bwa_db.sh b/bin/check-create_ref_genome_bwa_db.sh index 9eb13e2..49de9d2 100755 --- a/bin/check-create_ref_genome_bwa_db.sh +++ b/bin/check-create_ref_genome_bwa_db.sh @@ -17,7 +17,7 @@ if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \ bwa index -p "$PREFIX" "$REFERENCE" mkdir -p "${DB_LOCAL}" - mv "${PREFIX}.amb" "${PREFIX}.ann" "${PREFIX}.bwt" "${PREFIX}.pac" "${PREFIX}.sa" -t "${DB_LOCAL}/${OUTPUT}" + mv "${PREFIX}.amb" "${PREFIX}.ann" "${PREFIX}.bwt" "${PREFIX}.pac" "${PREFIX}.sa" -t "${DB_LOCAL}" echo -e "{\n \"reference\": \"$REFERENCE\",\n \"reference_md5\": \"$REFERENCE_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > "${DB_LOCAL}/${JSON_FILE}" From b6bf75aa48718df2b54fb18dc3a024c004459519 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 14 Aug 2023 18:58:10 +0000 Subject: [PATCH 101/157] Fix version printing Former-commit-id: 4e7f024852670855da916cd94e2d577be3b3f875 --- bin/save_databases_info.sh | 2 +- modules/info.nf | 2 ++ workflows/info_and_version.nf | 16 ++++++++++------ workflows/pipeline.nf | 2 +- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/bin/save_databases_info.sh b/bin/save_databases_info.sh index 10f2174..95cba9e 100755 --- a/bin/save_databases_info.sh +++ b/bin/save_databases_info.sh @@ -64,5 +64,5 @@ jq -n \ --argjson seroba_db "$(add_seroba_db)" \ --argjson kraken2_db "$(add_url_db "${KRAKEN2_DB_PATH}/${KRAKEN2_JSON}")" \ --argjson poppunnk_db "$(add_url_db "${POPPUNK_DB_PATH}/${POPPUNK_JSON}")" \ - --argjson poppunk_ext "$(add_url_db "${POPPUNK_DB_PATH}/${POPPUNK_EXT_JSON}")" \ + --argjson poppunk_ext "$(add_url_db "${POPPUNK_EXT_PATH}/${POPPUNK_EXT_JSON}")" \ '$ARGS.named' > "$JSON_FILE" diff --git a/modules/info.nf b/modules/info.nf index 72547d0..54b271f 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -33,6 +33,7 @@ process DATABASES { val kraken2_db_path val seroba_db_path val poppunk_db_path + val poppunk_ext_path output: path(json), emit: json @@ -56,6 +57,7 @@ process DATABASES { SEROBA_JSON="$seroba_json" POPPUNK_DB_PATH="$poppunk_db_path" POPPUNK_JSON="$poppunk_json" + POPPUNK_EXT_PATH="$poppunk_ext_path" POPPUNK_EXT_JSON="$poppunk_ext_json" JSON_FILE="$json" diff --git a/workflows/info_and_version.nf b/workflows/info_and_version.nf index 186dadf..696a388 100644 --- a/workflows/info_and_version.nf +++ b/workflows/info_and_version.nf @@ -7,11 +7,12 @@ workflow PRINT_VERSION { main: GET_VERSION( - params.ref_genome_bwa_db_local, - params.ariba_db_local, - params.kraken2_db_local, - params.seroba_db_local, - params.poppunk_db_local, + "${params.db}/bwa", + "${params.db}/ariba", + "${params.db}/kraken2", + "${params.db}/seroba", + "${params.db}/poppunk", + "${params.db}/poppunk_ext", pipeline_version ) \ | PARSE \ @@ -31,6 +32,7 @@ workflow SAVE_INFO { databases_info.kraken2_db_path, databases_info.seroba_db_path, databases_info.poppunk_db_path, + databases_info.poppunk_ext_path, pipeline_version ) \ | PARSE \ @@ -45,6 +47,7 @@ workflow GET_VERSION { kraken2_db_path seroba_db_path poppunk_db_path + poppunk_ext_path pipeline_version main: @@ -55,7 +58,8 @@ workflow GET_VERSION { ariba_db_path, kraken2_db_path, seroba_db_path, - poppunk_db_path + poppunk_db_path, + poppunk_ext_path ) nextflow_version = "$nextflow.version" diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 00429da..39a8398 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -173,7 +173,7 @@ workflow PIPELINE { .merge(GET_KRAKEN2_DB.out.path.map { [["kraken2_db_path", it]] }) .merge(GET_SEROBA_DB.out.path.map { [["seroba_db_path", it]] }) .merge(GET_POPPUNK_DB.out.path.map { [["poppunk_db_path", it]] }) - .merge(GET_POPPUNK_EXT_CLUSTERS.out.file.map { [["poppunk_ext_file", it]] }) + .merge(GET_POPPUNK_EXT_CLUSTERS.out.file.map { [["poppunk_ext_path", it]] }) // Save key-value tuples into a map .map { it.collectEntries() } From 65e5f93dd1eb8f417b926a86bdd49f6d4bce84cd Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 14 Aug 2023 18:58:22 +0000 Subject: [PATCH 102/157] Update Kraken2 database directory name Former-commit-id: 384b6e30413b30b9ba758a77baed3d89b600a362 --- modules/taxonomy.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/taxonomy.nf b/modules/taxonomy.nf index 162a986..539ee33 100644 --- a/modules/taxonomy.nf +++ b/modules/taxonomy.nf @@ -11,7 +11,7 @@ process GET_KRAKEN2_DB { path kraken2_db, emit: path script: - kraken2_db="${db}/kraken2_db" + kraken2_db="${db}/kraken2" json='done_kraken.json' """ DB_REMOTE="$remote" From 53279089aa1ea57820dbc8d9627a4e377879a232 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 14 Aug 2023 19:02:51 +0000 Subject: [PATCH 103/157] Fix PopPUNK external clusters info output Former-commit-id: 4d0288802c041a6948d0fc91731a9fede6d5d9a4 --- workflows/pipeline.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 39a8398..a809c58 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -173,7 +173,7 @@ workflow PIPELINE { .merge(GET_KRAKEN2_DB.out.path.map { [["kraken2_db_path", it]] }) .merge(GET_SEROBA_DB.out.path.map { [["seroba_db_path", it]] }) .merge(GET_POPPUNK_DB.out.path.map { [["poppunk_db_path", it]] }) - .merge(GET_POPPUNK_EXT_CLUSTERS.out.file.map { [["poppunk_ext_path", it]] }) + .merge(GET_POPPUNK_EXT_CLUSTERS.out.path.map { [["poppunk_ext_path", it]] }) // Save key-value tuples into a map .map { it.collectEntries() } From c80c29b9f7781353db611971cf2072ca8ce6ac34 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 14 Aug 2023 19:37:19 +0000 Subject: [PATCH 104/157] Fix saving of PopPUNK External Clusters Former-commit-id: 1369db2e35ef37e232ddad38a02f73f5da7e4cdd --- bin/check-download_poppunk_ext_clusters.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/check-download_poppunk_ext_clusters.sh b/bin/check-download_poppunk_ext_clusters.sh index cef4b67..3239da0 100755 --- a/bin/check-download_poppunk_ext_clusters.sh +++ b/bin/check-download_poppunk_ext_clusters.sh @@ -9,7 +9,7 @@ if [ ! -f "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}" ] || \ [ ! "$EXT_CLUSTERS_REMOTE" == "$(jq -r .url "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}")" ] || \ [ ! -f "${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV}" ]; then - rm -f "${EXT_CLUSTERS_LOCAL}" + rm -rf "${EXT_CLUSTERS_LOCAL}" mkdir -p "${EXT_CLUSTERS_LOCAL}" wget "$EXT_CLUSTERS_REMOTE" -O "${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV}" From b2162248fe2df6612925efafa9e0c0062d72fb11 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 15 Aug 2023 10:21:35 +0000 Subject: [PATCH 105/157] Update schema based on latest changes Former-commit-id: cd1a0d18f759bc8f7fd64787ef0e6c6b19cb2719 --- nextflow_schema.json | 112 ++++++++++++++++++++++++------------------- 1 file changed, 64 insertions(+), 48 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index be58342..59049cd 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -13,21 +13,34 @@ "properties": { "reads": { "type": "string", - "description": "Path to the input directory that contains the reads to be processed." + "description": "Path to the input directory that contains the reads to be processed.", + "format": "directory-path" }, "output": { "type": "string", - "description": "Path to the output directory that save the results." + "description": "Path to the output directory that save the results.", + "format": "directory-path" + }, + "db": { + "type": "string", + "description": "Path to the directory saving databases used by the pipeline", + "format": "directory-path" }, "assembly_publish": { "type": "string", "description": "Method used by Nextflow to publish the generated assemblies.", - "hidden": true + "hidden": true, + "enum": [ + "link", + "symlink", + "copy" + ] } }, "required": [ "reads", "output", + "db", "assembly_publish" ] }, @@ -40,37 +53,46 @@ "spneumo_percentage": { "type": "number", "description": "Minimum S. pneumoniae percentage in reads to pass Taxonomy QC.", - "hidden": true + "hidden": true, + "minimum": 0, + "maximum": 100 }, "ref_coverage": { "type": "number", "description": "Minimum reference coverage percentage by the reads to pass Mapping QC.", - "hidden": true + "hidden": true, + "minimum": 0, + "maximum": 100 }, "het_snp_site": { "type": "integer", "description": "Maximum non-cluster heterozygous SNP (Het-SNP) site count to pass Mapping QC.", - "hidden": true + "hidden": true, + "minimum": 0 }, "contigs": { "type": "integer", "hidden": true, - "description": "Maximum contig count in assembly to pass Assembly QC." + "description": "Maximum contig count in assembly to pass Assembly QC.", + "minimum": 0 }, "length_low": { "type": "integer", "hidden": true, - "description": "Minimum assembly length to pass Assembly QC." + "description": "Minimum assembly length to pass Assembly QC.", + "minimum": 0 }, "length_high": { "type": "integer", "hidden": true, - "description": "Maximum assembly length to pass Assembly QC." + "description": "Maximum assembly length to pass Assembly QC.", + "minimum": 0 }, "depth": { "type": "number", "hidden": true, - "description": "Minimum sequencing depth to pass Assembly QC." + "description": "Minimum sequencing depth to pass Assembly QC.", + "minimum": 0 } }, "required": [ @@ -92,12 +114,17 @@ "assembler": { "type": "string", "hidden": true, - "description": "Using which SPAdes-based assembler to assemble the reads." + "description": "Using which SPAdes-based assembler to assemble the reads.", + "enum": [ + "shovill", + "unicycler" + ] }, "min_contig_length": { "type": "integer", "description": "Minimum legnth of contig to be included in the assembly.", - "hidden": true + "hidden": true, + "minimum": 0 } }, "required": [ @@ -114,17 +141,14 @@ "ref_genome": { "type": "string", "hidden": true, - "description": "Path to the reference genome for mapping." - }, - "ref_genome_bwa_db_local": { - "type": "string", - "description": "Path to the directory where the reference genome FM-index database for BWA should be saved to.", - "hidden": true + "description": "Path to the reference genome for mapping.", + "pattern": ".+\\.(fa|fasta)$", + "format": "file-path", + "mimetype": "text/x-fasta" } }, "required": [ - "ref_genome", - "ref_genome_bwa_db_local" + "ref_genome" ] }, "taxonomy": { @@ -136,12 +160,8 @@ "kraken2_db_remote": { "type": "string", "hidden": true, - "description": "URL to a Kraken2 database." - }, - "kraken2_db_local": { - "type": "string", - "hidden": true, - "description": "Path to the directory where the remote Kraken2 database should be saved to." + "description": "URL to a Kraken2 database.", + "pattern": "^(https?:\\/\\/)?(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)\\.(tar\\.gz|tgz)$" }, "kraken2_memory_mapping": { "type": "boolean", @@ -164,17 +184,14 @@ "seroba_db_remote": { "type": "string", "hidden": true, - "description": "URL to a SeroBA Git remote repository." - }, - "seroba_db_local": { - "type": "string", - "hidden": true, - "description": "Path to the directory where SeroBA local repository should be saved to." + "description": "URL to a SeroBA Git remote repository.", + "pattern": "^(https?:\\/\\/)?(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)\\.git$" }, "seroba_kmer": { "type": "integer", "hidden": true, - "description": "Kmer size for creating the KMC database of SeroBA." + "description": "Kmer size for creating the KMC database of SeroBA.", + "minimum": 0 } }, "required": [ @@ -192,17 +209,14 @@ "poppunk_db_remote": { "type": "string", "hidden": true, - "description": "URL to a PopPUNK database." + "description": "URL to a PopPUNK database.", + "pattern": "^(https?:\\/\\/)?(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)\\.(tar\\.gz|tgz)$" }, "poppunk_ext_remote": { "type": "string", "hidden": true, - "description": "URL to a PopPUNK external clusters file." - }, - "poppunk_db_local": { - "type": "string", - "hidden": true, - "description": "Path to the directory where the remote PopPUNK database and external clusters file should be saved to." + "description": "URL to a PopPUNK external clusters file.", + "pattern": "^(https?:\\/\\/)?(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)\\.csv$" } }, "required": [ @@ -220,17 +234,18 @@ "ariba_ref": { "type": "string", "hidden": true, - "description": "Path to the reference sequences for ARIBA." + "description": "Path to the reference sequences for ARIBA.", + "pattern": ".+\\.(fa|fasta)$", + "format": "file-path", + "mimetype": "text/x-fasta" }, "ariba_metadata": { "type": "string", "hidden": true, - "description": "Path to the metadata file for ARIBA." - }, - "ariba_db_local": { - "type": "string", - "hidden": true, - "description": "Path to the directory where ARIBA reference database should be saved to." + "description": "Path to the metadata file for ARIBA.", + "pattern": "/.+\\.tsv$", + "format": "file-path", + "mimetype": "text/tab-separated-values" } }, "required": [ @@ -248,7 +263,8 @@ "singularity_cachedir": { "type": "string", "description": "Path to the directory where Singularity images should be saved to.", - "hidden": true + "hidden": true, + "format": "directory-path" } }, "required": [ From 052177f5cd1cce5cfb786d1b1113856090530753 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 15 Aug 2023 10:53:45 +0000 Subject: [PATCH 106/157] Fix database info output when saved externally Former-commit-id: 9ae5c39eadc0c86da645c8d44e73ec425c36bba7 --- modules/info.nf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/info.nf b/modules/info.nf index 54b271f..264bf8e 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -28,12 +28,12 @@ process DATABASES { label 'farm_low' input: - val bwa_db_path - val ariba_db_path - val kraken2_db_path - val seroba_db_path - val poppunk_db_path - val poppunk_ext_path + path bwa_db_path + path ariba_db_path + path kraken2_db_path + path seroba_db_path + path poppunk_db_path + path poppunk_ext_path output: path(json), emit: json From 7f9b53b0c4cea5f3172bfb9bab99ea3042ccaf95 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 15 Aug 2023 11:47:06 +0000 Subject: [PATCH 107/157] Use publishDir to save info.txt for NF Tower Former-commit-id: a8647192aee88724e3e416569bbc7660a609964e --- modules/info.nf | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/modules/info.nf b/modules/info.nf index 264bf8e..c55f537 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -322,16 +322,20 @@ process PRINT { process SAVE { label 'farm_local' + publishDir "${params.output}", mode: "copy" + input: val coreText val dbText val toolText val imageText + output: + path "info.txt", emit: info + exec: File readsDir = new File(params.reads) File outputDir = new File(params.output) - outputDir.mkdirs() def textRow = { leftSpace, rightSpace, leftContent, rightContent -> String.format("║ %-${leftSpace}s │ %-${rightSpace}s ║", leftContent, rightContent) @@ -407,7 +411,7 @@ process SAVE { |╚═══════════════════════════╧═══════════════════════════════════════════════════════════════╝ |""".stripMargin() - File output = new File("${params.output}/info.txt") + File output = new File("${task.workDir}/info.txt") output.write( """\ |${coreText} From b37320b1fc19c36019a52bde21c2a6dd5e8e219c Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 15 Aug 2023 13:03:40 +0000 Subject: [PATCH 108/157] Reflect changes from databases consolidation Former-commit-id: 5d3c0c9618a558d81fa5ac0b717b55fbdb24f000 --- README.md | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index f6cdd91..f6bbe29 100644 --- a/README.md +++ b/README.md @@ -178,10 +178,13 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | `--help` | `true` or `false`
(Default: `false`)| Show help message.
Can be enabled by including `--help` without value. | ## Input and Output + > ⚠️ `--db` does not accept user provided local databases, directory content will be overwritten + | Option | Values | Description | | --- | ---| --- | | `--reads` | Any valid path
(Default: `"$projectDir/input"`) | Path to the input directory that contains the reads to be processed. | | `--output` | Any valid path
(Default: `"$projectDir/output"`)| Path to the output directory that save the results. | + | `--db` | Any valid path
(Default: `"$projectDir/databases"`)| Path to the directory saving databases used by the pipeline. | | `--assembly_publish` | `"link"` or `"symlink"` or `"copy"`
(Default: `"link"`)| Method used by Nextflow to publish the generated assemblies.
(The default setting `"link"` means hard link, therefore will fail if the output directory is set to outside of the working file system) | ## QC Parameters @@ -204,48 +207,33 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | `--min_contig_length` | Any integer value
(Default: `500`) | Minimum legnth of contig to be included in the assembly. | ## Mapping - > ⚠️ `--ref_genome_bwa_db_local` does not accept user provided local database, directory content will be overwritten - | Option | Values | Description | | --- | ---| --- | | `--ref_genome` | Any valid path to a `.fa` or `.fasta` file
(Default: `"$projectDir/data/ATCC_700669_v1.fa"`) | Path to the reference genome for mapping. | - | `--ref_genome_bwa_db_local` | Any valid path
(Default: `"$projectDir/databases/bwa_ref_db"`) | Path to the directory where the reference genome FM-index database for BWA should be saved to. | ## Taxonomy - > ⚠️ `--kraken2_db_local` does not accept user provided local database, directory content will be overwritten - | Option | Values | Description | | --- | ---| --- | | `--kraken2_db_remote` | Any valid URL to a Kraken2 database in `.tar.gz` or `.tgz` format
(Default: [Minikraken v1](https://genome-idx.s3.amazonaws.com/kraken/minikraken2_v1_8GB_201904.tgz)) | URL to a Kraken2 database. | - | `--kraken2_db_local` | Any valid path
(Default: `"$projectDir/databases/kraken"`) | Path to the directory where the remote Kraken2 database should be saved to. | | `--kraken2_memory_mapping` | `true` or `false`
(Default: `true`) | Using the memory mapping option of Kraken2 or not.
`true` means not loading the database into RAM, suitable for memory-limited or fast storage environments. | ## Serotype - > ⚠️ `--seroba_db_local` does not accept user provided local database, directory content will be overwritten - | Option | Values | Description | | --- | ---| --- | | `--seroba_db_remote` | Any valid URL to a Git remote repository
(Default: [SeroBA GitHub Repo](https://github.com/sanger-pathogens/seroba.git))| URL to a SeroBA Git remote repository. | - | `--seroba_db_local` | Any valid path
(Default: `"$projectDir/databases/seroba"`) | Path to the directory where SeroBA local repository should be saved to. | | `--seroba_kmer` | Any integer value
(Default: `71`) | Kmer size for creating the KMC database of SeroBA. | ## Lineage - > ⚠️ `--poppunk_db_local` does not accept user provided local database, directory content will be overwritten - | Option | Values | Description | | --- | ---| --- | | `--poppunk_db_remote` | Any valid URL to a PopPUNK database in `.tar.gz` or `.tgz` format
(Default: [GPS v6](https://gps-project.cog.sanger.ac.uk/GPS_v6.tar.gz)) | URL to a PopPUNK database. | | `--poppunk_ext_remote` | Any valid URL to a PopPUNK external clusters file in `.csv` format
(Default: [GPS v6 GPSC Designation](https://www.pneumogen.net/gps/GPS_v6_external_clusters.csv)) | URL to a PopPUNK external clusters file. | - | `--poppunk_db_local` | Any valid path
(Default: `"$projectDir/databases/poppunk"`) | Path to the directory where the remote PopPUNK database and external clusters file should be saved to. | ## Other AMR - > ⚠️ `--ariba_db_local` does not accept user provided local database, directory content will be overwritten - | Option | Values | Description | | --- | ---| --- | | `--ariba_ref` | Any valid path to a `.fa` or `.fasta` file
(Default: `"$projectDir/data/ariba_ref_sequences-20230712.fasta"`) | Path to the reference sequences for ARIBA. | | `--ariba_metadata` | Any valid path to a `tsv` file
(Default: `"$projectDir/data/ariba_metadata-20230712.tsv"`) | Path to the metadata file for ARIBA. | - | `--ariba_db_local` | Any valid path
(Default: `"$projectDir/databases/ariba"`) | Path to the directory where ARIBA reference database should be saved to. | ## Singularity > ℹ️ This section is only valid when Singularity is used as the container engine From de2e6bc71b78f5b42b54ef2d16ad1d725e6cc24d Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 15 Aug 2023 13:43:28 +0000 Subject: [PATCH 109/157] Remove empty row in Experimental table Former-commit-id: c139b6e6428ffbe4564f32596354a4d83426702b --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f6bbe29..b64caf9 100644 --- a/README.md +++ b/README.md @@ -246,7 +246,7 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | Option | Values | Description | | --- | ---| --- | | `--lite` | `true` or `false`
(Default: `false`) | ⚠️ Enable this option breaks Nextflow resume function.
Reduce storage requirement by removing intermediate `.sam` and `.bam` files once they are no longer needed while the pipeline is still running.
The quantity of reduction of storage requirement cannot be guaranteed.
Can be enabled by including `--lite` without value. | -  + # Output - By default, the pipeline outputs the results into the `output` directory inside the `gps-unified-pipeline` local repository - It can be changed by adding the option `--output` From b7fbd6642c330ac96197d32890297dff49531d01 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 15 Aug 2023 13:52:15 +0000 Subject: [PATCH 110/157] Update ARIBA reference sequences Former-commit-id: c1b2b97d68baca15136256e1b299454ed0625722 --- ...uences-20230712.fasta => ariba_ref_sequences-20230815.fasta} | 2 +- nextflow.config | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename data/{ariba_ref_sequences-20230712.fasta => ariba_ref_sequences-20230815.fasta} (99%) diff --git a/data/ariba_ref_sequences-20230712.fasta b/data/ariba_ref_sequences-20230815.fasta similarity index 99% rename from data/ariba_ref_sequences-20230712.fasta rename to data/ariba_ref_sequences-20230815.fasta index aac7dd0..0e57bf2 100644 --- a/data/ariba_ref_sequences-20230712.fasta +++ b/data/ariba_ref_sequences-20230815.fasta @@ -25,7 +25,7 @@ ATGGAATTAATATTAAAAGCAAAAGACATTAGTGTGGAATTCAAAGGACACGATGTTTTAGATATAAATGAATTAGAAGT >mefA_10_AF376746 ATGGAAAAATACAACAATTGGAAACGAAAATTTTATGCAATATGGGCAGGGCAAGCAGTATCATTAATCACTAGTGCCATCCTGCAAATGGCGATTATTTTTTACCTTACAGAAAAAACAGGATCTGCGATGGTCTTGTCTATGGCTTCATTAGTAGGTTTTTTACCCTATGCGATTTTGGGACCTGCCATTGGTGTGCTAGTGGATCGTCATGATAGGAAGAAGATAATGATTGGTGCCGATTTAATTATCGCAGCAGCTGGTGCAGTGCTTGCTATTGTTGCATTCTGTATGGAGCTACCTGTCTGGATGATTATGATAGTATTGTTTATCCGTAGCATTGGAACAGCTTTTCATACCCCAGCACTCAATGCGGTTACACCACTTTTAGTACCAGAAGAACAGCTAACGAAATGCGCAGGCTATAGTCAGTCTTTGCAGTCTATAAGCTATATTGTTAGTCCGGCAGTTGCAGCACTCTTATACTCCGTTTGGGATTTAAATGCTATTATTGCCATCGACGTATTGGGTGCTGTGATTGCATCTATTACGGTAGCAATTGTACGTATACCTAAGCTGGGTAATCAAGTGCAAAGTTTAGAACCAAATTTCATAAGGGAGATGAAAGAAGGAGTTGTGGTTCTGAGACAAAACAAAGGATTGTTTGCCTTATTACTCTTAGGAACACTATATACTTTTGTTTATATGCCAATCAATGCACTATTTCCTTTAATAAGCATGGAACACTTTAATGGAACGCCTGTGCATATTTCTATTACGGAAATTTCCTTTGCATTTGGGATGCTAGCAGGAGGCTTATTATTAGGAAGATTAGGGGGCTTCGAAAAGCATGTATTACTAATAACAAGTTCATTTTTTATAATGGGGACCAGTTTAGCCGTTTCGGGAATACTTCCTCCAAATGGATTTGTAATATTCGTAGTTTGCTGTGCAATAATGGGGCTTTCGGTGCCATTTTATAGCGGTGTGCAAACAGCTCTTTTTCAGGAGAAAATTAAGCCTGAATATTTAGGACGTGTATTTTCTTTGATCGGAAGTATCATGTCACTTGCTATGCCAATTGGGTTAATTCTTTCTGGATTCTTTGCTGATAAAATCGGTGTAAATCATTGGTTTTTACTATCAGGTATTTTAATTATTGGCATTGCTATAGTTTGCCAAATGATAACTGAGGTTAGAAAATTAGATTTAAAATAA >mefE_AE007317 -TTGAAAATAGATAAAAAAAACGAGGCTTTCCTTATTGTAAGTAGAGGCATATCTCGAATTGGAGATATTATGTTTGACTTTGCGAATAATACCTTTCTTGCAGGATTAAATCCAACATCTTTATCATTGGTTGCAGTATATCAGTCACTAGAAAGTGTGATAGGTGTTCTTTTTAATTTATTTGGTGGAGTCATTGCAGATAGTTTCAAGCGGAAAAAAATTATTATTGTTGCAAATATCTTATGTGGTATTGCTTGTATAATTCTTTCATTCATATCACAAGAGCAGTGGATGGTCTTTGCAATTGTCATCACTAATATTATCTTGGCATTTATGAGTGCTTTTTCTGGACCGTCCTATAAAGCATTTACAAAAGAAATTGTAAAAAAGGATAGTATATCACAACTTAATTCATTGCTAGAGATAACAAGTACTATAATTAAAGTAACAATACCAATGGTAGCAATTTTATTATATAAGCTACTTGGGATACATGGTGTTTTACTATTGGATGGATTCTCATTTCTAATTGCTGCATCACTGATTTCCTTTATTGTACCCGTTAATGACGAAGTGGTCACAAAGGATAAAATGACAATAGGAGGAGTTTTAAATGACTTAAAAATAGGGTTTAAGTATATTTATAGTCATAAGACAATATTTATGATTATTATTCTCTCTGCTTTTGTTAATTTTTTTCTAGCAGCTTATAATTTATTGTTACCTTATAGTAATCAAATGTTTGGAGAAATTTCAGATGGGCTTTATGGTGTTTTTCTAACTGCGGAAGCAATTGGAGGATTTATTGGAGCGATATTAAGTGGTGTTATAAATAAAACCTTGTCAAGCAAACGTTTAATGGTCTTCTTATCATGTTCAGGATTGATGTTAATGCTATCAACGCCACTCTATTTTTTGTTTCAAAACTTCATTATTCTAGCCTTTTCTCCGGCATTATTTAGTCTATTTATTTCTATTTTTAATATTCAATTTTTCTCTATTGTTCAAAGAGAAGTTGATACTGAGTTTCTCGGTAGAGTCTTTGGAATCATCTTTACGGTAGCTATTCTTTTTATGCCAGTTGGGTCTGGATTTTTCTCAGTAGTTTTAAATCCTAACAATACTTTTAATCTTTTTATTATTGGTGTATCTATTACGATATTATCGCTAATATTCAGCACGCTATTGAAGAGGTATGATAAAAATAGCTGA +ATGAAAATAGATAAAAAAAACGAGGCTTTCCTTATTGTAAGTAGAGGCATATCTCGAATTGGAGATATTATGTTTGACTTTGCGAATAATACCTTTCTTGCAGGATTAAATCCAACATCTTTATCATTGGTTGCAGTATATCAGTCACTAGAAAGTGTGATAGGTGTTCTTTTTAATTTATTTGGTGGAGTCATTGCAGATAGTTTCAAGCGGAAAAAAATTATTATTGTTGCAAATATCTTATGTGGTATTGCTTGTATAATTCTTTCATTCATATCACAAGAGCAGTGGATGGTCTTTGCAATTGTCATCACTAATATTATCTTGGCATTTATGAGTGCTTTTTCTGGACCGTCCTATAAAGCATTTACAAAAGAAATTGTAAAAAAGGATAGTATATCACAACTTAATTCATTGCTAGAGATAACAAGTACTATAATTAAAGTAACAATACCAATGGTAGCAATTTTATTATATAAGCTACTTGGGATACATGGTGTTTTACTATTGGATGGATTCTCATTTCTAATTGCTGCATCACTGATTTCCTTTATTGTACCCGTTAATGACGAAGTGGTCACAAAGGATAAAATGACAATAGGAGGAGTTTTAAATGACTTAAAAATAGGGTTTAAGTATATTTATAGTCATAAGACAATATTTATGATTATTATTCTCTCTGCTTTTGTTAATTTTTTTCTAGCAGCTTATAATTTATTGTTACCTTATAGTAATCAAATGTTTGGAGAAATTTCAGATGGGCTTTATGGTGTTTTTCTAACTGCGGAAGCAATTGGAGGATTTATTGGAGCGATATTAAGTGGTGTTATAAATAAAACCTTGTCAAGCAAACGTTTAATGGTCTTCTTATCATGTTCAGGATTGATGTTAATGCTATCAACGCCACTCTATTTTTTGTTTCAAAACTTCATTATTCTAGCCTTTTCTCCGGCATTATTTAGTCTATTTATTTCTATTTTTAATATTCAATTTTTCTCTATTGTTCAAAGAGAAGTTGATACTGAGTTTCTCGGTAGAGTCTTTGGAATCATCTTTACGGTAGCTATTCTTTTTATGCCAGTTGGGTCTGGATTTTTCTCAGTAGTTTTAAATCCTAACAATACTTTTAATCTTTTTATTATTGGTGTATCTATTACGATATTATCGCTAATATTCAGCACGCTATTGAAGAGGTATGATAAAAATAGCTGA >tetM_1_X92947 ATGAAAATTATTAATATTGGAGTTTTAGCTCATGTTGATGCAGGAAAAACTACCTTAACAGAAAGCTTATTATATAACAGTGGAGCGATTACAGAATTAGGAAGCGTGGACAAAGGTACAACGAGGACGGATAATACGCTTTTAGAACGTCAGAGAGGAATTACAATTCAGACAGGAATAACCTCTTTTCAGTGGGAAAATACGAAGGTGAACATCATAGACACGCCAGGACATATGGATTTCTTAGCAGAAGTATATCGTTCATTATCAGTTTTAGATGGGGCAATTCTACTGATTTCTGCAAAAGATGGCGTACAAGCACAAACTCGTATATTATTTCATGCACTTAGGAAAATGGGGATTCCCACAATCTTTTTTATCAATAAGATTGACCAAAATGGAATTGATTTATCAACGGTTTATCAGGATATTAAAGAGAAACTTTCTGCCGAAATTGTAATCAAACAGAAGGTAGAACTGTATCCTAATGTGTGTGTGACGAACTTTACCGAATCTGAACAATGGGATACGGTAATAGAGGGAAACGATGACCTTTTAGAGAAATATATGTCCGGTAAATCATTAGAAGCATTGGAACTCGAACAAGAGGAAAGCATAAGATTTCAGAATTGTTCTCTGTTCCCTCTTTATCATGGAAGTGCAAAAAGTAATATAGGGATTGATAACCTTATAGAAGTTATTACTAATAAATTTTATTCATCAACACATCGAGGTCCGTCTGAACTTTGCGGAAATGTTTTCAAAATTGAATATACAAAAAAAAGACAACGTCTTGCATATATACGCCTTTATAGTGGAGTACTACATTTACGAGATTCGGTTAGAGTATCAGAAAAAGAAAAAATAAAAGTTACAGAAATGTATACTTCAATAAATGGTGAATTATGTAAGATTGATAGAGCTTATTCTGGAGAAATTGTTATTTTGCAAAATGAGTTTTTGAAGTTAAATAGTGTTCTTGGAGATACAAAACTATTGCCACAGAGAAAAAAGATTGAAAATCCGCACCCTCTACTACAAACAACTGTTGAACCGAGTAAACCTGAACAGAGAGAAATGTTGCTTGATGCCCTTTTGGAAATCTCAGATAGTGATCCGCTTCTACGATATTACGTGGATTCTACGACACATGAAATTATACTTTCTTTCTTAGGGAAAGTACAAATGGAAGTGATTAGTGCACTGTTGCAAGAAAAGTATCATGTGGAGATAGAACTAAAAGAGCCTACAGTCATTTATATGGAGAGACCGTTAAAAAATGCAGAATATACCATTCACATCGAAGTGCCGCCAAATCCTTTCTGGGCTTCCATTGGTTTATCTGTATCACCGCTTCCGTTGGGAAGTGGAATGCAGTATGAGAGCTCGGTTTCTCTTGGATACTTAAATCAATCATTTCAAAATGCAGTTATGGAAGGGATACGCTATGGTTGTGAACAAGGATTGTATGGTTGGAATGTGACGGACTGTAAAATCTGTTTTAAGTATGGCTTATACTATAGCCCTGTTAGTACCCCAGCAGATTTTCGGATGCTTGCTCCTATTGTATTGGAACAAGTCTTAAAAAAAGCTGGAACAGAATTGTTAGAGCCATATCTTAGTTTTAAAATTTATGCGCCACAGGAATATCTTTCACGAGCATACAACGATGCTCCTAAATATTGTGCGAACATCGTAGACACTCAATTGAAAAATAATGAGGTCATTCTTAGTGGAGAAATCCCTGCTCGGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGTTTGTTTAACAGAGTTAAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAAGTACGATATATGTTCAATAAAATAACTTAG >tetM_12_FR671418 diff --git a/nextflow.config b/nextflow.config index 902d9ec..9899a29 100644 --- a/nextflow.config +++ b/nextflow.config @@ -49,7 +49,7 @@ params { depth = 20.00 // Default ARIBA referece sequences and metadata paths - ariba_ref = "$projectDir/data/ariba_ref_sequences-20230712.fasta" + ariba_ref = "$projectDir/data/ariba_ref_sequences-20230815.fasta" ariba_metadata = "$projectDir/data/ariba_metadata-20230712.tsv" // Toggle for removing .bam and .sam files mid-run to reduce storage requirement From ae0488a4ab63e40ec366de96d32ded2e5058892b Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 18 Aug 2023 13:13:37 +0000 Subject: [PATCH 111/157] Add information about Nextflow Tower Former-commit-id: f13a3493afba831f774851dde2de8d0496b9a649 --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b64caf9..4a438a0 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca - [Profile](#profile) - [Resume](#resume) - [Clean Up](#clean-up) + - [Nextflow Tower (Optional)](#nextflow-tower-optional) - [Pipeline Options](#pipeline-options) - [Alternative Workflows](#alternative-workflows) - [Input and Output](#input-and-output) @@ -155,8 +156,10 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca ``` ./nextflow clean ``` - - + +## Nextflow Tower (Optional) +The pipeline is compatible with [Launchpad](https://help.tower.nf/23.2/launch/launchpad/) of [Nextflow tower](https://tower.nf/) and [Nextflow `-with-tower` option](https://help.tower.nf/23.2/getting-started/deployment-options/#nextflow-with-tower). For more information, please refer to the [Nextflow Tower documentation](https://help.tower.nf/). +   # Pipeline Options - The tables below contain the available options that can be used when you run the pipeline From d9f9c4c7e36b5ee0680676966d686f864a7f6816 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 22 Aug 2023 12:11:13 +0000 Subject: [PATCH 112/157] Fix incorrect target of ermBups and ermbTr Former-commit-id: 0d5f62ea817d6f26fe9b660f4a0a605385ad64b8 --- ...riba_metadata-20230712.tsv => ariba_metadata-20230822.tsv} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename data/{ariba_metadata-20230712.tsv => ariba_metadata-20230822.tsv} (93%) diff --git a/data/ariba_metadata-20230712.tsv b/data/ariba_metadata-20230822.tsv similarity index 93% rename from data/ariba_metadata-20230712.tsv rename to data/ariba_metadata-20230822.tsv index 1a0a6ea..f68826b 100644 --- a/data/ariba_metadata-20230712.tsv +++ b/data/ariba_metadata-20230822.tsv @@ -60,8 +60,8 @@ parE_AE007317 1 1 D435N . FQ parE_AE007317 1 1 D435H . FQ parE_AE007317 1 1 P454S . FQ tetO_Y07780 1 0 . . TET -ermBups_HG799494 0 0 . . ERY -ermbTr_CP002121 0 0 . . ERY +ermBups_HG799494 0 0 . . ERY_CLI +ermbTr_CP002121 0 0 . . ERY_CLI rpoB_AE007317 1 1 D489E . RIF rpoB_AE007317 1 1 H499N . RIF rpoB_AE007317 1 1 D489N . RIF From a1a0fcc7e287a7a40e0915e595f5d72410513e68 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 22 Aug 2023 12:13:02 +0000 Subject: [PATCH 113/157] Simplify ARIBA reference files naming Former-commit-id: 6be485cd815703aefa51dc5f487c58386515d8e7 --- README.md | 8 ++++---- data/{ariba_metadata-20230822.tsv => ariba_metadata.tsv} | 0 ...sequences-20230815.fasta => ariba_ref_sequences.fasta} | 0 nextflow.config | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) rename data/{ariba_metadata-20230822.tsv => ariba_metadata.tsv} (100%) rename data/{ariba_ref_sequences-20230815.fasta => ariba_ref_sequences.fasta} (100%) diff --git a/README.md b/README.md index 4a438a0..af21bad 100644 --- a/README.md +++ b/README.md @@ -235,8 +235,8 @@ The pipeline is compatible with [Launchpad](https://help.tower.nf/23.2/launch/la ## Other AMR | Option | Values | Description | | --- | ---| --- | - | `--ariba_ref` | Any valid path to a `.fa` or `.fasta` file
(Default: `"$projectDir/data/ariba_ref_sequences-20230712.fasta"`) | Path to the reference sequences for ARIBA. | - | `--ariba_metadata` | Any valid path to a `tsv` file
(Default: `"$projectDir/data/ariba_metadata-20230712.tsv"`) | Path to the metadata file for ARIBA. | + | `--ariba_ref` | Any valid path to a `.fa` or `.fasta` file
(Default: `"$projectDir/data/ariba_ref_sequences.fasta"`) | Path to the reference sequences for ARIBA. | + | `--ariba_metadata` | Any valid path to a `tsv` file
(Default: `"$projectDir/data/ariba_metadata.tsv"`) | Path to the metadata file for ARIBA. | ## Singularity > ℹ️ This section is only valid when Singularity is used as the container engine @@ -444,8 +444,8 @@ This project uses open-source components. You can find the homepage or source co [resistanceDatabase](https://github.com/kumarnaren/resistanceDatabase) - Narender Kumar ([@kumarnaren](https://github.com/kumarnaren)) - License (GPL-3.0): https://github.com/kumarnaren/resistanceDatabase/blob/main/LICENSE -- `sequences.fasta` is renamed to `ariba_ref_sequences-*.fasta` and used as-is -- `metadata.tsv` is renamed to `ariba_metadata-*.tsv` and modified +- `sequences.fasta` is renamed to `ariba_ref_sequences.fasta` and used as-is +- `metadata.tsv` is renamed to `ariba_metadata.tsv` and modified - The files are used as the default inputs of `GET_ARIBA_DB` process of the `amr.nf` module [Shovill](https://github.com/tseemann/shovill) diff --git a/data/ariba_metadata-20230822.tsv b/data/ariba_metadata.tsv similarity index 100% rename from data/ariba_metadata-20230822.tsv rename to data/ariba_metadata.tsv diff --git a/data/ariba_ref_sequences-20230815.fasta b/data/ariba_ref_sequences.fasta similarity index 100% rename from data/ariba_ref_sequences-20230815.fasta rename to data/ariba_ref_sequences.fasta diff --git a/nextflow.config b/nextflow.config index 9899a29..76bd04a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -49,8 +49,8 @@ params { depth = 20.00 // Default ARIBA referece sequences and metadata paths - ariba_ref = "$projectDir/data/ariba_ref_sequences-20230815.fasta" - ariba_metadata = "$projectDir/data/ariba_metadata-20230712.tsv" + ariba_ref = "$projectDir/data/ariba_ref_sequences.fasta" + ariba_metadata = "$projectDir/data/ariba_metadata.tsv" // Toggle for removing .bam and .sam files mid-run to reduce storage requirement // Warning: This will break the -resume function of Nextflow From c7c1ddc8763bbd014fa64f5fcf9fc191adb0fe75 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 22 Aug 2023 13:01:04 +0000 Subject: [PATCH 114/157] ARIBA AMR detection mechanism update Former-commit-id: 5c3250b0f8fa522528d8519fddbe95e5c3cfdf44 --- bin/parse_other_resistance.py | 39 ++++++++++++++++------------------- modules/amr.nf | 9 ++++---- workflows/pipeline.nf | 4 ++-- 3 files changed, 24 insertions(+), 28 deletions(-) diff --git a/bin/parse_other_resistance.py b/bin/parse_other_resistance.py index a77eab6..2c5b126 100755 --- a/bin/parse_other_resistance.py +++ b/bin/parse_other_resistance.py @@ -3,20 +3,18 @@ # Output AMR of a sample based on its ARIBA report and ARIBA metadata import sys -from itertools import chain from collections import defaultdict import pandas as pd import csv # Check argv and save to global variables -if len(sys.argv) != 5: - sys.exit('Usage: get_other_resistance.py REPORT_PATH DEBUG_REPORT_PATH METADATA_PATH OUTPUT_FILE') +if len(sys.argv) != 4: + sys.exit('Usage: get_other_resistance.py DEBUG_REPORT_PATH METADATA_PATH OUTPUT_FILE') -REPORT_PATH = sys.argv[1] -DEBUG_REPORT_PATH = sys.argv[2] -METADATA_PATH = sys.argv[3] -OUTPUT_FILE = sys.argv[4] +DEBUG_REPORT_PATH = sys.argv[1] +METADATA_PATH = sys.argv[2] +OUTPUT_FILE = sys.argv[3] def main(): @@ -57,35 +55,34 @@ def prepare_dicts(): # Finding hits in ARIBA results based on targets_dict and save hits to hits_dict def find_hits(targets_dict, hits_dict): - with open(REPORT_PATH) as report, open(DEBUG_REPORT_PATH) as debug_report: - # Skip the header in report and debug report - next(report) + with open(DEBUG_REPORT_PATH) as debug_report: + # Skip the header in debug report next(debug_report) - # Go through lines in both report and debug report to detect targets - for line in (line.strip() for line in chain(report, debug_report)): + # Go through lines in the debug report to detect targets + for line in (line.strip() for line in debug_report): # Extract useful fields fields = [str(field) for field in line.split("\t")] ref_name, gene, var_only, ref_len, ref_base_assembled, known_var_change, has_known_var, ref_ctg_effect, ref_start, ref_end = fields[1], fields[2], fields[3], fields[7], fields[8], fields[16], fields[17], fields[19], fields[20], fields[21] - # If coverage (ref_base_assembled / ref_len) < 0.9 or either variable contains non-numeric value, skip the line - if not ref_base_assembled.isdigit() or not ref_len.isdigit() or int(ref_base_assembled)/int(ref_len) < 0.9: - continue - # If the known_var_change (. for genes, specific change for variants) is not found in the metadata of the (ref_name, gene, var_only) combination, skip the line try: target = targets_dict[(ref_name, gene, var_only)][known_var_change] except KeyError: continue - # Logic for gene detection. Found means hit. - if var_only == "0": + # If ref_base_assembled or ref_len variable contains non-numeric value, skip the line + if not ref_base_assembled.isdigit() or not ref_len.isdigit(): + continue + + # Logic for gene detection, check coverage. + if var_only == "0" and int(ref_base_assembled)/int(ref_len) >= 0.8: hits_dict[target].add(f'{ref_name}') - - # Logic for variant detection, further criteria required + + # Logic for variant detection, coverage check is not needed, but check for other criteria if var_only == "1": # folP-specific criteria: ref_ctg_effect (effect of change between reference and contig) is one of the keywords and the change occurs within nt 168-201 - if ref_name.lower().startswith("folp") and ref_ctg_effect.lower() in ('fshift', 'trunc', 'indel', 'ins', 'multiple') and (168 <= int(ref_start) <= 201 or 168 <= int(ref_end) <= 201): + if ref_name.lower().startswith("folp") and ref_ctg_effect.lower() in ('fshift', 'trunc', 'indel', 'indels', 'ins', 'multiple') and (168 <= int(ref_start) <= 201 or 168 <= int(ref_end) <= 201): pos = ref_start if ref_start == ref_end else f'{ref_start}-{ref_end}' hits_dict[target].add(f'{ref_name} {ref_ctg_effect} at {pos}') # Common criteria: the assembly has that variant diff --git a/modules/amr.nf b/modules/amr.nf index cf080e2..319a358 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -85,13 +85,12 @@ process OTHER_RESISTANCE { tuple val(sample_id), path(read1), path(read2), path(unpaired) output: - tuple val(sample_id), path(report), path(report_debug), emit: reports + tuple val(sample_id), path(report_debug), emit: report script: - report='result/report.tsv' report_debug='result/debug.report.tsv' """ - ariba run --nucmer_min_id 80 --assembled_threshold 0.80 "$ariba_database/$database" "$read1" "$read2" result + ariba run --nucmer_min_id 80 --assembled_threshold 0 "$ariba_database/$database" "$read1" "$read2" result """ } @@ -103,7 +102,7 @@ process PARSE_OTHER_RESISTANCE { tag "$sample_id" input: - tuple val(sample_id), path(report), path(report_debug) + tuple val(sample_id), path(report_debug) path metadata output: @@ -112,6 +111,6 @@ process PARSE_OTHER_RESISTANCE { script: output_file="other_amr_report.csv" """ - parse_other_resistance.py "$report" "$report_debug" "$metadata" "$output_file" + parse_other_resistance.py "$report_debug" "$metadata" "$output_file" """ } diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index a809c58..bb58027 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -145,9 +145,9 @@ workflow PIPELINE { PARSE_PBP_RESISTANCE(PBP_RESISTANCE.out.json) // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, infer resistance and determinants of other antimicrobials - // Output into Channel PARSE_OTHER_RESISTANCE.out.result + // Output into Channel PARSE_OTHER_RESISTANCE.out.report OTHER_RESISTANCE(GET_ARIBA_DB.out.path, GET_ARIBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) - PARSE_OTHER_RESISTANCE(OTHER_RESISTANCE.out.reports, params.ariba_metadata) + PARSE_OTHER_RESISTANCE(OTHER_RESISTANCE.out.report, params.ariba_metadata) // Generate sample reports by merging outputs from all result-generating modules GENERATE_SAMPLE_REPORT( From 5daaeea317ac7ab23f01cb58b2927a49d8e58500 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 23 Aug 2023 13:20:13 +0000 Subject: [PATCH 115/157] Remove unnecessary ariba run option Former-commit-id: 7c423887b8def33acff9a1986b9cbb0287a727de --- modules/amr.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/amr.nf b/modules/amr.nf index 319a358..3e3af8e 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -90,7 +90,7 @@ process OTHER_RESISTANCE { script: report_debug='result/debug.report.tsv' """ - ariba run --nucmer_min_id 80 --assembled_threshold 0 "$ariba_database/$database" "$read1" "$read2" result + ariba run --nucmer_min_id 80 "$ariba_database/$database" "$read1" "$read2" result """ } From c234fce19bb1d5161296a59c1e4c0d0a6728eb0d Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 24 Aug 2023 12:38:17 +0100 Subject: [PATCH 116/157] Improve schema for NF Tower Former-commit-id: c1a1ba3a620e9795fbd4c4accc24bdf9b4e7b8fe --- nextflow_schema.json | 34 ++++++++-------------------------- 1 file changed, 8 insertions(+), 26 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 59049cd..19ba36a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -26,6 +26,11 @@ "description": "Path to the directory saving databases used by the pipeline", "format": "directory-path" }, + "singularity_cachedir": { + "type": "string", + "description": "Path to the directory where Singularity images should be saved to.", + "format": "directory-path" + }, "assembly_publish": { "type": "string", "description": "Method used by Nextflow to publish the generated assemblies.", @@ -41,6 +46,7 @@ "reads", "output", "db", + "singularity_cachedir", "assembly_publish" ] }, @@ -171,7 +177,6 @@ }, "required": [ "kraken2_db_remote", - "kraken2_db_local", "kraken2_memory_mapping" ] }, @@ -196,7 +201,6 @@ }, "required": [ "seroba_db_remote", - "seroba_db_local", "seroba_kmer" ] }, @@ -221,8 +225,7 @@ }, "required": [ "poppunk_db_remote", - "poppunk_ext_remote", - "poppunk_db_local" + "poppunk_ext_remote" ] }, "other_amr": { @@ -250,25 +253,7 @@ }, "required": [ "ariba_ref", - "ariba_metadata", - "ariba_db_local" - ] - }, - "singularity": { - "title": "Singularity", - "type": "object", - "description": "", - "default": "", - "properties": { - "singularity_cachedir": { - "type": "string", - "description": "Path to the directory where Singularity images should be saved to.", - "hidden": true, - "format": "directory-path" - } - }, - "required": [ - "singularity_cachedir" + "ariba_metadata" ] } }, @@ -296,9 +281,6 @@ }, { "$ref": "#/definitions/other_amr" - }, - { - "$ref": "#/definitions/singularity" } ] } \ No newline at end of file From 95882ed38ab2c22ee94f0113669447f3139d0294 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 24 Aug 2023 13:45:37 +0100 Subject: [PATCH 117/157] Add missing process labels to GET_KRAKEN2_DB Former-commit-id: 84b8f4c1a6754bef2293f01e4b636ef9a292da92 --- modules/taxonomy.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/taxonomy.nf b/modules/taxonomy.nf index 539ee33..0d945f2 100644 --- a/modules/taxonomy.nf +++ b/modules/taxonomy.nf @@ -2,6 +2,8 @@ process GET_KRAKEN2_DB { label 'bash_container' label 'farm_low' + label 'farm_scratchless' + label 'farm_slow' input: val remote From 0a25fb3225d3d08fd8e15e28b9a6b6b5d5ad7954 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 6 Sep 2023 15:32:27 +0100 Subject: [PATCH 118/157] Update default GPS PopPUNK database URL Former-commit-id: 00804a520d8e93cb1fcbef2aef60da1ed33db90b --- README.md | 2 +- nextflow.config | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index af21bad..1d2c3ed 100644 --- a/README.md +++ b/README.md @@ -230,7 +230,7 @@ The pipeline is compatible with [Launchpad](https://help.tower.nf/23.2/launch/la | Option | Values | Description | | --- | ---| --- | | `--poppunk_db_remote` | Any valid URL to a PopPUNK database in `.tar.gz` or `.tgz` format
(Default: [GPS v6](https://gps-project.cog.sanger.ac.uk/GPS_v6.tar.gz)) | URL to a PopPUNK database. | - | `--poppunk_ext_remote` | Any valid URL to a PopPUNK external clusters file in `.csv` format
(Default: [GPS v6 GPSC Designation](https://www.pneumogen.net/gps/GPS_v6_external_clusters.csv)) | URL to a PopPUNK external clusters file. | + | `--poppunk_ext_remote` | Any valid URL to a PopPUNK external clusters file in `.csv` format
(Default: [GPS v6 GPSC Designation](https://gps-project.cog.sanger.ac.uk/GPS_v6_external_clusters.csv)) | URL to a PopPUNK external clusters file. | ## Other AMR | Option | Values | Description | diff --git a/nextflow.config b/nextflow.config index 76bd04a..cfedd6b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -37,7 +37,7 @@ params { // Default links for PopPUNK Database and External Clusters poppunk_db_remote = "https://gps-project.cog.sanger.ac.uk/GPS_v6.tar.gz" - poppunk_ext_remote = "https://www.pneumogen.net/gps/GPS_v6_external_clusters.csv" + poppunk_ext_remote = "https://gps-project.cog.sanger.ac.uk/GPS_v6_external_clusters.csv" // Default values for QC spneumo_percentage = 60.00 From ee07673f63dcc6409ca878b253ed07803548407f Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 8 Sep 2023 14:58:05 +0100 Subject: [PATCH 119/157] Update fastp version Former-commit-id: 46e94bbf96210d1a0e24eedbc9eaad674f6989bd --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index cfedd6b..1345d6b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -69,7 +69,7 @@ process { container = 'amancevice/pandas:2.0.2' } withLabel: fastp_container { - container = 'staphb/fastp:0.23.2' + container = 'staphb/fastp:0.23.4' } withLabel: unicycler_container { container = 'staphb/unicycler:0.5.0' From 85a1955b15cb132a324fac9502963406cef64f23 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 19 Sep 2023 15:23:12 +0000 Subject: [PATCH 120/157] Correct folP variant nucleotide range Former-commit-id: 155efe83c17f26e0794f2ac01f6eace8fdb4bf69 --- bin/parse_other_resistance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/parse_other_resistance.py b/bin/parse_other_resistance.py index 2c5b126..ae346d1 100755 --- a/bin/parse_other_resistance.py +++ b/bin/parse_other_resistance.py @@ -81,8 +81,8 @@ def find_hits(targets_dict, hits_dict): # Logic for variant detection, coverage check is not needed, but check for other criteria if var_only == "1": - # folP-specific criteria: ref_ctg_effect (effect of change between reference and contig) is one of the keywords and the change occurs within nt 168-201 - if ref_name.lower().startswith("folp") and ref_ctg_effect.lower() in ('fshift', 'trunc', 'indel', 'indels', 'ins', 'multiple') and (168 <= int(ref_start) <= 201 or 168 <= int(ref_end) <= 201): + # folP-specific criteria: ref_ctg_effect (effect of change between reference and contig) is one of the keywords and the change occurs within nt 166-201 (covering changes affecting aa 56 - 67) + if ref_name.lower().startswith("folp") and ref_ctg_effect.lower() in ('fshift', 'trunc', 'indel', 'indels', 'ins', 'multiple') and (166 <= int(ref_start) <= 201 or 166 <= int(ref_end) <= 201): pos = ref_start if ref_start == ref_end else f'{ref_start}-{ref_end}' hits_dict[target].add(f'{ref_name} {ref_ctg_effect} at {pos}') # Common criteria: the assembly has that variant From 0002f249f2f01f2fe8b5724b25e8f113689edb25 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 26 Sep 2023 13:18:02 +0000 Subject: [PATCH 121/157] Add mapped read depth check for gene detection Former-commit-id: 034b6b0dde8c937664c6ad5ef8dbe79f5a5791df --- bin/parse_other_resistance.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bin/parse_other_resistance.py b/bin/parse_other_resistance.py index ae346d1..2077664 100755 --- a/bin/parse_other_resistance.py +++ b/bin/parse_other_resistance.py @@ -63,7 +63,7 @@ def find_hits(targets_dict, hits_dict): for line in (line.strip() for line in debug_report): # Extract useful fields fields = [str(field) for field in line.split("\t")] - ref_name, gene, var_only, ref_len, ref_base_assembled, known_var_change, has_known_var, ref_ctg_effect, ref_start, ref_end = fields[1], fields[2], fields[3], fields[7], fields[8], fields[16], fields[17], fields[19], fields[20], fields[21] + ref_name, gene, var_only, ref_len, ref_base_assembled, ctg_cov, known_var_change, has_known_var, ref_ctg_effect, ref_start, ref_end = fields[1], fields[2], fields[3], fields[7], fields[8], fields[12], fields[16], fields[17], fields[19], fields[20], fields[21] # If the known_var_change (. for genes, specific change for variants) is not found in the metadata of the (ref_name, gene, var_only) combination, skip the line try: @@ -71,12 +71,12 @@ def find_hits(targets_dict, hits_dict): except KeyError: continue - # If ref_base_assembled or ref_len variable contains non-numeric value, skip the line - if not ref_base_assembled.isdigit() or not ref_len.isdigit(): + # If ref_base_assembled or ref_len or ctg_cov variables contain non-numeric value, skip the line + if not ref_base_assembled.isdigit() or not ref_len.isdigit() or not ctg_cov.replace('.', '', 1).isdigit(): continue - # Logic for gene detection, check coverage. - if var_only == "0" and int(ref_base_assembled)/int(ref_len) >= 0.8: + # Logic for gene detection, check coverage and mapped read depth. + if var_only == "0" and int(ref_base_assembled)/int(ref_len) >= 0.8 and float(ctg_cov) >= 20: hits_dict[target].add(f'{ref_name}') # Logic for variant detection, coverage check is not needed, but check for other criteria From ebe5a03f271acc7857dd3997c7f9c9aa2ee3067b Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 27 Sep 2023 15:24:17 +0100 Subject: [PATCH 122/157] Ensure sample report generation is stable Former-commit-id: cc1fa25bbb2834a2f3eb75063c5f29a14c80c320 --- bin/generate_sample_report.sh | 2 +- modules/output.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/generate_sample_report.sh b/bin/generate_sample_report.sh index 2a82172..3a03a37 100755 --- a/bin/generate_sample_report.sh +++ b/bin/generate_sample_report.sh @@ -1,5 +1,5 @@ # Combine all csv reports into a single csv, then add Sample_ID as the first field -paste -d , *.csv \ +paste -d , ${sample_id}_process_report_*.csv \ | sed '1 s/^/\"Sample_ID\",/' \ | sed "2 s/^/\"${SAMPLE_ID}\",/" > "$SAMPLE_REPORT" diff --git a/modules/output.nf b/modules/output.nf index 0fdbfc6..9d7f859 100644 --- a/modules/output.nf +++ b/modules/output.nf @@ -5,7 +5,7 @@ process GENERATE_SAMPLE_REPORT { tag "$sample_id" input: - tuple val(sample_id), path ('report*.csv') + tuple val(sample_id), path("${sample_id}_process_report_?.csv") output: path sample_report, emit: report From 3218d043faa23222a8fbea61da33313b45aba9fa Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 27 Sep 2023 15:29:04 +0000 Subject: [PATCH 123/157] Fix variable name Former-commit-id: 747208e1368509a8095d85176ed3d1054b9a41c0 --- bin/generate_sample_report.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/generate_sample_report.sh b/bin/generate_sample_report.sh index 3a03a37..4e1ef25 100755 --- a/bin/generate_sample_report.sh +++ b/bin/generate_sample_report.sh @@ -1,5 +1,5 @@ # Combine all csv reports into a single csv, then add Sample_ID as the first field -paste -d , ${sample_id}_process_report_*.csv \ +paste -d , ${SAMPLE_ID}_process_report_*.csv \ | sed '1 s/^/\"Sample_ID\",/' \ | sed "2 s/^/\"${SAMPLE_ID}\",/" > "$SAMPLE_REPORT" From 4186dcf8b1692a1e2a8ded17405f8a4928b26e06 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 27 Sep 2023 15:29:23 +0000 Subject: [PATCH 124/157] Quote variable input Former-commit-id: 85d4140aefa5232a1d74ff300f31072956478705 --- modules/output.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/output.nf b/modules/output.nf index 9d7f859..a0d20d1 100644 --- a/modules/output.nf +++ b/modules/output.nf @@ -13,8 +13,8 @@ process GENERATE_SAMPLE_REPORT { script: sample_report="${sample_id}_report.csv" """ - SAMPLE_ID=$sample_id - SAMPLE_REPORT=$sample_report + SAMPLE_ID="$sample_id" + SAMPLE_REPORT="$sample_report" source generate_sample_report.sh """ From 40a4c6e98b6c56d23856d720810225f181c0ddea Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 28 Sep 2023 10:48:56 +0000 Subject: [PATCH 125/157] Improve input handling Former-commit-id: 6fc30612a7f2360f34cb46e3b0872879556ec802 --- bin/generate_overall_report.py | 6 +++--- modules/output.nf | 7 ++++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/bin/generate_overall_report.py b/bin/generate_overall_report.py index 59a1737..a02fb38 100755 --- a/bin/generate_overall_report.py +++ b/bin/generate_overall_report.py @@ -25,8 +25,8 @@ # Check argv and save to global variables if len(sys.argv) != 4: - sys.exit('Usage: generate_overall_report.py WORKDIR_PATH ARIBA_METADATA OUTPUT_FILE') -WORKDIR_PATH = sys.argv[1] + sys.exit('Usage: generate_overall_report.py INPUT_PATTERN ARIBA_METADATA OUTPUT_FILE') +INPUT_PATTERN = sys.argv[1] ARIBA_METADATA = sys.argv[2] OUTPUT_FILE = sys.argv[3] @@ -79,7 +79,7 @@ def get_df_output(output_columns): # Generate a dataframe for each sample report and then concat df_manifest and all dataframes into df_output dfs = [df_manifest] - reports = glob.glob(WORKDIR_PATH +'/*.csv') + reports = glob.glob(INPUT_PATTERN) for report in reports: df = pd.read_csv(report, dtype=str) dfs.append(df) diff --git a/modules/output.nf b/modules/output.nf index a0d20d1..dcfbc26 100644 --- a/modules/output.nf +++ b/modules/output.nf @@ -27,15 +27,16 @@ process GENERATE_OVERALL_REPORT { publishDir "${params.output}", mode: "copy" input: - path 'report*.csv' - path 'ariba_metadata' + path '*' + path ariba_metadata output: path "$overall_report", emit: report script: + input_pattern='*_report.csv' overall_report='results.csv' """ - generate_overall_report.py `pwd` $ariba_metadata $overall_report + generate_overall_report.py '$input_pattern' $ariba_metadata $overall_report """ } From 5a5530bb4c054da4a9aca0ceab668ca2ad013a16 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 2 Oct 2023 22:04:56 +0000 Subject: [PATCH 126/157] Update custom images Former-commit-id: 515a812d30fd7146eb91a2175d1c3b780b555c58 --- nextflow.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index 1345d6b..2e2f19b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -93,7 +93,7 @@ process { container = 'staphb/poppunk:2.6.0' } withLabel: spn_pbp_amr_container { - container = 'harryhungch/spn-pbp-amr:23.01.16' + container = 'sangerbentleygroup/spn-pbp-amr:23.10.2' } withLabel: ariba_container { container = 'staphb/ariba:2.14.6' @@ -105,7 +105,7 @@ process { container = 'staphb/kraken2:2.1.2-no-db' } withLabel: seroba_container { - container = 'harryhungch/seroba:1.0.3' + container = 'sangerbentleygroup/seroba:1.0.4' } } From 61bef6040f5bfd5e23d4cee42f6d6f325fc27c9b Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 2 Oct 2023 22:07:54 +0000 Subject: [PATCH 127/157] Update descriptions of custom images Former-commit-id: 3bd004df80fbded81701dd7f547ddc7f260b3f9a --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1d2c3ed..567262d 100644 --- a/README.md +++ b/README.md @@ -437,7 +437,7 @@ This project uses open-source components. You can find the homepage or source co [SeroBA](https://sanger-pathogens.github.io/seroba/) - **SeroBA: rapid high-throughput serotyping of Streptococcus pneumoniae from whole genome sequence data**. Epping L, van Tonder, AJ, Gladstone RA, GPS Consortium, Bentley SD, Page AJ, Keane JA, Microbial Genomics 2018, doi: [10.1099/mgen.0.000186](http://mgen.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000186) - License (GPL-3.0): https://github.com/sanger-pathogens/seroba/blob/master/LICENSE -- This project uses a Docker image built from a [custom fork](https://github.com/HarryHung/seroba) +- This project uses a Docker image of a [fork](https://github.com/sanger-bentley-group/seroba) - The fork includes critical bug fixes for SeroBA as the original repository is no longer maintained - The Docker image provides the containerised environment with SeroBA for `GET_SEROBA_DB` and `SEROTYPE` processes of the `serotype.nf` module @@ -457,7 +457,7 @@ This project uses open-source components. You can find the homepage or source co - [Pathogenwatch](https://pathogen.watch/) ([@pathogenwatch-oss](https://github.com/pathogenwatch-oss)) - License (MIT): https://github.com/pathogenwatch-oss/spn-resistance-pbp/blob/main/LICENSE - This is a modified version of [AMR predictor](https://github.com/BenJamesMetcalf/Spn_Scripts_Reference) by Ben Metcalf ([@BenJamesMetcalf](https://github.com/BenJamesMetcalf)) at the Centre for Disease Control (CDC) -- This project uses a Docker image built from a [custom fork](https://github.com/HarryHung/spn-resistance-pbp) +- This project uses a Docker image of a [fork](https://github.com/sanger-bentley-group/spn-pbp-amr) - The fork changes the Docker image from a Docker executable image to a Docker environment for Nextflow integration - The Docker image provides the containerised environment with SPN-PBP-MAR for `PBP_RESISTANCE` process of the `amr.nf` module From 624129288ac2360f8dfd7c7f72b3e445a647b577 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 2 Oct 2023 22:49:28 +0000 Subject: [PATCH 128/157] Update default SeroBA remote database Former-commit-id: be3d9ee6cba736c6e17ce009ffcd18cd67be3d2c --- README.md | 2 +- nextflow.config | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 567262d..34e62b6 100644 --- a/README.md +++ b/README.md @@ -223,7 +223,7 @@ The pipeline is compatible with [Launchpad](https://help.tower.nf/23.2/launch/la ## Serotype | Option | Values | Description | | --- | ---| --- | - | `--seroba_db_remote` | Any valid URL to a Git remote repository
(Default: [SeroBA GitHub Repo](https://github.com/sanger-pathogens/seroba.git))| URL to a SeroBA Git remote repository. | + | `--seroba_db_remote` | Any valid URL to a Git remote repository
(Default: [SeroBA Fork GitHub Repo](https://github.com/sanger-bentley-group/seroba.git))| URL to a SeroBA Git remote repository. | | `--seroba_kmer` | Any integer value
(Default: `71`) | Kmer size for creating the KMC database of SeroBA. | ## Lineage diff --git a/nextflow.config b/nextflow.config index 2e2f19b..afba5b1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -25,7 +25,7 @@ params { assembly_publish = "link" // Default git repository, and KMC kmer size for SeroBA - seroba_db_remote = "https://github.com/sanger-pathogens/seroba.git" + seroba_db_remote = "https://github.com/sanger-bentley-group/seroba.git" seroba_kmer = 71 // Default link for Kraken2 Database, and usage of memory mapping From c6975d29215a6abea96727542c882afbee0abb54 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 4 Oct 2023 00:30:02 +0000 Subject: [PATCH 129/157] Change SeroBA database to release-based Former-commit-id: a33663d078868e73092b3b1abafc60e7dbc11d1c --- README.md | 7 +---- bin/check-create_seroba_db.sh | 34 ++++++++++++++++++++++++ bin/check_seroba_db.sh | 20 -------------- bin/create_seroba_db.sh | 9 ------- bin/get_serotype.sh | 2 +- bin/save_databases_info.sh | 6 ++--- modules/serotype.nf | 49 +++++------------------------------ modules/validate.nf | 8 +----- nextflow.config | 7 ++--- workflows/init.nf | 7 +++-- workflows/pipeline.nf | 9 +++---- 11 files changed, 56 insertions(+), 102 deletions(-) create mode 100755 bin/check-create_seroba_db.sh delete mode 100755 bin/check_seroba_db.sh delete mode 100755 bin/create_seroba_db.sh diff --git a/README.md b/README.md index 34e62b6..a1f7436 100644 --- a/README.md +++ b/README.md @@ -223,7 +223,7 @@ The pipeline is compatible with [Launchpad](https://help.tower.nf/23.2/launch/la ## Serotype | Option | Values | Description | | --- | ---| --- | - | `--seroba_db_remote` | Any valid URL to a Git remote repository
(Default: [SeroBA Fork GitHub Repo](https://github.com/sanger-bentley-group/seroba.git))| URL to a SeroBA Git remote repository. | + | `--seroba_db_remote` | Any valid URL to a SeroBA release in `.tar.gz` or `.tgz` format
(Default: [SeroBA v1.0.4](https://github.com/sanger-bentley-group/seroba/archive/refs/tags/v1.0.4.tar.gz))| URL to a SeroBA release. | | `--seroba_kmer` | Any integer value
(Default: `71`) | Kmer size for creating the KMC database of SeroBA. | ## Lineage @@ -378,11 +378,6 @@ This project uses open-source components. You can find the homepage or source co - License (GPL-3.0): https://github.com/StaPH-B/docker-builds/blob/master/LICENSE - These Docker images provide containerised environments with different bioinformatics tools for processes of multiple modules -[Docker Image of Git](https://hub.docker.com/r/bitnami/git) -- [Bitnami](https://bitnami.com/) ([@Bitnami](https://github.com/bitnami)) -- License (Apache 2.0): https://github.com/bitnami/containers/blob/main/LICENSE.md -- This Docker image provides the containerised environment with Git for `CHECK_SEROBA_DB` process of the `serotype.nf` module - [Docker Image of network-multitool](https://hub.docker.com/r/wbitt/network-multitool) - [Wbitt - We Bring In Tomorrow's Technolgies](https://wbitt.com/) ([@WBITT](https://github.com/wbitt)) - License (MIT): https://github.com/wbitt/Network-MultiTool/blob/master/LICENSE diff --git a/bin/check-create_seroba_db.sh b/bin/check-create_seroba_db.sh new file mode 100755 index 0000000..365c321 --- /dev/null +++ b/bin/check-create_seroba_db.sh @@ -0,0 +1,34 @@ +# Check if database was downloaded from specific link, also prepared by the specific Kmer +# If not: remove files in database directory and download, re-create KMC and ARIBA databases, also save metadata to JSON + +ZIPPED_REPO='seroba.tar.gz' + +if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \ + [ ! "$(grep '"url"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$DB_REMOTE" ] || \ + [ ! "$(grep '"kmer"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$KMER" ] || \ + [ ! -d "${DB_LOCAL}/ariba_db" ] || \ + [ ! -d "${DB_LOCAL}/kmer_db" ] || \ + [ ! -d "${DB_LOCAL}/streptococcus-pneumoniae-ctvdb"] || \ + [ ! -f "${DB_LOCAL}/cd_cluster.tsv" ] || \ + [ ! -f "${DB_LOCAL}/cdhit_cluster" ] || \ + [ ! -f "${DB_LOCAL}/kmer_size.txt" ] || \ + [ ! -f "${DB_LOCAL}/meta.tsv" ] || \ + [ ! -f "${DB_LOCAL}/reference.fasta" ]; then + + rm -rf "${DB_LOCAL}" + + wget "${DB_REMOTE}" -O $ZIPPED_REPO + + mkdir tmp + tar -xzf $ZIPPED_REPO --strip-components=1 -C tmp + + mkdir -p "${DB_LOCAL}" + mv tmp/database/* "${DB_LOCAL}" + + seroba createDBs "${DB_LOCAL}" "${KMER}" + + rm -f $ZIPPED_REPO + + echo -e "{\n \"url\": \"$DB_REMOTE\",\n \"kmer\": \"$KMER\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > "${DB_LOCAL}/${JSON_FILE}" + +fi diff --git a/bin/check_seroba_db.sh b/bin/check_seroba_db.sh deleted file mode 100755 index fd9578d..0000000 --- a/bin/check_seroba_db.sh +++ /dev/null @@ -1,20 +0,0 @@ -# Check if database was cloned from specific link and is up-to-date, also prepared by the specific Kmer -# If not: remove files in database directory and clone, set CREATE_DB to true - -# Assume up-to-date if JSON passes checks and the host cannot be resolved to allow offline usage - -if [ ! -f "${DB_LOCAL}"/"${JSON_FILE}" ] || \ - [ ! "$(grep 'git' "${DB_LOCAL}"/"${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "${DB_REMOTE}" ] || \ - [ ! "$(grep 'kmer' "${DB_LOCAL}"/"${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "${KMER}" ] || \ - ! ( (git -C "${DB_LOCAL}" pull || echo 'Already up-to-date') | grep -q 'Already up[- ]to[- ]date' ); then - - rm -rf "${DB_LOCAL}" - git clone "${DB_REMOTE}" "${DB_LOCAL}" - - CREATE_DB=true - -else - - CREATE_DB=false - -fi diff --git a/bin/create_seroba_db.sh b/bin/create_seroba_db.sh deleted file mode 100755 index 3ff36b2..0000000 --- a/bin/create_seroba_db.sh +++ /dev/null @@ -1,9 +0,0 @@ -# If create_db is true: re-create KMC and ARIBA databases, also save metadata to JSON - -if [ "$CREATE_DB" = true ]; then - - seroba createDBs "${DB_LOCAL}/${DATABASE}/" "${KMER}" - - echo -e "{\n \"git\": \"$DB_REMOTE\",\n \"kmer\": \"$KMER\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > "${DB_LOCAL}/${JSON_FILE}" - -fi diff --git a/bin/get_serotype.sh b/bin/get_serotype.sh index 560cdd7..e4d83e5 100755 --- a/bin/get_serotype.sh +++ b/bin/get_serotype.sh @@ -1,7 +1,7 @@ # Run SeroBA to serotype samples { - seroba runSerotyping "${SEROBA_DIR}/${DATABASE}" "$READ1" "$READ2" "$SAMPLE_ID" && SEROTYPE=$(awk -F'\t' '{ print $2 }' "${SAMPLE_ID}/pred.tsv") + seroba runSerotyping "${SEROBA_DB}" "$READ1" "$READ2" "$SAMPLE_ID" && SEROTYPE=$(awk -F'\t' '{ print $2 }' "${SAMPLE_ID}/pred.tsv") } || { SEROTYPE="SEROBA FAILURE" } diff --git a/bin/save_databases_info.sh b/bin/save_databases_info.sh index 95cba9e..a932b03 100755 --- a/bin/save_databases_info.sh +++ b/bin/save_databases_info.sh @@ -35,15 +35,15 @@ add_ariba_db () { add_seroba_db () { SEROBA_DB_JSON="${SEROBA_DB_PATH}/${SEROBA_JSON}" if [ -f "$SEROBA_DB_JSON" ]; then - GIT=$(jq -r .git "$SEROBA_DB_JSON") + URL=$(jq -r .url "$SEROBA_DB_JSON") KMER=$(jq -r .kmer "$SEROBA_DB_JSON") CREATE_TIME=$(jq -r .create_time "$SEROBA_DB_JSON") else - GIT="Not yet created" + URL="Not yet created" KMER="Not yet created" CREATE_TIME="Not yet created" fi - jq -n --arg git "$GIT" --arg kmer "$KMER" --arg create_time "$CREATE_TIME" '. = {"git": $git, "kmer": $kmer, "create_time": $create_time}' + jq -n --arg url "$URL" --arg kmer "$KMER" --arg create_time "$CREATE_TIME" '. = {"url": $url, "kmer": $kmer, "create_time": $create_time}' } add_url_db () { diff --git a/modules/serotype.nf b/modules/serotype.nf index 18a23c7..8903f46 100644 --- a/modules/serotype.nf +++ b/modules/serotype.nf @@ -1,60 +1,28 @@ -// Return boolean of CREATE_DB, remove and clone if necessary -process CHECK_SEROBA_DB { - label 'git_container' - label 'farm_low' - label 'farm_scratchless' - label 'farm_slow' - - input: - val remote - path db - val kmer - - output: - env CREATE_DB, emit: create_db - - script: - seroba_db="${db}/seroba" - json='done_seroba.json' - """ - DB_REMOTE="$remote" - DB_LOCAL="$seroba_db" - KMER="$kmer" - JSON_FILE="$json" - - source check_seroba_db.sh - """ -} - -// Return SeroBA databases path, create databases if necessary +// Return SeroBA databases path, download and create databases if necessary process GET_SEROBA_DB { label 'seroba_container' label 'farm_low' label 'farm_scratchless' + label 'farm_slow' input: val remote path db - val create_db val kmer output: path seroba_db, emit: path - val database, emit: database script: seroba_db="${db}/seroba" - database='database' json='done_seroba.json' """ - DATABASE="$database" DB_REMOTE="$remote" DB_LOCAL="$seroba_db" KMER="$kmer" - CREATE_DB="$create_db" JSON_FILE="$json" - source create_seroba_db.sh + source check-create_seroba_db.sh """ } @@ -66,8 +34,7 @@ process SEROTYPE { tag "$sample_id" input: - path seroba_dir - val database + path seroba_db tuple val(sample_id), path(read1), path(read2), path(unpaired) output: @@ -80,8 +47,7 @@ process SEROTYPE { // Workaround: create and use a subdirectory to alter the path if (workflow.containerEngine === 'docker') """ - SEROBA_DIR="$seroba_dir" - DATABASE="$database" + SEROBA_DB="$seroba_db" READ1="$read1" READ2="$read2" SAMPLE_ID="$sample_id" @@ -91,14 +57,13 @@ process SEROTYPE { """ else if (workflow.containerEngine === 'singularity') """ - SEROBA_DIR="$seroba_dir" - DATABASE="$database" + SEROBA_DB="$seroba_db" READ1="$read1" READ2="$read2" SAMPLE_ID="$sample_id" SEROTYPE_REPORT="$serotype_report" - mkdir SEROBA_WORKDIR && mv $seroba_dir $read1 $read2 SEROBA_WORKDIR && cd SEROBA_WORKDIR + mkdir SEROBA_WORKDIR && mv $seroba_db $read1 $read2 SEROBA_WORKDIR && cd SEROBA_WORKDIR source get_serotype.sh diff --git a/modules/validate.nf b/modules/validate.nf index b994678..ce5a55e 100644 --- a/modules/validate.nf +++ b/modules/validate.nf @@ -9,7 +9,7 @@ validParams = [ assembler: 'assembler', min_contig_length: 'int', assembly_publish: 'publish_mode', - seroba_db_remote: 'url_git', + seroba_db_remote: 'url_targz', seroba_kmer: 'int', kraken2_db_remote: 'url_targz', kraken2_memory_mapping: 'boolean', @@ -150,12 +150,6 @@ void validate(Map params) { } break - case 'url_git': - if (!(value ==~ /^(https?:\/\/)?(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)\.git$/)) { - invalidValues[key] = [value, 'URL that points a Git remote repository (valid URL ending with .git)'] - } - break - case 'url_targz': if (!(value ==~ /^(https?:\/\/)?(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)\.(tar\.gz|tgz)$/)) { invalidValues[key] = [value, 'URL that points a .tar.gz file (valid URL ending with .tar.gz or .tgz)'] diff --git a/nextflow.config b/nextflow.config index afba5b1..b70ee63 100644 --- a/nextflow.config +++ b/nextflow.config @@ -24,8 +24,8 @@ params { // Default assembly publish mode assembly_publish = "link" - // Default git repository, and KMC kmer size for SeroBA - seroba_db_remote = "https://github.com/sanger-bentley-group/seroba.git" + // Default link for SeroBA repository, and KMC kmer size for SeroBA + seroba_db_remote = "https://github.com/sanger-bentley-group/seroba/archive/refs/tags/v1.0.4.tar.gz" seroba_kmer = 71 // Default link for Kraken2 Database, and usage of memory mapping @@ -62,9 +62,6 @@ process { withLabel: bash_container { container = 'wbitt/network-multitool:69aa4d5' } - withLabel: git_container{ - container = 'bitnami/git:2.39.0' - } withLabel: python_container { container = 'amancevice/pandas:2.0.2' } diff --git a/workflows/init.nf b/workflows/init.nf index 9a29e62..374ec2f 100644 --- a/workflows/init.nf +++ b/workflows/init.nf @@ -2,7 +2,7 @@ include { GET_REF_GENOME_BWA_DB } from "$projectDir/modules/mapping" include { GET_KRAKEN2_DB } from "$projectDir/modules/taxonomy" include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS } from "$projectDir/modules/lineage" -include { CHECK_SEROBA_DB; GET_SEROBA_DB } from "$projectDir/modules/serotype" +include { GET_SEROBA_DB } from "$projectDir/modules/serotype" include { GET_DOCKER_COMPOSE; PULL_IMAGES } from "$projectDir/modules/docker" include { GET_ARIBA_DB } from "$projectDir/modules/amr" @@ -17,9 +17,8 @@ workflow INIT { // Check Kraken2 Database, download if necessary GET_KRAKEN2_DB(params.kraken2_db_remote, params.db) - // Check SeroBA Databases, clone and rebuild if necessary - CHECK_SEROBA_DB(params.seroba_db_remote, params.db, params.seroba_kmer) - GET_SEROBA_DB(params.seroba_db_remote, params.db, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) + // Check SeroBA Databases, download and rebuild if necessary + GET_SEROBA_DB(params.seroba_db_remote, params.db, params.seroba_kmer) // Check to PopPUNK Database and External Clusters, download if necessary GET_POPPUNK_DB(params.poppunk_db_remote, params.db) diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index bb58027..ac427ca 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -5,7 +5,7 @@ include { GET_REF_GENOME_BWA_DB; MAPPING; SAM_TO_SORTED_BAM; SNP_CALL; HET_SNP_C include { GET_KRAKEN2_DB; TAXONOMY; TAXONOMY_QC } from "$projectDir/modules/taxonomy" include { OVERALL_QC } from "$projectDir/modules/overall_qc" include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS; LINEAGE } from "$projectDir/modules/lineage" -include { CHECK_SEROBA_DB; GET_SEROBA_DB; SEROTYPE } from "$projectDir/modules/serotype" +include { GET_SEROBA_DB; SEROTYPE } from "$projectDir/modules/serotype" include { MLST } from "$projectDir/modules/mlst" include { PBP_RESISTANCE; PARSE_PBP_RESISTANCE; GET_ARIBA_DB; OTHER_RESISTANCE; PARSE_OTHER_RESISTANCE } from "$projectDir/modules/amr" include { GENERATE_SAMPLE_REPORT; GENERATE_OVERALL_REPORT } from "$projectDir/modules/output" @@ -19,9 +19,8 @@ workflow PIPELINE { // Get path to Kraken2 Database, download if necessary GET_KRAKEN2_DB(params.kraken2_db_remote, params.db) - // Get path to SeroBA Databases, clone and rebuild if necessary - CHECK_SEROBA_DB(params.seroba_db_remote, params.db, params.seroba_kmer) - GET_SEROBA_DB(params.seroba_db_remote, params.db, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) + // Get path SeroBA Databases, download and rebuild if necessary + GET_SEROBA_DB(params.seroba_db_remote, params.db, params.seroba_kmer) // Get paths to PopPUNK Database and External Clusters, download if necessary GET_POPPUNK_DB(params.poppunk_db_remote, params.db) @@ -133,7 +132,7 @@ workflow PIPELINE { // From Channel OVERALL_QC_PASSED_READS_ch, serotype the preprocess reads of samples passed overall QC // Output into Channel SEROTYPE.out.report - SEROTYPE(GET_SEROBA_DB.out.path, GET_SEROBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) + SEROTYPE(GET_SEROBA_DB.out.path, OVERALL_QC_PASSED_READS_ch) // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, PubMLST typing the assemblies of samples passed overall QC // Output into Channel MLST.out.report From 60bfb70c23555f3e861b18c6db444c83ef06bb9b Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 4 Oct 2023 00:30:38 +0000 Subject: [PATCH 130/157] Remove Git container information Former-commit-id: b0af56303d3deca90201bf03772e8691462a8d86 --- bin/save_images_info.sh | 2 -- bin/save_tools_info.sh | 1 - workflows/info_and_version.nf | 4 +--- 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/bin/save_images_info.sh b/bin/save_images_info.sh index 51b20aa..265eb42 100755 --- a/bin/save_images_info.sh +++ b/bin/save_images_info.sh @@ -5,7 +5,6 @@ find_image () { } BASH=$(find_image bash) -GIT=$(find_image git) PYTHON=$(find_image python) FASTP=$(find_image fastp) UNICYCLER=$(find_image unicycler) @@ -27,7 +26,6 @@ add_container () { jq -n \ --argjson bash "$(add_container "$BASH")" \ - --argjson git "$(add_container "$GIT")" \ --argjson python "$(add_container "$PYTHON")" \ --argjson fastp "$(add_container "$FASTP")" \ --argjson unicycler "$(add_container "$UNICYCLER")" \ diff --git a/bin/save_tools_info.sh b/bin/save_tools_info.sh index 9d20e16..37958f0 100755 --- a/bin/save_tools_info.sh +++ b/bin/save_tools_info.sh @@ -5,7 +5,6 @@ add_version () { } jq -n \ - --argjson git "$(add_version "$GIT_VERSION")" \ --argjson python "$(add_version "$PYTHON_VERSION")" \ --argjson fastp "$(add_version "$FASTP_VERSION")" \ --argjson unicycler "$(add_version "$UNICYCLER_VERSION")" \ diff --git a/workflows/info_and_version.nf b/workflows/info_and_version.nf index 696a388..8dd4035 100644 --- a/workflows/info_and_version.nf +++ b/workflows/info_and_version.nf @@ -1,4 +1,4 @@ -include { IMAGES; DATABASES; TOOLS; COMBINE_INFO; PARSE; PRINT; SAVE; GIT_VERSION; PYTHON_VERSION; FASTP_VERSION; UNICYCLER_VERSION; SHOVILL_VERSION; QUAST_VERSION; BWA_VERSION; SAMTOOLS_VERSION; BCFTOOLS_VERSION; POPPUNK_VERSION; MLST_VERSION; KRAKEN2_VERSION; SEROBA_VERSION; ARIBA_VERSION } from "$projectDir/modules/info" +include { IMAGES; DATABASES; TOOLS; COMBINE_INFO; PARSE; PRINT; SAVE; PYTHON_VERSION; FASTP_VERSION; UNICYCLER_VERSION; SHOVILL_VERSION; QUAST_VERSION; BWA_VERSION; SAMTOOLS_VERSION; BCFTOOLS_VERSION; POPPUNK_VERSION; MLST_VERSION; KRAKEN2_VERSION; SEROBA_VERSION; ARIBA_VERSION } from "$projectDir/modules/info" // Alternative workflow that prints versions of pipeline and tools workflow PRINT_VERSION { @@ -64,7 +64,6 @@ workflow GET_VERSION { nextflow_version = "$nextflow.version" - GIT_VERSION() PYTHON_VERSION() FASTP_VERSION() UNICYCLER_VERSION() @@ -80,7 +79,6 @@ workflow GET_VERSION { ARIBA_VERSION() TOOLS( - GIT_VERSION.out, PYTHON_VERSION.out, FASTP_VERSION.out, UNICYCLER_VERSION.out, From d85c1d41b53f78ada85490b96e92d611d648741f Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 4 Oct 2023 00:30:49 +0000 Subject: [PATCH 131/157] Update info messages Former-commit-id: 66ac6b45d0cb883921613289778b9db44bc11c44 --- modules/info.nf | 161 ++++++++++++++++++++++-------------------------- 1 file changed, 72 insertions(+), 89 deletions(-) diff --git a/modules/info.nf b/modules/info.nf index c55f537..aaebd92 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -71,7 +71,6 @@ process TOOLS { label 'farm_low' input: - val git_version val python_version val fastp_version val unicycler_version @@ -92,7 +91,6 @@ process TOOLS { script: json='tools.json' """ - GIT_VERSION="$git_version" PYTHON_VERSION="$python_version" FASTP_VERSION="$fastp_version" UNICYCLER_VERSION="$unicycler_version" @@ -164,61 +162,61 @@ process PARSE { } def coreTextRow = { leftContent, rightContent -> - textRow(25, 61, leftContent, rightContent) + textRow(25, 67, leftContent, rightContent) } coreText = """\ - |┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ Core Software Versions ┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ - |╔═══════════════════════════╤═══════════════════════════════════════════════════════════════╗ + |┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ Core Software Versions ┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ + |╔═══════════════════════════╤═════════════════════════════════════════════════════════════════════╗ |${coreTextRow('Software', 'Version')} - |╠═══════════════════════════╪═══════════════════════════════════════════════════════════════╣ + |╠═══════════════════════════╪═════════════════════════════════════════════════════════════════════╣ |${coreTextRow('GPS Unified Pipeline', json.pipeline.version)} |${coreTextRow('Nextflow', json.nextflow.version)} - |╚═══════════════════════════╧═══════════════════════════════════════════════════════════════╝ + |╚═══════════════════════════╧═════════════════════════════════════════════════════════════════════╝ |""".stripMargin() def dbTextRow = { leftContent, rightContent -> - textRow(13, 73, leftContent, rightContent) + textRow(13, 79, leftContent, rightContent) } dbText = """\ - |┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ Databases Information ┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ - |╔═══════════════════════════════════════════════════════════════════════════════════════════╗ - |║ BWA reference genome FM-index database ║ - |╟───────────────┬───────────────────────────────────────────────────────────────────────────╢ + |┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ Databases Information ┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ + |╔═════════════════════════════════════════════════════════════════════════════════════════════════╗ + |║ BWA reference genome FM-index database ║ + |╟───────────────┬─────────────────────────────────────────────────────────────────────────────────╢ |${dbTextRow('Reference', json.bwa_db.reference)} |${dbTextRow('Reference MD5', json.bwa_db.reference_md5)} |${dbTextRow('Created', json.bwa_db.create_time)} - |╠═══════════════╧═══════════════════════════════════════════════════════════════════════════╣ - |║ Kraken 2 database ║ - |╟───────────────┬───────────────────────────────────────────────────────────────────────────╢ + |╠═══════════════╧═════════════════════════════════════════════════════════════════════════════════╣ + |║ Kraken 2 database ║ + |╟───────────────┬─────────────────────────────────────────────────────────────────────────────────╢ |${dbTextRow('Source', json.kraken2_db.url)} |${dbTextRow('Saved', json.kraken2_db.save_time)} - |╠═══════════════╧═══════════════════════════════════════════════════════════════════════════╣ - |║ PopPUNK database ║ - |╟───────────────┬───────────────────────────────────────────────────────────────────────────╢ + |╠═══════════════╧═════════════════════════════════════════════════════════════════════════════════╣ + |║ PopPUNK database ║ + |╟───────────────┬─────────────────────────────────────────────────────────────────────────────────╢ |${dbTextRow('Source', json.poppunnk_db.url)} |${dbTextRow('Saved', json.poppunnk_db.save_time)} - |╠═══════════════╧═══════════════════════════════════════════════════════════════════════════╣ - |║ PopPUNK external clusters file ║ - |╟───────────────┬───────────────────────────────────────────────────────────────────────────╢ + |╠═══════════════╧═════════════════════════════════════════════════════════════════════════════════╣ + |║ PopPUNK external clusters file ║ + |╟───────────────┬─────────────────────────────────────────────────────────────────────────────────╢ |${dbTextRow('Source', json.poppunk_ext.url)} |${dbTextRow('Saved', json.poppunk_ext.save_time)} - |╠═══════════════╧═══════════════════════════════════════════════════════════════════════════╣ - |║ SeroBA database ║ - |╟───────────────┬───────────────────────────────────────────────────────────────────────────╢ - |${dbTextRow('Source', json.seroba_db.git)} + |╠═══════════════╧═════════════════════════════════════════════════════════════════════════════════╣ + |║ SeroBA database ║ + |╟───────────────┬─────────────────────────────────────────────────────────────────────────────────╢ + |${dbTextRow('Source', json.seroba_db.url)} |${dbTextRow('Kmer size', json.seroba_db.kmer)} |${dbTextRow('Created', json.seroba_db.create_time)} - |╠═══════════════╧═══════════════════════════════════════════════════════════════════════════╣ - |║ ARIBA database ║ - |╟───────────────┬───────────────────────────────────────────────────────────────────────────╢ + |╠═══════════════╧═════════════════════════════════════════════════════════════════════════════════╣ + |║ ARIBA database ║ + |╟───────────────┬─────────────────────────────────────────────────────────────────────────────────╢ |${dbTextRow('Reference', json.ariba_db.reference)} |${dbTextRow('Reference MD5', json.ariba_db.reference_md5)} |${dbTextRow('Metadata', json.ariba_db.metadata)} |${dbTextRow('Metadata MD5', json.ariba_db.metadata_md5)} |${dbTextRow('Created', json.ariba_db.create_time)} - |╚═══════════════╧═══════════════════════════════════════════════════════════════════════════╝ + |╚═══════════════╧═════════════════════════════════════════════════════════════════════════════════╝ |""".stripMargin() def getVersion = { tool -> @@ -230,15 +228,14 @@ process PARSE { } def toolTextRow = { leftContent, rightContent -> - textRow(30, 56, leftContent, getVersion(rightContent)) + textRow(30, 62, leftContent, getVersion(rightContent)) } toolText = """\ - |┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ Tool Versions ┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ - |╔════════════════════════════════╤══════════════════════════════════════════════════════════╗ - |${textRow(30, 56, 'Tool', 'Version')} - |╠════════════════════════════════╪══════════════════════════════════════════════════════════╣ - |${toolTextRow('Git', 'git')} + |┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ Tool Versions ┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ + |╔════════════════════════════════╤════════════════════════════════════════════════════════════════╗ + |${textRow(30, 62, 'Tool', 'Version')} + |╠════════════════════════════════╪════════════════════════════════════════════════════════════════╣ |${toolTextRow('Python', 'python')} |${toolTextRow('fastp', 'fastp')} |${toolTextRow('Unicycler', 'unicycler')} @@ -253,7 +250,7 @@ process PARSE { |${toolTextRow('mlst', 'mlst')} |${toolTextRow('Kraken 2', 'kraken2')} |${toolTextRow('SeroBA', 'seroba')} - |╚════════════════════════════════╧══════════════════════════════════════════════════════════╝ + |╚════════════════════════════════╧════════════════════════════════════════════════════════════════╝ |""".stripMargin() def getImage = { tool -> @@ -265,16 +262,15 @@ process PARSE { } def imageTextRow = { leftContent, rightContent -> - textRow(30, 56, leftContent, getImage(rightContent)) + textRow(30, 62, leftContent, getImage(rightContent)) } imageText = """\ - |┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ Container Images ┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ - |╔════════════════════════════════╤══════════════════════════════════════════════════════════╗ - |${textRow(30, 56, 'Environment For', 'Image')} - |╠════════════════════════════════╪══════════════════════════════════════════════════════════╣ + |┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ Container Images ┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ + |╔════════════════════════════════╤════════════════════════════════════════════════════════════════╗ + |${textRow(30, 62, 'Environment For', 'Image')} + |╠════════════════════════════════╪════════════════════════════════════════════════════════════════╣ |${imageTextRow('Bash', 'bash')} - |${imageTextRow('Git', 'git')} |${imageTextRow('Python', 'python')} |${imageTextRow('fastp', 'fastp')} |${imageTextRow('Unicycler', 'unicycler')} @@ -289,7 +285,7 @@ process PARSE { |${imageTextRow('mlst', 'mlst')} |${imageTextRow('Kraken 2', 'kraken2')} |${imageTextRow('SeroBA', 'seroba')} - |╚════════════════════════════════╧══════════════════════════════════════════════════════════╝ + |╚════════════════════════════════╧════════════════════════════════════════════════════════════════╝ |""".stripMargin() } @@ -306,9 +302,9 @@ process PRINT { exec: log.info( """ - |╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍ - |╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍ Version Information ╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍ - |╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍ + |╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍ + |╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍ Version Information ╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍ + |╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍╍ | |${coreText} |${dbText} @@ -342,73 +338,73 @@ process SAVE { } def ioTextRow = { leftContent, rightContent -> - textRow(8, 78, leftContent, rightContent) + textRow(8, 84, leftContent, rightContent) } String ioText = """\ - |┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ Input and Output ┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ - |╔══════════╤════════════════════════════════════════════════════════════════════════════════╗ + |┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ Input and Output ┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ + |╔══════════╤══════════════════════════════════════════════════════════════════════════════════════╗ |${ioTextRow('Type', 'Path')} - |╠══════════╪════════════════════════════════════════════════════════════════════════════════╣ + |╠══════════╪══════════════════════════════════════════════════════════════════════════════════════╣ |${ioTextRow('Input', readsDir.canonicalPath)} |${ioTextRow('Output', outputDir.canonicalPath)} - |╚══════════╧════════════════════════════════════════════════════════════════════════════════╝ + |╚══════════╧══════════════════════════════════════════════════════════════════════════════════════╝ |""".stripMargin() def assemblerTextRow = { leftContent, rightContent -> - textRow(25, 61, leftContent, rightContent) + textRow(25, 67, leftContent, rightContent) } String assemblerText = """\ - |┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ Assembler Options ┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ - |╔═══════════════════════════╤═══════════════════════════════════════════════════════════════╗ + |┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ Assembler Options ┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ + |╔═══════════════════════════╤═════════════════════════════════════════════════════════════════════╗ |${assemblerTextRow('Option', 'Value')} - |╠═══════════════════════════╪═══════════════════════════════════════════════════════════════╣ + |╠═══════════════════════════╪═════════════════════════════════════════════════════════════════════╣ |${assemblerTextRow('Assembler', params.assembler.capitalize())} |${assemblerTextRow('Minimum contig length', params.min_contig_length)} - |╚═══════════════════════════╧═══════════════════════════════════════════════════════════════╝ + |╚═══════════════════════════╧═════════════════════════════════════════════════════════════════════╝ |""".stripMargin() def qcTextRow = { leftContent, rightContent -> - textRow(60, 26, leftContent, rightContent) + textRow(60, 32, leftContent, rightContent) } String qcText = """\ - |┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ QC Parameters ┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ - |╔═══════════════════════════════════════════════════════════════════════════════════════════╗ - |║ Read QC ║ - |╟──────────────────────────────────────────────────────────────┬────────────────────────────╢ + |┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ QC Parameters ┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ + |╔═════════════════════════════════════════════════════════════════════════════════════════════════╗ + |║ Read QC ║ + |╟──────────────────────────────────────────────────────────────┬──────────────────────────────────╢ |${qcTextRow('Minimum bases in processed reads', String.format("%.0f", Math.ceil(params.length_low * params.depth)))} - |╠══════════════════════════════════════════════════════════════╧════════════════════════════╣ - |║ Taxonomy QC ║ - |╟──────────────────────────────────────────────────────────────┬────────────────────────────╢ + |╠══════════════════════════════════════════════════════════════╧══════════════════════════════════╣ + |║ Taxonomy QC ║ + |╟──────────────────────────────────────────────────────────────┬──────────────────────────────────╢ |${qcTextRow('Minimum S. pneumoniae percentage in reads', params.spneumo_percentage)} - |╠══════════════════════════════════════════════════════════════╧════════════════════════════╣ - |║ Mapping QC ║ - |╟──────────────────────────────────────────────────────────────┬────────────────────────────╢ + |╠══════════════════════════════════════════════════════════════╧══════════════════════════════════╣ + |║ Mapping QC ║ + |╟──────────────────────────────────────────────────────────────┬──────────────────────────────────╢ |${qcTextRow('Minimum reference coverage percentage by the reads', params.ref_coverage)} |${qcTextRow('Maximum non-cluster heterozygous SNP (Het-SNP) site count', params.het_snp_site)} - |╠══════════════════════════════════════════════════════════════╧════════════════════════════╣ - |║ Assembly QC ║ - |╟──────────────────────────────────────────────────────────────┬────────────────────────────╢ + |╠══════════════════════════════════════════════════════════════╧══════════════════════════════════╣ + |║ Assembly QC ║ + |╟──────────────────────────────────────────────────────────────┬──────────────────────────────────╢ |${qcTextRow('Maximum contig count in assembly', params.contigs)} |${qcTextRow('Minimum assembly length', params.length_low)} |${qcTextRow('Maximum assembly length', params.length_high)} |${qcTextRow('Minimum sequencing depth', params.depth)} - |╚══════════════════════════════════════════════════════════════╧════════════════════════════╝ + |╚══════════════════════════════════════════════════════════════╧══════════════════════════════════╝ |""".stripMargin() def containerEngineTextRow = { leftContent, rightContent -> - textRow(25, 61, leftContent, rightContent) + textRow(25, 67, leftContent, rightContent) } String containerEngineText = """\ - |┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ Container Engine Options ┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ - |╔═══════════════════════════╤═══════════════════════════════════════════════════════════════╗ + |┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ Container Engine Options ┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ + |╔═══════════════════════════╤═════════════════════════════════════════════════════════════════════╗ |${containerEngineTextRow('Option', 'Value')} - |╠═══════════════════════════╪═══════════════════════════════════════════════════════════════╣ + |╠═══════════════════════════╪═════════════════════════════════════════════════════════════════════╣ |${containerEngineTextRow('Container Engine', workflow.containerEngine.capitalize())} - |╚═══════════════════════════╧═══════════════════════════════════════════════════════════════╝ + |╚═══════════════════════════╧═════════════════════════════════════════════════════════════════════╝ |""".stripMargin() File output = new File("${task.workDir}/info.txt") @@ -428,19 +424,6 @@ process SAVE { // Below processes get tool versions within container images by running their containers -process GIT_VERSION { - label 'git_container' - label 'farm_low' - - output: - env VERSION - - shell: - $/ - VERSION=$(git -v | sed -r "s/.*\s(.+)/\1/") - /$ -} - process PYTHON_VERSION { label 'python_container' label 'farm_low' From 39fbf67fb72105f11df52251619b7d3d4404d6e9 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 4 Oct 2023 00:31:07 +0000 Subject: [PATCH 132/157] Update description of seroba_db_remote Former-commit-id: cb878ffba259f9a20852dd8430c5219b8f99ec71 --- nextflow_schema.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 19ba36a..9741aa7 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -189,8 +189,8 @@ "seroba_db_remote": { "type": "string", "hidden": true, - "description": "URL to a SeroBA Git remote repository.", - "pattern": "^(https?:\\/\\/)?(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)\\.git$" + "description": "URL to a SeroBA release.", + "pattern": "^(https?:\\/\\/)?(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)\\.(tar\\.gz|tgz)$" }, "seroba_kmer": { "type": "integer", From 46e2c86246e36094b54f76c798fc64bd80531624 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 20 Sep 2023 16:36:07 +0000 Subject: [PATCH 133/157] Add second most abundant species check Former-commit-id: 5e2972f027de0389ef0974c5381e2c3736e244cb --- bin/generate_overall_report.py | 2 +- bin/get_overall_qc.sh | 8 ++++++-- bin/get_taxonomy_qc.sh | 17 +++++++++++++---- modules/taxonomy.nf | 2 ++ modules/validate.nf | 1 + nextflow.config | 1 + workflows/pipeline.nf | 6 +++--- 7 files changed, 27 insertions(+), 10 deletions(-) diff --git a/bin/generate_overall_report.py b/bin/generate_overall_report.py index a02fb38..2eeda03 100755 --- a/bin/generate_overall_report.py +++ b/bin/generate_overall_report.py @@ -15,7 +15,7 @@ 'READ': ['Bases'], 'ASSEMBLY': ['Contigs#' , 'Assembly_Length' , 'Seq_Depth'], 'MAPPING': ['Ref_Cov_%' , 'Het-SNP#'], - 'TAXONOMY': ['S.Pneumo_%'], + 'TAXONOMY': ['S.Pneumo_%', 'Second_Species', 'Second_Species_%'], 'LINEAGE': ['GPSC'], 'SEROTYPE': ['Serotype'], 'MLST': ['ST' , 'aroE' , 'gdh' , 'gki' , 'recP' , 'spi' , 'xpt' , 'ddl'], diff --git a/bin/get_overall_qc.sh b/bin/get_overall_qc.sh index d83e52a..d0eb9b9 100755 --- a/bin/get_overall_qc.sh +++ b/bin/get_overall_qc.sh @@ -1,8 +1,12 @@ # Determine overall QC result based on Assembly QC, Mapping QC and Taxonomy QC # In case of assembler failure, there will be no Assembly QC input, hence output result as ASSEMBLER FAILURE -if [[ "$READ_QC" == "PASS" ]] && [[ "$ASSEMBLY_QC" == "PASS" ]] && [[ "$MAPPING_QC" == "PASS" ]] && [[ "$TAXONOMY_QC" == "PASS" ]]; then - OVERALL_QC="PASS" +if [[ "$READ_QC" == "PASS" ]] && [[ "$ASSEMBLY_QC" == "PASS" ]] && [[ "$MAPPING_QC" == "PASS" ]] && [[ "$TAXONOMY_QC" == "PASS" || "$TAXONOMY_QC" == "WARNING" ]]; then + if [[ "$TAXONOMY_QC" == "WARNING" ]]; then + OVERALL_QC="WARNING" + else + OVERALL_QC="PASS" + fi elif [[ "$READ_QC" == "FAIL" ]]; then OVERALL_QC="FAIL" elif [[ "$ASSEMBLY_QC" == "null" ]]; then diff --git a/bin/get_taxonomy_qc.sh b/bin/get_taxonomy_qc.sh index cb1e382..6b118a7 100755 --- a/bin/get_taxonomy_qc.sh +++ b/bin/get_taxonomy_qc.sh @@ -1,16 +1,25 @@ # Extract taxonomy QC information and determine QC result based on $KRAKEN2_REPORT -PERCENTAGE=$(awk -F"\t" '$4 ~ /^S$/ && $6 ~ /Streptococcus pneumoniae$/ { gsub(/^[ \t]+/, "", $1); printf "%.2f", $1 }' "$KRAKEN2_REPORT") +PERCENTAGE=$(awk -F"\t" '$4 ~ /^S$/ && $6 ~ /^\s*Streptococcus pneumoniae$/ { printf "%.2f", $1 }' "$KRAKEN2_REPORT") if [ -z "$PERCENTAGE" ]; then PERCENTAGE="0.00" fi if [[ "$(echo "$PERCENTAGE > $QC_SPNEUMO_PERCENTAGE" | bc -l)" == 1 ]]; then - TAXONOMY_QC="PASS" + SECOND_SPECIES_RECORD=$(sort -nr -k1 "$KRAKEN2_REPORT" | awk -F"\t" '$4 ~ /^S$/ && $6 !~ /^\s*Streptococcus pneumoniae$/ { print; exit }') + + SECOND_SPECIES=$(awk -F"\t" '{ gsub(/^\s+/, "", $6); print $6 }' <<< "$SECOND_SPECIES_RECORD") + SECOND_SPECIES_PERCENTAGE=$(awk -F"\t" '{ printf "%.2f", $1 }' <<< "$SECOND_SPECIES_RECORD") + + if [[ "$(echo "$SECOND_SPECIES_PERCENTAGE > $QC_SECOND_SPECIES_PERCENTAGE" | bc -l)" == 1 ]]; then + TAXONOMY_QC="WARNING" + else + TAXONOMY_QC="PASS" + fi else TAXONOMY_QC="FAIL" fi -echo \"Taxonomy_QC\",\"S.Pneumo_%\" > "$TAXONOMY_QC_REPORT" -echo \""$TAXONOMY_QC"\",\""$PERCENTAGE"\" >> "$TAXONOMY_QC_REPORT" +echo \"Taxonomy_QC\",\"S.Pneumo_%\",\"Second_Species\",\"Second_Species_%\" > "$TAXONOMY_QC_REPORT" +echo \""$TAXONOMY_QC"\",\""$PERCENTAGE"\",\""${SECOND_SPECIES:-}"\",\""${SECOND_SPECIES_PERCENTAGE:-}"\" >> "$TAXONOMY_QC_REPORT" diff --git a/modules/taxonomy.nf b/modules/taxonomy.nf index 0d945f2..b3f4436 100644 --- a/modules/taxonomy.nf +++ b/modules/taxonomy.nf @@ -64,6 +64,7 @@ process TAXONOMY_QC { input: tuple val(sample_id), path(kraken2_report) val(qc_spneumo_percentage) + val(qc_second_species_percentage) output: tuple val(sample_id), env(TAXONOMY_QC), emit: result @@ -74,6 +75,7 @@ process TAXONOMY_QC { """ KRAKEN2_REPORT="$kraken2_report" QC_SPNEUMO_PERCENTAGE="$qc_spneumo_percentage" + QC_SECOND_SPECIES_PERCENTAGE="$qc_second_species_percentage" TAXONOMY_QC_REPORT="$taxonomy_qc_report" source get_taxonomy_qc.sh diff --git a/modules/validate.nf b/modules/validate.nf index ce5a55e..a59aa78 100644 --- a/modules/validate.nf +++ b/modules/validate.nf @@ -17,6 +17,7 @@ validParams = [ poppunk_db_remote: 'url_targz', poppunk_ext_remote: 'url_csv', spneumo_percentage: 'int_float', + second_sp_percentage: 'int_float', ref_coverage: 'int_float', het_snp_site: 'int', contigs: 'int', diff --git a/nextflow.config b/nextflow.config index b70ee63..9cac461 100644 --- a/nextflow.config +++ b/nextflow.config @@ -41,6 +41,7 @@ params { // Default values for QC spneumo_percentage = 60.00 + second_sp_percentage = 5.00 ref_coverage = 60.00 het_snp_site = 220 contigs = 500 diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index ac427ca..bda7d86 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -100,7 +100,7 @@ workflow PIPELINE { // From Channel TAXONOMY.out.report, provide taxonomy QC status // Output into Channels TAXONOMY_QC.out.result & TAXONOMY_QC.out.report - TAXONOMY_QC(TAXONOMY.out.report, params.spneumo_percentage) + TAXONOMY_QC(TAXONOMY.out.report, params.spneumo_percentage, params.second_sp_percentage) // Merge Channels AREAD_QC.out.result & SSEMBLY_QC.out.result & MAPPING_QC.out.result & TAXONOMY_QC.out.result to provide Overall QC Status // Output into Channel OVERALL_QC.out.result & OVERALL_QC.out.report @@ -113,12 +113,12 @@ workflow PIPELINE { // From Channel READ_QC_PASSED_READS_ch, only output reads of samples passed overall QC based on Channel OVERALL_QC.out.result OVERALL_QC_PASSED_READS_ch = OVERALL_QC.out.result.join(READ_QC_PASSED_READS_ch, failOnDuplicate: true) - .filter { it[1] == 'PASS' } + .filter { it[1] == 'PASS' || it[1] == 'WARNING' } .map { it[0, 2..-1] } // From Channel ASSEMBLY_ch, only output assemblies of samples passed overall QC based on Channel OVERALL_QC.out.result OVERALL_QC_PASSED_ASSEMBLIES_ch = OVERALL_QC.out.result.join(ASSEMBLY_ch, failOnDuplicate: true) - .filter { it[1] == 'PASS' } + .filter { it[1] == 'PASS' || it[1] == 'WARNING'} .map { it[0, 2..-1] } // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, generate PopPUNK query file containing assemblies of samples passed overall QC From 76dc888e5e34f9f7c18c1b6d3fb36e7f5ae47163 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 21 Sep 2023 14:14:51 +0000 Subject: [PATCH 134/157] Add info on the second most abundant species check Former-commit-id: 8282b78a1d78971ad0aef77b93f4fb08ced10af7 --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index a1f7436..9aeaad5 100644 --- a/README.md +++ b/README.md @@ -196,6 +196,7 @@ The pipeline is compatible with [Launchpad](https://help.tower.nf/23.2/launch/la | Option | Values | Description | | --- | ---| --- | | `--spneumo_percentage` | Any integer or float value
(Default: `60.00`) | Minimum *S. pneumoniae* percentage in reads to pass Taxonomy QC. | + | `--second_sp_percentage` | Any integer or float value
(Default: `5.00`) | The threshold of second most abundant species percentage in reads to trigger `WARNING` in Taxonomy QC. | | `--ref_coverage` | Any integer or float value
(Default: `60.00`) | Minimum reference coverage percentage by the reads to pass Mapping QC. | | `--het_snp_site` | Any integer value
(Default: `220`) | Maximum non-cluster heterozygous SNP (Het-SNP) site count to pass Mapping QC. | | `--contigs` | Any integer value
(Default: `500`) | Maximum contig count in assembly to pass Assembly QC. | @@ -274,6 +275,8 @@ The pipeline is compatible with [Launchpad](https://help.tower.nf/23.2/launch/la > ⚠️ If the result of `Overall_QC` of a sample is `ASSEMBLER FAILURE`, the assembler has crashed when trying to assembly the reads. You might want to re-run the sample with [another assembler](#assembly), or discard the sample if it is a low quality one. + > ⚠️ If the results of `Taxonomy_QC` and `Overall_QC` of a sample are both `WARNING`, the sample contains non-pneumococcal species in reads that exceeds the percentage set in `--second_sp_percentage`, but the sample is otherwise QC passed and all the *in silico* data should be available. You should exercise due diligence to determine the trustworthiness of the results from this sample. + > ⚠️ If the result of `Serotype` of a sample is `SEROBA FAILURE`, SeroBA has crashed when trying to serotype the sample. | Field | Type | Description | @@ -291,6 +294,8 @@ The pipeline is compatible with [Launchpad](https://help.tower.nf/23.2/launch/la | `Ref_Cov_%` | Mapping | Percentage of reference covered by reads
(Default: > 60% to pass Mapping QC) | | `Het-SNP#` | Mapping | Non-cluster heterozygous SNP (Het-SNP) site count
(Default: < 220 to pass Mapping QC) | | `S.Pneumo_%` | Taxonomy | Percentage of reads assigned to *Streptococcus pneumoniae*
(Default: > 60% to pass Taxonomy QC) | + | `Second_Species` | Taxonomy | The second most abundant species in reads
(Only available if `S.Pneumo_%` passed QC) | + | `Second_Species_%` | Taxonomy | Percentage of reads assigned to the second most abundant species
(Only available if `S.Pneumo_%` passed QC)
(Default: > 5.00% will trigger `WARNING` in Taxonomy QC) | | `GPSC` | Lineage | GPSC Lineage | | `Serotype` | Serotype | Serotype | | `ST` | MLST | Sequence Type (ST) | From 1adf78b51c229a8a6358e9b5dd1e7d83b31b76d3 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 3 Oct 2023 02:01:03 +0000 Subject: [PATCH 135/157] Switch from 2nd species to top non-Strep genus Former-commit-id: d7c9772660a546a698a0d8369605b5f971e67eb4 --- bin/generate_overall_report.py | 2 +- bin/get_overall_qc.sh | 12 +++--------- bin/get_taxonomy_qc.sh | 23 +++++++++++------------ modules/taxonomy.nf | 4 ++-- modules/validate.nf | 2 +- nextflow.config | 2 +- workflows/pipeline.nf | 6 +++--- 7 files changed, 22 insertions(+), 29 deletions(-) diff --git a/bin/generate_overall_report.py b/bin/generate_overall_report.py index 2eeda03..80ef52c 100755 --- a/bin/generate_overall_report.py +++ b/bin/generate_overall_report.py @@ -15,7 +15,7 @@ 'READ': ['Bases'], 'ASSEMBLY': ['Contigs#' , 'Assembly_Length' , 'Seq_Depth'], 'MAPPING': ['Ref_Cov_%' , 'Het-SNP#'], - 'TAXONOMY': ['S.Pneumo_%', 'Second_Species', 'Second_Species_%'], + 'TAXONOMY': ['S.Pneumo_%', 'Top_Non-Strep_Genus', 'Top_Non-Strep_Genus_%'], 'LINEAGE': ['GPSC'], 'SEROTYPE': ['Serotype'], 'MLST': ['ST' , 'aroE' , 'gdh' , 'gki' , 'recP' , 'spi' , 'xpt' , 'ddl'], diff --git a/bin/get_overall_qc.sh b/bin/get_overall_qc.sh index d0eb9b9..a0fc3d3 100755 --- a/bin/get_overall_qc.sh +++ b/bin/get_overall_qc.sh @@ -1,15 +1,9 @@ # Determine overall QC result based on Assembly QC, Mapping QC and Taxonomy QC # In case of assembler failure, there will be no Assembly QC input, hence output result as ASSEMBLER FAILURE -if [[ "$READ_QC" == "PASS" ]] && [[ "$ASSEMBLY_QC" == "PASS" ]] && [[ "$MAPPING_QC" == "PASS" ]] && [[ "$TAXONOMY_QC" == "PASS" || "$TAXONOMY_QC" == "WARNING" ]]; then - if [[ "$TAXONOMY_QC" == "WARNING" ]]; then - OVERALL_QC="WARNING" - else - OVERALL_QC="PASS" - fi -elif [[ "$READ_QC" == "FAIL" ]]; then - OVERALL_QC="FAIL" -elif [[ "$ASSEMBLY_QC" == "null" ]]; then +if [[ "$READ_QC" == "PASS" ]] && [[ "$ASSEMBLY_QC" == "PASS" ]] && [[ "$MAPPING_QC" == "PASS" ]] && [[ "$TAXONOMY_QC" == "PASS" ]]; then + OVERALL_QC="PASS" +elif [[ "$READ_QC" == "PASS" ]] && [[ "$ASSEMBLY_QC" == "null" ]]; then OVERALL_QC="ASSEMBLER FAILURE" else OVERALL_QC="FAIL" diff --git a/bin/get_taxonomy_qc.sh b/bin/get_taxonomy_qc.sh index 6b118a7..9eaf05e 100755 --- a/bin/get_taxonomy_qc.sh +++ b/bin/get_taxonomy_qc.sh @@ -2,24 +2,23 @@ PERCENTAGE=$(awk -F"\t" '$4 ~ /^S$/ && $6 ~ /^\s*Streptococcus pneumoniae$/ { printf "%.2f", $1 }' "$KRAKEN2_REPORT") +TOP_NON_STREP_GENUS_RECORD=$(sort -nr -k1,1 -k2,2 "$KRAKEN2_REPORT" | awk -F"\t" '$4 ~ /^G$/ && $6 !~ /^\s*Streptococcus$/ { print; exit }') +TOP_NON_STREP_GENUS=$(awk -F"\t" '{ gsub(/^\s+/, "", $6); print $6 }' <<< "$TOP_NON_STREP_GENUS_RECORD") +TOP_NON_STREP_GENUS_PERCENTAGE=$(awk -F"\t" '{ printf "%.2f", $1 }' <<< "$TOP_NON_STREP_GENUS_RECORD") + if [ -z "$PERCENTAGE" ]; then PERCENTAGE="0.00" fi -if [[ "$(echo "$PERCENTAGE > $QC_SPNEUMO_PERCENTAGE" | bc -l)" == 1 ]]; then - SECOND_SPECIES_RECORD=$(sort -nr -k1 "$KRAKEN2_REPORT" | awk -F"\t" '$4 ~ /^S$/ && $6 !~ /^\s*Streptococcus pneumoniae$/ { print; exit }') - - SECOND_SPECIES=$(awk -F"\t" '{ gsub(/^\s+/, "", $6); print $6 }' <<< "$SECOND_SPECIES_RECORD") - SECOND_SPECIES_PERCENTAGE=$(awk -F"\t" '{ printf "%.2f", $1 }' <<< "$SECOND_SPECIES_RECORD") +if [ -z "$TOP_NON_STREP_GENUS_PERCENTAGE" ]; then + TOP_NON_STREP_GENUS_PERCENTAGE="0.00" +fi - if [[ "$(echo "$SECOND_SPECIES_PERCENTAGE > $QC_SECOND_SPECIES_PERCENTAGE" | bc -l)" == 1 ]]; then - TAXONOMY_QC="WARNING" - else - TAXONOMY_QC="PASS" - fi +if [[ "$(echo "$PERCENTAGE > $QC_SPNEUMO_PERCENTAGE" | bc -l)" == 1 ]] && [[ "$(echo "$TOP_NON_STREP_GENUS_PERCENTAGE <= $QC_TOP_NON_STREP_GENUS_PERCENTAGE" | bc -l)" == 1 ]]; then + TAXONOMY_QC="PASS" else TAXONOMY_QC="FAIL" fi -echo \"Taxonomy_QC\",\"S.Pneumo_%\",\"Second_Species\",\"Second_Species_%\" > "$TAXONOMY_QC_REPORT" -echo \""$TAXONOMY_QC"\",\""$PERCENTAGE"\",\""${SECOND_SPECIES:-}"\",\""${SECOND_SPECIES_PERCENTAGE:-}"\" >> "$TAXONOMY_QC_REPORT" +echo \"Taxonomy_QC\",\"S.Pneumo_%\",\"Top_Non-Strep_Genus\",\"Top_Non-Strep_Genus_%\" > "$TAXONOMY_QC_REPORT" +echo \""$TAXONOMY_QC"\",\""$PERCENTAGE"\",\""${TOP_NON_STREP_GENUS:-}"\",\""${TOP_NON_STREP_GENUS_PERCENTAGE}"\" >> "$TAXONOMY_QC_REPORT" diff --git a/modules/taxonomy.nf b/modules/taxonomy.nf index b3f4436..2f9098f 100644 --- a/modules/taxonomy.nf +++ b/modules/taxonomy.nf @@ -64,7 +64,7 @@ process TAXONOMY_QC { input: tuple val(sample_id), path(kraken2_report) val(qc_spneumo_percentage) - val(qc_second_species_percentage) + val(qc_top_non_strep_genus_percentage) output: tuple val(sample_id), env(TAXONOMY_QC), emit: result @@ -75,7 +75,7 @@ process TAXONOMY_QC { """ KRAKEN2_REPORT="$kraken2_report" QC_SPNEUMO_PERCENTAGE="$qc_spneumo_percentage" - QC_SECOND_SPECIES_PERCENTAGE="$qc_second_species_percentage" + QC_TOP_NON_STREP_GENUS_PERCENTAGE="$qc_top_non_strep_genus_percentage" TAXONOMY_QC_REPORT="$taxonomy_qc_report" source get_taxonomy_qc.sh diff --git a/modules/validate.nf b/modules/validate.nf index a59aa78..1e9974a 100644 --- a/modules/validate.nf +++ b/modules/validate.nf @@ -17,7 +17,7 @@ validParams = [ poppunk_db_remote: 'url_targz', poppunk_ext_remote: 'url_csv', spneumo_percentage: 'int_float', - second_sp_percentage: 'int_float', + non_strep_percentage: 'int_float', ref_coverage: 'int_float', het_snp_site: 'int', contigs: 'int', diff --git a/nextflow.config b/nextflow.config index 9cac461..14fd65a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -41,7 +41,7 @@ params { // Default values for QC spneumo_percentage = 60.00 - second_sp_percentage = 5.00 + non_strep_percentage = 2.00 ref_coverage = 60.00 het_snp_site = 220 contigs = 500 diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index bda7d86..6e1f47e 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -100,7 +100,7 @@ workflow PIPELINE { // From Channel TAXONOMY.out.report, provide taxonomy QC status // Output into Channels TAXONOMY_QC.out.result & TAXONOMY_QC.out.report - TAXONOMY_QC(TAXONOMY.out.report, params.spneumo_percentage, params.second_sp_percentage) + TAXONOMY_QC(TAXONOMY.out.report, params.spneumo_percentage, params.non_strep_percentage) // Merge Channels AREAD_QC.out.result & SSEMBLY_QC.out.result & MAPPING_QC.out.result & TAXONOMY_QC.out.result to provide Overall QC Status // Output into Channel OVERALL_QC.out.result & OVERALL_QC.out.report @@ -113,12 +113,12 @@ workflow PIPELINE { // From Channel READ_QC_PASSED_READS_ch, only output reads of samples passed overall QC based on Channel OVERALL_QC.out.result OVERALL_QC_PASSED_READS_ch = OVERALL_QC.out.result.join(READ_QC_PASSED_READS_ch, failOnDuplicate: true) - .filter { it[1] == 'PASS' || it[1] == 'WARNING' } + .filter { it[1] == 'PASS' } .map { it[0, 2..-1] } // From Channel ASSEMBLY_ch, only output assemblies of samples passed overall QC based on Channel OVERALL_QC.out.result OVERALL_QC_PASSED_ASSEMBLIES_ch = OVERALL_QC.out.result.join(ASSEMBLY_ch, failOnDuplicate: true) - .filter { it[1] == 'PASS' || it[1] == 'WARNING'} + .filter { it[1] == 'PASS' } .map { it[0, 2..-1] } // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, generate PopPUNK query file containing assemblies of samples passed overall QC From a2a536fbf9f92a2f387e3d4d85595ec07a1572de Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 3 Oct 2023 02:03:01 +0000 Subject: [PATCH 136/157] Update based on Taxonomy QC changes Former-commit-id: fb6c680e5e0900e876381460cb0c57459220fffb --- README.md | 8 +++----- doc/workflow.drawio.svg | 25 ++++++++++++++----------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 9aeaad5..ab5fd34 100644 --- a/README.md +++ b/README.md @@ -196,7 +196,7 @@ The pipeline is compatible with [Launchpad](https://help.tower.nf/23.2/launch/la | Option | Values | Description | | --- | ---| --- | | `--spneumo_percentage` | Any integer or float value
(Default: `60.00`) | Minimum *S. pneumoniae* percentage in reads to pass Taxonomy QC. | - | `--second_sp_percentage` | Any integer or float value
(Default: `5.00`) | The threshold of second most abundant species percentage in reads to trigger `WARNING` in Taxonomy QC. | + | `--non_strep_percentage` | Any integer or float value
(Default: `2.00`) | Maximum non-*Streptococcus* genus percentage in reads to pass Taxonomy QC. | | `--ref_coverage` | Any integer or float value
(Default: `60.00`) | Minimum reference coverage percentage by the reads to pass Mapping QC. | | `--het_snp_site` | Any integer value
(Default: `220`) | Maximum non-cluster heterozygous SNP (Het-SNP) site count to pass Mapping QC. | | `--contigs` | Any integer value
(Default: `500`) | Maximum contig count in assembly to pass Assembly QC. | @@ -275,8 +275,6 @@ The pipeline is compatible with [Launchpad](https://help.tower.nf/23.2/launch/la > ⚠️ If the result of `Overall_QC` of a sample is `ASSEMBLER FAILURE`, the assembler has crashed when trying to assembly the reads. You might want to re-run the sample with [another assembler](#assembly), or discard the sample if it is a low quality one. - > ⚠️ If the results of `Taxonomy_QC` and `Overall_QC` of a sample are both `WARNING`, the sample contains non-pneumococcal species in reads that exceeds the percentage set in `--second_sp_percentage`, but the sample is otherwise QC passed and all the *in silico* data should be available. You should exercise due diligence to determine the trustworthiness of the results from this sample. - > ⚠️ If the result of `Serotype` of a sample is `SEROBA FAILURE`, SeroBA has crashed when trying to serotype the sample. | Field | Type | Description | @@ -294,8 +292,8 @@ The pipeline is compatible with [Launchpad](https://help.tower.nf/23.2/launch/la | `Ref_Cov_%` | Mapping | Percentage of reference covered by reads
(Default: > 60% to pass Mapping QC) | | `Het-SNP#` | Mapping | Non-cluster heterozygous SNP (Het-SNP) site count
(Default: < 220 to pass Mapping QC) | | `S.Pneumo_%` | Taxonomy | Percentage of reads assigned to *Streptococcus pneumoniae*
(Default: > 60% to pass Taxonomy QC) | - | `Second_Species` | Taxonomy | The second most abundant species in reads
(Only available if `S.Pneumo_%` passed QC) | - | `Second_Species_%` | Taxonomy | Percentage of reads assigned to the second most abundant species
(Only available if `S.Pneumo_%` passed QC)
(Default: > 5.00% will trigger `WARNING` in Taxonomy QC) | + | `Top_Non-Strep_Genus` | Taxonomy | The most abundant non-*Streptococcus* genus in reads | + | `Top_Non-Strep_Genus_%` | Taxonomy | Percentage of reads assigned to the most abundant non-*Streptococcus* genus
(Default: ≤ 2% to pass Taxonomy QC) | | `GPSC` | Lineage | GPSC Lineage | | `Serotype` | Serotype | Serotype | | `ST` | MLST | Sequence Type (ST) | diff --git a/doc/workflow.drawio.svg b/doc/workflow.drawio.svg index 43e654b..b88a1a7 100644 --- a/doc/workflow.drawio.svg +++ b/doc/workflow.drawio.svg @@ -1,4 +1,4 @@ - + @@ -81,19 +81,22 @@ - - S. Pneumo:  > 60% + + S. Pneumo:     > 60% + + + Other Genus:  ≤ 2% - + Contigs:  < 500 - + Length:   1.9 - 2.3 Mb - + Depth:     ≥ 20x @@ -133,10 +136,10 @@ - + Ref Coverage:  > 60% - + Het-SNP site:   < 220 @@ -467,9 +470,9 @@ Go / No-go
- - - + + + From 54bbb5b4eabfbe432bacd7fc41428c33ea8fc04d Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 4 Oct 2023 18:59:49 +0000 Subject: [PATCH 137/157] Show max non-Strep genus percentage in reads QC Former-commit-id: ae8cceaab2b4ccc1c1899c0283541aa8023df933 --- modules/info.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/info.nf b/modules/info.nf index aaebd92..82415a2 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -379,6 +379,7 @@ process SAVE { |║ Taxonomy QC ║ |╟──────────────────────────────────────────────────────────────┬──────────────────────────────────╢ |${qcTextRow('Minimum S. pneumoniae percentage in reads', params.spneumo_percentage)} + |${qcTextRow('Maximum non-Streptococcus genus percentage in reads', params.non_strep_percentage)} |╠══════════════════════════════════════════════════════════════╧══════════════════════════════════╣ |║ Mapping QC ║ |╟──────────────────────────────────────────────────────────────┬──────────────────────────────────╢ From 48128a0465ae50054933551df47b68ae9755a571 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 16 Oct 2023 11:07:18 +0000 Subject: [PATCH 138/157] Fix relational operators to match descriptions Former-commit-id: 40a7527ffcf9e2e67d1dac985edef3702f64bf8d --- bin/get_assembly_qc.sh | 2 +- bin/get_mapping_qc.sh | 2 +- bin/get_taxonomy_qc.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/get_assembly_qc.sh b/bin/get_assembly_qc.sh index f23ff7f..26ce34b 100755 --- a/bin/get_assembly_qc.sh +++ b/bin/get_assembly_qc.sh @@ -4,7 +4,7 @@ CONTIGS=$(awk -F'\t' '$1 == "# contigs (>= 0 bp)" { print $2 }' "$REPORT") LENGTH=$(awk -F'\t' '$1 == "Total length" { print $2 }' "$REPORT") DEPTH=$(echo "scale=2; $BASES / $LENGTH" | bc -l) -if [[ $CONTIGS -lt $QC_CONTIGS ]] && [[ $LENGTH -ge $QC_LENGTH_LOW ]] && [[ $LENGTH -le $QC_LENGTH_HIGH ]] && [[ "$(echo "$DEPTH >= $QC_DEPTH" | bc -l)" == 1 ]]; then +if [[ $CONTIGS -le $QC_CONTIGS ]] && [[ $LENGTH -ge $QC_LENGTH_LOW ]] && [[ $LENGTH -le $QC_LENGTH_HIGH ]] && [[ "$(echo "$DEPTH >= $QC_DEPTH" | bc -l)" == 1 ]]; then ASSEMBLY_QC="PASS" else ASSEMBLY_QC="FAIL" diff --git a/bin/get_mapping_qc.sh b/bin/get_mapping_qc.sh index 9ed580d..faae6d6 100755 --- a/bin/get_mapping_qc.sh +++ b/bin/get_mapping_qc.sh @@ -2,7 +2,7 @@ COVERAGE=$(printf %.2f "$COVERAGE") -if [[ "$(echo "$COVERAGE > $QC_REF_COVERAGE" | bc -l)" == 1 ]] && [[ $HET_SNP -lt $QC_HET_SNP_SITE ]]; then +if [[ "$(echo "$COVERAGE >= $QC_REF_COVERAGE" | bc -l)" == 1 ]] && [[ $HET_SNP -le $QC_HET_SNP_SITE ]]; then MAPPING_QC="PASS" else MAPPING_QC="FAIL" diff --git a/bin/get_taxonomy_qc.sh b/bin/get_taxonomy_qc.sh index 9eaf05e..5c25579 100755 --- a/bin/get_taxonomy_qc.sh +++ b/bin/get_taxonomy_qc.sh @@ -14,7 +14,7 @@ if [ -z "$TOP_NON_STREP_GENUS_PERCENTAGE" ]; then TOP_NON_STREP_GENUS_PERCENTAGE="0.00" fi -if [[ "$(echo "$PERCENTAGE > $QC_SPNEUMO_PERCENTAGE" | bc -l)" == 1 ]] && [[ "$(echo "$TOP_NON_STREP_GENUS_PERCENTAGE <= $QC_TOP_NON_STREP_GENUS_PERCENTAGE" | bc -l)" == 1 ]]; then +if [[ "$(echo "$PERCENTAGE >= $QC_SPNEUMO_PERCENTAGE" | bc -l)" == 1 ]] && [[ "$(echo "$TOP_NON_STREP_GENUS_PERCENTAGE <= $QC_TOP_NON_STREP_GENUS_PERCENTAGE" | bc -l)" == 1 ]]; then TAXONOMY_QC="PASS" else TAXONOMY_QC="FAIL" From e77919e5b953080f0aa01b99bc645b0954ffda66 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 16 Oct 2023 11:07:55 +0000 Subject: [PATCH 139/157] Reflect changes in relational operators Former-commit-id: 4663956f5de0d80decada3fd290fbfa520aca913 --- doc/workflow.drawio.svg | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/workflow.drawio.svg b/doc/workflow.drawio.svg index b88a1a7..87d8e97 100644 --- a/doc/workflow.drawio.svg +++ b/doc/workflow.drawio.svg @@ -1,4 +1,4 @@ - + @@ -82,7 +82,7 @@ - S. Pneumo:     > 60% + S. Pneumo:     ≥ 60% Other Genus:  ≤ 2% @@ -91,7 +91,7 @@ - Contigs:  < 500 + Contigs:  ≤ 500 Length:   1.9 - 2.3 Mb @@ -137,10 +137,10 @@ - Ref Coverage:  > 60% + Ref Coverage: ≥ 60% - Het-SNP site:   < 220 + Het-SNP site:  ≤ 220 @@ -470,9 +470,9 @@ Go / No-go - - - + + + From 0df43a56546342d33b6a57362c748610e81e394c Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 16 Oct 2023 13:54:06 +0000 Subject: [PATCH 140/157] Use dev SeroBA and remove Singularity workaround Former-commit-id: 13c9b7e653a8d2d24b151a054807e478284a81cc --- modules/serotype.nf | 39 +++++++++------------------------------ nextflow.config | 2 +- 2 files changed, 10 insertions(+), 31 deletions(-) diff --git a/modules/serotype.nf b/modules/serotype.nf index 8903f46..bb36fbe 100644 --- a/modules/serotype.nf +++ b/modules/serotype.nf @@ -29,7 +29,7 @@ process GET_SEROBA_DB { // Run SeroBA to serotype samples process SEROTYPE { label 'seroba_container' - label 'farm_low' + label 'farm_mid' tag "$sample_id" @@ -42,34 +42,13 @@ process SEROTYPE { script: serotype_report='serotype_report.csv' - // When using Singularity as container engine, SeroBA sometimes gives incorrect result or critical error - // Uncertain root cause, happen randomly when input are located directly in a Nextflow process work directory - // Workaround: create and use a subdirectory to alter the path - if (workflow.containerEngine === 'docker') - """ - SEROBA_DB="$seroba_db" - READ1="$read1" - READ2="$read2" - SAMPLE_ID="$sample_id" - SEROTYPE_REPORT="$serotype_report" - - source get_serotype.sh - """ - else if (workflow.containerEngine === 'singularity') - """ - SEROBA_DB="$seroba_db" - READ1="$read1" - READ2="$read2" - SAMPLE_ID="$sample_id" - SEROTYPE_REPORT="$serotype_report" - - mkdir SEROBA_WORKDIR && mv $seroba_db $read1 $read2 SEROBA_WORKDIR && cd SEROBA_WORKDIR - - source get_serotype.sh + """ + SEROBA_DB="$seroba_db" + READ1="$read1" + READ2="$read2" + SAMPLE_ID="$sample_id" + SEROTYPE_REPORT="$serotype_report" - cd ../ - mv SEROBA_WORKDIR/$serotype_report ./ - """ - else - error "The process must be run with Docker or Singularity as container engine." + source get_serotype.sh + """ } diff --git a/nextflow.config b/nextflow.config index 14fd65a..dfb4810 100644 --- a/nextflow.config +++ b/nextflow.config @@ -103,7 +103,7 @@ process { container = 'staphb/kraken2:2.1.2-no-db' } withLabel: seroba_container { - container = 'sangerbentleygroup/seroba:1.0.4' + container = 'oliverlorenzsanger/seroba:test' } } From 997ea668201a062919b33f4ebac059378f07dcc7 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 17 Oct 2023 10:21:57 +0000 Subject: [PATCH 141/157] Update SeroBA descriptions Former-commit-id: 4660719bef0711e85ada19a00a67e9d1ac91776d --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ab5fd34..0afd763 100644 --- a/README.md +++ b/README.md @@ -436,7 +436,7 @@ This project uses open-source components. You can find the homepage or source co - **SeroBA: rapid high-throughput serotyping of Streptococcus pneumoniae from whole genome sequence data**. Epping L, van Tonder, AJ, Gladstone RA, GPS Consortium, Bentley SD, Page AJ, Keane JA, Microbial Genomics 2018, doi: [10.1099/mgen.0.000186](http://mgen.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000186) - License (GPL-3.0): https://github.com/sanger-pathogens/seroba/blob/master/LICENSE - This project uses a Docker image of a [fork](https://github.com/sanger-bentley-group/seroba) - - The fork includes critical bug fixes for SeroBA as the original repository is no longer maintained + - The fork provides SeroBA with the latest updates as the original repository is no longer maintained - The Docker image provides the containerised environment with SeroBA for `GET_SEROBA_DB` and `SEROTYPE` processes of the `serotype.nf` module [resistanceDatabase](https://github.com/kumarnaren/resistanceDatabase) From 087f8cc5676213278d573e5da1978e3b7cd130c7 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 17 Oct 2023 11:44:15 +0000 Subject: [PATCH 142/157] Update to the latest SeroBA official release Former-commit-id: f2d86c6969549d7c1d7bddb2486443a69dbcf70e --- README.md | 2 +- nextflow.config | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0afd763..3df5503 100644 --- a/README.md +++ b/README.md @@ -224,7 +224,7 @@ The pipeline is compatible with [Launchpad](https://help.tower.nf/23.2/launch/la ## Serotype | Option | Values | Description | | --- | ---| --- | - | `--seroba_db_remote` | Any valid URL to a SeroBA release in `.tar.gz` or `.tgz` format
(Default: [SeroBA v1.0.4](https://github.com/sanger-bentley-group/seroba/archive/refs/tags/v1.0.4.tar.gz))| URL to a SeroBA release. | + | `--seroba_db_remote` | Any valid URL to a SeroBA release in `.tar.gz` or `.tgz` format
(Default: [SeroBA v1.0.5](https://github.com/sanger-bentley-group/seroba/archive/refs/tags/v1.0.5.tar.gz))| URL to a SeroBA release. | | `--seroba_kmer` | Any integer value
(Default: `71`) | Kmer size for creating the KMC database of SeroBA. | ## Lineage diff --git a/nextflow.config b/nextflow.config index dfb4810..338430f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -25,7 +25,7 @@ params { assembly_publish = "link" // Default link for SeroBA repository, and KMC kmer size for SeroBA - seroba_db_remote = "https://github.com/sanger-bentley-group/seroba/archive/refs/tags/v1.0.4.tar.gz" + seroba_db_remote = "https://github.com/sanger-bentley-group/seroba/archive/refs/tags/v1.0.5.tar.gz" seroba_kmer = 71 // Default link for Kraken2 Database, and usage of memory mapping @@ -103,7 +103,7 @@ process { container = 'staphb/kraken2:2.1.2-no-db' } withLabel: seroba_container { - container = 'oliverlorenzsanger/seroba:test' + container = 'sangerbentleygroup/seroba:1.0.5' } } From d7716b800022d63723f3cbf4612e02901becf057 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 17 Oct 2023 15:14:07 +0000 Subject: [PATCH 143/157] Update the included Nextflow executable to 23.10.0 Former-commit-id: 93436596ecff659b56dc4480dee02e9fe352f18a --- README.md | 6 ++-- nextflow | 93 +++++++++++++++++++++++++++++++------------------------ 2 files changed, 55 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index 3df5503..4972b25 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # GPS Unified Pipeline -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-23.04.2-23aa62.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-23.10.0-23aa62.svg)](https://www.nextflow.io/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/singularity/) @@ -47,8 +47,8 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca   # Usage ## Requirement -- A POSIX-compatible system (e.g. Linux, macOS, Windows with [WSL](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux)) -- Java 11 or later (up to 20) ([OpenJDK](https://openjdk.org/)/[Oracle Java](https://www.oracle.com/java/)) +- A POSIX-compatible system (e.g. Linux, macOS, Windows with [WSL](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux)) with Bash 3.2 or later +- Java 11 or later (up to 21) ([OpenJDK](https://openjdk.org/)/[Oracle Java](https://www.oracle.com/java/)) - [Docker](https://www.docker.com/) or [Singularity](https://sylabs.io/singularity/) - It is recommended to have at least 16GB of RAM and 100GB of free storage > ℹ️ Details on storage diff --git a/nextflow b/nextflow index b7725ce..2a73246 100755 --- a/nextflow +++ b/nextflow @@ -15,7 +15,7 @@ # limitations under the License. [[ "$NXF_DEBUG" == 'x' ]] && set -x -NXF_VER=${NXF_VER:-'23.04.2'} +NXF_VER=${NXF_VER:-'23.10.0'} NXF_ORG=${NXF_ORG:-'nextflow-io'} NXF_HOME=${NXF_HOME:-$HOME/.nextflow} NXF_PROT=${NXF_PROT:-'https'} @@ -24,6 +24,7 @@ NXF_TEMP=${NXF_TEMP:-$TMPDIR} NXF_DIST=${NXF_DIST:-$NXF_HOME/framework} NXF_CLI="$0 $@" NXF_CLI_OPTS=${NXF_CLI_OPTS:-} +NXF_REMOTE_DEBUG_PORT=${NXF_REMOTE_DEBUG_PORT:-5005} export NXF_CLI export NXF_ORG @@ -85,6 +86,17 @@ function get() { fi } +function get_ver() { + if command -v curl &>/dev/null; then + curl -fsSL "$1" + elif command -v wget &>/dev/null; then + wget "$1" >/dev/null 2>&1 + else + echo_red "ERROR: Cannot find 'curl' nor 'wget' utility -- please install one of them" + exit 1 + fi +} + function make_temp() { local base=${NXF_TEMP:=$PWD} if [ "$(uname)" = 'Darwin' ]; then mktemp "${base}/nxf-tmp.XXXXXX" || exit $? @@ -111,7 +123,7 @@ function resolve_link() { } function current_ver() { - [[ $NXF_EDGE == 1 ]] && printf 'edge' || printf 'latest' + [[ $NXF_EDGE == 1 || $NXF_VER == *"-edge" ]] && printf 'edge' || printf 'latest' } function install() { @@ -131,10 +143,19 @@ function install() { echo '' } +function check_latest() { + [[ $cmd != run ]] && return 0 + [[ $NXF_OFFLINE == true || $NXF_DISABLE_CHECK_LATEST == true ]] && return 0 + local latest=$(get_ver "$NXF_BASE/$(current_ver)/version?current=$NXF_VER") + if [[ -n "$latest" && "$latest" != $NXF_VER ]]; then + echo_yellow "Nextflow $latest is available - Please consider updating your version to it" + fi +} + function launch_nextflow() { # the launch command line local cmdline=() - # remove leading and trailing double-quotes + # remove leading and trailing double-quotes for x in "${launcher[@]}"; do x="${x%\"}" x="${x#\"}" @@ -174,7 +195,6 @@ unset JAVA_TOOL_OPTIONS # parse the command line bg='' -dockerize='' declare -a jvmopts=() declare -a args=("$@") declare -a commands=(clone config drop help history info ls pull run view node console kuberun) @@ -190,15 +210,12 @@ while [[ $# != 0 ]]; do jvmopts+=("$1") fi ;; - -d|-dockerize) - if [[ ! "$cmd" && ! -f /.nextflow/dockerized ]]; then - dockerize=1 - fi - ;; -bg) - if [[ ! -f /.nextflow/dockerized ]]; then bg=1 - fi + ;; + -remote-debug) + echo_yellow "Enabling script debugging - continue the execution launching the remote VM debugger in your favourite IDE using port $NXF_REMOTE_DEBUG_PORT" + remote_debug=1 ;; -download) if [[ ! "$cmd" ]]; then @@ -222,20 +239,6 @@ while [[ $# != 0 ]]; do shift done -NXF_DOCKER_OPTS=${NXF_DOCKER_OPTS:=''} -if [[ "$dockerize" ]]; then - if [[ "$bg" ]]; then detach='--detach '; else detach=''; fi - NXF_ASSETS=${NXF_ASSETS:-${NXF_HOME:-$HOME/.nextflow}/assets} - mkdir -p "$NXF_ASSETS" - exec docker run $detach --rm --net host \ - -e NXF_ANSI_LOG=false \ - -e USER -e HOME -e NXF_ASSETS=$NXF_ASSETS -e NXF_USRMAP=$(id -u) -e NXF_DOCKER_OPTS='-u $(id -u)' \ - -v /var/run/docker.sock:/var/run/docker.sock \ - -v $HOME:$HOME:ro,Z -v $NXF_ASSETS:$NXF_ASSETS:Z -v $PWD:$PWD:Z -w $PWD $NXF_DOCKER_OPTS \ - nextflow/nextflow:$NXF_VER nextflow "${args[@]}" - exit 1 -fi - CAPSULE_LOG=${CAPSULE_LOG:=''} CAPSULE_RESET=${CAPSULE_RESET:=''} CAPSULE_CACHE_DIR=${CAPSULE_CACHE_DIR:="$NXF_HOME/capsule"} @@ -302,8 +305,8 @@ else version_check="^(1.7|1.8)" version_message="Java 7 or 8" else - version_check="^(1.8|9|10|11|12|13|14|15|16|17|18|19|20)" - version_message="Java 8 or later (up to 20)" + version_check="^(1.8|9|10|11|12|13|14|15|16|17|18|19|20|21)" + version_message="Java 8 or later (up to 21)" fi if [[ ! $JAVA_VER =~ $version_check ]]; then echo_red "ERROR: Cannot find Java or it's a wrong version -- please make sure that $version_message is installed" @@ -314,8 +317,8 @@ else fi exit 1 fi - if [[ ! $JAVA_VER =~ ^(11|12|13|14|15|16|17|18|19|20) ]]; then - echo_yellow "NOTE: Nextflow is not tested with Java $JAVA_VER -- It's recommended the use of version 11 up to 20\n" + if [[ ! $JAVA_VER =~ ^(11|12|13|14|15|16|17|18|19|20|21) ]]; then + echo_yellow "NOTE: Nextflow is not tested with Java $JAVA_VER -- It's recommended the use of version 11 up to 21\n" fi mkdir -p $(dirname "$JAVA_KEY") [[ -f $JAVA_VER ]] && echo $JAVA_VER > "$JAVA_KEY" @@ -337,6 +340,7 @@ if [[ $cmd == console ]]; then bg=1; else JAVA_OPTS+=(-Djava.awt.headless=true) fi +[[ "$JAVA_VER" =~ ^(21) ]] && [[ ! "$NXF_ENABLE_VIRTUAL_THREADS" ]] && NXF_ENABLE_VIRTUAL_THREADS=true [[ "$JAVA_HOME" ]] && JAVA_OPTS+=(-Dcapsule.java.home="$JAVA_HOME") [[ "$CAPSULE_LOG" ]] && JAVA_OPTS+=(-Dcapsule.log=$CAPSULE_LOG) [[ "$CAPSULE_RESET" ]] && JAVA_OPTS+=(-Dcapsule.reset=true) @@ -353,6 +357,7 @@ export NXF_PLUGINS_DIR export NXF_PLUGINS_MODE export NXF_PLUGINS_DEFAULT export NXF_PACK +export NXF_ENABLE_VIRTUAL_THREADS # lookup the a `md5` command if hash md5sum 2>/dev/null; then MD5=md5sum; @@ -380,16 +385,14 @@ $NXF_OPTS $NXF_GRAB $NXF_CLASSPATH $NXF_JVM_ARGS +$NXF_ENABLE_VIRTUAL_THREADS EOF } # checked if a cached classpath file exists and it newer that the nextflow boot jar file -if [[ -f /.nextflow/dockerized ]]; then - LAUNCH_FILE=/.nextflow/launch-classpath -else - LAUNCH_FILE="${NXF_LAUNCHER}/classpath-$(env_md5)" -fi -if [ -s "$LAUNCH_FILE" ] && [ "$LAUNCH_FILE" -nt "$NXF_BIN" ]; then +LAUNCH_FILE="${NXF_LAUNCHER}/classpath-$(env_md5)" + +if [ -s "$LAUNCH_FILE" ] && [ "$LAUNCH_FILE" -nt "$NXF_BIN" ] && [[ "$remote_debug" -ne 1 ]]; then declare -a launcher="($(cat "$LAUNCH_FILE"))" else # otherwise run the capsule and get the result classpath in the 'launcher' and save it to a file @@ -404,8 +407,11 @@ else cmd_base=(${BASH_REMATCH[1]}) cmd_tail=(${BASH_REMATCH[2]}) - if [[ "$JAVA_VER" =~ ^(9|10|11|12|13|14|15|16|17|18|19|20) ]]; then - launcher="${cmd_base[@]}" + launcher="${cmd_base[@]}" + [[ "$NXF_JVM_ARGS" ]] && launcher+=($NXF_JVM_ARGS) + [[ "$remote_debug" ]] && launcher+=(-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=$NXF_REMOTE_DEBUG_PORT) + + if [[ "$JAVA_VER" =~ ^(9|10|11|12|13|14|15|16|17|18|19|20|21) ]]; then launcher+=(--add-opens=java.base/java.lang=ALL-UNNAMED) launcher+=(--add-opens=java.base/java.io=ALL-UNNAMED) launcher+=(--add-opens=java.base/java.nio=ALL-UNNAMED) @@ -421,12 +427,15 @@ else launcher+=(--add-opens=java.base/sun.net.www.protocol.ftp=ALL-UNNAMED) launcher+=(--add-opens=java.base/sun.net.www.protocol.file=ALL-UNNAMED) launcher+=(--add-opens=java.base/jdk.internal.misc=ALL-UNNAMED) + launcher+=(--add-opens=java.base/jdk.internal.vm=ALL-UNNAMED) launcher+=(--add-opens=java.base/java.util.regex=ALL-UNNAMED) - [[ "$NXF_JVM_ARGS" ]] && launcher+=($NXF_JVM_ARGS) + if [[ "$NXF_ENABLE_VIRTUAL_THREADS" == 'true' ]]; then + if [[ "$JAVA_VER" =~ ^(19|20) ]]; then launcher+=(--enable-preview) + elif [[ ! "$JAVA_VER" =~ ^(21) ]]; then die "Virtual threads require Java 19 or later - current version $JAVA_VER" + fi + fi launcher+=("${cmd_tail[@]}") else - launcher="${cmd_base[@]}" - [[ "$NXF_JVM_ARGS" ]] && launcher+=($NXF_JVM_ARGS) launcher+=("${cmd_tail[@]}") fi @@ -434,7 +443,7 @@ else if mkdir -p "${NXF_LAUNCHER}" 2>/dev/null; then STR='' for x in "${launcher[@]}"; do - [[ "$x" != "\"-Duser.dir=$PWD\"" ]] && STR+="$x " + [[ "$x" != "\"-Duser.dir=$PWD\"" ]] && [[ ! "$x" == *"-agentlib:jdwp"* ]] && STR+="$x " done printf "$STR">"$LAUNCH_FILE" else @@ -443,5 +452,7 @@ else fi +# check for latest version +check_latest # finally run it launch_nextflow From c8dd9e68ade802c9e143e5cc49cbab4ca45c15ea Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 17 Oct 2023 15:16:44 +0000 Subject: [PATCH 144/157] Add Launch on Nextflow Tower shield Former-commit-id: 67f3807b5357ae91341ebdd389b7c442aa632e83 --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 4972b25..75cf366 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-23.10.0-23aa62.svg)](https://www.nextflow.io/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/singularity/) +[![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/harryhung/gps-unified-pipeline) The GPS Unified Pipeline is a Nextflow pipeline designed for processing raw reads (FASTQ files) of *Streptococcus pneumoniae* samples. After preprocessing, the pipeline performs initial assessment based on the total bases in reads. Passed samples will be further assess based on assembly, mapping, and taxonomy. If the sample passes all quality controls (QC), the pipeline also provides the sample's serotype, multi-locus sequence typing (MLST), lineage (based on the [Global Pneumococcal Sequence Cluster (GPSC)](https://www.pneumogen.net/gps/GPSC_lineages.html)), and antimicrobial resistance (AMR) against multiple antimicrobials. From 35585cfc79517578f4ac004b9e9b2714c22319cc Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 17 Oct 2023 15:48:35 +0000 Subject: [PATCH 145/157] Correct error message Former-commit-id: 7f5b7be4881b3bd85ec9b05f8115c926b0503523 --- bin/parse_other_resistance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/parse_other_resistance.py b/bin/parse_other_resistance.py index 2077664..84ff3d3 100755 --- a/bin/parse_other_resistance.py +++ b/bin/parse_other_resistance.py @@ -10,7 +10,7 @@ # Check argv and save to global variables if len(sys.argv) != 4: - sys.exit('Usage: get_other_resistance.py DEBUG_REPORT_PATH METADATA_PATH OUTPUT_FILE') + sys.exit('Usage: parse_other_resistance.py DEBUG_REPORT_PATH METADATA_PATH OUTPUT_FILE') DEBUG_REPORT_PATH = sys.argv[1] METADATA_PATH = sys.argv[2] From 8bdae113743fdd9c3b2bdd5082b3669d153e3567 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 23 Oct 2023 12:39:20 +0000 Subject: [PATCH 146/157] Add extra folA sequences Former-commit-id: 85f88cf31aca7dc3e43b41b894d362af0df66a4a --- data/ariba_metadata.tsv | 7 ++++ data/ariba_ref_sequences.fasta | 70 ++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/data/ariba_metadata.tsv b/data/ariba_metadata.tsv index f68826b..ecfed73 100644 --- a/data/ariba_metadata.tsv +++ b/data/ariba_metadata.tsv @@ -40,6 +40,13 @@ tetM_M85225 1 0 . . TET tetS_FN555436 1 0 . . TET tetM_MH283017 1 0 . . TET folA_AE007317 1 1 I100L . TMP +folA_4_14410_4_33 1 1 I100L . TMP +folA_3_29330_2_374 1 1 I100L . TMP +folA_2_29594_2_181 1 1 I100L . TMP +folA_5_21127_2_117 1 1 I100L . TMP +folA_6_28042_2_176 1 1 I100L . TMP +folA_7_28790_1_77 1 1 I100L . TMP +folA_9_11511_8_37 1 1 I100L . TMP folP_AE007317 1 1 . . SMX gyrA_AE007317 1 1 S81F . FQ gyrA_AE007317 1 1 S81Y . FQ diff --git a/data/ariba_ref_sequences.fasta b/data/ariba_ref_sequences.fasta index 0e57bf2..44c605c 100644 --- a/data/ariba_ref_sequences.fasta +++ b/data/ariba_ref_sequences.fasta @@ -75,6 +75,76 @@ AAAAGGATACCAGCCAGCTATTGGTAAATTTATTTGCCAACCCCGCCGCCCGAATAGCCGTATAGATAAG GTTCGGCATATGTTCCACAAGTTAGCTTAA >folA_AE007317 ATGACTAAGAAAATCGTAGCTATTTGGGCCCAGGATGAAGAGGGTTTGATTGGTAAGGAAAATCGTCTGCCTTGGCATTTGCCAGCAGAATTGCAGCACTTTAAAGAAACAACTCTGAATCATGCTATCTTGATGGGGCGTGTGACCTTTGATGGGATGGGGCGTCGCTTGCTTCCAAAACGGGAAACCCTGATTTTGACGCGTAATCCGGAAGAAAAGATAGATGGGGTTGCTACTTTTCAGGACGTCCAGTCTGTTCTTGACTGGTATCAGGATCAAGAAAAGAATCTCTACATTATCGGTGGGAAGCAAATTTTTCAGGCTTTTGAACCTTACCTTGATGAAGTGATTGTCACTCACATTCATGCTCGGGTGGAAGGAGATACCTATTTCCCTGAAGAGCTTGACTTGTCTCTTTTTGAGACAGTTTCAAGCAAATTTTACGCCAAAGATGAGAAGAATCCTTATGATTTTACCATCCAATACCGCAAGAGAAAGGAAGTCTAA +>folA_2_29594_2_181 +ATGACTAAGAAAATTGTAGCTATATGGGCCCAGGATGAAGAGGGTGTGATTGGTAAGGAC +AATCGTCTGCCTTGGTATTTGCCAGCAGAATTGCAACACTTTAAAGAAACAACTCTGAAT +CATGCCATCTTGATGGGGCGCGTGACCTTTGATGGGATGGGACGTCGCTTGCTTCCAAAA +AGGGAGACCCTGATTTTGACACGTAACAGGGAAGAAAAGATAGACGGAGTTACTACTTTT +TATGATGTCCAGTCTGTCTTGGACTGGTATCAGGCTCAAGACAAGAATCTTTATATTATC +GGTGGGAAGCAAATTTTTCAGGCTTTTGAATCCTATCTTGACGAAGTGATTGTGACTCAA +ATTCATGCTAAGGTTGAAGGAGATACTTATTTCCCTGAAGAATTTGACTTGTCTCTTTTT +GAGCCAGTTTCAAGCAAATCCTACACCAAAGATGAGAAGAATCCTTATGATTTTACCATC +CAATACCGCAAGAGAAAGGAAGTCTAA +>folA_3_29330_2_374 +ATGACAAAGAAAATTGTAGCTATTTGGGCCCAGGATGAAGAGGGGATAATCGGTAAGGAT +AATCGCTTGCCGTGGTATTTACCAGCAGAATTGCAACACTTTAAAGAAACCACTCTCAAT +CATGCTATTTTGATGGGACGAGTTACTTTTGATGGAATGGGTCGTCGTTTGCTTCCCAAA +CGGGAGACCTTGATTTTGACTCGTAATTCAGATGAAAAGATGGATGGAGTGAGTATTTTC +AATGATATAGAGTCTGTATTGGACTGGTATCAAAGTCAAGATAAGAATCTCTATGTCATT +GGTGGAAAGCAAATTTTTCAAGCTTTTGAACCCTACCTTGATGAAGTGATTGTGACTTAC +ATTCATGCTCGGGTGGAGGGAGATACCTATTTCCCTGAGGAGTTTGACTTATCTCTTTTT +GAGACAGTTTCAAGCAAATCCTATACCAAAGATGAGAAAAATCCTTATGATTTTACCATC +CAATACCGCAAGAGAAAGGAAGTCTAA +>folA_4_14410_4_33 +ATGACAAAGAAAATCATAGCCATTTGGGCTCAAGATGAAAAGGGTGTAATCGGGAAAGAG +GATCGTTTACCTTGGCATTTGCCAGCAGAGTTAAAACATTTTAAAGAAACAACCTTGAAT +CATGCCATTTTAATGGGTCGTGTAACCTTTGATGGGATAGGTCGACGATTGCTTCCAGGA +CGTGAAACCTTGATTTTGACGCGTAATCCCGAAGAAACAATCGATGGTGCTCTGGTCTTC +CAAAATGTTGAGGATGTTTTAGACTGGTATCATCATCAGGATAAAAATCTCTACATCATC +GGTGGCAAGCAAATTTTTCAGCTTTTTGAACCTTTCTTAGATGAGCTTATTGTGACACAA +ATTCATGCTCAAGTTGAAGGGGATACCTATTTCCCAGAAGATTTTGACTTGACTGCTTTT +GAAACTATTACAAGTAAATCTTATAGCAAAGATGAAAAAAATGCCTATGATTTTACCATC +GAATATAGAAAGAGAAAGGAAGTCTAA +>folA_5_21127_2_117 +ATGACTAAGAAAATTGTAGCTATTTGGGCCCAGGATGAAGAGGGTGTGATTGGTAAGGAC +AATCGTCTGCCTTGGTATTTACCAGCAGAACTGCAACACTTCAAAGAAACAACTCTGAAT +CATGCTATCTTGATGGGGCGCGTGACCTTTGATGGGATAGGGCGTCGCTTGCTTCCACAA +CGGGAAACTCTGATTTTGACGCGTAATTCTGAAGAAAAGATAGACGGGGTTACTACTTTT +CAGGATGTCCAGTCTGTCTTGGACTGGTATCAGGCTCAAGAAAAGAATCTTTATATTATC +GGTGGGAAGCAAATTTTTCAGGCTTTTGAACCCTACCTTGACGAAGTGATTGTGACTCAA +ATTCATGCTCGGGTGGAGGGAGATACCTATTTCCCTGAGGAGTTTGATTTGTCTCTTTTT +GAGACAGTTTCAAGCAAATCCTATACCAAAGATGAGAAAAATCCTTATGATTTTACCATC +CAATACCGCAAGAGAAAGGAAGTCTAA +>folA_6_28042_2_176 +ATGACTAAGAAAATCGTAGCTATTTGGGCCCAGGATGAAGAGGGTGTGATTGGTAAGGAC +AATCGTCTGCCTTGGTATTTGCCAGCAGAACTGCAACACTTCAAAGAAACAACTCTTAAT +CATGCTATCCTGATGGGGCGCGTGACCTTTGATGGGATGGGACGTCGCTTGCTTCCGAAA +CGGGAGACCTTGATTTTGACACGTAATTCTGAAGAAAAGATAGACGGAGTTGTTACTTTT +CATGATATCCAGTCTGTTCTCGACTGGTATCAGGGTCAAGACAAGAATCTCTATATTATC +GGTGGGAAGCAAATTTTTCAGGCTTTTGAGTCCTATTTGGATGAAGTGATTGTCACTCAC +ATTCATGCTCGGGTAGAGGGAGATACCTATTTCCCTGAGGAATTTGACTTATCTCTTTTT +GAGACAGTTTCAAGCAAATCCTATACCAAAGATGAGAAAAATCCTTATGATTTTACCATC +CAATACCGCAAGAGAAAGGAAGTCTAA +>folA_7_28790_1_77 +ATGACTAAGAAAATCGTAGCTATTTGGGCCCAGGATGAAGAGGGTGTGATTGGTAAGGAA +AATCGTCTGCCTTGGCATTTGCCAGCAGAATTGCAGCACTTTAAAGAAACAACTCTGAAT +CATGCTATCTTGATGGGGCGTGTGACCTTTGATGGGATGGGGCGTCGCTTGCTTCCAAAA +CGGGAAACCTTGATTTTGACGCGTAATCCGGAAGAAAAGATAGATGGGGTTACTACTTTT +TATGATGTCCAGTCTGTCTTGGACTGGTATCAGGATCAAGATAAGAATCTCTATATTATC +GGTGGAAAGCAAATTTTTCAGGCTTTTGAACCCTACCTTGACGAAGTGATTGTGACTCAA +ATTCATGCTCGGGTGGAGGGAGATACCTATTTCCCTGAAGAGTTTGACTTGTCCCTTTTT +GAGACAGTTTCAAGCAAATCCTATACCAAAGATGAGAAAAACCCTTATGATTTTACCATC +CAATACCGCAAGAGAAAGGAAGTCTAA +>folA_9_11511_8_37 +ATGACTAAGAAAATTGTAGCTATTTGGGCCCAGGATGAAGAGGGTGTGATTGGTAAGGAC +AATCGTCTGCCTTGGTATTTACCAGCAGAACTGCAACACTTCAAAGAAACAACTCTGAAT +CATGCTATCTTGATGGGGCGCGTGACCTTTGATGGGATGGGGCGTCGACTGCTTCCAAAA +CGGGAGACCTTGATTTTGACACGTAATTCAGAAGAAAAGATAGATGGGGTTACTACTTTT +TATGATGTTCAGTCTGTCTTGGACTGGTATCATGCTCAAGACAAGAATCTCTATATTATT +GGTGGGAAGCAAGTTTTTCAGGCTTTTGAACCCTACCTTGACGAAGTGATTGTGACTCAA +ATTCATGCTCGGGTGGAGGGAGATACCTATTTCCCTGAAGAGTTTGACTTGTCCCTTTTT +GAGACAGTTTCAAGCAAATCCTTTACCAAAGATGAGAAAAACCCTTATGATTTTACCATC +CAATACCGCAAGAGAAAGGAAGTCTAA >gyrA_AE007317 ATGCAGGATAAAAATTTAGTGAATGTCAATCTGACAAAGGAGATGAAGGCAAGTTTTATCGACTACGCCATGAGTGTTATCGTAGCGCGAGCTCTTCCTGATGTTCGAGATGGCTTAAAACCTGTTCACCGTCGCATTCTCTACGGAATGAATGAATTGGGTGTGACCCCAGACAAACCCCATAAAAAATCTGCTCGTATTACAGGGGATGTCATGGGTAAATACCACCCACACGGGGATTCCTCTATTTATGAAGCCATGGTCCGTATGGCTCAATGGTGGAGCTACCGTTACATGCTTGTAGATGGTCATGGGAATTTTGGTTCCATGGATGGAGATAGTGCTGCCGCTCAACGTTATACCGAGGCACGTATGAGCAAGATTGCTCTGGAAATGCTTCGTGATATCAACAAAAATACAGTTGATTTCGTTGATAACTATGATGCCAATGAACGGGAACCCTTGGTCTTGCCAGCGCGTTTTCCAAACCTTTTGGTTAATGGAGCAACTGGTATCGCGGTTGGGATGGCAACCAATATTCCACCTCATAATCTGGGTGAAACCATTGATGCAGTGAAGTTGGTCATGGATAATCCTGAAGTGACTACCAAGGACTTGATGGAAGTCTTGCCTGGACCAGATTTTCCAACTGGTGCTCTTGTCATGGGGAAATCAGGTATCCATAAGGCTTATGAAACAGGTAAAGGTTCGATTGTCCTACGTTCTCGTACAGAGATTGAAACGACTAAGACTGGTCGTGAGCGTATCGTTGTAACAGAATTTCCTTACATGGTCAATAAAACCAAGGTGCATGAGCATATTGTTCGCTTGGTTCAGGAAAAACGCATTGAGGGTATCACAGCAGTACGTGATGAGTCAAACCGTGAAGGTGTTCGATTTGTTATTGAAGTCAAGCGCGACGCCTCAGCCAATGTTATTCTCAATAACCTCTTCAAAATGACCCAAATGCAAACCAATTTTGGTTTCAATATGCTCGCTATCCAAAATGGTATACCGAAAATTTTGTCTCTTCGTCAGATTTTGGATGCTTATATTGAGCACCAAAAAGAAGTGGTTGTTCGTCGTACACGTTTTGATAAGGAAAAAGCGGAAGCGCGCGCTCATATCTTAGAAGGTCTCTTGATTGCGCTAGACCATATCGACGAAGTGATTCGTATCATCCGTGCTAGTGAAACGGATGCGGAAGCTCAAGCTGAGTTGATGAGCAAGTTTAAGCTTTCTGAACGTCAAAGTCAAGCTATCCTTGATATGCGTCTTCGTCGTTTGACAGGTTTGGAACGCGATAAGATTCAATCTGAGTATGATGACCTCTTGGCTCTGATTGCGGATTTAGCAGATATTCTTGCTAAGCCTGAACGTGTTTCTCAAATTATCAAAGACGAATTGGATGAAGTTAAACGTAAATTTTCTGATAAACGCCGTACAGAGTTGATGGTTGGACAGGTCTTGAGTCTCGAGGATGAGGACTTGATTGAAGAATCGGATGTCTTGATTACCCTTTCTAACAGAGGCTACATTAAGCGTTTGGATCAGGACGAGTTCACTGCTCAAAAACGTGGGGGTCGTGGTGTCCAAGGAACGGGAGTGAAAGATGATGACTTTGTTCGTGAGTTAGTGTCAACTAGTACCCATGATCATCTGCTCTTCTTCACAAACAAGGGACGTGTCTATCGTCTTAAAGGTTATGAAATTCCTGAGTATGGTCGGACTGCCAAAGGGCTACCAGTAGTCAATCTCTTGAAATTGGATGAAGACGAAAGTATTCAGACGGTTATCAATGTTGAGTCTGATCGCAGTGATGATGCTTATCTCTTCTTTACAACCCGTCACGGTATTGTGAAGAGAACCAGTGTTAAGGAGTTTGCCAATATTCGTCAAAATGGTCTCAAAGCGCTGAATTTAAAGGATGAAGATGAGTTAATCAATGTCTTGTTGGCAGAAGGAGATATGGATATTATCATTGGTACCAAGTTTGGTTATGCAGTTCGCTTTAATCAATCAGCCGTTCGTGGTATGAGCCGTATCGCCACTGGTGTGAAAGGTGTTAACCTTCGTGAAGGAGACACAGTTGTTGGTGCCAGCTTGATTACTGATCAAGATGAGGTTCTTATTATCACAGAAAAAGGATATGGTAAGCGTACAGTCGCTACTGAATACCCAACAAAAGGTCGTGGTGGTAAGGGAATGCAGACAGCTAAAATTACCGAAAAAAATGGCTTGCTGGCCGGTCTTATGACTGTTCAAGGGGATGAGGATTTGATGATTATCACTGATACAGGTGTCATGATTCGAACCAATCTTGCCAATATTTCACAAACAGGACGTGCAACTATGGGAGTTAAAGTAATGCGCCTGGATCAAGATGCTCAGATAGTGACTTTCACAACGGTTGCGGTGGCAGAAAAAGAAGAAGTTGGGACAGAAAACGAAACAGAAGGTGAAGCATAA >gyrB_AE007317 From 3b8f04ae8e06578cc042e36114c9ac1a1a2e955c Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 25 Oct 2023 15:45:53 +0000 Subject: [PATCH 147/157] Rewrite for variants' gene mapped depth check & FQ Former-commit-id: 1ac7fa4291def64f4bb617c9f5cc015868a52f98 --- bin/parse_other_resistance.py | 248 ++++++++++++++++++++++------------ 1 file changed, 161 insertions(+), 87 deletions(-) diff --git a/bin/parse_other_resistance.py b/bin/parse_other_resistance.py index 84ff3d3..3d957d2 100755 --- a/bin/parse_other_resistance.py +++ b/bin/parse_other_resistance.py @@ -4,123 +4,196 @@ import sys from collections import defaultdict -import pandas as pd +import re import csv +import pandas as pd -# Check argv and save to global variables +# Check argv if len(sys.argv) != 4: sys.exit('Usage: parse_other_resistance.py DEBUG_REPORT_PATH METADATA_PATH OUTPUT_FILE') +# Global Constants DEBUG_REPORT_PATH = sys.argv[1] METADATA_PATH = sys.argv[2] OUTPUT_FILE = sys.argv[3] +LOW_COVERAGE = "Low Coverage" # String for Low Coverage warning def main(): - targets_dict, hits_dict = prepare_dicts() - find_hits(targets_dict, hits_dict) - output = get_output(hits_dict) + target_dict = get_target() + hit_dict = find_hit(target_dict) + output = get_output(hit_dict) # Save output to OUTPUT_FILE in csv format pd.DataFrame([output]).to_csv(OUTPUT_FILE, index=False, quoting=csv.QUOTE_ALL) -# Prepare targets_dict for searching hits and hits_dict for saving hits -def prepare_dicts(): - # For saving (reference, gene, var_only) combinations as keys and their information found in metadata as values in dict format (i.e. {var_change: target}) - # Used to search whether there is a hit in the ARIBA result - targets_dict = defaultdict(dict) +# Saving all targets in metadata as key and their information as values +def get_target(): + df_metadata = pd.read_csv(METADATA_PATH, sep='\t') + target_dict = defaultdict(lambda: defaultdict(set)) - # For saving targets found in metadata as key and their determinants (i.e. hits) found in ARIBA result as values in set format - hits_dict = {} + # Add ref_group based on ref_name to the Dataframe + df_metadata['ref_group'] = df_metadata['ref_name'].apply(lambda x: x.split('_')[0]) + + # Handle each AMR target one-by-one + for target, df_target in df_metadata.groupby('target'): + # Create Dataframe slices with presence and variant mechanism respectively + df_target_presence = df_target[df_target["var_only"] == 0] + df_target_var = df_target[df_target["var_only"] == 1] + + # Logic if presence of gene/non-gene is a mechanism of this target, add releveant ref_name to the existing set (create one by default if set does not exist) + if len(df_target_presence.index) != 0: + target_dict[target]['presence'].update(df_target_presence['ref_name']) - with open(METADATA_PATH) as metadata: - # Skip the header in metadata - next(metadata) + # Logic if variant of gene/non-gene is a mechanism of this target + if len(df_target_var.index) != 0: + target_dict[target]['variant'] = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) - # Go through lines in metadata and save findings to targets_dict and hits_dict - for line in (line.strip() for line in metadata): - # Extract useful fields - fields = [str(field) for field in line.split("\t")] - ref_name, gene, var_only, var_change, _, target = fields + # Further handle each ref_group one-by-one + for ref_group, df_ref_group in df_target_var.groupby('ref_group'): + # Each gene/non-gene group can only be gene or non-gene, cannot be both + is_gene = df_ref_group['gene'].unique() + if len(is_gene) != 1: + raise Exception(f"Error: Conflicting information. {ref_group} is considered as both gene and non-gene in the provided ARIBA metadata.") + is_gene = is_gene[0] + + # Save whether this ref_group is a gene or not + target_dict[target]['variant'][ref_group]['is_gene'] = bool(is_gene) - # Populating targets_dict - targets_dict[(ref_name, gene, var_only)].update({var_change: target}) - # Populating hits_dict - hits_dict.update({target: set()}) - - return targets_dict, hits_dict + # Save variants of each individual gene/non-gene within the ref_group + for ref, df_ref in df_ref_group.groupby('ref_name'): + target_dict[target]['variant'][ref_group]['ref'][ref].update(df_ref['var_change']) + + return target_dict # Finding hits in ARIBA results based on targets_dict and save hits to hits_dict -def find_hits(targets_dict, hits_dict): - with open(DEBUG_REPORT_PATH) as debug_report: - # Skip the header in debug report - next(debug_report) - - # Go through lines in the debug report to detect targets - for line in (line.strip() for line in debug_report): - # Extract useful fields - fields = [str(field) for field in line.split("\t")] - ref_name, gene, var_only, ref_len, ref_base_assembled, ctg_cov, known_var_change, has_known_var, ref_ctg_effect, ref_start, ref_end = fields[1], fields[2], fields[3], fields[7], fields[8], fields[12], fields[16], fields[17], fields[19], fields[20], fields[21] - - # If the known_var_change (. for genes, specific change for variants) is not found in the metadata of the (ref_name, gene, var_only) combination, skip the line - try: - target = targets_dict[(ref_name, gene, var_only)][known_var_change] - except KeyError: - continue - - # If ref_base_assembled or ref_len or ctg_cov variables contain non-numeric value, skip the line - if not ref_base_assembled.isdigit() or not ref_len.isdigit() or not ctg_cov.replace('.', '', 1).isdigit(): - continue - - # Logic for gene detection, check coverage and mapped read depth. - if var_only == "0" and int(ref_base_assembled)/int(ref_len) >= 0.8 and float(ctg_cov) >= 20: - hits_dict[target].add(f'{ref_name}') - - # Logic for variant detection, coverage check is not needed, but check for other criteria - if var_only == "1": - # folP-specific criteria: ref_ctg_effect (effect of change between reference and contig) is one of the keywords and the change occurs within nt 166-201 (covering changes affecting aa 56 - 67) - if ref_name.lower().startswith("folp") and ref_ctg_effect.lower() in ('fshift', 'trunc', 'indel', 'indels', 'ins', 'multiple') and (166 <= int(ref_start) <= 201 or 166 <= int(ref_end) <= 201): - pos = ref_start if ref_start == ref_end else f'{ref_start}-{ref_end}' - hits_dict[target].add(f'{ref_name} {ref_ctg_effect} at {pos}') - # Common criteria: the assembly has that variant - elif has_known_var == "1": - hits_dict[target].add(f'{ref_name} Variant {known_var_change}') - - -# Generating final output dataframe based on hits_dict -def get_output(hits_dict): - # For saving final output, where information is saved per-target +def find_hit(target_dict): + df_report = pd.read_csv(DEBUG_REPORT_PATH, sep="\t") + + # Remove rows with non-numeric value in ref_base_assembled, ref_len, or ctg_cov + df_report['ref_base_assembled'] = pd.to_numeric(df_report['ref_base_assembled'], errors='coerce') + df_report['ref_len'] = pd.to_numeric(df_report['ref_len'], errors='coerce') + df_report['ctg_cov'] = pd.to_numeric(df_report['ctg_cov'], errors='coerce') + df_report.dropna(subset=['ref_base_assembled', 'ref_len', 'ctg_cov'], inplace=True) + + # Calculate reference coverage + df_report['coverage'] = df_report['ref_base_assembled'] / df_report['ref_len'] + + # Saving all targets in metadata as key and their determinants (i.e. hits) found in ARIBA result as values in set format + hit_dict = {target: set() for target in target_dict} + + # Handle each AMR target one-by-one + for target, target_content in target_dict.items(): + # Logic if presence of gene/non-gene is a mechanism of this target + if 'presence' in target_content: + for ref in target_content['presence']: + # Add refs that pass coverage and mapped read depth checks + df_report_hit = df_report[ + (df_report['ref_name'] == ref) & + (df_report['coverage'] >= 0.8) & + (df_report['ctg_cov'] >= 20) + ] + hit_dict[target].update(df_report_hit['ref_name']) + + # Logic if variant of gene/non-gene is a mechanism of this target + if 'variant' in target_content: + # Further handle each ref_group one-by-one + for ref_group, ref_group_content in target_content['variant'].items(): + # Create Dataframe slices of entries in ref_group only + df_ref_group = df_report[df_report['ref_name'].str.startswith(f"{ref_group}_")] + + # If ref_group is gene: + # - further slice Dataframe to include those with 10x mapped depth only + # - if no entry in ref_group has 10x+ depth, mark ref_group as Low Coverage and skip + if ref_group_content['is_gene']: + df_ref_group = df_ref_group[df_ref_group['ctg_cov'] >= 10] + if len(df_ref_group.index) == 0: + hit_dict[target].add(f'{ref_group} {LOW_COVERAGE}') + continue + + # folP ref_group specific criteria: ref_ctg_effect (effect of change between reference and contig) is one of the keywords and the change occurs within nt 166-201 (covering changes affecting aa 56 - 67) + if ref_group.lower() == "folp": + df_ref_group_hit = df_ref_group[ + (df_ref_group["ref_ctg_effect"].str.lower().isin(['fshift', 'trunc', 'indel', 'indels', 'ins', 'multiple'])) & + (df_ref_group["ref_start"].astype(int).between(166, 201) | df_ref_group["ref_end"].astype(int).between(166, 201)) + ] + for ref_start, ref_end, ref_name, ref_ctg_effect in df_ref_group_hit[['ref_start', 'ref_end', 'ref_name', 'ref_ctg_effect']].itertuples(index=False, name=None): + pos = ref_start if ref_start == ref_end else f'{ref_start}-{ref_end}' + hit_dict[target].add(f"{ref_name} {ref_ctg_effect} at {pos}") + + # Criteria for other ref_group: known_var_change is one of the known variants and has_known_var is 1 + else: + # Handle each ref_name within the ref_group one-by-one + for ref, vars in ref_group_content['ref'].items(): + df_ref_group_hit = df_ref_group[ + (df_ref_group['ref_name'] == ref) & + (df_ref_group['known_var_change'].isin(vars)) & + (df_ref_group['has_known_var'].astype(str) == '1') + ] + if len(df_ref_group_hit.index) != 0: + for var_hit in df_ref_group_hit['known_var_change'].unique(): + hit_dict[target].add(f'{ref} Variant {var_hit}') + + return hit_dict + + +# Generating final output dataframe based on hit_dict +def get_output(hit_dict): output = {} - # Go through targets in hits_dict - for target in hits_dict: - # If the target has no hit, set output as S or NEG (only for PILI-1/2), and determinant as _ - if len(hits_dict[target]) == 0: + # Go through targets in hit_dict + for target in hit_dict: + # If the target has no hit, set output as S or NEG (for PILI-1/2), and determinant as _ + if len(hit_dict[target]) == 0: if target.lower().startswith('pili'): output[target] = 'NEG' else: output[f'{target}_Res'] = 'S' output[f'{target}_Determinant'] = '_' - # If the target has hit, set output as R or POS (only for PILI-1/2), and join all hits as determinant else: - if target.lower().startswith('pili'): - output[target] = 'POS' + # FQ specific-criteria + if target.lower() == 'fq': + # If gyrA or gyrB is mutated, FQ is R + if any(re.match(rf"^gyr[AB](?!.*{LOW_COVERAGE}$).*$", determinant) for determinant in hit_dict[target]): + output[f'{target}_Res'] = 'R' + # else if gyrA or gyrB have low coverage, FQ is Indeterminable as it cannot be sure whether it will be a R or not + elif any(re.match(rf"^gyr[AB].*{LOW_COVERAGE}$", determinant) for determinant in hit_dict[target]): + output[f'{target}_Res'] = 'Indeterminable' + # If parC or parE is mutated, FQ is I as gyrA or gyrB mutation already excluded + elif any(re.match(rf"^par[CE](?!.*{LOW_COVERAGE}$).*$", determinant) for determinant in hit_dict[target]): + output[f'{target}_Res'] = 'I' + # else if parC or parE have low coverage, FQ is Indeterminable as it cannot be sure whether it will be a I or not + elif any(re.match(rf"^par[CE].*{LOW_COVERAGE}$", determinant) for determinant in hit_dict[target]): + output[f'{target}_Res'] = 'Indeterminable' + # Should only reach this part if all of gyrA, gyrB, parC, parE have good coverage and not mutated, but other hit(s) exist + else: + raise Exception(f"Error: Unexpect determinant scenario of {target}: {'; '.join(hit_dict[target])}") + + # Criteria for other targets else: - output[f'{target}_Res'] = 'R' + # If all determinants have Low Coverage warning, set output as Indeterminable + if hit_dict[target] and all(re.match(rf"^.*{LOW_COVERAGE}$", determinant) for determinant in hit_dict[target]): + output[f'{target}_Res'] = 'Indeterminable' + # If the target has a hit without Low Coverage warning, set output as R or POS (for PILI-1/2), and join all hits as determinant + else: + if target.lower().startswith('pili'): + output[target] = 'POS' + else: + output[f'{target}_Res'] = 'R' - output[f'{target}_Determinant'] = '; '.join(sorted(hits_dict[target])) + output[f'{target}_Determinant'] = '; '.join(sorted(hit_dict[target])) - add_output_special_cases(output, hits_dict) + add_inferred_results(output, hit_dict) return output -# Special cases to add to output -def add_output_special_cases(output, hits_dict): +# Inferred cases to add to output +def add_inferred_results(output, hit_dict): # If TET exists and DOX does not: add DOX to output; directly copy output and determinant if 'TET_Res' in output and 'DOX_Res' not in output: output['DOX_Res'] = output['TET_Res'] @@ -132,23 +205,24 @@ def add_output_special_cases(output, hits_dict): output['LFX_Determinant'] = output['FQ_Determinant'] # If both TMP and SMX exists, and COT does not: add COT to output. - # If R in both, COT is R; if R in one of them, COT is I; if S in both, COT is S - # Copy TMP_Determinant and SMX_Determinant to COT_Determinant if 'TMP_Res' in output and 'SMX_Res' in output and 'COT_Res' not in output: - if output['TMP_Res'] == 'R' and output['SMX_Res'] == 'R': + # If Indeterminable in either, COT is Indeterminable; If R in both, COT is R; if R in one of them, COT is I; if S in both, COT is S + if output['TMP_Res'] == 'Indeterminable' or output['SMX_Res'] == 'Indeterminable': + output['COT_Res'] = 'Indeterminable' + elif output['TMP_Res'] == 'R' and output['SMX_Res'] == 'R': output['COT_Res'] = 'R' - output['COT_Determinant'] = '; '.join(sorted(hits_dict['TMP'].union(hits_dict['SMX']))) elif (output['TMP_Res'] == 'R') ^ (output['SMX_Res'] == 'R'): output['COT_Res'] = 'I' - output['COT_Determinant'] = '; '.join(sorted(hits_dict['TMP'].union(hits_dict['SMX']))) elif output['TMP_Res'] == 'S' and output['SMX_Res'] == 'S': output['COT_Res'] = 'S' - output['COT_Determinant'] = '_' + + # Copy TMP_Determinant and SMX_Determinant to COT_Determinant + output['COT_Determinant'] = res if (res := '; '.join(sorted(hit_dict['TMP'].union(hit_dict['SMX'])))) else '_' # If ERY_CLI exists: add ERY and CLI to output. - # If ERY_CLI is R, ERY and CLI are R, and add ERY_CLI determinant to their determinants - # If ERY_CLI is S, ERY and CLI are S if they do not already exist, otherwise leave them unchanged if 'ERY_CLI_Res' in output: + # If ERY_CLI is R, ERY and CLI are R, and add ERY_CLI determinant to their determinants + # If ERY_CLI is S, ERY and CLI are S if they do not already exist, otherwise leave them unchanged if output['ERY_CLI_Res'] == 'R': output['ERY_Res'] = 'R' output['CLI_Res'] = 'R' @@ -156,8 +230,8 @@ def add_output_special_cases(output, hits_dict): output['ERY_Res'] = output['ERY_Res'] if 'ERY_Res' in output else 'S' output['CLI_Res'] = output['CLI_Res'] if 'CLI_Res' in output else 'S' - output['ERY_Determinant'] = '; '.join(sorted(hits_dict['ERY_CLI'].union(hits_dict['ERY']))) if 'ERY' in hits_dict and len(hits_dict['ERY']) != 0 else output['ERY_CLI_Determinant'] - output['CLI_Determinant'] = '; '.join(sorted(hits_dict['ERY_CLI'].union(hits_dict['CLI']))) if 'CLI' in hits_dict and len(hits_dict['CLI']) != 0 else output['ERY_CLI_Determinant'] + output['ERY_Determinant'] = '; '.join(sorted(hit_dict['ERY_CLI'].union(hit_dict['ERY']))) if 'ERY' in hit_dict and len(hit_dict['ERY']) != 0 else output['ERY_CLI_Determinant'] + output['CLI_Determinant'] = '; '.join(sorted(hit_dict['ERY_CLI'].union(hit_dict['CLI']))) if 'CLI' in hit_dict and len(hit_dict['CLI']) != 0 else output['ERY_CLI_Determinant'] if __name__ == "__main__": From ae061fe18437006833b21ad6fcd45e707ed16386 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 31 Oct 2023 12:35:15 +0000 Subject: [PATCH 148/157] Bump version to 1.0.0-rc1 Former-commit-id: f8637045096ff1e183c84c9574b0dbb6cdf468a2 --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index ec3e7e5..cae54d5 100644 --- a/main.nf +++ b/main.nf @@ -1,7 +1,7 @@ #!/usr/bin/env nextflow // Version of this release -pipelineVersion = '0.8.2' +pipelineVersion = '1.0.0-rc1' // Import workflow modules include { PIPELINE } from "$projectDir/workflows/pipeline" From e80ce6ec734ef38f1bb9c670e0cd676e3c3754f7 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 31 Oct 2023 16:18:24 +0000 Subject: [PATCH 149/157] Reflect changes in relational operators Former-commit-id: 1054ad6a471362657f39ca262f8eee1e46d8a755 --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 75cf366..28fde75 100644 --- a/README.md +++ b/README.md @@ -287,12 +287,12 @@ The pipeline is compatible with [Launchpad](https://help.tower.nf/23.2/launch/la | `Taxonomy_QC` | QC | Taxonomy quality control result | | `Overall_QC` | QC | Overall quality control result
(Based on `Assembly_QC`, `Mapping_QC` and `Taxonomy_QC`) | | `Bases` | Read | Number of bases in the reads
(Default: ≥ 38 Mb to pass Read QC) | - | `Contigs#` | Assembly | Number of contigs in the assembly
(Default: < 500 to pass Assembly QC) | + | `Contigs#` | Assembly | Number of contigs in the assembly
(Default: ≤ 500 to pass Assembly QC) | | `Assembly_Length` | Assembly | Total length of the assembly
(Default: 1.9 - 2.3 Mb to pass Assembly QC) | | `Seq_Depth` | Assembly | Sequencing depth of the assembly
(Default: ≥ 20x to pass Assembly QC) | - | `Ref_Cov_%` | Mapping | Percentage of reference covered by reads
(Default: > 60% to pass Mapping QC) | - | `Het-SNP#` | Mapping | Non-cluster heterozygous SNP (Het-SNP) site count
(Default: < 220 to pass Mapping QC) | - | `S.Pneumo_%` | Taxonomy | Percentage of reads assigned to *Streptococcus pneumoniae*
(Default: > 60% to pass Taxonomy QC) | + | `Ref_Cov_%` | Mapping | Percentage of reference covered by reads
(Default: ≥ 60% to pass Mapping QC) | + | `Het-SNP#` | Mapping | Non-cluster heterozygous SNP (Het-SNP) site count
(Default: ≤ 220 to pass Mapping QC) | + | `S.Pneumo_%` | Taxonomy | Percentage of reads assigned to *Streptococcus pneumoniae*
(Default: ≥ 60% to pass Taxonomy QC) | | `Top_Non-Strep_Genus` | Taxonomy | The most abundant non-*Streptococcus* genus in reads | | `Top_Non-Strep_Genus_%` | Taxonomy | Percentage of reads assigned to the most abundant non-*Streptococcus* genus
(Default: ≤ 2% to pass Taxonomy QC) | | `GPSC` | Lineage | GPSC Lineage | From 9dd6d6901cf24d264b4a9e9916d9c03538a5b1a2 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 2 Nov 2023 17:17:43 +0000 Subject: [PATCH 150/157] Add warning about output overwrite Former-commit-id: be7941ca0d3a821d1ec82b3ff31a655f14ea3ffe --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 28fde75..93dd21c 100644 --- a/README.md +++ b/README.md @@ -182,6 +182,8 @@ The pipeline is compatible with [Launchpad](https://help.tower.nf/23.2/launch/la | `--help` | `true` or `false`
(Default: `false`)| Show help message.
Can be enabled by including `--help` without value. | ## Input and Output + > ⚠️ `--output` overwrites existing results in the target directory if there is any + > ⚠️ `--db` does not accept user provided local databases, directory content will be overwritten | Option | Values | Description | From dc1975b70c1820bf83436d6b193d37b1e12cd335 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 7 Nov 2023 14:46:59 +0000 Subject: [PATCH 151/157] Add warning about Docker Desktop for Linux Former-commit-id: a774fb8e2e84a61fca072f9ac28a1464d4366c6c --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 93dd21c..a61958d 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,8 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca ## Requirement - A POSIX-compatible system (e.g. Linux, macOS, Windows with [WSL](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux)) with Bash 3.2 or later - Java 11 or later (up to 21) ([OpenJDK](https://openjdk.org/)/[Oracle Java](https://www.oracle.com/java/)) -- [Docker](https://www.docker.com/) or [Singularity](https://sylabs.io/singularity/) +- [Docker](https://www.docker.com/) or [Singularity](https://sylabs.io/singularity/)/[Apptainer](https://apptainer.org/) + - For Linux, [Singularity](https://sylabs.io/singularity/)/[Apptainer](https://apptainer.org/) or [Docker Engine](https://docs.docker.com/engine/) is recommended over [Docker Desktop for Linux](https://docs.docker.com/desktop/). The latter is known to cause permission issues when running the pipeline on Linux. - It is recommended to have at least 16GB of RAM and 100GB of free storage > ℹ️ Details on storage > - The pipeline core files use < 1GB From 5b01810a69376268dbab0daa7b4b94d3f4bb9dfd Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 15 Nov 2023 16:51:55 +0000 Subject: [PATCH 152/157] Remove test input from the repository Former-commit-id: d8243c8a14628a15f430d213a1cb8b4058178835 --- test_input/17175_7#59_1.fastq.gz.REMOVED.git-id | 1 - test_input/17175_7#59_2.fastq.gz.REMOVED.git-id | 1 - test_input/21127_1#156_1.fastq.gz.REMOVED.git-id | 1 - test_input/21127_1#156_2.fastq.gz.REMOVED.git-id | 1 - test_input/9870_5#52_1.fastq.gz.REMOVED.git-id | 1 - test_input/9870_5#52_2.fastq.gz.REMOVED.git-id | 1 - 6 files changed, 6 deletions(-) delete mode 100644 test_input/17175_7#59_1.fastq.gz.REMOVED.git-id delete mode 100644 test_input/17175_7#59_2.fastq.gz.REMOVED.git-id delete mode 100644 test_input/21127_1#156_1.fastq.gz.REMOVED.git-id delete mode 100644 test_input/21127_1#156_2.fastq.gz.REMOVED.git-id delete mode 100644 test_input/9870_5#52_1.fastq.gz.REMOVED.git-id delete mode 100644 test_input/9870_5#52_2.fastq.gz.REMOVED.git-id diff --git a/test_input/17175_7#59_1.fastq.gz.REMOVED.git-id b/test_input/17175_7#59_1.fastq.gz.REMOVED.git-id deleted file mode 100644 index 4a16c6a..0000000 --- a/test_input/17175_7#59_1.fastq.gz.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -75374c4cf798b2792d14e64d5208bf14993d9523 \ No newline at end of file diff --git a/test_input/17175_7#59_2.fastq.gz.REMOVED.git-id b/test_input/17175_7#59_2.fastq.gz.REMOVED.git-id deleted file mode 100644 index 95f5c6d..0000000 --- a/test_input/17175_7#59_2.fastq.gz.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -ce243511501675c9ed081f83e81a2d9784f770f8 \ No newline at end of file diff --git a/test_input/21127_1#156_1.fastq.gz.REMOVED.git-id b/test_input/21127_1#156_1.fastq.gz.REMOVED.git-id deleted file mode 100644 index dacba3e..0000000 --- a/test_input/21127_1#156_1.fastq.gz.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -d13d4dc97d9579d2e3cb2db5c2835e72b8bbdcc2 \ No newline at end of file diff --git a/test_input/21127_1#156_2.fastq.gz.REMOVED.git-id b/test_input/21127_1#156_2.fastq.gz.REMOVED.git-id deleted file mode 100644 index 51ec076..0000000 --- a/test_input/21127_1#156_2.fastq.gz.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -1c0d2e02831087a61cd116da18f219a254a612dd \ No newline at end of file diff --git a/test_input/9870_5#52_1.fastq.gz.REMOVED.git-id b/test_input/9870_5#52_1.fastq.gz.REMOVED.git-id deleted file mode 100644 index 3bf70d3..0000000 --- a/test_input/9870_5#52_1.fastq.gz.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -ae4998adb7e1725224497ddc7dffe8cc72c4f013 \ No newline at end of file diff --git a/test_input/9870_5#52_2.fastq.gz.REMOVED.git-id b/test_input/9870_5#52_2.fastq.gz.REMOVED.git-id deleted file mode 100644 index 029c57c..0000000 --- a/test_input/9870_5#52_2.fastq.gz.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -512f0c32c025f6e4b82ad88dc55ea5e83a5cada9 \ No newline at end of file From c6faa0c2e8dba6cc2a69967adb43befde0344877 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 15 Nov 2023 16:52:19 +0000 Subject: [PATCH 153/157] Ignore test_input Former-commit-id: 2a92d7ca148a7c01ab32d5e7de748b1e355afc85 --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 904f7e3..7ea9fb9 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ work databases input output +test_input *.html # Singularity cache From bdef2de95ca260c57413a14ec073f99200af8641 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 15 Nov 2023 18:00:50 +0000 Subject: [PATCH 154/157] Script for downloading test input Former-commit-id: e06eecbe95ec00430d7cc245b234fd2f8d80e975 --- download_test_input | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100755 download_test_input diff --git a/download_test_input b/download_test_input new file mode 100755 index 0000000..85d988e --- /dev/null +++ b/download_test_input @@ -0,0 +1,29 @@ +#! /bin/sh + +pipelineDir=$(dirname -- "$(readlink -f -- "$0")") +URL="https://gps-project.cog.sanger.ac.uk/gup_test_input.tar.gz" +TARGET="test_input.tar.gz" + +cd $pipelineDir + +if command -v curl > /dev/null 2>&1; then + GET="curl -fsSL '$URL' -o '$TARGET'" +elif command -v wget > /dev/null 2>&1; then + GET="wget '$URL' -O '$TARGET' >/dev/null 2>&1" +else + echo "ERROR: Cannot find 'curl' nor 'wget' utility -- please install one of them" + echo "" + echo "Alternatively you can try to download and extract content of this file:" + echo "$URL" + exit 1 +fi + +echo "Downloading... Please wait ..." +eval $GET && tar -xf $TARGET && rm -f $TARGET; status=$? + +if [ $status -ne 0 ]; then + echo "ERROR: Cannot download the test input -- make sure you can connect to the internet" + exit 1 +else + echo "Download completed." +fi From 1d3a064614231e733bb9b87cefeac5e86c13415e Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 16 Nov 2023 18:37:34 +0000 Subject: [PATCH 155/157] Update content for optional test data Former-commit-id: b8c3e484b827e69e7973baf2798a4f7b06ed90b7 --- README.md | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index a61958d..53a2d3c 100644 --- a/README.md +++ b/README.md @@ -73,8 +73,8 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca ``` or - Download and unzip the [latest release](https://github.com/HarryHung/gps-unified-pipeline/releases) -2. Go into the local copy of the repository and the pipeline is ready to use without installation + Download and unzip/extract the [latest release](https://github.com/HarryHung/gps-unified-pipeline/releases) +2. Go into the local directory of the pipeline and it is ready to use without installation (the directory name might be different) ``` cd gps-unified-pipeline ``` @@ -95,7 +95,7 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca > ⚠️ If this is the first run and initialisation was not performed, an Internet connection is required. > ℹ️ By default, Docker is used as the container engine and all the processes are executed by the local machine. See [Profile](#profile) for details on running the pipeline with Singularity or on a HPC cluster. -- You can run the pipeline without options. It will attempt to get the raw reads from the default location (i.e. `input` directory inside the `gps-unified-pipeline` local repository) +- You can run the pipeline without options. It will attempt to get the raw reads from the default location (i.e. `input` directory inside the `gps-unified-pipeline` local directory) ``` ./run_pipeline ``` @@ -103,8 +103,9 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca ``` ./run_pipeline --reads /path/to/raw-reads-directory ``` -- For a test run, you could use the included test reads in the `test_input` directory +- For a test run, you could obtain a small test dataset by running the included `download_test_input` script. The dataset will be saved to the `test_input` directory inside the pipeline local directory. You can then run the pipeline on the test data ``` + ./download_test_input ./run_pipeline --reads test_input ``` - `9870_5#52` will fail the Taxonomy QC and hence Overall QC, therefore without analysis results @@ -141,12 +142,12 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca - If the run has been completed and you do not intend to use the `-resume` option or those intermediate files, you can remove the intermediate files using one of the following ways: - Run the included `clean_pipeline` script - It runs the commands in manual removal for you - - It removes the `work` directory and log files within the `gps-unified-pipeline` local repository + - It removes the `work` directory and log files within the `gps-unified-pipeline` local directory ``` ./clean_pipeline ``` - Manual removal - - Remove the `work` directory and log files within the `gps-unified-pipeline` local repository + - Remove the `work` directory and log files within the `gps-unified-pipeline` local directory ``` rm -rf work rm -rf .nextflow.log* @@ -169,9 +170,9 @@ The pipeline is compatible with [Launchpad](https://help.tower.nf/23.2/launch/la ``` ./run_pipeline [option] [value] ``` -> ℹ️ To permanently change the value of an option, edit the `nextflow.config` file inside the `gps-unified-pipeline` local repository. +> ℹ️ To permanently change the value of an option, edit the `nextflow.config` file inside the `gps-unified-pipeline` local directory. -> ℹ️ `$projectDir` is a [Nextflow built-in implicit variables](https://www.nextflow.io/docs/latest/script.html?highlight=projectdir#implicit-variables), it is defined as the directory where the `gps-unified-pipeline` local repository is stored. +> ℹ️ `$projectDir` is a [Nextflow built-in implicit variables](https://www.nextflow.io/docs/latest/script.html?highlight=projectdir#implicit-variables), it is defined as the local directory of `gps-unified-pipeline`. > ℹ️ Pipeline options are not built-in Nextflow options, they are lead with `--` instead of `-` @@ -256,7 +257,7 @@ The pipeline is compatible with [Launchpad](https://help.tower.nf/23.2/launch/la | `--lite` | `true` or `false`
(Default: `false`) | ⚠️ Enable this option breaks Nextflow resume function.
Reduce storage requirement by removing intermediate `.sam` and `.bam` files once they are no longer needed while the pipeline is still running.
The quantity of reduction of storage requirement cannot be guaranteed.
Can be enabled by including `--lite` without value. | # Output -- By default, the pipeline outputs the results into the `output` directory inside the `gps-unified-pipeline` local repository +- By default, the pipeline outputs the results into the `output` directory inside the `gps-unified-pipeline` local directory - It can be changed by adding the option `--output` ``` ./run_pipeline --output /path/to/output-directory From 386baaa5f6f3fae614dbefb6de2cdf5c4c7bce6c Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 17 Nov 2023 00:50:57 +0000 Subject: [PATCH 156/157] Fix: ref_start and ref_end could be non-numeric Former-commit-id: ec4d97362dd007e1c683ed8c5cc12644e08bc6ea --- bin/parse_other_resistance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/parse_other_resistance.py b/bin/parse_other_resistance.py index 3d957d2..def5488 100755 --- a/bin/parse_other_resistance.py +++ b/bin/parse_other_resistance.py @@ -118,7 +118,7 @@ def find_hit(target_dict): if ref_group.lower() == "folp": df_ref_group_hit = df_ref_group[ (df_ref_group["ref_ctg_effect"].str.lower().isin(['fshift', 'trunc', 'indel', 'indels', 'ins', 'multiple'])) & - (df_ref_group["ref_start"].astype(int).between(166, 201) | df_ref_group["ref_end"].astype(int).between(166, 201)) + (df_ref_group["ref_start"].apply(pd.to_numeric, errors='coerce').between(166, 201) | df_ref_group["ref_end"].apply(pd.to_numeric, errors='coerce').between(166, 201)) ] for ref_start, ref_end, ref_name, ref_ctg_effect in df_ref_group_hit[['ref_start', 'ref_end', 'ref_name', 'ref_ctg_effect']].itertuples(index=False, name=None): pos = ref_start if ref_start == ref_end else f'{ref_start}-{ref_end}' From 2d38b1982d6109f312f015bfc69d7b6f88c5ed8d Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 17 Nov 2023 12:05:56 +0000 Subject: [PATCH 157/157] Bump version to 1.0.0-rc2 Former-commit-id: c11bb4b2cd2688ee8c37de1133f5f2c611ea47c1 --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index cae54d5..741abd5 100644 --- a/main.nf +++ b/main.nf @@ -1,7 +1,7 @@ #!/usr/bin/env nextflow // Version of this release -pipelineVersion = '1.0.0-rc1' +pipelineVersion = '1.0.0-rc2' // Import workflow modules include { PIPELINE } from "$projectDir/workflows/pipeline"