From 33bdad08989c27f6a77ecd59a34125758a7f877c Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 21 Jun 2023 13:32:29 +0000 Subject: [PATCH 001/157] Initial implementation of ARIBA Former-commit-id: 98d9289b72bcf6f111b9f56de90d71c3df70c8fc --- data/ariba_metadata.tsv | 73 ++++ data/ariba_sequences.fasta | 715 +++++++++++++++++++++++++++++++++++++ modules/amr.nf | 43 ++- nextflow.config | 4 +- workflows/pipeline.nf | 15 +- 5 files changed, 828 insertions(+), 22 deletions(-) create mode 100644 data/ariba_metadata.tsv create mode 100644 data/ariba_sequences.fasta diff --git a/data/ariba_metadata.tsv b/data/ariba_metadata.tsv new file mode 100644 index 0000000..2a73517 --- /dev/null +++ b/data/ariba_metadata.tsv @@ -0,0 +1,73 @@ +reference coding_yes(1)_no(0) pr/ab(0)_var(1) description of the variant Group FreeText_Drug +aph_3prime_III_1_M26832 1 0 . . Kanamycin resistance +ermB_1_JN899585 1 0 . . Erythromycin and Clindamycin resistance +ermB_10_U86375 1 0 . . Erythromycin and Clindamycin resistance +ermB_16_X82819 1 0 . . Erythromycin and Clindamycin resistance +ermB_20_AF109075 1 0 . . Erythromycin and Clindamycin resistance +ermC_13_M13761 1 0 . . Erythromycin and Clindamycin resistance +cat_5_U35036 1 0 . . Chloramphenicol resistance +catpC194_1_NC_002013 1 0 . . Chloramphenicol resistance +catpC233_1_AY355285 1 0 . . Chloramphenicol resistance +catQ_1_M55620 1 0 . . Chloramphenicol resistance +msrD_2_AF274302 1 0 . . Erythromycin resistance +msrD_3_AF227520 1 0 . . Erythromycin resistance +mefA_10_AF376746 1 0 . . Erythromycin resistance +mefE_AE007317 1 0 . . Erythromycin resistance +tetM_1_X92947 1 0 . . Tetracycline resistance +tetM_12_FR671418 1 0 . . Tetracycline resistance +tetK_4_U38428 1 0 . . Tetracycline resistance +tetM_13_AM990992 1 0 . . Tetracycline resistance +tetM_2_X90939 1 0 . . Tetracycline resistance +tetM_4_X75073 1 0 . . Tetracycline resistance +tetM_5_U58985 1 0 . . Tetracycline resistance +tetM_8_X04388 1 0 . . Tetracycline resistance +tetS_M 1 0 . . Tetracycline resistance +tetS_M_MH283012 1 0 . . Tetracycline resistance +tetAp_L20800 1 0 . . Tetracycline resistance +tetBp_L20800 1 0 . . Tetracycline resistance +tetAQ2_Z21523 1 0 . . Tetracycline resistance +tetS_FN555436 1 0 . . Tetracycline resistance +tetT_L42544 1 0 . . Tetracycline resistance +tetW_AJ222769 1 0 . . Tetracycline resistance +tet32_AJ295238 1 0 . . Tetracycline resistance +tet36_AJ514254 1 0 . . Tetracycline resistance +tet44_FN594949 1 0 . . Tetracycline resistance +tet58_KY887560 1 0 . . Tetracycline resistance +tet_M74049 1 0 . . Tetracycline resistance +tetS_M_HM367711 1 0 . . Tetracycline resistance +tetS_M_AY534326 1 0 . . Tetracycline resistance +tetM_M85225 1 0 . . Tetracycline resistance +tetS_FN555436 1 0 . . Tetracycline resistance +tetM_MH283017 1 0 . . tetracycline resistance +folA_AE007317 1 1 I100L . Trimethoprim +folP_AE007317 1 1 . . Sulfamethoxazole resistance on if insertions in 56-67 amino acids +gyrA_AE007317 1 1 S81F . Fluoroquinolone +gyrA_AE007317 1 1 S81Y . Fluoroquinolone +gyrA_AE007317 1 1 S81C . Fluoroquinolone +gyrA_AE007317 1 1 S81I . Fluoroquinolone +gyrA_AE007317 1 1 E85K . Fluoroquinolone +gyrA_AE007317 1 1 Q118A . Fluoroquinolone +gyrB_AE007317 1 1 E474K . Fluoroquinolone +parC_AE007317 1 1 A63T . Fluoroquinolone +parC_AE007317 1 1 S79F . Fluoroquinolone +parC_AE007317 1 1 S79Y . Fluoroquinolone +parC_AE007317 1 1 S79L . Fluoroquinolone +parC_AE007317 1 1 S79F . Fluoroquinolone +parC_AE007317 1 1 D83G . Fluoroquinolone +parC_AE007317 1 1 D83N . Fluoroquinolone +parE_AE007317 1 1 E474K . Fluoroquinolone +parE_AE007317 1 1 D435N . Fluoroquinolone +parE_AE007317 1 1 D435H . Fluoroquinolone +parE_AE007317 1 1 P454S . Fluoroquinolone +tetO_Y07780 1 0 . . Tetracycline resistance +ermBups_HG799494 0 0 . . Erythromycin and Clindamycin resistance +ermbTr_CP002121 0 0 . . Erythromycin and Clindamycin resistance +rplD_AE007317 1 1 . . Linezolid resistance (deletion within the L4 region of the gene PMID:24492357) +rpoB_AE007317 1 1 D489E . rifampicin resistance PMID:10508007-D415E +rpoB_AE007317 1 1 H499N . rifampicin resistance PMID:10508007-H425N +rpoB_AE007317 1 1 D489N . rifampicin resistance PMID:10508007-H415N +vanB_KC489787 1 0 . . Vacomycin resistance +vanD_EU999036 1 0 . . Vacomycin resistance +vanE_FJ872411 1 0 . . Vacomycin resistance +vanG_KF704242 1 0 . . Vacomycin resistance +otrA_X53401 1 0 . . Tetracycline resistance \ No newline at end of file diff --git a/data/ariba_sequences.fasta b/data/ariba_sequences.fasta new file mode 100644 index 0000000..8da1617 --- /dev/null +++ b/data/ariba_sequences.fasta @@ -0,0 +1,715 @@ +>aph_3prime_III_1_M26832 +ATGGCTAAAATGAGAATATCACCGGAATTGAAAAAACTGATCGAAAAATACCGCTGCGTAAAAGATACGGAAGGAATGTCTCCTGCTAAGGTATATAAGCTGGTGGGAGAAAATGAAAACCTATATTTAAAAATGACGGACAGCCGGTATAAAGGGACCACCTATGATGTGGAACGGGAAAAGGACATGATGCTATGGCTGGAAGGAAAGCTGCCTGTTCCAAAGGTCCTGCACTTTGAACGGCATGATGGCTGGAGCAATCTGCTCATGAGTGAGGCCGATGGCGTCCTTTGCTCGGAAGAGTATGAAGATGAACAAAGCCCTGAAAAGATTATCGAGCTGTATGCGGAGTGCATCAGGCTCTTTCACTCCATCGACATATCGGATTGTCCCTATACGAATAGCTTAGACAGCCGCTTAGCCGAATTGGATTACTTACTGAATAACGATCTGGCCGATGTGGATTGCGAAAACTGGGAAGAAGACACTCCATTTAAAGATCCGCGCGAGCTGTATGATTTTTTAAAGACGGAAAAGCCCGAAGAGGAACTTGTCTTTTCCCACGGCGACCTGGGAGACAGCAACATCTTTGTGAAAGATGGCAAAGTAAGTGGCTTTATTGATCTTGGGAGAAGCGGCAGGGCGGACAAGTGGTATGACATTGCCTTCTGCGTCCGGTCGATCAGGGAGGATATCGGGGAAGAACAGTATGTCGAGCTATTTTTTGACTTACTGGGGATCAAGCCTGATTGGGAGAAAATAAAATATTATATTTTACTGGATGAATTGTTTTAG +>ermB_1_JN899585 +ATGAACAAAAATATAAAATATTCTCAAAACTTTTTAACGAGTGAAAAAGTACTCAACCAAATAATAAAACAATTGAATTTAAAAGAAACCGATACCGTTTACGAAATTGGAACAGGTAAAGGGCATTTAACGACGAAACTGGCTAAAATAAGTAAACAGGTAACGTCTATTGAATTAGACAGTCATCTATTCAACTTATCGTCAGAAAAATTAAAACTGAATACTCGTGTCACTTTAATTCACCAAGATATTCTACAGTTTCAATTCCCTAACAAACAGAGGTATAAAATTGTTGGGAATATTCCTTACCATTTAAGCACACAAATTATTAAAAAAGTGGTTTTTGAAAGCCATGCGTCTGACATCTATCTGATTGTTGAAGAAGGATTCTACAAGCGTACCTTGGATATTCACCGAACACTAGGGTTGCTCTTGCACACTCAAGTCTCGATTCAGCAATTGCTTAAGCTGCCAGCGGAATGCTTTCATCCTAAACCAAAAGTAAACAGTGTCTTAATAAAACTTACCCGCCATACCACAGATGTTCCAGATAAATATTGGAAGCTATATACGTACTTTGTTTCAAAATGGGTCAATCGAGAATATCGTCAACTGTTTACTAAAAATCAGTTTCATCAAGCAATGAAACACGCCAAAGTAAACAATTTAAGTACCGTTACTTATGAGCAAGTATTGTCTATTTTTAATAGTTATCTATTATTTAACGGGAGGAAATAA +>ermB_10_U86375 +ATGAACAAAAATATAAAATATTCTCAAAACTTTTTAACGAGTGAAAAAGTACTCAACCAAATAATAAAACAATTGAATTTAAAAGAAACCGATACCGTTTACGAAATTGGAACAGGTAAAGGGCATTTAACGACGAAACTGGCTAAAATAAGTAAACAGGTAACGTCTATTGAATTAGACAGTCATCTATTCAACTTATCGTCAGAAAAATTAAAACTGAACATTCGTGTCACTTTAATTCACCAAGATATTCTACAGTTTCAATTCCCTAACAAACAGAGGTATAAAATTGTTGGGAATATTCCTTACCATTTAAGCACACAAATTATTAAAAAAGTGGTTTTTGAAAGCCATGCGTCTGACATCTATCTGATTGTTGAAGAAGGATTCTACAAGCGTACCTTGGATATTCACCGAACACTAGGGTTGCTCTTGCACACTCAAGTCTCGATTCAGCAATTGCTTAAGCTGCCAGCGGAATGCTTTCATCCTAAACCAAAAGTAAACAGTGTCTTAATAAAACTTACCCGCCATACCACAGATGTTCCAGATAAATATTGGAAGCTATATACGTACTTTGTTTCAAAATGGGTCAATCGAGAATATCGTCAACTGTTTACTAAAAATCAGTTTCATCAAGCAATGAAACACGCCAAAGTAAACAATTTAAGTACCGTTACTTATGAGCAAGTATTGTCTATTTTTAATAGTTATCTATTATTTAACGGGAGGAAATAA +>ermB_16_X82819 +ATGAACAAAAATATAAAATATTCTCAAAACTTTTTAACGAGTGAAAAAGTACTCAACCAAATAATAAAACAATTGAATTTAAAAGAAACCGATACCGTTTACGAAATTGGAACAGGTAAAGGGCATTTAACGACGAAACTGGCTAAAATAAGTAAACAGGTAACGTCTATTGAATTAGACAGTCATCTATTCAACTTATCGTCAGAAAAATTAAAACTGAATACTCGTGTCACTTTAATTCACCAAGATATTCTACAGTTTCAATTCCCTAACAAACAGAGGTATAAAATTGTTGGGAATATTCCTTACCATTTAAGCACACAAATTATTAAAAAAGTGGTTTTTGAAAGCCGTGCGTCTGACATCTATCTGATTGTTGAAGAAGGATTCTACAAGCGTACCTTGGATATTCACCGAACACTAGGGTTGCTCTTGCACACTCAAGTCTCGATTCAGCAATTGCTTAAGCTGCCAGCGGAATGCTTTCATCCTAAACCAAAAGTAAACAGTGTCTTAATAAAACTTACCCGCCATACCACAGATGTTCCAGATAAATATTGGAAGCTATATACGTACTTTGTTTCAAAATGGGTCAATCGAGAATATCGTCAACTGTTTACTAAAAATCAGTTTCATCAAGCAATGAAACACGCCAAAGTAAACAATTTAAGTACCATTACTTATGAGCAAGTATTGTCTATTTTTAATAGTTATCTATTATTTAACGGGAGGAAATAA +>ermB_20_AF109075 +ATGAACAAAAATATAAAATATTCTCAAAACTTTTTAACGAGTGAAAAAGTACTCAACCAAATAATAAAACAATTGAATTTAAAAGAAACCGATACCGTTTACGAAATTGGAACAGGTAAAGGGCATTTAACGACGAAACTGGCTAAAATAAGTAAACAGGTAACGTCTATTGAATTAGACAGTCATCTATTCAACTTATCGTCAGAAAAATTAAAACTGAATACTCGTGTCACTTTAATTCACCAAGATATTCTACAGTTTCAATTCCCTAACAAACAGAGGTATAAAATTGTTGGGAGTATTCCTTACCATTTAAGCACACAAATTATTAAAAAAGTGGTTTTTGAAAGCCGTGCGTCTGACATCTATCTGATTGTTGAAGAAGGATTCTACAAGCGTACCTTGGATATTCACCGAACACTAGGGTTGCTCTTGCACACTCAAGTCTCGATTAAGCAATTGCTTAAGCTGCCAGCTGAATGCTTTCATCCTAAACCAAAAGTAAACAGTGCCTTAATAAAACTTACCCGCCATACCACAGATGTTCCAGATAAATATTGGAAACTATATACGTACTTTGTTTCAAAATGGGTCAATCGAGAATATCGTCAACTGTTTACTAAAAATCAGTTTCATCAAGCAATGAAATACGCCAAAGTAAACGATTTAAGTACCGTTACTTATGAGCAAGTATTGTCTATTTTTAATAGTTATCTATTATTTAACGGGAGGAAATAA +>ermC_13_M13761 +ATGAACGAGAAAAATATAAAACACAGTCAAAACTTTATTACTTCAAAACATAATATAGATAAAATAATGACAAATATAAGATTAAATGAACATGATAATATCTTTGAAATCGGCTCAGGAAAAGGGCATTTTACCCTTGAATTAGTACAGAGGTGTAATTTCGTAACTGCCATTGAAATAGACCATAAATTATGCAAAACTACAGAAAATAAACTTGTTGATCACGATAATTTCCAAGTTTTAAACAAGGATATATTGCAGTTTAAATTTCCTAAAAACCAATCCTATAAAATATTTGGTAATATACCTTATAACATAAGTACGGATATAATACGCAAAATTGTTTTTGATAGTATAGCTGATGAGATTTATTTAATCGTGGAATACGGGTTTGCTAAAAGATTATTAAATACAAAACGCTCATTGGCATTATTTTTAATGGCAGAAGTTGATATTTCTATATTAAGTATGGTTCCAAGAGAATATTTTCATCCTAAACCTAAAGTGAATAGCTCACTTATCAGATTAAATAGAAAAAAATCAAGAATATCACACAAAGATAAACAGAAGTATAATTATTTCGTTATGAAATGGGTTAACAAAGAATACAAGAAAATATTTACAAAAAATCAATTTAACAATTCCTTAAAACATGCAGGAATTGACGATTTAAACAATATTAGCTTTGAACAATTCTTATCTCTTTTCAATAGCTATAAATTATTTAATAAGTAA +>cat_5_U35036 +ATGACTTTTAATATTATTAATTTGGAAACTTGGGATAGAAAAGAATATTTTAATCATTATTTCAATCAACAAACAACTTACAGTGTTACTAAAGAATTTGATATCACTTTACTTAAAAGTATGATAAAAAATAAAGGATATGAACTGTATCCTGCTTTGATTTATACAATTGTAAATATTATAAATCAAAATAAAGTATTTAGAACAGGAATTAATAGTGAGGGAAATTTGGGTTATTGGGATAAATTAAACCCTTTATATACAGTCTTTAATAAAGAAACTGAAAAATTTTCTAACATTTGGACAGAATCAAATGTTAGTTTTAATTCTTTTTATAATAGTTATAAGAGTGACTTACTTGAATATAAAGATAAAAATGAAATGTTTCCTAAAAAACCAATACCTGAAAACACAGTTCCTATTTCGATGATTCCTTGGATTGATTTTAGTTCATTTAATTTAAATATTGGTAATAATAGTAGATTCCTATTGCCAATTATTACAATAGGTAAATTTTATAGTAAGAATAATAAGATCTATTTACCAGTCTCATTGCAAGTTCATCATGCGGTATGTGATGGTTACCATGTTTCATTATTTATGAGTGAATTTCAAAATATAGTTGATAGTGTAAATGAATGGATTTAA +>catpC194_1_NC_002013 +ATGAACTTTAATAAAATTGATTTAGACAATTGGAAGAGAAAAGAGATATTTAATCATTATTTGAACCAACAAACGACTTTTAGTATAACCACAGAAATTGATATTAGTGTTTTATACCGAAACATAAAACAAGAAGGATATAAATTTTACCCTGCATTTATTTTCTTAGTGACAAGGGTGATAAACTCAAATACAGCTTTTAGAACTGGTTACAATAGCGACGGAGAGTTAGGTTATTGGGATAAGTTAGAGCCACTTTATACAATTTTTGATGGTGTATCTAAAACATTCTCTGGTATTTGGACTCCTGTAAAGAATGACTTCAAAGAGTTTTATGATTTATACCTTTCTGATGTAGAGAAATATAATGGTTCGGGGAAATTGTTTCCCAAAACACCTATACCTGAAAATGCTTTTTCTCTTTCTATTATTCCATGGACTTCATTTACTGGGTTTAACTTAAATATCAATAATAATAGTAATTACCTTCTACCCATTATTACAGCAGGAAAATTCATTAATAAAGGTAATTCAATATATTTACCGCTATCTTTACAGGTACATCATTCTGTTTGTGATGGTTATCATGCAGGATTGTTTATGAACTCTATTCAGGAATTGTCAGATAGGCCTAATGACTGGCTTTTATAA +>catpC233_1_AY355285 +ATGACTTTTAATATTATTAATTTAGAAACTTGGGATAGAAAAGAATATTTCAATCATTATTTTAATCAACAAACAACTTATAGTGTTACTAAAGAATTAGATATTACCTTGTTAAAAAGTATGATAAAAGATAAAGGATATGAACTGTATCCTGCTTTGATTCATGCAATTGTAAGTGTTATAAATCGAAATAAAGTATTTAGAACAGGGATTAATAGTGAGGGGAATTTGGGTTATTGGGATAAATTAGAACCTTTATATACAGTCTTTAATAAAGAAACTGAAAAATTTTCTAATATTTGGACAGAATCAAATGCTAGTTTTAACTCTTTTTATAATAGTTATAAGAATGATTTATTTAAATATAAAGATAAAAATGAAATGTTTCCTAAAAAGCCGATACCTGAAAACACAGTTCCTATCTCGATGATTCCTTGGATTGATTTTAGTTCATTTAATTTAAATATTGGTAATAATAGTAGATTTTTATTGCCAATTATTACAATAGGTAAATTTTATAGTAAGGATGATAAGATCTATTTACCATTTTCATTGCAAGTTCATCATGCAGTATGTGATGGTTACCATGTTTCATTATTTATGAATGAATTTCAAAATATAATTGATAATGTAAATGAATGGATTTAA +>catQ_1_M55620 +ATGAAATTTAATTTGATAGATATTGAGGATTGGAATAGAAAGCCATACTTTGAGCATTATTTAAATGCGGTTAGGTGCACTTACAGTATGACTGCAAATATAGAGATAACTGGTTTACTGCGTGAAATTAAACTTAAGGGCCTGAAACTGTACCCTACGCTTATTTATATCATCACAACTGTGGTTAACCGTCACAAGGAGTTCCGCACCTGTTTTGATCAAAAAGGTAAGTTAGGATACTGGGATAGTATGAACCCAAGTTATACTGTCTTTCATAAGGATAACGAAACTTTTTCAAGTATTTGGACAGAGTATGACGAGAACTTCCCACGTTTTTACTATAATTACCTTGAGGATATTAGAAACTATAGCGACGTTTTGAATTTCATGCCTAAGACAGGTGAACCTGCTAATACAATTAATGTGTCCAGCATTCCTTGGGTGAATTTTACCGGATTCAACCTGAATATATACAATGATGCAACATATCTAATCCCTATTTTTACTTTGGGTAAGTATTTTCAGCAGGATAATAAAATTTTATTACCTATGTCTGTACAGGTGCATCATGCGGTTTGCGACGGTTATCATATAAGCAGATTTTTTAATGAGGCACAGGAATTAGCGTCAAATTATGAGACATGGTTAGGAGAAAAATAA +>msrD_2_AF274302 +ATGGAATTAATATTAAAAGCAAAAGACATTCGTGTGGAATTCAAAGGACGCGATGTTTTAGATATAAATGAATTAGAAGTATATGATTATGACCGTATTGGTTTAGTAGGAGCAAATGGTGCTGGAAAAAGCACTTTACTCAGGGTACTTTTAGGAGAATTAACTCCCCCAGGATGTAAAATGAATCGTCTGGGTGAACTTGCCTATATTCCCCAGTTGGACGAAGTAACTCTGCAGGAGGAAAAAGATTTTGCACTTGTAGGCAAGCTAGGTGTTGAGCAATTAAATATACAGACTATGAGCGGTGGTGAAGAAACAAGGCTTAAAATAGCACAGGCCTTATCGGCACAGGTTCATGGTATTTTAGCGGATGAACCTACGAGCCATTTAGACCGTGAAGGAATTGATTTTCTAATAGGACAGCTAAAATATTTTACAGGTGCACTGTTAGTTATTAGCCATGACCGCTATTTTCTTGATGAAATAGTAGATAAAATATGGGAACTGAAAGATGGCAAAATCACTGAGTATTGGGGAAACTATTCTGATTATCTTCGTCAGAAAGAGGAAGAACGTAAGAGCCAAGCTGCAGAATACGAACAATTTATTGCGGAACGTGCCCGATTGGAAAGGGCTGCGGAGGAAAAGCGAAAACAGGCTCGTAAAATAGAACAGAAGGCAAAAGGTTCTTCAAAGAAAAAAAGTACTGAAGACGGAGGGCGTTTAGCTCATCAAAAATCAATAGGAAGTAAGGAAAAAAAGATGTATAATGCTGCTAAAACCCTAGAGCACAGGATTGCGGCCTTAGGAAAAGTAGAAGCTCCGGAAGGCATTCGCAGAATTCGTTTCAGGCAAAGTAAAGCATTGGAGCTCCATAATCCATACCCTATAGTCGGTGCAGAAATTAATAAAGTATTTGGGGATAAGGCTCTGTTTGAAAATGCATCTTTTCAAATTCCGTTAGGAGCAAAAGTGGCGTTAACTGGTGGTAATGGAATCGGAAAAACAACTTTAATCCAAATGATCTTAAACCATGAAGAAGGAATTTCTATTTCGCCTAAGGCAAAAATAGGTTACTTTGCACAGAATGGTTACAAGTACAACAGTAATCAGAATGTTATGGAGTTTATGCAGAAGGATTGTGACTACAATATATCAGAAATTCGTTCAGTGCTAGCATCTATGGGGTTCAAACAGAACGATATTGGAAAAAGTTTATCTGTTTTAAGCGGTGGAGAAATTATAAAATTGTTGCTTGCTAAAATGCTCATGGGTAGATATAACATCCTAATAATGGATGAACCCAGTAACTTCCTTGACATACCAAGTTTAGAGGCTTTGGAAATACTAATGAAGGAGTACACCGGAACTATCGTGTTTATCACCCACGATAAACGATTACTCGAAAATGTAGCAGATGTAGTTTATGAAATTAGAGATAAGAAAATAAATCTGAAACATTAA +>msrD_3_AF227520 +ATGGAATTAATATTAAAAGCAAAAGACATTAGTGTGGAATTCAAAGGACACGATGTTTTAGATATAAATGAATTAGAAGTATATGATTATGACCGTATTGGTTTAGTAGGAGCAAATGGTGCAGGAAAAAGCACTTTATTCAAGGTACTTTTAGGAGAATTAATTCCCCCAGGATGTAAAATGAATCATCTGGGTGAACTTGCCTATATTCCCCAGTTGGACGAAGTAACTCTGCAGGAGGAAAAAGATTTTGCGCTTGTAGGCAAGCTAGGTGTTGAGCAATTAAATATACAGACCATGAGCGGTGGTGAAGAAACAAGGCTTAAAATAGCACAGGCCTTATCGGCACAGGTTCATGGTATTTTAGCGGATGAACCTACGAGCCATTTAGACCGTGAAGGAATTGATTTTCTAATAGGACAGCTAAAATATTTTACAGGTGCACTGTTAGTTATTAGCCATGACCGCTATTTTCTTGATGAAATAGTAGATAAAATATGGGAACTGAAAGATGGCAAAATCACTGAGTATTGGGGAAACTATTCTGATTATCTTCGTCAGAAAGAGGAAGAACGTAAGAGACAAGCTGCAGAATACGAACAATTTATTGCGGAACGTGCTCGATTGGAAAGGGCTGCGGAGGAAAAGCGAAAACAGGCTCGTAAAATAGAACAGAAGGCAAAAGGTTCTTCAAAGAAAAAAAGTACTGAAGGCGGAGGGCGTTTAGCTCATCAAAAATCAATAGGAAGTAAGGAAAAAAAGATGCATAATGCCGCTAAATCCCTAGAGAACAGGATTGCGGCATTAGGAAAAGTAGAAGCTCCGGAAGGCATTCGCAGAATTCGTTTCAGGCAAAGTAAAGCATTGGAGCTCCATAATCCATACCCTATAGTCGGTGCGGAAATTAATAAAGTATTTGGGGATAAGGCACTGTTTGAAAATGCATCTTTTCAAATTCCGCTAGGAGCAAAAGTGGCATTAACGGGTGGTAATGGAACCGGAAAAACAACTTTAATCCAAATGATCTTAAACCATGAAGAAGGAATTTCTATTTCACCTAAGGCAAAAATAGGTTACTTTGCACAGAATGGTTACAAGTACAACAGTAATCAGAATGTTATGGAGTTTATGCAGAAGGATTGTGATTACAATATATCAGAAATTCGTTCTGTGCTAGCATCTATGGGGTTCAAACAGAACGATATTGGAAAAAGCTTATCTGTTTTAAGCGGTGGAGAAATTATAAAATTGTTGCTTGCTAAAATGCTCATGGGTAGATATAACATCCTAATAATGGATGAACCCAGTAACTTCCTTGACATACCAAGTTTAGAGGCTTTGGAAATACTAATGAAGGAGTACACCGGAACTATCGTGTTTATCACCCACGATAAACGATTACTCGAAAATGTAGCTGATGTAGTTTATGAAATTAGAGATAAGAAAATTAAGCTGAAACATTAA +>mefA_10_AF376746 +ATGGAAAAATACAACAATTGGAAACGAAAATTTTATGCAATATGGGCAGGGCAAGCAGTATCATTAATCACTAGTGCCATCCTGCAAATGGCGATTATTTTTTACCTTACAGAAAAAACAGGATCTGCGATGGTCTTGTCTATGGCTTCATTAGTAGGTTTTTTACCCTATGCGATTTTGGGACCTGCCATTGGTGTGCTAGTGGATCGTCATGATAGGAAGAAGATAATGATTGGTGCCGATTTAATTATCGCAGCAGCTGGTGCAGTGCTTGCTATTGTTGCATTCTGTATGGAGCTACCTGTCTGGATGATTATGATAGTATTGTTTATCCGTAGCATTGGAACAGCTTTTCATACCCCAGCACTCAATGCGGTTACACCACTTTTAGTACCAGAAGAACAGCTAACGAAATGCGCAGGCTATAGTCAGTCTTTGCAGTCTATAAGCTATATTGTTAGTCCGGCAGTTGCAGCACTCTTATACTCCGTTTGGGATTTAAATGCTATTATTGCCATCGACGTATTGGGTGCTGTGATTGCATCTATTACGGTAGCAATTGTACGTATACCTAAGCTGGGTAATCAAGTGCAAAGTTTAGAACCAAATTTCATAAGGGAGATGAAAGAAGGAGTTGTGGTTCTGAGACAAAACAAAGGATTGTTTGCCTTATTACTCTTAGGAACACTATATACTTTTGTTTATATGCCAATCAATGCACTATTTCCTTTAATAAGCATGGAACACTTTAATGGAACGCCTGTGCATATTTCTATTACGGAAATTTCCTTTGCATTTGGGATGCTAGCAGGAGGCTTATTATTAGGAAGATTAGGGGGCTTCGAAAAGCATGTATTACTAATAACAAGTTCATTTTTTATAATGGGGACCAGTTTAGCCGTTTCGGGAATACTTCCTCCAAATGGATTTGTAATATTCGTAGTTTGCTGTGCAATAATGGGGCTTTCGGTGCCATTTTATAGCGGTGTGCAAACAGCTCTTTTTCAGGAGAAAATTAAGCCTGAATATTTAGGACGTGTATTTTCTTTGATCGGAAGTATCATGTCACTTGCTATGCCAATTGGGTTAATTCTTTCTGGATTCTTTGCTGATAAAATCGGTGTAAATCATTGGTTTTTACTATCAGGTATTTTAATTATTGGCATTGCTATAGTTTGCCAAATGATAACTGAGGTTAGAAAATTAGATTTAAAATAA +>mefE_AE007317 +TTGAAAATAGATAAAAAAAACGAGGCTTTCCTTATTGTAAGTAGAGGCATATCTCGAATTGGAGATATTATGTTTGACTTTGCGAATAATACCTTTCTTGCAGGATTAAATCCAACATCTTTATCATTGGTTGCAGTATATCAGTCACTAGAAAGTGTGATAGGTGTTCTTTTTAATTTATTTGGTGGAGTCATTGCAGATAGTTTCAAGCGGAAAAAAATTATTATTGTTGCAAATATCTTATGTGGTATTGCTTGTATAATTCTTTCATTCATATCACAAGAGCAGTGGATGGTCTTTGCAATTGTCATCACTAATATTATCTTGGCATTTATGAGTGCTTTTTCTGGACCGTCCTATAAAGCATTTACAAAAGAAATTGTAAAAAAGGATAGTATATCACAACTTAATTCATTGCTAGAGATAACAAGTACTATAATTAAAGTAACAATACCAATGGTAGCAATTTTATTATATAAGCTACTTGGGATACATGGTGTTTTACTATTGGATGGATTCTCATTTCTAATTGCTGCATCACTGATTTCCTTTATTGTACCCGTTAATGACGAAGTGGTCACAAAGGATAAAATGACAATAGGAGGAGTTTTAAATGACTTAAAAATAGGGTTTAAGTATATTTATAGTCATAAGACAATATTTATGATTATTATTCTCTCTGCTTTTGTTAATTTTTTTCTAGCAGCTTATAATTTATTGTTACCTTATAGTAATCAAATGTTTGGAGAAATTTCAGATGGGCTTTATGGTGTTTTTCTAACTGCGGAAGCAATTGGAGGATTTATTGGAGCGATATTAAGTGGTGTTATAAATAAAACCTTGTCAAGCAAACGTTTAATGGTCTTCTTATCATGTTCAGGATTGATGTTAATGCTATCAACGCCACTCTATTTTTTGTTTCAAAACTTCATTATTCTAGCCTTTTCTCCGGCATTATTTAGTCTATTTATTTCTATTTTTAATATTCAATTTTTCTCTATTGTTCAAAGAGAAGTTGATACTGAGTTTCTCGGTAGAGTCTTTGGAATCATCTTTACGGTAGCTATTCTTTTTATGCCAGTTGGGTCTGGATTTTTCTCAGTAGTTTTAAATCCTAACAATACTTTTAATCTTTTTATTATTGGTGTATCTATTACGATATTATCGCTAATATTCAGCACGCTATTGAAGAGGTATGATAAAAATAGCTGA +>tetM_1_X92947 +ATGAAAATTATTAATATTGGAGTTTTAGCTCATGTTGATGCAGGAAAAACTACCTTAACAGAAAGCTTATTATATAACAGTGGAGCGATTACAGAATTAGGAAGCGTGGACAAAGGTACAACGAGGACGGATAATACGCTTTTAGAACGTCAGAGAGGAATTACAATTCAGACAGGAATAACCTCTTTTCAGTGGGAAAATACGAAGGTGAACATCATAGACACGCCAGGACATATGGATTTCTTAGCAGAAGTATATCGTTCATTATCAGTTTTAGATGGGGCAATTCTACTGATTTCTGCAAAAGATGGCGTACAAGCACAAACTCGTATATTATTTCATGCACTTAGGAAAATGGGGATTCCCACAATCTTTTTTATCAATAAGATTGACCAAAATGGAATTGATTTATCAACGGTTTATCAGGATATTAAAGAGAAACTTTCTGCCGAAATTGTAATCAAACAGAAGGTAGAACTGTATCCTAATGTGTGTGTGACGAACTTTACCGAATCTGAACAATGGGATACGGTAATAGAGGGAAACGATGACCTTTTAGAGAAATATATGTCCGGTAAATCATTAGAAGCATTGGAACTCGAACAAGAGGAAAGCATAAGATTTCAGAATTGTTCTCTGTTCCCTCTTTATCATGGAAGTGCAAAAAGTAATATAGGGATTGATAACCTTATAGAAGTTATTACTAATAAATTTTATTCATCAACACATCGAGGTCCGTCTGAACTTTGCGGAAATGTTTTCAAAATTGAATATACAAAAAAAAGACAACGTCTTGCATATATACGCCTTTATAGTGGAGTACTACATTTACGAGATTCGGTTAGAGTATCAGAAAAAGAAAAAATAAAAGTTACAGAAATGTATACTTCAATAAATGGTGAATTATGTAAGATTGATAGAGCTTATTCTGGAGAAATTGTTATTTTGCAAAATGAGTTTTTGAAGTTAAATAGTGTTCTTGGAGATACAAAACTATTGCCACAGAGAAAAAAGATTGAAAATCCGCACCCTCTACTACAAACAACTGTTGAACCGAGTAAACCTGAACAGAGAGAAATGTTGCTTGATGCCCTTTTGGAAATCTCAGATAGTGATCCGCTTCTACGATATTACGTGGATTCTACGACACATGAAATTATACTTTCTTTCTTAGGGAAAGTACAAATGGAAGTGATTAGTGCACTGTTGCAAGAAAAGTATCATGTGGAGATAGAACTAAAAGAGCCTACAGTCATTTATATGGAGAGACCGTTAAAAAATGCAGAATATACCATTCACATCGAAGTGCCGCCAAATCCTTTCTGGGCTTCCATTGGTTTATCTGTATCACCGCTTCCGTTGGGAAGTGGAATGCAGTATGAGAGCTCGGTTTCTCTTGGATACTTAAATCAATCATTTCAAAATGCAGTTATGGAAGGGATACGCTATGGTTGTGAACAAGGATTGTATGGTTGGAATGTGACGGACTGTAAAATCTGTTTTAAGTATGGCTTATACTATAGCCCTGTTAGTACCCCAGCAGATTTTCGGATGCTTGCTCCTATTGTATTGGAACAAGTCTTAAAAAAAGCTGGAACAGAATTGTTAGAGCCATATCTTAGTTTTAAAATTTATGCGCCACAGGAATATCTTTCACGAGCATACAACGATGCTCCTAAATATTGTGCGAACATCGTAGACACTCAATTGAAAAATAATGAGGTCATTCTTAGTGGAGAAATCCCTGCTCGGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGTTTGTTTAACAGAGTTAAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAAGTACGATATATGTTCAATAAAATAACTTAG +>tetM_12_FR671418 +ATGAAAATTATTAATATTGGAGTTTTAGCTCATGTTGATGCAGGAAAAACTACCTTAACAGAAAGCTTATTATATAACAGTGGAGCGATTACAGAATTAGGAAGCGTGGACAAAGGTACAACGAGGACGGATAATACGCTTTTAGAACGTCAGAGAGGAATTACAATTCAGACAGGAATAACCTCTTTTCAGTGGGAAAATACGAAGGTGAACATCATAGACACGCCAGGACATATGGATTTCTTAGCAGAAGTATATCGTTCATTATCAGTTTTAGATGGGGCAATTCTACTGATTTCTGCAAAAGATGGCGTACAAGCACAAACTCGTATATTATTTCATGCACTTAGGAAAATGGGGATTCCCACAATCTTTTTTATCAATAAGATTGACCAAAATGGAATTGATTTATCAACGGTTTATCAGGATATTAAAGAGAAACTTTCTGCCGAAATTGTAATCAAACAGAAGGTAGAACTGTATCCTAATATGTGTGTGACGAACTTTACCGAATCTGAACAATGGGATACGGTAATAGAGGGAAACGATGACCTTTTAGAGAAATATATGTCCGGTAAATCATTAGAAGCATTGGAACTCGAACAAGAGGAAAGCATAAGATTTCAGAATTGTTCTCTGTTCCCTCTTTATCATGGAAGTGCAAAAAGTAATATAGGGATTGATAACCTTATAGAAGTGATTACGAATAAATTTTATTCATCAACACATCGAGGTCAGTCTGAACTTTGCGGAAAAGTTTTCAAAATTGAGTATTCGGAAAAAAGACAGCGTCTTGCATATATACGTCTTTATAGTGGCGTACTGCATTTGCGAGATTCGGTTAGAATATCGGAAAAGGAAAAAATAAAAATTACAGAAATGTATACTTCAATAAATGGTGAATTATGTAAAATCGATAAGGCTTATTCCGGGGAAATTGTTATTTTGCAGAATGAGTTTTTGAAGTTAAATAGTGTTCTTGGAGATACAAAGCTATTGCCACAGAGAGAGAGAATTGAAAATCCCCTCCCTCTGCTGCAAACGACTGTTGAACCGAGCAAACCTCAACAAAGGGAAATGTTACTTGATGCACTTTTAGAAATCTCCGACAGTGACCCGCTTCTGCGATATTATGTGGATTCTGCGACACATGAAATCATACTTTCTTTCTTAGGGAAAGTACAAATGGAAGTGACTTGTGCTCTGCTGCAAGAAAAGTATCATGTGGAGATAGAAATAAAAGAGCCTACAGTCATTTATATGGAAAGACCGTTAAAAAAAGCAGAGTATACCATTCACATCGAAGTTCCACCGAATCCTTTCTGGGCTTCCATTGGTCTATCTGTAGCACAGCTTCCATTAGGGAGCGGAGTACAGTATGAGAGCTCGGTTTCTCTTGGATACTTAAATCAATCGTTTCAAAATGCAGTTATGGAGGGGATACGCTATGGCTGTGAACAAGGATTGTATGGTTGGAATGTGACGGACTGTAAAATCTGTTTTAAGTATGGCTTATACTATAGCCCTGTTAGTACCCCAGCAGATTTTCGGATGCTTGCTCCTATTGTATTGGAACAAGTCTTAAAAAAAGCTGGAACAGAATTGTTAGAGCCATATCTTAGTTTTAAAATTTATGCGCCACAGGAATATCTTTCACGAGCATACAACGATGCTCCTAAATATTGTGCGAACATCGTAGACACTCAATTGAAAAATAATGAGGTCATTCTTAGTGGAGAAATCCCTGCTCGGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGTTTGTTTAACAGAGTTAAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAAGTACGATATATGTTCAATAAAATAACTTAG +>tetK_4_U38428 +TTGTTTAGTTTATATAAAAAATTTAAAGGTTTGTTTTATAGCGTTTTATTTTGGCTTTGTATTCTTTCATTTTTTAGTGTATTAAATGAAATGGTTTTAAATGTTTCTTTACCTGATATTGCAAATCATTTTAATACTACTCCTGGAATTACAAACTGGGTAAACACTGCATATATGTTAACTTTTTCGATAGGAACAGCAGTATATGGAAAATTATCTGATTATATAAATATAAAAAAATTGTTAATTATTGGTATTAGTTTGAGCTGTCTTGGTTCATTGATTGCTTTTATTGGTCACAATCACTTTTTTATTTTGATTTTTGGTAGGTTAGTACAAGGAGTAGGATCTGCTGCATTCCCTTCACTGATTATGGTGGTTGTAGCTAGAAATATTACAAGAAAAAAACAAGGCAAAGCCTTTGGTTTTATAGGATCAATTGTAGCTTTAGGTGAAGGGTTAGGTCCTTCAATAGGGGGAATAATAGCACATTATATTCATTGGTCTTACCTACTTATACTTCCTATGATTACAATAGTAACTATACCTTTTCTTATTAAAGTAATGGTACCTGGTAAATCAACAAAAAATACATTAGATATCGTAGGTATTGTTTTAATGTCTATAAGTATTATATGTTTTATGTTATTTACGACAAATTATAATTGGACTTTTTTAATACTCTTCACAATCTTTTTTGTGATTTTTATTAAACATATTTCAAGAGTTTCTAACCCTTTTATTAATCCTAAACTAGGGAAAAACATTCCGTTTATGCTTGGTTTGTTTTCTGGTGGGCTAATATTTTCTATAGTAGCTGGTTTTATATCAATGGTGCCTTATATGATGAAAACTATTTATCATGTAAATGTAGCGACAATAGGTAATAGTGTTATTTTTCCTGGAACCATGAGTGTTATTGTTTTTGGTTATTTTGGTGGTTTTTTAGTGGATAGAAAAGGATCATTATTTGTTTTTATTTTAGGATCATTGTCTATCTCTATAAGTTTTTTAACTATTGCATTTTTTGTTGAGTTTAGTATGTGGTTGACTACTTTTATGTTTATATTTGTTATGGGCGGATTATCTTTTACTAAAACAGTTATATCAAAAATAGTATCAAGTAGTCTTTCTGAAGAAGAAGTTGCTTCTGGAATGAGTTTGCTAAATTTCACAAGTTTTTTATCAGAGGGAACAGGTATAGCAATTGTAGGAGGTTTATTGTCACTACAATTGATTAATCGTAAACTAGTTCTGGAATTTATAAATTATTCTTCTGGAGTGTATAGTAATATTCTTGTAGCCATGGCTATCCTTATTATTTTATGTTGTCTTTTGACGATTATTGTATTTAAACGTTCTGAAAAGCAGTTTGAATAG +>tetM_13_AM990992 +ATGAAAATTATTAATATTGGAGTTTTAGCTCATGTTGATGCAGGAAAAACTACCTTAACAGAAAGCTTATTATATAACAGTGGAGCGATTACAGAATTAGGAAGCGTGGACAAAGGTACAACGAGGACGGATAATACGCTTTTAGAACGTCAGAGAGGAATTACAATTCAGACAGGAATAACCTCTTTTCAGTGGGAAAATACGAAGGTGAACATCATAGACACGCCAGGACATATGGATTTCTTAGCAGAAGTATATCGTTCATTATCAGTTTTAGATGGGGCAATTCTACTGATTTCTGCAAAAGATGGCGTACAAGCACAAACTCGTATATTATTTCATGCACTTAGGAAAATGGGGATTCCCACAATCTTTTTTATCAATAAGATTGACCAAAATGGAATTGATTTATCAACGGTTTATCAGGATATTAAAGAGAAACTTTCTGCCGAAATTGTAATCAAACAGAAGGTAGAACTGTATCCTAATATGTGTGTGACGAACTTTACCGAATCTGAACAATGGGATACGGTAATAGAGGGAAACGATGACCTTTTAGAGAAATATATGTCCGGTAAATCATTAGAAGCATTGGAACTCGAACAAGAGGAAAGCATAAGATTTCAGAATTGTTCTCTGTTCCCTCTTTATCATGGAAGTGCAAAAAGTAATATAGGGATTGATAACCTTATAGAAGTTATTACTAATAAATTTTATTCATCAACACATCGAGGTCCGTCTGAACTTTGCGGAAATGTTTTCAAAATTGAATATACAAAAAAAAGACAACGTCTTGCATATATACGCCTTTATAGTGGAGTACTACATTTACGAGATTCGGTTAGAGTATCAGAAAAAGAAAAAATAAAAGTTACAGAAATGTATACTTCAATAAATGGTGAATTATGTAAGATTGATAGAGCTTATTCTGGAGAAATTGTTATTTTGCAAAATGAGTTTTTGAAGTTAAATAGTGTTCTTGGAGATACAAAACTATTGCCACAGAGAAAAAAGATTGAAAATCCGCACCCTCTACTACAAACAACTGTTGAACCGAGTAAACCTGAACAGAGAGAAATGTTGCTTGATGCCCTTTTGGAAATCTCAGATAGTGATCCGCTTCTACGATATTACGTGGATTCTACGACACATGAAATTATACTTTCTTTCTTAGGGAAAGTACAAATGGAAGTGATTAGTGCACTGTTGCAAGAAAAGTATCATGTGGAGATAGAACTAAAAGAGCCTACAGTCATTTATATGGAGAGACCGTTAAAAAATGCAGAATATACCATTCACATCGAAGTGCCGCCAAATCCTTTCTGGGCTTCCATTGGTTTATCTGTATCACCGCTTCCGTTGGGAAGTGGAATGCAGTATGAGAGCTCGGTTTCTCTTGGATACTTAAATCAATCGTTTCAAAATGCAGTTATGGAGGGGATACGCTATGGCTGTGAACAAGGATTGTATGGTTGGAATGTGACGGACTGTAAAATCTGTTTTAAGTATGGCTTATACTATAGCCCTGTTAGTACCCCAGCAGATTTTCGGATGCTTGCTCCTATTGTATTGGAACAAGTCTTAAAAAAAGCTGGAACAGAATTGTTAGAGCCATATCTTAGTTTTAAAATTTATGCGCCACAGGAATATCTTTCACGAGCATACAACGATGCTCCTAAATATTGTGCGAACATCGTAGACACTCAATTGAAAAATAATGAGGTCATTCTTAGTGGAGAAATCCCTGCTCGGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGTTTGTTTAACAGAGTTAAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAAGTACGATATATGTTCAATAAAATAACTTAG +>tetM_2_X90939 +ATGAAAATTATTAATATTGGAGTTTTAGCTCATGTTGATGCGGGAAAAACTACCTTAACAGAAAGCTTATTATATAACAGTGGAGCGATTACAGAATTAGGAAGCGTGGACAAAGGTACAACGAGGACGGATAATACGCTTTTAGAACGTCAGAGAGGAATTACAATTCAGACAGGAATAACCTCTTTTCAGTGGGAAAATACGAAGGTGAACATCATAGACACGCCAGGACATATGGATTTCTTAGCAGAAGTATATCGTTCATTATCAGTTTTAGATGGGGCAATTCTACTGATTTCTGCAAAAGATGGCGTACAAGCACAAACTCGTATATTATTTCATGCACTTAGGAAAATGGGGATTCCCACAATCTTTTTTATCAATAAGATTGACCAAAATGGAATTGATTTATCAACGGTTTATCAGGATATTAAAGAGAAACTTTCTGCCGAAATTGTAATCAAACAGAAGGTAGAACTGTATCCTAATATGTGTGTGACGAACTTTACCGAATCTGAACAATGGGATACGGTAATAGAGGGAAACGATGACCTTTTAGAGAAATATATGTCCGGTAAATCATTAGAAGCATTGGAACTCGAACAAGAGGAAAGCATAAGATTTCATAATTGTTCCCTGTTCCCTGTTTATCACGGAAGTGCAAAAAACAATATAGGGATTGATAACCTTATAGAAGTGATTACGAATAAATTTTATTCATCAACACATCGAGGTCAGTCTGAACTTTGCGGAAAAGTTTTCAAAATTGAGTATTCGGAAAAAAGACAGCGTCTTGCATATATACGTCTTTATAGTGGCGTACTGCATTTGCGAGATTCGGTTAGAATATCGGAAAAGGAAAAAATAAAAATTACAGAAATGTATACTTCAATAAATGGTGAATTATGTAAAATCGATAAGGCTTATTCCGGGGAAATTGTTATTTTGCAGAATGAGTTTTTGAAGTTAAATAGTGTTCTTGGAGATACAAAGCTATTGCCACAGAGAGAGAGAATTGAAAATCCCCTCCCTCTGCTGCAAACGACTGTTGAACCGAGCAAACCTCAACAAAGGGAAATGTTACTTGATGCACTTTTAGAAATCTCCGACAGTGACCCGCTTCTGCGATATTATGTGGATTCTGCGACACATGAAATCATACTTTCTTTCTTAGGGAAAGTACAAATGGAAGTGACTTGTGCTCTGCTGCAAGAAAAGTATCATGTGGAGATAGAAATAAAAGAGCCTACAGTCATTTATATGGAAAGACCGTTAAAAAAAGCAGAGTATACCATTCACATCGAAGTTCCACCGAATCCTTTCTGGGCTTCCATTGGTTTATCTGTATCACCGCTTCCGTTGGGAAGTGGAATGCAGTATGAGAGCTCGGTTTCTCTTGGATACTTAAATCAATCATTTCAAAATGCAGTTATGGAAGGGATACGCTATGGCTGTGAACAAGGATTGTATGGTTGGAATGTGACGGACTGTAAAATCTGTTTTAAGTATGGCTTATACTATAGCCCTGTTAGTACCCCAGCAGATTTTCGGATGCTTGCTCCTATTGTATTGGAACAAGTCTTAAAAAAAGCTGGAACAGAATTGTTAGAGCCATATCTTAGTTTTAAAATTTATGCGCCACAGGAATATCTTTCACGAGCATACACCGATGCTCCTAAATATTGTGCGAACATCGTAGACACTCAATTGAAAAATAATGAGGTCATTCTTAGTGGAGAAATCCCTGCTCGGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGTTTGTTTAACAGAGTTAAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAAGTACGATATATGTTCAATAAAATAACTTAG +>tetM_4_X75073 +ATGAAAATTATTAATATTGGAGTTTTAGCTCATGTTGATGCAGGAAAAACTACCTTAACAGAAAGCTTATTATATAACAGTGGAGCGATTACAGAATTAGGAAGCGTGGACAAAGGTACAACGAGGACGGATAATACGCTTTTAGAACGTCAGAGAGGAATTACAATTCAGACAGGAATAACCTCTTTTCAGTGGGAAAATACGAAGGTGAACATCATAGACACGCCAGGACATATGGATTTCTTAGCAGAAGTATATCGTTCATTATCAGTTTTAGATGGGGCAATTCTACTGATTTCTGCAAAAGATGGCGTACAAGCACAAACTCGTATATTATTTCATGCACTTAGGAAAATGGGGATTCCCACAATCTTTTTTATCAATAAGATTGACCAAAATGGAATTGATTTATCAACGGTTTATCAGGATATTAAAGAGAAACTTTCTGCCGAAATTGTAATCAAACAGAAGGTAGAACTGTATCCTAATATGTGTGTGACGAACTTTACCGAATCTGAACAATGGGATACGGTAATAGAGGGAAACGATGACCTTTTAGAGAAATATATGTCCGGTAAATCATTAGAAGCATTGGAACTCGAACAAGAGGAAAGCATAAGATTTCATAATTGTTCCCTGTTCCCTGTTTATCACGGAAGTGCAAAAAACAATATAGGGATTGATAACCTTATAGAAGTGATTACGAATAAATTTTATTCATCAACACATCGAGGTCCGTCTGAACTTTGCGGAAATGTTTTCAAAATTGAATATACAAAAAAAAGACAACGTCTTGCATATATACGCCTTTATAGTGGAGTACTACATTTACGAGATTCGGTTAGAGTATCAGAAAAAGAAAAAATAAAAGTTACAGAAATGTATACTTCAATAAATGGTGAATTATGTAAGATTGATAGAGCTTATTCTGGAGAAATTGTTATTTTGCAAAATGAGTTTTTGAAGTTAAATAGTGTTCTTGGAGATACAAAACTATTGCCACAGAGAAAAAAGATTGAAAATCCGCACCCTCTACTACGAACAACTGTTGAACCGAGTAAACCTGAACAGAGAGAAATGTTGCTTGATGCCCTTTTGGAAATCTCAGATAGTGATCCGCTTCTACGATATTACGTGGATTCTACGACACATGAAATTATACTTTCTTTCTTAGGGAAAGTACAAATGGAAGTGATTAGTGCACTGTTGCAAGAAAAGTATCATGTGGAGATAGAACTAAAAGAGCCTACAGTCATTTATATGGAGAGACCGTTAAAAAATGCAGAATATACCATTCACATCGAAGTGCCGCCAAATCCTTTCTGGGCTTCCATTGGTTTATCTGTATCACCGCTTCCGTTGGGAAGTGGAATGCAGTATGAGAGCTCGGTTTCTCTTGGATACTTAAATCAATCATTTCAAAATGCAGTTATGGAAGGGATACGCTATGGTTGCGAACAAGGATTATATGGTTGGAATGTGACGGATTGTAAAATCTGTTTTAAGTATGGCTTATACTATAGCCCTGTTAGTACCCCAGCAGATTTTCGGATGCTTGCTCCTATTGTATTGGAACAAGTCTTAAAAAAAGCTGGAACAGAATTGTTAGAGCCATATCTTAGTTTTAAAATTTATACGCCACAGGAATATCTTTCACGAGCATACAACGATGCTCCTAAATATTGTGCGAACATCGTAGACACTCAATTGAAAAATAATGAGGTCATTCTTAGTGGAGAAATCCCTGCTCGGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGTTTGTTTAACAGAGTTAAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAAGTACGATATATGTTCAATAAAATAACTTAG +>tetM_5_U58985 +ATGAAAATTATTAATATTGGAGTTTTAGCTCATGTTGATGCAGGAAAAACTACCTTAACAGAAAGCTTATTATATAACAGTGGAGCGATTACAGAATTAGGAAGCGTGGACAAAGGTACAACGAGGACGGATAATACGCTTTTAGAACGTCAGAGAGGAATTACAATTCAGACAGGAATAACCTCTTTTCAGTGGGAAAATACGAAGGTGAACATCATAGACACGCCAGGACATATGGATTTCTTAGCAGAAGTATATCGTTCATTATCAGTTTTAGATGGGGCAATTCTACTGATTTCTGCAAAAGATGGCGTACAAGCACAAACTCGTATATTATTTCATGCACTTAGGAAAATGGGGATTCCCACAATCTTTTTTATCAATAAGATTGACCAAAATGGAATTGATTTATCAACGGTTTATCAGGATATTAAAGAGAAACTTTCTGCCGAAATTGTAATCAAACAGAAGGTAGAACTGTATCCTAATATGTGTGTGACGAACTTTACCGAATCTGAACAATGGGATACGGTAATAGAGGGAAACGATGACCTTTTAGAGAAATATATGTCCGGTAAATCATTAGAAGCATTGGAACTCGAACAAGAGGAAAGCATAAGATTTCAGAATTGTTCTCTGTTCCCTCTTTATCATGGAAGTGCAAAAAGTAATATAGGGATTGATAACCTTATAGAAGTTATTACTAATAAATTTTATTCATCAACACATCGAGGTCCGTCTGAACTTTGCGGAAATGTTTTCAAAATTGAATATACAAAAAAAAGACAACGTCTTGCATATATACGTCTTTATAGTGGAGTACTACATTTACGAGATTCGGTTAGAATATCGGAAAAGGAAAAAATAAAAATTACAGAAATGTATACTTCAATAAATGGTGAATTATGTAAAATCGATAAGGCTTATTCTGGAGAAATTGTTATTTTGCAAAATGAGTTTTTGAAGTTAAATAGTGTTCTTGGAGATACAAAACTATTGCCACAGAGAAAAAGAATTGAAAATCCGCACCCTCTACTACAAACAACTGTTGAACCGAGTAAACCTGAACAGAGAGAAATGTTGCTTGATGCCCTTTTGGAAATCTCAGATAGTGATCCGCTTCTACGATATTACGTGGATTCTACGACACATGAAATTATACTTTCTTTCTTAGGGAAAGTACAAATGGAAGTGATTAGTGCACTGTTGCAAGAAAAGTATCATGTGGAGATAGAACTAAAAGAGCCTACAGTCATTTATATGGAGAGACCGTTAAAAAATGCAGAATATACCATTCACATCGAAGTGCCGCCAAATCCTTTCTGGGCTTCCATTGGTTTATCTGTATCACCGCTTCCGTTGGGAAGTGGAATGCAGTATGAGAGCTCGGTTTCTCTTGGATACTTAAATCAATCATTTCAAAATGCAGTTATGGAAGGGATACGCTATGGTTGCGAACAAGGATTATATGGTTGGAATGTGACGGACTGTAAAATCTGTTTTAAGTATGGCTTATACTATAGCCCTGTTAGTACCCCAGCAGATTTTCGGATGCTTGCTCCTATTGTATTGGAACAAGTCTTAAAAAAAGCTGGAACAGAATTGTTAGAGCCATATCTTAGTTTTAAAATTTATGCGCCACAGGAATATCTTTCACGAGCATACAACGATGCTCCTAAATATTGTGCGAACATCGTAGACACTCAATTGAAAAATAATGAGGTCATTCTTAGTGGAGAAATCCCTGCTCGGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGTTTGTTTAACAGAGTTAAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAAGTACGATATATGTTCAATAAAATAACTTAG +>tetM_8_X04388 +ATGAAAATTATTAATATTGGAGTTTTAGCTCATGTTGATGCGGGAAAAACTACCTTAACAGAAAGCTTATTATATAACAGTGGAGCGATTACAGAATTAGGAAGCGTGGACAGAGGTACAACGAAAACGGATAATACGCTTTTAGAACGTCAGAGAGGAATTACAATTCAGACGGCGATAACCTCTTTTCAGTGGAAAAATACTAAGGTGAACATCATAGACACGCCAGGACATATGGATTTTTTAGCAGAAGTATATCGTTCATTATCAGTATTAGATGGGGCAATTCTACTGATTTCTGCAAAAGATGGCGTACAAGCACAAACTCGTATATTGTTTCATGCACTTAGGAAAATAGGTATTCCCACAATCTTTTTTATCAATAAGATTGACCAAAATGGAATTGATTTATCAACGGTTTATCAGGATATTAAAGAGAAACTTTCTGCGGAAATTGTAATCAAACAGAAGGTAGAACTGCATCCTAATATGCGTGTAATGAACTTTACCGAATCTGAACAATGGGATATGGTAATAGAAGGAAATGATTACCTTTTGGAGAAATATACGTCTGGGAAATTATTGGAAGCATTAGAACTCGAACAAGAGGAAAGCATAAGATTTCATAATTGTTCCCTGTTCCCTGTTTATCACGGAAGTGCAAAAAACAATATAGGGATTGATAACCTTATAGAAGTGATTACGAATAAATTTTATTCATCAACACATCGAGGTCAGTCTGAACTTTGCGGAAAAGTTTTCAAAATTGAGTATTCGGAAAAAAGACAGCGTCTTGCATATATACGTCTTTATAGTGGCGTACTGCATTTGCGAGATCCGGTTAGAATATCGGAAAAGGAAAAAATAAAAATTACAGAAATGTATACTTCAATAAATGGTGAATTATGTAAAATCGATAAGGCTTATTCCGGGGAAATTGTTATTTTGCAGAATGAGTTTTTGAAGTTAAATAGTGTTCTTGGAGATACAAAGCTATTGCCACAGAGAGAGAGAATTGAAAATCCCCTCCCTCTGCTGCAAACGACTGTTGAACCGAGCAAACCTCAACAAAGGGAAATGTTACTTGATGCACTTTTAGAAATCTCCGACAGTGACCCGCTTCTGCGATATTATGTGGATTCTGCGACACATGAAATCATACTTTCTTTCTTAGGGAAAGTACAAATGGAAGTGACTTGTGCTCTGCTGCAAGAAAAGTATCATGTGGAGATAGAAATAAAAGAGCCTACAGTCATTTATATGGAAAGACCGTTAAAAAAAGCAGAGTATACCATTCACATCGAAGTTCCACCGAATCCTTTCTGGGCTTCCATTGGTCTATCTGTAGCACCGCTTCCATTAGGGAGCGGAGTACAGTATGAGAGCTCGGTTTCTCTTGGATACTTAAATCAATCGTTTCAAAATGCAGTTATGGAGGGGATACGCTATGGCTGTGAACAAGGATTGTATGGTTGGAATGTGACGGACTGTAAAATCTGTTTTAAGTATGGCTTATACTATAGCCCTGTTAGTACCCCAGCAGATTTTCGGATGCTTGCTCCTATTGTATTGGAACAAGTCTTAAAAAAAGCTGGAACAGAATTGTTAGAGCCATATCTTAGTTTTAAAATTTATGCGCCACAGGAATATCTTTCACGAGCATACAACGATGCTCCTAAATATTGTGCGAACATCGTAGACACTCAATTGAAAAATAATGAGGTCATTCTTAGTGGAGAAATCCCTGCTCGGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGTTTGTTTAACAGAGTTAAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAAGTACGATATATGTTCAATAAAATAACTTAG +>tetS_M +ATGGAGGAAATAAAATTGAAAATTATTAATATCGGTATCTTAGCACATGTTGATGCAGGAAAAACTACTTTGACAGAAAGCTTACTATACAGTAGCGGAGCAATTAAAGAGTTAGGAAGTGTAGATAGCGGTACAACGAAAACGGATACTATGTTTTTGGAACGCCAGAGAGGTATTACTATTCAGACCGCAATAACATCTTTTCAACGGGAAAATGTTAAAGTAAATATTGTAGATACTCCTGGACACATGGATTTTTTGGCAGATGTATACCGTTCATTATCTGTTTTGGATGGAGCTATTTTGCTAATCTCTGCAAAAGATGGAGTACAGTCACAAACTCGTATACTATTCCATGCACTTAGAAAGATGAACATACCTATAATATTTTTTATTAACAAAATTGATCAAAATGGAATAAATTTGCCAGATGTTTATCAAGATATTAAGGACAAACTTTCTGACGACATCATAATTAAGCAGACTGTGAATCTAAATTTGAAACCTTATGTAATAGATTATACTGAACCAGAACAATGGGAGACAGTAATTGTGGGAAATGATTATTTATTAGAAAAATATACCATTGGGAAAACATTGAATATTGCAGAACTTGAAAAGGAGGAAAACGAAAGAATTCAAAGTTGCTCCTTATATCCTGTTTATCACGGAAGTGCAAAGAATAATATTGGAATTAAACAACTTATAGAGGTAATTACTAGCAAATTATTTTCACCCACACAACTCAATTCAGATAAACTTTGTGGAAATGTTTTTAAAGTAGAATATTCAGATGATGGTCAACGGCTTGTCTATGTACGTCTTTATAGTGGAACGCTACATTTGCGAGACTCAGTCAATATATCAGAAAAGGAAAAAATAAAAGTTACAGAAATGTATACTTCAATAAATGGAGAATTACGCCAGATAGATAAGGCAGAGCCTGGTGAGATTATTATTTTAAAAAATGAGCTTTTAAAACTAAATAACGTACTTGGAGATAAAAAAAGATTACCACATAGAGAAATTCTTGAGAATCCTCTTCCTATGTTACAAACAACAATTGAACCATGTAAATCAGTACAAAGAGAAAAGTTACTAGATGCACTTTTTGAAATATCCGATAGTGATCCCCTTCTACAATATTATGTAGATACAGTAACTCACGAAATTGTGCTATCTTTTTTAGGTGAGGTCCAAATGGAGGTAACTTGTACTCTGATTCAAGAAAAATATCATATTGAGATAGAAACAAGAAAACCAACTGTCATTTATATGGAAAGACCATTAAAAAAATCTGAATTTACCATTGATATCGAAGTACCTCCAAATCCTTTCTGGGCTTCTATTGGTTTATCTGTAACACCACTTCCTTTGGGTAGTGGCATTCAGTATGAGAGCCTGGTTTCTCTAGGTTATTTAAATCAATCATTTCAAAATGCAGTTATGGAAGGTATACGCTATGGGTGTGAACAAGGATTGTACGGTTGGAAATTAACAGACTGTAAGATCTGTTTTAAGTATGGTCTATATTACAGCCCTGTCAGTACGCCAGCAGATTTCCGAATGCTTGCGCCTATTGTACTAGAGCAGGCTTTTAGAAAGAGTGGTACAGAGTTATTAGAGCCATATCTTAGCTTCGAAATTTATGTACCACAAGAATATCTTTCGAGAGCATATAATGATGCTTCCAAATATTGTGCAAATATTTTAAATACTAAGTTAAAAGGTAACGAGGTCATTCTCATTGGTGAAATTCCAGCCCGTTGTATTCAAGAGTATCAAAACAGTTTAACTTTCTTTACAAATGGACGCAGTGTCTGTTTAACAGAGTTAAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAAGTACGATATATGTTCAATAAAATAACTTAG +>tetO_Y07780 +ATGAAAATAATTAACTTAGGCATTCTGGCTCACGTTGACGCAGGAAAGACAACATTAACGGAGAGTTTAT +TGTATACCAGTGGTGCAATTGCAGAACCAGGGAGCGTAGATAAAGGCACAACAAGGACAGATACAATGAA +TTTGGAGCGTCAAAGGGGAATCACTATCCAGACAGCAGTGACATCTTTTCAGTGGGAGGATGTAAAAGTC +AACATTATAGATACGCCAGGCCATATGGATTTTTTGGCGGAAGTATACCGTTCTTTATCCGTATTAGACG +GAGCAGTATTATTAGTTTCTGCAAAGGATGGCATACAGGCACAGACCCGTATACTGTTTCATGCACTACA +GACAATGAAGATTCCGACAATTTTTTTCATCAATAAAATTGACCAAGAGGGGATTGATTTGCCAATGGTA +TATCAAGAAATGAAAGCAAAGCTTTCTTCGGAAATTATAGTGAAGCAAAAGGTTGGGCAGCATCCCCATA +TAAATGTAACGGACAATGACGATATGGAACAGTGGGATGCGGTAATTATGGGAAACGATGAACTATTAGA +GAAATATATGTCAGGGAAACCGTTTAAAATGTCAGAACTGGAACAGGAAGAAAACAGGAGATTCCAAAAC +GGAACGTTATTTCCCGTTTATCACGGAAGTGCTAAAAACAATCTGGGGATTCGGCAGCTTATAGAAGTGA +TTGCCAGTAAGTTTTATTCATCAACGCCTGAAGGTCAATCTGAACTATGCGGGCAGGTTTTTAAGATTGA +ATATTCAGAGAAAAGGCGGCGTTTTGTTTATGTGCGTATATATAGCGGAACATTGCATTTGAGGGATGTT +ATTAAAATATCTGAAAAAGAGAAAATAAAAATCACAGAGATGTGTGTTCCGACAAACGGTGAATTATATT +CATCCGATACAGCCTGCTCTGGTGATATTGTAATTTTACCAAATGATGTTTTGCAGCTAAACAGTATTTT +GGGGAACGAAATGCTGTTGCCGCAGAGAAAATTTATTGAAAATCCTCTCCCTATGCTCCAAACAACGATT +GCAGTAAAGAAATCTGAACAGCGGGAAATATTGCTTGGGGCACTTACAGAAATTTCAGATGGCGACCCTC +TTTTAAAATATTATGTGGATACTACAACGCATGAGATTATACTTTCTTTTTTGGGGAATGTGCAGATGGA +AGTCATTTGTGCCATCCTTGAGGAAAAATACCATGTGGAGGCAGAAATAAAAGAGCCTACTGTTATATAT +ATGGAAAGACCGCTTAGAAAAGCAGAATATACCATCCACATAGAAGTCCCGCCAAATCCTTTCTGGGCTT +CTGTCGGGTTGTCCATAGAGCCGCTCCCTATTGGAAGCGGAGTGCAGTATGAAAGCAGAGTTTCACTTGG +ATATTTAAACCAATCGTTCCAAAATGCGGTTATGGAGGGGGTTCTTTATGGCTGCGAGCAGGGGCTGTAT +GGATGGAAAGTGACAGACTGTAAAATCTGTTTTGAATATGGATTGTATTATAGTCCTGTAAGTACCCCCG +CAGACTTTCGGCTGCTTTCCCCTATCGTATTGGAGCAGGCTTTAAAAAAAGCAGGGACAGAACTATTAGA +GCCATATCTCCACTTTGAAATTTATGCACCGCAGGAATATCTCTCACGGGCGTATCATGATGCCCCAAGG +TATTGTGCAGATATTGTAAGTACTCAGGTAAAGAATGACGAGGTCATTCTGAAAGGAGAAATCCCTGCCA +GATGTATTCAAGAATACAGGAACGATTTAACTTATTTCACAAATGGGCAGGGAGTCTGCTTGACAGAGTT +AAAAGGATACCAGCCAGCTATTGGTAAATTTATTTGCCAACCCCGCCGCCCGAATAGCCGTATAGATAAG +GTTCGGCATATGTTCCACAAGTTAGCTTAA +>folA_AE007317 +ATGACTAAGAAAATCGTAGCTATTTGGGCCCAGGATGAAGAGGGTTTGATTGGTAAGGAAAATCGTCTGCCTTGGCATTTGCCAGCAGAATTGCAGCACTTTAAAGAAACAACTCTGAATCATGCTATCTTGATGGGGCGTGTGACCTTTGATGGGATGGGGCGTCGCTTGCTTCCAAAACGGGAAACCCTGATTTTGACGCGTAATCCGGAAGAAAAGATAGATGGGGTTGCTACTTTTCAGGACGTCCAGTCTGTTCTTGACTGGTATCAGGATCAAGAAAAGAATCTCTACATTATCGGTGGGAAGCAAATTTTTCAGGCTTTTGAACCTTACCTTGATGAAGTGATTGTCACTCACATTCATGCTCGGGTGGAAGGAGATACCTATTTCCCTGAAGAGCTTGACTTGTCTCTTTTTGAGACAGTTTCAAGCAAATTTTACGCCAAAGATGAGAAGAATCCTTATGATTTTACCATCCAATACCGCAAGAGAAAGGAAGTCTAA +>gyrA_AE007317 +ATGCAGGATAAAAATTTAGTGAATGTCAATCTGACAAAGGAGATGAAGGCAAGTTTTATCGACTACGCCATGAGTGTTATCGTAGCGCGAGCTCTTCCTGATGTTCGAGATGGCTTAAAACCTGTTCACCGTCGCATTCTCTACGGAATGAATGAATTGGGTGTGACCCCAGACAAACCCCATAAAAAATCTGCTCGTATTACAGGGGATGTCATGGGTAAATACCACCCACACGGGGATTCCTCTATTTATGAAGCCATGGTCCGTATGGCTCAATGGTGGAGCTACCGTTACATGCTTGTAGATGGTCATGGGAATTTTGGTTCCATGGATGGAGATAGTGCTGCCGCTCAACGTTATACCGAGGCACGTATGAGCAAGATTGCTCTGGAAATGCTTCGTGATATCAACAAAAATACAGTTGATTTCGTTGATAACTATGATGCCAATGAACGGGAACCCTTGGTCTTGCCAGCGCGTTTTCCAAACCTTTTGGTTAATGGAGCAACTGGTATCGCGGTTGGGATGGCAACCAATATTCCACCTCATAATCTGGGTGAAACCATTGATGCAGTGAAGTTGGTCATGGATAATCCTGAAGTGACTACCAAGGACTTGATGGAAGTCTTGCCTGGACCAGATTTTCCAACTGGTGCTCTTGTCATGGGGAAATCAGGTATCCATAAGGCTTATGAAACAGGTAAAGGTTCGATTGTCCTACGTTCTCGTACAGAGATTGAAACGACTAAGACTGGTCGTGAGCGTATCGTTGTAACAGAATTTCCTTACATGGTCAATAAAACCAAGGTGCATGAGCATATTGTTCGCTTGGTTCAGGAAAAACGCATTGAGGGTATCACAGCAGTACGTGATGAGTCAAACCGTGAAGGTGTTCGATTTGTTATTGAAGTCAAGCGCGACGCCTCAGCCAATGTTATTCTCAATAACCTCTTCAAAATGACCCAAATGCAAACCAATTTTGGTTTCAATATGCTCGCTATCCAAAATGGTATACCGAAAATTTTGTCTCTTCGTCAGATTTTGGATGCTTATATTGAGCACCAAAAAGAAGTGGTTGTTCGTCGTACACGTTTTGATAAGGAAAAAGCGGAAGCGCGCGCTCATATCTTAGAAGGTCTCTTGATTGCGCTAGACCATATCGACGAAGTGATTCGTATCATCCGTGCTAGTGAAACGGATGCGGAAGCTCAAGCTGAGTTGATGAGCAAGTTTAAGCTTTCTGAACGTCAAAGTCAAGCTATCCTTGATATGCGTCTTCGTCGTTTGACAGGTTTGGAACGCGATAAGATTCAATCTGAGTATGATGACCTCTTGGCTCTGATTGCGGATTTAGCAGATATTCTTGCTAAGCCTGAACGTGTTTCTCAAATTATCAAAGACGAATTGGATGAAGTTAAACGTAAATTTTCTGATAAACGCCGTACAGAGTTGATGGTTGGACAGGTCTTGAGTCTCGAGGATGAGGACTTGATTGAAGAATCGGATGTCTTGATTACCCTTTCTAACAGAGGCTACATTAAGCGTTTGGATCAGGACGAGTTCACTGCTCAAAAACGTGGGGGTCGTGGTGTCCAAGGAACGGGAGTGAAAGATGATGACTTTGTTCGTGAGTTAGTGTCAACTAGTACCCATGATCATCTGCTCTTCTTCACAAACAAGGGACGTGTCTATCGTCTTAAAGGTTATGAAATTCCTGAGTATGGTCGGACTGCCAAAGGGCTACCAGTAGTCAATCTCTTGAAATTGGATGAAGACGAAAGTATTCAGACGGTTATCAATGTTGAGTCTGATCGCAGTGATGATGCTTATCTCTTCTTTACAACCCGTCACGGTATTGTGAAGAGAACCAGTGTTAAGGAGTTTGCCAATATTCGTCAAAATGGTCTCAAAGCGCTGAATTTAAAGGATGAAGATGAGTTAATCAATGTCTTGTTGGCAGAAGGAGATATGGATATTATCATTGGTACCAAGTTTGGTTATGCAGTTCGCTTTAATCAATCAGCCGTTCGTGGTATGAGCCGTATCGCCACTGGTGTGAAAGGTGTTAACCTTCGTGAAGGAGACACAGTTGTTGGTGCCAGCTTGATTACTGATCAAGATGAGGTTCTTATTATCACAGAAAAAGGATATGGTAAGCGTACAGTCGCTACTGAATACCCAACAAAAGGTCGTGGTGGTAAGGGAATGCAGACAGCTAAAATTACCGAAAAAAATGGCTTGCTGGCCGGTCTTATGACTGTTCAAGGGGATGAGGATTTGATGATTATCACTGATACAGGTGTCATGATTCGAACCAATCTTGCCAATATTTCACAAACAGGACGTGCAACTATGGGAGTTAAAGTAATGCGCCTGGATCAAGATGCTCAGATAGTGACTTTCACAACGGTTGCGGTGGCAGAAAAAGAAGAAGTTGGGACAGAAAACGAAACAGAAGGTGAAGCATAA +>gyrB_AE007317 +ATGACAGAAGAAATCAAAAATCTGCAGGCACAGGATTATGATGCCAGTCAAATTCAAGTTTTAGAGGGCTTAGAGGCTGTTCGTATGCGTCCAGGGATGTACATTGGATCAACCTCAAAAGAAGGTCTTCACCATCTAGTCTGGGAAATTGTTGATAACTCAATTGACGAGGCCTTGGCAGGATTTGCCAGCCATATTCAAGTTTTTATTGAGCCAGATGATTCGATTACTGTTGTGGATGATGGGCGTGGTATCCCAGTCGATATTCAGGAAAAAACAGGTCGTCCTGCTGTTGAGACCGTCTTTACAGTCCTTCACGCTGGAGGAAAGTTCGGCGGTGGTGGATACAAGGTTTCAGGTGGTCTTCACGGGGTGGGGTCGTCAGTTGTTAATGCCCTTTCCACTCAATTAGACGTTCATGTCCATAAAAACGGTAAGATTCATTACCAAGAATACCGTCGTGGTCATGTTGTCGCAGATCTTGAAATAGTTGGAGATACGGATAAAACAGGAACAACTGTTCACTTCACACCGGACCCAAAAATCTTCACTGAAACAACAATCTTTGATTTTGATAAATTAAATAAACGGATTCAAGAGTTGGCCTTTCTAAATCGCGGTCTTCAAATTTCTATCACTGATAAGCGCCAAGGTTTGGAACAAACCAAGCATTATCATTATGAAGGTGGGATTGCTAGTTACGTTGAATATATCAACGAGAACAAGGATGTAATCTTTGATACACCAATCTATACAGACGGTGAGATGGATGATATCACAGTTGAGGTAGCCATGCAGTACACAACGGGTTACCATGAAAATGTCATGAGTTTCGCCAATAATATTCATACACATGAAGGTGGAACGCATGAACAAGGTTTCCGTACAGCCTTGACACGTGTTATCAACGATTATGCTCGTAAGAATAAGTTACTGAAAGACAATGAAGACAATCTAACAGGGGAAGATGTTCGCGAAGGCTTAACTGCAGTTATCTCAGTTAAACACCCAAATCCACAGTTTGAAGGACAAACGAAGACCAAATTGGGAAATAGCGAAGTGGTCAAGATTACCAATCGCCTCTTCAGTGAAGCCTTCTCCGATTTCCTCATGGAAAATCCACAGATTGCCAAACGTATCGTAGAAAAAGGAATTTTGGCTGCCAAGGCTCGTGTGGCTGCCAAGCGTGCGCGTGAAGTCACACGTAAAAAATCTGGTTTGGAAATTTCCAACCTTCCAGGGAAACTAGCAGACTGTTCTTCTAATAACCCTGCTGAAACAGAACTCTTCATCGTCGAAGGAGACTCAGCTGGTGGATCAGCCAAATCTGGTCGTAACCGTGAGTTTCAGGCTATCCTTCCAATTCGCGGTAAGATTTTGAACGTTGAAAAAGCAAGTATGGATAAGATTCTAGCTAACGAAGAAATTCGTAGTCTTTTCACAGCCATGGGAACAGGATTTGGCGCAGAATTTGATGTTTCGAAAGCCCGTTACCAAAAACTCGTTTTGATGACCGATGCCGATGTCGATGGAGCCCACATTCGTACCCTTCTTTTAACCTTGATTTATCGTTATATGAAACCAATCCTAGAAGCTGGCTATGTTTATATTGCCCAACCACCAATCTATGGTGTCAAGGTTGGAAGCGAGATTAAAGAATATATCCAGCCGGGTGCAGATCAAGAAATCAAACTCCAAGAAGCTTTAGCCCGTTATAGTGAAGGTCGTACCAAACCGACTATTCAGCGTTATAAGGGGCTAGGTGAAATGGACGATCATCAGCTGTGGGAAACAACCATGGATCCCGAACATCGCTTGATGGCTAGAGTTTCTGTAGATGATGCTGCAGAAGCAGATAAAATCTTTGATATGTTGATGGGGGATCGAGTAGAGCCTCGTCGTGAGTTTATCGAAGAAAATGCTGTCTATAGTACACTTGATGTCTAA +>parC_AE007317 +ATGTCTAACATTCAAAACATGTCCCTGGAGGACATCATGGGAGAGCGCTTTGGTCGCTACTCCAAGTACATTATTCAAGACCGGGCTTTGCCAGATATTCGTGATGGGTTGAAGCCGGTTCAACGCCGTATTCTTTATTCTATGAATAAGGATAGCAATACTTTTGACAAGAGCTACCGTAAGTCGGCCAAGTCAGTCGGGAACATCATGGGGAATTTCCACCCACACGGGGATTCTTCTATCTATGATGCCATGGTTCGTATGTCACAGAACTGGAAAAATCGTGAGATTCTAGTTGAAATGCACGGTAATAACGGTTCTATGGACGGAGATCCTCCTGCGGCTATGCGTTATACTGAGGCACGTTTGTCTGAAATTGCAGGCTACCTTCTTCAGGATATCGAGAAAAAGACAGTTCCTTTTGCATGGAACTTTGACGATACGGAGAAAGAACCAACGGTCTTGCCAGCAGCCTTTCCAAACCTCTTGGTCAATGGTTCGACTGGGATTTCGGCTGGTTATGCCACAGACATTCCTCCCCATAATTTAGCTGAGGTCATAGATGCTGCAGTTTACATGATTGACCACCCAACTGCAAAGATTGATAAACTCATGGAATTCTTACCTGGACCAGACTTCCCTACAGGGGCTATTATTCAGGGTCGTGATGAAATCAAGAAAGCCTATGAGACTGGGAAAGGGCGCGTGGTTGTTCGTTCCAAGACTGAAATTGAAAAGCTAAAAGGTGGTAAGGAACAAATCGTTATTACTGAGATTCCTTATGAAATCAATAAGGCCAATCTAGTCAAGAAAATCGATGATGTTCGTGTTAATAACAAGGTAGCTGGGATTGCTGAGGTTCGTGATGAGTCTGACCGTGATGGTCTTCGTATCGCTATTGAACTTAAGAAAGACGCTAATACTGAGCTTGTTCTCAACTACTTATTTAAGTACACCGACCTACAAATCAACTACAACTTTAATATGGTGGCGATTGACAATTTCACACCTCGTCAGGTTGGGATTGTTCCAATCCTGTCTAGCTACATCGCTCACCGTCGAGAAGTGATTTTGGCGCGTTCACGCTTTGACAAAGAAAAGGCTGAGAAACGTCTCCGTATCGTCGAAGGTTTGATTCGTGTGATTTCGATTTTGGATGAAGTCATTGCTCTTATCCGTGCTTCTGAGAATAAGGCGGACGCCAAGGAAAACCTCAAAGTTAGCTATGATTTTACGGAAGAACAGGCTGAGGCTATCGTAACTTTGCAACTGTACCGTTTGACCAATACCGATGTGGTTGTCTTGCAGGAAGAAGAAGCAGAGCTTCGTGAGAAGATTGCTATGCTGGCGGCTATTATCGGTGATGAAAGGACTATGTACAATCTCATGAAGAAAGAACTTCGTGAGGTCAAGAAGAACTTTGCAACTCCTCGTTTGAGTTCTTTAGAAGACACTGCGAAAGCAATTGAGATTGATACAGCTAGTCTTATCGCTGAGGAAGATACCTACGTCAGCGTGACCAAGGCAGGTTACATCAAGCGTACCAGTCCACGTTCCTTTGCGGCTTCCACCTTGGAAGAAATTGGCAAGCGTGATGATGACCGTTTGATTTTTGTTCAATCTGCCAAGACAACCCAGCACCTCTTGATGTTCACAAGTCTTGGAAATGTCATCTACAGACCAATCCATGAGTTGGCAGATATTCGTTGGAAGGACATCGGAGAGCATCTGAGCCAAACCATCACAAACTTTGAAACGAATGAAGCAATCCTTTATGTGGAAGTACTGGATCAGTTTGACGATGCGACAACCTACTTTGCAGCGACTCGCCTTGGTCAAATCAAACGGGTAGAGCGAAAAGAATTCACTCCATGGCGGACCTATAGATCTAAGTCTGTCAAGTATGCTAAGCTCAAAGACGATACAGATCAGATTGTAGCAGTGGCTCCGATTAAACTAGATGATGTTGTCTTGGTTAGTCAAAATGGTTATGCCCTGCGTTTCAATATCGAAGAGGTTCCGGTTGTCGGTGCTAAGGCAGCAGGTGTCAAGGCTATGAATTTGAAAGAAGATGATGTCCTCCAATCTGGCTTTATCTGTAATACTTCGTCCTTCTACCTCTTGACCCAGCGTGGAAGCTTGAAACGTGTTTCCATTGAGGAAATTCTAGCAACCAGCCGTGCCAAACGAGGATTACAAGTCTTGCGTGAGTTGAAAAACAAACCGCATCGTGTCTTCTTGGCAGGAGCAGTTGCAGAGCAAGGATTTGTTGGCGATTTCTTCAGTACGGAAGTGGATGTGAACGACCAAACTCTGCTTGTCCAATCCAATAAAGGAACAATCTATGAAAGCCGATTGCAAGACTTGAACTTGTCAGAACGCACTAGCAATGGAAGCTTCATTTCTGACACGATTTCAGATGAAGAAGTTTTTGACGCTTATCTTCAGGAAGTAGTTACTGAAGATAAATAA +>parE_AE007317 +GTGTCAAAAAAGGAAATCAATATTAACAATTATAATGATGATGCTATTCAGGTGCTAGAAGGGTTGGATGCGGTCCGAAAACGTCCAGGGATGTATATTGGATCGACCGATGGCGCTGGTCTTCATCACCTAGTTTGGGAAATCGTTGATAATGCAGTCGATGAAGCCTTGTCTGGGTTTGGTGATCGTATTGATGTAACTATCAATAAAGACGGTAGTCTAACGGTTCAAGACCATGGACGTGGGATGCCGACAGGTATGCACGCTATGGGAATTCCAACTGTTGAGGTTATCTTTACCATTCTTCATGCCGGAGGGAAATTCGGTCAAGGTGGCTATAAGACATCAGGTGGACTTCACGGAGTGGGTTCTTCCGTTGTTAACGCCCTTTCTAGCTGGTTAGAAGTTGAAATTACCCGTGATGGCGCAGTTTACAAGCAACGTTTTGAAAATGGTGGAAAACCTGTCACGACTTTGAAGAAAATCGGTACAGCACTCAAGTCTAAAACAGGCACCAAAGTTACTTTTATGCCTGACGCGACTATCTTTTCTACGACAGATTTCAAGTACAATACCATTTCAGAGCGCCTTAATGAATCAGCCTTTCTCTTGAAAAATGTGACCTTGTCTTTAACGGACAAGCGAACAGATGAAGCGATTGAGTTCCACTATGAGAATGGAGTACAAGATTTTGTTTCTTATCTCAACGAAGATAAGGAAATCTTGACGCCAGTTCTTTACTTTGAAGGGGAAGACAATGGTTTTCAAGTGGAAGTAGCCCTCCAGTACAATGACGGATTCTCAGATAACATTCTATCCTTTGTCAATAACGTTCGCACCAAGGACGGTGGAACGCACGAGACAGGACTCAAGTCTGCCATTACCAAGGTCATGAATGACTATGCACGTAAAACAGGTCTTCTCAAGGAAAAAGATAAAAACCTTGAAGGTTCAGACTATCGTGAGGGACTAGCGGCCGTTCTTTCTATCTTAGTTCCTGAAGAACACTTGCAGTTTGAAGGACAGACCAAGGATAAACTAGGAAGCCCCCTAGCTCGCCCAGTTGTGGATGGAATAGTGGCTGATAAGTTGACCTTTTTCCTTATGGAAAATGGGGAATTAGCTTCTAACCTCATCCGCAAGGCTATCAAGGCCCGTGATGCTCGTGAAGCAGCACGTAAGGCGCGTGATGAGAGCCGAAATGGGAAGAAAAACAAGAAAGATAAGGGCTTGTTGTCTGGGAAATTGACCCCAGCCCAATCTAAGAATCCTGCTAAGAATGAACTCTATCTAGTTGAGGGGGACTCTGCCGGTGGTTCTGCCAAACAAGGTCGTGACCGCAAGTTCCAGGCTATTCTACCTCTTCGTGGTAAGGTTATCAATACAGCCAAGGCCAAGATGGCGGATATCCTCAAAAATGAAGAGATCAATACCATGATTTATACCATTGGTGCGGGTGTTGGAGCAGACTTCTCTATTGAAGATGCCAACTATGATAAGATCATTATCATGACCGATGCGGATACCGACGGTGCCCATATCCAGACCTTGCTCTTGACATTTTTCTACCGTTACATGCGTCCGCTAGTCGAGGCAGGTCATGTCTATATTGCCCTCCCACCTCTTTACAAGATGTCCAAAGGTAAAGGCAAGAAAGAAGAAGTGGCCTACGCTTGGACGGACGGAGAACTAGAAGAACTCCGTAAACAGTTCGGTAAAGGCGCTACCCTCCAACGATACAAAGGACTTGGTGAGATGAATGCGGACCAGCTCTGGGAAACAACCATGAACCCAGAAACACGTACCCTCATCCGTGTCACAATTGAAGATTTAGCGCGCGCCGAACGCCGCGTCAATGTTCTCATGGGAGATAAGGTAGAACCACGCCGTAAATGGATTGAAGATAATGTCAAGTTTACGCTAGAAGAAGCGACAGTGTTTTAA +>folP_AE007317 +ATGTCAAGTAAAGCCAATCATGCAAAGACAGTTATTTGCGGAATTATCAATGTAACCCCAGACTCCTTTT +CGGACGGTGGTCAATTTTTTGCTCTTGAGCAGGCACTCCAGCAGGCTCGTAAATTGATAGCAGAAGGAGC +CAGTATGCTCGATATCGGCGGAGAATCGACTCGGCCGGGCAGTAGCTATGTTGAGATAGAAGAGGAAATC +CAGCGTGTTGTTCCAGTGATCAAAGCGATTCGCAAGGAAAGTGATGTCCTCATCTCTATTGATACTTGGA +AGAGCCAAGTAGCAGAGGCTGCTTTGGCTGCTGGTGCCGATCTAGTCAATGATATCACTGGTCTTATGGG +TGATGAGAAAATGCCTCATGTGGTAGCTGAAGCGAGAGCGCAAGTGGTCATCATGTTTAATCCAGTTATG +GCGCGACCTCAGCACCCTAGCTCGCTCATCTTCCCTCATTTTGGTTTTGGTCAAGCTTTTACAGAGGAAG +AGTTAGCTGACTTTGAAACATTGCCAATCGAAGAATTGATGGAGGCTTTCTTTGAACGAGCACTAGCGAG +AGCGGCAGAAGCTGGTATTGCACCAGAAAATATCCTGTTGGATCCAGGAATTGGCTTTGGTCTGACCAAG +AAAGAAAATCTGCTTCTTTTACGGGACCTGGATAAACTACATCAGAAGGGCTATCCAATCTTTCTCGGAG +TGTCGCGCAAGCGATTTGTCATCAATATCCTAGAGGAGAATGGTTTTGAAGTCAATCCTGAGACAGAGCT +TGGTTTCCGCAATCGGGACACGGCTTCGGCTCATGTAACCAGTATCGCTGCAAGACAGGGTGTAGAAGTG +GTGCGCGTGCATGACGTAGCTAGTCACAGGATGGCAGTTGAAATTGCCTCTGCCATTCGTCTGGCTGATG +AAGCGGAAAATTTAGATTTAAAACAATATAAATAA +>ermBups_HG799494 +ATGCGTAATGTAGATAAAACATCTACTGTTTTGAAACAGACTAAAAACAGTGATTACGCA +GATAAATAA +>ermbTr_CP002121 +GCTTTTGATAGTCAAGCGAAATATAGCTACCTTATTGTAGAGAGGGGATTTGCTAAAAGG +TTGCAAAA +>rplD_AE007317 +ATGGCAAACGTAACATTATTTGACCAAACTGGTAAAGAAGCTGGCCAAGTTGTTCTTAGCGATGCAGTAT +TTGGTATCGAACCAAATGAATCAGTTGTGTTTGATGTAATCATCAGCCAACGCGCAAGCCTTCGTCAAGG +AACACACGCTGTTAAAAACCGCTCTGCAGTATCAGGTGGTGGACGCAAACCATGGCGTCAAAAAGGAACT +GGACGTGCTCGTCAAGGTTCTATCCGCTCACCACAATGGCGTGGTGGTGGTGTTGTCTTCGGACCAACTC +CACGTTCATACGGCTACAAACTTCCACAAAAAGTTCGTCGCCTAGCTCTTAAATCAGTTTACTCTGAAAA +AGTTGCTGAAAACAAATTCGTAGCTGTAGACGCTCTTTCATTTACAGCTCCAAAAACTGCTGAATTTGCA +AAAGTTCTTGCAGCATTGAGCATCGATTCTAAAGTTCTTGTTATCCTTGAAGAAGGAAATGAATTCGCAG +CTCTTTCAGCTCGTAACCTTCCAAACGTGAAAGTTGCAACTGCTACAACTGCAAGTGTTCTTGACATCGC +AAATAGCGACAAACTTCTTGTCACACAAGCAGCTATCTCTAAAATCGAGGAGGTTCTTGCATAA +>rpoB_AE007317 +TTGACAAGGCTTGGAACTTATTTACAAAGGAGAATCATCTTGGCAGGACATGACGTTCAATACGGGAAAC +ATCGTACCCGTCGTAGTTTTTCAAGAATCAAAGAAGTTCTTGACTTACCAAATTTGATTGAAATTCAAAC +TGACTCATTCAAAGCTTTCCTAGACCACGGTCTTAAGGAAGTGTTTGAAGATGTATTGCCAATTTCAAAC +TTCACAGACACAATGGAGTTGGAATTTGTTGGATATGAAATCAAGGAACCAAAATACACGCTAGAAGAAG +CTCGTATCCACGATGCTAGCTACTCAGCACCAATTTTTGTAACCTTCCGTTTGATCAATAAAGAAACAGG +CGAAATCAAGACCCAAGAAGTTTTCTTTGGTGATTTCCCAATCATGACAGAAATGGGTACTTTCATCATC +AATGGTGGTGAACGTATTATCGTTTCTCAGTTGGTCCGCTCACCAGGTGTTTACTTTAACGACAAAGTAG +ACAAAAATGGTAAGGTGGGCTATGGTTCAACTGTTATCCCTAACCGTGGAGCTTGGTTGGAACTTGAAAG +CGACTCAAAAGATATCACCTACACTCGTATCGACCGTACTCGTAAGATTCCATTTACAACCTTGGTTCGT +GCTCTTGGTTTCTCAGGTGATGATGAAATCTTTGATATCTTTGGTGACAGCGAATTGGTTCGCAACACTG +TTGAAAAAGATATCCACAAGAATCCAATGGACTCTCGTACAGACGAAGCCTTGAAAGAAATTTACGAACG +CCTTCGTCCAGGTGAGCCTAAGACAGCTGAAAGCTCACGTAGCTTGCTTGTGGCTCGCTTCTTTGACCCA +CGTCGCTATGACTTGGCAGCAGTTGGTCGTTACAAAATCAATAAAAAACTCAATGTTAAAACACGTTTGC +TCAACCAAACCATTGCAGAGCCATTGGTAGACCCTGAAACTGGAGAAATCTTGGTAGAAGCTGGTACGAT +TATGACTCGTAGCGTGATTGAAAGCATTGAAAGCCATTTGGATGGCGACTTGAACAAGATTGTCTACATC +CCAAACGATGCAGCCGTTGTGACTGAGCCTGTTGTTCTTCAAAAATTCAAGGTTGTTGCTCCAACTGATC +CAGATCGCGTCGTAACGATCATTGGTAATGCTAACCCAGATGACAAGGTTCGTACGGTGACTCCTGCAGA +TATCCTTGCTGAGATGAGCTACTTCCTCAACTTGGCTGAAGGACTTGGCCGTGTAGATGATATCGACCAC +CTTGGAAATCGTCGTATCCGTGCGGTTGGTGAATTGCTTGCCAACCAAGTACGTTTGGGACTTTCTCGTA +TGGAACGTAATGTCCGTGAACGTATGTCTGTTCAGGACAATGAAGTCTTGACACCACAACAAATTATCAA +TATCCGTCCTGTAACAGCTGCAGTTAAAGAATTCTTTGGTTCATCACAGTTGTCACAGTTCATGGACCAA +CACAACCCGCTTTCTGAGTTGTCTCACAAACGCCGTTTGTCAGCCTTAGGACCTGGTGGTTTGACTCGTG +ACCGTGCCGGATATGAAGTGCGTGACGTGCACTACACTCACTATGGTCGTATGTGTCCAATCGAAACACC +TGAAGGACCTAACATCGGTTTGATCAATAACTTGTCATCTTACGGACACTTGAACAAATATGGTTTTGTT +CAAACACCATACCGTAAGGTTGACCGTGAAACAGGTGTTGTCACGAACGAAATTGTTTGGTTGACAGCTG +ATGAAGAAGATGAATATACTGTAGCTCAGGCTAACTCTCGTCTGAATGAAGATGGAACCTTTGCTGAGAA +GATTGTCATGGGACGTCACCAAGGGGTCAACCAAGAGTATCCAGCTAATATTGTTGACTACATGGACGTT +TCACCAAAACAGGTAGTTGCCGTTGCGACAGCATGTATTCCTTTCTTGGAAAACGATGACTCCAACCGTG +CCCTCATGGGAGCCAATATGCAACGTCAGGCTGTGCCATTGATTAATCCTCAGGCACCTTACGTTGGTAC +TGGTATGGAATACCAAGCAGCCCACGATTCTGGTGCGGCTGTGATTGCTCAGTATGATGGTAAAGTTACT +TACGCAGATGCTGACAAGGTAGAAGTTCGTCGTGAAGATGGTTCATTGGATGTTTACCACATCCAAAAAT +TCCGTCGTTCAAACTCAGGTACTGCTTACAACCAACGCACTCTCGTAAAAGTTGGTGATGTCGTTGAAAA +AGGCGATTTCATCGCTGACGGACCTTCTATGGAAAATGGAGAAATGGCGCTTGGACAAAACCCAATCGTT +GCCTACATGACTTGGGAAGGTTACAACTTCGAGGATGCCGTTATCATGAGCGAACGCTTGGTGAAGGACG +ATGTCTACACATCTGTTCACCTTGAAGAATACGAATCAGAAACGCGCGATACAAAGCTTGGGCCTGAAGA +AATCACTCGCGAAATTCCAAACGTTGGTGAAGATGCCCTCAAAGACCTTGACGAAATGGGGATTATCCGT +ATTGGTGCTGAGGTTAAAGAAGGTGATATTCTTGTAGGTAAAGTAACACCTAAGGGTGAGAAAGATCTTT +CAGCTGAAGAACGTCTCTTGCACGCTATCTTTGGAGACAAGTCTCGTGAAGTGCGTGATACTTCTCTTCG +TGTACCACACGGTGCCGATGGTGTCGTTCGTGATGTTAAGATCTTTACACGTGTAAATGGAGATGAGTTG +CAATCAGGTGTTAACATGTTGGTTCGTGTTTACATCGCTCAAAAACGTAAGATTAAGGTCGGAGATAAAA +TGGCCGGACGTCACGGAAACAAAGGGGTTGTCTCTCGTATCGTTCCTGTAGAAGACATGCCTTACCTTCC +AGACGGAACTCCAGTCGACATCATGTTGAACCCACTTGGGGTGCCATCACGTATGAATATCGGTCAGGTT +ATGGAGCTTCACCTTGGTATGGCAGCTCGTACTCTTGGTATTCACATTGCGACACCAGTCTTTGATGGAG +CAAGTTCTGAAGATCTTTGGTCAACTGTTAAAGAAGCAGGTATGGATAGCGATGCCAAGACAATCCTTTA +CGATGGACGTACAGGTGAACCATTTGATAACCGTGTTTCTGTTGGAGTCATGTACATGATCAAACTCCAC +CACATGGTTGACGATAAATTGCACGCGCGTTCAGTCGGACCTTACTCAACTGTTACCCAACAACCACTCG +GAGGTAAAGCTCAGTTTGGTGGACAACGTTTCGGTGAGATGGAGGTTTGGGCTCTTGAAGCCTACGGTGC +GTCAAATGTCCTTCAAGAAATCTTGACTTACAAGTCGGACGATATCAACGGACGTTTGAAAGCCTATGAA +GCTATTACAAAAGGCAAACCAATTCCAAAACCAGGTGTTCCAGAATCCTTCCGAGTTCTTGTCAAAGAAT +TGCAATCTCTTGGTCTTGACATGCGTGTCCTAGACGAAGATGACCAAGAAGTGGAACTTCGCGACTTGGA +TGAAGGAATGGACGAAGATGTCATCCACGTAGATGACCTTGAAAAAGCCCGCGAAAAAGCAGCCCAAGAG +GCTAAAGCAGCCTTTGAAGCTGAAGAAGCTGAGAAAGCAACAAAAGCGGAAGCAACAGAAGAAGCTGCTG +AACAAGAATAA +>vanB_KC489787 +ATGAATAAAATAAAAGTCGCAATTATCTTCGGCGGTTGCTCGGAGGAACATGATGTATCGGTAAAATCCG +CAATAGAAATTGCTGCGAACATTAATACTGAAAAATTCGATCCGCACTACATCGGAATTACAAAAAACGG +CGTATGGAAGCTATGCAAGAAGCCATGTACGGAATGGGAAGCCGATAGTCTCCCCGCCATATTCTCCCCG +GATAGGAAAACGCATGGTCTGCTTGTCATGAAAGAAAGAGAATACGAAACTCGGCGTATTGACGTGGCTT +TCCCGGTTTTGCATGGCAAATGCGGGGAGGATGGTGCGATACAGGGTCTGTTTGAATTGTCTGGTATCCC +CTATGTAGGCTGCGATATTCAAAGCTCCGCAGCTTGCATGGACAAATCACTGGCCTACATTCTTACAAAA +AATGCGGGCATCGCCGTCCCCGAATTTCAAGTGATTGAAAAAGGTGGCAAACCGGAGGCGAGGACGCTTA +CCTACCCTGTCTTTGTGAAGCCGGCACGGTCAGGTTCGTCCTTTGGCGTAACCAAAGTAAACAGTACGGA +AGAACTAAACGCTGCGATAGAAGCAGCAGGACAATATGATGGAAAAATCTTAATTGAGCAAGCGATTTCG +GGCTGTGAGGTCGGCTGCGCGGTCATGGGAAACGAGGATGATTTGATTGTCGGCGAAGTGGATCAAATCC +GGTTGAGCCACGGTATCTTCCGCATCCATCAGGAAAACGAGCCGGAAAAAGGCTCAGAGAATGCGATGAT +TATCGTTCCAGCAGACATTCCGGTCGAGGAACGAAATCGGGTGCAAGAAACGGCAAAGAAAGTATATCGG +GTGCTTGGATGCAGAGGGCTTGCTCGTGTTGATCTTTTTTTGCAGGAGGATGGCGGCATCGTTCTAAACG +AGGTCAATACCCTGCCCGGTTTTACATCGTACAGCCGCTATCCACGCATGGCGGCTGCCACAGGAATCAC +GCTTCCCGCACTAATTGACAGCCTGATTACATTGGCGATAGAGAGGTGA +>vanD_EU999036 +ATGTTTAAGATTAAAGTTGCAGTTCTGTTTGGGGGCTGTTCAGAGGAACATAATGTTTCGATAAAATCTG +CGATGGAGATTGCCGCAAACATAGATACAAAAAAATATCAGCCTTATTATATTGGAATCACAAAATCCGG +CGTTTGGAAGATGTGTGAAAAACCTTGTTTGGGGTGGGAACAATATGCGGGGGATCCGGTTGTTTTTTCG +CCGGACAGAAGTACGCATGGTCTGCTGATACAAAAAGACACTGGGTATGAAATCCAGCCTGTAGATGTGG +TATTTCCGATGATTCATGGCAAGTTTGGCGAAGATGGATCCATACAAGGCTTGCTTGAATTGTCAGGCAT +TCCGTATGTGGGATGCGATATTCAAAGCTCCGTGATCTGCATGGATAAGGCGCTTGCATATACCGTTGTG +AAAAATGCGGGTATCGCTGTGCCTGGGTTCCGGATCCTTCAGGAGGGGGATCGCCTGGAAACGGAGGATT +TAGTATATCCCGTCTTTGTAAAGCCTGCCCGTTCTGGCTCATCCTTTGGCGTAAACAAGGTATGCAAGGC +AGAAGAACTGCAGGCAGCAATCAGAGAAGCAAGAAAATATGATAGCAAGATTTTGATTGAAGAGGCCGTT +ACCGGGAGTGAGGTAGGCTGCGCCATACTGGGAAACGAAAATGATCTCATGGCTGGCGAGGTGGATCAGA +TTGAGCTGAGACACGGCTTTTTTAAGATTCATCAGGAAGCACAGCCGGAGAAGGGATCTGAAAATGCAGT +TATCAGAGTTCCAGCCGCCTTACCGGATGAGGTAAGAGAACGGATTCGGAAAACAGCAATGAAGATTTAC +CGGATACTTGGCTGCCGAGGATTGGCCCGTATTGATCTGTTTTTGCGGGAGGACGGCTGCATTGTGCTGA +ATGAAGTGAATACCATGCCGGGTTTTACTTCCTACAGCCGTTATCCCCGCATGATGACAGCAGCCGGTTT +TACGCTTTCTGAAATACTGGATCGCTTGATTGAATTTTCACTTAGGAGGTAA +>vanE_FJ872411 +ATGAAGACAGTTGCGATTATCTTTGGCGGAGTTTCTTCTGAATATGAAGTTTCACTGAAA +TCTGCTGTAGCGATTATTAAAAATATGGAATCTATTGATTATAACGTAATGAAAATAGGG +ATCACCGAAGAAGGTCATTGGTATCTATTTGAAGGAACGACAGACAAAATAAAGAAAGAT +CGTTGGTTTTTAGATGAAAGCTGTGAAGAAATCGTAGTTGATTTCGCAAAAAAAAGCTTT +GTATTGAAAAACAGTAAAAAAATAATCAAGCCTGATATTTTATTCCCAGTTTTACATGGA +GGTTATGGTGAGAATGGTGCTATGCAGGGAGTATTTGAGTTATTAGATATTCCATATGTA +GGTTGTGGTATCGGAGCTGCAGCAATCTCTATGAATAAAATAATGCTCCATCAATTTGCT +GAAGCAATTGGTGTAAAAAGCACCCCTAGTATGATTATAGAAAAGGGACAAGACCTACAA +AAAGTCGATGCGTTTGCGAAAATACATGGATTTCCTTTATATATTAAACCGAATGAGGCA +GGCTCATCAAAAGGAATTAGCAAGGTAGAACGAAAAAGTGATTTATATAAAGCAATAGAC +GAAGCTTCAAAATATGATAGTCGTATTTTAATTCAAAAGGAAGTGAAAGGGGTAGAAATT +GGTTGTGGTATTTTAGGAAATGAACAATTGGTCGTTGGAGAATGTGACCAAATCAGTCTT +GTGGATGGCTTTTTCGATTATGAAGAGAAATACAATTTAGTAACAGCAGAAATTTTGTTA +CCAGCTAAACTATCAATAGACAAAAAAGAAGATATTCAGATGAAAGCAAAAAAACTATAC +AGACTATTAGGATGCAAAGGATTAGCGAGAATCGACTTTTTCTTAACTGATGACGGAGAA +ATTTTATTAAATGAAATCAATACAATGCCTGGTTTTACAGAGCATTCGAGATTTCCAATG +ATGATGAATGAGATTGGGATGGACTACAAAGAGATTATAGAAAACCTATTAGTATTGGCG +GTGGAAAATCATGAAAAAAAATTATCTACGATTGATTAA +>vanG_KF704242 +ATGCAGAAGAAAAAAATAGCTATTATTTTTGGCGGCAATTCAACAGAGTATGAGGTGTCATTACAATCGG +CATTTTCTGTTTTTGAAAATATCAATAAAGAAAAATTCGACATAGTTCCAATCGGAATTACCAGAAATGG +CGACTGGTATCATTACACAGGCAAAAAAGAAAAGATTGCAAATAATACTTGGTTTGAGGATAACGAAAAC +CTGTATTCTGTTGCGGTATCGCAAAACCGTTCTGTAAAAGGCTTTATAGAATTTAAGGAAGAAAAATTCT +ACATCATTAAGGTTGACTTGATATTTCCTGTATTGCACGGCAAGAACGGCGAGGACGGTACTTTGCAGGG +ATTATTTGAATTGGCAGGAATACCTGTTGTTGGGTGTGATACACTCTCGTCTGCTCTTTGTATGGACAAA +GATAAAGCACATAAACTTGTTAGCCTTGCGGGTATCTCTGTTCCAAAATCAGTAACATTCAAACGCTTTA +ACAAAGAAGCAGCGATGAAAGAGATTGAAGCGAATTTAACTTATCCGCTGTTTATTAAACCTGTTCGTGC +AGGCTCTTCCTTTGGAATAACAAAAGTAATTGAAGAGCAAGAGCTTGATGCTGCCATAGAGTTGGCATTT +GAACACGATACAGAAGTCATCGTTGAAGAAACAATAAACGGCTTTGAAGTCGGTTGTGCCGTACTTGGCA +TAGATGAGCTGATTGTTGGCAGAGTTGATGAAATCGAACTGTCAAGCGGCTTTTTTGATTATACAGAGAA +ATATACACTTAAATCTTCAAAGATATATATGCCTGCAAGGATTGATGCTGAAGCAGAAAAACGGATACAA +GAAACGGCTGTAACTATATATAAAGCTCTGGGCTGTTCGGGTTTTTCCAGAGTGGATATGTTTTATACAC +CGTCTGGCGAAATTGTATTTAATGAGGTAAACACAATACCAGGCTTTACCTCGCACAGTCGCTATCCAAA +TATGATGAAAGGCATTGGTCTATCGTTCGCCCAAATGTTGGATAAGCTGATAGGTCTGTATGTGGAATGA +>tetS_M_MH283012 +TTGAAAATTATTAATATCGGTATCTTAGCACATGTTGATGCAGGAAAAACTACTTTGACAGAAAGCTTAC +TATACAGTAGCGGAGCAATTAAAGAGTTAGGAAGTGTAGATAGCGGTACAACGAAAACGGATACTATGTT +TTTGGAACGCCAGAGAGGTATTACTATTCAGACCGCAATAACATCTTTTCAACGGGAAAATGTTAAAGTA +AATATTGTAGATACTCCTGGACACATGGATTTTTTGGCAGATGTATACCGTTCATTATCTGTTTTGGATG +GAGCTATTTTGCTAATCTCTGCAAAAGATGGAGTACAGTCACAAACTCGTATACTATTCCATGCACTTAG +AAAGATGAACATACCTATAATATTTTTTATTAACAAAATTGATCAAAATGGAATAAATTTGCCAGATGTT +TATCAAGATATTAAGGACAAACTTTCTGACGACATCATAATTAAGCAGACTGTGAATCTAAATTTGAAAC +CTTATGTAATAGATTATACTGAACCAGAACAATGGGAGACAGTAATTGTGGGAAATGATTATTTATTAGA +AAAATATACCATTGGGAAAACATTGAATATTGCAGAACTTGAAAAGGAGGAAAACGAAAGAATTCAAAGT +TGCTCCTTATATCCTGTTTATCACGGAAGTGCAAAGAATAATATTGGAATTAAACAACTTATAGAGGTAA +TTACTAGCAAATTATTTTCACCCACACAACTCAATTCAGATAAACTTTGTGGAAATGTTTTTAAAGTAGA +ATATTCAGATGATGGTCAACGGCTTGTCTATGTACGTCTTTATAGTGGAACGCTACATTTGCGAGACTCA +GTCAATATATCAGAAAAGGAAAAAATAAAAGTTACAGAAATGTATACTTCAATAAATGGAGAATTACGCC +AGATAGATAAGGCAGAGCCTGGTGAGATTATTATTTTAAAAAATGAGCTTTTAAAACTAAATAACGTACT +TGGAGATAAAAAAAGATTACCACATAGAGAAATTCTTGAGAATCCTCTTCCTATGTTACAAACAACAATT +GAACCATGTAAATCAGTACAAAGAGAAAAGTTACTAGATGCACTTTTTGAAATATCCGATAGTGATCCCC +TTCTACAATATTATGTAGATACAGTAACTCACGAAATTGTGCTATCTTTTTTAGGTGAGGTCCAAATGGA +GGTAACTTGTACTCTGATTCAAGAAAAATATCATATTGAGATAGAAACAAGAAAACCAACTGTCATTTAT +ATGGAAAGACCATTAAAAAAATCTGAATTTACCATTGATATCGAAGTACCTCCAAATCCTTTCTGGGCTT +CTATTGGTTTATCTGTAACACCACTTCCTTTGGGTAGTGGCATTCAGTATGAGAGCCTGGTTTCTCTAGG +TTATTTAAATCAATCATTTCAAAATGCAGTTATGGAAGGTATACGCTATGGGTGTGAACAAGGATTGTAC +GGTTGGAAATTAACAGACTGTAAGATCTGTTTTAAGTATGGTCTATATTACAGCCCTGTCAGTACGCCAG +CAGATTTCCGAATGCTTGCGCCTATTGTACTAGAGCAGGCTTTTAGAAAGAGTGGTACAGAGTTATTAGA +GCCATATCTTAGCTTCGAAATTTATGTACCACAAGAATATCTTTCGAGAGCATATAATGATGCTTCCAAA +TATTGTGCAAATATTTTAAATACTAAGTTAAAAGGTAACGAGGTCATTCTCATTGGTGAAATTCCAGCCC +GTTGTATTCAAGAGTATCGAAACAGTTTAACTTTCTTTACAAATGGACGCAGTGTCTGTTTAACAGAGTT +AAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAA +GTACGATATATGTTCAATAAAATAACTTAG +>tetM_MH283017 +ATGAAAATTATTAATATTGGAGTTTTAGCTCATGTTGATGCGGGAAAAACTACCTTAACAGAAAGCTTAT +TATATAACAGTGGAGCGATTACAGAATTAGGAAGCGTGGACAGAGGTACAACGAAAACGGATAATACGCT +TTTAGAACGTCAGAGAGGAATTACAATTCAGACGGCGATAACCTCTTTTCAGTGGAAAAATACTAAGATA +AACATCATAGACACGCCAGGACATATGGATTTTTTAGCAGAAGTATATCGTTCATTATCAGTATTAGATG +GGGCAATTCTACTGATTTCTGCAAAAGATGGCGTACAAGCACAAACTCGTATATTGTTTCATGCACTTAG +GAAAATAGGTATTCCCACAATCTTTTTTATCAATAAGATTGACCAAAATGGAATTGATTTATCAACGGTT +TATCAGGATATTAAAGAGAAACTTTCTGCGGAAATTGTAATCAAACAGAAGGTAGAACTGCATCCTAATA +TGCGTGTAATGAACTTTACCGAATCTGAACAATGGGATATGGTAATAGAAGGAAATGATTACCTTTTGGA +GAAATATACGTCTGGGAAATTATTGGAAGCATTAGAACTCGAACAAGAGGAAAGCATAAGATTTCATAAT +TGTTCCCTGTTCCCTGTTTATCACGGAAGTGCAAAAAACAATATAGGGATTGATAACCTTATAGAAGTGA +TTACGAATAAATTTTATTCATCAACACATCGAGGTCAGTCTGAACTTTGCGGAAAAGTTTTCAAAATTGA +GTATTCGGAAAAAAGACAGCGTCTTGCATATATACGTCTTTATAGTGGCGTACTGCATTTGCGAGATTCG +GTTAGAATATCGGAAAAGGAAAAAATAAAAATTACAGAAATGTATACTTCAATAAATGGTGAATTATGTA +AAATCGATAAGGCTTATTCCGGGGAAATTGTTATTTTGCAGAATGAGTTTTTGAAGTTAAATAGTGTTCT +TGGAGATACAAAGCTATTGCCACAGAGAGAGAGAATTGAAAATCCCCTCCCTCTGCTGCAAACGACTGTT +GAACCGAGCAAACCTCAACAAAGGGAAATGTTACTTGATGCACTTTTAGAAATCTCCGACAGTGACCCGC +TTCTGCGATATTATGTGGATTCTGCGACACATGAAATCATACTTTCTTTCTTAGGGAAAGTACAAATGGA +AGTGACTTGTGCTCTGCTGCAAGAAAAGTATCATGTGGAGATAGAAATAAAAGAGCCTACAGTCATTTAT +ATGGAAAGACCGTTAAAAAAAGCAGAGTATACCATTCACATCGAAGTTCCACCGAATCCTTTCTGGGCTT +CCATTGGTCTATCTGTAGCACCGCTTCCATTAGGGAGCGGAGTACAGTATGAGAGCTCGGTTTCTCTTGG +ATACTTAAATCAATCGTTTCAAAATGCAGTTATGGAGGGGATACGCTATGGCTGTGAACAAGGATTGTAT +GGTTGGAATGTGACGGACTGTAAAATCTGTTTTAAGTATGGCTTATACTATAGCCCTGTTAGTACCCCAG +CAGATTTTCGGATGCTTGCTCCTATTGTATTGGAACAAGTCTTAAAAAAAGCTGGAACAGAATTGTTAGA +GCCATATCTTAGTTTTAAAATTTATGCGCCACAGGAATATCTTTCACGAGCATACAACGATGCTCCTAAA +TATTGTGCGAACATCGTAGACACTCAATTGAAAAATAATGAGGTCATTCTTAGTGGAGAAATCCCTGCTC +GGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGTTTGTTTAACAGAGTT +AAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAA +GTACGATATATGTTCAATAAAATAACTTAG +>tetAp_L20800 +ATGGTTAATAAACTTTCAGCATATAAAACTTATTTATTATTTTCAGCTATTACAGCAATGTGTTTTTCGT +TAGTAGCTACAGTTATGATGGTGTATCACATTGAAATAGTTCATTTAAATCCACTTCAGCTTATACTTGT +TGGAACTACTTTGGAATTAGCATGCTTTATATTTGAAATTCCTACAGCTATAGTTGCAGATGTGTATAGT +CGTAAACTATCTATTGTTATTGGGGGAGTTTTAACAGGAGTGGGATTTATTTTAGAAGGTTCTATTTCTA +GTTTTGTTTTCGTACTTGTAGCACAGATTGTATGGGGATTAGGGTCTACTTTTATCAGTGGCTCGCTTGA +AGCTTGGATTGCGGAAGAAGAGAAGAATAAAGATTTAGATGAAATTTATATAAAGGGAGCACAAGCAGGG +CAGATAGGAGCATTTATTGGAATAGTACTAAGCACTGTAATAGCTAATTTCTCTGTAAGGCTTCCTATTA +TAGTTAGTGGAGTTTTATTTATAATTCTTGCATTATTTTTATGGTTATATATGCCAGAAAATAATTTTAA +ACCATCTGCTCCTGGGGATTTAAATACATTCAAAAAGATGGTATATACATTTAAATCTGGTCTTAAATTT +GTAAAAAGTAAATCTATAATTATGATTTTACTTGCAGTAACTTTATTTTATGGATTATCAAGTGAAGGTT +ATGATAGACTTTCTAATGCGCATTTTTTACAAGATACTACACTTCCTAAACTTGGAAACCTTAGTTCAGT +GACTTGGTTTGGAATTTTTGGAATTTTAGGAATGATATTGAGCTTCATAGTAATGCATTTTATGGCAAAG +AATCTTAAGAATGAGGATAATAGGAAAAATGGAAAACTATTATTATGCATAAATATACTTTATATATCGT +CTATGTTGATATTTGCTCTTACAAGAAACTTTAGTTTAATGTTAATAGCTTATTTGGCAACAAATACCTT +TAGAATTATAAATAAACCTATATTCAGTGCGTGGTTAAATGGGCATATAGATGATAATTCTAGAGCTACT +GTGCTTTCTATAAATGGACAAATGAATTCCTTAGGTCAAATTTTAGGTGGACCGATTATAGGAATCATAG +CTACAAATATTTCAGTAAGTATTGGTATAGTATGTACTTCGTTATTAGTAACACCGGTATTAGTGTTATA +TATTGTTGCTATGATAATTGATAAAAAGGTGGATGATAGAGTTGGAGGTATTGATTATGAAGAAAATAAT +TAA +>tetBp_L20800 +ATGAAGAAAATAATTAATATAGGAATCGTAGCACACGTGGATGCAGGAAAAACAACTATAACAGAAAACT +TATTATATTATAGTGGAGCTATAAAATCAGTTGGAAGAGTTGATTTAGGCAATACACAGACGGATTCTAT +GGAGCTTGAGCGTAAGAGAGGAATTACCATTAAATCGTCAACCATATCTTTTAATTGGAATAATGTTAAG +GTGAATATTATTGATACTCCAGGACATGTGGATTTTATTTCGGAAGTTGAACGTTCATTAAATAGCTTAG +ATGGAGCAATACTAGTTATATCAGGAGTAGAGGGGATTCAGTCACAAACAAGAATATTATTTGACACATT +AAAGGAGTTAAATATTCCAACAATAATTTTTGTAAATAAGCTAGATAGAATTGGGGCAAATTTCAACAAA +GTATTTGAAGAAATAAAGAAGAATATGTCCAATAAAGTAGTTAGATTACAAGAAGTATATGATGTAGGAA +GCAAAGCTGTTTATATAAAAAAACTATTTGATACATGCATAATAAATGATGATGCTATTAATGTTTTATC +AGACTTAGACGAAGCATTTTTAGAAAGATATATTGGTGGAATAGAACCTGATAAAGAAGAAATACAAGAA +AAGCTTTCATTATATGCAAGAGAAGGAAGTCTATATCCAGTATTTTGTGGTGCTGCAGCAATTGGACTTG +GAATTGAAGATTTATTAGATGGAATTTGTAGTTATTTTCCATTTGCAAGTAATGATTGTGAAAGTGATTT +ATCTGGGGTAGTATTTAAAATCGAAAGAACAAGTAAAAATGAAAAGAAGGTTTATGTAAGATTATTTGGA +GGAAAAATATCTGTAAGAGATAAAATTCAAGTACCTAATAAGGAGATAGCAGAAAAAGTAAAGAAAATTA +ATAGGTTAGAAAATGGGGGAGTTGTTGAAGCACAGAGGATAGAAGCAGGGGATATAGGTATTTTATATGG +ACTTACAAGTTTCCAAGTGGGAGATGTTATTGGAATTTCAAATGATAAAATTAAAAATATATCTATAGCT +AAACCAGCATTAAAAACAACAATTTCTGCAATTGATAAAGAAAAAAATCCAGAGCTATTTAAAGCATTAA +CATTACTTGCAGAGGAAGATCCACTACTCGCCTTCGCGATGAATGACATAGATAAAGAAATTTATGTCAA +CTTATTCGGTGAAGTTCAAATGGAAATACTAAGTTCCATGTTAGATGATTTATATGGAATAAAAGTAGAG +TTTTCGAATATTGAGACTATCTATAAGGAAACACCTAAAGGTTTTGGAGCGTCAATAATGCATATGCAGG +AAGACTTAAATCCATTTTGGGCGACAGTAGGCTTAGAAATAGAACCAGCAGGGAGAGGCGAAGGTCTTAG +GTATATTTCTAATGTTTCAGTAGGGTCATTGCCAAAATCTTTTCAAAATGCAATTGAAGAAGCAGTTATT +AAGACAAGTAAACAAGGATTATTTGGATGGGAGGTTACAGATGTAAAAGTCACTCTTAGCTGTGGTGAAT +TTTTTAGTCCAGCCAGCACTCCAGCAGATTTTAGAAATGTGACACCTATGGTATTCATGGAAGCATTATA +TAAAGCACAAACTGTTTTATTAGAGCCATTACATGAGTTTGAGTTAAAGATTCCTCAAAATGCTTTAAGC +AAAGCGGTATGGGATTTAGAAACTATGAGGGCAACCTTTGATAATCCTATTGTTATAGGGGATGAATTCT +CAATAAAGGGATTAATTCCAGTAGAAAATTCAAAAGAATATAAAATGAAAATAGCTTCATATACAGAAGG +TAGAGGAATGTTTGTGACAAAATTTTATGGGTATAAGGAAGCTTCAGCTGAATTTTCAAAAGCACGCAAA +AAAACAACGTATGATCCATTGAATAAAAAAGAGTATTTGCTTCATAAACTAAACGCAATTAGAGATTAA +>tetAQ2_Z21523 +GTGCGTTTCGACAATGCATCTATTGTAGTATATTATTGCTTAATCCAAATGAATATTATAAATTTAGGAA +TTCTTGCTCACATTGATGCAGGAAAAACTTCCGTAACCGAGAATCTGCTGTTTGCCAGTGGAGCAACGGA +AAAGTGCGGCCGTGTGGATAATGGTGACACCATAACAGACTCTATGGATATAGAGAAACGTAGAGGAATT +ACTGTTCGGGCTTCTACGACATCTATTATCTGGAATGGAGTGAAATGCAATATCATTGACACTCCGGGAC +ACATGGATTTTATTGCGGAAGTGGAGCGGACATTCAAAATGCTTGATGGAGCAGTCCTCATCTTATCCGC +AAAGGAAGGCATACAAGCGCAAACAAAGTTGCTGTTCAATACTTTACAAAAACTGCAAATCCCGACAATT +ATATTTATCAATAAAATTGACCGTGACGGTGTGAATTTAGAGCGTTTGTATCTGGATATAAAAACAAATC +TGTCTCAAGATGTCCTGTTTATGCAAACTGTTGTCGATGGATTGGTTTATCCGATTTGCTCCCAAACATA +TATAAAGGAAGAATACAAAGAATTTGTATGCAACCATGACGACAATATATTAGAACGATATTTGGCGGAT +AGCGAAATTTCACCGGCTGATTATTGGAATACGATAATCGATCTTGTGGCAAAAGCCAAAGTCTATCCGG +TACTACATGGATCAGCAATGTTCAATATCGGTATCAATGAGTTGTTGGACGCCATCTCTTCTTTTATACT +TCCTCCAGAATCAGTCTCAAACAGACTTTCAGCTTATCTCTATAAGATAGAGCATGACCCCAAAGGACAT +AAAAGAAGTTTTCTAAAAATAATTGACGGAAGTCTGAGACTTCGAGACATTGTAAGAATCAACGATTCGG +AAAAATTCATCAAGATTAAAAATCTAAAGACTATTTATCAGGGCAGAGAGATAAATGTTGATGAAGTGGG +GGCCAATGATATCGCGATTGTAGAAGATATGGAAGATTTTCGAATCGGAGATTATTTAGGTACTAAACCT +TGTTTGATTCAAGGGTTATCTCATCAGCATCCCGCTCTCAAATCCTCCGTCCGGCCAGACAGGTCCGAAG +AGAGAAGCAAGGTGATATCCGCTCTGAATACATTGTGGATTGAAGACCCGTCTTTGTCCTTTTCCATAAA +CTCATATAGTGATGAATTGGAAATCTCGTTATATGGTTTGACACAAAAGGAAATCATACAGACATTGCTG +GAAGAACGATTTTCCGTAAAGGTCCATTTTGATGAGATCAAGACTATCTACAAAGAACGACCTGTAAAAA +AGGTCAATAAGATTATTCAGATCGAAGTGCCACCCAACCCTTACTGGGCCACAATAGGGCTGACGCTTGA +ACCCTTGCCGTTAGGGACAGGGTTGCAAATCGAAAGTGACATCTCCTATGGTTATCTGAACCATTCTTTT +CAAAATGCCGTTTTTGAAGGGATTCGTATGTCTTGCCAATCTGGTTTACATGGATGGGAAGTGACTGATC +TGAAAGTAACTTTTACTCAAGCCGAGTATTATAGCCCGGTAAGTACACCTGCTGATTTCAGACAGCTGAC +CCCTTATGTCTTCAGGCTGGCCTTGCAACAGTCAGGTGTGGACATTCTCGAACCGATGCTCTATTTTGAG +TTGCAGATACCCCAAGCGGCAAGTTCCAAAGCTATTACAGATTTGCAAAAAATGATGTCTGAGATTGAAG +ACATCAGTTGCAATAATGAGTGGTGTCATATTAAAGGGAAAGTTCCATTAAATACAAGTAAAGACTACGC +CTCAGAAGTAAGTTCATACACTAAGGGCTTAGGCGTTTTTATGGTCAAGCCATGCGGGTATCAAATAACA +AAAGGCGATTATTCTGATAATATCCGCATGAACGAAAAAGATAAACTTTTATTCATGTTCCAAAAATCAA +TGTCATCAAAATAA +>tetS_FN555436 +TTGAAAATTATTAATATCGGTATCTTAGCACATGTTGATGCAGGAAAAACTACTTTGACAGAAAGCTTAC +TATACAGTAGCGGAGCAATTAAAGAGTTAGGAAGTGTAGATAGCGGTACAACGAAAACGGATACTATGTT +TTTGGAACGCCAGAGAGGTATTACTATTCAGACCGCAATAACATCTTTTCAACGGGAAAATGTTAAAGTA +AATATTGTAGATACTCCTGGACACATGGATTTTTTGGCAGATGTATACCGTTCATTATCTGTTTTGGATG +GAGCTATTTTGCTAATCTCTGCAAAAGATGGAGTACAGTCACAAACTCGTATACTATTCCATGCACTTAG +AAAGATGAACATACCTATAATATTTTTTATTAACAAAATTGATCAAAATGGAATAAATTTGCCAGATGTT +TATCAAGATATTAAGGACAAACTTTCTGACGACATCATAATTAAGCAGACTGTGAATCTAAATTTGAAAC +CTTATGTAATAGATTATACTGAACCAGAACAATGGGAGACAGTAATTGTGGGAAATGATTATTTATTAGA +AAAATATACCATTGGGAAAACATTGAATATTGCAGAACTTGAAAAGGAGGAAAACGAAAGAATTCAAAGT +TGCTCCTTATATCCTGTTTATCACGGAAGTGCAAAGAATAATATTGGAATTAAACAACTTATAGAGGTAA +TTACTAGCAAATTATTTTCACCCACACAACTCAATTCAGATAAACTTTGTGGAAATGTTTTTAAAGTAGA +ATATTCAGATGATGGTCAACGGCTTGTCTATGTACGTCTTTATAGTGGAACGCTACATTTGCGAGACTCA +GTCAATATATCAGAAAAGGAAAAAATAAAAGTTACAGAAATGTATACTTCAATAAATGGAGAATTACGCC +AGATAGATAAGGCAGAGCCTGGTGAGATTATTATTTTAAAAAATGAGCTTTTAAAACTAAATAACGTACT +TGGAGATAAAAAAAGATTACCACATAGAGAAATTCTTGAGAATCCTCTTCCTATGTTACAAACAACAATT +GAACCATGTAAATCAGTACAAAGAGAAAAGTTACTAGATGCACTTTTTGAAATATCCGATAGTGATCCCC +TTCTACAATATTATGTAGATACAGTAACTCACGAAATTGTGCTATCTTTTTTAGGTGAGGTCCAAATGGA +GGTAACTTGTACTCTGATTCAAGAAAAATATCATATTGAGATAGAAACAAGAAAACCAACTGTCATTTAT +ATGGAAAGACCATTAAAAAAATCTGAATTTACCATTGATATCGAAGTACCTCCAAATCCTTTCTGGGCTT +CTATTGGTTTATCTGTAACACCACTTCCTTTGGGTAGTGGCATTCAGTATGAGAGCCTGGTTTCTCTAGG +TTATTTAAATCAATCATTTCAAAATGCAGTTATGGAAGGTATACGCTATGGGTGTGAACAAGGATTGTAC +GGTTGGAAATTAACAGACTGTAAGATCTGTTTTAAGTATGGTCTATATTACAGCCCTGTCAGTACGCCAG +CAGATTTCCGAATGCTTGCGCCTATTGTACTAGAGCAGGCTTTTAGAAAGAGTGGTACAGAGTTATTAGA +GCCATATCTTAGCTTCGAAATTTATGTACCACAAGAATATCTTTCGAGAGCATATAATGATGCTTCCAAA +TATTGTGCAAATATTTTAAATACTAAGTTAAAAGGTAACGAGGTCATTCTCATTGGTGAAATTCCAGCCC +GTTGTATTCAAGAGTATCGAAACAGTTTAACTTTCTTTACAAATGGACGCAGTGTCTGTTTAACAGAGTT +AAAAGGTTATCAGGTTACTAACATTAAGTCTGCTTTCCAACCACGTCGTCCAAATAATAGAATAGACAAA +GTAAGGCATATGTTTAATAAAATCAACTTACATTGA +>tetT_L42544 +ATGAAAATTATTAATATAGGAATATTAGCACATGTTGATGCAGGTAAAACAACTGTTACAGAAGGTTTAT +TATATAAAAGTGGGGCGATTAATAAAATTGGAAGAGTTGATAATGCTACAACGACAACGGATTCGATGGA +ACTTGAAAGAGATAGGGGAATAACTATACGGGCGTCTACAGTTTCATTTAATTACAATGATACAAAGGTA +AATATCATAGATACACCTGGGCACATGGATTTCATAGCCGAAGTTGAGCGAACTCTGAAAGTGTTAGATG +GAGCTATTTTAGTAATTTCAGCAAAAGAAGGAATTCAAGTCCAAACTAAAGTGATTTTTAATACTTTAGT +GAAATTAAATATACCAACACTTATATTTGTGAATAAAATAGATCGAAAGGGAGTATGTTTGGATGAGATA +TACACTCAAATACAGGAGAAATTAACTTCTAATCTTGCAATAATGCAATCAGTTAAAATAAAAGATAAAG +GTGATTTTGAATTGACAAATGTAAGGGATGATAAAGTAATTCAAAGTCAAATAATAGAGAAGTTACTGGA +TATAAATGATTATCTAGCAGAAAAATATATAAATGGCGATGTCATTGCAGAAAAAGAATATAATGATGTA +TTTTTGGATGAGATTAATAACTGCAATCTTTATCCTGTATTTCATGGTTCGGCTTTAAAAAATATTGGAA +TTGACGAGCTATTATTTGCCATTACTAAATATCTTCCTACCAAGAGCTATAATACTGAAGACCTTTTATC +AGCGTATGTTTATAAGATTGATAGGGATGAAAAATCTAGAAAGATGACTTTCTTAAGAGTATTCAGTGGG +AATATAAGGACACGTCAAGATGTTTATATAAATGGCACAGAAGAAACTTTCAAGATAAAAAGTCTGGAAT +CAATTATGAATGGTGAAATTGTGAAGGTAGGTCAGGTTAATAGTGGGGATATTGCTATTATTTCTAATGC +TAATTCTCTGAAGATAGGTGATTATATTGGTAAGAAATATGACGGGATTTTAGATATAAAGATAGCCCAA +CCGGCATTGAGAGCATCAATTAAACCTTGTGATTTAAGCAAAAGAAGCAAACTGATAGAAGCACTATTTG +AATTAACTGAAGAAGACCCATTTCTCGATTGTGAAATTAACGGAGATACTGGAGAAATCATATTGAGGCT +ATTTGGAAATATTCAAATGGAAGTAATAGAATCACTACTTAAAAGCCGATACAAAATAGATGCTAGATTT +GGTGAATTGAAAACAATATATAAAGAACGACCTAAGAGAAACTCTAAAGCAGTAATCCATATAGAGGTTC +CACCAAATCCTTATTGGGCATCTATTGGACTGTCAATAGAACCACTACCAATAGGGTCAGGATTATTATA +TAAGACAGAGGTGTCCTATGGATATTTAAATAATTCATTTCAAAATGCAGTAAAAGATGCTGTAGAGAAG +GCTTGTAAAGAAGGGCTTTATGGATGGGAAGTTACAGACTTAAAGGTAACTTTTGACTACGGATTATACT +ATAGCCCGGTAAGTACCCCCTCTGACTTTAGGAATTTAACACCATATGTATTTTGGGAAGCTCTTCGAAA +AGCAGGAACTGAAATATTAGAACCTTATTTAAAATATACAGTTCAAGTTCCAAATGATTTCTGCGGAAGG +GTTATGAGTGATCTTAGAAAGATGAGGGCTTCTATTGAAGATATAATAGCCAAGGGAGAGGAGACAACTT +TAAGTGGAAAGATACCTGTTGATACATCGAAGTCCTATCAGTCAGAATTACTTTCTTATTCAAATGGAAA +GGGTATATTTATTACTGAGCCTTATGGGTATGATATATATAATGATAAGCCTATAATTAATGATATTGGG +AACGACAATAATGATAGCAACAAGGAAGGGTTAAGATATTTATTTCAAAAACAGGATGAAAATTGA +>tetW_AJ222769 +ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAAAGACGACCTTGACGGAGAGCCTGC +TATATGCCAGCGGAGCCATTTCAGAACCGGGGAGCGTCGAAAAAGGGACAACGAGGACGGACACCATGTT +TTTGGAGCGGCAGCGTGGGATTACCATTCAAGCGGCAGTCACTTCCTTCCAGTGGCACAGATGTAAAGTT +AACATTGTGGATACGCCCGGCCACATGGATTTTTTGGCGGAGGTGTACCGCTCTTTGGCTGTTTTAGATG +GGGCCATCTTGGTGATCTCCGCTAAAGATGGCGTGCAGGCCCAGACCCGTATTCTGTTCCATGCCCTGCG +GAAAATGAACATTCCCACCGTTATCTTTATCAACAAGATCGACCAGGCTGGCGTTGATTTGCAGAGCGTG +GTTCAGTCTGTTCGGGATAAGCTCTCCGCCGATATTATCATCAAGCAGACGGTGTCGCTGTCCCCGGAAA +TAGTCCTGGAGGAAAATACCGACATAGAAGCATGGGATGCGGTCATCGAAAATAACGATGAATTATTGGA +AAAGTATATCGCAGGAGAACCAATCAGCCGGGAAAAACTTGCGCGGGAGGAACAGCAGCGGGTTCAAGAC +GCCTCCCTGTTCCCAGTCTATCATGGCAGCGCCAAAAATGGCCTTGGCATTCAACCGTTGATGGATGCGG +TGACAGGGCTGTTCCAACCGATTGGGGAACAGGGGGGCGCCGCCCTATGCGGCAGCGTTTTCAAGGTTGA +GTACACCGATTGCGGCCAGCGGCGTGTCTATCTACGGTTATACAGCGGAACGCTGCGCCTGCGGGATACG +GTGGCCCTGGCCGGGAGAGAAAAGCTGAAAATCACAGAGATGCGTATTCCATCCAAAGGGGAAATTGTTC +GGACAGACACCGCTTATCAGGGTGAAATTGTTATCCTTCCCAGCGACAGCGTGAGGTTAAACGATGTATT +AGGGGACCAAACCCGGCTCCCTCGTAAAAGGTGGCGCGAGGACCCCCTCCCCATGCTGCGGACGACGATT +GCGCCGAAAACGGCAGCGCAAAGAGAACGGCTGCTGGACGCTCTTACGCAACTTGCGGATACTGACCCGC +TTTTGCGTTGCGAAGTGGATTCCATCACCCATGAGATCATTCTTTCTTTTTTGGGCCGGGTGCAGTTGGA +GGTTGTTTCCGCTTTGCTGTCGGAAAAATACAAGCTTGAAACAGTGGTAAAGGAACCCTCCGTCATTTAT +ATGGAGCGGCCGCTCAAAGCAGCCAGCCACACCATCCATATCGAGGTGCCGCCCAACCCGTTTTGGGCAT +CCATAGGACTGTCTGTTACACCACTCTCGCTTGGCTCCGGTGTACAATACGAGAGCCGGGTTTCGCTGGG +ATACTTGAACCAGAGTTTTCAAAACGCTGTCAGGGATGGTATCCGTTACGGGCTGGAGCAGGGCTTGTTC +GGCTGGAACGTAACGGACTGTAAGATTTGCTTTGAATACGGGCTTTATTACAGTCCGGTCAGCACGCCGG +CGGACTTCCGCTCATTGGCCCCGATTGTATTGGAACAGGCATTGAAGGAATCGGGGACGCAGCTGCTGGA +ACCTTATCTCTCCTTCATCCTCTATGCGCCCCAGGAATACCTTTCCAGGGCTTATCATGATGCACCGAAA +TACTGTGCCACCATCGAAACGGCCCAGGTAAAAAAGGATGAAGTTGTCTTTACTGGCGAGATTCCCGCCC +GCTGTATACAGGCATACCGTACTGATCTGGCCTTTTACACCAACGGGCGGAGCGTATGCCTTACAGAGCT +GAAAGGATATCAGGCCGCTGTCGGTCAGCCGGTCATCCAGCCCCGCCGTCCAAACAGCCGCCTGGACAAG +GTGCGCCATATGTTTCAGAAGGTAATGTAA +>tet32_AJ295238 +ATGAAAATAATTAACTTAGGCATTCTGGCTCACGTTGACGCAGGAAAGACAACATTAACGGAAAGTTTAT +TGTATACCAGTGGTGCAATTGCAGAACTAGGGAGCGTAGATGAAGGCACAACAAGGACAGATACAATGAA +TTTGGAGCGTCAAAGGGGAATCACTATCCAGACAGCAGTGACATCTTTTCAGTGGGAGGATGTAAAAGTC +AACATTATAGATACGCCAGGCCATATGGATTTTTTAACCGAAGCATACCGCTCTTTATCTGTCCTTGACG +GAGCTGTTTTAGTCATTTCGGCAAAAGACGGCGTACAGGCACAGACGCGTATATTATTCCATGCGCTTCA +GAAAATGAACATTCCGACAATTATCTTTATAAATAAGATAGACCAAAATGGAATCGACCTACAGCGTGTT +TACCAAAGCATTAAAGACAAACTTACCAGTGATATGATTGTCATGCAGGAGGTTTCCCTGTCGCCAAAGA +TAACCATGACCGATATTTCTGATTTGGACAAATGGGATATGATTATTTCCGGAAGCGATGAACTATTAGA +ACGATATGTTGCAGAGGATTCTTTGGATATACAGGAATTACAATATGAAAAGTGCAAAAGAACCAGATGC +TGCTCTTTGTTTCCTGTTTATCATGGGAGTGCAAAAGACAATTTAGGAACAGAAAAACTGATTGAAGCGA +TTACAGAAACTTTCATTACAGAAACAGACGATATTCAGTCTGAATTATGTGGATATGTTTTTAAGGTTGA +GTATACAGAGCGGAAAAAACGGCTTTCTTATTTACGCCTGTATCATGGGACGCTCCATTTACGGGATACC +CTGCTGCTGTCAAAAAAGGAAAAAATAAAGATTACAGAAATGTGTATTCCGTCAAATGGTGAAATCGTCC +CGGTTGACCATGCCTGTCCGGGAGAAATTGTTATTTTAGCTGATGATACTTTGAAACTGAACGACATTCT +GGGAAATGAAAAACTCCTGCCTCACAAAACACGGATTGATAATCCCATGCCATTACTTCGGACAACGGTA +GAGCCGCAAAAGCCGGAGCAAAGGGAAGCCCTGTTAAATGCCCTCACAGAGATTGCTGATACAGACCCTC +TTTTGCATTTTGACATTGATACTGTTACACATGAGATTATATTATCTTTTTTGGGAAAAGTACAGTTAGA +AGTTATTTGTTCGCTATTAGAAGAAAAATATCATGTGGGCGTGGCTATGAAAGAGCCTTCGGTTATTTAT +CTGGAAAGACCGCTTAGAAAAGCAGAATATACCATCCACATAGAAGTCCCGCCAAATCCTTTCTGGGCTT +CTGTCGGGTTGTCCATAGAGCCGCTCCCTATTGGAAGCGGAGTGCAGTATGAAAGCAGAGTTTCACTTGG +ATATTTAAATCAATCGTTCCAAAATGCGGTTATGGAGGGGGTTCTTTATGGCTGCGAGCAGGGGCTGTAT +GGATGGAAAGTGACAGACTGTAAAATCTGTTTTGAATATGGATTGTATTATAGTCCTGTAAGTACCCCCG +CAGACTTTCGGCTGCTTTCCCCTATCGTATTGGAGCAGGCTTTAAAAAAAGCAGGGACAGAACTATTAGA +GCCATATCTCCACTTTGAAATTTATGCACCGCAGGAATATCTCTCACGGGCGTATCATGATGCTCCAAGG +TATTGTGCAGATATTGTAAGTACTCAGATAAAGAATGACGAGGTCATTCTGAAAGGAGAAATCCCTGCTA +GATGTATTCAAGAATACAGGAACGATTTAACTTATTTCACAAATGGGCAGGGAGTCTGCTTGACAGAGTT +AAAAGGATACCAGCCAGCTATTGGTAAATTTATTTGCCAACCCCGCCGCCCGAATAGCCGTATAGATAAG +GTTCGGCATATGTTCCACAAGTTAGCTTAA +>tet36_AJ514254 +ATGAGAACTATAAATATAGGTATTCTTGCACATATTGATGCAGGAAAGACCTCCATTACAGAGAACTTGC +TATTTGCGAGTGGAGCAACCATAGTACGTGGAAGTGTGGACAAAGGAAACACTACAACCGATTCGATGGA +TATCGAAAAACGAAGAGGTATCACAGTTAGAGCGTCTACAACATCTATTCAATGGAATGATACAAAGATT +AATATCATCGACACTCCTGGACACATGGACTTTCTGGCAGAGGTAGAACGCACTTTTAGGATGCTAGATG +GTGCTATACTTGTGGTGTCTGCCAAAGAGGGCATTCAAGCTCAAACAAGGTTGTTGTTCAATGTCCTGCA +ACAACTAGAAATACCTACAATTCTATTCGTCAACAAAATAGACAGAGAGGGAGTCAATCTAAATCAGCTT +TATTTAGAGATACAAAATAGCCTTTCTAAAGATATTATCTTTATGCAATCCGTTGAAGGCAAGGAATTAA +CATCTAGCTGTACAATACACTACATATCAGAAAAGAACAGAGAAACAATTTTAGAGAAAGATGATCTCTT +GCTTGAAAAATACTTGAGTGATACACAGCTTTCTAATTTAGATTATTGGAATTCAATGGTTCGTCTTGTT +CAAGCTGCTAAATTACATCCTATCTATCATGGTTCAGCAATGTATGGCATTGGTATTGAAGATTTGCTAA +ACTCAATCACTACTTTTATCGAAACATCTCTACCTCAAGAGAACGCTTTGTCTGCCTATGTTTATAAAAT +TGAGCATAATAAGAAGGAACAGAAACGAGCCTATCTAAAGATTATAGGTGGAACCCTTAAATCTCGAAAA +TTATATAGCCTCAATGGCTCAGATGAGAATCTGAAGATAAGAGGTTTAAAGACCTTTTACTCAGGAGACG +AAATAGATGTAGACGAAGTTTTTACAAATGATATTGCAATTGCAGATCATGCTGATAACTTAATGGTAGG +AGATTATCTAGGAATAATGCCAAACTTATTCGACAAATTGAATATTCCTAGTCCTGCTCTCAAATCGTCT +ATACATCCTGCAAAAGTAGAGAATAGGAGTAAATTGATTTCTGCTATGAATGTATTATCAGTAGAAGATC +CATCTTTGGCCTTTAGCATTAATGCTGATAATAATGAATTGGAGGTTTCGCTTTATGGAGCAACTCAACG +GGAGGTGATTTTGACTTTATTGGAAGAGAGATTTTCGGTAGATGCTTACTTTGAAGAGGTGAAAACTATC +TATAAAGAACGTCTTAAAACAAAATCGGAATACACCATTCATATCGAAGTGCCACCTAATCCGTATTGGG +CATCTATTGGCTTGATTATAGAGCCTTTGCCAATTGGGGCGGGACTTGTAATGGAGAGTGAAATATCATT +GGGATATTTGAATCGATCCTTTCAGAATGCAGTATTCGATGGAGTCAAGAAAGCCTGTGAATCGGGTTTG +TACGGTTGGGAAGTAACTGACCTTAAAGTCACTTTTTCTCACGGAATCTATTATAGCCCAGTGAGTACAC +CTGCCGACTTTAGAAGTTTAGCACCTTATGTTTTTCGATTAGCTTTGCAACAAGCTGATGTTGAGTTATT +GGAGCCAATCTTAGATTTTAAATTGCAAATTCCACTAGCTGTGAATGCTAGAGCTATTACAGACATCAAC +AAGATGCAAGGCGAAATATCTACTATTACTTCAGATGGTGATTGGACTACTATTTTGGGTAATATTCCTT +TAGATACTAGTAAAGAATACTCAGCAGAGGTCAGTTCCTACACACAAGGCTTGGGCGTTTTTGTTACTCG +ATTTTCGGGTTATCGACCTACCAACAAAAAGGTAAGCAGAAGTGTAGAACTGAATGAAAAAGATAAGCTG +ATGTATATGTTTGAGAAGGAAAGTATCAAATAA +>tet44_FN594949 +ATGAAAATAATCAACATTGGTATTCTTGCTCATGTAGATGCAGGAAAGACGACCTTAACGGAAAGTCTGC +TTTATACAAGTGGAGCAATTTTAGAATTAGGCAGTGTAGATAAGGGAACAACAAGGACAGATACTATGTT +TTTAGAACGTCAGCGTGGAATCACAATTCAGGCAGCAGTTACTTCTTTTAATTGGAATGACTACAAAATC +AATATTGTAGATACTCCTGGACATACAGATTTTATAACAGAAGTGTATCGTTCCTTATCTGTTCTTGATG +GAGCAATTTTAGTAATTTCTGCTAAAGATGGTGTACAAGCACAAACCCGAATACTATTCCATGCACTTCA +AAAAATGAATATACCAACAATTATTTTTATAAATAAAATAGATCAGGATGGAATTAACTTAAATAATATT +TATCAAAATATCAAAGAAAAACTTTCAAATGATATTATTGTTATGCAAAATGTAACATTAACTCCAGAAA +TATCAATTAAAAATATCATTGATTTAGATGATTGGGATCCTGTAATTTCCAAAAATGATAAACTTTTAGA +AAAATATATTGTAGGAGAAAAATTGACTATACAAGAATTAATGTATGAAGAATATAGGTGTGTTAAAAAA +GGTTCGTTGTTTCCTATATACCATGGAAGTGCTAGAAATAATATAGGGACTCAACAACTTATCGAAGCTA +TTTCAAATCTTTTTTGTTCTGAAATGAATGAGAATGATTCAGAACTATGTGGAAGAGTTTTTAAAATTGA +ATATACAGACCATAAGCAAAGATTAGTTTATTTGCGTCTTTATAGTGGAACATTACACTTACGAGATACA +ATTATATTGCCAGAAAAAAAGAAAGTGAAACTTACAGAAATATATATTCCTTCAAATGGAGAAATGATAC +AGACAAAAACAGTTTGTTCTGGAGATATTTTTATTATACCTAACAATACATTAAGATTGAACGATATTAT +AGGAAATGAAAAGCTTTTGCCATGCAATGTATGGAATGACAAGACTGTACCAATACTACGAACAAGAATT +GAACCGATAAAAATAGAAGAGAGAGAAAAATTATTGGATGCTCTTACAGAAATTGCAGATACTGATCCTC +TTTTACGTTATTATGTTGATACGATAACACATGAAATCATCATTTCTTTTTTAGGAACAGTGCAGTTAGA +AGTTATCTGTTCTCTGTTGATTGAAAAATATCACATAAACATAAGAATCGAAGATCCAACCGTAATTTAT +TTGGAAAAACCATTACAAAAGGCAGATTATACTATTCATATTGAAGTACCACCAAATCCATTTTGGGCAT +CGATTGGATTATCAATAACTCCACTTCCAATTGGCAGTGGAATACAGTACGAAAGCAAAGTTTCACTCGG +TTATTTAAATCAAAGTTTCCAAAATGCAGTAAGAGAAGGTATTAATTATGGACTGGAGCAAGGTTTGTAT +GGTTGGGAAGTAACAGATTGTAAAATATGTTTTGAATATGGTGTTTATTATAGCCCTGTTAGTACTCCCT +CGGATTTTCGCTTTCTTGCCCCAATTGTACTTGAACAAACATTGAAAAAAGCGGGAACGCAATTATTAGA +GCCATATCTTTCGTTTATACTTTTTACGCCACAGGGATACTTTTCTCGTGCATATAAAGATGCACAAAAA +CATTGTGCAATAATTGAAACAAGTCAATCAAAAAATGATGAAGTTATTTTTACAGGACATATTCCTGTAC +GTTGTATTAATGAATATCGTAATACTTTAACTCTATATACAAATGGGCAAGCAGTTTTTTTGACAGAATT +AAAAGATTATCAAATTGCTACTTGTGAACCAGTTATTCAATCACGTAGACCAAATAATCGAATAGATAAA +GTACGCCATATGTTTAATAAAAAAGAAAATTAA +>tet58_KY887560 +ATGAATTCTAATTCGTCAAACCATAAATCACAATACAACAAATTATTACTTTGGCTTTGCTTTTTATCTT +TCTTTAGTGTACTAAATGAAATGGTTCTAAATGTATCTTTTCCTGATGTAGCGAATTACTTTGGAAAAGC +TCCTGCAAGTATAAATTGGATTAATACATCGTTCATGTTAAGTTTTTCTGTAGGAACAGCCATATATGGC +AAAGTTTCTGATTATGTTGGTATTAAGAAACTGTTATTAACAGGAATTTTATTAAATTGTATAGGTTCAA +TTATGGGTTTTATAGGTCATACATCATTTCCTGTATTATTATTATCACGATTCATTCAAGGTACAGGAGC +TGCAGCTTTTCCTGCGTTAATTATGGTGGTTGTTGCTAAATATATTCCAAGGGAGAGCCAGGGAAAGGCT +TTTGGACTTATTGGTTCCATTGTTGCAATGGGTGAAGCTCTAGGTCCATCTATCGGTGGAATGATAGCTG +AATATATTCATTGGTCTTATCTATTGATATTACCTTTAGGAACTTTAATATCAGTTCCTTTTCTTATCAA +AATGCTTGATCATGAACCGATTAAAAAGGGAAGTTTTGATTTTATAGGATTAGTATTAATGTCGTTAAGC +ATAGTAACTTTTATGGTGTTTACCACATCATATAAATTATATTTCTTAGGAATAAGTTTCGTTATTTTCA +TTATTTTTATTAAGCACATAAAAAAAGTGGATGAGCCGTTTATTGAGCCTAAATTAGGTGAAAACCGATC +TTTTATGGTTGGTATTGTTTGTGGAGGTCTTTTTTTTGGAACGGTGGCAGGATTTATTTCAATGGTTCCT +TATATGATGAGAGATTTATATCAGTTAAGTACACTTGCCATTGGTAACGGGATTATCTTTCCAGGAGCTG +TTAGTGTCATTATTTTTGGTTACTTTGGTGGAATACTAGTAGATAAAAAAGGACCAATATTCGTGTTAAC +TATAGGAGCTATGTTGTTATCAATTAGTTTTCTATTGGCTGCACTGTTTGTTGAAACGACACCTTTTTTA +ATTACTATATTAATTATTTTTATATTTGGAGGTCTGTCCTTTACAAAAACGGTTATATCTACAATTGTTT +CAAGTAGTTTAACTACAAAAGAAAGTGGTTCAGGAATGAGTTTACTTAATTTTACAAGTTTCTTATCTGA +AGGACTAGGAATCGCAGTTGTAGGAGGATTACTGTCTGTAGACATACTAAATAAAAAAATTATTCCTATA +AATGTTTCTTCTCAATCATATTTATATAGCAATATGTTACTTATTTTTTCTATAATAATTATTTTTAGTT +GGTTAATTACTATCAAAGTGTACTCTGAGCCAAAAATAAAATAG +>otrA_X53401 +ATGAACAAGCTGAATCTGGGCATCCTGGCCCACGTTGACGCCGGCAAGACCAGCCTCACCGAGCGCCTGC +TGCACCGCACCGGTGTGATCGACGAGGTCGGCAGCGTGGACGCCGGCACCACGACGACCGACTCGATGGA +GCTGGAGCGGCAGCGCGGCATCACCATCCGGTCCGCCGTGGCCACGTTCGTCCTGGACGATCTCAAGGTC +AACCTCATCGACACCCCGGGCCACTCCGACTTCATCTCCGAGGTCGAGCGGGCGCTCGGGGTGCTCGACG +GCGCGGTCCTGGTGGTCTCGGCCGTCGAGGGCGTCCAGCCGCAGACCCGCATCCTGATGCGGACCCTGCG +CAGGCTGGGCATTCCCACGCTGGTCTTCGTCAACAAGATCGACCGGGGCGGCGCGCGTCCCGACGGTGTG +CTGCGGGAGATCCGCGACCGGCTCACCCCCGCCGCGGTGGCACTGTCCGCCGTGGCGGACGCCGGCACGC +CGCGGGCCCGCGCGATCGCGCTCGGCCCGGACACCGACCCGGACTTCGCCGTCCGGGTCGGTGAGCTGCT +GGCCGACCACGACGACGCGTTCCTCACCGCCTACCTGGACGAGGAACACGTACTGACCGAGAAGGAGTAC +GCGGAGGAACTGGCCGCGCAGACCGCGCGCGGTCTGGTGCACCCGGTGTACTTCGGGTCCGCGCTGACCG +GCGAGGGCCTGGACCATCTGGTGCACGGCATCCGGGAGTTGCTGCCGTCCGTGCACGCGTCGCAGGACGC +GCCGCTGCGGGCCACCGTGTTCAAGGTGGACCGTGGCGCGCGCGGCGAGGCCGTCGCGTACCTGCGGCTG +GTCTCCGGCACGCTGGGCACCCGCGATTCGGTGACGCTGCACCGCGTCGACCACACCGGCCGGGTCACCG +AGCACGCCGGACGCATCACCGCGCTGCGGGTCTTCGAGCACGGGTCGGCCACCAGCGAGACCCGGGCGAC +CGCCGGGGACATCGCGCAGGCGTGGGGCCTGAAGGACGTACGGGTCGGTGACCGGGCCGGGCACCTCGAC +GGTCCCCCGCCGCGCAACTTCTTCGCGCCGCCCAGCCTGGAGACCGTGATCAGGCCGGAGCGCCCGGAGG +AAGCGGGACGGCTGCACGCCGCGCTGCGCATGCTGGACGAGCAGGACCCCTCGATCGACCTGCGGCAGGA +CGAGGAGAACGCGGCCGGCGCGGTGGTCCGCCTCTACGGGGAGGTGCAGAAGGAGATCCTCGGCAGCACG +CTCGCGGAGTCCTTCGGCGTACGGGTGCGCTTCGACCCGACCCGTACGGTCTGCATCGAAAAGCCCGTGG +GGACCGGCGAGGCGCTGATCGAGCTGGACACGCGGACGCACAACTACTTCTGGGGCGCACCGTGGGTCTG +CGCGTCGGACCGGCCGAGCCCGGCGCGGGCGATCACGTTCCGTTTGGCGGTGGAACTGGGCTCGCTCCCC +CTGGCCTTCCACAAGGCCATCGAGGAGACGGTGCACACCACCCTGCGGCACGGTCTGTACGGCTGGCAGG +TCACCGACTGCGCCGTCACCCTGACCCGTACCGGCGTTCGCAGTCCGGTCAGCGCGGCCGACGACTTCCG +CAAGGCCAACGCGCGCTTGGTCCTGATGGACGCGCTCGGCAGGGCCGGTACGGAGGTGCACGAGCCGGTC +AGCTCCTTCGAACTGGAGGTGCCCGCCGCCCGGCTCAGCCCGGTACTTGCGAAACTCGCGGAACTGGGCG +CGACGCCCGGTGTGCCCACGGCCGAGGGGGACGTCTTCCGCCTGGAGGGCACGATGCCGACCAGCCTCGT +GCACGACTTCAACCAGCGGGTTCCCGGACTGACCCAGGGCGAGGGCGTGTTCCTGGCCGAGCACCGGGGC +TACCGGCCCGCCGTCGGACAGCCGCCCGTGCGGCCGCGGCCCGAGGGGCCCAACCCGCTCAACCGCGACG +AGTACATCCTGCACGTGCTCAAGCGCGTGTGA +>tet_M74049 +ATGCGCACCCTGAACATCGGCATTCTGGCCCACGTCGACGCGGGTAAGACCAGCCTGACCGAACGGCTCC +TGTTCGACCACGGCGCCGTCGACCGGCTCGGCAGCGTCGACGCCGGCGACACCCGTACGGTCGACGGCGG +TATCGAGCGCCGCCGCGGCATCACCATCCGCTCCGCCGTCGCCGCCTTCACCGTCGGCGACACGCGCGTC +AACCTGATCGACACCCCGGGACACTCCGACTTCGTCGCGGAGGTCGAGCGGGCCCTGGAAGTGCTCGACG +GGGCGGTGCTGCTGCTGTCCGCCGTCGAGGGCGTCCAGGCGCGGACCCGCGTCCTGATGCGCGCGCTGCG +GCGGCTGCGGCTGCCCACGATCGTGTTCGTCAACAAGATCGACCGGGCCGGCGCGCGCACCGACGGCCTC +CTCGGTGACGTCCGGCGCCTGCTGACGCCGCACGTCGCGCCGCTGACCGAGGTGGCGGACGCCGGTACCC +CGCGCGCCCGGGTCACCCGCCGCCCGCCGGACGGGCGGACCGCGGAGGCCCTCGCCGAGGTCGACACGGA +GGTCCTGGCCGCGCTGGTCGACGGCCCCGAGCCGACCGGGGAGGACGTGGCCCGCGCCCTCGCCGCCCGT +ACCGCCGACGGCTCGTTCCACCCGCTGTACCACGGCTCCGCGCTCGGCGGACAGGGCGTCGCGGAGCTGG +TCGAGGGCCTGCTCGGCCTGATCCCGGCCGCCACGCCGGGCACGTCCGGCGGCACGTCCGGCGGCACGGA +ACCGCGCGGCACGGTCTTCGCCGTGCGCCCCGGACCCGCCGGCGAGCGCACCGCGTACCTCAGGCTGTAC +GGCGGCGAGGTGCACCCGCGCCGGCGGCTCACCTTCCTGCGGCGCGAGTCCGACGGGCGGACCACCGAGG +TCTCCGGCCGGGTGACCCGCCTCGACGTCGTCGGCGGCGACGCCACGCTCACCGCCGGGAACATCGCCGC +GCTCACCGTTCCCGGGGGCCTGCGCGTCGGCGACCGGCTCGGCGGACCGACCGACCGTGCACCGCAGTTC +GCGCCACCGACCCTGCAGACGCTGGTCCGGGCCCGGCACCCGGAGCAGGCGGCGCCGCTGCGCTCCGCCC +TGCTGGCGCTGGCCGACCAGGACCCGCTGCTGCACGCCCGACCGGCGGCGTCCGGCGCCACCGCCCTGCT +CCTGTACGGCGAGGTCCAGATGGAGGTGCTCGCGGCGACACTGGCCGAGGACTTCGGGATCGAGGCGGAG +TTCACGCCGGGCCGCGTCCGGTTCCTGGAGCGTCCGGCGGGCACCGACGAGGCCGCGGAGGAGATGCCGT +GGCTCGACCGCACCCGGTACTTCGCGACGATCGGGCTGCGCGTCGAACCGGGTCCGCGCGGCTCCGGCGG +GGCCTTCGGGTACGAGACGGAGCTCGGCGCGCTCCCCCGGGCCTTCCACCAGGCCGTCGAGGAGACCGTC +CACGACACGCTGCGGACCGGGCTCACCGGTGCGGCGGTCACCGACTACCGGGTCACGCTGATCCGCTCCG +GCTTCAGCTCGCCGCTCAGCACGGCCGCCGACTTCCGCGGGCTGACACCGCTCGTGCTGCGCCGTGCCCT +CGCCCGCGCGGGGACCGTGCTCCACGAGCCGTACCAGGCCTTCGAGGCGGAGGTCCCGGCGGACACGCTG +GCCGCCGTGACGGCCCTGCTGGCCTCGCTGGGCGCGGACTTCACCGGAACGACGGGGGGCGACCCGGCCT +GGATCGTCACCGGCGAGCTGCCGGCCCGGCGGGTGCGGGAGGCCGAGCTGCGGCTGCCGGGGCTGACGCA +CGGGGAGGCGGTCTGGTCCTCCCGCCCTTGCGAGGACCGACCGCTGAAGGCCGGAAACTCTGGGCCTGGC +ACGGGAGTTGGCGGGCATTCGGGTGAGTAG +>tetS_M_AY534326 +ATGGAGGAAATAAAATTGAAAATTATTAATATCGGTATCTTAGCACATGTTGATGCAGGAAAAACTACTT +TGACAGAAAGCTTACTATACAGTAGCGGAGCAATTAAAGAGTTAGGAAGTGTAGATAGCGGTACAACGAA +AACGGATACTATGTTTTTGGAACGCCAGAGAGGTATTACTATTCAGACCGCAATAACATCTTTTCAACGG +GAAAATGTTAAAGTAAATATTGTAGATACTCCTGGACACATGGATTTTTTGGCAGATGTATACCGTTCAT +TATCTGTTTTGGATGGAGCTATTTTGCTAATCTCTGCAAAAGATGGAGTACAGTCACAAACTCGTATACT +ATTCCATGCACTTAGAAAGATGAACATACCTATAATATTTTTTATTAACAAAATTGATCAAAATGGAATA +AATTTGCCAGATGTTTATCAAGATATTAAGGACAAACTTTCTGACGACATCATAATTAAGCAGACTGTGA +ATCTAAATTTGAAACCTTATGTAATAGATTATACTGAACCAGAACAATGGGAGACAGTAATTGTGGGAAA +TGATTATTTATTAGAAAAATATACCATTGGGAAAACATTGAATATTGCAGAACTTGAAAAGGAGGAAAAC +GAAAGAATTCAAAGTTGCTCCTTATATCCTGTTTATCACGGAAGTGCAAAGAATAATATTGGAATTAAAC +AACTTATAGAGGTAATTACTAGCAAATTATTTTCACCCACACAACTCAATTCAGATAAACTTTGTGGAAA +TGTTTTTAAAGTAGAATATTCAGATGATGGTCAACGGCTTGTCTATGTACGTCTTTATAGTGGAACGCTA +CATTTGCGAGACTCAGTCAATATATCAGAAAAGGAAAAAATAAAAGTTACAGAAATGTATACTTCAATAA +ATGGAGAATTACGCCAGATAGATAAGGCAGAGCCTGGTGAGATTATTATTTTAAAAAATGAGCTTTTAAA +ACTAAATAACGTACTTGGAGATAAAAAAAGATTACCACATAGAGAAATTCTTGAGAATCCTCTTCCTATG +TTACAAACAACAATTGAACCATGTAAATCAGTACAAAGAGAAAAGTTACTAGATGCACTTTTTGAAATAT +CCGATAGTGATCCCCTTCTACAATATTATGTAGATACAGTAACTCACGAAATTGTGCTATCTTTTTTAGG +TGAGGTCCAAATGGAGGTAACTTGTACTCTGATTCAAGAAAAATATCATATTGAGATAGAAACAAGAAAA +CCAACTGTCATTTATATGGAAAGACCATTAAAAAAATCTGAATTTACCATTGATATCGAAGTACCTCCAA +ATCCTTTCTGGGCTTCTATTGGTTTATCTGTAACACCACTTCCTTTGGGTAGTGGCATTCAGTATGAGAG +CCTGGTTTCTCTAGGTTATTTAAATCAATCATTTCAAAATGCAGTTATGGAAGGTATACGCTATGGGTGT +GAACAAGGATTGTACGGTTGGAAATTAACAGACTGTAAGATCTGTTTTAAGTATGGTCTATATTACAGCC +CTGTCAGTACGCCAGCAGATTTCCGAATGCTTGCGCCTATTGTACTAGAGCAGGCTTTTAGAAAGAGTGG +TACAGAGTTATTAGAGCCATATCTTAGCTTCGAAATTTATGTACCACAAGAATATCTTTCGAGAGCATAT +AATGATGCTTCCAAATATTGTGCAAATATTTTAAATACTAAGTTAAAAGGTAACGAGGTCATTCTCATTG +GTGAAATCCCTGCTCGGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGT +TTGTTTAACAGAGTTAAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAAT +AGTCGGATAGATAAAGTACGATATATGTTCAATAAAATAACTTAG +>tetS_M_HM367711 +CTAAGTTATTTTATTGAACATATATCGTACTTTATCTATCCGACTATTTGGACGACGGGGCTGGCAAACA +GGTTCACCGGTAGTAACATGGTACCCTTTTAACTCTGTTAAACAAACACTACGTCCATTTGTAAAGAAAG +TTAAATCACTACGATATTCTTGAATACACCGATCAGGGATTTCTCCACTAAGAATGACCTCATTATTTTT +CAATTGAGTGTCTACGATGTTCGCACAATATTTAGGAGCATCGTTGTATGCTCGTGAAAGATATTCCTGT +GGCGCATAAATTTTAAAACTAAGATATGGCTCTAACAATTCTGTTCCAGCTTTTTTTAAGACTTGTTCCA +ATACAATAGGAGCAAGCATCCGAAAATCTGCTGGGGTACTAACAGGGCTATAGTATAAGCCATACTTAAA +ACAGATTTTACAGTCCGTCACATTCCAACCATACAATCCTTGTTCACAGCCATAGCGTATCCCTTCCATA +ACTGCATTTTGAAATGATTGATTTAAGTATCCAAGAGAAACCGAGCTCTCATACTGCATTCCACTTCCCA +ACGGAAGCGGTGATACAGATAAACCAATGGAAGCCCAGAAAGGATTTGGCGGCACTTCGATGTGAATGGT +ATATTCTGCATTTTTTAACGGTCTCTCCATATAAATGACTGTAGGCTCTTTTAGTTCTATCTCCACATGA +TACTTTTCTTGCAACAGTGCACTAATCACTTCCATTTGTACTTTCCCTAAGAAAGAAAGTATAATTTCAT +GTGTCGTAGAATCCACGTAATATCGTAGAAGCGGATCACTATCTGAGATTTCCAAAAGGGCATCAAGCAA +CATTTCTCTCTGTTCAGGTTTACTCGGTTCAACAGTTGTTTGTAGTAGAGGGTGCGGATTTTCAATCTTT +TTTCTCTGTGGCAATAGTTTTGTATCTCCAAGAACACTATTTAACTTCAAAAACTCATTTTGCAAAATAA +CAATTTCTCCAGAATAAGCTCTATCAATCTTACATAATTCACCATTTATTGAAGTATACATTTCTGTAAC +TTTTATTTTTTCTTTTTCTGATACTCTAACCGAATCTCGTAAATGTAGTACTCCACTATAAAGGCGTATA +TATGCAAGACGTTGTCTTTTTTTTGTATATTCAATTTTGAAAACATTTCCGCAAAGTTCAGACGGACCTC +GATGTGTTGATGAATAAAATTTATTAGTAATAACTTCTATAAGGTTATCAATCCCTATATTACTTTTTGC +ACTTCCATGATAAAGAGGGAACAGAGAACAATTCTGAAATCTTATGCTTTCCTCTTGTTCGAGTTCCAAT +GCTTCTAATGATTTACCGGACATATATTTCTCTAAAAGGTCATCGTTTCCCTCTATTACCGTATCCCATT +GTTCAGATTCGGTAAAGTTCGTCACACACATATTAGGATACAGTTCTACCTTCTGTTTGATTACAATTTC +CGCAGAAAGTTTCTCTTTAATATCCTGATAAACCGTTGATAAATCAATTCCATTTTGGTCAATCTTATTG +ATAAAAAAGATTGTGGGAATACCTATTTTCCTAAGTGCATGAAACAATATACGAGTTTGTGCTTGTACGC +CATCTTTTGCAGAAATCAGTAGAATTGCCCCATCTAAAACTGATAATGAACGATATACTTCTGCTAAGAA +ATCCATATGTCCTGGCGTGTCTATGATGTTCACCTTCGTATTTTCCCACTGAAAAGAGGTTATTCCTGTC +TGAATTGTAATTCCTCTCTGACGTTCTAAAAGCGTATTATCCGTCCTCGTTGTACCTTTGTCCACGCTTC +CTAATTCTGTAATCGCTCCGCTACTGTATAGTAAGCTTTCTGTCAAAGTAGTTTTTCCTGCATCAACATG +TGCTAAGATACCGATATTAATAATTTTCAATTTTATTTCCTCCAT +>tetM_M85225 +ATGAAAATTATTAATATTGGAGTTTTAGCTCATGTTGATGCAGGAAAAACTACCTTAACAGAAAGCTTAT +TATATAACAGTGGAGCGATTACAGAATTAGGAAGCGTGGACAAAGGTACAACGAGGACGGATAATACGCT +TTTAGAACGTCAGAGAGGAATTACAATTCAGACAGGAATAACCTCTTTTCAGTGGGAAAATACGAAGGTG +AACATCATAGACACGCCAGGACATATGGATTTCTTAGCAGAAGTATATCGTTCATTATCAGTTTTAGATG +GGGCAATTCTACTGATTTCTGCAAAAGATGGCGTACAAGCACAAACTCGTATATTATTTCATGCACTTAG +GAAAATGGGGATTCCCACAATCTTTTTTATCAATAAGATTGACCAAAATGGAATTGATTTATCAACGGTT +TATCAGGATATTAAAGAGAAACTTTCTGCCGAAATTGTAATCAAACAGAAGGTAGAACTGTATCCTAATG +TGTGTGTGACGAACTTTACCGAATCTGAACAATGGGATACGGTAATAGAGGGAAACGATGACCTTTTAGA +GAAATATATGTCCGGTAAATCATTAGAAGCATTGGAACTCGAACAAGAGGAAAGCATAAGATTTCAGAAT +TGTTCTCTGTTCCCTCTTTATCATGGAAGTGCAAAAAGTAATATAGGGATTGATAACCTTATAGAAGTTA +TTACTAATAAATTTTATTCATCAACACATCGAGGTCCGTCTGAACTTTGCGGAAATGTTTTCAAAATTGA +ATATACAAAAAAAAGACAACGTCTTGCATATATACGCCTTTATAGTGGAGTACTACATTTACGAGATTCG +GTTAGAGTATCAGAAAAAGAAAAAATAAAAGTTACAGAAATGTATACTTCAATAAATGGTGAATTATGTA +AGATTGATAGAGCTTATTCTGGAGAAATTGTTATTTTGCAAAATGAGTTTTTGAAGTTAAATAGTGTTCT +TGGAGATACAAAACTATTGCCACAGAGAAAAAAGATTGAAAATCCGCACCCTCTACTACAAACAACTGTT +GAACCGAGTAAACCTGAACAGAGAGAAATGTTGCTTGATGCCCTTTTGGAAATCTCAGATAGTGATCCGC +TTCTACGATATTACGTGGATTCTACGACACATGAAATTATACTTTCTTTCTTAGGGAAAGTACAAATGGA +AGTGATTAGTGCACTGTTGCAAGAAAAGTATCATGTGGAGATAGAACTAAAAGAGCCTACAGTCATTTAT +ATGGAGAGACCGTTAAAAAATGCAGAATATACCATTCACATCGAAGTGCCGCCAAATCCTTTCTGGGCTT +CCATTGGTTTATCTGTATCACCGCTTCCGTTGGGAAGTGGAATGCAGTATGAGAGCTCGGTTTCTCTTGG +ATACTTAAATCAATCATTTCAAAATGCAGTTATGGAAGGGATACGCTATGGTTGTGAACAAGGATTGTAT +GGTTGGAATGTGACGGACTGTAAAATCTGTTTTAAGTATGGCTTATACTATAGCCCTGTTAGTACCCCAG +CAGATTTTCGGATGCTTGCTCCTATTGTATTGGAACAAGTCTTAAAAAAAGCTGGAACAGAATTGTTAGA +GCCATATCTTAGTTTTAAAATTTATGCGCCACAGGAATATCTTTCACGAGCATACAACGATGCTCCTAAA +TATTGTGCGAACATCGTAGACACTCAATTGAAAAATAATGAGGTCATTCTTAGTGGAGAAATCCCTGCTC +GGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGTTTGTTTAACAGAGTT +AAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAA +GTACGATATATGTTCAATAAAATAACTTAG diff --git a/modules/amr.nf b/modules/amr.nf index 176ea37..94c3ddb 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -39,27 +39,47 @@ process GET_PBP_RESISTANCE { """ } -// Run AMRsearch to infer resistance (also determinants if any) of other antimicrobials +// Create ARIBA database and return database path +process CREATE_ARIBA_DB { + label 'ariba_container' + label 'farm_low' + + input: + path(ref_genome) + path(metadata) + + output: + path ariba_database + + script: + """ + ariba prepareref -f "$ref_genome" -m "$metadata" ariba_database + """ +} + +// Run ARIBA to identify AMR process OTHER_RESISTANCE { - label 'amrsearch_container' + label 'ariba_container' label 'farm_low' tag "$sample_id" input: - tuple val(sample_id), path(assembly) + path ariba_database + tuple val(sample_id), path(read1), path(read2), path(unpaired) output: - tuple val(sample_id), path(json), emit: json + tuple val(sample_id), path(tsv), emit: tsv script: - json='result.json' + tsv='report.tsv' """ - java -jar /paarsnp/paarsnp.jar -i "$assembly" -s 1313 -o > $json + ariba run --nucmer_min_id 80 --assembled_threshold 0.80 --assembly_cov 10 $ariba_database $read1 $read2 result + mv result/report.tsv "${tsv}" """ } -// Extract the results from the output file of the AMRsearch +// WIP, for extracting information from ARIBA report process GET_OTHER_RESISTANCE { label 'bash_container' label 'farm_low' @@ -67,15 +87,10 @@ process GET_OTHER_RESISTANCE { tag "$sample_id" input: - tuple val(sample_id), path(json) - - output: - tuple val(sample_id), env(CHL_RES), env(CHL_DETERMINANTS), env(CLI_RES), env(CLI_DETERMINANTS), env(ERY_RES), env(ERY_DETERMINANTS), env(FQ_RES), env(FQ_DETERMINANTS), env(KAN_RES), env(KAN_DETERMINANTS), env(LZO_RES), env(LZO_DETERMINANTS), env(TET_RES), env(TET_DETERMINANTS), env(TMP_RES), env(TMP_DETERMINANTS), env(SMX_RES), env(SMX_DETERMINANTS), env(COT_RES), env(COT_DETERMINANTS), emit: result + tuple val(sample_id), path(tsv) script: """ - JSON_FILE="$json" - - source get_other_resistance.sh + # TBC """ } diff --git a/nextflow.config b/nextflow.config index 7687a2f..ee0459d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -90,8 +90,8 @@ process { withLabel: spn_pbp_amr_container { container = 'harryhungch/spn-pbp-amr:23.01.16' } - withLabel: amrsearch_container { - container = 'harryhungch/amrsearch:23.02.23' + withLabel: ariba_container { + container = 'staphb/ariba:2.14.4' } withLabel: mlst_container { container = 'staphb/mlst:2.23.0' diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 27bb589..4e020ce 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -7,7 +7,7 @@ include { OVERALL_QC } from "$projectDir/modules/overall_qc" include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS; LINEAGE } from "$projectDir/modules/lineage" include { GET_SEROBA_DB; CREATE_SEROBA_DB; SEROTYPE } from "$projectDir/modules/serotype" include { MLST } from "$projectDir/modules/mlst" -include { PBP_RESISTANCE; GET_PBP_RESISTANCE; OTHER_RESISTANCE; GET_OTHER_RESISTANCE } from "$projectDir/modules/amr" +include { PBP_RESISTANCE; GET_PBP_RESISTANCE; CREATE_ARIBA_DB; OTHER_RESISTANCE; GET_OTHER_RESISTANCE } from "$projectDir/modules/amr" // Main pipeline workflow workflow PIPELINE { @@ -26,6 +26,9 @@ workflow PIPELINE { poppunk_db = GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_local) poppunk_ext_clusters = GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_local) + // Get path to ARIBA database, create from reference and metadata + ariba_db = CREATE_ARIBA_DB("$projectDir/data/ariba_sequences.fasta", "$projectDir/data/ariba_metadata.tsv") + // Get read pairs into Channel raw_read_pairs_ch raw_read_pairs_ch = Channel.fromFilePairs("$params.reads/*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}", checkIfExists: true) @@ -139,8 +142,8 @@ workflow PIPELINE { // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, infer resistance (also determinants if any) of other antimicrobials // Output into Channel GET_OTHER_RESISTANCE.out.result - OTHER_RESISTANCE(OVERALL_QC_PASSED_ASSEMBLIES_ch) - GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.json) + OTHER_RESISTANCE(ariba_db, OVERALL_QC_PASSED_READS_ch) + GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.tsv) // Generate results.csv by sorted sample_id based on merged Channels // READ_QC.out.result, ASSEMBLY_QC.out.result, MAPPING_QC.out.result, TAXONOMY_QC.out.result, OVERALL_QC.out.result, @@ -176,8 +179,8 @@ workflow PIPELINE { .map { (it[-1] == null) ? it[0..-2] + ['_'] * 8 : it } .join(GET_PBP_RESISTANCE.out.result, failOnDuplicate: true, remainder: true) .map { (it[-1] == null) ? it[0..-2] + ['_'] * 18 : it } - .join(GET_OTHER_RESISTANCE.out, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] * 20 : it } + // .join(GET_OTHER_RESISTANCE.out, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 20 : it } .map { it.collect {"\"$it\""}.join',' } .collectFile( name: 'results.csv', @@ -193,7 +196,7 @@ workflow PIPELINE { 'Serotype', 'ST', 'aroE', 'gdh', 'gki', 'recP', 'spi', 'xpt', 'ddl', 'pbp1a', 'pbp2b', 'pbp2x', 'AMO_MIC', 'AMO_Res', 'CFT_MIC', 'CFT_Res(Meningital)', 'CFT_Res(Non-meningital)', 'TAX_MIC', 'TAX_Res(Meningital)', 'TAX_Res(Non-meningital)', 'CFX_MIC', 'CFX_Res', 'MER_MIC', 'MER_Res', 'PEN_MIC', 'PEN_Res(Meningital)', 'PEN_Res(Non-meningital)', - 'CHL_Res', 'CHL_Determinant', 'CLI_Res', 'CLI_Determinant', 'ERY_Res', 'ERY_Determinant', 'FQ_Res', 'FQ_Determinant', 'KAN_Res', 'KAN_Determinant', 'LZO_Res', 'LZO_Determinant', 'TET_Res', 'TET_Determinant', 'TMP_Res', 'TMP_Determinant', 'SMX_Res', 'SMX_Determinant', 'COT_Res', 'COT_Determinant' + // 'CHL_Res', 'CHL_Determinant', 'CLI_Res', 'CLI_Determinant', 'ERY_Res', 'ERY_Determinant', 'FQ_Res', 'FQ_Determinant', 'KAN_Res', 'KAN_Determinant', 'LZO_Res', 'LZO_Determinant', 'TET_Res', 'TET_Determinant', 'TMP_Res', 'TMP_Determinant', 'SMX_Res', 'SMX_Determinant', 'COT_Res', 'COT_Determinant' ].join(','), sort: { it.split(',')[0] }, newLine: true From 4510697ee539318b7c8722b0d8bb6bf24180f11c Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 21 Jun 2023 13:32:55 +0000 Subject: [PATCH 002/157] Include ARIBA info Former-commit-id: c1fd7c2234289bcf7dfcc760580ce9701f70ce6d --- bin/get_images_info.sh | 4 ++-- bin/get_tools_info.sh | 1 + modules/info.nf | 19 +++++++++++++++++-- workflows/info_and_version.nf | 6 ++++-- 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/bin/get_images_info.sh b/bin/get_images_info.sh index c4474db..ba5428f 100755 --- a/bin/get_images_info.sh +++ b/bin/get_images_info.sh @@ -16,7 +16,7 @@ SAMTOOLS=$(grep samtools <<< $IMAGES) BCFTOOLS=$(grep bcftools <<< $IMAGES) POPPUNK=$(grep poppunk <<< $IMAGES) SPN_PBP_AMR=$(grep spn-pbp-amr <<< $IMAGES) -AMRSEARCH=$(grep amrsearch <<< $IMAGES) +ARIBA=$(grep ariba <<< $IMAGES) MLST=$(grep mlst <<< $IMAGES) KRAKEN2=$(grep kraken2 <<< $IMAGES) SEROBA=$(grep seroba <<< $IMAGES) @@ -38,7 +38,7 @@ jq -n \ --argjson bcftools "$(add_container $BCFTOOLS)" \ --argjson poppunk "$(add_container $POPPUNK)" \ --argjson spn_pbp_amr "$(add_container $SPN_PBP_AMR)" \ - --argjson amrsearch "$(add_container $AMRSEARCH)" \ + --argjson ariba "$(add_container $ARIBA)" \ --argjson mlst "$(add_container $MLST)" \ --argjson kraken2 "$(add_container $KRAKEN2)" \ --argjson seroba "$(add_container $SEROBA)" \ diff --git a/bin/get_tools_info.sh b/bin/get_tools_info.sh index c2cc99c..23d9520 100755 --- a/bin/get_tools_info.sh +++ b/bin/get_tools_info.sh @@ -18,4 +18,5 @@ jq -n \ --argjson mlst "$(add_version "$MLST_VERSION")" \ --argjson kraken2 "$(add_version "$KRAKEN2_VERSION")" \ --argjson seroba "$(add_version "$SEROBA_VERSION")" \ + --argjson ariba "$(add_version "$ARIBA_VERSION")" \ '$ARGS.named' > $JSON_FILE diff --git a/modules/info.nf b/modules/info.nf index 3f70fee..5dadd65 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -68,6 +68,7 @@ process TOOLS { val mlst_version val kraken2_version val seroba_version + val ariba_version output: path(json), emit: json @@ -88,6 +89,7 @@ process TOOLS { MLST_VERSION="$mlst_version" KRAKEN2_VERSION="$kraken2_version" SEROBA_VERSION="$seroba_version" + ARIBA_VERSION="$ariba_version" JSON_FILE="$json" source get_tools_info.sh @@ -223,7 +225,7 @@ process PARSE { |${toolTextRow('Het-SNP Counter', 'het_snp_count')} |${toolTextRow('PopPUNK', 'poppunk')} |${toolTextRow('CDC PBP AMR Predictor', 'spn_pbp_amr')} - |${toolTextRow('AMRsearch', 'amrsearch')} + |${toolTextRow('ARIBA', 'ariba')} |${toolTextRow('mlst', 'mlst')} |${toolTextRow('Kraken 2', 'kraken2')} |${toolTextRow('SeroBA', 'seroba')} @@ -259,7 +261,7 @@ process PARSE { |${imageTextRow('BCFtools', 'bcftools')} |${imageTextRow('PopPUNK', 'poppunk')} |${imageTextRow('CDC PBP AMR Predictor', 'spn_pbp_amr')} - |${imageTextRow('AMRsearch', 'amrsearch')} + |${imageTextRow('ARIBA', 'ariba')} |${imageTextRow('mlst', 'mlst')} |${imageTextRow('Kraken 2', 'kraken2')} |${imageTextRow('SeroBA', 'seroba')} @@ -566,3 +568,16 @@ process SEROBA_VERSION { VERSION=$(seroba version) /$ } + +process ARIBA_VERSION { + label 'ariba_container' + label 'farm_low' + + output: + env VERSION + + shell: + $/ + VERSION=$(ariba version | grep ARIBA | sed -r "s/.*:\s(.+)/\1/") + /$ +} diff --git a/workflows/info_and_version.nf b/workflows/info_and_version.nf index d6f0d50..3808a57 100644 --- a/workflows/info_and_version.nf +++ b/workflows/info_and_version.nf @@ -1,4 +1,4 @@ -include { IMAGES; DATABASES; TOOLS; COMBINE_INFO; PARSE; PRINT; SAVE; GIT_VERSION; PYTHON_VERSION; FASTP_VERSION; UNICYCLER_VERSION; SHOVILL_VERSION; QUAST_VERSION; BWA_VERSION; SAMTOOLS_VERSION; BCFTOOLS_VERSION; POPPUNK_VERSION; MLST_VERSION; KRAKEN2_VERSION; SEROBA_VERSION } from "$projectDir/modules/info" +include { IMAGES; DATABASES; TOOLS; COMBINE_INFO; PARSE; PRINT; SAVE; GIT_VERSION; PYTHON_VERSION; FASTP_VERSION; UNICYCLER_VERSION; SHOVILL_VERSION; QUAST_VERSION; BWA_VERSION; SAMTOOLS_VERSION; BCFTOOLS_VERSION; POPPUNK_VERSION; MLST_VERSION; KRAKEN2_VERSION; SEROBA_VERSION; ARIBA_VERSION } from "$projectDir/modules/info" // Alternative workflow that prints versions of pipeline and tools workflow PRINT_VERSION { @@ -69,6 +69,7 @@ workflow GET_VERSION { MLST_VERSION() KRAKEN2_VERSION() SEROBA_VERSION() + ARIBA_VERSION() TOOLS( GIT_VERSION.out, @@ -83,7 +84,8 @@ workflow GET_VERSION { POPPUNK_VERSION.out, MLST_VERSION.out, KRAKEN2_VERSION.out, - SEROBA_VERSION.out + SEROBA_VERSION.out, + ARIBA_VERSION.out ) COMBINE_INFO( From 648d3807705610b7ea4f5d485312abbcdad0e32c Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 21 Jun 2023 15:39:40 +0000 Subject: [PATCH 003/157] Update ARIBA reference and metadata Former-commit-id: 0106a59bac2e9a2916947a0617168d44d58cbd7f --- data/ariba_metadata.tsv | 21 ++++++++----- data/ariba_sequences.fasta | 64 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 7 deletions(-) diff --git a/data/ariba_metadata.tsv b/data/ariba_metadata.tsv index 2a73517..9d35ca5 100644 --- a/data/ariba_metadata.tsv +++ b/data/ariba_metadata.tsv @@ -39,8 +39,8 @@ tetS_M_AY534326 1 0 . . Tetracycline resistance tetM_M85225 1 0 . . Tetracycline resistance tetS_FN555436 1 0 . . Tetracycline resistance tetM_MH283017 1 0 . . tetracycline resistance -folA_AE007317 1 1 I100L . Trimethoprim -folP_AE007317 1 1 . . Sulfamethoxazole resistance on if insertions in 56-67 amino acids +folA_AE007317 1 1 I100L . "proteinID-AAL00232.1, Trimethoprim" +folP_AE007317 1 1 . . "proteinID-AAK99071.1, Sulfamethoxazole resistance on if insertions in 56-67 amino acids" gyrA_AE007317 1 1 S81F . Fluoroquinolone gyrA_AE007317 1 1 S81Y . Fluoroquinolone gyrA_AE007317 1 1 S81C . Fluoroquinolone @@ -62,12 +62,19 @@ parE_AE007317 1 1 P454S . Fluoroquinolone tetO_Y07780 1 0 . . Tetracycline resistance ermBups_HG799494 0 0 . . Erythromycin and Clindamycin resistance ermbTr_CP002121 0 0 . . Erythromycin and Clindamycin resistance -rplD_AE007317 1 1 . . Linezolid resistance (deletion within the L4 region of the gene PMID:24492357) -rpoB_AE007317 1 1 D489E . rifampicin resistance PMID:10508007-D415E -rpoB_AE007317 1 1 H499N . rifampicin resistance PMID:10508007-H425N -rpoB_AE007317 1 1 D489N . rifampicin resistance PMID:10508007-H415N +rplD_AE007317 1 1 . . Linezolid resistance (deletion within the L4 region of the gene ) +rpoB_AE007317 1 1 D489E . rifampicin resistance -D415E +rpoB_AE007317 1 1 H499N . rifampicin resistance -H425N +rpoB_AE007317 1 1 D489N . rifampicin resistance -H415N vanB_KC489787 1 0 . . Vacomycin resistance vanD_EU999036 1 0 . . Vacomycin resistance vanE_FJ872411 1 0 . . Vacomycin resistance vanG_KF704242 1 0 . . Vacomycin resistance -otrA_X53401 1 0 . . Tetracycline resistance \ No newline at end of file +otrA_X53401 1 0 . . Tetracycline resistance +vanA_M97297 1 0 . . Vacomycin resistance (E.faecium) +vanC_AF162694 1 0 . . Vacomycin resistance (E.gallinarum) +23S_NZ_CP018347 0 1 A2114G . Macrolide:32347-35250 +23S_NZ_CP018347 0 1 A2115G . Macrolide:32347-35250 +23S_NZ_CP018347 0 1 A2118G . Macrolide/Streptogramin:32347-35250 +23S_NZ_CP018347 0 1 C2630A . Macrolide:32347-35250 +23S_NZ_CP018347 0 1 C2630G . Macrolide:32347-35250 \ No newline at end of file diff --git a/data/ariba_sequences.fasta b/data/ariba_sequences.fasta index 8da1617..4509177 100644 --- a/data/ariba_sequences.fasta +++ b/data/ariba_sequences.fasta @@ -713,3 +713,67 @@ TATTGTGCGAACATCGTAGACACTCAATTGAAAAATAATGAGGTCATTCTTAGTGGAGAAATCCCTGCTC GGTGTATTCAAGAATATCGTAGTGATTTAACTTTCTTTACAAATGGACGTAGTGTTTGTTTAACAGAGTT AAAAGGGTACCATGTTACTACCGGTGAACCTGTTTGCCAGCCCCGTCGTCCAAATAGTCGGATAGATAAA GTACGATATATGTTCAATAAAATAACTTAG +>vanA_M97297 +ATGAATAGAATAAAAGTTGCAATACTGTTTGGGGGTTGCTCAGAGGAGCATGACGTATCGGTAAAATCTGCAATAGAGATAGCCGCTAAC +ATTAATAAAGAAAAATACGAGCCGTTATACATTGGAATTACGAAATCTGGTGTATGGAAAATGTGCGAAAAACCTTGCGCGGAATGGGAA +AACGACAATTGCTATTCAGCTGTACTCTCGCCGGATAAAAAAATGCACGGATTACTTGTTAAAAAGAACCATGAATATGAAATCAACCAT +GTTGATGTAGCATTTTCAGCTTTGCATGGCAAGTCAGGTGAAGATGGATCCATACAAGGTCTGTTTGAATTGTCCGGTATCCCTTTTGTA +GGCTGCGATATTCAAAGCTCAGCAATTTGTATGGACAAATCGTTGACATACATCGTTGCGAAAAATGCTGGGATAGCTACTCCCGCCTTT +TGGGTTATTAATAAAGATGATAGGCCGGTGGCAGCTACGTTTACCTATCCTGTTTTTGTTAAGCCGGCGCGTTCAGGCTCATCCTTCGGT +GTGAAAAAAGTCAATAGCGCGGACGAATTGGACTACGCAATTGAATCGGCAAGACAATATGACAGCAAAATCTTAATTGAGCAGGCTGTT +TCGGGCTGTGAGGTCGGTTGTGCGGTATTGGGAAACAGTGCCGCGTTAGTTGTTGGCGAGGTGGACCAAATCAGGCTGCAGTACGGAATC +TTTCGTATTCATCAGGAAGTCGAGCCGGAAAAAGGCTCTGAAAACGCAGTTATAACCGTTCCCGCAGACCTTTCAGCAGAGGAGCGAGGA +CGGATACAGGAAACGGCAAAAAAAATATATAAAGCGCTCGGCTGTAGAGGTCTAGCCCGTGTGGATATGTTTTTACAAGATAACGGCCGC +ATTGTACTGAACGAAGTCAATACTCTGCCCGGTTTCACGTCATACAGTCGTTATCCCCGTATGATGGCCGCTGCAGGTATTGCACTTCCC +GAACTGATTGACCGCTTGATCGTATTAGCGTTAAAGGGGTGA +>vanC_AF162694 +ATGAAAAAAATTGCCGTTTTATTTGGAGGGAATTCTCCAGAATACTCAGTGTCACTAACCTCAGCAGCAAGTGTGATCCAAGCTATTGAC +CCGCTGAAATATGAAGTAATGACCATTGGCATCGCACCAACAATGGATTGGTATTGGTATCAAGGAAACCTCGCGAATGTTCGCAATGAT +ACTTGGCTAGAAGATCACAAAAACTGTCACCAGCTGACTTTTTCTAGCCAAGGATTTATATTAGGAGAAAAACGAATCGTCCCTGATGTC +CTCTTTCCAGTCTTGCATGGGAAGTATGGCGAGGATGGCTGTATCCAAGGACTGCTTGAACTAATGAACCTGCCTTATGTTGGTTGCCAT +GTCGCTGCCTCCGCATTATGTATGAACAAATGGCTCTTGCATCAACTTGCTGATACCATGGGAATCGCTAGTGCTCCCACTTTGCTTTTA +TCCCGCTATGAAAACGATCCTGCCACAATCGATCGTTTTATTCAAGACCATGGATTCCCGATCTTTATCAAGCCGAATGAAGCCGGTTCT +TCAAAAGGGATCACAAAAGTAACTGACAAAACAGCGCTCCAATCTGCATTAACGACTGCTTTTGCTTACGGTTCTACTGTGTTGATCCAA +AAGGCGATAGCGGGTATTGAAATTGGCTGCGGCATCTTAGGAAATGAGCAATTGACGATTGGTGCTTGTGATGCGATTTCTCTTGTCGAC +GGTTTTTTTGATTTTGAAGAGAAATACCAATTAATCAGCGCCACGATCACTGTCCCAGCACCATTGCCTCTCGCGCTTGAATCACAGATC +AAGGAGCAGGCACAGCTGCTTTATCGAAACTTGGGATTGACGGGTCTGGCTCGAATCGATTTTTTCGTCACCAATCAAGGAGCGATTTAT +TTAAACGAAATCAACACCATGCCGGGATTTACTGGGCACTCCCGCTACCCAGCTATGATGGCGGAAGTCGGGTTATCCTACGAAATATTA +GTAGAGCAATTGATTGCACTGGCAGAGGAGGACAAACGATGA +>23S_NZ_CP018347 +tttggataagtcctcgagctattagtattagtccgctacatgtgtcgccacacttccacttctaacctatctacctgatc +atctctcagggctcttactgatatataatcatgggaaatctcatcttgaggtgggtttcacacttagatgctttcagcgt +ttatcccttccctacatagctacccagcgatgcctttggcaagacaactggtacaccagcggtaagtccactctggtcct +ctcgtactaggagcagatcctctcaaatttcctacgcccgcgacggatagggaccgaactgtctcacgacgttctgaacc +cagctcgcgtgccgctttaatgggcgaacagcccaacccttgggaccgactacagccccaggatgcgacgagccgacatc +gaggtgccaaacctccccgtcgatgtgaactcttgggggagataagcctgttatccccagggtagcttttatccgttgag +cgatggcccttccatacggaaccaccggatcactaagcccgactttcgtccctgctcgagttgtagctctcgcagtcaag +ctcccttatacctttacactctgcgaatgatttccaaccattctgagggaacctttgggcgcctccgttaccttttagga +ggcgaccgccccagtcaaactgcccgtcagacactgtctccgatagggatcacctatctgggttagagtggccataacac +aagggtagtatcccaacagcgtctccttcgaaactggcgtcccgatctcttagactcctacctatcctgtacatgtggta +cagacactcaatatcaaactgcagtaaagctccatggggtctttccgtcctgtcgcgggtaacctgcatcttcacaggta +ctaaaatttcaccgagtctctcgttgagacagtgcccaaatcattacgcctttcgtgcgggtcggaacttacccgacaag +gaatttcgctaccttaggaccgttatagttacggccgccgtttactggggcttcaattcataccttcgcttacgctaagc +actcctcttaaccttccagcaccgggcaggcgtcaccccctatacatcatcttacgatttagcagagagctgtgtttttg +ataaacagttgcttgggcctattcactgcggctgacctaaagtcagcaccccttctcccgaagttacggggtcattttgc +cgagttccttaacgagagttctctcgctcacctgaggctactcgcctcgactacctgtgtcggtttgcggtacgggtaga +gtatgtttaaacgctagaagcttttcttggcagtgtgacgtcactaacttcgctactaaacttcgctccccatcacagct +caatgttatagaattaagcatttgactcaattcacacctcactgcttagacagactcttccaatcgtctgctttagttag +cctactgcgtccctccatcactacatactctagtacaggaatatcaacctgttgtccatcggatacacctttcggtctct +ccttaggtcccgactaacccagggcggacgagccttcccctggaaaccttagtcttacggtggacaggattctcacctgt +ctttcgctactcataccggcattctcacttctatgcgttccagcactcctcacggtataccttcatcacacatagaacgc +tctcctaccatacctataaaggtatccacagcttcggtaaattgttttagccccggtacattttcggcgcagggtcactc +gactagtgagctattacgcactctttgaatgaatagctgcttctaagctaacatcctagttgtctgtgcaaccccacatc +cttttccacttaacaattattttgggaccttagctggtggtctgggctgtttccctttcgactacggatcttagcactcg +cagtctgactgccgaccataattcattggcattcggagtttatctgagattggtaatccgggatggacccctcacccaaa +cagtgctctacctccaagaatctctaatgtcgacgctagccctaaagctatttcggagagaaccagctatctccaagttc +gtttggaatttctccgctacccacaagtcatccaagcacttttcaacgtgccctggttcggtcctccagtgcgtcttacc +gcaccttcaacctgctcatgggtaggtcacatggtttcgggtctacgtcatgatactaattcgccctgttcagactcggt +ttccctacggctccgtctcttcaacttaacctcgcatcataacgtaactcgccggttcattctacaaaaggcacgctctc +acccattaacgggctcgaacttgttgtaggcacacggtttcaggttctatttcactcccctcccggggtgcttttcacct +ttccctcacggtactggttcactatcggtcactagggagtatttagggttgggagatggtcctcccagattccgacggga +tttcacgtgtcccgccgtactcaggatactgctaggtacaaagactattttaaatacgaggctattactctctttggctg +atcttcccaaatcattcttctataatctttgagtccacattgcagtcctacaaccccgaagagtaaactcttcggtttgc +ccttctgccgtttcgctcgccgctactaaggcaatcgcttttgctttctcttcctgcagctacttagatgtttcagttca +ctgcgtcttcctcctcacatccttaacagatgtgggtaacaggtattacctgttgggttcccccattcggaaatccctgg +atcatcgcttacttacagctacccaaggtatatcgtcgtttgtcacgtccttcgtcggctcctagtgccaaggcatccac +cgtgcgcccttattaacttaacct From c7c22b3271dfd4ea49de74f8325aea955d6079ee Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 28 Jun 2023 14:11:58 +0000 Subject: [PATCH 004/157] Rename included ARIBA references Former-commit-id: ede4f8469abbed6593b5925268e33005c18a02cc --- data/{ariba_metadata.tsv => ariba_metadata-20230628.tsv} | 0 .../{ariba_sequences.fasta => ariba_ref_sequences-20230628.fasta} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename data/{ariba_metadata.tsv => ariba_metadata-20230628.tsv} (100%) rename data/{ariba_sequences.fasta => ariba_ref_sequences-20230628.fasta} (100%) diff --git a/data/ariba_metadata.tsv b/data/ariba_metadata-20230628.tsv similarity index 100% rename from data/ariba_metadata.tsv rename to data/ariba_metadata-20230628.tsv diff --git a/data/ariba_sequences.fasta b/data/ariba_ref_sequences-20230628.fasta similarity index 100% rename from data/ariba_sequences.fasta rename to data/ariba_ref_sequences-20230628.fasta From 94e3d8e7d18741aceaf8a8d6fc10f05a6607badf Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 28 Jun 2023 14:12:40 +0000 Subject: [PATCH 005/157] Allow custom ARIBA references and save database Former-commit-id: 1233fdecf005871f57fc669c9350bb7d93832141 --- modules/amr.nf | 20 +++++++++++--------- modules/validate.nf | 12 ++++++++++++ nextflow.config | 5 +++++ workflows/pipeline.nf | 7 ++++--- 4 files changed, 32 insertions(+), 12 deletions(-) diff --git a/modules/amr.nf b/modules/amr.nf index 94c3ddb..b6e3e24 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -45,15 +45,17 @@ process CREATE_ARIBA_DB { label 'farm_low' input: - path(ref_genome) - path(metadata) + path ref_sequences + path metadata + path local output: - path ariba_database + path "${local}/database" script: """ - ariba prepareref -f "$ref_genome" -m "$metadata" ariba_database + rm -rf "$local/database" + ariba prepareref -f "$ref_sequences" -m "$metadata" "$local/database" """ } @@ -69,13 +71,13 @@ process OTHER_RESISTANCE { tuple val(sample_id), path(read1), path(read2), path(unpaired) output: - tuple val(sample_id), path(tsv), emit: tsv + tuple val(sample_id), path(report), path(report_debug), emit: reports script: - tsv='report.tsv' + report='result/report.tsv' + report_debug='result/debug.report.tsv' """ - ariba run --nucmer_min_id 80 --assembled_threshold 0.80 --assembly_cov 10 $ariba_database $read1 $read2 result - mv result/report.tsv "${tsv}" + ariba run --nucmer_min_id 80 --assembled_threshold 0.80 --assembler spades $ariba_database $read1 $read2 result """ } @@ -87,7 +89,7 @@ process GET_OTHER_RESISTANCE { tag "$sample_id" input: - tuple val(sample_id), path(tsv) + tuple val(sample_id), path(report), path(report_debug) script: """ diff --git a/modules/validate.nf b/modules/validate.nf index b8fb6be..4cf3438 100644 --- a/modules/validate.nf +++ b/modules/validate.nf @@ -26,6 +26,9 @@ validParams = [ length_low: 'int', length_high: 'int', depth: 'int_float', + ariba_ref: 'path_fasta', + ariba_metadata: 'path_tsv', + ariba_db_local: 'path', lite: 'boolean' ] @@ -141,6 +144,15 @@ void validate(Map params) { invalidValues[key] = [value, 'path to a fasta file (file does not have an filename extension of .fasta or .fa)'] } break + + case 'path_tsv': + File tsv = new File(value) + if (!tsv.exists()) { + invalidValues[key] = [value, 'path to a TSV file (file does not exist)'] + } else if (!(value ==~ /.+\.tsv$/)) { + invalidValues[key] = [value, 'path to a TSV file (file does not have an filename extension of .tsv)'] + } + break case 'url_git': if (!(value ==~ /^(https?:\/\/)?(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)\.git$/)) { diff --git a/nextflow.config b/nextflow.config index ee0459d..191a4e5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -47,6 +47,11 @@ params { length_high = 2300000 depth = 20.00 + // Default ARIBA referece sequences and metadata paths, and local directory for its generated database + ariba_ref = "$projectDir/data/ariba_ref_sequences-20230628.fasta" + ariba_metadata = "$projectDir/data/ariba_metadata-20230628.tsv" + ariba_db_local = "$projectDir/databases/ariba" + // Toggle for removing .bam and .sam files mid-run to reduce storage requirement // Warning: This will break the -resume function of Nextflow lite = false diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 4e020ce..46e6a6f 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -26,8 +26,8 @@ workflow PIPELINE { poppunk_db = GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_local) poppunk_ext_clusters = GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_local) - // Get path to ARIBA database, create from reference and metadata - ariba_db = CREATE_ARIBA_DB("$projectDir/data/ariba_sequences.fasta", "$projectDir/data/ariba_metadata.tsv") + // Get path to ARIBA database, generate from reference sequences and metadata if ncessary + ariba_db = CREATE_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) // Get read pairs into Channel raw_read_pairs_ch raw_read_pairs_ch = Channel.fromFilePairs("$params.reads/*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}", checkIfExists: true) @@ -143,7 +143,8 @@ workflow PIPELINE { // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, infer resistance (also determinants if any) of other antimicrobials // Output into Channel GET_OTHER_RESISTANCE.out.result OTHER_RESISTANCE(ariba_db, OVERALL_QC_PASSED_READS_ch) - GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.tsv) + OTHER_RESISTANCE.out.reports.view() + GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.reports) // Generate results.csv by sorted sample_id based on merged Channels // READ_QC.out.result, ASSEMBLY_QC.out.result, MAPPING_QC.out.result, TAXONOMY_QC.out.result, OVERALL_QC.out.result, From e1c91e702d9d80b1c96036eb0e39d9e79194dc84 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 30 Jun 2023 11:31:23 +0000 Subject: [PATCH 006/157] Implement validation of existing ARIBA database Former-commit-id: de24962f929e06658774ba006d188f612db064e1 --- bin/create_ariba_db.sh | 34 ++++++++++++++++++++++++++++++++++ modules/amr.nf | 11 ++++++++--- 2 files changed, 42 insertions(+), 3 deletions(-) create mode 100755 bin/create_ariba_db.sh diff --git a/bin/create_ariba_db.sh b/bin/create_ariba_db.sh new file mode 100755 index 0000000..03d9997 --- /dev/null +++ b/bin/create_ariba_db.sh @@ -0,0 +1,34 @@ +# Check if CREATE_ARIBA_DB has run successfully on the specific reference sequences and metadata. +# If not: remove the $OUTPUT directory, and prepare the ARIBA database from reference sequences and metadata, also save metadata to done_ariba_db.json + +JSON="done_ariba_db.json" + +REF_SEQUENCES_MD5=$(md5sum $REF_SEQUENCES | awk '{ print $1 }') +METADATA_MD5=$(md5sum $METADATA | awk '{ print $1 }') + +if [ ! -f ${DB_LOCAL}/${JSON} ] || \ + [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$REF_SEQUENCES" ] || \ + [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$REF_SEQUENCES_MD5" ] || \ + [ ! "$(grep '"metadata"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$METADATA" ] || \ + [ ! "$(grep '"metadata_md5"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$METADATA_MD5" ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/00.info.txt ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/00.version_info.txt ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_genes.log ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_metadata.log ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_metadata.tsv ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_noncoding.log ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.all.fa ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.clusters.pickle ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.clusters.tsv ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.gene.fa ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.gene.varonly.fa ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.fa ] || \ + [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.varonly.fa ] ; then + + rm -rf "$DB_LOCAL/$OUTPUT" + + ariba prepareref -f "$REF_SEQUENCES" -m "$METADATA" "$DB_LOCAL/$OUTPUT" + + echo -e "{\n \"reference\": \"$REF_SEQUENCES\",\n \"reference_md5\": \"$REF_SEQUENCES_MD5\",\n \"metadata\": \"$METADATA\",\n \"metadata_md5\": \"$METADATA_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON} + +fi \ No newline at end of file diff --git a/modules/amr.nf b/modules/amr.nf index b6e3e24..bafff49 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -50,12 +50,17 @@ process CREATE_ARIBA_DB { path local output: - path "${local}/database" + path "${local}/${output}" script: + output='database' """ - rm -rf "$local/database" - ariba prepareref -f "$ref_sequences" -m "$metadata" "$local/database" + REF_SEQUENCES="$ref_sequences" + METADATA="$metadata" + DB_LOCAL="$local" + OUTPUT="$output" + + source create_ariba_db.sh """ } From de2e2a3f29019d82828a736466cd9040a1feefd2 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 30 Jun 2023 11:53:03 +0000 Subject: [PATCH 007/157] Improve naming of BWA database module and script Former-commit-id: 93a926df184a9b607cf7b64d30eccc554bff8b9c --- ...me_bwa_db_prefix.sh => create_ref_genome_bwa_db.sh} | 2 +- modules/mapping.nf | 6 +++--- workflows/init.nf | 4 ++-- workflows/pipeline.nf | 10 +++++----- 4 files changed, 11 insertions(+), 11 deletions(-) rename bin/{get_ref_genome_bwa_db_prefix.sh => create_ref_genome_bwa_db.sh} (90%) diff --git a/bin/get_ref_genome_bwa_db_prefix.sh b/bin/create_ref_genome_bwa_db.sh similarity index 90% rename from bin/get_ref_genome_bwa_db_prefix.sh rename to bin/create_ref_genome_bwa_db.sh index 7a155a1..e9f5225 100755 --- a/bin/get_ref_genome_bwa_db_prefix.sh +++ b/bin/create_ref_genome_bwa_db.sh @@ -1,4 +1,4 @@ -# Check if GET_REF_GENOME_BWA_DB_PREFIX has run successfully on the specific reference. +# Check if CREATE_REF_GENOME_BWA_DB has run successfully on the specific reference. # If not: remove files in database directory, and construct the FM-index database of the reference genome for BWA, also save metadata to done_bwa_db.json if [ ! -f ${DB_LOCAL}/done_bwa_db.json ] || \ diff --git a/modules/mapping.nf b/modules/mapping.nf index 46b04c8..c058954 100644 --- a/modules/mapping.nf +++ b/modules/mapping.nf @@ -1,5 +1,5 @@ -// Return database prefix with path, construct if necessary -process GET_REF_GENOME_BWA_DB_PREFIX { +// Return database path and prefix, construct if necessary +process CREATE_REF_GENOME_BWA_DB { label 'bwa_container' label 'farm_mid' @@ -17,7 +17,7 @@ process GET_REF_GENOME_BWA_DB_PREFIX { DB_LOCAL="$local" PREFIX="$prefix" - source get_ref_genome_bwa_db_prefix.sh + source create_ref_genome_bwa_db.sh """ } diff --git a/workflows/init.nf b/workflows/init.nf index 24e1deb..e92c2da 100644 --- a/workflows/init.nf +++ b/workflows/init.nf @@ -1,5 +1,5 @@ // Import process modules -include { GET_REF_GENOME_BWA_DB_PREFIX } from "$projectDir/modules/mapping" +include { CREATE_REF_GENOME_BWA_DB } from "$projectDir/modules/mapping" include { GET_KRAKEN_DB } from "$projectDir/modules/taxonomy" include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS } from "$projectDir/modules/lineage" include { GET_SEROBA_DB; CREATE_SEROBA_DB } from "$projectDir/modules/serotype" @@ -8,7 +8,7 @@ include { GET_DOCKER_COMPOSE; PULL_IMAGES } from "$projectDir/modules/docker" // Alternative workflow for initialisation only workflow INIT { // Check Reference Genome BWA Database, generate from assembly if necessary - GET_REF_GENOME_BWA_DB_PREFIX(params.ref_genome, params.ref_genome_bwa_db_local) + CREATE_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) // Check Kraken2 Database, download if necessary kraken2_db = GET_KRAKEN_DB(params.kraken2_db_remote, params.kraken2_db_local) diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 46e6a6f..f08f357 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -1,7 +1,7 @@ // Import process modules include { PREPROCESS; READ_QC } from "$projectDir/modules/preprocess" include { ASSEMBLY_UNICYCLER; ASSEMBLY_SHOVILL; ASSEMBLY_ASSESS; ASSEMBLY_QC } from "$projectDir/modules/assembly" -include { GET_REF_GENOME_BWA_DB_PREFIX; MAPPING; SAM_TO_SORTED_BAM; SNP_CALL; HET_SNP_COUNT; MAPPING_QC } from "$projectDir/modules/mapping" +include { CREATE_REF_GENOME_BWA_DB; MAPPING; SAM_TO_SORTED_BAM; SNP_CALL; HET_SNP_COUNT; MAPPING_QC } from "$projectDir/modules/mapping" include { GET_KRAKEN_DB; TAXONOMY; TAXONOMY_QC } from "$projectDir/modules/taxonomy" include { OVERALL_QC } from "$projectDir/modules/overall_qc" include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS; LINEAGE } from "$projectDir/modules/lineage" @@ -12,8 +12,8 @@ include { PBP_RESISTANCE; GET_PBP_RESISTANCE; CREATE_ARIBA_DB; OTHER_RESISTANCE; // Main pipeline workflow workflow PIPELINE { main: - // Get path to prefix of Reference Genome BWA Database, generate from assembly if necessary - ref_genome_bwa_db_prefix = GET_REF_GENOME_BWA_DB_PREFIX(params.ref_genome, params.ref_genome_bwa_db_local) + // Get path and prefix of Reference Genome BWA Database, generate from assembly if necessary + ref_genome_bwa_db = CREATE_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) // Get path to Kraken2 Database, download if necessary kraken2_db = GET_KRAKEN_DB(params.kraken2_db_remote, params.kraken2_db_local) @@ -73,7 +73,7 @@ workflow PIPELINE { // From Channel READ_QC_PASSED_READS_ch map reads to reference // Output into Channel MAPPING.out.sam - MAPPING(ref_genome_bwa_db_prefix, READ_QC_PASSED_READS_ch) + MAPPING(ref_genome_bwa_db, READ_QC_PASSED_READS_ch) // From Channel MAPPING.out.sam, Convert SAM into sorted BAM and calculate reference coverage // Output into Channels SAM_TO_SORTED_BAM.out.bam and SAM_TO_SORTED_BAM.out.ref_coverage @@ -204,7 +204,7 @@ workflow PIPELINE { ) // Pass to SAVE_INFO sub-workflow - DATABASES_INFO = ref_genome_bwa_db_prefix.map { it[0] } + DATABASES_INFO = ref_genome_bwa_db.map { it[0] } .merge(kraken2_db) .merge(seroba_db.map { it[0] }) .merge(poppunk_db.map { it[0] }) From 1b9851633627debc9836142e841a5da940df57de Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 30 Jun 2023 12:30:39 +0000 Subject: [PATCH 008/157] Add MD5 check to BWA database Former-commit-id: 199e21324c6a793a16ad7ea4ec079df5aa18bbde --- bin/create_ref_genome_bwa_db.sh | 11 ++++++++--- bin/get_databases_info.sh | 3 ++- modules/info.nf | 23 ++++++++++++----------- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/bin/create_ref_genome_bwa_db.sh b/bin/create_ref_genome_bwa_db.sh index e9f5225..aae6537 100755 --- a/bin/create_ref_genome_bwa_db.sh +++ b/bin/create_ref_genome_bwa_db.sh @@ -1,8 +1,13 @@ # Check if CREATE_REF_GENOME_BWA_DB has run successfully on the specific reference. # If not: remove files in database directory, and construct the FM-index database of the reference genome for BWA, also save metadata to done_bwa_db.json -if [ ! -f ${DB_LOCAL}/done_bwa_db.json ] || \ - [ ! "$(grep 'reference' ${DB_LOCAL}/done_bwa_db.json | sed -r 's/.+: "(.*)",/\1/')" == "$REFERENCE" ] || \ +JSON="done_bwa_db.json" + +REFERENCE_MD5=$(md5sum $REFERENCE | awk '{ print $1 }') + +if [ ! -f ${DB_LOCAL}/${JSON} ] || \ + [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$REFERENCE" ] || \ + [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$REFERENCE_MD5" ] || \ [ ! -f ${DB_LOCAL}/${PREFIX}.amb ] || \ [ ! -f ${DB_LOCAL}/${PREFIX}.ann ] || \ [ ! -f ${DB_LOCAL}/${PREFIX}.bwt ] || \ @@ -15,6 +20,6 @@ if [ ! -f ${DB_LOCAL}/done_bwa_db.json ] || \ mv ${PREFIX}.amb ${PREFIX}.ann ${PREFIX}.bwt ${PREFIX}.pac ${PREFIX}.sa -t $DB_LOCAL - echo -e "{\n \"reference\": \"$REFERENCE\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/done_bwa_db.json + echo -e "{\n \"reference\": \"$REFERENCE\",\n \"reference_md5\": \"$REFERENCE_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON} fi diff --git a/bin/get_databases_info.sh b/bin/get_databases_info.sh index acc9258..45d126b 100755 --- a/bin/get_databases_info.sh +++ b/bin/get_databases_info.sh @@ -4,12 +4,13 @@ add_bwa_db () { BWA_DB_JSON=${BWA_DB_PATH}/done_bwa_db.json if [ -f "$BWA_DB_JSON" ]; then REFERENCE=$(jq -r .reference $BWA_DB_JSON) + REFERENCE_MD5=$(jq -r .reference_md5 $BWA_DB_JSON) CREATE_TIME=$(jq -r .create_time $BWA_DB_JSON) else REFERENCE="Not yet created" CREATE_TIME="Not yet created" fi - jq -n --arg ref "$REFERENCE" --arg create_time "$CREATE_TIME" '. = {"reference": $ref, "create_time": $create_time}' + jq -n --arg ref "$REFERENCE" --arg ref_md5 "$REFERENCE_MD5" --arg create_time "$CREATE_TIME" '. = {"reference": $ref, "reference_md5": $ref_md5, "create_time": $create_time}' } add_seroba_db () { diff --git a/modules/info.nf b/modules/info.nf index 5dadd65..dd456ed 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -162,38 +162,39 @@ process PARSE { |""".stripMargin() def dbTextRow = { leftContent, rightContent -> - textRow(9, 77, leftContent, rightContent) + textRow(13, 73, leftContent, rightContent) } dbText = """\ |┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ Databases Information ┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈┈ |╔═══════════════════════════════════════════════════════════════════════════════════════════╗ |║ BWA reference genome FM-index database ║ - |╟───────────┬───────────────────────────────────────────────────────────────────────────────╢ + |╟───────────────┬───────────────────────────────────────────────────────────────────────────╢ |${dbTextRow('Reference', json.bwa_db.reference)} + |${dbTextRow('Reference MD5', json.bwa_db.reference_md5)} |${dbTextRow('Created', json.bwa_db.create_time)} - |╠═══════════╧═══════════════════════════════════════════════════════════════════════════════╣ + |╠═══════════════╧═══════════════════════════════════════════════════════════════════════════╣ |║ Kraken 2 database ║ - |╟───────────┬───────────────────────────────────────────────────────────────────────────────╢ + |╟───────────────┬───────────────────────────────────────────────────────────────────────────╢ |${dbTextRow('Source', json.kraken2_db.url)} |${dbTextRow('Saved', json.kraken2_db.save_time)} - |╠═══════════╧═══════════════════════════════════════════════════════════════════════════════╣ + |╠═══════════════╧═══════════════════════════════════════════════════════════════════════════╣ |║ PopPUNK database ║ - |╟───────────┬───────────────────────────────────────────────────────────────────────────────╢ + |╟───────────────┬───────────────────────────────────────────────────────────────────────────╢ |${dbTextRow('Source', json.poppunnk_db.url)} |${dbTextRow('Saved', json.poppunnk_db.save_time)} - |╠═══════════╧═══════════════════════════════════════════════════════════════════════════════╣ + |╠═══════════════╧═══════════════════════════════════════════════════════════════════════════╣ |║ PopPUNK external clusters file ║ - |╟───────────┬───────────────────────────────────────────────────────────────────────────────╢ + |╟───────────────┬───────────────────────────────────────────────────────────────────────────╢ |${dbTextRow('Source', json.poppunk_ext.url)} |${dbTextRow('Saved', json.poppunk_ext.save_time)} - |╠═══════════╧═══════════════════════════════════════════════════════════════════════════════╣ + |╠═══════════════╧═══════════════════════════════════════════════════════════════════════════╣ |║ SeroBA database ║ - |╟───────────┬───────────────────────────────────────────────────────────────────────────────╢ + |╟───────────────┬───────────────────────────────────────────────────────────────────────────╢ |${dbTextRow('Source', json.seroba_db.git)} |${dbTextRow('Kmer size', json.seroba_db.kmer)} |${dbTextRow('Created', json.seroba_db.create_time)} - |╚═══════════╧═══════════════════════════════════════════════════════════════════════════════╝ + |╚═══════════════╧═══════════════════════════════════════════════════════════════════════════╝ |""".stripMargin() def getVersion = { tool -> From 7e8cfad4d5505b9ad44d58e06e46e39dba9d9762 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 30 Jun 2023 13:41:38 +0000 Subject: [PATCH 009/157] Output ARIBA database info Former-commit-id: 92797594e9261dc669247a89b60bda709eac6eee --- bin/get_databases_info.sh | 30 +++++++++++++++++++++++++----- modules/amr.nf | 6 +++--- modules/info.nf | 22 ++++++++++++++++++++++ workflows/info_and_version.nf | 4 ++++ workflows/pipeline.nf | 8 +++++--- 5 files changed, 59 insertions(+), 11 deletions(-) diff --git a/bin/get_databases_info.sh b/bin/get_databases_info.sh index 45d126b..c87d56f 100755 --- a/bin/get_databases_info.sh +++ b/bin/get_databases_info.sh @@ -1,20 +1,39 @@ # Save received databases information into a JSON file add_bwa_db () { - BWA_DB_JSON=${BWA_DB_PATH}/done_bwa_db.json + BWA_DB_JSON=${BWA_DB_PATH}/${BWA_JSON} if [ -f "$BWA_DB_JSON" ]; then REFERENCE=$(jq -r .reference $BWA_DB_JSON) REFERENCE_MD5=$(jq -r .reference_md5 $BWA_DB_JSON) CREATE_TIME=$(jq -r .create_time $BWA_DB_JSON) else REFERENCE="Not yet created" + REFERENCE_MD5="Not yet created" CREATE_TIME="Not yet created" fi jq -n --arg ref "$REFERENCE" --arg ref_md5 "$REFERENCE_MD5" --arg create_time "$CREATE_TIME" '. = {"reference": $ref, "reference_md5": $ref_md5, "create_time": $create_time}' } +add_ariba_db () { + ARIBA_DB_JSON=${ARIBA_DB_PATH}/${ARIBA_JSON} + if [ -f "$ARIBA_DB_JSON" ]; then + REFERENCE=$(jq -r .reference $ARIBA_DB_JSON) + REFERENCE_MD5=$(jq -r .reference_md5 $ARIBA_DB_JSON) + METADATA=$(jq -r .metadata $ARIBA_DB_JSON) + METADATA_MD5=$(jq -r .metadata_md5 $ARIBA_DB_JSON) + CREATE_TIME=$(jq -r .create_time $ARIBA_DB_JSON) + else + REFERENCE="Not yet created" + REFERENCE_MD5="Not yet created" + METADATA="Not yet created" + METADATA_MD5="Not yet created" + CREATE_TIME="Not yet created" + fi + jq -n --arg ref "$REFERENCE" --arg ref_md5 "$REFERENCE_MD5" --arg meta "$METADATA" --arg meta_md5 "$METADATA_MD5" --arg create_time "$CREATE_TIME" '. = {"reference": $ref, "reference_md5": $ref_md5, "metadata": $meta, "metadata_md5": $meta_md5, "create_time": $create_time}' +} + add_seroba_db () { - SEROBA_DB_JSON=${SEROBA_DB_PATH}/done_seroba.json + SEROBA_DB_JSON=${SEROBA_DB_PATH}/${SEROBA_JSON} if [ -f "$SEROBA_DB_JSON" ]; then GIT=$(jq -r .git $SEROBA_DB_JSON) KMER=$(jq -r .kmer $SEROBA_DB_JSON) @@ -41,8 +60,9 @@ add_url_db () { jq -n \ --argjson bwa_db "$(add_bwa_db)" \ + --argjson ariba_db "$(add_ariba_db)" \ --argjson seroba_db "$(add_seroba_db)" \ - --argjson kraken2_db "$(add_url_db "${KRAKEN2_DB_PATH}/done_kraken.json")" \ - --argjson poppunnk_db "$(add_url_db "${POPPUNK_DB_PATH}/done_poppunk.json")" \ - --argjson poppunk_ext "$(add_url_db "${POPPUNK_DB_PATH}/done_poppunk_ext.json")" \ + --argjson kraken2_db "$(add_url_db "${KRAKEN2_DB_PATH}/${KRAKEN2_JSON}")" \ + --argjson poppunnk_db "$(add_url_db "${POPPUNK_DB_PATH}/${POPPUNK_JSON}")" \ + --argjson poppunk_ext "$(add_url_db "${POPPUNK_DB_PATH}/${POPPUNK_EXT_JSON}")" \ '$ARGS.named' > $JSON_FILE diff --git a/modules/amr.nf b/modules/amr.nf index bafff49..6507c41 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -50,7 +50,7 @@ process CREATE_ARIBA_DB { path local output: - path "${local}/${output}" + tuple path(local), val(output) script: output='database' @@ -72,7 +72,7 @@ process OTHER_RESISTANCE { tag "$sample_id" input: - path ariba_database + tuple path(ariba_database), val(database) tuple val(sample_id), path(read1), path(read2), path(unpaired) output: @@ -82,7 +82,7 @@ process OTHER_RESISTANCE { report='result/report.tsv' report_debug='result/debug.report.tsv' """ - ariba run --nucmer_min_id 80 --assembled_threshold 0.80 --assembler spades $ariba_database $read1 $read2 result + ariba run --nucmer_min_id 80 --assembled_threshold 0.80 --assembler spades $ariba_database/$database $read1 $read2 result """ } diff --git a/modules/info.nf b/modules/info.nf index dd456ed..4cabccb 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -29,6 +29,7 @@ process DATABASES { input: val bwa_db_path + val ariba_db_path val kraken2_db_path val seroba_db_path val poppunk_db_path @@ -38,11 +39,24 @@ process DATABASES { script: json='databases.json' + bwa_json='done_bwa_db.json' + ariba_json='done_ariba_db.json' + seroba_json='done_seroba.json' + kraken2_json='done_kraken.json' + poppunk_json='done_poppunk.json' + poppunk_ext_json='done_poppunk_ext.json' """ BWA_DB_PATH="$bwa_db_path" + BWA_JSON="$bwa_json" + ARIBA_DB_PATH="$ariba_db_path" + ARIBA_JSON="$ariba_json" KRAKEN2_DB_PATH="$kraken2_db_path" + KRAKEN2_JSON="$kraken2_json" SEROBA_DB_PATH="$seroba_db_path" + SEROBA_JSON="$seroba_json" POPPUNK_DB_PATH="$poppunk_db_path" + POPPUNK_JSON="$poppunk_json" + POPPUNK_EXT_JSON="$poppunk_ext_json" JSON_FILE="$json" source get_databases_info.sh @@ -194,6 +208,14 @@ process PARSE { |${dbTextRow('Source', json.seroba_db.git)} |${dbTextRow('Kmer size', json.seroba_db.kmer)} |${dbTextRow('Created', json.seroba_db.create_time)} + |╠═══════════════╧═══════════════════════════════════════════════════════════════════════════╣ + |║ ARIBA database ║ + |╟───────────────┬───────────────────────────────────────────────────────────────────────────╢ + |${dbTextRow('Reference', json.ariba_db.reference)} + |${dbTextRow('Reference MD5', json.ariba_db.reference_md5)} + |${dbTextRow('Metadata', json.ariba_db.metadata)} + |${dbTextRow('Metadata MD5', json.ariba_db.metadata_md5)} + |${dbTextRow('Created', json.ariba_db.create_time)} |╚═══════════════╧═══════════════════════════════════════════════════════════════════════════╝ |""".stripMargin() diff --git a/workflows/info_and_version.nf b/workflows/info_and_version.nf index 3808a57..bb5ce37 100644 --- a/workflows/info_and_version.nf +++ b/workflows/info_and_version.nf @@ -8,6 +8,7 @@ workflow PRINT_VERSION { main: GET_VERSION( params.ref_genome_bwa_db_local, + params.ariba_db_local, params.kraken2_db_local, params.seroba_local, params.poppunk_local, @@ -26,6 +27,7 @@ workflow SAVE_INFO { main: GET_VERSION( databases_info.bwa_db_path, + databases_info.ariba_db_path, databases_info.kraken2_db_path, databases_info.seroba_db_path, databases_info.poppunk_db_path, @@ -39,6 +41,7 @@ workflow SAVE_INFO { workflow GET_VERSION { take: bwa_db_path + ariba_db_path kraken2_db_path seroba_db_path poppunk_db_path @@ -49,6 +52,7 @@ workflow GET_VERSION { DATABASES( bwa_db_path, + ariba_db_path, kraken2_db_path, seroba_db_path, poppunk_db_path diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index f08f357..21a0593 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -205,6 +205,7 @@ workflow PIPELINE { // Pass to SAVE_INFO sub-workflow DATABASES_INFO = ref_genome_bwa_db.map { it[0] } + .merge(ariba_db.map { it[0] }) .merge(kraken2_db) .merge(seroba_db.map { it[0] }) .merge(poppunk_db.map { it[0] }) @@ -212,9 +213,10 @@ workflow PIPELINE { .map { [ bwa_db_path: it[0], - kraken2_db_path: it[1], - seroba_db_path: it[2], - poppunk_db_path: it[3] + ariba_db_path: it[1], + kraken2_db_path: it[2], + seroba_db_path: it[3], + poppunk_db_path: it[4] ] } From fd31c316cd6a5bbebf334cbb2c778d3e04abe4de Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 30 Jun 2023 15:09:59 +0000 Subject: [PATCH 010/157] Improve DATABASES_INFO maintainability Former-commit-id: 9ebff00173b08cc4f05a0b45d54477566500a8d6 --- workflows/pipeline.nf | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 21a0593..d11726d 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -143,7 +143,6 @@ workflow PIPELINE { // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, infer resistance (also determinants if any) of other antimicrobials // Output into Channel GET_OTHER_RESISTANCE.out.result OTHER_RESISTANCE(ariba_db, OVERALL_QC_PASSED_READS_ch) - OTHER_RESISTANCE.out.reports.view() GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.reports) // Generate results.csv by sorted sample_id based on merged Channels @@ -204,21 +203,14 @@ workflow PIPELINE { ) // Pass to SAVE_INFO sub-workflow - DATABASES_INFO = ref_genome_bwa_db.map { it[0] } - .merge(ariba_db.map { it[0] }) - .merge(kraken2_db) - .merge(seroba_db.map { it[0] }) - .merge(poppunk_db.map { it[0] }) - .merge(poppunk_ext_clusters) - .map { - [ - bwa_db_path: it[0], - ariba_db_path: it[1], - kraken2_db_path: it[2], - seroba_db_path: it[3], - poppunk_db_path: it[4] - ] - } + DATABASES_INFO = ref_genome_bwa_db.map { [["bwa_db_path", it[0]]] } + .merge(ariba_db.map { [["ariba_db_path", it[0]]] }) + .merge(kraken2_db.map { [["kraken2_db_path", it]] }) + .merge(seroba_db.map { [["seroba_db_path", it[0]]] }) + .merge(poppunk_db.map { [["poppunk_db_path", it[0]]] }) + .merge(poppunk_ext_clusters.map { [["poppunk_ext_path", it]] }) + // Save key-value tuples into a map + .map { it.collectEntries() } emit: databases_info = DATABASES_INFO From 671fce649e5dc6a988dfc005a56bb455b64b3c64 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 30 Jun 2023 15:48:05 +0000 Subject: [PATCH 011/157] Improve code maintainability Former-commit-id: f8b857a959f6517e5a7807ba17f39c04593abfa6 --- bin/{get_kraken_db.sh => get_kraken2_db.sh} | 7 ++-- bin/taxonomy_qc.sh | 4 +-- modules/amr.nf | 6 ++-- modules/lineage.nf | 8 +++-- modules/mapping.nf | 6 ++-- modules/serotype.nf | 6 ++-- modules/taxonomy.nf | 20 ++++++------ workflows/init.nf | 4 +-- workflows/pipeline.nf | 36 ++++++++++----------- 9 files changed, 53 insertions(+), 44 deletions(-) rename bin/{get_kraken_db.sh => get_kraken2_db.sh} (88%) diff --git a/bin/get_kraken_db.sh b/bin/get_kraken2_db.sh similarity index 88% rename from bin/get_kraken_db.sh rename to bin/get_kraken2_db.sh index 862e868..cb42d7c 100755 --- a/bin/get_kraken_db.sh +++ b/bin/get_kraken2_db.sh @@ -2,6 +2,7 @@ # If not: remove files in database directory, download, and unzip to database directory, also save metadata to done_kraken.json DB_NAME=$(basename $DB_REMOTE) +ZIPPED_DB='kraken2_db.tar.gz' if [ ! -f ${DB_LOCAL}/done_kraken.json ] || \ [ ! "$DB_REMOTE" == "$(jq -r .url ${DB_LOCAL}/done_kraken.json)" ] || \ @@ -11,14 +12,14 @@ if [ ! -f ${DB_LOCAL}/done_kraken.json ] || \ rm -rf ${DB_LOCAL}/{,.[!.],..?}* - wget ${DB_REMOTE} -O kraken_db.tar.gz + wget ${DB_REMOTE} -O $ZIPPED_DB # Use tmp dir and find to ensure files are saved directly at $DB_LOCAL regardless of archive directory structure mkdir tmp - tar -xzf kraken_db.tar.gz -C tmp + tar -xzf $ZIPPED_DB -C tmp find tmp -type f -exec mv {} $DB_LOCAL \; - rm -f kraken_db.tar.gz + rm -f $ZIPPED_DB jq -n \ --arg url "${DB_REMOTE}" \ diff --git a/bin/taxonomy_qc.sh b/bin/taxonomy_qc.sh index 232e61b..c468b14 100755 --- a/bin/taxonomy_qc.sh +++ b/bin/taxonomy_qc.sh @@ -1,6 +1,6 @@ -# Extract taxonomy QC information and determine QC result based on kraken_report.txt +# Extract taxonomy QC information and determine QC result based on kraken2_report.txt -PERCENTAGE=$(awk -F"\t" '$4 ~ /^S$/ && $6 ~ /Streptococcus pneumoniae$/ { gsub(/^[ \t]+/, "", $1); printf "%.2f", $1 }' $KRAKEN_REPORT) +PERCENTAGE=$(awk -F"\t" '$4 ~ /^S$/ && $6 ~ /Streptococcus pneumoniae$/ { gsub(/^[ \t]+/, "", $1); printf "%.2f", $1 }' $KRAKEN2_REPORT) if [ -z "$PERCENTAGE" ]; then PERCENTAGE="0.00" diff --git a/modules/amr.nf b/modules/amr.nf index 6507c41..8d3fd53 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -50,7 +50,8 @@ process CREATE_ARIBA_DB { path local output: - tuple path(local), val(output) + path local, emit: path + val output, emit: database script: output='database' @@ -72,7 +73,8 @@ process OTHER_RESISTANCE { tag "$sample_id" input: - tuple path(ariba_database), val(database) + path ariba_database + val database tuple val(sample_id), path(read1), path(read2), path(unpaired) output: diff --git a/modules/lineage.nf b/modules/lineage.nf index 646077b..40156a3 100644 --- a/modules/lineage.nf +++ b/modules/lineage.nf @@ -8,7 +8,8 @@ process GET_POPPUNK_DB { path local output: - tuple path(local), env(DB_NAME) + path local, emit: path + env DB_NAME, emit: database script: """ @@ -29,7 +30,7 @@ process GET_POPPUNK_EXT_CLUSTERS { path local output: - env EXT_CLUSTERS_CSV + env EXT_CLUSTERS_CSV, emit: file script: """ @@ -52,7 +53,8 @@ process LINEAGE { tag 'All samples' input: - tuple path(poppunk_dir), val(db_name) + path poppunk_dir + val db_name val ext_clusters_file path qfile diff --git a/modules/mapping.nf b/modules/mapping.nf index c058954..c640544 100644 --- a/modules/mapping.nf +++ b/modules/mapping.nf @@ -8,7 +8,8 @@ process CREATE_REF_GENOME_BWA_DB { path local output: - tuple path(local), val(prefix) + path(local), emit: path + val(prefix), emit: prefix script: prefix='reference' @@ -30,7 +31,8 @@ process MAPPING { tag "$sample_id" input: - tuple path(bwa_ref_db_dir), val(prefix) + path bwa_ref_db_dir + val prefix tuple val(sample_id), path(read1), path(read2), path(unpaired) output: diff --git a/modules/serotype.nf b/modules/serotype.nf index 2e3ff3d..764fceb 100644 --- a/modules/serotype.nf +++ b/modules/serotype.nf @@ -33,7 +33,8 @@ process CREATE_SEROBA_DB { val kmer output: - tuple path(local), val(database) + path local, emit: path + val database, emit: database script: database='database' @@ -56,7 +57,8 @@ process SEROTYPE { tag "$sample_id" input: - tuple path(seroba_dir), val(database) + path seroba_dir + val database tuple val(sample_id), path(read1), path(read2), path(unpaired) output: diff --git a/modules/taxonomy.nf b/modules/taxonomy.nf index aefa4ad..c76771c 100644 --- a/modules/taxonomy.nf +++ b/modules/taxonomy.nf @@ -1,5 +1,5 @@ // Return Kraken 2 database path, download if necessary -process GET_KRAKEN_DB { +process GET_KRAKEN2_DB { label 'bash_container' label 'farm_low' @@ -8,14 +8,14 @@ process GET_KRAKEN_DB { path local output: - path local + path local, emit: path script: """ DB_REMOTE="$remote" DB_LOCAL="$local" - source get_kraken_db.sh + source get_kraken2_db.sh """ } @@ -27,7 +27,7 @@ process TAXONOMY { tag "$sample_id" input: - path kraken_db + path kraken2_db val kraken2_memory_mapping tuple val(sample_id), path(read1), path(read2), path(unpaired) @@ -35,21 +35,21 @@ process TAXONOMY { tuple val(sample_id), path(report), emit: report script: - report='kraken_report.txt' + report='kraken2_report.txt' if (kraken2_memory_mapping === true) """ - kraken2 --threads `nproc` --use-names --memory-mapping --db "$kraken_db" --paired "$read1" "$read2" --report "$report" --output - + kraken2 --threads `nproc` --use-names --memory-mapping --db "$kraken2_db" --paired "$read1" "$read2" --report "$report" --output - """ else if (kraken2_memory_mapping === false) """ - kraken2 --threads `nproc` --use-names --db "$kraken_db" --paired "$read1" "$read2" --report "$report" --output - + kraken2 --threads `nproc` --use-names --db "$kraken2_db" --paired "$read1" "$read2" --report "$report" --output - """ else error "The value for --kraken2_memory_mapping is not valid." } -// Extract taxonomy QC information and determine QC result based on kraken_report.txt +// Extract taxonomy QC information and determine QC result based on kraken2_report.txt process TAXONOMY_QC { label 'bash_container' label 'farm_low' @@ -57,7 +57,7 @@ process TAXONOMY_QC { tag "$sample_id" input: - tuple val(sample_id), path(kraken_report) + tuple val(sample_id), path(kraken2_report) val(qc_spneumo_percentage) output: @@ -66,7 +66,7 @@ process TAXONOMY_QC { script: """ - KRAKEN_REPORT="$kraken_report" + KRAKEN2_REPORT="$kraken2_report" QC_SPNEUMO_PERCENTAGE="$qc_spneumo_percentage" source taxonomy_qc.sh diff --git a/workflows/init.nf b/workflows/init.nf index e92c2da..0650922 100644 --- a/workflows/init.nf +++ b/workflows/init.nf @@ -1,6 +1,6 @@ // Import process modules include { CREATE_REF_GENOME_BWA_DB } from "$projectDir/modules/mapping" -include { GET_KRAKEN_DB } from "$projectDir/modules/taxonomy" +include { GET_KRAKEN2_DB } from "$projectDir/modules/taxonomy" include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS } from "$projectDir/modules/lineage" include { GET_SEROBA_DB; CREATE_SEROBA_DB } from "$projectDir/modules/serotype" include { GET_DOCKER_COMPOSE; PULL_IMAGES } from "$projectDir/modules/docker" @@ -11,7 +11,7 @@ workflow INIT { CREATE_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) // Check Kraken2 Database, download if necessary - kraken2_db = GET_KRAKEN_DB(params.kraken2_db_remote, params.kraken2_db_local) + GET_KRAKEN2_DB(params.kraken2_db_remote, params.kraken2_db_local) // Check SeroBA Databases, clone and rebuild if necessary GET_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index d11726d..1aea8d7 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -2,7 +2,7 @@ include { PREPROCESS; READ_QC } from "$projectDir/modules/preprocess" include { ASSEMBLY_UNICYCLER; ASSEMBLY_SHOVILL; ASSEMBLY_ASSESS; ASSEMBLY_QC } from "$projectDir/modules/assembly" include { CREATE_REF_GENOME_BWA_DB; MAPPING; SAM_TO_SORTED_BAM; SNP_CALL; HET_SNP_COUNT; MAPPING_QC } from "$projectDir/modules/mapping" -include { GET_KRAKEN_DB; TAXONOMY; TAXONOMY_QC } from "$projectDir/modules/taxonomy" +include { GET_KRAKEN2_DB; TAXONOMY; TAXONOMY_QC } from "$projectDir/modules/taxonomy" include { OVERALL_QC } from "$projectDir/modules/overall_qc" include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS; LINEAGE } from "$projectDir/modules/lineage" include { GET_SEROBA_DB; CREATE_SEROBA_DB; SEROTYPE } from "$projectDir/modules/serotype" @@ -13,21 +13,21 @@ include { PBP_RESISTANCE; GET_PBP_RESISTANCE; CREATE_ARIBA_DB; OTHER_RESISTANCE; workflow PIPELINE { main: // Get path and prefix of Reference Genome BWA Database, generate from assembly if necessary - ref_genome_bwa_db = CREATE_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) + CREATE_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) // Get path to Kraken2 Database, download if necessary - kraken2_db = GET_KRAKEN_DB(params.kraken2_db_remote, params.kraken2_db_local) + GET_KRAKEN2_DB(params.kraken2_db_remote, params.kraken2_db_local) // Get path to SeroBA Databases, clone and rebuild if necessary GET_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) - seroba_db = CREATE_SEROBA_DB(params.seroba_remote, params.seroba_local, GET_SEROBA_DB.out.create_db, params.seroba_kmer) + CREATE_SEROBA_DB(params.seroba_remote, params.seroba_local, GET_SEROBA_DB.out.create_db, params.seroba_kmer) // Get paths to PopPUNK Database and External Clusters, download if necessary - poppunk_db = GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_local) - poppunk_ext_clusters = GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_local) + GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_local) + GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_local) // Get path to ARIBA database, generate from reference sequences and metadata if ncessary - ariba_db = CREATE_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) + CREATE_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) // Get read pairs into Channel raw_read_pairs_ch raw_read_pairs_ch = Channel.fromFilePairs("$params.reads/*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}", checkIfExists: true) @@ -73,7 +73,7 @@ workflow PIPELINE { // From Channel READ_QC_PASSED_READS_ch map reads to reference // Output into Channel MAPPING.out.sam - MAPPING(ref_genome_bwa_db, READ_QC_PASSED_READS_ch) + MAPPING(CREATE_REF_GENOME_BWA_DB.out.path, CREATE_REF_GENOME_BWA_DB.out.prefix, READ_QC_PASSED_READS_ch) // From Channel MAPPING.out.sam, Convert SAM into sorted BAM and calculate reference coverage // Output into Channels SAM_TO_SORTED_BAM.out.bam and SAM_TO_SORTED_BAM.out.ref_coverage @@ -94,7 +94,7 @@ workflow PIPELINE { // From Channel READ_QC_PASSED_READS_ch assess Streptococcus pneumoniae percentage in reads // Output into Channels TAXONOMY.out.detailed_result & TAXONOMY.out.result report - TAXONOMY(kraken2_db, params.kraken2_memory_mapping, READ_QC_PASSED_READS_ch) + TAXONOMY(GET_KRAKEN2_DB.out.path, params.kraken2_memory_mapping, READ_QC_PASSED_READS_ch) // From Channel TAXONOMY.out.report, provide taxonomy QC status // Output into Channels TAXONOMY_QC.out.detailed_result & TAXONOMY_QC.out.result report @@ -125,11 +125,11 @@ workflow PIPELINE { .collectFile(name: 'qfile.txt', newLine: true) // From generated POPPUNK_QFILE, assign GPSC to samples passed overall QC - LINEAGE(poppunk_db, poppunk_ext_clusters, POPPUNK_QFILE) + LINEAGE(GET_POPPUNK_DB.out.path, GET_POPPUNK_DB.out.database, GET_POPPUNK_EXT_CLUSTERS.out.file, POPPUNK_QFILE) // From Channel OVERALL_QC_PASSED_READS_ch, serotype the preprocess reads of samples passed overall QC // Output into Channel SEROTYPE.out.result - SEROTYPE(seroba_db, OVERALL_QC_PASSED_READS_ch) + SEROTYPE(CREATE_SEROBA_DB.out.path, CREATE_SEROBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, PubMLST typing the assemblies of samples passed overall QC // Output into Channel MLST.out.result @@ -142,7 +142,7 @@ workflow PIPELINE { // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, infer resistance (also determinants if any) of other antimicrobials // Output into Channel GET_OTHER_RESISTANCE.out.result - OTHER_RESISTANCE(ariba_db, OVERALL_QC_PASSED_READS_ch) + OTHER_RESISTANCE(CREATE_ARIBA_DB.out.path, CREATE_ARIBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.reports) // Generate results.csv by sorted sample_id based on merged Channels @@ -203,12 +203,12 @@ workflow PIPELINE { ) // Pass to SAVE_INFO sub-workflow - DATABASES_INFO = ref_genome_bwa_db.map { [["bwa_db_path", it[0]]] } - .merge(ariba_db.map { [["ariba_db_path", it[0]]] }) - .merge(kraken2_db.map { [["kraken2_db_path", it]] }) - .merge(seroba_db.map { [["seroba_db_path", it[0]]] }) - .merge(poppunk_db.map { [["poppunk_db_path", it[0]]] }) - .merge(poppunk_ext_clusters.map { [["poppunk_ext_path", it]] }) + DATABASES_INFO = CREATE_REF_GENOME_BWA_DB.out.path.map { [["bwa_db_path", it]] } + .merge(CREATE_ARIBA_DB.out.path.map { [["ariba_db_path", it]] }) + .merge(GET_KRAKEN2_DB.out.path.map { [["kraken2_db_path", it]] }) + .merge(CREATE_SEROBA_DB.out.path.map { [["seroba_db_path", it]] }) + .merge(GET_POPPUNK_DB.out.path.map { [["poppunk_db_path", it]] }) + .merge(GET_POPPUNK_EXT_CLUSTERS.out.file.map { [["poppunk_ext_file", it]] }) // Save key-value tuples into a map .map { it.collectEntries() } From 81c0fc3072d9ff04557a89e8f3f960d553484732 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 30 Jun 2023 16:32:17 +0000 Subject: [PATCH 012/157] Include ARIBA database generation Former-commit-id: d6ab8c63ce7367dd6c9af1b7ee4bbb5ae7484f67 --- workflows/init.nf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/workflows/init.nf b/workflows/init.nf index 0650922..64a748f 100644 --- a/workflows/init.nf +++ b/workflows/init.nf @@ -4,12 +4,16 @@ include { GET_KRAKEN2_DB } from "$projectDir/modules/taxonomy" include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS } from "$projectDir/modules/lineage" include { GET_SEROBA_DB; CREATE_SEROBA_DB } from "$projectDir/modules/serotype" include { GET_DOCKER_COMPOSE; PULL_IMAGES } from "$projectDir/modules/docker" +include { CREATE_ARIBA_DB } from "$projectDir/modules/amr" // Alternative workflow for initialisation only workflow INIT { // Check Reference Genome BWA Database, generate from assembly if necessary CREATE_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) + // Check ARIBA database, generate from reference sequences and metadata if ncessary + CREATE_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) + // Check Kraken2 Database, download if necessary GET_KRAKEN2_DB(params.kraken2_db_remote, params.kraken2_db_local) From 9e5eb5b9ca7f37241ebb5b2674c589f857c02d26 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 30 Jun 2023 16:40:03 +0000 Subject: [PATCH 013/157] Improve code maintainability Former-commit-id: 4140943a45e80f5e2fc3235c8db3ca122aec8926 --- bin/create_ariba_db.sh | 16 +++++++--------- bin/create_ref_genome_bwa_db.sh | 12 +++++------- bin/create_seroba_db.sh | 4 ++-- bin/get_kraken2_db.sh | 8 ++++---- bin/get_poppunk_db.sh | 10 +++++----- bin/get_poppunk_ext_clusters.sh | 10 +++++----- bin/get_seroba_db.sh | 8 ++++---- modules/amr.nf | 2 ++ modules/lineage.nf | 4 ++++ modules/mapping.nf | 2 ++ modules/serotype.nf | 4 ++++ modules/taxonomy.nf | 2 ++ 12 files changed, 46 insertions(+), 36 deletions(-) diff --git a/bin/create_ariba_db.sh b/bin/create_ariba_db.sh index 03d9997..073028e 100755 --- a/bin/create_ariba_db.sh +++ b/bin/create_ariba_db.sh @@ -1,16 +1,14 @@ # Check if CREATE_ARIBA_DB has run successfully on the specific reference sequences and metadata. -# If not: remove the $OUTPUT directory, and prepare the ARIBA database from reference sequences and metadata, also save metadata to done_ariba_db.json - -JSON="done_ariba_db.json" +# If not: remove the $OUTPUT directory, and prepare the ARIBA database from reference sequences and metadata, also save metadata to JSON REF_SEQUENCES_MD5=$(md5sum $REF_SEQUENCES | awk '{ print $1 }') METADATA_MD5=$(md5sum $METADATA | awk '{ print $1 }') -if [ ! -f ${DB_LOCAL}/${JSON} ] || \ - [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$REF_SEQUENCES" ] || \ - [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$REF_SEQUENCES_MD5" ] || \ - [ ! "$(grep '"metadata"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$METADATA" ] || \ - [ ! "$(grep '"metadata_md5"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$METADATA_MD5" ] || \ +if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ + [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$REF_SEQUENCES" ] || \ + [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$REF_SEQUENCES_MD5" ] || \ + [ ! "$(grep '"metadata"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$METADATA" ] || \ + [ ! "$(grep '"metadata_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$METADATA_MD5" ] || \ [ ! -f ${DB_LOCAL}/${OUTPUT}/00.info.txt ] || \ [ ! -f ${DB_LOCAL}/${OUTPUT}/00.version_info.txt ] || \ [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_genes.log ] || \ @@ -29,6 +27,6 @@ if [ ! -f ${DB_LOCAL}/${JSON} ] || \ ariba prepareref -f "$REF_SEQUENCES" -m "$METADATA" "$DB_LOCAL/$OUTPUT" - echo -e "{\n \"reference\": \"$REF_SEQUENCES\",\n \"reference_md5\": \"$REF_SEQUENCES_MD5\",\n \"metadata\": \"$METADATA\",\n \"metadata_md5\": \"$METADATA_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON} + echo -e "{\n \"reference\": \"$REF_SEQUENCES\",\n \"reference_md5\": \"$REF_SEQUENCES_MD5\",\n \"metadata\": \"$METADATA\",\n \"metadata_md5\": \"$METADATA_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON_FILE} fi \ No newline at end of file diff --git a/bin/create_ref_genome_bwa_db.sh b/bin/create_ref_genome_bwa_db.sh index aae6537..6cee335 100755 --- a/bin/create_ref_genome_bwa_db.sh +++ b/bin/create_ref_genome_bwa_db.sh @@ -1,13 +1,11 @@ # Check if CREATE_REF_GENOME_BWA_DB has run successfully on the specific reference. -# If not: remove files in database directory, and construct the FM-index database of the reference genome for BWA, also save metadata to done_bwa_db.json - -JSON="done_bwa_db.json" +# If not: remove files in database directory, and construct the FM-index database of the reference genome for BWA, also save metadata to JSON REFERENCE_MD5=$(md5sum $REFERENCE | awk '{ print $1 }') -if [ ! -f ${DB_LOCAL}/${JSON} ] || \ - [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$REFERENCE" ] || \ - [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON} | sed -r 's/.+: "(.*)",/\1/')" == "$REFERENCE_MD5" ] || \ +if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ + [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$REFERENCE" ] || \ + [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$REFERENCE_MD5" ] || \ [ ! -f ${DB_LOCAL}/${PREFIX}.amb ] || \ [ ! -f ${DB_LOCAL}/${PREFIX}.ann ] || \ [ ! -f ${DB_LOCAL}/${PREFIX}.bwt ] || \ @@ -20,6 +18,6 @@ if [ ! -f ${DB_LOCAL}/${JSON} ] || \ mv ${PREFIX}.amb ${PREFIX}.ann ${PREFIX}.bwt ${PREFIX}.pac ${PREFIX}.sa -t $DB_LOCAL - echo -e "{\n \"reference\": \"$REFERENCE\",\n \"reference_md5\": \"$REFERENCE_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON} + echo -e "{\n \"reference\": \"$REFERENCE\",\n \"reference_md5\": \"$REFERENCE_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON_FILE} fi diff --git a/bin/create_seroba_db.sh b/bin/create_seroba_db.sh index 44b3be4..21a058f 100755 --- a/bin/create_seroba_db.sh +++ b/bin/create_seroba_db.sh @@ -1,9 +1,9 @@ -# If create_db is true: re-create KMC and ARIBA databases, also save metadata to done_seroba.json +# If create_db is true: re-create KMC and ARIBA databases, also save metadata to JSON if [ $CREATE_DB = true ]; then seroba createDBs ${DB_LOCAL}/${DATABASE}/ ${KMER} - echo -e "{\n \"git\": \"$DB_REMOTE\",\n \"kmer\": \"$KMER\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/done_seroba.json + echo -e "{\n \"git\": \"$DB_REMOTE\",\n \"kmer\": \"$KMER\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON_FILE} fi diff --git a/bin/get_kraken2_db.sh b/bin/get_kraken2_db.sh index cb42d7c..c53cc52 100755 --- a/bin/get_kraken2_db.sh +++ b/bin/get_kraken2_db.sh @@ -1,11 +1,11 @@ # Check if all file exists and were obtained from the database at the specific link. -# If not: remove files in database directory, download, and unzip to database directory, also save metadata to done_kraken.json +# If not: remove files in database directory, download, and unzip to database directory, also save metadata to JSON DB_NAME=$(basename $DB_REMOTE) ZIPPED_DB='kraken2_db.tar.gz' -if [ ! -f ${DB_LOCAL}/done_kraken.json ] || \ - [ ! "$DB_REMOTE" == "$(jq -r .url ${DB_LOCAL}/done_kraken.json)" ] || \ +if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ + [ ! "$DB_REMOTE" == "$(jq -r .url ${DB_LOCAL}/${JSON_FILE})" ] || \ [ ! -f ${DB_LOCAL}/hash.k2d ] || \ [ ! -f ${DB_LOCAL}/opts.k2d ] || \ [ ! -f ${DB_LOCAL}/taxo.k2d ]; then @@ -24,6 +24,6 @@ if [ ! -f ${DB_LOCAL}/done_kraken.json ] || \ jq -n \ --arg url "${DB_REMOTE}" \ --arg save_time "$(date +"%Y-%m-%d %H:%M:%S %Z")" \ - '{"url" : $url, "save_time": $save_time}' > ${DB_LOCAL}/done_kraken.json + '{"url" : $url, "save_time": $save_time}' > ${DB_LOCAL}/${JSON_FILE} fi diff --git a/bin/get_poppunk_db.sh b/bin/get_poppunk_db.sh index 33420b5..d4e705a 100755 --- a/bin/get_poppunk_db.sh +++ b/bin/get_poppunk_db.sh @@ -1,13 +1,13 @@ # Return PopPUNK database name # Check if all files exist and were obtained from the database at the specific link. -# If not: remove all sub-directories, download, and unzip to database directory, also save metadata to done_poppunk.json +# If not: remove all sub-directories, download, and unzip to database directory, also save metadata to JSON DB_NAME=$(basename "$DB_REMOTE" .tar.gz) DB_PATH=${DB_LOCAL}/${DB_NAME} -if [ ! -f ${DB_LOCAL}/done_poppunk.json ] || \ - [ ! "$DB_REMOTE" == "$(jq -r .url ${DB_LOCAL}/done_poppunk.json)" ] || \ +if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ + [ ! "$DB_REMOTE" == "$(jq -r .url ${DB_LOCAL}/${JSON_FILE})" ] || \ [ ! -f ${DB_PATH}/${DB_NAME}.h5 ] || \ [ ! -f ${DB_PATH}/${DB_NAME}.dists.npy ] || \ [ ! -f ${DB_PATH}/${DB_NAME}.dists.pkl ] || \ @@ -17,7 +17,7 @@ if [ ! -f ${DB_LOCAL}/done_poppunk.json ] || \ [ ! -f ${DB_PATH}/${DB_NAME}_clusters.csv ] || \ [ ! -f ${DB_PATH}/${DB_NAME}.refs ]; then - rm -rf ${DB_LOCAL}/done_poppunk.json + rm -rf ${DB_LOCAL}/${JSON_FILE} rm -rf ${DB_LOCAL}/*/ wget $DB_REMOTE -O poppunk_db.tar.gz @@ -27,6 +27,6 @@ if [ ! -f ${DB_LOCAL}/done_poppunk.json ] || \ jq -n \ --arg url "$DB_REMOTE" \ --arg save_time "$(date +"%Y-%m-%d %H:%M:%S %Z")" \ - '{"url" : $url, "save_time": $save_time}' > ${DB_LOCAL}/done_poppunk.json + '{"url" : $url, "save_time": $save_time}' > ${DB_LOCAL}/${JSON_FILE} fi diff --git a/bin/get_poppunk_ext_clusters.sh b/bin/get_poppunk_ext_clusters.sh index c971567..e330968 100755 --- a/bin/get_poppunk_ext_clusters.sh +++ b/bin/get_poppunk_ext_clusters.sh @@ -1,23 +1,23 @@ # Return PopPUNK External Clusters file name # Check if specific external clusters file exists and was obtained from the specific link. -# If not: remove all csv files, and download to database directory, also save metadata to done_poppunk_ext.json +# If not: remove all csv files, and download to database directory, also save metadata to JSON EXT_CLUSTERS_CSV=$(basename "$EXT_CLUSTERS_REMOTE") EXT_CLUSTERS_NAME=$(basename "$EXT_CLUSTERS_REMOTE" .csv) -if [ ! -f ${EXT_CLUSTERS_LOCAL}/done_poppunk_ext.json ] || \ - [ ! "$EXT_CLUSTERS_REMOTE" == "$(jq -r .url ${EXT_CLUSTERS_LOCAL}/done_poppunk_ext.json)" ] || \ +if [ ! -f ${EXT_CLUSTERS_LOCAL}/${JSON_FILE} ] || \ + [ ! "$EXT_CLUSTERS_REMOTE" == "$(jq -r .url ${EXT_CLUSTERS_LOCAL}/${JSON_FILE})" ] || \ [ ! -f ${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV} ]; then rm -f ${EXT_CLUSTERS_LOCAL}/*.csv - rm -f ${EXT_CLUSTERS_LOCAL}/done_${EXT_CLUSTERS_NAME}.json + rm -f ${EXT_CLUSTERS_LOCAL}/${JSON_FILE} wget $EXT_CLUSTERS_REMOTE -O ${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV} jq -n \ --arg url "$EXT_CLUSTERS_REMOTE" \ --arg save_time "$(date +"%Y-%m-%d %H:%M:%S %Z")" \ - '{"url" : $url, "save_time": $save_time}' > ${EXT_CLUSTERS_LOCAL}/done_poppunk_ext.json + '{"url" : $url, "save_time": $save_time}' > ${EXT_CLUSTERS_LOCAL}/${JSON_FILE} fi diff --git a/bin/get_seroba_db.sh b/bin/get_seroba_db.sh index 1b2235e..736a99b 100755 --- a/bin/get_seroba_db.sh +++ b/bin/get_seroba_db.sh @@ -3,11 +3,11 @@ # Check if GET_SEROBA_DB and CREATE_SEROBA_DB has run successfully on the database at the specific link, CREATE_SEROBA_DB used the specific Kmerm and pull to check if SeroBA database is up-to-date. # If outdated or does not exist: remove files in database directory and clone, set CREATE_DB to true -# Assume up-to-date if done_seroba.json passes checks and the host cannot be resolved to allow offline usage +# Assume up-to-date if JSON passes checks and the host cannot be resolved to allow offline usage -if [ ! -f ${DB_LOCAL}/done_seroba.json ] || \ - [ ! "$(grep 'git' ${DB_LOCAL}/done_seroba.json | sed -r 's/.+: "(.*)",/\1/')" == "${DB_REMOTE}" ] || \ - [ ! "$(grep 'kmer' ${DB_LOCAL}/done_seroba.json | sed -r 's/.+: "(.*)",/\1/')" == "${KMER}" ] || \ +if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ + [ ! "$(grep 'git' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "${DB_REMOTE}" ] || \ + [ ! "$(grep 'kmer' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "${KMER}" ] || \ !((git -C ${DB_LOCAL} pull || echo 'Already up-to-date') | grep -q 'Already up[- ]to[- ]date'); then rm -rf ${DB_LOCAL}/{,.[!.],..?}* diff --git a/modules/amr.nf b/modules/amr.nf index 8d3fd53..feaa3ad 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -55,11 +55,13 @@ process CREATE_ARIBA_DB { script: output='database' + json='done_ariba_db.json' """ REF_SEQUENCES="$ref_sequences" METADATA="$metadata" DB_LOCAL="$local" OUTPUT="$output" + JSON_FILE="$json" source create_ariba_db.sh """ diff --git a/modules/lineage.nf b/modules/lineage.nf index 40156a3..6e13fab 100644 --- a/modules/lineage.nf +++ b/modules/lineage.nf @@ -12,9 +12,11 @@ process GET_POPPUNK_DB { env DB_NAME, emit: database script: + json='done_poppunk.json' """ DB_REMOTE="$db_remote" DB_LOCAL="$local" + JSON_FILE="$json" source get_poppunk_db.sh """ @@ -33,9 +35,11 @@ process GET_POPPUNK_EXT_CLUSTERS { env EXT_CLUSTERS_CSV, emit: file script: + json='done_poppunk_ext.json' """ EXT_CLUSTERS_REMOTE="$ext_clusters_remote" EXT_CLUSTERS_LOCAL="$local" + JSON_FILE="$json" source get_poppunk_ext_clusters.sh """ diff --git a/modules/mapping.nf b/modules/mapping.nf index c640544..f0d1e0e 100644 --- a/modules/mapping.nf +++ b/modules/mapping.nf @@ -13,10 +13,12 @@ process CREATE_REF_GENOME_BWA_DB { script: prefix='reference' + json='done_bwa_db.json' """ REFERENCE="$reference" DB_LOCAL="$local" PREFIX="$prefix" + JSON_FILE="$json" source create_ref_genome_bwa_db.sh """ diff --git a/modules/serotype.nf b/modules/serotype.nf index 764fceb..5c268fc 100644 --- a/modules/serotype.nf +++ b/modules/serotype.nf @@ -12,10 +12,12 @@ process GET_SEROBA_DB { env CREATE_DB, emit: create_db script: + json='done_seroba.json' """ DB_REMOTE="$remote" DB_LOCAL="$local" KMER="$kmer" + JSON_FILE="$json" source get_seroba_db.sh """ @@ -38,12 +40,14 @@ process CREATE_SEROBA_DB { script: database='database' + json='done_seroba.json' """ DATABASE="$database" DB_REMOTE="$remote" DB_LOCAL="$local" KMER="$kmer" CREATE_DB="$create_db" + JSON_FILE="$json" source create_seroba_db.sh """ diff --git a/modules/taxonomy.nf b/modules/taxonomy.nf index c76771c..af6266d 100644 --- a/modules/taxonomy.nf +++ b/modules/taxonomy.nf @@ -11,9 +11,11 @@ process GET_KRAKEN2_DB { path local, emit: path script: + json='done_kraken.json' """ DB_REMOTE="$remote" DB_LOCAL="$local" + JSON_FILE="$json" source get_kraken2_db.sh """ From ee1a725837386591f41ea30f5968af33ac95f2ed Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 4 Jul 2023 16:16:26 +0000 Subject: [PATCH 014/157] Update ARIBA database Former-commit-id: 3bbde9b138528817562ba23699797da2cd3a5972 --- ...230628.tsv => ariba_metadata-20230629.tsv} | 4 +- ...sta => ariba_ref_sequences-20230629.fasta} | 59 +++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) rename data/{ariba_metadata-20230628.tsv => ariba_metadata-20230629.tsv} (94%) rename data/{ariba_ref_sequences-20230628.fasta => ariba_ref_sequences-20230629.fasta} (95%) diff --git a/data/ariba_metadata-20230628.tsv b/data/ariba_metadata-20230629.tsv similarity index 94% rename from data/ariba_metadata-20230628.tsv rename to data/ariba_metadata-20230629.tsv index 9d35ca5..89ce2e8 100644 --- a/data/ariba_metadata-20230628.tsv +++ b/data/ariba_metadata-20230629.tsv @@ -77,4 +77,6 @@ vanC_AF162694 1 0 . . Vacomycin resistance (E.gallinarum) 23S_NZ_CP018347 0 1 A2115G . Macrolide:32347-35250 23S_NZ_CP018347 0 1 A2118G . Macrolide/Streptogramin:32347-35250 23S_NZ_CP018347 0 1 C2630A . Macrolide:32347-35250 -23S_NZ_CP018347 0 1 C2630G . Macrolide:32347-35250 \ No newline at end of file +23S_NZ_CP018347 0 1 C2630G . Macrolide:32347-35250 +rrgA_EF560637 1 0 . . Pili-1-(RlrA pilus-1-2279-4939) +pitB_GU256423 1 0 . . Pili-2-(pitB pilus-3504-4736) diff --git a/data/ariba_ref_sequences-20230628.fasta b/data/ariba_ref_sequences-20230629.fasta similarity index 95% rename from data/ariba_ref_sequences-20230628.fasta rename to data/ariba_ref_sequences-20230629.fasta index 4509177..5dffbd5 100644 --- a/data/ariba_ref_sequences-20230628.fasta +++ b/data/ariba_ref_sequences-20230629.fasta @@ -777,3 +777,62 @@ ccttctgccgtttcgctcgccgctactaaggcaatcgcttttgctttctcttcctgcagctacttagatgtttcagttca ctgcgtcttcctcctcacatccttaacagatgtgggtaacaggtattacctgttgggttcccccattcggaaatccctgg atcatcgcttacttacagctacccaaggtatatcgtcgtttgtcacgtccttcgtcggctcctagtgccaaggcatccac cgtgcgcccttattaacttaacct +>rrgA_EF560637 +ATGAAAAAAGTAAGAAAGATATTTCAGAAGGCAGTTGCAGGACTGTGCTGTATATCTCAGTTGACAGCTT +TTTCTTCGATAGTTGCTTTAGCAGAAACGCCTGAAACCAGTCCAGCGATAGGAAAAGTAGTGATTAAGGA +GACAGGCGAAGGAGGAGCGCTTCTAGGAGATGCCGTCTTTGAGTTGAAAAACAATACGAATGGCACAACT +GTTTCGCAAAGGACAGAGGCGCAAACAGGAGAAGCGATATTTTCAAACATAAAACCTGGGACATACACCT +TGACAGAAGCCCAACCTCCAGTTGGTTATAAACCCTCTACTAAACAACGGACTGTTGAAGTTGAGAAGAA +TGGTCGGACGACTGTCCAAGGTGAACAGGTAGAAAATCGAGAAGAGGCTCTATCTGACCAGTATCCACAA +ACAGGGACTTATCCAGATGTTCAAACACCTTATCAGATTATTAAGGTAGATGGTTCGGAAAAAAACGGAC +AGCACAAGGCGTTGAATCCGAATCCATATGAACGTGTGATTCCAGAAGGTACACTTTCAAAGAGAATTTA +TCAAGTGAATAATTTGGATGATAACCAATATGGAATCGAATTGACGGTTAGTGGGAAAACAGTGTATGAA +CGAAAAGATAAGTCTGTGCCGCTGGATGTCGTTATCTTGCTCGATAACTCAAATAGTATGAGTAACATTC +GAAACAAGAATGCTCGACGTGCGGAAAGAGCTGGTGAGGCGACACGTTCTCTTATTGATAAAATTACATC +TGATCCAGAAAATAGGGTAGCGCTTGTGACTTATGCTTCCACTATCTTTGATGGGACCGAGTTTACAGTA +GAAAAAGGGGTAGCAGATAAAAACGGAAAACGATTGAATGATTCTCTTTTTTGGAATTATGATCAGACGA +GTTTTACAACCAATACCAAAGATTATAGTTATTTAAAGCTGACTAATGATAAGAATGACATTGTAGAATT +AAAAAATAAGGTACCTACCGAGGCAGAAGACCATGATGGAAATAGATTGATGTACCAATTCGGTGCCACT +TTTACTCAGAAAGCTTTGATGAAGGCCGATGAGATTTTGACACAACAAGCGAGACAAAATAGTCAAAAAG +TCATTTTCCATATTACGGATGGTGTCCCAACTATGTCGTATCCGATTAATTTTAATCATGCTACGTTTGC +TCCATCATATCAAAATCAACTAAATGTATTTTTTAGTAAATCTCCTAATAAAGATGGAATACTATTAAGT +GATTTTATTACGCAAGCAACTAGTGGAGAACATACAATTGTACGCGGAGATGGGCAAAGTTACCAGATGT +TTACAGATAAGACAGTTTATGAAAAAGGTGCTCCTGCAGCTTTCCCAGTTAAACCTGAAAAATATTCTGA +AATGAAGGCGGTTGGTTATGCAGTTATAGGCGATCCAATTAATGGTGGATATATTTGGCTTAATTGGAGA +GAGAGTATTCTGGCTTATCCGTTTAATTCTAATACTGCTAAAATTACCAATCATGGTGACCCTACAAGAT +GGTACTATAACGGGAATATTGCTCCTGATGGGTATGATGTCTTTACGGTAGGTATTGGTATTAACGGAGA +TCCTGGTACGGATGAAGCAACGGCTACTAGTTTTATGCAAAGTATTTCTAGTAAACCTGAAAACTATACC +AATGTTACTGACACGACAAAAATATTGGAACAGTTGAATCGTTATTTCCACACCATCGTAACTGAAAAGA +AATCAATTGAGAATGGTACGATTACAGATCCGATGGGTGAGTTAATTGATTTGCAATTGGGCACAGATGG +AAGATTTGATCCAGCAGATTACACTTTAACTGCAAACGATGGTAGTCGCTTGGAGAATGGACAAGCTGTA +GGTGGTCCACAAAATGATGGTGGCTTGCTAAAAAATGCAAAAGTGTTCTATGATACGACTGAGAAAAGGA +TTCGTGTAACAGGTTTGTACCTTGGAACGGGTGAAAAAGTTACATTGACTTATAATGTTCGCTTGAATGA +CCAATTTGTAAGCAATAAATTCTATGACACGAATGGTCGAACAACCCTACACCCTAAGGAAGTAGAAAAG +AACACAGTGCGCGACTTCCCGATTCCTAAGATTCGTGATGTGCGAAAATATCCAGCAATTACGATTGCAA +AAGAGAAAAAACTTGGTGAAATTGAGTTTATTAAGATCAATAAGAATGATAAAAAACCACTGAGAGATGC +GGTCTTTAGTCTTCAAAAACAACATCCGGATTATCCAGATATTTATGGAGCTATTGATCAAAATGGCACT +TATCAAAATGTGAGAACAGGTGAAGATGGTAAGTTGACCTTTAAAAATCTGTCAGATGGGAAATATCGAT +TATTTGAAAATTCTGAACCAGCTGGTTATAAACCCGTTCAAAATAAGCCTATCGTTGCCTTCCAAATAGT +AAATGGAGAAGTCAGAGATGTGACTTCAATCGTTCCACAAGATATACCAGCGGGTTACGAGTTTACGAAT +GATAAGCACTATATTACCAATGAACCTATTCCTCCAAAGAGAGAATATCCTCGAACTGGTGGTATCGGAA +TGTTGCTATTCTATCTGATAGGTTGCATGATGATGGGAGGAGTTCTATTATACACACGGAAACATCCGTA +A +>pitB_GU256423 +ATGAAAAAAGAAAATAAAAAAACAAAAGAAATAATCATGAAAAAAACATTCTTTAAAAAGCTATTCACTG +CAAGCATTGCAGCTATAACCGCTTTGTCCGTATTCAGAGGTGTCCCGACTTTTGCGGATGATAATTCAGC +AATAACCAAAGCAAATGGTGAAAATAATGCTGTTGTGAAGATTAATAAAACGTTGAATATTGCAGAGGGA +ATAACAACACCAACAGCGACATTTACATTTAAGTTTACAGAAAAAACAGGACAATCTTCTAACGGTGCGC +CATATCAAACCGGAGTTGCAATTCCAGATAGAAATGTAGAATACAATAAAAATGATCACCCAACTGCTGA +TAAGATTCAAAAAGCAACAGAAGACATTTTTTCGGGAGTTGCTTATGGCCATGCTGGTGAATACGTTTAT +GATGTAGCGGAAGCAAAAACTGGATGGCAGGCGATTACCAAAAATGGTAAAACAATTGATGCCATGAGAT +ACGACAAACGTACATATGAAATGCACGTTATTGTTAAGAATAAAGTAAATGGTGGTGTCTATATTTCATC +AGTATACTTTAAGGAAAATAATAAATCTAACGCCCCTAAAGTAGAACCAAGTGAACAAGGCGTTTATAAT +TTATTTGATAACACATATACCAAAGACGCAAGTAAGGAGCCTAATCCTGATGATCCGAGTCAAGTAGACC +CCAATGCGAAAGCATTAACAATTACTAAAAAAGTTGATGGAGCTTCAGGGGATAAAACAAGAGATTTCCA +ATTCCATATCAAGATTCAACTTCCAAGTACAAATAAAACAGCAGAAACCCCTGTTACGAATATTATAGTA +AAACATGGATCTAAGTCAGAGGTGTTGGCAGTAGTGACCCCAGCAGATACAGTTGAGTACAATTTTACTC +TTAAAGATGGTGAAACATTTACAGTTGAACAACTACCAGCAGGTTCTAAATATACAGTAACTGAAACTGG +AGTAGCAGGTTATACAGATTCATCAATTTATACTACAAATGGTGCAGAACAAACATCTCAAGGACAAAAA +AATGTAGATTTTACATTAACAGATATCCTCATAGGTGAAAAGAAAAACGACAACAAAGTTACTAACAAAA +TCGACGACGTTACTCCTACTGGTCTCTTGATTGATAACCTTCCATTCATTTTGATGATTGGTCTTGGTTT +GGCTGGATTTGTTGTCTTGTCTAAAAAACGTAGAGAAGCCTAA From dec7f680fc0bba6ed3e6478b3ee7e95d35e2c630 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 5 Jul 2023 10:45:46 +0000 Subject: [PATCH 015/157] Update default ARIBA reference files Former-commit-id: bb3c15889772d749bd2cae42d11ebd229513efc3 --- nextflow.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index 191a4e5..ba6ff1a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -48,8 +48,8 @@ params { depth = 20.00 // Default ARIBA referece sequences and metadata paths, and local directory for its generated database - ariba_ref = "$projectDir/data/ariba_ref_sequences-20230628.fasta" - ariba_metadata = "$projectDir/data/ariba_metadata-20230628.tsv" + ariba_ref = "$projectDir/data/ariba_ref_sequences-20230629.fasta" + ariba_metadata = "$projectDir/data/ariba_metadata-20230629.tsv" ariba_db_local = "$projectDir/databases/ariba" // Toggle for removing .bam and .sam files mid-run to reduce storage requirement From e5ca3e629e392dcc3b0703c1b3c86d6bc330b0a8 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 6 Jul 2023 11:26:14 +0000 Subject: [PATCH 016/157] Remove unused script Former-commit-id: 0a898f8946925b38fc27b4c9ea6cf34e27b0b9aa --- bin/get_other_resistance.sh | 63 ------------------------------------- 1 file changed, 63 deletions(-) delete mode 100755 bin/get_other_resistance.sh diff --git a/bin/get_other_resistance.sh b/bin/get_other_resistance.sh deleted file mode 100755 index 610059d..0000000 --- a/bin/get_other_resistance.sh +++ /dev/null @@ -1,63 +0,0 @@ -# Extract the results from the output file of the AMRsearch - -# For resistances, change NOT_FOUND to S, lower cases to upper cases, SENSITIVE to S, INTERMEDIATE to I, RESISTANT to R, null or space-only string to empty string -# For determinants, determinants are sorted and separated by "; ", and no determinant is output as "_". Each acquired gene is output as "*gene*", each variant is output as "*gene*_*variant*" - -function GET_RES { - echo $( < $JSON_FILE jq -r --arg target "$1" '.resistanceProfile[] | select( .agent.key == $target ) | .state' \ - | tr '[:lower:]' '[:upper:]' \ - | sed 's/^NOT_FOUND$/S/g;s/^SENSITIVE$/S/g;s/^INTERMEDIATE$/I/g;s/^RESISTANT$/R/g;s/^null$//g;s/^\s+$//g' ) -} - -function GET_DETERMINANTS { - DETERMINANTS=() - - ACQUIRED=( $(< $JSON_FILE jq -r --arg target "$1" '.resistanceProfile[] | select( .agent.key == $target ) | .determinants | .acquired | map(.gene)[]') ) - VARIANTS=( $(< $JSON_FILE jq -r --arg target "$1" '.resistanceProfile[] | select( .agent.key == $target ) | .determinants | .variants | map(.gene + "_" +.variant)[]') ) - - if (( ${#ACQUIRED[@]} != 0 )); then - DETERMINANTS+=( "${ACQUIRED[@]}" ) - fi - - if (( ${#VARIANTS[@]} != 0 )); then - DETERMINANTS+=( "${VARIANTS[@]}" ) - fi - - if (( ${#DETERMINANTS[@]} == 0 )); then - DETERMINANTS+=("_") - fi - - IFS=$'\n' SORTED_DETERMINANTS=($(sort -f <<<"${DETERMINANTS[*]}")); unset IFS - printf -v JOINED_DETERMINANTS '; %s' "${SORTED_DETERMINANTS[@]}" - echo ${JOINED_DETERMINANTS:2} -} - -CHL_RES=$(GET_RES "CHL") -CHL_DETERMINANTS=$(GET_DETERMINANTS "CHL") - -CLI_RES=$(GET_RES "CLI") -CLI_DETERMINANTS=$(GET_DETERMINANTS "CLI") - -ERY_RES=$(GET_RES "ERY") -ERY_DETERMINANTS=$(GET_DETERMINANTS "ERY") - -FQ_RES=$(GET_RES "FLQ") -FQ_DETERMINANTS=$(GET_DETERMINANTS "FLQ") - -KAN_RES=$(GET_RES "KAN") -KAN_DETERMINANTS=$(GET_DETERMINANTS "KAN") - -LZO_RES=$(GET_RES "LNZ") -LZO_DETERMINANTS=$(GET_DETERMINANTS "LNZ") - -TET_RES=$(GET_RES "TCY") -TET_DETERMINANTS=$(GET_DETERMINANTS "TCY") - -TMP_RES=$(GET_RES "TMP") -TMP_DETERMINANTS=$(GET_DETERMINANTS "TMP") - -SMX_RES=$(GET_RES "SSS") -SMX_DETERMINANTS=$(GET_DETERMINANTS "SSS") - -COT_RES=$(GET_RES "SXT") -COT_DETERMINANTS=$(GET_DETERMINANTS "SXT") From 3feab10b3985a1cc85bad0cc13ddc8c7d437a8c0 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 6 Jul 2023 11:30:55 +0000 Subject: [PATCH 017/157] Initial work on extracting AMR from ARIBA report Former-commit-id: 3ec708917092711711520e4498d4775d00b51ba4 --- bin/get_other_resistance.py | 43 +++++++++++++++++++++++++++++++++++++ modules/amr.nf | 5 +++-- workflows/pipeline.nf | 2 +- 3 files changed, 47 insertions(+), 3 deletions(-) create mode 100755 bin/get_other_resistance.py diff --git a/bin/get_other_resistance.py b/bin/get_other_resistance.py new file mode 100755 index 0000000..df0899e --- /dev/null +++ b/bin/get_other_resistance.py @@ -0,0 +1,43 @@ +#! /usr/bin/env python3 + +import sys + +report_path = sys.argv[1] +metadata_path = sys.argv[2] + +with open(report_path) as report, open(metadata_path) as metadata: + # Save (reference, gene, var_only) combination found in metadata + gene_dict = {} + # Save drug found in metadata + drug_set = set() + + # Skip the header in metadata + next(metadata) + # Go through lines and save findings to gene_dict and drug_set + lines = [line.strip() for line in metadata] + for line in lines: + fields = line.split("\t") + reference, gene, var_only, var_change, _, drug = fields + gene_dict[(reference, gene, var_only)] = {"var_change": var_change, "drug": drug} + drug_set.add(drug) + + # Skip the header in report + next(report) + # Go through lines to detect targets + lines = [line.strip() for line in report] + for line in lines: + # Extract useful fields + fields = line.split("\t") + ref_name, gene, var_only, ref_len, ref_base_assembled, known_var_change, has_known_var = fields[1], fields[2], fields[3], fields[7], fields[8], fields[16], fields[17] + + # If coverage (ref_base_assembled / ref_len) < 0.9 or either variable contains non-numeric value, skip the line + if not ref_base_assembled.isdigit() or not ref_len.isdigit() or int(ref_base_assembled)/int(ref_len) < 0.9: + continue + + # WIP + gene_dict_key = (ref_name, gene, var_only) + if gene_dict_key in gene_dict: + if var_only == 0: + print(gene_dict[gene_dict_key]) + if var_only == 1 and gene_dict[gene_dict_key]['var_change'] == known_var_change and has_known_var == 1: + print(gene_dict[gene_dict_key]) diff --git a/modules/amr.nf b/modules/amr.nf index feaa3ad..905fd2b 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -92,16 +92,17 @@ process OTHER_RESISTANCE { // WIP, for extracting information from ARIBA report process GET_OTHER_RESISTANCE { - label 'bash_container' + label 'python_container' label 'farm_low' tag "$sample_id" input: tuple val(sample_id), path(report), path(report_debug) + path metadata script: """ - # TBC + get_other_resistance.py "$report_debug" "$metadata" """ } diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 1aea8d7..811da5a 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -143,7 +143,7 @@ workflow PIPELINE { // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, infer resistance (also determinants if any) of other antimicrobials // Output into Channel GET_OTHER_RESISTANCE.out.result OTHER_RESISTANCE(CREATE_ARIBA_DB.out.path, CREATE_ARIBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) - GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.reports) + GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.reports, params.ariba_metadata) // Generate results.csv by sorted sample_id based on merged Channels // READ_QC.out.result, ASSEMBLY_QC.out.result, MAPPING_QC.out.result, TAXONOMY_QC.out.result, OVERALL_QC.out.result, From fe57162c1a2a4032c6288105d5483e7f6d46b04d Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 6 Jul 2023 11:31:29 +0000 Subject: [PATCH 018/157] Save AMR info to FreeText_Drug column Former-commit-id: 67277d0022fbb2ef53e8d492d61bb5cd47718e02 --- data/ariba_metadata-20230629.tsv | 162 +++++++++++++++---------------- 1 file changed, 81 insertions(+), 81 deletions(-) diff --git a/data/ariba_metadata-20230629.tsv b/data/ariba_metadata-20230629.tsv index 89ce2e8..88c3a4f 100644 --- a/data/ariba_metadata-20230629.tsv +++ b/data/ariba_metadata-20230629.tsv @@ -1,82 +1,82 @@ reference coding_yes(1)_no(0) pr/ab(0)_var(1) description of the variant Group FreeText_Drug -aph_3prime_III_1_M26832 1 0 . . Kanamycin resistance -ermB_1_JN899585 1 0 . . Erythromycin and Clindamycin resistance -ermB_10_U86375 1 0 . . Erythromycin and Clindamycin resistance -ermB_16_X82819 1 0 . . Erythromycin and Clindamycin resistance -ermB_20_AF109075 1 0 . . Erythromycin and Clindamycin resistance -ermC_13_M13761 1 0 . . Erythromycin and Clindamycin resistance -cat_5_U35036 1 0 . . Chloramphenicol resistance -catpC194_1_NC_002013 1 0 . . Chloramphenicol resistance -catpC233_1_AY355285 1 0 . . Chloramphenicol resistance -catQ_1_M55620 1 0 . . Chloramphenicol resistance -msrD_2_AF274302 1 0 . . Erythromycin resistance -msrD_3_AF227520 1 0 . . Erythromycin resistance -mefA_10_AF376746 1 0 . . Erythromycin resistance -mefE_AE007317 1 0 . . Erythromycin resistance -tetM_1_X92947 1 0 . . Tetracycline resistance -tetM_12_FR671418 1 0 . . Tetracycline resistance -tetK_4_U38428 1 0 . . Tetracycline resistance -tetM_13_AM990992 1 0 . . Tetracycline resistance -tetM_2_X90939 1 0 . . Tetracycline resistance -tetM_4_X75073 1 0 . . Tetracycline resistance -tetM_5_U58985 1 0 . . Tetracycline resistance -tetM_8_X04388 1 0 . . Tetracycline resistance -tetS_M 1 0 . . Tetracycline resistance -tetS_M_MH283012 1 0 . . Tetracycline resistance -tetAp_L20800 1 0 . . Tetracycline resistance -tetBp_L20800 1 0 . . Tetracycline resistance -tetAQ2_Z21523 1 0 . . Tetracycline resistance -tetS_FN555436 1 0 . . Tetracycline resistance -tetT_L42544 1 0 . . Tetracycline resistance -tetW_AJ222769 1 0 . . Tetracycline resistance -tet32_AJ295238 1 0 . . Tetracycline resistance -tet36_AJ514254 1 0 . . Tetracycline resistance -tet44_FN594949 1 0 . . Tetracycline resistance -tet58_KY887560 1 0 . . Tetracycline resistance -tet_M74049 1 0 . . Tetracycline resistance -tetS_M_HM367711 1 0 . . Tetracycline resistance -tetS_M_AY534326 1 0 . . Tetracycline resistance -tetM_M85225 1 0 . . Tetracycline resistance -tetS_FN555436 1 0 . . Tetracycline resistance -tetM_MH283017 1 0 . . tetracycline resistance -folA_AE007317 1 1 I100L . "proteinID-AAL00232.1, Trimethoprim" -folP_AE007317 1 1 . . "proteinID-AAK99071.1, Sulfamethoxazole resistance on if insertions in 56-67 amino acids" -gyrA_AE007317 1 1 S81F . Fluoroquinolone -gyrA_AE007317 1 1 S81Y . Fluoroquinolone -gyrA_AE007317 1 1 S81C . Fluoroquinolone -gyrA_AE007317 1 1 S81I . Fluoroquinolone -gyrA_AE007317 1 1 E85K . Fluoroquinolone -gyrA_AE007317 1 1 Q118A . Fluoroquinolone -gyrB_AE007317 1 1 E474K . Fluoroquinolone -parC_AE007317 1 1 A63T . Fluoroquinolone -parC_AE007317 1 1 S79F . Fluoroquinolone -parC_AE007317 1 1 S79Y . Fluoroquinolone -parC_AE007317 1 1 S79L . Fluoroquinolone -parC_AE007317 1 1 S79F . Fluoroquinolone -parC_AE007317 1 1 D83G . Fluoroquinolone -parC_AE007317 1 1 D83N . Fluoroquinolone -parE_AE007317 1 1 E474K . Fluoroquinolone -parE_AE007317 1 1 D435N . Fluoroquinolone -parE_AE007317 1 1 D435H . Fluoroquinolone -parE_AE007317 1 1 P454S . Fluoroquinolone -tetO_Y07780 1 0 . . Tetracycline resistance -ermBups_HG799494 0 0 . . Erythromycin and Clindamycin resistance -ermbTr_CP002121 0 0 . . Erythromycin and Clindamycin resistance -rplD_AE007317 1 1 . . Linezolid resistance (deletion within the L4 region of the gene ) -rpoB_AE007317 1 1 D489E . rifampicin resistance -D415E -rpoB_AE007317 1 1 H499N . rifampicin resistance -H425N -rpoB_AE007317 1 1 D489N . rifampicin resistance -H415N -vanB_KC489787 1 0 . . Vacomycin resistance -vanD_EU999036 1 0 . . Vacomycin resistance -vanE_FJ872411 1 0 . . Vacomycin resistance -vanG_KF704242 1 0 . . Vacomycin resistance -otrA_X53401 1 0 . . Tetracycline resistance -vanA_M97297 1 0 . . Vacomycin resistance (E.faecium) -vanC_AF162694 1 0 . . Vacomycin resistance (E.gallinarum) -23S_NZ_CP018347 0 1 A2114G . Macrolide:32347-35250 -23S_NZ_CP018347 0 1 A2115G . Macrolide:32347-35250 -23S_NZ_CP018347 0 1 A2118G . Macrolide/Streptogramin:32347-35250 -23S_NZ_CP018347 0 1 C2630A . Macrolide:32347-35250 -23S_NZ_CP018347 0 1 C2630G . Macrolide:32347-35250 -rrgA_EF560637 1 0 . . Pili-1-(RlrA pilus-1-2279-4939) -pitB_GU256423 1 0 . . Pili-2-(pitB pilus-3504-4736) +aph_3prime_III_1_M26832 1 0 . . KAN +ermB_1_JN899585 1 0 . . ERY CLI +ermB_10_U86375 1 0 . . ERY CLI +ermB_16_X82819 1 0 . . ERY CLI +ermB_20_AF109075 1 0 . . ERY CLI +ermC_13_M13761 1 0 . . ERY CLI +cat_5_U35036 1 0 . . CHL +catpC194_1_NC_002013 1 0 . . CHL +catpC233_1_AY355285 1 0 . . CHL +catQ_1_M55620 1 0 . . CHL +msrD_2_AF274302 1 0 . . ERY +msrD_3_AF227520 1 0 . . ERY +mefA_10_AF376746 1 0 . . ERY +mefE_AE007317 1 0 . . ERY +tetM_1_X92947 1 0 . . TET +tetM_12_FR671418 1 0 . . TET +tetK_4_U38428 1 0 . . TET +tetM_13_AM990992 1 0 . . TET +tetM_2_X90939 1 0 . . TET +tetM_4_X75073 1 0 . . TET +tetM_5_U58985 1 0 . . TET +tetM_8_X04388 1 0 . . TET +tetS_M 1 0 . . TET +tetS_M_MH283012 1 0 . . TET +tetAp_L20800 1 0 . . TET +tetBp_L20800 1 0 . . TET +tetAQ2_Z21523 1 0 . . TET +tetS_FN555436 1 0 . . TET +tetT_L42544 1 0 . . TET +tetW_AJ222769 1 0 . . TET +tet32_AJ295238 1 0 . . TET +tet36_AJ514254 1 0 . . TET +tet44_FN594949 1 0 . . TET +tet58_KY887560 1 0 . . TET +tet_M74049 1 0 . . TET +tetS_M_HM367711 1 0 . . TET +tetS_M_AY534326 1 0 . . TET +tetM_M85225 1 0 . . TET +tetS_FN555436 1 0 . . TET +tetM_MH283017 1 0 . . TET +folA_AE007317 1 1 I100L . TMP +folP_AE007317 1 1 . . SMX +gyrA_AE007317 1 1 S81F . FLQ +gyrA_AE007317 1 1 S81Y . FLQ +gyrA_AE007317 1 1 S81C . FLQ +gyrA_AE007317 1 1 S81I . FLQ +gyrA_AE007317 1 1 E85K . FLQ +gyrA_AE007317 1 1 Q118A . FLQ +gyrB_AE007317 1 1 E474K . FLQ +parC_AE007317 1 1 A63T . FLQ +parC_AE007317 1 1 S79F . FLQ +parC_AE007317 1 1 S79Y . FLQ +parC_AE007317 1 1 S79L . FLQ +parC_AE007317 1 1 S79F . FLQ +parC_AE007317 1 1 D83G . FLQ +parC_AE007317 1 1 D83N . FLQ +parE_AE007317 1 1 E474K . FLQ +parE_AE007317 1 1 D435N . FLQ +parE_AE007317 1 1 D435H . FLQ +parE_AE007317 1 1 P454S . FLQ +tetO_Y07780 1 0 . . TET +ermBups_HG799494 0 0 . . ERY +ermbTr_CP002121 0 0 . . ERY +rplD_AE007317 1 1 . . LNZ +rpoB_AE007317 1 1 D489E . RIF +rpoB_AE007317 1 1 H499N . RIF +rpoB_AE007317 1 1 D489N . RIF +vanB_KC489787 1 0 . . VAN +vanD_EU999036 1 0 . . VAN +vanE_FJ872411 1 0 . . VAN +vanG_KF704242 1 0 . . VAN +otrA_X53401 1 0 . . TET +vanA_M97297 1 0 . . TET +vanC_AF162694 1 0 . . TET +23S_NZ_CP018347 0 1 A2114G . ERY +23S_NZ_CP018347 0 1 A2115G . ERY +23S_NZ_CP018347 0 1 A2118G . ERY +23S_NZ_CP018347 0 1 C2630A . ERY +23S_NZ_CP018347 0 1 C2630G . ERY +rrgA_EF560637 1 0 . . PILI-1 +pitB_GU256423 1 0 . . PILI-2 From aa3d09b8e2d0f50ddf6d286d1c234ffddc31e212 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 6 Jul 2023 12:13:30 +0000 Subject: [PATCH 019/157] Revert ARIBA assembler back to default Former-commit-id: ad3cdc824697aecb1b0704a0cb489d5279e97083 --- modules/amr.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/amr.nf b/modules/amr.nf index 905fd2b..0079b1e 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -86,7 +86,7 @@ process OTHER_RESISTANCE { report='result/report.tsv' report_debug='result/debug.report.tsv' """ - ariba run --nucmer_min_id 80 --assembled_threshold 0.80 --assembler spades $ariba_database/$database $read1 $read2 result + ariba run --nucmer_min_id 80 --assembled_threshold 0.80 $ariba_database/$database $read1 $read2 result """ } From 840b26208fcafe0c7d93b788207b2397971d8278 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 7 Jul 2023 09:09:53 +0000 Subject: [PATCH 020/157] Ensure type matching of variable comparsion Former-commit-id: 2f8c6dc502140522f3583fbe75895d353aa24f1c --- bin/get_other_resistance.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/bin/get_other_resistance.py b/bin/get_other_resistance.py index df0899e..56ae244 100755 --- a/bin/get_other_resistance.py +++ b/bin/get_other_resistance.py @@ -13,21 +13,21 @@ # Skip the header in metadata next(metadata) - # Go through lines and save findings to gene_dict and drug_set + # Go through lines in metadata and save findings to gene_dict and drug_set lines = [line.strip() for line in metadata] for line in lines: - fields = line.split("\t") + fields = [str(field) for field in line.split("\t")] reference, gene, var_only, var_change, _, drug = fields gene_dict[(reference, gene, var_only)] = {"var_change": var_change, "drug": drug} drug_set.add(drug) # Skip the header in report next(report) - # Go through lines to detect targets + # Go through lines in report to detect targets lines = [line.strip() for line in report] for line in lines: # Extract useful fields - fields = line.split("\t") + fields = [str(field) for field in line.split("\t")] ref_name, gene, var_only, ref_len, ref_base_assembled, known_var_change, has_known_var = fields[1], fields[2], fields[3], fields[7], fields[8], fields[16], fields[17] # If coverage (ref_base_assembled / ref_len) < 0.9 or either variable contains non-numeric value, skip the line @@ -37,7 +37,7 @@ # WIP gene_dict_key = (ref_name, gene, var_only) if gene_dict_key in gene_dict: - if var_only == 0: - print(gene_dict[gene_dict_key]) - if var_only == 1 and gene_dict[gene_dict_key]['var_change'] == known_var_change and has_known_var == 1: - print(gene_dict[gene_dict_key]) + if var_only == "0": + print(ref_name, gene_dict[gene_dict_key]) + if var_only == "1" and gene_dict[gene_dict_key]['var_change'] == known_var_change and has_known_var == "1": + print(ref_name, gene_dict[gene_dict_key]) From 82eb910b04f43e7bb5b8a6f40eb4108ef7485a44 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 13 Jul 2023 09:18:13 +0000 Subject: [PATCH 021/157] Remove LNZ from ARIBA database Former-commit-id: 7bf53626f236254d8683b31eb6f64fa933dc1582 --- ...tadata-20230629.tsv => ariba_metadata-20230712.tsv} | 1 - ...230629.fasta => ariba_ref_sequences-20230712.fasta} | 10 ---------- nextflow.config | 6 +++--- 3 files changed, 3 insertions(+), 14 deletions(-) rename data/{ariba_metadata-20230629.tsv => ariba_metadata-20230712.tsv} (95%) rename data/{ariba_ref_sequences-20230629.fasta => ariba_ref_sequences-20230712.fasta} (99%) diff --git a/data/ariba_metadata-20230629.tsv b/data/ariba_metadata-20230712.tsv similarity index 95% rename from data/ariba_metadata-20230629.tsv rename to data/ariba_metadata-20230712.tsv index 88c3a4f..de44254 100644 --- a/data/ariba_metadata-20230629.tsv +++ b/data/ariba_metadata-20230712.tsv @@ -62,7 +62,6 @@ parE_AE007317 1 1 P454S . FLQ tetO_Y07780 1 0 . . TET ermBups_HG799494 0 0 . . ERY ermbTr_CP002121 0 0 . . ERY -rplD_AE007317 1 1 . . LNZ rpoB_AE007317 1 1 D489E . RIF rpoB_AE007317 1 1 H499N . RIF rpoB_AE007317 1 1 D489N . RIF diff --git a/data/ariba_ref_sequences-20230629.fasta b/data/ariba_ref_sequences-20230712.fasta similarity index 99% rename from data/ariba_ref_sequences-20230629.fasta rename to data/ariba_ref_sequences-20230712.fasta index 5dffbd5..aac7dd0 100644 --- a/data/ariba_ref_sequences-20230629.fasta +++ b/data/ariba_ref_sequences-20230712.fasta @@ -104,16 +104,6 @@ GATAAATAA >ermbTr_CP002121 GCTTTTGATAGTCAAGCGAAATATAGCTACCTTATTGTAGAGAGGGGATTTGCTAAAAGG TTGCAAAA ->rplD_AE007317 -ATGGCAAACGTAACATTATTTGACCAAACTGGTAAAGAAGCTGGCCAAGTTGTTCTTAGCGATGCAGTAT -TTGGTATCGAACCAAATGAATCAGTTGTGTTTGATGTAATCATCAGCCAACGCGCAAGCCTTCGTCAAGG -AACACACGCTGTTAAAAACCGCTCTGCAGTATCAGGTGGTGGACGCAAACCATGGCGTCAAAAAGGAACT -GGACGTGCTCGTCAAGGTTCTATCCGCTCACCACAATGGCGTGGTGGTGGTGTTGTCTTCGGACCAACTC -CACGTTCATACGGCTACAAACTTCCACAAAAAGTTCGTCGCCTAGCTCTTAAATCAGTTTACTCTGAAAA -AGTTGCTGAAAACAAATTCGTAGCTGTAGACGCTCTTTCATTTACAGCTCCAAAAACTGCTGAATTTGCA -AAAGTTCTTGCAGCATTGAGCATCGATTCTAAAGTTCTTGTTATCCTTGAAGAAGGAAATGAATTCGCAG -CTCTTTCAGCTCGTAACCTTCCAAACGTGAAAGTTGCAACTGCTACAACTGCAAGTGTTCTTGACATCGC -AAATAGCGACAAACTTCTTGTCACACAAGCAGCTATCTCTAAAATCGAGGAGGTTCTTGCATAA >rpoB_AE007317 TTGACAAGGCTTGGAACTTATTTACAAAGGAGAATCATCTTGGCAGGACATGACGTTCAATACGGGAAAC ATCGTACCCGTCGTAGTTTTTCAAGAATCAAAGAAGTTCTTGACTTACCAAATTTGATTGAAATTCAAAC diff --git a/nextflow.config b/nextflow.config index ba6ff1a..fd30ded 100644 --- a/nextflow.config +++ b/nextflow.config @@ -48,8 +48,8 @@ params { depth = 20.00 // Default ARIBA referece sequences and metadata paths, and local directory for its generated database - ariba_ref = "$projectDir/data/ariba_ref_sequences-20230629.fasta" - ariba_metadata = "$projectDir/data/ariba_metadata-20230629.tsv" + ariba_ref = "$projectDir/data/ariba_ref_sequences-20230712.fasta" + ariba_metadata = "$projectDir/data/ariba_metadata-20230712.tsv" ariba_db_local = "$projectDir/databases/ariba" // Toggle for removing .bam and .sam files mid-run to reduce storage requirement @@ -105,7 +105,7 @@ process { container = 'staphb/kraken2:2.1.2-no-db' } withLabel: seroba_container { - container = 'staphb/seroba:1.0.2' + container = 'harryhungch/seroba:test' } } From 3495e5caa1046ed4e2e217d61ff1704fb7180338 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 13 Jul 2023 13:24:45 +0000 Subject: [PATCH 022/157] Update SeroBA image Former-commit-id: 619a2e97ab11d93c59adbf4a9420ed9d6820aca0 --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index fd30ded..1ab6cb0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -105,7 +105,7 @@ process { container = 'staphb/kraken2:2.1.2-no-db' } withLabel: seroba_container { - container = 'harryhungch/seroba:test' + container = 'harryhungch/seroba:1.0.3' } } From dae98893b173f81c71016c16e509fc6b2363cac4 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 13 Jul 2023 16:33:10 +0000 Subject: [PATCH 023/157] Use both normal and debug reports of ARIBA Former-commit-id: 20af1fd7bbf6bfe374af4b8960aaafe0bf06ec18 --- modules/amr.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/amr.nf b/modules/amr.nf index 0079b1e..982e348 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -103,6 +103,6 @@ process GET_OTHER_RESISTANCE { script: """ - get_other_resistance.py "$report_debug" "$metadata" + get_other_resistance.py "$report" "$report_debug" "$metadata" """ } From df95121c2f13ea19e93d1c757113d4b37550cc58 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 13 Jul 2023 16:33:30 +0000 Subject: [PATCH 024/157] Improve header of ARIBA metadata Former-commit-id: 9c747b31a229a1d7885a4852c5fcbd3402622a4b --- data/ariba_metadata-20230712.tsv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/ariba_metadata-20230712.tsv b/data/ariba_metadata-20230712.tsv index de44254..9afa30a 100644 --- a/data/ariba_metadata-20230712.tsv +++ b/data/ariba_metadata-20230712.tsv @@ -1,4 +1,4 @@ -reference coding_yes(1)_no(0) pr/ab(0)_var(1) description of the variant Group FreeText_Drug +ref_name gene var_only var_change group target aph_3prime_III_1_M26832 1 0 . . KAN ermB_1_JN899585 1 0 . . ERY CLI ermB_10_U86375 1 0 . . ERY CLI From 4f889b70b76bc3e8cb0e59f14f11f04dd556e72b Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 13 Jul 2023 17:03:44 +0000 Subject: [PATCH 025/157] Further work on extracting info from ARIBA reports Former-commit-id: 9f649c3940a3e20738c88cda529bf93f4019743d --- bin/get_other_resistance.py | 68 ++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 23 deletions(-) diff --git a/bin/get_other_resistance.py b/bin/get_other_resistance.py index 56ae244..ae6d154 100755 --- a/bin/get_other_resistance.py +++ b/bin/get_other_resistance.py @@ -1,43 +1,65 @@ #! /usr/bin/env python3 import sys +from itertools import chain +from collections import defaultdict report_path = sys.argv[1] -metadata_path = sys.argv[2] +debug_report_path = sys.argv[2] +metadata_path = sys.argv[3] -with open(report_path) as report, open(metadata_path) as metadata: - # Save (reference, gene, var_only) combination found in metadata - gene_dict = {} - # Save drug found in metadata - drug_set = set() +with open(report_path) as report, open(debug_report_path) as debug_report, open(metadata_path) as metadata: + # For saving (reference, gene, var_only) combinations as key and their information ({var_change: target}) as value found in metadata + gene_dict = defaultdict(dict) + + # For saving targets found in metadata as key and their determinants (add to a set) as value + target_dict = {} # Skip the header in metadata next(metadata) - # Go through lines in metadata and save findings to gene_dict and drug_set - lines = [line.strip() for line in metadata] - for line in lines: + # Go through lines in metadata and save findings to gene_dict and target_dict + for line in (line.strip() for line in metadata): + # Extract useful fields fields = [str(field) for field in line.split("\t")] - reference, gene, var_only, var_change, _, drug = fields - gene_dict[(reference, gene, var_only)] = {"var_change": var_change, "drug": drug} - drug_set.add(drug) + ref_name, gene, var_only, var_change, _, target = fields - # Skip the header in report + # Populating gene_dict + gene_dict[(ref_name, gene, var_only)].update({var_change: target}) + # Populating target_dict + target_dict.update({target: set()}) + + # Skip the header in report and debug report next(report) - # Go through lines in report to detect targets - lines = [line.strip() for line in report] - for line in lines: + next(debug_report) + # Go through lines in both report and debug report to detect targets + for line in (line.strip() for line in chain(report, debug_report)): # Extract useful fields fields = [str(field) for field in line.split("\t")] - ref_name, gene, var_only, ref_len, ref_base_assembled, known_var_change, has_known_var = fields[1], fields[2], fields[3], fields[7], fields[8], fields[16], fields[17] + ref_name, gene, var_only, ref_len, ref_base_assembled, known_var_change, has_known_var, ref_ctg_effect, ref_start, ref_end = fields[1], fields[2], fields[3], fields[7], fields[8], fields[16], fields[17], fields[19], fields[20], fields[21] # If coverage (ref_base_assembled / ref_len) < 0.9 or either variable contains non-numeric value, skip the line if not ref_base_assembled.isdigit() or not ref_len.isdigit() or int(ref_base_assembled)/int(ref_len) < 0.9: continue - # WIP + # If the known_var_change (. for genes, specific change for variants) is not found in the metadata of the (ref_name, gene, var_only) combination, skip the line gene_dict_key = (ref_name, gene, var_only) - if gene_dict_key in gene_dict: - if var_only == "0": - print(ref_name, gene_dict[gene_dict_key]) - if var_only == "1" and gene_dict[gene_dict_key]['var_change'] == known_var_change and has_known_var == "1": - print(ref_name, gene_dict[gene_dict_key]) + try: + target = gene_dict[gene_dict_key][known_var_change] + except KeyError: + continue + + # Logic for gene detection. Found means hit. + if var_only == "0": + target_dict[target].add(f'Found {ref_name}') + + # Logic for variant detection, further criteria required + if var_only == "1": + # folP-specific criteria: ref_ctg_effect (effect of change between reference and contig) is one of the keywords and the change occurs within nt 168-201 + if ref_name.lower().startswith("folp") and ref_ctg_effect.lower() in ('fshift', 'trunc', 'indel', 'ins', 'multiple') and (168 <= int(ref_start) <= 201 or 168 <= int(ref_end) <= 201): + pos = ref_start if ref_start == ref_end else f'{ref_start}-{ref_end}' + target_dict[target].add(f'{ref_name} {ref_ctg_effect} at {pos}') + # Common criteria: the assembly has that variant + elif has_known_var == "1": + target_dict[target].add(f'{ref_name} {known_var_change}') + + print(target_dict) \ No newline at end of file From 38241206ef4b27d63a066a01c33aeb03d02eac7d Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 14 Jul 2023 15:05:35 +0000 Subject: [PATCH 026/157] Improve robustness of JSON capture Former-commit-id: 44b133139bc6df54d85d4afcf3869e67d92fa236 --- bin/create_ariba_db.sh | 8 ++++---- bin/create_ref_genome_bwa_db.sh | 4 ++-- bin/get_seroba_db.sh | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/bin/create_ariba_db.sh b/bin/create_ariba_db.sh index 073028e..289fff4 100755 --- a/bin/create_ariba_db.sh +++ b/bin/create_ariba_db.sh @@ -5,10 +5,10 @@ REF_SEQUENCES_MD5=$(md5sum $REF_SEQUENCES | awk '{ print $1 }') METADATA_MD5=$(md5sum $METADATA | awk '{ print $1 }') if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ - [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$REF_SEQUENCES" ] || \ - [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$REF_SEQUENCES_MD5" ] || \ - [ ! "$(grep '"metadata"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$METADATA" ] || \ - [ ! "$(grep '"metadata_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$METADATA_MD5" ] || \ + [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$REF_SEQUENCES" ] || \ + [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$REF_SEQUENCES_MD5" ] || \ + [ ! "$(grep '"metadata"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$METADATA" ] || \ + [ ! "$(grep '"metadata_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$METADATA_MD5" ] || \ [ ! -f ${DB_LOCAL}/${OUTPUT}/00.info.txt ] || \ [ ! -f ${DB_LOCAL}/${OUTPUT}/00.version_info.txt ] || \ [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_genes.log ] || \ diff --git a/bin/create_ref_genome_bwa_db.sh b/bin/create_ref_genome_bwa_db.sh index 6cee335..5bd277a 100755 --- a/bin/create_ref_genome_bwa_db.sh +++ b/bin/create_ref_genome_bwa_db.sh @@ -4,8 +4,8 @@ REFERENCE_MD5=$(md5sum $REFERENCE | awk '{ print $1 }') if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ - [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$REFERENCE" ] || \ - [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "$REFERENCE_MD5" ] || \ + [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$REFERENCE" ] || \ + [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$REFERENCE_MD5" ] || \ [ ! -f ${DB_LOCAL}/${PREFIX}.amb ] || \ [ ! -f ${DB_LOCAL}/${PREFIX}.ann ] || \ [ ! -f ${DB_LOCAL}/${PREFIX}.bwt ] || \ diff --git a/bin/get_seroba_db.sh b/bin/get_seroba_db.sh index 736a99b..a3e1d3c 100755 --- a/bin/get_seroba_db.sh +++ b/bin/get_seroba_db.sh @@ -6,8 +6,8 @@ # Assume up-to-date if JSON passes checks and the host cannot be resolved to allow offline usage if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ - [ ! "$(grep 'git' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "${DB_REMOTE}" ] || \ - [ ! "$(grep 'kmer' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",/\1/')" == "${KMER}" ] || \ + [ ! "$(grep 'git' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "${DB_REMOTE}" ] || \ + [ ! "$(grep 'kmer' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "${KMER}" ] || \ !((git -C ${DB_LOCAL} pull || echo 'Already up-to-date') | grep -q 'Already up[- ]to[- ]date'); then rm -rf ${DB_LOCAL}/{,.[!.],..?}* From 5784155f2141a6f075e74d997e340c96cfd46aa6 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 14 Jul 2023 15:06:03 +0000 Subject: [PATCH 027/157] Improve target names Former-commit-id: a0938d18db7980c3c6dc049d01610c98462a0fd8 --- data/ariba_metadata-20230712.tsv | 50 ++++++++++++++++---------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/data/ariba_metadata-20230712.tsv b/data/ariba_metadata-20230712.tsv index 9afa30a..1a0a6ea 100644 --- a/data/ariba_metadata-20230712.tsv +++ b/data/ariba_metadata-20230712.tsv @@ -1,10 +1,10 @@ ref_name gene var_only var_change group target aph_3prime_III_1_M26832 1 0 . . KAN -ermB_1_JN899585 1 0 . . ERY CLI -ermB_10_U86375 1 0 . . ERY CLI -ermB_16_X82819 1 0 . . ERY CLI -ermB_20_AF109075 1 0 . . ERY CLI -ermC_13_M13761 1 0 . . ERY CLI +ermB_1_JN899585 1 0 . . ERY_CLI +ermB_10_U86375 1 0 . . ERY_CLI +ermB_16_X82819 1 0 . . ERY_CLI +ermB_20_AF109075 1 0 . . ERY_CLI +ermC_13_M13761 1 0 . . ERY_CLI cat_5_U35036 1 0 . . CHL catpC194_1_NC_002013 1 0 . . CHL catpC233_1_AY355285 1 0 . . CHL @@ -41,24 +41,24 @@ tetS_FN555436 1 0 . . TET tetM_MH283017 1 0 . . TET folA_AE007317 1 1 I100L . TMP folP_AE007317 1 1 . . SMX -gyrA_AE007317 1 1 S81F . FLQ -gyrA_AE007317 1 1 S81Y . FLQ -gyrA_AE007317 1 1 S81C . FLQ -gyrA_AE007317 1 1 S81I . FLQ -gyrA_AE007317 1 1 E85K . FLQ -gyrA_AE007317 1 1 Q118A . FLQ -gyrB_AE007317 1 1 E474K . FLQ -parC_AE007317 1 1 A63T . FLQ -parC_AE007317 1 1 S79F . FLQ -parC_AE007317 1 1 S79Y . FLQ -parC_AE007317 1 1 S79L . FLQ -parC_AE007317 1 1 S79F . FLQ -parC_AE007317 1 1 D83G . FLQ -parC_AE007317 1 1 D83N . FLQ -parE_AE007317 1 1 E474K . FLQ -parE_AE007317 1 1 D435N . FLQ -parE_AE007317 1 1 D435H . FLQ -parE_AE007317 1 1 P454S . FLQ +gyrA_AE007317 1 1 S81F . FQ +gyrA_AE007317 1 1 S81Y . FQ +gyrA_AE007317 1 1 S81C . FQ +gyrA_AE007317 1 1 S81I . FQ +gyrA_AE007317 1 1 E85K . FQ +gyrA_AE007317 1 1 Q118A . FQ +gyrB_AE007317 1 1 E474K . FQ +parC_AE007317 1 1 A63T . FQ +parC_AE007317 1 1 S79F . FQ +parC_AE007317 1 1 S79Y . FQ +parC_AE007317 1 1 S79L . FQ +parC_AE007317 1 1 S79F . FQ +parC_AE007317 1 1 D83G . FQ +parC_AE007317 1 1 D83N . FQ +parE_AE007317 1 1 E474K . FQ +parE_AE007317 1 1 D435N . FQ +parE_AE007317 1 1 D435H . FQ +parE_AE007317 1 1 P454S . FQ tetO_Y07780 1 0 . . TET ermBups_HG799494 0 0 . . ERY ermbTr_CP002121 0 0 . . ERY @@ -77,5 +77,5 @@ vanC_AF162694 1 0 . . TET 23S_NZ_CP018347 0 1 A2118G . ERY 23S_NZ_CP018347 0 1 C2630A . ERY 23S_NZ_CP018347 0 1 C2630G . ERY -rrgA_EF560637 1 0 . . PILI-1 -pitB_GU256423 1 0 . . PILI-2 +rrgA_EF560637 1 0 . . PILI1 +pitB_GU256423 1 0 . . PILI2 From 9c95da62fe5f03f222eabf2fa0deaca29ed53d4c Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 14 Jul 2023 15:06:50 +0000 Subject: [PATCH 028/157] ARIBA-based AMR detection prototype Former-commit-id: a4d225166c6978a5ed780ec25311a4801aa0b516 --- bin/get_other_resistance.py | 30 +++++++++++++++++++++++++++--- bin/get_other_resistance.sh | 32 ++++++++++++++++++++++++++++++++ modules/amr.nf | 11 +++++++++-- workflows/pipeline.nf | 6 +++--- 4 files changed, 71 insertions(+), 8 deletions(-) create mode 100755 bin/get_other_resistance.sh diff --git a/bin/get_other_resistance.py b/bin/get_other_resistance.py index ae6d154..29f8234 100755 --- a/bin/get_other_resistance.py +++ b/bin/get_other_resistance.py @@ -1,8 +1,11 @@ #! /usr/bin/env python3 +# Output AMR of a sample based on its ARIBA report and ARIBA metadata + import sys from itertools import chain from collections import defaultdict +import json report_path = sys.argv[1] debug_report_path = sys.argv[2] @@ -50,7 +53,7 @@ # Logic for gene detection. Found means hit. if var_only == "0": - target_dict[target].add(f'Found {ref_name}') + target_dict[target].add(f'{ref_name}') # Logic for variant detection, further criteria required if var_only == "1": @@ -60,6 +63,27 @@ target_dict[target].add(f'{ref_name} {ref_ctg_effect} at {pos}') # Common criteria: the assembly has that variant elif has_known_var == "1": - target_dict[target].add(f'{ref_name} {known_var_change}') + target_dict[target].add(f'{ref_name} Variant {known_var_change}') + + # For saving final output, where information is saved per-target + output = {} + + # Go through targets in metadata + for target in target_dict: + # + if len(target_dict[target]) == 0: + if target.lower().startswith('pili'): + output[target] = 'NEG' + else: + output[f'{target}_Res'] = 'S' - print(target_dict) \ No newline at end of file + output[f'{target}_Determinant'] = '_' + else: + if target.lower().startswith('pili'): + output[target] = 'POS' + else: + output[f'{target}_Res'] = 'R' + + output[f'{target}_Determinant'] = '; '.join(target_dict[target]) + + print(json.dumps(output, indent=4)) \ No newline at end of file diff --git a/bin/get_other_resistance.sh b/bin/get_other_resistance.sh new file mode 100755 index 0000000..fdd5f58 --- /dev/null +++ b/bin/get_other_resistance.sh @@ -0,0 +1,32 @@ +# Run get_other_resistance.py to infer AMR from ARIBA reports, then capture individual AMR from the output for Nextflow + +function GET_VALUE { + echo $(grep \"$1\" <<< $OUTPUT | sed -r 's/.+: "(.*)",?/\1/') +} + +OUTPUT=$(get_other_resistance.py "$REPORT" "$REPORT_DEBUG" "$METADATA") + +CHL_Res=$(GET_VALUE "CHL_Res") +CHL_Determinant=$(GET_VALUE "CHL_Determinant") +ERY_Res=$(GET_VALUE "ERY_Res") +ERY_Determinant=$(GET_VALUE "ERY_Determinant") +FQ_Res=$(GET_VALUE "FQ_Res") +FQ_Determinant=$(GET_VALUE "FQ_Determinant") +KAN_Res=$(GET_VALUE "KAN_Res") +KAN_Determinant=$(GET_VALUE "KAN_Determinant") +TET_Res=$(GET_VALUE "TET_Res") +TET_Determinant=$(GET_VALUE "TET_Determinant") +TMP_Res=$(GET_VALUE "TMP_Res") +TMP_Determinant=$(GET_VALUE "TMP_Determinant") +SMX_Res=$(GET_VALUE "SMX_Res") +SMX_Determinant=$(GET_VALUE "SMX_Determinant") +ERY_CLI_Res=$(GET_VALUE "ERY_CLI_Res") +ERY_CLI_Determinant=$(GET_VALUE "ERY_CLI_Determinant") +RIF_Res=$(GET_VALUE "RIF_Res") +RIF_Determinant=$(GET_VALUE "RIF_Determinant") +VAN_Res=$(GET_VALUE "VAN_Res") +VAN_Determinant=$(GET_VALUE "VAN_Determinant") +PILI1=$(GET_VALUE "PILI1") +PILI1_Determinant=$(GET_VALUE "PILI1_Determinant") +PILI2=$(GET_VALUE "PILI2") +PILI2_Determinant=$(GET_VALUE "PILI2_Determinant") \ No newline at end of file diff --git a/modules/amr.nf b/modules/amr.nf index 982e348..19429f7 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -90,7 +90,7 @@ process OTHER_RESISTANCE { """ } -// WIP, for extracting information from ARIBA report +// Extracting resistance information from ARIBA report process GET_OTHER_RESISTANCE { label 'python_container' label 'farm_low' @@ -101,8 +101,15 @@ process GET_OTHER_RESISTANCE { tuple val(sample_id), path(report), path(report_debug) path metadata + output: + tuple val(sample_id), env(CHL_Res), env(CHL_Determinant), env(ERY_Res), env(ERY_Determinant), env(FQ_Res), env(FQ_Determinant), env(KAN_Res), env(KAN_Determinant), env(TET_Res), env(TET_Determinant), env(TMP_Res), env(TMP_Determinant), env(SMX_Res), env(SMX_Determinant), env(ERY_CLI_Res), env(ERY_CLI_Determinant), env(RIF_Res), env(RIF_Determinant), env(VAN_Res), env(VAN_Determinant), env(PILI1), env(PILI1_Determinant), env(PILI2), env(PILI2_Determinant), emit: result + script: """ - get_other_resistance.py "$report" "$report_debug" "$metadata" + REPORT="$report" + REPORT_DEBUG="$report_debug" + METADATA="$metadata" + + source get_other_resistance.sh """ } diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 811da5a..bdead12 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -179,8 +179,8 @@ workflow PIPELINE { .map { (it[-1] == null) ? it[0..-2] + ['_'] * 8 : it } .join(GET_PBP_RESISTANCE.out.result, failOnDuplicate: true, remainder: true) .map { (it[-1] == null) ? it[0..-2] + ['_'] * 18 : it } - // .join(GET_OTHER_RESISTANCE.out, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 20 : it } + .join(GET_OTHER_RESISTANCE.out, failOnDuplicate: true, remainder: true) + .map { (it[-1] == null) ? it[0..-2] + ['_'] * 24 : it } .map { it.collect {"\"$it\""}.join',' } .collectFile( name: 'results.csv', @@ -196,7 +196,7 @@ workflow PIPELINE { 'Serotype', 'ST', 'aroE', 'gdh', 'gki', 'recP', 'spi', 'xpt', 'ddl', 'pbp1a', 'pbp2b', 'pbp2x', 'AMO_MIC', 'AMO_Res', 'CFT_MIC', 'CFT_Res(Meningital)', 'CFT_Res(Non-meningital)', 'TAX_MIC', 'TAX_Res(Meningital)', 'TAX_Res(Non-meningital)', 'CFX_MIC', 'CFX_Res', 'MER_MIC', 'MER_Res', 'PEN_MIC', 'PEN_Res(Meningital)', 'PEN_Res(Non-meningital)', - // 'CHL_Res', 'CHL_Determinant', 'CLI_Res', 'CLI_Determinant', 'ERY_Res', 'ERY_Determinant', 'FQ_Res', 'FQ_Determinant', 'KAN_Res', 'KAN_Determinant', 'LZO_Res', 'LZO_Determinant', 'TET_Res', 'TET_Determinant', 'TMP_Res', 'TMP_Determinant', 'SMX_Res', 'SMX_Determinant', 'COT_Res', 'COT_Determinant' + 'CHL_Res', 'CHL_Determinant', 'ERY_Res', 'ERY_Determinant', 'FQ_Res', 'FQ_Determinant', 'KAN_Res', 'KAN_Determinant', 'TET_Res', 'TET_Determinant', 'TMP_Res', 'TMP_Determinant', 'SMX_Res', 'SMX_Determinant', 'ERY_CLI_Res', 'ERY_CLI_Determinant', 'RIF_Res', 'RIF_Determinant', 'VAN_Res', 'VAN_Determinant', 'PILI-1', 'PILI-1_Determinant', 'PILI-2', 'PILI-2_Determinant' ].join(','), sort: { it.split(',')[0] }, newLine: true From 864644128e7cf6dfbe1c21e0cbde9e465a650a47 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 17 Jul 2023 17:50:58 +0000 Subject: [PATCH 029/157] Update to reflect change from AMRsearch to ARIBA Former-commit-id: 7a38ec57cf10d8dc62d1687f1fbef4a9de52ae41 --- doc/workflow.drawio.svg | 542 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 538 insertions(+), 4 deletions(-) diff --git a/doc/workflow.drawio.svg b/doc/workflow.drawio.svg index c5d85f5..f2e08ab 100644 --- a/doc/workflow.drawio.svg +++ b/doc/workflow.drawio.svg @@ -1,4 +1,538 @@ - - - -OutputInputData TypeData TypeRaw Reads*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}Raw Reads...FASTQ (Reads)FASTQ (Reads)S. Pneumo: > 60%Contigs: < 500Length: 1.9 - 2.3 MbDepth: ≥ 20xFASTA (Assemblies)FASTA (Assemblies)SAMSAMRef Coverage: > 60%Het-SNP site: < 220ResultsResultsAssemblySPAdes(Default: Shovill)Asse...MappingBWA MEMMapp...Taxonomy & Taxonomy QCKraken 2Taxo...Assembly QCQUASTAsse...Mapping QCSAMtools, BCFtools,Het-SNP CounterMapp...PreprocessfastpPrep...PBP AMRCDC PBP AMR PredictorPBP...MLSTmlstMLST...Overall QCOver...LineagePopPUNKLine...SerotypeSeroBASero...Other AMRAMRsearchOthe...Informationinfo.txtInformatio...Read QC+Go / No-goBases: ≥ Min Length x Depth Go / No-goAssembliesassemblies/*.contigs.fastaAssemblies...Resultsresults.csvResults...Text is not SVG - cannot display \ No newline at end of file + + + + + + + + Output + + + + + + Input + + + + + + + + + + Data Type + + + + + + Data Type + + + + + + + + + + + + + + + Raw Reads + + + + *_{,R}{1,2}{,_001}.{fq,fastq}{,.gz} + + + + + + + Raw Reads... + + + + + + + + + + + + + + FASTQ (Reads) + + + + + + FASTQ (Reads) + + + + + + + + + S. Pneumo: > 60% + + + + + + Contigs: < 500 + + + Length: 1.9 - 2.3 Mb + + + Depth: ≥ 20x + + + + + + + + + + FASTA (Assemblies) + + + + + + FASTA (Assemblies) + + + + + + + + + + + SAM + + + + + + SAM + + + + + + + Ref Coverage: > 60% + + + Het-SNP site: < 220 + + + + + + + + + + Results + + + + + + Results + + + + + + + + + + + + + + + + + + Assembly + + + SPAdes + + + (Default: Shovill) + + + + + + + Asse... + + + + + + + + + + + + Mapping + + + BWA MEM + + + + + + Mapp... + + + + + + + + + + + + + Taxonomy & Taxonomy QC + + + + Kraken 2 + + + + + + + Taxo... + + + + + + + + + + + + Assembly QC + + + QUAST + + + + + + Asse... + + + + + + + + + + + + Mapping QC + + + + + SAMtools, BCFtools, + + custom script + + + + + + Mapp... + + + + + + + + + + + + + + Preprocess + + + fastp + + + + + + Prep... + + + + + + + + + + + + + + PBP AMR + + + CDC PBP AMR Predictor + + + + + + PBP... + + + + + + + + + + + + + MLST + + + mlst + + + + + + MLST... + + + + + + + + + + + + + + Overall QC + + + + + + + Over... + + + + + + + + + + + + + Lineage + + + PopPUNK + + + + + + Line... + + + + + + + + + + + + + Serotype + + + SeroBA + + + + + + Sero... + + + + + + + + + + + + + Other AMR + + + ARIBA, custom script + + + + + + Othe... + + + + + + + + + + + + Information + + + info.txt + + + + + + Informatio... + + + + + + + + Read QC + + + + + + + Go / No-go + + + + + Bases: + + + ≥ Min Length x Depth + + + + + + Go / No-go + + + + + + + + + + + + + Assemblies + + + assemblies/*.contigs.fasta + + + + + + Assemblies... + + + + + + + + + + + + Results + + + results.csv + + + + + + Results... + + + + + + + + + Text is not SVG - cannot display + + + + \ No newline at end of file From 6b8def4e564d3f8d296a990c5e48b0ab106f77dc Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 17 Jul 2023 17:51:45 +0000 Subject: [PATCH 030/157] Add ARIBA-related options, and update credits Former-commit-id: 5905964b46720da1877b76c2f22f3dd99c00a682 --- README.md | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index df7cb44..aab4261 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca - [Taxonomy](#taxonomy) - [Serotype](#serotype) - [Lineage](#lineage) + - [Other AMR](#other-amr) - [Singularity](#singularity) - [Experimental](#experimental) - [Output](#output) @@ -236,6 +237,15 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | `--poppunk_ext_remote` | Any valid URL to a PopPUNK external clusters file in `.csv` format(Default: [GPS v6 GPSC Designation](https://www.pneumogen.net/gps/GPS_v6_external_clusters.csv)) | URL to a PopPUNK external clusters file. | | `--poppunk_local` | Any valid path(Default: `"$projectDir/databases/poppunk"`) | Path to the directory where the remote PopPUNK database and external clusters file should be saved to. | +## Other AMR + > ⚠️ `--ariba_db_local` does not accept user provided local database, directory content will be overwritten + + | Option | Values | Description | + | --- | ---| --- | + | `--ariba_ref` | Any valid path to a `.fa` or `.fasta` file(Default: `"$projectDir/data/ariba_ref_sequences-20230712.fasta"`) | Path to the reference sequences for ARIBA. | + | `--ariba_metadata` | Any valid path to a `tsv` file(Default: `"$projectDir/data/ariba_metadata-20230712.tsv"`) | Path to the metadata file for ARIBA. | + | `--ariba_db_local` | Any valid path(Default: `"$projectDir/databases/ariba"`) | Path to the directory where ARIBA reference database should be saved to. | + ## Singularity > ℹ️ This section is only valid when Singularity is used as the container engine @@ -336,12 +346,10 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca # Credits This project uses open-source components. You can find the homepage or source code of their open-source projects along with license information below. I acknowledge and am grateful to these developers for their contributions to open source. -[AMRsearch](https://github.com/pathogenwatch-oss/amr-search) -- [Pathogenwatch](https://pathogen.watch/) ([@pathogenwatch-oss](https://github.com/pathogenwatch-oss)) -- License (MIT): https://github.com/pathogenwatch-oss/amr-search/blob/main/LICENSE -- This project uses a Docker image built from a [custom fork](https://github.com/HarryHung/amr-search) - - The fork changes the Docker image from a Docker executable image to a Docker environment for Nextflow integration - - The Docker image provides the containerised environment for `OTHER_RESISTANCE` process of the `amr.nf` module +[ARIBA](https://sanger-pathogens.github.io/ariba/) +- ARIBA: rapid antimicrobial resistance genotyping directly from sequencing reads Hunt M, Mather AE, Sánchez-Busó L, Page AJ, Parkhill J , Keane JA, Harris SR. Microbial Genomics 2017. doi: [110.1099/mgen.0.000131](http://mgen.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000131) +- License (GNU): https://github.com/sanger-pathogens/ariba/blob/master/LICENSE +- This tool is used in `CREATE_ARIBA_DB` and `OTHER_RESISTANCE` processes of the `amr.nf` module [BCFtools](https://samtools.github.io/bcftools/) and [SAMtools](https://www.htslib.org/) - Twelve years of SAMtools and BCFtools. Petr Danecek, James K Bonfield, Jennifer Liddle, John Marshall, Valeriu Ohan, Martin O Pollard, Andrew Whitwham, Thomas Keane, Shane A McCarthy, Robert M Davies, Heng Li. **GigaScience**, Volume 10, Issue 2, February 2021, giab008, https://doi.org/10.1093/gigascience/giab008 @@ -355,7 +363,7 @@ This project uses open-source components. You can find the homepage or source co - License (GPL-3.0): https://github.com/lh3/bwa/blob/master/COPYING - This tool is used in `GET_REF_GENOME_BWA_DB_PREFIX` and `MAPPING` processes of the `mapping.nf` module -[Docker Images](https://hub.docker.com/u/staphb) of [BCFtools](https://hub.docker.com/r/staphb/bcftools), [BWA](https://hub.docker.com/r/staphb/bwa), [fastp](https://hub.docker.com/r/staphb/fastp), [Kraken 2](https://hub.docker.com/r/staphb/kraken2), [mlst](https://hub.docker.com/r/staphb/mlst), [PopPUNK](https://hub.docker.com/r/staphb/poppunk), [QUAST](https://hub.docker.com/r/staphb/quast), [SAMtools](https://hub.docker.com/r/staphb/samtools), [SeroBA](https://hub.docker.com/r/staphb/seroba), [Shovill](https://hub.docker.com/r/staphb/shovill), [Unicycler](https://hub.docker.com/r/staphb/unicycler) +[Docker Images](https://hub.docker.com/u/staphb) of [BCFtools](https://hub.docker.com/r/staphb/bcftools), [BWA](https://hub.docker.com/r/staphb/bwa), [fastp](https://hub.docker.com/r/staphb/fastp), [Kraken 2](https://hub.docker.com/r/staphb/kraken2), [mlst](https://hub.docker.com/r/staphb/mlst), [PopPUNK](https://hub.docker.com/r/staphb/poppunk), [QUAST](https://hub.docker.com/r/staphb/quast), [SAMtools](https://hub.docker.com/r/staphb/samtools), [Shovill](https://hub.docker.com/r/staphb/shovill), [Unicycler](https://hub.docker.com/r/staphb/unicycler) - [State Public Health Bioinformatics Workgroup](https://staphb.org/) ([@StaPH-B](https://github.com/StaPH-B)) - License (GPL-3.0): https://github.com/StaPH-B/docker-builds/blob/master/LICENSE - These Docker images provide containerised environments for processes of multiple modules @@ -373,7 +381,7 @@ This project uses open-source components. You can find the homepage or source co [Docker Image of Python](https://hub.docker.com/_/python) - The Docker Community ([@docker-library](https://github.com/docker-library)) - License (MIT): https://github.com/docker-library/python/blob/master/LICENSE -- This Docker image provides the containerised environment for `HET_SNP_COUNT` process of the `mapping.nf` module +- This Docker image provides the containerised environment for `HET_SNP_COUNT` process of the `mapping.nf` module and `GET_OTHER_RESISTANCE` process of the `amr.nf` module [fastp](https://github.com/OpenGene/fastp) - Shifu Chen, Yanqing Zhou, Yaru Chen, Jia Gu; fastp: an ultra-fast all-in-one FASTQ preprocessor, Bioinformatics, Volume 34, Issue 17, 1 September 2018, Pages i884–i890, https://doi.org/10.1093/bioinformatics/bty560 @@ -393,7 +401,7 @@ This project uses open-source components. You can find the homepage or source co [mecA-HetSites-calculator](https://github.com/kumarnaren/mecA-HetSites-calculator) - Narender Kumar ([@kumarnaren](https://github.com/kumarnaren)) - License (GPL-3.0): https://github.com/kumarnaren/mecA-HetSites-calculator/blob/master/LICENSE -- Code was rewritten into `HET_SNP_COUNT` process of the `mapping.nf` module +- Code was rewritten into the `het_snp_count.py` script used by `HET_SNP_COUNT` process of the `mapping.nf` module [mlst](https://github.com/tseemann/mlst) - Torsten Seemann ([@tseemann](https://github.com/tseemann)) @@ -419,7 +427,15 @@ This project uses open-source components. You can find the homepage or source co [SeroBA](https://sanger-pathogens.github.io/seroba/) - **SeroBA: rapid high-throughput serotyping of Streptococcus pneumoniae from whole genome sequence data**. Epping L, van Tonder, AJ, Gladstone RA, GPS Consortium, Bentley SD, Page AJ, Keane JA, Microbial Genomics 2018, doi: [10.1099/mgen.0.000186](http://mgen.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000186) - License (GPL-3.0): https://github.com/sanger-pathogens/seroba/blob/master/LICENSE -- This tool is used in `CREATE_SEROBA_DB` and `SEROTYPE` processes of the `serotype.nf` module +- This project uses a Docker image built from a [custom fork](https://github.com/HarryHung/seroba) + - The fork integrates bug fixes + - The Docker image provides the containerised environment for `CREATE_SEROBA_DB` and `SEROTYPE` processes of the `serotype.nf` module + +[resistanceDatabase](https://github.com/kumarnaren/resistanceDatabase) +- Narender Kumar ([@kumarnaren](https://github.com/kumarnaren)) +- `sequences.fasta` is renamed to `ariba_ref_sequences-*.fasta` and used as-is +- `metadata.tsv` is renamed to `ariba_metadata-*.tsv` and modified +- The files are used as the default inputs of `CREATE_ARIBA_DB` process of the `amr.nf` module [Shovill](https://github.com/tseemann/shovill) - Torsten Seemann ([@tseemann](https://github.com/tseemann)) From 06c16d0078e92080da312b53eb94fab7b4b2a015 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 17 Jul 2023 19:31:40 +0000 Subject: [PATCH 031/157] Update License of ARIBA and resistanceDatabase Former-commit-id: 69009e2417f1106d201320ae733977035ce80e3c --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index aab4261..0bbc36f 100644 --- a/README.md +++ b/README.md @@ -348,7 +348,7 @@ This project uses open-source components. You can find the homepage or source co [ARIBA](https://sanger-pathogens.github.io/ariba/) - ARIBA: rapid antimicrobial resistance genotyping directly from sequencing reads Hunt M, Mather AE, Sánchez-Busó L, Page AJ, Parkhill J , Keane JA, Harris SR. Microbial Genomics 2017. doi: [110.1099/mgen.0.000131](http://mgen.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000131) -- License (GNU): https://github.com/sanger-pathogens/ariba/blob/master/LICENSE +- License (GPL-3.0): https://github.com/sanger-pathogens/ariba/blob/master/LICENSE - This tool is used in `CREATE_ARIBA_DB` and `OTHER_RESISTANCE` processes of the `amr.nf` module [BCFtools](https://samtools.github.io/bcftools/) and [SAMtools](https://www.htslib.org/) @@ -433,6 +433,7 @@ This project uses open-source components. You can find the homepage or source co [resistanceDatabase](https://github.com/kumarnaren/resistanceDatabase) - Narender Kumar ([@kumarnaren](https://github.com/kumarnaren)) +- License (GPL-3.0): https://github.com/kumarnaren/resistanceDatabase/blob/main/LICENSE - `sequences.fasta` is renamed to `ariba_ref_sequences-*.fasta` and used as-is - `metadata.tsv` is renamed to `ariba_metadata-*.tsv` and modified - The files are used as the default inputs of `CREATE_ARIBA_DB` process of the `amr.nf` module From a980fd78b9a2ad5cdd1983bd7860ffc9c32735e3 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 18 Jul 2023 08:56:59 +0000 Subject: [PATCH 032/157] Update Output section based on ARIBA-based AMR Former-commit-id: d0ea4fc4d79134df1ca48eaa5a87155541e8abe2 --- README.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0bbc36f..99535cb 100644 --- a/README.md +++ b/README.md @@ -323,24 +323,28 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | `PEN_Res(Non-meningital)` | PBP AMR | Resistance phenotype against PEN in non-meningital form | | `CHL_Res` | Other AMR | Resistance phenotype against Chloramphenicol (CHL) | | `CHL_Determinant` | Other AMR | Known determinants that inferred the CHL resistance | - | `CLI_Res` | Other AMR | Resistance phenotype against Clindamycin (CLI) | - | `CLI_Determinant` | Other AMR | Known determinants that inferred the CLI resistance | | `ERY_Res` | Other AMR | Resistance phenotype against Erythromycin (ERY) | | `ERY_Determinant` | Other AMR | Known determinants that inferred the ERY resistance | | `FQ_Res` | Other AMR | Resistance phenotype against Fluoroquinolones (FQ) | | `FQ_Determinant` | Other AMR | Known determinants that inferred the FQ resistance | | `KAN_Res` | Other AMR | Resistance phenotype against Kanamycin (KAN) | | `KAN_Determinant` | Other AMR | Known determinants that inferred the KAN resistance | - | `LZO_Res` | Other AMR | Resistance phenotype against Linezolid (LZO) | - | `LZO_Determinant` | Other AMR | Known determinants that inferred the LZO resistance | | `TET_Res` | Other AMR | Resistance phenotype against Tetracycline (TET) | | `TET_Determinant` | Other AMR | Known determinants that inferred the TET resistance | | `TMP_Res` | Other AMR | Resistance phenotype against Trimethoprim (TMP) | | `TMP_Determinant` | Other AMR | Known determinants that inferred the TMP resistance | | `SMX_Res` | Other AMR | Resistance phenotype against Sulfamethoxazole (SMX) | | `SMX_Determinant` | Other AMR | Known determinants that inferred the SMX resistance | - | `COT_Res` | Other AMR | Resistance phenotype against Co-Trimoxazole (COT) | - | `COT_Determinant` | Other AMR | Known determinants that inferred the COT resistance | + | `ERY_CLI_Res` | Other AMR | Resistance phenotype against Erythromycin (ERY) and Clindamycin (CLI) | + | `ERY_CLI_Determinant` | Other AMR | Known determinants that inferred the ERY and CLI resistance | + | `RIF_Res` | Other AMR | Resistance phenotype against Rifampin (RIF) | + | `RIF_Determinant` | Other AMR | Known determinants that inferred the RIF resistance | + | `VAN_Res` | Other AMR | Resistance phenotype against Vancomycin (VAN) | + | `VAN_Determinant` | Other AMR | Known determinants that inferred the VAN resistance | + | `PILI-1` | Other AMR | Expression of PILI-1 | + | `PILI-1_Determinant` | Other AMR | Known determinants that inferred the PILI-1 expression | + | `PILI-2` | Other AMR | Expression of PILI-2 | + | `PILI-2_Determinant` | Other AMR | Known determinants that inferred the PILI-2 expression | # Credits From c9f27009998d652ff74f3f9d38ab931cb57dbc4d Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 18 Jul 2023 15:13:51 +0000 Subject: [PATCH 033/157] Add AMR inference based on other AMR Former-commit-id: 08ba429a8b7cada1d5cd1ee0ec37ea3f520fe400 --- bin/get_other_resistance.py | 43 ++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/bin/get_other_resistance.py b/bin/get_other_resistance.py index 29f8234..a4e42d1 100755 --- a/bin/get_other_resistance.py +++ b/bin/get_other_resistance.py @@ -70,7 +70,7 @@ # Go through targets in metadata for target in target_dict: - # + # If the target has no hit, set output as S or NEG (only for PILI-1/2), and determinant as _ if len(target_dict[target]) == 0: if target.lower().startswith('pili'): output[target] = 'NEG' @@ -78,6 +78,7 @@ output[f'{target}_Res'] = 'S' output[f'{target}_Determinant'] = '_' + # If the target has hit, set output as R or POS (only for PILI-1/2), and join all hits as determinant else: if target.lower().startswith('pili'): output[target] = 'POS' @@ -85,5 +86,45 @@ output[f'{target}_Res'] = 'R' output[f'{target}_Determinant'] = '; '.join(target_dict[target]) + + # Special cases to add to output + + # If TET exists and DOX does not: add DOX to output; directly copy output and determinant + if 'TET_Res' in output and 'DOX_Res' not in output: + output['DOX_Res'] = output['TET_Res'] + output['DOX_Determinant'] = output['TET_Determinant'] + + # If FQ exists and LFX does not: add LFX to output; directly copy output and determinant + if 'FQ_Res' in output and 'LFX_Res' not in output: + output['LFX_Res'] = output['FQ_Res'] + output['LFX_Determinant'] = output['FQ_Determinant'] + + # If both TMP and SMX exists, and COT does not: add COT to output. + # If R in both, COT is R; if R in one of them, COT is I; if S in both, COT is S + # Copy TMP_Determinant and SMX_Determinant to COT_Determinant + if 'TMP_Res' in output and 'SMX_Res' in output and 'COT_Res' not in output: + if output['TMP_Res'] == 'R' and output['SMX_Res'] == 'R': + output['COT_Res'] = 'R' + output['COT_Determinant'] = '; '.join(target_dict['TMP'].union(target_dict['SMX'])) + elif (output['TMP_Res'] == 'R') ^ (output['SMX_Res'] == 'R'): + output['COT_Res'] = 'I' + output['COT_Determinant'] = '; '.join(target_dict['TMP'].union(target_dict['SMX'])) + elif output['TMP_Res'] == 'S' and output['SMX_Res'] == 'S': + output['COT_Res'] = 'S' + output['COT_Determinant'] = '_' + + # If ERY_CLI exists, add ERY and CLI to output. + # If ERY_CLI is R, ERY and CLI are R, and add ERY_CLI determinant to their determinants + # If ERY_CLI is S, ERY and CLI are S if they do not already exist, otherwise leave them unchanged + if 'ERY_CLI_Res' in output: + if output['ERY_CLI_Res'] == 'R': + output['ERY_Res'] = 'R' + output['CLI_Res'] = 'R' + elif output['ERY_CLI_Res'] == 'S': + output['ERY_Res'] = output['ERY_Res'] if 'ERY_Res' in output else 'S' + output['CLI_Res'] = output['CLI_Res'] if 'CLI_Res' in output else 'S' + output['ERY_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['ERY'])) if 'ERY' in target_dict else output['ERY_CLI_Determinant'] + output['CLI_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['CLI'])) if 'CLI' in target_dict else output['ERY_CLI_Determinant'] + print(json.dumps(output, indent=4)) \ No newline at end of file From baaacb7132dad659984c3f7971df0bc7a5fb4b52 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 18 Jul 2023 15:31:34 +0000 Subject: [PATCH 034/157] Include new AMR in results.csv Former-commit-id: 82c6fa948e5c99d92651479f385d6027e4d9d792 --- bin/get_other_resistance.sh | 12 ++++++++++-- modules/amr.nf | 2 +- workflows/pipeline.nf | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/bin/get_other_resistance.sh b/bin/get_other_resistance.sh index fdd5f58..befd4a4 100755 --- a/bin/get_other_resistance.sh +++ b/bin/get_other_resistance.sh @@ -10,18 +10,26 @@ CHL_Res=$(GET_VALUE "CHL_Res") CHL_Determinant=$(GET_VALUE "CHL_Determinant") ERY_Res=$(GET_VALUE "ERY_Res") ERY_Determinant=$(GET_VALUE "ERY_Determinant") +CLI_Res=$(GET_VALUE "CLI_Res") +CLI_Determinant=$(GET_VALUE "CLI_Determinant") +ERY_CLI_Res=$(GET_VALUE "ERY_CLI_Res") +ERY_CLI_Determinant=$(GET_VALUE "ERY_CLI_Determinant") FQ_Res=$(GET_VALUE "FQ_Res") FQ_Determinant=$(GET_VALUE "FQ_Determinant") +LFX_Res=$(GET_VALUE "LFX_Res") +LFX_Determinant=$(GET_VALUE "LFX_Determinant") KAN_Res=$(GET_VALUE "KAN_Res") KAN_Determinant=$(GET_VALUE "KAN_Determinant") TET_Res=$(GET_VALUE "TET_Res") TET_Determinant=$(GET_VALUE "TET_Determinant") +DOX_Res=$(GET_VALUE "DOX_Res") +DOX_Determinant=$(GET_VALUE "DOX_Determinant") TMP_Res=$(GET_VALUE "TMP_Res") TMP_Determinant=$(GET_VALUE "TMP_Determinant") SMX_Res=$(GET_VALUE "SMX_Res") SMX_Determinant=$(GET_VALUE "SMX_Determinant") -ERY_CLI_Res=$(GET_VALUE "ERY_CLI_Res") -ERY_CLI_Determinant=$(GET_VALUE "ERY_CLI_Determinant") +COT_Res=$(GET_VALUE "COT_Res") +COT_Determinant=$(GET_VALUE "COT_Determinant") RIF_Res=$(GET_VALUE "RIF_Res") RIF_Determinant=$(GET_VALUE "RIF_Determinant") VAN_Res=$(GET_VALUE "VAN_Res") diff --git a/modules/amr.nf b/modules/amr.nf index 19429f7..6a2a0bf 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -102,7 +102,7 @@ process GET_OTHER_RESISTANCE { path metadata output: - tuple val(sample_id), env(CHL_Res), env(CHL_Determinant), env(ERY_Res), env(ERY_Determinant), env(FQ_Res), env(FQ_Determinant), env(KAN_Res), env(KAN_Determinant), env(TET_Res), env(TET_Determinant), env(TMP_Res), env(TMP_Determinant), env(SMX_Res), env(SMX_Determinant), env(ERY_CLI_Res), env(ERY_CLI_Determinant), env(RIF_Res), env(RIF_Determinant), env(VAN_Res), env(VAN_Determinant), env(PILI1), env(PILI1_Determinant), env(PILI2), env(PILI2_Determinant), emit: result + tuple val(sample_id), env(CHL_Res), env(CHL_Determinant), env(ERY_Res), env(ERY_Determinant), env(CLI_Res), env(CLI_Determinant), env(ERY_CLI_Res), env(ERY_CLI_Determinant), env(FQ_Res), env(FQ_Determinant), env(LFX_Res), env(LFX_Determinant), env(KAN_Res), env(KAN_Determinant), env(TET_Res), env(TET_Determinant), env(DOX_Res), env(DOX_Determinant), env(TMP_Res), env(TMP_Determinant), env(SMX_Res), env(SMX_Determinant), env(COT_Res), env(COT_Determinant), env(RIF_Res), env(RIF_Determinant), env(VAN_Res), env(VAN_Determinant), env(PILI1), env(PILI1_Determinant), env(PILI2), env(PILI2_Determinant), emit: result script: """ diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index bdead12..6dc59fe 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -196,7 +196,7 @@ workflow PIPELINE { 'Serotype', 'ST', 'aroE', 'gdh', 'gki', 'recP', 'spi', 'xpt', 'ddl', 'pbp1a', 'pbp2b', 'pbp2x', 'AMO_MIC', 'AMO_Res', 'CFT_MIC', 'CFT_Res(Meningital)', 'CFT_Res(Non-meningital)', 'TAX_MIC', 'TAX_Res(Meningital)', 'TAX_Res(Non-meningital)', 'CFX_MIC', 'CFX_Res', 'MER_MIC', 'MER_Res', 'PEN_MIC', 'PEN_Res(Meningital)', 'PEN_Res(Non-meningital)', - 'CHL_Res', 'CHL_Determinant', 'ERY_Res', 'ERY_Determinant', 'FQ_Res', 'FQ_Determinant', 'KAN_Res', 'KAN_Determinant', 'TET_Res', 'TET_Determinant', 'TMP_Res', 'TMP_Determinant', 'SMX_Res', 'SMX_Determinant', 'ERY_CLI_Res', 'ERY_CLI_Determinant', 'RIF_Res', 'RIF_Determinant', 'VAN_Res', 'VAN_Determinant', 'PILI-1', 'PILI-1_Determinant', 'PILI-2', 'PILI-2_Determinant' + 'CHL_Res', 'CHL_Determinant', 'ERY_Res', 'ERY_Determinant', 'CLI_Res', 'CLI_Determinant', 'ERY_CLI_Res', 'ERY_CLI_Determinant', 'FQ_Res', 'FQ_Determinant', 'LFX_Res', 'LFX_Determinant', 'KAN_Res', 'KAN_Determinant', 'TET_Res', 'TET_Determinant', 'DOX_Res', 'DOX_Determinant', 'TMP_Res', 'TMP_Determinant', 'SMX_Res', 'SMX_Determinant', 'COT_Res', 'COT_Determinant', 'RIF_Res', 'RIF_Determinant', 'VAN_Res', 'VAN_Determinant', 'PILI1', 'PILI1_Determinant', 'PILI2', 'PILI2_Determinant' ].join(','), sort: { it.split(',')[0] }, newLine: true From e2ff36e522930058a06f981fe936ee79219f7a6a Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 18 Jul 2023 15:40:14 +0000 Subject: [PATCH 035/157] Fixing ERY and CLI determinant output when empty Former-commit-id: c2d321ce266856f17b8bc09f80ce52313c53c0dc --- bin/get_other_resistance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/get_other_resistance.py b/bin/get_other_resistance.py index a4e42d1..4f71294 100755 --- a/bin/get_other_resistance.py +++ b/bin/get_other_resistance.py @@ -124,7 +124,7 @@ output['ERY_Res'] = output['ERY_Res'] if 'ERY_Res' in output else 'S' output['CLI_Res'] = output['CLI_Res'] if 'CLI_Res' in output else 'S' - output['ERY_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['ERY'])) if 'ERY' in target_dict else output['ERY_CLI_Determinant'] - output['CLI_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['CLI'])) if 'CLI' in target_dict else output['ERY_CLI_Determinant'] + output['ERY_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['ERY'])) if 'ERY' in target_dict and len(target_dict['ERY']) != 0 else output['ERY_CLI_Determinant'] + output['CLI_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['CLI'])) if 'CLI' in target_dict and len(target_dict['CLI']) != 0 else output['ERY_CLI_Determinant'] print(json.dumps(output, indent=4)) \ No newline at end of file From f217c516fe5e0f99cb6b2ebfb28960aae4787884 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 18 Jul 2023 15:59:33 +0000 Subject: [PATCH 036/157] Add information on new AMR Former-commit-id: 7de88411bad5ac33ac7f9729377aeaa16678678f --- README.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 99535cb..e0a2220 100644 --- a/README.md +++ b/README.md @@ -325,18 +325,26 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | `CHL_Determinant` | Other AMR | Known determinants that inferred the CHL resistance | | `ERY_Res` | Other AMR | Resistance phenotype against Erythromycin (ERY) | | `ERY_Determinant` | Other AMR | Known determinants that inferred the ERY resistance | + | `CLI_Res` | Other AMR | Resistance phenotype against Clindamycin (CLI) | + | `CLI_Determinant` | Other AMR | Known determinants that inferred the CLI resistance | + | `ERY_CLI_Res` | Other AMR | Resistance phenotype against Erythromycin (ERY) and Clindamycin (CLI) | + | `ERY_CLI_Determinant` | Other AMR | Known determinants that inferred the ERY and CLI resistance | | `FQ_Res` | Other AMR | Resistance phenotype against Fluoroquinolones (FQ) | | `FQ_Determinant` | Other AMR | Known determinants that inferred the FQ resistance | + | `LFX_Res` | Other AMR | Resistance phenotype against Levofloxacin (LFX) | + | `LFX_Determinant` | Other AMR | Known determinants that inferred the LFX resistance | | `KAN_Res` | Other AMR | Resistance phenotype against Kanamycin (KAN) | | `KAN_Determinant` | Other AMR | Known determinants that inferred the KAN resistance | | `TET_Res` | Other AMR | Resistance phenotype against Tetracycline (TET) | | `TET_Determinant` | Other AMR | Known determinants that inferred the TET resistance | + | `DOX_Res` | Other AMR | Resistance phenotype against Doxycycline (DOX) | + | `DOX_Determinant` | Other AMR | Known determinants that inferred the DOX resistance | | `TMP_Res` | Other AMR | Resistance phenotype against Trimethoprim (TMP) | | `TMP_Determinant` | Other AMR | Known determinants that inferred the TMP resistance | | `SMX_Res` | Other AMR | Resistance phenotype against Sulfamethoxazole (SMX) | | `SMX_Determinant` | Other AMR | Known determinants that inferred the SMX resistance | - | `ERY_CLI_Res` | Other AMR | Resistance phenotype against Erythromycin (ERY) and Clindamycin (CLI) | - | `ERY_CLI_Determinant` | Other AMR | Known determinants that inferred the ERY and CLI resistance | + | `COT_Res` | Other AMR | Resistance phenotype against Co-Trimoxazole (COT) | + | `COT_Determinant` | Other AMR | Known determinants that inferred the COT resistance | | `RIF_Res` | Other AMR | Resistance phenotype against Rifampin (RIF) | | `RIF_Determinant` | Other AMR | Known determinants that inferred the RIF resistance | | `VAN_Res` | Other AMR | Resistance phenotype against Vancomycin (VAN) | From 5ebf6047cdf77b95dfc18bbd7899e335cbdc5866 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 19 Jul 2023 15:17:17 +0000 Subject: [PATCH 037/157] Ensure version of Python 3 is captured Former-commit-id: e8a2905c8a533ea2535f861815071d1008667eee --- modules/info.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/info.nf b/modules/info.nf index 4cabccb..90cbc5b 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -445,7 +445,7 @@ process PYTHON_VERSION { shell: $/ - VERSION=$(python --version | sed -r "s/.*\s(.+)/\1/") + VERSION=$(python3 --version | sed -r "s/.*\s(.+)/\1/") /$ } From a291074aa544664df09a6b9af74a90426b9cd2ee Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 19 Jul 2023 16:20:26 +0000 Subject: [PATCH 038/157] Remove unnecessary versioning of Het-SNP Counter Former-commit-id: dc1064e8b6eac5887a968bef99a2b3d117d18532 --- modules/info.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/info.nf b/modules/info.nf index 90cbc5b..cde8662 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -245,7 +245,6 @@ process PARSE { |${toolTextRow('BWA', 'bwa')} |${toolTextRow('SAMtools', 'samtools')} |${toolTextRow('BCFtools', 'bcftools')} - |${toolTextRow('Het-SNP Counter', 'het_snp_count')} |${toolTextRow('PopPUNK', 'poppunk')} |${toolTextRow('CDC PBP AMR Predictor', 'spn_pbp_amr')} |${toolTextRow('ARIBA', 'ariba')} From ebd212ea124576beb8a15425292c5858347e433c Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 19 Jul 2023 16:20:51 +0000 Subject: [PATCH 039/157] Change default Python image to include Pandas Former-commit-id: 47bb796fbbb8fcb79e0d78a79ebf303940f49cd5 --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 1ab6cb0..1a322f1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -66,7 +66,7 @@ process { container = 'bitnami/git:2.39.0' } withLabel: python_container { - container = 'python:3.11.1-bullseye' + container = 'amancevice/pandas:2.0.2-slim' } withLabel: fastp_container { container = 'staphb/fastp:0.23.2' From af43d4f077c3dd15e00fa18ab8e19395bade2d3c Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 19 Jul 2023 16:21:36 +0000 Subject: [PATCH 040/157] Improve Docker Image capturing Former-commit-id: 231b7b1ae78c1fde380b7056da22c3202b8940a8 --- bin/get_images_info.sh | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/bin/get_images_info.sh b/bin/get_images_info.sh index ba5428f..95dd83f 100755 --- a/bin/get_images_info.sh +++ b/bin/get_images_info.sh @@ -1,25 +1,25 @@ # Extract containers information from nextflow.config and save into a JSON file -IMAGES=$(grep -E "container\s?=" $NEXTFLOW_CONFIG \ - | sort -u \ - | sed -r "s/\s+container\s?=\s?'(.+)'/\1/") +find_image () { + grep -E "container\s?=" -B 1 $NEXTFLOW_CONFIG | grep -v -- "^--$" | paste - - | sort -u | grep $1 | sed -r "s/.+container\s?=\s?'(.+)'/\1/" +} -BASH=$(grep network-multitool <<< $IMAGES) -GIT=$(grep git <<< $IMAGES) -PYTHON=$(grep python <<< $IMAGES) -FASTP=$(grep fastp <<< $IMAGES) -UNICYCLER=$(grep unicycler <<< $IMAGES) -SHOVILL=$(grep shovill <<< $IMAGES) -QUAST=$(grep quast <<< $IMAGES) -BWA=$(grep bwa <<< $IMAGES) -SAMTOOLS=$(grep samtools <<< $IMAGES) -BCFTOOLS=$(grep bcftools <<< $IMAGES) -POPPUNK=$(grep poppunk <<< $IMAGES) -SPN_PBP_AMR=$(grep spn-pbp-amr <<< $IMAGES) -ARIBA=$(grep ariba <<< $IMAGES) -MLST=$(grep mlst <<< $IMAGES) -KRAKEN2=$(grep kraken2 <<< $IMAGES) -SEROBA=$(grep seroba <<< $IMAGES) +BASH=$(find_image bash) +GIT=$(find_image git) +PYTHON=$(find_image python) +FASTP=$(find_image fastp) +UNICYCLER=$(find_image unicycler) +SHOVILL=$(find_image shovill) +QUAST=$(find_image quast) +BWA=$(find_image bwa) +SAMTOOLS=$(find_image samtools) +BCFTOOLS=$(find_image bcftools) +POPPUNK=$(find_image poppunk) +SPN_PBP_AMR=$(find_image spn-pbp-amr) +ARIBA=$(find_image ariba) +MLST=$(find_image mlst) +KRAKEN2=$(find_image kraken2) +SEROBA=$(find_image seroba) add_container () { jq -n --arg container $1 '.container = $container' From a0877b052c29f3d4f290fdbf54083647a311f627 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 20 Jul 2023 16:16:27 +0000 Subject: [PATCH 041/157] Save reports as .csv Former-commit-id: 5d072252de429525558fb157ee858317c566b71d --- bin/assembly_qc.sh | 3 +++ bin/mapping_qc.sh | 3 +++ bin/overall_qc.sh | 7 ++++++- bin/read_qc.sh | 3 +++ bin/taxonomy_qc.sh | 3 +++ modules/assembly.nf | 4 +++- modules/mapping.nf | 4 +++- modules/overall_qc.nf | 6 +++++- modules/preprocess.nf | 3 +++ modules/taxonomy.nf | 4 +++- 10 files changed, 35 insertions(+), 5 deletions(-) diff --git a/bin/assembly_qc.sh b/bin/assembly_qc.sh index 160ed72..9d399fc 100755 --- a/bin/assembly_qc.sh +++ b/bin/assembly_qc.sh @@ -9,3 +9,6 @@ if (( $CONTIGS < $QC_CONTIGS )) && (( $LENGTH >= $QC_LENGTH_LOW )) && (( $LENGTH else ASSEMBLY_QC="FAIL" fi + +echo \"Assembly_QC\",\"Contigs#\",\"Assembly_Length\",\"Seq_Depth\" > $ASSEMBLY_QC_REPORT +echo \"$ASSEMBLY_QC\",\"$CONTIGS\",\"$LENGTH\",\"$DEPTH\" >> $ASSEMBLY_QC_REPORT \ No newline at end of file diff --git a/bin/mapping_qc.sh b/bin/mapping_qc.sh index 450e871..75b18a0 100755 --- a/bin/mapping_qc.sh +++ b/bin/mapping_qc.sh @@ -7,3 +7,6 @@ if (( $(echo "$COVERAGE > $QC_REF_COVERAGE" | bc -l) )) && (( $HET_SNP < $QC_HET else MAPPING_QC="FAIL" fi + +echo \"Mapping_QC\",\"Ref_Cov_%\",\"Het-SNP#\" > $MAPPING_QC_REPORT +echo \"$MAPPING_QC\",\"$COVERAGE\",\"$QC_HET_SNP_SITE\" >> $MAPPING_QC_REPORT \ No newline at end of file diff --git a/bin/overall_qc.sh b/bin/overall_qc.sh index ccdb803..de7e116 100755 --- a/bin/overall_qc.sh +++ b/bin/overall_qc.sh @@ -1,10 +1,15 @@ # Determine overall QC result based on Assembly QC, Mapping QC and Taxonomy QC # In case of assembler failure, there will be no Assembly QC input, hence output result as ASSEMBLER FAILURE -if [[ "$ASSEMBLY_QC" == "PASS" ]] && [[ "$MAPPING_QC" == "PASS" ]] && [[ "$TAXONOMY_QC" == "PASS" ]]; then +if [[ "$READ_QC" == "PASS" ]] && [[ "$ASSEMBLY_QC" == "PASS" ]] && [[ "$MAPPING_QC" == "PASS" ]] && [[ "$TAXONOMY_QC" == "PASS" ]]; then OVERALL_QC="PASS" +elif [[ "$READ_QC" == "FAIL" ]]; then + OVERALL_QC="FAIL" elif [[ "$ASSEMBLY_QC" == "null" ]]; then OVERALL_QC="ASSEMBLER FAILURE" else OVERALL_QC="FAIL" fi + +echo \"Overall_QC\" > $OVERALL_QC_REPORT +echo \"$OVERALL_QC\" >> $OVERALL_QC_REPORT \ No newline at end of file diff --git a/bin/read_qc.sh b/bin/read_qc.sh index 040c6ec..6ce8382 100755 --- a/bin/read_qc.sh +++ b/bin/read_qc.sh @@ -7,3 +7,6 @@ if (( $(echo "$BASES >= ($QC_LENGTH_LOW*$QC_DEPTH)" | bc -l) )); then else READ_QC="FAIL" fi + +echo \"Read_QC\",\"Bases\" > $READ_QC_REPORT +echo \"$READ_QC\",\"$BASES\" >> $READ_QC_REPORT \ No newline at end of file diff --git a/bin/taxonomy_qc.sh b/bin/taxonomy_qc.sh index c468b14..23254b1 100755 --- a/bin/taxonomy_qc.sh +++ b/bin/taxonomy_qc.sh @@ -11,3 +11,6 @@ if (( $(echo "$PERCENTAGE > $QC_SPNEUMO_PERCENTAGE" | bc -l) )); then else TAXONOMY_QC="FAIL" fi + +echo \"Taxonomy_QC\",\"S.Pneumo_%\" > $TAXONOMY_QC_REPORT +echo \"$TAXONOMY_QC\",\"$PERCENTAGE\" >> $TAXONOMY_QC_REPORT \ No newline at end of file diff --git a/modules/assembly.nf b/modules/assembly.nf index 4699289..fd84c76 100644 --- a/modules/assembly.nf +++ b/modules/assembly.nf @@ -85,10 +85,11 @@ process ASSEMBLY_QC { val(qc_depth) output: - tuple val(sample_id), env(CONTIGS), env(LENGTH), env(DEPTH), emit: info tuple val(sample_id), env(ASSEMBLY_QC), emit: result + tuple val(sample_id), path(assembly_qc_report), emit: report script: + assembly_qc_report='assembly_qc_report.csv' """ REPORT="$report" BASES="$bases" @@ -96,6 +97,7 @@ process ASSEMBLY_QC { QC_LENGTH_LOW="$qc_length_low" QC_LENGTH_HIGH="$qc_length_high" QC_DEPTH="$qc_depth" + ASSEMBLY_QC_REPORT="$assembly_qc_report" source assembly_qc.sh """ diff --git a/modules/mapping.nf b/modules/mapping.nf index f0d1e0e..0a37628 100644 --- a/modules/mapping.nf +++ b/modules/mapping.nf @@ -138,15 +138,17 @@ process MAPPING_QC { val(qc_het_snp_site) output: - tuple val(sample_id), env(COVERAGE), env(HET_SNP), emit: info tuple val(sample_id), env(MAPPING_QC), emit: result + tuple val(sample_id), path(mapping_qc_report), emit: report script: + mapping_qc_report='mapping_qc_report.csv' """ COVERAGE="$ref_coverage" HET_SNP="$het_snp_count" QC_REF_COVERAGE="$qc_ref_coverage" QC_HET_SNP_SITE="$qc_het_snp_site" + MAPPING_QC_REPORT="$mapping_qc_report" source mapping_qc.sh """ diff --git a/modules/overall_qc.nf b/modules/overall_qc.nf index b212595..fa639d9 100644 --- a/modules/overall_qc.nf +++ b/modules/overall_qc.nf @@ -6,16 +6,20 @@ process OVERALL_QC { tag "$sample_id" input: - tuple val(sample_id), val(assembly_qc), val(mapping_qc), val(taxonomy_qc) + tuple val(sample_id), val(read_qc), val(assembly_qc), val(mapping_qc), val(taxonomy_qc) output: tuple val(sample_id), env(OVERALL_QC), emit: result + tuple val(sample_id), path(overall_qc_report), emit: report script: + overall_qc_report='overall_qc_report.csv' """ + READ_QC="$read_qc" ASSEMBLY_QC="$assembly_qc" MAPPING_QC="$mapping_qc" TAXONOMY_QC="$taxonomy_qc" + OVERALL_QC_REPORT="$overall_qc_report" source overall_qc.sh """ diff --git a/modules/preprocess.nf b/modules/preprocess.nf index 4e8e18c..e04b756 100644 --- a/modules/preprocess.nf +++ b/modules/preprocess.nf @@ -38,12 +38,15 @@ process READ_QC { output: tuple val(sample_id), env(BASES), emit: bases tuple val(sample_id), env(READ_QC), emit: result + tuple val(sample_id), path(read_qc_report), emit: report script: + read_qc_report='read_qc_report.csv' """ JSON="$json" QC_LENGTH_LOW="$qc_length_low" QC_DEPTH="$qc_depth" + READ_QC_REPORT="$read_qc_report" source read_qc.sh """ diff --git a/modules/taxonomy.nf b/modules/taxonomy.nf index af6266d..b4d1e62 100644 --- a/modules/taxonomy.nf +++ b/modules/taxonomy.nf @@ -63,13 +63,15 @@ process TAXONOMY_QC { val(qc_spneumo_percentage) output: - tuple val(sample_id), env(PERCENTAGE), emit: percentage tuple val(sample_id), env(TAXONOMY_QC), emit: result + tuple val(sample_id), path(taxonomy_qc_report), emit: report script: + taxonomy_qc_report='taxonomy_qc_report.csv' """ KRAKEN2_REPORT="$kraken2_report" QC_SPNEUMO_PERCENTAGE="$qc_spneumo_percentage" + TAXONOMY_QC_REPORT="$taxonomy_qc_report" source taxonomy_qc.sh """ From d3d28be81945b08be19ea3be79a8a3eea20bf8c6 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 20 Jul 2023 16:17:14 +0000 Subject: [PATCH 042/157] Combining reports to generate sample report Former-commit-id: ea1b1bab32ded4e1ba187b693acfed0e91bc733f --- bin/generate_sample_report.sh | 3 +++ modules/output.nf | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100755 bin/generate_sample_report.sh create mode 100644 modules/output.nf diff --git a/bin/generate_sample_report.sh b/bin/generate_sample_report.sh new file mode 100755 index 0000000..ec769f3 --- /dev/null +++ b/bin/generate_sample_report.sh @@ -0,0 +1,3 @@ +paste -d , *.csv \ +| sed '1 s/^/\"Sample_ID\",/' \ +| sed "2 s/^/\"${SAMPLE_ID}\",/" > $SAMPLE_REPORT \ No newline at end of file diff --git a/modules/output.nf b/modules/output.nf new file mode 100644 index 0000000..a711425 --- /dev/null +++ b/modules/output.nf @@ -0,0 +1,21 @@ +process GENERATE_SAMPLE_REPORT { + label 'bash_container' + label 'farm_low' + + tag "$sample_id" + + input: + tuple val(sample_id), path ('report*.csv') + + output: + path sample_report + + script: + sample_report="${sample_id}_report.csv" + """ + SAMPLE_ID=$sample_id + SAMPLE_REPORT=$sample_report + + source generate_sample_report.sh + """ +} \ No newline at end of file From 9c5c5be5c22629f023ea755f04ab1ea5206d77c2 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 20 Jul 2023 16:17:23 +0000 Subject: [PATCH 043/157] Initial work on output revamp (WIP) Former-commit-id: 01b22b87a70de4c9077b879c74a69aeeb03fedbd --- workflows/pipeline.nf | 110 ++++++++++++++++++++++++------------------ 1 file changed, 62 insertions(+), 48 deletions(-) diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 6dc59fe..ef8f650 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -8,6 +8,7 @@ include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS; LINEAGE } from "$projectDir/ include { GET_SEROBA_DB; CREATE_SEROBA_DB; SEROTYPE } from "$projectDir/modules/serotype" include { MLST } from "$projectDir/modules/mlst" include { PBP_RESISTANCE; GET_PBP_RESISTANCE; CREATE_ARIBA_DB; OTHER_RESISTANCE; GET_OTHER_RESISTANCE } from "$projectDir/modules/amr" +include { GENERATE_SAMPLE_REPORT } from "$projectDir/modules/output" // Main pipeline workflow workflow PIPELINE { @@ -103,9 +104,10 @@ workflow PIPELINE { // Merge Channels ASSEMBLY_QC.out.result & MAPPING_QC.out.result & TAXONOMY_QC.out.result to provide Overall QC Status // Output into Channel OVERALL_QC.out.result OVERALL_QC( - ASSEMBLY_QC.out.result + READ_QC.out.result + .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true) .join(MAPPING_QC.out.result, failOnDuplicate: true, remainder: true) - .join(TAXONOMY_QC.out.result, failOnDuplicate: true) + .join(TAXONOMY_QC.out.result, failOnDuplicate: true, remainder: true) ) // From Channel READ_QC_PASSED_READS_ch, only output reads of samples passed overall QC based on Channel OVERALL_QC.out.result @@ -155,52 +157,64 @@ workflow PIPELINE { // GET_OTHER_RESISTANCE.out.result // // Replace null with approiate amount of "_" items when sample_id does not exist in that output (i.e. QC rejected) - READ_QC.out.result - .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - .join(MAPPING_QC.out.result, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - .join(TAXONOMY_QC.out.result, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - .join(OVERALL_QC.out.result, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['FAIL'] : it } - .join(READ_QC.out.bases, failOnDuplicate: true, failOnMismatch: true) - .join(ASSEMBLY_QC.out.info, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] * 3 : it } - .join(MAPPING_QC.out.info, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] * 2 : it } - .join(TAXONOMY_QC.out.percentage, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - .join(LINEAGE.out.csv.splitCsv(skip: 1), failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - .join(SEROTYPE.out.result, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - .join(MLST.out.result, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] * 8 : it } - .join(GET_PBP_RESISTANCE.out.result, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] * 18 : it } - .join(GET_OTHER_RESISTANCE.out, failOnDuplicate: true, remainder: true) - .map { (it[-1] == null) ? it[0..-2] + ['_'] * 24 : it } - .map { it.collect {"\"$it\""}.join',' } - .collectFile( - name: 'results.csv', - storeDir: "$params.output", - seed: [ - 'Sample_ID', - 'Read_QC', 'Assembly_QC', 'Mapping_QC', 'Taxonomy_QC', 'Overall_QC', - 'Bases', - 'Contigs#' , 'Assembly_Length', 'Seq_Depth', - 'Ref_Cov_%', 'Het-SNP#' , - 'S.Pneumo_%', - 'GPSC', - 'Serotype', - 'ST', 'aroE', 'gdh', 'gki', 'recP', 'spi', 'xpt', 'ddl', - 'pbp1a', 'pbp2b', 'pbp2x', 'AMO_MIC', 'AMO_Res', 'CFT_MIC', 'CFT_Res(Meningital)', 'CFT_Res(Non-meningital)', 'TAX_MIC', 'TAX_Res(Meningital)', 'TAX_Res(Non-meningital)', 'CFX_MIC', 'CFX_Res', 'MER_MIC', 'MER_Res', 'PEN_MIC', 'PEN_Res(Meningital)', 'PEN_Res(Non-meningital)', - 'CHL_Res', 'CHL_Determinant', 'ERY_Res', 'ERY_Determinant', 'CLI_Res', 'CLI_Determinant', 'ERY_CLI_Res', 'ERY_CLI_Determinant', 'FQ_Res', 'FQ_Determinant', 'LFX_Res', 'LFX_Determinant', 'KAN_Res', 'KAN_Determinant', 'TET_Res', 'TET_Determinant', 'DOX_Res', 'DOX_Determinant', 'TMP_Res', 'TMP_Determinant', 'SMX_Res', 'SMX_Determinant', 'COT_Res', 'COT_Determinant', 'RIF_Res', 'RIF_Determinant', 'VAN_Res', 'VAN_Determinant', 'PILI1', 'PILI1_Determinant', 'PILI2', 'PILI2_Determinant' - ].join(','), - sort: { it.split(',')[0] }, - newLine: true - ) + + GENERATE_SAMPLE_REPORT( + READ_QC.out.report + .join(ASSEMBLY_QC.out.report, failOnDuplicate: true, remainder: true) + .join(MAPPING_QC.out.report, failOnDuplicate: true, remainder: true) + .join(TAXONOMY_QC.out.report, failOnDuplicate: true, remainder: true) + .join(OVERALL_QC.out.report, failOnDuplicate: true, remainder: true) + .map { [it[0], it[1..-1].minus(null)] } + ).view() + + // GENERATE_OVERALL_REPORT + + // READ_QC.out.result + // .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } + // .join(MAPPING_QC.out.result, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } + // .join(TAXONOMY_QC.out.result, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } + // .join(OVERALL_QC.out.result, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['FAIL'] : it } + // .join(READ_QC.out.bases, failOnDuplicate: true, failOnMismatch: true) + // .join(ASSEMBLY_QC.out.info, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 3 : it } + // .join(MAPPING_QC.out.info, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 2 : it } + // .join(TAXONOMY_QC.out.percentage, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } + // .join(LINEAGE.out.csv.splitCsv(skip: 1), failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } + // .join(SEROTYPE.out.result, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } + // .join(MLST.out.result, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 8 : it } + // .join(GET_PBP_RESISTANCE.out.result, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 18 : it } + // .join(GET_OTHER_RESISTANCE.out, failOnDuplicate: true, remainder: true) + // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 24 : it } + // .map { it.collect {"\"$it\""}.join',' } + // .collectFile( + // name: 'results.csv', + // storeDir: "$params.output", + // seed: [ + // 'Sample_ID', + // 'Read_QC', 'Assembly_QC', 'Mapping_QC', 'Taxonomy_QC', 'Overall_QC', + // 'Bases', + // 'Contigs#' , 'Assembly_Length', 'Seq_Depth', + // 'Ref_Cov_%', 'Het-SNP#' , + // 'S.Pneumo_%', + // 'GPSC', + // 'Serotype', + // 'ST', 'aroE', 'gdh', 'gki', 'recP', 'spi', 'xpt', 'ddl', + // 'pbp1a', 'pbp2b', 'pbp2x', 'AMO_MIC', 'AMO_Res', 'CFT_MIC', 'CFT_Res(Meningital)', 'CFT_Res(Non-meningital)', 'TAX_MIC', 'TAX_Res(Meningital)', 'TAX_Res(Non-meningital)', 'CFX_MIC', 'CFX_Res', 'MER_MIC', 'MER_Res', 'PEN_MIC', 'PEN_Res(Meningital)', 'PEN_Res(Non-meningital)', + // 'CHL_Res', 'CHL_Determinant', 'ERY_Res', 'ERY_Determinant', 'CLI_Res', 'CLI_Determinant', 'ERY_CLI_Res', 'ERY_CLI_Determinant', 'FQ_Res', 'FQ_Determinant', 'LFX_Res', 'LFX_Determinant', 'KAN_Res', 'KAN_Determinant', 'TET_Res', 'TET_Determinant', 'DOX_Res', 'DOX_Determinant', 'TMP_Res', 'TMP_Determinant', 'SMX_Res', 'SMX_Determinant', 'COT_Res', 'COT_Determinant', 'RIF_Res', 'RIF_Determinant', 'VAN_Res', 'VAN_Determinant', 'PILI1', 'PILI1_Determinant', 'PILI2', 'PILI2_Determinant' + // ].join(','), + // sort: { it.split(',')[0] }, + // newLine: true + // ) // Pass to SAVE_INFO sub-workflow DATABASES_INFO = CREATE_REF_GENOME_BWA_DB.out.path.map { [["bwa_db_path", it]] } From 897d4a297fb1ca149a0f6198f2ebdcc2047d8b1f Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 21 Jul 2023 11:40:03 +0000 Subject: [PATCH 044/157] Save Lineage report per sample as .csv Former-commit-id: d0c5123b34773a7832dd7ff7ea3740c078c29dda --- bin/get_lineage.sh | 13 +++++++++++++ modules/lineage.nf | 15 ++++++++------- workflows/pipeline.nf | 1 + 3 files changed, 22 insertions(+), 7 deletions(-) create mode 100755 bin/get_lineage.sh diff --git a/bin/get_lineage.sh b/bin/get_lineage.sh new file mode 100755 index 0000000..70e4f08 --- /dev/null +++ b/bin/get_lineage.sh @@ -0,0 +1,13 @@ +# Run PopPUNK to assign GPSCs to samples + +# Add "prefix_" to all sample names in qfile to avoid poppunk_assign crashing due to sample name already exists in database +# Remove "prefix_" from all sample names in the result + +# Save results of individual sample into .csv with its name as filename + +sed 's/^/prefix_/' "$QFILE" > safe_qfile.txt +poppunk_assign --db "${POPPUNK_DIR}/${DB_NAME}" --external-clustering "${POPPUNK_DIR}/${EXT_CLUSTERS_FILE}" --query safe_qfile.txt --output output --threads $(nproc) +sed 's/^prefix_//' output/output_external_clusters.csv > result.txt + + +awk -F , 'NR!=1 { print "GPSC\n" "\"" $2 "\"" > $1 ".csv" }' result.txt \ No newline at end of file diff --git a/modules/lineage.nf b/modules/lineage.nf index 6e13fab..68edae3 100644 --- a/modules/lineage.nf +++ b/modules/lineage.nf @@ -46,8 +46,7 @@ process GET_POPPUNK_EXT_CLUSTERS { } // Run PopPUNK to assign GPSCs to samples -// Add "prefix_" to all sample names in qfile to avoid poppunk_assign crashing due to sample name already exists in database -// Remove "prefix_" from all sample names in the output +// Save results of individual sample into .csv with its name as filename process LINEAGE { label 'poppunk_container' label 'farm_high' @@ -63,13 +62,15 @@ process LINEAGE { path qfile output: - path(result), emit: csv + path '*.csv', emit: reports script: - result='result.csv' """ - sed 's/^/prefix_/' "$qfile" > safe_qfile.txt - poppunk_assign --db "${poppunk_dir}/${db_name}" --external-clustering "${poppunk_dir}/${ext_clusters_file}" --query safe_qfile.txt --output output --threads `nproc` - sed 's/^prefix_//' output/output_external_clusters.csv > "$result" + QFILE="$qfile" + POPPUNK_DIR="$poppunk_dir" + DB_NAME="$db_name" + EXT_CLUSTERS_FILE="$ext_clusters_file" + + source get_lineage.sh """ } diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index ef8f650..688e928 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -164,6 +164,7 @@ workflow PIPELINE { .join(MAPPING_QC.out.report, failOnDuplicate: true, remainder: true) .join(TAXONOMY_QC.out.report, failOnDuplicate: true, remainder: true) .join(OVERALL_QC.out.report, failOnDuplicate: true, remainder: true) + .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name .map { [it[0], it[1..-1].minus(null)] } ).view() From b19ffa403b9d72a249c7d66a668f05ee1336ef39 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 21 Jul 2023 14:04:07 +0000 Subject: [PATCH 045/157] Add quote to csv header Former-commit-id: bfcca3163f37354fda219cfab3c2feec9ddd01f6 --- bin/get_lineage.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/get_lineage.sh b/bin/get_lineage.sh index 70e4f08..63b6ec0 100755 --- a/bin/get_lineage.sh +++ b/bin/get_lineage.sh @@ -10,4 +10,4 @@ poppunk_assign --db "${POPPUNK_DIR}/${DB_NAME}" --external-clustering "${POPPUNK sed 's/^prefix_//' output/output_external_clusters.csv > result.txt -awk -F , 'NR!=1 { print "GPSC\n" "\"" $2 "\"" > $1 ".csv" }' result.txt \ No newline at end of file +awk -F , 'NR!=1 { print "\"GPSC\"\n" "\"" $2 "\"" > $1 ".csv" }' result.txt \ No newline at end of file From 3811c8be69d36a384b6dca1b190a7860cced052d Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 21 Jul 2023 14:05:13 +0000 Subject: [PATCH 046/157] Save Serotype report as .csv Former-commit-id: 897aa091a42ef4372a0476bea63114de9c51eef8 --- bin/get_serotype.sh | 3 +++ modules/serotype.nf | 6 +++++- workflows/pipeline.nf | 1 + 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/bin/get_serotype.sh b/bin/get_serotype.sh index b17c2de..80bfcc7 100755 --- a/bin/get_serotype.sh +++ b/bin/get_serotype.sh @@ -4,3 +4,6 @@ } || { SEROTYPE="SEROBA FAILURE" } + +echo \"Serotype\" > $SEROTYPE_REPORT +echo \"$SEROTYPE\" >> $SEROTYPE_REPORT \ No newline at end of file diff --git a/modules/serotype.nf b/modules/serotype.nf index 5c268fc..0c69bad 100644 --- a/modules/serotype.nf +++ b/modules/serotype.nf @@ -66,9 +66,10 @@ process SEROTYPE { tuple val(sample_id), path(read1), path(read2), path(unpaired) output: - tuple val(sample_id), env(SEROTYPE), emit: result + tuple val(sample_id), path(serotype_report), emit: report script: + serotype_report='serotype_report.csv' // When using Singularity as container engine, SeroBA sometimes gives incorrect result or critical error // Uncertain root cause, happen randomly when input are located directly in a Nextflow process work directory // Workaround: create and use a subdirectory to alter the path @@ -79,6 +80,7 @@ process SEROTYPE { READ1="$read1" READ2="$read2" SAMPLE_ID="$sample_id" + SEROTYPE_REPORT="$serotype_report" source get_serotype.sh """ @@ -89,12 +91,14 @@ process SEROTYPE { READ1="$read1" READ2="$read2" SAMPLE_ID="$sample_id" + SEROTYPE_REPORT="$serotype_report" mkdir SEROBA_WORKDIR && mv $seroba_dir $read1 $read2 SEROBA_WORKDIR && cd SEROBA_WORKDIR source get_serotype.sh cd ../ + mv SEROBA_WORKDIR/$serotype_report ./ """ else error "The process must be run with Docker or Singularity as container engine." diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 688e928..257d4dd 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -164,6 +164,7 @@ workflow PIPELINE { .join(MAPPING_QC.out.report, failOnDuplicate: true, remainder: true) .join(TAXONOMY_QC.out.report, failOnDuplicate: true, remainder: true) .join(OVERALL_QC.out.report, failOnDuplicate: true, remainder: true) + .join(SEROTYPE.out.report, failOnDuplicate: true, remainder: true) .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name .map { [it[0], it[1..-1].minus(null)] } ).view() From 41287de8e37d2dd86a27fa0cc0c2f5ab62e13c74 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 21 Jul 2023 14:18:04 +0000 Subject: [PATCH 047/157] Save MLST report as .csv Former-commit-id: 58a549f948e3c9581779f1b753448bdc8e7acfc6 --- bin/get_mlst.sh | 3 +++ modules/mlst.nf | 4 +++- workflows/pipeline.nf | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/bin/get_mlst.sh b/bin/get_mlst.sh index 7e5c61a..ab7c8e9 100755 --- a/bin/get_mlst.sh +++ b/bin/get_mlst.sh @@ -12,3 +12,6 @@ recP=$(awk -F'\t' 'FNR == 2 {print $7}' $OUTPUT) spi=$(awk -F'\t' 'FNR == 2 {print $8}' $OUTPUT) xpt=$(awk -F'\t' 'FNR == 2 {print $9}' $OUTPUT) ddl=$(awk -F'\t' 'FNR == 2 {print $10}' $OUTPUT) + +echo \"ST\",\"aroE\",\"gdh\",\"gki\",\"recP\",\"spi\",\"xpt\",\"ddl\" > $MLST_REPORT +echo \"$ST\",\"$aroE\",\"$gdh\",\"$gki\",\"$recP\",\"$spi\",\"$xpt\",\"$ddl\" >> $MLST_REPORT \ No newline at end of file diff --git a/modules/mlst.nf b/modules/mlst.nf index cc766d4..c8d12e4 100644 --- a/modules/mlst.nf +++ b/modules/mlst.nf @@ -9,11 +9,13 @@ process MLST { tuple val(sample_id), path(assembly) output: - tuple val(sample_id), env(ST), env(aroE), env(gdh), env(gki), env(recP), env(spi), env(xpt), env(ddl), emit: result + tuple val(sample_id), path(mlst_report), emit: report script: + mlst_report='mlst_report.csv' """ ASSEMBLY="$assembly" + MLST_REPORT="$mlst_report" source get_mlst.sh """ diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 257d4dd..32089a9 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -165,6 +165,7 @@ workflow PIPELINE { .join(TAXONOMY_QC.out.report, failOnDuplicate: true, remainder: true) .join(OVERALL_QC.out.report, failOnDuplicate: true, remainder: true) .join(SEROTYPE.out.report, failOnDuplicate: true, remainder: true) + .join(MLST.out.report, failOnDuplicate: true, remainder: true) .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name .map { [it[0], it[1..-1].minus(null)] } ).view() From 6faf88f4ddab167c23fcde04efe2ecd14e65bd60 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 21 Jul 2023 15:21:49 +0000 Subject: [PATCH 048/157] Save PBP AMR report as .csv Former-commit-id: 51aebdc959fab1e66639e7567ac369175169f5a8 --- bin/get_pbp_resistance.sh | 3 +++ modules/amr.nf | 4 +++- workflows/pipeline.nf | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/bin/get_pbp_resistance.sh b/bin/get_pbp_resistance.sh index d7082eb..5e833c3 100755 --- a/bin/get_pbp_resistance.sh +++ b/bin/get_pbp_resistance.sh @@ -30,3 +30,6 @@ MER=$(GET_RES "mem") PEN_MIC=$(GET_VALUE "penMic") PEN_NONMENINGITIS=$(GET_RES "penNonMeningitis") PEN_MENINGITIS=$(GET_RES "penMeningitis") + +echo \"pbp1a\",\"pbp2b\",\"pbp2x\",\"AMO_MIC\",\"AMO_Res\",\"CFT_MIC\",\"CFT_Res\(Meningital\)\",\"CFT_Res\(Non-meningital\)\",\"TAX_MIC\",\"TAX_Res\(Meningital\)\",\"TAX_Res\(Non-meningital\)\",\"CFX_MIC\",\"CFX_Res\",\"MER_MIC\",\"MER_Res\",\"PEN_MIC\",\"PEN_Res\(Meningital\)\",\"PEN_Res\(Non-meningital\)\" > $PBP_AMR_REPORT +echo \"$pbp1a\",\"$pbp2b\",\"$pbp2x\",\"$AMO_MIC\",\"$AMO\",\"$CFT_MIC\",\"$CFT_MENINGITIS\",\"$CFT_NONMENINGITIS\",\"$TAX_MIC\",\"$TAX_MENINGITIS\",\"$TAX_NONMENINGITIS\",\"$CFX_MIC\",\"$CFX\",\"$MER_MIC\",\"$MER\",\"$PEN_MIC\",\"$PEN_MENINGITIS\",\"$PEN_NONMENINGITIS\" >> $PBP_AMR_REPORT \ No newline at end of file diff --git a/modules/amr.nf b/modules/amr.nf index 6a2a0bf..3aabed7 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -29,11 +29,13 @@ process GET_PBP_RESISTANCE { tuple val(sample_id), path(json) output: - tuple val(sample_id), env(pbp1a), env(pbp2b), env(pbp2x), env(AMO_MIC), env(AMO), env(CFT_MIC), env(CFT_MENINGITIS), env(CFT_NONMENINGITIS), env(TAX_MIC), env(TAX_MENINGITIS), env(TAX_NONMENINGITIS), env(CFX_MIC), env(CFX), env(MER_MIC), env(MER), env(PEN_MIC), env(PEN_MENINGITIS), env(PEN_NONMENINGITIS), emit: result + tuple val(sample_id), path(pbp_amr_report), emit: report script: + pbp_amr_report='pbp_amr_report.csv' """ JSON_FILE="$json" + PBP_AMR_REPORT="$pbp_amr_report" source get_pbp_resistance.sh """ diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 32089a9..c78f5fa 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -166,6 +166,7 @@ workflow PIPELINE { .join(OVERALL_QC.out.report, failOnDuplicate: true, remainder: true) .join(SEROTYPE.out.report, failOnDuplicate: true, remainder: true) .join(MLST.out.report, failOnDuplicate: true, remainder: true) + .join(GET_PBP_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name .map { [it[0], it[1..-1].minus(null)] } ).view() From 505152f09c98a5758bb952ea4795f2f1f120a158 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 21 Jul 2023 15:39:52 +0000 Subject: [PATCH 049/157] Save other AMR report as .csv Former-commit-id: 2bb7ad12dd1b2555da6e52cd4432525929925680 --- bin/get_other_resistance.py | 7 +++++-- bin/get_other_resistance.sh | 40 ------------------------------------- modules/amr.nf | 9 +++------ workflows/pipeline.nf | 1 + 4 files changed, 9 insertions(+), 48 deletions(-) delete mode 100755 bin/get_other_resistance.sh diff --git a/bin/get_other_resistance.py b/bin/get_other_resistance.py index 4f71294..d902f59 100755 --- a/bin/get_other_resistance.py +++ b/bin/get_other_resistance.py @@ -5,11 +5,13 @@ import sys from itertools import chain from collections import defaultdict -import json +import pandas as pd +import csv report_path = sys.argv[1] debug_report_path = sys.argv[2] metadata_path = sys.argv[3] +output_file = sys.argv[4] with open(report_path) as report, open(debug_report_path) as debug_report, open(metadata_path) as metadata: # For saving (reference, gene, var_only) combinations as key and their information ({var_change: target}) as value found in metadata @@ -127,4 +129,5 @@ output['ERY_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['ERY'])) if 'ERY' in target_dict and len(target_dict['ERY']) != 0 else output['ERY_CLI_Determinant'] output['CLI_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['CLI'])) if 'CLI' in target_dict and len(target_dict['CLI']) != 0 else output['ERY_CLI_Determinant'] - print(json.dumps(output, indent=4)) \ No newline at end of file + # Save output dict as csv + pd.DataFrame([output]).to_csv(output_file, index=False, quoting=csv.QUOTE_ALL) \ No newline at end of file diff --git a/bin/get_other_resistance.sh b/bin/get_other_resistance.sh deleted file mode 100755 index befd4a4..0000000 --- a/bin/get_other_resistance.sh +++ /dev/null @@ -1,40 +0,0 @@ -# Run get_other_resistance.py to infer AMR from ARIBA reports, then capture individual AMR from the output for Nextflow - -function GET_VALUE { - echo $(grep \"$1\" <<< $OUTPUT | sed -r 's/.+: "(.*)",?/\1/') -} - -OUTPUT=$(get_other_resistance.py "$REPORT" "$REPORT_DEBUG" "$METADATA") - -CHL_Res=$(GET_VALUE "CHL_Res") -CHL_Determinant=$(GET_VALUE "CHL_Determinant") -ERY_Res=$(GET_VALUE "ERY_Res") -ERY_Determinant=$(GET_VALUE "ERY_Determinant") -CLI_Res=$(GET_VALUE "CLI_Res") -CLI_Determinant=$(GET_VALUE "CLI_Determinant") -ERY_CLI_Res=$(GET_VALUE "ERY_CLI_Res") -ERY_CLI_Determinant=$(GET_VALUE "ERY_CLI_Determinant") -FQ_Res=$(GET_VALUE "FQ_Res") -FQ_Determinant=$(GET_VALUE "FQ_Determinant") -LFX_Res=$(GET_VALUE "LFX_Res") -LFX_Determinant=$(GET_VALUE "LFX_Determinant") -KAN_Res=$(GET_VALUE "KAN_Res") -KAN_Determinant=$(GET_VALUE "KAN_Determinant") -TET_Res=$(GET_VALUE "TET_Res") -TET_Determinant=$(GET_VALUE "TET_Determinant") -DOX_Res=$(GET_VALUE "DOX_Res") -DOX_Determinant=$(GET_VALUE "DOX_Determinant") -TMP_Res=$(GET_VALUE "TMP_Res") -TMP_Determinant=$(GET_VALUE "TMP_Determinant") -SMX_Res=$(GET_VALUE "SMX_Res") -SMX_Determinant=$(GET_VALUE "SMX_Determinant") -COT_Res=$(GET_VALUE "COT_Res") -COT_Determinant=$(GET_VALUE "COT_Determinant") -RIF_Res=$(GET_VALUE "RIF_Res") -RIF_Determinant=$(GET_VALUE "RIF_Determinant") -VAN_Res=$(GET_VALUE "VAN_Res") -VAN_Determinant=$(GET_VALUE "VAN_Determinant") -PILI1=$(GET_VALUE "PILI1") -PILI1_Determinant=$(GET_VALUE "PILI1_Determinant") -PILI2=$(GET_VALUE "PILI2") -PILI2_Determinant=$(GET_VALUE "PILI2_Determinant") \ No newline at end of file diff --git a/modules/amr.nf b/modules/amr.nf index 3aabed7..1fd57f4 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -104,14 +104,11 @@ process GET_OTHER_RESISTANCE { path metadata output: - tuple val(sample_id), env(CHL_Res), env(CHL_Determinant), env(ERY_Res), env(ERY_Determinant), env(CLI_Res), env(CLI_Determinant), env(ERY_CLI_Res), env(ERY_CLI_Determinant), env(FQ_Res), env(FQ_Determinant), env(LFX_Res), env(LFX_Determinant), env(KAN_Res), env(KAN_Determinant), env(TET_Res), env(TET_Determinant), env(DOX_Res), env(DOX_Determinant), env(TMP_Res), env(TMP_Determinant), env(SMX_Res), env(SMX_Determinant), env(COT_Res), env(COT_Determinant), env(RIF_Res), env(RIF_Determinant), env(VAN_Res), env(VAN_Determinant), env(PILI1), env(PILI1_Determinant), env(PILI2), env(PILI2_Determinant), emit: result + tuple val(sample_id), path(output_file), emit: report script: + output_file="other_amr_report.csv" """ - REPORT="$report" - REPORT_DEBUG="$report_debug" - METADATA="$metadata" - - source get_other_resistance.sh + get_other_resistance.py "$report" "$report_debug" "$metadata" "$output_file" """ } diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index c78f5fa..f8fc630 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -167,6 +167,7 @@ workflow PIPELINE { .join(SEROTYPE.out.report, failOnDuplicate: true, remainder: true) .join(MLST.out.report, failOnDuplicate: true, remainder: true) .join(GET_PBP_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) + .join(GET_OTHER_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name .map { [it[0], it[1..-1].minus(null)] } ).view() From 6eff23c3a2555ada827883b9b906c4dab7e5c691 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 24 Jul 2023 13:58:13 +0000 Subject: [PATCH 050/157] Improve clarity of Read QC module Former-commit-id: e94cc56378de3fef6c4ff3debfcec9296dc31ff8 --- README.md | 2 +- doc/workflow.drawio.svg | 100 +++++++++++++++++++++++----------------- 2 files changed, 59 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 8fdc0d6..149928e 100644 --- a/README.md +++ b/README.md @@ -184,7 +184,7 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | `--assembly_publish` | `"link"` or `"symlink"` or `"copy"`(Default: `"link"`)| Method used by Nextflow to publish the generated assemblies.(The default setting `"link"` means hard link, therefore will fail if the output directory is set to outside of the working file system) | ## QC Parameters -> ℹ️ Read QC does not have directly accessible parameters. The minimum base count in reads of Read QC is based on the multiplication of `--length_low` and `--depth` of Assembly QC. +> ℹ️ Read QC does not have directly accessible parameters. The minimum base count in reads of Read QC is based on the multiplication of `--length_low` and `--depth` of Assembly QC (i.e. default value is `38000000`). | Option | Values | Description | | --- | ---| --- | diff --git a/doc/workflow.drawio.svg b/doc/workflow.drawio.svg index f2e08ab..00766b6 100644 --- a/doc/workflow.drawio.svg +++ b/doc/workflow.drawio.svg @@ -1,4 +1,4 @@ - + @@ -157,10 +157,10 @@ - - - - + + + + @@ -279,8 +279,8 @@ - - + + @@ -302,13 +302,13 @@ - - - + + + - + @@ -320,17 +320,17 @@ - + PBP... - - + + - + @@ -342,7 +342,7 @@ - + MLST... @@ -368,12 +368,12 @@ - - + + - + @@ -385,17 +385,17 @@ - + Line... - - + + - + @@ -407,17 +407,17 @@ - + Sero... - - + + - + @@ -429,7 +429,7 @@ - + Othe... @@ -459,21 +459,12 @@ - Read QC - - - + - - Go / No-go - - Bases: - - - ≥ Min Length x Depth + + Bases: ≥ 38 Mb @@ -482,9 +473,9 @@ Go / No-go - - - + + + @@ -526,6 +517,31 @@ + + + + + + + + + + Read QC + + + + + + + Read... + + + + + + QC values shown in the diagram are the default values + + From f1c30b345354f83c101e7be3e25a38db99320d6a Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 24 Jul 2023 15:47:35 +0000 Subject: [PATCH 051/157] Correct output column names Former-commit-id: fe079b96442e9ba534703488a885f74f90584e94 --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 149928e..aa68b3e 100644 --- a/README.md +++ b/README.md @@ -351,10 +351,10 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | `RIF_Determinant` | Other AMR | Known determinants that inferred the RIF resistance | | `VAN_Res` | Other AMR | Resistance phenotype against Vancomycin (VAN) | | `VAN_Determinant` | Other AMR | Known determinants that inferred the VAN resistance | - | `PILI-1` | Other AMR | Expression of PILI-1 | - | `PILI-1_Determinant` | Other AMR | Known determinants that inferred the PILI-1 expression | - | `PILI-2` | Other AMR | Expression of PILI-2 | - | `PILI-2_Determinant` | Other AMR | Known determinants that inferred the PILI-2 expression | + | `PILI1` | Other AMR | Expression of PILI-1 | + | `PILI1_Determinant` | Other AMR | Known determinants that inferred the PILI-1 expression | + | `PILI2` | Other AMR | Expression of PILI-2 | + | `PILI2_Determinant` | Other AMR | Known determinants that inferred the PILI-2 expression | # Credits From 5b59a0bc31c4730342280b6d09e17284de07169d Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 24 Jul 2023 15:53:21 +0000 Subject: [PATCH 052/157] Initial implementation of overall report revamp Former-commit-id: 19312f9574d53d1cfd3f5b49e27044d264865c37 --- bin/generate_overall_report.py | 22 ++++++++++++++++++++++ modules/output.nf | 25 +++++++++++++++++++++++-- workflows/pipeline.nf | 6 +++--- 3 files changed, 48 insertions(+), 5 deletions(-) create mode 100755 bin/generate_overall_report.py diff --git a/bin/generate_overall_report.py b/bin/generate_overall_report.py new file mode 100755 index 0000000..55287d2 --- /dev/null +++ b/bin/generate_overall_report.py @@ -0,0 +1,22 @@ +#! /usr/bin/env python3 + +import sys +import glob +import pandas as pd + +workdir_path = sys.argv[1] +ariba_metadata = sys.argv[2] +output_file = sys.argv[3] + +output_columns = ['Sample_ID' , 'Read_QC' , 'Assembly_QC' , 'Mapping_QC' , 'Taxonomy_QC' , 'Overall_QC' , 'Bases' , 'Contigs#' , 'Assembly_Length' , 'Seq_Depth' , 'Ref_Cov_%' , 'Het-SNP#' , 'S.Pneumo_%' , 'GPSC' , 'Serotype' , 'ST' , 'aroE' , 'gdh' , 'gki' , 'recP' , 'spi' , 'xpt' , 'ddl' , 'pbp1a' , 'pbp2b' , 'pbp2x' , 'AMO_MIC' , 'AMO_Res' , 'CFT_MIC' , 'CFT_Res(Meningital)' , 'CFT_Res(Non-meningital)' , 'TAX_MIC' , 'TAX_Res(Meningital)' , 'TAX_Res(Non-meningital)' , 'CFX_MIC' , 'CFX_Res' , 'MER_MIC' , 'MER_Res' , 'PEN_MIC' , 'PEN_Res(Meningital)' , 'PEN_Res(Non-meningital)' , 'CHL_Res' , 'CHL_Determinant' , 'ERY_Res' , 'ERY_Determinant' , 'CLI_Res' , 'CLI_Determinant' , 'ERY_CLI_Res' , 'ERY_CLI_Determinant' , 'FQ_Res' , 'FQ_Determinant' , 'LFX_Res' , 'LFX_Determinant' , 'KAN_Res' , 'KAN_Determinant' , 'TET_Res' , 'TET_Determinant' , 'DOX_Res' , 'DOX_Determinant' , 'TMP_Res' , 'TMP_Determinant' , 'SMX_Res' , 'SMX_Determinant' , 'COT_Res' , 'COT_Determinant' , 'RIF_Res' , 'RIF_Determinant' , 'VAN_Res' , 'VAN_Determinant' , 'PILI1' , 'PILI1_Determinant' , 'PILI2' , 'PILI2_Determinant'] +df_manifest = pd.DataFrame(columns=output_columns) + +dfs = [df_manifest] + +reports = glob.glob(workdir_path +'/*.csv') +for report in reports: + df = pd.read_csv(report) + dfs.append(df) + +df_output = pd.concat(dfs, ignore_index=True).sort_values(by=['Sample_ID']) +df_output.to_csv(output_file, index=False, na_rep='_') diff --git a/modules/output.nf b/modules/output.nf index a711425..6bc6a27 100644 --- a/modules/output.nf +++ b/modules/output.nf @@ -8,7 +8,7 @@ process GENERATE_SAMPLE_REPORT { tuple val(sample_id), path ('report*.csv') output: - path sample_report + path sample_report, emit: report script: sample_report="${sample_id}_report.csv" @@ -18,4 +18,25 @@ process GENERATE_SAMPLE_REPORT { source generate_sample_report.sh """ -} \ No newline at end of file +} + +process GENERATE_OVERALL_REPORT { + label 'python_container' + label 'farm_low' + + publishDir "${params.output}", mode: "copy" + + input: + path 'report*.csv' + path "$ariba_metadata" + + output: + path "$overall_report", emit: report + + script: + overall_report='results.csv' + ariba_metadata='ariba_metadata.tsv' + """ + generate_overall_report.py `pwd` $ariba_metadata $overall_report + """ +} diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index f8fc630..e962742 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -8,7 +8,7 @@ include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS; LINEAGE } from "$projectDir/ include { GET_SEROBA_DB; CREATE_SEROBA_DB; SEROTYPE } from "$projectDir/modules/serotype" include { MLST } from "$projectDir/modules/mlst" include { PBP_RESISTANCE; GET_PBP_RESISTANCE; CREATE_ARIBA_DB; OTHER_RESISTANCE; GET_OTHER_RESISTANCE } from "$projectDir/modules/amr" -include { GENERATE_SAMPLE_REPORT } from "$projectDir/modules/output" +include { GENERATE_SAMPLE_REPORT; GENERATE_OVERALL_REPORT } from "$projectDir/modules/output" // Main pipeline workflow workflow PIPELINE { @@ -170,9 +170,9 @@ workflow PIPELINE { .join(GET_OTHER_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name .map { [it[0], it[1..-1].minus(null)] } - ).view() + ) - // GENERATE_OVERALL_REPORT + GENERATE_OVERALL_REPORT(GENERATE_SAMPLE_REPORT.out.report.collect(), params.ariba_metadata) // READ_QC.out.result // .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true) From 8949b84bed03b0be56da14999c98e97b6029405f Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 24 Jul 2023 16:01:39 +0000 Subject: [PATCH 053/157] Improve comments; remove obsolete code Former-commit-id: 4c7fc286c0a754820b2490a9cb9d08d486810e2c --- workflows/pipeline.nf | 64 +++---------------------------------------- 1 file changed, 4 insertions(+), 60 deletions(-) diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index e962742..fd288c9 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -147,17 +147,7 @@ workflow PIPELINE { OTHER_RESISTANCE(CREATE_ARIBA_DB.out.path, CREATE_ARIBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.reports, params.ariba_metadata) - // Generate results.csv by sorted sample_id based on merged Channels - // READ_QC.out.result, ASSEMBLY_QC.out.result, MAPPING_QC.out.result, TAXONOMY_QC.out.result, OVERALL_QC.out.result, - // READ_QC.out.bases, ASSEMBLY_QC.out.info, MAPPING_QC.out.info, TAXONOMY_QC.out.percentage - // LINEAGE.out.csv, - // SEROTYPE.out.result, - // MLST.out.result, - // GET_PBP_RESISTANCE.out.result, - // GET_OTHER_RESISTANCE.out.result - // - // Replace null with approiate amount of "_" items when sample_id does not exist in that output (i.e. QC rejected) - + // Generate sample reports by merging outputs from all result-generating modules GENERATE_SAMPLE_REPORT( READ_QC.out.report .join(ASSEMBLY_QC.out.report, failOnDuplicate: true, remainder: true) @@ -169,59 +159,13 @@ workflow PIPELINE { .join(GET_PBP_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) .join(GET_OTHER_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name - .map { [it[0], it[1..-1].minus(null)] } + .map { [it[0], it[1..-1].minus(null)] } // Map Sample_ID to index 0 and all reports (with null entries removed) as a list to index 1 ) + // Generate overall report by concatenating sample reports GENERATE_OVERALL_REPORT(GENERATE_SAMPLE_REPORT.out.report.collect(), params.ariba_metadata) - // READ_QC.out.result - // .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - // .join(MAPPING_QC.out.result, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - // .join(TAXONOMY_QC.out.result, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - // .join(OVERALL_QC.out.result, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['FAIL'] : it } - // .join(READ_QC.out.bases, failOnDuplicate: true, failOnMismatch: true) - // .join(ASSEMBLY_QC.out.info, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 3 : it } - // .join(MAPPING_QC.out.info, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 2 : it } - // .join(TAXONOMY_QC.out.percentage, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - // .join(LINEAGE.out.csv.splitCsv(skip: 1), failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - // .join(SEROTYPE.out.result, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] : it } - // .join(MLST.out.result, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 8 : it } - // .join(GET_PBP_RESISTANCE.out.result, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 18 : it } - // .join(GET_OTHER_RESISTANCE.out, failOnDuplicate: true, remainder: true) - // .map { (it[-1] == null) ? it[0..-2] + ['_'] * 24 : it } - // .map { it.collect {"\"$it\""}.join',' } - // .collectFile( - // name: 'results.csv', - // storeDir: "$params.output", - // seed: [ - // 'Sample_ID', - // 'Read_QC', 'Assembly_QC', 'Mapping_QC', 'Taxonomy_QC', 'Overall_QC', - // 'Bases', - // 'Contigs#' , 'Assembly_Length', 'Seq_Depth', - // 'Ref_Cov_%', 'Het-SNP#' , - // 'S.Pneumo_%', - // 'GPSC', - // 'Serotype', - // 'ST', 'aroE', 'gdh', 'gki', 'recP', 'spi', 'xpt', 'ddl', - // 'pbp1a', 'pbp2b', 'pbp2x', 'AMO_MIC', 'AMO_Res', 'CFT_MIC', 'CFT_Res(Meningital)', 'CFT_Res(Non-meningital)', 'TAX_MIC', 'TAX_Res(Meningital)', 'TAX_Res(Non-meningital)', 'CFX_MIC', 'CFX_Res', 'MER_MIC', 'MER_Res', 'PEN_MIC', 'PEN_Res(Meningital)', 'PEN_Res(Non-meningital)', - // 'CHL_Res', 'CHL_Determinant', 'ERY_Res', 'ERY_Determinant', 'CLI_Res', 'CLI_Determinant', 'ERY_CLI_Res', 'ERY_CLI_Determinant', 'FQ_Res', 'FQ_Determinant', 'LFX_Res', 'LFX_Determinant', 'KAN_Res', 'KAN_Determinant', 'TET_Res', 'TET_Determinant', 'DOX_Res', 'DOX_Determinant', 'TMP_Res', 'TMP_Determinant', 'SMX_Res', 'SMX_Determinant', 'COT_Res', 'COT_Determinant', 'RIF_Res', 'RIF_Determinant', 'VAN_Res', 'VAN_Determinant', 'PILI1', 'PILI1_Determinant', 'PILI2', 'PILI2_Determinant' - // ].join(','), - // sort: { it.split(',')[0] }, - // newLine: true - // ) - - // Pass to SAVE_INFO sub-workflow + // Pass databases information to SAVE_INFO sub-workflow DATABASES_INFO = CREATE_REF_GENOME_BWA_DB.out.path.map { [["bwa_db_path", it]] } .merge(CREATE_ARIBA_DB.out.path.map { [["ariba_db_path", it]] }) .merge(GET_KRAKEN2_DB.out.path.map { [["kraken2_db_path", it]] }) From 4c86c59e0d066a47b0898d687c3bf59287e7677a Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 24 Jul 2023 17:18:24 +0000 Subject: [PATCH 054/157] Generate results.csv based on ARIBA metadata Former-commit-id: 073ddd3b1bdd090026017eb34c57f1538e7b90e2 --- bin/generate_overall_report.py | 33 ++++++++++++++++++++++++++++++++- modules/output.nf | 3 +-- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/bin/generate_overall_report.py b/bin/generate_overall_report.py index 55287d2..e48fb53 100755 --- a/bin/generate_overall_report.py +++ b/bin/generate_overall_report.py @@ -8,7 +8,35 @@ ariba_metadata = sys.argv[2] output_file = sys.argv[3] -output_columns = ['Sample_ID' , 'Read_QC' , 'Assembly_QC' , 'Mapping_QC' , 'Taxonomy_QC' , 'Overall_QC' , 'Bases' , 'Contigs#' , 'Assembly_Length' , 'Seq_Depth' , 'Ref_Cov_%' , 'Het-SNP#' , 'S.Pneumo_%' , 'GPSC' , 'Serotype' , 'ST' , 'aroE' , 'gdh' , 'gki' , 'recP' , 'spi' , 'xpt' , 'ddl' , 'pbp1a' , 'pbp2b' , 'pbp2x' , 'AMO_MIC' , 'AMO_Res' , 'CFT_MIC' , 'CFT_Res(Meningital)' , 'CFT_Res(Non-meningital)' , 'TAX_MIC' , 'TAX_Res(Meningital)' , 'TAX_Res(Non-meningital)' , 'CFX_MIC' , 'CFX_Res' , 'MER_MIC' , 'MER_Res' , 'PEN_MIC' , 'PEN_Res(Meningital)' , 'PEN_Res(Non-meningital)' , 'CHL_Res' , 'CHL_Determinant' , 'ERY_Res' , 'ERY_Determinant' , 'CLI_Res' , 'CLI_Determinant' , 'ERY_CLI_Res' , 'ERY_CLI_Determinant' , 'FQ_Res' , 'FQ_Determinant' , 'LFX_Res' , 'LFX_Determinant' , 'KAN_Res' , 'KAN_Determinant' , 'TET_Res' , 'TET_Determinant' , 'DOX_Res' , 'DOX_Determinant' , 'TMP_Res' , 'TMP_Determinant' , 'SMX_Res' , 'SMX_Determinant' , 'COT_Res' , 'COT_Determinant' , 'RIF_Res' , 'RIF_Determinant' , 'VAN_Res' , 'VAN_Determinant' , 'PILI1' , 'PILI1_Determinant' , 'PILI2' , 'PILI2_Determinant'] +output_columns = ['Sample_ID' , 'Read_QC' , 'Assembly_QC' , 'Mapping_QC' , 'Taxonomy_QC' , 'Overall_QC' , 'Bases' , 'Contigs#' , 'Assembly_Length' , 'Seq_Depth' , 'Ref_Cov_%' , 'Het-SNP#' , 'S.Pneumo_%' , 'GPSC' , 'Serotype' , 'ST' , 'aroE' , 'gdh' , 'gki' , 'recP' , 'spi' , 'xpt' , 'ddl' , 'pbp1a' , 'pbp2b' , 'pbp2x' , 'AMO_MIC' , 'AMO_Res' , 'CFT_MIC' , 'CFT_Res(Meningital)' , 'CFT_Res(Non-meningital)' , 'TAX_MIC' , 'TAX_Res(Meningital)' , 'TAX_Res(Non-meningital)' , 'CFX_MIC' , 'CFX_Res' , 'MER_MIC' , 'MER_Res' , 'PEN_MIC' , 'PEN_Res(Meningital)' , 'PEN_Res(Non-meningital)'] + +ariba_targets = set(pd.read_csv(ariba_metadata, sep='\t')['target'].unique()) + +if 'TET' in ariba_targets: + ariba_targets.add('DOX') + +if 'FQ' in ariba_targets: + ariba_targets.add('LFX') + +if 'TMP' in ariba_targets and 'SMX' in ariba_targets: + ariba_targets.add('COT') + +if 'ERY_CLI' in ariba_targets: + ariba_targets.update(['ERY', 'CLI']) + +ariba_targets = sorted(ariba_targets) + +pilis = [] + +for target in ariba_targets: + if target.lower().startswith('pili'): + pilis.append(target) + else: + output_columns.extend([f'{target}_Res', f'{target}_Determinant']) + +for pili in pilis: + output_columns.extend([f'{pili}', f'{pili}_Determinant']) + df_manifest = pd.DataFrame(columns=output_columns) dfs = [df_manifest] @@ -19,4 +47,7 @@ dfs.append(df) df_output = pd.concat(dfs, ignore_index=True).sort_values(by=['Sample_ID']) + +df_output = df_output[output_columns] + df_output.to_csv(output_file, index=False, na_rep='_') diff --git a/modules/output.nf b/modules/output.nf index 6bc6a27..0fdbfc6 100644 --- a/modules/output.nf +++ b/modules/output.nf @@ -28,14 +28,13 @@ process GENERATE_OVERALL_REPORT { input: path 'report*.csv' - path "$ariba_metadata" + path 'ariba_metadata' output: path "$overall_report", emit: report script: overall_report='results.csv' - ariba_metadata='ariba_metadata.tsv' """ generate_overall_report.py `pwd` $ariba_metadata $overall_report """ From 7d9fb4a39521c3816fcf9a14cfbbcef478544dfb Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 25 Jul 2023 20:06:43 +0000 Subject: [PATCH 055/157] Improve shell scripts style Former-commit-id: fd905cfa221f04810dbbd64b64147854bc0ffef3 --- bin/assembly_qc.sh | 12 ++++----- bin/combine_info.sh | 6 ++--- bin/create_ariba_db.sh | 48 ++++++++++++++++----------------- bin/create_ref_genome_bwa_db.sh | 26 +++++++++--------- bin/create_seroba_db.sh | 6 ++--- 5 files changed, 49 insertions(+), 49 deletions(-) diff --git a/bin/assembly_qc.sh b/bin/assembly_qc.sh index 9d399fc..f23ff7f 100755 --- a/bin/assembly_qc.sh +++ b/bin/assembly_qc.sh @@ -1,14 +1,14 @@ # Extract assembly QC information and determine QC result based on report.tsv from Quast, total base count -CONTIGS=$(awk -F'\t' '$1 == "# contigs (>= 0 bp)" { print $2 }' $REPORT) -LENGTH=$(awk -F'\t' '$1 == "Total length" { print $2 }' $REPORT) -DEPTH=$(printf %.2f $(echo "$BASES / $LENGTH" | bc -l) ) +CONTIGS=$(awk -F'\t' '$1 == "# contigs (>= 0 bp)" { print $2 }' "$REPORT") +LENGTH=$(awk -F'\t' '$1 == "Total length" { print $2 }' "$REPORT") +DEPTH=$(echo "scale=2; $BASES / $LENGTH" | bc -l) -if (( $CONTIGS < $QC_CONTIGS )) && (( $LENGTH >= $QC_LENGTH_LOW )) && (( $LENGTH <= $QC_LENGTH_HIGH )) && (( $(echo "$DEPTH >= $QC_DEPTH" | bc -l) )); then +if [[ $CONTIGS -lt $QC_CONTIGS ]] && [[ $LENGTH -ge $QC_LENGTH_LOW ]] && [[ $LENGTH -le $QC_LENGTH_HIGH ]] && [[ "$(echo "$DEPTH >= $QC_DEPTH" | bc -l)" == 1 ]]; then ASSEMBLY_QC="PASS" else ASSEMBLY_QC="FAIL" fi -echo \"Assembly_QC\",\"Contigs#\",\"Assembly_Length\",\"Seq_Depth\" > $ASSEMBLY_QC_REPORT -echo \"$ASSEMBLY_QC\",\"$CONTIGS\",\"$LENGTH\",\"$DEPTH\" >> $ASSEMBLY_QC_REPORT \ No newline at end of file +echo \"Assembly_QC\",\"Contigs#\",\"Assembly_Length\",\"Seq_Depth\" > "$ASSEMBLY_QC_REPORT" +echo \""$ASSEMBLY_QC"\",\""$CONTIGS"\",\""$LENGTH"\",\""$DEPTH"\" >> "$ASSEMBLY_QC_REPORT" diff --git a/bin/combine_info.sh b/bin/combine_info.sh index 7409046..5275baa 100755 --- a/bin/combine_info.sh +++ b/bin/combine_info.sh @@ -1,12 +1,12 @@ # Combine pipeline version, Nextflow version, databases information, container images, tools version JSON files into the a single JSON file -jq -s '.[0] * .[1] * .[2]' $DATABASE $IMAGES $TOOLS > working.json +jq -s '.[0] * .[1] * .[2]' "$DATABASE" "$IMAGES" "$TOOLS" > working.json add_version () { - jq --arg entry $1 --arg version "$2" '.[$entry] += {"version": $version}' working.json > tmp.json && mv tmp.json working.json + jq --arg entry "$1" --arg version "$2" '.[$entry] += {"version": $version}' working.json > tmp.json && mv tmp.json working.json } add_version pipeline "$PIPELINE_VERSION" add_version nextflow "$NEXTFLOW_VERSION" -mv working.json $JSON_FILE +mv working.json "$JSON_FILE" diff --git a/bin/create_ariba_db.sh b/bin/create_ariba_db.sh index 289fff4..fb2b657 100755 --- a/bin/create_ariba_db.sh +++ b/bin/create_ariba_db.sh @@ -1,32 +1,32 @@ # Check if CREATE_ARIBA_DB has run successfully on the specific reference sequences and metadata. # If not: remove the $OUTPUT directory, and prepare the ARIBA database from reference sequences and metadata, also save metadata to JSON -REF_SEQUENCES_MD5=$(md5sum $REF_SEQUENCES | awk '{ print $1 }') -METADATA_MD5=$(md5sum $METADATA | awk '{ print $1 }') +REF_SEQUENCES_MD5=$(md5sum "$REF_SEQUENCES" | awk '{ print $1 }') +METADATA_MD5=$(md5sum "$METADATA" | awk '{ print $1 }') -if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ - [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$REF_SEQUENCES" ] || \ - [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$REF_SEQUENCES_MD5" ] || \ - [ ! "$(grep '"metadata"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$METADATA" ] || \ - [ ! "$(grep '"metadata_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$METADATA_MD5" ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/00.info.txt ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/00.version_info.txt ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_genes.log ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_metadata.log ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_metadata.tsv ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/01.filter.check_noncoding.log ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.all.fa ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.clusters.pickle ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.clusters.tsv ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.gene.fa ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.gene.varonly.fa ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.fa ] || \ - [ ! -f ${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.varonly.fa ] ; then +if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \ + [ ! "$(grep '"reference"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$REF_SEQUENCES" ] || \ + [ ! "$(grep '"reference_md5"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$REF_SEQUENCES_MD5" ] || \ + [ ! "$(grep '"metadata"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$METADATA" ] || \ + [ ! "$(grep '"metadata_md5"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$METADATA_MD5" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/00.info.txt" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/00.version_info.txt" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/01.filter.check_genes.log" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/01.filter.check_metadata.log" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/01.filter.check_metadata.tsv" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/01.filter.check_noncoding.log" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.all.fa" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.clusters.pickle" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.clusters.tsv" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.gene.fa" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.gene.varonly.fa" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.fa" ] || \ + [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.varonly.fa" ] ; then - rm -rf "$DB_LOCAL/$OUTPUT" + rm -rf "${DB_LOCAL:?}/${OUTPUT}" - ariba prepareref -f "$REF_SEQUENCES" -m "$METADATA" "$DB_LOCAL/$OUTPUT" + ariba prepareref -f "$REF_SEQUENCES" -m "$METADATA" "${DB_LOCAL}/${OUTPUT}" - echo -e "{\n \"reference\": \"$REF_SEQUENCES\",\n \"reference_md5\": \"$REF_SEQUENCES_MD5\",\n \"metadata\": \"$METADATA\",\n \"metadata_md5\": \"$METADATA_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON_FILE} + echo -e "{\n \"reference\": \"$REF_SEQUENCES\",\n \"reference_md5\": \"$REF_SEQUENCES_MD5\",\n \"metadata\": \"$METADATA\",\n \"metadata_md5\": \"$METADATA_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > "${DB_LOCAL}/${JSON_FILE}" -fi \ No newline at end of file +fi diff --git a/bin/create_ref_genome_bwa_db.sh b/bin/create_ref_genome_bwa_db.sh index 5bd277a..385b609 100755 --- a/bin/create_ref_genome_bwa_db.sh +++ b/bin/create_ref_genome_bwa_db.sh @@ -1,23 +1,23 @@ # Check if CREATE_REF_GENOME_BWA_DB has run successfully on the specific reference. # If not: remove files in database directory, and construct the FM-index database of the reference genome for BWA, also save metadata to JSON -REFERENCE_MD5=$(md5sum $REFERENCE | awk '{ print $1 }') +REFERENCE_MD5=$(md5sum "$REFERENCE" | awk '{ print $1 }') -if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ - [ ! "$(grep '"reference"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$REFERENCE" ] || \ - [ ! "$(grep '"reference_md5"' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "$REFERENCE_MD5" ] || \ - [ ! -f ${DB_LOCAL}/${PREFIX}.amb ] || \ - [ ! -f ${DB_LOCAL}/${PREFIX}.ann ] || \ - [ ! -f ${DB_LOCAL}/${PREFIX}.bwt ] || \ - [ ! -f ${DB_LOCAL}/${PREFIX}.pac ] || \ - [ ! -f ${DB_LOCAL}/${PREFIX}.sa ] ; then +if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \ + [ ! "$(grep '"reference"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$REFERENCE" ] || \ + [ ! "$(grep '"reference_md5"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$REFERENCE_MD5" ] || \ + [ ! -f "${DB_LOCAL}/${PREFIX}.amb" ] || \ + [ ! -f "${DB_LOCAL}/${PREFIX}.ann" ] || \ + [ ! -f "${DB_LOCAL}/${PREFIX}.bwt" ] || \ + [ ! -f "${DB_LOCAL}/${PREFIX}.pac" ] || \ + [ ! -f "${DB_LOCAL}/${PREFIX}.sa" ] ; then - rm -rf ${DB_LOCAL}/{,.[!.],..?}* + rm -rf "${DB_LOCAL:?}"/{,.[!.],..?}* - bwa index -p $PREFIX $REFERENCE + bwa index -p "$PREFIX" "$REFERENCE" - mv ${PREFIX}.amb ${PREFIX}.ann ${PREFIX}.bwt ${PREFIX}.pac ${PREFIX}.sa -t $DB_LOCAL + mv "${PREFIX}.amb" "${PREFIX}.ann" "${PREFIX}.bwt" "${PREFIX}.pac" "${PREFIX}.sa" -t "$DB_LOCAL" - echo -e "{\n \"reference\": \"$REFERENCE\",\n \"reference_md5\": \"$REFERENCE_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON_FILE} + echo -e "{\n \"reference\": \"$REFERENCE\",\n \"reference_md5\": \"$REFERENCE_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > "${DB_LOCAL}/${JSON_FILE}" fi diff --git a/bin/create_seroba_db.sh b/bin/create_seroba_db.sh index 21a058f..3ff36b2 100755 --- a/bin/create_seroba_db.sh +++ b/bin/create_seroba_db.sh @@ -1,9 +1,9 @@ # If create_db is true: re-create KMC and ARIBA databases, also save metadata to JSON -if [ $CREATE_DB = true ]; then +if [ "$CREATE_DB" = true ]; then - seroba createDBs ${DB_LOCAL}/${DATABASE}/ ${KMER} + seroba createDBs "${DB_LOCAL}/${DATABASE}/" "${KMER}" - echo -e "{\n \"git\": \"$DB_REMOTE\",\n \"kmer\": \"$KMER\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > ${DB_LOCAL}/${JSON_FILE} + echo -e "{\n \"git\": \"$DB_REMOTE\",\n \"kmer\": \"$KMER\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > "${DB_LOCAL}/${JSON_FILE}" fi From 69290926d97146d7ee1ee6e5bbd43d67ded61759 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 27 Jul 2023 13:40:52 +0000 Subject: [PATCH 056/157] Refactor to improve maintainability & readability Former-commit-id: 29caffdda1fbb8858cb80aeba3ef93c699a86f0d --- bin/generate_overall_report.py | 136 +++++++++++++++++++++------------ 1 file changed, 89 insertions(+), 47 deletions(-) diff --git a/bin/generate_overall_report.py b/bin/generate_overall_report.py index e48fb53..602e1d0 100755 --- a/bin/generate_overall_report.py +++ b/bin/generate_overall_report.py @@ -1,53 +1,95 @@ #! /usr/bin/env python3 +# Generate overall report based on sample reports and columns specified by COLUMNS_BY_CATEGORY and ARIBA metadata + import sys +from itertools import chain +import pandas as pd import glob -import pandas as pd - -workdir_path = sys.argv[1] -ariba_metadata = sys.argv[2] -output_file = sys.argv[3] - -output_columns = ['Sample_ID' , 'Read_QC' , 'Assembly_QC' , 'Mapping_QC' , 'Taxonomy_QC' , 'Overall_QC' , 'Bases' , 'Contigs#' , 'Assembly_Length' , 'Seq_Depth' , 'Ref_Cov_%' , 'Het-SNP#' , 'S.Pneumo_%' , 'GPSC' , 'Serotype' , 'ST' , 'aroE' , 'gdh' , 'gki' , 'recP' , 'spi' , 'xpt' , 'ddl' , 'pbp1a' , 'pbp2b' , 'pbp2x' , 'AMO_MIC' , 'AMO_Res' , 'CFT_MIC' , 'CFT_Res(Meningital)' , 'CFT_Res(Non-meningital)' , 'TAX_MIC' , 'TAX_Res(Meningital)' , 'TAX_Res(Non-meningital)' , 'CFX_MIC' , 'CFX_Res' , 'MER_MIC' , 'MER_Res' , 'PEN_MIC' , 'PEN_Res(Meningital)' , 'PEN_Res(Non-meningital)'] - -ariba_targets = set(pd.read_csv(ariba_metadata, sep='\t')['target'].unique()) - -if 'TET' in ariba_targets: - ariba_targets.add('DOX') - -if 'FQ' in ariba_targets: - ariba_targets.add('LFX') - -if 'TMP' in ariba_targets and 'SMX' in ariba_targets: - ariba_targets.add('COT') - -if 'ERY_CLI' in ariba_targets: - ariba_targets.update(['ERY', 'CLI']) - -ariba_targets = sorted(ariba_targets) - -pilis = [] - -for target in ariba_targets: - if target.lower().startswith('pili'): - pilis.append(target) - else: - output_columns.extend([f'{target}_Res', f'{target}_Determinant']) - -for pili in pilis: - output_columns.extend([f'{pili}', f'{pili}_Determinant']) - -df_manifest = pd.DataFrame(columns=output_columns) - -dfs = [df_manifest] - -reports = glob.glob(workdir_path +'/*.csv') -for report in reports: - df = pd.read_csv(report) - dfs.append(df) - -df_output = pd.concat(dfs, ignore_index=True).sort_values(by=['Sample_ID']) -df_output = df_output[output_columns] -df_output.to_csv(output_file, index=False, na_rep='_') +# Specify columns need to be included in the output file and their orders (except those based on ARIBA metadata) +COLUMNS_BY_CATEGORY = { + 'IDENTIFICATION': ['Sample_ID'], + 'QC': ['Read_QC' , 'Assembly_QC' , 'Mapping_QC' , 'Taxonomy_QC' , 'Overall_QC'] , + 'READ': ['Bases'], + 'ASSEMBLY': ['Contigs#' , 'Assembly_Length' , 'Seq_Depth'], + 'MAPPING': ['Ref_Cov_%' , 'Het-SNP#'], + 'TAXONOMY': ['S.Pneumo_%'], + 'LINEAGE': ['GPSC'], + 'SEROTYPE': ['Serotype'], + 'MLST': ['ST' , 'aroE' , 'gdh' , 'gki' , 'recP' , 'spi' , 'xpt' , 'ddl'], + 'PBP': ['pbp1a' , 'pbp2b' , 'pbp2x' , 'AMO_MIC' , 'AMO_Res' , 'CFT_MIC' , 'CFT_Res(Meningital)' , 'CFT_Res(Non-meningital)' , 'TAX_MIC' , 'TAX_Res(Meningital)' , 'TAX_Res(Non-meningital)' , 'CFX_MIC' , 'CFX_Res' , 'MER_MIC' , 'MER_Res' , 'PEN_MIC' , 'PEN_Res(Meningital)' , 'PEN_Res(Non-meningital)'] +} + + +# Check argv and save the global variables +if len(sys.argv) != 4: + sys.exit('Usage: generate_overall_report.py WORKDIR_PATH ARIBA_METADATA OUTPUT_FILE') +WORKDIR_PATH = sys.argv[1] +ARIBA_METADATA = sys.argv[2] +OUTPUT_FILE = sys.argv[3] + + +def main(): + output_columns = get_output_columns() + df_output = get_df_output(output_columns) + + # Saving df_output to output_file in csv format + df_output.to_csv(OUTPUT_FILE, index=False, na_rep='_') + + +# Get output columns based on COLUMNS_BY_CATEGORY and ARIBA metadata +def get_output_columns(): + output_columns = list(chain.from_iterable(COLUMNS_BY_CATEGORY.values())) + add_ariba_columns(output_columns) + return output_columns + + +# Based on ARIBA metadata, add additional output columns +def add_ariba_columns(output_columns): + # Get all targets in ARIBA metadata + ariba_targets = set(pd.read_csv(ARIBA_METADATA, sep='\t')['target'].unique()) + + # Adding special cases if certain targets exist + if 'TET' in ariba_targets: + ariba_targets.add('DOX') + if 'FQ' in ariba_targets: + ariba_targets.add('LFX') + if 'TMP' in ariba_targets and 'SMX' in ariba_targets: + ariba_targets.add('COT') + if 'ERY_CLI' in ariba_targets: + ariba_targets.update(['ERY', 'CLI']) + + # Add all targets alphabetically, except always adding PILI at the end + pilis = [] + for target in sorted(ariba_targets): + if target.lower().startswith('pili'): + pilis.append(target) + else: + output_columns.extend([f'{target}_Res', f'{target}_Determinant']) + for pili in pilis: + output_columns.extend([f'{pili}', f'{pili}_Determinant']) + + +# Generating df_output based on all sample reports with columns in the order of output_columns +def get_df_output(output_columns): + # Generate an empty dataframe as df_manifest based on output_columns + df_manifest = pd.DataFrame(columns=output_columns) + + # Generate a dataframe for each sample report and then concat df_manifest and all dataframes into df_output + dfs = [df_manifest] + reports = glob.glob(WORKDIR_PATH +'/*.csv') + for report in reports: + df = pd.read_csv(report) + dfs.append(df) + df_output = pd.concat(dfs, ignore_index=True).sort_values(by=['Sample_ID']) + + # Ensure column order in df_output is the same as output_columns + df_output = df_output[output_columns] + + return df_output + + +if __name__ == "__main__": + main() From ae3cce669b98644ec3ca8e96c9586bf648ba0b39 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 27 Jul 2023 14:57:36 +0000 Subject: [PATCH 057/157] Improve shell scripts style Former-commit-id: 60614c8ef117a5641b4fbbf92343e2721e6d27db --- bin/generate_sample_report.sh | 2 +- bin/get_databases_info.sh | 28 +++++++++++++------------- bin/get_docker_compose.sh | 8 ++++---- bin/get_images_info.sh | 38 +++++++++++++++++------------------ bin/get_kraken2_db.sh | 19 +++++++++--------- bin/get_lineage.sh | 4 ++-- bin/get_mlst.sh | 6 +++--- 7 files changed, 52 insertions(+), 53 deletions(-) diff --git a/bin/generate_sample_report.sh b/bin/generate_sample_report.sh index ec769f3..cb7ab52 100755 --- a/bin/generate_sample_report.sh +++ b/bin/generate_sample_report.sh @@ -1,3 +1,3 @@ paste -d , *.csv \ | sed '1 s/^/\"Sample_ID\",/' \ -| sed "2 s/^/\"${SAMPLE_ID}\",/" > $SAMPLE_REPORT \ No newline at end of file +| sed "2 s/^/\"${SAMPLE_ID}\",/" > "$SAMPLE_REPORT" diff --git a/bin/get_databases_info.sh b/bin/get_databases_info.sh index c87d56f..3d9dd98 100755 --- a/bin/get_databases_info.sh +++ b/bin/get_databases_info.sh @@ -3,9 +3,9 @@ add_bwa_db () { BWA_DB_JSON=${BWA_DB_PATH}/${BWA_JSON} if [ -f "$BWA_DB_JSON" ]; then - REFERENCE=$(jq -r .reference $BWA_DB_JSON) - REFERENCE_MD5=$(jq -r .reference_md5 $BWA_DB_JSON) - CREATE_TIME=$(jq -r .create_time $BWA_DB_JSON) + REFERENCE=$(jq -r .reference "$BWA_DB_JSON") + REFERENCE_MD5=$(jq -r .reference_md5 "$BWA_DB_JSON") + CREATE_TIME=$(jq -r .create_time "$BWA_DB_JSON") else REFERENCE="Not yet created" REFERENCE_MD5="Not yet created" @@ -17,11 +17,11 @@ add_bwa_db () { add_ariba_db () { ARIBA_DB_JSON=${ARIBA_DB_PATH}/${ARIBA_JSON} if [ -f "$ARIBA_DB_JSON" ]; then - REFERENCE=$(jq -r .reference $ARIBA_DB_JSON) - REFERENCE_MD5=$(jq -r .reference_md5 $ARIBA_DB_JSON) - METADATA=$(jq -r .metadata $ARIBA_DB_JSON) - METADATA_MD5=$(jq -r .metadata_md5 $ARIBA_DB_JSON) - CREATE_TIME=$(jq -r .create_time $ARIBA_DB_JSON) + REFERENCE=$(jq -r .reference "$ARIBA_DB_JSON") + REFERENCE_MD5=$(jq -r .reference_md5 "$ARIBA_DB_JSON") + METADATA=$(jq -r .metadata "$ARIBA_DB_JSON") + METADATA_MD5=$(jq -r .metadata_md5 "$ARIBA_DB_JSON") + CREATE_TIME=$(jq -r .create_time "$ARIBA_DB_JSON") else REFERENCE="Not yet created" REFERENCE_MD5="Not yet created" @@ -35,9 +35,9 @@ add_ariba_db () { add_seroba_db () { SEROBA_DB_JSON=${SEROBA_DB_PATH}/${SEROBA_JSON} if [ -f "$SEROBA_DB_JSON" ]; then - GIT=$(jq -r .git $SEROBA_DB_JSON) - KMER=$(jq -r .kmer $SEROBA_DB_JSON) - CREATE_TIME=$(jq -r .create_time $SEROBA_DB_JSON) + GIT=$(jq -r .git "$SEROBA_DB_JSON") + KMER=$(jq -r .kmer "$SEROBA_DB_JSON") + CREATE_TIME=$(jq -r .create_time "$SEROBA_DB_JSON") else GIT="Not yet created" KMER="Not yet created" @@ -49,8 +49,8 @@ add_seroba_db () { add_url_db () { DB_JSON=$1 if [ -f "$DB_JSON" ]; then - URL=$(jq -r .url $DB_JSON) - SAVE_TIME=$(jq -r .save_time $DB_JSON) + URL=$(jq -r .url "$DB_JSON") + SAVE_TIME=$(jq -r .save_time "$DB_JSON") else URL="Not yet downloaded" SAVE_TIME="Not yet downloaded" @@ -65,4 +65,4 @@ jq -n \ --argjson kraken2_db "$(add_url_db "${KRAKEN2_DB_PATH}/${KRAKEN2_JSON}")" \ --argjson poppunnk_db "$(add_url_db "${POPPUNK_DB_PATH}/${POPPUNK_JSON}")" \ --argjson poppunk_ext "$(add_url_db "${POPPUNK_DB_PATH}/${POPPUNK_EXT_JSON}")" \ - '$ARGS.named' > $JSON_FILE + '$ARGS.named' > "$JSON_FILE" diff --git a/bin/get_docker_compose.sh b/bin/get_docker_compose.sh index e581e54..5f8ff8b 100755 --- a/bin/get_docker_compose.sh +++ b/bin/get_docker_compose.sh @@ -2,13 +2,13 @@ COUNT=0 -echo "services:" >> $COMPOSE +echo "services:" >> "$COMPOSE" -grep -E "container\s?=" $NEXTFLOW_CONFIG \ +grep -E "container\s?=" "$NEXTFLOW_CONFIG" \ | sort -u \ | sed -r "s/\s+container\s?=\s?'(.+)'/\1/" \ | while read -r IMAGE ; do COUNT=$((COUNT+1)) - echo " SERVICE${COUNT}:" >> $COMPOSE - echo " image: $IMAGE" >> $COMPOSE + echo " SERVICE${COUNT}:" >> "$COMPOSE" + echo " image: $IMAGE" >> "$COMPOSE" done diff --git a/bin/get_images_info.sh b/bin/get_images_info.sh index 95dd83f..51b20aa 100755 --- a/bin/get_images_info.sh +++ b/bin/get_images_info.sh @@ -1,7 +1,7 @@ # Extract containers information from nextflow.config and save into a JSON file find_image () { - grep -E "container\s?=" -B 1 $NEXTFLOW_CONFIG | grep -v -- "^--$" | paste - - | sort -u | grep $1 | sed -r "s/.+container\s?=\s?'(.+)'/\1/" + grep -E "container\s?=" -B 1 "$NEXTFLOW_CONFIG" | grep -v -- "^--$" | paste - - | sort -u | grep "$1" | sed -r "s/.+container\s?=\s?'(.+)'/\1/" } BASH=$(find_image bash) @@ -22,24 +22,24 @@ KRAKEN2=$(find_image kraken2) SEROBA=$(find_image seroba) add_container () { - jq -n --arg container $1 '.container = $container' + jq -n --arg container "$1" '.container = $container' } jq -n \ - --argjson bash "$(add_container $BASH)" \ - --argjson git "$(add_container $GIT)" \ - --argjson python "$(add_container $PYTHON)" \ - --argjson fastp "$(add_container $FASTP)" \ - --argjson unicycler "$(add_container $UNICYCLER)" \ - --argjson shovill "$(add_container $SHOVILL)" \ - --argjson quast "$(add_container $QUAST)" \ - --argjson bwa "$(add_container $BWA)" \ - --argjson samtools "$(add_container $SAMTOOLS)" \ - --argjson bcftools "$(add_container $BCFTOOLS)" \ - --argjson poppunk "$(add_container $POPPUNK)" \ - --argjson spn_pbp_amr "$(add_container $SPN_PBP_AMR)" \ - --argjson ariba "$(add_container $ARIBA)" \ - --argjson mlst "$(add_container $MLST)" \ - --argjson kraken2 "$(add_container $KRAKEN2)" \ - --argjson seroba "$(add_container $SEROBA)" \ - '$ARGS.named' > $JSON_FILE + --argjson bash "$(add_container "$BASH")" \ + --argjson git "$(add_container "$GIT")" \ + --argjson python "$(add_container "$PYTHON")" \ + --argjson fastp "$(add_container "$FASTP")" \ + --argjson unicycler "$(add_container "$UNICYCLER")" \ + --argjson shovill "$(add_container "$SHOVILL")" \ + --argjson quast "$(add_container "$QUAST")" \ + --argjson bwa "$(add_container "$BWA")" \ + --argjson samtools "$(add_container "$SAMTOOLS")" \ + --argjson bcftools "$(add_container "$BCFTOOLS")" \ + --argjson poppunk "$(add_container "$POPPUNK")" \ + --argjson spn_pbp_amr "$(add_container "$SPN_PBP_AMR")" \ + --argjson ariba "$(add_container "$ARIBA")" \ + --argjson mlst "$(add_container "$MLST")" \ + --argjson kraken2 "$(add_container "$KRAKEN2")" \ + --argjson seroba "$(add_container "$SEROBA")" \ + '$ARGS.named' > "$JSON_FILE" diff --git a/bin/get_kraken2_db.sh b/bin/get_kraken2_db.sh index c53cc52..8632bc8 100755 --- a/bin/get_kraken2_db.sh +++ b/bin/get_kraken2_db.sh @@ -1,29 +1,28 @@ # Check if all file exists and were obtained from the database at the specific link. # If not: remove files in database directory, download, and unzip to database directory, also save metadata to JSON -DB_NAME=$(basename $DB_REMOTE) ZIPPED_DB='kraken2_db.tar.gz' -if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ - [ ! "$DB_REMOTE" == "$(jq -r .url ${DB_LOCAL}/${JSON_FILE})" ] || \ - [ ! -f ${DB_LOCAL}/hash.k2d ] || \ - [ ! -f ${DB_LOCAL}/opts.k2d ] || \ - [ ! -f ${DB_LOCAL}/taxo.k2d ]; then +if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \ + [ ! "$DB_REMOTE" == "$(jq -r .url "${DB_LOCAL}/${JSON_FILE}")" ] || \ + [ ! -f "${DB_LOCAL}/hash.k2d" ] || \ + [ ! -f "${DB_LOCAL}/opts.k2d" ] || \ + [ ! -f "${DB_LOCAL}/taxo.k2d" ]; then - rm -rf ${DB_LOCAL}/{,.[!.],..?}* + rm -rf "${DB_LOCAL:?}"/{,.[!.],..?}* - wget ${DB_REMOTE} -O $ZIPPED_DB + wget "${DB_REMOTE}" -O $ZIPPED_DB # Use tmp dir and find to ensure files are saved directly at $DB_LOCAL regardless of archive directory structure mkdir tmp tar -xzf $ZIPPED_DB -C tmp - find tmp -type f -exec mv {} $DB_LOCAL \; + find tmp -type f -exec mv {} "$DB_LOCAL" \; rm -f $ZIPPED_DB jq -n \ --arg url "${DB_REMOTE}" \ --arg save_time "$(date +"%Y-%m-%d %H:%M:%S %Z")" \ - '{"url" : $url, "save_time": $save_time}' > ${DB_LOCAL}/${JSON_FILE} + '{"url" : $url, "save_time": $save_time}' > "${DB_LOCAL}/${JSON_FILE}" fi diff --git a/bin/get_lineage.sh b/bin/get_lineage.sh index 63b6ec0..cd57737 100755 --- a/bin/get_lineage.sh +++ b/bin/get_lineage.sh @@ -6,8 +6,8 @@ # Save results of individual sample into .csv with its name as filename sed 's/^/prefix_/' "$QFILE" > safe_qfile.txt -poppunk_assign --db "${POPPUNK_DIR}/${DB_NAME}" --external-clustering "${POPPUNK_DIR}/${EXT_CLUSTERS_FILE}" --query safe_qfile.txt --output output --threads $(nproc) +poppunk_assign --db "${POPPUNK_DIR}/${DB_NAME}" --external-clustering "${POPPUNK_DIR}/${EXT_CLUSTERS_FILE}" --query safe_qfile.txt --output output --threads "$(nproc)" sed 's/^prefix_//' output/output_external_clusters.csv > result.txt -awk -F , 'NR!=1 { print "\"GPSC\"\n" "\"" $2 "\"" > $1 ".csv" }' result.txt \ No newline at end of file +awk -F , 'NR!=1 { print "\"GPSC\"\n" "\"" $2 "\"" > $1 ".csv" }' result.txt diff --git a/bin/get_mlst.sh b/bin/get_mlst.sh index ab7c8e9..72e0400 100755 --- a/bin/get_mlst.sh +++ b/bin/get_mlst.sh @@ -2,7 +2,7 @@ OUTPUT='output.tsv' -mlst --legacy --scheme spneumoniae "$ASSEMBLY" > $OUTPUT +mlst --legacy --scheme spneumoniae "$ASSEMBLY" > "$OUTPUT" ST=$(awk -F'\t' 'FNR == 2 {print $3}' $OUTPUT) aroE=$(awk -F'\t' 'FNR == 2 {print $4}' $OUTPUT) @@ -13,5 +13,5 @@ spi=$(awk -F'\t' 'FNR == 2 {print $8}' $OUTPUT) xpt=$(awk -F'\t' 'FNR == 2 {print $9}' $OUTPUT) ddl=$(awk -F'\t' 'FNR == 2 {print $10}' $OUTPUT) -echo \"ST\",\"aroE\",\"gdh\",\"gki\",\"recP\",\"spi\",\"xpt\",\"ddl\" > $MLST_REPORT -echo \"$ST\",\"$aroE\",\"$gdh\",\"$gki\",\"$recP\",\"$spi\",\"$xpt\",\"$ddl\" >> $MLST_REPORT \ No newline at end of file +echo \"ST\",\"aroE\",\"gdh\",\"gki\",\"recP\",\"spi\",\"xpt\",\"ddl\" > "$MLST_REPORT" +echo \""$ST"\",\""$aroE"\",\""$gdh"\",\""$gki"\",\""$recP"\",\""$spi"\",\""$xpt"\",\""$ddl"\" >> "$MLST_REPORT" From 2449fdaf9e26ec61bf606682d8067edf6bf28656 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 27 Jul 2023 17:04:47 +0000 Subject: [PATCH 058/157] Fix comment Former-commit-id: 00e3de37a77c7ee13aa179509a78d69e635870b0 --- bin/generate_overall_report.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/generate_overall_report.py b/bin/generate_overall_report.py index 602e1d0..f238979 100755 --- a/bin/generate_overall_report.py +++ b/bin/generate_overall_report.py @@ -23,7 +23,7 @@ } -# Check argv and save the global variables +# Check argv and save to global variables if len(sys.argv) != 4: sys.exit('Usage: generate_overall_report.py WORKDIR_PATH ARIBA_METADATA OUTPUT_FILE') WORKDIR_PATH = sys.argv[1] @@ -35,7 +35,7 @@ def main(): output_columns = get_output_columns() df_output = get_df_output(output_columns) - # Saving df_output to output_file in csv format + # Saving df_output to OUTPUT_FILE in csv format df_output.to_csv(OUTPUT_FILE, index=False, na_rep='_') From e0d2010b02cb46362e05f815e3e1e3552a5affba Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Thu, 27 Jul 2023 17:10:04 +0000 Subject: [PATCH 059/157] Refactor to improve maintainability & readability Former-commit-id: b813ad0a8aba3f2e252b5d00d659e96e64d024cc --- bin/get_other_resistance.py | 169 +++++++++++++++++++++--------------- 1 file changed, 100 insertions(+), 69 deletions(-) diff --git a/bin/get_other_resistance.py b/bin/get_other_resistance.py index d902f59..8743ff7 100755 --- a/bin/get_other_resistance.py +++ b/bin/get_other_resistance.py @@ -8,72 +8,97 @@ import pandas as pd import csv -report_path = sys.argv[1] -debug_report_path = sys.argv[2] -metadata_path = sys.argv[3] -output_file = sys.argv[4] -with open(report_path) as report, open(debug_report_path) as debug_report, open(metadata_path) as metadata: - # For saving (reference, gene, var_only) combinations as key and their information ({var_change: target}) as value found in metadata - gene_dict = defaultdict(dict) - - # For saving targets found in metadata as key and their determinants (add to a set) as value - target_dict = {} - - # Skip the header in metadata - next(metadata) - # Go through lines in metadata and save findings to gene_dict and target_dict - for line in (line.strip() for line in metadata): - # Extract useful fields - fields = [str(field) for field in line.split("\t")] - ref_name, gene, var_only, var_change, _, target = fields - - # Populating gene_dict - gene_dict[(ref_name, gene, var_only)].update({var_change: target}) - # Populating target_dict - target_dict.update({target: set()}) - - # Skip the header in report and debug report - next(report) - next(debug_report) - # Go through lines in both report and debug report to detect targets - for line in (line.strip() for line in chain(report, debug_report)): - # Extract useful fields - fields = [str(field) for field in line.split("\t")] - ref_name, gene, var_only, ref_len, ref_base_assembled, known_var_change, has_known_var, ref_ctg_effect, ref_start, ref_end = fields[1], fields[2], fields[3], fields[7], fields[8], fields[16], fields[17], fields[19], fields[20], fields[21] - - # If coverage (ref_base_assembled / ref_len) < 0.9 or either variable contains non-numeric value, skip the line - if not ref_base_assembled.isdigit() or not ref_len.isdigit() or int(ref_base_assembled)/int(ref_len) < 0.9: - continue - - # If the known_var_change (. for genes, specific change for variants) is not found in the metadata of the (ref_name, gene, var_only) combination, skip the line - gene_dict_key = (ref_name, gene, var_only) - try: - target = gene_dict[gene_dict_key][known_var_change] - except KeyError: - continue - - # Logic for gene detection. Found means hit. - if var_only == "0": - target_dict[target].add(f'{ref_name}') - - # Logic for variant detection, further criteria required - if var_only == "1": - # folP-specific criteria: ref_ctg_effect (effect of change between reference and contig) is one of the keywords and the change occurs within nt 168-201 - if ref_name.lower().startswith("folp") and ref_ctg_effect.lower() in ('fshift', 'trunc', 'indel', 'ins', 'multiple') and (168 <= int(ref_start) <= 201 or 168 <= int(ref_end) <= 201): - pos = ref_start if ref_start == ref_end else f'{ref_start}-{ref_end}' - target_dict[target].add(f'{ref_name} {ref_ctg_effect} at {pos}') - # Common criteria: the assembly has that variant - elif has_known_var == "1": - target_dict[target].add(f'{ref_name} Variant {known_var_change}') +# Check argv and save to global variables +if len(sys.argv) != 5: + sys.exit('Usage: get_other_resistance.py REPORT_PATH DEBUG_REPORT_PATH METADATA_PATH OUTPUT_FILE') + +REPORT_PATH = sys.argv[1] +DEBUG_REPORT_PATH = sys.argv[2] +METADATA_PATH = sys.argv[3] +OUTPUT_FILE = sys.argv[4] + + +def main(): + targets_dict, hits_dict = prepare_dicts() + find_hits(targets_dict, hits_dict) + output = get_output(hits_dict) + # Save output to OUTPUT_FILE in csv format + pd.DataFrame([output]).to_csv(OUTPUT_FILE, index=False, quoting=csv.QUOTE_ALL) + + +def prepare_dicts(): + # For saving (reference, gene, var_only) combinations as key and their information ({var_change: target}) as value found in metadata + # Used to search whether there is a hit in the ARIBA result + targets_dict = defaultdict(dict) + + # For saving targets found in metadata as key and their determinants (i.e. hits) found in ARIBA result as values in set + hits_dict = {} + + with open(METADATA_PATH) as metadata: + # Skip the header in metadata + next(metadata) + + # Go through lines in metadata and save findings to targets_dict and hits_dict + for line in (line.strip() for line in metadata): + # Extract useful fields + fields = [str(field) for field in line.split("\t")] + ref_name, gene, var_only, var_change, _, target = fields + + # Populating targets_dict + targets_dict[(ref_name, gene, var_only)].update({var_change: target}) + # Populating hits_dict + hits_dict.update({target: set()}) + + return targets_dict, hits_dict + + +def find_hits(targets_dict, hits_dict): + with open(REPORT_PATH) as report, open(DEBUG_REPORT_PATH) as debug_report: + # Skip the header in report and debug report + next(report) + next(debug_report) + + # Go through lines in both report and debug report to detect targets + for line in (line.strip() for line in chain(report, debug_report)): + # Extract useful fields + fields = [str(field) for field in line.split("\t")] + ref_name, gene, var_only, ref_len, ref_base_assembled, known_var_change, has_known_var, ref_ctg_effect, ref_start, ref_end = fields[1], fields[2], fields[3], fields[7], fields[8], fields[16], fields[17], fields[19], fields[20], fields[21] + + # If coverage (ref_base_assembled / ref_len) < 0.9 or either variable contains non-numeric value, skip the line + if not ref_base_assembled.isdigit() or not ref_len.isdigit() or int(ref_base_assembled)/int(ref_len) < 0.9: + continue + + # If the known_var_change (. for genes, specific change for variants) is not found in the metadata of the (ref_name, gene, var_only) combination, skip the line + try: + target = targets_dict[(ref_name, gene, var_only)][known_var_change] + except KeyError: + continue + + # Logic for gene detection. Found means hit. + if var_only == "0": + hits_dict[target].add(f'{ref_name}') + + # Logic for variant detection, further criteria required + if var_only == "1": + # folP-specific criteria: ref_ctg_effect (effect of change between reference and contig) is one of the keywords and the change occurs within nt 168-201 + if ref_name.lower().startswith("folp") and ref_ctg_effect.lower() in ('fshift', 'trunc', 'indel', 'ins', 'multiple') and (168 <= int(ref_start) <= 201 or 168 <= int(ref_end) <= 201): + pos = ref_start if ref_start == ref_end else f'{ref_start}-{ref_end}' + hits_dict[target].add(f'{ref_name} {ref_ctg_effect} at {pos}') + # Common criteria: the assembly has that variant + elif has_known_var == "1": + hits_dict[target].add(f'{ref_name} Variant {known_var_change}') + + +def get_output(hits_dict): # For saving final output, where information is saved per-target output = {} - # Go through targets in metadata - for target in target_dict: + # Go through targets in hits_dict + for target in hits_dict: # If the target has no hit, set output as S or NEG (only for PILI-1/2), and determinant as _ - if len(target_dict[target]) == 0: + if len(hits_dict[target]) == 0: if target.lower().startswith('pili'): output[target] = 'NEG' else: @@ -87,10 +112,15 @@ else: output[f'{target}_Res'] = 'R' - output[f'{target}_Determinant'] = '; '.join(target_dict[target]) + output[f'{target}_Determinant'] = '; '.join(sorted(hits_dict[target])) - # Special cases to add to output + add_output_special_cases(output, hits_dict) + return output + + +# Special cases to add to output +def add_output_special_cases(output, hits_dict): # If TET exists and DOX does not: add DOX to output; directly copy output and determinant if 'TET_Res' in output and 'DOX_Res' not in output: output['DOX_Res'] = output['TET_Res'] @@ -107,15 +137,15 @@ if 'TMP_Res' in output and 'SMX_Res' in output and 'COT_Res' not in output: if output['TMP_Res'] == 'R' and output['SMX_Res'] == 'R': output['COT_Res'] = 'R' - output['COT_Determinant'] = '; '.join(target_dict['TMP'].union(target_dict['SMX'])) + output['COT_Determinant'] = '; '.join(sorted(hits_dict['TMP'].union(hits_dict['SMX']))) elif (output['TMP_Res'] == 'R') ^ (output['SMX_Res'] == 'R'): output['COT_Res'] = 'I' - output['COT_Determinant'] = '; '.join(target_dict['TMP'].union(target_dict['SMX'])) + output['COT_Determinant'] = '; '.join(sorted(hits_dict['TMP'].union(hits_dict['SMX']))) elif output['TMP_Res'] == 'S' and output['SMX_Res'] == 'S': output['COT_Res'] = 'S' output['COT_Determinant'] = '_' - # If ERY_CLI exists, add ERY and CLI to output. + # If ERY_CLI exists: add ERY and CLI to output. # If ERY_CLI is R, ERY and CLI are R, and add ERY_CLI determinant to their determinants # If ERY_CLI is S, ERY and CLI are S if they do not already exist, otherwise leave them unchanged if 'ERY_CLI_Res' in output: @@ -126,8 +156,9 @@ output['ERY_Res'] = output['ERY_Res'] if 'ERY_Res' in output else 'S' output['CLI_Res'] = output['CLI_Res'] if 'CLI_Res' in output else 'S' - output['ERY_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['ERY'])) if 'ERY' in target_dict and len(target_dict['ERY']) != 0 else output['ERY_CLI_Determinant'] - output['CLI_Determinant'] = '; '.join(target_dict['ERY_CLI'].union(target_dict['CLI'])) if 'CLI' in target_dict and len(target_dict['CLI']) != 0 else output['ERY_CLI_Determinant'] + output['ERY_Determinant'] = '; '.join(sorted(hits_dict['ERY_CLI'].union(hits_dict['ERY']))) if 'ERY' in hits_dict and len(hits_dict['ERY']) != 0 else output['ERY_CLI_Determinant'] + output['CLI_Determinant'] = '; '.join(sorted(hits_dict['ERY_CLI'].union(hits_dict['CLI']))) if 'CLI' in hits_dict and len(hits_dict['CLI']) != 0 else output['ERY_CLI_Determinant'] + - # Save output dict as csv - pd.DataFrame([output]).to_csv(output_file, index=False, quoting=csv.QUOTE_ALL) \ No newline at end of file +if __name__ == "__main__": + main() From d377c2928068a9e9238b467c80dd752f86705644 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 28 Jul 2023 09:31:21 +0000 Subject: [PATCH 060/157] Improve code comments Former-commit-id: 6002529434876cfc7e91d35f6782fc043e8c327c --- bin/get_other_resistance.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bin/get_other_resistance.py b/bin/get_other_resistance.py index 8743ff7..a77eab6 100755 --- a/bin/get_other_resistance.py +++ b/bin/get_other_resistance.py @@ -28,12 +28,13 @@ def main(): pd.DataFrame([output]).to_csv(OUTPUT_FILE, index=False, quoting=csv.QUOTE_ALL) +# Prepare targets_dict for searching hits and hits_dict for saving hits def prepare_dicts(): - # For saving (reference, gene, var_only) combinations as key and their information ({var_change: target}) as value found in metadata + # For saving (reference, gene, var_only) combinations as keys and their information found in metadata as values in dict format (i.e. {var_change: target}) # Used to search whether there is a hit in the ARIBA result targets_dict = defaultdict(dict) - # For saving targets found in metadata as key and their determinants (i.e. hits) found in ARIBA result as values in set + # For saving targets found in metadata as key and their determinants (i.e. hits) found in ARIBA result as values in set format hits_dict = {} with open(METADATA_PATH) as metadata: @@ -54,6 +55,7 @@ def prepare_dicts(): return targets_dict, hits_dict +# Finding hits in ARIBA results based on targets_dict and save hits to hits_dict def find_hits(targets_dict, hits_dict): with open(REPORT_PATH) as report, open(DEBUG_REPORT_PATH) as debug_report: # Skip the header in report and debug report @@ -91,6 +93,7 @@ def find_hits(targets_dict, hits_dict): hits_dict[target].add(f'{ref_name} Variant {known_var_change}') +# Generating final output dataframe based on hits_dict def get_output(hits_dict): # For saving final output, where information is saved per-target output = {} From 40613bbb22e12e670a1cf12eff6eb764a87d8059 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 28 Jul 2023 11:14:12 +0000 Subject: [PATCH 061/157] Improve shell scripts style Former-commit-id: 63ad10beee0e35b3578e30c985a4e9fe732a4637 --- bin/generate_sample_report.sh | 2 ++ bin/get_pbp_resistance.sh | 12 ++++++------ bin/get_poppunk_db.sh | 30 +++++++++++++++--------------- bin/get_poppunk_ext_clusters.sh | 15 +++++++-------- bin/get_seroba_db.sh | 12 ++++++------ bin/get_serotype.sh | 7 ++++--- bin/get_tools_info.sh | 4 ++-- bin/mapping_qc.sh | 8 ++++---- bin/overall_qc.sh | 4 ++-- bin/read_qc.sh | 8 ++++---- bin/taxonomy_qc.sh | 8 ++++---- 11 files changed, 56 insertions(+), 54 deletions(-) diff --git a/bin/generate_sample_report.sh b/bin/generate_sample_report.sh index cb7ab52..2a82172 100755 --- a/bin/generate_sample_report.sh +++ b/bin/generate_sample_report.sh @@ -1,3 +1,5 @@ +# Combine all csv reports into a single csv, then add Sample_ID as the first field + paste -d , *.csv \ | sed '1 s/^/\"Sample_ID\",/' \ | sed "2 s/^/\"${SAMPLE_ID}\",/" > "$SAMPLE_REPORT" diff --git a/bin/get_pbp_resistance.sh b/bin/get_pbp_resistance.sh index 5e833c3..0c9c943 100755 --- a/bin/get_pbp_resistance.sh +++ b/bin/get_pbp_resistance.sh @@ -3,13 +3,13 @@ # For all, replace null or space-only string with empty string function GET_VALUE { - echo $( < $JSON_FILE jq -r --arg target "$1" '.[$target]' \ - | sed 's/^null$//g;s/^\s+$//g' ) + < "$JSON_FILE" jq -r --arg target "$1" '.[$target]' \ + | sed 's/^null$//g;s/^\s+$//g' } function GET_RES { - echo $( < $JSON_FILE jq -r --arg target "$1" '.[$target]' \ - | sed 's/^null$//g;s/^\s+$//g' ) + < "$JSON_FILE" jq -r --arg target "$1" '.[$target]' \ + | sed 's/^null$//g;s/^\s+$//g' } pbp1a=$(GET_VALUE "pbp1a") @@ -31,5 +31,5 @@ PEN_MIC=$(GET_VALUE "penMic") PEN_NONMENINGITIS=$(GET_RES "penNonMeningitis") PEN_MENINGITIS=$(GET_RES "penMeningitis") -echo \"pbp1a\",\"pbp2b\",\"pbp2x\",\"AMO_MIC\",\"AMO_Res\",\"CFT_MIC\",\"CFT_Res\(Meningital\)\",\"CFT_Res\(Non-meningital\)\",\"TAX_MIC\",\"TAX_Res\(Meningital\)\",\"TAX_Res\(Non-meningital\)\",\"CFX_MIC\",\"CFX_Res\",\"MER_MIC\",\"MER_Res\",\"PEN_MIC\",\"PEN_Res\(Meningital\)\",\"PEN_Res\(Non-meningital\)\" > $PBP_AMR_REPORT -echo \"$pbp1a\",\"$pbp2b\",\"$pbp2x\",\"$AMO_MIC\",\"$AMO\",\"$CFT_MIC\",\"$CFT_MENINGITIS\",\"$CFT_NONMENINGITIS\",\"$TAX_MIC\",\"$TAX_MENINGITIS\",\"$TAX_NONMENINGITIS\",\"$CFX_MIC\",\"$CFX\",\"$MER_MIC\",\"$MER\",\"$PEN_MIC\",\"$PEN_MENINGITIS\",\"$PEN_NONMENINGITIS\" >> $PBP_AMR_REPORT \ No newline at end of file +echo \"pbp1a\",\"pbp2b\",\"pbp2x\",\"AMO_MIC\",\"AMO_Res\",\"CFT_MIC\",\"CFT_Res\(Meningital\)\",\"CFT_Res\(Non-meningital\)\",\"TAX_MIC\",\"TAX_Res\(Meningital\)\",\"TAX_Res\(Non-meningital\)\",\"CFX_MIC\",\"CFX_Res\",\"MER_MIC\",\"MER_Res\",\"PEN_MIC\",\"PEN_Res\(Meningital\)\",\"PEN_Res\(Non-meningital\)\" > "$PBP_AMR_REPORT" +echo \""$pbp1a"\",\""$pbp2b"\",\""$pbp2x"\",\""$AMO_MIC"\",\""$AMO"\",\""$CFT_MIC"\",\""$CFT_MENINGITIS"\",\""$CFT_NONMENINGITIS"\",\""$TAX_MIC"\",\""$TAX_MENINGITIS"\",\""$TAX_NONMENINGITIS"\",\""$CFX_MIC"\",\""$CFX"\",\""$MER_MIC"\",\""$MER"\",\""$PEN_MIC"\",\""$PEN_MENINGITIS"\",\""$PEN_NONMENINGITIS"\" >> "$PBP_AMR_REPORT" diff --git a/bin/get_poppunk_db.sh b/bin/get_poppunk_db.sh index d4e705a..48a0198 100755 --- a/bin/get_poppunk_db.sh +++ b/bin/get_poppunk_db.sh @@ -6,27 +6,27 @@ DB_NAME=$(basename "$DB_REMOTE" .tar.gz) DB_PATH=${DB_LOCAL}/${DB_NAME} -if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ - [ ! "$DB_REMOTE" == "$(jq -r .url ${DB_LOCAL}/${JSON_FILE})" ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}.h5 ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}.dists.npy ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}.dists.pkl ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}_fit.npz ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}_fit.pkl ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}_graph.gt ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}_clusters.csv ] || \ - [ ! -f ${DB_PATH}/${DB_NAME}.refs ]; then +if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \ + [ ! "$DB_REMOTE" == "$(jq -r .url "${DB_LOCAL}/${JSON_FILE}")" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}.h5" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}.dists.npy" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}.dists.pkl" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}_fit.npz" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}_fit.pkl" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}_graph.gt" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}_clusters.csv" ] || \ + [ ! -f "${DB_PATH}/${DB_NAME}.refs" ]; then - rm -rf ${DB_LOCAL}/${JSON_FILE} - rm -rf ${DB_LOCAL}/*/ + rm -rf "${DB_LOCAL:?}/${JSON_FILE}" + rm -rf "${DB_LOCAL:?}"/*/ - wget $DB_REMOTE -O poppunk_db.tar.gz - tar -xzf poppunk_db.tar.gz -C $DB_LOCAL + wget "$DB_REMOTE" -O poppunk_db.tar.gz + tar -xzf poppunk_db.tar.gz -C "$DB_LOCAL" rm poppunk_db.tar.gz jq -n \ --arg url "$DB_REMOTE" \ --arg save_time "$(date +"%Y-%m-%d %H:%M:%S %Z")" \ - '{"url" : $url, "save_time": $save_time}' > ${DB_LOCAL}/${JSON_FILE} + '{"url" : $url, "save_time": $save_time}' > "${DB_LOCAL}/${JSON_FILE}" fi diff --git a/bin/get_poppunk_ext_clusters.sh b/bin/get_poppunk_ext_clusters.sh index e330968..273ccbb 100755 --- a/bin/get_poppunk_ext_clusters.sh +++ b/bin/get_poppunk_ext_clusters.sh @@ -4,20 +4,19 @@ # If not: remove all csv files, and download to database directory, also save metadata to JSON EXT_CLUSTERS_CSV=$(basename "$EXT_CLUSTERS_REMOTE") -EXT_CLUSTERS_NAME=$(basename "$EXT_CLUSTERS_REMOTE" .csv) -if [ ! -f ${EXT_CLUSTERS_LOCAL}/${JSON_FILE} ] || \ - [ ! "$EXT_CLUSTERS_REMOTE" == "$(jq -r .url ${EXT_CLUSTERS_LOCAL}/${JSON_FILE})" ] || \ - [ ! -f ${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV} ]; then +if [ ! -f "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}" ] || \ + [ ! "$EXT_CLUSTERS_REMOTE" == "$(jq -r .url "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}")" ] || \ + [ ! -f "${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV}" ]; then - rm -f ${EXT_CLUSTERS_LOCAL}/*.csv - rm -f ${EXT_CLUSTERS_LOCAL}/${JSON_FILE} + rm -f "${EXT_CLUSTERS_LOCAL}"/*.csv + rm -f "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}" - wget $EXT_CLUSTERS_REMOTE -O ${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV} + wget "$EXT_CLUSTERS_REMOTE" -O "${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV}" jq -n \ --arg url "$EXT_CLUSTERS_REMOTE" \ --arg save_time "$(date +"%Y-%m-%d %H:%M:%S %Z")" \ - '{"url" : $url, "save_time": $save_time}' > ${EXT_CLUSTERS_LOCAL}/${JSON_FILE} + '{"url" : $url, "save_time": $save_time}' > "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}" fi diff --git a/bin/get_seroba_db.sh b/bin/get_seroba_db.sh index a3e1d3c..0cda2fc 100755 --- a/bin/get_seroba_db.sh +++ b/bin/get_seroba_db.sh @@ -5,13 +5,13 @@ # Assume up-to-date if JSON passes checks and the host cannot be resolved to allow offline usage -if [ ! -f ${DB_LOCAL}/${JSON_FILE} ] || \ - [ ! "$(grep 'git' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "${DB_REMOTE}" ] || \ - [ ! "$(grep 'kmer' ${DB_LOCAL}/${JSON_FILE} | sed -r 's/.+: "(.*)",?/\1/')" == "${KMER}" ] || \ - !((git -C ${DB_LOCAL} pull || echo 'Already up-to-date') | grep -q 'Already up[- ]to[- ]date'); then +if [ ! -f "${DB_LOCAL}"/"${JSON_FILE}" ] || \ + [ ! "$(grep 'git' "${DB_LOCAL}"/"${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "${DB_REMOTE}" ] || \ + [ ! "$(grep 'kmer' "${DB_LOCAL}"/"${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "${KMER}" ] || \ + ! ( (git -C "${DB_LOCAL}" pull || echo 'Already up-to-date') | grep -q 'Already up[- ]to[- ]date' ); then - rm -rf ${DB_LOCAL}/{,.[!.],..?}* - git clone ${DB_REMOTE} ${DB_LOCAL} + rm -rf "${DB_LOCAL:?}"/{,.[!.],..?}* + git clone "${DB_REMOTE}" "${DB_LOCAL}" CREATE_DB=true diff --git a/bin/get_serotype.sh b/bin/get_serotype.sh index 80bfcc7..560cdd7 100755 --- a/bin/get_serotype.sh +++ b/bin/get_serotype.sh @@ -1,9 +1,10 @@ # Run SeroBA to serotype samples + { - seroba runSerotyping "$SEROBA_DIR"/"$DATABASE" "$READ1" "$READ2" "$SAMPLE_ID" && SEROTYPE=$(awk -F'\t' '{ print $2 }' ${SAMPLE_ID}/pred.tsv) + seroba runSerotyping "${SEROBA_DIR}/${DATABASE}" "$READ1" "$READ2" "$SAMPLE_ID" && SEROTYPE=$(awk -F'\t' '{ print $2 }' "${SAMPLE_ID}/pred.tsv") } || { SEROTYPE="SEROBA FAILURE" } -echo \"Serotype\" > $SEROTYPE_REPORT -echo \"$SEROTYPE\" >> $SEROTYPE_REPORT \ No newline at end of file +echo \"Serotype\" > "$SEROTYPE_REPORT" +echo \""$SEROTYPE"\" >> "$SEROTYPE_REPORT" diff --git a/bin/get_tools_info.sh b/bin/get_tools_info.sh index 23d9520..9d20e16 100755 --- a/bin/get_tools_info.sh +++ b/bin/get_tools_info.sh @@ -1,7 +1,7 @@ # Save received tools versions into a JSON file add_version () { - jq -n --arg version $1 '.version = $version' + jq -n --arg version "$1" '.version = $version' } jq -n \ @@ -19,4 +19,4 @@ jq -n \ --argjson kraken2 "$(add_version "$KRAKEN2_VERSION")" \ --argjson seroba "$(add_version "$SEROBA_VERSION")" \ --argjson ariba "$(add_version "$ARIBA_VERSION")" \ - '$ARGS.named' > $JSON_FILE + '$ARGS.named' > "$JSON_FILE" diff --git a/bin/mapping_qc.sh b/bin/mapping_qc.sh index 75b18a0..f6eb48e 100755 --- a/bin/mapping_qc.sh +++ b/bin/mapping_qc.sh @@ -1,12 +1,12 @@ # Extract mapping QC information and determine QC result based on reference coverage and count of Het-SNP sites -COVERAGE=$(printf %.2f $COVERAGE) +COVERAGE=$(printf %.2f "$COVERAGE") -if (( $(echo "$COVERAGE > $QC_REF_COVERAGE" | bc -l) )) && (( $HET_SNP < $QC_HET_SNP_SITE )); then +if [[ "$(echo "$COVERAGE > $QC_REF_COVERAGE" | bc -l)" == 1 ]] && [[ $HET_SNP -lt $QC_HET_SNP_SITE ]]; then MAPPING_QC="PASS" else MAPPING_QC="FAIL" fi -echo \"Mapping_QC\",\"Ref_Cov_%\",\"Het-SNP#\" > $MAPPING_QC_REPORT -echo \"$MAPPING_QC\",\"$COVERAGE\",\"$QC_HET_SNP_SITE\" >> $MAPPING_QC_REPORT \ No newline at end of file +echo \"Mapping_QC\",\"Ref_Cov_%\",\"Het-SNP#\" > "$MAPPING_QC_REPORT" +echo \"$MAPPING_QC\",\""$COVERAGE"\",\""$QC_HET_SNP_SITE"\" >> "$MAPPING_QC_REPORT" diff --git a/bin/overall_qc.sh b/bin/overall_qc.sh index de7e116..d83e52a 100755 --- a/bin/overall_qc.sh +++ b/bin/overall_qc.sh @@ -11,5 +11,5 @@ else OVERALL_QC="FAIL" fi -echo \"Overall_QC\" > $OVERALL_QC_REPORT -echo \"$OVERALL_QC\" >> $OVERALL_QC_REPORT \ No newline at end of file +echo \"Overall_QC\" > "$OVERALL_QC_REPORT" +echo \""$OVERALL_QC"\" >> "$OVERALL_QC_REPORT" diff --git a/bin/read_qc.sh b/bin/read_qc.sh index 6ce8382..14d7519 100755 --- a/bin/read_qc.sh +++ b/bin/read_qc.sh @@ -1,12 +1,12 @@ # Extract total base count and determine QC result based on output JSON file of fastp -BASES=$(< $JSON jq -r .summary.after_filtering.total_bases) +BASES=$(< "$JSON" jq -r .summary.after_filtering.total_bases) -if (( $(echo "$BASES >= ($QC_LENGTH_LOW*$QC_DEPTH)" | bc -l) )); then +if [[ "$(echo "$BASES >= ($QC_LENGTH_LOW*$QC_DEPTH)" | bc -l)" == 1 ]]; then READ_QC="PASS" else READ_QC="FAIL" fi -echo \"Read_QC\",\"Bases\" > $READ_QC_REPORT -echo \"$READ_QC\",\"$BASES\" >> $READ_QC_REPORT \ No newline at end of file +echo \"Read_QC\",\"Bases\" > "$READ_QC_REPORT" +echo \"$READ_QC\",\""$BASES"\" >> "$READ_QC_REPORT" diff --git a/bin/taxonomy_qc.sh b/bin/taxonomy_qc.sh index 23254b1..7528b1d 100755 --- a/bin/taxonomy_qc.sh +++ b/bin/taxonomy_qc.sh @@ -1,16 +1,16 @@ # Extract taxonomy QC information and determine QC result based on kraken2_report.txt -PERCENTAGE=$(awk -F"\t" '$4 ~ /^S$/ && $6 ~ /Streptococcus pneumoniae$/ { gsub(/^[ \t]+/, "", $1); printf "%.2f", $1 }' $KRAKEN2_REPORT) +PERCENTAGE=$(awk -F"\t" '$4 ~ /^S$/ && $6 ~ /Streptococcus pneumoniae$/ { gsub(/^[ \t]+/, "", $1); printf "%.2f", $1 }' "$KRAKEN2_REPORT") if [ -z "$PERCENTAGE" ]; then PERCENTAGE="0.00" fi -if (( $(echo "$PERCENTAGE > $QC_SPNEUMO_PERCENTAGE" | bc -l) )); then +if [[ "$(echo "$PERCENTAGE > $QC_SPNEUMO_PERCENTAGE" | bc -l)" == 1 ]]; then TAXONOMY_QC="PASS" else TAXONOMY_QC="FAIL" fi -echo \"Taxonomy_QC\",\"S.Pneumo_%\" > $TAXONOMY_QC_REPORT -echo \"$TAXONOMY_QC\",\"$PERCENTAGE\" >> $TAXONOMY_QC_REPORT \ No newline at end of file +echo \"Taxonomy_QC\",\"S.Pneumo_%\" > "$TAXONOMY_QC_REPORT" +echo \"$TAXONOMY_QC\",\""$PERCENTAGE"\" >> "$TAXONOMY_QC_REPORT" From c47ca63a1c6e04dc9c2f755c8fd5e3147b508207 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 28 Jul 2023 14:49:25 +0000 Subject: [PATCH 062/157] Fix outputing incorrect variable for Het-SNP# Former-commit-id: 2e6707ad4889d1018856400a405d6b5a35003e93 --- bin/mapping_qc.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/mapping_qc.sh b/bin/mapping_qc.sh index f6eb48e..a737d09 100755 --- a/bin/mapping_qc.sh +++ b/bin/mapping_qc.sh @@ -9,4 +9,4 @@ else fi echo \"Mapping_QC\",\"Ref_Cov_%\",\"Het-SNP#\" > "$MAPPING_QC_REPORT" -echo \"$MAPPING_QC\",\""$COVERAGE"\",\""$QC_HET_SNP_SITE"\" >> "$MAPPING_QC_REPORT" +echo \"$MAPPING_QC\",\""$COVERAGE"\",\""$HET_SNP"\" >> "$MAPPING_QC_REPORT" From 346e6be5e8c12f379a20f617f892dd1235e77729 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 28 Jul 2023 15:42:41 +0000 Subject: [PATCH 063/157] Avoid numbers output as float Former-commit-id: 8d352582bcbd82a34797e22c2f0433d588e5d6a7 --- bin/generate_overall_report.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/generate_overall_report.py b/bin/generate_overall_report.py index f238979..59a1737 100755 --- a/bin/generate_overall_report.py +++ b/bin/generate_overall_report.py @@ -81,7 +81,7 @@ def get_df_output(output_columns): dfs = [df_manifest] reports = glob.glob(WORKDIR_PATH +'/*.csv') for report in reports: - df = pd.read_csv(report) + df = pd.read_csv(report, dtype=str) dfs.append(df) df_output = pd.concat(dfs, ignore_index=True).sort_values(by=['Sample_ID']) From 4e60552ec4a66f7c661477d8ef096c0418f16fac Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 28 Jul 2023 15:44:33 +0000 Subject: [PATCH 064/157] Refactor to improve maintainability & readability Former-commit-id: 1327f764db5a22a74da889a101104c99cbbaeb22 --- bin/het_snp_count.py | 171 ++++++++++++++++++++++++++----------------- modules/mapping.nf | 4 +- 2 files changed, 107 insertions(+), 68 deletions(-) diff --git a/bin/het_snp_count.py b/bin/het_snp_count.py index 74d45ee..cf92dc6 100755 --- a/bin/het_snp_count.py +++ b/bin/het_snp_count.py @@ -5,72 +5,109 @@ import re import sys -# Input VCF path -vcf = sys.argv[1] -# Minimum distance between SNPs to not consider as part of cluster -min_snp_distance = int(sys.argv[2]) - - -with open(vcf) as f: - lines = [line.strip() for line in f] - - # List of positions of non-cluster Het-SNPs - het_noncluster_pos = [] - # Previous Het-SNP position. Initialise with the negative of min_snp_distance for calculation of the sites in starting positions - prev_het_pos = -min_snp_distance - - for line in lines: - # Skip lines of header and INDEL calls - if line.startswith("#") or "INDEL" in line: - continue - - # Get fields from the call - chrom, pos, id, ref, alt, qual, filter, info, format, sample = line.split("\t") - - # Get DP (The number of reads covering or bridging POS) from the INFO field - dp = re.search(r'DP=([0-9]+)', info).group(1) - # Get DP4 (Number of forward ref alleles; reverse ref; forward non-ref; reverse non-ref alleles, used in variant calling) from the INFO field - reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref = re.search(r'DP4=([0-9,]+)', info).group(1).split(",") - # Get MQ (Root-Mean-Square mapping quality of covering reads) from the INFO field - mq = re.search(r'MQ=([0-9]+)', info).group(1) - - # Get PV4 (P-values for strand bias; baseQ bias; mapQ bias; tail distance bias) from the INFO field; set to None if it is not found - try: - pv4 = re.search(r'PV4=([0-9,.]+)', info).group(1) - except AttributeError: - pv4 = None - - # Ensure qual is float - qual = float(qual) - # Ensure pos, dp, mq, reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref are int - pos, dp, mq, reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref = map(int, [pos, dp, mq, reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref]) - - # Basic quality filter, skip this call if fails - if not(qual > 50 and dp > 5 and mq > 30 and reads_for_non_ref > 2 and reads_rev_non_ref > 2): - continue - - # Further quality filter if PV4 exists, skip this call if fails - if pv4 is not None: - pv_strand, pv_baseq, pv_mapq, pv_tail_distance = map(float, pv4.split(",")) - if not (pv_strand > 0.001 and pv_mapq > 0.001 and pv_tail_distance > 0.001): + +# Check argv and save to global variables +if len(sys.argv) != 4: + sys.exit('Usage: het_snp_count.py VCF MIN_SNP_DISTANCE OUTPUT_FILE') +VCF = sys.argv[1] +MIN_SNP_DISTANCE = int(sys.argv[2]) # Minimum distance between SNPs to not consider as part of cluster +OUTPUT_FILE=sys.argv[3] + + +def main(): + with open(VCF) as vcf, open(OUTPUT_FILE, 'w') as output_file: + lines = [line.strip() for line in vcf] + + # List of positions of non-cluster Het-SNPs + het_noncluster_pos = [] + # Previous Het-SNP position + prev_het_pos = None + + for line in lines: + # Skip lines of header and INDEL calls + if line.startswith("#") or "INDEL" in line: continue + + pos, qual, info = extract_vcf_fields(line) + + dp, reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref, mq, pv4 = extract_info(info) + + if not quality_check(qual, dp, mq, reads_for_non_ref, reads_rev_non_ref, pv4): + continue + + if is_het_snp(het_noncluster_pos, pos, prev_het_pos, reads_for_non_ref, reads_for_ref, reads_rev_non_ref, reads_rev_ref): + # Mark current pos as previous Het-SNP pos for the next Het-SNP + prev_het_pos = pos + + # Save amount of non-cluster Het-SNP sites to OUTPUT_FILE + output_file.write(f'{len(het_noncluster_pos)}') + + +# Extract relevant fields from the call +def extract_vcf_fields(line): + fields = line.split("\t") + pos, qual, info = fields[1], fields[5], fields[7] + + # Ensure pos is int and qual is float + return int(pos), float(qual), info + + +# Extract information from the INFO field +def extract_info(info): + # Get DP (The number of reads covering or bridging POS) + dp = re.search(r'DP=([0-9]+)', info).group(1) + + # Get DP4 (Number of forward ref alleles; reverse ref; forward non-ref; reverse non-ref alleles, used in variant calling) + reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref = re.search(r'DP4=([0-9,]+)', info).group(1).split(",") + + # Get MQ (Root-Mean-Square mapping quality of covering reads) + mq = re.search(r'MQ=([0-9]+)', info).group(1) + + # Get PV4 (P-values for strand bias; baseQ bias; mapQ bias; tail distance bias); set to None if it is not found + try: + pv4 = re.search(r'PV4=([0-9,.]+)', info).group(1) + except AttributeError: + pv4 = None + + # Ensure dp, reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref, mq are int + return *map(int, [dp, reads_for_ref, reads_rev_ref, reads_for_non_ref, reads_rev_non_ref, mq]), pv4 + + +# Quality check for call +def quality_check(qual, dp, mq, reads_for_non_ref, reads_rev_non_ref, pv4): + # Basic quality check, skip this call if fails + if not(qual > 50 and dp > 5 and mq > 30 and reads_for_non_ref > 2 and reads_rev_non_ref > 2): + return False + + # Further quality check if PV4 exists, skip this call if fails + if pv4 is not None: + pv_strand, pv_baseq, pv_mapq, pv_tail_distance = map(float, pv4.split(",")) + if not (pv_strand > 0.001 and pv_mapq > 0.001 and pv_tail_distance > 0.001): + return False + + return True + + +# Check if this call is a Het-SNP and add/remove Het-SNP to/from het_noncluster_pos +def is_het_snp(het_noncluster_pos, pos, prev_het_pos, reads_for_non_ref, reads_for_ref, reads_rev_non_ref, reads_rev_ref): + # Calculate forward and reverse non-reference reads ratios (variant allele frequencies) + forward_non_ref_ratio = reads_for_non_ref / (reads_for_non_ref + reads_for_ref) + reverse_non_ref_ratio = reads_rev_non_ref / (reads_rev_non_ref + reads_rev_ref) + + # Consider as Het-SNP when both forward and reverse non-reference reads ratios are below 0.90 + if forward_non_ref_ratio < 0.90 and reverse_non_ref_ratio < 0.90: + # If the distance between current and previous Het-SNP position is >= the minimum non-cluster SNP distance or there is no previous Het-SNP, + # add the position to the list of non-cluster Het-SNP positions + if prev_het_pos is None or pos - prev_het_pos >= MIN_SNP_DISTANCE: + het_noncluster_pos.append(pos) + # If the last Het-SNP in the list of non-cluster Het-SNP positions is part of the current cluster, remove it + elif het_noncluster_pos and pos - het_noncluster_pos[-1] < MIN_SNP_DISTANCE: + het_noncluster_pos.pop() + + return True + + return False + - # Calculate forward and reverse non-reference reads ratios (variant allele frequencies) - forward_non_ref_ratio = reads_for_non_ref / (reads_for_non_ref + reads_for_ref) - reverse_non_ref_ratio = reads_rev_non_ref / (reads_rev_non_ref + reads_rev_ref) - - # Consider as Het-SNP when both forward and reverse non-reference reads ratios are below 0.90 - if forward_non_ref_ratio < 0.90 and reverse_non_ref_ratio < 0.90: - # If the distance between current and previous Het-SNP position is >= the minimum non-cluster SNP distance, - # add the position to the list of non-cluster Het-SNP positions - if pos - prev_het_pos >= min_snp_distance: - het_noncluster_pos.append(pos) - # If the last Het-SNP in the list of non-cluster Het-SNP positions is part of the current cluster, remove it - elif het_noncluster_pos and pos - het_noncluster_pos[-1] < min_snp_distance: - het_noncluster_pos.pop() - # Mark current pos as previous Het-SNP pos for the next Het-SNP - prev_het_pos = pos - - # Amount of non-cluster Het-SNP sites, print to be captured by Nextflow - het_noncluster_sites = len(het_noncluster_pos) - print(het_noncluster_sites, end="") +if __name__ == "__main__": + main() diff --git a/modules/mapping.nf b/modules/mapping.nf index 0a37628..d545607 100644 --- a/modules/mapping.nf +++ b/modules/mapping.nf @@ -120,8 +120,10 @@ process HET_SNP_COUNT { tuple val(sample_id), env(OUTPUT), emit: result script: + het_snp_count_output='output.txt' """ - OUTPUT=`het_snp_count.py "$vcf" 50` + het_snp_count.py "$vcf" 50 "$het_snp_count_output" + OUTPUT=`cat $het_snp_count_output` """ } From fb375e084fa90e0a3f076d8b2c8cfb6a2194b6c1 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 31 Jul 2023 13:45:59 +0000 Subject: [PATCH 065/157] Improve shell scripts style Former-commit-id: 8f15a7de7d3a77d34a95d8ebf1ac166af9bb8a96 --- bin/mapping_qc.sh | 2 +- bin/read_qc.sh | 2 +- bin/taxonomy_qc.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/mapping_qc.sh b/bin/mapping_qc.sh index a737d09..9ed580d 100755 --- a/bin/mapping_qc.sh +++ b/bin/mapping_qc.sh @@ -9,4 +9,4 @@ else fi echo \"Mapping_QC\",\"Ref_Cov_%\",\"Het-SNP#\" > "$MAPPING_QC_REPORT" -echo \"$MAPPING_QC\",\""$COVERAGE"\",\""$HET_SNP"\" >> "$MAPPING_QC_REPORT" +echo \""$MAPPING_QC"\",\""$COVERAGE"\",\""$HET_SNP"\" >> "$MAPPING_QC_REPORT" diff --git a/bin/read_qc.sh b/bin/read_qc.sh index 14d7519..a72c1d7 100755 --- a/bin/read_qc.sh +++ b/bin/read_qc.sh @@ -9,4 +9,4 @@ else fi echo \"Read_QC\",\"Bases\" > "$READ_QC_REPORT" -echo \"$READ_QC\",\""$BASES"\" >> "$READ_QC_REPORT" +echo \""$READ_QC"\",\""$BASES"\" >> "$READ_QC_REPORT" diff --git a/bin/taxonomy_qc.sh b/bin/taxonomy_qc.sh index 7528b1d..a867804 100755 --- a/bin/taxonomy_qc.sh +++ b/bin/taxonomy_qc.sh @@ -13,4 +13,4 @@ else fi echo \"Taxonomy_QC\",\"S.Pneumo_%\" > "$TAXONOMY_QC_REPORT" -echo \"$TAXONOMY_QC\",\""$PERCENTAGE"\" >> "$TAXONOMY_QC_REPORT" +echo \""$TAXONOMY_QC"\",\""$PERCENTAGE"\" >> "$TAXONOMY_QC_REPORT" From 532509dc62daf5c1705a24e2d86dc971171df6e1 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 31 Jul 2023 14:06:38 +0000 Subject: [PATCH 066/157] Improve chart Former-commit-id: 9d6ead6f62d07b58ca5e304f4fc45702d4a25b81 --- doc/workflow.drawio.svg | 238 ++++++++++++++++++++-------------------- 1 file changed, 119 insertions(+), 119 deletions(-) diff --git a/doc/workflow.drawio.svg b/doc/workflow.drawio.svg index 00766b6..873d1b7 100644 --- a/doc/workflow.drawio.svg +++ b/doc/workflow.drawio.svg @@ -1,23 +1,23 @@ - + - - + + Output - - + + Input - + - + @@ -32,12 +32,12 @@ - - - - + + + + - + @@ -57,14 +57,14 @@ - + - + - + - - + + FASTQ (Reads) @@ -72,36 +72,36 @@ - + FASTQ (Reads) - - - + + + S. Pneumo: > 60% - - - + + + Contigs: < 500 - + Length: 1.9 - 2.3 Mb - + Depth: ≥ 20x - + - - + + FASTA (Assemblies) @@ -109,16 +109,16 @@ - + FASTA (Assemblies) - + - - + + SAM @@ -126,25 +126,25 @@ - + SAM - - - + + + Ref Coverage: > 60% - + Het-SNP site: < 220 - + - - + + Results @@ -152,21 +152,21 @@ - + Results - - - - - - - - + + + + + + + + - + @@ -188,10 +188,10 @@ - - + + - + @@ -209,10 +209,10 @@ - - + + - + @@ -233,10 +233,10 @@ - - + + - + @@ -254,10 +254,10 @@ - - + + - + @@ -279,12 +279,12 @@ - - - - + + + + - + @@ -302,12 +302,12 @@ - - - - + + + + - + @@ -325,11 +325,11 @@ - - - + + + - + @@ -347,13 +347,13 @@ - - - - + + + + - - + + @@ -363,16 +363,16 @@ - + Over... - - - + + + - + @@ -390,11 +390,11 @@ - - - + + + - + @@ -412,11 +412,11 @@ - - - + + + - + @@ -435,9 +435,9 @@ - + - + @@ -455,30 +455,24 @@ - - - - - Go / No-go - - - - + + + Bases: ≥ 38 Mb - + Go / No-go - + - - + + - + @@ -497,9 +491,9 @@ - + - + @@ -517,11 +511,11 @@ - - - + + + - + @@ -537,11 +531,17 @@ - + QC values shown in the diagram are the default values + + + + Go / No-go + + From 119622d167e569163ef47083324e3c3d83984501 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 31 Jul 2023 14:37:56 +0000 Subject: [PATCH 067/157] Update Nextflow executable to 23.04.2 Former-commit-id: d899ec958206749174162c4e5e82a1511980bb17 --- nextflow | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow b/nextflow index a6ece4c..b7725ce 100755 --- a/nextflow +++ b/nextflow @@ -15,7 +15,7 @@ # limitations under the License. [[ "$NXF_DEBUG" == 'x' ]] && set -x -NXF_VER=${NXF_VER:-'23.04.1'} +NXF_VER=${NXF_VER:-'23.04.2'} NXF_ORG=${NXF_ORG:-'nextflow-io'} NXF_HOME=${NXF_HOME:-$HOME/.nextflow} NXF_PROT=${NXF_PROT:-'https'} From 5f0e3011e2ef2d74fc56a9dc02b0e81197e59a25 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 31 Jul 2023 14:47:49 +0000 Subject: [PATCH 068/157] Improve wording of messages. Former-commit-id: a98510ee41c53e924b7fdd46949c1104d62b6bef --- modules/messages.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/messages.nf b/modules/messages.nf index 8e13998..7a9d189 100644 --- a/modules/messages.nf +++ b/modules/messages.nf @@ -46,7 +46,7 @@ void workflowSelectMessage(String selectedWorkflow) { switch (selectedWorkflow) { case 'pipeline': message = """ - |The main pipeline workflow was selected. + |The main pipeline workflow has been selected. | |Input Directory: ${readsDir.canonicalPath} |Output Directory: ${outputDir.canonicalPath} @@ -54,12 +54,12 @@ void workflowSelectMessage(String selectedWorkflow) { break case 'init': message = ''' - |The alternative workflow for initialisation was selected. + |The alternative workflow for initialisation has been selected. '''.stripMargin() break case 'version': message = ''' - |The alternative workflow for getting versions of pipeline, tools and databases was selected. + |The alternative workflow for getting versions of pipeline, tools and databases has been selected. '''.stripMargin() break } From 873ca1c426ab33303abc8b0a546b4ee9ab768c90 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 31 Jul 2023 14:53:48 +0000 Subject: [PATCH 069/157] Improve shell scripts style Former-commit-id: fa3f58f26c148d373780e12886af10517c8335be --- bin/get_databases_info.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/get_databases_info.sh b/bin/get_databases_info.sh index 3d9dd98..10f2174 100755 --- a/bin/get_databases_info.sh +++ b/bin/get_databases_info.sh @@ -1,7 +1,7 @@ # Save received databases information into a JSON file add_bwa_db () { - BWA_DB_JSON=${BWA_DB_PATH}/${BWA_JSON} + BWA_DB_JSON="${BWA_DB_PATH}/${BWA_JSON}" if [ -f "$BWA_DB_JSON" ]; then REFERENCE=$(jq -r .reference "$BWA_DB_JSON") REFERENCE_MD5=$(jq -r .reference_md5 "$BWA_DB_JSON") @@ -15,7 +15,7 @@ add_bwa_db () { } add_ariba_db () { - ARIBA_DB_JSON=${ARIBA_DB_PATH}/${ARIBA_JSON} + ARIBA_DB_JSON="${ARIBA_DB_PATH}/${ARIBA_JSON}" if [ -f "$ARIBA_DB_JSON" ]; then REFERENCE=$(jq -r .reference "$ARIBA_DB_JSON") REFERENCE_MD5=$(jq -r .reference_md5 "$ARIBA_DB_JSON") @@ -33,7 +33,7 @@ add_ariba_db () { } add_seroba_db () { - SEROBA_DB_JSON=${SEROBA_DB_PATH}/${SEROBA_JSON} + SEROBA_DB_JSON="${SEROBA_DB_PATH}/${SEROBA_JSON}" if [ -f "$SEROBA_DB_JSON" ]; then GIT=$(jq -r .git "$SEROBA_DB_JSON") KMER=$(jq -r .kmer "$SEROBA_DB_JSON") @@ -47,7 +47,7 @@ add_seroba_db () { } add_url_db () { - DB_JSON=$1 + DB_JSON="$1" if [ -f "$DB_JSON" ]; then URL=$(jq -r .url "$DB_JSON") SAVE_TIME=$(jq -r .save_time "$DB_JSON") From ab20087a04365363751a15ed80f4811c5afba1a1 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Mon, 31 Jul 2023 17:29:22 +0000 Subject: [PATCH 070/157] Improve names & comments of processes & scripts Former-commit-id: 6c54b3e998ca66a1ba47bda6d8845edaed8e62ce --- ...e_ariba_db.sh => check-create_ariba_db.sh} | 2 +- ...b.sh => check-create_ref_genome_bwa_db.sh} | 2 +- ...en2_db.sh => check-download_kraken2_db.sh} | 0 ...unk_db.sh => check-download_poppunk_db.sh} | 0 ...=> check-download_poppunk_ext_clusters.sh} | 0 bin/{get_seroba_db.sh => check_seroba_db.sh} | 6 +- ...er_compose.sh => create_docker_compose.sh} | 2 +- bin/{assembly_qc.sh => get_assembly_qc.sh} | 0 bin/{mapping_qc.sh => get_mapping_qc.sh} | 0 bin/{overall_qc.sh => get_overall_qc.sh} | 0 bin/{read_qc.sh => get_read_qc.sh} | 0 bin/{taxonomy_qc.sh => get_taxonomy_qc.sh} | 2 +- ...esistance.py => parse_other_resistance.py} | 0 ..._resistance.sh => parse_pbp_resistance.sh} | 0 ...{combine_info.sh => save_combined_info.sh} | 0 ...tabases_info.sh => save_databases_info.sh} | 0 ...get_images_info.sh => save_images_info.sh} | 0 bin/{get_tools_info.sh => save_tools_info.sh} | 0 modules/amr.nf | 16 ++--- modules/assembly.nf | 6 +- modules/docker.nf | 2 +- modules/info.nf | 8 +-- modules/lineage.nf | 4 +- modules/mapping.nf | 6 +- modules/overall_qc.nf | 2 +- modules/preprocess.nf | 2 +- modules/serotype.nf | 8 +-- modules/taxonomy.nf | 4 +- workflows/init.nf | 14 ++-- workflows/pipeline.nf | 66 ++++++++++--------- 30 files changed, 76 insertions(+), 76 deletions(-) rename bin/{create_ariba_db.sh => check-create_ariba_db.sh} (95%) rename bin/{create_ref_genome_bwa_db.sh => check-create_ref_genome_bwa_db.sh} (92%) rename bin/{get_kraken2_db.sh => check-download_kraken2_db.sh} (100%) rename bin/{get_poppunk_db.sh => check-download_poppunk_db.sh} (100%) rename bin/{get_poppunk_ext_clusters.sh => check-download_poppunk_ext_clusters.sh} (100%) rename bin/{get_seroba_db.sh => check_seroba_db.sh} (63%) rename bin/{get_docker_compose.sh => create_docker_compose.sh} (95%) rename bin/{assembly_qc.sh => get_assembly_qc.sh} (100%) rename bin/{mapping_qc.sh => get_mapping_qc.sh} (100%) rename bin/{overall_qc.sh => get_overall_qc.sh} (100%) rename bin/{read_qc.sh => get_read_qc.sh} (100%) rename bin/{taxonomy_qc.sh => get_taxonomy_qc.sh} (95%) rename bin/{get_other_resistance.py => parse_other_resistance.py} (100%) rename bin/{get_pbp_resistance.sh => parse_pbp_resistance.sh} (100%) rename bin/{combine_info.sh => save_combined_info.sh} (100%) rename bin/{get_databases_info.sh => save_databases_info.sh} (100%) rename bin/{get_images_info.sh => save_images_info.sh} (100%) rename bin/{get_tools_info.sh => save_tools_info.sh} (100%) diff --git a/bin/create_ariba_db.sh b/bin/check-create_ariba_db.sh similarity index 95% rename from bin/create_ariba_db.sh rename to bin/check-create_ariba_db.sh index fb2b657..32ff767 100755 --- a/bin/create_ariba_db.sh +++ b/bin/check-create_ariba_db.sh @@ -1,4 +1,4 @@ -# Check if CREATE_ARIBA_DB has run successfully on the specific reference sequences and metadata. +# Check if ARIBA database was prepared from the specific reference sequences and metadata. # If not: remove the $OUTPUT directory, and prepare the ARIBA database from reference sequences and metadata, also save metadata to JSON REF_SEQUENCES_MD5=$(md5sum "$REF_SEQUENCES" | awk '{ print $1 }') diff --git a/bin/create_ref_genome_bwa_db.sh b/bin/check-create_ref_genome_bwa_db.sh similarity index 92% rename from bin/create_ref_genome_bwa_db.sh rename to bin/check-create_ref_genome_bwa_db.sh index 385b609..65a7da8 100755 --- a/bin/create_ref_genome_bwa_db.sh +++ b/bin/check-create_ref_genome_bwa_db.sh @@ -1,4 +1,4 @@ -# Check if CREATE_REF_GENOME_BWA_DB has run successfully on the specific reference. +# Check if BWA database was prepared from the specific reference. # If not: remove files in database directory, and construct the FM-index database of the reference genome for BWA, also save metadata to JSON REFERENCE_MD5=$(md5sum "$REFERENCE" | awk '{ print $1 }') diff --git a/bin/get_kraken2_db.sh b/bin/check-download_kraken2_db.sh similarity index 100% rename from bin/get_kraken2_db.sh rename to bin/check-download_kraken2_db.sh diff --git a/bin/get_poppunk_db.sh b/bin/check-download_poppunk_db.sh similarity index 100% rename from bin/get_poppunk_db.sh rename to bin/check-download_poppunk_db.sh diff --git a/bin/get_poppunk_ext_clusters.sh b/bin/check-download_poppunk_ext_clusters.sh similarity index 100% rename from bin/get_poppunk_ext_clusters.sh rename to bin/check-download_poppunk_ext_clusters.sh diff --git a/bin/get_seroba_db.sh b/bin/check_seroba_db.sh similarity index 63% rename from bin/get_seroba_db.sh rename to bin/check_seroba_db.sh index 0cda2fc..2e6ff2d 100755 --- a/bin/get_seroba_db.sh +++ b/bin/check_seroba_db.sh @@ -1,7 +1,5 @@ -# Return boolean of CREATE_DB, download if necessary - -# Check if GET_SEROBA_DB and CREATE_SEROBA_DB has run successfully on the database at the specific link, CREATE_SEROBA_DB used the specific Kmerm and pull to check if SeroBA database is up-to-date. -# If outdated or does not exist: remove files in database directory and clone, set CREATE_DB to true +# Check if database was cloned from specific link and is up-to-date, also prepared by the specific Kmer +# If not: remove files in database directory and clone, set CREATE_DB to true # Assume up-to-date if JSON passes checks and the host cannot be resolved to allow offline usage diff --git a/bin/get_docker_compose.sh b/bin/create_docker_compose.sh similarity index 95% rename from bin/get_docker_compose.sh rename to bin/create_docker_compose.sh index 5f8ff8b..d6fc3ba 100755 --- a/bin/get_docker_compose.sh +++ b/bin/create_docker_compose.sh @@ -1,4 +1,4 @@ -# Generate a Docker compose file that includes all images used in nextflow.config +# Generate a Docker compose file that includes all images used in $NEXTFLOW_CONFIG COUNT=0 diff --git a/bin/assembly_qc.sh b/bin/get_assembly_qc.sh similarity index 100% rename from bin/assembly_qc.sh rename to bin/get_assembly_qc.sh diff --git a/bin/mapping_qc.sh b/bin/get_mapping_qc.sh similarity index 100% rename from bin/mapping_qc.sh rename to bin/get_mapping_qc.sh diff --git a/bin/overall_qc.sh b/bin/get_overall_qc.sh similarity index 100% rename from bin/overall_qc.sh rename to bin/get_overall_qc.sh diff --git a/bin/read_qc.sh b/bin/get_read_qc.sh similarity index 100% rename from bin/read_qc.sh rename to bin/get_read_qc.sh diff --git a/bin/taxonomy_qc.sh b/bin/get_taxonomy_qc.sh similarity index 95% rename from bin/taxonomy_qc.sh rename to bin/get_taxonomy_qc.sh index a867804..cb1e382 100755 --- a/bin/taxonomy_qc.sh +++ b/bin/get_taxonomy_qc.sh @@ -1,4 +1,4 @@ -# Extract taxonomy QC information and determine QC result based on kraken2_report.txt +# Extract taxonomy QC information and determine QC result based on $KRAKEN2_REPORT PERCENTAGE=$(awk -F"\t" '$4 ~ /^S$/ && $6 ~ /Streptococcus pneumoniae$/ { gsub(/^[ \t]+/, "", $1); printf "%.2f", $1 }' "$KRAKEN2_REPORT") diff --git a/bin/get_other_resistance.py b/bin/parse_other_resistance.py similarity index 100% rename from bin/get_other_resistance.py rename to bin/parse_other_resistance.py diff --git a/bin/get_pbp_resistance.sh b/bin/parse_pbp_resistance.sh similarity index 100% rename from bin/get_pbp_resistance.sh rename to bin/parse_pbp_resistance.sh diff --git a/bin/combine_info.sh b/bin/save_combined_info.sh similarity index 100% rename from bin/combine_info.sh rename to bin/save_combined_info.sh diff --git a/bin/get_databases_info.sh b/bin/save_databases_info.sh similarity index 100% rename from bin/get_databases_info.sh rename to bin/save_databases_info.sh diff --git a/bin/get_images_info.sh b/bin/save_images_info.sh similarity index 100% rename from bin/get_images_info.sh rename to bin/save_images_info.sh diff --git a/bin/get_tools_info.sh b/bin/save_tools_info.sh similarity index 100% rename from bin/get_tools_info.sh rename to bin/save_tools_info.sh diff --git a/modules/amr.nf b/modules/amr.nf index 1fd57f4..d7a7206 100644 --- a/modules/amr.nf +++ b/modules/amr.nf @@ -19,7 +19,7 @@ process PBP_RESISTANCE { } // Extract the results from the output file of the PBP AMR predictor -process GET_PBP_RESISTANCE { +process PARSE_PBP_RESISTANCE { label 'bash_container' label 'farm_low' @@ -37,12 +37,12 @@ process GET_PBP_RESISTANCE { JSON_FILE="$json" PBP_AMR_REPORT="$pbp_amr_report" - source get_pbp_resistance.sh + source parse_pbp_resistance.sh """ } -// Create ARIBA database and return database path -process CREATE_ARIBA_DB { +// Return database path, create if necessary +process GET_ARIBA_DB { label 'ariba_container' label 'farm_low' @@ -65,7 +65,7 @@ process CREATE_ARIBA_DB { OUTPUT="$output" JSON_FILE="$json" - source create_ariba_db.sh + source check-create_ariba_db.sh """ } @@ -88,12 +88,12 @@ process OTHER_RESISTANCE { report='result/report.tsv' report_debug='result/debug.report.tsv' """ - ariba run --nucmer_min_id 80 --assembled_threshold 0.80 $ariba_database/$database $read1 $read2 result + ariba run --nucmer_min_id 80 --assembled_threshold 0.80 "$ariba_database/$database" "$read1" "$read2" result """ } // Extracting resistance information from ARIBA report -process GET_OTHER_RESISTANCE { +process PARSE_OTHER_RESISTANCE { label 'python_container' label 'farm_low' @@ -109,6 +109,6 @@ process GET_OTHER_RESISTANCE { script: output_file="other_amr_report.csv" """ - get_other_resistance.py "$report" "$report_debug" "$metadata" "$output_file" + parse_other_resistance.py "$report" "$report_debug" "$metadata" "$output_file" """ } diff --git a/modules/assembly.nf b/modules/assembly.nf index fd84c76..ab66b6e 100644 --- a/modules/assembly.nf +++ b/modules/assembly.nf @@ -1,5 +1,5 @@ // Run Unicycler to get assembly -// Return sample_id and assembly, and hardlink the assembly to ${params.output}/assemblies directory +// Return sample_id and assembly, and publish the assembly to ${params.output}/assemblies directory based on ${params.assembly_publish} process ASSEMBLY_UNICYCLER { label 'unicycler_container' label 'farm_high_fallible' @@ -26,7 +26,7 @@ process ASSEMBLY_UNICYCLER { } // Run Shovill to get assembly -// Return sample_id and assembly, and hardlink the assembly to ${params.output}/assemblies directory +// Return sample_id and assembly, and publish the assembly to ${params.output}/assemblies directory based on ${params.assembly_publish} process ASSEMBLY_SHOVILL { label 'shovill_container' label 'farm_high_fallible' @@ -99,6 +99,6 @@ process ASSEMBLY_QC { QC_DEPTH="$qc_depth" ASSEMBLY_QC_REPORT="$assembly_qc_report" - source assembly_qc.sh + source get_assembly_qc.sh """ } diff --git a/modules/docker.nf b/modules/docker.nf index 4090957..ef0236b 100644 --- a/modules/docker.nf +++ b/modules/docker.nf @@ -15,7 +15,7 @@ process GET_DOCKER_COMPOSE { NEXTFLOW_CONFIG="$nextflowConfig" COMPOSE="$compose" - source get_docker_compose.sh + source create_docker_compose.sh """ } diff --git a/modules/info.nf b/modules/info.nf index cde8662..a317a3b 100644 --- a/modules/info.nf +++ b/modules/info.nf @@ -18,7 +18,7 @@ process IMAGES { NEXTFLOW_CONFIG="$nextflowConfig" JSON_FILE="$json" - source get_images_info.sh + source save_images_info.sh """ } @@ -59,7 +59,7 @@ process DATABASES { POPPUNK_EXT_JSON="$poppunk_ext_json" JSON_FILE="$json" - source get_databases_info.sh + source save_databases_info.sh """ } @@ -106,7 +106,7 @@ process TOOLS { ARIBA_VERSION="$ariba_version" JSON_FILE="$json" - source get_tools_info.sh + source save_tools_info.sh """ } @@ -135,7 +135,7 @@ process COMBINE_INFO { TOOLS="$tools" JSON_FILE="$json" - source combine_info.sh + source save_combined_info.sh """ } diff --git a/modules/lineage.nf b/modules/lineage.nf index 68edae3..9090f02 100644 --- a/modules/lineage.nf +++ b/modules/lineage.nf @@ -18,7 +18,7 @@ process GET_POPPUNK_DB { DB_LOCAL="$local" JSON_FILE="$json" - source get_poppunk_db.sh + source check-download_poppunk_db.sh """ } @@ -41,7 +41,7 @@ process GET_POPPUNK_EXT_CLUSTERS { EXT_CLUSTERS_LOCAL="$local" JSON_FILE="$json" - source get_poppunk_ext_clusters.sh + source check-download_poppunk_ext_clusters.sh """ } diff --git a/modules/mapping.nf b/modules/mapping.nf index d545607..e5d7c4e 100644 --- a/modules/mapping.nf +++ b/modules/mapping.nf @@ -1,5 +1,5 @@ // Return database path and prefix, construct if necessary -process CREATE_REF_GENOME_BWA_DB { +process GET_REF_GENOME_BWA_DB { label 'bwa_container' label 'farm_mid' @@ -20,7 +20,7 @@ process CREATE_REF_GENOME_BWA_DB { PREFIX="$prefix" JSON_FILE="$json" - source create_ref_genome_bwa_db.sh + source check-create_ref_genome_bwa_db.sh """ } @@ -152,6 +152,6 @@ process MAPPING_QC { QC_HET_SNP_SITE="$qc_het_snp_site" MAPPING_QC_REPORT="$mapping_qc_report" - source mapping_qc.sh + source get_mapping_qc.sh """ } diff --git a/modules/overall_qc.nf b/modules/overall_qc.nf index fa639d9..31c8cca 100644 --- a/modules/overall_qc.nf +++ b/modules/overall_qc.nf @@ -21,6 +21,6 @@ process OVERALL_QC { TAXONOMY_QC="$taxonomy_qc" OVERALL_QC_REPORT="$overall_qc_report" - source overall_qc.sh + source get_overall_qc.sh """ } diff --git a/modules/preprocess.nf b/modules/preprocess.nf index e04b756..e87ef99 100644 --- a/modules/preprocess.nf +++ b/modules/preprocess.nf @@ -48,6 +48,6 @@ process READ_QC { QC_DEPTH="$qc_depth" READ_QC_REPORT="$read_qc_report" - source read_qc.sh + source get_read_qc.sh """ } diff --git a/modules/serotype.nf b/modules/serotype.nf index 0c69bad..02327dd 100644 --- a/modules/serotype.nf +++ b/modules/serotype.nf @@ -1,5 +1,5 @@ -// Return boolean of CREATE_DB, download if necessary -process GET_SEROBA_DB { +// Return boolean of CREATE_DB, remove and clone if necessary +process CHECK_SEROBA_DB { label 'git_container' label 'farm_low' @@ -19,12 +19,12 @@ process GET_SEROBA_DB { KMER="$kmer" JSON_FILE="$json" - source get_seroba_db.sh + source check_seroba_db.sh """ } // Return SeroBA databases path, create databases if necessary -process CREATE_SEROBA_DB { +process GET_SEROBA_DB { label 'seroba_container' label 'farm_low' diff --git a/modules/taxonomy.nf b/modules/taxonomy.nf index b4d1e62..34ebeab 100644 --- a/modules/taxonomy.nf +++ b/modules/taxonomy.nf @@ -17,7 +17,7 @@ process GET_KRAKEN2_DB { DB_LOCAL="$local" JSON_FILE="$json" - source get_kraken2_db.sh + source check-download_kraken2_db.sh """ } @@ -73,6 +73,6 @@ process TAXONOMY_QC { QC_SPNEUMO_PERCENTAGE="$qc_spneumo_percentage" TAXONOMY_QC_REPORT="$taxonomy_qc_report" - source taxonomy_qc.sh + source get_taxonomy_qc.sh """ } diff --git a/workflows/init.nf b/workflows/init.nf index 64a748f..20eff25 100644 --- a/workflows/init.nf +++ b/workflows/init.nf @@ -1,25 +1,25 @@ // Import process modules -include { CREATE_REF_GENOME_BWA_DB } from "$projectDir/modules/mapping" +include { GET_REF_GENOME_BWA_DB } from "$projectDir/modules/mapping" include { GET_KRAKEN2_DB } from "$projectDir/modules/taxonomy" include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS } from "$projectDir/modules/lineage" -include { GET_SEROBA_DB; CREATE_SEROBA_DB } from "$projectDir/modules/serotype" +include { CHECK_SEROBA_DB; GET_SEROBA_DB } from "$projectDir/modules/serotype" include { GET_DOCKER_COMPOSE; PULL_IMAGES } from "$projectDir/modules/docker" -include { CREATE_ARIBA_DB } from "$projectDir/modules/amr" +include { GET_ARIBA_DB } from "$projectDir/modules/amr" // Alternative workflow for initialisation only workflow INIT { // Check Reference Genome BWA Database, generate from assembly if necessary - CREATE_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) + GET_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) // Check ARIBA database, generate from reference sequences and metadata if ncessary - CREATE_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) + GET_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) // Check Kraken2 Database, download if necessary GET_KRAKEN2_DB(params.kraken2_db_remote, params.kraken2_db_local) // Check SeroBA Databases, clone and rebuild if necessary - GET_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) - CREATE_SEROBA_DB(params.seroba_remote, params.seroba_local, GET_SEROBA_DB.out.create_db, params.seroba_kmer) + CHECK_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) + GET_SEROBA_DB(params.seroba_remote, params.seroba_local, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) // Check to PopPUNK Database and External Clusters, download if necessary GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_local) diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index fd288c9..01f3172 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -1,34 +1,34 @@ // Import process modules include { PREPROCESS; READ_QC } from "$projectDir/modules/preprocess" include { ASSEMBLY_UNICYCLER; ASSEMBLY_SHOVILL; ASSEMBLY_ASSESS; ASSEMBLY_QC } from "$projectDir/modules/assembly" -include { CREATE_REF_GENOME_BWA_DB; MAPPING; SAM_TO_SORTED_BAM; SNP_CALL; HET_SNP_COUNT; MAPPING_QC } from "$projectDir/modules/mapping" +include { GET_REF_GENOME_BWA_DB; MAPPING; SAM_TO_SORTED_BAM; SNP_CALL; HET_SNP_COUNT; MAPPING_QC } from "$projectDir/modules/mapping" include { GET_KRAKEN2_DB; TAXONOMY; TAXONOMY_QC } from "$projectDir/modules/taxonomy" include { OVERALL_QC } from "$projectDir/modules/overall_qc" include { GET_POPPUNK_DB; GET_POPPUNK_EXT_CLUSTERS; LINEAGE } from "$projectDir/modules/lineage" -include { GET_SEROBA_DB; CREATE_SEROBA_DB; SEROTYPE } from "$projectDir/modules/serotype" +include { CHECK_SEROBA_DB; GET_SEROBA_DB; SEROTYPE } from "$projectDir/modules/serotype" include { MLST } from "$projectDir/modules/mlst" -include { PBP_RESISTANCE; GET_PBP_RESISTANCE; CREATE_ARIBA_DB; OTHER_RESISTANCE; GET_OTHER_RESISTANCE } from "$projectDir/modules/amr" +include { PBP_RESISTANCE; PARSE_PBP_RESISTANCE; GET_ARIBA_DB; OTHER_RESISTANCE; PARSE_OTHER_RESISTANCE } from "$projectDir/modules/amr" include { GENERATE_SAMPLE_REPORT; GENERATE_OVERALL_REPORT } from "$projectDir/modules/output" // Main pipeline workflow workflow PIPELINE { main: // Get path and prefix of Reference Genome BWA Database, generate from assembly if necessary - CREATE_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) + GET_REF_GENOME_BWA_DB(params.ref_genome, params.ref_genome_bwa_db_local) // Get path to Kraken2 Database, download if necessary GET_KRAKEN2_DB(params.kraken2_db_remote, params.kraken2_db_local) // Get path to SeroBA Databases, clone and rebuild if necessary - GET_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) - CREATE_SEROBA_DB(params.seroba_remote, params.seroba_local, GET_SEROBA_DB.out.create_db, params.seroba_kmer) + CHECK_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) + GET_SEROBA_DB(params.seroba_remote, params.seroba_local, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) // Get paths to PopPUNK Database and External Clusters, download if necessary GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_local) GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_local) // Get path to ARIBA database, generate from reference sequences and metadata if ncessary - CREATE_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) + GET_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) // Get read pairs into Channel raw_read_pairs_ch raw_read_pairs_ch = Channel.fromFilePairs("$params.reads/*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}", checkIfExists: true) @@ -38,7 +38,7 @@ workflow PIPELINE { PREPROCESS(raw_read_pairs_ch) // From Channel PREPROCESS.out.json, provide Read QC status - // Output into Channel READ_QC_PASSED_READS_ch + // Output into Channels READ_QC.out.bases, READ_QC.out.result, READ_QC.out.report READ_QC(PREPROCESS.out.json, params.length_low, params.depth) // From Channel PREPROCESS.out.processed_reads, only output reads of samples passed Read QC based on Channel READ_QC.out.result @@ -47,7 +47,7 @@ workflow PIPELINE { .map { it[0, 2..-1] } // From Channel READ_QC_PASSED_READS_ch, assemble the preprocess read pairs - // Output into Channel ASSEMBLY_ch, and hardlink the assemblies to $params.output directory + // Output into Channel ASSEMBLY_ch, and hardlink (default) the assemblies to $params.output directory switch (params.assembler) { case 'shovill': ASSEMBLY_ch = ASSEMBLY_SHOVILL(READ_QC_PASSED_READS_ch, params.min_contig_length) @@ -59,10 +59,11 @@ workflow PIPELINE { } // From Channel ASSEMBLY_ch, assess assembly quality + // Output into Channel ASSEMBLY_ASSESS.out.report ASSEMBLY_ASSESS(ASSEMBLY_ch) // From Channel ASSEMBLY_ASSESS.out.report and Channel READ_QC.out.bases, provide Assembly QC status - // Output into Channels ASSEMBLY_QC.out.detailed_result & ASSEMBLY_QC.out.result + // Output into Channels ASSEMBLY_QC.out.result & ASSEMBLY_QC.out.report ASSEMBLY_QC( ASSEMBLY_ASSESS.out.report .join(READ_QC.out.bases, failOnDuplicate: true), @@ -74,7 +75,7 @@ workflow PIPELINE { // From Channel READ_QC_PASSED_READS_ch map reads to reference // Output into Channel MAPPING.out.sam - MAPPING(CREATE_REF_GENOME_BWA_DB.out.path, CREATE_REF_GENOME_BWA_DB.out.prefix, READ_QC_PASSED_READS_ch) + MAPPING(GET_REF_GENOME_BWA_DB.out.path, GET_REF_GENOME_BWA_DB.out.prefix, READ_QC_PASSED_READS_ch) // From Channel MAPPING.out.sam, Convert SAM into sorted BAM and calculate reference coverage // Output into Channels SAM_TO_SORTED_BAM.out.bam and SAM_TO_SORTED_BAM.out.ref_coverage @@ -82,10 +83,11 @@ workflow PIPELINE { // From Channel SAM_TO_SORTED_BAM.out.bam calculates non-cluster Het-SNP site count // Output into Channel HET_SNP_COUNT.out.result - SNP_CALL(params.ref_genome, SAM_TO_SORTED_BAM.out.bam, params.lite) | HET_SNP_COUNT + SNP_CALL(params.ref_genome, SAM_TO_SORTED_BAM.out.bam, params.lite) + HET_SNP_COUNT(SNP_CALL.out.vcf) // Merge Channels SAM_TO_SORTED_BAM.out.ref_coverage & HET_SNP_COUNT.out.result to provide Mapping QC Status - // Output into Channels MAPPING_QC.out.detailed_result & MAPPING_QC.out.result + // Output into Channels MAPPING_QC.out.result & MAPPING_QC.out.report MAPPING_QC( SAM_TO_SORTED_BAM.out.ref_coverage .join(HET_SNP_COUNT.out.result, failOnDuplicate: true, failOnMismatch: true), @@ -94,15 +96,15 @@ workflow PIPELINE { ) // From Channel READ_QC_PASSED_READS_ch assess Streptococcus pneumoniae percentage in reads - // Output into Channels TAXONOMY.out.detailed_result & TAXONOMY.out.result report + // Output into Channel TAXONOMY.out.report TAXONOMY(GET_KRAKEN2_DB.out.path, params.kraken2_memory_mapping, READ_QC_PASSED_READS_ch) // From Channel TAXONOMY.out.report, provide taxonomy QC status - // Output into Channels TAXONOMY_QC.out.detailed_result & TAXONOMY_QC.out.result report + // Output into Channels TAXONOMY_QC.out.result & TAXONOMY_QC.out.report TAXONOMY_QC(TAXONOMY.out.report, params.spneumo_percentage) - // Merge Channels ASSEMBLY_QC.out.result & MAPPING_QC.out.result & TAXONOMY_QC.out.result to provide Overall QC Status - // Output into Channel OVERALL_QC.out.result + // Merge Channels AREAD_QC.out.result & SSEMBLY_QC.out.result & MAPPING_QC.out.result & TAXONOMY_QC.out.result to provide Overall QC Status + // Output into Channel OVERALL_QC.out.result & OVERALL_QC.out.report OVERALL_QC( READ_QC.out.result .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true) @@ -121,31 +123,31 @@ workflow PIPELINE { .map { it[0, 2..-1] } // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, generate PopPUNK query file containing assemblies of samples passed overall QC - // Output into POPPUNK_QFILE POPPUNK_QFILE = OVERALL_QC_PASSED_ASSEMBLIES_ch .map { it.join'\t' } .collectFile(name: 'qfile.txt', newLine: true) // From generated POPPUNK_QFILE, assign GPSC to samples passed overall QC + // Output into Channel LINEAGE.out.reports (multiple reports from a single process) LINEAGE(GET_POPPUNK_DB.out.path, GET_POPPUNK_DB.out.database, GET_POPPUNK_EXT_CLUSTERS.out.file, POPPUNK_QFILE) // From Channel OVERALL_QC_PASSED_READS_ch, serotype the preprocess reads of samples passed overall QC - // Output into Channel SEROTYPE.out.result - SEROTYPE(CREATE_SEROBA_DB.out.path, CREATE_SEROBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) + // Output into Channel SEROTYPE.out.report + SEROTYPE(GET_SEROBA_DB.out.path, GET_SEROBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, PubMLST typing the assemblies of samples passed overall QC - // Output into Channel MLST.out.result + // Output into Channel MLST.out.report MLST(OVERALL_QC_PASSED_ASSEMBLIES_ch) // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, assign PBP genes and estimate MIC (minimum inhibitory concentration) for 6 Beta-lactam antibiotics - // Output into Channel GET_PBP_RESISTANCE.out.result + // Output into Channel PARSE_PBP_RESISTANCE.out.report PBP_RESISTANCE(OVERALL_QC_PASSED_ASSEMBLIES_ch) - GET_PBP_RESISTANCE(PBP_RESISTANCE.out.json) + PARSE_PBP_RESISTANCE(PBP_RESISTANCE.out.json) - // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, infer resistance (also determinants if any) of other antimicrobials - // Output into Channel GET_OTHER_RESISTANCE.out.result - OTHER_RESISTANCE(CREATE_ARIBA_DB.out.path, CREATE_ARIBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) - GET_OTHER_RESISTANCE(OTHER_RESISTANCE.out.reports, params.ariba_metadata) + // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, infer resistance and determinants of other antimicrobials + // Output into Channel PARSE_OTHER_RESISTANCE.out.result + OTHER_RESISTANCE(GET_ARIBA_DB.out.path, GET_ARIBA_DB.out.database, OVERALL_QC_PASSED_READS_ch) + PARSE_OTHER_RESISTANCE(OTHER_RESISTANCE.out.reports, params.ariba_metadata) // Generate sample reports by merging outputs from all result-generating modules GENERATE_SAMPLE_REPORT( @@ -156,8 +158,8 @@ workflow PIPELINE { .join(OVERALL_QC.out.report, failOnDuplicate: true, remainder: true) .join(SEROTYPE.out.report, failOnDuplicate: true, remainder: true) .join(MLST.out.report, failOnDuplicate: true, remainder: true) - .join(GET_PBP_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) - .join(GET_OTHER_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) + .join(PARSE_PBP_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) + .join(PARSE_OTHER_RESISTANCE.out.report, failOnDuplicate: true, remainder: true) .join(LINEAGE.out.reports.flatten().map { [it.name.take(it.name.lastIndexOf('.')), it] }, failOnDuplicate: true, remainder: true) // Turn reports list into channel, and map back Sample_ID based on output file name .map { [it[0], it[1..-1].minus(null)] } // Map Sample_ID to index 0 and all reports (with null entries removed) as a list to index 1 ) @@ -166,10 +168,10 @@ workflow PIPELINE { GENERATE_OVERALL_REPORT(GENERATE_SAMPLE_REPORT.out.report.collect(), params.ariba_metadata) // Pass databases information to SAVE_INFO sub-workflow - DATABASES_INFO = CREATE_REF_GENOME_BWA_DB.out.path.map { [["bwa_db_path", it]] } - .merge(CREATE_ARIBA_DB.out.path.map { [["ariba_db_path", it]] }) + DATABASES_INFO = GET_REF_GENOME_BWA_DB.out.path.map { [["bwa_db_path", it]] } + .merge(GET_ARIBA_DB.out.path.map { [["ariba_db_path", it]] }) .merge(GET_KRAKEN2_DB.out.path.map { [["kraken2_db_path", it]] }) - .merge(CREATE_SEROBA_DB.out.path.map { [["seroba_db_path", it]] }) + .merge(GET_SEROBA_DB.out.path.map { [["seroba_db_path", it]] }) .merge(GET_POPPUNK_DB.out.path.map { [["poppunk_db_path", it]] }) .merge(GET_POPPUNK_EXT_CLUSTERS.out.file.map { [["poppunk_ext_file", it]] }) // Save key-value tuples into a map From 9157a20cf7e1264e313d658b2f0725701bd3a28a Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 1 Aug 2023 15:10:13 +0000 Subject: [PATCH 071/157] Extract scripts to separated files Former-commit-id: 76afb5731cc9171b4528393a9155998a7dde0e3d --- bin/call_snp.sh | 8 ++++++++ bin/convert_sam_to_sorted_bam.sh | 11 +++++++++++ modules/mapping.nf | 31 ++++++++++++++----------------- workflows/pipeline.nf | 6 +++--- 4 files changed, 36 insertions(+), 20 deletions(-) create mode 100755 bin/call_snp.sh create mode 100755 bin/convert_sam_to_sorted_bam.sh diff --git a/bin/call_snp.sh b/bin/call_snp.sh new file mode 100755 index 0000000..d3fba63 --- /dev/null +++ b/bin/call_snp.sh @@ -0,0 +1,8 @@ +# Call SNPs and save to .vcf +# Remove source sorted BAM file if $LITE is true + +bcftools mpileup --threads "$(nproc)" -f "$REFERENCE" "$SORTED_BAM" | bcftools call --threads "$(nproc)" -mv -O v -o "$VCF" + +if [ "$LITE" = true ]; then + rm "$(readlink -f "$SORTED_BAM")" +fi diff --git a/bin/convert_sam_to_sorted_bam.sh b/bin/convert_sam_to_sorted_bam.sh new file mode 100755 index 0000000..c730a5f --- /dev/null +++ b/bin/convert_sam_to_sorted_bam.sh @@ -0,0 +1,11 @@ +# Convet SAM to sorted BAM file +# Remove source SAM file if $LITE is true + +samtools view -@ "$(nproc)" -b "$SAM" > "$BAM" + +samtools sort -@ "$(nproc)" -o "$SORTED_BAM" "$BAM" +rm "$BAM" + +if [ "$LITE" = true ]; then + rm "$(readlink -f "$SAM")" +fi diff --git a/modules/mapping.nf b/modules/mapping.nf index e5d7c4e..964a415 100644 --- a/modules/mapping.nf +++ b/modules/mapping.nf @@ -43,7 +43,7 @@ process MAPPING { script: sam="${sample_id}_mapped.sam" """ - bwa mem -t `nproc` "${bwa_ref_db_dir}/${prefix}" <(zcat -f -- < "$read1") <(zcat -f -- < "$read2") > "$sam" + bwa mem -t "`nproc`" "${bwa_ref_db_dir}/${prefix}" <(zcat -f -- < "$read1") <(zcat -f -- < "$read2") > "$sam" """ } @@ -60,22 +60,18 @@ process SAM_TO_SORTED_BAM { val lite output: - tuple val(sample_id), path(bam), emit: bam + tuple val(sample_id), path(sorted_bam), emit: sorted_bam tuple val(sample_id), env(COVERAGE), emit: ref_coverage script: - bam="${sample_id}_mapped_sorted.bam" + sorted_bam="${sample_id}_mapped_sorted.bam" """ - samtools view -@ `nproc` -b "$sam" > mapped.bam + SAM="$sam" + BAM="mapped.bam" + SORTED_BAM="$sorted_bam" + LITE="$lite" - samtools sort -@ `nproc` -o "$bam" mapped.bam - rm mapped.bam - - if [ $lite = true ]; then - rm `readlink -f "$sam"` - fi - - BAM="$bam" + source convert_sam_to_sorted_bam.sh source get_ref_coverage.sh """ } @@ -89,7 +85,7 @@ process SNP_CALL { input: path reference - tuple val(sample_id), path(bam) + tuple val(sample_id), path(sorted_bam) val lite output: @@ -98,11 +94,12 @@ process SNP_CALL { script: vcf="${sample_id}.vcf" """ - bcftools mpileup --threads `nproc` -f "$reference" "$bam" | bcftools call --threads `nproc` -mv -O v -o "$vcf" + REFERENCE="$reference" + SORTED_BAM="$sorted_bam" + VCF="$vcf" + LITE="$lite" - if [ $lite = true ]; then - rm `readlink -f "$bam"` - fi + source call_snp.sh """ } diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 01f3172..51a93f9 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -78,12 +78,12 @@ workflow PIPELINE { MAPPING(GET_REF_GENOME_BWA_DB.out.path, GET_REF_GENOME_BWA_DB.out.prefix, READ_QC_PASSED_READS_ch) // From Channel MAPPING.out.sam, Convert SAM into sorted BAM and calculate reference coverage - // Output into Channels SAM_TO_SORTED_BAM.out.bam and SAM_TO_SORTED_BAM.out.ref_coverage + // Output into Channels SAM_TO_SORTED_BAM.out.sorted_bam and SAM_TO_SORTED_BAM.out.ref_coverage SAM_TO_SORTED_BAM(MAPPING.out.sam, params.lite) - // From Channel SAM_TO_SORTED_BAM.out.bam calculates non-cluster Het-SNP site count + // From Channel SAM_TO_SORTED_BAM.out.sorted_bam calculates non-cluster Het-SNP site count // Output into Channel HET_SNP_COUNT.out.result - SNP_CALL(params.ref_genome, SAM_TO_SORTED_BAM.out.bam, params.lite) + SNP_CALL(params.ref_genome, SAM_TO_SORTED_BAM.out.sorted_bam, params.lite) HET_SNP_COUNT(SNP_CALL.out.vcf) // Merge Channels SAM_TO_SORTED_BAM.out.ref_coverage & HET_SNP_COUNT.out.result to provide Mapping QC Status From 7b024a18dce9953431865a17ebef2a391828efd6 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 1 Aug 2023 15:11:11 +0000 Subject: [PATCH 072/157] Improve shell scripts style Former-commit-id: 2e625f5e2ee984c2dbd4966f9eedb538d3f27b96 --- bin/get_ref_coverage.sh | 4 ++-- modules/assembly.nf | 4 ++-- modules/preprocess.nf | 2 +- modules/taxonomy.nf | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bin/get_ref_coverage.sh b/bin/get_ref_coverage.sh index 69a131b..199c481 100755 --- a/bin/get_ref_coverage.sh +++ b/bin/get_ref_coverage.sh @@ -1,4 +1,4 @@ # Return reference coverage percentage by the reads -samtools index -@ `nproc` "$BAM" -COVERAGE=$(samtools coverage "$BAM" | awk -F'\t' 'FNR==2 {print $6}') +samtools index -@ "$(nproc)" "$SORTED_BAM" +COVERAGE=$(samtools coverage "$SORTED_BAM" | awk -F'\t' 'FNR==2 {print $6}') diff --git a/modules/assembly.nf b/modules/assembly.nf index ab66b6e..6bf7b79 100644 --- a/modules/assembly.nf +++ b/modules/assembly.nf @@ -20,7 +20,7 @@ process ASSEMBLY_UNICYCLER { script: fasta="${sample_id}.contigs.fasta" """ - unicycler -1 "$read1" -2 "$read2" -s "$unpaired" -o results -t `nproc` --min_fasta_length "$min_contig_length" + unicycler -1 "$read1" -2 "$read2" -s "$unpaired" -o results -t "`nproc`" --min_fasta_length "$min_contig_length" mv results/assembly.fasta "${fasta}" """ } @@ -47,7 +47,7 @@ process ASSEMBLY_SHOVILL { script: fasta="${sample_id}.contigs.fasta" """ - shovill --R1 "$read1" --R2 "$read2" --outdir results --cpus `nproc` --minlen "$min_contig_length" --force + shovill --R1 "$read1" --R2 "$read2" --outdir results --cpus "`nproc`" --minlen "$min_contig_length" --force mv results/contigs.fa "${fasta}" """ } diff --git a/modules/preprocess.nf b/modules/preprocess.nf index e87ef99..1e89da7 100644 --- a/modules/preprocess.nf +++ b/modules/preprocess.nf @@ -19,7 +19,7 @@ process PREPROCESS { processed_two="processed-${sample_id}_2.fastq.gz" processed_unpaired="processed-${sample_id}_unpaired.fastq.gz" """ - fastp --thread `nproc` --in1 "$read_one" --in2 "$read_two" --out1 "$processed_one" --out2 "$processed_two" --unpaired1 "$processed_unpaired" --unpaired2 "$processed_unpaired" + fastp --thread "`nproc`" --in1 "$read_one" --in2 "$read_two" --out1 "$processed_one" --out2 "$processed_two" --unpaired1 "$processed_unpaired" --unpaired2 "$processed_unpaired" """ } diff --git a/modules/taxonomy.nf b/modules/taxonomy.nf index 34ebeab..735b59d 100644 --- a/modules/taxonomy.nf +++ b/modules/taxonomy.nf @@ -41,11 +41,11 @@ process TAXONOMY { if (kraken2_memory_mapping === true) """ - kraken2 --threads `nproc` --use-names --memory-mapping --db "$kraken2_db" --paired "$read1" "$read2" --report "$report" --output - + kraken2 --threads "`nproc`" --use-names --memory-mapping --db "$kraken2_db" --paired "$read1" "$read2" --report "$report" --output - """ else if (kraken2_memory_mapping === false) """ - kraken2 --threads `nproc` --use-names --db "$kraken2_db" --paired "$read1" "$read2" --report "$report" --output - + kraken2 --threads "`nproc`" --use-names --db "$kraken2_db" --paired "$read1" "$read2" --report "$report" --output - """ else error "The value for --kraken2_memory_mapping is not valid." From 2bc27c7e2a81bd9a501603ad0296e30029ff4af5 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 1 Aug 2023 17:02:31 +0000 Subject: [PATCH 073/157] Improve option names consistency Former-commit-id: 0c70390e8657999edfb717a9a94df8fbfb8806dc --- modules/validate.nf | 6 +++--- nextflow.config | 6 +++--- workflows/info_and_version.nf | 4 ++-- workflows/init.nf | 8 ++++---- workflows/pipeline.nf | 8 ++++---- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/modules/validate.nf b/modules/validate.nf index 4cf3438..f2bceb0 100644 --- a/modules/validate.nf +++ b/modules/validate.nf @@ -8,8 +8,8 @@ validParams = [ assembler: 'assembler', min_contig_length: 'int', assembly_publish: 'publish_mode', - seroba_remote: 'url_git', - seroba_local: 'path', + seroba_db_remote: 'url_git', + seroba_db_local: 'path', seroba_kmer: 'int', kraken2_db_remote: 'url_targz', kraken2_db_local: 'path', @@ -18,7 +18,7 @@ validParams = [ ref_genome_bwa_db_local: 'path', poppunk_db_remote: 'url_targz', poppunk_ext_remote: 'url_csv', - poppunk_local: 'path', + poppunk_db_local: 'path', spneumo_percentage: 'int_float', ref_coverage: 'int_float', het_snp_site: 'int', diff --git a/nextflow.config b/nextflow.config index 1a322f1..6048efb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -20,8 +20,8 @@ params { assembly_publish = "link" // Default git repository and local directory, and KMC kmer size for SeroBA - seroba_remote = "https://github.com/sanger-pathogens/seroba.git" - seroba_local = "$projectDir/databases/seroba" + seroba_db_remote = "https://github.com/sanger-pathogens/seroba.git" + seroba_db_local = "$projectDir/databases/seroba" seroba_kmer = 71 // Default link and local directory for Kraken2 Database, and usage of memory mapping @@ -36,7 +36,7 @@ params { // Default links for PopPUNK Database and External Clusters, and local directory for both poppunk_db_remote = "https://gps-project.cog.sanger.ac.uk/GPS_v6.tar.gz" poppunk_ext_remote = "https://www.pneumogen.net/gps/GPS_v6_external_clusters.csv" - poppunk_local = "$projectDir/databases/poppunk" + poppunk_db_local = "$projectDir/databases/poppunk" // Default values for QC spneumo_percentage = 60.00 diff --git a/workflows/info_and_version.nf b/workflows/info_and_version.nf index bb5ce37..186dadf 100644 --- a/workflows/info_and_version.nf +++ b/workflows/info_and_version.nf @@ -10,8 +10,8 @@ workflow PRINT_VERSION { params.ref_genome_bwa_db_local, params.ariba_db_local, params.kraken2_db_local, - params.seroba_local, - params.poppunk_local, + params.seroba_db_local, + params.poppunk_db_local, pipeline_version ) \ | PARSE \ diff --git a/workflows/init.nf b/workflows/init.nf index 20eff25..7d1ed77 100644 --- a/workflows/init.nf +++ b/workflows/init.nf @@ -18,12 +18,12 @@ workflow INIT { GET_KRAKEN2_DB(params.kraken2_db_remote, params.kraken2_db_local) // Check SeroBA Databases, clone and rebuild if necessary - CHECK_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) - GET_SEROBA_DB(params.seroba_remote, params.seroba_local, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) + CHECK_SEROBA_DB(params.seroba_db_remote, params.seroba_db_local, params.seroba_kmer) + GET_SEROBA_DB(params.seroba_db_remote, params.seroba_db_local, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) // Check to PopPUNK Database and External Clusters, download if necessary - GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_local) - GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_local) + GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_db_local) + GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_db_local) // Pull all Docker images mentioned in nextflow.config if using Docker if (workflow.containerEngine === 'docker') { diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 51a93f9..e39599f 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -20,12 +20,12 @@ workflow PIPELINE { GET_KRAKEN2_DB(params.kraken2_db_remote, params.kraken2_db_local) // Get path to SeroBA Databases, clone and rebuild if necessary - CHECK_SEROBA_DB(params.seroba_remote, params.seroba_local, params.seroba_kmer) - GET_SEROBA_DB(params.seroba_remote, params.seroba_local, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) + CHECK_SEROBA_DB(params.seroba_db_remote, params.seroba_db_local, params.seroba_kmer) + GET_SEROBA_DB(params.seroba_db_remote, params.seroba_db_local, CHECK_SEROBA_DB.out.create_db, params.seroba_kmer) // Get paths to PopPUNK Database and External Clusters, download if necessary - GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_local) - GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_local) + GET_POPPUNK_DB(params.poppunk_db_remote, params.poppunk_db_local) + GET_POPPUNK_EXT_CLUSTERS(params.poppunk_ext_remote, params.poppunk_db_local) // Get path to ARIBA database, generate from reference sequences and metadata if ncessary GET_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.ariba_db_local) From 109da30408076c265cc52e43122557b5e93af7fb Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 1 Aug 2023 17:02:46 +0000 Subject: [PATCH 074/157] Improve help message Former-commit-id: d76351c9528a7ad8e2b0a551e63a0ca361a0b80a --- modules/messages.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/messages.nf b/modules/messages.nf index 7a9d189..78e9cd3 100644 --- a/modules/messages.nf +++ b/modules/messages.nf @@ -30,7 +30,7 @@ void helpMessage() { |--reads [PATH] Path to the input directory that contains the reads to be processed |--output [PATH] Path to the output directory that save the results |--init Alternative workflow for initialisation - |--version Alternative workflow for getting versions of pipeline, tools and databases + |--version Alternative workflow for getting versions of pipeline, container images, tools and databases | |For all available options, please refer to README.md '''.stripMargin() From 2a1135b5dd2294b019739fe1af73fed8e2a5b9f2 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Tue, 1 Aug 2023 17:03:39 +0000 Subject: [PATCH 075/157] Improve content and update to reflect changes Former-commit-id: 7935001f2b822e7d845d43cdf4ff091b0c46ead0 --- README.md | 127 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 65 insertions(+), 62 deletions(-) diff --git a/README.md b/README.md index aa68b3e..cb10fa9 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ # GPS Unified Pipeline -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-23.04.1-23aa62.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-23.04.2-23aa62.svg)](https://www.nextflow.io/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/singularity/) The GPS Unified Pipeline is a Nextflow pipeline designed for processing raw reads (FASTQ files) of *Streptococcus pneumoniae* samples. After preprocessing, the pipeline performs initial assessment based on the total bases in reads. Passed samples will be further assess based on assembly, mapping, and taxonomy. If the sample passes all quality controls (QC), the pipeline also provides the sample's serotype, multi-locus sequence typing (MLST), lineage (based on the [Global Pneumococcal Sequence Cluster (GPSC)](https://www.pneumogen.net/gps/GPSC_lineages.html)), and antimicrobial resistance (AMR) against multiple antimicrobials. -The pipeline is designed to be easy to set up and use, and is suitable for use on local machines. It is also offline-capable, making it an ideal option for cases where the FASTQ files being analysed should not leave the local machine. Additionally, the pipeline only downloads essential files to enable the analysis, and no data is uploaded from the local machine. After initialisation or the first successful complete run, the pipeline can be used offline unless you have changed the selection of any database or container image. +The pipeline is designed to be easy to set up and use, and is suitable for use on local machines and high-performance computing (HPC) clusters alike. Additionally, the pipeline only downloads essential files to enable the analysis, and no data is uploaded from the local environment, making it an ideal option for cases where the FASTQ files being analysed is confidential. After initialisation or the first successful complete run, the pipeline can be used offline unless you have changed the selection of any database or container image. The development of this pipeline is part of the GPS Project ([Global Pneumococcal Sequencing Project](https://www.pneumogen.net/gps/)). @@ -57,7 +57,7 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca > - The pipeline generates ~1.8GB intermediate files for each sample on average(These files can be removed when the pipeline run is completed, please refer to [Clean Up](#clean-up)) (To further reduce storage requirement by sacrificing the ability to resume the pipeline, please refer to [Experimental](#experimental)) ## Accepted Inputs -- Currently, only Illumina paired-end short reads are supported +- Only Illumina paired-end short reads are supported - Each sample is expected to be a pair of raw reads following this file name pattern: - `*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}` - example 1: `SampleName_R1_001.fastq.gz`, `SampleName_R2_001.fastq.gz` @@ -70,18 +70,18 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca ``` or - Download and unzip the [repository](https://github.com/HarryHung/gps-unified-pipeline/archive/refs/heads/master.zip) -2. Go into the local copy of the repository + Download and unzip the [latest release](https://github.com/HarryHung/gps-unified-pipeline/releases) +2. Go into the local copy of the repository and the pipeline is ready to use without installation ``` cd gps-unified-pipeline ``` 3. (Optional) You could perform an initialisation to download all required additional files and container images, so the pipeline can be used at any time with or without the Internet afterwards. > ⚠️ Docker or Singularity must be running, and an Internet connection is required. - - For those using Docker as the container engine + - Using Docker as the container engine ``` ./run_pipeline --init ``` - - For those using Singularity as the container engine + - Using Singularity as the container engine ``` ./run_pipeline --init -profile singularity ``` @@ -91,8 +91,8 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca > ⚠️ If this is the first run and initialisation was not performed, an Internet connection is required. -> ℹ️ By default, Docker is used as the container engine and all the processes are executed by the local machine. See [Profile](#profile) for details on running the pipeline with Singularity or on a server farm. -- You can run the pipeline without options. It will attempt to get the raw reads from the default location (`input` directory inside the `gps-unified-pipeline` local repository) +> ℹ️ By default, Docker is used as the container engine and all the processes are executed by the local machine. See [Profile](#profile) for details on running the pipeline with Singularity or on a HPC cluster. +- You can run the pipeline without options. It will attempt to get the raw reads from the default location (i.e. `input` directory inside the `gps-unified-pipeline` local repository) ``` ./run_pipeline ``` @@ -113,29 +113,30 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca ``` ./run_pipeline -profile [profile name] ``` -- Currently, the following profiles are available +- Available profiles: | Profile Name | Details | | --- | --- | | `standard` (Default) | Docker is used as the container engine. Processes are executed locally. | | `singularity` | Singularity is used as the container engine. Processes are executed locally. | - | `lsf` | **The pipeline should be launched from a LSF cluster head node with this profile.** Singularity is used as the container engine. Processes are submitted to your LSF cluster via `bsub`. (Tested on Sanger farm5) | + | `lsf` | **The pipeline should be launched from a LSF cluster head node with this profile.** Singularity is used as the container engine. Processes are submitted to your LSF cluster via `bsub` by the pipeline. (Tested on Sanger farm5 cluster only) | ## Resume - If the pipeline is interrupted mid-run, Nextflow's built-in `-resume` option can be used to resume the pipeline execution instead of starting from scratch again - You should use the same command of the original run, only add `-resume` at the end (i.e. all pipeline options should be identical) > ℹ️ `-resume` is a built-in Nextflow option, it only has one leading `-` - ``` - # original command - ./run_pipeline --reads /path/to/raw-reads-directory - - # command to resume the pipeline execution - ./run_pipeline --reads /path/to/raw-reads-directory -resume - ``` + - If the original command is + ``` + ./run_pipeline --reads /path/to/raw-reads-directory + ``` + - The command to resume the pipeline execution should be + ``` + ./run_pipeline --reads /path/to/raw-reads-directory -resume + ``` ## Clean Up - During the run of the pipeline, Nextflow generates a considerable amount of intermediate files -- If the run has been completed and you do not intend to use the `-resume` option, you can remove the intermediate files using one of the following ways: - - Run `clean_pipeline` script +- If the run has been completed and you do not intend to use the `-resume` option or those intermediate files, you can remove the intermediate files using one of the following ways: + - Run the included `clean_pipeline` script - It runs the commands in manual removal for you - It removes the `work` directory and log files within the `gps-unified-pipeline` local repository ``` @@ -167,13 +168,13 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca > ℹ️ `$projectDir` is a [Nextflow built-in implicit variables](https://www.nextflow.io/docs/latest/script.html?highlight=projectdir#implicit-variables), it is defined as the directory where the `gps-unified-pipeline` local repository is stored. -> ℹ️ They are not built-in Nextflow options, hence lead with `--` instead of `-` +> ℹ️ Pipeline options are not built-in Nextflow options, they are lead with `--` instead of `-` ## Alternative Workflows | Option | Values | Description | | --- | ---| --- | - | `--init` | `true` or `false`(Default: `false`) | Use alternative workflow for initialisation, which means downloading all required additional files and container images.Can be enabled by including `--init` without value. | - | `--version` | `true` or `false`(Default: `false`)| Use alternative workflow for getting versions of pipeline, tools and databases.Can be enabled by including `--version` without value. (This workflow pulls the required container images if they are not yet available locally) | + | `--init` | `true` or `false`(Default: `false`) | Use alternative workflow for initialisation, which means downloading all required additional files and container images, and creating databases.Can be enabled by including `--init` without value. | + | `--version` | `true` or `false`(Default: `false`)| Use alternative workflow for showing versions of pipeline, container images, tools and databases.Can be enabled by including `--version` without value. (This workflow pulls the required container images if they are not yet available locally) | | `--help` | `true` or `false`(Default: `false`)| Show help message.Can be enabled by including `--help` without value. | ## Input and Output @@ -199,7 +200,7 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca ## Assembly | Option | Values | Description | | --- | ---| --- | - | `--assembler` | `"shovill"` or `"unicycler"`(Default: `"shovill"`)| SPAdes Assembler to assemble the reads. | + | `--assembler` | `"shovill"` or `"unicycler"`(Default: `"shovill"`)| Using which SPAdes-based assembler to assemble the reads. | | `--min_contig_length` | Any integer value(Default: `500`) | Minimum legnth of contig to be included in the assembly | ## Mapping @@ -220,22 +221,22 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | `--kraken2_memory_mapping` | `true` or `false`(Default: `true`) | Using the memory mapping option of Kraken2 or not.`true` means not loading the database into RAM, suitable for memory-limited or fast storage environments. | ## Serotype - > ⚠️ `--seroba_local` does not accept user provided local database, directory content will be overwritten + > ⚠️ `--seroba_db_local` does not accept user provided local database, directory content will be overwritten | Option | Values | Description | | --- | ---| --- | - | `--seroba_remote` | Any valid URL to a Git remote repository(Default: [SeroBA GitHub Repo](https://github.com/sanger-pathogens/seroba.git))| URL to a SeroBA Git remote repository. | - | `--seroba_local` | Any valid path(Default: `"$projectDir/databases/seroba"`) | Path to the directory where SeroBA local repository should be saved to. | + | `--seroba_db_remote` | Any valid URL to a Git remote repository(Default: [SeroBA GitHub Repo](https://github.com/sanger-pathogens/seroba.git))| URL to a SeroBA Git remote repository. | + | `--seroba_db_local` | Any valid path(Default: `"$projectDir/databases/seroba"`) | Path to the directory where SeroBA local repository should be saved to. | | `--seroba_kmer` | Any integer value(Default: `71`) | Kmer size for creating the KMC database of SeroBA. | ## Lineage - > ⚠️ `--poppunk_local` does not accept user provided local database, directory content will be overwritten + > ⚠️ `--poppunk_db_local` does not accept user provided local database, directory content will be overwritten | Option | Values | Description | | --- | ---| --- | | `--poppunk_db_remote` | Any valid URL to a PopPUNK database in `.tar.gz` or `.tgz` format(Default: [GPS v6](https://gps-project.cog.sanger.ac.uk/GPS_v6.tar.gz)) | URL to a PopPUNK database. | | `--poppunk_ext_remote` | Any valid URL to a PopPUNK external clusters file in `.csv` format(Default: [GPS v6 GPSC Designation](https://www.pneumogen.net/gps/GPS_v6_external_clusters.csv)) | URL to a PopPUNK external clusters file. | - | `--poppunk_local` | Any valid path(Default: `"$projectDir/databases/poppunk"`) | Path to the directory where the remote PopPUNK database and external clusters file should be saved to. | + | `--poppunk_db_local` | Any valid path(Default: `"$projectDir/databases/poppunk"`) | Path to the directory where the remote PopPUNK database and external clusters file should be saved to. | ## Other AMR > ⚠️ `--ariba_db_local` does not accept user provided local database, directory content will be overwritten @@ -276,9 +277,11 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca - The following fields can be found in the output `results.csv` > ℹ️ For resistance phenotypes: S = Sensitive/Susceptible; I = Intermediate; R = Resistant + > ℹ️ * The exact output fields of Other AMR depends on the provided ARIBA database, the below table is based on the default ARIBA database + > ⚠️ If the result of `Overall_QC` of a sample is `ASSEMBLER FAILURE`, the assembler has crashed when trying to assembly the reads. You might want to re-run the sample with [another assembler](#assembly), or discard the sample if it is a low quality one. - > ⚠️ If the result of `Serotype` of a sample is `SEROBA FAILURE`, SeroBA has crashed when trying to serotype the sample. Please report the issue. + > ⚠️ If the result of `Serotype` of a sample is `SEROBA FAILURE`, SeroBA has crashed when trying to serotype the sample. | Field | Type | Description | | --- | --- | --- | @@ -323,38 +326,38 @@ The development of this pipeline is part of the GPS Project ([Global Pneumococca | `PEN_MIC` | PBP AMR | Estimated MIC of penicillin (PEN) | | `PEN_Res(Meningital)` | PBP AMR | Resistance phenotype against PEN in meningital form | | `PEN_Res(Non-meningital)` | PBP AMR | Resistance phenotype against PEN in non-meningital form | - | `CHL_Res` | Other AMR | Resistance phenotype against Chloramphenicol (CHL) | - | `CHL_Determinant` | Other AMR | Known determinants that inferred the CHL resistance | - | `ERY_Res` | Other AMR | Resistance phenotype against Erythromycin (ERY) | - | `ERY_Determinant` | Other AMR | Known determinants that inferred the ERY resistance | - | `CLI_Res` | Other AMR | Resistance phenotype against Clindamycin (CLI) | - | `CLI_Determinant` | Other AMR | Known determinants that inferred the CLI resistance | - | `ERY_CLI_Res` | Other AMR | Resistance phenotype against Erythromycin (ERY) and Clindamycin (CLI) | - | `ERY_CLI_Determinant` | Other AMR | Known determinants that inferred the ERY and CLI resistance | - | `FQ_Res` | Other AMR | Resistance phenotype against Fluoroquinolones (FQ) | - | `FQ_Determinant` | Other AMR | Known determinants that inferred the FQ resistance | - | `LFX_Res` | Other AMR | Resistance phenotype against Levofloxacin (LFX) | - | `LFX_Determinant` | Other AMR | Known determinants that inferred the LFX resistance | - | `KAN_Res` | Other AMR | Resistance phenotype against Kanamycin (KAN) | - | `KAN_Determinant` | Other AMR | Known determinants that inferred the KAN resistance | - | `TET_Res` | Other AMR | Resistance phenotype against Tetracycline (TET) | - | `TET_Determinant` | Other AMR | Known determinants that inferred the TET resistance | - | `DOX_Res` | Other AMR | Resistance phenotype against Doxycycline (DOX) | - | `DOX_Determinant` | Other AMR | Known determinants that inferred the DOX resistance | - | `TMP_Res` | Other AMR | Resistance phenotype against Trimethoprim (TMP) | - | `TMP_Determinant` | Other AMR | Known determinants that inferred the TMP resistance | - | `SMX_Res` | Other AMR | Resistance phenotype against Sulfamethoxazole (SMX) | - | `SMX_Determinant` | Other AMR | Known determinants that inferred the SMX resistance | - | `COT_Res` | Other AMR | Resistance phenotype against Co-Trimoxazole (COT) | - | `COT_Determinant` | Other AMR | Known determinants that inferred the COT resistance | - | `RIF_Res` | Other AMR | Resistance phenotype against Rifampin (RIF) | - | `RIF_Determinant` | Other AMR | Known determinants that inferred the RIF resistance | - | `VAN_Res` | Other AMR | Resistance phenotype against Vancomycin (VAN) | - | `VAN_Determinant` | Other AMR | Known determinants that inferred the VAN resistance | - | `PILI1` | Other AMR | Expression of PILI-1 | - | `PILI1_Determinant` | Other AMR | Known determinants that inferred the PILI-1 expression | - | `PILI2` | Other AMR | Expression of PILI-2 | - | `PILI2_Determinant` | Other AMR | Known determinants that inferred the PILI-2 expression | + | `CHL_Res` | Other AMR* | Resistance phenotype against Chloramphenicol (CHL) | + | `CHL_Determinant` | Other AMR* | Known determinants that inferred the CHL resistance | + | `CLI_Res` | Other AMR* | Resistance phenotype against Clindamycin (CLI) | + | `CLI_Determinant` | Other AMR* | Known determinants that inferred the CLI resistance | + | `COT_Res` | Other AMR* | Resistance phenotype against Co-Trimoxazole (COT) | + | `COT_Determinant` | Other AMR* | Known determinants that inferred the COT resistance | + | `DOX_Res` | Other AMR* | Resistance phenotype against Doxycycline (DOX) | + | `DOX_Determinant` | Other AMR* | Known determinants that inferred the DOX resistance | + | `ERY_Res` | Other AMR* | Resistance phenotype against Erythromycin (ERY) | + | `ERY_Determinant` | Other AMR* | Known determinants that inferred the ERY resistance | + | `ERY_CLI_Res` | Other AMR* | Resistance phenotype against Erythromycin (ERY) and Clindamycin (CLI) | + | `ERY_CLI_Determinant` | Other AMR* | Known determinants that inferred the ERY and CLI resistance | + | `FQ_Res` | Other AMR* | Resistance phenotype against Fluoroquinolones (FQ) | + | `FQ_Determinant` | Other AMR* | Known determinants that inferred the FQ resistance | + | `KAN_Res` | Other AMR* | Resistance phenotype against Kanamycin (KAN) | + | `KAN_Determinant` | Other AMR* | Known determinants that inferred the KAN resistance | + | `LFX_Res` | Other AMR* | Resistance phenotype against Levofloxacin (LFX) | + | `LFX_Determinant` | Other AMR* | Known determinants that inferred the LFX resistance | + | `RIF_Res` | Other AMR* | Resistance phenotype against Rifampin (RIF) | + | `RIF_Determinant` | Other AMR* | Known determinants that inferred the RIF resistance | + | `SMX_Res` | Other AMR* | Resistance phenotype against Sulfamethoxazole (SMX) | + | `SMX_Determinant` | Other AMR* | Known determinants that inferred the SMX resistance | + | `TET_Res` | Other AMR* | Resistance phenotype against Tetracycline (TET) | + | `TET_Determinant` | Other AMR* | Known determinants that inferred the TET resistance | + | `TMP_Res` | Other AMR* | Resistance phenotype against Trimethoprim (TMP) | + | `TMP_Determinant` | Other AMR* | Known determinants that inferred the TMP resistance | + | `VAN_Res` | Other AMR* | Resistance phenotype against Vancomycin (VAN) | + | `VAN_Determinant` | Other AMR* | Known determinants that inferred the VAN resistance | + | `PILI1` | Other AMR* | Expression of PILI-1 | + | `PILI1_Determinant` | Other AMR* | Known determinants that inferred the PILI-1 expression | + | `PILI2` | Other AMR* | Expression of PILI-2 | + | `PILI2_Determinant` | Other AMR* | Known determinants that inferred the PILI-2 expression | # Credits From 4fd29070fdbc412cb6762164c62ce60a0424b39f Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 2 Aug 2023 09:54:56 +0000 Subject: [PATCH 076/157] Update version of ARIBA container Former-commit-id: 457b99629e7fc812230497d9f04cbe93b3cbea12 --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 6048efb..268f05a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -96,7 +96,7 @@ process { container = 'harryhungch/spn-pbp-amr:23.01.16' } withLabel: ariba_container { - container = 'staphb/ariba:2.14.4' + container = 'staphb/ariba:2.14.6' } withLabel: mlst_container { container = 'staphb/mlst:2.23.0' From cdb0251f4619c83fcad940463d7734cfb89ba33c Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 2 Aug 2023 09:55:21 +0000 Subject: [PATCH 077/157] Update credits section Former-commit-id: ebc7ce5dd388af8c8a3f821d5f02842d331d799a --- README.md | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index cb10fa9..bda84c2 100644 --- a/README.md +++ b/README.md @@ -366,39 +366,39 @@ This project uses open-source components. You can find the homepage or source co [ARIBA](https://sanger-pathogens.github.io/ariba/) - ARIBA: rapid antimicrobial resistance genotyping directly from sequencing reads Hunt M, Mather AE, Sánchez-Busó L, Page AJ, Parkhill J , Keane JA, Harris SR. Microbial Genomics 2017. doi: [110.1099/mgen.0.000131](http://mgen.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000131) - License (GPL-3.0): https://github.com/sanger-pathogens/ariba/blob/master/LICENSE -- This tool is used in `CREATE_ARIBA_DB` and `OTHER_RESISTANCE` processes of the `amr.nf` module +- This tool is used in `GET_ARIBA_DB` and `OTHER_RESISTANCE` processes of the `amr.nf` module [BCFtools](https://samtools.github.io/bcftools/) and [SAMtools](https://www.htslib.org/) - Twelve years of SAMtools and BCFtools. Petr Danecek, James K Bonfield, Jennifer Liddle, John Marshall, Valeriu Ohan, Martin O Pollard, Andrew Whitwham, Thomas Keane, Shane A McCarthy, Robert M Davies, Heng Li. **GigaScience**, Volume 10, Issue 2, February 2021, giab008, https://doi.org/10.1093/gigascience/giab008 - Licenses - BCFtools (MIT/Expat or GPL-3.0): https://github.com/samtools/bcftools/blob/develop/LICENSE - SAMtools (MIT/Expat): https://github.com/samtools/samtools/blob/develop/LICENSE -- These tools are used in `SAM_TO_SORTED_BAM`, `REF_COVERAGE` and `SNP_CALL` processes of the `mapping.nf` module +- These tools are used in `SAM_TO_SORTED_BAM` and `SNP_CALL` processes of the `mapping.nf` module [BWA](https://github.com/lh3/bwa) - Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. [arXiv:1303.3997v2](http://arxiv.org/abs/1303.3997) [q-bio.GN] - License (GPL-3.0): https://github.com/lh3/bwa/blob/master/COPYING -- This tool is used in `GET_REF_GENOME_BWA_DB_PREFIX` and `MAPPING` processes of the `mapping.nf` module +- This tool is used in `GET_REF_GENOME_BWA_DB` and `MAPPING` processes of the `mapping.nf` module -[Docker Images](https://hub.docker.com/u/staphb) of [BCFtools](https://hub.docker.com/r/staphb/bcftools), [BWA](https://hub.docker.com/r/staphb/bwa), [fastp](https://hub.docker.com/r/staphb/fastp), [Kraken 2](https://hub.docker.com/r/staphb/kraken2), [mlst](https://hub.docker.com/r/staphb/mlst), [PopPUNK](https://hub.docker.com/r/staphb/poppunk), [QUAST](https://hub.docker.com/r/staphb/quast), [SAMtools](https://hub.docker.com/r/staphb/samtools), [Shovill](https://hub.docker.com/r/staphb/shovill), [Unicycler](https://hub.docker.com/r/staphb/unicycler) +[Docker Images](https://hub.docker.com/u/staphb) of [ARIBA](https://hub.docker.com/r/staphb/ariba), [BCFtools](https://hub.docker.com/r/staphb/bcftools), [BWA](https://hub.docker.com/r/staphb/bwa), [fastp](https://hub.docker.com/r/staphb/fastp), [Kraken 2](https://hub.docker.com/r/staphb/kraken2), [mlst](https://hub.docker.com/r/staphb/mlst), [PopPUNK](https://hub.docker.com/r/staphb/poppunk), [QUAST](https://hub.docker.com/r/staphb/quast), [SAMtools](https://hub.docker.com/r/staphb/samtools), [Shovill](https://hub.docker.com/r/staphb/shovill), [Unicycler](https://hub.docker.com/r/staphb/unicycler) - [State Public Health Bioinformatics Workgroup](https://staphb.org/) ([@StaPH-B](https://github.com/StaPH-B)) - License (GPL-3.0): https://github.com/StaPH-B/docker-builds/blob/master/LICENSE -- These Docker images provide containerised environments for processes of multiple modules +- These Docker images provide containerised environments with different bioinformatics tools for processes of multiple modules [Docker Image of Git](https://hub.docker.com/r/bitnami/git) - [Bitnami](https://bitnami.com/) ([@Bitnami](https://github.com/bitnami)) - License (Apache 2.0): https://github.com/bitnami/containers/blob/main/LICENSE.md -- This Docker image provides the containerised environment for `GET_SEROBA_DB` process of the `serotype.nf` module +- This Docker image provides the containerised environment with Git for `CHECK_SEROBA_DB` process of the `serotype.nf` module [Docker Image of network-multitool](https://hub.docker.com/r/wbitt/network-multitool) - [Wbitt - We Bring In Tomorrow's Technolgies](https://wbitt.com/) ([@WBITT](https://github.com/wbitt)) - License (MIT): https://github.com/wbitt/Network-MultiTool/blob/master/LICENSE -- This Docker image provides the containerised environment for processes of multiple modules +- This Docker image provides the containerised environment with Bash tools for processes of multiple modules -[Docker Image of Python](https://hub.docker.com/_/python) -- The Docker Community ([@docker-library](https://github.com/docker-library)) -- License (MIT): https://github.com/docker-library/python/blob/master/LICENSE -- This Docker image provides the containerised environment for `HET_SNP_COUNT` process of the `mapping.nf` module and `GET_OTHER_RESISTANCE` process of the `amr.nf` module +[Docker Image of Pandas](https://hub.docker.com/r/amancevice/pandas) +- Alexander Mancevice ([@amancevice](https://github.com/amancevice)) +- License (MIT): https://github.com/amancevice/docker-pandas/blob/main/LICENSE +- This Docker image provides the containerised environment with Python and Pandas for `GENERATE_OVERALL_REPORT` process of the `output.nf` module, `HET_SNP_COUNT` process of the `mapping.nf` module and `PARSE_OTHER_RESISTANCE` process of the `amr.nf` module [fastp](https://github.com/OpenGene/fastp) - Shifu Chen, Yanqing Zhou, Yaru Chen, Jia Gu; fastp: an ultra-fast all-in-one FASTQ preprocessor, Bioinformatics, Volume 34, Issue 17, 1 September 2018, Pages i884–i890, https://doi.org/10.1093/bioinformatics/bty560 @@ -406,9 +406,9 @@ This project uses open-source components. You can find the homepage or source co - This tool is used in `PREPROCESS` process of the `preprocess.nf` module [GPSC_pipeline_nf](https://github.com/sanger-bentley-group/GPSC_pipeline_nf) -- Victoria Carr ([@blue-moon22](https://github.com/blue-moon22)) +- Victoria Dyster ([@blue-moon22](https://github.com/blue-moon22)) - License (GPL-3.0): https://github.com/sanger-bentley-group/GPSC_pipeline_nf/blob/master/LICENSE -- Code adapted into `LINEAGE` process of the `lineage.nf` module +- Code adapted into the `get_lineage.sh` script [Kraken 2](https://ccb.jhu.edu/software/kraken2/) - Wood, D.E., Lu, J. & Langmead, B. Improved metagenomic analysis with Kraken 2. Genome Biol 20, 257 (2019). https://doi.org/10.1186/s13059-019-1891-0 @@ -418,7 +418,7 @@ This project uses open-source components. You can find the homepage or source co [mecA-HetSites-calculator](https://github.com/kumarnaren/mecA-HetSites-calculator) - Narender Kumar ([@kumarnaren](https://github.com/kumarnaren)) - License (GPL-3.0): https://github.com/kumarnaren/mecA-HetSites-calculator/blob/master/LICENSE -- Code was rewritten into the `het_snp_count.py` script used by `HET_SNP_COUNT` process of the `mapping.nf` module +- Code was rewritten into the `het_snp_count.py` script [mlst](https://github.com/tseemann/mlst) - Torsten Seemann ([@tseemann](https://github.com/tseemann)) @@ -446,14 +446,14 @@ This project uses open-source components. You can find the homepage or source co - License (GPL-3.0): https://github.com/sanger-pathogens/seroba/blob/master/LICENSE - This project uses a Docker image built from a [custom fork](https://github.com/HarryHung/seroba) - The fork includes critical bug fixes for SeroBA as the original repository is no longer maintained - - The Docker image provides the containerised environment for `CREATE_SEROBA_DB` and `SEROTYPE` processes of the `serotype.nf` module + - The Docker image provides the containerised environment with SeroBA for `GET_SEROBA_DB` and `SEROTYPE` processes of the `serotype.nf` module [resistanceDatabase](https://github.com/kumarnaren/resistanceDatabase) - Narender Kumar ([@kumarnaren](https://github.com/kumarnaren)) - License (GPL-3.0): https://github.com/kumarnaren/resistanceDatabase/blob/main/LICENSE - `sequences.fasta` is renamed to `ariba_ref_sequences-*.fasta` and used as-is - `metadata.tsv` is renamed to `ariba_metadata-*.tsv` and modified -- The files are used as the default inputs of `CREATE_ARIBA_DB` process of the `amr.nf` module +- The files are used as the default inputs of `GET_ARIBA_DB` process of the `amr.nf` module [Shovill](https://github.com/tseemann/shovill) - Torsten Seemann ([@tseemann](https://github.com/tseemann)) @@ -466,7 +466,7 @@ This project uses open-source components. You can find the homepage or source co - This is a modified version of [AMR predictor](https://github.com/BenJamesMetcalf/Spn_Scripts_Reference) by Ben Metcalf ([@BenJamesMetcalf](https://github.com/BenJamesMetcalf)) at the Centre for Disease Control (CDC) - This project uses a Docker image built from a [custom fork](https://github.com/HarryHung/spn-resistance-pbp) - The fork changes the Docker image from a Docker executable image to a Docker environment for Nextflow integration - - The Docker image provides the containerised environment for `PBP_RESISTANCE` process of the `amr.nf` module + - The Docker image provides the containerised environment with SPN-PBP-MAR for `PBP_RESISTANCE` process of the `amr.nf` module [Unicycler](https://github.com/rrwick/Unicycler) - **Wick RR, Judd LM, Gorrie CL, Holt KE**. Unicycler: resolving bacterial genome assemblies from short and long sequencing reads. *PLoS Comput Biol* 2017. From 30fbe71480d5a1a83dc74aeb7fc9ddb6ec0c6741 Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 2 Aug 2023 10:27:19 +0000 Subject: [PATCH 078/157] Use full Pandas image for NF metrics collection Former-commit-id: db629f27a21165a37d1fe408a649eaed08702ee5 --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 268f05a..5a29810 100644 --- a/nextflow.config +++ b/nextflow.config @@ -66,7 +66,7 @@ process { container = 'bitnami/git:2.39.0' } withLabel: python_container { - container = 'amancevice/pandas:2.0.2-slim' + container = 'amancevice/pandas:2.0.2' } withLabel: fastp_container { container = 'staphb/fastp:0.23.2' From 545709e0e8fb56e9492d782ee8c29beb6d11617a Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Wed, 2 Aug 2023 16:36:10 +0000 Subject: [PATCH 079/157] Include Virulence in the chart Former-commit-id: 09672c296931c8216b741bbac1a847f0dec9f215 --- doc/workflow.drawio.svg | 216 ++++++++++++++++++++-------------------- 1 file changed, 108 insertions(+), 108 deletions(-) diff --git a/doc/workflow.drawio.svg b/doc/workflow.drawio.svg index 873d1b7..43e654b 100644 --- a/doc/workflow.drawio.svg +++ b/doc/workflow.drawio.svg @@ -1,23 +1,23 @@ - + - - + + Output - - + + Input - + - + @@ -32,12 +32,12 @@ - - - - + + + + - + @@ -57,14 +57,14 @@ - + - + - + - - + + FASTQ (Reads) @@ -72,36 +72,36 @@ - + FASTQ (Reads) - - - + + +