sanger-bentley-group · HarryHung · Aug 23, 2023 · Aug 22, 2023 · Aug 22, 2023 · Aug 22, 2023
diff --git a/README.md b/README.md
@@ -235,8 +235,8 @@ The pipeline is compatible with [Launchpad](https://help.tower.nf/23.2/launch/la
 ## Other AMR
   | Option | Values | Description |
   | --- | ---| --- |
-  | `--ariba_ref` | Any valid path to a `.fa` or `.fasta` file<br />(Default: `"$projectDir/data/ariba_ref_sequences-20230712.fasta"`) | Path to the reference sequences for ARIBA. |
-  | `--ariba_metadata` | Any valid path to a `tsv` file<br />(Default: `"$projectDir/data/ariba_metadata-20230712.tsv"`) | Path to the metadata file for ARIBA. |
+  | `--ariba_ref` | Any valid path to a `.fa` or `.fasta` file<br />(Default: `"$projectDir/data/ariba_ref_sequences.fasta"`) | Path to the reference sequences for ARIBA. |
+  | `--ariba_metadata` | Any valid path to a `tsv` file<br />(Default: `"$projectDir/data/ariba_metadata.tsv"`) | Path to the metadata file for ARIBA. |
 
 ## Singularity
   > ℹ️ This section is only valid when Singularity is used as the container engine
@@ -444,8 +444,8 @@ This project uses open-source components. You can find the homepage or source co
 [resistanceDatabase](https://github.com/kumarnaren/resistanceDatabase)
 - Narender Kumar ([@kumarnaren](https://github.com/kumarnaren))
 - License (GPL-3.0): https://github.com/kumarnaren/resistanceDatabase/blob/main/LICENSE
-- `sequences.fasta` is renamed to `ariba_ref_sequences-*.fasta` and used as-is
-- `metadata.tsv` is renamed to `ariba_metadata-*.tsv` and modified
+- `sequences.fasta` is renamed to `ariba_ref_sequences.fasta` and used as-is
+- `metadata.tsv` is renamed to `ariba_metadata.tsv` and modified
 - The files are used as the default inputs of `GET_ARIBA_DB` process of the `amr.nf` module
 
 [Shovill](https://github.com/tseemann/shovill)

diff --git a/bin/parse_other_resistance.py b/bin/parse_other_resistance.py
@@ -3,20 +3,18 @@
 # Output AMR of a sample based on its ARIBA report and ARIBA metadata
 
 import sys
-from itertools import chain
 from collections import defaultdict
 import pandas as pd
 import csv
 
 
 # Check argv and save to global variables
-if len(sys.argv) != 5:
-    sys.exit('Usage: get_other_resistance.py REPORT_PATH DEBUG_REPORT_PATH METADATA_PATH OUTPUT_FILE')
+if len(sys.argv) != 4:
+    sys.exit('Usage: get_other_resistance.py DEBUG_REPORT_PATH METADATA_PATH OUTPUT_FILE')
 
-REPORT_PATH = sys.argv[1]
-DEBUG_REPORT_PATH = sys.argv[2]
-METADATA_PATH = sys.argv[3]
-OUTPUT_FILE = sys.argv[4]
+DEBUG_REPORT_PATH = sys.argv[1]
+METADATA_PATH = sys.argv[2]
+OUTPUT_FILE = sys.argv[3]
 
 
 def main():
@@ -57,35 +55,34 @@ def prepare_dicts():
 
 # Finding hits in ARIBA results based on targets_dict and save hits to hits_dict
 def find_hits(targets_dict, hits_dict):
-    with open(REPORT_PATH) as report, open(DEBUG_REPORT_PATH) as debug_report:
-        # Skip the header in report and debug report
-        next(report)
+    with open(DEBUG_REPORT_PATH) as debug_report:
+        # Skip the header in debug report
         next(debug_report)
 
-        # Go through lines in both report and debug report to detect targets
-        for line in (line.strip() for line in chain(report, debug_report)):
+        # Go through lines in the debug report to detect targets
+        for line in (line.strip() for line in debug_report):
             # Extract useful fields
             fields = [str(field) for field in line.split("\t")]
             ref_name, gene, var_only, ref_len, ref_base_assembled, known_var_change, has_known_var, ref_ctg_effect, ref_start, ref_end = fields[1], fields[2], fields[3], fields[7], fields[8], fields[16], fields[17], fields[19], fields[20], fields[21]
 
-            # If coverage (ref_base_assembled / ref_len) < 0.9 or either variable contains non-numeric value, skip the line
-            if not ref_base_assembled.isdigit() or not ref_len.isdigit() or int(ref_base_assembled)/int(ref_len) < 0.9:
-                continue
-
             # If the known_var_change (. for genes, specific change for variants) is not found in the metadata of the (ref_name, gene, var_only) combination, skip the line
             try:
                 target = targets_dict[(ref_name, gene, var_only)][known_var_change]
             except KeyError: 
                 continue
 
-            # Logic for gene detection. Found means hit.
-            if var_only == "0":
+            # If ref_base_assembled or ref_len variable contains non-numeric value, skip the line
+            if not ref_base_assembled.isdigit() or not ref_len.isdigit():
+                continue
+
+            # Logic for gene detection, check coverage.
+            if var_only == "0" and int(ref_base_assembled)/int(ref_len) >= 0.8:
                 hits_dict[target].add(f'{ref_name}')
-            
-            # Logic for variant detection, further criteria required
+
+            # Logic for variant detection, coverage check is not needed, but check for other criteria
             if var_only == "1":
                 # folP-specific criteria: ref_ctg_effect (effect of change between reference and contig) is one of the keywords and the change occurs within nt 168-201
-                if ref_name.lower().startswith("folp") and ref_ctg_effect.lower() in ('fshift', 'trunc', 'indel', 'ins', 'multiple') and (168 <= int(ref_start) <= 201 or 168 <= int(ref_end) <= 201):
+                if ref_name.lower().startswith("folp") and ref_ctg_effect.lower() in ('fshift', 'trunc', 'indel', 'indels', 'ins', 'multiple') and (168 <= int(ref_start) <= 201 or 168 <= int(ref_end) <= 201):
                     pos = ref_start if ref_start == ref_end else f'{ref_start}-{ref_end}'
                     hits_dict[target].add(f'{ref_name} {ref_ctg_effect} at {pos}')
                 # Common criteria: the assembly has that variant

diff --git a/data/ariba_metadata-20230712.tsv → data/ariba_metadata.tsv b/data/ariba_metadata-20230712.tsv → data/ariba_metadata.tsv
@@ -60,8 +60,8 @@ parE_AE007317	1	1	D435N	.	FQ
 parE_AE007317	1	1	D435H	.	FQ
 parE_AE007317	1	1	P454S	.	FQ
 tetO_Y07780	1	0	.	.	TET
-ermBups_HG799494	0	0	.	.	ERY
-ermbTr_CP002121	0	0	.	.	ERY
+ermBups_HG799494	0	0	.	.	ERY_CLI
+ermbTr_CP002121	0	0	.	.	ERY_CLI
 rpoB_AE007317	1	1	D489E	.	RIF
 rpoB_AE007317	1	1	H499N	.	RIF
 rpoB_AE007317	1	1	D489N	.	RIF

diff --git a/data/ariba_ref_sequences-20230815.fasta → data/ariba_ref_sequences.fasta b/data/ariba_ref_sequences-20230815.fasta → data/ariba_ref_sequences.fasta
diff --git a/modules/amr.nf b/modules/amr.nf
@@ -85,13 +85,12 @@ process OTHER_RESISTANCE {
     tuple val(sample_id), path(read1), path(read2), path(unpaired)
 
     output:
-    tuple val(sample_id), path(report), path(report_debug), emit: reports
+    tuple val(sample_id), path(report_debug), emit: report
 
     script:
-    report='result/report.tsv'
     report_debug='result/debug.report.tsv'
     """
-    ariba run --nucmer_min_id 80 --assembled_threshold 0.80 "$ariba_database/$database" "$read1" "$read2" result
+    ariba run --nucmer_min_id 80 "$ariba_database/$database" "$read1" "$read2" result
     """
 }
 
@@ -103,7 +102,7 @@ process PARSE_OTHER_RESISTANCE {
     tag "$sample_id"
 
     input:
-    tuple val(sample_id), path(report), path(report_debug)
+    tuple val(sample_id), path(report_debug)
     path metadata
 
     output:
@@ -112,6 +111,6 @@ process PARSE_OTHER_RESISTANCE {
     script:
     output_file="other_amr_report.csv"
     """
-    parse_other_resistance.py "$report" "$report_debug" "$metadata" "$output_file"
+    parse_other_resistance.py "$report_debug" "$metadata" "$output_file"
     """
 }
diff --git a/nextflow.config b/nextflow.config
@@ -49,8 +49,8 @@ params {
     depth = 20.00
 
     // Default ARIBA referece sequences and metadata paths
-    ariba_ref = "$projectDir/data/ariba_ref_sequences-20230815.fasta"
-    ariba_metadata =  "$projectDir/data/ariba_metadata-20230712.tsv"
+    ariba_ref = "$projectDir/data/ariba_ref_sequences.fasta"
+    ariba_metadata =  "$projectDir/data/ariba_metadata.tsv"
 
     // Toggle for removing .bam and .sam files mid-run to reduce storage requirement
     // Warning: This will break the -resume function of Nextflow  

diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf
@@ -145,9 +145,9 @@ workflow PIPELINE {
     PARSE_PBP_RESISTANCE(PBP_RESISTANCE.out.json)
 
     // From Channel OVERALL_QC_PASSED_ASSEMBLIES_ch, infer resistance and determinants of other antimicrobials
-    // Output into Channel PARSE_OTHER_RESISTANCE.out.result
+    // Output into Channel PARSE_OTHER_RESISTANCE.out.report
     OTHER_RESISTANCE(GET_ARIBA_DB.out.path, GET_ARIBA_DB.out.database, OVERALL_QC_PASSED_READS_ch)
-    PARSE_OTHER_RESISTANCE(OTHER_RESISTANCE.out.reports, params.ariba_metadata)
+    PARSE_OTHER_RESISTANCE(OTHER_RESISTANCE.out.report, params.ariba_metadata)
 
     // Generate sample reports by merging outputs from all result-generating modules
     GENERATE_SAMPLE_REPORT(