forked from pha4ge/hAMRonization
-
Notifications
You must be signed in to change notification settings - Fork 0
/
RgiIO.py
111 lines (100 loc) · 4.77 KB
/
RgiIO.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python
import csv
import warnings
import math
from .Interfaces import hAMRonizedResultIterator
required_metadata = ['analysis_software_version',
'reference_database_version',
'input_file_name']
class RgiIterator(hAMRonizedResultIterator):
def __init__(self, source, metadata):
metadata['analysis_software_name'] = 'rgi'
metadata['reference_database_id'] = 'CARD'
self.metadata = metadata
with open(source) as fh:
header = next(fh)
# i.e. RGI-bwt
if 'Resistomes & Variants: Observed in Genome(s)' in \
header.strip().split('\t'):
self.field_mapping = {
'ARO Term': 'gene_symbol',
'ARO Accession': 'reference_accession',
'Reference Model Type': None,
'Reference DB': 'reference_database_id',
'Alleles with Mapped Reads': None,
'Reference Allele(s) Identity '
'to CARD Reference Protein (%)': 'sequence_identity',
'Resistomes & Variants: Observed in Genome(s)': None,
'Resistomes & Variants: Observed in Plasmid(s)': None,
'Resistomes & Variants: Observed Pathogen(s)': None,
'Completely Mapped Reads': None,
'Mapped Reads with Flanking Sequence': None,
'All Mapped Reads': None,
'Average Percent Coverage': 'coverage_percentage',
'Average Length Coverage (bp)': 'input_gene_length',
'Average MAPQ (Completely Mapped Reads)': None,
'Number of Mapped Baits': None,
'Number of Mapped Baits with Reads': None,
'Average Number of reads per Bait': None,
'Number of reads per Bait '
'Coefficient of Variation (%)': None,
'Number of reads mapping to baits '
'and mapping to complete gene': None,
'Number of reads mapping to baits and '
'mapping to complete gene (%)': None,
'Mate Pair Linkage (# reads)': None,
'Reference Length': 'reference_gene_length',
'AMR Gene Family': 'gene_name',
'Drug Class': 'drug_class',
'Resistance Mechanism': 'resistance_mechanism'}
else:
# normal RGI mode
self.field_mapping = {
'ORF_ID': 'input_sequence_id',
'Contig': None,
'Start': 'input_gene_start',
'Stop': 'input_gene_stop',
'Orientation': 'strand_orientation',
'Cut_Off': None,
'Pass_Bitscore': None,
'Best_Hit_Bitscore': None,
'Best_Hit_ARO': 'gene_symbol',
'Best_Identities': 'sequence_identity',
'ARO': 'reference_accession',
'Model_type': None,
'SNPs_in_Best_Hit_ARO': None,
'Other_SNPs': None,
'Drug Class': 'drug_class',
'Resistance Mechanism': 'resistance_mechanism',
'AMR Gene Family': 'gene_name',
'Predicted_DNA': None,
'Predicted_Protein': None,
'CARD_Protein_Sequence': None,
'Percentage Length of '
'Reference Sequence': 'coverage_percentage',
'ID': None,
'Model_ID': None,
'Nudged': None,
'Note': None}
super().__init__(source, self.field_mapping, self.metadata)
def parse(self, handle):
"""
Read each and return it
"""
# skip any manually specified fields for later
reader = csv.DictReader(handle, delimiter='\t')
skipped_mutational = 0
for result in reader:
if 'Model_type' in result:
if result['Model_type'] != 'protein homolog model':
skipped_mutational += 1
continue
# round down average length of coverage so its comparable to other
# target lengths
if 'Average Length Coverage (bp)' in result:
result['Average Length Coverage (bp)'] = \
math.floor(float(result['Average Length Coverage (bp)']))
yield self.hAMRonize(result, self.metadata)
if skipped_mutational > 0:
warnings.warn(f"Skipping {skipped_mutational} mutational AMR "
f"records from {self.metadata['input_file_name']}")