-
Notifications
You must be signed in to change notification settings - Fork 1
/
allele_transvar.py
138 lines (102 loc) · 6.92 KB
/
allele_transvar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
Uses transvar to represent the allele modifications in standard variant nomenclature.
Removes all lines with sequence errors (needs_fixing == True).
"""
import pandas
import pickle
import argparse
from grammar import aminoacid_grammar, nucleotide_grammar
from models import SyntaxRule, find_rule
from transvar_functions import parse_transvar_string, get_transvar_str_annotation, get_anno_db, TransvarAnnotation
from genome_functions import handle_systematic_id_for_allele_qc
from tqdm import tqdm
tqdm.pandas()
def format_transvar_input_list(row, genome, syntax_rules_aminoacids, syntax_rules_nucleotides) -> list[str]:
# Transvar uses only gene_ids, while the allele_qc uses a mix to handle multi-transcripts
gene_systematic_id = row['systematic_id']
allele_qc_id = handle_systematic_id_for_allele_qc(row['systematic_id'], row['allele_name'], genome)
gene = genome[allele_qc_id]
chromosome = gene['contig'].id
if 'amino_acid' in row['allele_type']:
syntax_rule = find_rule(syntax_rules_aminoacids, *row['rules_applied'].split(':'))
prefix = gene_systematic_id
else:
syntax_rule = find_rule(syntax_rules_nucleotides, *row['rules_applied'].split(':'))
prefix = chromosome
capture_groups = syntax_rule.get_groups(row['allele_parts'], gene)
transvar_input_list = ['{}:{}'.format(prefix, x) for x in syntax_rule.format_for_transvar(capture_groups, gene)]
return transvar_input_list
def get_transvar_annotation_coordinates(annotations: list[TransvarAnnotation], gene_id: str, transcript_id: str) -> TransvarAnnotation:
# There may be multiple annotations for the same systematic_id if there are multiple
# transcripts
for annotation in annotations:
if annotation.gene == gene_id:
if transcript_id is None:
return annotation.coordinates
elif annotation.transcript.split()[0] == transcript_id:
return annotation.coordinates
# Some variants fall out of the transcript, so we just return the genomic variant
return '{}/./.'.format(annotations[0].coordinates.split('/')[0])
def get_transvar_coordinates(row, db, genome, exclude_transcripts, sgd_mode=False):
allele_qc_id = handle_systematic_id_for_allele_qc(row['systematic_id'], row['allele_name'], genome)
transcript_id = None if (allele_qc_id == row['systematic_id']) else allele_qc_id
try:
transvar_output = list()
for var in row['transvar_input_list']:
transvar_annotation_list = parse_transvar_string(get_transvar_str_annotation('panno' if 'amino_acid' in row['allele_type'] else 'ganno', var, db))
transvar_output.append(get_transvar_annotation_coordinates(transvar_annotation_list, row['systematic_id'], transcript_id))
return transvar_output
except ValueError as e:
# For now we skip the transcripts that don't work, but we should address this
if sgd_mode:
print('skipping transcript {} for {}'.format(row['systematic_id'], row['allele_name']))
return []
elif e.args[0] == 'no_valid_transcript_found' and row['systematic_id'] in exclude_transcripts:
return []
else:
raise e
def main(genome_file, allele_results_file, exclude_transcripts_file, output_file, sgd_mode, transvardb, genome_fasta):
with open(genome_file, 'rb') as ins:
genome = pickle.load(ins)
with open(exclude_transcripts_file) as ins:
exclude_transcripts = set(map(str.strip, ins.readlines()))
syntax_rules_aminoacids = [SyntaxRule.parse_obj(r) for r in aminoacid_grammar]
syntax_rules_nucleotides = [SyntaxRule.parse_obj(r) for r in nucleotide_grammar]
data = pandas.read_csv(allele_results_file, sep='\t', na_filter=False)
if sgd_mode:
# Ammend wrong type:
wrong_type = (data['change_type_to'] != '') & \
(data['pattern_error'] == '') & \
(data['invalid_error'] == '') & \
(data['sequence_error'] == '')
data.loc[wrong_type, 'allele_type'] = data.loc[wrong_type, 'change_type_to']
data.loc[wrong_type, 'needs_fixing'] = False
# Remove all errors
data = data[~data['needs_fixing']].copy()
# Only allele types that can be fixed
data = data[data['allele_type'].str.contains('amino_acid|nucleotide', regex=True)].copy()
# Explode the allele_parts and the rules_applied
data_exploded = data.copy()
data_exploded.loc[:, 'allele_parts'] = data_exploded['allele_parts'].apply(str.split, args=['|'])
data_exploded.loc[:, 'rules_applied'] = data_exploded['rules_applied'].apply(str.split, args=['|'])
data_exploded = data_exploded.explode(['allele_parts', 'rules_applied'])
# Apply transvar to each allele_parts
data_exploded['transvar_input_list'] = data_exploded.apply(format_transvar_input_list, axis=1, args=(genome, syntax_rules_aminoacids, syntax_rules_nucleotides))
anno_db = get_anno_db(transvardb, genome_fasta)
print('Running transvar on variants... (will take a while)')
data_exploded['transvar_coordinates'] = data_exploded.progress_apply(get_transvar_coordinates, args=(anno_db, genome, exclude_transcripts, sgd_mode), axis=1)
aggregated_data = data_exploded[['systematic_id', 'allele_description', 'allele_type', 'transvar_coordinates']].groupby(['systematic_id', 'allele_description', 'allele_type'], as_index=False).agg({'transvar_coordinates': lambda x: '|'.join(sum(x, []))})
data.merge(aggregated_data, on=['systematic_id', 'allele_description', 'allele_type'], how='left').to_csv(output_file, sep='\t', index=False)
if __name__ == '__main__':
class Formatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
pass
parser = argparse.ArgumentParser(description=__doc__, formatter_class=Formatter)
parser.add_argument('--genome', default='data/genome.pickle', help='input: genome dictionary built from contig files. (see load_genome.py)')
parser.add_argument('--allele_results', default='results/allele_results.tsv', help='input: output of allele_qc.py')
parser.add_argument('--exclude_transcripts', default='data/frame_shifted_transcripts.tsv', help='input: transcripts to exclude from transvar because they are known to be problematic')
parser.add_argument('--genome_fasta', default='data/pombe_genome.fa', help='input: genome fasta file used by transvar')
parser.add_argument('--transvardb', default='data/pombe_genome.gtf.transvardb', help='input: path of transvardb file')
parser.add_argument('--output', default='results/allele_results_transvar.tsv', help='output: file with extra column with transvar coordinates')
parser.add_argument('--sgd_mode', type=bool, default=False, help='Skip transcripts that don\'t work and fix allele types, this arg should be removed in the future.')
args = parser.parse_args()
main(args.genome, args.allele_results, args.exclude_transcripts, args.output, args.sgd_mode, args.transvardb, args.genome_fasta)