Skip to content

Commit

Permalink
Issue 96 (#98)
Browse files Browse the repository at this point in the history
* getting there

* working in the pipeline and the API

* changed convert_ctd api endpoint response type
  • Loading branch information
manulera authored Aug 16, 2023
1 parent 5ddf478 commit cda2b07
Show file tree
Hide file tree
Showing 12 changed files with 374 additions and 125 deletions.
26 changes: 15 additions & 11 deletions allele_transvar.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
tqdm.pandas()


def format_for_transvar(row, genome, syntax_rules_aminoacids, syntax_rules_nucleotides) -> str:
def format_transvar_input_list(row, genome, syntax_rules_aminoacids, syntax_rules_nucleotides) -> list[str]:

# Transvar uses only gene_ids, while the allele_qc uses a mix to handle multi-transcripts
gene_systematic_id = row['systematic_id']
Expand All @@ -25,10 +25,10 @@ def format_for_transvar(row, genome, syntax_rules_aminoacids, syntax_rules_nucle
syntax_rule = find_rule(syntax_rules_nucleotides, *row['rules_applied'].split(':'))
prefix = chromosome

capture_groups = syntax_rule.get_groups(row['allele_parts'])
transvar_input = '{}:{}'.format(prefix, syntax_rule.format_for_transvar(capture_groups, gene))
capture_groups = syntax_rule.get_groups(row['allele_parts'], gene)
transvar_input_list = ['{}:{}'.format(prefix, x) for x in syntax_rule.format_for_transvar(capture_groups, gene)]

return transvar_input
return transvar_input_list


def get_transvar_annotation_coordinates(annotations: list[TransvarAnnotation], gene_id: str, transcript_id: str) -> TransvarAnnotation:
Expand All @@ -46,20 +46,24 @@ def get_transvar_annotation_coordinates(annotations: list[TransvarAnnotation], g


def get_transvar_coordinates(row, db, genome, exclude_transcripts):
# print(row['systematic_id'], '<<<>>>', row['transvar_input'])
# print(row['systematic_id'], '<<<>>>', row['transvar_input_list'])
allele_qc_id = handle_systematic_id_for_allele_qc(row['systematic_id'], row['allele_name'], genome)
transcript_id = None if (allele_qc_id == row['systematic_id']) else allele_qc_id
try:
transvar_annotation_list = parse_transvar_string(get_transvar_str_annotation('panno' if 'amino_acid' in row['allele_type'] else 'ganno', row['transvar_input'], db))
return get_transvar_annotation_coordinates(transvar_annotation_list, row['systematic_id'], transcript_id)
transvar_output = list()
for var in row['transvar_input_list']:
transvar_annotation_list = parse_transvar_string(get_transvar_str_annotation('panno' if 'amino_acid' in row['allele_type'] else 'ganno', var, db))
transvar_output.append(get_transvar_annotation_coordinates(transvar_annotation_list, row['systematic_id'], transcript_id))
return transvar_output
except ValueError as e:
if e.args[0] == 'no_valid_transcript_found' and row['systematic_id'] in exclude_transcripts:
return ''
return []
else:
raise e


def main(genome_file, allele_results_file, exclude_transcripts_file, output_file):

with open(genome_file, 'rb') as ins:
genome = pickle.load(ins)

Expand All @@ -84,13 +88,13 @@ def main(genome_file, allele_results_file, exclude_transcripts_file, output_file
data_exploded = data_exploded.explode(['allele_parts', 'rules_applied'])

# Apply transvar to each allele_parts
data_exploded['transvar_input'] = data_exploded.apply(format_for_transvar, axis=1, args=(genome, syntax_rules_aminoacids, syntax_rules_nucleotides))
data_exploded['transvar_input_list'] = data_exploded.apply(format_transvar_input_list, axis=1, args=(genome, syntax_rules_aminoacids, syntax_rules_nucleotides))

anno_db = get_anno_db()
print('Running transvar on variants... (will take a while)')
data_exploded['transvar_coordinates'] = data_exploded.progress_apply(get_transvar_coordinates, args=(anno_db, genome, exclude_transcripts), axis=1)

aggregated_data = data_exploded[['systematic_id', 'allele_description', 'allele_type', 'transvar_coordinates']].groupby(['systematic_id', 'allele_description', 'allele_type'], as_index=False).agg({'transvar_coordinates': '|'.join})
aggregated_data = data_exploded[['systematic_id', 'allele_description', 'allele_type', 'transvar_coordinates']].groupby(['systematic_id', 'allele_description', 'allele_type'], as_index=False).agg({'transvar_coordinates': lambda x: '|'.join(sum(x, []))})

data.merge(aggregated_data, on=['systematic_id', 'allele_description', 'allele_type'], how='left').to_csv(output_file, sep='\t', index=False)

Expand Down
19 changes: 15 additions & 4 deletions api.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
from common_autofix_functions import apply_histone_fix
from protein_modification_qc import check_func as check_modification_description
from protein_modification_transvar import get_transvar_coordinates as get_transvar_coordinates_modification, format_for_transvar as format_for_transvar_modification
from allele_transvar import get_transvar_coordinates as get_transvar_coordinates_allele, format_for_transvar as format_for_transvar_allele
from allele_transvar import get_transvar_coordinates as get_transvar_coordinates_allele, format_transvar_input_list as format_for_transvar_allele
from ctd_support import ctd_convert_to_normal_variant
import re
from typing import Optional
from Bio import SeqIO
Expand Down Expand Up @@ -417,14 +418,14 @@ async def allele_transvar_coordinates(systematic_id: str = Query(example="SPBC35
out_list = list()
for allele_part, rule_applied in zip(check_allele_resp.allele_parts.split('|'), check_allele_resp.rules_applied.split('|')):
input_dict = {'systematic_id': systematic_id, 'allele_description': allele_description, 'allele_type': allele_type, 'allele_name': allele_name, 'allele_parts': allele_part, 'rules_applied': rule_applied}
input_dict['transvar_input'] = format_for_transvar_allele(input_dict, genome, syntax_rules_aminoacids, syntax_rules_nucleotides)
input_dict['transvar_input_list'] = format_for_transvar_allele(input_dict, genome, syntax_rules_aminoacids, syntax_rules_nucleotides)
# Can give errors for certain transcripts, e.g. it was giving error for frame-shifted transcripts such as SPAC688.08 S1137
try:
out_list.append(get_transvar_coordinates_allele(input_dict, db, genome, []))
except ValueError as e:
raise HTTPException(400, str(e))

return '|'.join(out_list)
return PlainTextResponse('|'.join(sum(out_list, [])))


@ app.get("/protein_modification_transvar_coordinates", response_class=PlainTextResponse)
Expand All @@ -451,4 +452,14 @@ async def protein_modification_coordinates(systematic_id: str = Query(example="S
except ValueError as e:
raise HTTPException(400, str(e))

return '|'.join(out_list)
return PlainTextResponse('|'.join(out_list))

@ app.get("/convert_ctd_description_to_normal_description", response_class=PlainTextResponse)
async def convert_ctd_description_to_normal_description(ctd_description: str = Query(example="CTD-S2A(r1-r6-2),S7A")):
return PlainTextResponse(ctd_convert_to_normal_variant(ctd_description))

@ app.get("/convert_ctd_position_to_position_list", response_class=PlainTextResponse)
async def convert_ctd_position_to_position_list(ctd_description: str = Query(example="CTD-S2")):
# A bit of a hack, but better than using another function entirely, we append a triptophan (not in the repeats),
# to be able to use the variant function, and then we remove it.
return PlainTextResponse(ctd_convert_to_normal_variant(ctd_description + 'W').replace('W', ''))
139 changes: 139 additions & 0 deletions ctd_support.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import re

aa = 'GPAVLIMCFYWHKRQNEDST'
aa = aa + aa.lower()
aa = f'[{aa}]'

ctd_repetition_regex = r'\(r\d+-r\d+(?:-\d+)?\)'
ctd_repetition_regex_with_groups = r'\(r(\d+)-r(\d+)(?:-(\d+))?\)'
ctd_mutation_regex = f'{aa}\d+{aa}(?:{ctd_repetition_regex})?'
ctd_deletion_regex = f'(?:delta|\u0394|∆)(?:{ctd_repetition_regex})?'

# This dictionary has keys that are the systematic IDs of the genes that have CTDs,
# the value is a dictionary with:
# - positions: a dictionary with keys that are the residues of the canonical repeat
# e.g. YSPTSPS for SPBC28F2.12, and the value is the repeats in which the residue is
# present, zero-based index.
# - shifts: a tuple of two integers the first value is the position in the protein where
# an aminoacid is missing (-1) or inserted (+1) with respect to the canonical repeat.
# There is only one in rpb1, but a list of tuples could be used if more are needed.
# - start: the position of the first residue of the CTD in the protein (zero-based index)
# - length: the length of the CTD

rpb1_dictionary = {
'positions': {
'Y1': range(29),
'S2': [1, 2] + list(range(4, 29)),
'P3': [1, ] + list(range(3, 8)) + list(range(9, 29)),
'T4': [0, 2] + list(range(4, 29)),
'S5': range(29),
'P6': range(29),
'S7': [0, 3] + list(range(4, 29)),
},
'shift': (1566, -1),
'start': 1550,
'length': 202,
'nb_repeats': 29,
}


def apply_shift(match: re.match, shift: tuple[int, int]):
num_str = match.group()
num = int(num_str)
# If the number is smaller than the position of the missing aminoacid,
# return the number
if num <= shift[0]:
return num_str
# If the number is bigger than the position of the missing aminoacid,
# return the number plus the shift
return str(shift[1] + num)


def ctd_further_check(g, gg):
return gg['CDS'].qualifiers['systematic_id'][0] in ['SPBC28F2.12']


def ctd_convert_to_normal_variant(ctd_substring: str):

mutations = re.findall(ctd_mutation_regex, ctd_substring)
deletions= re.findall(ctd_deletion_regex, ctd_substring)
starting_position = rpb1_dictionary['start']
repeat_length = len(rpb1_dictionary['positions'])
out_list = []
deleted_repeats = list()
for deletion in deletions:
# Entire deletion, always correct, and takes over everything else
if '(' not in deletion:
return '{}-{}'.format(rpb1_dictionary['start'] + 1, rpb1_dictionary['start'] + rpb1_dictionary['length'])
# Deletion with repeat number
match = re.search(ctd_repetition_regex_with_groups, deletion)
start, stop, step = match.groups()
step = int(step) if step is not None else 1
start, stop = int(start), int(stop)
deleted_ranges = list()

for repeat_number in range(start, stop + 1, step):
deleted_repeats.append(repeat_number)
repeat_start = (repeat_number - 1) * repeat_length + starting_position + 1
repeat_end = repeat_start + repeat_length - 1
deleted_ranges.append((repeat_start, repeat_end))
if step == 1:
out_list.append('{}-{}'.format(deleted_ranges[0][0], deleted_ranges[-1][1]))
else:
out_list.append(','.join(['{}-{}'.format(*x) for x in deleted_ranges]))

for mutation in mutations:
original_residue, index_in_repeat, replaced_by = re.search(r'([A-Za-z])(\d+)([A-Za-z])', mutation).groups()
index_in_repeat = int(index_in_repeat)
repeats_where_residue_is_present = rpb1_dictionary['positions'][mutation[:2]]
match = re.search(ctd_repetition_regex_with_groups, mutation)
if match is None:
start, stop, step = 1, rpb1_dictionary['nb_repeats'], 1
else:
start, stop, step = match.groups()
step = int(step) if step is not None else 1
start, stop = int(start), int(stop)
for repeat_number in range(start, stop + 1, step):
if repeat_number in deleted_repeats:
continue
if repeat_number - 1 not in repeats_where_residue_is_present:
continue
mutated_position = index_in_repeat + (repeat_number - 1) * repeat_length + starting_position
out_list.append('{}{}{}'.format(original_residue, mutated_position, replaced_by))
out_str = ','.join(out_list)

return re.sub(r'(\d+)', lambda x: apply_shift(x, rpb1_dictionary['shift']), out_str)


def ctd_check_sequence(ctd_substring: str):
sequence_errors = []
mutations = re.findall(ctd_mutation_regex, ctd_substring)

for mutation in mutations:
if mutation[:2] not in rpb1_dictionary['positions']:
sequence_errors.append('CTD-' + mutation[:2])

match = re.search(ctd_repetition_regex_with_groups, ctd_substring)
start, stop, step = match.groups()
if int(stop) > rpb1_dictionary['nb_repeats']:
sequence_errors.append('CTD-r' + stop)
if int(start) > rpb1_dictionary['nb_repeats']:
sequence_errors.append('CTD-r' + start)

return '/'.join(sequence_errors)


def ctd_format_for_transvar(capture_groups: list[str], gene: dict) -> list[str]:
ctd_substring = capture_groups[0]
result = list()
for ele in ctd_convert_to_normal_variant(ctd_substring).split(','):
if '-' in ele:
result.append('p.{}_{}del'.format(*ele.split('-')))
else:
result.append('p.{}'.format(ele))
return result


def ctd_apply_syntax(ctd_substring: str):
ctd_bits = re.findall(f'({ctd_mutation_regex}|{ctd_deletion_regex})', ctd_substring)
return 'CTD-' + ','.join(ctd_bits)
Loading

0 comments on commit cda2b07

Please sign in to comment.