Issue 96 (#98)

* getting there * working in the pipeline and the API * changed convert_ctd api endpoint response type
pombase · Aug 16, 2023 · cda2b07 · cda2b07
1 parent 5ddf478
commit cda2b07
Show file tree

Hide file tree

Showing 12 changed files with 374 additions and 125 deletions.
diff --git a/allele_transvar.py b/allele_transvar.py
@@ -10,7 +10,7 @@
 tqdm.pandas()
 
 
-def format_for_transvar(row, genome, syntax_rules_aminoacids, syntax_rules_nucleotides) -> str:
+def format_transvar_input_list(row, genome, syntax_rules_aminoacids, syntax_rules_nucleotides) -> list[str]:
 
     # Transvar uses only gene_ids, while the allele_qc uses a mix to handle multi-transcripts
     gene_systematic_id = row['systematic_id']
@@ -25,10 +25,10 @@ def format_for_transvar(row, genome, syntax_rules_aminoacids, syntax_rules_nucle
         syntax_rule = find_rule(syntax_rules_nucleotides, *row['rules_applied'].split(':'))
         prefix = chromosome
 
-    capture_groups = syntax_rule.get_groups(row['allele_parts'])
-    transvar_input = '{}:{}'.format(prefix, syntax_rule.format_for_transvar(capture_groups, gene))
+    capture_groups = syntax_rule.get_groups(row['allele_parts'], gene)
+    transvar_input_list = ['{}:{}'.format(prefix, x) for x in syntax_rule.format_for_transvar(capture_groups, gene)]
 
-    return transvar_input
+    return transvar_input_list
 
 
 def get_transvar_annotation_coordinates(annotations: list[TransvarAnnotation], gene_id: str, transcript_id: str) -> TransvarAnnotation:
@@ -46,20 +46,24 @@ def get_transvar_annotation_coordinates(annotations: list[TransvarAnnotation], g
 
 
 def get_transvar_coordinates(row, db, genome, exclude_transcripts):
-    # print(row['systematic_id'], '<<<>>>', row['transvar_input'])
+    # print(row['systematic_id'], '<<<>>>', row['transvar_input_list'])
     allele_qc_id = handle_systematic_id_for_allele_qc(row['systematic_id'], row['allele_name'], genome)
     transcript_id = None if (allele_qc_id == row['systematic_id']) else allele_qc_id
     try:
-        transvar_annotation_list = parse_transvar_string(get_transvar_str_annotation('panno' if 'amino_acid' in row['allele_type'] else 'ganno', row['transvar_input'], db))
-        return get_transvar_annotation_coordinates(transvar_annotation_list, row['systematic_id'], transcript_id)
+        transvar_output = list()
+        for var in row['transvar_input_list']:
+            transvar_annotation_list = parse_transvar_string(get_transvar_str_annotation('panno' if 'amino_acid' in row['allele_type'] else 'ganno', var, db))
+            transvar_output.append(get_transvar_annotation_coordinates(transvar_annotation_list, row['systematic_id'], transcript_id))
+        return transvar_output
     except ValueError as e:
         if e.args[0] == 'no_valid_transcript_found' and row['systematic_id'] in exclude_transcripts:
-            return ''
+            return []
         else:
             raise e
 
+
 def main(genome_file, allele_results_file, exclude_transcripts_file, output_file):
-    
+
     with open(genome_file, 'rb') as ins:
         genome = pickle.load(ins)
 
@@ -84,13 +88,13 @@ def main(genome_file, allele_results_file, exclude_transcripts_file, output_file
     data_exploded = data_exploded.explode(['allele_parts', 'rules_applied'])
 
     # Apply transvar to each allele_parts
-    data_exploded['transvar_input'] = data_exploded.apply(format_for_transvar, axis=1, args=(genome, syntax_rules_aminoacids, syntax_rules_nucleotides))
+    data_exploded['transvar_input_list'] = data_exploded.apply(format_transvar_input_list, axis=1, args=(genome, syntax_rules_aminoacids, syntax_rules_nucleotides))
 
     anno_db = get_anno_db()
     print('Running transvar on variants... (will take a while)')
     data_exploded['transvar_coordinates'] = data_exploded.progress_apply(get_transvar_coordinates, args=(anno_db, genome, exclude_transcripts), axis=1)
 
-    aggregated_data = data_exploded[['systematic_id', 'allele_description', 'allele_type', 'transvar_coordinates']].groupby(['systematic_id', 'allele_description', 'allele_type'], as_index=False).agg({'transvar_coordinates': '|'.join})
+    aggregated_data = data_exploded[['systematic_id', 'allele_description', 'allele_type', 'transvar_coordinates']].groupby(['systematic_id', 'allele_description', 'allele_type'], as_index=False).agg({'transvar_coordinates': lambda x: '|'.join(sum(x, []))})
 
     data.merge(aggregated_data, on=['systematic_id', 'allele_description', 'allele_type'], how='left').to_csv(output_file, sep='\t', index=False)
 

diff --git a/api.py b/api.py
@@ -11,7 +11,8 @@
 from common_autofix_functions import apply_histone_fix
 from protein_modification_qc import check_func as check_modification_description
 from protein_modification_transvar import get_transvar_coordinates as get_transvar_coordinates_modification, format_for_transvar as format_for_transvar_modification
-from allele_transvar import get_transvar_coordinates as get_transvar_coordinates_allele, format_for_transvar as format_for_transvar_allele
+from allele_transvar import get_transvar_coordinates as get_transvar_coordinates_allele, format_transvar_input_list as format_for_transvar_allele
+from ctd_support import ctd_convert_to_normal_variant
 import re
 from typing import Optional
 from Bio import SeqIO
@@ -417,14 +418,14 @@ async def allele_transvar_coordinates(systematic_id: str = Query(example="SPBC35
     out_list = list()
     for allele_part, rule_applied in zip(check_allele_resp.allele_parts.split('|'), check_allele_resp.rules_applied.split('|')):
         input_dict = {'systematic_id': systematic_id, 'allele_description': allele_description, 'allele_type': allele_type, 'allele_name': allele_name, 'allele_parts': allele_part, 'rules_applied': rule_applied}
-        input_dict['transvar_input'] = format_for_transvar_allele(input_dict, genome, syntax_rules_aminoacids, syntax_rules_nucleotides)
+        input_dict['transvar_input_list'] = format_for_transvar_allele(input_dict, genome, syntax_rules_aminoacids, syntax_rules_nucleotides)
         # Can give errors for certain transcripts, e.g. it was giving error for frame-shifted transcripts such as SPAC688.08 S1137
         try:
             out_list.append(get_transvar_coordinates_allele(input_dict, db, genome, []))
         except ValueError as e:
             raise HTTPException(400, str(e))
 
-    return '|'.join(out_list)
+    return PlainTextResponse('|'.join(sum(out_list, [])))
 
 
 @ app.get("/protein_modification_transvar_coordinates", response_class=PlainTextResponse)
@@ -451,4 +452,14 @@ async def protein_modification_coordinates(systematic_id: str = Query(example="S
         except ValueError as e:
             raise HTTPException(400, str(e))
 
-    return '|'.join(out_list)
+    return PlainTextResponse('|'.join(out_list))
+
+@ app.get("/convert_ctd_description_to_normal_description", response_class=PlainTextResponse)
+async def convert_ctd_description_to_normal_description(ctd_description: str = Query(example="CTD-S2A(r1-r6-2),S7A")):
+    return PlainTextResponse(ctd_convert_to_normal_variant(ctd_description))
+
+@ app.get("/convert_ctd_position_to_position_list", response_class=PlainTextResponse)
+async def convert_ctd_position_to_position_list(ctd_description: str = Query(example="CTD-S2")):
+    # A bit of a hack, but better than using another function entirely, we append a triptophan (not in the repeats),
+    # to be able to use the variant function, and then we remove it.
+    return PlainTextResponse(ctd_convert_to_normal_variant(ctd_description + 'W').replace('W', ''))
diff --git a/ctd_support.py b/ctd_support.py
@@ -0,0 +1,139 @@
+import re
+
+aa = 'GPAVLIMCFYWHKRQNEDST'
+aa = aa + aa.lower()
+aa = f'[{aa}]'
+
+ctd_repetition_regex = r'\(r\d+-r\d+(?:-\d+)?\)'
+ctd_repetition_regex_with_groups = r'\(r(\d+)-r(\d+)(?:-(\d+))?\)'
+ctd_mutation_regex = f'{aa}\d+{aa}(?:{ctd_repetition_regex})?'
+ctd_deletion_regex = f'(?:delta|\u0394|∆)(?:{ctd_repetition_regex})?'
+
+# This dictionary has keys that are the systematic IDs of the genes that have CTDs,
+# the value is a dictionary with:
+# - positions: a dictionary with keys that are the residues of the canonical repeat
+#   e.g. YSPTSPS for SPBC28F2.12, and the value is the repeats in which the residue is
+#   present, zero-based index.
+# - shifts: a tuple of two integers the first value is the position in the protein where
+#   an aminoacid is missing (-1) or inserted (+1) with respect to the canonical repeat.
+#   There is only one in rpb1, but a list of tuples could be used if more are needed.
+# - start: the position of the first residue of the CTD in the protein (zero-based index)
+# - length: the length of the CTD
+
+rpb1_dictionary = {
+    'positions': {
+        'Y1': range(29),
+        'S2': [1, 2] + list(range(4, 29)),
+        'P3': [1, ] + list(range(3, 8)) + list(range(9, 29)),
+        'T4': [0, 2] + list(range(4, 29)),
+        'S5': range(29),
+        'P6': range(29),
+        'S7': [0, 3] + list(range(4, 29)),
+    },
+    'shift': (1566, -1),
+    'start': 1550,
+    'length': 202,
+    'nb_repeats': 29,
+}
+
+
+def apply_shift(match: re.match, shift: tuple[int, int]):
+    num_str = match.group()
+    num = int(num_str)
+    # If the number is smaller than the position of the missing aminoacid,
+    # return the number
+    if num <= shift[0]:
+        return num_str
+    # If the number is bigger than the position of the missing aminoacid,
+    # return the number plus the shift
+    return str(shift[1] + num)
+
+
+def ctd_further_check(g, gg):
+    return gg['CDS'].qualifiers['systematic_id'][0] in ['SPBC28F2.12'] 
+
+
+def ctd_convert_to_normal_variant(ctd_substring: str):
+
+    mutations = re.findall(ctd_mutation_regex, ctd_substring)
+    deletions= re.findall(ctd_deletion_regex, ctd_substring)
+    starting_position = rpb1_dictionary['start']
+    repeat_length = len(rpb1_dictionary['positions'])
+    out_list = []
+    deleted_repeats = list()
+    for deletion in deletions:
+        # Entire deletion, always correct, and takes over everything else
+        if '(' not in deletion:
+            return '{}-{}'.format(rpb1_dictionary['start'] + 1, rpb1_dictionary['start'] + rpb1_dictionary['length'])
+        # Deletion with repeat number
+        match = re.search(ctd_repetition_regex_with_groups, deletion)
+        start, stop, step = match.groups()
+        step = int(step) if step is not None else 1
+        start, stop = int(start), int(stop)
+        deleted_ranges = list()
+
+        for repeat_number in range(start, stop + 1, step):
+            deleted_repeats.append(repeat_number)
+            repeat_start = (repeat_number - 1) * repeat_length + starting_position + 1
+            repeat_end = repeat_start + repeat_length - 1
+            deleted_ranges.append((repeat_start, repeat_end))
+        if step == 1:
+            out_list.append('{}-{}'.format(deleted_ranges[0][0], deleted_ranges[-1][1]))
+        else:
+            out_list.append(','.join(['{}-{}'.format(*x) for x in deleted_ranges]))
+
+    for mutation in mutations:
+        original_residue, index_in_repeat, replaced_by = re.search(r'([A-Za-z])(\d+)([A-Za-z])', mutation).groups()
+        index_in_repeat = int(index_in_repeat)
+        repeats_where_residue_is_present = rpb1_dictionary['positions'][mutation[:2]]
+        match = re.search(ctd_repetition_regex_with_groups, mutation)
+        if match is None:
+            start, stop, step = 1, rpb1_dictionary['nb_repeats'], 1
+        else:
+            start, stop, step = match.groups()
+            step = int(step) if step is not None else 1
+            start, stop = int(start), int(stop)
+        for repeat_number in range(start, stop + 1, step):
+            if repeat_number in deleted_repeats:
+                continue
+            if repeat_number - 1 not in repeats_where_residue_is_present:
+                continue
+            mutated_position = index_in_repeat + (repeat_number - 1) * repeat_length + starting_position
+            out_list.append('{}{}{}'.format(original_residue, mutated_position, replaced_by))
+    out_str = ','.join(out_list)
+
+    return re.sub(r'(\d+)', lambda x: apply_shift(x, rpb1_dictionary['shift']), out_str)
+
+
+def ctd_check_sequence(ctd_substring: str):
+    sequence_errors = []
+    mutations = re.findall(ctd_mutation_regex, ctd_substring)
+
+    for mutation in mutations:
+        if mutation[:2] not in rpb1_dictionary['positions']:
+            sequence_errors.append('CTD-' + mutation[:2])
+
+    match = re.search(ctd_repetition_regex_with_groups, ctd_substring)
+    start, stop, step = match.groups()
+    if int(stop) > rpb1_dictionary['nb_repeats']:
+        sequence_errors.append('CTD-r' + stop)
+    if int(start) > rpb1_dictionary['nb_repeats']:
+        sequence_errors.append('CTD-r' + start)
+
+    return '/'.join(sequence_errors)
+
+
+def ctd_format_for_transvar(capture_groups: list[str], gene: dict) -> list[str]:
+    ctd_substring = capture_groups[0]
+    result = list()
+    for ele in ctd_convert_to_normal_variant(ctd_substring).split(','):
+        if '-' in ele:
+            result.append('p.{}_{}del'.format(*ele.split('-')))
+        else:
+            result.append('p.{}'.format(ele))
+    return result
+
+
+def ctd_apply_syntax(ctd_substring: str):
+    ctd_bits = re.findall(f'({ctd_mutation_regex}|{ctd_deletion_regex})', ctd_substring)
+    return 'CTD-' + ','.join(ctd_bits)