Skip to content

Commit

Permalink
Sgd allele auto fix + transvar support (#105)
Browse files Browse the repository at this point in the history
* intermediary commit prior to apply the fixes only to the alleles with sequence errors

* auto_fix and transvar work (hacky)

* remove type_error for showing

* Improved sgd support + changed grammar for better invalid error messages for #101

* fix tests and better handling of transvar sgd vs. pombase
  • Loading branch information
manulera authored Aug 28, 2023
1 parent 2d39a55 commit f3571cb
Show file tree
Hide file tree
Showing 24 changed files with 42,694 additions and 218 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,7 @@ results/all_protein_variant_sequences.fasta

data/sgd/genome_embl_files
data/sgd/genome.pickle
data/sgd/current_protein_seqs.fasta
data/sgd/features.gtf.*
data/sgd/genome_sequence.fsa.fai
data/sgd/protein_coding_sgd.txt
357 changes: 191 additions & 166 deletions allele_auto_fix.py

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions allele_fixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def old_coords_fix(coordinate_changes, targets):
new_sequence = new_alignment.replace('-', '')
old_sequence = old_alignment.replace('-', '')

this_revision = {'revision': prev_coord['revision'], 'location': prev_coord['old_coord'], 'values': list()}
this_revision = {'revision': prev_coord['revision'], 'location': prev_coord['old_coord'] if 'old_coord' in prev_coord else '', 'values': list()}
# remap the coordinates
for t in targets:
# The position must exist in the old sequence
Expand Down Expand Up @@ -129,7 +129,8 @@ def multi_shift_fix(seq, targets):
targets = ['A3', 'V4', '8']
returns ['A1,V2'] # Because position 10 does not exist
"""

if 'G71V' in targets:
print(targets)
all_indexes = list()
for target in targets:
all_indexes.extend([(int(i) - 1) for i in re.findall(r'\d+', target)])
Expand Down
29 changes: 23 additions & 6 deletions allele_transvar.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ def get_transvar_annotation_coordinates(annotations: list[TransvarAnnotation], g
return '{}/./.'.format(annotations[0].coordinates.split('/')[0])


def get_transvar_coordinates(row, db, genome, exclude_transcripts):
# print(row['systematic_id'], '<<<>>>', row['transvar_input_list'])
def get_transvar_coordinates(row, db, genome, exclude_transcripts, sgd_mode=False):

allele_qc_id = handle_systematic_id_for_allele_qc(row['systematic_id'], row['allele_name'], genome)
transcript_id = None if (allele_qc_id == row['systematic_id']) else allele_qc_id
try:
Expand All @@ -56,13 +56,17 @@ def get_transvar_coordinates(row, db, genome, exclude_transcripts):
transvar_output.append(get_transvar_annotation_coordinates(transvar_annotation_list, row['systematic_id'], transcript_id))
return transvar_output
except ValueError as e:
if e.args[0] == 'no_valid_transcript_found' and row['systematic_id'] in exclude_transcripts:
# For now we skip the transcripts that don't work, but we should address this
if sgd_mode:
print('skipping transcript {} for {}'.format(row['systematic_id'], row['allele_name']))
return []
elif e.args[0] == 'no_valid_transcript_found' and row['systematic_id'] in exclude_transcripts:
return []
else:
raise e


def main(genome_file, allele_results_file, exclude_transcripts_file, output_file):
def main(genome_file, allele_results_file, exclude_transcripts_file, output_file, sgd_mode, transvardb, genome_fasta):

with open(genome_file, 'rb') as ins:
genome = pickle.load(ins)
Expand All @@ -75,6 +79,15 @@ def main(genome_file, allele_results_file, exclude_transcripts_file, output_file

data = pandas.read_csv(allele_results_file, sep='\t', na_filter=False)

if sgd_mode:
# Ammend wrong type:
wrong_type = (data['change_type_to'] != '') & \
(data['pattern_error'] == '') & \
(data['invalid_error'] == '') & \
(data['sequence_error'] == '')
data.loc[wrong_type, 'allele_type'] = data.loc[wrong_type, 'change_type_to']
data.loc[wrong_type, 'needs_fixing'] = False

# Remove all errors
data = data[~data['needs_fixing']].copy()

Expand All @@ -90,7 +103,7 @@ def main(genome_file, allele_results_file, exclude_transcripts_file, output_file
# Apply transvar to each allele_parts
data_exploded['transvar_input_list'] = data_exploded.apply(format_transvar_input_list, axis=1, args=(genome, syntax_rules_aminoacids, syntax_rules_nucleotides))

anno_db = get_anno_db()
anno_db = get_anno_db(transvardb, genome_fasta)
print('Running transvar on variants... (will take a while)')
data_exploded['transvar_coordinates'] = data_exploded.progress_apply(get_transvar_coordinates, args=(anno_db, genome, exclude_transcripts), axis=1)

Expand All @@ -107,8 +120,12 @@ class Formatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionH
parser.add_argument('--genome', default='data/genome.pickle', help='genome dictionary built from contig files.')
parser.add_argument('--allele_results', default='results/allele_results.tsv')
parser.add_argument('--exclude_transcripts', default='data/frame_shifted_transcripts.tsv')
parser.add_argument('--genome_fasta', default='data/pombe_genome.fa')
parser.add_argument('--transvardb', default='data/pombe_genome.gtf.transvardb')
parser.add_argument('--output', default='results/allele_results_transvar.tsv')

parser.add_argument('--sgd_mode', type=bool, default=False, help='Skip transcripts that don\'t work and fix allele types, this arg should be removed in the future.')

args = parser.parse_args()

main(args.genome, args.allele_results, args.exclude_transcripts, args.output)
main(args.genome, args.allele_results, args.exclude_transcripts, args.output, args.sgd_mode, args.transvardb, args.genome_fasta)
7 changes: 5 additions & 2 deletions api.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,7 @@ async def get_residue_at_position(systematic_id: str = Query(example='SPAPB1A10.
@ app.get("/ganno", summary='Variant described at the genome level (gDNA)', response_model=list[TransvarAnnotation])
async def ganno(variant_description: str = Query(example="II:g.178497T>A", description='Variant described at the genome level (gDNA)')) -> list[TransvarAnnotation]:
try:
db = get_anno_db('data/pombe_genome.gtf.transvardb', 'data/pombe_genome.fa')
return parse_transvar_string(get_transvar_str_annotation('ganno', variant_description))
except Exception as e:
raise HTTPException(400, str(e))
Expand All @@ -386,6 +387,7 @@ async def ganno(variant_description: str = Query(example="II:g.178497T>A", descr
@ app.get("/canno", summary='Variant described at the coding DNA level (cDNA)', response_model=list[TransvarAnnotation])
async def canno(variant_description: str = Query(example="SPAC3F10.09:c.5A>T", description='Variant described at the coding DNA level (cDNA)')) -> list[TransvarAnnotation]:
try:
db = get_anno_db('data/pombe_genome.gtf.transvardb', 'data/pombe_genome.fa')
return parse_transvar_string(get_transvar_str_annotation('canno', variant_description))
except Exception as e:
raise HTTPException(400, str(e))
Expand All @@ -394,6 +396,7 @@ async def canno(variant_description: str = Query(example="SPAC3F10.09:c.5A>T", d
@ app.get("/panno", summary='Variant described at the protein level', response_model=list[TransvarAnnotation])
async def panno(variant_description: str = Query(example="SPBC1198.04c:p.N3A", description='Variant described at the protein level')) -> list[TransvarAnnotation]:
try:
db = get_anno_db('data/pombe_genome.gtf.transvardb', 'data/pombe_genome.fa')
return parse_transvar_string(get_transvar_str_annotation('panno', variant_description))
except Exception as e:
raise HTTPException(400, str(e))
Expand All @@ -413,7 +416,7 @@ async def allele_transvar_coordinates(systematic_id: str = Query(example="SPBC35
with open('data/genome.pickle', 'rb') as ins:
genome = pickle.load(ins)

db = get_anno_db()
db = get_anno_db('data/pombe_genome.gtf.transvardb', 'data/pombe_genome.fa')

out_list = list()
for allele_part, rule_applied in zip(check_allele_resp.allele_parts.split('|'), check_allele_resp.rules_applied.split('|')):
Expand All @@ -440,7 +443,7 @@ async def protein_modification_coordinates(systematic_id: str = Query(example="S
with open('data/genome.pickle', 'rb') as ins:
genome = pickle.load(ins)

db = get_anno_db()
db = get_anno_db('data/pombe_genome.gtf.transvardb', 'data/pombe_genome.fa')

out_list = list()
for sequence_position_i in sequence_position.split(','):
Expand Down
10 changes: 6 additions & 4 deletions build_alignment_dict.py → build_alignment_dict_from_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
The dictionary structure, where keys are the systematic_id of genes:
"SPAC23E2.02": [{
"new_revision": "new_revision",
"old_revision": "old_revision",
"revision": "revision",
"new_coord": "join(446770..449241,449295..450530)",
"old_coord": "join(446491..446513,446679..449241,449295..450530)",
"new_alignment": "--------------------------------------MNTSENDP ... GYNGTRY*",
Expand Down Expand Up @@ -48,6 +47,9 @@ class Formatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionH
parser.add_argument('--genome', default='data/genome.pickle')
parser.add_argument('--coords', default='data/only_modified_coordinates.tsv')
parser.add_argument('--output', default='data/coordinate_changes_dict.json')
parser.add_argument('--old_genomes', default='data/old_genome_versions/*/*.contig')
parser.add_argument('--genome_sequence_changes', default='data/genome_sequence_changes.tsv')

args = parser.parse_args()


Expand Down Expand Up @@ -109,12 +111,12 @@ def choose_old_genome(previous_coordinate, latest_genome_seq, old_genomes_dict,


# Load info about changes in genome sequence
genome_seq_changes = pandas.read_csv('data/genome_sequence_changes.tsv', sep='\t', na_filter=False, dtype=str)
genome_seq_changes = pandas.read_csv(args.genome_sequence_changes, sep='\t', na_filter=False, dtype=str)
# We skip current versions
genome_seq_changes = genome_seq_changes[genome_seq_changes['chromosome'].duplicated()].copy()

print('\033[0;32mreading old genomes...\033[0m')
old_genomes_dict = read_old_genomes(glob.glob('data/old_genome_versions/*/*.contig'), 'embl')
old_genomes_dict = read_old_genomes(glob.glob(args.old_genomes), 'embl')
print('\033[0;32mold genomes read\033[0m')

with open(args.genome, 'rb') as ins:
Expand Down
80 changes: 80 additions & 0 deletions build_alignment_dict_from_peptides.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""
Build a dictionary of alignments based on current and all previous peptide coordinates.
The input is the all_previous_seqs.tsv file generated in https://github.com/pombase/all_previous_sgd_peptide_sequences
The dictionary structure, where keys are the systematic_id of genes:
"SPAC23E2.02": [{
"new_alignment": "--------------------------------------MNTSENDP ... GYNGTRY*",
"old_alignment": "MPLGRSSWICCAKYFVNTKSRFNEILPPRFTLIVSFYSMNTSENDP ... SGYNGTRY*"
}],
In the alignment gaps have the maximum penalty, to avoid scattered matches.
-----CAV
MAACATAV
And not
---C--AV
MAACATAV
The reason for this is that the alignment is based on changing / removing introns or changing the start of ending
coordinate of the start or end of the CDS, so you want maximal identity with minimum number of gaps.
"""

import argparse
from Bio import pairwise2, SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import json


def main(previous_seqs_file, current_seqs_file, output_file):
old_seq_dict = dict()
# This one is a tsv file
with open(previous_seqs_file) as ins:
for line in map(str.rstrip, ins):
ls = line.split('\t')
gene_id = ls[0]
if gene_id in old_seq_dict:
old_seq_dict[ls[0]].append((Seq(ls[1]), ls[2]))
else:
old_seq_dict[ls[0]] = [(Seq(ls[1]), ls[2])]

changes_dict = dict()
record: SeqRecord
for record in SeqIO.parse(current_seqs_file, 'fasta'):

if record.id not in old_seq_dict:
continue

changes_dict[record.id] = list()
for old_seq, revision in old_seq_dict[record.id]:

alignments = pairwise2.align.globalms(record.seq, old_seq, match=1, mismatch=-2, open=-2, extend=0, penalize_end_gaps=False)
if len(alignments) == 0:
print('> No alignment found for {}, skipping'.format(record.id))
continue
changes_dict[record.id].append({
'revision': revision,
'new_alignment': alignments[0].seqA,
'old_alignment': alignments[0].seqB
})

with open(output_file, 'w') as out:
json.dump(changes_dict, out, indent=4)


if __name__ == '__main__':
class Formatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
pass

parser = argparse.ArgumentParser(description=__doc__, formatter_class=Formatter)
parser.add_argument('--previous_seqs', default='data/sgd/all_previous_seqs.tsv')
parser.add_argument('--current_seqs', default='data/sgd/current_protein_seqs.fasta')
parser.add_argument('--output', default='data/sgd/coordinate_changes_dict.json')
args = parser.parse_args()

main(args.previous_seqs, args.current_seqs, args.output)
1 change: 1 addition & 0 deletions common_autofix_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def apply_old_coords_fix(row, coordinate_changes_dict, target_column):

# These are histone proteins that typically did not count the methionine
histones = ['SPBC1105.11c', 'SPBC1105.12', 'SPAC1834.03c', 'SPAC1834.04', 'SPAC19G12.06c', 'SPBC8D2.03c', 'SPBC8D2.04', 'SPCC622.08c', 'SPCC622.09', 'SPBC11B10.10c', 'SPBC1105.17']
histones += ['YBL002W', 'YBL003C', 'YBR009C', 'YBR010W', 'YDR224C', 'YDR225W', 'YKL049C', 'YNL030W', 'YNL031C', 'YOL012C']


def apply_histone_fix(row, genome, target_column):
Expand Down
Loading

0 comments on commit f3571cb

Please sign in to comment.