Skip to content

Commit

Permalink
protein modif docs improved
Browse files Browse the repository at this point in the history
  • Loading branch information
manulera committed Sep 29, 2023
1 parent c96f85a commit ebc50d7
Show file tree
Hide file tree
Showing 7 changed files with 78 additions and 16 deletions.
6 changes: 3 additions & 3 deletions allele_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,9 @@ class Formatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionH
pass

parser = argparse.ArgumentParser(description=__doc__, formatter_class=Formatter)
parser.add_argument('--genome', default='data/genome.pickle', help='genome dictionary built from contig files.')
parser.add_argument('--alleles', default='data/alleles.tsv')
parser.add_argument('--output', default='results/allele_results.tsv')
parser.add_argument('--genome', default='data/genome.pickle', help='input: genome dictionary built from contig files.')
parser.add_argument('--alleles', default='data/alleles.tsv', help='input allele dataset')
parser.add_argument('--output', default='results/allele_results.tsv', help='output file, also creates two extra files with the extension _errors.tsv and _errors_summarised.tsv')
args = parser.parse_args()

with open(args.genome, 'rb') as ins:
Expand Down
14 changes: 7 additions & 7 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 21 additions & 0 deletions protein_modification_auto_fix.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,24 @@
"""
Tries to apply a series of fixes to the protein modification data (see functions apply_*_fix), and outputs the results.
Inputs:
- results/protein_modification_results_errors_aggregated.tsv: the aggregated errors found in the analysis (created by protein_modification_auto_fix.py)
- data/genome.pickle: the genome data from PomBase (see load_genome.py)
- data/coordinate_changes_dict.json: the coordinate changes dictionary from PomBase (see build_alignment_dict_from_genome.py)
- results/protein_modification_results_errors.tsv: the errors found in the analysis (created by protein_modification_auto_fix.py)
Outputs:
- results/protein_modification_auto_fix.tsv: the data with the fixes applied
- results/protein_modification_cannot_fix_sequence_errors.tsv: the sequence errors that could not be fixed (sequence errors)
- results/protein_modification_cannot_fix_other_errors.tsv: the syntax errors that could not be fixed (pattern errors)
- results/protein_modification_auto_fix_info.tsv: contains all possible fixes (does not prioritise which one to pick). This is not committed or used
anywhere, but it can be useful to track unexpected outcomes.
The extra columns generated in the results file are described in the readme.
For now it works for PomBase data with the default paths, but it can be easily adapted to other data sources.
"""

import json
import pickle
import pandas
Expand Down
31 changes: 31 additions & 0 deletions protein_modification_qc.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,23 @@
"""
Runs the analysis pipeline for protein modifications, idenfifying sequence or syntax errors and aggregating them.
Inputs:
- data/pombase-chado.modifications: the protein modification data from PomBase
- data/genome.pickle: the genome data from PomBase (see load_genome.py)
- data/allowed_mod_dict.json: the allowed modifications for each modification type
Outputs:
- results/protein_modification_results.tsv: the results of the analysis
- results/protein_modification_results_errors.tsv: the errors found in the analysis (a subset of the previous file)
- results/protein_modification_results_errors_aggregated.tsv: the aggregated errors found in the analysis (used in protein_modification_auto_fix.py)
The extra columns of the output are:
- sequence_error: the residues that are incorrect, separated by "|"
- change_sequence_position_to: the sequence position that the error should be changed to (only fixes syntax errors)
For now it works for PomBase data with the default paths, but it can be easily adapted to other data sources.
"""

import pandas
from models import SyntaxRule
from grammar import check_sequence_single_pos, aa
Expand All @@ -9,6 +29,17 @@


def check_func(row, genome, allowed_mod_dict):
"""
Checks if the sequence position is correct, and if not, returns the errors, two values:
The first one can be:
- '': no error
- 'systematic_id not in genome': the sequence position is not correct
- 'pattern_error': the sequence position does not match the pattern
- 'not_protein_gene': the gene is not a protein coding gene
- 'residue_not_allowed': the residue is not allowed for this modification
The second one is normally empty, but if the sequence_position has syntax errors, it contains the corrected sequence_position
"""

# Handle multiple transcripts, we pick the first (.1) by default
try:
Expand Down
13 changes: 9 additions & 4 deletions protein_modification_transvar.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
"""
Uses transvar to represent the modification positions in standard genomic coordinates.
"""

import pandas
import pickle
import argparse
Expand Down Expand Up @@ -87,10 +92,10 @@ class Formatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionH
pass

parser = argparse.ArgumentParser(description=__doc__, formatter_class=Formatter)
parser.add_argument('--genome', default='data/genome.pickle', help='genome dictionary built from contig files.')
parser.add_argument('--protein_modification_results', default='results/protein_modification_results.tsv')
parser.add_argument('--exclude_transcripts', default='data/frame_shifted_transcripts.tsv')
parser.add_argument('--output', default='results/protein_modification_results_transvar.tsv')
parser.add_argument('--genome', default='data/genome.pickle', help='genome dictionary built from contig files (see load_genome.py).')
parser.add_argument('--protein_modification_results', default='results/protein_modification_results.tsv', help='output of protein_modification_qc.py')
parser.add_argument('--exclude_transcripts', default='data/frame_shifted_transcripts.tsv', help='transcripts to exclude from transvar because they are known to be problematic')
parser.add_argument('--output', default='results/protein_modification_results_transvar.tsv', help='output file')

args = parser.parse_args()
main(args.genome, args.protein_modification_results, args.exclude_transcripts, args.output)
Expand Down
5 changes: 3 additions & 2 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@ poetry install
# Activate python environment
poetry shell

# Install transvar in the project directory and set up necessary env variables
bash set_up_transvar.sh
# Set up the necessary transvar variables (you must have installed transvar, see next section)
. transvar_env_vars.sh
bash set_up_transvar.sh


# Run this script (See the comments in the subscripts)
bash run_analysis.sh
Expand Down
4 changes: 4 additions & 0 deletions run_analysis.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
set -e
bash get_data.sh
python build_alignment_dict_from_genome.py

# Check and fix protein modification
python protein_modification_qc.py
python protein_modification_auto_fix.py
python protein_modification_transvar.py

# Check and fix allele descriptions, types and names
python allele_qc.py
python allele_auto_fix.py
python allele_transvar.py

0 comments on commit ebc50d7

Please sign in to comment.