protein modif docs improved

pombase · Sep 29, 2023 · ebc50d7 · ebc50d7
1 parent c96f85a
commit ebc50d7
Show file tree

Hide file tree

Showing 7 changed files with 78 additions and 16 deletions.
diff --git a/allele_qc.py b/allele_qc.py
@@ -71,9 +71,9 @@ class Formatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionH
         pass
 
     parser = argparse.ArgumentParser(description=__doc__, formatter_class=Formatter)
-    parser.add_argument('--genome', default='data/genome.pickle', help='genome dictionary built from contig files.')
-    parser.add_argument('--alleles', default='data/alleles.tsv')
-    parser.add_argument('--output', default='results/allele_results.tsv')
+    parser.add_argument('--genome', default='data/genome.pickle', help='input: genome dictionary built from contig files.')
+    parser.add_argument('--alleles', default='data/alleles.tsv', help='input allele dataset')
+    parser.add_argument('--output', default='results/allele_results.tsv', help='output file, also creates two extra files with the extension _errors.tsv and _errors_summarised.tsv')
     args = parser.parse_args()
 
     with open(args.genome, 'rb') as ins:

diff --git a/poetry.lock b/poetry.lock
diff --git a/protein_modification_auto_fix.py b/protein_modification_auto_fix.py
@@ -1,3 +1,24 @@
+"""
+Tries to apply a series of fixes to the protein modification data (see functions apply_*_fix), and outputs the results.
+
+Inputs:
+    - results/protein_modification_results_errors_aggregated.tsv: the aggregated errors found in the analysis (created by protein_modification_auto_fix.py)
+    - data/genome.pickle: the genome data from PomBase (see load_genome.py)
+    - data/coordinate_changes_dict.json: the coordinate changes dictionary from PomBase (see build_alignment_dict_from_genome.py)
+    - results/protein_modification_results_errors.tsv: the errors found in the analysis (created by protein_modification_auto_fix.py)
+
+Outputs:
+    - results/protein_modification_auto_fix.tsv: the data with the fixes applied
+    - results/protein_modification_cannot_fix_sequence_errors.tsv: the sequence errors that could not be fixed (sequence errors)
+    - results/protein_modification_cannot_fix_other_errors.tsv: the syntax errors that could not be fixed (pattern errors)
+    - results/protein_modification_auto_fix_info.tsv: contains all possible fixes (does not prioritise which one to pick). This is not committed or used
+      anywhere, but it can be useful to track unexpected outcomes.
+
+The extra columns generated in the results file are described in the readme.
+
+For now it works for PomBase data with the default paths, but it can be easily adapted to other data sources.
+"""
+
 import json
 import pickle
 import pandas

diff --git a/protein_modification_qc.py b/protein_modification_qc.py
@@ -1,3 +1,23 @@
+"""
+Runs the analysis pipeline for protein modifications, idenfifying sequence or syntax errors and aggregating them.
+
+Inputs:
+    - data/pombase-chado.modifications: the protein modification data from PomBase
+    - data/genome.pickle: the genome data from PomBase (see load_genome.py)
+    - data/allowed_mod_dict.json: the allowed modifications for each modification type
+
+Outputs:
+    - results/protein_modification_results.tsv: the results of the analysis
+    - results/protein_modification_results_errors.tsv: the errors found in the analysis (a subset of the previous file)
+    - results/protein_modification_results_errors_aggregated.tsv: the aggregated errors found in the analysis (used in protein_modification_auto_fix.py)
+
+The extra columns of the output are:
+    - sequence_error: the residues that are incorrect, separated by "|"
+    - change_sequence_position_to: the sequence position that the error should be changed to (only fixes syntax errors)
+
+For now it works for PomBase data with the default paths, but it can be easily adapted to other data sources.
+"""
+
 import pandas
 from models import SyntaxRule
 from grammar import check_sequence_single_pos, aa
@@ -9,6 +29,17 @@
 
 
 def check_func(row, genome, allowed_mod_dict):
+    """
+    Checks if the sequence position is correct, and if not, returns the errors, two values:
+    The first one can be:
+        - '': no error
+        - 'systematic_id not in genome': the sequence position is not correct
+        - 'pattern_error': the sequence position does not match the pattern
+        - 'not_protein_gene': the gene is not a protein coding gene
+        - 'residue_not_allowed': the residue is not allowed for this modification
+
+    The second one is normally empty, but if the sequence_position has syntax errors, it contains the corrected sequence_position
+    """
 
     # Handle multiple transcripts, we pick the first (.1) by default
     try:

diff --git a/protein_modification_transvar.py b/protein_modification_transvar.py
@@ -1,3 +1,8 @@
+"""
+Uses transvar to represent the modification positions in standard genomic coordinates.
+
+"""
+
 import pandas
 import pickle
 import argparse
@@ -87,10 +92,10 @@ class Formatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionH
         pass
 
     parser = argparse.ArgumentParser(description=__doc__, formatter_class=Formatter)
-    parser.add_argument('--genome', default='data/genome.pickle', help='genome dictionary built from contig files.')
-    parser.add_argument('--protein_modification_results', default='results/protein_modification_results.tsv')
-    parser.add_argument('--exclude_transcripts', default='data/frame_shifted_transcripts.tsv')
-    parser.add_argument('--output', default='results/protein_modification_results_transvar.tsv')
+    parser.add_argument('--genome', default='data/genome.pickle', help='genome dictionary built from contig files (see load_genome.py).')
+    parser.add_argument('--protein_modification_results', default='results/protein_modification_results.tsv', help='output of protein_modification_qc.py')
+    parser.add_argument('--exclude_transcripts', default='data/frame_shifted_transcripts.tsv', help='transcripts to exclude from transvar because they are known to be problematic')
+    parser.add_argument('--output', default='results/protein_modification_results_transvar.tsv', help='output file')
 
     args = parser.parse_args()
     main(args.genome, args.protein_modification_results, args.exclude_transcripts, args.output)

diff --git a/readme.md b/readme.md
@@ -17,9 +17,10 @@ poetry install
 # Activate python environment
 poetry shell
 
-# Install transvar in the project directory and set up necessary env variables
-bash set_up_transvar.sh
+# Set up the necessary transvar variables (you must have installed transvar, see next section)
 . transvar_env_vars.sh
+bash set_up_transvar.sh
+
 
 # Run this script (See the comments in the subscripts)
 bash run_analysis.sh

diff --git a/run_analysis.sh b/run_analysis.sh
@@ -1,9 +1,13 @@
 set -e
 bash get_data.sh
 python build_alignment_dict_from_genome.py
+
+# Check and fix protein modification
 python protein_modification_qc.py
 python protein_modification_auto_fix.py
 python protein_modification_transvar.py
+
+# Check and fix allele descriptions, types and names
 python allele_qc.py
 python allele_auto_fix.py
 python allele_transvar.py