From 5d4e57c28773f85abcbdabec98a6da927f2020ae Mon Sep 17 00:00:00 2001 From: Trent Hinkle Date: Mon, 23 Dec 2024 13:50:15 -0800 Subject: [PATCH] Fixed xml openms reading to properly assign posterior_error_prob, percolator_score, and q_value scores to be scored correctly. Added error propagation if no decoys found based on the input decoy symbol. Added error propagation if input custom score or other score does not exist in the input data for all PSMs to properly score proteins --- pyproteininference/datastore.py | 8 ++++++++ pyproteininference/physical.py | 1 + pyproteininference/pipeline.py | 4 +--- pyproteininference/reader.py | 4 ++-- pyproteininference/scoring.py | 17 +++++++++++++++++ 5 files changed, 29 insertions(+), 5 deletions(-) diff --git a/pyproteininference/datastore.py b/pyproteininference/datastore.py index 207fe80..eebc8f7 100644 --- a/pyproteininference/datastore.py +++ b/pyproteininference/datastore.py @@ -62,6 +62,8 @@ class DataStore(object): "q-value": "qvalue", "posterior_error_prob": "pepvalue", "posterior_error_probability": "pepvalue", + "MS:1001493": "pepvalue", # Added to make sure custom input for pep/qval accession gets mapped to pep/qval + "MS:1001491": "qvalue", } CUSTOM_SCORE_KEY = "custom_score" @@ -1179,6 +1181,12 @@ def _check_target_decoy_split(self): decoys = [ x for x in self.digest.protein_to_peptide_dictionary.keys() if self.parameter_file_object.decoy_symbol in x ] + if len(decoys) == 0: + raise ValueError( + "No decoy proteins found in digest file with decoy symbol: {}. Please double check your decoy symbol and make sure decoy proteins are present in your input file(s).".format( + self.parameter_file_object.decoy_symbol + ) + ) ratio = float(len(targets)) / float(len(decoys)) logger.info("Number of Target Proteins in Digest: {}".format(len(targets))) logger.info("Number of Decoy Proteins in Digest: {}".format(len(decoys))) diff --git a/pyproteininference/physical.py b/pyproteininference/physical.py index eefa87e..90453e3 100644 --- a/pyproteininference/physical.py +++ b/pyproteininference/physical.py @@ -330,6 +330,7 @@ def assign_main_score(self, score): if score not in self.SCORE_ATTRIBUTE_NAMES: raise ValueError("Scores must either be one of: '{}'".format(", ".join(self.SCORE_ATTRIBUTE_NAMES))) else: + score_attribute = getattr(self, score) self.main_score = getattr(self, score) diff --git a/pyproteininference/pipeline.py b/pyproteininference/pipeline.py index 2cad67f..a397457 100644 --- a/pyproteininference/pipeline.py +++ b/pyproteininference/pipeline.py @@ -210,9 +210,7 @@ def _as_list(x: Union[str, List[str]]) -> List[str]: else ( _as_list(self.decoy_files) if self.decoy_files - else _as_list(self.combined_files) - if self.combined_files - else list() + else _as_list(self.combined_files) if self.combined_files else list() ) ) extensions = set([os.path.splitext(x)[1].lower() for x in input_files]) diff --git a/pyproteininference/reader.py b/pyproteininference/reader.py index 327a5c8..d4cffc6 100644 --- a/pyproteininference/reader.py +++ b/pyproteininference/reader.py @@ -1111,13 +1111,13 @@ def __init__( if self.scoring_variable != self.Q_VALUE and self.scoring_variable != self.POSTERIOR_ERROR_PROB: self.load_custom_score = True logger.info( - "Pulling custom column based on parameter file input for score, Column: {}".format( + "Pulling custom column based on parameter file input for score, Attribute: {}".format( self.scoring_variable ) ) else: logger.info( - "Pulling no custom columns based on parameter file input for score, using standard Column: {}".format( + "Pulling no custom columns based on parameter file input for score, using standard Attribute: {}".format( self.scoring_variable ) ) diff --git a/pyproteininference/scoring.py b/pyproteininference/scoring.py index e5e8176..e61af6d 100644 --- a/pyproteininference/scoring.py +++ b/pyproteininference/scoring.py @@ -120,6 +120,8 @@ def score_psms(self, score_method="multiplicative_log"): >>> score.score_psms(score_method="best_peptide_per_protein") """ + self._validate_scoring_input() + if score_method not in self.SCORE_METHODS: raise ValueError( "score method '{}' is not a proper method. Score method must be one of the following: '{}'".format( @@ -472,3 +474,18 @@ def additive(self): self.data.protein_score = self.ADDITIVE self.data.short_protein_score = self.SHORT_ADDITIVE self.data.scored_proteins = all_scores + + def _validate_scoring_input(self): + validated_psm_scores = all(x.main_score is not None for x in self.data.get_psm_data()) + if validated_psm_scores: + logger.info( + "PSM scores validated. Score: {} read from file correctly for all PSMs".format( + self.data.parameter_file_object.psm_score + ) + ) + else: + raise ValueError( + "PSM scores not validated. Score: {} not read from file correctly for all PSMs".format( + self.data.parameter_file_object.psm_score + ) + )