Skip to content

Commit

Permalink
Fixed xml openms reading to properly assign posterior_error_prob, per…
Browse files Browse the repository at this point in the history
…colator_score, and q_value scores to be scored correctly. Added error propagation if no decoys found based on the input decoy symbol. Added error propagation if input custom score or other score does not exist in the input data for all PSMs to properly score proteins
  • Loading branch information
thinkle12 committed Dec 23, 2024
1 parent 8b47da4 commit 5d4e57c
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 5 deletions.
8 changes: 8 additions & 0 deletions pyproteininference/datastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ class DataStore(object):
"q-value": "qvalue",
"posterior_error_prob": "pepvalue",
"posterior_error_probability": "pepvalue",
"MS:1001493": "pepvalue", # Added to make sure custom input for pep/qval accession gets mapped to pep/qval
"MS:1001491": "qvalue",
}

CUSTOM_SCORE_KEY = "custom_score"
Expand Down Expand Up @@ -1179,6 +1181,12 @@ def _check_target_decoy_split(self):
decoys = [
x for x in self.digest.protein_to_peptide_dictionary.keys() if self.parameter_file_object.decoy_symbol in x
]
if len(decoys) == 0:
raise ValueError(
"No decoy proteins found in digest file with decoy symbol: {}. Please double check your decoy symbol and make sure decoy proteins are present in your input file(s).".format(
self.parameter_file_object.decoy_symbol
)
)
ratio = float(len(targets)) / float(len(decoys))
logger.info("Number of Target Proteins in Digest: {}".format(len(targets)))
logger.info("Number of Decoy Proteins in Digest: {}".format(len(decoys)))
Expand Down
1 change: 1 addition & 0 deletions pyproteininference/physical.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,7 @@ def assign_main_score(self, score):
if score not in self.SCORE_ATTRIBUTE_NAMES:
raise ValueError("Scores must either be one of: '{}'".format(", ".join(self.SCORE_ATTRIBUTE_NAMES)))
else:
score_attribute = getattr(self, score)
self.main_score = getattr(self, score)


Expand Down
4 changes: 1 addition & 3 deletions pyproteininference/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,9 +210,7 @@ def _as_list(x: Union[str, List[str]]) -> List[str]:
else (
_as_list(self.decoy_files)
if self.decoy_files
else _as_list(self.combined_files)
if self.combined_files
else list()
else _as_list(self.combined_files) if self.combined_files else list()
)
)
extensions = set([os.path.splitext(x)[1].lower() for x in input_files])
Expand Down
4 changes: 2 additions & 2 deletions pyproteininference/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1111,13 +1111,13 @@ def __init__(
if self.scoring_variable != self.Q_VALUE and self.scoring_variable != self.POSTERIOR_ERROR_PROB:
self.load_custom_score = True
logger.info(
"Pulling custom column based on parameter file input for score, Column: {}".format(
"Pulling custom column based on parameter file input for score, Attribute: {}".format(
self.scoring_variable
)
)
else:
logger.info(
"Pulling no custom columns based on parameter file input for score, using standard Column: {}".format(
"Pulling no custom columns based on parameter file input for score, using standard Attribute: {}".format(
self.scoring_variable
)
)
Expand Down
17 changes: 17 additions & 0 deletions pyproteininference/scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ def score_psms(self, score_method="multiplicative_log"):
>>> score.score_psms(score_method="best_peptide_per_protein")
"""

self._validate_scoring_input()

if score_method not in self.SCORE_METHODS:
raise ValueError(
"score method '{}' is not a proper method. Score method must be one of the following: '{}'".format(
Expand Down Expand Up @@ -472,3 +474,18 @@ def additive(self):
self.data.protein_score = self.ADDITIVE
self.data.short_protein_score = self.SHORT_ADDITIVE
self.data.scored_proteins = all_scores

def _validate_scoring_input(self):
validated_psm_scores = all(x.main_score is not None for x in self.data.get_psm_data())
if validated_psm_scores:
logger.info(
"PSM scores validated. Score: {} read from file correctly for all PSMs".format(
self.data.parameter_file_object.psm_score
)
)
else:
raise ValueError(
"PSM scores not validated. Score: {} not read from file correctly for all PSMs".format(
self.data.parameter_file_object.psm_score
)
)

0 comments on commit 5d4e57c

Please sign in to comment.