Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor parsing of precursor data from spectrum files #162

Merged
merged 4 commits into from
Dec 5, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 18 additions & 55 deletions ms2rescore/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,14 @@
from multiprocessing import cpu_count
from typing import Dict, Optional

import numpy as np
import psm_utils.io
from mokapot.dataset import LinearPsmDataset
from psm_utils import PSMList

from ms2rescore import exceptions
from ms2rescore.feature_generators import FEATURE_GENERATORS
from ms2rescore.parse_psms import parse_psms
from ms2rescore.parse_spectra import get_missing_values
from ms2rescore.parse_spectra import add_precursor_values
from ms2rescore.report import generate
from ms2rescore.rescoring_engines import mokapot, percolator
from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence
Expand Down Expand Up @@ -62,20 +61,28 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
)

# Add missing precursor info from spectrum file if needed
psm_list = _fill_missing_precursor_info(psm_list, config)
available_ms_data = add_precursor_values(
psm_list, config["spectrum_path"], config["spectrum_id_pattern"]
)

# Add rescoring features
for fgen_name, fgen_config in config["feature_generators"].items():
# TODO: Handle this somewhere else, more generally?
if fgen_name == "maxquant" and not (psm_list["source"] == "msms").all():
logger.warning(
"MaxQuant feature generator requires PSMs from a MaxQuant msms.txt file. Skipping "
"this feature generator."
)
continue
# Compile configuration
conf = config.copy()
conf.update(fgen_config)
fgen = FEATURE_GENERATORS[fgen_name](**conf)

# Check if required MS data is available
missing_ms_data = fgen.required_ms_data - available_ms_data
if missing_ms_data:
logger.warning(
f"Skipping feature generator {fgen_name} because required MS data is missing: "
f"{missing_ms_data}. Ensure that the required MS data is present in the input "
"files or disable the feature generator."
)
continue

# Add features
fgen.add_features(psm_list)
logger.debug(f"Adding features from {fgen_name}: {set(fgen.feature_names)}")
feature_names[fgen_name] = set(fgen.feature_names)
Expand All @@ -102,6 +109,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
# Write feature names to file
_write_feature_names(feature_names, output_file_root)

# Rename PSMs to USIs if requested
if config["rename_to_usi"]:
logging.debug(f"Creating USIs for {len(psm_list)} PSMs")
psm_list["spectrum_id"] = [psm.get_usi(as_url=False) for psm in psm_list]
Expand Down Expand Up @@ -173,51 +181,6 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
logger.exception(e)


def _fill_missing_precursor_info(psm_list: PSMList, config: Dict) -> PSMList:
"""Fill missing precursor info from spectrum file if needed."""
# Check if required
# TODO: avoid hard coding feature generators in some way
rt_required = ("deeplc" in config["feature_generators"]) and any(
v is None or v == 0 or np.isnan(v) for v in psm_list["retention_time"]
)
im_required = (
"ionmob" in config["feature_generators"] or "im2deep" in config["feature_generators"]
) and any(v is None or v == 0 or np.isnan(v) for v in psm_list["ion_mobility"])
logger.debug(f"RT required: {rt_required}, IM required: {im_required}")

# Add missing values
if rt_required or im_required:
logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
get_missing_values(psm_list, config, rt_required=rt_required, im_required=im_required)

# Check if values are now present
for value_name, required in [("retention_time", rt_required), ("ion_mobility", im_required)]:
if required and (
0.0 in psm_list[value_name]
or None in psm_list[value_name]
or np.isnan(psm_list[value_name]).any()
):
if all(v is None or v == 0.0 or np.isnan(v) for v in psm_list[value_name]):
raise exceptions.MissingValuesError(
f"Could not find any '{value_name}' values in PSM or spectrum files. Disable "
f"feature generators that require '{value_name}' or ensure that the values are "
"present in the input files."
)
else:
missing_value_psms = psm_list[
[v is None or np.isnan(v) for v in psm_list[value_name]]
]
logger.warning(
f"Found {len(missing_value_psms)} PSMs with missing '{value_name}' values. "
"These PSMs will be removed."
)
psm_list = psm_list[
[v is not None and not np.isnan(v) for v in psm_list[value_name]]
]

return psm_list


def _filter_by_rank(psm_list: PSMList, max_rank: int, lower_score_better: bool) -> PSMList:
"""Filter PSMs by rank."""
psm_list.set_ranks(lower_score_better=lower_score_better)
Expand Down
6 changes: 6 additions & 0 deletions ms2rescore/feature_generators/base.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
from abc import ABC, abstractmethod
from typing import Set

from psm_utils import PSMList

from ms2rescore.parse_spectra import MSDataType


class FeatureGeneratorBase(ABC):
"""Base class from which all feature generators must inherit."""

# List of required MS data types for feature generation
required_ms_data: Set[MSDataType] = set()

def __init__(self, *args, **kwargs) -> None:
super().__init__()

Expand Down
11 changes: 8 additions & 3 deletions ms2rescore/feature_generators/deeplc.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from psm_utils import PSMList

from ms2rescore.feature_generators.base import FeatureGeneratorBase
from ms2rescore.parse_spectra import MSDataType

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
logger = logging.getLogger(__name__)
Expand All @@ -35,6 +36,8 @@
class DeepLCFeatureGenerator(FeatureGeneratorBase):
"""DeepLC retention time-based feature generator."""

required_ms_data = {MSDataType.retention_time}

def __init__(
self,
*args,
Expand Down Expand Up @@ -138,9 +141,11 @@ def add_features(self, psm_list: PSMList) -> None:
)

# Disable wild logging to stdout by Tensorflow, unless in debug mode
with contextlib.redirect_stdout(
open(os.devnull, "w")
) if not self._verbose else contextlib.nullcontext():
with (
contextlib.redirect_stdout(open(os.devnull, "w"))
if not self._verbose
else contextlib.nullcontext()
):
# Make new PSM list for this run (chain PSMs per spectrum to flat list)
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))

Expand Down
3 changes: 3 additions & 0 deletions ms2rescore/feature_generators/im2deep.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from psm_utils import PSMList

from ms2rescore.feature_generators.base import FeatureGeneratorBase
from ms2rescore.parse_spectra import MSDataType

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
logger = logging.getLogger(__name__)
Expand All @@ -30,6 +31,8 @@
class IM2DeepFeatureGenerator(FeatureGeneratorBase):
"""IM2Deep collision cross section feature generator."""

required_ms_data = {MSDataType.ion_mobility}

def __init__(
self,
*args,
Expand Down
3 changes: 3 additions & 0 deletions ms2rescore/feature_generators/ionmob.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from psm_utils import Peptidoform, PSMList

from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException
from ms2rescore.parse_spectra import MSDataType

try:
from ionmob import __file__ as ionmob_file
Expand Down Expand Up @@ -55,6 +56,8 @@
class IonMobFeatureGenerator(FeatureGeneratorBase):
"""Ionmob collisional cross section (CCS)-based feature generator."""

required_ms_data = {MSDataType.ion_mobility}

def __init__(
self,
*args,
Expand Down
37 changes: 26 additions & 11 deletions ms2rescore/feature_generators/maxquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,16 @@
class MaxQuantFeatureGenerator(FeatureGeneratorBase):
"""Generate MaxQuant-derived features."""

available_features = [
"mean_error_top7",
"sq_mean_error_top7",
"stdev_error_top7",
"ln_explained_ion_current",
"ln_nterm_ion_current_ratio",
"ln_cterm_ion_current_ratio",
"ln_ms2_ion_current",
]

def __init__(self, *args, **kwargs) -> None:
"""
Generate MaxQuant-derived features.
Expand All @@ -39,30 +49,30 @@ def __init__(self, *args, **kwargs) -> None:

"""
super().__init__(*args, **kwargs)
self._feature_names = self.available_features[:] # Copy list
RalfG marked this conversation as resolved.
Show resolved Hide resolved

@property
def feature_names(self) -> List[str]:
return [
"mean_error_top7",
"sq_mean_error_top7",
"stdev_error_top7",
"ln_explained_ion_current",
"ln_nterm_ion_current_ratio",
"ln_cterm_ion_current_ratio",
"ln_ms2_ion_current",
]
return self._feature_names

def add_features(self, psm_list: PSMList):
"""
Add MS²PIP-derived features to PSMs.
Add MaxQuant-derived features to PSMs.

Parameters
----------
psm_list
PSMs to add features to.

"""
logger.info("Adding MaxQuant-derived features to PSMs.")
# Check if all PSMs are from MaxQuant
if not self._all_psms_from_maxquant(psm_list):
self._feature_names = [] # Set feature names to empty list to indicate none added
logger.warning("Not all PSMs are from MaxQuant. Skipping MaxQuant feature generation.")
return
else:
self._feature_names = self.available_features # Reset feature names
logger.info("Adding MaxQuant-derived features to PSMs.")

# Infer mass deviations column name
for column_name in [
Expand Down Expand Up @@ -90,6 +100,11 @@ def add_features(self, psm_list: PSMList):
for psm in psm_list:
psm["rescoring_features"].update(self._compute_features(psm["metadata"]))

@staticmethod
def _all_psms_from_maxquant(psm_list):
"""Check if the PSMs are from MaxQuant."""
return (psm_list["source"] == "msms").all()

def _compute_features(self, psm_metadata):
"""Compute features from derived from intensities and mass errors."""
features = {}
Expand Down
3 changes: 3 additions & 0 deletions ms2rescore/feature_generators/ms2pip.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from rich.progress import track

from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException
from ms2rescore.parse_spectra import MSDataType
from ms2rescore.utils import infer_spectrum_path

logger = logging.getLogger(__name__)
Expand All @@ -46,6 +47,8 @@
class MS2PIPFeatureGenerator(FeatureGeneratorBase):
"""Generate MS²PIP-based features."""

required_ms_data = {MSDataType.ms2_spectra}

def __init__(
self,
*args,
Expand Down
Loading
Loading