Skip to content

Commit

Permalink
Merge pull request #95 from compomics/CCS_feature_generator
Browse files Browse the repository at this point in the history
Add `ionmob` feature generator
  • Loading branch information
RalfG authored Sep 21, 2023
2 parents 502b31c + 96c708d commit 09f8970
Show file tree
Hide file tree
Showing 12 changed files with 494 additions and 26 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ steps.txt
old_files/
prepare_pin_files.py
*.jar
ms2rescore-v3.0.0.dev3.tar
*.tar

# Ruff
.ruff_cache/
Expand Down
6 changes: 6 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ repos:
- id: end-of-file-fixer
- id: trailing-whitespace

# - repo: https://github.com/pycqa/isort
# rev: 5.11.2
# hooks:
# - id: isort
# name: isort (python)

- repo: https://github.com/psf/black
rev: 22.10.0
hooks:
Expand Down
5 changes: 5 additions & 0 deletions docs/source/config_schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
- **`ms2pip`**: Refer to *[#/definitions/ms2pip](#definitions/ms2pip)*.
- **`deeplc`**: Refer to *[#/definitions/deeplc](#definitions/deeplc)*.
- **`maxquant`**: Refer to *[#/definitions/maxquant](#definitions/maxquant)*.
- **`ionmob`**: Refer to *[#/definitions/ionmob](#definitions/ionmob)*.
- **`rescoring_engine`** *(object)*: Rescoring engine to use and its configuration. Leave empty to skip rescoring and write features to file. Default: `{"mokapot": {}}`.
- **`.*`**: Refer to *[#/definitions/rescoring_engine](#definitions/rescoring_engine)*.
- **`percolator`**: Refer to *[#/definitions/percolator](#definitions/percolator)*.
Expand Down Expand Up @@ -68,6 +69,10 @@
- *integer*
- *number*
- <a id="definitions/maxquant"></a>**`maxquant`** *(object)*: MaxQuant feature generator configuration. Can contain additional properties. Refer to *[#/definitions/feature_generator](#definitions/feature_generator)*.
- <a id="definitions/ionmob"></a>**`ionmob`** *(object)*: Ion mobility feature generator configuration using Ionmob. Can contain additional properties. Refer to *[#/definitions/feature_generator](#definitions/feature_generator)*.
- **`ionmob_model`** *(string)*: Path to Ionmob model directory. Default: `"GRUPredictor"`.
- **`reference_dataset`** *(string)*: Path to Ionmob reference dataset file. Default: `"Meier_unimod.parquet"`.
- **`tokenizer`** *(string)*: Path to tokenizer json file. Default: `"tokenizer.json"`.
- <a id="definitions/mokapot"></a>**`mokapot`** *(object)*: Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function. Can contain additional properties. Refer to *[#/definitions/rescoring_engine](#definitions/rescoring_engine)*.
- **`write_weights`** *(boolean)*: Write Mokapot weights to a text file. Default: `false`.
- **`write_txt`** *(boolean)*: Write Mokapot results to a text file. Default: `false`.
Expand Down
2 changes: 2 additions & 0 deletions ms2rescore/feature_generators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from ms2rescore.feature_generators.basic import BasicFeatureGenerator
from ms2rescore.feature_generators.deeplc import DeepLCFeatureGenerator
from ms2rescore.feature_generators.ionmob import IonMobFeatureGenerator
from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator
from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator

Expand All @@ -12,4 +13,5 @@
"ms2pip": MS2PIPFeatureGenerator,
"deeplc": DeepLCFeatureGenerator,
"maxquant": MaxQuantFeatureGenerator,
"ionmob": IonMobFeatureGenerator,
}
7 changes: 4 additions & 3 deletions ms2rescore/feature_generators/deeplc.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,10 @@ def add_features(self, psm_list: PSMList) -> None:
# Get easy-access nested version of PSMList
psm_dict = psm_list.get_psm_dict()

# Run MS²PIP for each spectrum file
total_runs = len(psm_dict.values())
# Run DeepLC for each spectrum file
current_run = 1
total_runs = sum(len(runs) for runs in psm_dict.values())

for runs in psm_dict.values():
# Reset DeepLC predictor for each collection of runs
self.deeplc_predictor = None
Expand Down Expand Up @@ -216,7 +217,7 @@ def add_features(self, psm_list: PSMList) -> None:
psm["rescoring_features"].update(
peptide_rt_diff_dict[psm.peptidoform.proforma.split("\\")[0]]
)
current_run += 1
current_run += 1

# TODO: Remove when DeepLC supports PSMList directly
@staticmethod
Expand Down
282 changes: 282 additions & 0 deletions ms2rescore/feature_generators/ionmob.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,282 @@
import contextlib
import logging
import os
from itertools import chain
from pathlib import Path
from typing import Dict, Optional

import pandas as pd
import tensorflow as tf
from psm_utils import Peptidoform, PSMList

from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException

try:
from ionmob import __file__ as ionmob_file
from ionmob.preprocess.data import to_tf_dataset_inference
from ionmob.utilities.chemistry import VARIANT_DICT, calculate_mz, reduced_mobility_to_ccs
from ionmob.utilities.tokenization import tokenizer_from_json
from ionmob.utilities.utility import get_ccs_shift
except ImportError:
IONMOB_INSTALLED = False
else:
IONMOB_INSTALLED = True

logger = logging.getLogger(__name__)

if IONMOB_INSTALLED:
IONMOB_DIR = Path(ionmob_file).parent
DEFAULT_MODELS_IONMOB = {
Path("pretrained_models/DeepTwoMerModel"),
Path("pretrained_models/GRUPredictor"),
Path("pretrained_models/SqrtModel"),
}
DEFAULT_MODELS_DICT = {
mod_path.stem: IONMOB_DIR / mod_path for mod_path in DEFAULT_MODELS_IONMOB
}
DEFAULT_TOKENIZER = IONMOB_DIR / "pretrained_models/tokenizers/tokenizer.json"
DEFAULT_REFERENCE_DATASET = IONMOB_DIR / "example_data/Tenzer_unimod.parquet"


class IonMobFeatureGenerator(FeatureGeneratorBase):
"""Ionmob collisional cross section (CCS)-based feature generator."""

def __init__(
self,
*args,
ionmob_model: str = "GRUPredictor",
reference_dataset: Optional[str] = None,
tokenizer: Optional[str] = None,
**kwargs,
) -> None:
"""
Ionmob collisional cross section (CCS)-based feature generator.
Parameters
----------
*args
Additional arguments passed to the base class.
ionmob_model
Path to a trained Ionmob model or one of the default models (``DeepTwoMerModel``,
``GRUPredictor``, or ``SqrtModel``). Default: ``GRUPredictor``.
reference_dataset
Path to a reference dataset for CCS shift calculation. Uses the default reference
dataset if not specified.
tokenizer
Path to a tokenizer or one of the default tokenizers. Uses the default tokenizer if
not specified.
**kwargs
Additional keyword arguments passed to the base class.
"""
super().__init__(*args, **kwargs)

# Check if Ionmob could be imported
if not IONMOB_INSTALLED:
raise ImportError(
"Ionmob not installed. Please install Ionmob to use this feature generator."
)

# Get model from file or one of the default models
if Path(ionmob_model).is_file():
self.ionmob_model = tf.keras.models.load_model(ionmob_model)
elif ionmob_model in DEFAULT_MODELS_DICT:
self.ionmob_model = tf.keras.models.load_model(
DEFAULT_MODELS_DICT[ionmob_model].as_posix()
)
else:
raise IonmobException(
f"Invalid Ionmob model: {ionmob_model}. Should be path to a model file or one of "
f"the default models: {DEFAULT_MODELS_DICT.keys()}."
)
self.reference_dataset = pd.read_parquet(reference_dataset or DEFAULT_REFERENCE_DATASET)
self.tokenizer = tokenizer_from_json(tokenizer or DEFAULT_TOKENIZER)

self._verbose = logger.getEffectiveLevel() <= logging.DEBUG

@property
def feature_names(self):
return [
"ccs_predicted",
"ccs_observed",
"ccs_error",
"abs_ccs_error",
"perc_ccs_error",
]

@property
def allowed_modifications(self):
"""Return a list of modifications that are allowed in ionmob."""
return [token for aa_tokens in VARIANT_DICT.values() for token in aa_tokens]

def add_features(self, psm_list: PSMList) -> None:
"""
Add Ionmob-derived features to PSMs.
Parameters
----------
psm_list
PSMs to add features to.
"""
logger.info("Adding Ionmob-derived features to PSMs.")
psm_dict = psm_list.get_psm_dict()
current_run = 1
total_runs = sum(len(runs) for runs in psm_dict.values())

for runs in psm_dict.values():
for run, psms in runs.items():
logger.info(
f"Running Ionmob for PSMs from run ({current_run}/{total_runs}): `{run}`..."
)
with contextlib.redirect_stdout(
open(os.devnull, "w")
) if not self._verbose else contextlib.nullcontext():
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
psm_list_run_df = psm_list_run.to_dataframe()

# prepare data frames for CCS prediction
psm_list_run_df["charge"] = [
peptidoform.precursor_charge
for peptidoform in psm_list_run_df["peptidoform"]
]
psm_list_run_df = psm_list_run_df[
psm_list_run_df["charge"] < 5
] # predictions do not go higher for ionmob

psm_list_run_df["sequence-tokenized"] = psm_list_run_df.apply(
lambda x: self.tokenize_peptidoform(x["peptidoform"]), axis=1
)
psm_list_run_df = psm_list_run_df[
psm_list_run_df.apply(
lambda x: self._is_valid_tokenized_sequence(x["sequence-tokenized"]),
axis=1,
)
]

psm_list_run_df["mz"] = psm_list_run_df.apply(
lambda x: calculate_mz(x["sequence-tokenized"], x["charge"]), axis=1
) # use precursor m/z from PSMs?

psm_list_run_df["ccs_observed"] = psm_list_run_df.apply(
lambda x: reduced_mobility_to_ccs(x["ion_mobility"], x["mz"], x["charge"]),
axis=1,
)
# calibrate CCS values
shift_factor = self.calculate_ccs_shift(psm_list_run_df)
psm_list_run_df["ccs_observed"] = psm_list_run_df.apply(
lambda x: x["ccs_observed"] + shift_factor, axis=1
)
# predict CCS values
tf_ds = to_tf_dataset_inference(
psm_list_run_df["mz"],
psm_list_run_df["charge"],
psm_list_run_df["sequence-tokenized"],
self.tokenizer,
)

psm_list_run_df["ccs_predicted"], _ = self.ionmob_model.predict(tf_ds)

# calculate CCS features
ccs_features = self._calculate_features(psm_list_run_df)

# add CCS features to PSMs
for psm in psm_list_run:
try:
psm["rescoring_features"].update(ccs_features[psm.spectrum_id])
except KeyError:
psm["rescoring_features"].update({})
current_run += 1

def _calculate_features(self, feature_df: pd.DataFrame) -> Dict[str, Dict[str, float]]:
"""Get CCS features for PSMs."""
ccs_features = {}
for row in feature_df.itertuples():
ccs_features[row.spectrum_id] = {
"ccs_predicted": row.ccs_predicted,
"ccs_observed": row.ccs_observed,
"ccs_error": row.ccs_observed - row.ccs_predicted,
"abs_ccs_error": abs(row.ccs_observed - row.ccs_predicted),
"perc_ccs_error": ((abs(row.ccs_observed - row.ccs_predicted)) / row.ccs_observed)
* 100,
}
return ccs_features

@staticmethod
def tokenize_peptidoform(peptidoform: Peptidoform) -> list:
"""Tokenize proforma sequence and add modifications."""
tokenized_seq = []

if peptidoform.properties["n_term"]:
tokenized_seq.append(
f"<START>[UNIMOD:{peptidoform.properties['n_term'][0].definition['id']}]"
)
else:
tokenized_seq.append("<START>")

for amino_acid, modification in peptidoform.parsed_sequence:
tokenized_seq.append(amino_acid)
if modification:
tokenized_seq[-1] = (
tokenized_seq[-1] + f"[UNIMOD:{modification[0].definition['id']}]"
)

if peptidoform.properties["c_term"]:
pass # provide if c-term mods are supported
else:
tokenized_seq.append("<END>")

return tokenized_seq

def calculate_ccs_shift(self, psm_dataframe: pd.DataFrame) -> float:
"""
Apply CCS shift to CCS values.
Parameters
----------
psm_dataframe
Dataframe with PSMs as returned by :py:meth:`psm_utils.PSMList.to_dataframe`.
"""
df = psm_dataframe.copy()
df.rename({"ccs_observed": "ccs"}, axis=1, inplace=True)
high_conf_hits = list(df["spectrum_id"][df["score"].rank(pct=True) > 0.95])
logger.debug(
f"Number of high confidence hits for calculating shift: {len(high_conf_hits)}"
)

shift_factor = get_ccs_shift(
df[["charge", "sequence-tokenized", "ccs"]][df["spectrum_id"].isin(high_conf_hits)],
self.reference_dataset,
)

logger.debug(f"CCS shift factor: {shift_factor}")

return shift_factor

def _is_valid_tokenized_sequence(self, tokenized_seq):
"""
Check if peptide sequence contains invalid tokens.
Parameters
----------
tokenized_seq
Tokenized peptide sequence.
Returns
-------
bool
False if invalid tokens are present, True otherwise.
"""
for token in tokenized_seq:
if token not in self.allowed_modifications:
logger.debug(f"Invalid modification found: {token}")
return False
return True


class IonmobException(FeatureGeneratorException):
"""Exception raised by Ionmob feature generator."""

pass
11 changes: 9 additions & 2 deletions ms2rescore/feature_generators/ms2pip.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,9 +174,15 @@ def add_features(self, psm_list: PSMList) -> None:
"""
logger.info("Adding MS²PIP-derived features to PSMs.")
for runs in psm_list.get_psm_dict().values():
psm_dict = psm_list.get_psm_dict()
current_run = 1
total_runs = sum(len(runs) for runs in psm_dict.values())

for runs in psm_dict.values():
for run, psms in runs.items():
logger.info(f"Running MS²PIP for PSMs from run `{run}`...")
logger.info(
f"Running MS²PIP for PSMs from run ({current_run}/{total_runs}) `{run}`..."
)
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
spectrum_filename = infer_spectrum_path(self.spectrum_path, run)
logger.debug(f"Using spectrum file `{spectrum_filename}`")
Expand All @@ -199,6 +205,7 @@ def add_features(self, psm_list: PSMList) -> None:
" for more information."
) from e
self._calculate_features(psm_list_run, ms2pip_results)
current_run += 1

def _calculate_features(
self, psm_list: PSMList, ms2pip_results: List[ProcessingResult]
Expand Down
Loading

0 comments on commit 09f8970

Please sign in to comment.