Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cit pipeline to the release #276

Merged
merged 6 commits into from
Oct 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions oktoberfest/preprocessing/preprocessing.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,8 @@ def convert_search(
tmt_label: str = "",
custom_mods: Optional[dict[str, int]] = None,
output_file: Optional[Union[str, Path]] = None,
ptm_unimod_id: Optional[int] = 0,
ptm_sites: Optional[list] = None,
) -> pd.DataFrame:
r"""
Convert search results to Oktoberfest format.
Expand All @@ -414,7 +416,8 @@ def convert_search(
static and variable mods as keys. The values are the integer values of the respective unimod identifier
:param output_file: Optional path to the location where the converted search results should be written to.
If this is omitted, the results are not stored.

:param ptm_unimod_id: unimod id used for site localization
:param ptm_sites: possible sites that the ptm can exist on
:raises ValueError: if an unsupported search engine was given
:return: A dataframe containing the converted results.

Expand Down Expand Up @@ -503,7 +506,11 @@ def convert_search(
raise ValueError(f"Unknown search engine provided: {search_engine}")

return search_result(input_path).generate_internal(
tmt_label=tmt_label, out_path=output_file, custom_mods=custom_mods
tmt_label=tmt_label,
out_path=output_file,
custom_mods=custom_mods,
ptm_unimod_id=ptm_unimod_id,
ptm_sites=ptm_sites,
)


Expand Down Expand Up @@ -850,6 +857,7 @@ def annotate_spectral_library(
mass_tol: Optional[float] = None,
unit_mass_tol: Optional[str] = None,
custom_mods: Optional[dict[str, float]] = None,
annotate_neutral_loss: Optional[bool] = False,
) -> Spectra:
"""
Annotate all specified ion peaks of given PSMs (Default b and y ions).
Expand All @@ -865,6 +873,7 @@ def annotate_spectral_library(
:param unit_mass_tol: The unit in which the mass tolerance is given
:param fragmentation_method: fragmentation method that was used
:param custom_mods: mapping of custom UNIMOD string identifiers ('[UNIMOD:xyz]') to their mass
:param annotate_neutral_loss: flag to indicate whether to annotate neutral loss peaks or not

:return: Spectra object containing the annotated b and y ion peaks including metadata

Expand Down Expand Up @@ -892,6 +901,7 @@ def annotate_spectral_library(
unit_mass_tolerance=unit_mass_tol,
fragmentation_method=fragmentation_method,
custom_mods=custom_mods,
annotate_neutral_loss=annotate_neutral_loss,
)

ion_types = retrieve_ion_types(fragmentation_method)
Expand All @@ -903,6 +913,8 @@ def annotate_spectral_library(
)
aspec.add_mzs(np.stack(df_annotated_spectra["MZ"]), FragmentType.MZ)
aspec.add_column(df_annotated_spectra["CALCULATED_MASS"].values, "CALCULATED_MASS")
aspec.add_column(df_annotated_spectra["EXPECTED_NL_COUNT"].values, "EXPECTED_NL_COUNT")
aspec.add_column(df_annotated_spectra["ANNOTATED_NL_COUNT"].values, "ANNOTATED_NL_COUNT")
aspec.strings_to_categoricals()

logger.info("Finished annotating.")
Expand Down
6 changes: 6 additions & 0 deletions oktoberfest/rescore/rescore.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ def generate_features(
additional_columns: str | list | None = None,
all_features: bool = False,
regression_method: str = "spline",
add_neutral_loss_features: bool = False,
remove_miss_cleavage_features: bool = False,
):
"""
Generate features to be used for percolator or mokapot target decoy separation.
Expand All @@ -38,6 +40,8 @@ def generate_features(
:param additional_columns: additional columns supplied in the search results to be used as features (either a list or "all")
:param all_features: whether to use all features or only the standard set TODO
:param regression_method: The regression method to use for iRT alignment
:param add_neutral_loss_features: Flag to indicate whether to add neutral loss features to percolator or not
:param remove_miss_cleavage_features: Flag to indicate whether to remove miss cleavage features from percolator or not

:Example:

Expand Down Expand Up @@ -87,6 +91,8 @@ def generate_features(
additional_columns=additional_columns,
all_features_flag=all_features,
regression_method=regression_method,
neutral_loss_flag=add_neutral_loss_features,
drop_miss_cleavage_flag=remove_miss_cleavage_features,
)
perc_features.calc()
perc_features.write_to_file(str(output_file))
Expand Down
62 changes: 56 additions & 6 deletions oktoberfest/runner.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,16 @@ def _preprocess(spectra_files: list[Path], config: Config) -> list[Path]:
msms_output.mkdir(exist_ok=True)
internal_search_file = msms_output / "msms.prosit"
tmt_label = config.tag

ptm_unimods = config.ptm_unimod_id
ptm_sites = config.ptm_possible_sites
search_results = pp.convert_search(
input_path=config.search_results,
search_engine=config.search_results_type,
tmt_label=tmt_label,
custom_mods=config.custom_to_unimod(),
output_file=internal_search_file,
ptm_unimod_id=ptm_unimods,
ptm_sites=ptm_sites,
)
if config.spectra_type.lower() in ["d", "hdf"]:
timstof_metadata = pp.convert_timstof_metadata(
Expand Down Expand Up @@ -149,12 +152,14 @@ def _annotate_and_get_library(spectra_file: Path, config: Config, tims_meta_file
spectra["INSTRUMENT_TYPES"] = config_instrument_type
search = pp.load_search(config.output / "msms" / spectra_file.with_suffix(".rescore").name)
library = pp.merge_spectra_and_peptides(spectra, search)
annotate_neutral_loss = config.ptm_use_neutral_loss
aspec = pp.annotate_spectral_library(
psms=library,
mass_tol=config.mass_tolerance,
unit_mass_tol=config.unit_mass_tolerance,
fragmentation_method=config.fragmentation_method,
custom_mods=config.unimod_to_mass(),
annotate_neutral_loss=annotate_neutral_loss,
)
aspec.write_as_hdf5(hdf5_path) # write_metadata_annotation

Expand Down Expand Up @@ -562,7 +567,6 @@ def _refinement_learn(spectra_files: list[Path], config: Config):

def _calculate_features(spectra_file: Path, config: Config):
library = _ce_calib(spectra_file, config)

calc_feature_step = ProcessStep(config.output, "calculate_features." + spectra_file.stem)
if calc_feature_step.is_done():
return
Expand Down Expand Up @@ -597,7 +601,8 @@ def _calculate_features(spectra_file: Path, config: Config):
# produce percolator tab files
fdr_dir = config.output / "results" / config.fdr_estimation_method
fdr_dir.mkdir(exist_ok=True)

add_neutral_loss_features = config.ptm_use_neutral_loss
remove_miss_cleavage_features = ("R" in config.ptm_possible_sites) or ("K" in config.ptm_possible_sites)
re.generate_features(
library=library,
search_type="original",
Expand All @@ -613,6 +618,8 @@ def _calculate_features(spectra_file: Path, config: Config):
additional_columns=config.use_feature_cols,
all_features=config.all_features,
regression_method=config.curve_fitting_method,
add_neutral_loss_features=add_neutral_loss_features,
remove_miss_cleavage_features=remove_miss_cleavage_features,
)

calc_feature_step.mark_done()
Expand All @@ -634,8 +641,13 @@ def _rescore(fdr_dir: Path, config: Config):
re.rescore_with_percolator(input_file=fdr_dir / "original.tab", output_folder=fdr_dir)
rescore_original_step.mark_done()
if not rescore_prosit_step.is_done():
re.rescore_with_percolator(input_file=fdr_dir / "rescore.tab", output_folder=fdr_dir)
rescore_prosit_step.mark_done()
logger.info("Start percolator rescoring")
logger.info(config.ptm_localization)
if config.ptm_localization:
_ptm_localization_rescore(fdr_dir, config)
else:
re.rescore_with_percolator(input_file=fdr_dir / "rescore.tab", output_folder=fdr_dir)
rescore_prosit_step.mark_done()
elif config.fdr_estimation_method == "mokapot":
if not rescore_original_step.is_done():
re.rescore_with_mokapot(input_file=fdr_dir / "original.tab", output_folder=fdr_dir)
Expand All @@ -649,6 +661,43 @@ def _rescore(fdr_dir: Path, config: Config):
)


def _ptm_localization_rescore(fdr_dir: Path, config: Config):
"""
Helper function for running percolator to do PTM localization.

:param fdr_dir: the output directory
:param config: the configuration object
"""
df_rescore = pd.read_csv(fdr_dir / "rescore.tab", sep="\t")
unimod_id = config.ptm_unimod_id
df_rescore["id"] = df_rescore["filename"] + df_rescore["ScanNr"].astype(str)
df_rescore_fil = df_rescore[df_rescore["Peptide"].str.contains("UNIMOD:" + str(unimod_id))]
df_rescore = df_rescore[df_rescore["id"].isin(df_rescore_fil["id"].tolist())]
df_rescore.drop(columns=["id"], inplace=True)
df_rescore.to_csv(fdr_dir / "rescore.tab", sep="\t", index=False)
new_rescore_dir = fdr_dir / "localize_mod"
new_rescore_dir.mkdir(parents=True, exist_ok=True)

if unimod_id == 7:
re.rescore_with_percolator(input_file=fdr_dir / "rescore.tab", output_folder=fdr_dir)
df_rescore_psms_targets = pd.read_csv(fdr_dir / "rescore.percolator.psms.txt", sep="\t")
df_rescore_psms_decoys = pd.read_csv(fdr_dir / "rescore.percolator.decoy.psms.txt", sep="\t")
df_rescore_psms_targets = df_rescore_psms_targets[
df_rescore_psms_targets["peptide"].apply(lambda x: "UNIMOD:" + str(unimod_id) in x)
]
df_rescore_psms_decoys = df_rescore_psms_decoys[
df_rescore_psms_decoys["peptide"].apply(lambda x: "UNIMOD:" + str(unimod_id) in x)
]
df_rescore_psms = pd.concat([df_rescore_psms_targets, df_rescore_psms_decoys])
df_rescore_psms = df_rescore_psms[["PSMId"]]
df_rescore_psms.rename(columns={"PSMId": "SpecId"}, inplace=True)
df_rescore = df_rescore.merge(df_rescore_psms, on="SpecId", how="inner")
df_rescore.to_csv(new_rescore_dir / "rescore.tab", sep="\t", index=False)
re.rescore_with_percolator(input_file=new_rescore_dir / "rescore.tab", output_folder=new_rescore_dir)
else:
re.rescore_with_percolator(input_file=fdr_dir / "rescore.tab", output_folder=new_rescore_dir)


def run_rescoring(config_path: Union[str, Path]):
"""
Create a ReScore object and run the rescoring.
Expand Down Expand Up @@ -717,7 +766,8 @@ def run_rescoring(config_path: Union[str, Path]):

# plotting
logger.info("Generating summary plots...")
pl.plot_all(fdr_dir)
if not config.ptm_localization:
pl.plot_all(fdr_dir)
logger.info("Finished rescoring.")

if config.quantification:
Expand Down
30 changes: 30 additions & 0 deletions oktoberfest/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,36 @@ def nr_ox(self) -> int:
"""Get the maximum number of oxidations allowed on M residues in peptides during spectral library generation."""
return self.spec_lib_options.get("nrOx", 1)

###########################
# these are PTM localization options #
###########################

@property
def ptm_localization(self) -> bool:
"""Get ptm localization flag from the config file."""
return self.data.get("ptm_localization", False)

@property
def ptm_localization_options(self) -> dict:
"""Get ptm localization dictionary from the config file."""
return self.data.get("ptmLocalizationOptions", {})

@property
def ptm_unimod_id(self) -> int:
"""Get the unimod id required for localization."""
unimod_id = self.ptm_localization_options.get("unimod_id", 0)
return unimod_id

@property
def ptm_possible_sites(self) -> list:
"""Get which sites ptm can exist on."""
return self.ptm_localization_options.get("possible_sites", [])

@property
def ptm_use_neutral_loss(self) -> bool:
"""Get neutral loss flag to indicate whether to add a score for this or not."""
return self.ptm_localization_options.get("neutral_loss", False)

#####################################################################
# these are local prediction / transfer&refinement learning options #
#####################################################################
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ matplotlib = "^3.6.3"
mokapot = ">=0.9.1,<0.11.0"
numpy = ">=1.20,<1.25"
seaborn = ">=0.12.2,<0.14.0"
spectrum_fundamentals = ">=0.7.3,<0.8.0"
spectrum-io = ">=0.6.2,<0.7.0"
spectrum_fundamentals = ">=0.7.4,<0.8.0"
spectrum-io = ">=0.6.3,<0.7.0"
picked_group_fdr = ">=0.7.1"

dlomix = {extras = ["rltl-report", "wandb"], git = "git@github.com:wilhelm-lab/dlomix.git", branch = "feature/bmpc", optional = true}
Expand Down
Loading