From f4c74554c2e601bfce8d9036263fb6da4e26e1bd Mon Sep 17 00:00:00 2001 From: Timo Sachsenberg Date: Wed, 31 Jan 2024 04:38:03 +0100 Subject: [PATCH] started moving some parts that can be used by LFQ and Isobaric Workflows to helper class --- .../QUANTITATION/DDAWorkflowCommons.h | 221 ++++++++++++++++++ .../QUANTITATION/IsobaricIsotopeCorrector.h | 2 + .../ANALYSIS/QUANTITATION/sources.cmake | 1 + .../QUANTITATION/DDAWorkflowCommons.cpp | 9 + .../ANALYSIS/QUANTITATION/sources.cmake | 1 + src/topp/ProteomicsLFQ.cpp | 155 +----------- 6 files changed, 244 insertions(+), 145 deletions(-) create mode 100644 src/openms/include/OpenMS/ANALYSIS/QUANTITATION/DDAWorkflowCommons.h create mode 100644 src/openms/source/ANALYSIS/QUANTITATION/DDAWorkflowCommons.cpp diff --git a/src/openms/include/OpenMS/ANALYSIS/QUANTITATION/DDAWorkflowCommons.h b/src/openms/include/OpenMS/ANALYSIS/QUANTITATION/DDAWorkflowCommons.h new file mode 100644 index 00000000000..efe490ece12 --- /dev/null +++ b/src/openms/include/OpenMS/ANALYSIS/QUANTITATION/DDAWorkflowCommons.h @@ -0,0 +1,221 @@ +// Copyright (c) 2002-present, The OpenMS Team -- EKU Tuebingen, ETH Zurich, and FU Berlin +// SPDX-License-Identifier: BSD-3-Clause +// +// -------------------------------------------------------------------------- +// $Maintainer: Timo Sachsenberg $ +// $Authors: Timo Sachsenberg $ +// -------------------------------------------------------------------------- + +#pragma once + +#include +#include +#include + +#include + +namespace OpenMS +{ + /** + @brief Common functions for DDA workflows + + @ingroup Analysis_ID + */ + class OPENMS_DLLAPI DDAWorkflowCommons + { + public: + /* @brief create Map between mzML file and corresponding id file + * Checks implemented: + * 1. Check if the number of spectra and id files match. + * 2. If spectra and id files share common base names (without extension) + * but appear in different order, throw an error. + */ + static std::map mapMzML2Ids(StringList & in, StringList & in_ids) + { + // Detect the common case that ID files have same names as spectra files + if (!File::validateMatchingFileNames(in, in_ids, true, true, false)) // only basenames, without extension, only order + { + // Spectra and id files have the same set of basenames but appear in different order. -> this is most likely an error + throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, + "ID and spectra file match but order of file names seem to differ. They need to be provided in the same order."); + } + + std::map mzfile2idfile; + for (Size i = 0; i != in.size(); ++i) + { + const String& in_abs_path = File::absolutePath(in[i]); + const String& id_abs_path = File::absolutePath(in_ids[i]); + mzfile2idfile[in_abs_path] = id_abs_path; + OPENMS_LOG_DEBUG << ("Spectra: " + in[i] + "\t Ids: " + in_ids[i], 1) << std::endl; + } + return mzfile2idfile; + } + } + +/** + * @brief Small helper to get the mapping from id files to mzML files + * + * Basically just reverses the mapMzML2Ids function. + * Potential improvement: Could be combined into a single functionexposed to the user. + */ + std::map mapId2MzMLs(const map& m2i) + { + map idfile2mzfile; + for (const auto& m : m2i) + { + idfile2mzfile[m.second] = m.first; + } + return idfile2mzfile; + } + + +/** + * Estimates the median chromatographic full width at half maximum (FWHM) for a given MSExperiment. + * + * @param ms_centroided The centroided MSExperiment for which to estimate the FWHM. + * @return The estimated median chromatographic FWHM based on the top 1000 intensity mass traces. + */ + double estimateMedianChromatographicFWHM(MSExperiment & ms_centroided) + { + MassTraceDetection mt_ext; + Param mtd_param = mt_ext.getParameters(); + + OPENMS_LOG_DEBUG << "Parameters passed to MassTraceDetection" << mtd_param << std::endl; + + std::vector m_traces; + mt_ext.run(ms_centroided, m_traces, 1000); + + std::vector fwhm_1000; + for (auto &m : m_traces) + { + if (m.getSize() == 0) continue; + m.updateMeanMZ(); + m.updateWeightedMZsd(); + double fwhm = m.estimateFWHM(false); + fwhm_1000.push_back(fwhm); + } + + double median_fwhm = Math::median(fwhm_1000.begin(), fwhm_1000.end()); + + return median_fwhm; + } + +/** + * @brief Recalibrates the masses of the MSExperiment using peptide identifications. + * + * This function recalibrates the masses of the MSExperiment by applying a mass recalibration + * based on the theoretical masses from identification data. + * + * @param ms_centroided The MSExperiment object containing the centroided spectra. + * @param peptide_ids The vector of PeptideIdentification objects containing the peptide identifications. + * @param id_file_abs_path The absolute path of the identification file. + */ + void recalibrateMS1(MSExperiment & ms_centroided, + vector& peptide_ids, + const String & id_file_abs_path = "") + { + InternalCalibration ic; + ic.setLogType(log_type_); + ic.fillCalibrants(peptide_ids, 25.0); // >25 ppm maximum deviation defines an outlier TODO: check if we need to adapt this + if (ic.getCalibrationPoints().size() <= 1) return; + + // choose calibration model based on number of calibration points + + // there seem to be some problems with the QUADRATIC model that we first need to investigate + //MZTrafoModel::MODELTYPE md = (ic.getCalibrationPoints().size() == 2) ? MZTrafoModel::LINEAR : MZTrafoModel::QUADRATIC; + //bool use_RANSAC = (md == MZTrafoModel::LINEAR || md == MZTrafoModel::QUADRATIC); + + MZTrafoModel::MODELTYPE md = MZTrafoModel::LINEAR; + bool use_RANSAC = true; + + Size RANSAC_initial_points = (md == MZTrafoModel::LINEAR) ? 2 : 3; + Math::RANSACParam p(RANSAC_initial_points, 70, 10, 30, true); // TODO: check defaults (taken from tool) + MZTrafoModel::setRANSACParams(p); + // these limits are a little loose, but should prevent grossly wrong models without burdening the user with yet another parameter. + MZTrafoModel::setCoefficientLimits(25.0, 25.0, 0.5); + + IntList ms_level = {1}; + double rt_chunk = 300.0; // 5 minutes + String qc_residual_path, qc_residual_png_path; + if (!id_file_abs_path.empty()) + { + const String & id_basename = File::basename(id_file_abs_path); + qc_residual_path = id_basename + "qc_residuals.tsv"; + qc_residual_png_path = id_basename + "qc_residuals.png"; + } + + if (!ic.calibrate(ms_centroided, + ms_level, md, rt_chunk, use_RANSAC, + 10.0, + 5.0, + "", + "", + qc_residual_path, + qc_residual_png_path, + "Rscript")) + { + OPENMS_LOG_WARN << "\nCalibration failed. See error message above!" << std::endl; + } + } + + +/** + * @brief Extracts seeding features from centroided MS data (e.g., for untarged extraction). + * + * MS1 spectra are subjected to a threshold filter to removelow-intensity peaks, + * and then uses the FeatureFinderMultiplex algorithm to identify potential seeding features. + * The function also takes into account the median full width at half maximum (FWHM) of the peaks + * to adjust the FeatureFinderMultiplex parameters for better seed detection. + * + * @param[in] ms_centroided The MSExperiment object containing centroided mass spectrometry data. Only MS1 level + * spectra are considered for seed calculation. + * @param[out] seeds The FeatureMap object where the identified seeding features will be stored. + * @param[in] median_fwhm The median FWHM of the peaks, used to adjust the FeatureFinderMultiplex parameters for + * seed detection. + * + * @note The function currently uses hardcoded parameters for the threshold filter and FeatureFinderMultiplex + * algorithm, which may need to be derived from the data or provided as function arguments in future + * implementations. + */ + void calculateSeeds( + const MSExperiment & ms_centroided, + const double intensity_threshold, + FeatureMap & seeds, + double median_fwhm + Size charge_min = 2, + Size charge_max = 5) + { + //TODO: Actually FFM provides a parameter for minimum intensity. Also it copies the full experiment again once or twice. + MSExperiment e; + for (const auto& s : ms_centroided) + { + if (s.getMSLevel() == 1) + { + e.addSpectrum(s); + } + } + + ThresholdMower threshold_mower_filter; + Param tm = threshold_mower_filter.getParameters(); + tm.setValue("threshold", intensity_threshold);; // TODO: derive from data + threshold_mower_filter.setParameters(tm); + threshold_mower_filter.filterPeakMap(e); + + FeatureFinderMultiplexAlgorithm algorithm; + Param p = algorithm.getParameters(); + p.setValue("algorithm:labels", ""); // unlabeled only + p.setValue("algorithm:charge", String(charge_min) + ":" + String(charge_max)); + p.setValue("algorithm:rt_typical", median_fwhm * 3.0); + p.setValue("algorithm:rt_band", 3.0); // max 3 seconds shifts between isotopic traces (not sure if needed) + p.setValue("algorithm:rt_min", median_fwhm * 0.5); + p.setValue("algorithm:spectrum_type", "centroid"); + algorithm.setParameters(p); + //FIXME progress of FFM is not printed at all + const bool progress(true); + algorithm.run(e, progress); + seeds = algorithm.getFeatureMap(); + OPENMS_LOG_INFO << "Using " << seeds.size() << " seeds from untargeted feature extraction." << endl; + } + +} + diff --git a/src/openms/include/OpenMS/ANALYSIS/QUANTITATION/IsobaricIsotopeCorrector.h b/src/openms/include/OpenMS/ANALYSIS/QUANTITATION/IsobaricIsotopeCorrector.h index 6857ae61f8c..648daee7e61 100644 --- a/src/openms/include/OpenMS/ANALYSIS/QUANTITATION/IsobaricIsotopeCorrector.h +++ b/src/openms/include/OpenMS/ANALYSIS/QUANTITATION/IsobaricIsotopeCorrector.h @@ -9,6 +9,7 @@ #pragma once #include +#include // forward decl namespace Eigen @@ -78,6 +79,7 @@ namespace OpenMS */ static void solveNNLS_(const Matrix& correction_matrix, const Matrix& m_b, Matrix& m_x); + // @jpfeuffer why shared_ptr needed here? static void solveNNLS_(std::shared_ptr & correction_matrix, std::vector & b, std::vector & x); static void solveNNLS_(std::shared_ptr & correction_matrix, std::vector & b, std::vector & x); diff --git a/src/openms/include/OpenMS/ANALYSIS/QUANTITATION/sources.cmake b/src/openms/include/OpenMS/ANALYSIS/QUANTITATION/sources.cmake index f98de702e5f..1b1900438e2 100644 --- a/src/openms/include/OpenMS/ANALYSIS/QUANTITATION/sources.cmake +++ b/src/openms/include/OpenMS/ANALYSIS/QUANTITATION/sources.cmake @@ -3,6 +3,7 @@ set(directory include/OpenMS/ANALYSIS/QUANTITATION) ### list all header files of the directory here set(sources_list_h +DDAWorkflowCommons.h IsobaricChannelExtractor.h IsobaricIsotopeCorrector.h IsobaricNormalizer.h diff --git a/src/openms/source/ANALYSIS/QUANTITATION/DDAWorkflowCommons.cpp b/src/openms/source/ANALYSIS/QUANTITATION/DDAWorkflowCommons.cpp new file mode 100644 index 00000000000..bccb921f37e --- /dev/null +++ b/src/openms/source/ANALYSIS/QUANTITATION/DDAWorkflowCommons.cpp @@ -0,0 +1,9 @@ +// Copyright (c) 2002-present, The OpenMS Team -- EKU Tuebingen, ETH Zurich, and FU Berlin +// SPDX-License-Identifier: BSD-3-Clause +// +// -------------------------------------------------------------------------- +// $Maintainer: Timo Sachsenberg $ +// $Authors: Timo Sachsenberg $ +// -------------------------------------------------------------------------- + +#include diff --git a/src/openms/source/ANALYSIS/QUANTITATION/sources.cmake b/src/openms/source/ANALYSIS/QUANTITATION/sources.cmake index 9f7ae4a3e9c..d515f2fb738 100644 --- a/src/openms/source/ANALYSIS/QUANTITATION/sources.cmake +++ b/src/openms/source/ANALYSIS/QUANTITATION/sources.cmake @@ -3,6 +3,7 @@ set(directory source/ANALYSIS/QUANTITATION) ### list all filenames of the directory here set(sources_list +DDAWorkflowCommons.cpp IsobaricChannelExtractor.cpp IsobaricIsotopeCorrector.cpp IsobaricNormalizer.cpp diff --git a/src/topp/ProteomicsLFQ.cpp b/src/topp/ProteomicsLFQ.cpp index eba7217bac1..8a799a498ac 100644 --- a/src/topp/ProteomicsLFQ.cpp +++ b/src/topp/ProteomicsLFQ.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -274,40 +275,6 @@ class ProteomicsLFQ : registerFullParam_(combined); } - // Map between mzML file and corresponding id file - // Warn if the primaryMSRun indicates that files were provided in the wrong order. - map mapMzML2Ids_(StringList & in, StringList & in_ids) - { - // Detect the common case that ID files have same names as spectra files - if (!File::validateMatchingFileNames(in, in_ids, true, true, false)) // only basenames, without extension, only order - { - // Spectra and id files have the same set of basenames but appear in different order. -> this is most likely an error - throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, - "ID and spectra file match but order of file names seem to differ. They need to be provided in the same order."); - } - - map mzfile2idfile; - for (Size i = 0; i != in.size(); ++i) - { - const String& in_abs_path = File::absolutePath(in[i]); - const String& id_abs_path = File::absolutePath(in_ids[i]); - mzfile2idfile[in_abs_path] = id_abs_path; - writeDebug_("Spectra: " + in[i] + "\t Ids: " + in_ids[i], 1); - } - return mzfile2idfile; - } - - // map back - map mapId2MzMLs_(const map& m2i) - { - map idfile2mzfile; - for (const auto& m : m2i) - { - idfile2mzfile[m.second] = m.first; - } - return idfile2mzfile; - } - ExitCodes centroidAndCorrectPrecursors_(const String & mz_file, MSExperiment & ms_centroided) { Param pp_param = getParam_().copy("Centroiding:", true); @@ -384,111 +351,6 @@ class ProteomicsLFQ : return EXECUTION_OK; } - void recalibrateMasses_(MSExperiment & ms_centroided, vector& peptide_ids, const String & id_file_abs_path) - { - InternalCalibration ic; - ic.setLogType(log_type_); - ic.fillCalibrants(peptide_ids, 25.0); // >25 ppm maximum deviation defines an outlier TODO: check if we need to adapt this - if (ic.getCalibrationPoints().size() <= 1) return; - - // choose calibration model based on number of calibration points - - // there seem to be some problems with the QUADRATIC model that we first need to investigate - //MZTrafoModel::MODELTYPE md = (ic.getCalibrationPoints().size() == 2) ? MZTrafoModel::LINEAR : MZTrafoModel::QUADRATIC; - //bool use_RANSAC = (md == MZTrafoModel::LINEAR || md == MZTrafoModel::QUADRATIC); - - MZTrafoModel::MODELTYPE md = MZTrafoModel::LINEAR; - bool use_RANSAC = true; - - Size RANSAC_initial_points = (md == MZTrafoModel::LINEAR) ? 2 : 3; - Math::RANSACParam p(RANSAC_initial_points, 70, 10, 30, true); // TODO: check defaults (taken from tool) - MZTrafoModel::setRANSACParams(p); - // these limits are a little loose, but should prevent grossly wrong models without burdening the user with yet another parameter. - MZTrafoModel::setCoefficientLimits(25.0, 25.0, 0.5); - - IntList ms_level = {1}; - double rt_chunk = 300.0; // 5 minutes - String qc_residual_path, qc_residual_png_path; - if (debug_level_ >= 1) - { - const String & id_basename = File::basename(id_file_abs_path); - qc_residual_path = id_basename + "qc_residuals.tsv"; - qc_residual_png_path = id_basename + "qc_residuals.png"; - } - - if (!ic.calibrate(ms_centroided, ms_level, md, rt_chunk, use_RANSAC, - 10.0, - 5.0, - "", - "", - qc_residual_path, - qc_residual_png_path, - "Rscript")) - { - OPENMS_LOG_WARN << "\nCalibration failed. See error message above!" << std::endl; - } - } - - double estimateMedianChromatographicFWHM_(MSExperiment & ms_centroided) - { - MassTraceDetection mt_ext; - Param mtd_param = mt_ext.getParameters(); - writeDebug_("Parameters passed to MassTraceDetection", mtd_param, 3); - - std::vector m_traces; - mt_ext.run(ms_centroided, m_traces, 1000); - - std::vector fwhm_1000; - for (auto &m : m_traces) - { - if (m.getSize() == 0) continue; - m.updateMeanMZ(); - m.updateWeightedMZsd(); - double fwhm = m.estimateFWHM(false); - fwhm_1000.push_back(fwhm); - } - - double median_fwhm = Math::median(fwhm_1000.begin(), fwhm_1000.end()); - - OPENMS_LOG_INFO << "Median chromatographic FWHM: " << median_fwhm << std::endl; - - return median_fwhm; - } - - void calculateSeeds_(const MSExperiment & ms_centroided, FeatureMap & seeds, double median_fwhm) - { - //TODO: Actually FFM provides a parameter for minimum intensity. Also it copies the full experiment again once or twice. - MSExperiment e; - for (const auto& s : ms_centroided) - { - if (s.getMSLevel() == 1) - { - e.addSpectrum(s); - } - } - - ThresholdMower threshold_mower_filter; - Param tm = threshold_mower_filter.getParameters(); - tm.setValue("threshold", getDoubleOption_("Seeding:intThreshold")); // TODO: derive from data - threshold_mower_filter.setParameters(tm); - threshold_mower_filter.filterPeakMap(e); - - FeatureFinderMultiplexAlgorithm algorithm; - Param p = algorithm.getParameters(); - p.setValue("algorithm:labels", ""); // unlabeled only - p.setValue("algorithm:charge", getStringOption_("Seeding:charge")); //TODO infer from IDs? - p.setValue("algorithm:rt_typical", median_fwhm * 3.0); - p.setValue("algorithm:rt_band", getDoubleOption_("Seeding:traceRTTolerance")); // max 3 seconds shifts between isotopic traces - p.setValue("algorithm:rt_min", median_fwhm * 0.5); - p.setValue("algorithm:spectrum_type", "centroid"); - algorithm.setParameters(p); - //FIXME progress of FFM is not printed at all - const bool progress(true); - algorithm.run(e, progress); - seeds = algorithm.getFeatureMap(); - OPENMS_LOG_INFO << "Using " << seeds.size() << " seeds from untargeted feature extraction." << endl; - } - // aligns the feature maps double align_( @@ -1019,7 +881,8 @@ class ProteomicsLFQ : //------------------------------------------------------------- if (getStringOption_("mass_recalibration") == "true") { - recalibrateMasses_(ms_centroided, peptide_ids, id_file_abs_path); + String debug_output_basename = (debug_level_ > 666) ? id_file_abs_path : ""; + DDAWorkflowCommons::recalibrateMS1(ms_centroided, peptide_ids, debug_output_basename); } vector ext_protein_ids; @@ -1028,7 +891,8 @@ class ProteomicsLFQ : ////////////////////////////////////////// // Chromatographic parameter estimation ////////////////////////////////////////// - median_fwhm = estimateMedianChromatographicFWHM_(ms_centroided); + median_fwhm = DDAWorfklowCommons::estimateMedianChromatographicFWHM(ms_centroided); + OPENMS_LOG_INFO << "Median chromatographic FWHM: " << median_fwhm << std::endl; //------------------------------------------------------------- // Feature detection @@ -1044,7 +908,8 @@ class ProteomicsLFQ : if (!targeted_only) { - calculateSeeds_(ms_centroided, seeds, median_fwhm); + // TODO: infer min/max charge from ID data + DDAWorkflowCommons::calculateSeeds(ms_centroided, getDoubleOption_("Seeding:intThreshold"), seeds, median_fwhm, 2, 5); if (debug_level_ > 666) { FileHandler().storeFeatures("debug_seeds_fraction_" + String(ms_files.first) + "_" + String(fraction_group) + ".featureXML", seeds, {FileTypes::FEATUREXML}); @@ -1669,9 +1534,9 @@ class ProteomicsLFQ : } // Map between mzML file and corresponding id file - // Here we currently assume that these are provided in the exact same order. - map mzfile2idfile = mapMzML2Ids_(in, in_ids); - map idfile2mzfile = mapId2MzMLs_(mzfile2idfile); + // We assume that these are provided in the exact same order. + map mzfile2idfile = DDAWorkflowCommons::mapMzML2Ids(in, in_ids); + map idfile2mzfile = DDAWorkflowCommons::mapId2MzMLs(mzfile2idfile); // TODO maybe check if mzMLs in experimental design match to mzMLs passed as in parameter // IF both are present