Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add documentation and docstrings, improve DIA-NN parsing, refactoring #378

Merged
merged 50 commits into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
be3f18c
fix posixpath error in maxquant parameter parsing file
SamvPy Sep 4, 2024
05e6d2a
add documentation
Cajac102 Sep 4, 2024
6f16282
add documentation, uncomment error handling
Cajac102 Sep 4, 2024
819125c
add new parameter fields and refactor diann parameter parsing
SamvPy Sep 3, 2024
225ed78
all_datapoints now optional argument, added documentation
Cajac102 Sep 4, 2024
f38966a
adapt calls to data point functions
Cajac102 Sep 4, 2024
94dcf8d
Fix DIA-NN parsing, add module_dia_quant test
rodvrees Sep 4, 2024
f3fc148
undo changes to parse_settings_fragpipe.toml
rodvrees Sep 4, 2024
31eea5e
Remove debugging statements
rodvrees Sep 4, 2024
01fca23
Add further tests for module_dia_quant
rodvrees Sep 4, 2024
76f080a
Add Datapoint constructor unittest
rodvrees Sep 5, 2024
0ffe509
bugfix for DIA quant page
Cajac102 Sep 5, 2024
9e6bd57
add documentation to base quant
Cajac102 Sep 5, 2024
37f0de9
Merge branch 'main' into DIA
RobbinBouwmeester Sep 5, 2024
5e50667
DIA-NN support
rodvrees Sep 5, 2024
48cd756
add documentation
Cajac102 Sep 5, 2024
e47e5cc
fix identation
Cajac102 Sep 5, 2024
b658ca9
Merge branch 'main' into DIA
RobbinBouwmeester Sep 5, 2024
1e89222
adapt plotquant import
Cajac102 Sep 5, 2024
46dd487
Add DIA_quant_peptidoform page, make separate custom parse files and …
Alirezak2n Sep 5, 2024
cc6e8a7
reformatting
Cajac102 Sep 5, 2024
f11c571
Fix placeholder_download bug
Alirezak2n Sep 5, 2024
67e5985
Merge branch 'DIA' of https://github.com/Proteobench/ProteoBench into…
Alirezak2n Sep 5, 2024
f5265ec
Fix black
RobbinBouwmeester Sep 5, 2024
1adb2ce
Merge branch 'DIA' of https://github.com/Proteobench/ProteoBench into…
RobbinBouwmeester Sep 5, 2024
5bbe462
Merge branch 'main' into DIA
RobbinBouwmeester Sep 5, 2024
3ce21c1
Merge branch 'main' into DIA
RobbinBouwmeester Sep 5, 2024
bdec42a
AlphaDIA support
rodvrees Sep 5, 2024
5c6dd5e
Update parse_settings_ion.py
Alirezak2n Sep 5, 2024
13cf3eb
Update parse_ion.py
Alirezak2n Sep 5, 2024
d3b85ca
Update parse_ion.py
Alirezak2n Sep 5, 2024
9cb8e08
Update parse_ion.py
Alirezak2n Sep 5, 2024
b758625
Update parse_ion.py
Alirezak2n Sep 5, 2024
f037389
Merge branch 'main' into DIA
RobbinBouwmeester Sep 5, 2024
fc7a684
AlphaDIA support
rodvrees Sep 5, 2024
61f02c9
remove unused ModuleInterface class
Cajac102 Sep 5, 2024
696b688
add documentation
Cajac102 Sep 5, 2024
de76f14
remove abstract moduleInterface class, add documentation
Cajac102 Sep 5, 2024
e96627d
remove abstract Interface class
Cajac102 Sep 5, 2024
a980ff6
black
Cajac102 Sep 5, 2024
de38fd8
Undo debug statements, formatting
rodvrees Sep 5, 2024
db492cb
undo debug statements
rodvrees Sep 5, 2024
4a294be
Fix AlphaDIA contaminant detection
rodvrees Sep 5, 2024
a17102c
Update contributions
rodvrees Sep 5, 2024
c20b263
Merge branch 'main' into DIA
RobbinBouwmeester Sep 5, 2024
fc5f601
add alphadia parameter parsing and edit param parsing test files to n…
SamvPy Sep 3, 2024
592d24c
Merge branch 'main' into DIA
RobbinBouwmeester Sep 5, 2024
5d7a39b
fix maxquant param parsing tests
SamvPy Sep 4, 2024
3d841cb
MaxDIA support
rodvrees Sep 5, 2024
d0e7d6b
black formatting
SamvPy Sep 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions proteobench/io/params/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,17 @@ class ProteoBenchParameters:
Minimum precursor charge allowed.
max_precursor_charge : Optional[int]
Maximum precursor charge allowed.
spectral_library_generation : Optional[dict]
Models used to generate spectral library (DIA-specific).
scan_window : Optional[int]
Scan window radius. Ideally corresponds to approximate
average number of data points per peak (DIA-specific).
quantification_method_DIANN : Optional[str]
Quantification strategy used in the DIA-NN engine (DIANN-specific).
second_pass : Optional[bool]
Whether second pass search is enabled (DIANN-specific).
protein_inference : Optional[str]
Protein inference method used.
"""

software_name: Optional[str] = None
Expand All @@ -77,3 +88,8 @@ class ProteoBenchParameters:
max_mods: Optional[int] = None # max_num_modifications
min_precursor_charge: Optional[int] = None # precursor_charge
max_precursor_charge: Optional[int] = None
scan_window: Optional[int] = None # DIA-specific
quantification_method_DIANN: Optional[str] = None # DIANN-specific
second_pass: Optional[bool] = None # DIANN specific
protein_inference: Optional[str] = None
predictors_library: Optional[dict] = None
204 changes: 151 additions & 53 deletions proteobench/io/params/diann.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
from typing import List, Optional, Any
from proteobench.io.params import ProteoBenchParameters

mass_tolerance_regex = "(?<=Optimised mass accuracy: )\d*\.?\d+(?= ppm)"
software_version_regex = "(?<=DIA-NN\s)(.*?)(?=\s\(Data-Independent Acquisition by Neural Networks\))"
mass_tolerance_regex = r"(?<=Optimised mass accuracy: )\d*\.?\d+(?= ppm)"
software_version_regex = r"(?<=DIA-NN\s)(.*?)(?=\s\(Data-Independent Acquisition by Neural Networks\))"
scan_window_regex = r"(?<=Scan window radius set to )\d+"

PARAM_CMD_DICT = {
"ident_fdr_psm": "qvalue",
"ident_fdr_peptide": "qvalue",
"ident_fdr_protein": "qvalue",
"enable_match_between_runs": "reanalyse",
Expand All @@ -22,23 +23,25 @@
"max_mods": "var-mods",
"min_precursor_charge": "min-pr-charge",
"max_precursor_charge": "max-pr-charge",
"scan_window": "window"
}
FLOAT_SETTINGS = [
SETTINGS_PB_FLOAT = [
"ident_fdr_psm",
"ident_fdr_peptide",
"ident_fdr_protein",
"precursor_mass_tolerance",
"fragment_mass_tolerance",
]
INTEGER_SETTINGS = [
SETTINGS_PB_INT = [
"allowed_miscleavages",
"min_peptide_length",
"max_peptide_length",
"max_mods",
"min_precursor_charge",
"max_precursor_charge",
"scan_window"
]
MODIFICATION_SETTINGS = ["fixed_mods", "variable_mods"]
SETTINGS_PB_MOD = ["fixed_mods", "variable_mods"]


def find_cmdline_string(lines: List[str]) -> Optional[str]:
Expand Down Expand Up @@ -101,32 +104,6 @@ def parse_cmdline_string(line: str) -> dict:
setting_dict["mod"] = fixed_mods
return setting_dict


def get_version_number(lines: List[str]) -> Optional[str]:
"""
Get the DIA-NN version number.

It is assumed that the version number is specified on line
with prefix 'DIA-NN ' and suffix ' (Data-Independent Acquisition by Neural Networks)'

Parameter
---------
lines: list[str]
All input lines from the DIA-NN log file.

Return
------
str
The software version.
"""
for line in lines:
software_version_match = re.search(software_version_regex, line)
if software_version_match:
software_version = software_version_match.group()
return software_version
return None


def parse_setting(setting_name: str, setting_list: list) -> Any:
"""
Parse individual settings based on their setting type.
Expand All @@ -143,18 +120,17 @@ def parse_setting(setting_name: str, setting_list: list) -> Any:
Any
The parsed setting.
"""
if setting_name in FLOAT_SETTINGS:
if setting_name in SETTINGS_PB_FLOAT:
assert len(setting_list) == 1
return float(setting_list[0])
if setting_name in INTEGER_SETTINGS:
if setting_name in SETTINGS_PB_INT:
assert len(setting_list) == 1
return int(setting_list[0])
if setting_name in MODIFICATION_SETTINGS:
if setting_name in SETTINGS_PB_MOD:
return ",".join(setting_list)
return "".join(setting_list)


def extract_mass_accuracy(lines: List[str]) -> str:
def extract_with_regex(lines: List[str], regex) -> str:
"""
If no mass accuracy was specified in the cmd string, extract it from the log-file.

Expand All @@ -169,42 +145,164 @@ def extract_mass_accuracy(lines: List[str]) -> str:
The MS1 and MS2 mass accuracy specified in ppm.
"""
for line in lines:
mass_tolerance_match = re.search(mass_tolerance_regex, line)
if mass_tolerance_match:
fragment_mass_tolerance = mass_tolerance_match.group()
return fragment_mass_tolerance
regex_match = re.search(regex, line)
if regex_match:
x = regex_match.group(0)
return x
return None

def parse_protein_inference_method(cmdline_dict: dict) -> str:
"""
Parse the protein inference method from the parsed execution command string.

This setting is defined by disparate setting tags, namely:
- no-prot-inf: No protein inference
- pg-level: Code specifies inference method

Parameter
---------
cmdline_dict: dict
Parsed execution command string

Return
------
str
The protein inference method.
Possibilities:
- Disabled
- Isoforms
- Protein_names
- Genes
"""
if "no-prot-inf" in cmdline_dict.keys():
return "Disabled"
elif "pg-level" in cmdline_dict.keys():
pg_setting = cmdline_dict["pg-level"][0]
pg_level_mapping = {
"0": "Isoforms",
"1": "Protein_names",
"2": "Genes"
}
try:
return pg_level_mapping[pg_setting]
except KeyError:
Exception(f"Unexpected setting passed to --pg-level in diann.exe: {pg_setting}")

def parse_quantification_strategy(cmdline_dict: dict):
"""
Parse the quatnification method from the parsed execution command string.

This setting is defined by disparate setting tags, namely:
- direct-quant: use legacy quantification within DIANN
- high-acc: QuantUMS high-accuracy setting
- no tag: Default is QuantUMS high-precision

Parameter
---------
cmdline_dict: dict
Parsed execution command string

Return
------
str
The quantification method.
Possibilities:
- Legacy
- QuantUMS high-accuracy
- QuantUMS high-precision
"""
if "direct-quant" in cmdline_dict.keys():
return "Legacy"
elif "high-acc" in cmdline_dict.keys():
return "QuantUMS high-accuracy"
else:
# Default value
return "QuantUMS high-precision"

def parse_predictors_library(cmdline_dict: dict):
"""
Parse the spectral library predictors from parsed execute command string.

For now, only 'DIANN' and 'User defined speclib' are supported.
In the future, the user might specify which algorithm was used for library generation.

Parameter
---------
cmdline_dict: dict
Parsed execution command string

Return
------
dict
Dictionary specifying algorithm name for RT, IM and MS2_int.
"""
if "predictor" in cmdline_dict.keys():
return {
"RT": "DIANN",
"IM": "DIANN",
"MS2_int": "DIANN"
}
elif "lib" in cmdline_dict.keys():
if not isinstance(cmdline_dict["lib"], bool):
return {
"RT": "User defined speclib",
"IM": "User defined speclib",
"MS2_int": "User defined speclib"
}


def extract_params(fname: str) -> ProteoBenchParameters:
"""Parse DIA-NN log file and extract relevant parameters."""
parameters = {"software_name": "DIA-NN", "search_engine": "DIA-NN", "enable_match_between_runs": False}
# Some default and flag settings
parameters = {
"software_name": "DIA-NN",
"search_engine": "DIA-NN",
"enable_match_between_runs": False,
"quantification_method_DIANN": "QuantUMS high-precision",
"protein_inference": "Heuristic protein inference",
}

# Read in the log file
with open(fname) as f:
lines = f.readlines()

software_version = search_engine_version = get_version_number(lines)
# Extract software versions from the log file.
software_version = search_engine_version = extract_with_regex(lines, software_version_regex)
parameters["software_version"] = software_version
parameters["search_engine_version"] = search_engine_version

# Get settings from the execution command string
cmdline_string = find_cmdline_string(lines)
settings_dict = parse_cmdline_string(cmdline_string)
cmdline_dict = parse_cmdline_string(cmdline_string)

parameters["second_pass"] = (
"double-search" in cmdline_dict.keys() or
"double-pass" in cmdline_dict.keys()
)
parameters["quantification_method_DIANN"] = parse_quantification_strategy(cmdline_dict)
parameters["protein_inference"] = parse_protein_inference_method(cmdline_dict)
parameters["predictors_library"] = parse_predictors_library(cmdline_dict)

# Parse most settings as possible from the execution command using PARAM_CMD_DICT for mapping.
for proteobench_setting, cmd_setting in PARAM_CMD_DICT.items():
if cmd_setting in settings_dict.keys():
if isinstance(settings_dict[cmd_setting], bool):
parameters[proteobench_setting] = settings_dict[cmd_setting]
if cmd_setting in cmdline_dict.keys():
if isinstance(cmdline_dict[cmd_setting], bool):
parameters[proteobench_setting] = cmdline_dict[cmd_setting]
else:
parameters[proteobench_setting] = parse_setting(proteobench_setting, settings_dict[cmd_setting])

parameters["software_version"] = software_version
parameters["search_engine_version"] = search_engine_version
parameters[proteobench_setting] = parse_setting(proteobench_setting, cmdline_dict[cmd_setting])

# If mass-acc flag is not present in cmdline string, extract it from the log file
if "precursor_mass_tolerance" not in parameters.keys():
mass_tol = extract_mass_accuracy(lines)
mass_tol = extract_with_regex(lines, mass_tolerance_regex)
parameters["precursor_mass_tolerance"] = mass_tol + " ppm"
parameters["fragment_mass_tolerance"] = mass_tol + " ppm"
else:
parameters["precursor_mass_tolerance"] += " ppm"
parameters["fragment_mass_tolerance"] += " ppm"

# If scan window is not customely set, extract it from the log file
parameters["scan_window"] = int(extract_with_regex(lines, scan_window_regex))

return ProteoBenchParameters(**parameters)


Expand All @@ -217,4 +315,4 @@ def extract_params(fname: str) -> ProteoBenchParameters:
params = extract_params(file)
data_dict = params.__dict__
series = pd.Series(data_dict)
series.to_csv(file.with_suffix(".csv"))
series.to_csv(file.with_suffix(".csv"))
34 changes: 17 additions & 17 deletions proteobench/io/parsing/io_parse_settings/parse_settings_diann.toml
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
[mapper]
"Modified.Sequence" = "Sequence"
"File.Name" = "Raw file"
"Run" = "Raw file"
"Protein.Names" = "Proteins"
"Precursor.Charge" = "Charge"
"Precursor.Quantity" = "Intensity"

[condition_mapper]
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01.raw" = "A"
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_02.raw" = "A"
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_03.raw" = "A"
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_01.raw" = "B"
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_02.raw" = "B"
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_03.raw" = "B"
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01" = "A"
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_02" = "A"
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_03" = "A"
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_01" = "B"
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_02" = "B"
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_03" = "B"

[run_mapper]
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01.raw" = "Condition_A_Sample_Alpha_01"
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_02.raw" = "Condition_A_Sample_Alpha_02"
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_03.raw" = "Condition_A_Sample_Alpha_03"
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_01.raw" = "Condition_B_Sample_Alpha_01"
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_02.raw" = "Condition_B_Sample_Alpha_02"
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_03.raw" = "Condition_B_Sample_Alpha_03"
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01" = "Condition_A_Sample_Alpha_01"
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_02" = "Condition_A_Sample_Alpha_02"
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_03" = "Condition_A_Sample_Alpha_03"
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_01" = "Condition_B_Sample_Alpha_01"
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_02" = "Condition_B_Sample_Alpha_02"
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_03" = "Condition_B_Sample_Alpha_03"

[species_mapper]
"_YEAST" = "YEAST"
Expand All @@ -30,10 +30,10 @@
contaminant_flag = "Cont_"
decoy_flag = true

[modification_mapper]
"parse_column" = "Modified.Sequence"
[modifications_parser]
"parse_column" = "Sequence"
"before_aa" = false
"isalpha" = true
"isupper" = true
"pattern" = "\\[([^]]+)\\]"
"modification_dict" = {"UniMod:35" = "Oxidation", "UniMod:1" = "Acetyl", "UniMod:4" = "Carbamidomethyl"}
"pattern" = "\\(([^()]*)\\)"
"modification_dict" = {"(unimod:35)" = "Oxidation", "(unimod:1)" = "Acetyl", "(unimod:4)" = "Carbamidomethyl"}
Loading