diff --git a/proteobench/io/parsing/io_parse_settings/parse_settings_alphadia.toml b/proteobench/io/parsing/io_parse_settings/parse_settings_alphadia.toml index 4a30703b..c3feaf49 100644 --- a/proteobench/io/parsing/io_parse_settings/parse_settings_alphadia.toml +++ b/proteobench/io/parsing/io_parse_settings/parse_settings_alphadia.toml @@ -3,6 +3,7 @@ "run" = "Raw file" "charge" = "Charge" "intensity" = "Intensity" +"genes" = "Proteins" [condition_mapper] "LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01" = "A" diff --git a/proteobench/io/parsing/io_parse_settings/parse_settings_fragpipe_DIA.toml b/proteobench/io/parsing/io_parse_settings/parse_settings_fragpipe_DIA.toml index 2e3e5604..ee03d89e 100644 --- a/proteobench/io/parsing/io_parse_settings/parse_settings_fragpipe_DIA.toml +++ b/proteobench/io/parsing/io_parse_settings/parse_settings_fragpipe_DIA.toml @@ -29,8 +29,8 @@ "before_aa" = false "isalpha" = true "isupper" = true -"pattern"="\\[([^]]+)\\]" -"modification_dict" = {"[57.0215]" = "Carbamidomethyl", "[57.0216]" = "Carbamidomethyl", "[15.9949]" = "Oxidation", "[-17.026548]" = "Gln->pyro-Glu", "[-18.010565]" = "Glu->pyro-Glu", "[42.0106]" = "Acetyl"} +"pattern"="(?<=\\[).+?(?=\\])" +"modification_dict" = {"57.0215" = "Carbamidomethyl", "57.0216" = "Carbamidomethyl", "15.9949" = "Oxidation", "-17.026548" = "Gln->pyro-Glu", "-18.010565" = "Glu->pyro-Glu", "42.0106" = "Acetyl"} [general] "contaminant_flag" = "Cont_" diff --git a/proteobench/io/parsing/io_parse_settings/parse_settings_spectronaut.toml b/proteobench/io/parsing/io_parse_settings/parse_settings_spectronaut.toml index 47bcdb5c..ffdc0cae 100644 --- a/proteobench/io/parsing/io_parse_settings/parse_settings_spectronaut.toml +++ b/proteobench/io/parsing/io_parse_settings/parse_settings_spectronaut.toml @@ -3,6 +3,7 @@ "R.FileName" = "Raw file" "FG.Charge" = "Charge" "FG.Quantity" = "Intensity" +"PG.ProteinGroups" = "Proteins" [condition_mapper] "LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01" = "A" diff --git a/proteobench/io/parsing/parse_ion.py b/proteobench/io/parsing/parse_ion.py index 80dfcabd..917e26a8 100644 --- a/proteobench/io/parsing/parse_ion.py +++ b/proteobench/io/parsing/parse_ion.py @@ -50,7 +50,7 @@ def load_input_file(input_csv: str, input_format: str) -> pd.DataFrame: mapper_path = os.path.join(os.path.dirname(__file__), "io_parse_settings/mapper.csv") mapper_df = pd.read_csv(mapper_path).set_index("gene_name") mapper = mapper_df["description"].to_dict() - input_data_frame["Proteins"] = input_data_frame["genes"].map( + input_data_frame["genes"] = input_data_frame["genes"].map( lambda x: ";".join([mapper[protein] if protein in mapper.keys() else protein for protein in x.split(";")]) ) input_data_frame["proforma"] = input_data_frame.apply( @@ -75,11 +75,11 @@ def load_input_file(input_csv: str, input_format: str) -> pd.DataFrame: mapper_path = os.path.join(os.path.dirname(__file__), "io_parse_settings/mapper.csv") mapper_df = pd.read_csv(mapper_path).set_index("gene_name") mapper = mapper_df["description"].to_dict() - input_data_frame["Protein_list"] = input_data_frame["PG.ProteinGroups"].str.split(";") - input_data_frame["Proteins"] = input_data_frame["Protein_list"].map( + input_data_frame["PG.ProteinGroups"] = input_data_frame["PG.ProteinGroups"].str.split(";") + input_data_frame["PG.ProteinGroups"] = input_data_frame["PG.ProteinGroups"].map( lambda x: [mapper[protein] if protein in mapper.keys() else protein for protein in x] ) - input_data_frame["Proteins"] = input_data_frame["Proteins"].str.join(";") + input_data_frame["PG.ProteinGroups"] = input_data_frame["PG.ProteinGroups"].str.join(";") elif input_format == "MSAID": input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")