From 325f10c3d682af96ebe0793a260b93498c7f7747 Mon Sep 17 00:00:00 2001 From: RobbinBouwmeester Date: Mon, 13 Nov 2023 11:18:56 +0100 Subject: [PATCH] Reintroduce CCS predictie --- deeplc/deeplc.py | 53 +++++++++++++++++++++++++--------------- deeplc/feat_extractor.py | 25 +++++++++---------- 2 files changed, 45 insertions(+), 33 deletions(-) diff --git a/deeplc/deeplc.py b/deeplc/deeplc.py index b9d3b71..efbc0c9 100644 --- a/deeplc/deeplc.py +++ b/deeplc/deeplc.py @@ -214,6 +214,7 @@ def __init__( pygam_calibration=True, deepcallc_mod=False, deeplc_retrain=False, + predict_ccs=False, n_epochs=20, ): # if a config file is defined overwrite standard parameters @@ -278,6 +279,8 @@ def __init__( self.deepcallc_mod = deepcallc_mod + self.predict_ccs = predict_ccs + if self.deepcallc_mod: self.write_library = False self.use_library = None @@ -315,14 +318,14 @@ def do_f_extraction(self, seqs, mods, identifiers, charges=[]): feature matrix """ list_of_psms = [] - # TODO include charge here - if len(charges) > 0: + + if not self.predict_ccs: for seq,mod,ident in zip(seqs,mods,identifiers): list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident)) else: for seq,mod,ident,z in zip(seqs,mods,identifiers,charges): - list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident)) - # TODO include charge here + list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod,z),spectrum_id=ident)) + psm_list = PSMList(psm_list=list_of_psms) return self.f_extractor.full_feat_extract(psm_list) @@ -353,8 +356,8 @@ def do_f_extraction_pd(self, list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident)) else: for seq,mod,ident,z in zip(df_instances["seq"],df_instances["modifications"],df_instances.index,charges=df_instances["charges"]): - list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident)) - # TODO include charge here + list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod,charge=z),spectrum_id=ident)) + psm_list = PSMList(psm_list=list_of_psms) return self.f_extractor.full_feat_extract(psm_list) @@ -630,14 +633,16 @@ def make_preds(self, """ if type(seq_df) == pd.core.frame.DataFrame: list_of_psms = [] - # TODO include charge here - for seq,mod,ident in zip(seq_df["seq"],seq_df["modifications"],seq_df.index): - list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident)) + if self.predict_ccs: + for seq,mod,ident,z in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["charge"]): + list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod,charge=z),spectrum_id=ident)) + else: + for seq,mod,ident in zip(seq_df["seq"],seq_df["modifications"],seq_df.index): + list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident)) psm_list = PSMList(psm_list=list_of_psms) if len(infile) > 0: psm_list = read_file(infile) - # TODO is charge included here? if "msms" in infile and ".txt" in infile: mapper = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/map_mq_file.csv"),index_col=0)["value"].to_dict() psm_list.rename_modifications(mapper) @@ -709,8 +714,12 @@ def calibrate_preds_func_pygam(self, if type(seq_df) == pd.core.frame.DataFrame: list_of_psms = [] # TODO include charge here - for seq,mod,ident,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]): - list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr)) + if self.predict_ccs: + for seq,mod,ident,tr,z in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"],seq_df["charge"]): + list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod,charge=z),spectrum_id=ident,retention_time=tr)) + else: + for seq,mod,ident,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]): + list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr)) psm_list = PSMList(psm_list=list_of_psms) measured_tr = [psm.retention_time for psm in psm_list] @@ -787,8 +796,12 @@ def calibrate_preds_func(self, if type(seq_df) == pd.core.frame.DataFrame: list_of_psms = [] # TODO include charge here - for seq,mod,tr,ident in zip(seq_df["seq"],seq_df["modifications"],seq_df["tr"],seq_df.index): - list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr)) + if self.predict_ccs: + for seq,mod,tr,ident,z in zip(seq_df["seq"],seq_df["modifications"],seq_df["tr"],seq_df.index,seq_df["charge"]): + list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod,charge=z),spectrum_id=ident,retention_time=tr)) + else: + for seq,mod,tr,ident in zip(seq_df["seq"],seq_df["modifications"],seq_df["tr"],seq_df.index): + list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr)) psm_list = PSMList(psm_list=list_of_psms) measured_tr = [psm.retention_time for psm in psm_list] @@ -933,19 +946,20 @@ def calibrate_preds(self, """ if type(seq_df) == pd.core.frame.DataFrame: list_of_psms = [] - # TODO include charge here - for seq,mod,ident,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]): - list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr)) + if self.predict_ccs: + for seq,mod,ident,tr,z in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"],seq_df["charge"]): + list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod,charge=z),spectrum_id=ident,retention_time=tr)) + else: + for seq,mod,ident,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]): + list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr)) psm_list = PSMList(psm_list=list_of_psms) elif psm_utils_obj: - # TODO include charge here psm_list = psm_utils_obj if isinstance(self.model, str): self.model = [self.model] if len(infile) > 0: - # TODO include charge here psm_list = read_file(infile) if "msms" in infile and ".txt" in infile: mapper = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/map_mq_file.csv"),index_col=0)["value"].to_dict() @@ -1005,7 +1019,6 @@ def calibrate_preds(self, self.model = models if isinstance(sample_for_calibration_curve, int): - # TODO include charge here psm_list = random.sample(list(psm_list), sample_for_calibration_curve) measured_tr = [psm.retention_time for psm in psm_list] diff --git a/deeplc/feat_extractor.py b/deeplc/feat_extractor.py index c7b2b0f..c1cb666 100644 --- a/deeplc/feat_extractor.py +++ b/deeplc/feat_extractor.py @@ -332,6 +332,7 @@ def encode_atoms(self, psm_list, indexes, charges=[], + predict_ccs=False, padding_length=60, positions=set([0, 1, 2, 3, -1, -2, -3, -4]), positions_pos=set([0, 1, 2, 3]), @@ -430,15 +431,12 @@ def rolling_sum(a, n=2): ret_list["pos_matrix"] = {} ret_list["matrix_hc"] = {} - # TODO Reintroduce for CCS, check CCS flag - #if len(charges) == 0: - # charges = [-1] * len(indexes) - # Iterate over all instances for psm,row_index in zip(psm_list,indexes): peptidoform = psm.peptidoform seq = peptidoform.sequence seq_len = len(seq) + charge = psm.get_precursor_charge() # For now anything longer than padding length is cut away # (C-terminal cutting) @@ -541,14 +539,14 @@ def rolling_sum(a, n=2): matrix_all = np.sum(matrix, axis=0) matrix_all = np.append(matrix_all, seq_len) - - # TODO Reintroduce for CCS, check CCS flag - #if charge != -1: - # matrix_all = np.append(matrix_all,(seq.count("H"))/float(seq_len)) - # matrix_all = np.append(matrix_all,(seq.count("F")+seq.count("W")+seq.count("Y"))/float(seq_len)) - # matrix_all = np.append(matrix_all,(seq.count("D")+seq.count("E"))/float(seq_len)) - # matrix_all = np.append(matrix_all,(seq.count("K")+seq.count("R"))/float(seq_len)) - # matrix_all = np.append(matrix_all,charge) + + if predict_ccs: + matrix_all = np.append(matrix_all,(seq.count("H"))/float(seq_len)) + matrix_all = np.append(matrix_all,(seq.count("F")+seq.count("W")+seq.count("Y"))/float(seq_len)) + matrix_all = np.append(matrix_all,(seq.count("D")+seq.count("E"))/float(seq_len)) + matrix_all = np.append(matrix_all,(seq.count("K")+seq.count("R"))/float(seq_len)) + matrix_all = np.append(matrix_all,charge) + matrix_sum = rolling_sum(matrix.T, n=2)[:, ::2].T ret_list["matrix"][row_index] = matrix @@ -564,6 +562,7 @@ def full_feat_extract(self, psm_list=[], seqs=[], mods=[], + predict_ccs=False, identifiers=[], charges=[]): """ @@ -621,7 +620,7 @@ def full_feat_extract(self, if self.verbose: logger.debug("Extracting CNN features") X_cnn = self.encode_atoms( - psm_list, list(range(len(psm_list))), charges=charges) + psm_list, list(range(len(psm_list))), charges=charges, predict_ccs=predict_ccs) if self.cnn_feats: X = X_cnn