From 325f10c3d682af96ebe0793a260b93498c7f7747 Mon Sep 17 00:00:00 2001
From: RobbinBouwmeester <robbin.bouwmeester@ugent.be>
Date: Mon, 13 Nov 2023 11:18:56 +0100
Subject: [PATCH] Reintroduce CCS predictie

---
 deeplc/deeplc.py         | 53 +++++++++++++++++++++++++---------------
 deeplc/feat_extractor.py | 25 +++++++++----------
 2 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/deeplc/deeplc.py b/deeplc/deeplc.py
index b9d3b71..efbc0c9 100644
--- a/deeplc/deeplc.py
+++ b/deeplc/deeplc.py
@@ -214,6 +214,7 @@ def __init__(
         pygam_calibration=True,
         deepcallc_mod=False,
         deeplc_retrain=False,
+        predict_ccs=False,
         n_epochs=20,
     ):  
         # if a config file is defined overwrite standard parameters
@@ -278,6 +279,8 @@ def __init__(
 
         self.deepcallc_mod = deepcallc_mod
 
+        self.predict_ccs = predict_ccs
+
         if self.deepcallc_mod:
             self.write_library = False
             self.use_library = None
@@ -315,14 +318,14 @@ def do_f_extraction(self, seqs, mods, identifiers, charges=[]):
             feature matrix
         """
         list_of_psms = []
-        # TODO include charge here
-        if len(charges) > 0:
+
+        if not self.predict_ccs:
             for seq,mod,ident in zip(seqs,mods,identifiers):
                 list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident))
         else:
             for seq,mod,ident,z in zip(seqs,mods,identifiers,charges):
-                list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident))
-        # TODO include charge here
+                list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod,z),spectrum_id=ident))
+
         psm_list = PSMList(psm_list=list_of_psms)
 
         return self.f_extractor.full_feat_extract(psm_list)
@@ -353,8 +356,8 @@ def do_f_extraction_pd(self,
                 list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident))
         else:
             for seq,mod,ident,z in zip(df_instances["seq"],df_instances["modifications"],df_instances.index,charges=df_instances["charges"]):
-                list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident))
-        # TODO include charge here
+                list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod,charge=z),spectrum_id=ident))
+
         psm_list = PSMList(psm_list=list_of_psms)
 
         return self.f_extractor.full_feat_extract(psm_list)
@@ -630,14 +633,16 @@ def make_preds(self,
         """
         if type(seq_df) == pd.core.frame.DataFrame:
             list_of_psms = []
-            # TODO include charge here
-            for seq,mod,ident in zip(seq_df["seq"],seq_df["modifications"],seq_df.index):
-                list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident))
+            if self.predict_ccs:
+                for seq,mod,ident,z in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["charge"]):
+                    list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod,charge=z),spectrum_id=ident))
+            else:
+                for seq,mod,ident in zip(seq_df["seq"],seq_df["modifications"],seq_df.index):
+                    list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident))
             psm_list = PSMList(psm_list=list_of_psms)
         
         if len(infile) > 0:
             psm_list = read_file(infile)
-            # TODO is charge included here?
             if "msms" in infile and ".txt" in infile:
                 mapper = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/map_mq_file.csv"),index_col=0)["value"].to_dict()
                 psm_list.rename_modifications(mapper)
@@ -709,8 +714,12 @@ def calibrate_preds_func_pygam(self,
         if type(seq_df) == pd.core.frame.DataFrame:
             list_of_psms = []
             # TODO include charge here
-            for seq,mod,ident,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]):
-                list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr))
+            if self.predict_ccs:
+                for seq,mod,ident,tr,z in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"],seq_df["charge"]):
+                    list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod,charge=z),spectrum_id=ident,retention_time=tr))
+            else:
+                for seq,mod,ident,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]):
+                    list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr))
             psm_list = PSMList(psm_list=list_of_psms)
 
             measured_tr = [psm.retention_time for psm in psm_list]
@@ -787,8 +796,12 @@ def calibrate_preds_func(self,
         if type(seq_df) == pd.core.frame.DataFrame:
             list_of_psms = []
             # TODO include charge here
-            for seq,mod,tr,ident in zip(seq_df["seq"],seq_df["modifications"],seq_df["tr"],seq_df.index):
-                list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr))
+            if self.predict_ccs:
+                for seq,mod,tr,ident,z in zip(seq_df["seq"],seq_df["modifications"],seq_df["tr"],seq_df.index,seq_df["charge"]):
+                    list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod,charge=z),spectrum_id=ident,retention_time=tr))
+            else:
+                for seq,mod,tr,ident in zip(seq_df["seq"],seq_df["modifications"],seq_df["tr"],seq_df.index):
+                    list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr))
             psm_list = PSMList(psm_list=list_of_psms)
         
         measured_tr = [psm.retention_time for psm in psm_list]
@@ -933,19 +946,20 @@ def calibrate_preds(self,
         """
         if type(seq_df) == pd.core.frame.DataFrame:
             list_of_psms = []
-            # TODO include charge here
-            for seq,mod,ident,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]):
-                list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr))
+            if self.predict_ccs:
+                for seq,mod,ident,tr,z in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"],seq_df["charge"]):
+                    list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod,charge=z),spectrum_id=ident,retention_time=tr))
+            else:
+                for seq,mod,ident,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]):
+                    list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr))
             psm_list = PSMList(psm_list=list_of_psms)
         elif psm_utils_obj:
-            # TODO include charge here
             psm_list = psm_utils_obj    
 
         if isinstance(self.model, str):
             self.model = [self.model]
         
         if len(infile) > 0:
-            # TODO include charge here
             psm_list = read_file(infile)
             if "msms" in infile and ".txt" in infile:
                 mapper = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/map_mq_file.csv"),index_col=0)["value"].to_dict()
@@ -1005,7 +1019,6 @@ def calibrate_preds(self,
             self.model = models
 
         if isinstance(sample_for_calibration_curve, int):
-            # TODO include charge here
             psm_list = random.sample(list(psm_list), sample_for_calibration_curve)
             measured_tr = [psm.retention_time for psm in psm_list]
 
diff --git a/deeplc/feat_extractor.py b/deeplc/feat_extractor.py
index c7b2b0f..c1cb666 100644
--- a/deeplc/feat_extractor.py
+++ b/deeplc/feat_extractor.py
@@ -332,6 +332,7 @@ def encode_atoms(self,
                      psm_list,
                      indexes,
                      charges=[],
+                     predict_ccs=False,
                      padding_length=60,
                      positions=set([0, 1, 2, 3, -1, -2, -3, -4]),
                      positions_pos=set([0, 1, 2, 3]),
@@ -430,15 +431,12 @@ def rolling_sum(a, n=2):
         ret_list["pos_matrix"] = {}
         ret_list["matrix_hc"] = {}
 
-        # TODO Reintroduce for CCS, check CCS flag
-        #if len(charges) == 0:
-        #    charges = [-1] * len(indexes)
-
         # Iterate over all instances
         for psm,row_index in zip(psm_list,indexes):
             peptidoform = psm.peptidoform
             seq = peptidoform.sequence
             seq_len = len(seq)
+            charge = psm.get_precursor_charge()
             
             # For now anything longer than padding length is cut away
             # (C-terminal cutting)
@@ -541,14 +539,14 @@ def rolling_sum(a, n=2):
 
             matrix_all = np.sum(matrix, axis=0)
             matrix_all = np.append(matrix_all, seq_len)
-            
-            # TODO Reintroduce for CCS, check CCS flag
-            #if charge != -1:
-            #    matrix_all = np.append(matrix_all,(seq.count("H"))/float(seq_len))
-            #    matrix_all = np.append(matrix_all,(seq.count("F")+seq.count("W")+seq.count("Y"))/float(seq_len))
-            #    matrix_all = np.append(matrix_all,(seq.count("D")+seq.count("E"))/float(seq_len))
-            #    matrix_all = np.append(matrix_all,(seq.count("K")+seq.count("R"))/float(seq_len))
-            #    matrix_all = np.append(matrix_all,charge)
+
+            if predict_ccs:
+                matrix_all = np.append(matrix_all,(seq.count("H"))/float(seq_len))
+                matrix_all = np.append(matrix_all,(seq.count("F")+seq.count("W")+seq.count("Y"))/float(seq_len))
+                matrix_all = np.append(matrix_all,(seq.count("D")+seq.count("E"))/float(seq_len))
+                matrix_all = np.append(matrix_all,(seq.count("K")+seq.count("R"))/float(seq_len))
+                matrix_all = np.append(matrix_all,charge)
+
             matrix_sum = rolling_sum(matrix.T, n=2)[:, ::2].T
 
             ret_list["matrix"][row_index] = matrix 
@@ -564,6 +562,7 @@ def full_feat_extract(self,
                           psm_list=[],
                           seqs=[],
                           mods=[],
+                          predict_ccs=False,
                           identifiers=[],
                           charges=[]):
         """
@@ -621,7 +620,7 @@ def full_feat_extract(self,
             if self.verbose:
                 logger.debug("Extracting CNN features")
             X_cnn = self.encode_atoms(
-                psm_list, list(range(len(psm_list))), charges=charges)
+                psm_list, list(range(len(psm_list))), charges=charges, predict_ccs=predict_ccs)
 
         if self.cnn_feats:
             X = X_cnn