Skip to content

Commit

Permalink
Reintroduce CCS predictie
Browse files Browse the repository at this point in the history
  • Loading branch information
RobbinBouwmeester committed Nov 13, 2023
1 parent daa1e46 commit 325f10c
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 33 deletions.
53 changes: 33 additions & 20 deletions deeplc/deeplc.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ def __init__(
pygam_calibration=True,
deepcallc_mod=False,
deeplc_retrain=False,
predict_ccs=False,
n_epochs=20,
):
# if a config file is defined overwrite standard parameters
Expand Down Expand Up @@ -278,6 +279,8 @@ def __init__(

self.deepcallc_mod = deepcallc_mod

self.predict_ccs = predict_ccs

if self.deepcallc_mod:
self.write_library = False
self.use_library = None
Expand Down Expand Up @@ -315,14 +318,14 @@ def do_f_extraction(self, seqs, mods, identifiers, charges=[]):
feature matrix
"""
list_of_psms = []
# TODO include charge here
if len(charges) > 0:

if not self.predict_ccs:
for seq,mod,ident in zip(seqs,mods,identifiers):
list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident))
else:
for seq,mod,ident,z in zip(seqs,mods,identifiers,charges):
list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident))
# TODO include charge here
list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod,z),spectrum_id=ident))

psm_list = PSMList(psm_list=list_of_psms)

return self.f_extractor.full_feat_extract(psm_list)
Expand Down Expand Up @@ -353,8 +356,8 @@ def do_f_extraction_pd(self,
list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident))
else:
for seq,mod,ident,z in zip(df_instances["seq"],df_instances["modifications"],df_instances.index,charges=df_instances["charges"]):
list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident))
# TODO include charge here
list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod,charge=z),spectrum_id=ident))

psm_list = PSMList(psm_list=list_of_psms)

return self.f_extractor.full_feat_extract(psm_list)
Expand Down Expand Up @@ -630,14 +633,16 @@ def make_preds(self,
"""
if type(seq_df) == pd.core.frame.DataFrame:
list_of_psms = []
# TODO include charge here
for seq,mod,ident in zip(seq_df["seq"],seq_df["modifications"],seq_df.index):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident))
if self.predict_ccs:
for seq,mod,ident,z in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["charge"]):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod,charge=z),spectrum_id=ident))
else:
for seq,mod,ident in zip(seq_df["seq"],seq_df["modifications"],seq_df.index):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident))
psm_list = PSMList(psm_list=list_of_psms)

if len(infile) > 0:
psm_list = read_file(infile)
# TODO is charge included here?
if "msms" in infile and ".txt" in infile:
mapper = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/map_mq_file.csv"),index_col=0)["value"].to_dict()
psm_list.rename_modifications(mapper)
Expand Down Expand Up @@ -709,8 +714,12 @@ def calibrate_preds_func_pygam(self,
if type(seq_df) == pd.core.frame.DataFrame:
list_of_psms = []
# TODO include charge here
for seq,mod,ident,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr))
if self.predict_ccs:
for seq,mod,ident,tr,z in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"],seq_df["charge"]):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod,charge=z),spectrum_id=ident,retention_time=tr))
else:
for seq,mod,ident,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr))
psm_list = PSMList(psm_list=list_of_psms)

measured_tr = [psm.retention_time for psm in psm_list]
Expand Down Expand Up @@ -787,8 +796,12 @@ def calibrate_preds_func(self,
if type(seq_df) == pd.core.frame.DataFrame:
list_of_psms = []
# TODO include charge here
for seq,mod,tr,ident in zip(seq_df["seq"],seq_df["modifications"],seq_df["tr"],seq_df.index):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr))
if self.predict_ccs:
for seq,mod,tr,ident,z in zip(seq_df["seq"],seq_df["modifications"],seq_df["tr"],seq_df.index,seq_df["charge"]):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod,charge=z),spectrum_id=ident,retention_time=tr))
else:
for seq,mod,tr,ident in zip(seq_df["seq"],seq_df["modifications"],seq_df["tr"],seq_df.index):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr))
psm_list = PSMList(psm_list=list_of_psms)

measured_tr = [psm.retention_time for psm in psm_list]
Expand Down Expand Up @@ -933,19 +946,20 @@ def calibrate_preds(self,
"""
if type(seq_df) == pd.core.frame.DataFrame:
list_of_psms = []
# TODO include charge here
for seq,mod,ident,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr))
if self.predict_ccs:
for seq,mod,ident,tr,z in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"],seq_df["charge"]):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod,charge=z),spectrum_id=ident,retention_time=tr))
else:
for seq,mod,ident,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr))
psm_list = PSMList(psm_list=list_of_psms)
elif psm_utils_obj:
# TODO include charge here
psm_list = psm_utils_obj

if isinstance(self.model, str):
self.model = [self.model]

if len(infile) > 0:
# TODO include charge here
psm_list = read_file(infile)
if "msms" in infile and ".txt" in infile:
mapper = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/map_mq_file.csv"),index_col=0)["value"].to_dict()
Expand Down Expand Up @@ -1005,7 +1019,6 @@ def calibrate_preds(self,
self.model = models

if isinstance(sample_for_calibration_curve, int):
# TODO include charge here
psm_list = random.sample(list(psm_list), sample_for_calibration_curve)
measured_tr = [psm.retention_time for psm in psm_list]

Expand Down
25 changes: 12 additions & 13 deletions deeplc/feat_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ def encode_atoms(self,
psm_list,
indexes,
charges=[],
predict_ccs=False,
padding_length=60,
positions=set([0, 1, 2, 3, -1, -2, -3, -4]),
positions_pos=set([0, 1, 2, 3]),
Expand Down Expand Up @@ -430,15 +431,12 @@ def rolling_sum(a, n=2):
ret_list["pos_matrix"] = {}
ret_list["matrix_hc"] = {}

# TODO Reintroduce for CCS, check CCS flag
#if len(charges) == 0:
# charges = [-1] * len(indexes)

# Iterate over all instances
for psm,row_index in zip(psm_list,indexes):
peptidoform = psm.peptidoform
seq = peptidoform.sequence
seq_len = len(seq)
charge = psm.get_precursor_charge()

# For now anything longer than padding length is cut away
# (C-terminal cutting)
Expand Down Expand Up @@ -541,14 +539,14 @@ def rolling_sum(a, n=2):

matrix_all = np.sum(matrix, axis=0)
matrix_all = np.append(matrix_all, seq_len)
# TODO Reintroduce for CCS, check CCS flag
#if charge != -1:
# matrix_all = np.append(matrix_all,(seq.count("H"))/float(seq_len))
# matrix_all = np.append(matrix_all,(seq.count("F")+seq.count("W")+seq.count("Y"))/float(seq_len))
# matrix_all = np.append(matrix_all,(seq.count("D")+seq.count("E"))/float(seq_len))
# matrix_all = np.append(matrix_all,(seq.count("K")+seq.count("R"))/float(seq_len))
# matrix_all = np.append(matrix_all,charge)

if predict_ccs:
matrix_all = np.append(matrix_all,(seq.count("H"))/float(seq_len))
matrix_all = np.append(matrix_all,(seq.count("F")+seq.count("W")+seq.count("Y"))/float(seq_len))
matrix_all = np.append(matrix_all,(seq.count("D")+seq.count("E"))/float(seq_len))
matrix_all = np.append(matrix_all,(seq.count("K")+seq.count("R"))/float(seq_len))
matrix_all = np.append(matrix_all,charge)

matrix_sum = rolling_sum(matrix.T, n=2)[:, ::2].T

ret_list["matrix"][row_index] = matrix
Expand All @@ -564,6 +562,7 @@ def full_feat_extract(self,
psm_list=[],
seqs=[],
mods=[],
predict_ccs=False,
identifiers=[],
charges=[]):
"""
Expand Down Expand Up @@ -621,7 +620,7 @@ def full_feat_extract(self,
if self.verbose:
logger.debug("Extracting CNN features")
X_cnn = self.encode_atoms(
psm_list, list(range(len(psm_list))), charges=charges)
psm_list, list(range(len(psm_list))), charges=charges, predict_ccs=predict_ccs)

if self.cnn_feats:
X = X_cnn
Expand Down

0 comments on commit 325f10c

Please sign in to comment.