From e63fb08bda91a5ed9d9440efbc1a3cd87b44ab4b Mon Sep 17 00:00:00 2001 From: Jerome Date: Mon, 12 Feb 2024 14:01:37 -0800 Subject: [PATCH 1/4] Update mmsplice.yml removed spliceAI dependency. The code MMsplice.py only calls mmsplice --- envs/mmsplice.yml | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/envs/mmsplice.yml b/envs/mmsplice.yml index 8b3092f..2b8cb44 100644 --- a/envs/mmsplice.yml +++ b/envs/mmsplice.yml @@ -6,19 +6,17 @@ channels: - defaults dependencies: - python - - cython=0.29.13 - - pyranges=0.0.51 - - libgcc=7.2.0 + - cython + - libgcc + - keras - tensorflow - - keras=2.2.4 - - numpy=1.16.1 - - scikit-learn - - cyvcf2=0.8.4 - - pandas<0.25.0 + - numpy + - pandas - pysam - htslib - pip - pip: - - kipoiseq==0.2.5 - - git+https://github.com/Aerval/SpliceAI.git - - mmsplice==1.0.1 + - mmsplice + - cyvcf2 + - pyranges + - kipoiseq From 30edf815f3e8e8e95e7896c225c56b763b7fced0 Mon Sep 17 00:00:00 2001 From: Jerome Date: Mon, 12 Feb 2024 14:03:38 -0800 Subject: [PATCH 2/4] Update MMSplice.py to remove concise.encodeDNA dependency removed legacy dependency to concise.encodeDNA. mmsplice bumps to v2.4.0. --- src/scripts/lib/tools/MMSplice.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/scripts/lib/tools/MMSplice.py b/src/scripts/lib/tools/MMSplice.py index d9cf835..cafd562 100644 --- a/src/scripts/lib/tools/MMSplice.py +++ b/src/scripts/lib/tools/MMSplice.py @@ -2,11 +2,12 @@ # Import from tqdm import tqdm -from concise.preprocessing import encodeDNA +# from concise.preprocessing import encodeDNA from mmsplice.vcf_dataloader import SplicingVCFDataloader from mmsplice import MMSplice from mmsplice.utils import logit, predict_deltaLogitPsi, \ - predict_pathogenicity, predict_splicing_efficiency + predict_pathogenicity, predict_splicing_efficiency, \ + encodeDNA import pandas as pd import numpy as np From 862146861245d0f377f9b36962bbb8463e1e980d Mon Sep 17 00:00:00 2001 From: Jerome Date: Mon, 12 Feb 2024 23:23:34 -0800 Subject: [PATCH 3/4] Update MMSplice.py remove redundant functions --- src/scripts/lib/tools/MMSplice.py | 101 +----------------------------- 1 file changed, 2 insertions(+), 99 deletions(-) diff --git a/src/scripts/lib/tools/MMSplice.py b/src/scripts/lib/tools/MMSplice.py index cafd562..0d20665 100644 --- a/src/scripts/lib/tools/MMSplice.py +++ b/src/scripts/lib/tools/MMSplice.py @@ -4,10 +4,7 @@ from tqdm import tqdm # from concise.preprocessing import encodeDNA from mmsplice.vcf_dataloader import SplicingVCFDataloader -from mmsplice import MMSplice -from mmsplice.utils import logit, predict_deltaLogitPsi, \ - predict_pathogenicity, predict_splicing_efficiency, \ - encodeDNA +from mmsplice import MMSplice, predict_all_table import pandas as pd import numpy as np @@ -28,100 +25,6 @@ def max_geneEff(df): df_max = df_max.drop_duplicates(subset=['ID', 'gene_name', 'delta_logit_psi']) return df_max -def predict_batch_fast(model, dataloader, batch_size=512, progress=True, - splicing_efficiency=False): - """ - Return the prediction as a table - Args: - model: mmsplice model object. - dataloader: dataloader object. - progress: show progress bar. - splicing_efficiency: adds splicing_efficiency prediction as column - Returns: - iterator of pd.DataFrame of modular prediction, delta_logit_psi, - splicing_efficiency, pathogenicity. - """ - dataloader.encode = False - dt_iter = dataloader.batch_iter(batch_size=batch_size) - if progress: - dt_iter = tqdm(dt_iter) - - ref_cols = ['ref_acceptorIntron', 'ref_acceptor', - 'ref_exon', 'ref_donor', 'ref_donorIntron'] - alt_cols = ['alt_acceptorIntron', 'alt_acceptor', - 'alt_exon', 'alt_donor', 'alt_donorIntron'] - - cat_list = ['acceptor_intron', 'acceptor', 'exon', 'donor', 'donor_intron'] - cats = {'acceptor_intron': lambda x: model.acceptor_intronM.predict(x), - 'acceptor': lambda x: logit(model.acceptorM.predict(x)), - 'exon': lambda x: model.exonM.predict(x), - 'donor': lambda x: logit(model.donorM.predict(x)), - 'donor_intron': lambda x: model.donor_intronM.predict(x)} - - for batch in dt_iter: - refs,alts = {},{} - for cat, model_eval in cats.items(): - alterations = batch['inputs']['seq'][cat] != \ - batch['inputs']['mut_seq'][cat] - if np.any(alterations): - sequences = list(set([str(s) for s in list(batch['inputs']['seq'][cat][alterations]) + \ - list(batch['inputs']['mut_seq'][cat][alterations])])) - prediction = model_eval(encodeDNA(sequences)).flatten() - pred_dict = {s: p for s, p in zip(sequences, prediction)} - - refs[cat] = [pred_dict[batch['inputs']['seq'][cat][i]] \ - if a else 0 for i, a in enumerate(alterations)] - alts[cat] = [pred_dict[batch['inputs']['mut_seq'][cat][i]] \ - if a else 0 for i, a in enumerate(alterations)] - else: - refs[cat] = [0] * len(alterations) - alts[cat] = [0] * len(alterations) - X_ref = np.array([refs[cat] for cat in cat_list]).T - X_alt = np.array([alts[cat] for cat in cat_list]).T - ref_pred = pd.DataFrame(X_ref, columns=ref_cols) - alt_pred = pd.DataFrame(X_alt, columns=alt_cols) - - df = pd.DataFrame({ - 'ID': batch['metadata']['variant']['STR'], - 'exons': batch['metadata']['exon']['annotation'], - }) - for k in ['exon_id', 'gene_id', 'gene_name', 'transcript_id']: - if k in batch['metadata']['exon']: - df[k] = batch['metadata']['exon'][k] - - df['delta_logit_psi'] = predict_deltaLogitPsi(X_ref, X_alt) - df = pd.concat([df, ref_pred, alt_pred], axis=1) - - # pathogenicity does not work - #if pathogenicity: - # df['pathogenicity'] = predict_pathogenicity(X_ref, X_alt) - - if splicing_efficiency: - df['efficiency'] = predict_splicing_efficiency(X_ref, X_alt) - - yield df - -def predict_table_fast(model, - dataloader, - batch_size=512, - progress=True, - pathogenicity=False, - splicing_efficiency=False): - """ - Return the prediction as a table - Args: - model: mmsplice model object. - dataloader: dataloader object. - progress: show progress bar. - splicing_efficiency: adds splicing_efficiency prediction as column - Returns: - pd.DataFrame of modular prediction, delta_logit_psi, splicing_efficiency, - pathogenicity. - """ - return pd.concat(predict_batch_fast(model, dataloader, batch_size=batch_size, - progress=progress, - splicing_efficiency=splicing_efficiency)) - parser = ArgumentParser(description="%prog name") parser.add_argument("-o", "--output", dest="output", type=str, help="Output vcf file {default stdout}") @@ -143,7 +46,7 @@ def predict_table_fast(model, try: # Do prediction - predictions = predict_table_fast(model, dl, batch_size=512) + predictions = predict_all_table(model, dl, batch_size=512) # Summerize with maximum effect size predictionsMax = max_geneEff(predictions) From 997c866868c6dad1eda5c85ac806fa521c6c9815 Mon Sep 17 00:00:00 2001 From: Jerome Date: Mon, 12 Feb 2024 23:25:33 -0800 Subject: [PATCH 4/4] Update mmsplice.yml mmsplice via pip takes care of tensorflow and keras --- envs/mmsplice.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/envs/mmsplice.yml b/envs/mmsplice.yml index 2b8cb44..df2c536 100644 --- a/envs/mmsplice.yml +++ b/envs/mmsplice.yml @@ -8,8 +8,6 @@ dependencies: - python - cython - libgcc - - keras - - tensorflow - numpy - pandas - pysam