From 77fab3470de03e8ce2cb800112f423555edd6e41 Mon Sep 17 00:00:00 2001 From: Christopher Andrew Mancuso Date: Fri, 15 Nov 2024 12:20:18 -0500 Subject: [PATCH] updated the example directory (#337) * updated the example directory * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- example/example_download.py | 30 ++++++++++++++ example/example_run.py | 20 ++++------ example/test_download.py | 73 ---------------------------------- example/test_download_slurm.py | 65 ------------------------------ geneplexus/geneplexus.py | 13 +++--- 5 files changed, 44 insertions(+), 157 deletions(-) create mode 100755 example/example_download.py delete mode 100755 example/test_download.py delete mode 100755 example/test_download_slurm.py diff --git a/example/example_download.py b/example/example_download.py new file mode 100755 index 0000000..d21e11d --- /dev/null +++ b/example/example_download.py @@ -0,0 +1,30 @@ +import os +import os.path as osp +import pathlib +import shutil +import time + +import numpy as np + +import geneplexus + +# Set up data directory +homedir = pathlib.Path(__file__).absolute().parent +datadir = osp.join(homedir, "data") +os.makedirs(datadir, exist_ok=True) + + +""" +each file is separated by the species. Can select all by using +species = ["Human", "Mouse", "Fly", "Worm", "Zebrafish", "Yeast"] +or +species = "All" + +for a subset just include desired species (example for just Mouse and Human) +species = ["Human", "Mouse"] +""" + +geneplexus.download.download_select_data( + datadir, + species=["Human", "Mouse", "Fly", "Worm", "Zebrafish", "Yeast"], +) diff --git a/example/example_run.py b/example/example_run.py index 3fcfe6e..b868e01 100755 --- a/example/example_run.py +++ b/example/example_run.py @@ -38,23 +38,19 @@ # Get the data from URL geneplexus.download.download_select_data( datadir, - tasks="All", - networks="STRING", - features="SixSpeciesN2V", - sp_trn="Human", - sp_tst="Mouse", - gsc="GO", + species=["Human", "Mouse"], ) # Run through the pipeline # First initialize the geneplexus object myclass = geneplexus.GenePlexus( - file_loc=fp_data, - gsc="Combined", - features="SixSpeciesN2V", + file_loc=datadir, net_type="STRING", + features="SixSpeciesN2V", sp_trn="Human", - sp_tst="Mouse", + sp_res="Mouse", + gsc_trn="Combined", + gsc_res="Combined", ) # Load the input genes into the class and set up positives/negatives @@ -65,7 +61,7 @@ # The makes the tables that have the model weight similarity to other models # trained on known GO and DisGeNet sets -df_sim_GO, df_sim_Dis, weights_GO, weights_Dis = myclass.make_sim_dfs() +df_sim, sim_weights = myclass.make_sim_dfs() # Return an edgelist df_edge, isolated_genes, df_edge_sym, isolated_genes_sym = myclass.make_small_edgelist(num_nodes=50) @@ -75,5 +71,5 @@ # Save a few things for checking df_probs.to_csv(osp.join(outdir, "df_probs.tsv"), sep="\t", header=True, index=False) -df_sim_GO.to_csv(osp.join(outdir, "df_sim_GO.tsv"), sep="\t", header=True, index=False) +df_sim.to_csv(osp.join(outdir, "df_sim_GO.tsv"), sep="\t", header=True, index=False) df_convert_out_subset.to_csv(osp.join(outdir, "df_convert_out_subset.tsv"), sep="\t", header=True, index=False) diff --git a/example/test_download.py b/example/test_download.py deleted file mode 100755 index 848a98c..0000000 --- a/example/test_download.py +++ /dev/null @@ -1,73 +0,0 @@ -import argparse -import os -import os.path as osp -import pathlib -import shutil -import time - -import numpy as np - -import geneplexus - -parser = argparse.ArgumentParser() -parser.add_argument( - "-dirname", - default="dir1", - type=str, - help="This dir will be end point", -) -parser.add_argument( - "-data_loc", - default="Zenodo", - type=str, - help="Where the data is stored (Azure or Zenodo)", -) -parser.add_argument( - "-amount", - default="full", - type=str, - help="either full or subset", -) -args = parser.parse_args() -dirname = args.dirname -data_loc = args.data_loc -amount = args.amount - -dirname_full = f"{data_loc}_{amount}_{dirname}" - -tic = time.time() - -# Set up directories -homedir = pathlib.Path(__file__).absolute().parent -datadir = osp.join(homedir, "data", dirname_full) -tic1 = time.time() -if os.path.exists(datadir): - shutil.rmtree(datadir) -os.makedirs(datadir) -print("The time it took to delete/create the directory was", (time.time() - tic1) / 60) - -# Get the data -if amount == "full": - print(f"Start downloading data and saving to: {datadir}") - geneplexus.download.download_select_data( - datadir, - tasks="All", - networks="All", - features="All", - gscs="All", - data_loc=data_loc, - ) - -elif amount == "subset": - print(f"Start downloading data and saving to: {datadir}") - geneplexus.download.download_select_data( - datadir, - tasks="All", - networks="BioGRID", - features="Embedding", - gscs="GO", - data_loc=data_loc, - ) - -print("Done downlaoding") -print("The time it took in minutes was", (time.time() - tic) / 60) diff --git a/example/test_download_slurm.py b/example/test_download_slurm.py deleted file mode 100755 index 98fa8ab..0000000 --- a/example/test_download_slurm.py +++ /dev/null @@ -1,65 +0,0 @@ -import datetime -import glob -import os -import time -from subprocess import PIPE -from subprocess import Popen - -import numpy as np - -tic = time.time() - -myloc = "Zenodo" -myamount = "full" -num_runs = 5 - -jobs_final = [] -for i in range(num_runs): - jobs_final.append([myloc, myamount, f"dir{i}"]) - -# dir to put the slurm files -slurm_dir = f"/mnt/home/mancus16/GenePlexus/speedtest/zipped/{myloc}/" - -for idx, ajob in enumerate(jobs_final): - # make the script - mylist = ["#!/bin/bash"] - mylist.append("### define resources needed:") - mylist.append("#SBATCH --time=03:50:00") - mylist.append("#SBATCH --nodes=1") - mylist.append("#SBATCH --cpus-per-task=1") - mylist.append("#SBATCH --mem=50G") - mylist.append("#SBATCH --job-name={}-{}-{}".format(ajob[0], ajob[1], ajob[2])) - mylist.append("#SBATCH --account=mancuso") - mylist.append("#SBATCH --output=%sslurm-%%x-%%j.out" % slurm_dir) - mylist.append("cd") - mylist.append("module load powertools") - mylist.append("umask g+rw") - mylist.append("module use /mnt/research/compbio/krishnanlab/modules/") - mylist.append("source .bashrc") - mylist.append('export PATH="/mnt/home/mancus16/software/anaconda3/bin:$PATH"') - mylist.append("conda activate pygpdpwnloadtest") - mylist.append("cd /mnt/home/mancus16/GenePlexus/PyGeneplexus/example") - mylist.append("which python") - mylist.append(f"python test_download.py -data_loc {ajob[0]} -amount {ajob[1]} -dirname {ajob[2]}") - - with open(slurm_dir + "{}-{}-{}.sb".format(ajob[0], ajob[1], ajob[2]), "w") as thefile: - for item in mylist: - thefile.write("%s\n" % item) - - os.system("sbatch " + slurm_dir + "{}-{}-{}.sb".format(ajob[0], ajob[1], ajob[2])) - - p1 = Popen(["squeue", "-u", "mancus16"], stdout=PIPE) - p2 = Popen(["wc", "-l"], stdin=p1.stdout, stdout=PIPE) - # njobs has like 7 extra lines becuase of the header, but it is good to over estimate the number of jobs in the queue - njobs, err = p2.communicate() - njobs = int(njobs) - - while njobs > 800: - print("More than 800 jobs in queue") - time.sleep(360) - p1 = Popen(["squeue", "-u", "mancus16"], stdout=PIPE) - p2 = Popen(["wc", "-l"], stdin=p1.stdout, stdout=PIPE) - njobs, err = p2.communicate() - njobs = int(njobs) - -print("This script took %i minutes to run " % ((time.time() - tic) / 60)) diff --git a/geneplexus/geneplexus.py b/geneplexus/geneplexus.py index 0a63d4a..71bd535 100755 --- a/geneplexus/geneplexus.py +++ b/geneplexus/geneplexus.py @@ -382,12 +382,6 @@ def fit_and_predict( :attr:`GenePlexus.mdl_weights` (array of float) Trained model parameters. - :attr:`GenePlexus.probs` (array of float) - Genome-wide gene prediction scores. A high value indicates the - relevance of the gene to the input gene list. - :attr:`GenePlexus.avgps` (array of float) - Cross validation results. Performance is measured using - log2(auprc/prior). :attr:`GenePlexus.df_probs` (DataFrame) A table with 7 columns: **Entrez** (the gene Entrez ID), **Symbol** (the gene Symbol), **Name** (the gene Name), **Probability** (the @@ -395,7 +389,12 @@ def fit_and_predict( **Known/Novel** (whether the gene is in the input gene list), **Class-Label** (positive, negative, or neutral), **Rank** (rank of relevance of the gene to the input gene list). - + :attr:`GenePlexus.avgps` (array of float) + Cross validation results. Performance is measured using + log2(auprc/prior). + :attr:`GenePlexus.probs` (array of float) + Genome-wide gene prediction scores. A high value indicates the + relevance of the gene to the input gene list. """ self._get_pos_and_neg_genes() self.mdl_weights, self.probs, self.avgps = _geneplexus._run_sl(