From 77fab3470de03e8ce2cb800112f423555edd6e41 Mon Sep 17 00:00:00 2001
From: Christopher Andrew Mancuso <chrismancuso1984@gmail.com>
Date: Fri, 15 Nov 2024 12:20:18 -0500
Subject: [PATCH] updated the example directory (#337)

* updated the example directory

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 example/example_download.py    | 30 ++++++++++++++
 example/example_run.py         | 20 ++++------
 example/test_download.py       | 73 ----------------------------------
 example/test_download_slurm.py | 65 ------------------------------
 geneplexus/geneplexus.py       | 13 +++---
 5 files changed, 44 insertions(+), 157 deletions(-)
 create mode 100755 example/example_download.py
 delete mode 100755 example/test_download.py
 delete mode 100755 example/test_download_slurm.py

diff --git a/example/example_download.py b/example/example_download.py
new file mode 100755
index 0000000..d21e11d
--- /dev/null
+++ b/example/example_download.py
@@ -0,0 +1,30 @@
+import os
+import os.path as osp
+import pathlib
+import shutil
+import time
+
+import numpy as np
+
+import geneplexus
+
+# Set up data directory
+homedir = pathlib.Path(__file__).absolute().parent
+datadir = osp.join(homedir, "data")
+os.makedirs(datadir, exist_ok=True)
+
+
+"""
+each file is separated by the species. Can select all by using
+species = ["Human", "Mouse", "Fly", "Worm", "Zebrafish", "Yeast"]
+or
+species = "All"
+
+for a subset just include desired species (example for just Mouse and Human)
+species = ["Human", "Mouse"]
+"""
+
+geneplexus.download.download_select_data(
+    datadir,
+    species=["Human", "Mouse", "Fly", "Worm", "Zebrafish", "Yeast"],
+)
diff --git a/example/example_run.py b/example/example_run.py
index 3fcfe6e..b868e01 100755
--- a/example/example_run.py
+++ b/example/example_run.py
@@ -38,23 +38,19 @@
 # Get the data from URL
 geneplexus.download.download_select_data(
     datadir,
-    tasks="All",
-    networks="STRING",
-    features="SixSpeciesN2V",
-    sp_trn="Human",
-    sp_tst="Mouse",
-    gsc="GO",
+    species=["Human", "Mouse"],
 )
 
 # Run through the pipeline
 # First initialize the geneplexus object
 myclass = geneplexus.GenePlexus(
-    file_loc=fp_data,
-    gsc="Combined",
-    features="SixSpeciesN2V",
+    file_loc=datadir,
     net_type="STRING",
+    features="SixSpeciesN2V",
     sp_trn="Human",
-    sp_tst="Mouse",
+    sp_res="Mouse",
+    gsc_trn="Combined",
+    gsc_res="Combined",
 )
 
 # Load the input genes into the class and set up positives/negatives
@@ -65,7 +61,7 @@
 
 # The makes the tables that have the model weight similarity to other models
 # trained on known GO and DisGeNet sets
-df_sim_GO, df_sim_Dis, weights_GO, weights_Dis = myclass.make_sim_dfs()
+df_sim, sim_weights = myclass.make_sim_dfs()
 
 # Return an edgelist
 df_edge, isolated_genes, df_edge_sym, isolated_genes_sym = myclass.make_small_edgelist(num_nodes=50)
@@ -75,5 +71,5 @@
 
 # Save a few things for checking
 df_probs.to_csv(osp.join(outdir, "df_probs.tsv"), sep="\t", header=True, index=False)
-df_sim_GO.to_csv(osp.join(outdir, "df_sim_GO.tsv"), sep="\t", header=True, index=False)
+df_sim.to_csv(osp.join(outdir, "df_sim_GO.tsv"), sep="\t", header=True, index=False)
 df_convert_out_subset.to_csv(osp.join(outdir, "df_convert_out_subset.tsv"), sep="\t", header=True, index=False)
diff --git a/example/test_download.py b/example/test_download.py
deleted file mode 100755
index 848a98c..0000000
--- a/example/test_download.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import argparse
-import os
-import os.path as osp
-import pathlib
-import shutil
-import time
-
-import numpy as np
-
-import geneplexus
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "-dirname",
-    default="dir1",
-    type=str,
-    help="This dir will be end point",
-)
-parser.add_argument(
-    "-data_loc",
-    default="Zenodo",
-    type=str,
-    help="Where the data is stored (Azure or Zenodo)",
-)
-parser.add_argument(
-    "-amount",
-    default="full",
-    type=str,
-    help="either full or subset",
-)
-args = parser.parse_args()
-dirname = args.dirname
-data_loc = args.data_loc
-amount = args.amount
-
-dirname_full = f"{data_loc}_{amount}_{dirname}"
-
-tic = time.time()
-
-# Set up directories
-homedir = pathlib.Path(__file__).absolute().parent
-datadir = osp.join(homedir, "data", dirname_full)
-tic1 = time.time()
-if os.path.exists(datadir):
-    shutil.rmtree(datadir)
-os.makedirs(datadir)
-print("The time it took to delete/create the directory was", (time.time() - tic1) / 60)
-
-# Get the data
-if amount == "full":
-    print(f"Start downloading data and saving to: {datadir}")
-    geneplexus.download.download_select_data(
-        datadir,
-        tasks="All",
-        networks="All",
-        features="All",
-        gscs="All",
-        data_loc=data_loc,
-    )
-
-elif amount == "subset":
-    print(f"Start downloading data and saving to: {datadir}")
-    geneplexus.download.download_select_data(
-        datadir,
-        tasks="All",
-        networks="BioGRID",
-        features="Embedding",
-        gscs="GO",
-        data_loc=data_loc,
-    )
-
-print("Done downlaoding")
-print("The time it took in minutes was", (time.time() - tic) / 60)
diff --git a/example/test_download_slurm.py b/example/test_download_slurm.py
deleted file mode 100755
index 98fa8ab..0000000
--- a/example/test_download_slurm.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import datetime
-import glob
-import os
-import time
-from subprocess import PIPE
-from subprocess import Popen
-
-import numpy as np
-
-tic = time.time()
-
-myloc = "Zenodo"
-myamount = "full"
-num_runs = 5
-
-jobs_final = []
-for i in range(num_runs):
-    jobs_final.append([myloc, myamount, f"dir{i}"])
-
-# dir to put the slurm files
-slurm_dir = f"/mnt/home/mancus16/GenePlexus/speedtest/zipped/{myloc}/"
-
-for idx, ajob in enumerate(jobs_final):
-    # make the script
-    mylist = ["#!/bin/bash"]
-    mylist.append("### define resources needed:")
-    mylist.append("#SBATCH --time=03:50:00")
-    mylist.append("#SBATCH --nodes=1")
-    mylist.append("#SBATCH --cpus-per-task=1")
-    mylist.append("#SBATCH --mem=50G")
-    mylist.append("#SBATCH --job-name={}-{}-{}".format(ajob[0], ajob[1], ajob[2]))
-    mylist.append("#SBATCH --account=mancuso")
-    mylist.append("#SBATCH --output=%sslurm-%%x-%%j.out" % slurm_dir)
-    mylist.append("cd")
-    mylist.append("module load powertools")
-    mylist.append("umask g+rw")
-    mylist.append("module use /mnt/research/compbio/krishnanlab/modules/")
-    mylist.append("source .bashrc")
-    mylist.append('export PATH="/mnt/home/mancus16/software/anaconda3/bin:$PATH"')
-    mylist.append("conda activate pygpdpwnloadtest")
-    mylist.append("cd /mnt/home/mancus16/GenePlexus/PyGeneplexus/example")
-    mylist.append("which python")
-    mylist.append(f"python test_download.py -data_loc {ajob[0]} -amount {ajob[1]} -dirname {ajob[2]}")
-
-    with open(slurm_dir + "{}-{}-{}.sb".format(ajob[0], ajob[1], ajob[2]), "w") as thefile:
-        for item in mylist:
-            thefile.write("%s\n" % item)
-
-    os.system("sbatch " + slurm_dir + "{}-{}-{}.sb".format(ajob[0], ajob[1], ajob[2]))
-
-    p1 = Popen(["squeue", "-u", "mancus16"], stdout=PIPE)
-    p2 = Popen(["wc", "-l"], stdin=p1.stdout, stdout=PIPE)
-    # njobs has like 7 extra lines becuase of the header, but it is good to over estimate the number of jobs in the queue
-    njobs, err = p2.communicate()
-    njobs = int(njobs)
-
-    while njobs > 800:
-        print("More than 800 jobs in queue")
-        time.sleep(360)
-        p1 = Popen(["squeue", "-u", "mancus16"], stdout=PIPE)
-        p2 = Popen(["wc", "-l"], stdin=p1.stdout, stdout=PIPE)
-        njobs, err = p2.communicate()
-        njobs = int(njobs)
-
-print("This script took %i minutes to run " % ((time.time() - tic) / 60))
diff --git a/geneplexus/geneplexus.py b/geneplexus/geneplexus.py
index 0a63d4a..71bd535 100755
--- a/geneplexus/geneplexus.py
+++ b/geneplexus/geneplexus.py
@@ -382,12 +382,6 @@ def fit_and_predict(
 
         :attr:`GenePlexus.mdl_weights` (array of float)
             Trained model parameters.
-        :attr:`GenePlexus.probs` (array of float)
-            Genome-wide gene prediction scores. A high value indicates the
-            relevance of the gene to the input gene list.
-        :attr:`GenePlexus.avgps` (array of float)
-            Cross validation results. Performance is measured using
-            log2(auprc/prior).
         :attr:`GenePlexus.df_probs` (DataFrame)
             A table with 7 columns: **Entrez** (the gene Entrez ID), **Symbol**
             (the gene Symbol), **Name** (the gene Name), **Probability** (the
@@ -395,7 +389,12 @@ def fit_and_predict(
             **Known/Novel** (whether the gene is in the input gene list),
             **Class-Label** (positive, negative, or neutral), **Rank** (rank of
             relevance of the gene to the input gene list).
-
+        :attr:`GenePlexus.avgps` (array of float)
+            Cross validation results. Performance is measured using
+            log2(auprc/prior).
+        :attr:`GenePlexus.probs` (array of float)
+            Genome-wide gene prediction scores. A high value indicates the
+            relevance of the gene to the input gene list.
         """
         self._get_pos_and_neg_genes()
         self.mdl_weights, self.probs, self.avgps = _geneplexus._run_sl(