Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

multiple updates to CLI #331

Merged
merged 2 commits into from
Oct 31, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 77 additions & 40 deletions geneplexus/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Command line interface for the GenePlexus pipeline."""
import argparse
import atexit
import json
import os
import os.path as osp
import pathlib
Expand Down Expand Up @@ -37,7 +38,7 @@ def parse_args() -> argparse.Namespace:
"--input_file",
metavar="",
required=True,
help="Input gene list (.txt) file (one gene per line).",
help="Input gene list (.txt) file.",
)

parser.add_argument(
Expand All @@ -50,6 +51,15 @@ def parse_args() -> argparse.Namespace:
"tabs. Other generic separator are also supported, e.g. ', '.",
)

parser.add_argument(
"-dd",
"--data_dir",
default=None,
metavar="",
help="Directory in which the data are stored, if set to None, then use "
"the default data directory ~/.data/geneplexus",
)

parser.add_argument(
"-n",
"--network",
Expand All @@ -72,24 +82,31 @@ def parse_args() -> argparse.Namespace:
"--sp_trn",
default="Human",
metavar="",
help="Species of training data {format_choices(config.ALL_SPECIES}",
help=f"Species of training data {format_choices(config.ALL_SPECIES)}",
)

parser.add_argument(
"-s2",
"--sp_tst",
default="Human",
default="Mouse",
metavar="",
help=f"Species of test data {format_choices(config.ALL_SPECIES)}",
)

parser.add_argument(
"-g1",
"--gsc_trn",
default="GO",
metavar="",
help="Species of test data {format_choices(config.ALL_SPECIES}",
help=f"Geneset collection used to generate negatives. {format_choices(config.ALL_GSCS)}",
)

parser.add_argument(
"-g",
"--gsc",
"-g2",
"--gsc_tst",
default="GO",
metavar="",
help="Geneset collection used to generate negatives and the model"
f"similarities. {format_choices(config.ALL_GSCS)}",
help=f"Geneset collection used for model similarities. {format_choices(config.ALL_GSCS)}",
)

parser.add_argument(
Expand All @@ -101,15 +118,6 @@ def parse_args() -> argparse.Namespace:
help="Number of nodes in the small edgelist.",
)

parser.add_argument(
"-dd",
"--data_dir",
default=None,
metavar="",
help="Directory in which the data are stored, if set to None, then use "
"the default data directory ~/.data/geneplexus",
)

parser.add_argument(
"-od",
"--output_dir",
Expand All @@ -126,6 +134,13 @@ def parse_args() -> argparse.Namespace:
help=f"Logging level. {format_choices(config.LOG_LEVELS)}",
)

parser.add_argument(
"-ad",
"--auto_download_off",
action="store_true",
help="Turns off autodownloader which is on by default.",
)

parser.add_argument(
"-q",
"--quiet",
Expand Down Expand Up @@ -155,15 +170,19 @@ def parse_args() -> argparse.Namespace:
parser.add_argument(
"--skip-mdl-sim",
action="store_true",
help="Skip model similarity computation. This computation is not yet "
"available when using custom networks due to the lack of pretrained "
"models for comparison.",
help="Skip model similarity computation",
)

parser.add_argument(
"--skip-sm-edgelist",
action="store_true",
help="Skip making small edgelist.",
)

return parser.parse_args()


def run_pipeline(gp: GenePlexus, num_nodes: int, skip_mdl_sim: bool):
def run_pipeline(gp: GenePlexus, num_nodes: int, skip_mdl_sim: bool, skip_sm_edgelist: bool):
"""Run the full GenePlexus pipeline.

Args:
Expand All @@ -175,12 +194,15 @@ def run_pipeline(gp: GenePlexus, num_nodes: int, skip_mdl_sim: bool):

"""
gp.fit_and_predict()
gp.make_small_edgelist(num_nodes=num_nodes)
gp.alter_validation_df()
if not skip_mdl_sim:
gp.make_sim_dfs()
else:
logger.info("Skipping model similarity computation.")
if not skip_sm_edgelist:
gp.make_small_edgelist(num_nodes=num_nodes)
else:
logger.info("Skipping making small edgelist.")
gp.alter_validation_df()


def df_to_tsv(df: pd.DataFrame, root: str, name: str):
Expand All @@ -195,29 +217,37 @@ def df_to_tsv(df: pd.DataFrame, root: str, name: str):
df.to_csv(osp.join(root, name), sep="\t", index=False)


def save_results(gp, outdir, zip_output, overwrite, skip_mdl_sim):
def save_results(gp, outdir, zip_output, overwrite, skip_mdl_sim, skip_sm_edgelist):
"""Save all results generated by the GenePlexus pipeline.

Args:
outdir: Output directory.
zip_output: Whether or not to zip the output directory into a zip file.
overwrite: Whether or not to overwrite existing results.
skip_mdl_sim: Whether or not to skip the computation of model
similarities with GO and Mondo. This option is not yet available
for custom networks.
similarities with GO, Monarch and/or Mondo.
skip_sm_edgelist: Whether or not to skip making the small edgelist.

"""
zip_outpath = _suffix_fn(f"{outdir}.zip", overwrite=overwrite)
outdir = _suffix_dir(outdir, overwrite=overwrite, mktmp=zip_output)

np.savetxt(osp.join(outdir, "cross_validation.txt"), gp.avgps, fmt="%.18f")
df_to_tsv(gp.df_convert_out, outdir, "df_convert_out.tsv")
np.savetxt(osp.join(outdir, "pos_genes_in_net.txt"), gp.pos_genes_in_net, fmt="%s")
np.savetxt(osp.join(outdir, "negative_genes.txt"), gp.negative_genes, fmt="%s")
np.savetxt(osp.join(outdir, "net_genes.txt"), gp.net_genes, fmt="%s")
with open(osp.join(outdir, "neutral_gene_info.json"), "w") as f:
json.dump(gp.neutral_gene_info, f)
np.savetxt(osp.join(outdir, "avgps.txt"), gp.avgps, fmt="%.18f")
np.savetxt(osp.join(outdir, "mdl_weights.txt"), gp.mdl_weights, fmt="%.18f")
df_to_tsv(gp.df_probs, outdir, "df_probs.tsv")
df_to_tsv(gp.df_edge, outdir, "df_edge.tsv")
df_to_tsv(gp.df_edge_sym, outdir, "df_edge_sym.tsv")
df_to_tsv(gp.df_convert_out_subset, outdir, "df_convert_out_subset.tsv")
if not skip_mdl_sim:
df_to_tsv(gp.df_sim, outdir, "df_sim.tsv")
if not skip_mdl_sim:
df_to_tsv(gp.df_edge, outdir, "df_edge.tsv")
df_to_tsv(gp.df_edge_sym, outdir, "df_edge_sym.tsv")
np.savetxt(osp.join(outdir, "isolated_genes.txt"), gp.isolated_genes, fmt="%s")
np.savetxt(osp.join(outdir, "isolated_genes_sym.txt"), gp.isolated_genes_sym, fmt="%s")
df_to_tsv(gp.df_convert_out_subset, outdir, "df_convert_out_subset.tsv")

# Dump config, close file handler and move run log to result directory
gp.dump_config(outdir)
Expand Down Expand Up @@ -302,18 +332,23 @@ def main():
"""Command line interface."""
args = parse_args()
log_level = "CRITICAL" if args.quiet else args.log_level
if args.auto_download_off:
auto_download = False
else:
auto_download = True

clear_data(args)

# Create geneplexus object and auto download data files
gp = GenePlexus(
args.data_dir,
args.network,
args.feature,
args.sp_trn,
args.sp_tst,
args.gsc,
auto_download=True,
file_loc=args.data_dir,
net_type=args.network,
features=args.feature,
sp_trn=args.sp_trn,
sp_tst=args.sp_tst,
gsc_trn=args.gsc_trn,
gsc_tst=args.gsc_tst,
auto_download=auto_download,
log_level=log_level,
)

Expand All @@ -323,8 +358,10 @@ def main():
# Save config

# Run pipeline and save results
run_pipeline(gp, args.small_edgelist_num_nodes, args.skip_mdl_sim)
save_results(gp, normexpand(args.output_dir), args.zip_output, args.overwrite, args.skip_mdl_sim)
run_pipeline(gp, args.small_edgelist_num_nodes, args.skip_mdl_sim, args.skip_sm_edgelist)
save_results(
gp, normexpand(args.output_dir), args.zip_output, args.overwrite, args.skip_mdl_sim, args.skip_sm_edgelist
)


if __name__ == "__main__":
Expand Down
Loading