From 2913dc7210ca94c453572ec2ea46af08aa6bf6fa Mon Sep 17 00:00:00 2001 From: Leonardo macOS Date: Wed, 4 Sep 2024 12:21:11 +0200 Subject: [PATCH] Moved cache files, tested it, minor cleanups --- pyproject.toml | 3 ++- src/malco/post_process/ranking_utils.py | 25 +++++++++++++++---------- src/malco/runner.py | 15 +++++---------- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f9da07d1..ef7cafc1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ packages = [{include = "malco", from = "src"}] python = "^3.10" pheval = "^0.3.2" setuptools = "^69.5.1" +shelved-cache = "^0.3.1" [tool.poetry.plugins."pheval.plugins"] @@ -20,7 +21,7 @@ pytest = "^7.1.2" pylint = "^2.15.6" pycodestyle = "^2.10.0" coverage = "^6.5.0" -ontogpt = {git = "https://github.com/monarch-initiative/ontogpt.git", branch = "main"} +ontogpt = {git = "https://github.com/monarch-initiative/ontogpt.git", tag = "v1.0.3"} [tool.poetry.group.dev.dependencies] tox = "^4.15.0" diff --git a/src/malco/post_process/ranking_utils.py b/src/malco/post_process/ranking_utils.py index eab822ff..b19ee02f 100644 --- a/src/malco/post_process/ranking_utils.py +++ b/src/malco/post_process/ranking_utils.py @@ -35,19 +35,24 @@ def mondo_adapter() -> OboGraphInterface: return get_adapter("sqlite:obo:mondo") def compute_mrr_and_ranks( - comparing, - output_dir, - prompt_dir, - correct_answer_file, + comparing: str, + output_dir: Path, + out_subdir: str, + prompt_dir: str, + correct_answer_file: str, ) -> Path: + # Read in results TSVs from self.output_dir that match glob results*tsv + out_caches = output_dir / "caches" + out_caches.mkdir(exist_ok=True) + output_dir = output_dir / out_subdir results_data = [] results_files = [] num_ppkt = 0 - pc2_cache_file = str(output_dir / "score_grounded_result_cache") - pc2 = PersistentCache(LRUCache, pc2_cache_file, maxsize=4096) - pc1_cache_file = str(output_dir / "omim_mappings_cache") - pc1 = PersistentCache(LRUCache, pc1_cache_file, maxsize=16384) + pc2_cache_file = str(out_caches / "score_grounded_result_cache") + pc2 = PersistentCache(LRUCache, pc2_cache_file, maxsize=524288) + pc1_cache_file = str(out_caches / "omim_mappings_cache") + pc1 = PersistentCache(LRUCache, pc1_cache_file, maxsize=524288) # Treat hits and misses as run-specific arguments, write them cache_log pc1.hits = pc1.misses = 0 pc2.hits = pc2.misses = 0 @@ -78,7 +83,7 @@ def compute_mrr_and_ranks( header = [comparing, "n1", "n2", "n3", "n4", "n5", "n6", "n7", "n8", "n9", "n10", "n10p", "nf"] rank_df = pd.DataFrame(0, index=np.arange(len(results_files)), columns=header) - cache_file = output_dir / "cache_log.txt" + cache_file = out_caches / "cache_log.txt" with cache_file.open('a', newline = '') as cf: now_is = datetime.now().strftime("%Y%m%d-%H%M%S") @@ -120,7 +125,7 @@ def compute_mrr_and_ranks( # Save full data frame full_df_path = output_dir / results_files[i].split("/")[0] full_df_filename = "full_df_results.tsv" - safe_save_tsv(full_df_path, df, full_df_filename) + safe_save_tsv(full_df_path, full_df_filename, df) # Calculate MRR for this file mrr = df.groupby("label")["reciprocal_rank"].max().mean() diff --git a/src/malco/runner.py b/src/malco/runner.py index 3b904c88..278bf8f5 100644 --- a/src/malco/runner.py +++ b/src/malco/runner.py @@ -23,14 +23,7 @@ class MalcoRunner(PhEvalRunner): #languages: tuple #models: tuple #just_run: bool - #just_postprocess: bool - - #languages = ("en", "es", "nl", "it", "de") - #models = ("gpt-3.5-turbo", "gpt-4", "gpt-4-turbo", "gpt-4o") # Decide on list of models: Claude-Sonnet (Anthropic key), - #models = ("gpt-3.5-turbo", "gpt-4-turbo") # Decide on list of models: Claude-Sonnet (Anthropic key), - #just_run = 0 # only run the run part of the code - #just_postprocess = 1 # only run the postprocess part of the code - + #just_postprocess: bool def prepare(self): @@ -75,7 +68,8 @@ def post_process(self, ''' comparing = "language" mrr_file, plot_dir, num_ppkt, topn_aggr_file = compute_mrr_and_ranks(comparing, - output_dir=self.output_dir / "multilingual" , + output_dir=self.output_dir, + out_subdir="multilingual", prompt_dir=os.path.join(self.input_dir, prompts_subdir_name), correct_answer_file=correct_answer_file) @@ -85,7 +79,8 @@ def post_process(self, ''' comparing = "model" mrr_file, data_dir, num_ppkt, topn_aggr_file = compute_mrr_and_ranks(comparing, - output_dir=self.output_dir / "multimodel" , + output_dir=self.output_dir, + out_subdir="multimodel", prompt_dir=os.path.join(self.input_dir, prompts_subdir_name), correct_answer_file=correct_answer_file)