From 2913dc7210ca94c453572ec2ea46af08aa6bf6fa Mon Sep 17 00:00:00 2001
From: Leonardo macOS <leonardochimirri94@gmail.com>
Date: Wed, 4 Sep 2024 12:21:11 +0200
Subject: [PATCH] Moved cache files, tested it, minor cleanups

---
 pyproject.toml                          |  3 ++-
 src/malco/post_process/ranking_utils.py | 25 +++++++++++++++----------
 src/malco/runner.py                     | 15 +++++----------
 3 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f9da07d1..ef7cafc1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,6 +10,7 @@ packages = [{include = "malco", from = "src"}]
 python = "^3.10"
 pheval = "^0.3.2"
 setuptools = "^69.5.1"
+shelved-cache = "^0.3.1"
 
 
 [tool.poetry.plugins."pheval.plugins"]
@@ -20,7 +21,7 @@ pytest = "^7.1.2"
 pylint = "^2.15.6"
 pycodestyle = "^2.10.0"
 coverage = "^6.5.0"
-ontogpt = {git = "https://github.com/monarch-initiative/ontogpt.git", branch = "main"}
+ontogpt = {git = "https://github.com/monarch-initiative/ontogpt.git", tag = "v1.0.3"}
 
 [tool.poetry.group.dev.dependencies]
 tox = "^4.15.0"
diff --git a/src/malco/post_process/ranking_utils.py b/src/malco/post_process/ranking_utils.py
index eab822ff..b19ee02f 100644
--- a/src/malco/post_process/ranking_utils.py
+++ b/src/malco/post_process/ranking_utils.py
@@ -35,19 +35,24 @@ def mondo_adapter() -> OboGraphInterface:
     return get_adapter("sqlite:obo:mondo") 
 
 def compute_mrr_and_ranks(
-    comparing, 
-    output_dir, 
-    prompt_dir, 
-    correct_answer_file,
+    comparing: str, 
+    output_dir: Path, 
+    out_subdir: str,
+    prompt_dir: str, 
+    correct_answer_file: str,
     ) -> Path:
+
     # Read in results TSVs from self.output_dir that match glob results*tsv 
+    out_caches = output_dir / "caches"
+    out_caches.mkdir(exist_ok=True)
+    output_dir = output_dir / out_subdir
     results_data = []
     results_files = []
     num_ppkt = 0
-    pc2_cache_file = str(output_dir / "score_grounded_result_cache")
-    pc2 = PersistentCache(LRUCache, pc2_cache_file, maxsize=4096)        
-    pc1_cache_file = str(output_dir / "omim_mappings_cache")
-    pc1 = PersistentCache(LRUCache, pc1_cache_file, maxsize=16384)
+    pc2_cache_file = str(out_caches / "score_grounded_result_cache")
+    pc2 = PersistentCache(LRUCache, pc2_cache_file, maxsize=524288)        
+    pc1_cache_file = str(out_caches / "omim_mappings_cache")
+    pc1 = PersistentCache(LRUCache, pc1_cache_file, maxsize=524288)
     # Treat hits and misses as run-specific arguments, write them cache_log
     pc1.hits = pc1.misses = 0
     pc2.hits = pc2.misses = 0
@@ -78,7 +83,7 @@ def compute_mrr_and_ranks(
     header = [comparing, "n1", "n2", "n3", "n4", "n5", "n6", "n7", "n8", "n9", "n10", "n10p", "nf"]
     rank_df = pd.DataFrame(0, index=np.arange(len(results_files)), columns=header)
 
-    cache_file = output_dir / "cache_log.txt"
+    cache_file = out_caches / "cache_log.txt"
 
     with cache_file.open('a', newline = '') as cf:
         now_is = datetime.now().strftime("%Y%m%d-%H%M%S")
@@ -120,7 +125,7 @@ def compute_mrr_and_ranks(
             # Save full data frame
             full_df_path = output_dir / results_files[i].split("/")[0]
             full_df_filename = "full_df_results.tsv"
-            safe_save_tsv(full_df_path, df, full_df_filename)
+            safe_save_tsv(full_df_path, full_df_filename, df)
             
             # Calculate MRR for this file
             mrr = df.groupby("label")["reciprocal_rank"].max().mean()
diff --git a/src/malco/runner.py b/src/malco/runner.py
index 3b904c88..278bf8f5 100644
--- a/src/malco/runner.py
+++ b/src/malco/runner.py
@@ -23,14 +23,7 @@ class MalcoRunner(PhEvalRunner):
     #languages: tuple
     #models: tuple
     #just_run: bool
-    #just_postprocess: bool
-
-    #languages = ("en", "es", "nl", "it", "de")
-    #models = ("gpt-3.5-turbo", "gpt-4", "gpt-4-turbo", "gpt-4o") # Decide on list of models: Claude-Sonnet (Anthropic key), 
-    #models = ("gpt-3.5-turbo", "gpt-4-turbo") # Decide on list of models: Claude-Sonnet (Anthropic key), 
-    #just_run = 0          # only run the run part of the code
-    #just_postprocess = 1  # only run the postprocess part of the code
-    
+    #just_postprocess: bool  
 
 
     def prepare(self):
@@ -75,7 +68,8 @@ def post_process(self,
             '''
             comparing = "language"
             mrr_file, plot_dir, num_ppkt, topn_aggr_file = compute_mrr_and_ranks(comparing,
-                output_dir=self.output_dir / "multilingual" ,
+                output_dir=self.output_dir,
+                out_subdir="multilingual",
                 prompt_dir=os.path.join(self.input_dir, prompts_subdir_name),
                 correct_answer_file=correct_answer_file)
             
@@ -85,7 +79,8 @@ def post_process(self,
             '''
             comparing = "model"
             mrr_file, data_dir, num_ppkt, topn_aggr_file = compute_mrr_and_ranks(comparing,
-                output_dir=self.output_dir / "multimodel" ,
+                output_dir=self.output_dir,
+                out_subdir="multimodel",
                 prompt_dir=os.path.join(self.input_dir, prompts_subdir_name),
                 correct_answer_file=correct_answer_file)