adjust alpaca and llama reference scores in tests (#267)

AmpereComputingAI · Oct 14, 2024 · 02fb34e · 02fb34e
1 parent 36444eb
commit 02fb34e
Showing 1 changed file with 3 additions and 3 deletions.
diff --git a/tests/test_pytorch_models.py b/tests/test_pytorch_models.py
@@ -51,7 +51,7 @@ def wrapper(**kwargs):
     @unittest.skipIf(psutil.virtual_memory().available / 1024 ** 3 < 100, "too little memory")
     @unittest.skipUnless('_aio_profiler_print' in dir(torch._C), "Ampere optimized PyTorch required")
     def test_llama2_7b(self):
-        f1_ref = 0.349
+        f1_ref = 0.330
         acc = run_process(self.wrapper,
                           {"model_name": "meta-llama/Llama-2-7b-chat-hf", "batch_size": 1, "num_runs": 50,
                            "timeout": None, "dataset_path": self.dataset_path})
@@ -60,7 +60,7 @@ def test_llama2_7b(self):
     @unittest.skipIf(psutil.virtual_memory().available / 1024 ** 3 < 200, "too little memory")
     @unittest.skipUnless('_aio_profiler_print' in dir(torch._C), "Ampere optimized PyTorch required")
     def test_llama2_13b(self):
-        f1_ref = 0.195
+        f1_ref = 0.261
         acc = run_process(self.wrapper,
                           {"model_name": "meta-llama/Llama-2-13b-chat-hf", "batch_size": 1, "num_runs": 50,
                            "timeout": None, "dataset_path": self.dataset_path})
@@ -93,7 +93,7 @@ def test_alpaca(self):
         def wrapper(**kwargs):
             kwargs["q"].put(run_pytorch_fp32(**kwargs)[0])
 
-        exact_match_ref, f1_ref = 0.260, 0.616
+        exact_match_ref, f1_ref = 0.180, 0.548
         acc = run_process(wrapper, {"model_path": self.model_path, "batch_size": 1, "num_runs": 50,
                                     "timeout": None, "dataset_path": self.dataset_path})
         self.assertTrue(acc["exact_match"] / exact_match_ref > 0.95)