Merge branch 'main' into integrate-spice

AbanteAI · Apr 3, 2024 · d727903 · d727903
2 parents 36c2a5a + cc6716a
commit d727903
Show file tree

Hide file tree

Showing 28 changed files with 1,018 additions and 394 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -3,6 +3,12 @@ Changelog
 
 In this changelog focus on user facing highlights and stick to the format. This information will be used to motivate users to upgrade or after upgrading to inform them of features that might otherwise not be very discoverable.
 
+`1.0.12 <https://pypi.org/project/mentat/1.0.12/>`__
+--------------------------------------------------
+
+- Added helpful message when no api key found
+- Fixed errors relating to embedding models
+
 `1.0.11 <https://pypi.org/project/mentat/1.0.11/>`__
 --------------------------------------------------
 

diff --git a/benchmarks/arg_parser.py b/benchmarks/arg_parser.py
@@ -72,5 +72,11 @@ def common_benchmark_parser():
         type=str,
         help="Fetch or load SWE-bench examples from split: dev (default), train or test.",
     )
+    parser.add_argument(
+        "--auto_context_tokens",
+        default=0,
+        type=int,
+        help="Include auto-selected tokens in benchmark runs and evaluate precision/recall",
+    )
 
     return parser
diff --git a/benchmarks/benchmark_result.py b/benchmarks/benchmark_result.py
@@ -36,6 +36,11 @@ class BenchmarkResult:
     missing_functionality: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
     extra_functionality: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
     referenced_format: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
+    test_eval_results: Optional[dict] = attr.ib(default=None, metadata={"display": "json"})
+    test_eval_passed: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
+    context_results: Optional[dict] = attr.ib(default=None, metadata={"display": "json"})
+    context_precision: Optional[float] = attr.ib(default=None, metadata={"aggregation": "average"})
+    context_recall: Optional[float] = attr.ib(default=None, metadata={"aggregation": "average"})
 
     def display_color(self) -> str:
         if self.passed is None:

diff --git a/benchmarks/benchmark_runner.py b/benchmarks/benchmark_runner.py
@@ -15,6 +15,7 @@
 from benchmarks.arg_parser import common_benchmark_parser
 from benchmarks.benchmark_result import BenchmarkResult
 from benchmarks.benchmark_run import BenchmarkRun
+from benchmarks.context_benchmark import run_auto_context_benchmark
 from benchmarks.run_sample import run_sample
 from benchmarks.swe_bench_runner import SWE_BENCH_SAMPLES_DIR, get_swe_samples
 from mentat.config import Config
@@ -202,11 +203,12 @@ def from_module(cls, path_to_module: Path, module_name: str) -> Benchmark:
         return output
 
     @classmethod
-    def from_sample(cls, path_to_sample: Path) -> Benchmark:
+    def from_sample(cls, path_to_sample: Path, config: Config | None = None) -> Benchmark:
         sample = Sample.load(path_to_sample)
         return cls(
             title=sample.title,
             description=sample.description,
+            config=config or Config(),
             samples=[sample],
         )
 
@@ -223,10 +225,17 @@ async def run(self, retries: int = 1) -> list[BenchmarkResult]:
                     family=formatted_title,
                 )
                 try:
-                    sample_result = await run_sample(sample)
+                    if sample.context and self.config.auto_context_tokens:
+                        score = await run_auto_context_benchmark(sample, self.config, include_context=False)
+                        result.context_results = {**score, "auto_context_tokens": self.config.auto_context_tokens}
+                        result.context_precision = score["precision"]
+                        result.context_recall = score["recall"]
+                    sample_result = await run_sample(sample, config=self.config)
                     result.cost = sample_result["cost"]
                     result.tokens = sample_result["tokens"]
                     result.transcript = sample_result["transcript"]
+                    result.test_eval_results = sample_result["test_eval_results"]
+                    result.test_eval_passed = sample_result["test_eval_passed"]
                     if self.verify is not None:
                         result.verify = self.verify()
 
@@ -251,7 +260,13 @@ def benchmark_listed(title, benchmarks):
     return False
 
 
-def run_benchmarks(user_benchmarks: list[str], directory: str, retries: int = 1):
+def run_benchmarks(
+    user_benchmarks: list[str],
+    directory: str,
+    retries: int = 1,
+    max_benchmarks: int | None = None,
+    auto_context_tokens: int = 0,
+):
     # Load benchmarks
     dir_path = Path(directory).resolve()
     assert dir_path.exists(), f"Invalid directory: {directory}"
@@ -263,7 +278,8 @@ def run_benchmarks(user_benchmarks: list[str], directory: str, retries: int = 1)
             if file.endswith(".py"):
                 benchmark = Benchmark.from_module(path, "benchmark")
             elif file.endswith(".json"):
-                benchmark = Benchmark.from_sample(path)
+                config = Config(auto_context_tokens=auto_context_tokens)
+                benchmark = Benchmark.from_sample(path, config)
             else:
                 continue
 
@@ -277,7 +293,9 @@ def run_benchmarks(user_benchmarks: list[str], directory: str, retries: int = 1)
     results_cache = dir_path / f"benchmark_results_cache_{uuid4()}.jsonl"
     results_cache.touch()
     total_cost = 0.0
-    for benchmark in benchmarks:
+    for i, benchmark in enumerate(benchmarks):
+        if max_benchmarks and i >= max_benchmarks:
+            break
         # Run benchmark.run() with timeout
         try:
             result = asyncio.run(benchmark.run(retries=retries))
@@ -328,4 +346,6 @@ def run_benchmarks(user_benchmarks: list[str], directory: str, retries: int = 1)
         args.benchmarks,
         args.directory,
         args.retries,
+        args.max_benchmarks,
+        args.auto_context_tokens,
     )