AbanteAI · granawkins · Apr 2, 2024 · Mar 26, 2024 · Mar 28, 2024 · Mar 28, 2024
diff --git a/benchmarks/benchmark_result.py b/benchmarks/benchmark_result.py
@@ -36,6 +36,8 @@ class BenchmarkResult:
     missing_functionality: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
     extra_functionality: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
     referenced_format: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
+    test_eval_results: Optional[dict] = attr.ib(default=None, metadata={"display": "json"})
+    test_eval_passed: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
 
     def display_color(self) -> str:
         if self.passed is None:

diff --git a/benchmarks/benchmark_runner.py b/benchmarks/benchmark_runner.py
@@ -227,6 +227,8 @@ async def run(self, retries: int = 1) -> list[BenchmarkResult]:
                     result.cost = sample_result["cost"]
                     result.tokens = sample_result["tokens"]
                     result.transcript = sample_result["transcript"]
+                    result.test_eval_results = sample_result["test_eval_results"]
+                    result.test_eval_passed = sample_result["test_eval_passed"]
-                    result.test_eval_passed = sample_result["test_eval_passed"]
+try:
+    result = asyncio.run(benchmark.run(retries=retries))
+    with open(results_cache, "a") as f:
+        for r in result:
+            total_cost += r.cost if r.cost else 0.0
+            f.write(r.to_json() + "
+")
+except Exception as e:
+    print(f"Error running benchmark {benchmark.title}: {e}")
-                    result.test_eval_passed = sample_result["test_eval_passed"]
+print(f"Processed {i+1} benchmarks. Skipped {len(benchmarks) - i - 1} benchmarks due to max_benchmarks limit.")
-                    result.test_eval_passed = sample_result["test_eval_passed"]
+try:
+    result = asyncio.run(benchmark.run(retries=retries))
+    with open(results_cache, "a") as f:
+        for r in result:
+            total_cost += r.cost if r.cost else 0.0
+            f.write(r.to_json() + "
+")
+except Exception as e:
+    print(f"Error running benchmark {benchmark.title}: {e}")
-                    result.test_eval_passed = sample_result["test_eval_passed"]
+print(f"Processed {i+1} benchmarks. Skipped {len(benchmarks) - i - 1} benchmarks due to max_benchmarks limit.")
                     if self.verify is not None:
                         result.verify = self.verify()
 
@@ -251,7 +253,7 @@ def benchmark_listed(title, benchmarks):
     return False
 
 
-def run_benchmarks(user_benchmarks: list[str], directory: str, retries: int = 1):
+def run_benchmarks(user_benchmarks: list[str], directory: str, retries: int = 1, max_benchmarks: int | None = None):
     # Load benchmarks
     dir_path = Path(directory).resolve()
     assert dir_path.exists(), f"Invalid directory: {directory}"
@@ -277,7 +279,9 @@ def run_benchmarks(user_benchmarks: list[str], directory: str, retries: int = 1)
     results_cache = dir_path / f"benchmark_results_cache_{uuid4()}.jsonl"
     results_cache.touch()
     total_cost = 0.0
-    for benchmark in benchmarks:
+    for i, benchmark in enumerate(benchmarks):
+        if max_benchmarks and i >= max_benchmarks:
+            break
-            break
+print(f"Processed {i+1} benchmarks. Skipped {len(benchmarks) - i - 1} benchmarks due to max_benchmarks limit.")
-            break
+print(f"Processed {i+1} benchmarks. Skipped {len(benchmarks) - i - 1} benchmarks due to max_benchmarks limit.")
         # Run benchmark.run() with timeout
         try:
             result = asyncio.run(benchmark.run(retries=retries))
@@ -328,4 +332,5 @@ def run_benchmarks(user_benchmarks: list[str], directory: str, retries: int = 1)
         args.benchmarks,
         args.directory,
         args.retries,
+        args.max_benchmarks,
     )
diff --git a/benchmarks/run_sample.py b/benchmarks/run_sample.py
@@ -1,8 +1,12 @@
-from pathlib import Path
+import json
+import random
 import subprocess
-import re
+import sys
+from pathlib import Path
 from typing import Any
 
+import tqdm
+
 from mentat import Mentat
 from mentat.errors import SampleError
 from mentat.git_handler import get_git_diff
@@ -16,17 +20,39 @@
 async def run_sample(sample: Sample, cwd: Path | str | None = None) -> dict[str, Any]:
     """Run a sample using Mentat and return the resulting diff"""
 
+    setup_commit = sample.environment_setup_commit or sample.merge_base
     repo = setup_repo(
         url=sample.repo,
         cwd=cwd,
-        commit=sample.merge_base,
+        commit=setup_commit,
         diff_merge_base=sample.diff_merge_base,
         diff_active=sample.diff_active,
     )
     cwd = Path(repo.working_dir)
 
+    test_executable = None
+    if sample.FAIL_TO_PASS:
+        # If there's an environment_setup_commit, this is what it's needed for.
+        try:
+            test_executable = get_test_executable(Path(repo.working_dir))
+        except SampleError as e:
+            print(f"Error setting up virtual environment: {e}")
+            print("Using default python executable instead.")
+    if test_executable is None:
+        test_executable = Path(sys.executable)
+
+    if sample.environment_setup_commit and sample.merge_base:
+        # SWE-Bench samples have a setup commit and a merge base.
+        repo.git.reset("--hard")
+        repo.git.checkout(sample.merge_base)
+    else:
+        # Mentat Samples have an active diff set in setup_repo that needs to be preserved.
+        pass
+
     # Make a commit from the pre-edited state (should match diff_active)
-    commit_active = get_active_snapshot_commit(repo)
+    commit_active = get_active_snapshot_commit(repo)  # TODO: This throws an error in some benchmarks: sqlfluff..
+
+    # Pre-validation took place here
 
     # Run sample in PythonClient
     paths = list[Path]()
@@ -59,26 +85,30 @@ async def run_sample(sample: Sample, cwd: Path | str | None = None) -> dict[str,
     message_eval = str(transcript_messages[-1].get("message", ""))
     diff_eval = get_git_diff(commit_active or "HEAD", cwd=cwd)
 
-    test_results = {"passed": 0, "failed": 0, "error": ""}
-    if sample.test_command:
-        if sample.test_patch:
-            apply_diff_to_repo(sample.test_patch, repo)
-        try:
-            output = subprocess.run(
-                sample.test_command,
-                shell=True,
-                capture_output=True,
-                text=True,
-                cwd=cwd,
-            )
-            matches = re.search(r"(?:(\d+) passed)?(?:, )?(?:(\d+) failed)?", output.stdout)
-            if matches:
-                test_results["passed"] = int(matches.group(1)) or 0
-                test_results["failed"] = int(matches.group(2)) or 0
-            else:
-                raise SampleError(f"Test command failed: {output.stdout}")
-        except Exception as e:
-            test_results["error"] = str(e)
+    test_results = None
+    test_passed = None
+    if sample.test_patch:
+        apply_diff_to_repo(sample.test_patch, repo)
+    if sample.FAIL_TO_PASS:
+        if test_executable is None:
+            raise SampleError("No test executable found")
+        tests = json.loads(sample.FAIL_TO_PASS)
+        total = len(tests)
+        passed = 0
+        errors = list[dict[str, str]]()
+        for test in tests:
+            _passed, _error = get_test_result(test, cwd, test_executable)
+            if _passed:
+                passed += 1
+            if _error:
+                errors.append({"test": test, "error": _error})
+        test_results = {
+            "passed": passed,
+            "total": total,
+            "passed_percent": passed / total * 100,
+            "errors": errors,
+        }
+        test_passed = passed == total
 
     return {
         "id": sample.id,
@@ -90,5 +120,223 @@ async def run_sample(sample: Sample, cwd: Path | str | None = None) -> dict[str,
             "id": sample.id,
             "messages": transcript_messages,
         },
-        "test_results": test_results,
+        "test_eval_results": test_results,
+        "test_eval_passed": test_passed,
     }
+
+
+test_requirements_for_repo = {
+    "pvlib-python": [
+        "setuptools",
+        "pytest",
+        "pytest-cov",
+        "pytest-mock",
+        "requests-mock",
+        "pytest-timeout",
+        "pytest-rerunfailures",
+        "pytest-remotedata",
+    ],
+    "pydicom": ["setuptools", "pytest"],
+    "sqlfluff": [
+        "setuptools",
+        "pytest",
+        "pytest-cov",
+        "pytest-mock",
+        "Jinja2",
+        "oyaml",
+    ],
+    "pyvista": [
+        "setuptools",
+        "pytest",
+        "ipython",
+        "ipywidgets",
+        "ipykernel",
+        "tqdm",
+    ],
+    "astroid": [
+        "setuptools",
+        "pytest",
+        "attrs",
+        "types-attrs",
+        "nose",
+        "numpy",
+        "python-dateutil",
+        "types-python-dateutil",
+        "six",
+        "types-six",
+    ],
+    "marshmallow": [
+        "setuptools",
+        "pytest",
+        "pytz",
+        "simplejson",
+    ],
+}
+
+
+def get_test_executable(cwd: Path) -> Path:
+    """Rebuild every time with the latest setup."""
+
+    venv_dir = cwd / ".venv"
+    repo_name = cwd.name
+
+    try:
+        python_executable = "python3" if sys.platform != "win32" else "python"
+        subprocess.run([python_executable, "-m", "venv", str(venv_dir)], check=True, cwd=cwd, capture_output=True)
+    except Exception as e:
+        raise SampleError(f"Error creating virtual environment: {e}")
+
+    # Install as a pip module
+    try:
+        output = subprocess.run(
+            [venv_dir / "bin" / "pip", "install", "-e", "."], check=True, cwd=cwd, capture_output=True
+        )
+        if output.returncode != 0:
+            raise SampleError(f"Error installing sample as a pip module: {output.stderr}")
+    except Exception as e:
+        raise SampleError(f"Error installing sample as a pip module: {e}")
+
+    # Requirements are hard-coded by repo
+    if repo_name not in test_requirements_for_repo:
+        raise SampleError(f"No requirements found for repo '{repo_name}'")
+    requirements = test_requirements_for_repo[repo_name]
+
+    # Install them all with pip
+    try:
+        output = subprocess.run(
+            [venv_dir / "bin" / "pip", "install", *list(requirements)], check=True, cwd=cwd, capture_output=True
+        )
+        if output.returncode != 0:
+            raise SampleError(f"Error installing requirements: {output.stderr}")
+    except Exception as e:
+        raise SampleError(f"Error installing requirements: {e}")
+
+    return venv_dir / "bin" / "python"
+
+
+def get_test_result(test: str, cwd: Path, test_executable: Path) -> tuple[bool, str]:
+    passed, error = False, ""
+    command = [str(test_executable), "-m", "pytest"]
+    if "[" in test:
+        # Some tests include parameters, like "..is_valid[3.1415]".
+        # Running '-k' over the whole suite is very slow.
+        path, params = test.split("[", 1)
+        params = params[:-1]  # Remove trailing ']'
+        command += [path, "-k", params]
+    else:
+        command += [test]
+    try:
+        output = subprocess.run(
+            command,
+            capture_output=True,
+            text=True,
+            cwd=cwd,
+        )
+        if (output.returncode != 0 and output.stderr) or not output.stdout:
+            raise SampleError(f"Test command failed: {output.stderr}")
+
+        # Starting from the end, find the first line that contains "passed" or "failed"
+        lines = output.stdout.splitlines()
+        result_line = next(
+            line for line in reversed(lines) if any(key in line for key in {"passed", "failed", "skipped"})
+        )
+        _passed = "passed" in result_line or "skipped" in result_line
+        _failed = "failed" in result_line
+        if _passed == _failed:
+            raise SampleError(f"Could not determine test result from line: {result_line}")
+        passed = _passed
+        if _failed:
+            raise SampleError("Test failed:\n" + "\n".join(lines))
+    except (SampleError, StopIteration, Exception) as e:
+        error = str(e)
+    return passed, error
+
+
+def validate_test_fields(sample: Sample) -> dict[str, Any]:
+    test_results: dict[str, Any] = {
+        "PASS_TO_PASS": {"passed": 0, "total": 0, "errors": []},
+        "FAIL_TO_PASS_PRE": {"passed": 0, "total": 0, "errors": []},
+        "FAIL_TO_PASS_POST": {"passed": 0, "total": 0, "errors": []},
+    }
+
+    if not sample.FAIL_TO_PASS and not sample.PASS_TO_PASS:
+        return test_results
+
+    setup_commit = sample.environment_setup_commit or sample.merge_base
+    repo = setup_repo(
+        url=sample.repo,
+        commit=setup_commit,
+        diff_merge_base=sample.diff_merge_base,
+        diff_active=sample.diff_active,
+    )
+    cwd = Path(repo.working_dir)
+    test_executable = get_test_executable(cwd)
+
+    repo.git.checkout(sample.merge_base)
+
+    # Run the PASS_TO_PASS test, expected to PASS
+    if sample.PASS_TO_PASS:
+        tests = json.loads(sample.PASS_TO_PASS)
+
+        # There are sometimes hundreds of tests which take ~30 minutes to all complete.
+        # Since we'll check all the FAIL_TO_PASS tests, here we just want to confirm the
+        # environment is set up correctly, so we sample 10 tests.
+        tests = random.sample(tests, min(10, len(tests)))
+
+        # Iterate with tqdm
+        for test in tqdm.tqdm(tests, desc="PASS_TO_PASS tests", unit="test"):
+            test_results["PASS_TO_PASS"]["total"] += 1
+            passed, error = False, ""
+            try:
+                passed, error = get_test_result(test, cwd, test_executable)
+                if passed:
+                    test_results["PASS_TO_PASS"]["passed"] += 1
+                elif error:
+                    raise SampleError(error)
+            except SampleError as e:
+                test_results["PASS_TO_PASS"]["errors"].append({"test": test, "error": str(e)})
+    print("PASS_TO_PASS results: ", test_results["PASS_TO_PASS"])
+
+    # Apply test patch
+    if sample.test_patch:
+        print("Applying test patch...")
+        apply_diff_to_repo(sample.test_patch, repo)
+
+    # Run FAIL_TO_PASS tests expected to FAIL
+    if sample.FAIL_TO_PASS:
+        tests = json.loads(sample.FAIL_TO_PASS)
+        for test in tqdm.tqdm(tests, desc="FAIL_TO_PASS tests", unit="test"):
+            test_results["FAIL_TO_PASS_PRE"]["total"] += 1
+            passed, error = False, ""
+            try:
+                passed, error = get_test_result(test, cwd, test_executable)
+                if passed:
+                    test_results["FAIL_TO_PASS_PRE"]["passed"] += 1
+                elif error:
+                    raise SampleError(error)
+            except SampleError as e:
+                test_results["FAIL_TO_PASS_PRE"]["errors"].append({"test": test, "error": str(e)})
+    print("FAIL_TO_PASS_PRE results: ", test_results["FAIL_TO_PASS_PRE"])
+
+    # Apply golden patch
+    if sample.diff_edit:
+        print("Applying diff_edit...")
+        apply_diff_to_repo(sample.diff_edit, repo)
+
+    # Run FAIL_TO_PASS tests expected to PASS
+    if sample.FAIL_TO_PASS:
+        tests = json.loads(sample.FAIL_TO_PASS)
+        for test in tqdm.tqdm(tests, desc="FAIL_TO_PASS tests", unit="test"):
+            test_results["FAIL_TO_PASS_POST"]["total"] += 1
+            passed, error = False, ""
+            try:
+                passed, error = get_test_result(test, cwd, test_executable)
+                if passed:
+                    test_results["FAIL_TO_PASS_POST"]["passed"] += 1
+                elif error:
+                    raise SampleError(error)
+            except SampleError as e:
+                test_results["FAIL_TO_PASS_POST"]["errors"].append({"test": test, "error": str(e)})
+    print("FAIL_TO_PASS_POST results: ", test_results["FAIL_TO_PASS_POST"])
+
+    return test_results