Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Swe bench results #549

Merged
merged 10 commits into from
Apr 2, 2024
2 changes: 2 additions & 0 deletions benchmarks/benchmark_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ class BenchmarkResult:
missing_functionality: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
extra_functionality: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
referenced_format: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
test_eval_results: Optional[dict] = attr.ib(default=None, metadata={"display": "json"})
granawkins marked this conversation as resolved.
Show resolved Hide resolved
test_eval_passed: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})

def display_color(self) -> str:
if self.passed is None:
Expand Down
9 changes: 7 additions & 2 deletions benchmarks/benchmark_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,8 @@ async def run(self, retries: int = 1) -> list[BenchmarkResult]:
result.cost = sample_result["cost"]
result.tokens = sample_result["tokens"]
result.transcript = sample_result["transcript"]
result.test_eval_results = sample_result["test_eval_results"]
granawkins marked this conversation as resolved.
Show resolved Hide resolved
result.test_eval_passed = sample_result["test_eval_passed"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider adding a try-except block around benchmark.run(retries=retries) to catch and handle any exceptions that might occur during the execution of a benchmark. This would allow the benchmarking process to continue even if one benchmark fails, improving the robustness of the benchmark runner.

Suggested change
result.test_eval_passed = sample_result["test_eval_passed"]
try:
result = asyncio.run(benchmark.run(retries=retries))
with open(results_cache, "a") as f:
for r in result:
total_cost += r.cost if r.cost else 0.0
f.write(r.to_json() + "
")
except Exception as e:
print(f"Error running benchmark {benchmark.title}: {e}")

Handling exceptions in this manner ensures that the benchmarking process is not halted due to a single failure, providing a more resilient and user-friendly experience.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After the loop that processes each benchmark, it would be useful to log the total number of benchmarks processed and how many were skipped due to reaching the max_benchmarks limit. This information could be valuable for users to understand the scope of the benchmark run and ensure transparency.

Suggested change
result.test_eval_passed = sample_result["test_eval_passed"]
print(f"Processed {i+1} benchmarks. Skipped {len(benchmarks) - i - 1} benchmarks due to max_benchmarks limit.")

Including such logging can enhance the user's understanding of the benchmarking process and provide insights into the execution flow.

if self.verify is not None:
result.verify = self.verify()

Expand All @@ -251,7 +253,7 @@ def benchmark_listed(title, benchmarks):
return False


def run_benchmarks(user_benchmarks: list[str], directory: str, retries: int = 1):
def run_benchmarks(user_benchmarks: list[str], directory: str, retries: int = 1, max_benchmarks: int | None = None):
# Load benchmarks
dir_path = Path(directory).resolve()
assert dir_path.exists(), f"Invalid directory: {directory}"
Expand All @@ -277,7 +279,9 @@ def run_benchmarks(user_benchmarks: list[str], directory: str, retries: int = 1)
results_cache = dir_path / f"benchmark_results_cache_{uuid4()}.jsonl"
results_cache.touch()
total_cost = 0.0
for benchmark in benchmarks:
for i, benchmark in enumerate(benchmarks):
if max_benchmarks and i >= max_benchmarks:
break
granawkins marked this conversation as resolved.
Show resolved Hide resolved
granawkins marked this conversation as resolved.
Show resolved Hide resolved
granawkins marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After the loop that processes each benchmark, it would be useful to log the total number of benchmarks processed and how many were skipped due to reaching the max_benchmarks limit. This information could be valuable for users to understand the scope of the benchmark run and ensure transparency.

Suggested change
break
print(f"Processed {i+1} benchmarks. Skipped {len(benchmarks) - i - 1} benchmarks due to max_benchmarks limit.")

Including such logging can enhance the user's understanding of the benchmarking process and provide insights into the execution flow.

# Run benchmark.run() with timeout
try:
result = asyncio.run(benchmark.run(retries=retries))
Expand Down Expand Up @@ -328,4 +332,5 @@ def run_benchmarks(user_benchmarks: list[str], directory: str, retries: int = 1)
args.benchmarks,
args.directory,
args.retries,
args.max_benchmarks,
)
298 changes: 273 additions & 25 deletions benchmarks/run_sample.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
from pathlib import Path
import json
import random
import subprocess
import re
import sys
from pathlib import Path
from typing import Any

import tqdm

from mentat import Mentat
granawkins marked this conversation as resolved.
Show resolved Hide resolved
from mentat.errors import SampleError
from mentat.git_handler import get_git_diff
Expand All @@ -16,17 +20,39 @@
async def run_sample(sample: Sample, cwd: Path | str | None = None) -> dict[str, Any]:
"""Run a sample using Mentat and return the resulting diff"""

setup_commit = sample.environment_setup_commit or sample.merge_base
repo = setup_repo(
url=sample.repo,
cwd=cwd,
commit=sample.merge_base,
commit=setup_commit,
diff_merge_base=sample.diff_merge_base,
diff_active=sample.diff_active,
)
cwd = Path(repo.working_dir)

test_executable = None
if sample.FAIL_TO_PASS:
# If there's an environment_setup_commit, this is what it's needed for.
try:
test_executable = get_test_executable(Path(repo.working_dir))
except SampleError as e:
print(f"Error setting up virtual environment: {e}")
print("Using default python executable instead.")
if test_executable is None:
test_executable = Path(sys.executable)

if sample.environment_setup_commit and sample.merge_base:
# SWE-Bench samples have a setup commit and a merge base.
repo.git.reset("--hard")
repo.git.checkout(sample.merge_base)
else:
# Mentat Samples have an active diff set in setup_repo that needs to be preserved.
pass

# Make a commit from the pre-edited state (should match diff_active)
commit_active = get_active_snapshot_commit(repo)
commit_active = get_active_snapshot_commit(repo) # TODO: This throws an error in some benchmarks: sqlfluff..

# Pre-validation took place here

# Run sample in PythonClient
paths = list[Path]()
Expand Down Expand Up @@ -59,26 +85,30 @@ async def run_sample(sample: Sample, cwd: Path | str | None = None) -> dict[str,
message_eval = str(transcript_messages[-1].get("message", ""))
diff_eval = get_git_diff(commit_active or "HEAD", cwd=cwd)

test_results = {"passed": 0, "failed": 0, "error": ""}
if sample.test_command:
if sample.test_patch:
apply_diff_to_repo(sample.test_patch, repo)
try:
output = subprocess.run(
sample.test_command,
shell=True,
capture_output=True,
text=True,
cwd=cwd,
)
matches = re.search(r"(?:(\d+) passed)?(?:, )?(?:(\d+) failed)?", output.stdout)
if matches:
test_results["passed"] = int(matches.group(1)) or 0
test_results["failed"] = int(matches.group(2)) or 0
else:
raise SampleError(f"Test command failed: {output.stdout}")
except Exception as e:
test_results["error"] = str(e)
test_results = None
test_passed = None
if sample.test_patch:
apply_diff_to_repo(sample.test_patch, repo)
if sample.FAIL_TO_PASS:
if test_executable is None:
raise SampleError("No test executable found")
tests = json.loads(sample.FAIL_TO_PASS)
total = len(tests)
passed = 0
errors = list[dict[str, str]]()
for test in tests:
_passed, _error = get_test_result(test, cwd, test_executable)
if _passed:
passed += 1
if _error:
errors.append({"test": test, "error": _error})
test_results = {
"passed": passed,
"total": total,
"passed_percent": passed / total * 100,
"errors": errors,
}
test_passed = passed == total

return {
"id": sample.id,
Expand All @@ -90,5 +120,223 @@ async def run_sample(sample: Sample, cwd: Path | str | None = None) -> dict[str,
"id": sample.id,
"messages": transcript_messages,
},
"test_results": test_results,
"test_eval_results": test_results,
"test_eval_passed": test_passed,
}
granawkins marked this conversation as resolved.
Show resolved Hide resolved


test_requirements_for_repo = {
"pvlib-python": [
"setuptools",
"pytest",
"pytest-cov",
"pytest-mock",
"requests-mock",
"pytest-timeout",
"pytest-rerunfailures",
"pytest-remotedata",
],
"pydicom": ["setuptools", "pytest"],
"sqlfluff": [
"setuptools",
"pytest",
"pytest-cov",
"pytest-mock",
"Jinja2",
"oyaml",
],
"pyvista": [
"setuptools",
"pytest",
"ipython",
"ipywidgets",
"ipykernel",
"tqdm",
],
"astroid": [
"setuptools",
"pytest",
"attrs",
"types-attrs",
"nose",
"numpy",
"python-dateutil",
"types-python-dateutil",
"six",
"types-six",
],
"marshmallow": [
"setuptools",
"pytest",
"pytz",
"simplejson",
],
}


def get_test_executable(cwd: Path) -> Path:
"""Rebuild every time with the latest setup."""

venv_dir = cwd / ".venv"
repo_name = cwd.name

try:
python_executable = "python3" if sys.platform != "win32" else "python"
subprocess.run([python_executable, "-m", "venv", str(venv_dir)], check=True, cwd=cwd, capture_output=True)
except Exception as e:
raise SampleError(f"Error creating virtual environment: {e}")

# Install as a pip module
try:
output = subprocess.run(
[venv_dir / "bin" / "pip", "install", "-e", "."], check=True, cwd=cwd, capture_output=True
)
if output.returncode != 0:
raise SampleError(f"Error installing sample as a pip module: {output.stderr}")
except Exception as e:
raise SampleError(f"Error installing sample as a pip module: {e}")

# Requirements are hard-coded by repo
if repo_name not in test_requirements_for_repo:
raise SampleError(f"No requirements found for repo '{repo_name}'")
requirements = test_requirements_for_repo[repo_name]

# Install them all with pip
try:
output = subprocess.run(
[venv_dir / "bin" / "pip", "install", *list(requirements)], check=True, cwd=cwd, capture_output=True
)
if output.returncode != 0:
raise SampleError(f"Error installing requirements: {output.stderr}")
except Exception as e:
raise SampleError(f"Error installing requirements: {e}")

return venv_dir / "bin" / "python"


def get_test_result(test: str, cwd: Path, test_executable: Path) -> tuple[bool, str]:
passed, error = False, ""
command = [str(test_executable), "-m", "pytest"]
if "[" in test:
# Some tests include parameters, like "..is_valid[3.1415]".
# Running '-k' over the whole suite is very slow.
path, params = test.split("[", 1)
params = params[:-1] # Remove trailing ']'
command += [path, "-k", params]
else:
command += [test]
try:
output = subprocess.run(
command,
capture_output=True,
text=True,
cwd=cwd,
)
if (output.returncode != 0 and output.stderr) or not output.stdout:
raise SampleError(f"Test command failed: {output.stderr}")

# Starting from the end, find the first line that contains "passed" or "failed"
lines = output.stdout.splitlines()
result_line = next(
line for line in reversed(lines) if any(key in line for key in {"passed", "failed", "skipped"})
)
_passed = "passed" in result_line or "skipped" in result_line
_failed = "failed" in result_line
if _passed == _failed:
raise SampleError(f"Could not determine test result from line: {result_line}")
passed = _passed
if _failed:
raise SampleError("Test failed:\n" + "\n".join(lines))
except (SampleError, StopIteration, Exception) as e:
error = str(e)
return passed, error


def validate_test_fields(sample: Sample) -> dict[str, Any]:
test_results: dict[str, Any] = {
"PASS_TO_PASS": {"passed": 0, "total": 0, "errors": []},
"FAIL_TO_PASS_PRE": {"passed": 0, "total": 0, "errors": []},
"FAIL_TO_PASS_POST": {"passed": 0, "total": 0, "errors": []},
}

if not sample.FAIL_TO_PASS and not sample.PASS_TO_PASS:
return test_results

setup_commit = sample.environment_setup_commit or sample.merge_base
repo = setup_repo(
url=sample.repo,
commit=setup_commit,
diff_merge_base=sample.diff_merge_base,
diff_active=sample.diff_active,
)
cwd = Path(repo.working_dir)
test_executable = get_test_executable(cwd)

repo.git.checkout(sample.merge_base)

# Run the PASS_TO_PASS test, expected to PASS
if sample.PASS_TO_PASS:
tests = json.loads(sample.PASS_TO_PASS)

# There are sometimes hundreds of tests which take ~30 minutes to all complete.
# Since we'll check all the FAIL_TO_PASS tests, here we just want to confirm the
# environment is set up correctly, so we sample 10 tests.
tests = random.sample(tests, min(10, len(tests)))

# Iterate with tqdm
for test in tqdm.tqdm(tests, desc="PASS_TO_PASS tests", unit="test"):
test_results["PASS_TO_PASS"]["total"] += 1
passed, error = False, ""
try:
passed, error = get_test_result(test, cwd, test_executable)
if passed:
test_results["PASS_TO_PASS"]["passed"] += 1
elif error:
raise SampleError(error)
except SampleError as e:
test_results["PASS_TO_PASS"]["errors"].append({"test": test, "error": str(e)})
print("PASS_TO_PASS results: ", test_results["PASS_TO_PASS"])

# Apply test patch
if sample.test_patch:
print("Applying test patch...")
apply_diff_to_repo(sample.test_patch, repo)

# Run FAIL_TO_PASS tests expected to FAIL
if sample.FAIL_TO_PASS:
tests = json.loads(sample.FAIL_TO_PASS)
for test in tqdm.tqdm(tests, desc="FAIL_TO_PASS tests", unit="test"):
test_results["FAIL_TO_PASS_PRE"]["total"] += 1
passed, error = False, ""
try:
passed, error = get_test_result(test, cwd, test_executable)
if passed:
test_results["FAIL_TO_PASS_PRE"]["passed"] += 1
elif error:
raise SampleError(error)
except SampleError as e:
test_results["FAIL_TO_PASS_PRE"]["errors"].append({"test": test, "error": str(e)})
print("FAIL_TO_PASS_PRE results: ", test_results["FAIL_TO_PASS_PRE"])

# Apply golden patch
if sample.diff_edit:
print("Applying diff_edit...")
apply_diff_to_repo(sample.diff_edit, repo)

# Run FAIL_TO_PASS tests expected to PASS
if sample.FAIL_TO_PASS:
tests = json.loads(sample.FAIL_TO_PASS)
for test in tqdm.tqdm(tests, desc="FAIL_TO_PASS tests", unit="test"):
test_results["FAIL_TO_PASS_POST"]["total"] += 1
passed, error = False, ""
try:
passed, error = get_test_result(test, cwd, test_executable)
if passed:
test_results["FAIL_TO_PASS_POST"]["passed"] += 1
elif error:
raise SampleError(error)
except SampleError as e:
test_results["FAIL_TO_PASS_POST"]["errors"].append({"test": test, "error": str(e)})
print("FAIL_TO_PASS_POST results: ", test_results["FAIL_TO_PASS_POST"])

return test_results
Loading
Loading