Skip to content

Commit

Permalink
Merge branch 'main' into integrate-spice
Browse files Browse the repository at this point in the history
  • Loading branch information
biobootloader committed Apr 3, 2024
2 parents 36c2a5a + cc6716a commit d727903
Show file tree
Hide file tree
Showing 28 changed files with 1,018 additions and 394 deletions.
Binary file removed .DS_Store
Binary file not shown.
6 changes: 6 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ Changelog

In this changelog focus on user facing highlights and stick to the format. This information will be used to motivate users to upgrade or after upgrading to inform them of features that might otherwise not be very discoverable.

`1.0.12 <https://pypi.org/project/mentat/1.0.12/>`__
--------------------------------------------------

- Added helpful message when no api key found
- Fixed errors relating to embedding models

`1.0.11 <https://pypi.org/project/mentat/1.0.11/>`__
--------------------------------------------------

Expand Down
6 changes: 6 additions & 0 deletions benchmarks/arg_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,5 +72,11 @@ def common_benchmark_parser():
type=str,
help="Fetch or load SWE-bench examples from split: dev (default), train or test.",
)
parser.add_argument(
"--auto_context_tokens",
default=0,
type=int,
help="Include auto-selected tokens in benchmark runs and evaluate precision/recall",
)

return parser
5 changes: 5 additions & 0 deletions benchmarks/benchmark_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ class BenchmarkResult:
missing_functionality: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
extra_functionality: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
referenced_format: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
test_eval_results: Optional[dict] = attr.ib(default=None, metadata={"display": "json"})
test_eval_passed: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
context_results: Optional[dict] = attr.ib(default=None, metadata={"display": "json"})
context_precision: Optional[float] = attr.ib(default=None, metadata={"aggregation": "average"})
context_recall: Optional[float] = attr.ib(default=None, metadata={"aggregation": "average"})

def display_color(self) -> str:
if self.passed is None:
Expand Down
30 changes: 25 additions & 5 deletions benchmarks/benchmark_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from benchmarks.arg_parser import common_benchmark_parser
from benchmarks.benchmark_result import BenchmarkResult
from benchmarks.benchmark_run import BenchmarkRun
from benchmarks.context_benchmark import run_auto_context_benchmark
from benchmarks.run_sample import run_sample
from benchmarks.swe_bench_runner import SWE_BENCH_SAMPLES_DIR, get_swe_samples
from mentat.config import Config
Expand Down Expand Up @@ -202,11 +203,12 @@ def from_module(cls, path_to_module: Path, module_name: str) -> Benchmark:
return output

@classmethod
def from_sample(cls, path_to_sample: Path) -> Benchmark:
def from_sample(cls, path_to_sample: Path, config: Config | None = None) -> Benchmark:
sample = Sample.load(path_to_sample)
return cls(
title=sample.title,
description=sample.description,
config=config or Config(),
samples=[sample],
)

Expand All @@ -223,10 +225,17 @@ async def run(self, retries: int = 1) -> list[BenchmarkResult]:
family=formatted_title,
)
try:
sample_result = await run_sample(sample)
if sample.context and self.config.auto_context_tokens:
score = await run_auto_context_benchmark(sample, self.config, include_context=False)
result.context_results = {**score, "auto_context_tokens": self.config.auto_context_tokens}
result.context_precision = score["precision"]
result.context_recall = score["recall"]
sample_result = await run_sample(sample, config=self.config)
result.cost = sample_result["cost"]
result.tokens = sample_result["tokens"]
result.transcript = sample_result["transcript"]
result.test_eval_results = sample_result["test_eval_results"]
result.test_eval_passed = sample_result["test_eval_passed"]
if self.verify is not None:
result.verify = self.verify()

Expand All @@ -251,7 +260,13 @@ def benchmark_listed(title, benchmarks):
return False


def run_benchmarks(user_benchmarks: list[str], directory: str, retries: int = 1):
def run_benchmarks(
user_benchmarks: list[str],
directory: str,
retries: int = 1,
max_benchmarks: int | None = None,
auto_context_tokens: int = 0,
):
# Load benchmarks
dir_path = Path(directory).resolve()
assert dir_path.exists(), f"Invalid directory: {directory}"
Expand All @@ -263,7 +278,8 @@ def run_benchmarks(user_benchmarks: list[str], directory: str, retries: int = 1)
if file.endswith(".py"):
benchmark = Benchmark.from_module(path, "benchmark")
elif file.endswith(".json"):
benchmark = Benchmark.from_sample(path)
config = Config(auto_context_tokens=auto_context_tokens)
benchmark = Benchmark.from_sample(path, config)
else:
continue

Expand All @@ -277,7 +293,9 @@ def run_benchmarks(user_benchmarks: list[str], directory: str, retries: int = 1)
results_cache = dir_path / f"benchmark_results_cache_{uuid4()}.jsonl"
results_cache.touch()
total_cost = 0.0
for benchmark in benchmarks:
for i, benchmark in enumerate(benchmarks):
if max_benchmarks and i >= max_benchmarks:
break
# Run benchmark.run() with timeout
try:
result = asyncio.run(benchmark.run(retries=retries))
Expand Down Expand Up @@ -328,4 +346,6 @@ def run_benchmarks(user_benchmarks: list[str], directory: str, retries: int = 1)
args.benchmarks,
args.directory,
args.retries,
args.max_benchmarks,
args.auto_context_tokens,
)
Loading

0 comments on commit d727903

Please sign in to comment.