diff --git a/swebench/harness/run_evaluation.py b/swebench/harness/run_evaluation.py index 82608bab..05463714 100644 --- a/swebench/harness/run_evaluation.py +++ b/swebench/harness/run_evaluation.py @@ -306,17 +306,8 @@ def get_dataset_from_preds( dataset_ids = {i[KEY_INSTANCE_ID] for i in dataset} if instance_ids: - # check that all instance IDs are in the dataset - instance_ids = set(instance_ids) - if instance_ids - dataset_ids: - raise ValueError( - ( - "Some instance IDs not found in dataset!" - f"\nMissing IDs:\n{' '.join(instance_ids - dataset_ids)}" - ) - ) # check that all instance IDs have predictions - missing_preds = instance_ids - set(predictions.keys()) + missing_preds = set(instance_ids) - set(predictions.keys()) if missing_preds: print(f"Warning: Missing predictions for {len(missing_preds)} instance IDs.") @@ -329,9 +320,7 @@ def get_dataset_from_preds( f"\nMissing IDs:\n{' '.join(prediction_ids - dataset_ids)}" ) ) - if instance_ids: - # filter dataset to just the instance IDs dataset = [i for i in dataset if i[KEY_INSTANCE_ID] in instance_ids] # check which instance IDs have already been run @@ -437,8 +426,9 @@ def make_run_report( unstopped_containers.add(container.name) # print final report + dataset_ids = {i[KEY_INSTANCE_ID] for i in full_dataset} print(f"Total instances: {len(full_dataset)}") - print(f"Instances submitted: {len(predictions)}") + print(f"Instances submitted: {len(set(predictions.keys()) & dataset_ids)}") print(f"Instances completed: {len(completed_ids)}") print(f"Instances incomplete: {len(incomplete_ids)}") print(f"Instances resolved: {len(resolved_ids)}") @@ -532,7 +522,7 @@ def main( # get dataset from predictions dataset = get_dataset_from_preds(dataset_name, split, instance_ids, predictions, run_id) - full_dataset = load_swebench_dataset(dataset_name, split) + full_dataset = load_swebench_dataset(dataset_name, split, instance_ids) existing_images = list_images(client) print(f"Running {len(dataset)} unevaluated instances...") if not dataset: diff --git a/swebench/harness/utils.py b/swebench/harness/utils.py index e3160ee5..4e53a4e8 100644 --- a/swebench/harness/utils.py +++ b/swebench/harness/utils.py @@ -16,28 +16,40 @@ MAP_REPO_TO_REQS_PATHS, NON_TEST_EXTS, SWE_BENCH_URL_RAW, + KEY_INSTANCE_ID, ) load_dotenv() -def load_swebench_dataset(name="princeton-nlp/SWE-bench", split="test") -> list[SWEbenchInstance]: +def load_swebench_dataset(name="princeton-nlp/SWE-bench", split="test", instance_ids=None) -> list[SWEbenchInstance]: """ Load SWE-bench dataset from Hugging Face Datasets or local .json/.jsonl file """ + # check that all instance IDs are in the dataset + if instance_ids: + instance_ids = set(instance_ids) # Load from local .json/.jsonl file if name.endswith(".json") or name.endswith(".jsonl"): - return [ - cast(SWEbenchInstance, instance) - for instance in json.loads(Path(name).read_text()) - ] - - # Load from Hugging Face Datasets - if name.lower() in {"swe-bench", "swebench", "swe_bench"}: - name = "princeton-nlp/SWE-bench" - elif name.lower() in {"swe-bench-lite", "swebench-lite", "swe_bench_lite", "swe-bench_lite", "lite"}: - name = "princeton-nlp/SWE-bench_Lite" - dataset = cast(Dataset, load_dataset(name, split=split)) + dataset = json.loads(Path(name).read_text()) + dataset_ids = {instance[KEY_INSTANCE_ID] for instance in dataset} + else: + # Load from Hugging Face Datasets + if name.lower() in {"swe-bench", "swebench", "swe_bench"}: + name = "princeton-nlp/SWE-bench" + elif name.lower() in {"swe-bench-lite", "swebench-lite", "swe_bench_lite", "swe-bench_lite", "lite"}: + name = "princeton-nlp/SWE-bench_Lite" + dataset = cast(Dataset, load_dataset(name, split=split)) + dataset_ids = {instance[KEY_INSTANCE_ID] for instance in dataset} + if instance_ids: + if instance_ids - dataset_ids: + raise ValueError( + ( + "Some instance IDs not found in dataset!" + f"\nMissing IDs:\n{' '.join(instance_ids - dataset_ids)}" + ) + ) + dataset = [instance for instance in dataset if instance[KEY_INSTANCE_ID] in instance_ids] return [cast(SWEbenchInstance, instance) for instance in dataset]