Make run_report more intuitive when using instance_ids filter

princeton-nlp · Aug 15, 2024 · 312b914 · 312b914
1 parent a8df201
commit 312b914
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 26 deletions.
diff --git a/swebench/harness/run_evaluation.py b/swebench/harness/run_evaluation.py
@@ -306,17 +306,8 @@ def get_dataset_from_preds(
     dataset_ids = {i[KEY_INSTANCE_ID] for i in dataset}
 
     if instance_ids:
-        # check that all instance IDs are in the dataset
-        instance_ids = set(instance_ids)
-        if instance_ids - dataset_ids:
-            raise ValueError(
-                (
-                    "Some instance IDs not found in dataset!"
-                    f"\nMissing IDs:\n{' '.join(instance_ids - dataset_ids)}"
-                )
-            )
         # check that all instance IDs have predictions
-        missing_preds = instance_ids - set(predictions.keys())
+        missing_preds = set(instance_ids) - set(predictions.keys())
         if missing_preds:
             print(f"Warning: Missing predictions for {len(missing_preds)} instance IDs.")
 
@@ -329,9 +320,7 @@ def get_dataset_from_preds(
                 f"\nMissing IDs:\n{' '.join(prediction_ids - dataset_ids)}"
             )
         )
-
     if instance_ids:
-        # filter dataset to just the instance IDs
         dataset = [i for i in dataset if i[KEY_INSTANCE_ID] in instance_ids]
 
     # check which instance IDs have already been run
@@ -437,8 +426,9 @@ def make_run_report(
             unstopped_containers.add(container.name)
 
     # print final report
+    dataset_ids = {i[KEY_INSTANCE_ID] for i in full_dataset}
     print(f"Total instances: {len(full_dataset)}")
-    print(f"Instances submitted: {len(predictions)}")
+    print(f"Instances submitted: {len(set(predictions.keys()) & dataset_ids)}")
     print(f"Instances completed: {len(completed_ids)}")
     print(f"Instances incomplete: {len(incomplete_ids)}")
     print(f"Instances resolved: {len(resolved_ids)}")
@@ -532,7 +522,7 @@ def main(
 
     # get dataset from predictions
     dataset = get_dataset_from_preds(dataset_name, split, instance_ids, predictions, run_id)
-    full_dataset = load_swebench_dataset(dataset_name, split)
+    full_dataset = load_swebench_dataset(dataset_name, split, instance_ids)
     existing_images = list_images(client)
     print(f"Running {len(dataset)} unevaluated instances...")
     if not dataset:

diff --git a/swebench/harness/utils.py b/swebench/harness/utils.py
@@ -16,28 +16,40 @@
     MAP_REPO_TO_REQS_PATHS,
     NON_TEST_EXTS,
     SWE_BENCH_URL_RAW,
+    KEY_INSTANCE_ID,
 )
 
 load_dotenv()
 
 
-def load_swebench_dataset(name="princeton-nlp/SWE-bench", split="test") -> list[SWEbenchInstance]:
+def load_swebench_dataset(name="princeton-nlp/SWE-bench", split="test", instance_ids=None) -> list[SWEbenchInstance]:
     """
     Load SWE-bench dataset from Hugging Face Datasets or local .json/.jsonl file
     """
+    # check that all instance IDs are in the dataset
+    if instance_ids:
+        instance_ids = set(instance_ids)
     # Load from local .json/.jsonl file
     if name.endswith(".json") or name.endswith(".jsonl"):
-        return [
-            cast(SWEbenchInstance, instance)
-            for instance in json.loads(Path(name).read_text())
-        ]
-
-    # Load from Hugging Face Datasets
-    if name.lower() in {"swe-bench", "swebench", "swe_bench"}:
-        name = "princeton-nlp/SWE-bench"
-    elif name.lower() in {"swe-bench-lite", "swebench-lite", "swe_bench_lite", "swe-bench_lite", "lite"}:
-        name = "princeton-nlp/SWE-bench_Lite"
-    dataset = cast(Dataset, load_dataset(name, split=split))
+        dataset = json.loads(Path(name).read_text())
+        dataset_ids = {instance[KEY_INSTANCE_ID] for instance in dataset}
+    else:
+        # Load from Hugging Face Datasets
+        if name.lower() in {"swe-bench", "swebench", "swe_bench"}:
+            name = "princeton-nlp/SWE-bench"
+        elif name.lower() in {"swe-bench-lite", "swebench-lite", "swe_bench_lite", "swe-bench_lite", "lite"}:
+            name = "princeton-nlp/SWE-bench_Lite"
+        dataset = cast(Dataset, load_dataset(name, split=split))
+        dataset_ids = {instance[KEY_INSTANCE_ID] for instance in dataset}
+    if instance_ids:
+        if instance_ids - dataset_ids:
+            raise ValueError(
+                (
+                    "Some instance IDs not found in dataset!"
+                    f"\nMissing IDs:\n{' '.join(instance_ids - dataset_ids)}"
+                )
+            )
+        dataset = [instance for instance in dataset if instance[KEY_INSTANCE_ID] in instance_ids]
     return [cast(SWEbenchInstance, instance) for instance in dataset]