diff --git a/swebench/harness/docker_build.py b/swebench/harness/docker_build.py
index 7eb490b2..6af012f7 100644
--- a/swebench/harness/docker_build.py
+++ b/swebench/harness/docker_build.py
@@ -44,10 +44,13 @@ def __str__(self):
         )
 
 
-def setup_logger(instance_id: str, log_file: Path, mode="w"):
+def setup_logger(instance_id: str, log_file: Path, mode="w", add_stdout: bool = False):
     """
     This logger is used for logging the build process of images and containers.
     It writes logs to the log file.
+
+    If `add_stdout` is True, logs will also be sent to stdout, which can be used for
+    streaming ephemeral output from Modal containers.
     """
     log_file.parent.mkdir(parents=True, exist_ok=True)
     logger = logging.getLogger(f"{instance_id}.{log_file.name}")
@@ -58,6 +61,12 @@ def setup_logger(instance_id: str, log_file: Path, mode="w"):
     logger.setLevel(logging.INFO)
     logger.propagate = False
     setattr(logger, "log_file", log_file)
+    if add_stdout:
+        import sys
+        handler = logging.StreamHandler(sys.stdout)
+        formatter = logging.Formatter(f"%(asctime)s - {instance_id} - %(levelname)s - %(message)s")
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
     return logger
 
 
diff --git a/swebench/harness/run_evaluation.py b/swebench/harness/run_evaluation.py
index 05463714..fc410705 100644
--- a/swebench/harness/run_evaluation.py
+++ b/swebench/harness/run_evaluation.py
@@ -35,23 +35,10 @@
 )
 from swebench.harness.grading import get_eval_report
 from swebench.harness.test_spec import make_test_spec, TestSpec
-from swebench.harness.utils import load_swebench_dataset, str2bool
+from swebench.harness.utils import load_swebench_dataset, str2bool, EvaluationError
+from swebench.harness.run_evaluation_modal import run_instances_modal
 
 
-class EvaluationError(Exception):
-    def __init__(self, instance_id, message, logger):
-        super().__init__(message)
-        self.super_str = super().__str__()
-        self.instance_id = instance_id
-        self.log_file = logger.log_file
-        self.logger = logger
-
-    def __str__(self):
-        return (
-            f"Evaluation error for {self.instance_id}: {self.super_str}\n"
-            f"Check ({self.log_file}) for more information."
-        )
-
 
 def run_instance(
         test_spec: TestSpec,
@@ -217,7 +204,6 @@ def run_instance(
         close_logger(logger)
     return
 
-
 def run_instances(
         predictions: dict,
         instances: list,
@@ -287,7 +273,6 @@ def run_instances(
                     continue
     print("All instances run.")
 
-
 def get_dataset_from_preds(
         dataset_name: str,
         split: str,
@@ -483,6 +468,18 @@ def get_gold_predictions(dataset_name: str, split: str):
         } for datum in dataset
     ]
 
+def get_predictions_from_file(predictions_path: str, dataset_name: str, split: str):
+    if predictions_path == "gold":
+        print("Using gold predictions - ignoring predictions_path")
+        return get_gold_predictions(dataset_name, split)
+    if predictions_path.endswith(".json"):
+        with open(predictions_path, "r") as f:
+            return json.load(f)
+    elif predictions_path.endswith(".jsonl"):
+        with open(predictions_path, "r") as f:
+            return [json.loads(line) for line in f]
+    else:
+        raise ValueError("Predictions path must be .json or .jsonl")
 
 def main(
         dataset_name: str,
@@ -496,33 +493,34 @@ def main(
         open_file_limit: int,
         run_id: str,
         timeout: int,
+        modal: bool,
     ):
     """
     Run evaluation harness for the given dataset and predictions.
     """
     # set open file limit
     assert len(run_id) > 0, "Run ID must be provided"
-    resource.setrlimit(resource.RLIMIT_NOFILE, (open_file_limit, open_file_limit))
-    client = docker.from_env()
 
     # load predictions as map of instance_id to prediction
-    if predictions_path == 'gold':
-        print("Using gold predictions - ignoring predictions_path")
-        predictions = get_gold_predictions(dataset_name, split)
-    else:
-        if predictions_path.endswith(".json"):
-            with open(predictions_path, "r") as f:
-                predictions = json.load(f)
-        elif predictions_path.endswith(".jsonl"):
-            with open(predictions_path, "r") as f:
-                predictions = [json.loads(line) for line in f]
-        else:
-            raise ValueError("Predictions path must be \"gold\", .json, or .jsonl")
+    predictions = get_predictions_from_file(predictions_path, dataset_name, split)
     predictions = {pred[KEY_INSTANCE_ID]: pred for pred in predictions}
 
     # get dataset from predictions
     dataset = get_dataset_from_preds(dataset_name, split, instance_ids, predictions, run_id)
     full_dataset = load_swebench_dataset(dataset_name, split, instance_ids)
+
+    if modal:
+        # run instances on Modal
+        if not dataset:
+            print("No instances to run.")
+        else:
+            run_instances_modal(predictions, dataset, full_dataset, run_id, timeout)
+        return
+
+    # run instances locally
+    resource.setrlimit(resource.RLIMIT_NOFILE, (open_file_limit, open_file_limit))
+    client = docker.from_env()
+
     existing_images = list_images(client)
     print(f"Running {len(dataset)} unevaluated instances...")
     if not dataset:
@@ -536,18 +534,21 @@ def main(
     clean_images(client, existing_images, cache_level, clean)
     make_run_report(predictions, full_dataset, client, run_id)
 
-
 if __name__ == "__main__":
     parser = ArgumentParser()
+
+    # Common args
     parser.add_argument("--dataset_name", default="princeton-nlp/SWE-bench_Lite", type=str, help="Name of dataset or path to JSON file.")
     parser.add_argument("--split", type=str, default="test", help="Split of the dataset")
     parser.add_argument("--instance_ids", nargs="+", type=str, help="Instance IDs to run (space separated)")
     parser.add_argument("--predictions_path", type=str, help="Path to predictions file - if 'gold', uses gold predictions", required=True)
+
+    # Local execution args
     parser.add_argument("--max_workers", type=int, default=4, help="Maximum number of workers (should be <= 75%% of CPU cores)")
     parser.add_argument("--open_file_limit", type=int, default=4096, help="Open file limit")
     parser.add_argument(
         "--timeout", type=int, default=1_800, help="Timeout (in seconds) for running tests for each instance"
-        )
+    )
     parser.add_argument(
         "--force_rebuild", type=str2bool, default=False, help="Force rebuild of all images"
     )
@@ -564,6 +565,9 @@ def main(
         "--clean", type=str2bool, default=False, help="Clean images above cache level"
     )
     parser.add_argument("--run_id", type=str, required=True, help="Run ID - identifies the run")
-    args = parser.parse_args()
 
+    # Modal execution args
+    parser.add_argument("--modal", action="store_true", default=False, help="Run on Modal")
+
+    args = parser.parse_args()
     main(**vars(args))
diff --git a/swebench/harness/run_evaluation_modal.py b/swebench/harness/run_evaluation_modal.py
new file mode 100644
index 00000000..6817d33b
--- /dev/null
+++ b/swebench/harness/run_evaluation_modal.py
@@ -0,0 +1,528 @@
+from __future__ import annotations
+
+import json
+import traceback
+
+import time
+
+from pathlib import Path
+
+from dataclasses import dataclass
+import modal
+import modal.container_process
+import modal.io_streams
+
+from typing import cast
+
+from logging import Logger
+
+from swebench.harness.docker_build import setup_logger
+from swebench.harness.constants import KEY_INSTANCE_ID
+from swebench.harness.utils import EvaluationError
+
+import asyncio
+import tenacity
+
+app = modal.App("swebench-evaluation")
+
+swebench_image = modal.Image.debian_slim().pip_install("swebench", "tenacity")
+
+from swebench.harness.constants import (
+    APPLY_PATCH_FAIL,
+    APPLY_PATCH_PASS,
+    RUN_EVALUATION_LOG_DIR,
+)
+from swebench.harness.grading import get_eval_report
+from swebench.harness.test_spec import make_test_spec, TestSpec
+
+
+@dataclass
+class TestOutput:
+    instance_id: str
+    test_output: str
+    report_json_str: str
+    run_instance_log: str
+    patch_diff: str
+    log_dir: Path
+    errored: bool
+
+class ModalSandboxRuntime:
+    """
+    Runtime for running instances in a Modal Sandbox.
+    """
+    def __init__(self, test_spec: TestSpec, timeout: int | None = None, verbose: bool = True):
+        self.test_spec = test_spec
+        self.image = ModalSandboxRuntime.get_instance_image(test_spec)
+        self.sandbox = self._get_sandbox(timeout)
+        self.verbose = verbose
+        self._stream_tasks = []
+
+        # Hack for pylint
+        self.write_file("/sys/fs/cgroup/cpu/cpu.shares", "2048")
+
+    @tenacity.retry(stop=tenacity.stop_after_attempt(5))
+    def _get_sandbox(self, timeout: int | None = None):
+        # Sometimes network flakiness causes the image build to fail,
+        # so we retry a few times.
+        if timeout is None:
+            # Default 30 minutes
+            timeout = 60 * 30
+
+        return modal.Sandbox.create(image=self.image, timeout=timeout, cpu=4)
+    
+    async def _read_stream(self, stream: modal.io_streams.StreamReader, output_list: list[str]):
+        try:
+            async for line in stream:
+                output_list.append(line)
+                if self.verbose:
+                    print(line)
+        except asyncio.CancelledError:
+            pass
+        except Exception as e:
+            print(f"Error reading stream: {e}")
+
+    async def _read_output(self, p: modal.container_process.ContainerProcess, stdout: list[str], stderr: list[str]):
+        self._stream_tasks = [
+            asyncio.create_task(self._read_stream(p.stdout, stdout)),
+            asyncio.create_task(self._read_stream(p.stderr, stderr))
+        ]
+        try:
+            await asyncio.gather(*self._stream_tasks)
+        except asyncio.CancelledError:
+            pass
+
+    def write_file(self, file_path: str, content: str) -> modal.container_process.ContainerProcess:
+        bash_command = f"""cat <<'EOF' > {file_path}
+{content}
+EOF"""
+        p = self.sandbox.exec("bash", "-c", bash_command)
+        p.wait()
+        return p
+    
+    def exec(self, *args, **kwargs) -> tuple[str, int]:
+        """
+        Execute a command in the sandbox.
+
+        Returns:
+            tuple[str, int]: Sandbox output and return code.
+        """
+        p = self.sandbox.exec(*args, **kwargs)
+        stdout = []
+        stderr = []
+        try:
+            # We separate stdout/stderr because some tests rely on them being separate.
+            # We still read stdout/stderr simultaneously to continuously 
+            # flush both streams and avoid blocking.
+            asyncio.run(self._read_output(p, stdout, stderr))
+        except Exception as e:
+            print(f"Error during command execution: {e}")
+        p.wait()
+        return "".join(stdout + stderr), p.returncode
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self._stream_tasks:
+            try:
+                # Forcefully kill remaining streams
+                for task in self._stream_tasks:
+                    if not task.done():
+                        task.cancel()
+                        try:
+                            asyncio.wait_for(task, timeout=0.1)
+                        except asyncio.TimeoutError:
+                            pass
+                        except Exception:
+                            pass
+
+                self.sandbox.terminate()
+            except Exception:
+                pass
+            finally:
+                self._stream_tasks = []
+
+    @staticmethod
+    def get_instance_image(test_spec: TestSpec) -> modal.Image:
+        env_script = test_spec.setup_env_script
+        repo_script = test_spec.install_repo_script
+
+        remote_env_script_path = "/root/setup_env.sh"
+        remote_repo_script_path = "/root/setup_repo.sh"
+
+        Path(remote_env_script_path).write_text(env_script)
+        Path(remote_repo_script_path).write_text(repo_script)
+
+        # Modal automatically caches images
+        # https://modal.com/docs/guide/custom-container#image-caching-and-rebuilds
+        return (
+            modal.Image.from_registry("ubuntu:22.04", add_python="3.11")
+            .run_commands("apt update")
+            .env({"DEBIAN_FRONTEND": "noninteractive", "TZ": "Etc/UTC"})
+            .apt_install(
+                "wget",
+                "git",
+                "build-essential",
+                "libffi-dev",
+                "libtiff-dev",
+                "jq",
+                "curl",
+                "locales",
+                "locales-all",
+                "tzdata",
+            )
+            .run_commands(
+                "wget 'https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-2-Linux-x86_64.sh' -O miniconda.sh",
+                "bash miniconda.sh -b -p /opt/miniconda3",
+                "echo 'export PATH=/opt/miniconda3/bin:$PATH' >> ~/.bashrc",
+                "/opt/miniconda3/bin/conda init --all",
+                "/opt/miniconda3/bin/conda config --append channels conda-forge",
+                "adduser --disabled-password --gecos 'dog' nonroot",
+            )
+            .copy_local_file(Path(remote_env_script_path), remote_env_script_path)
+            .copy_local_file(Path(remote_repo_script_path), remote_repo_script_path)
+            .run_commands(
+                f"chmod +x {remote_env_script_path}",
+                f"/bin/bash -c 'source ~/.bashrc && {remote_env_script_path}'",
+                "echo 'source /opt/miniconda3/etc/profile.d/conda.sh && conda activate testbed' >> /root/.bashrc",
+                f"/bin/bash {remote_repo_script_path}",
+            )
+            .workdir("/testbed/")
+        )
+    
+def make_run_report(
+        predictions: dict,
+        full_dataset: list,
+        run_id: str
+    ) -> Path:
+    """
+    Make a final evaluation and run report of the instances that have been run.
+
+    Args:
+        predictions (dict): Predictions dict generated by the model
+        full_dataset (list): List of all instances
+        run_id (str): Run ID
+    
+    Returns:
+        Path to report file
+    """
+    # Sets to store IDs of different outcomes
+    completed_ids = set()
+    resolved_ids = set()
+    error_ids = set()
+    unresolved_ids = set()
+    incomplete_ids = set()
+    empty_patch_ids = set()
+
+    for instance in full_dataset:
+        instance_id = instance[KEY_INSTANCE_ID]
+
+        # Instances that were not submitted
+        if instance_id not in predictions:
+            incomplete_ids.add(instance_id)
+            continue
+        
+        # Instances with empty patches
+        prediction = predictions[instance_id]
+        if prediction.get("model_patch", None) in ["", None]:
+            empty_patch_ids.add(instance_id)
+            continue
+
+        # Instances that errored
+        log_dir = get_log_dir(predictions[instance_id], run_id, instance_id)
+        report_file = log_dir / "report.json"
+        if not report_file.exists():
+            error_ids.add(instance_id)
+            continue
+
+        # Instance completed successfully
+        completed_ids.add(instance_id)
+        try:
+            report = json.loads(report_file.read_text())
+            if report[instance_id]["resolved"]:
+                resolved_ids.add(instance_id)
+            else:
+                unresolved_ids.add(instance_id)
+        except Exception as e:
+            print(f"{instance_id}: error loading report.json: {e}")
+            error_ids.add(instance_id)
+
+    # Print final report
+    dataset_ids = {i[KEY_INSTANCE_ID] for i in full_dataset}
+    print(f"Total instances: {len(full_dataset)}")
+    print(f"Instances submitted: {len(set(predictions.keys()) & dataset_ids)}")
+    print(f"Instances completed: {len(completed_ids)}")
+    print(f"Instances incomplete: {len(incomplete_ids)}")
+    print(f"Instances resolved: {len(resolved_ids)}")
+    print(f"Instances unresolved: {len(unresolved_ids)}")
+    print(f"Instances with empty patches: {len(empty_patch_ids)}")
+    print(f"Instances with errors: {len(error_ids)}")
+
+    # Write report to file
+    report = {
+        "total_instances": len(full_dataset),
+        "submitted_instances": len(predictions),
+        "completed_instances": len(completed_ids),
+        "resolved_instances": len(resolved_ids),
+        "unresolved_instances": len(unresolved_ids),
+        "empty_patch_instances": len(empty_patch_ids),
+        "error_instances": len(error_ids),
+        "completed_ids": list(sorted(completed_ids)),
+        "incomplete_ids": list(sorted(incomplete_ids)),
+        "empty_patch_ids": list(sorted(empty_patch_ids)),
+        "submitted_ids": list(sorted(predictions.keys())),
+        "resolved_ids": list(sorted(resolved_ids)),
+        "unresolved_ids": list(sorted(unresolved_ids)),
+        "error_ids": list(sorted(error_ids)),
+        "schema_version": 2,
+    }
+
+    report_file = Path(
+        list(predictions.values())[0]["model_name_or_path"].replace("/", "__")
+        + f".{run_id}"
+        + ".json"
+    )
+
+    with open(report_file, "w") as f:
+        print(json.dumps(report, indent=4), file=f)
+
+    print(f"Report written to {report_file}")
+    return report_file
+
+def get_log_dir(pred: dict, run_id: str, instance_id: str) -> Path:
+    model_name_or_path = cast(str, pred.get("model_name_or_path", "None").replace("/", "__"))
+    return RUN_EVALUATION_LOG_DIR / run_id / model_name_or_path / instance_id
+
+@app.function(
+    image=swebench_image,
+    timeout=120*60, # Much larger than default timeout to account for image build time
+)
+def run_instance_modal(
+        test_spec: TestSpec,
+        pred: dict,
+        run_id: str,
+        timeout: int | None = None,
+    ) -> TestOutput:
+    """
+    Run a single instance with the given prediction.
+
+    Args:
+        test_spec (TestSpec): TestSpec instance
+        pred (dict): Prediction w/ model_name_or_path, model_patch, instance_id
+        run_id (str): Run ID
+        timeout (int): Timeout for running tests
+    """
+    instance_id = test_spec.instance_id
+    log_dir = get_log_dir(pred, run_id, instance_id)
+    log_dir.mkdir(parents=True, exist_ok=True)
+
+    log_file = log_dir / "run_instance.log"
+
+    logger = setup_logger(instance_id, log_file, add_stdout=True)
+
+    try:
+        runner = ModalSandboxRuntime(test_spec, timeout)
+    except Exception as e:
+        print(f"Error creating sandbox: {e}")
+        raise EvaluationError(
+            instance_id,
+            f"Error creating sandbox: {e}",
+            logger,
+        ) from e
+
+    patch_diff = pred.get("model_patch", "")
+
+    try:
+        patch_file = "/tmp/patch.diff"
+        runner.write_file(patch_file, patch_diff)
+
+        apply_patch_output, returncode = runner.exec(
+            "bash",
+            "-c",
+            "cd /testbed && git apply -v /tmp/patch.diff",
+        )
+
+        if returncode != 0:
+            logger.info(f"Failed to apply patch to container, trying again...")
+
+            apply_patch_output, returncode = runner.exec(
+                "bash",
+                "-c",
+                "cd /testbed && patch --batch --fuzz=5 -p1 -i /tmp/patch.diff",
+            )
+
+            if returncode != 0:
+                logger.info(f"{APPLY_PATCH_FAIL}:\n{apply_patch_output}")
+                raise EvaluationError(
+                    instance_id,
+                    f"{APPLY_PATCH_FAIL}:\n{apply_patch_output}",
+                    logger,
+                )
+            else:
+                logger.info(f"{APPLY_PATCH_PASS}:\n{apply_patch_output}")
+        else:
+            logger.info(f"{APPLY_PATCH_PASS}:\n{apply_patch_output}")
+
+
+        # Get git diff before running eval script
+        git_diff_output_before, returncode = runner.exec(
+            "bash",
+            "-c",
+            "cd /testbed && git diff",
+        )
+        logger.info(f"Git diff before:\n{git_diff_output_before}")
+
+        eval_file = "/root/eval.sh"
+        eval_script = test_spec.eval_script
+        # Hack for django
+        eval_script = eval_script.replace("locale-gen", "locale-gen en_US.UTF-8")
+        runner.write_file(eval_file, eval_script)
+
+        start_time = time.time()
+
+        run_command = "cd /testbed"
+        if "pylint" in test_spec.instance_id:
+            run_command += " && PYTHONPATH="
+        run_command += " && /bin/bash /root/eval.sh"
+        test_output, returncode = runner.exec(
+            "bash",
+            "-c",
+            run_command,
+        )
+
+        total_runtime = time.time() - start_time
+
+        test_output_path = log_dir / "test_output.txt"
+        logger.info(f'Test runtime: {total_runtime:_.2f} seconds')
+        with open(test_output_path, "w") as f:
+            f.write(test_output)
+            logger.info(f"Test output for {instance_id} written to {test_output_path}")
+            print(f"Test output for {instance_id} written to {test_output_path}")
+
+        # Get git diff after running eval script
+        git_diff_output_after, returncode = runner.exec(
+            "bash",
+            "-c",
+            "cd /testbed && git diff",
+        )
+
+        # Check if git diff changed after running eval script
+        logger.info(f"Git diff after:\n{git_diff_output_after}")
+        if git_diff_output_after != git_diff_output_before:
+            logger.info(f"Git diff changed after running eval script")
+
+        # Get report from test output
+        logger.info(f"Grading answer for {instance_id}...")
+        report = get_eval_report(
+            test_spec=test_spec,
+            prediction=pred,
+            log_path=test_output_path,
+            include_tests_status=True,
+        )
+        logger.info(
+            f"report: {report}\n"
+            f"Result for {instance_id}: resolved: {report[instance_id]['resolved']}"
+        )
+
+        return TestOutput(
+            instance_id=instance_id,
+            test_output=test_output,
+            report_json_str=json.dumps(report, indent=4),
+            run_instance_log=log_file.read_text(),
+            patch_diff=patch_diff,
+            log_dir=log_dir,
+            errored=False,
+        )
+    except modal.exception.SandboxTimeoutError as e:
+        raise EvaluationError(
+            instance_id,
+            f"Test timed out after {timeout} seconds.",
+            logger,
+        ) from e
+    except EvaluationError as e:
+        error_msg = traceback.format_exc()
+        logger.info(error_msg)
+        return TestOutput(
+            instance_id=instance_id,
+            test_output="",
+            report_json_str="",
+            run_instance_log=log_file.read_text(),
+            patch_diff=patch_diff,
+            log_dir=log_dir,
+            errored=True,
+        )
+    except Exception as e:
+        error_msg = (f"Error in evaluating model for {instance_id}: {e}\n"
+                     f"{traceback.format_exc()}\n"
+                     f"Check ({logger.log_file}) for more information.")
+        logger.error(error_msg)
+        return TestOutput(
+            instance_id=instance_id,
+            test_output="",
+            report_json_str="",
+            run_instance_log=log_file.read_text(),
+            patch_diff=patch_diff,
+            log_dir=log_dir,
+            errored=True,
+        )
+
+def run_instances_modal(
+        predictions: dict,
+        instances: list,
+        full_dataset: list,
+        run_id: str,
+        timeout: int,
+    ):
+    """
+    Run all instances for the given predictions on Modal.
+
+    Args:
+        predictions (dict): Predictions dict generated by the model
+        instances (list): List of instances
+        run_id (str): Run ID
+        timeout (int): Timeout for running tests
+    """
+    test_specs = list(map(make_test_spec, instances))
+
+    with modal.enable_output():
+        with app.run():
+            run_test_specs = []
+
+            # Check for instances that have already been run
+            for test_spec in test_specs:
+                log_dir = get_log_dir(predictions[test_spec.instance_id], run_id, test_spec.instance_id)
+                if log_dir.exists():
+                    continue
+                run_test_specs.append(test_spec)
+
+            # Run instances that haven't been run yet
+            results = run_instance_modal.starmap(
+                [
+                    (
+                        test_spec,
+                        predictions[test_spec.instance_id],
+                        run_id,
+                        timeout,
+                    )
+                    for test_spec in run_test_specs
+                ],
+            )
+
+            for result in results:
+                result = cast(TestOutput, result)
+
+                # Save logs locally
+                log_dir = result.log_dir
+                log_dir.mkdir(parents=True, exist_ok=True)
+                with open(log_dir / "run_instance.log", "w") as f:
+                    f.write(result.run_instance_log)
+                with open(log_dir / "test_output.txt", "w") as f:
+                    f.write(result.test_output)
+                with open(log_dir / "patch.diff", "w") as f:
+                    f.write(result.patch_diff)
+                with open(log_dir / "report.json", "w") as f:
+                    try:
+                        report_json = json.loads(result.report_json_str)
+                        json.dump(report_json, f, indent=4)
+                    except Exception:
+                        # This happens if the test fails with any exception
+                        print(f"{result.instance_id}: no report.json")
+
+            make_run_report(predictions, full_dataset, run_id)
diff --git a/swebench/harness/utils.py b/swebench/harness/utils.py
index 4e53a4e8..10ee638d 100644
--- a/swebench/harness/utils.py
+++ b/swebench/harness/utils.py
@@ -21,6 +21,20 @@
 
 load_dotenv()
 
+class EvaluationError(Exception):
+    def __init__(self, instance_id, message, logger):
+        super().__init__(message)
+        self.super_str = super().__str__()
+        self.instance_id = instance_id
+        self.log_file = logger.log_file
+        self.logger = logger
+
+    def __str__(self):
+        return (
+            f"Evaluation error for {self.instance_id}: {self.super_str}\n"
+            f"Check ({self.log_file}) for more information."
+        )
+
 
 def load_swebench_dataset(name="princeton-nlp/SWE-bench", split="test", instance_ids=None) -> list[SWEbenchInstance]:
     """