Skip to content

Commit

Permalink
Merge pull request #66 from yuntongzhang/main
Browse files Browse the repository at this point in the history
Keep experiment run script in sync with the source code
  • Loading branch information
yuntongzhang authored Sep 28, 2024
2 parents 1405ec9 + 804625c commit cd4b645
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 55 deletions.
11 changes: 7 additions & 4 deletions EXPERIMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ docker run -it yuntongzhang/auto-code-rover:experiment
Activate the conda environment in it.

```
source activate base
conda activate auto-code-rover
```

Expand All @@ -55,9 +56,12 @@ the experiment run using `gpt-4-0125-preview` on SWE-bench-lite.
python scripts/run.py conf/vanilla-lite.conf
```

This command runs auto-code-rover on the 300 instances in SWE-bench-lite, and runs the SWE-bench
evaluation on the generated patches. The final results of the experiment will be at
`/opt/auto-code-rover/experiment/vanilla-lite/final_report.json`.
This command runs auto-code-rover on the 300 instances in SWE-bench-lite, and consolidates all the generated
patches in a single file named `predictions_for_swebench.json`. For evaluating the correctness of the generated
patch, please copy this file out and evaluate it with either:

1. The [Containerized Evaluation Harness](https://github.com/princeton-nlp/SWE-bench/tree/main/docs/20240627_docker) by the SWE-bench team.
2. The [Moatless EvalTools](https://github.com/aorwall/SWE-bench-docker).


### Running experiments multiple times
Expand Down Expand Up @@ -90,7 +94,6 @@ Here are a few useful fields in the conf file:
- id: determines the name of the experiment output folder
- experiment_dir: where output will be stored
- setup_result_dir: must point to the directory where SWE-bench setup writes its results
- eval_log_dir: where the SWE-bench evaluation log is written to

- model: the model to be used by auto-code-rover
- temperature: model temperature
Expand Down
84 changes: 33 additions & 51 deletions scripts/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import os
import shutil
import subprocess
from datetime import datetime
from glob import glob
from os.path import dirname as pdirname
from os.path import join as pjoin
Expand Down Expand Up @@ -109,7 +108,6 @@ def run_agent(
model: str,
temperature: float,
enbale_sbfl: bool,
enable_layered: bool,
enable_validation: bool,
enable_angelic: bool,
enable_perfect_angelic: bool,
Expand All @@ -132,7 +130,7 @@ def run_agent(
added_env = {"PYTHONPATH": root_dir}
modified_env = {**os.environ, **added_env}

cmd = "python app/main.py "
cmd = "python app/main.py swe-bench "
cmd += f"--setup-map {setup_map_json} "
cmd += f"--tasks-map {tasks_map_json} "
cmd += f"--output-dir {expr_dir} "
Expand All @@ -143,8 +141,6 @@ def run_agent(
cmd += f"--num-processes {num_processes} "
if enbale_sbfl:
cmd += "--enable-sbfl "
if enable_layered:
cmd += "--enable-layered "
if enable_validation:
cmd += "--enable-validation "
if enable_angelic:
Expand Down Expand Up @@ -272,7 +268,7 @@ def generate_report(
return final_report_path


def generate_stats(expr_dir: str, eval_start_epoch: float, eval_end_epoch: float):
def generate_stats(expr_dir: str):
cost_files = glob(pjoin(expr_dir, "**", "*__*", "cost.json"))
cost_data = [json.loads(Path(file).read_text()) for file in cost_files]

Expand Down Expand Up @@ -307,14 +303,6 @@ def generate_stats(expr_dir: str, eval_start_epoch: float, eval_end_epoch: float
mean(x["elapsed_seconds"] for x in cost_data), 1
)

stats["eval_start_epoch"] = eval_start_epoch
stats["eval_end_epoch"] = eval_end_epoch
eval_elapsed = eval_end_epoch - eval_start_epoch
stats["eval_elapsed_mins"] = round(eval_elapsed / 60, 2)
stats["eval_avg_elapsed_secs"] = round(eval_elapsed / len(cost_data), 1)

stats["total_elapsed_mins"] = round((inference_elapsed + eval_elapsed) / 60, 2)

with open(pjoin(expr_dir, "stats.json"), "w") as f:
json.dump(stats, f, indent=4)

Expand All @@ -324,10 +312,10 @@ def main():
parser = argparse.ArgumentParser()
parser.add_argument("conf_file", help="Configuration file")
parser.add_argument(
"--eval-only",
"--with-eval",
action="store_true",
default=False,
help="Only do SWE-bench evaluation",
help="Additionally perform SWE-bench evaluation (deprecated)",
)
parser.add_argument(
"-f",
Expand Down Expand Up @@ -363,15 +351,14 @@ def main():
config_dict = config["DEFAULT"]
expr_id = config_dict["id"]
overall_expr_dir = config_dict["experiment_dir"]
swe_bench_dir = config_dict["swe_bench_dir"]
setup_result_dir = config_dict["setup_result_dir"]
eval_log_dir = config_dict["eval_log_dir"]
swe_bench_dir = config_dict.get("swe_bench_dir", fallback="")
eval_log_dir = config_dict.get("eval_log_dir", fallback="")

model = config_dict["model"]
temperature = float(config_dict["temperature"])
selected_tasks_file = config_dict["selected_tasks_file"]
enable_sbfl = config.getboolean("DEFAULT", "enable_sbfl", fallback=False)
enable_layered = config.getboolean("DEFAULT", "enable_layered", fallback=False)
enable_validation = config.getboolean(
"DEFAULT", "enable_validation", fallback=False
)
Expand All @@ -381,52 +368,47 @@ def main():
)
conv_round_limit = config.getint("DEFAULT", "conv_round_limit", fallback=15)

# with_sbfl = config.getboolean("DEFAULT", "with_sbfl", fallback=False)
print_more = config.getboolean("DEFAULT", "print", fallback=False)
num_processes = int(config_dict["num_processes"])

expr_dir = pjoin(overall_expr_dir, expr_id)
task_list_file_path = pjoin(expr_dir, os.path.basename(selected_tasks_file))
if not args.eval_only:
create_fresh_dir(expr_dir)

create_fresh_dir(expr_dir)

shutil.copy(selected_tasks_file, expr_dir)

script_dir = pdirname(os.path.realpath(__file__))
root_dir = pdirname(script_dir) # root of this repo

if args.eval_only:
swe_input_file = pjoin(expr_dir, "predictions_for_swebench.json")
else:
swe_input_file = run_agent(
root_dir,
setup_result_dir,
expr_dir,
task_list_file_path,
model,
temperature,
enable_sbfl,
enable_layered,
enable_validation,
enable_angelic,
enable_perfect_angelic,
print_more,
conv_round_limit,
num_processes,
)

eval_start_time = datetime.now()
expr_eval_log_dir = run_swe_bench_eval(
expr_id, swe_bench_dir, swe_input_file, eval_log_dir
swe_input_file = run_agent(
root_dir,
setup_result_dir,
expr_dir,
task_list_file_path,
model,
temperature,
enable_sbfl,
enable_validation,
enable_angelic,
enable_perfect_angelic,
print_more,
conv_round_limit,
num_processes,
)
eval_end_time = datetime.now()

final_report_path = generate_report(
expr_dir, swe_bench_dir, swe_input_file, expr_eval_log_dir, model
)
if args.with_eval:
expr_eval_log_dir = run_swe_bench_eval(
expr_id, swe_bench_dir, swe_input_file, eval_log_dir
)

final_report_path = generate_report(
expr_dir, swe_bench_dir, swe_input_file, expr_eval_log_dir, model
)

generate_stats(expr_dir, eval_start_time.timestamp(), eval_end_time.timestamp())
generate_stats(expr_dir)

print(f"Experiment {expr_id} done. Final report is at {final_report_path}.")
print(f"Experiment {expr_id} done. SWE-bench eval input file is at {swe_input_file}.")

if running_combined:
create_separate_reports(expr_dir, final_report_path)
Expand Down

0 comments on commit cd4b645

Please sign in to comment.