diff --git a/.gitignore b/.gitignore index e0582af..9062184 100644 --- a/.gitignore +++ b/.gitignore @@ -167,4 +167,4 @@ config.yml hydra_outputs/ .commit0* .agent* -docs/analysis_*.md \ No newline at end of file +docs/analysis*.md \ No newline at end of file diff --git a/docs/analysis.md b/docs/analysis.md deleted file mode 100644 index bc2a8f9..0000000 --- a/docs/analysis.md +++ /dev/null @@ -1,6 +0,0 @@ - -| | Name | Summary | | -|--|--------|----------|--| -||[reference](/analysis_reference)|3628 / 33 ; duration: 18.66s|| -||[test-save-commit0](/analysis_test-save-commit0)|0 / 0 ; duration: 0.00s|| -||[model_name-claude-3-5-sonnet-20240620__run_tests-0__use_lint_info-0__use_spec_info-0](/analysis_model_name-claude-3-5-sonnet-20240620__run_tests-0__use_lint_info-0__use_spec_info-0)|0 / 0 ; duration: 0.00s|| \ No newline at end of file diff --git a/docs/javascripts/tablesort.js b/docs/javascripts/tablesort.js new file mode 100644 index 0000000..c916015 --- /dev/null +++ b/docs/javascripts/tablesort.js @@ -0,0 +1,6 @@ +document$.subscribe(function() { + var tables = document.querySelectorAll("article table:not([class])") + tables.forEach(function(table) { + new Tablesort(table) + }) + }) \ No newline at end of file diff --git a/docs/render_submissions.py b/docs/render_submissions.py index 799bb16..e2c95c2 100644 --- a/docs/render_submissions.py +++ b/docs/render_submissions.py @@ -2,15 +2,19 @@ import os import glob import ast -from datasets import load_dataset +import subprocess import json import shutil import argparse +import pypdf +import tqdm + +from datasets import load_dataset from transformers import AutoTokenizer + from commit0.harness.constants import SPLIT from commit0.harness.utils import clone_repo from commit0.cli import write_commit0_dot_file -import pypdf import logging @@ -35,26 +39,34 @@ def get_pytest_info(path_to_logs, repo_name, branch_name): } report_file_path = os.path.join(path_to_logs, pytest_hash, "report.json") if not os.path.exists(report_file_path): - reason_for_failure = open( + if os.path.exists( os.path.join(path_to_logs, pytest_hash, "test_output.txt") - ).read() + ): + reason_for_failure = open( + os.path.join(path_to_logs, pytest_hash, "test_output.txt") + ).read() + else: + reason_for_failure = "Unknown failure." pytest_info[testname]["failed_to_run"] = reason_for_failure return pytest_info pytest_report = json.load(open(report_file_path)) pytest_summary = pytest_report["summary"] pytest_info[testname]["summary"] = pytest_summary + # TODO this is a hacky fix, should eventually do a check against true num collected + if pytest_summary["collected"] < 5: + reason_for_failure = "Pytest collection failure." + pytest_info[testname]["failed_to_run"] = reason_for_failure + return pytest_info pytest_info[testname]["duration"] = pytest_report["duration"] if "passed" not in pytest_summary: pytest_summary["passed"] = 0 for test in pytest_report["tests"]: - if test["outcome"] == "passed": + if test["outcome"] in {"passed", "skipped"}: continue if "longrepr" in test: failure_string = test["longrepr"] elif "???" in test: failure_string = test["???"]["longrepr"] - elif test["outcome"] == "error": - failure_string = test["setup"]["longrepr"] elif "setup" in test and "longrepr" in test["setup"]: failure_string = test["setup"]["longrepr"] elif "call" in test and "longrepr" in test["call"]: @@ -76,17 +88,6 @@ def get_pytest_info(path_to_logs, repo_name, branch_name): def get_coverage_info(path_to_logs, repo_name, branch_name): - # coverage_fp = open(os.path.join(path_to_logs, pytest_hash, "coverage.json")) - # for filename, file_coverage in json.load(coverage_fp)["files"].items(): - # if not any(relevant_function.startswith(filename) for relevant_function in relevant_functions): - # continue - # for funcname, func_coverage in file_coverage["functions"].items(): - # if f"{filename}::{funcname}" not in relevant_functions: continue - # pycov_info[testname][f"{filename}::{funcname}"] = { - # "implementation": submission_info["function_impls"][f"{filename}::{funcname}"], - # "executed_lines": func_coverage["executed_lines"], - # "executed_branches": func_coverage["executed_branches"] - # } raise NotImplementedError @@ -119,7 +120,7 @@ def get_blank_repo_metrics( print(f"{e}: Trouble opening {filename}") continue - filename = filename[len(blank_source_code_folder):].lstrip(" /") + filename = filename[len(blank_source_code_folder) :].lstrip(" /") try: code_tree = ast.parse(code) except Exception as e: @@ -162,145 +163,212 @@ def get_blank_repo_metrics( return blank_repo_metrics -def render_mds(subfolder="docs"): - all_submissions = {} +leaderboard_header = """\n\n## Leaderboard ({split}) +| Name | Repos Resolved (/{num_repos}) | Total Tests Passed (/{total_num_tests}) | Test Duration (s) | Date | Analysis | Github | +|------|:-------------------------:|:--------------------:|:--------------------:|:----------:|----|----| """ - method_repo_pytests = {} - for branch_name in glob.glob(os.path.join(analysis_files_path, "*")): - branch_name = os.path.basename(branch_name) - if branch_name in {"blank", "repos", "submission_repos"}: - continue - all_submissions[branch_name] = {} - for repo_file in glob.glob( - os.path.join(analysis_files_path, branch_name, "*.json") - ): +submission_table_header = """# Submission Name: **{display_name}** (split: {split}) - repo_metrics_output_file = os.path.join( - analysis_files_path, branch_name, repo_file - ) - repo_metrics = json.load(open(repo_metrics_output_file)) - repo_name = os.path.basename(repo_file[: -len(".json")]) - - all_submissions[branch_name][repo_name] = {} - - method_repo_pytests[ - f"{branch_name}_{repo_name}" - ] = f"# Submission Name: {branch_name}\n# Repository: {repo_name}" - if "pytest_results" in repo_metrics: - repo_metrics = repo_metrics["pytest_results"] - for pytest_group, pytest_info in repo_metrics.items(): - pytest_group = os.path.basename(pytest_group.strip("/")) - patch_diff = ( - f"""\n\n### Patch diff\n```diff\n{pytest_info['patch_diff']}```""" - ) - if "failed_to_run" in pytest_info: - all_submissions[branch_name][repo_name][pytest_group] = { - "failed_to_run": pytest_info["failed_to_run"] - } - method_repo_pytests[ - f"{branch_name}_{repo_name}" - ] += f"""\n## Failed to run pytests\n```\n{pytest_info['failed_to_run']}\n```""" - else: - all_submissions[branch_name][repo_name][pytest_group] = { - "summary": pytest_info["summary"], - "duration": pytest_info["duration"], - } - method_repo_pytests[ - f"{branch_name}_{repo_name}" - ] += f"""\n## Pytest Summary: {pytest_group} +| Repository | Resolved | Pass Rate | Test Duration (s) | Analysis | Github Link | +|------------|---------|:-----:|:-----:|-----|-----|""" + +pytest_summary_table_header = """\n## Pytest Summary for test `{pytest_group}` | status | count | |:---------|:-----:| """ - for category, count in pytest_info["summary"].items(): - if category not in {"duration"}: - method_repo_pytests[ - f"{branch_name}_{repo_name}" - ] += f"""| {category} | {count} |\n""" - else: - method_repo_pytests[ - f"{branch_name}_{repo_name}" - ] += f"""| {category} | {float(count):.2f}s |\n""" - - method_repo_pytests[ - f"{branch_name}_{repo_name}" - ] += f"\n## Failed pytest outputs: {pytest_group}\n\n" - for testname, failure in pytest_info["failures"].items(): - shortened_testname = os.path.basename(testname) - method_repo_pytests[f"{branch_name}_{repo_name}"] += ( - f"### {shortened_testname}\n\n
{shortened_testname}"
-                            f"
\n{failure['failure_string']}\n
\n
\n" - ) - back_button = f"[back to {branch_name} summary]({f'analysis_{branch_name}'})\n\n" - with open( - os.path.join(subfolder, f"analysis_{branch_name}_{repo_name}.md"), "w" - ) as wf: - wf.write( - back_button - + method_repo_pytests[f"{branch_name}_{repo_name}"] - + patch_diff + +def render_mds(overwrite_previous, subfolder="docs"): + leaderboard = {} + + split_to_total_tests = { + "lite": 3628, + "all": 140926, + } # hard-coded to skip running it later + for split in tqdm.tqdm(["lite", "all"]): + num_repos = len(SPLIT[split]) + # total_num_tests = 0 + # for repo_name in SPLIT[split]: + # repo_tests = subprocess.run(['commit0', 'get-tests', repo_name], capture_output=True, text=True).stdout.strip() + # total_num_tests += len(repo_tests.splitlines()) + leaderboard[split] = leaderboard_header.format( + split=split, + num_repos=num_repos, + total_num_tests=split_to_total_tests[split], + ) + + for org_path in tqdm.tqdm(glob.glob(os.path.join(analysis_files_path, "*"))): + org_name = os.path.basename(org_path) + if org_name in {"blank", "repos", "submission_repos"}: + continue + for branch_path in glob.glob(os.path.join(org_path, "*.json")): + cum_tests_passed = 0 + repos_resolved = 0 + total_duration = 0.0 + branch_metrics = json.load(open(branch_path)) + submission_info = branch_metrics["submission_info"] + split = submission_info["split"] + org_name = submission_info["org_name"] + project_page_link = submission_info["project_page"] + display_name = submission_info["display_name"] + submission_date = submission_info["submission_date"] + branch_name = submission_info["branch"] + org_branch_filepath = os.path.join( + subfolder, f"analysis_{org_name}_{branch_name}.md" + ) + write_submission = True + if os.path.exists(org_branch_filepath) and not overwrite_previous: + write_submission = False + + if write_submission: + submission_page = submission_table_header.format( + display_name=display_name, split=split ) - # Render general page. Has buttons to all methods - leaderboard = """ -| | Name | Summary | | -|--|--------|----------|--|""" - # Render method page. Per method, buttons to all repos. - method_to_repos = {} - # Render method & repo page. Has "back" button. - for branch_name, branch_info in all_submissions.items(): - cum_pytests = {"passed": 0} - method_to_repos[branch_name] = """ -| | Repository | Summary | | -|-|------------|---------|-|""" - total_duration = 0.0 - for repo_name, repo_test_info in branch_info.items(): - for testname, test_info in repo_test_info.items(): - if "failed_to_run" in test_info: - summary_pytests_string = "failure" - else: - total_duration += test_info["duration"] - summary_pytests_string = ( - f"`{testname}`: {test_info['summary']['passed']} / " - f"{test_info['summary']['collected']} ; duration: {test_info['duration']:.2f}s" + for repo_name, repo_pytest_results in branch_metrics.items(): + if repo_name == "submission_info": + continue + if write_submission: + submission_repo_page = f"# **{display_name}**: {repo_name}" + org_branch_repo_filepath = os.path.join( + subfolder, f"analysis_{org_name}_{branch_name}_{repo_name}.md" ) - for category, count in test_info["summary"].items(): - if category not in cum_pytests: - cum_pytests[category] = 0 - if isinstance(count, int): - cum_pytests[category] += int(count) - elif isinstance(count, float): - cum_pytests[category] += float(count) - method_to_repos[branch_name] += ( - f"\n||[{repo_name}]({f'analysis_{branch_name}_{repo_name}'})|" - f"{summary_pytests_string}||" - ) - break # assume we ran all tests. will add functionality for checking diff tests later, as we need it. - summary_pytests_string = ( - f"{cum_pytests['passed']} / {cum_pytests['collected']} ; duration: {total_duration:.2f}s" - ) - leaderboard += f"\n||[{branch_name}]({f'analysis_{branch_name}'})|{summary_pytests_string}||" - back_button = f"[back to all submissions]({f'analysis'})\n\n" - with open(os.path.join(subfolder, f"analysis_{branch_name}.md"), "w") as wf: - wf.write(back_button + "\n" + method_to_repos[branch_name]) - with open(os.path.join(subfolder, "analysis.md"), "w") as wf: - wf.write(leaderboard) + if isinstance(repo_pytest_results, str): + submission_repo_page = f"# **{display_name}**: {repo_name}\n\n## Failed to clone\n\n{repo_pytest_results}" + org_branch_repo_filepath = os.path.join( + subfolder, f"analysis_{org_name}_{branch_name}_{repo_name}.md" + ) + github_hyperlink = ( + f"{project_page_link}/{repo_name}/tree/{branch_name}" + ) + if branch_name == "reference": + github_hyperlink = f"{project_page_link}/{repo_name}" + submission_page = submission_table_header.format( + display_name=display_name, split=split + ) + ( + f"\n| {repo_name} | No; Failed to clone. | - | - | " + f"[Analysis](/{f'analysis_{org_name}_{branch_name}_{repo_name}'}) | " + f"[Github]({github_hyperlink}) |" + ) + back_button = f"[back to {display_name} summary](/{f'analysis_{org_name}_{branch_name}'})\n\n" + with open(org_branch_repo_filepath, "w") as wf: + wf.write(back_button + submission_repo_page) + continue + + for pytest_group, pytest_info in repo_pytest_results.items(): + pytest_group = os.path.basename(pytest_group.strip("/")) + patch_diff = f"""\n\n## Patch diff\n```diff\n{pytest_info['patch_diff']}```""" + if "failed_to_run" in pytest_info: + resolved = False + if write_submission: + submission_repo_page += ( + f"\n## Failed to run pytests for test `{pytest_group}`\n" + f"```\n{pytest_info['failed_to_run']}\n```" + ) + pytest_details = "Pytest failed" + duration = "Failed." + else: + resolved = False + if "passed" in pytest_info["summary"]: + if "skipped" in pytest_info["summary"]: + resolved = pytest_info["summary"]["passed"] + pytest_info["summary"]["skipped"] == pytest_info["summary"]["total"] + else: + resolved = pytest_info["summary"]["passed"] == pytest_info["summary"]["total"] + if write_submission: + submission_repo_page += pytest_summary_table_header.format( + pytest_group=pytest_group + ) + for category, count in pytest_info["summary"].items(): + if category not in {"duration"}: + submission_repo_page += ( + f"""| {category} | {count} |\n""" + ) + else: + submission_repo_page += ( + f"""| {category} | {float(count):.2f}s |\n""" + ) + + submission_repo_page += "\n## Failed pytests:\n\n" + for testname, failure in pytest_info["failures"].items(): + shortened_testname = os.path.basename(testname) + submission_repo_page += ( + f"### {shortened_testname}\n\n
{shortened_testname}"
+                                    f"
\n{failure['failure_string']}\n
\n
\n" + ) + cum_tests_passed += pytest_info["summary"]["passed"] + total_duration += pytest_info["duration"] + repos_resolved += int(resolved) + if write_submission: + pytest_details = f"{pytest_info['summary']['passed']} / {pytest_info['summary']['total']}" + duration = f"{pytest_info['duration']:.2f}" + break + if write_submission: + github_hyperlink = ( + f"{project_page_link}/{repo_name}/tree/{branch_name}" + ) + if branch_name == "reference": + github_hyperlink = f"{project_page_link}/{repo_name}" + submission_page += ( + f"\n| {repo_name} | {'Yes' if resolved else 'No'} | {pytest_details} | " + f"{duration} | [Analysis](/{f'analysis_{org_name}_{branch_name}_{repo_name}'}) | " + f"[Github]({github_hyperlink}) |" + ) + back_button = f"[back to {display_name} summary](/{f'analysis_{org_name}_{branch_name}'})\n\n" + with open(org_branch_repo_filepath, "w") as wf: + wf.write(back_button + submission_repo_page + patch_diff) + if write_submission: + back_button = f"[back to all submissions](/{f'analysis'})\n\n" + with open(org_branch_filepath, "w") as wf: + wf.write(back_button + "\n" + submission_page) + analysis_link = f"[Analysis](/{f'analysis_{org_name}_{branch_name}'})" + github_link = f"[Github]({project_page_link})" + leaderboard[split] += ( + f"\n|{display_name}|" + f"{repos_resolved}|" + f"{cum_tests_passed}|" + f"{total_duration:.2f}|" + f"{submission_date}|" + f"{analysis_link}|" + f"{github_link}|" + ) + + leaderboard_filepath = os.path.join(subfolder, "analysis.md") + with open(leaderboard_filepath, "w") as wf: + wf.write(leaderboard["lite"] + "\n\n" + leaderboard["all"]) def get_args(): parser = argparse.ArgumentParser() - parser.add_argument("--do_setup", action="store_true") - parser.add_argument("--get_blank_details", action="store_true") - parser.add_argument("--get_reference_details", action="store_true") - parser.add_argument("--keep_previous_eval", action="store_true") - parser.add_argument("--analyze_submissions", action="store_true") + parser.add_argument( + "--do_setup", action="store_true", help="Run commit0 setup with specified split" + ) + parser.add_argument( + "--get_blank_details", + action="store_true", + help="Get difficulty metrics of blank repository", + ) + parser.add_argument( + "--get_reference_details", + action="store_true", + help="Get pytest results from reference", + ) + parser.add_argument( + "--analyze_submissions", + action="store_true", + help="Get pytest results from submissions with split", + ) parser.add_argument("--render_webpages", action="store_true") - - parser.add_argument("--split", type=str, default="lite") + parser.add_argument("--split", type=str, help="all or lite") parser.add_argument( "--tokenizer_name", type=str, default="meta-llama/Meta-Llama-3.1-8B-Instruct" ) + parser.add_argument( + "--overwrite_previous_eval", + action="store_true", + help="Overwrite cached pytest info" + # TODO add finer granularity so can specify which ones to overwrite + ) return parser.parse_args() @@ -320,14 +388,14 @@ def main(args): f"--commit0-dot-file-path {analysis_files_path}/repos/.commit0.yaml" ) branch_name = "blank" - if not args.keep_previous_eval: + if args.overwrite_previous_eval: if os.path.exists(os.path.join(analysis_files_path, branch_name)): shutil.rmtree(os.path.join(analysis_files_path, branch_name)) os.makedirs(os.path.join(analysis_files_path, branch_name), exist_ok=True) tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name) for example in dataset: repo_name = example["repo"].split("/")[-1] - if args.split != "all" and repo_name not in SPLIT[args.split]: + if repo_name not in SPLIT[args.split]: continue repo_metrics_output_file = os.path.join( @@ -350,73 +418,114 @@ def main(args): json.dump(repo_metrics, open(repo_metrics_output_file, "w"), indent=4) if args.get_reference_details: + branch_name = "reference" + org_name = f"commit0_{args.split}" + commit0_dot_file_path = os.path.join( + analysis_files_path, "repos", org_name, branch_name, ".commit0.yaml" + ) + submission_repos_path = os.path.join( + analysis_files_path, "repos", org_name, branch_name + ) if args.do_setup: os.system( - f"commit0 setup {args.split} --base-dir {analysis_files_path}/repos " - f"--commit0-dot-file-path {analysis_files_path}/repos/.commit0.yaml" + f"commit0 setup {args.split} --base-dir {submission_repos_path} " + f"--commit0-dot-file-path {commit0_dot_file_path}" ) - branch_name = "reference" - os.makedirs(os.path.join(analysis_files_path, branch_name), exist_ok=True) - if not args.keep_previous_eval: - for repo_log_path in glob.glob(f"{os.getcwd()}/logs/pytest/*"): - if os.path.exists(os.path.join(repo_log_path, branch_name)): - shutil.rmtree(os.path.join(repo_log_path, branch_name)) - os.system( - "commit0 evaluate --reference " - f"--commit0-dot-file-path {analysis_files_path}/repos/.commit0.yaml" + submission_metrics_output_file = os.path.join( + analysis_files_path, org_name, f"{branch_name}.json" ) + submission_details = { + "submission_info": { + "org_name": org_name, + "branch": branch_name, + "display_name": "Reference (Gold)", + "submission_date": "NA", + "split": args.split, + "project_page": "https://github.com/commit-0", + } + } + os.makedirs(os.path.join(analysis_files_path, org_name), exist_ok=True) + need_re_eval = False + for repo_log_path in glob.glob(f"{os.getcwd()}/logs/pytest/*"): + if os.path.exists(os.path.join(repo_log_path, branch_name)): + if args.overwrite_previous_eval: + shutil.rmtree(os.path.join(repo_log_path, branch_name)) + else: + need_re_eval = True + if args.overwrite_previous_eval or need_re_eval: + os.system( + "commit0 evaluate --reference " + f"--commit0-dot-file-path {commit0_dot_file_path}" + ) # get coverage and pytest info for each repo for example in dataset: repo_name = example["repo"].split("/")[-1] - if args.split != "all" and repo_name not in SPLIT[args.split]: + if repo_name not in SPLIT[args.split]: continue - repo_metrics_output_file = os.path.join( - analysis_files_path, branch_name, f"{repo_name}.json" - ) - path_to_logs = f"{os.getcwd()}/logs/pytest/{repo_name}/{branch_name}" pytest_results = get_pytest_info(path_to_logs, repo_name, branch_name) - json.dump(pytest_results, open(repo_metrics_output_file, "w"), indent=4) + submission_details[repo_name] = pytest_results + json.dump( + submission_details, open(submission_metrics_output_file, "w"), indent=4 + ) + print(f"Saved pytest info to {submission_metrics_output_file}") if args.analyze_submissions: - commit0_dot_file_path = os.path.join( - analysis_files_path, "submission_repos", ".commit0.yaml" - ) - if not args.keep_previous_eval: - for subfolder in glob.glob(os.path.join(analysis_files_path, "*")): - if os.path.basename(subfolder.rstrip("/")) not in {"blank", "reference", "repos", "submission_repos"}: - try: - print(f"Clearing {subfolder}") - shutil.rmtree(subfolder) - except Exception as e: - print(f"{e}: when removing {subfolder}") - - for submission in submission_dataset: - branch_name = submission["name"] - os.makedirs(os.path.join(analysis_files_path, branch_name), exist_ok=True) - if not args.keep_previous_eval: - for repo_log_path in glob.glob(f"{os.getcwd()}/logs/pytest/*"): - if os.path.exists(os.path.join(repo_log_path, branch_name)): - shutil.rmtree(os.path.join(repo_log_path, branch_name)) + for submission in tqdm.tqdm(submission_dataset): + submission_details = {"submission_info": submission} + branch_name = submission["branch"] + org_name = submission["org_name"] + split = submission["split"] + if split != args.split: + continue + submission_metrics_output_file = os.path.join( + analysis_files_path, org_name, f"{branch_name}.json" + ) + if ( + os.path.exists(submission_metrics_output_file) + and not args.overwrite_previous_eval + ): + continue + submission_repos_path = os.path.join( + analysis_files_path, "submission_repos", org_name, branch_name + ) + if os.path.exists(submission_repos_path): + shutil.rmtree(submission_repos_path) + os.makedirs(os.path.join(analysis_files_path, org_name), exist_ok=True) + commit0_dot_file_path = os.path.join( + analysis_files_path, + "submission_repos", + org_name, + branch_name, + ".commit0.yaml", + ) + for repo_log_path in glob.glob(f"{os.getcwd()}/logs/pytest/*"): + if os.path.exists(os.path.join(repo_log_path, branch_name)): + shutil.rmtree(os.path.join(repo_log_path, branch_name)) for example in dataset: repo_name = example["repo"].split("/")[-1] - if args.split != "all" and repo_name not in SPLIT[args.split]: + if split != "all" and repo_name not in SPLIT[split]: continue - clone_url = f"https://github.com/test-save-commit0/{repo_name}.git" + clone_url = f"https://github.com/{org_name}/{repo_name}.git" clone_dir = os.path.abspath( - os.path.join(analysis_files_path, "submission_repos", repo_name) + os.path.join(submission_repos_path, repo_name) ) - clone_repo(clone_url, clone_dir, branch_name, logger) + try: + clone_repo(clone_url, clone_dir, branch_name, logger) + except Exception as e: + submission_details[repo_name] = f"Error cloning: {e}" + if os.path.exists(clone_dir): + shutil.rmtree(clone_dir) # after successfully setup, write the commit0 dot file write_commit0_dot_file( commit0_dot_file_path, { "dataset_name": commit0_dataset_name, "dataset_split": "test", - "repo_split": args.split, - "base_dir": os.path.join(analysis_files_path, "submission_repos"), + "repo_split": split, + "base_dir": submission_repos_path, }, ) # run pytests @@ -426,22 +535,21 @@ def main(args): ) for example in dataset: repo_name = example["repo"].split("/")[-1] - if args.split != "all" and repo_name not in SPLIT[args.split]: + if split != "all" and repo_name not in SPLIT[split]: + continue + if repo_name in submission_details: # Failed to clone earlier, skip. continue - - repo_metrics_output_file = os.path.join( - analysis_files_path, branch_name, f"{repo_name}.json" - ) - path_to_logs = f"{os.getcwd()}/logs/pytest/{repo_name}/{branch_name}" pytest_results = get_pytest_info(path_to_logs, repo_name, branch_name) - json.dump(pytest_results, open(repo_metrics_output_file, "w"), indent=4) + submission_details[repo_name] = pytest_results + json.dump( + submission_details, open(submission_metrics_output_file, "w"), indent=4 + ) + print(f"Saved pytest info to {submission_metrics_output_file}") - if not args.keep_previous_eval: - for analysis_file in glob.glob("docs/analysis*.md"): - os.unlink(analysis_file) if args.render_webpages: - render_mds() + # Render only updated leaderboard and new submissions + render_mds(args.overwrite_previous_eval) main(get_args()) diff --git a/docs/update_submissions_dataset.py b/docs/update_submissions_dataset.py new file mode 100644 index 0000000..134847e --- /dev/null +++ b/docs/update_submissions_dataset.py @@ -0,0 +1,12 @@ +from datasets import Dataset + +submissions = { + "org_name": ["test-save-commit0", "commit0-lite-with-test", "commit0-lite-plain", "commit0-all-plain"], + "branch": ["baseline", "fillin", "fillin", "fillin"], + "display_name": ["Claude Sonnet 3.5 - Base", "Claude Sonnet 3.5 - Fill-in + Unit Test Feedback", "Claude Sonnet 3.5 - Fill-in", "Claude Sonnet 3.5 - Fill-in"], + "submission_date": ["09/25/2024", "09/25/2024", "09/25/2024", "09/25/2024"], + "split": ["lite", "lite", "lite", "all"], + "project_page": ["https://github.com/test-save-commit0", "https://github.com/commit0-lite-with-test", "https://github.com/commit0-lite-plain", "https://github.com/commit0-all-plain"] +} + +Dataset.from_dict(submissions).push_to_hub("celinelee/commit0_submissions") \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 5c5f1f9..2a7d83b 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -6,7 +6,7 @@ nav: - Commit0: setupdist.md - Agent: agent.md - API: api.md - - Submission Analysis: analysis.md + - Leaderboard: analysis.md theme: name: material logo: "logo2.webp" @@ -19,3 +19,7 @@ markdown_extensions: - pymdownx.snippets - pymdownx.highlight - pymdownx.superfences + +extra_javascript: + - https://unpkg.com/tablesort@5.3.0/dist/tablesort.min.js + - javascripts/tablesort.js \ No newline at end of file