-
Notifications
You must be signed in to change notification settings - Fork 0
/
build_dataset.py
132 lines (118 loc) · 4.36 KB
/
build_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python3
import argparse
import logging
import os
import yaml
from typing import Optional
from datasets import Dataset, DatasetDict
from utils import (
generate_base_commit,
Repo,
)
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
def create_instance(
repo: Repo, original_owner: str, base_branch_name: str, removal: str, raw_info: dict
) -> dict:
"""Create a single task instance from a commit, where task instance is:
{
repo (Repo): owner/repo this task instance is from,
original_owner (str): the original owner of the repo
base_commit (str): SHA of the base commit for starter repo,
reference_commit(str): SHA of the commit for setting up environment,
patch (str): reference solution as .patch (apply to base commit),
}
"""
# extract_test_names needs to be called on the environment set up commit
base_commit = generate_base_commit(
repo,
raw_info["src_dir"],
raw_info["test_dir"],
spec_url=raw_info["specification"],
base_branch_name=base_branch_name,
removal=removal,
)
setup = dict()
setup["python"] = raw_info["python"]
setup["install"] = raw_info["install"]
setup["specification"] = raw_info["specification"]
if "pre_install" in raw_info:
setup["pre_install"] = raw_info["pre_install"]
if "packages" in raw_info:
setup["packages"] = raw_info["packages"]
if "pip_packages" in raw_info:
setup["pip_packages"] = raw_info["pip_packages"]
return {
"repo": f"{repo.owner}/{repo.name}",
"original_repo": f"{original_owner}/{repo.name}",
"base_commit": base_commit,
"reference_commit": repo.commit,
"setup": setup,
"test": {"test_cmd": raw_info["test_cmd"], "test_dir": raw_info["test_dir"]},
"src_dir": raw_info["src_dir"],
}
def main(
repo_file: str,
hf_name: str,
organization: str,
base_branch_name: str,
removal: str,
token: Optional[str] = None,
) -> None:
"""Main thread for creating task instances from existing repositories
Args:
----
repo_file (str): path to repository YAML file
hf_name (str): where to upload the dataset
organization (str): under which organization to fork repos to
base_branch_name (str): base of the branch name under which the base commit will be sent to
removal (str): strategy to remove code body
token (str): GitHub token
"""
if token is None:
# Get GitHub token from environment variable if not provided
token = os.environ.get("GITHUB_TOKEN")
examples = []
with open(repo_file, "r") as f:
repo_file = yaml.safe_load(f)
for idx, info in repo_file.items():
logger.info(f"Working on {info['name']}")
# can only provide tag or commit
assert (info["tag"] is None) ^ (info["commit"] is None)
if info["tag"] is not None:
if not info["tag"].startswith("tags/"):
info["tag"] = "tags/" + info["tag"]
head = info["tag"]
else:
head = info["commit"]
owner, repo = info["name"].split("/")
repo = Repo(owner, repo, organization=organization, head=head, token=token)
# Create task instance
instance = create_instance(repo, owner, base_branch_name, removal, info)
examples.append(instance)
ds = Dataset.from_list(examples)
ds = DatasetDict({"test": ds})
hf_name = f"{hf_name}_{removal}"
ds.push_to_hub(hf_name, private=True)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("repo_file", type=str, help="Path to pull request YAML file")
parser.add_argument("--hf_name", type=str, help="HF dataset name")
parser.add_argument(
"--organization",
type=str,
default="commit-0",
help="under which organization to fork repos to",
)
parser.add_argument("--token", type=str, help="GitHub token")
parser.add_argument(
"--base_branch_name",
type=str,
default="commit0",
help="base of the branch name under which the base commit will be sent to",
)
parser.add_argument("--removal", type=str, default="all", help="Removal method")
args = parser.parse_args()
main(**vars(args))