Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update docker support for Nemo/Nccl testing #18

Merged
merged 4 commits into from
May 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,17 @@ def gen_exec_command(
srun_command = self._generate_srun_command(slurm_args, final_env_vars, final_cmd_args, extra_cmd_args)
return self._write_sbatch_script(slurm_args, env_vars_str, srun_command, output_path)

def get_docker_image_path(self, cmd_args: Dict[str, str]) -> str:
if os.path.isfile(cmd_args["docker_image_url"]):
image_path = cmd_args["docker_image_url"]
else:
image_path = os.path.join(
self.install_path,
NcclTestSlurmInstallStrategy.SUBDIR_PATH,
NcclTestSlurmInstallStrategy.DOCKER_IMAGE_FILENAME,
)
return image_path

def _parse_slurm_args(
self,
job_name_prefix: str,
Expand All @@ -69,11 +80,7 @@ def _parse_slurm_args(
) -> Dict[str, Any]:
base_args = super()._parse_slurm_args(job_name_prefix, env_vars, cmd_args, nodes)

image_path = os.path.join(
self.install_path,
NcclTestSlurmInstallStrategy.SUBDIR_PATH,
NcclTestSlurmInstallStrategy.DOCKER_IMAGE_FILENAME,
)
image_path = self.get_docker_image_path(cmd_args)

container_mounts = ""
if "NCCL_TOPO_FILE" in env_vars and "DOCKER_NCCL_TOPO_FILE" in env_vars:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,9 @@ def gen_exec_command(
nodes = self.slurm_system.parse_nodes(nodes)
if nodes:
self.final_cmd_args["training.trainer.num_nodes"] = str(len(nodes))
self.final_cmd_args["container"] = self.final_cmd_args["docker_image_url"]

self.set_container_arg()

del self.final_cmd_args["repository_url"]
del self.final_cmd_args["repository_commit_hash"]
del self.final_cmd_args["docker_image_url"]
Expand All @@ -102,6 +104,16 @@ def gen_exec_command(

return full_cmd.strip()

def set_container_arg(self) -> None:
if os.path.isfile(self.final_cmd_args["docker_image_url"]):
self.final_cmd_args["container"] = self.final_cmd_args["docker_image_url"]
else:
self.final_cmd_args["container"] = os.path.join(
self.install_path,
NeMoLauncherSlurmInstallStrategy.SUBDIR_PATH,
NeMoLauncherSlurmInstallStrategy.DOCKER_IMAGE_FILENAME,
)

def _handle_special_keys(self, key: str, value: Any, launcher_path: str, output_path: str) -> Any:
"""
Handles special formatting for specific keys.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,11 @@ def is_installed(self) -> bool:
docker_image_path = os.path.join(subdir_path, self.DOCKER_IMAGE_FILENAME)
repo_path = os.path.join(subdir_path, self.REPOSITORY_NAME)
repo_installed = os.path.isdir(repo_path)
docker_image_installed = os.path.isfile(docker_image_path)

if not os.path.isfile(self.docker_image_url):
docker_image_installed = os.path.isfile(docker_image_path)
else:
docker_image_installed = True

data_dir_path = self.default_cmd_args["data_dir"]
datasets_ready = self._check_datasets_on_nodes(data_dir_path)
Expand Down Expand Up @@ -142,7 +146,8 @@ def install(self) -> None:
)

self._clone_repository(subdir_path)
self._setup_docker_image(self.slurm_system, subdir_path)
if not os.path.isfile(self.docker_image_url):
self._setup_docker_image(self.slurm_system, subdir_path)

def _check_install_path_access(self):
"""
Expand Down
58 changes: 55 additions & 3 deletions tests/test_slurm_command_gen_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,26 @@
from cloudai.schema.system import SlurmSystem
from cloudai.schema.system.slurm import SlurmNode, SlurmNodeState
from cloudai.schema.system.slurm.strategy import SlurmCommandGenStrategy
from cloudai.schema.test_template.nccl_test.slurm_command_gen_strategy import NcclTestSlurmCommandGenStrategy
from cloudai.schema.test_template.nemo_launcher.slurm_command_gen_strategy import NeMoLauncherSlurmCommandGenStrategy


@pytest.fixture
def strategy_fixture() -> SlurmCommandGenStrategy:
def slurm_system(tmp_path: Path) -> SlurmSystem:
slurm_system = SlurmSystem(
name="TestSystem",
install_path="/path/to/install",
output_path="/path/to/output",
install_path=str(tmp_path / "install"),
output_path=str(tmp_path / "output"),
default_partition="main",
partitions={"main": [SlurmNode(name="node1", partition="main", state=SlurmNodeState.IDLE)]},
)
Path(slurm_system.install_path).mkdir()
Path(slurm_system.output_path).mkdir()
return slurm_system


@pytest.fixture
def strategy_fixture(slurm_system: SlurmSystem) -> SlurmCommandGenStrategy:
env_vars = {"TEST_VAR": "VALUE"}
cmd_args = {"test_arg": "test_value"}
strategy = SlurmCommandGenStrategy(slurm_system, env_vars, cmd_args)
Expand Down Expand Up @@ -42,3 +51,46 @@ def test_filename_generation(strategy_fixture: SlurmCommandGenStrategy, tmp_path

# Check the correctness of the sbatch command format
assert sbatch_command == f"sbatch {filepath_from_command}"


class TestNcclTestSlurmCommandGenStrategy__GetDockerImagePath:
@pytest.fixture
def nccl_slurm_cmd_gen_strategy_fixture(self, slurm_system: SlurmSystem) -> NcclTestSlurmCommandGenStrategy:
env_vars = {"TEST_VAR": "VALUE"}
cmd_args = {"test_arg": "test_value"}
strategy = NcclTestSlurmCommandGenStrategy(slurm_system, env_vars, cmd_args)
return strategy

def test_cmd_arg_file_doesnt_exist(self, nccl_slurm_cmd_gen_strategy_fixture: NcclTestSlurmCommandGenStrategy):
cmd_args = {"docker_image_url": f"{nccl_slurm_cmd_gen_strategy_fixture.install_path}/docker_image"}
image_path = nccl_slurm_cmd_gen_strategy_fixture.get_docker_image_path(cmd_args)
assert image_path == f"{nccl_slurm_cmd_gen_strategy_fixture.install_path}/nccl-test/nccl_test.sqsh"

def test_cmd_arg_file_exists(self, nccl_slurm_cmd_gen_strategy_fixture: NcclTestSlurmCommandGenStrategy):
cmd_args = {"docker_image_url": f"{nccl_slurm_cmd_gen_strategy_fixture.install_path}/docker_image"}
Path(cmd_args["docker_image_url"]).touch()
image_path = nccl_slurm_cmd_gen_strategy_fixture.get_docker_image_path(cmd_args)
assert image_path == cmd_args["docker_image_url"]


class TestNeMoLauncherSlurmCommandGenStrategy__SetContainerArg:
@pytest.fixture
def nemo_cmd_gen(self, slurm_system: SlurmSystem) -> NeMoLauncherSlurmCommandGenStrategy:
env_vars = {"TEST_VAR": "VALUE"}
cmd_args = {"test_arg": "test_value"}
strategy = NeMoLauncherSlurmCommandGenStrategy(slurm_system, env_vars, cmd_args)
return strategy

def test_docker_image_url_is_not_file(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStrategy):
nemo_cmd_gen.final_cmd_args["docker_image_url"] = f"{nemo_cmd_gen.install_path}/docker_image"
nemo_cmd_gen.set_container_arg()
assert (
nemo_cmd_gen.final_cmd_args["container"]
== f"{nemo_cmd_gen.install_path}/NeMo-Megatron-Launcher/nemo_megatron_launcher.sqsh"
)

def test_docker_image_url_is_file(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStrategy):
nemo_cmd_gen.final_cmd_args["docker_image_url"] = f"{nemo_cmd_gen.install_path}/docker_image"
Path(nemo_cmd_gen.final_cmd_args["docker_image_url"]).touch()
nemo_cmd_gen.set_container_arg()
assert nemo_cmd_gen.final_cmd_args["container"] == nemo_cmd_gen.final_cmd_args["docker_image_url"]