diff --git a/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py index 0198332f..47d86e01 100644 --- a/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py @@ -60,6 +60,17 @@ def gen_exec_command( srun_command = self._generate_srun_command(slurm_args, final_env_vars, final_cmd_args, extra_cmd_args) return self._write_sbatch_script(slurm_args, env_vars_str, srun_command, output_path) + def get_docker_image_path(self, cmd_args: Dict[str, str]) -> str: + if os.path.isfile(cmd_args["docker_image_url"]): + image_path = cmd_args["docker_image_url"] + else: + image_path = os.path.join( + self.install_path, + NcclTestSlurmInstallStrategy.SUBDIR_PATH, + NcclTestSlurmInstallStrategy.DOCKER_IMAGE_FILENAME, + ) + return image_path + def _parse_slurm_args( self, job_name_prefix: str, @@ -69,11 +80,7 @@ def _parse_slurm_args( ) -> Dict[str, Any]: base_args = super()._parse_slurm_args(job_name_prefix, env_vars, cmd_args, nodes) - image_path = os.path.join( - self.install_path, - NcclTestSlurmInstallStrategy.SUBDIR_PATH, - NcclTestSlurmInstallStrategy.DOCKER_IMAGE_FILENAME, - ) + image_path = self.get_docker_image_path(cmd_args) container_mounts = "" if "NCCL_TOPO_FILE" in env_vars and "DOCKER_NCCL_TOPO_FILE" in env_vars: diff --git a/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py index c7da8865..984e5bdf 100644 --- a/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py @@ -85,7 +85,9 @@ def gen_exec_command( nodes = self.slurm_system.parse_nodes(nodes) if nodes: self.final_cmd_args["training.trainer.num_nodes"] = str(len(nodes)) - self.final_cmd_args["container"] = self.final_cmd_args["docker_image_url"] + + self.set_container_arg() + del self.final_cmd_args["repository_url"] del self.final_cmd_args["repository_commit_hash"] del self.final_cmd_args["docker_image_url"] @@ -102,6 +104,16 @@ def gen_exec_command( return full_cmd.strip() + def set_container_arg(self) -> None: + if os.path.isfile(self.final_cmd_args["docker_image_url"]): + self.final_cmd_args["container"] = self.final_cmd_args["docker_image_url"] + else: + self.final_cmd_args["container"] = os.path.join( + self.install_path, + NeMoLauncherSlurmInstallStrategy.SUBDIR_PATH, + NeMoLauncherSlurmInstallStrategy.DOCKER_IMAGE_FILENAME, + ) + def _handle_special_keys(self, key: str, value: Any, launcher_path: str, output_path: str) -> Any: """ Handles special formatting for specific keys. diff --git a/src/cloudai/schema/test_template/nemo_launcher/slurm_install_strategy.py b/src/cloudai/schema/test_template/nemo_launcher/slurm_install_strategy.py index a23f04d9..28eff405 100644 --- a/src/cloudai/schema/test_template/nemo_launcher/slurm_install_strategy.py +++ b/src/cloudai/schema/test_template/nemo_launcher/slurm_install_strategy.py @@ -108,7 +108,11 @@ def is_installed(self) -> bool: docker_image_path = os.path.join(subdir_path, self.DOCKER_IMAGE_FILENAME) repo_path = os.path.join(subdir_path, self.REPOSITORY_NAME) repo_installed = os.path.isdir(repo_path) - docker_image_installed = os.path.isfile(docker_image_path) + + if not os.path.isfile(self.docker_image_url): + docker_image_installed = os.path.isfile(docker_image_path) + else: + docker_image_installed = True data_dir_path = self.default_cmd_args["data_dir"] datasets_ready = self._check_datasets_on_nodes(data_dir_path) @@ -142,7 +146,8 @@ def install(self) -> None: ) self._clone_repository(subdir_path) - self._setup_docker_image(self.slurm_system, subdir_path) + if not os.path.isfile(self.docker_image_url): + self._setup_docker_image(self.slurm_system, subdir_path) def _check_install_path_access(self): """ diff --git a/tests/test_slurm_command_gen_strategy.py b/tests/test_slurm_command_gen_strategy.py index ac6448ec..ead951c6 100644 --- a/tests/test_slurm_command_gen_strategy.py +++ b/tests/test_slurm_command_gen_strategy.py @@ -4,17 +4,26 @@ from cloudai.schema.system import SlurmSystem from cloudai.schema.system.slurm import SlurmNode, SlurmNodeState from cloudai.schema.system.slurm.strategy import SlurmCommandGenStrategy +from cloudai.schema.test_template.nccl_test.slurm_command_gen_strategy import NcclTestSlurmCommandGenStrategy +from cloudai.schema.test_template.nemo_launcher.slurm_command_gen_strategy import NeMoLauncherSlurmCommandGenStrategy @pytest.fixture -def strategy_fixture() -> SlurmCommandGenStrategy: +def slurm_system(tmp_path: Path) -> SlurmSystem: slurm_system = SlurmSystem( name="TestSystem", - install_path="/path/to/install", - output_path="/path/to/output", + install_path=str(tmp_path / "install"), + output_path=str(tmp_path / "output"), default_partition="main", partitions={"main": [SlurmNode(name="node1", partition="main", state=SlurmNodeState.IDLE)]}, ) + Path(slurm_system.install_path).mkdir() + Path(slurm_system.output_path).mkdir() + return slurm_system + + +@pytest.fixture +def strategy_fixture(slurm_system: SlurmSystem) -> SlurmCommandGenStrategy: env_vars = {"TEST_VAR": "VALUE"} cmd_args = {"test_arg": "test_value"} strategy = SlurmCommandGenStrategy(slurm_system, env_vars, cmd_args) @@ -42,3 +51,46 @@ def test_filename_generation(strategy_fixture: SlurmCommandGenStrategy, tmp_path # Check the correctness of the sbatch command format assert sbatch_command == f"sbatch {filepath_from_command}" + + +class TestNcclTestSlurmCommandGenStrategy__GetDockerImagePath: + @pytest.fixture + def nccl_slurm_cmd_gen_strategy_fixture(self, slurm_system: SlurmSystem) -> NcclTestSlurmCommandGenStrategy: + env_vars = {"TEST_VAR": "VALUE"} + cmd_args = {"test_arg": "test_value"} + strategy = NcclTestSlurmCommandGenStrategy(slurm_system, env_vars, cmd_args) + return strategy + + def test_cmd_arg_file_doesnt_exist(self, nccl_slurm_cmd_gen_strategy_fixture: NcclTestSlurmCommandGenStrategy): + cmd_args = {"docker_image_url": f"{nccl_slurm_cmd_gen_strategy_fixture.install_path}/docker_image"} + image_path = nccl_slurm_cmd_gen_strategy_fixture.get_docker_image_path(cmd_args) + assert image_path == f"{nccl_slurm_cmd_gen_strategy_fixture.install_path}/nccl-test/nccl_test.sqsh" + + def test_cmd_arg_file_exists(self, nccl_slurm_cmd_gen_strategy_fixture: NcclTestSlurmCommandGenStrategy): + cmd_args = {"docker_image_url": f"{nccl_slurm_cmd_gen_strategy_fixture.install_path}/docker_image"} + Path(cmd_args["docker_image_url"]).touch() + image_path = nccl_slurm_cmd_gen_strategy_fixture.get_docker_image_path(cmd_args) + assert image_path == cmd_args["docker_image_url"] + + +class TestNeMoLauncherSlurmCommandGenStrategy__SetContainerArg: + @pytest.fixture + def nemo_cmd_gen(self, slurm_system: SlurmSystem) -> NeMoLauncherSlurmCommandGenStrategy: + env_vars = {"TEST_VAR": "VALUE"} + cmd_args = {"test_arg": "test_value"} + strategy = NeMoLauncherSlurmCommandGenStrategy(slurm_system, env_vars, cmd_args) + return strategy + + def test_docker_image_url_is_not_file(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStrategy): + nemo_cmd_gen.final_cmd_args["docker_image_url"] = f"{nemo_cmd_gen.install_path}/docker_image" + nemo_cmd_gen.set_container_arg() + assert ( + nemo_cmd_gen.final_cmd_args["container"] + == f"{nemo_cmd_gen.install_path}/NeMo-Megatron-Launcher/nemo_megatron_launcher.sqsh" + ) + + def test_docker_image_url_is_file(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStrategy): + nemo_cmd_gen.final_cmd_args["docker_image_url"] = f"{nemo_cmd_gen.install_path}/docker_image" + Path(nemo_cmd_gen.final_cmd_args["docker_image_url"]).touch() + nemo_cmd_gen.set_container_arg() + assert nemo_cmd_gen.final_cmd_args["container"] == nemo_cmd_gen.final_cmd_args["docker_image_url"]