From 48f3984cec0e04e3276aed77a9fda49119516af5 Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Fri, 17 May 2024 12:15:24 +0200 Subject: [PATCH 1/3] Add get_docker_image_path() for NcclTestSlurmCommandGenStrategy --- .../nccl_test/slurm_command_gen_strategy.py | 17 +++++++--- tests/test_slurm_command_gen_strategy.py | 31 +++++++++++++++++++ 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py index 0198332f..47d86e01 100644 --- a/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py @@ -60,6 +60,17 @@ def gen_exec_command( srun_command = self._generate_srun_command(slurm_args, final_env_vars, final_cmd_args, extra_cmd_args) return self._write_sbatch_script(slurm_args, env_vars_str, srun_command, output_path) + def get_docker_image_path(self, cmd_args: Dict[str, str]) -> str: + if os.path.isfile(cmd_args["docker_image_url"]): + image_path = cmd_args["docker_image_url"] + else: + image_path = os.path.join( + self.install_path, + NcclTestSlurmInstallStrategy.SUBDIR_PATH, + NcclTestSlurmInstallStrategy.DOCKER_IMAGE_FILENAME, + ) + return image_path + def _parse_slurm_args( self, job_name_prefix: str, @@ -69,11 +80,7 @@ def _parse_slurm_args( ) -> Dict[str, Any]: base_args = super()._parse_slurm_args(job_name_prefix, env_vars, cmd_args, nodes) - image_path = os.path.join( - self.install_path, - NcclTestSlurmInstallStrategy.SUBDIR_PATH, - NcclTestSlurmInstallStrategy.DOCKER_IMAGE_FILENAME, - ) + image_path = self.get_docker_image_path(cmd_args) container_mounts = "" if "NCCL_TOPO_FILE" in env_vars and "DOCKER_NCCL_TOPO_FILE" in env_vars: diff --git a/tests/test_slurm_command_gen_strategy.py b/tests/test_slurm_command_gen_strategy.py index ac6448ec..c1949268 100644 --- a/tests/test_slurm_command_gen_strategy.py +++ b/tests/test_slurm_command_gen_strategy.py @@ -4,6 +4,7 @@ from cloudai.schema.system import SlurmSystem from cloudai.schema.system.slurm import SlurmNode, SlurmNodeState from cloudai.schema.system.slurm.strategy import SlurmCommandGenStrategy +from cloudai.schema.test_template.nccl_test.slurm_command_gen_strategy import NcclTestSlurmCommandGenStrategy @pytest.fixture @@ -42,3 +43,33 @@ def test_filename_generation(strategy_fixture: SlurmCommandGenStrategy, tmp_path # Check the correctness of the sbatch command format assert sbatch_command == f"sbatch {filepath_from_command}" + + +class TestNcclTestSlurmCommandGenStrategy__GetDockerImagePath: + @pytest.fixture + def nccl_slurm_cmd_gen_strategy_fixture(self, tmp_path: Path) -> NcclTestSlurmCommandGenStrategy: + slurm_system = SlurmSystem( + name="TestSystem", + install_path=str(tmp_path / "install"), + output_path=str(tmp_path / "output"), + default_partition="main", + partitions={"main": [SlurmNode(name="node1", partition="main", state=SlurmNodeState.IDLE)]}, + ) + Path(slurm_system.install_path).mkdir() + Path(slurm_system.output_path).mkdir() + + env_vars = {"TEST_VAR": "VALUE"} + cmd_args = {"test_arg": "test_value"} + strategy = NcclTestSlurmCommandGenStrategy(slurm_system, env_vars, cmd_args) + return strategy + + def test_cmd_arg_file_doesnt_exist(self, nccl_slurm_cmd_gen_strategy_fixture: NcclTestSlurmCommandGenStrategy): + cmd_args = {"docker_image_url": f"{nccl_slurm_cmd_gen_strategy_fixture.install_path}/docker_image"} + image_path = nccl_slurm_cmd_gen_strategy_fixture.get_docker_image_path(cmd_args) + assert image_path == f"{nccl_slurm_cmd_gen_strategy_fixture.install_path}/nccl-test/nccl_test.sqsh" + + def test_cmd_arg_file_exists(self, nccl_slurm_cmd_gen_strategy_fixture: NcclTestSlurmCommandGenStrategy): + cmd_args = {"docker_image_url": f"{nccl_slurm_cmd_gen_strategy_fixture.install_path}/docker_image"} + Path(cmd_args["docker_image_url"]).touch() + image_path = nccl_slurm_cmd_gen_strategy_fixture.get_docker_image_path(cmd_args) + assert image_path == cmd_args["docker_image_url"] From f0356b25e9c632ffb91885361666b4d3ed9206b6 Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Fri, 17 May 2024 12:26:41 +0200 Subject: [PATCH 2/3] Add set_container_arg() for NeMoLauncherSlurmCommandGenStrategy --- .../slurm_command_gen_strategy.py | 14 +++++- tests/test_slurm_command_gen_strategy.py | 49 +++++++++++++------ 2 files changed, 48 insertions(+), 15 deletions(-) diff --git a/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py index c7da8865..984e5bdf 100644 --- a/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py @@ -85,7 +85,9 @@ def gen_exec_command( nodes = self.slurm_system.parse_nodes(nodes) if nodes: self.final_cmd_args["training.trainer.num_nodes"] = str(len(nodes)) - self.final_cmd_args["container"] = self.final_cmd_args["docker_image_url"] + + self.set_container_arg() + del self.final_cmd_args["repository_url"] del self.final_cmd_args["repository_commit_hash"] del self.final_cmd_args["docker_image_url"] @@ -102,6 +104,16 @@ def gen_exec_command( return full_cmd.strip() + def set_container_arg(self) -> None: + if os.path.isfile(self.final_cmd_args["docker_image_url"]): + self.final_cmd_args["container"] = self.final_cmd_args["docker_image_url"] + else: + self.final_cmd_args["container"] = os.path.join( + self.install_path, + NeMoLauncherSlurmInstallStrategy.SUBDIR_PATH, + NeMoLauncherSlurmInstallStrategy.DOCKER_IMAGE_FILENAME, + ) + def _handle_special_keys(self, key: str, value: Any, launcher_path: str, output_path: str) -> Any: """ Handles special formatting for specific keys. diff --git a/tests/test_slurm_command_gen_strategy.py b/tests/test_slurm_command_gen_strategy.py index c1949268..ead951c6 100644 --- a/tests/test_slurm_command_gen_strategy.py +++ b/tests/test_slurm_command_gen_strategy.py @@ -5,17 +5,25 @@ from cloudai.schema.system.slurm import SlurmNode, SlurmNodeState from cloudai.schema.system.slurm.strategy import SlurmCommandGenStrategy from cloudai.schema.test_template.nccl_test.slurm_command_gen_strategy import NcclTestSlurmCommandGenStrategy +from cloudai.schema.test_template.nemo_launcher.slurm_command_gen_strategy import NeMoLauncherSlurmCommandGenStrategy @pytest.fixture -def strategy_fixture() -> SlurmCommandGenStrategy: +def slurm_system(tmp_path: Path) -> SlurmSystem: slurm_system = SlurmSystem( name="TestSystem", - install_path="/path/to/install", - output_path="/path/to/output", + install_path=str(tmp_path / "install"), + output_path=str(tmp_path / "output"), default_partition="main", partitions={"main": [SlurmNode(name="node1", partition="main", state=SlurmNodeState.IDLE)]}, ) + Path(slurm_system.install_path).mkdir() + Path(slurm_system.output_path).mkdir() + return slurm_system + + +@pytest.fixture +def strategy_fixture(slurm_system: SlurmSystem) -> SlurmCommandGenStrategy: env_vars = {"TEST_VAR": "VALUE"} cmd_args = {"test_arg": "test_value"} strategy = SlurmCommandGenStrategy(slurm_system, env_vars, cmd_args) @@ -47,17 +55,7 @@ def test_filename_generation(strategy_fixture: SlurmCommandGenStrategy, tmp_path class TestNcclTestSlurmCommandGenStrategy__GetDockerImagePath: @pytest.fixture - def nccl_slurm_cmd_gen_strategy_fixture(self, tmp_path: Path) -> NcclTestSlurmCommandGenStrategy: - slurm_system = SlurmSystem( - name="TestSystem", - install_path=str(tmp_path / "install"), - output_path=str(tmp_path / "output"), - default_partition="main", - partitions={"main": [SlurmNode(name="node1", partition="main", state=SlurmNodeState.IDLE)]}, - ) - Path(slurm_system.install_path).mkdir() - Path(slurm_system.output_path).mkdir() - + def nccl_slurm_cmd_gen_strategy_fixture(self, slurm_system: SlurmSystem) -> NcclTestSlurmCommandGenStrategy: env_vars = {"TEST_VAR": "VALUE"} cmd_args = {"test_arg": "test_value"} strategy = NcclTestSlurmCommandGenStrategy(slurm_system, env_vars, cmd_args) @@ -73,3 +71,26 @@ def test_cmd_arg_file_exists(self, nccl_slurm_cmd_gen_strategy_fixture: NcclTest Path(cmd_args["docker_image_url"]).touch() image_path = nccl_slurm_cmd_gen_strategy_fixture.get_docker_image_path(cmd_args) assert image_path == cmd_args["docker_image_url"] + + +class TestNeMoLauncherSlurmCommandGenStrategy__SetContainerArg: + @pytest.fixture + def nemo_cmd_gen(self, slurm_system: SlurmSystem) -> NeMoLauncherSlurmCommandGenStrategy: + env_vars = {"TEST_VAR": "VALUE"} + cmd_args = {"test_arg": "test_value"} + strategy = NeMoLauncherSlurmCommandGenStrategy(slurm_system, env_vars, cmd_args) + return strategy + + def test_docker_image_url_is_not_file(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStrategy): + nemo_cmd_gen.final_cmd_args["docker_image_url"] = f"{nemo_cmd_gen.install_path}/docker_image" + nemo_cmd_gen.set_container_arg() + assert ( + nemo_cmd_gen.final_cmd_args["container"] + == f"{nemo_cmd_gen.install_path}/NeMo-Megatron-Launcher/nemo_megatron_launcher.sqsh" + ) + + def test_docker_image_url_is_file(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStrategy): + nemo_cmd_gen.final_cmd_args["docker_image_url"] = f"{nemo_cmd_gen.install_path}/docker_image" + Path(nemo_cmd_gen.final_cmd_args["docker_image_url"]).touch() + nemo_cmd_gen.set_container_arg() + assert nemo_cmd_gen.final_cmd_args["container"] == nemo_cmd_gen.final_cmd_args["docker_image_url"] From 70f2f99ebfdef4f4c3514b6401e6f5572095a68f Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Fri, 17 May 2024 12:28:59 +0200 Subject: [PATCH 3/3] Extend NeMoLauncherSlurmInstallStrategy docker image support --- .../nemo_launcher/slurm_install_strategy.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/cloudai/schema/test_template/nemo_launcher/slurm_install_strategy.py b/src/cloudai/schema/test_template/nemo_launcher/slurm_install_strategy.py index a23f04d9..28eff405 100644 --- a/src/cloudai/schema/test_template/nemo_launcher/slurm_install_strategy.py +++ b/src/cloudai/schema/test_template/nemo_launcher/slurm_install_strategy.py @@ -108,7 +108,11 @@ def is_installed(self) -> bool: docker_image_path = os.path.join(subdir_path, self.DOCKER_IMAGE_FILENAME) repo_path = os.path.join(subdir_path, self.REPOSITORY_NAME) repo_installed = os.path.isdir(repo_path) - docker_image_installed = os.path.isfile(docker_image_path) + + if not os.path.isfile(self.docker_image_url): + docker_image_installed = os.path.isfile(docker_image_path) + else: + docker_image_installed = True data_dir_path = self.default_cmd_args["data_dir"] datasets_ready = self._check_datasets_on_nodes(data_dir_path) @@ -142,7 +146,8 @@ def install(self) -> None: ) self._clone_repository(subdir_path) - self._setup_docker_image(self.slurm_system, subdir_path) + if not os.path.isfile(self.docker_image_url): + self._setup_docker_image(self.slurm_system, subdir_path) def _check_install_path_access(self): """