From c4ec66506ee4163665aae5b0550aa5f2e1e5c8d2 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Wed, 7 Aug 2024 12:32:35 -0400 Subject: [PATCH] Add Kubernetes support for Sleep Co-authored-by: Peng Wang --- src/cloudai/__init__.py | 6 ++ .../schema/test_template/sleep/__init__.py | 4 ++ .../sleep/kubernetes_install_strategy.py | 30 +++++++++ .../sleep/kubernetes_json_gen_strategy.py | 65 +++++++++++++++++++ tests/test_init.py | 4 ++ 5 files changed, 109 insertions(+) create mode 100644 src/cloudai/schema/test_template/sleep/kubernetes_install_strategy.py create mode 100644 src/cloudai/schema/test_template/sleep/kubernetes_json_gen_strategy.py diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py index f702fb47..d6e0754a 100644 --- a/src/cloudai/__init__.py +++ b/src/cloudai/__init__.py @@ -76,6 +76,8 @@ ) from .schema.test_template.nemo_launcher.template import NeMoLauncher from .schema.test_template.sleep.grading_strategy import SleepGradingStrategy +from .schema.test_template.sleep.kubernetes_install_strategy import SleepKubernetesInstallStrategy +from .schema.test_template.sleep.kubernetes_json_gen_strategy import SleepKubernetesJsonGenStrategy from .schema.test_template.sleep.report_generation_strategy import SleepReportGenerationStrategy from .schema.test_template.sleep.slurm_command_gen_strategy import SleepSlurmCommandGenStrategy from .schema.test_template.sleep.standalone_command_gen_strategy import SleepStandaloneCommandGenStrategy @@ -86,6 +88,7 @@ from .schema.test_template.ucc_test.slurm_command_gen_strategy import UCCTestSlurmCommandGenStrategy from .schema.test_template.ucc_test.slurm_install_strategy import UCCTestSlurmInstallStrategy from .schema.test_template.ucc_test.template import UCCTest +from .systems.kubernetes.kubernetes_system import KubernetesSystem from .systems.slurm.slurm_system import SlurmSystem from .systems.standalone_system import StandaloneSystem @@ -102,10 +105,12 @@ Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [NcclTest], NcclTestReportGenerationStrategy) Registry().add_strategy(CommandGenStrategy, [StandaloneSystem], [Sleep], SleepStandaloneCommandGenStrategy) Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [Sleep], SleepSlurmCommandGenStrategy) +Registry().add_strategy(JsonGenStrategy, [KubernetesSystem], [Sleep], SleepKubernetesJsonGenStrategy) Registry().add_strategy(InstallStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxSlurmInstallStrategy) Registry().add_strategy(GradingStrategy, [SlurmSystem], [NcclTest], NcclTestGradingStrategy) Registry().add_strategy(InstallStrategy, [SlurmSystem], [UCCTest], UCCTestSlurmInstallStrategy) Registry().add_strategy(InstallStrategy, [StandaloneSystem, SlurmSystem], [Sleep], SleepStandaloneInstallStrategy) +Registry().add_strategy(InstallStrategy, [KubernetesSystem], [Sleep], SleepKubernetesInstallStrategy) Registry().add_strategy( ReportGenerationStrategy, [StandaloneSystem, SlurmSystem], [Sleep], SleepReportGenerationStrategy ) @@ -128,6 +133,7 @@ ) Registry().add_strategy(JobIdRetrievalStrategy, [StandaloneSystem], [Sleep], StandaloneJobIdRetrievalStrategy) Registry().add_strategy(JobStatusRetrievalStrategy, [StandaloneSystem], [Sleep], DefaultJobStatusRetrievalStrategy) +Registry().add_strategy(JobStatusRetrievalStrategy, [KubernetesSystem], [Sleep], DefaultJobStatusRetrievalStrategy) Registry().add_strategy(JobStatusRetrievalStrategy, [SlurmSystem], [NcclTest], NcclTestJobStatusRetrievalStrategy) Registry().add_strategy(JobStatusRetrievalStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxJobStatusRetrievalStrategy) Registry().add_strategy( diff --git a/src/cloudai/schema/test_template/sleep/__init__.py b/src/cloudai/schema/test_template/sleep/__init__.py index 28070e74..c55f27a4 100644 --- a/src/cloudai/schema/test_template/sleep/__init__.py +++ b/src/cloudai/schema/test_template/sleep/__init__.py @@ -15,6 +15,8 @@ # limitations under the License. from .grading_strategy import SleepGradingStrategy +from .kubernetes_install_strategy import SleepKubernetesInstallStrategy +from .kubernetes_json_gen_strategy import SleepKubernetesJsonGenStrategy from .report_generation_strategy import SleepReportGenerationStrategy from .slurm_command_gen_strategy import SleepSlurmCommandGenStrategy from .standalone_command_gen_strategy import SleepStandaloneCommandGenStrategy @@ -25,6 +27,8 @@ "Sleep", "SleepStandaloneInstallStrategy", "SleepStandaloneCommandGenStrategy", + "SleepKubernetesJsonGenStrategy", + "SleepKubernetesInstallStrategy", "SleepSlurmCommandGenStrategy", "SleepReportGenerationStrategy", "SleepGradingStrategy", diff --git a/src/cloudai/schema/test_template/sleep/kubernetes_install_strategy.py b/src/cloudai/schema/test_template/sleep/kubernetes_install_strategy.py new file mode 100644 index 00000000..6f21adb0 --- /dev/null +++ b/src/cloudai/schema/test_template/sleep/kubernetes_install_strategy.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cloudai import InstallStatusResult, InstallStrategy + + +class SleepKubernetesInstallStrategy(InstallStrategy): + """Installation strategy for the Sleep test on Kubernetes systems.""" + + def is_installed(self) -> InstallStatusResult: + return InstallStatusResult(success=True) + + def install(self) -> InstallStatusResult: + return InstallStatusResult(success=True) + + def uninstall(self) -> InstallStatusResult: + return InstallStatusResult(success=True) diff --git a/src/cloudai/schema/test_template/sleep/kubernetes_json_gen_strategy.py b/src/cloudai/schema/test_template/sleep/kubernetes_json_gen_strategy.py new file mode 100644 index 00000000..b953cffc --- /dev/null +++ b/src/cloudai/schema/test_template/sleep/kubernetes_json_gen_strategy.py @@ -0,0 +1,65 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Any, Dict, List, cast + +from cloudai import JsonGenStrategy +from cloudai.systems import KubernetesSystem + + +class SleepKubernetesJsonGenStrategy(JsonGenStrategy): + """JSON generation strategy for Sleep on Kubernetes systems.""" + + def gen_json( + self, + env_vars: Dict[str, str], + cmd_args: Dict[str, str], + extra_env_vars: Dict[str, str], + extra_cmd_args: str, + output_path: Path, + job_name: str, + num_nodes: int, + nodes: List[str], + ) -> Dict[Any, Any]: + self.final_cmd_args = self._override_cmd_args(self.default_cmd_args, cmd_args) + sec = self.final_cmd_args["seconds"] + + kubernetes_system = cast(KubernetesSystem, self.system) + + job_spec = { + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": {"name": job_name, "namespace": kubernetes_system.default_namespace}, + "spec": { + "ttlSecondsAfterFinished": 0, + "template": { + "spec": { + "containers": [ + { + "args": ["sleep " + sec], + "command": ["/bin/bash", "-c"], + "image": kubernetes_system.default_image, + "name": "task", + } + ], + "restartPolicy": "Never", + } + }, + }, + } + + return job_spec diff --git a/tests/test_init.py b/tests/test_init.py index a49c7d21..20739cc2 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -20,6 +20,7 @@ GradingStrategy, InstallStrategy, JobIdRetrievalStrategy, + JsonGenStrategy, Registry, ReportGenerationStrategy, ) @@ -52,6 +53,7 @@ ) from cloudai.schema.test_template.nemo_launcher.template import NeMoLauncher from cloudai.schema.test_template.sleep.grading_strategy import SleepGradingStrategy +from cloudai.schema.test_template.sleep.kubernetes_json_gen_strategy import SleepKubernetesJsonGenStrategy from cloudai.schema.test_template.sleep.report_generation_strategy import SleepReportGenerationStrategy from cloudai.schema.test_template.sleep.slurm_command_gen_strategy import SleepSlurmCommandGenStrategy from cloudai.schema.test_template.sleep.standalone_command_gen_strategy import SleepStandaloneCommandGenStrategy @@ -62,6 +64,7 @@ from cloudai.schema.test_template.ucc_test.slurm_command_gen_strategy import UCCTestSlurmCommandGenStrategy from cloudai.schema.test_template.ucc_test.slurm_install_strategy import UCCTestSlurmInstallStrategy from cloudai.schema.test_template.ucc_test.template import UCCTest +from cloudai.systems.kubernetes.kubernetes_system import KubernetesSystem from cloudai.systems.slurm.slurm_system import SlurmSystem from cloudai.systems.standalone_system import StandaloneSystem @@ -111,6 +114,7 @@ def test_runners(): ((JobIdRetrievalStrategy, SlurmSystem, NeMoLauncher), NeMoLauncherSlurmJobIdRetrievalStrategy), ((JobIdRetrievalStrategy, SlurmSystem, UCCTest), SlurmJobIdRetrievalStrategy), ((JobIdRetrievalStrategy, StandaloneSystem, Sleep), StandaloneJobIdRetrievalStrategy), + ((JsonGenStrategy, KubernetesSystem, Sleep), SleepKubernetesJsonGenStrategy), ((ReportGenerationStrategy, SlurmSystem, ChakraReplay), ChakraReplayReportGenerationStrategy), ((ReportGenerationStrategy, SlurmSystem, JaxToolbox), JaxToolboxReportGenerationStrategy), ((ReportGenerationStrategy, SlurmSystem, NcclTest), NcclTestReportGenerationStrategy),