Skip to content

Commit

Permalink
Add Kubernetes support for Sleep
Browse files Browse the repository at this point in the history
Co-authored-by: Peng Wang <pengwang@nvidia.com>
  • Loading branch information
TaekyungHeo and wpeng102 committed Aug 29, 2024
1 parent 75084ae commit 691c6e9
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 0 deletions.
6 changes: 6 additions & 0 deletions src/cloudai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@
)
from .schema.test_template.nemo_launcher.template import NeMoLauncher
from .schema.test_template.sleep.grading_strategy import SleepGradingStrategy
from .schema.test_template.sleep.kubernetes_install_strategy import SleepKubernetesInstallStrategy
from .schema.test_template.sleep.kubernetes_json_gen_strategy import SleepKubernetesJsonGenStrategy
from .schema.test_template.sleep.report_generation_strategy import SleepReportGenerationStrategy
from .schema.test_template.sleep.slurm_command_gen_strategy import SleepSlurmCommandGenStrategy
from .schema.test_template.sleep.standalone_command_gen_strategy import SleepStandaloneCommandGenStrategy
Expand All @@ -86,6 +88,7 @@
from .schema.test_template.ucc_test.slurm_command_gen_strategy import UCCTestSlurmCommandGenStrategy
from .schema.test_template.ucc_test.slurm_install_strategy import UCCTestSlurmInstallStrategy
from .schema.test_template.ucc_test.template import UCCTest
from .systems.kubernetes.kubernetes_system import KubernetesSystem
from .systems.slurm.slurm_system import SlurmSystem
from .systems.standalone_system import StandaloneSystem

Expand All @@ -102,10 +105,12 @@
Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [NcclTest], NcclTestReportGenerationStrategy)
Registry().add_strategy(CommandGenStrategy, [StandaloneSystem], [Sleep], SleepStandaloneCommandGenStrategy)
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [Sleep], SleepSlurmCommandGenStrategy)
Registry().add_strategy(JsonGenStrategy, [KubernetesSystem], [Sleep], SleepKubernetesJsonGenStrategy)
Registry().add_strategy(InstallStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxSlurmInstallStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [NcclTest], NcclTestGradingStrategy)
Registry().add_strategy(InstallStrategy, [SlurmSystem], [UCCTest], UCCTestSlurmInstallStrategy)
Registry().add_strategy(InstallStrategy, [StandaloneSystem, SlurmSystem], [Sleep], SleepStandaloneInstallStrategy)
Registry().add_strategy(InstallStrategy, [KubernetesSystem], [Sleep], SleepKubernetesInstallStrategy)
Registry().add_strategy(
ReportGenerationStrategy, [StandaloneSystem, SlurmSystem], [Sleep], SleepReportGenerationStrategy
)
Expand All @@ -128,6 +133,7 @@
)
Registry().add_strategy(JobIdRetrievalStrategy, [StandaloneSystem], [Sleep], StandaloneJobIdRetrievalStrategy)
Registry().add_strategy(JobStatusRetrievalStrategy, [StandaloneSystem], [Sleep], DefaultJobStatusRetrievalStrategy)
Registry().add_strategy(JobStatusRetrievalStrategy, [KubernetesSystem], [Sleep], DefaultJobStatusRetrievalStrategy)
Registry().add_strategy(JobStatusRetrievalStrategy, [SlurmSystem], [NcclTest], NcclTestJobStatusRetrievalStrategy)
Registry().add_strategy(JobStatusRetrievalStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxJobStatusRetrievalStrategy)
Registry().add_strategy(
Expand Down
4 changes: 4 additions & 0 deletions src/cloudai/schema/test_template/sleep/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
# limitations under the License.

from .grading_strategy import SleepGradingStrategy
from .kubernetes_install_strategy import SleepKubernetesInstallStrategy
from .kubernetes_json_gen_strategy import SleepKubernetesJsonGenStrategy
from .report_generation_strategy import SleepReportGenerationStrategy
from .slurm_command_gen_strategy import SleepSlurmCommandGenStrategy
from .standalone_command_gen_strategy import SleepStandaloneCommandGenStrategy
Expand All @@ -25,6 +27,8 @@
"Sleep",
"SleepStandaloneInstallStrategy",
"SleepStandaloneCommandGenStrategy",
"SleepKubernetesJsonGenStrategy",
"SleepKubernetesInstallStrategy",
"SleepSlurmCommandGenStrategy",
"SleepReportGenerationStrategy",
"SleepGradingStrategy",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from cloudai import InstallStatusResult, InstallStrategy


class SleepKubernetesInstallStrategy(InstallStrategy):
"""Installation strategy for the Sleep test on Kubernetes systems."""

def is_installed(self) -> InstallStatusResult:
return InstallStatusResult(success=True)

def install(self) -> InstallStatusResult:
return InstallStatusResult(success=True)

def uninstall(self) -> InstallStatusResult:
return InstallStatusResult(success=True)
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path
from typing import Any, Dict, List, cast

from cloudai import JsonGenStrategy
from cloudai.systems import KubernetesSystem


class SleepKubernetesJsonGenStrategy(JsonGenStrategy):
"""JSON generation strategy for Sleep on Kubernetes systems."""

def gen_json(
self,
env_vars: Dict[str, str],
cmd_args: Dict[str, str],
extra_env_vars: Dict[str, str],
extra_cmd_args: str,
output_path: Path,
job_name: str,
num_nodes: int,
nodes: List[str],
) -> Dict[Any, Any]:
self.final_cmd_args = self._override_cmd_args(self.default_cmd_args, cmd_args)
sec = self.final_cmd_args["seconds"]

kubernetes_system = cast(KubernetesSystem, self.system)

job_spec = {
"apiVersion": "batch/v1",
"kind": "Job",
"metadata": {"name": job_name, "namespace": kubernetes_system.default_namespace},
"spec": {
"ttlSecondsAfterFinished": 0,
"template": {
"spec": {
"containers": [
{
"args": ["sleep " + sec],
"command": ["/bin/bash", "-c"],
"image": kubernetes_system.default_image,
"name": "task",
}
],
"restartPolicy": "Never",
}
},
},
}

return job_spec
4 changes: 4 additions & 0 deletions tests/test_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
GradingStrategy,
InstallStrategy,
JobIdRetrievalStrategy,
JsonGenStrategy,
Registry,
ReportGenerationStrategy,
)
Expand Down Expand Up @@ -52,6 +53,7 @@
)
from cloudai.schema.test_template.nemo_launcher.template import NeMoLauncher
from cloudai.schema.test_template.sleep.grading_strategy import SleepGradingStrategy
from cloudai.schema.test_template.sleep.kubernetes_json_gen_strategy import SleepKubernetesJsonGenStrategy
from cloudai.schema.test_template.sleep.report_generation_strategy import SleepReportGenerationStrategy
from cloudai.schema.test_template.sleep.slurm_command_gen_strategy import SleepSlurmCommandGenStrategy
from cloudai.schema.test_template.sleep.standalone_command_gen_strategy import SleepStandaloneCommandGenStrategy
Expand All @@ -62,6 +64,7 @@
from cloudai.schema.test_template.ucc_test.slurm_command_gen_strategy import UCCTestSlurmCommandGenStrategy
from cloudai.schema.test_template.ucc_test.slurm_install_strategy import UCCTestSlurmInstallStrategy
from cloudai.schema.test_template.ucc_test.template import UCCTest
from cloudai.systems.kubernetes.kubernetes_system import KubernetesSystem
from cloudai.systems.slurm.slurm_system import SlurmSystem
from cloudai.systems.standalone_system import StandaloneSystem

Expand Down Expand Up @@ -111,6 +114,7 @@ def test_runners():
((JobIdRetrievalStrategy, SlurmSystem, NeMoLauncher), NeMoLauncherSlurmJobIdRetrievalStrategy),
((JobIdRetrievalStrategy, SlurmSystem, UCCTest), SlurmJobIdRetrievalStrategy),
((JobIdRetrievalStrategy, StandaloneSystem, Sleep), StandaloneJobIdRetrievalStrategy),
((JsonGenStrategy, KubernetesSystem, Sleep), SleepKubernetesJsonGenStrategy),
((ReportGenerationStrategy, SlurmSystem, ChakraReplay), ChakraReplayReportGenerationStrategy),
((ReportGenerationStrategy, SlurmSystem, JaxToolbox), JaxToolboxReportGenerationStrategy),
((ReportGenerationStrategy, SlurmSystem, NcclTest), NcclTestReportGenerationStrategy),
Expand Down

0 comments on commit 691c6e9

Please sign in to comment.