Skip to content

Commit

Permalink
Merge pull request #180 from TaekyungHeo/k8s-base
Browse files Browse the repository at this point in the history
Add Kubernetes Support
  • Loading branch information
srinivas212 committed Sep 3, 2024
2 parents c8b032b + a4d0fef commit 1e6d618
Show file tree
Hide file tree
Showing 19 changed files with 1,299 additions and 5 deletions.
33 changes: 33 additions & 0 deletions conf/common/system/kubernetes_cluster.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name = "kubernetes-cluster"
scheduler = "kubernetes"
kube_config_path = ""

install_path = "./install"
output_path = "./results"
default_image = "ubuntu:22.04"
default_namespace = "default"

[global_env_vars]
NCCL_IB_GID_INDEX = "3"
NCCL_SOCKET_IFNAME = "ib0"
NCCL_IB_HCA = "mlx5_0"
UCX_NET_DEVICES = "mlx5_0:1"
NCCL_P2P_LEVEL = "PIX"
UCX_TLS = "rc_x,sm,cuda_copy"
NCCL_IB_TC = "96"
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ bokeh==3.4.1
pandas==2.2.1
tbparse==0.0.8
toml==0.10.2
kubernetes==30.1.0
14 changes: 14 additions & 0 deletions src/cloudai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from ._core.job_id_retrieval_strategy import JobIdRetrievalStrategy
from ._core.job_status_result import JobStatusResult
from ._core.job_status_retrieval_strategy import JobStatusRetrievalStrategy
from ._core.json_gen_strategy import JsonGenStrategy
from ._core.parser import Parser
from ._core.registry import Registry
from ._core.report_generation_strategy import ReportGenerationStrategy
Expand All @@ -36,11 +37,14 @@
from ._core.test_template import TestTemplate
from ._core.test_template_strategy import TestTemplateStrategy
from .installer.installer import Installer
from .installer.kubernetes_installer import KubernetesInstaller
from .installer.slurm_installer import SlurmInstaller
from .installer.standalone_installer import StandaloneInstaller
from .parser.system_parser.kubernetes_system_parser import KubernetesSystemParser
from .parser.system_parser.slurm_system_parser import SlurmSystemParser
from .parser.system_parser.standalone_system_parser import StandaloneSystemParser
from .report_generator import ReportGenerator
from .runner.kubernetes.kubernetes_runner import KubernetesRunner
from .runner.slurm.slurm_runner import SlurmRunner
from .runner.standalone.standalone_runner import StandaloneRunner
from .schema.test_template.chakra_replay.grading_strategy import ChakraReplayGradingStrategy
Expand Down Expand Up @@ -72,6 +76,8 @@
)
from .schema.test_template.nemo_launcher.template import NeMoLauncher
from .schema.test_template.sleep.grading_strategy import SleepGradingStrategy
from .schema.test_template.sleep.kubernetes_install_strategy import SleepKubernetesInstallStrategy
from .schema.test_template.sleep.kubernetes_json_gen_strategy import SleepKubernetesJsonGenStrategy
from .schema.test_template.sleep.report_generation_strategy import SleepReportGenerationStrategy
from .schema.test_template.sleep.slurm_command_gen_strategy import SleepSlurmCommandGenStrategy
from .schema.test_template.sleep.standalone_command_gen_strategy import SleepStandaloneCommandGenStrategy
Expand All @@ -82,24 +88,29 @@
from .schema.test_template.ucc_test.slurm_command_gen_strategy import UCCTestSlurmCommandGenStrategy
from .schema.test_template.ucc_test.slurm_install_strategy import UCCTestSlurmInstallStrategy
from .schema.test_template.ucc_test.template import UCCTest
from .systems.kubernetes.kubernetes_system import KubernetesSystem
from .systems.slurm.slurm_system import SlurmSystem
from .systems.standalone_system import StandaloneSystem

Registry().add_system_parser("standalone", StandaloneSystemParser)
Registry().add_system_parser("slurm", SlurmSystemParser)
Registry().add_system_parser("kubernetes", KubernetesSystemParser)

Registry().add_runner("slurm", SlurmRunner)
Registry().add_runner("kubernetes", KubernetesRunner)
Registry().add_runner("standalone", StandaloneRunner)

Registry().add_strategy(InstallStrategy, [SlurmSystem], [NcclTest], NcclTestSlurmInstallStrategy)
Registry().add_strategy(InstallStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmInstallStrategy)
Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [NcclTest], NcclTestReportGenerationStrategy)
Registry().add_strategy(CommandGenStrategy, [StandaloneSystem], [Sleep], SleepStandaloneCommandGenStrategy)
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [Sleep], SleepSlurmCommandGenStrategy)
Registry().add_strategy(JsonGenStrategy, [KubernetesSystem], [Sleep], SleepKubernetesJsonGenStrategy)
Registry().add_strategy(InstallStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxSlurmInstallStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [NcclTest], NcclTestGradingStrategy)
Registry().add_strategy(InstallStrategy, [SlurmSystem], [UCCTest], UCCTestSlurmInstallStrategy)
Registry().add_strategy(InstallStrategy, [StandaloneSystem, SlurmSystem], [Sleep], SleepStandaloneInstallStrategy)
Registry().add_strategy(InstallStrategy, [KubernetesSystem], [Sleep], SleepKubernetesInstallStrategy)
Registry().add_strategy(
ReportGenerationStrategy, [StandaloneSystem, SlurmSystem], [Sleep], SleepReportGenerationStrategy
)
Expand All @@ -122,6 +133,7 @@
)
Registry().add_strategy(JobIdRetrievalStrategy, [StandaloneSystem], [Sleep], StandaloneJobIdRetrievalStrategy)
Registry().add_strategy(JobStatusRetrievalStrategy, [StandaloneSystem], [Sleep], DefaultJobStatusRetrievalStrategy)
Registry().add_strategy(JobStatusRetrievalStrategy, [KubernetesSystem], [Sleep], DefaultJobStatusRetrievalStrategy)
Registry().add_strategy(JobStatusRetrievalStrategy, [SlurmSystem], [NcclTest], NcclTestJobStatusRetrievalStrategy)
Registry().add_strategy(JobStatusRetrievalStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxJobStatusRetrievalStrategy)
Registry().add_strategy(
Expand All @@ -145,13 +157,15 @@

Registry().add_installer("slurm", SlurmInstaller)
Registry().add_installer("standalone", StandaloneInstaller)
Registry().add_installer("kubernetes", KubernetesInstaller)

__all__ = [
"BaseInstaller",
"BaseJob",
"BaseRunner",
"BaseSystemParser",
"CommandGenStrategy",
"JsonGenStrategy",
"Grader",
"GradingStrategy",
"Installer",
Expand Down
59 changes: 59 additions & 0 deletions src/cloudai/_core/json_gen_strategy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import abstractmethod
from pathlib import Path
from typing import Any, Dict, List

from .test_template_strategy import TestTemplateStrategy


class JsonGenStrategy(TestTemplateStrategy):
"""
Abstract base class for generating Kubernetes job specifications based on system and test parameters.
It specifies how to generate JSON job specifications based on system and test parameters.
"""

@abstractmethod
def gen_json(
self,
env_vars: Dict[str, str],
cmd_args: Dict[str, str],
extra_env_vars: Dict[str, str],
extra_cmd_args: str,
output_path: Path,
job_name: str,
num_nodes: int,
nodes: List[str],
) -> Dict[Any, Any]:
"""
Generate the Kubernetes job specification based on the given parameters.
Args:
env_vars (Dict[str, str]): Environment variables for the job.
cmd_args (Dict[str, str]): Command-line arguments for the job.
extra_env_vars (Dict[str, str]): Additional environment variables.
extra_cmd_args (str): Additional command-line arguments.
output_path (Path): Path to the output directory.
job_name (str): The name of the job.
num_nodes (int): The number of nodes to be used for job execution.
nodes (List[str]): List of nodes for job execution, optional.
Returns:
Dict[Any, Any]: The generated Kubernetes job specification in JSON format.
"""
pass
41 changes: 39 additions & 2 deletions src/cloudai/_core/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

import sys
from pathlib import Path
from typing import Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Union

from .job_status_result import JobStatusResult
from .test_template import TestTemplate
Expand Down Expand Up @@ -120,7 +120,7 @@ def gen_exec_command(
Generate the command to run this specific test.
Args:
output_path (str): Path to the output directory.
output_path (Path): Path to the output directory where logs and results will be stored.
time_limit (Optional[str]): Time limit for the test execution.
num_nodes (Optional[int]): Number of nodes to be used for the test execution.
nodes (Optional[List[str]]): List of nodes involved in the test.
Expand All @@ -143,6 +143,43 @@ def gen_exec_command(
nodes,
)

def gen_json(
self,
output_path: Path,
job_name: str,
time_limit: Optional[str] = None,
num_nodes: int = 1,
nodes: Optional[List[str]] = None,
) -> Dict[Any, Any]:
"""
Generate a JSON dictionary representing the Kubernetes job specification for this test.
Args:
output_path (Path): Path to the output directory where logs and results will be stored.
job_name (str): The name assigned to the Kubernetes job.
time_limit (Optional[str]): Time limit for the test execution.
num_nodes (Optional[int]): Number of nodes to be used for the test execution.
nodes (Optional[List[str]]): List of nodes involved in the test.
Returns:
Dict[Any, Any]: A dictionary representing the Kubernetes job specification.
"""
if time_limit is not None:
self.cmd_args["time_limit"] = time_limit
if not nodes:
nodes = []

return self.test_template.gen_json(
self.env_vars,
self.cmd_args,
self.extra_env_vars,
self.extra_cmd_args,
output_path,
job_name,
num_nodes,
nodes,
)

def get_job_id(self, stdout: str, stderr: str) -> Optional[int]:
"""
Retrieve the job ID using the test template's method.
Expand Down
48 changes: 48 additions & 0 deletions src/cloudai/_core/test_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from .job_id_retrieval_strategy import JobIdRetrievalStrategy
from .job_status_result import JobStatusResult
from .job_status_retrieval_strategy import JobStatusRetrievalStrategy
from .json_gen_strategy import JsonGenStrategy
from .report_generation_strategy import ReportGenerationStrategy
from .system import System

Expand All @@ -42,6 +43,7 @@ class TestTemplate:
logger (logging.Logger): Logger for the test template.
install_strategy (InstallStrategy): Strategy for installing test prerequisites.
command_gen_strategy (CommandGenStrategy): Strategy for generating execution commands.
json_gen_strategy (JsonGenStrategy): Strategy for generating json string.
job_id_retrieval_strategy (JobIdRetrievalStrategy): Strategy for retrieving job IDs.
report_generation_strategy (ReportGenerationStrategy): Strategy for generating reports.
grading_strategy (GradingStrategy): Strategy for grading performance based on test outcomes.
Expand Down Expand Up @@ -72,6 +74,7 @@ def __init__(
self.cmd_args = cmd_args
self.install_strategy: Optional[InstallStrategy] = None
self.command_gen_strategy: Optional[CommandGenStrategy] = None
self.json_gen_strategy: Optional[JsonGenStrategy] = None
self.job_id_retrieval_strategy: Optional[JobIdRetrievalStrategy] = None
self.job_status_retrieval_strategy: Optional[JobStatusRetrievalStrategy] = None
self.report_generation_strategy: Optional[ReportGenerationStrategy] = None
Expand Down Expand Up @@ -166,6 +169,51 @@ def gen_exec_command(
nodes,
)

def gen_json(
self,
env_vars: Dict[str, str],
cmd_args: Dict[str, str],
extra_env_vars: Dict[str, str],
extra_cmd_args: str,
output_path: Path,
job_name: str,
num_nodes: int,
nodes: List[str],
) -> Dict[Any, Any]:
"""
Generate a JSON string representing the Kubernetes job specification for this test using this template.
Args:
env_vars (Dict[str, str]): Environment variables for the test.
cmd_args (Dict[str, str]): Command-line arguments for the test.
extra_env_vars (Dict[str, str]): Extra environment variables.
extra_cmd_args (str): Extra command-line arguments.
output_path (Path): Path to the output directory.
job_name (str): The name of the job.
num_nodes (int): The number of nodes to be used for the test execution.
nodes (List[str]): A list of nodes where the test will be executed.
Returns:
Dict[Any, Any]: A dictionary representing the Kubernetes job specification.
"""
if not nodes:
nodes = []
if self.json_gen_strategy is None:
raise ValueError(
"json_gen_strategy is missing. Ensure the strategy is registered in the Registry "
"by calling the appropriate registration function for the system type."
)
return self.json_gen_strategy.gen_json(
env_vars,
cmd_args,
extra_env_vars,
extra_cmd_args,
output_path,
job_name,
num_nodes,
nodes,
)

def get_job_id(self, stdout: str, stderr: str) -> Optional[int]:
"""
Retrieve the job ID from the execution output using the job ID retrieval strategy.
Expand Down
5 changes: 5 additions & 0 deletions src/cloudai/_core/test_template_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from .install_strategy import InstallStrategy
from .job_id_retrieval_strategy import JobIdRetrievalStrategy
from .job_status_retrieval_strategy import JobStatusRetrievalStrategy
from .json_gen_strategy import JsonGenStrategy
from .registry import Registry
from .report_generation_strategy import ReportGenerationStrategy
from .system import System
Expand Down Expand Up @@ -127,6 +128,10 @@ def _parse_data(self, data: Dict[str, Any]) -> TestTemplate:
CommandGenStrategy,
self._fetch_strategy(CommandGenStrategy, type(obj.system), type(obj), env_vars, cmd_args),
)
obj.json_gen_strategy = cast(
JsonGenStrategy,
self._fetch_strategy(JsonGenStrategy, type(obj.system), type(obj), env_vars, cmd_args),
)
obj.job_id_retrieval_strategy = cast(
JobIdRetrievalStrategy,
self._fetch_strategy(JobIdRetrievalStrategy, type(obj.system), type(obj), env_vars, cmd_args),
Expand Down
Loading

0 comments on commit 1e6d618

Please sign in to comment.