Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Kubernetes Support #180

Merged
merged 10 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions conf/common/system/kubernetes_cluster.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name = "kubernetes-cluster"
scheduler = "kubernetes"
kube_config_path = ""
TaekyungHeo marked this conversation as resolved.
Show resolved Hide resolved

install_path = "./install"
output_path = "./results"
default_image = "ubuntu:22.04"
TaekyungHeo marked this conversation as resolved.
Show resolved Hide resolved
default_namespace = "default"

[global_env_vars]
NCCL_IB_GID_INDEX = "3"
NCCL_SOCKET_IFNAME = "ib0"
NCCL_IB_HCA = "mlx5_0"
UCX_NET_DEVICES = "mlx5_0:1"
NCCL_P2P_LEVEL = "PIX"
UCX_TLS = "rc_x,sm,cuda_copy"
NCCL_IB_TC = "96"
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ bokeh==3.4.1
pandas==2.2.1
tbparse==0.0.8
toml==0.10.2
kubernetes==30.1.0
TaekyungHeo marked this conversation as resolved.
Show resolved Hide resolved
14 changes: 14 additions & 0 deletions src/cloudai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from ._core.job_id_retrieval_strategy import JobIdRetrievalStrategy
from ._core.job_status_result import JobStatusResult
from ._core.job_status_retrieval_strategy import JobStatusRetrievalStrategy
from ._core.json_gen_strategy import JsonGenStrategy
from ._core.parser import Parser
from ._core.registry import Registry
from ._core.report_generation_strategy import ReportGenerationStrategy
Expand All @@ -36,11 +37,14 @@
from ._core.test_template import TestTemplate
from ._core.test_template_strategy import TestTemplateStrategy
from .installer.installer import Installer
from .installer.kubernetes_installer import KubernetesInstaller
from .installer.slurm_installer import SlurmInstaller
from .installer.standalone_installer import StandaloneInstaller
from .parser.system_parser.kubernetes_system_parser import KubernetesSystemParser
from .parser.system_parser.slurm_system_parser import SlurmSystemParser
from .parser.system_parser.standalone_system_parser import StandaloneSystemParser
from .report_generator import ReportGenerator
from .runner.kubernetes.kubernetes_runner import KubernetesRunner
from .runner.slurm.slurm_runner import SlurmRunner
from .runner.standalone.standalone_runner import StandaloneRunner
from .schema.test_template.chakra_replay.grading_strategy import ChakraReplayGradingStrategy
Expand Down Expand Up @@ -72,6 +76,8 @@
)
from .schema.test_template.nemo_launcher.template import NeMoLauncher
from .schema.test_template.sleep.grading_strategy import SleepGradingStrategy
from .schema.test_template.sleep.kubernetes_install_strategy import SleepKubernetesInstallStrategy
from .schema.test_template.sleep.kubernetes_json_gen_strategy import SleepKubernetesJsonGenStrategy
from .schema.test_template.sleep.report_generation_strategy import SleepReportGenerationStrategy
from .schema.test_template.sleep.slurm_command_gen_strategy import SleepSlurmCommandGenStrategy
from .schema.test_template.sleep.standalone_command_gen_strategy import SleepStandaloneCommandGenStrategy
Expand All @@ -82,24 +88,29 @@
from .schema.test_template.ucc_test.slurm_command_gen_strategy import UCCTestSlurmCommandGenStrategy
from .schema.test_template.ucc_test.slurm_install_strategy import UCCTestSlurmInstallStrategy
from .schema.test_template.ucc_test.template import UCCTest
from .systems.kubernetes.kubernetes_system import KubernetesSystem
from .systems.slurm.slurm_system import SlurmSystem
from .systems.standalone_system import StandaloneSystem

Registry().add_system_parser("standalone", StandaloneSystemParser)
Registry().add_system_parser("slurm", SlurmSystemParser)
Registry().add_system_parser("kubernetes", KubernetesSystemParser)

Registry().add_runner("slurm", SlurmRunner)
Registry().add_runner("kubernetes", KubernetesRunner)
Registry().add_runner("standalone", StandaloneRunner)

Registry().add_strategy(InstallStrategy, [SlurmSystem], [NcclTest], NcclTestSlurmInstallStrategy)
Registry().add_strategy(InstallStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmInstallStrategy)
Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [NcclTest], NcclTestReportGenerationStrategy)
Registry().add_strategy(CommandGenStrategy, [StandaloneSystem], [Sleep], SleepStandaloneCommandGenStrategy)
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [Sleep], SleepSlurmCommandGenStrategy)
Registry().add_strategy(JsonGenStrategy, [KubernetesSystem], [Sleep], SleepKubernetesJsonGenStrategy)
Registry().add_strategy(InstallStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxSlurmInstallStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [NcclTest], NcclTestGradingStrategy)
Registry().add_strategy(InstallStrategy, [SlurmSystem], [UCCTest], UCCTestSlurmInstallStrategy)
Registry().add_strategy(InstallStrategy, [StandaloneSystem, SlurmSystem], [Sleep], SleepStandaloneInstallStrategy)
Registry().add_strategy(InstallStrategy, [KubernetesSystem], [Sleep], SleepKubernetesInstallStrategy)
Registry().add_strategy(
ReportGenerationStrategy, [StandaloneSystem, SlurmSystem], [Sleep], SleepReportGenerationStrategy
)
Expand All @@ -122,6 +133,7 @@
)
Registry().add_strategy(JobIdRetrievalStrategy, [StandaloneSystem], [Sleep], StandaloneJobIdRetrievalStrategy)
Registry().add_strategy(JobStatusRetrievalStrategy, [StandaloneSystem], [Sleep], DefaultJobStatusRetrievalStrategy)
Registry().add_strategy(JobStatusRetrievalStrategy, [KubernetesSystem], [Sleep], DefaultJobStatusRetrievalStrategy)
Registry().add_strategy(JobStatusRetrievalStrategy, [SlurmSystem], [NcclTest], NcclTestJobStatusRetrievalStrategy)
Registry().add_strategy(JobStatusRetrievalStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxJobStatusRetrievalStrategy)
Registry().add_strategy(
Expand All @@ -145,13 +157,15 @@

Registry().add_installer("slurm", SlurmInstaller)
Registry().add_installer("standalone", StandaloneInstaller)
Registry().add_installer("kubernetes", KubernetesInstaller)

__all__ = [
"BaseInstaller",
"BaseJob",
"BaseRunner",
"BaseSystemParser",
"CommandGenStrategy",
"JsonGenStrategy",
"Grader",
"GradingStrategy",
"Installer",
Expand Down
59 changes: 59 additions & 0 deletions src/cloudai/_core/json_gen_strategy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import abstractmethod
from pathlib import Path
from typing import Any, Dict, List

from .test_template_strategy import TestTemplateStrategy


class JsonGenStrategy(TestTemplateStrategy):
"""
Abstract base class for generating Kubernetes job specifications based on system and test parameters.

It specifies how to generate JSON job specifications based on system and test parameters.
"""

@abstractmethod
def gen_json(
self,
env_vars: Dict[str, str],
cmd_args: Dict[str, str],
extra_env_vars: Dict[str, str],
extra_cmd_args: str,
output_path: Path,
job_name: str,
num_nodes: int,
nodes: List[str],
) -> Dict[Any, Any]:
"""
Generate the Kubernetes job specification based on the given parameters.

Args:
env_vars (Dict[str, str]): Environment variables for the job.
cmd_args (Dict[str, str]): Command-line arguments for the job.
extra_env_vars (Dict[str, str]): Additional environment variables.
extra_cmd_args (str): Additional command-line arguments.
output_path (Path): Path to the output directory.
job_name (str): The name of the job.
num_nodes (int): The number of nodes to be used for job execution.
nodes (List[str]): List of nodes for job execution, optional.

Returns:
Dict[Any, Any]: The generated Kubernetes job specification in JSON format.
"""
pass
41 changes: 39 additions & 2 deletions src/cloudai/_core/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

import sys
from pathlib import Path
from typing import Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Union

from .job_status_result import JobStatusResult
from .test_template import TestTemplate
Expand Down Expand Up @@ -120,7 +120,7 @@ def gen_exec_command(
Generate the command to run this specific test.

Args:
output_path (str): Path to the output directory.
output_path (Path): Path to the output directory where logs and results will be stored.
time_limit (Optional[str]): Time limit for the test execution.
num_nodes (Optional[int]): Number of nodes to be used for the test execution.
nodes (Optional[List[str]]): List of nodes involved in the test.
Expand All @@ -143,6 +143,43 @@ def gen_exec_command(
nodes,
)

def gen_json(
self,
output_path: Path,
job_name: str,
time_limit: Optional[str] = None,
num_nodes: int = 1,
nodes: Optional[List[str]] = None,
) -> Dict[Any, Any]:
"""
Generate a JSON dictionary representing the Kubernetes job specification for this test.

Args:
output_path (Path): Path to the output directory where logs and results will be stored.
job_name (str): The name assigned to the Kubernetes job.
time_limit (Optional[str]): Time limit for the test execution.
num_nodes (Optional[int]): Number of nodes to be used for the test execution.
nodes (Optional[List[str]]): List of nodes involved in the test.

Returns:
Dict[Any, Any]: A dictionary representing the Kubernetes job specification.
"""
if time_limit is not None:
self.cmd_args["time_limit"] = time_limit
if not nodes:
nodes = []
TaekyungHeo marked this conversation as resolved.
Show resolved Hide resolved

return self.test_template.gen_json(
self.env_vars,
self.cmd_args,
self.extra_env_vars,
self.extra_cmd_args,
output_path,
job_name,
num_nodes,
nodes,
)

def get_job_id(self, stdout: str, stderr: str) -> Optional[int]:
"""
Retrieve the job ID using the test template's method.
Expand Down
48 changes: 48 additions & 0 deletions src/cloudai/_core/test_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from .job_id_retrieval_strategy import JobIdRetrievalStrategy
from .job_status_result import JobStatusResult
from .job_status_retrieval_strategy import JobStatusRetrievalStrategy
from .json_gen_strategy import JsonGenStrategy
from .report_generation_strategy import ReportGenerationStrategy
from .system import System

Expand All @@ -42,6 +43,7 @@ class TestTemplate:
logger (logging.Logger): Logger for the test template.
install_strategy (InstallStrategy): Strategy for installing test prerequisites.
command_gen_strategy (CommandGenStrategy): Strategy for generating execution commands.
json_gen_strategy (JsonGenStrategy): Strategy for generating json string.
job_id_retrieval_strategy (JobIdRetrievalStrategy): Strategy for retrieving job IDs.
report_generation_strategy (ReportGenerationStrategy): Strategy for generating reports.
grading_strategy (GradingStrategy): Strategy for grading performance based on test outcomes.
Expand Down Expand Up @@ -72,6 +74,7 @@ def __init__(
self.cmd_args = cmd_args
self.install_strategy: Optional[InstallStrategy] = None
self.command_gen_strategy: Optional[CommandGenStrategy] = None
self.json_gen_strategy: Optional[JsonGenStrategy] = None
self.job_id_retrieval_strategy: Optional[JobIdRetrievalStrategy] = None
self.job_status_retrieval_strategy: Optional[JobStatusRetrievalStrategy] = None
self.report_generation_strategy: Optional[ReportGenerationStrategy] = None
Expand Down Expand Up @@ -166,6 +169,51 @@ def gen_exec_command(
nodes,
)

def gen_json(
self,
env_vars: Dict[str, str],
cmd_args: Dict[str, str],
extra_env_vars: Dict[str, str],
extra_cmd_args: str,
output_path: Path,
job_name: str,
num_nodes: int,
nodes: List[str],
) -> Dict[Any, Any]:
"""
Generate a JSON string representing the Kubernetes job specification for this test using this template.

Args:
env_vars (Dict[str, str]): Environment variables for the test.
cmd_args (Dict[str, str]): Command-line arguments for the test.
extra_env_vars (Dict[str, str]): Extra environment variables.
extra_cmd_args (str): Extra command-line arguments.
output_path (Path): Path to the output directory.
job_name (str): The name of the job.
num_nodes (int): The number of nodes to be used for the test execution.
nodes (List[str]): A list of nodes where the test will be executed.

Returns:
Dict[Any, Any]: A dictionary representing the Kubernetes job specification.
"""
if not nodes:
nodes = []
if self.json_gen_strategy is None:
raise ValueError(
"json_gen_strategy is missing. Ensure the strategy is registered in the Registry "
"by calling the appropriate registration function for the system type."
)
return self.json_gen_strategy.gen_json(
env_vars,
cmd_args,
extra_env_vars,
extra_cmd_args,
output_path,
job_name,
num_nodes,
nodes,
)

def get_job_id(self, stdout: str, stderr: str) -> Optional[int]:
"""
Retrieve the job ID from the execution output using the job ID retrieval strategy.
Expand Down
5 changes: 5 additions & 0 deletions src/cloudai/_core/test_template_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from .install_strategy import InstallStrategy
from .job_id_retrieval_strategy import JobIdRetrievalStrategy
from .job_status_retrieval_strategy import JobStatusRetrievalStrategy
from .json_gen_strategy import JsonGenStrategy
from .registry import Registry
from .report_generation_strategy import ReportGenerationStrategy
from .system import System
Expand Down Expand Up @@ -127,6 +128,10 @@ def _parse_data(self, data: Dict[str, Any]) -> TestTemplate:
CommandGenStrategy,
self._fetch_strategy(CommandGenStrategy, type(obj.system), type(obj), env_vars, cmd_args),
)
obj.json_gen_strategy = cast(
JsonGenStrategy,
self._fetch_strategy(JsonGenStrategy, type(obj.system), type(obj), env_vars, cmd_args),
)
obj.job_id_retrieval_strategy = cast(
JobIdRetrievalStrategy,
self._fetch_strategy(JobIdRetrievalStrategy, type(obj.system), type(obj), env_vars, cmd_args),
Expand Down
Loading