Merge pull request #180 from TaekyungHeo/k8s-base

Add Kubernetes Support
NVIDIA · Sep 3, 2024 · 1e6d618 · 1e6d618
2 parents c8b032b + a4d0fef
commit 1e6d618
Show file tree

Hide file tree

Showing 19 changed files with 1,299 additions and 5 deletions.
diff --git a/conf/common/system/kubernetes_cluster.toml b/conf/common/system/kubernetes_cluster.toml
@@ -0,0 +1,33 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "kubernetes-cluster"
+scheduler = "kubernetes"
+kube_config_path = ""
+
+install_path = "./install"
+output_path = "./results"
+default_image = "ubuntu:22.04"
+default_namespace = "default"
+
+[global_env_vars]
+NCCL_IB_GID_INDEX = "3"
+NCCL_SOCKET_IFNAME = "ib0"
+NCCL_IB_HCA = "mlx5_0"
+UCX_NET_DEVICES = "mlx5_0:1"
+NCCL_P2P_LEVEL = "PIX"
+UCX_TLS = "rc_x,sm,cuda_copy"
+NCCL_IB_TC = "96"
diff --git a/requirements.txt b/requirements.txt
@@ -2,3 +2,4 @@ bokeh==3.4.1
 pandas==2.2.1
 tbparse==0.0.8
 toml==0.10.2
+kubernetes==30.1.0
diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py
@@ -26,6 +26,7 @@
 from ._core.job_id_retrieval_strategy import JobIdRetrievalStrategy
 from ._core.job_status_result import JobStatusResult
 from ._core.job_status_retrieval_strategy import JobStatusRetrievalStrategy
+from ._core.json_gen_strategy import JsonGenStrategy
 from ._core.parser import Parser
 from ._core.registry import Registry
 from ._core.report_generation_strategy import ReportGenerationStrategy
@@ -36,11 +37,14 @@
 from ._core.test_template import TestTemplate
 from ._core.test_template_strategy import TestTemplateStrategy
 from .installer.installer import Installer
+from .installer.kubernetes_installer import KubernetesInstaller
 from .installer.slurm_installer import SlurmInstaller
 from .installer.standalone_installer import StandaloneInstaller
+from .parser.system_parser.kubernetes_system_parser import KubernetesSystemParser
 from .parser.system_parser.slurm_system_parser import SlurmSystemParser
 from .parser.system_parser.standalone_system_parser import StandaloneSystemParser
 from .report_generator import ReportGenerator
+from .runner.kubernetes.kubernetes_runner import KubernetesRunner
 from .runner.slurm.slurm_runner import SlurmRunner
 from .runner.standalone.standalone_runner import StandaloneRunner
 from .schema.test_template.chakra_replay.grading_strategy import ChakraReplayGradingStrategy
@@ -72,6 +76,8 @@
 )
 from .schema.test_template.nemo_launcher.template import NeMoLauncher
 from .schema.test_template.sleep.grading_strategy import SleepGradingStrategy
+from .schema.test_template.sleep.kubernetes_install_strategy import SleepKubernetesInstallStrategy
+from .schema.test_template.sleep.kubernetes_json_gen_strategy import SleepKubernetesJsonGenStrategy
 from .schema.test_template.sleep.report_generation_strategy import SleepReportGenerationStrategy
 from .schema.test_template.sleep.slurm_command_gen_strategy import SleepSlurmCommandGenStrategy
 from .schema.test_template.sleep.standalone_command_gen_strategy import SleepStandaloneCommandGenStrategy
@@ -82,24 +88,29 @@
 from .schema.test_template.ucc_test.slurm_command_gen_strategy import UCCTestSlurmCommandGenStrategy
 from .schema.test_template.ucc_test.slurm_install_strategy import UCCTestSlurmInstallStrategy
 from .schema.test_template.ucc_test.template import UCCTest
+from .systems.kubernetes.kubernetes_system import KubernetesSystem
 from .systems.slurm.slurm_system import SlurmSystem
 from .systems.standalone_system import StandaloneSystem
 
 Registry().add_system_parser("standalone", StandaloneSystemParser)
 Registry().add_system_parser("slurm", SlurmSystemParser)
+Registry().add_system_parser("kubernetes", KubernetesSystemParser)
 
 Registry().add_runner("slurm", SlurmRunner)
+Registry().add_runner("kubernetes", KubernetesRunner)
 Registry().add_runner("standalone", StandaloneRunner)
 
 Registry().add_strategy(InstallStrategy, [SlurmSystem], [NcclTest], NcclTestSlurmInstallStrategy)
 Registry().add_strategy(InstallStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmInstallStrategy)
 Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [NcclTest], NcclTestReportGenerationStrategy)
 Registry().add_strategy(CommandGenStrategy, [StandaloneSystem], [Sleep], SleepStandaloneCommandGenStrategy)
 Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [Sleep], SleepSlurmCommandGenStrategy)
+Registry().add_strategy(JsonGenStrategy, [KubernetesSystem], [Sleep], SleepKubernetesJsonGenStrategy)
 Registry().add_strategy(InstallStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxSlurmInstallStrategy)
 Registry().add_strategy(GradingStrategy, [SlurmSystem], [NcclTest], NcclTestGradingStrategy)
 Registry().add_strategy(InstallStrategy, [SlurmSystem], [UCCTest], UCCTestSlurmInstallStrategy)
 Registry().add_strategy(InstallStrategy, [StandaloneSystem, SlurmSystem], [Sleep], SleepStandaloneInstallStrategy)
+Registry().add_strategy(InstallStrategy, [KubernetesSystem], [Sleep], SleepKubernetesInstallStrategy)
 Registry().add_strategy(
     ReportGenerationStrategy, [StandaloneSystem, SlurmSystem], [Sleep], SleepReportGenerationStrategy
 )
@@ -122,6 +133,7 @@
 )
 Registry().add_strategy(JobIdRetrievalStrategy, [StandaloneSystem], [Sleep], StandaloneJobIdRetrievalStrategy)
 Registry().add_strategy(JobStatusRetrievalStrategy, [StandaloneSystem], [Sleep], DefaultJobStatusRetrievalStrategy)
+Registry().add_strategy(JobStatusRetrievalStrategy, [KubernetesSystem], [Sleep], DefaultJobStatusRetrievalStrategy)
 Registry().add_strategy(JobStatusRetrievalStrategy, [SlurmSystem], [NcclTest], NcclTestJobStatusRetrievalStrategy)
 Registry().add_strategy(JobStatusRetrievalStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxJobStatusRetrievalStrategy)
 Registry().add_strategy(
@@ -145,13 +157,15 @@
 
 Registry().add_installer("slurm", SlurmInstaller)
 Registry().add_installer("standalone", StandaloneInstaller)
+Registry().add_installer("kubernetes", KubernetesInstaller)
 
 __all__ = [
     "BaseInstaller",
     "BaseJob",
     "BaseRunner",
     "BaseSystemParser",
     "CommandGenStrategy",
+    "JsonGenStrategy",
     "Grader",
     "GradingStrategy",
     "Installer",

diff --git a/src/cloudai/_core/json_gen_strategy.py b/src/cloudai/_core/json_gen_strategy.py
@@ -0,0 +1,59 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+from pathlib import Path
+from typing import Any, Dict, List
+
+from .test_template_strategy import TestTemplateStrategy
+
+
+class JsonGenStrategy(TestTemplateStrategy):
+    """
+    Abstract base class for generating Kubernetes job specifications based on system and test parameters.
+
+    It specifies how to generate JSON job specifications based on system and test parameters.
+    """
+
+    @abstractmethod
+    def gen_json(
+        self,
+        env_vars: Dict[str, str],
+        cmd_args: Dict[str, str],
+        extra_env_vars: Dict[str, str],
+        extra_cmd_args: str,
+        output_path: Path,
+        job_name: str,
+        num_nodes: int,
+        nodes: List[str],
+    ) -> Dict[Any, Any]:
+        """
+        Generate the Kubernetes job specification based on the given parameters.
+
+        Args:
+            env_vars (Dict[str, str]): Environment variables for the job.
+            cmd_args (Dict[str, str]): Command-line arguments for the job.
+            extra_env_vars (Dict[str, str]): Additional environment variables.
+            extra_cmd_args (str): Additional command-line arguments.
+            output_path (Path): Path to the output directory.
+            job_name (str): The name of the job.
+            num_nodes (int): The number of nodes to be used for job execution.
+            nodes (List[str]): List of nodes for job execution, optional.
+
+        Returns:
+            Dict[Any, Any]: The generated Kubernetes job specification in JSON format.
+        """
+        pass
diff --git a/src/cloudai/_core/test.py b/src/cloudai/_core/test.py
@@ -16,7 +16,7 @@
 
 import sys
 from pathlib import Path
-from typing import Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 from .job_status_result import JobStatusResult
 from .test_template import TestTemplate
@@ -120,7 +120,7 @@ def gen_exec_command(
         Generate the command to run this specific test.
 
         Args:
-            output_path (str): Path to the output directory.
+            output_path (Path): Path to the output directory where logs and results will be stored.
             time_limit (Optional[str]): Time limit for the test execution.
             num_nodes (Optional[int]): Number of nodes to be used for the test execution.
             nodes (Optional[List[str]]): List of nodes involved in the test.
@@ -143,6 +143,43 @@ def gen_exec_command(
             nodes,
         )
 
+    def gen_json(
+        self,
+        output_path: Path,
+        job_name: str,
+        time_limit: Optional[str] = None,
+        num_nodes: int = 1,
+        nodes: Optional[List[str]] = None,
+    ) -> Dict[Any, Any]:
+        """
+        Generate a JSON dictionary representing the Kubernetes job specification for this test.
+
+        Args:
+            output_path (Path): Path to the output directory where logs and results will be stored.
+            job_name (str): The name assigned to the Kubernetes job.
+            time_limit (Optional[str]): Time limit for the test execution.
+            num_nodes (Optional[int]): Number of nodes to be used for the test execution.
+            nodes (Optional[List[str]]): List of nodes involved in the test.
+
+        Returns:
+            Dict[Any, Any]: A dictionary representing the Kubernetes job specification.
+        """
+        if time_limit is not None:
+            self.cmd_args["time_limit"] = time_limit
+        if not nodes:
+            nodes = []
+
+        return self.test_template.gen_json(
+            self.env_vars,
+            self.cmd_args,
+            self.extra_env_vars,
+            self.extra_cmd_args,
+            output_path,
+            job_name,
+            num_nodes,
+            nodes,
+        )
+
     def get_job_id(self, stdout: str, stderr: str) -> Optional[int]:
         """
         Retrieve the job ID using the test template's method.

diff --git a/src/cloudai/_core/test_template.py b/src/cloudai/_core/test_template.py
@@ -24,6 +24,7 @@
 from .job_id_retrieval_strategy import JobIdRetrievalStrategy
 from .job_status_result import JobStatusResult
 from .job_status_retrieval_strategy import JobStatusRetrievalStrategy
+from .json_gen_strategy import JsonGenStrategy
 from .report_generation_strategy import ReportGenerationStrategy
 from .system import System
 
@@ -42,6 +43,7 @@ class TestTemplate:
         logger (logging.Logger): Logger for the test template.
         install_strategy (InstallStrategy): Strategy for installing test prerequisites.
         command_gen_strategy (CommandGenStrategy): Strategy for generating execution commands.
+        json_gen_strategy (JsonGenStrategy): Strategy for generating json string.
         job_id_retrieval_strategy (JobIdRetrievalStrategy): Strategy for retrieving job IDs.
         report_generation_strategy (ReportGenerationStrategy): Strategy for generating reports.
         grading_strategy (GradingStrategy): Strategy for grading performance based on test outcomes.
@@ -72,6 +74,7 @@ def __init__(
         self.cmd_args = cmd_args
         self.install_strategy: Optional[InstallStrategy] = None
         self.command_gen_strategy: Optional[CommandGenStrategy] = None
+        self.json_gen_strategy: Optional[JsonGenStrategy] = None
         self.job_id_retrieval_strategy: Optional[JobIdRetrievalStrategy] = None
         self.job_status_retrieval_strategy: Optional[JobStatusRetrievalStrategy] = None
         self.report_generation_strategy: Optional[ReportGenerationStrategy] = None
@@ -166,6 +169,51 @@ def gen_exec_command(
             nodes,
         )
 
+    def gen_json(
+        self,
+        env_vars: Dict[str, str],
+        cmd_args: Dict[str, str],
+        extra_env_vars: Dict[str, str],
+        extra_cmd_args: str,
+        output_path: Path,
+        job_name: str,
+        num_nodes: int,
+        nodes: List[str],
+    ) -> Dict[Any, Any]:
+        """
+        Generate a JSON string representing the Kubernetes job specification for this test using this template.
+
+        Args:
+            env_vars (Dict[str, str]): Environment variables for the test.
+            cmd_args (Dict[str, str]): Command-line arguments for the test.
+            extra_env_vars (Dict[str, str]): Extra environment variables.
+            extra_cmd_args (str): Extra command-line arguments.
+            output_path (Path): Path to the output directory.
+            job_name (str): The name of the job.
+            num_nodes (int): The number of nodes to be used for the test execution.
+            nodes (List[str]): A list of nodes where the test will be executed.
+
+        Returns:
+            Dict[Any, Any]: A dictionary representing the Kubernetes job specification.
+        """
+        if not nodes:
+            nodes = []
+        if self.json_gen_strategy is None:
+            raise ValueError(
+                "json_gen_strategy is missing. Ensure the strategy is registered in the Registry "
+                "by calling the appropriate registration function for the system type."
+            )
+        return self.json_gen_strategy.gen_json(
+            env_vars,
+            cmd_args,
+            extra_env_vars,
+            extra_cmd_args,
+            output_path,
+            job_name,
+            num_nodes,
+            nodes,
+        )
+
     def get_job_id(self, stdout: str, stderr: str) -> Optional[int]:
         """
         Retrieve the job ID from the execution output using the job ID retrieval strategy.

diff --git a/src/cloudai/_core/test_template_parser.py b/src/cloudai/_core/test_template_parser.py
@@ -24,6 +24,7 @@
 from .install_strategy import InstallStrategy
 from .job_id_retrieval_strategy import JobIdRetrievalStrategy
 from .job_status_retrieval_strategy import JobStatusRetrievalStrategy
+from .json_gen_strategy import JsonGenStrategy
 from .registry import Registry
 from .report_generation_strategy import ReportGenerationStrategy
 from .system import System
@@ -127,6 +128,10 @@ def _parse_data(self, data: Dict[str, Any]) -> TestTemplate:
             CommandGenStrategy,
             self._fetch_strategy(CommandGenStrategy, type(obj.system), type(obj), env_vars, cmd_args),
         )
+        obj.json_gen_strategy = cast(
+            JsonGenStrategy,
+            self._fetch_strategy(JsonGenStrategy, type(obj.system), type(obj), env_vars, cmd_args),
+        )
         obj.job_id_retrieval_strategy = cast(
             JobIdRetrievalStrategy,
             self._fetch_strategy(JobIdRetrievalStrategy, type(obj.system), type(obj), env_vars, cmd_args),