Skip to content

Commit

Permalink
Unify job spec generation interface across Slurm and Kubernetes
Browse files Browse the repository at this point in the history
  • Loading branch information
TaekyungHeo committed Aug 22, 2024
1 parent 9e8eae3 commit 62be1ab
Show file tree
Hide file tree
Showing 31 changed files with 372 additions and 478 deletions.
46 changes: 24 additions & 22 deletions src/cloudai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,16 @@
from ._core.base_job import BaseJob
from ._core.base_runner import BaseRunner
from ._core.base_system_parser import BaseSystemParser
from ._core.command_gen_strategy import CommandGenStrategy
from ._core.exceptions import JobIdRetrievalError
from ._core.grader import Grader
from ._core.grading_strategy import GradingStrategy
from ._core.install_strategy import InstallStrategy
from ._core.job_context import JobContext
from ._core.job_id_retrieval_strategy import JobIdRetrievalStrategy
from ._core.job_spec_gen_strategy import JobSpecGenStrategy
from ._core.job_specification import JobSpecification
from ._core.job_status_result import JobStatusResult
from ._core.job_status_retrieval_strategy import JobStatusRetrievalStrategy
from ._core.json_gen_strategy import JsonGenStrategy
from ._core.parser import Parser
from ._core.registry import Registry
from ._core.report_generation_strategy import ReportGenerationStrategy
Expand All @@ -49,54 +50,54 @@
from .runner.standalone.standalone_runner import StandaloneRunner
from .schema.test_template.chakra_replay.grading_strategy import ChakraReplayGradingStrategy
from .schema.test_template.chakra_replay.report_generation_strategy import ChakraReplayReportGenerationStrategy
from .schema.test_template.chakra_replay.slurm_command_gen_strategy import ChakraReplaySlurmCommandGenStrategy
from .schema.test_template.chakra_replay.slurm_install_strategy import ChakraReplaySlurmInstallStrategy
from .schema.test_template.chakra_replay.slurm_job_spec_gen_strategy import ChakraReplaySlurmJobSpecGenStrategy
from .schema.test_template.chakra_replay.template import ChakraReplay
from .schema.test_template.common.default_job_status_retrieval_strategy import DefaultJobStatusRetrievalStrategy
from .schema.test_template.common.slurm_job_id_retrieval_strategy import SlurmJobIdRetrievalStrategy
from .schema.test_template.common.standalone_job_id_retrieval_strategy import StandaloneJobIdRetrievalStrategy
from .schema.test_template.jax_toolbox.grading_strategy import JaxToolboxGradingStrategy
from .schema.test_template.jax_toolbox.job_status_retrieval_strategy import JaxToolboxJobStatusRetrievalStrategy
from .schema.test_template.jax_toolbox.report_generation_strategy import JaxToolboxReportGenerationStrategy
from .schema.test_template.jax_toolbox.slurm_command_gen_strategy import JaxToolboxSlurmCommandGenStrategy
from .schema.test_template.jax_toolbox.slurm_install_strategy import JaxToolboxSlurmInstallStrategy
from .schema.test_template.jax_toolbox.slurm_job_spec_gen_strategy import JaxToolboxSlurmJobSpecGenStrategy
from .schema.test_template.jax_toolbox.template import JaxToolbox
from .schema.test_template.nccl_test.kubernetes_grading_strategy import KubernetesNcclTestGradingStrategy
from .schema.test_template.nccl_test.kubernetes_job_spec_gen_strategy import NcclTestKubernetesJobSpecGenStrategy
from .schema.test_template.nccl_test.kubernetes_job_status_retrieval_strategy import (
KubernetesNcclTestJobStatusRetrievalStrategy,
)
from .schema.test_template.nccl_test.kubernetes_json_gen_strategy import NcclTestKubernetesJsonGenStrategy
from .schema.test_template.nccl_test.kubernetes_report_generation_strategy import (
KubernetesNcclTestReportGenerationStrategy,
)
from .schema.test_template.nccl_test.slurm_command_gen_strategy import NcclTestSlurmCommandGenStrategy
from .schema.test_template.nccl_test.slurm_grading_strategy import SlurmNcclTestGradingStrategy
from .schema.test_template.nccl_test.slurm_install_strategy import NcclTestSlurmInstallStrategy
from .schema.test_template.nccl_test.slurm_job_spec_gen_strategy import NcclTestSlurmJobSpecGenStrategy
from .schema.test_template.nccl_test.slurm_job_status_retrieval_strategy import (
SlurmNcclTestJobStatusRetrievalStrategy,
)
from .schema.test_template.nccl_test.slurm_report_generation_strategy import SlurmNcclTestReportGenerationStrategy
from .schema.test_template.nccl_test.template import NcclTest
from .schema.test_template.nemo_launcher.grading_strategy import NeMoLauncherGradingStrategy
from .schema.test_template.nemo_launcher.report_generation_strategy import NeMoLauncherReportGenerationStrategy
from .schema.test_template.nemo_launcher.slurm_command_gen_strategy import NeMoLauncherSlurmCommandGenStrategy
from .schema.test_template.nemo_launcher.slurm_install_strategy import NeMoLauncherSlurmInstallStrategy
from .schema.test_template.nemo_launcher.slurm_job_id_retrieval_strategy import (
NeMoLauncherSlurmJobIdRetrievalStrategy,
)
from .schema.test_template.nemo_launcher.slurm_job_spec_gen_strategy import NeMoLauncherSlurmJobSpecGenStrategy
from .schema.test_template.nemo_launcher.template import NeMoLauncher
from .schema.test_template.sleep.grading_strategy import SleepGradingStrategy
from .schema.test_template.sleep.kubernetes_install_strategy import SleepKubernetesInstallStrategy
from .schema.test_template.sleep.kubernetes_json_gen_strategy import SleepKubernetesJsonGenStrategy
from .schema.test_template.sleep.kubernetes_job_spec_gen_strategy import SleepKubernetesJobSpecGenStrategy
from .schema.test_template.sleep.report_generation_strategy import SleepReportGenerationStrategy
from .schema.test_template.sleep.slurm_command_gen_strategy import SleepSlurmCommandGenStrategy
from .schema.test_template.sleep.standalone_command_gen_strategy import SleepStandaloneCommandGenStrategy
from .schema.test_template.sleep.slurm_job_spec_gen_strategy import SleepSlurmJobSpecGenStrategy
from .schema.test_template.sleep.standalone_install_strategy import SleepStandaloneInstallStrategy
from .schema.test_template.sleep.standalone_job_spec_gen_strategy import SleepStandaloneJobSpecGenStrategy
from .schema.test_template.sleep.template import Sleep
from .schema.test_template.ucc_test.grading_strategy import UCCTestGradingStrategy
from .schema.test_template.ucc_test.report_generation_strategy import UCCTestReportGenerationStrategy
from .schema.test_template.ucc_test.slurm_command_gen_strategy import UCCTestSlurmCommandGenStrategy
from .schema.test_template.ucc_test.slurm_install_strategy import UCCTestSlurmInstallStrategy
from .schema.test_template.ucc_test.slurm_job_spec_gen_strategy import UCCTestSlurmJobSpecGenStrategy
from .schema.test_template.ucc_test.template import UCCTest
from .systems.kubernetes.kubernetes_system import KubernetesSystem
from .systems.slurm.slurm_system import SlurmSystem
Expand All @@ -116,11 +117,11 @@
Registry().add_strategy(
ReportGenerationStrategy, [KubernetesSystem], [NcclTest], KubernetesNcclTestReportGenerationStrategy
)
Registry().add_strategy(CommandGenStrategy, [StandaloneSystem], [Sleep], SleepStandaloneCommandGenStrategy)
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [Sleep], SleepSlurmCommandGenStrategy)
Registry().add_strategy(JsonGenStrategy, [KubernetesSystem], [Sleep], SleepKubernetesJsonGenStrategy)
Registry().add_strategy(JobSpecGenStrategy, [StandaloneSystem], [Sleep], SleepStandaloneJobSpecGenStrategy)
Registry().add_strategy(JobSpecGenStrategy, [SlurmSystem], [Sleep], SleepSlurmJobSpecGenStrategy)
Registry().add_strategy(JobSpecGenStrategy, [KubernetesSystem], [Sleep], SleepKubernetesJobSpecGenStrategy)
Registry().add_strategy(InstallStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxSlurmInstallStrategy)
Registry().add_strategy(JsonGenStrategy, [KubernetesSystem], [NcclTest], NcclTestKubernetesJsonGenStrategy)
Registry().add_strategy(JobSpecGenStrategy, [KubernetesSystem], [NcclTest], NcclTestKubernetesJobSpecGenStrategy)
Registry().add_strategy(GradingStrategy, [KubernetesSystem], [NcclTest], KubernetesNcclTestGradingStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [NcclTest], SlurmNcclTestGradingStrategy)
Registry().add_strategy(InstallStrategy, [SlurmSystem], [UCCTest], UCCTestSlurmInstallStrategy)
Expand All @@ -130,16 +131,16 @@
ReportGenerationStrategy, [StandaloneSystem, SlurmSystem], [Sleep], SleepReportGenerationStrategy
)
Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherReportGenerationStrategy)
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NcclTest], NcclTestSlurmCommandGenStrategy)
Registry().add_strategy(JobSpecGenStrategy, [SlurmSystem], [NcclTest], NcclTestSlurmJobSpecGenStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [Sleep], SleepGradingStrategy)
Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxReportGenerationStrategy)
Registry().add_strategy(JobIdRetrievalStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmJobIdRetrievalStrategy)
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmCommandGenStrategy)
Registry().add_strategy(JobSpecGenStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmJobSpecGenStrategy)
Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [UCCTest], UCCTestReportGenerationStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherGradingStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxGradingStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [UCCTest], UCCTestGradingStrategy)
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxSlurmCommandGenStrategy)
Registry().add_strategy(JobSpecGenStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxSlurmJobSpecGenStrategy)
Registry().add_strategy(
JobIdRetrievalStrategy,
[SlurmSystem],
Expand All @@ -160,11 +161,11 @@
[ChakraReplay, UCCTest, NeMoLauncher, Sleep],
DefaultJobStatusRetrievalStrategy,
)
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [UCCTest], UCCTestSlurmCommandGenStrategy)
Registry().add_strategy(JobSpecGenStrategy, [SlurmSystem], [UCCTest], UCCTestSlurmJobSpecGenStrategy)
Registry().add_strategy(InstallStrategy, [SlurmSystem], [ChakraReplay], ChakraReplaySlurmInstallStrategy)
Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [ChakraReplay], ChakraReplayReportGenerationStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [ChakraReplay], ChakraReplayGradingStrategy)
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [ChakraReplay], ChakraReplaySlurmCommandGenStrategy)
Registry().add_strategy(JobSpecGenStrategy, [SlurmSystem], [ChakraReplay], ChakraReplaySlurmJobSpecGenStrategy)

Registry().add_test_template("ChakraReplay", ChakraReplay)
Registry().add_test_template("JaxToolbox", JaxToolbox)
Expand All @@ -182,8 +183,9 @@
"BaseJob",
"BaseRunner",
"BaseSystemParser",
"CommandGenStrategy",
"JsonGenStrategy",
"JobContext",
"JobSpecification",
"JobSpecGenStrategy",
"Grader",
"GradingStrategy",
"Installer",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,44 +14,54 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import abstractmethod
from pathlib import Path
from typing import Dict, List
from typing import Dict, List, Optional

from .test_template_strategy import TestTemplateStrategy


class CommandGenStrategy(TestTemplateStrategy):
class JobContext:
"""
Abstract base class defining the interface for command generation strategies across different system environments.
Encapsulates all necessary parameters required to generate a job specification.
It specifies how to generate execution commands based on system and test parameters.
Attributes
env_vars (Dict[str, str]): Environment variables for the test.
cmd_args (Dict[str, str]): Command-line arguments for the test.
extra_env_vars (Dict[str, str]): Additional environment variables.
extra_cmd_args (str): Additional command-line arguments.
output_path (Path): Path to the output directory.
job_name (Optional[str]): The name of the job, if applicable.
num_nodes (int): The number of nodes to be used for the test execution.
nodes (List[str]): List of nodes for test execution, optional.
"""

@abstractmethod
def gen_exec_command(
def __init__(
self,
env_vars: Dict[str, str],
cmd_args: Dict[str, str],
extra_env_vars: Dict[str, str],
extra_cmd_args: str,
output_path: Path,
job_name: Optional[str],
num_nodes: int,
nodes: List[str],
) -> str:
):
"""
Generate the execution command for a test based on the given parameters.
Initialize a new JobContext instance with the provided parameters.
Args:
env_vars (Dict[str, str]): Environment variables for the test.
cmd_args (Dict[str, str]): Command-line arguments for the test.
extra_env_vars (Dict[str, str]): Additional environment variables.
extra_cmd_args (str): Additional command-line arguments.
output_path (Path): Path to the output directory.
job_name (Optional[str]): The name of the job, if applicable.
num_nodes (int): The number of nodes to be used for the test execution.
nodes (List[str]): List of nodes for test execution, optional.
Returns:
str: The generated execution command.
"""
pass
self.env_vars = env_vars
self.cmd_args = cmd_args
self.extra_env_vars = extra_env_vars
self.extra_cmd_args = extra_cmd_args
self.output_path = output_path
self.job_name = job_name
self.num_nodes = num_nodes
self.nodes = nodes
43 changes: 43 additions & 0 deletions src/cloudai/_core/job_spec_gen_strategy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import abstractmethod

from .job_context import JobContext
from .job_specification import JobSpecification
from .test_template_strategy import TestTemplateStrategy


class JobSpecGenStrategy(TestTemplateStrategy):
"""
Abstract base class defining the interface for job specification generation strategies.
It specifies how to generate job specifications based on system and test parameters.
"""

@abstractmethod
def gen_job_spec(self, context: JobContext) -> JobSpecification:
"""
Generate the job specification for a test based on the given context.
Args:
context (JobContext): The context containing all necessary parameters.
Returns:
JobSpecification: The generated job specification, which could be a command string,
a JSON object, or other format suitable for the system environment.
"""
pass
36 changes: 36 additions & 0 deletions src/cloudai/_core/job_specification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any


class JobSpecification:
"""
Represents the job specification generated by the JobSpecGenStrategy.
Attributes
spec_data (Any): The underlying data representing the job specification.
"""

def __init__(self, spec_data: Any):
"""
Initialize a new JobSpecification instance.
Args:
spec_data (Any): The underlying data representing the job specification.
This could be a command string, a JSON object, or any other format.
"""
self.spec_data = spec_data
59 changes: 0 additions & 59 deletions src/cloudai/_core/json_gen_strategy.py

This file was deleted.

Loading

0 comments on commit 62be1ab

Please sign in to comment.