Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unify Job Spec Generation Interface Across Slurm and Kubernetes #182

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions conf/common/system/kubernetes_cluster.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name = "kubernetes-cluster"
scheduler = "kubernetes"
kube_config_path = ""

install_path = "./install"
output_path = "./results"
default_image = "ubuntu:22.04"
default_namespace = "default"

[global_env_vars]
NCCL_IB_GID_INDEX = "3"
NCCL_SOCKET_IFNAME = "ib0"
NCCL_IB_HCA = "mlx5_0"
UCX_NET_DEVICES = "mlx5_0:1"
NCCL_P2P_LEVEL = "PIX"
UCX_TLS = "rc_x,sm,cuda_copy"
NCCL_IB_TC = "96"
2 changes: 1 addition & 1 deletion conf/common/test/nccl_test_all_gather.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ test_template_name = "NcclTest"
extra_cmd_args = "--stepfactor 2"

[cmd_args]
"subtest_name" = "all_gather_perf_mpi"
"subtest_name" = "all_gather_perf"
"ngpus" = "1"
"minbytes" = "128"
"maxbytes" = "4G"
Expand Down
2 changes: 1 addition & 1 deletion conf/common/test/nccl_test_all_reduce.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ test_template_name = "NcclTest"
extra_cmd_args = "--stepfactor 2"

[cmd_args]
"subtest_name" = "all_reduce_perf_mpi"
"subtest_name" = "all_reduce_perf"
"ngpus" = "1"
"minbytes" = "128"
"maxbytes" = "16G"
Expand Down
2 changes: 1 addition & 1 deletion conf/common/test/nccl_test_alltoall.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ test_template_name = "NcclTest"
extra_cmd_args = "--stepfactor 2"

[cmd_args]
"subtest_name" = "alltoall_perf_mpi"
"subtest_name" = "alltoall_perf"
"ngpus" = "1"
"minbytes" = "128"
"maxbytes" = "4G"
Expand Down
2 changes: 1 addition & 1 deletion conf/common/test/nccl_test_broadcast.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ description = "broadcast"
test_template_name = "NcclTest"

[cmd_args]
"subtest_name" = "broadcast_perf_mpi"
"subtest_name" = "broadcast_perf"
2 changes: 1 addition & 1 deletion conf/common/test/nccl_test_gather.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ description = "gather"
test_template_name = "NcclTest"

[cmd_args]
"subtest_name" = "gather_perf_mpi"
"subtest_name" = "gather_perf"
2 changes: 1 addition & 1 deletion conf/common/test/nccl_test_hypercube.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ description = "hypercube"
test_template_name = "NcclTest"

[cmd_args]
"subtest_name" = "hypercube_perf_mpi"
"subtest_name" = "hypercube_perf"
2 changes: 1 addition & 1 deletion conf/common/test/nccl_test_reduce.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ description = "reduce"
test_template_name = "NcclTest"

[cmd_args]
"subtest_name" = "reduce_perf_mpi"
"subtest_name" = "reduce_perf"
2 changes: 1 addition & 1 deletion conf/common/test/nccl_test_reduce_scatter.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ test_template_name = "NcclTest"
extra_cmd_args = "--stepfactor 2"

[cmd_args]
"subtest_name" = "reduce_scatter_perf_mpi"
"subtest_name" = "reduce_scatter_perf"
"ngpus" = "1"
"minbytes" = "128"
"maxbytes" = "4G"
Expand Down
2 changes: 1 addition & 1 deletion conf/common/test/nccl_test_scatter.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ description = "scatter"
test_template_name = "NcclTest"

[cmd_args]
"subtest_name" = "scatter_perf_mpi"
"subtest_name" = "scatter_perf"
2 changes: 1 addition & 1 deletion conf/common/test/nccl_test_sendrecv.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ description = "sendrecv"
test_template_name = "NcclTest"

[cmd_args]
"subtest_name" = "sendrecv_perf_mpi"
"subtest_name" = "sendrecv_perf"
24 changes: 12 additions & 12 deletions conf/common/test_template/nccl_test.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,23 @@ name = "NcclTest"
[cmd_args]
[cmd_args.docker_image_url]
type = "str"
default = "nvcr.io/nvidia/pytorch:24.02-py3"
default = "ghcr.io/coreweave/nccl-tests:12.4.1-cudnn-devel-ubuntu20.04-nccl2.21.5-1-85f9143"

[cmd_args.subtest_name]
type = "preset"
values = [
"all_reduce_perf_mpi",
"all_gather_perf_mpi",
"alltoall_perf_mpi",
"broadcast_perf_mpi",
"gather_perf_mpi",
"hypercube_perf_mpi",
"reduce_perf_mpi",
"reduce_scatter_perf_mpi",
"scatter_perf_mpi",
"sendrecv_perf_mpi",
"all_reduce_perf",
"all_gather_perf",
"alltoall_perf",
"broadcast_perf",
"gather_perf",
"hypercube_perf",
"reduce_perf",
"reduce_scatter_perf",
"scatter_perf",
"sendrecv_perf",
]
default = "all_reduce_perf_mpi"
default = "all_reduce_perf"

[cmd_args.nthreads]
type = "int"
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ bokeh==3.4.1
pandas==2.2.1
tbparse==0.0.8
toml==0.10.2
kubernetes==30.1.0
78 changes: 56 additions & 22 deletions src/cloudai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@
from ._core.base_job import BaseJob
from ._core.base_runner import BaseRunner
from ._core.base_system_parser import BaseSystemParser
from ._core.command_gen_strategy import CommandGenStrategy
from ._core.exceptions import JobIdRetrievalError
from ._core.grader import Grader
from ._core.grading_strategy import GradingStrategy
from ._core.install_strategy import InstallStrategy
from ._core.job_context import JobContext
from ._core.job_id_retrieval_strategy import JobIdRetrievalStrategy
from ._core.job_spec_gen_strategy import JobSpecGenStrategy
from ._core.job_specification import JobSpecification
from ._core.job_status_result import JobStatusResult
from ._core.job_status_retrieval_strategy import JobStatusRetrievalStrategy
from ._core.parser import Parser
Expand All @@ -36,84 +38,109 @@
from ._core.test_template import TestTemplate
from ._core.test_template_strategy import TestTemplateStrategy
from .installer.installer import Installer
from .installer.kubernetes_installer import KubernetesInstaller
from .installer.slurm_installer import SlurmInstaller
from .installer.standalone_installer import StandaloneInstaller
from .parser.system_parser.kubernetes_system_parser import KubernetesSystemParser
from .parser.system_parser.slurm_system_parser import SlurmSystemParser
from .parser.system_parser.standalone_system_parser import StandaloneSystemParser
from .report_generator import ReportGenerator
from .runner.kubernetes.kubernetes_runner import KubernetesRunner
from .runner.slurm.slurm_runner import SlurmRunner
from .runner.standalone.standalone_runner import StandaloneRunner
from .schema.test_template.chakra_replay.grading_strategy import ChakraReplayGradingStrategy
from .schema.test_template.chakra_replay.report_generation_strategy import ChakraReplayReportGenerationStrategy
from .schema.test_template.chakra_replay.slurm_command_gen_strategy import ChakraReplaySlurmCommandGenStrategy
from .schema.test_template.chakra_replay.slurm_install_strategy import ChakraReplaySlurmInstallStrategy
from .schema.test_template.chakra_replay.slurm_job_spec_gen_strategy import ChakraReplaySlurmJobSpecGenStrategy
from .schema.test_template.chakra_replay.template import ChakraReplay
from .schema.test_template.common.default_job_status_retrieval_strategy import DefaultJobStatusRetrievalStrategy
from .schema.test_template.common.slurm_job_id_retrieval_strategy import SlurmJobIdRetrievalStrategy
from .schema.test_template.common.standalone_job_id_retrieval_strategy import StandaloneJobIdRetrievalStrategy
from .schema.test_template.jax_toolbox.grading_strategy import JaxToolboxGradingStrategy
from .schema.test_template.jax_toolbox.job_status_retrieval_strategy import JaxToolboxJobStatusRetrievalStrategy
from .schema.test_template.jax_toolbox.report_generation_strategy import JaxToolboxReportGenerationStrategy
from .schema.test_template.jax_toolbox.slurm_command_gen_strategy import JaxToolboxSlurmCommandGenStrategy
from .schema.test_template.jax_toolbox.slurm_install_strategy import JaxToolboxSlurmInstallStrategy
from .schema.test_template.jax_toolbox.slurm_job_spec_gen_strategy import JaxToolboxSlurmJobSpecGenStrategy
from .schema.test_template.jax_toolbox.template import JaxToolbox
from .schema.test_template.nccl_test.grading_strategy import NcclTestGradingStrategy
from .schema.test_template.nccl_test.job_status_retrieval_strategy import NcclTestJobStatusRetrievalStrategy
from .schema.test_template.nccl_test.report_generation_strategy import NcclTestReportGenerationStrategy
from .schema.test_template.nccl_test.slurm_command_gen_strategy import NcclTestSlurmCommandGenStrategy
from .schema.test_template.nccl_test.kubernetes_grading_strategy import KubernetesNcclTestGradingStrategy
from .schema.test_template.nccl_test.kubernetes_job_spec_gen_strategy import NcclTestKubernetesJobSpecGenStrategy
from .schema.test_template.nccl_test.kubernetes_job_status_retrieval_strategy import (
KubernetesNcclTestJobStatusRetrievalStrategy,
)
from .schema.test_template.nccl_test.kubernetes_report_generation_strategy import (
KubernetesNcclTestReportGenerationStrategy,
)
from .schema.test_template.nccl_test.slurm_grading_strategy import SlurmNcclTestGradingStrategy
from .schema.test_template.nccl_test.slurm_install_strategy import NcclTestSlurmInstallStrategy
from .schema.test_template.nccl_test.slurm_job_spec_gen_strategy import NcclTestSlurmJobSpecGenStrategy
from .schema.test_template.nccl_test.slurm_job_status_retrieval_strategy import (
SlurmNcclTestJobStatusRetrievalStrategy,
)
from .schema.test_template.nccl_test.slurm_report_generation_strategy import SlurmNcclTestReportGenerationStrategy
from .schema.test_template.nccl_test.template import NcclTest
from .schema.test_template.nemo_launcher.grading_strategy import NeMoLauncherGradingStrategy
from .schema.test_template.nemo_launcher.report_generation_strategy import NeMoLauncherReportGenerationStrategy
from .schema.test_template.nemo_launcher.slurm_command_gen_strategy import NeMoLauncherSlurmCommandGenStrategy
from .schema.test_template.nemo_launcher.slurm_install_strategy import NeMoLauncherSlurmInstallStrategy
from .schema.test_template.nemo_launcher.slurm_job_id_retrieval_strategy import (
NeMoLauncherSlurmJobIdRetrievalStrategy,
)
from .schema.test_template.nemo_launcher.slurm_job_spec_gen_strategy import NeMoLauncherSlurmJobSpecGenStrategy
from .schema.test_template.nemo_launcher.template import NeMoLauncher
from .schema.test_template.sleep.grading_strategy import SleepGradingStrategy
from .schema.test_template.sleep.kubernetes_install_strategy import SleepKubernetesInstallStrategy
from .schema.test_template.sleep.kubernetes_job_spec_gen_strategy import SleepKubernetesJobSpecGenStrategy
from .schema.test_template.sleep.report_generation_strategy import SleepReportGenerationStrategy
from .schema.test_template.sleep.slurm_command_gen_strategy import SleepSlurmCommandGenStrategy
from .schema.test_template.sleep.standalone_command_gen_strategy import SleepStandaloneCommandGenStrategy
from .schema.test_template.sleep.slurm_job_spec_gen_strategy import SleepSlurmJobSpecGenStrategy
from .schema.test_template.sleep.standalone_install_strategy import SleepStandaloneInstallStrategy
from .schema.test_template.sleep.standalone_job_spec_gen_strategy import SleepStandaloneJobSpecGenStrategy
from .schema.test_template.sleep.template import Sleep
from .schema.test_template.ucc_test.grading_strategy import UCCTestGradingStrategy
from .schema.test_template.ucc_test.report_generation_strategy import UCCTestReportGenerationStrategy
from .schema.test_template.ucc_test.slurm_command_gen_strategy import UCCTestSlurmCommandGenStrategy
from .schema.test_template.ucc_test.slurm_install_strategy import UCCTestSlurmInstallStrategy
from .schema.test_template.ucc_test.slurm_job_spec_gen_strategy import UCCTestSlurmJobSpecGenStrategy
from .schema.test_template.ucc_test.template import UCCTest
from .systems.kubernetes.kubernetes_system import KubernetesSystem
from .systems.slurm.slurm_system import SlurmSystem
from .systems.standalone_system import StandaloneSystem

Registry().add_system_parser("standalone", StandaloneSystemParser)
Registry().add_system_parser("slurm", SlurmSystemParser)
Registry().add_system_parser("kubernetes", KubernetesSystemParser)

Registry().add_runner("slurm", SlurmRunner)
Registry().add_runner("kubernetes", KubernetesRunner)
Registry().add_runner("standalone", StandaloneRunner)

Registry().add_strategy(InstallStrategy, [SlurmSystem], [NcclTest], NcclTestSlurmInstallStrategy)
Registry().add_strategy(InstallStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmInstallStrategy)
Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [NcclTest], NcclTestReportGenerationStrategy)
Registry().add_strategy(CommandGenStrategy, [StandaloneSystem], [Sleep], SleepStandaloneCommandGenStrategy)
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [Sleep], SleepSlurmCommandGenStrategy)
Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [NcclTest], SlurmNcclTestReportGenerationStrategy)
Registry().add_strategy(
ReportGenerationStrategy, [KubernetesSystem], [NcclTest], KubernetesNcclTestReportGenerationStrategy
)
Registry().add_strategy(JobSpecGenStrategy, [StandaloneSystem], [Sleep], SleepStandaloneJobSpecGenStrategy)
Registry().add_strategy(JobSpecGenStrategy, [SlurmSystem], [Sleep], SleepSlurmJobSpecGenStrategy)
Registry().add_strategy(JobSpecGenStrategy, [KubernetesSystem], [Sleep], SleepKubernetesJobSpecGenStrategy)
Registry().add_strategy(InstallStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxSlurmInstallStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [NcclTest], NcclTestGradingStrategy)
Registry().add_strategy(JobSpecGenStrategy, [KubernetesSystem], [NcclTest], NcclTestKubernetesJobSpecGenStrategy)
Registry().add_strategy(GradingStrategy, [KubernetesSystem], [NcclTest], KubernetesNcclTestGradingStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [NcclTest], SlurmNcclTestGradingStrategy)
Registry().add_strategy(InstallStrategy, [SlurmSystem], [UCCTest], UCCTestSlurmInstallStrategy)
Registry().add_strategy(InstallStrategy, [StandaloneSystem, SlurmSystem], [Sleep], SleepStandaloneInstallStrategy)
Registry().add_strategy(InstallStrategy, [KubernetesSystem], [Sleep], SleepKubernetesInstallStrategy)
Registry().add_strategy(
ReportGenerationStrategy, [StandaloneSystem, SlurmSystem], [Sleep], SleepReportGenerationStrategy
)
Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherReportGenerationStrategy)
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NcclTest], NcclTestSlurmCommandGenStrategy)
Registry().add_strategy(JobSpecGenStrategy, [SlurmSystem], [NcclTest], NcclTestSlurmJobSpecGenStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [Sleep], SleepGradingStrategy)
Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxReportGenerationStrategy)
Registry().add_strategy(JobIdRetrievalStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmJobIdRetrievalStrategy)
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmCommandGenStrategy)
Registry().add_strategy(JobSpecGenStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmJobSpecGenStrategy)
Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [UCCTest], UCCTestReportGenerationStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherGradingStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxGradingStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [UCCTest], UCCTestGradingStrategy)
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxSlurmCommandGenStrategy)
Registry().add_strategy(JobSpecGenStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxSlurmJobSpecGenStrategy)
Registry().add_strategy(
JobIdRetrievalStrategy,
[SlurmSystem],
Expand All @@ -122,19 +149,23 @@
)
Registry().add_strategy(JobIdRetrievalStrategy, [StandaloneSystem], [Sleep], StandaloneJobIdRetrievalStrategy)
Registry().add_strategy(JobStatusRetrievalStrategy, [StandaloneSystem], [Sleep], DefaultJobStatusRetrievalStrategy)
Registry().add_strategy(JobStatusRetrievalStrategy, [SlurmSystem], [NcclTest], NcclTestJobStatusRetrievalStrategy)
Registry().add_strategy(JobStatusRetrievalStrategy, [KubernetesSystem], [Sleep], DefaultJobStatusRetrievalStrategy)
Registry().add_strategy(JobStatusRetrievalStrategy, [SlurmSystem], [NcclTest], SlurmNcclTestJobStatusRetrievalStrategy)
Registry().add_strategy(
JobStatusRetrievalStrategy, [KubernetesSystem], [NcclTest], KubernetesNcclTestJobStatusRetrievalStrategy
)
Registry().add_strategy(JobStatusRetrievalStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxJobStatusRetrievalStrategy)
Registry().add_strategy(
JobStatusRetrievalStrategy,
[SlurmSystem],
[ChakraReplay, UCCTest, NeMoLauncher, Sleep],
DefaultJobStatusRetrievalStrategy,
)
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [UCCTest], UCCTestSlurmCommandGenStrategy)
Registry().add_strategy(JobSpecGenStrategy, [SlurmSystem], [UCCTest], UCCTestSlurmJobSpecGenStrategy)
Registry().add_strategy(InstallStrategy, [SlurmSystem], [ChakraReplay], ChakraReplaySlurmInstallStrategy)
Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [ChakraReplay], ChakraReplayReportGenerationStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [ChakraReplay], ChakraReplayGradingStrategy)
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [ChakraReplay], ChakraReplaySlurmCommandGenStrategy)
Registry().add_strategy(JobSpecGenStrategy, [SlurmSystem], [ChakraReplay], ChakraReplaySlurmJobSpecGenStrategy)

Registry().add_test_template("ChakraReplay", ChakraReplay)
Registry().add_test_template("JaxToolbox", JaxToolbox)
Expand All @@ -145,13 +176,16 @@

Registry().add_installer("slurm", SlurmInstaller)
Registry().add_installer("standalone", StandaloneInstaller)
Registry().add_installer("kubernetes", KubernetesInstaller)

__all__ = [
"BaseInstaller",
"BaseJob",
"BaseRunner",
"BaseSystemParser",
"CommandGenStrategy",
"JobContext",
"JobSpecification",
"JobSpecGenStrategy",
"Grader",
"GradingStrategy",
"Installer",
Expand Down
4 changes: 2 additions & 2 deletions src/cloudai/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def handle_generate_report(test_scenario: TestScenario, output_dir: Path) -> Non
output_dir (Path): The path to the output directory.
"""
logging.info("Generating report based on system and test scenario")
generator = ReportGenerator(str(output_dir))
generator = ReportGenerator(output_dir)
generator.generate_report(test_scenario)

logging.info("Report generation completed.")
Expand All @@ -274,7 +274,7 @@ def main() -> None:
system, tests, test_scenario = parser.parse(tests_dir, test_scenario_path)

if output_dir:
system.output_path = str(output_dir.absolute())
system.output_path = Path(output_dir.absolute())
system.update()

if args.mode in ["install", "uninstall"]:
Expand Down
Loading
Loading