Skip to content

Commit

Permalink
Use Pydantic to verify System schemas (#158)
Browse files Browse the repository at this point in the history
* Move Parser to a higher level

* Dirty implementation of Pydantic models for Systems

* Remove System Parsers

* Simplify code a bit

* Enable 'groups' parsing

* Handle group name the same way as partition name

* Make ruff happy

* Add missing module

* Fixes

* Add pydantic to requirements

* Fix tests

* Fix tests

* Update test, still fails

* Make it work

* Add missing file

* Test all systems

* Add mode for verifying system TOMLs

* Fixes

* Make ruff happy

* Extend testing

* Fixes

* Update README

* Address review comments
  • Loading branch information
amaslenn authored Sep 16, 2024
1 parent d5e4497 commit 11c5592
Show file tree
Hide file tree
Showing 31 changed files with 483 additions and 938 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,16 @@ cloudai\
--tests-dir conf/common/test
```

Verify if system configs are valid:
```bash
cloudai\
--mode verify-systems\
--tests-dir conf/common/test\
--test-templates-dir conf/common/test_template\
--system-config conf/common/system
```
`--system-config` can be a file or a directory to verify all configs in the directory.

## Contributing
Feel free to contribute to the CloudAI project. Your contributions are highly appreciated.

Expand Down
48 changes: 23 additions & 25 deletions conf/common/system/example_slurm_cluster.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,31 +25,29 @@ mpi = "pmix"
gpus_per_node = 8
ntasks_per_node = 8

[partitions]
[partitions.partition_1]
name = "partition_1"
nodes = ["node-[001-100]"]

[partitions.partition_2]
name = "partition_2"
nodes = ["node-[101-200]"]

[partitions.partition_1.groups]
[partitions.partition_1.groups.group_1]
name = "group_1"
nodes = ["node-[001-025]"]

[partitions.partition_1.groups.group_2]
name = "group_2"
nodes = ["node-[026-050]"]

[partitions.partition_1.groups.group_3]
name = "group_3"
nodes = ["node-[051-075]"]

[partitions.partition_1.groups.group_4]
name = "group_4"
nodes = ["node-[076-100]"]
[[partitions]]
name = "partition_1"
nodes = ["node-[001-100]"]

[[partitions.groups]]
name = "group_1"
nodes = ["node-[001-025]"]

[[partitions.groups]]
name = "group_2"
nodes = ["node-[026-050]"]

[[partitions.groups]]
name = "group_3"
nodes = ["node-[051-075]"]

[[partitions.groups]]
name = "group_4"
nodes = ["node-[076-100]"]

[[partitions]]
name = "partition_2"
nodes = ["node-[101-200]"]

[global_env_vars]
# NCCL Specific Configurations
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ dependencies = [
"tbparse==0.0.8",
"toml==0.10.2",
"kubernetes==30.1.0",
"pydantic==2.8.2",
]

[build-system]
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ pandas==2.2.1
tbparse==0.0.8
toml==0.10.2
kubernetes==30.1.0
pydantic==2.8.2
19 changes: 11 additions & 8 deletions src/cloudai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,22 +27,22 @@
from ._core.job_status_result import JobStatusResult
from ._core.job_status_retrieval_strategy import JobStatusRetrievalStrategy
from ._core.json_gen_strategy import JsonGenStrategy
from ._core.parser import Parser
from ._core.registry import Registry
from ._core.report_generation_strategy import ReportGenerationStrategy
from ._core.runner import Runner
from ._core.system import System
from ._core.test import Test
from ._core.test_parser import TestParser
from ._core.test_scenario import TestRun, TestScenario
from ._core.test_scenario_parser import TestScenarioParser
from ._core.test_template import TestTemplate
from ._core.test_template_parser import TestTemplateParser
from ._core.test_template_strategy import TestTemplateStrategy
from .installer.installer import Installer
from .installer.kubernetes_installer import KubernetesInstaller
from .installer.slurm_installer import SlurmInstaller
from .installer.standalone_installer import StandaloneInstaller
from .parser.system_parser.kubernetes_system_parser import KubernetesSystemParser
from .parser.system_parser.slurm_system_parser import SlurmSystemParser
from .parser.system_parser.standalone_system_parser import StandaloneSystemParser
from .parser import Parser
from .report_generator import ReportGenerator
from .runner.kubernetes.kubernetes_runner import KubernetesRunner
from .runner.slurm.slurm_runner import SlurmRunner
Expand Down Expand Up @@ -93,10 +93,6 @@
from .systems.slurm.slurm_system import SlurmSystem
from .systems.standalone_system import StandaloneSystem

Registry().add_system_parser("standalone", StandaloneSystemParser)
Registry().add_system_parser("slurm", SlurmSystemParser)
Registry().add_system_parser("kubernetes", KubernetesSystemParser)

Registry().add_runner("slurm", SlurmRunner)
Registry().add_runner("kubernetes", KubernetesRunner)
Registry().add_runner("standalone", StandaloneRunner)
Expand Down Expand Up @@ -165,6 +161,10 @@
Registry().add_installer("standalone", StandaloneInstaller)
Registry().add_installer("kubernetes", KubernetesInstaller)

Registry().add_system("slurm", SlurmSystem)
Registry().add_system("standalone", StandaloneSystem)
Registry().add_system("kubernetes", KubernetesSystem)

__all__ = [
"BaseInstaller",
"BaseJob",
Expand All @@ -189,4 +189,7 @@
"TestScenario",
"TestTemplate",
"TestTemplateStrategy",
"TestParser",
"TestScenarioParser",
"TestTemplateParser",
]
29 changes: 29 additions & 0 deletions src/cloudai/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def parse_arguments() -> argparse.Namespace:
"run",
"generate-report",
"uninstall",
"verify-systems",
],
help=(
"Operating mode: 'install' to install test templates, 'dry-run' "
Expand Down Expand Up @@ -253,11 +254,39 @@ def handle_generate_report(test_scenario: TestScenario, output_dir: Path) -> Non
logging.info("Report generation completed.")


def handle_verify_systems(root: Path) -> int:
if not root.exists():
logging.error(f"Tests directory {root} does not exist.")
return 1

test_tomls = [root]
if root.is_dir():
test_tomls = list(root.glob("*.toml"))
if not test_tomls:
logging.error(f"No test tomls found in {root}")
return 1

rc = 0
for test_toml in test_tomls:
logging.info(f"Verifying {test_toml}...")
try:
Parser.parse_system(test_toml)
except Exception:
rc = 1
break

return rc


def main() -> None:
args = parse_arguments()

setup_logging(args.log_file, args.log_level)

if args.mode == "verify-systems":
rc = handle_verify_systems(Path(args.system_config))
exit(rc)

system_config_path = Path(args.system_config)
test_templates_dir = Path(args.test_templates_dir)
tests_dir = Path(args.tests_dir)
Expand Down
63 changes: 31 additions & 32 deletions src/cloudai/_core/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

from .base_installer import BaseInstaller
from .base_runner import BaseRunner
from .base_system_parser import BaseSystemParser
from .grading_strategy import GradingStrategy
from .job_id_retrieval_strategy import JobIdRetrievalStrategy
from .job_status_retrieval_strategy import JobStatusRetrievalStrategy
Expand All @@ -42,7 +41,6 @@ def __new__(cls, name, bases, dct):
class Registry(metaclass=Singleton):
"""Registry for implementations mappings."""

system_parsers_map: Dict[str, Type[BaseSystemParser]] = {}
runners_map: Dict[str, Type[BaseRunner]] = {}
strategies_map: Dict[
Tuple[
Expand Down Expand Up @@ -70,36 +68,7 @@ class Registry(metaclass=Singleton):
] = {}
test_templates_map: Dict[str, Type[TestTemplate]] = {}
installers_map: Dict[str, Type[BaseInstaller]] = {}

def add_system_parser(self, name: str, value: Type[BaseSystemParser]) -> None:
"""
Add a new system parser implementation mapping.
Args:
name (str): The name of the system parser.
value (Type[BaseSystemParser]): The system parser implementation.
Raises:
ValueError: If the system parser implementation already exists.
"""
if name in self.system_parsers_map:
raise ValueError(f"Duplicating implementation for '{name}', use 'update()' for replacement.")
self.update_system_parser(name, value)

def update_system_parser(self, name: str, value: Type[BaseSystemParser]) -> None:
"""
Create or replace system parser implementation mapping.
Args:
name (str): The name of the system parser.
value (Type[BaseSystemParser]): The system parser implementation.
Raises:
ValueError: If value is not a subclass of BaseSystemParser.
"""
if not issubclass(value, BaseSystemParser):
raise ValueError(f"Invalid system implementation for '{name}', should be subclass of 'System'.")
self.system_parsers_map[name] = value
systems_map: Dict[str, Type[System]] = {}

def add_runner(self, name: str, value: Type[BaseRunner]) -> None:
"""
Expand Down Expand Up @@ -274,3 +243,33 @@ def update_installer(self, name: str, value: Type[BaseInstaller]) -> None:
if not issubclass(value, BaseInstaller):
raise ValueError(f"Invalid installer implementation for '{name}', should be subclass of 'BaseInstaller'.")
self.installers_map[name] = value

def add_system(self, name: str, value: Type[System]) -> None:
"""
Add a new system implementation mapping.
Args:
name (str): The name of the system.
value (Type[System]): The system implementation.
Raises:
ValueError: If the system implementation already exists.
"""
if name in self.systems_map:
raise ValueError(f"Duplicating implementation for '{name}', use 'update()' for replacement.")
self.update_system(name, value)

def update_system(self, name: str, value: Type[System]) -> None:
"""
Create or replace system implementation mapping.
Args:
name (str): The name of the system.
value (Type[System]): The system implementation.
Raises:
ValueError: If value is not a subclass of System.
"""
if not issubclass(value, System):
raise ValueError(f"Invalid system implementation for '{name}', should be subclass of 'System'.")
self.systems_map[name] = value
74 changes: 0 additions & 74 deletions src/cloudai/_core/system_parser.py

This file was deleted.

Loading

0 comments on commit 11c5592

Please sign in to comment.