From ab76fe9d32fa7631492385ecb046a4120abd80d1 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 20 Aug 2024 09:16:45 -0400 Subject: [PATCH 1/4] Refactor to use pathlib.Path for path-related variables --- src/cloudai/__main__.py | 4 +- src/cloudai/_core/base_job.py | 8 +- src/cloudai/_core/base_runner.py | 39 ++--- src/cloudai/_core/command_gen_strategy.py | 5 +- src/cloudai/_core/grader.py | 28 ++-- src/cloudai/_core/grading_strategy.py | 5 +- .../_core/job_status_retrieval_strategy.py | 5 +- .../_core/report_generation_strategy.py | 9 +- src/cloudai/_core/system.py | 13 +- src/cloudai/_core/test.py | 9 +- src/cloudai/_core/test_template.py | 21 +-- .../system_parser/slurm_system_parser.py | 5 +- .../system_parser/standalone_system_parser.py | 3 +- .../report_generator/report_generator.py | 23 ++- .../tool/bokeh_report_tool.py | 12 +- .../report_generator/tool/csv_report_tool.py | 12 +- .../tool/report_tool_interface.py | 5 +- .../tool/tensorboard_data_reader.py | 11 +- .../chakra_replay/grading_strategy.py | 7 +- .../report_generation_strategy.py | 36 ++--- .../slurm_command_gen_strategy.py | 3 +- .../default_job_status_retrieval_strategy.py | 4 +- .../jax_toolbox/grading_strategy.py | 3 +- .../job_status_retrieval_strategy.py | 33 ++-- .../jax_toolbox/report_generation_strategy.py | 27 ++-- .../jax_toolbox/slurm_command_gen_strategy.py | 9 +- .../nccl_test/grading_strategy.py | 22 ++- .../job_status_retrieval_strategy.py | 12 +- .../nccl_test/report_generation_strategy.py | 34 ++-- .../nccl_test/slurm_command_gen_strategy.py | 3 +- .../nemo_launcher/grading_strategy.py | 6 +- .../report_generation_strategy.py | 7 +- .../slurm_command_gen_strategy.py | 28 ++-- .../test_template/sleep/grading_strategy.py | 6 +- .../sleep/report_generation_strategy.py | 5 +- .../sleep/slurm_command_gen_strategy.py | 3 +- .../sleep/standalone_command_gen_strategy.py | 3 +- .../ucc_test/grading_strategy.py | 18 +-- .../ucc_test/report_generation_strategy.py | 25 ++- .../ucc_test/slurm_command_gen_strategy.py | 3 +- src/cloudai/systems/slurm/slurm_system.py | 153 +++++++++--------- .../strategy/slurm_command_gen_strategy.py | 20 +-- src/cloudai/systems/standalone_system.py | 6 +- .../util/docker_image_cache_manager.py | 82 +++++----- tests/test_acceptance.py | 12 +- tests/test_base_installer.py | 5 +- tests/test_base_runner.py | 8 +- tests/test_csv_report_tool.py | 34 ++-- tests/test_docker_image_cache_manager.py | 149 +++++++---------- tests/test_job_status_retrieval_strategy.py | 50 +++--- tests/test_report_generation_strategy.py | 6 +- tests/test_slurm_command_gen_strategy.py | 36 ++--- tests/test_slurm_install_strategy.py | 4 +- .../test_slurm_report_generation_strategy.py | 3 +- tests/test_slurm_system.py | 5 +- tests/test_slurm_system_parser.py | 5 +- 56 files changed, 542 insertions(+), 550 deletions(-) diff --git a/src/cloudai/__main__.py b/src/cloudai/__main__.py index 86b3718b..c5146f4b 100644 --- a/src/cloudai/__main__.py +++ b/src/cloudai/__main__.py @@ -247,7 +247,7 @@ def handle_generate_report(test_scenario: TestScenario, output_dir: Path) -> Non output_dir (Path): The path to the output directory. """ logging.info("Generating report based on system and test scenario") - generator = ReportGenerator(str(output_dir)) + generator = ReportGenerator(output_dir) generator.generate_report(test_scenario) logging.info("Report generation completed.") @@ -274,7 +274,7 @@ def main() -> None: system, tests, test_scenario = parser.parse(tests_dir, test_scenario_path) if output_dir: - system.output_path = str(output_dir.absolute()) + system.output_path = Path(output_dir.absolute()) system.update() if args.mode in ["install", "uninstall"]: diff --git a/src/cloudai/_core/base_job.py b/src/cloudai/_core/base_job.py index c97fb228..53f4a1ab 100644 --- a/src/cloudai/_core/base_job.py +++ b/src/cloudai/_core/base_job.py @@ -14,6 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path + from .test import Test @@ -24,17 +26,17 @@ class BaseJob: Attributes id (int): The unique identifier of the job. test (Test): The test instance associated with this job. - output_path (str): The path where the job's output is stored. + output_path (Path): The path where the job's output is stored. terminated_by_dependency (bool): Flag to indicate if the job was terminated due to a dependency. """ - def __init__(self, job_id: int, test: Test, output_path: str): + def __init__(self, job_id: int, test: Test, output_path: Path): """ Initialize a BaseJob instance. Args: job_id (int): The unique identifier of the job. - output_path (str): The path where the job's output is stored. + output_path (Path): The path where the job's output is stored. test (Test): The test instance associated with the job. """ self.id = job_id diff --git a/src/cloudai/_core/base_runner.py b/src/cloudai/_core/base_runner.py index 37348326..1f34b614 100644 --- a/src/cloudai/_core/base_runner.py +++ b/src/cloudai/_core/base_runner.py @@ -16,12 +16,12 @@ import asyncio import logging -import os import signal import sys from abc import ABC, abstractmethod from asyncio import Task from datetime import datetime +from pathlib import Path from types import FrameType from typing import Dict, List, Optional @@ -44,7 +44,7 @@ class BaseRunner(ABC): mode (str): The operation mode ('dry-run', 'run'). system (System): The system schema object. test_scenario (TestScenario): The test scenario to run. - output_path (str): Path to the output directory. + output_path (Path): Path to the output directory. monitor_interval (int): Interval in seconds for monitoring jobs. jobs (List[BaseJob]): List to track jobs created by the runner. test_to_job_map (Dict[Test, BaseJob]): Mapping from tests to their jobs. @@ -78,21 +78,21 @@ def __init__( self.shutting_down = False self.register_signal_handlers() - def setup_output_directory(self, base_output_path: str) -> str: + def setup_output_directory(self, base_output_path: Path) -> Path: """ Set up and return the output directory path for the runner instance. Args: - base_output_path (str): The base output directory. + base_output_path (Path): The base output directory. Returns: - str: The path to the output directory. + Path: The path to the output directory. """ - if not os.path.exists(base_output_path): - os.makedirs(base_output_path) + if not base_output_path.exists(): + base_output_path.mkdir() current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - output_subpath = os.path.join(base_output_path, f"{self.test_scenario.name}_{current_time}") - os.makedirs(output_subpath) + output_subpath = base_output_path / f"{self.test_scenario.name}_{current_time}" + output_subpath.mkdir() return output_subpath def register_signal_handlers(self): @@ -242,9 +242,9 @@ def find_dependency_free_tests(self) -> List[Test]: return dependency_free_tests - def get_job_output_path(self, test: Test) -> str: + def get_job_output_path(self, test: Test) -> Path: """ - Generate and ensures the existence of the output directory for a given test. + Generate and ensure the existence of the output directory for a given test. It constructs the path based on the test's section name and current iteration, creating the directories if they do not exist. @@ -253,23 +253,24 @@ def get_job_output_path(self, test: Test) -> str: test (Test): The test instance for which to generate the output directory path. Returns: - str: The path to the job's output directory. + Path: The path to the job's output directory. Raises: ValueError: If the test's section name is None. FileNotFoundError: If the base output directory does not exist. PermissionError: If there is a permission issue creating the directories. """ - job_output_path = "" + if not self.output_path.exists(): + raise FileNotFoundError(f"Output directory {self.output_path} does not exist") + + job_output_path = None # Initialize the variable - if not os.path.exists(self.output_path): - raise FileNotFoundError(f"Output directory {self.output_path} " f"does not exist") try: assert test.section_name is not None, "test.section_name must not be None" - test_output_path = os.path.join(self.output_path, test.section_name) - os.makedirs(test_output_path, exist_ok=True) - job_output_path = os.path.join(test_output_path, str(test.current_iteration)) - os.makedirs(job_output_path, exist_ok=True) + test_output_path = self.output_path / test.section_name + test_output_path.mkdir() + job_output_path = test_output_path / str(test.current_iteration) + job_output_path.mkdir() except PermissionError as e: raise PermissionError(f"Cannot create directory {job_output_path}: {e}") from e diff --git a/src/cloudai/_core/command_gen_strategy.py b/src/cloudai/_core/command_gen_strategy.py index 3d0164bc..19cc712f 100644 --- a/src/cloudai/_core/command_gen_strategy.py +++ b/src/cloudai/_core/command_gen_strategy.py @@ -15,6 +15,7 @@ # limitations under the License. from abc import abstractmethod +from pathlib import Path from typing import Dict, List from .test_template_strategy import TestTemplateStrategy @@ -34,7 +35,7 @@ def gen_exec_command( cmd_args: Dict[str, str], extra_env_vars: Dict[str, str], extra_cmd_args: str, - output_path: str, + output_path: Path, num_nodes: int, nodes: List[str], ) -> str: @@ -46,7 +47,7 @@ def gen_exec_command( cmd_args (Dict[str, str]): Command-line arguments for the test. extra_env_vars (Dict[str, str]): Additional environment variables. extra_cmd_args (str): Additional command-line arguments. - output_path (str): Path to the output directory. + output_path (Path): Path to the output directory. num_nodes (int): The number of nodes to be used for the test execution. nodes (List[str]): List of nodes for test execution, optional. diff --git a/src/cloudai/_core/grader.py b/src/cloudai/_core/grader.py index f14d557a..b33559d5 100644 --- a/src/cloudai/_core/grader.py +++ b/src/cloudai/_core/grader.py @@ -16,7 +16,7 @@ import csv import logging -import os +from pathlib import Path from typing import Dict, List from .test import Test @@ -28,11 +28,11 @@ class Grader: Class responsible for grading the performance of tests within a test scenario and generating a report. Attributes - output_path (str): The path where the performance results are stored. + output_path (Path): The path where the performance results are stored. logger (logging.Logger): Logger for the class, used to log messages related to the grading process. """ - def __init__(self, output_path: str) -> None: + def __init__(self, output_path: Path) -> None: self.output_path = output_path def grade(self, test_scenario: TestScenario) -> str: @@ -57,7 +57,7 @@ def grade(self, test_scenario: TestScenario) -> str: if not section_name: logging.warning(f"Missing section name for test {test.name}") continue - test_output_dir = os.path.join(self.output_path, section_name) + test_output_dir = self.output_path / section_name perfs = self._get_perfs_from_subdirs(test_output_dir, test) avg_perf = sum(perfs) / len(perfs) if perfs else 0 test_perfs[test.name] = perfs + [avg_perf] @@ -69,24 +69,22 @@ def grade(self, test_scenario: TestScenario) -> str: self._save_report(report) return report - def _get_perfs_from_subdirs(self, directory_path: str, test: Test) -> List[float]: + def _get_perfs_from_subdirs(self, directory_path: Path, test: Test) -> List[float]: """ Average performance values from subdirectories within a given path, according to the test's grading template. Args: - directory_path (str): Directory path. + directory_path (Path): Directory path. test (Test): The test to grade. Returns: List[float]: A list of performance values. """ perfs = [] - for subdir in os.listdir(directory_path): - if subdir.isdigit(): - subdir_path = os.path.join(directory_path, subdir) - if os.path.isdir(subdir_path): - perf = test.test_template.grade(subdir_path, test.ideal_perf) - perfs.append(perf) + for subdir in directory_path.iterdir(): + if subdir.is_dir() and subdir.name.isdigit(): + perf = test.test_template.grade(subdir, test.ideal_perf) + perfs.append(perf) return perfs def _generate_report(self, test_perfs: Dict[str, List[float]], overall_avg: float) -> str: @@ -102,7 +100,7 @@ def _generate_report(self, test_perfs: Dict[str, List[float]], overall_avg: floa """ report_lines = ["Test Performance Report:"] for test, perfs in test_perfs.items(): - report_lines.append(f"{test}: Min: {min(perfs[:-1])}, " f"Max: {max(perfs[:-1])}, " f"Avg: {perfs[-1]}") + report_lines.append(f"{test}: Min: {min(perfs[:-1])}, Max: {max(perfs[:-1])}, Avg: {perfs[-1]}") report_lines.append(f"Overall Average Performance: {overall_avg}") return "\n".join(report_lines) @@ -113,8 +111,8 @@ def _save_report(self, report: str) -> None: Args: report (str): The report to save. """ - report_path = os.path.join(self.output_path, "performance_report.csv") - with open(report_path, "w", newline="") as file: + report_path = self.output_path / "performance_report.csv" + with report_path.open("w", newline="") as file: writer = csv.writer(file) for line in report.split("\n"): writer.writerow([line]) diff --git a/src/cloudai/_core/grading_strategy.py b/src/cloudai/_core/grading_strategy.py index 3a3d888c..c4590026 100644 --- a/src/cloudai/_core/grading_strategy.py +++ b/src/cloudai/_core/grading_strategy.py @@ -15,6 +15,7 @@ # limitations under the License. from abc import abstractmethod +from pathlib import Path from .test_template_strategy import TestTemplateStrategy @@ -23,12 +24,12 @@ class GradingStrategy(TestTemplateStrategy): """Abstract class for grading test performance.""" @abstractmethod - def grade(self, directory_path: str, ideal_perf: float) -> float: + def grade(self, directory_path: Path, ideal_perf: float) -> float: """ Grades the performance of a test. Args: - directory_path (str): Path to the directory containing the test's output. + directory_path (Path): Path to the directory containing the test's output. ideal_perf (float): The ideal performance value for comparison. Returns: diff --git a/src/cloudai/_core/job_status_retrieval_strategy.py b/src/cloudai/_core/job_status_retrieval_strategy.py index 63551b18..5c3eb682 100644 --- a/src/cloudai/_core/job_status_retrieval_strategy.py +++ b/src/cloudai/_core/job_status_retrieval_strategy.py @@ -15,6 +15,7 @@ # limitations under the License. from abc import abstractmethod +from pathlib import Path from .job_status_result import JobStatusResult @@ -23,12 +24,12 @@ class JobStatusRetrievalStrategy: """Abstract class to define a strategy for retrieving job statuses from a given output directory.""" @abstractmethod - def get_job_status(self, output_path: str) -> JobStatusResult: + def get_job_status(self, output_path: Path) -> JobStatusResult: """ Retrieve the job status from a specified output directory. Args: - output_path (str): Path to the output directory. + output_path (Path): Path to the output directory. Returns: JobStatusResult: The result containing the job status and an optional error message. diff --git a/src/cloudai/_core/report_generation_strategy.py b/src/cloudai/_core/report_generation_strategy.py index c6287401..16d8c404 100644 --- a/src/cloudai/_core/report_generation_strategy.py +++ b/src/cloudai/_core/report_generation_strategy.py @@ -15,6 +15,7 @@ # limitations under the License. from abc import abstractmethod +from pathlib import Path from typing import Optional @@ -27,12 +28,12 @@ class ReportGenerationStrategy: """ @abstractmethod - def can_handle_directory(self, directory_path: str) -> bool: + def can_handle_directory(self, directory_path: Path) -> bool: """ Determine if the strategy can handle the directory. Args: - directory_path (str): Path to the directory. + directory_path (Path): Path to the directory. Returns: bool: True if can handle, False otherwise. @@ -40,13 +41,13 @@ def can_handle_directory(self, directory_path: str) -> bool: pass @abstractmethod - def generate_report(self, test_name: str, directory_path: str, sol: Optional[float] = None) -> None: + def generate_report(self, test_name: str, directory_path: Path, sol: Optional[float] = None) -> None: """ Generate a report from the directory. Args: test_name (str): The name of the test. - directory_path (str): Path to the directory. + directory_path (Path): Path to the directory. sol (Optional[float]): Speed-of-light performance for reference. """ pass diff --git a/src/cloudai/_core/system.py b/src/cloudai/_core/system.py index 3a6cb024..73484021 100644 --- a/src/cloudai/_core/system.py +++ b/src/cloudai/_core/system.py @@ -16,6 +16,7 @@ from abc import ABC, abstractmethod +from pathlib import Path class System(ABC): @@ -25,24 +26,18 @@ class System(ABC): Attributes name (str): Unique name of the system. scheduler (str): Type of scheduler used by the system, determining the specific subclass of System to be used. - output_path (str): Path to the output directory. + output_path (Path): Path to the output directory. monitor_interval (int): Interval in seconds for monitoring jobs. """ - def __init__( - self, - name: str, - scheduler: str, - output_path: str, - monitor_interval: int = 1, - ) -> None: + def __init__(self, name: str, scheduler: str, output_path: Path, monitor_interval: int = 1) -> None: """ Initialize a System instance. Args: name (str): Name of the system. scheduler (str): Type of scheduler used by the system. - output_path (str): Path to the output directory. + output_path (Path): Path to the output directory. monitor_interval (int): Interval in seconds for monitoring jobs. """ self.name = name diff --git a/src/cloudai/_core/test.py b/src/cloudai/_core/test.py index 3e744a85..51925fa2 100644 --- a/src/cloudai/_core/test.py +++ b/src/cloudai/_core/test.py @@ -15,6 +15,7 @@ # limitations under the License. import sys +from pathlib import Path from typing import Dict, List, Optional, Union from .job_status_result import JobStatusResult @@ -126,12 +127,12 @@ def __repr__(self) -> str: f"nodes={self.nodes})" ) - def gen_exec_command(self, output_path: str) -> str: + def gen_exec_command(self, output_path: Path) -> str: """ Generate the command to run this specific test. Args: - output_path (str): Path to the output directory. + output_path (Path): Path to the output directory. Returns: str: The command string. @@ -162,12 +163,12 @@ def get_job_id(self, stdout: str, stderr: str) -> Optional[int]: """ return self.test_template.get_job_id(stdout, stderr) - def get_job_status(self, output_path: str) -> JobStatusResult: + def get_job_status(self, output_path: Path) -> JobStatusResult: """ Determine the status of a job based on the outputs located in the given output directory. Args: - output_path (str): Path to the output directory. + output_path (Path): Path to the output directory. Returns: JobStatusResult: The result containing the job status and an optional error message. diff --git a/src/cloudai/_core/test_template.py b/src/cloudai/_core/test_template.py index ce1574d2..06d3f1ac 100644 --- a/src/cloudai/_core/test_template.py +++ b/src/cloudai/_core/test_template.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path from typing import Any, Dict, List, Optional from .command_gen_strategy import CommandGenStrategy @@ -127,7 +128,7 @@ def gen_exec_command( cmd_args: Dict[str, str], extra_env_vars: Dict[str, str], extra_cmd_args: str, - output_path: str, + output_path: Path, num_nodes: int, nodes: List[str], ) -> str: @@ -141,7 +142,7 @@ def gen_exec_command( cmd_args (Dict[str, str]): Command-line arguments for the test. extra_env_vars (Dict[str, str]): Extra environment variables. extra_cmd_args (str): Extra command-line arguments. - output_path (str): Path to the output directory. + output_path (Path): Path to the output directory. num_nodes (int): The number of nodes to be used for the test execution. nodes (List[str]): A list of nodes where the test will be executed. @@ -183,12 +184,12 @@ def get_job_id(self, stdout: str, stderr: str) -> Optional[int]: ) return self.job_id_retrieval_strategy.get_job_id(stdout, stderr) - def get_job_status(self, output_path: str) -> JobStatusResult: + def get_job_status(self, output_path: Path) -> JobStatusResult: """ Determine the job status by evaluating the contents or results in a specified output directory. Args: - output_path (str): Path to the output directory. + output_path (Path): Path to the output directory. Returns: JobStatusResult: The result containing the job status and an optional error message. @@ -200,12 +201,12 @@ def get_job_status(self, output_path: str) -> JobStatusResult: ) return self.job_status_retrieval_strategy.get_job_status(output_path) - def can_handle_directory(self, directory_path: str) -> bool: + def can_handle_directory(self, directory_path: Path) -> bool: """ Determine if the strategy can handle the directory. Args: - directory_path (str): Path to the directory. + directory_path (Path): Path to the directory. Returns: bool: True if can handle, False otherwise. @@ -215,24 +216,24 @@ def can_handle_directory(self, directory_path: str) -> bool: else: return False - def generate_report(self, test_name: str, directory_path: str, sol: Optional[float] = None) -> None: + def generate_report(self, test_name: str, directory_path: Path, sol: Optional[float] = None) -> None: """ Generate a report from the directory. Args: test_name (str): The name of the test. - directory_path (str): Path to the directory. + directory_path (Path): Path to the directory. sol (Optional[float]): Speed-of-light performance for reference. """ if self.report_generation_strategy is not None: return self.report_generation_strategy.generate_report(test_name, directory_path, sol) - def grade(self, directory_path: str, ideal_perf: float) -> Optional[float]: + def grade(self, directory_path: Path, ideal_perf: float) -> Optional[float]: """ Read the performance value from the directory. Args: - directory_path (str): Path to the directory containing performance data. + directory_path (Path): Path to the directory containing performance data. ideal_perf (float): The ideal performance metric to compare against. Returns: diff --git a/src/cloudai/parser/system_parser/slurm_system_parser.py b/src/cloudai/parser/system_parser/slurm_system_parser.py index 045e62c2..1236793a 100644 --- a/src/cloudai/parser/system_parser/slurm_system_parser.py +++ b/src/cloudai/parser/system_parser/slurm_system_parser.py @@ -15,6 +15,7 @@ # limitations under the License. import os +from pathlib import Path from typing import Any, Dict, List from cloudai import BaseSystemParser @@ -57,12 +58,12 @@ def str_to_bool(value: Any) -> bool: install_path = data.get("install_path") if not install_path: raise ValueError("Field 'install_path' is required.") - install_path = os.path.abspath(install_path) + install_path = Path(os.path.abspath(install_path)) output_path = data.get("output_path") if not output_path: raise ValueError("Field 'output_path' is required.") - output_path = os.path.abspath(output_path) + output_path = Path(os.path.abspath(output_path)) default_partition = data.get("default_partition") if not default_partition: diff --git a/src/cloudai/parser/system_parser/standalone_system_parser.py b/src/cloudai/parser/system_parser/standalone_system_parser.py index f649044b..4bcd24d0 100644 --- a/src/cloudai/parser/system_parser/standalone_system_parser.py +++ b/src/cloudai/parser/system_parser/standalone_system_parser.py @@ -15,6 +15,7 @@ # limitations under the License. import os +from pathlib import Path from typing import Any, Dict from cloudai import BaseSystemParser @@ -45,6 +46,6 @@ def parse(self, data: Dict[str, Any]) -> StandaloneSystem: output_path = data.get("output_path") if not output_path: raise ValueError("Field 'output_path' is required.") - output_path = os.path.abspath(output_path) + output_path = Path(os.path.abspath(output_path)) return StandaloneSystem(name=name, output_path=output_path) diff --git a/src/cloudai/report_generator/report_generator.py b/src/cloudai/report_generator/report_generator.py index 61843134..2680cda3 100644 --- a/src/cloudai/report_generator/report_generator.py +++ b/src/cloudai/report_generator/report_generator.py @@ -15,7 +15,7 @@ # limitations under the License. import logging -import os +from pathlib import Path from cloudai import Test, TestScenario @@ -28,12 +28,12 @@ class ReportGenerator: based on subdirectories. """ - def __init__(self, output_path: str) -> None: + def __init__(self, output_path: Path) -> None: """ Initialize the ReportGenerator with the path for output. Args: - output_path (str): Output directory path. + output_path (Path): Output directory path. """ self.output_path = output_path @@ -52,26 +52,25 @@ def generate_report(self, test_scenario: TestScenario) -> None: if not section_name: logging.warning(f"Missing section name for test {test.name}") continue - test_output_dir = os.path.join(self.output_path, section_name) - if not os.path.exists(test_output_dir): + test_output_dir = self.output_path / section_name + if not test_output_dir.exists(): logging.warning(f"Directory '{test_output_dir}' not found.") continue self._generate_test_report(test_output_dir, test) - def _generate_test_report(self, directory_path: str, test: Test) -> None: + def _generate_test_report(self, directory_path: Path, test: Test) -> None: """ Generate reports for a test by iterating through subdirectories within the directory path. Checks if the test's template can handle each, and generating reports accordingly. Args: - directory_path (str): Directory for the test's section. + directory_path (Path): Directory for the test's section. test (Test): The test for report generation. """ - for subdir in os.listdir(directory_path): - subdir_path = os.path.join(directory_path, subdir) - if os.path.isdir(subdir_path) and test.test_template.can_handle_directory(subdir_path): - test.test_template.generate_report(test.name, subdir_path, test.sol) + for subdir in directory_path.iterdir(): + if subdir.is_dir() and test.test_template.can_handle_directory(subdir): + test.test_template.generate_report(test.name, subdir, test.sol) else: - logging.warning(f"Skipping directory '{subdir_path}' for test '{test.name}'") + logging.warning(f"Skipping directory '{subdir}' for test '{test.name}'") diff --git a/src/cloudai/report_generator/tool/bokeh_report_tool.py b/src/cloudai/report_generator/tool/bokeh_report_tool.py index f54086cb..c63cda96 100644 --- a/src/cloudai/report_generator/tool/bokeh_report_tool.py +++ b/src/cloudai/report_generator/tool/bokeh_report_tool.py @@ -14,8 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from math import pi +from pathlib import Path from typing import List, Optional, Tuple import pandas as pd @@ -31,10 +31,10 @@ class BokehReportTool: Tool for creating interactive Bokeh plots. Attributes - output_directory (str): Directory to save the generated reports. + output_directory (Path): Directory to save the generated reports. """ - def __init__(self, output_directory: str): + def __init__(self, output_directory: Path): self.output_directory = output_directory self.plots = [] @@ -238,14 +238,14 @@ def add_log_x_linear_y_multi_line_plot( self.plots.append(p) - def finalize_report(self, output_filename: str): + def finalize_report(self, output_filename: Path): """ Save all accumulated plots to a single HTML file. Args: - output_filename (str): output_filename to save the final report. + output_filename (Path): Path to save the final report. """ - output_filepath = os.path.join(self.output_directory, output_filename) + output_filepath = self.output_directory / output_filename output_file(output_filepath) save(column(*self.plots)) self.plots = [] # Clear the list after saving to prepare for future use. diff --git a/src/cloudai/report_generator/tool/csv_report_tool.py b/src/cloudai/report_generator/tool/csv_report_tool.py index cba89bf3..e560974f 100644 --- a/src/cloudai/report_generator/tool/csv_report_tool.py +++ b/src/cloudai/report_generator/tool/csv_report_tool.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os +from pathlib import Path import pandas as pd @@ -26,11 +26,11 @@ class CSVReportTool(ReportToolInterface): Tool for creating CSV reports. Attributes - output_directory (str): Directory to save the generated reports. + output_directory (Path): Directory to save the generated reports. dataframe (pd.DataFrame): DataFrame containing the data to be saved. """ - def __init__(self, output_directory: str): + def __init__(self, output_directory: Path): self.output_directory = output_directory self.dataframe = None @@ -43,16 +43,16 @@ def set_dataframe(self, df: pd.DataFrame): """ self.dataframe = df - def finalize_report(self, output_filename: str): + def finalize_report(self, output_filename: Path): """ Save the DataFrame to a CSV file. Args: - output_filename (str): The filename to save the final report. + output_filename (Path): The filename to save the final report. """ if self.dataframe is None: raise ValueError("No DataFrame has been set for the report.") - output_filepath = os.path.join(self.output_directory, output_filename) + output_filepath = self.output_directory / output_filename self.dataframe.to_csv(output_filepath, index=False) self.dataframe = None diff --git a/src/cloudai/report_generator/tool/report_tool_interface.py b/src/cloudai/report_generator/tool/report_tool_interface.py index 9d8ad69c..79c1430a 100644 --- a/src/cloudai/report_generator/tool/report_tool_interface.py +++ b/src/cloudai/report_generator/tool/report_tool_interface.py @@ -15,17 +15,18 @@ # limitations under the License. from abc import ABC, abstractmethod +from pathlib import Path class ReportToolInterface(ABC): """Interface for report tools, defining methods to add and finalize reports.""" @abstractmethod - def finalize_report(self, output_filename: str) -> None: + def finalize_report(self, output_filename: Path) -> None: """ Finalize the report and save it to the specified filename. Args: - output_filename (str): The filename where the report will be saved. + output_filename (Path): The filename where the report will be saved. """ pass diff --git a/src/cloudai/report_generator/tool/tensorboard_data_reader.py b/src/cloudai/report_generator/tool/tensorboard_data_reader.py index 89c70a14..dc67891d 100644 --- a/src/cloudai/report_generator/tool/tensorboard_data_reader.py +++ b/src/cloudai/report_generator/tool/tensorboard_data_reader.py @@ -15,6 +15,7 @@ # limitations under the License. import os +from pathlib import Path from typing import List, Tuple @@ -23,10 +24,10 @@ class TensorBoardDataReader: Reads scalar data from TensorBoard log files for specified tags. Attributes - directory_path (str): Path to the directory containing TensorBoard logs. + directory_path (Path): Path to the directory containing TensorBoard logs. """ - def __init__(self, directory_path: str): + def __init__(self, directory_path: Path): self.directory_path = directory_path def extract_data(self, tag: str) -> List[Tuple[int, float]]: @@ -45,8 +46,8 @@ def extract_data(self, tag: str) -> List[Tuple[int, float]]: for root, _, files in os.walk(self.directory_path): for file in files: if file.startswith("events.out.tfevents"): - path = os.path.join(root, file) - reader = SummaryReader(path) + path = Path(root) / file + reader = SummaryReader(str(path)) df = reader.scalars if tag in df["tag"].values: filtered_data = df[df["tag"] == tag] @@ -61,7 +62,7 @@ def main(): print("Usage: python script_name.py ") return - directory_path = sys.argv[1] + directory_path = Path(sys.argv[1]) tag = sys.argv[2] try: diff --git a/src/cloudai/schema/test_template/chakra_replay/grading_strategy.py b/src/cloudai/schema/test_template/chakra_replay/grading_strategy.py index 051cafce..fabd5f78 100644 --- a/src/cloudai/schema/test_template/chakra_replay/grading_strategy.py +++ b/src/cloudai/schema/test_template/chakra_replay/grading_strategy.py @@ -14,19 +14,20 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path + from cloudai import GradingStrategy class ChakraReplayGradingStrategy(GradingStrategy): """Performance grading strategy for ChakraReplay test templates on Slurm systems.""" - def grade(self, directory_path: str, ideal_perf: float) -> float: + def grade(self, directory_path: Path, ideal_perf: float) -> float: """ Grades the performance of a test. Args: - directory_path (str): Path to the directory containing the - test's output. + directory_path (Path): Path to the directory containing the test's output. ideal_perf (float): The ideal performance value for comparison. Returns: diff --git a/src/cloudai/schema/test_template/chakra_replay/report_generation_strategy.py b/src/cloudai/schema/test_template/chakra_replay/report_generation_strategy.py index 46a07bab..dcb89164 100644 --- a/src/cloudai/schema/test_template/chakra_replay/report_generation_strategy.py +++ b/src/cloudai/schema/test_template/chakra_replay/report_generation_strategy.py @@ -14,9 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os import re from math import pi +from pathlib import Path from typing import Dict, Optional import pandas as pd @@ -37,17 +37,17 @@ class ChakraReplayReportGenerationStrategy(ReportGenerationStrategy): tensor sizes from stdout files, and generate a report using Bokeh. """ - def can_handle_directory(self, directory_path: str) -> bool: - stdout_path = os.path.join(directory_path, "stdout.txt") - if os.path.exists(stdout_path): - with open(stdout_path, "r") as file: + def can_handle_directory(self, directory_path: Path) -> bool: + stdout_path = directory_path / "stdout.txt" + if stdout_path.exists(): + with stdout_path.open("r") as file: if re.search(r"Hello from Rank \d+: \[Rank\s+\d+\]", file.read()): return True return False - def generate_report(self, test_name: str, directory_path: str, sol: Optional[float] = None) -> None: - stdout_path = os.path.join(directory_path, "stdout.txt") - if not os.path.isfile(stdout_path): + def generate_report(self, test_name: str, directory_path: Path, sol: Optional[float] = None) -> None: + stdout_path = directory_path / "stdout.txt" + if not stdout_path.is_file(): return comms_data = self._extract_comms_data(stdout_path) @@ -56,7 +56,7 @@ def generate_report(self, test_name: str, directory_path: str, sol: Optional[flo self._generate_bokeh_content(comms_data, latency_tables, tensor_sizes, directory_path) - def _extract_comms_data(self, file_path: str) -> Dict[str, int]: + def _extract_comms_data(self, file_path: Path) -> Dict[str, int]: """ Extract the number of times each communication operation is called from the specified file. @@ -70,7 +70,7 @@ def _extract_comms_data(self, file_path: str) -> Dict[str, int]: comms_data = {} pattern = r"Replayed (\d+) (\w+)" - with open(file_path, "r") as file: + with file_path.open("r") as file: for line in file: match = re.search(pattern, line) if match: @@ -79,7 +79,7 @@ def _extract_comms_data(self, file_path: str) -> Dict[str, int]: return comms_data - def _extract_latency_tables(self, file_path: str) -> Dict[str, pd.DataFrame]: + def _extract_latency_tables(self, file_path: Path) -> Dict[str, pd.DataFrame]: """ Extract latency distribution tables for communication operations from the specified file. @@ -93,7 +93,7 @@ def _extract_latency_tables(self, file_path: str) -> Dict[str, pd.DataFrame]: comms_data = {} headers = ["Total", "Max", "Min", "Average", "p50", "p95"] - with open(file_path, "r") as file: + with file_path.open("r") as file: lines = file.readlines() op_name = None @@ -124,7 +124,7 @@ def _extract_latency_tables(self, file_path: str) -> Dict[str, pd.DataFrame]: return comms_data - def _extract_tensor_sizes(self, file_path: str) -> Dict[str, Dict[str, pd.DataFrame]]: # noqa: C901 + def _extract_tensor_sizes(self, file_path: Path) -> Dict[str, Dict[str, pd.DataFrame]]: # noqa: C901 """ Extract input and output tensor size distribution tables from the specified file. @@ -139,7 +139,7 @@ def _extract_tensor_sizes(self, file_path: str) -> Dict[str, Dict[str, pd.DataFr tensor_sizes = {} headers = ["Total (MB)", "Max.", "Min.", "Average", "p50", "p95"] - with open(file_path, "r") as file: + with file_path.open("r") as file: lines = file.readlines() op_name = None @@ -197,7 +197,7 @@ def _generate_bokeh_content( comms_data: Dict[str, int], latency_tables: Dict[str, pd.DataFrame], tensor_sizes: Dict[str, Dict[str, pd.DataFrame]], - directory_path: str, + directory_path: Path, ) -> None: """ Generate Bokeh visualizations for the report. @@ -289,9 +289,9 @@ def _generate_bokeh_content( layout_final = column(layout, table_title_div, data_table) # Combine layouts # Generate and save the report - output_filepath = os.path.join(directory_path, "chakra_replay_report.html") - if os.path.exists(output_filepath): - os.remove(output_filepath) + output_filepath = directory_path / "chakra_replay_report.html" + if output_filepath.exists(): + output_filepath.unlink() output_file(output_filepath) save(layout_final) # Generates an HTML file with the specified filename diff --git a/src/cloudai/schema/test_template/chakra_replay/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/chakra_replay/slurm_command_gen_strategy.py index e8095757..789f7e45 100644 --- a/src/cloudai/schema/test_template/chakra_replay/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/chakra_replay/slurm_command_gen_strategy.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path from typing import Any, Dict, List from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy @@ -28,7 +29,7 @@ def gen_exec_command( cmd_args: Dict[str, str], extra_env_vars: Dict[str, str], extra_cmd_args: str, - output_path: str, + output_path: Path, num_nodes: int, nodes: List[str], ) -> str: diff --git a/src/cloudai/schema/test_template/common/default_job_status_retrieval_strategy.py b/src/cloudai/schema/test_template/common/default_job_status_retrieval_strategy.py index 2a39b336..47f3fe76 100644 --- a/src/cloudai/schema/test_template/common/default_job_status_retrieval_strategy.py +++ b/src/cloudai/schema/test_template/common/default_job_status_retrieval_strategy.py @@ -15,6 +15,8 @@ # limitations under the License. +from pathlib import Path + from cloudai import JobStatusResult, JobStatusRetrievalStrategy @@ -26,5 +28,5 @@ class DefaultJobStatusRetrievalStrategy(JobStatusRetrievalStrategy): It always returns a success result, indicating that the job has successfully completed. """ - def get_job_status(self, output_path: str) -> JobStatusResult: + def get_job_status(self, output_path: Path) -> JobStatusResult: return JobStatusResult(is_successful=True) diff --git a/src/cloudai/schema/test_template/jax_toolbox/grading_strategy.py b/src/cloudai/schema/test_template/jax_toolbox/grading_strategy.py index 5a13766a..1f027746 100644 --- a/src/cloudai/schema/test_template/jax_toolbox/grading_strategy.py +++ b/src/cloudai/schema/test_template/jax_toolbox/grading_strategy.py @@ -13,6 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path from cloudai import GradingStrategy @@ -20,5 +21,5 @@ class JaxToolboxGradingStrategy(GradingStrategy): """Performance grading strategy for JaxToolbox test templates on Slurm systems.""" - def grade(self, directory_path: str, ideal_perf: float) -> float: + def grade(self, directory_path: Path, ideal_perf: float) -> float: return 0.0 diff --git a/src/cloudai/schema/test_template/jax_toolbox/job_status_retrieval_strategy.py b/src/cloudai/schema/test_template/jax_toolbox/job_status_retrieval_strategy.py index 932b4ff6..3b3d3a88 100644 --- a/src/cloudai/schema/test_template/jax_toolbox/job_status_retrieval_strategy.py +++ b/src/cloudai/schema/test_template/jax_toolbox/job_status_retrieval_strategy.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os import re from pathlib import Path @@ -24,23 +23,23 @@ class JaxToolboxJobStatusRetrievalStrategy(JobStatusRetrievalStrategy): """Strategy to retrieve job status for JaxToolbox by checking the contents of output_path.""" - def get_job_status(self, output_path: str) -> JobStatusResult: + def get_job_status(self, output_path: Path) -> JobStatusResult: """ Determine the job status by examining 'profile_stderr.txt' and 'error-*.txt' in the output directory. Args: - output_path (str): Path to the directory containing 'profile_stderr.txt' and 'error-*.txt'. + output_path (Path): Path to the directory containing 'profile_stderr.txt' and 'error-*.txt'. Returns: JobStatusResult: The result containing the job status and an optional error message. """ - profile_stderr_path = os.path.join(output_path, "profile_stderr.txt") + profile_stderr_path = output_path / "profile_stderr.txt" result = self.check_profile_stderr(profile_stderr_path, output_path) if not result.is_successful: return result - error_files = list(Path(output_path).glob("error-*.txt")) + error_files = list(output_path.glob("error-*.txt")) if not error_files: return JobStatusResult( is_successful=False, @@ -54,18 +53,18 @@ def get_job_status(self, output_path: str) -> JobStatusResult: return self.check_error_files(error_files, output_path) - def check_profile_stderr(self, profile_stderr_path: str, output_path: str) -> JobStatusResult: + def check_profile_stderr(self, profile_stderr_path: Path, output_path: Path) -> JobStatusResult: """ Check the profile_stderr.txt file for known error messages. Args: - profile_stderr_path (str): Path to the 'profile_stderr.txt' file. - output_path (str): Path to the output directory. + profile_stderr_path (Path): Path to the 'profile_stderr.txt' file. + output_path (Path): Path to the output directory. Returns: JobStatusResult: The result containing the job status and an optional error message. """ - if not os.path.isfile(profile_stderr_path): + if not profile_stderr_path.is_file(): return JobStatusResult( is_successful=False, error_message=( @@ -76,7 +75,7 @@ def check_profile_stderr(self, profile_stderr_path: str, output_path: str) -> Jo ), ) - with open(profile_stderr_path, "r") as file: + with profile_stderr_path.open("r") as file: content = file.read() if "[PAX STATUS]: E2E time: Elapsed time for " not in content: @@ -96,14 +95,14 @@ def check_profile_stderr(self, profile_stderr_path: str, output_path: str) -> Jo return JobStatusResult(is_successful=True) - def check_common_errors(self, content: str, file_path: str, output_path: str) -> JobStatusResult: + def check_common_errors(self, content: str, file_path: Path, output_path: Path) -> JobStatusResult: """ Check for common errors in the file content. Args: content (str): The content of the file to check. - file_path (str): The path of the file being checked. - output_path (str): Path to the output directory. + file_path (Path): The path of the file being checked. + output_path (Path): Path to the output directory. Returns: JobStatusResult: The result containing the job status and an optional error message. @@ -153,19 +152,19 @@ def check_common_errors(self, content: str, file_path: str, output_path: str) -> return JobStatusResult(is_successful=True) - def check_error_files(self, error_files: list, output_path: str) -> JobStatusResult: + def check_error_files(self, error_files: list[Path], output_path: Path) -> JobStatusResult: """ Check the error-*.txt files for known error messages. Args: - error_files (list): List of paths to error files. - output_path (str): Path to the output directory. + error_files (list[Path]): List of paths to error files. + output_path (Path): Path to the output directory. Returns: JobStatusResult: The result containing the job status and an optional error message. """ for error_file in error_files: - with open(error_file, "r") as file: + with error_file.open("r") as file: content = file.read() result = self.check_common_errors(content, error_file, output_path) if not result.is_successful: diff --git a/src/cloudai/schema/test_template/jax_toolbox/report_generation_strategy.py b/src/cloudai/schema/test_template/jax_toolbox/report_generation_strategy.py index 23ea6773..1699ba63 100644 --- a/src/cloudai/schema/test_template/jax_toolbox/report_generation_strategy.py +++ b/src/cloudai/schema/test_template/jax_toolbox/report_generation_strategy.py @@ -14,10 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import glob import logging -import os import statistics +from pathlib import Path from typing import List, Optional from cloudai import ReportGenerationStrategy @@ -26,16 +25,16 @@ class JaxToolboxReportGenerationStrategy(ReportGenerationStrategy): """Strategy for generating reports from JaxToolbox.""" - def can_handle_directory(self, directory_path: str) -> bool: - error_files = glob.glob(os.path.join(directory_path, "error-*.txt")) + def can_handle_directory(self, directory_path: Path) -> bool: + error_files = directory_path.glob("error-*.txt") for error_file in error_files: - with open(error_file, "r") as file: + with error_file.open("r") as file: content = file.read() if "[PAX STATUS]: E2E time: Elapsed time for <_main>: " in content: return True return False - def generate_report(self, test_name: str, directory_path: str, sol: Optional[float] = None) -> None: + def generate_report(self, test_name: str, directory_path: Path, sol: Optional[float] = None) -> None: times = self._extract_times(directory_path) if times: stats = { @@ -47,24 +46,24 @@ def generate_report(self, test_name: str, directory_path: str, sol: Optional[flo } self._write_report(directory_path, stats) - def _extract_times(self, directory_path: str) -> List[float]: + def _extract_times(self, directory_path: Path) -> List[float]: """ Extract elapsed times from all error files matching the pattern in the directory. Starting after the 10th occurrence of a line matching the "[PAX STATUS]: train_step() took" pattern. Args: - directory_path (str): Directory containing error files. + directory_path (Path): Directory containing error files. Returns: List[float]: List of extracted times as floats, starting from the epoch after the 10th occurrence. """ times = [] - error_files = glob.glob(os.path.join(directory_path, "error-*.txt")) + error_files = directory_path.glob("error-*.txt") for stderr_path in error_files: file_times = [] epoch_count = 0 - with open(stderr_path, "r") as file: + with stderr_path.open("r") as file: for line in file: if "[PAX STATUS]: train_step() took" in line: epoch_count += 1 @@ -89,15 +88,15 @@ def _extract_times(self, directory_path: str) -> List[float]: return times - def _write_report(self, directory_path: str, stats: dict) -> None: + def _write_report(self, directory_path: Path, stats: dict) -> None: """ Write the computed statistics to a file named 'report.txt' in the same directory. Args: - directory_path (str): Path to the directory. + directory_path (Path): Path to the directory. stats (dict): Dictionary containing computed statistics. """ - report_path = os.path.join(directory_path, "report.txt") - with open(report_path, "w") as file: + report_path = directory_path / "report.txt" + with report_path.open("w") as file: for key, value in stats.items(): file.write(f"{key.capitalize()}: {value}\n") diff --git a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py index c5190217..9df231a3 100644 --- a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py @@ -15,6 +15,7 @@ # limitations under the License. import os +from pathlib import Path from typing import Any, Dict, List from cloudai.systems import SlurmSystem @@ -36,14 +37,14 @@ def gen_exec_command( cmd_args: Dict[str, str], extra_env_vars: Dict[str, str], extra_cmd_args: str, - output_path: str, + output_path: Path, num_nodes: int, nodes: List[str], ) -> str: final_env_vars = self._override_env_vars(self.default_env_vars, env_vars) final_env_vars = self._override_env_vars(final_env_vars, extra_env_vars) final_cmd_args = self._override_cmd_args(self.default_cmd_args, cmd_args) - final_cmd_args["output_path"] = output_path + final_cmd_args["output_path"] = str(output_path) self.test_name = self._extract_test_name(cmd_args) @@ -262,11 +263,11 @@ def set_xla_flags(test_name: str, profile_enabled: bool): # Combine both parts into the run script content run_script_content = profile_content + perf_content - run_script_path = os.path.join(cmd_args["output_path"], "run.sh") + run_script_path = Path(cmd_args["output_path"]) / "run.sh" with open(run_script_path, "w") as run_file: run_file.write("\n".join(run_script_content)) os.chmod(run_script_path, 0o755) - return run_script_path + return str(run_script_path) def _script_content( self, diff --git a/src/cloudai/schema/test_template/nccl_test/grading_strategy.py b/src/cloudai/schema/test_template/nccl_test/grading_strategy.py index fd0c4bda..e51ab756 100644 --- a/src/cloudai/schema/test_template/nccl_test/grading_strategy.py +++ b/src/cloudai/schema/test_template/nccl_test/grading_strategy.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os +from pathlib import Path from cloudai import GradingStrategy @@ -27,23 +27,21 @@ class NcclTestGradingStrategy(GradingStrategy): ideal performance metric. The grade is normalized and scaled between 0 and 100. """ - def grade(self, directory_path: str, ideal_perf: float) -> float: + def grade(self, directory_path: Path, ideal_perf: float) -> float: """ - Gradesthe performance of an NcclTest based on the maximum bus bandwidth. + Grades the performance of an NcclTest based on the maximum bus bandwidth. Reported in the test's stdout.txt file, considering both in-place and out-of-place updates. Args: - directory_path (str): Path to the directory containing the - test's output. + directory_path (Path): Path to the directory containing the test's output. ideal_perf (float): The ideal performance value for comparison. Returns: - float: The performance grade of the test, normalized and - scaled between 0 and 100. + float: The performance grade of the test, normalized and scaled between 0 and 100. """ - stdout_path = os.path.join(directory_path, "stdout.txt") - if not os.path.isfile(stdout_path): + stdout_path = directory_path / "stdout.txt" + if not stdout_path.is_file(): return 0.0 max_bus_bw = self._extract_max_bus_bandwidth(stdout_path) @@ -54,18 +52,18 @@ def grade(self, directory_path: str, ideal_perf: float) -> float: grade = min(max(normalized_perf, 0), 100) return grade - def _extract_max_bus_bandwidth(self, stdout_path: str) -> float: + def _extract_max_bus_bandwidth(self, stdout_path: Path) -> float: """ Extract the maximum bus bandwidth from the NcclTest output file. Args: - stdout_path (str): Path to the stdout.txt file containing the NcclTest output. + stdout_path (Path): Path to the stdout.txt file containing the NcclTest output. Returns: float: The maximum bus bandwidth value. """ max_bus_bw = 0.0 - with open(stdout_path, "r") as file: + with stdout_path.open("r") as file: for line in file: if line.strip().startswith("#"): continue diff --git a/src/cloudai/schema/test_template/nccl_test/job_status_retrieval_strategy.py b/src/cloudai/schema/test_template/nccl_test/job_status_retrieval_strategy.py index 597896a2..d1bce565 100644 --- a/src/cloudai/schema/test_template/nccl_test/job_status_retrieval_strategy.py +++ b/src/cloudai/schema/test_template/nccl_test/job_status_retrieval_strategy.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os +from pathlib import Path from cloudai import JobStatusResult, JobStatusRetrievalStrategy @@ -22,19 +22,19 @@ class NcclTestJobStatusRetrievalStrategy(JobStatusRetrievalStrategy): """Strategy to retrieve job status for NCCL tests by checking the contents of 'stdout.txt'.""" - def get_job_status(self, output_path: str) -> JobStatusResult: + def get_job_status(self, output_path: Path) -> JobStatusResult: """ Determine the job status by examining 'stdout.txt' in the output directory. Args: - output_path (str): Path to the directory containing 'stdout.txt'. + output_path (Path): Path to the directory containing 'stdout.txt'. Returns: JobStatusResult: The result containing the job status and an optional error message. """ - stdout_path = os.path.join(output_path, "stdout.txt") - if os.path.isfile(stdout_path): - with open(stdout_path, "r") as file: + stdout_path = output_path / "stdout.txt" + if stdout_path.is_file(): + with stdout_path.open("r") as file: content = file.read() # Check for specific error patterns diff --git a/src/cloudai/schema/test_template/nccl_test/report_generation_strategy.py b/src/cloudai/schema/test_template/nccl_test/report_generation_strategy.py index 12903c97..d38526c6 100644 --- a/src/cloudai/schema/test_template/nccl_test/report_generation_strategy.py +++ b/src/cloudai/schema/test_template/nccl_test/report_generation_strategy.py @@ -14,8 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os import re +from pathlib import Path from typing import List, Optional, Tuple import pandas as pd @@ -33,10 +33,10 @@ class NcclTestReportGenerationStrategy(ReportGenerationStrategy): Visualizing bus bandwidth changes over epochs using interactive Bokeh plots. """ - def can_handle_directory(self, directory_path: str) -> bool: - stdout_path = os.path.join(directory_path, "stdout.txt") - if os.path.exists(stdout_path): - with open(stdout_path, "r") as file: + def can_handle_directory(self, directory_path: Path) -> bool: + stdout_path = directory_path / "stdout.txt" + if stdout_path.exists(): + with stdout_path.open("r") as file: content = file.read() if re.search(r"out-of-place|in-place", content) and re.search( r"\b(size\s+count\s+type\s+redop\s+root\s+" @@ -48,7 +48,7 @@ def can_handle_directory(self, directory_path: str) -> bool: return True return False - def generate_report(self, test_name: str, directory_path: str, sol: Optional[float] = None) -> None: + def generate_report(self, test_name: str, directory_path: Path, sol: Optional[float] = None) -> None: report_data, _ = self._parse_output(directory_path) if report_data: df = pd.DataFrame( @@ -78,21 +78,21 @@ def generate_report(self, test_name: str, directory_path: str, sol: Optional[flo self._generate_bokeh_report(test_name, df, directory_path, sol) self._generate_csv_report(df, directory_path) - def _parse_output(self, directory_path: str) -> Tuple[List[List[str]], Optional[float]]: + def _parse_output(self, directory_path: Path) -> Tuple[List[List[str]], Optional[float]]: """ Extract data from 'stdout.txt' for report generation. Args: - directory_path (str): Directory containing 'stdout.txt'. + directory_path (Path): Directory containing 'stdout.txt'. Returns: Tuple[List[List[str]], Optional[float]]: Parsed data and optional average bus bandwidth. """ - stdout_path = os.path.join(directory_path, "stdout.txt") + stdout_path = directory_path / "stdout.txt" avg_bus_bw = None data = [] - if os.path.isfile(stdout_path): - with open(stdout_path, "r") as file: + if stdout_path.is_file(): + with stdout_path.open("r") as file: for line in file: line = line.strip() if re.match(r"^\d", line): @@ -103,7 +103,7 @@ def _parse_output(self, directory_path: str) -> Tuple[List[List[str]], Optional[ return data, avg_bus_bw def _generate_bokeh_report( - self, test_name: str, df: pd.DataFrame, directory_path: str, sol: Optional[float] + self, test_name: str, df: pd.DataFrame, directory_path: Path, sol: Optional[float] ) -> None: """ Create and saves plots to visualize NCCL test metrics. @@ -111,7 +111,7 @@ def _generate_bokeh_report( Args: test_name (str): The name of the test. df (pd.DataFrame): DataFrame containing the NCCL test data. - directory_path (str): Output directory path for saving the plots. + directory_path (Path): Output directory path for saving the plots. sol (Optional[float]): Speed-of-light performance for reference. """ report_tool = BokehReportTool(directory_path) @@ -141,16 +141,16 @@ def _generate_bokeh_report( sol=sol, ) - report_tool.finalize_report("cloudai_nccl_test_bokeh_report.html") + report_tool.finalize_report(Path("cloudai_nccl_test_bokeh_report.html")) - def _generate_csv_report(self, df: pd.DataFrame, directory_path: str) -> None: + def _generate_csv_report(self, df: pd.DataFrame, directory_path: Path) -> None: """ Generate a CSV report from the DataFrame. Args: df (pd.DataFrame): DataFrame containing the NCCL test data. - directory_path (str): Output directory path for saving the CSV report. + directory_path (Path): Output directory path for saving the CSV report. """ csv_report_tool = CSVReportTool(directory_path) csv_report_tool.set_dataframe(df) - csv_report_tool.finalize_report("cloudai_nccl_test_csv_report.csv") + csv_report_tool.finalize_report(Path("cloudai_nccl_test_csv_report.csv")) diff --git a/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py index 0f522c4e..427de545 100644 --- a/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py @@ -15,6 +15,7 @@ # limitations under the License. import os +from pathlib import Path from typing import Any, Dict, List from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy @@ -31,7 +32,7 @@ def gen_exec_command( cmd_args: Dict[str, str], extra_env_vars: Dict[str, str], extra_cmd_args: str, - output_path: str, + output_path: Path, num_nodes: int, nodes: List[str], ) -> str: diff --git a/src/cloudai/schema/test_template/nemo_launcher/grading_strategy.py b/src/cloudai/schema/test_template/nemo_launcher/grading_strategy.py index 09bbf4f3..353387d0 100644 --- a/src/cloudai/schema/test_template/nemo_launcher/grading_strategy.py +++ b/src/cloudai/schema/test_template/nemo_launcher/grading_strategy.py @@ -14,6 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path + import numpy as np from cloudai import GradingStrategy @@ -23,12 +25,12 @@ class NeMoLauncherGradingStrategy(GradingStrategy): """Performance grading strategy for NeMoLauncher test templates on Slurm systems.""" - def grade(self, directory_path: str, ideal_perf: float) -> float: + def grade(self, directory_path: Path, ideal_perf: float) -> float: """ Grades the performance of a test. Args: - directory_path (str): Path to the directory containing the test's output. + directory_path (Path): Path to the directory containing the test's output. ideal_perf (float): The ideal performance value for comparison. Returns: diff --git a/src/cloudai/schema/test_template/nemo_launcher/report_generation_strategy.py b/src/cloudai/schema/test_template/nemo_launcher/report_generation_strategy.py index 0ff49ee8..3a43002e 100644 --- a/src/cloudai/schema/test_template/nemo_launcher/report_generation_strategy.py +++ b/src/cloudai/schema/test_template/nemo_launcher/report_generation_strategy.py @@ -15,6 +15,7 @@ # limitations under the License. import os +from pathlib import Path from typing import Optional import pandas as pd @@ -31,14 +32,14 @@ class NeMoLauncherReportGenerationStrategy(ReportGenerationStrategy): Now updated to handle TensorBoard log files and visualize data using Bokeh plots. """ - def can_handle_directory(self, directory_path: str) -> bool: + def can_handle_directory(self, directory_path: Path) -> bool: for _, __, files in os.walk(directory_path): for file in files: if file.startswith("events.out.tfevents"): return True return False - def generate_report(self, test_name: str, directory_path: str, sol: Optional[float] = None) -> None: + def generate_report(self, test_name: str, directory_path: Path, sol: Optional[float] = None) -> None: tags = ["train_step_timing in s"] data_reader = TensorBoardDataReader(directory_path) report_tool = BokehReportTool(directory_path) @@ -57,4 +58,4 @@ def generate_report(self, test_name: str, directory_path: str, sol: Optional[flo color="black", ) - report_tool.finalize_report("cloudai_nemo_launcher_bokeh_report.html") + report_tool.finalize_report(Path("cloudai_nemo_launcher_bokeh_report.html")) diff --git a/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py index dc28ddc5..903ca751 100644 --- a/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os +from pathlib import Path from typing import Any, Dict, List from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy @@ -27,7 +27,7 @@ class NeMoLauncherSlurmCommandGenStrategy(SlurmCommandGenStrategy): Command generation strategy for NeMo Megatron Launcher on Slurm systems. Attributes - install_path (str): The installation path of CloudAI. + install_path (Path): The installation path of CloudAI. """ def gen_exec_command( @@ -36,32 +36,36 @@ def gen_exec_command( cmd_args: Dict[str, str], extra_env_vars: Dict[str, str], extra_cmd_args: str, - output_path: str, + output_path: Path, num_nodes: int, nodes: List[str], ) -> str: final_env_vars = self._override_env_vars(self.default_env_vars, env_vars) final_env_vars = self._override_env_vars(final_env_vars, extra_env_vars) - launcher_path = os.path.join( - self.install_path, - NeMoLauncherSlurmInstallStrategy.SUBDIR_PATH, - NeMoLauncherSlurmInstallStrategy.REPOSITORY_NAME, + launcher_path = ( + self.install_path + / NeMoLauncherSlurmInstallStrategy.SUBDIR_PATH + / NeMoLauncherSlurmInstallStrategy.REPOSITORY_NAME ) overriden_cmd_args = self._override_cmd_args(self.default_cmd_args, cmd_args) self.final_cmd_args = { - k: self._handle_special_keys(k, v, launcher_path, output_path) for k, v in overriden_cmd_args.items() + k: self._handle_special_keys(k, v, str(launcher_path), str(output_path)) + for k, v in overriden_cmd_args.items() } - self.final_cmd_args["base_results_dir"] = output_path - self.final_cmd_args["training.model.data.index_mapping_dir"] = output_path - self.final_cmd_args["launcher_scripts_path"] = os.path.join(launcher_path, "launcher_scripts") + self.final_cmd_args["base_results_dir"] = str(output_path) + self.final_cmd_args["training.model.data.index_mapping_dir"] = str(output_path) + self.final_cmd_args["launcher_scripts_path"] = str(launcher_path / "launcher_scripts") + for key, value in final_env_vars.items(): self.final_cmd_args[f"env_vars.{key}"] = value + self.final_cmd_args["cluster.partition"] = self.slurm_system.default_partition reservation_key = "--reservation " if self.slurm_system.extra_srun_args and reservation_key in self.slurm_system.extra_srun_args: reservation = self.slurm_system.extra_srun_args.split(reservation_key, 1)[1].split(" ", 1)[0] self.final_cmd_args["+cluster.reservation"] = reservation + nodes = self.slurm_system.parse_nodes(nodes) if nodes: self.final_cmd_args["training.trainer.num_nodes"] = str(len(nodes)) @@ -87,7 +91,7 @@ def gen_exec_command( tokenizer_key = "training.model.tokenizer.model=" if tokenizer_key in extra_cmd_args: tokenizer_path = extra_cmd_args.split(tokenizer_key, 1)[1].split(" ", 1)[0] - if not os.path.isfile(tokenizer_path): + if not Path(tokenizer_path).is_file(): raise ValueError( f"The provided tokenizer path '{tokenizer_path}' is not valid. " "Please review the test schema file to ensure the tokenizer path is correct. " diff --git a/src/cloudai/schema/test_template/sleep/grading_strategy.py b/src/cloudai/schema/test_template/sleep/grading_strategy.py index e9a64538..414c00ad 100644 --- a/src/cloudai/schema/test_template/sleep/grading_strategy.py +++ b/src/cloudai/schema/test_template/sleep/grading_strategy.py @@ -14,18 +14,20 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path + from cloudai import GradingStrategy class SleepGradingStrategy(GradingStrategy): """Performance grading strategy for Sleep test templates on Slurm systems.""" - def grade(self, directory_path: str, ideal_perf: float) -> float: + def grade(self, directory_path: Path, ideal_perf: float) -> float: """ Grades the performance of a test. Args: - directory_path (str): Path to the directory containing the test's output. + directory_path (Path): Path to the directory containing the test's output. ideal_perf (float): The ideal performance value for comparison. Returns: diff --git a/src/cloudai/schema/test_template/sleep/report_generation_strategy.py b/src/cloudai/schema/test_template/sleep/report_generation_strategy.py index 4a3cc79f..2e693b37 100644 --- a/src/cloudai/schema/test_template/sleep/report_generation_strategy.py +++ b/src/cloudai/schema/test_template/sleep/report_generation_strategy.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path from typing import Optional from cloudai import ReportGenerationStrategy @@ -22,8 +23,8 @@ class SleepReportGenerationStrategy(ReportGenerationStrategy): """Strategy for generating reports from sleep directories.""" - def can_handle_directory(self, directory_path: str) -> bool: + def can_handle_directory(self, directory_path: Path) -> bool: return False - def generate_report(self, test_name: str, directory_path: str, sol: Optional[float] = None) -> None: + def generate_report(self, test_name: str, directory_path: Path, sol: Optional[float] = None) -> None: pass diff --git a/src/cloudai/schema/test_template/sleep/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/sleep/slurm_command_gen_strategy.py index ddb4a4b9..fac11094 100644 --- a/src/cloudai/schema/test_template/sleep/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/sleep/slurm_command_gen_strategy.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path from typing import Any, Dict, List from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy @@ -28,7 +29,7 @@ def gen_exec_command( cmd_args: Dict[str, str], extra_env_vars: Dict[str, str], extra_cmd_args: str, - output_path: str, + output_path: Path, num_nodes: int, nodes: List[str], ) -> str: diff --git a/src/cloudai/schema/test_template/sleep/standalone_command_gen_strategy.py b/src/cloudai/schema/test_template/sleep/standalone_command_gen_strategy.py index 88e32fe7..f6267f1c 100644 --- a/src/cloudai/schema/test_template/sleep/standalone_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/sleep/standalone_command_gen_strategy.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path from typing import Dict, List from cloudai import CommandGenStrategy @@ -32,7 +33,7 @@ def gen_exec_command( cmd_args: Dict[str, str], extra_env_vars: Dict[str, str], extra_cmd_args: str, - output_path: str, + output_path: Path, num_nodes: int, nodes: List[str], ) -> str: diff --git a/src/cloudai/schema/test_template/ucc_test/grading_strategy.py b/src/cloudai/schema/test_template/ucc_test/grading_strategy.py index 9a122367..a2f119ff 100644 --- a/src/cloudai/schema/test_template/ucc_test/grading_strategy.py +++ b/src/cloudai/schema/test_template/ucc_test/grading_strategy.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os +from pathlib import Path from cloudai import GradingStrategy @@ -27,22 +27,22 @@ class UCCTestGradingStrategy(GradingStrategy): ideal performance metric. The grade is normalized and scaled between 0 and 100. """ - def grade(self, directory_path: str, ideal_perf: float) -> float: + def grade(self, directory_path: Path, ideal_perf: float) -> float: """ - Grade the performance of an UCCTest based on the maximum bus bandwidth. + Grade the performance of a UCCTest based on the maximum bus bandwidth. Reported in the test's stdout.txt file, considering both in-place and out-of-place updates. Args: - directory_path (str): Path to the directory containing the test's output. + directory_path (Path): Path to the directory containing the test's output. ideal_perf (float): The ideal performance value for comparison. Returns: float: The performance grade of the test, normalized and scaled between 0 and 100. """ - stdout_path = os.path.join(directory_path, "stdout.txt") - if not os.path.isfile(stdout_path): + stdout_path = directory_path / "stdout.txt" + if not stdout_path.is_file(): return 0.0 max_bus_bw = self._extract_max_bus_bandwidth(stdout_path) @@ -53,18 +53,18 @@ def grade(self, directory_path: str, ideal_perf: float) -> float: grade = min(max(normalized_perf, 0), 100) return grade - def _extract_max_bus_bandwidth(self, stdout_path: str) -> float: + def _extract_max_bus_bandwidth(self, stdout_path: Path) -> float: """ Extract the maximum bus bandwidth from the UCCTest output file. Args: - stdout_path (str): Path to the stdout.txt file containing the UCCTest output. + stdout_path (Path): Path to the stdout.txt file containing the UCCTest output. Returns: float: The maximum bus bandwidth value. """ max_bus_bw = 0.0 - with open(stdout_path, "r") as file: + with stdout_path.open("r") as file: for line in file: parts = line.split() if len(parts) == 8: # Ensure it's a data line diff --git a/src/cloudai/schema/test_template/ucc_test/report_generation_strategy.py b/src/cloudai/schema/test_template/ucc_test/report_generation_strategy.py index 54bddf9e..2b263630 100644 --- a/src/cloudai/schema/test_template/ucc_test/report_generation_strategy.py +++ b/src/cloudai/schema/test_template/ucc_test/report_generation_strategy.py @@ -13,9 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import os import re +from pathlib import Path from typing import List, Optional import pandas as pd @@ -32,20 +31,20 @@ class UCCTestReportGenerationStrategy(ReportGenerationStrategy): Visualizing bus bandwidth changes over epochs using interactive Bokeh plots. """ - def can_handle_directory(self, directory_path: str) -> bool: - stdout_path = os.path.join(directory_path, "stdout.txt") - if os.path.exists(stdout_path): - with open(stdout_path, "r") as file: + def can_handle_directory(self, directory_path: Path) -> bool: + stdout_path = directory_path / "stdout.txt" + if stdout_path.exists(): + with stdout_path.open("r") as file: content = file.read() if re.search(r"\b(avg\s+min\s+max\s+avg\s+max\s+min)\b", content): return True return False - def generate_report(self, test_name: str, directory_path: str, sol: Optional[float] = None) -> None: + def generate_report(self, test_name: str, directory_path: Path, sol: Optional[float] = None) -> None: report_data = [] - stdout_path = os.path.join(directory_path, "stdout.txt") - if os.path.isfile(stdout_path): - with open(stdout_path, "r") as file: + stdout_path = directory_path / "stdout.txt" + if stdout_path.is_file(): + with stdout_path.open("r") as file: content = file.read() report_data = self._parse_output(content) @@ -84,13 +83,13 @@ def _parse_output(self, content: str) -> List[List[str]]: data.append(values) return data - def _generate_plots(self, df: pd.DataFrame, directory_path: str, sol: Optional[float]) -> None: + def _generate_plots(self, df: pd.DataFrame, directory_path: Path, sol: Optional[float]) -> None: """ Create and saves plots to visualize UCC test metrics. Args: df (pd.DataFrame): DataFrame containing the NCCL test data. - directory_path (str): Output directory path for saving the plots. + directory_path (Path): Output directory path for saving the plots. sol (Optional[float]): Speed-of-light performance for reference. """ report_tool = BokehReportTool(directory_path) @@ -121,4 +120,4 @@ def _generate_plots(self, df: pd.DataFrame, directory_path: str, sol: Optional[f sol=sol, ) - report_tool.finalize_report("cloudai_ucc_test_bokeh_report.html") + report_tool.finalize_report(Path("cloudai_ucc_test_bokeh_report.html")) diff --git a/src/cloudai/schema/test_template/ucc_test/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/ucc_test/slurm_command_gen_strategy.py index 3d741bf6..559db258 100644 --- a/src/cloudai/schema/test_template/ucc_test/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/ucc_test/slurm_command_gen_strategy.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path from typing import Any, Dict, List from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy @@ -31,7 +32,7 @@ def gen_exec_command( cmd_args: Dict[str, str], extra_env_vars: Dict[str, str], extra_cmd_args: str, - output_path: str, + output_path: Path, num_nodes: int, nodes: List[str], ) -> str: diff --git a/src/cloudai/systems/slurm/slurm_system.py b/src/cloudai/systems/slurm/slurm_system.py index c337121a..a5fd202b 100644 --- a/src/cloudai/systems/slurm/slurm_system.py +++ b/src/cloudai/systems/slurm/slurm_system.py @@ -17,6 +17,7 @@ import getpass import logging import re +from pathlib import Path from typing import Any, Dict, List, Optional, Tuple from cloudai import System @@ -31,8 +32,8 @@ class SlurmSystem(System): Attributes name (str): The name of the Slurm system. - install_path (str): Installation path of CloudAI software. - output_path (str): Directory path for output files. + install_path (Path): Installation path of CloudAI software. + output_path (Path): Path to the output directory. default_partition (str): The default partition for job submission. partitions (Dict[str, List[SlurmNode]]): Mapping of partition names to lists of SlurmNodes. account (Optional[str]): Account name for charging resources used by this job. @@ -50,6 +51,80 @@ class SlurmSystem(System): cmd_shell (CommandShell): An instance of CommandShell for executing system commands. """ + def __init__( + self, + name: str, + install_path: Path, + output_path: Path, + default_partition: str, + partitions: Dict[str, List[SlurmNode]], + account: Optional[str] = None, + distribution: Optional[str] = None, + mpi: Optional[str] = None, + gpus_per_node: Optional[int] = None, + ntasks_per_node: Optional[int] = None, + cache_docker_images_locally: bool = False, + groups: Optional[Dict[str, Dict[str, List[SlurmNode]]]] = None, + global_env_vars: Optional[Dict[str, Any]] = None, + extra_srun_args: Optional[str] = None, + ) -> None: + """ + Initialize a SlurmSystem instance. + + Args: + name (str): Name of the Slurm system. + install_path (Path): The installation path of CloudAI. + output_path (Path): Path to the output directory. + default_partition (str): Default partition. + partitions (Dict[str, List[SlurmNode]]): Partitions in the system. + account (Optional[str]): Account name for charging resources used by this job. + distribution (Optional[str]): Specifies alternate distribution methods for remote processes. + mpi (Optional[str]): Indicates the Process Management Interface (PMI) implementation to be used for + inter-process communication. + gpus_per_node (Optional[int]): Specifies the number of GPUs available per node. + ntasks_per_node (Optional[int]): Specifies the number of tasks that can run concurrently on a single node. + cache_docker_images_locally (bool): Whether to cache Docker images locally for the Slurm system. + groups (Optional[Dict[str, Dict[str, List[SlurmNode]]]]): Nested mapping of group names to lists of + SlurmNodes within partitions, defining the group composition within each partition. Defaults to an + empty dictionary if not provided. + global_env_vars (Optional[Dict[str, Any]]): Dictionary containing additional configuration settings for + the system. + extra_srun_args (Optional[str]): Additional arguments to be passed to the srun command. + """ + super().__init__(name, "slurm", output_path) + self.install_path = install_path + self.default_partition = default_partition + self.partitions = partitions + self.account = account + self.distribution = distribution + self.mpi = mpi + self.gpus_per_node = gpus_per_node + self.ntasks_per_node = ntasks_per_node + self.cache_docker_images_locally = cache_docker_images_locally + self.groups = groups if groups is not None else {} + self.global_env_vars = global_env_vars if global_env_vars is not None else {} + self.extra_srun_args = extra_srun_args + self.cmd_shell = CommandShell() + logging.debug(f"{self.__class__.__name__} initialized") + + def __repr__(self) -> str: + """ + Provide a structured string representation of the system. + + Including the system name, scheduler type, and a simplified view similar to the `sinfo` command output, + focusing on the partition, state, and nodelist. + """ + header = f"System Name: {self.name}\nScheduler Type: {self.scheduler}" + parts = [header, "\tPARTITION STATE NODELIST"] + for partition_name, nodes in self.partitions.items(): + state_count = {} + for node in nodes: + state_count.setdefault(node.state, []).append(node.name) + for state, names in state_count.items(): + node_list_str = self.format_node_list(names) + parts.append(f"\t{partition_name:<10} {state.name:<7} {node_list_str}") + return "\n".join(parts) + def update(self) -> None: """ Update the system object for a SLURM system. @@ -176,80 +251,6 @@ def format_range(lst: List[int], padding: int) -> List[str]: return ", ".join(formatted_ranges) - def __init__( - self, - name: str, - install_path: str, - output_path: str, - default_partition: str, - partitions: Dict[str, List[SlurmNode]], - account: Optional[str] = None, - distribution: Optional[str] = None, - mpi: Optional[str] = None, - gpus_per_node: Optional[int] = None, - ntasks_per_node: Optional[int] = None, - cache_docker_images_locally: bool = False, - groups: Optional[Dict[str, Dict[str, List[SlurmNode]]]] = None, - global_env_vars: Optional[Dict[str, Any]] = None, - extra_srun_args: Optional[str] = None, - ) -> None: - """ - Initialize a SlurmSystem instance. - - Args: - name (str): Name of the Slurm system. - install_path (str): The installation path of CloudAI. - output_path (str): Path to the output directory. - default_partition (str): Default partition. - partitions (Dict[str, List[SlurmNode]]): Partitions in the system. - account (Optional[str]): Account name for charging resources used by this job. - distribution (Optional[str]): Specifies alternate distribution methods for remote processes. - mpi (Optional[str]): Indicates the Process Management Interface (PMI) implementation to be used for - inter-process communication. - gpus_per_node (Optional[int]): Specifies the number of GPUs available per node. - ntasks_per_node (Optional[int]): Specifies the number of tasks that can run concurrently on a single node. - cache_docker_images_locally (bool): Whether to cache Docker images locally for the Slurm system. - groups (Optional[Dict[str, Dict[str, List[SlurmNode]]]]): Nested mapping of group names to lists of - SlurmNodes within partitions, defining the group composition within each partition. Defaults to an - empty dictionary if not provided. - global_env_vars (Optional[Dict[str, Any]]): Dictionary containing additional configuration settings for - the system. - extra_srun_args (Optional[str]): Additional arguments to be passed to the srun command. - """ - super().__init__(name, "slurm", output_path) - self.install_path = install_path - self.default_partition = default_partition - self.partitions = partitions - self.account = account - self.distribution = distribution - self.mpi = mpi - self.gpus_per_node = gpus_per_node - self.ntasks_per_node = ntasks_per_node - self.cache_docker_images_locally = cache_docker_images_locally - self.groups = groups if groups is not None else {} - self.global_env_vars = global_env_vars if global_env_vars is not None else {} - self.extra_srun_args = extra_srun_args - self.cmd_shell = CommandShell() - logging.debug(f"{self.__class__.__name__} initialized") - - def __repr__(self) -> str: - """ - Provide a structured string representation of the system. - - Including the system name, scheduler type, and a simplified view similar to the `sinfo` command output, - focusing on the partition, state, and nodelist. - """ - header = f"System Name: {self.name}\nScheduler Type: {self.scheduler}" - parts = [header, "\tPARTITION STATE NODELIST"] - for partition_name, nodes in self.partitions.items(): - state_count = {} - for node in nodes: - state_count.setdefault(node.state, []).append(node.name) - for state, names in state_count.items(): - node_list_str = self.format_node_list(names) - parts.append(f"\t{partition_name:<10} {state.name:<7} {node_list_str}") - return "\n".join(parts) - def get_partition_names(self) -> List[str]: """Return a list of all partition names.""" return list(self.partitions.keys()) diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py index 71b5854f..7412c194 100644 --- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py +++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py @@ -14,8 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from datetime import datetime +from pathlib import Path from typing import Any, Dict, List from cloudai import CommandGenStrategy @@ -37,7 +37,7 @@ def __init__(self, system: SlurmSystem, env_vars: Dict[str, Any], cmd_args: Dict Initialize a new SlurmCommandGenStrategy instance. Args: - system (System): The system schema object. + system (SlurmSystem): The system schema object. env_vars (Dict[str, Any]): Environment variables. cmd_args (Dict[str, Any]): Command-line arguments. """ @@ -173,15 +173,17 @@ def _add_reservation(self, batch_script_content: List[str]): return batch_script_content - def _write_sbatch_script(self, args: Dict[str, Any], env_vars_str: str, srun_command: str, output_path: str) -> str: + def _write_sbatch_script( + self, args: Dict[str, Any], env_vars_str: str, srun_command: str, output_path: Path + ) -> str: """ - Write the batch script for Slurm submission and returns the sbatch command. + Write the batch script for Slurm submission and return the sbatch command. Args: args (Dict[str, Any]): Arguments including job settings. env_vars_str (str): Environment variables. srun_command (str): srun command. - output_path (str): Output directory for script and logs. + output_path (Path): Output directory for script and logs. Returns: str: sbatch command to submit the job. @@ -194,9 +196,9 @@ def _write_sbatch_script(self, args: Dict[str, Any], env_vars_str: str, srun_com batch_script_content = self._add_reservation(batch_script_content) if "output" not in args: - batch_script_content.append(f"#SBATCH --output={os.path.join(output_path, 'stdout.txt')}") + batch_script_content.append(f"#SBATCH --output={output_path / 'stdout.txt'}") if "error" not in args: - batch_script_content.append(f"#SBATCH --error={os.path.join(output_path, 'stderr.txt')}") + batch_script_content.append(f"#SBATCH --error={output_path / 'stderr.txt'}") if args["partition"]: batch_script_content.append(f"#SBATCH --partition={args['partition']}") if args["node_list_str"]: @@ -218,8 +220,8 @@ def _write_sbatch_script(self, args: Dict[str, Any], env_vars_str: str, srun_com batch_script_content.extend(["", env_vars_str, "", srun_command]) - batch_script_path = os.path.join(output_path, "cloudai_sbatch_script.sh") - with open(batch_script_path, "w") as batch_file: + batch_script_path = output_path / "cloudai_sbatch_script.sh" + with batch_script_path.open("w") as batch_file: batch_file.write("\n".join(batch_script_content)) return f"sbatch {batch_script_path}" diff --git a/src/cloudai/systems/standalone_system.py b/src/cloudai/systems/standalone_system.py index ed3166b2..609cf9b0 100644 --- a/src/cloudai/systems/standalone_system.py +++ b/src/cloudai/systems/standalone_system.py @@ -14,6 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path + from cloudai import System @@ -29,13 +31,13 @@ class StandaloneSystem(System): output_path (str): Path to the output directory. """ - def __init__(self, name: str, output_path: str) -> None: + def __init__(self, name: str, output_path: Path) -> None: """ Initialize a StandaloneSystem instance. Args: name (str): Name of the standalone system. - output_path (str): Path to the output directory. + output_path (Path): Path to the output directory. """ super().__init__(name, "standalone", output_path) diff --git a/src/cloudai/util/docker_image_cache_manager.py b/src/cloudai/util/docker_image_cache_manager.py index 6a302b65..476bdc18 100644 --- a/src/cloudai/util/docker_image_cache_manager.py +++ b/src/cloudai/util/docker_image_cache_manager.py @@ -19,6 +19,8 @@ import shutil import subprocess import tempfile +from pathlib import Path +from typing import Optional class PrerequisiteCheckResult: @@ -66,21 +68,21 @@ class DockerImageCacheResult: Attributes success (bool): Indicates whether the operation was successful. - docker_image_path (str): The path to the Docker image. + docker_image_path (Path): The path to the Docker image. message (str): A message providing additional information about the result. """ - def __init__(self, success: bool, docker_image_path: str = "", message: str = "") -> None: + def __init__(self, success: bool, docker_image_path: Optional[Path] = None, message: str = "") -> None: """ Initialize the DockerImageCacheResult. Args: success (bool): Indicates whether the operation was successful. - docker_image_path (str): The path to the Docker image. + docker_image_path (Path): The path to the Docker image. message (str): A message providing additional information about the result. """ self.success = success - self.docker_image_path = docker_image_path + self.docker_image_path = docker_image_path or Path() self.message = message def __bool__(self): @@ -107,12 +109,12 @@ class DockerImageCacheManager: Manages the caching of Docker images for installation strategies. Attributes - install_path (str): The base installation path. + install_path (Path): The base installation path. cache_docker_images_locally (bool): Whether to cache Docker image files locally. partition_name (str): The partition name to use in the srun command. """ - def __init__(self, install_path: str, cache_docker_images_locally: bool, partition_name: str) -> None: + def __init__(self, install_path: Path, cache_docker_images_locally: bool, partition_name: str) -> None: self.install_path = install_path self.cache_docker_images_locally = cache_docker_images_locally self.partition_name = partition_name @@ -162,35 +164,35 @@ def check_docker_image_exists( # If not caching locally, return True. Defer checking URL accessibility to srun. if not self.cache_docker_images_locally: - return DockerImageCacheResult(True, docker_image_url, "") + return DockerImageCacheResult(True, Path(docker_image_url), "") - # Check if docker_image_url is a file path and exists - if os.path.isfile(docker_image_url) and os.path.exists(docker_image_url): + docker_image_path = Path(docker_image_url) + if docker_image_path.is_file() and docker_image_path.exists(): return DockerImageCacheResult( - True, docker_image_url, f"Docker image file path is valid: {docker_image_url}." + True, docker_image_path, f"Docker image file path is valid: {docker_image_url}." ) # Check if the cache file exists - if not os.path.exists(self.install_path): + if not self.install_path.exists(): message = f"Install path {self.install_path} does not exist." logging.debug(message) - return DockerImageCacheResult(False, "", message) + return DockerImageCacheResult(False, Path(), message) - subdir_path = os.path.join(self.install_path, subdir_name) - if not os.path.exists(subdir_path): + subdir_path = self.install_path / subdir_name + if not subdir_path.exists(): message = f"Subdirectory path {subdir_path} does not exist." logging.debug(message) - return DockerImageCacheResult(False, "", message) + return DockerImageCacheResult(False, Path(), message) - docker_image_path = os.path.join(subdir_path, docker_image_filename) - if os.path.isfile(docker_image_path) and os.path.exists(docker_image_path): + docker_image_path = subdir_path / docker_image_filename + if docker_image_path.is_file() and docker_image_path.exists(): message = f"Cached Docker image already exists at {docker_image_path}." logging.debug(message) return DockerImageCacheResult(True, docker_image_path, message) message = f"Docker image does not exist at the specified path: {docker_image_path}." logging.debug(message) - return DockerImageCacheResult(False, "", message) + return DockerImageCacheResult(False, Path(), message) def cache_docker_image( self, docker_image_url: str, subdir_name: str, docker_image_filename: str @@ -206,10 +208,10 @@ def cache_docker_image( Returns: DockerImageCacheResult: Result of the Docker image caching operation. """ - subdir_path = os.path.join(self.install_path, subdir_name) - docker_image_path = os.path.join(subdir_path, docker_image_filename) + subdir_path = self.install_path / subdir_name + docker_image_path = subdir_path / docker_image_filename - if os.path.isfile(docker_image_path): + if docker_image_path.is_file(): success_message = f"Cached Docker image already exists at {docker_image_path}." logging.info(success_message) return DockerImageCacheResult(True, docker_image_path, success_message) @@ -217,25 +219,25 @@ def cache_docker_image( prerequisite_check = self._check_prerequisites(docker_image_url) if not prerequisite_check: logging.error(f"Prerequisite check failed: {prerequisite_check.message}") - return DockerImageCacheResult(False, "", prerequisite_check.message) + return DockerImageCacheResult(False, Path(), prerequisite_check.message) - if not os.path.exists(self.install_path): + if not self.install_path.exists(): error_message = f"Install path {self.install_path} does not exist." logging.error(error_message) - return DockerImageCacheResult(False, "", error_message) + return DockerImageCacheResult(False, Path(), error_message) if not os.access(self.install_path, os.W_OK): error_message = f"No permission to write in install path {self.install_path}." logging.error(error_message) - return DockerImageCacheResult(False, "", error_message) + return DockerImageCacheResult(False, Path(), error_message) - if not os.path.exists(subdir_path): + if not subdir_path.exists(): try: - os.makedirs(subdir_path) + subdir_path.mkdir(parents=True) except OSError as e: error_message = f"Failed to create subdirectory {subdir_path}. Error: {e}" logging.error(error_message) - return DockerImageCacheResult(False, "", error_message) + return DockerImageCacheResult(False, Path(), error_message) enroot_import_cmd = ( f"srun --export=ALL --partition={self.partition_name} " @@ -254,7 +256,7 @@ def cache_docker_image( "If the disk is full, consider using a different disk or removing unnecessary files." ) logging.error(error_message) - return DockerImageCacheResult(False, "", error_message) + return DockerImageCacheResult(False, Path(), error_message) success_message = f"Docker image cached successfully at {docker_image_path}." logging.debug(success_message) @@ -267,7 +269,7 @@ def cache_docker_image( logging.error(error_message) return DockerImageCacheResult( False, - "", + Path(), ( f"Failed to import Docker image from {docker_image_url}. " f"Command: {enroot_import_cmd}. " @@ -314,7 +316,7 @@ def _check_docker_image_accessibility(self, docker_image_url: str) -> Prerequisi PrerequisiteCheckResult: Result of the Docker image accessibility check. """ with tempfile.TemporaryDirectory() as temp_dir: - docker_image_path = os.path.join(temp_dir, "docker_image.sqsh") + docker_image_path = Path(temp_dir) / "docker_image.sqsh" enroot_import_cmd = f"enroot import -o {docker_image_path} docker://{docker_image_url}" logging.debug(f"Checking Docker image accessibility: {enroot_import_cmd}") @@ -365,9 +367,9 @@ def _check_docker_image_accessibility(self, docker_image_url: str) -> Prerequisi ) finally: # Ensure the temporary docker image file is removed - if os.path.exists(docker_image_path): + if docker_image_path.exists(): try: - os.remove(docker_image_path) + docker_image_path.unlink() logging.debug(f"Temporary Docker image file removed: {docker_image_path}") except OSError as e: logging.error(f"Failed to remove temporary Docker image file {docker_image_path}. Error: {e}") @@ -387,11 +389,11 @@ def uninstall_cached_image(self, subdir_name: str, docker_image_filename: str) - if not result.success: return result - subdir_path = os.path.join(self.install_path, subdir_name) - if os.path.isdir(subdir_path): + subdir_path = self.install_path / subdir_name + if subdir_path.is_dir(): try: - if not os.listdir(subdir_path): - os.rmdir(subdir_path) + if not any(subdir_path.iterdir()): + subdir_path.rmdir() success_message = f"Subdirectory removed successfully: {subdir_path}." logging.info(success_message) return DockerImageCacheResult(True, subdir_path, success_message) @@ -414,10 +416,10 @@ def remove_cached_image(self, subdir_name: str, docker_image_filename: str) -> D Returns: DockerImageCacheResult: Result of the removal operation. """ - docker_image_path = os.path.join(self.install_path, subdir_name, docker_image_filename) - if os.path.isfile(docker_image_path): + docker_image_path = self.install_path / subdir_name / docker_image_filename + if docker_image_path.is_file(): try: - os.remove(docker_image_path) + docker_image_path.unlink() success_message = f"Cached Docker image removed successfully from {docker_image_path}." logging.info(success_message) return DockerImageCacheResult(True, docker_image_path, success_message) diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index 707cf832..f69895d0 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -40,14 +40,14 @@ def test_slurm(tmp_path: Path, scenario: Dict): test_scenario_path = scenario["path"] expected_dirs_number = scenario.get("expected_dirs_number") - log_file = scenario.get("log_file") - log_file_path = tmp_path / str(log_file) + log_file = scenario.get("log_file", ".") + log_file_path = tmp_path / log_file parser = Parser(Path("conf/common/system/example_slurm_cluster.toml"), Path("conf/common/test_template")) system, tests, test_scenario = parser.parse(Path("conf/common/test"), test_scenario_path) - system.output_path = str(tmp_path) + system.output_path = tmp_path assert test_scenario is not None, "Test scenario is None" - setup_logging(str(log_file_path), "DEBUG") + setup_logging(log_file_path, "DEBUG") handle_dry_run_and_run("dry-run", system, tests, test_scenario) # Find the directory that was created for the test results @@ -78,8 +78,8 @@ def slurm_system() -> SlurmSystem: system = SlurmSystem( name="test_system", - install_path="/fake/path", - output_path="/fake/output", + install_path=Path("/fake/path"), + output_path=Path("/fake/output"), default_partition="main", partitions={"main": nodes, "backup": backup_nodes}, ) diff --git a/tests/test_base_installer.py b/tests/test_base_installer.py index 9e13d8ae..0b84d83a 100644 --- a/tests/test_base_installer.py +++ b/tests/test_base_installer.py @@ -15,6 +15,7 @@ # limitations under the License. from concurrent.futures import Future +from pathlib import Path from unittest.mock import MagicMock, Mock, patch import pytest @@ -34,8 +35,8 @@ def slurm_system() -> SlurmSystem: system = SlurmSystem( name="test_system", - install_path="/fake/path", - output_path="/fake/output", + install_path=Path("/fake/path"), + output_path=Path("/fake/output"), default_partition="main", partitions={"main": nodes, "backup": backup_nodes}, ) diff --git a/tests/test_base_runner.py b/tests/test_base_runner.py index 6e0000da..1d8ef2fc 100644 --- a/tests/test_base_runner.py +++ b/tests/test_base_runner.py @@ -55,14 +55,14 @@ def test_setup_output_directory(mock_datetime_now, tmp_path): mock_test_scenario = MagicMock(spec=TestScenario) mock_test_scenario.name = scenario_name mock_system = MagicMock(spec=System) - mock_system.output_path = str(base_output_path) + mock_system.output_path = base_output_path mock_system.monitor_interval = 5 runner = MockRunner("run", mock_system, mock_test_scenario) assert base_output_path.exists() assert expected_path.exists() - assert runner.output_path == str(expected_path) + assert runner.output_path == expected_path def test_setup_output_directory_existing_base_path(mock_datetime_now, tmp_path): @@ -77,10 +77,10 @@ def test_setup_output_directory_existing_base_path(mock_datetime_now, tmp_path): mock_test_scenario = MagicMock(spec=TestScenario) mock_test_scenario.name = scenario_name mock_system = MagicMock(spec=System) - mock_system.output_path = str(base_output_path) + mock_system.output_path = base_output_path mock_system.monitor_interval = 5 runner = MockRunner("run", mock_system, mock_test_scenario) assert expected_path.exists() - assert runner.output_path == str(expected_path) + assert runner.output_path == expected_path diff --git a/tests/test_csv_report_tool.py b/tests/test_csv_report_tool.py index be5d7df4..1f90bb42 100644 --- a/tests/test_csv_report_tool.py +++ b/tests/test_csv_report_tool.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os +from pathlib import Path import pandas as pd import pytest @@ -22,44 +22,44 @@ @pytest.fixture -def output_directory(tmpdir): - return tmpdir.mkdir("reports") +def output_directory(tmpdir) -> Path: + return Path(tmpdir.mkdir("reports")) @pytest.fixture -def example_dataframe(): +def example_dataframe() -> pd.DataFrame: data = {"A": [1, 2, 3], "B": [4, 5, 6]} return pd.DataFrame(data) -def test_set_dataframe(example_dataframe): - csv_tool = CSVReportTool(output_directory=".") +def test_set_dataframe(example_dataframe: pd.DataFrame): + csv_tool = CSVReportTool(output_directory=Path(".")) csv_tool.set_dataframe(example_dataframe) assert csv_tool.dataframe is not None, "The dataframe was not set." assert csv_tool.dataframe.equals(example_dataframe), "The dataframe was not set correctly." -def test_finalize_report_no_dataframe(output_directory): - csv_tool = CSVReportTool(output_directory=str(output_directory)) +def test_finalize_report_no_dataframe(output_directory: Path): + csv_tool = CSVReportTool(output_directory=output_directory) with pytest.raises(ValueError, match="No DataFrame has been set for the report."): - csv_tool.finalize_report("report.csv") + csv_tool.finalize_report(Path("report.csv")) -def test_finalize_report(output_directory, example_dataframe): - csv_tool = CSVReportTool(output_directory=str(output_directory)) +def test_finalize_report(output_directory: Path, example_dataframe: pd.DataFrame): + csv_tool = CSVReportTool(output_directory=output_directory) csv_tool.set_dataframe(example_dataframe) - output_filename = "report.csv" + output_filename = Path("report.csv") csv_tool.finalize_report(output_filename) - output_filepath = os.path.join(str(output_directory), output_filename) - assert os.path.exists(output_filepath), "The report file was not created." + output_filepath = output_directory / output_filename + assert output_filepath.exists(), "The report file was not created." saved_df = pd.read_csv(output_filepath) assert saved_df.equals(example_dataframe), "The saved dataframe does not match the original dataframe." -def test_finalize_report_resets_dataframe(output_directory, example_dataframe): - csv_tool = CSVReportTool(output_directory=str(output_directory)) +def test_finalize_report_resets_dataframe(output_directory: Path, example_dataframe: pd.DataFrame): + csv_tool = CSVReportTool(output_directory=output_directory) csv_tool.set_dataframe(example_dataframe) - csv_tool.finalize_report("report.csv") + csv_tool.finalize_report(Path("report.csv")) assert csv_tool.dataframe is None, "The dataframe was not reset after finalizing the report." diff --git a/tests/test_docker_image_cache_manager.py b/tests/test_docker_image_cache_manager.py index 1e9a29f5..7e2c7c9e 100644 --- a/tests/test_docker_image_cache_manager.py +++ b/tests/test_docker_image_cache_manager.py @@ -15,6 +15,7 @@ # limitations under the License. import subprocess +from pathlib import Path from unittest.mock import MagicMock, patch from cloudai.util.docker_image_cache_manager import ( @@ -24,61 +25,59 @@ ) -@patch("os.path.isfile") -@patch("os.path.exists") +@patch("pathlib.Path.is_file") +@patch("pathlib.Path.exists") @patch("os.access") -def test_ensure_docker_image_file_exists(mock_access, mock_exists, mock_isfile): - manager = DockerImageCacheManager("/fake/install/path", True, "default") - mock_isfile.return_value = True +def test_ensure_docker_image_file_exists(mock_access, mock_exists, mock_is_file): + manager = DockerImageCacheManager(Path("/fake/install/path"), True, "default") + mock_is_file.return_value = True mock_exists.return_value = True result = manager.ensure_docker_image("/tmp/existing_file.sqsh", "subdir", "docker_image.sqsh") assert result.success - assert result.docker_image_path == "/tmp/existing_file.sqsh" + assert result.docker_image_path == Path("/tmp/existing_file.sqsh") assert result.message == "Docker image file path is valid: /tmp/existing_file.sqsh." -@patch("os.path.isfile") -@patch("os.path.exists") +@patch("pathlib.Path.is_file") +@patch("pathlib.Path.exists") @patch("os.access") -def test_ensure_docker_image_url_cache_enabled(mock_access, mock_exists, mock_isfile): - manager = DockerImageCacheManager("/fake/install/path", True, "default") - mock_isfile.return_value = False +def test_ensure_docker_image_url_cache_enabled(mock_access, mock_exists, mock_is_file): + manager = DockerImageCacheManager(Path("/fake/install/path"), True, "default") + mock_is_file.return_value = False mock_exists.return_value = True mock_access.return_value = True with patch.object( manager, "cache_docker_image", return_value=DockerImageCacheResult( - True, "/fake/install/path/subdir/docker_image.sqsh", "Docker image cached successfully." + True, Path("/fake/install/path/subdir/docker_image.sqsh"), "Docker image cached successfully." ), ): result = manager.ensure_docker_image("docker.io/hello-world", "subdir", "docker_image.sqsh") assert result.success - assert result.docker_image_path == "/fake/install/path/subdir/docker_image.sqsh" + assert result.docker_image_path == Path("/fake/install/path/subdir/docker_image.sqsh") assert result.message == "Docker image cached successfully." -@patch("os.path.isfile") -@patch("os.path.exists") +@patch("pathlib.Path.is_file") +@patch("pathlib.Path.exists") @patch("os.access") -@patch("os.makedirs") @patch("subprocess.run") @patch("cloudai.util.docker_image_cache_manager.DockerImageCacheManager._check_prerequisites") -def test_cache_docker_image(mock_check_prerequisites, mock_run, mock_makedirs, mock_access, mock_exists, mock_isfile): - manager = DockerImageCacheManager("/fake/install/path", True, "default") +def test_cache_docker_image(mock_check_prerequisites, mock_run, mock_access, mock_exists, mock_is_file): + manager = DockerImageCacheManager(Path("/fake/install/path"), True, "default") # Test when cached file already exists - mock_isfile.return_value = True + mock_is_file.return_value = True result = manager.cache_docker_image("docker.io/hello-world", "subdir", "image.tar.gz") assert result.success assert result.message == "Cached Docker image already exists at /fake/install/path/subdir/image.tar.gz." # Test creating subdirectory when it doesn't exist - mock_isfile.return_value = False - mock_exists.side_effect = [True, False, False] # install_path exists, subdir_path does not - with patch("os.makedirs") as mock_makedirs: - result = manager.cache_docker_image("docker.io/hello-world", "subdir", "image.tar.gz") - mock_makedirs.assert_called_once_with("/fake/install/path/subdir") + mock_is_file.return_value = False + mock_exists.side_effect = [True, False, True] # install_path exists, subdir_path does not, install_path again + result = manager.cache_docker_image("docker.io/hello-world", "subdir", "image.tar.gz") + assert (manager.install_path / "subdir").exists(), "Subdirectory should exist after caching the image" # Ensure prerequisites are always met for the following tests mock_check_prerequisites.return_value = PrerequisiteCheckResult(True, "All prerequisites are met.") @@ -88,7 +87,7 @@ def test_cache_docker_image(mock_check_prerequisites, mock_run, mock_makedirs, m mock_exists.side_effect = None # Test caching success with subprocess command (removal of default partition keyword) - mock_isfile.return_value = False + mock_is_file.return_value = False mock_exists.side_effect = [True, True, True, True, True] # Ensure all path checks return True mock_run.return_value = subprocess.CompletedProcess(args=["cmd"], returncode=0, stderr="") result = manager.cache_docker_image("docker.io/hello-world", "subdir", "image.tar.gz") @@ -103,13 +102,13 @@ def test_cache_docker_image(mock_check_prerequisites, mock_run, mock_makedirs, m assert result.message == "Docker image cached successfully at /fake/install/path/subdir/image.tar.gz." # Test caching failure due to subprocess error - mock_isfile.return_value = False + mock_is_file.return_value = False mock_run.side_effect = subprocess.CalledProcessError(1, "cmd") result = manager.cache_docker_image("docker.io/hello-world", "subdir", "image.tar.gz") assert not result.success # Test caching failure due to disk-related errors - mock_isfile.return_value = False + mock_is_file.return_value = False mock_run.side_effect = None mock_run.return_value = subprocess.CompletedProcess(args=["cmd"], returncode=1, stderr="Disk quota exceeded\n") mock_exists.side_effect = [True, True, True, True, True] @@ -123,68 +122,74 @@ def test_cache_docker_image(mock_check_prerequisites, mock_run, mock_makedirs, m assert "Write error" in result.message -@patch("os.path.isfile") -@patch("os.path.exists") -@patch("os.access") -@patch("os.remove") -def test_remove_cached_image(mock_remove, mock_access, mock_exists, mock_isfile): - manager = DockerImageCacheManager("/fake/install/path", True, "default") +@patch("pathlib.Path.unlink") +@patch("pathlib.Path.is_file") +def test_remove_cached_image(mock_is_file, mock_unlink): + # Mock setup + manager = DockerImageCacheManager(Path("/fake/install/path"), True, "default") # Test successful removal - mock_isfile.return_value = True + mock_is_file.return_value = True result = manager.remove_cached_image("subdir", "image.tar.gz") assert result.success assert result.message == "Cached Docker image removed successfully from /fake/install/path/subdir/image.tar.gz." - mock_remove.assert_called_once_with("/fake/install/path/subdir/image.tar.gz") + mock_unlink.assert_called_once() # Test failed removal due to OSError - mock_remove.side_effect = OSError("Mocked OSError") + mock_unlink.side_effect = OSError("Mocked OSError") result = manager.remove_cached_image("subdir", "image.tar.gz") assert not result.success assert "Failed to remove cached Docker image" in result.message # Test no file to remove - mock_isfile.return_value = False + mock_is_file.return_value = False result = manager.remove_cached_image("subdir", "image.tar.gz") assert result.success assert result.message == "No cached Docker image found to remove at /fake/install/path/subdir/image.tar.gz." -@patch("os.path.isfile") -@patch("os.path.exists") +@patch("pathlib.Path.unlink") +@patch("pathlib.Path.is_file") +@patch("pathlib.Path.exists") +@patch("pathlib.Path.iterdir") @patch("os.access") -@patch("os.remove") -def test_uninstall_cached_image(mock_remove, mock_access, mock_exists, mock_isfile): - manager = DockerImageCacheManager("/fake/install/path", True, "default") +def test_uninstall_cached_image(mock_access, mock_iterdir, mock_exists, mock_is_file, mock_unlink): + manager = DockerImageCacheManager(Path("/fake/install/path"), True, "default") # Test successful uninstallation and subdirectory removal - mock_isfile.return_value = True + mock_is_file.return_value = True mock_exists.return_value = True mock_access.return_value = True - mock_listdir = patch("os.listdir", return_value=[]).start() + mock_iterdir.return_value = iter([]) # Simulate empty directory result = manager.uninstall_cached_image("subdir", "image.tar.gz") + + # Ensure the unlink call was successful + mock_unlink.assert_called_once_with() + + # Ensure the directory is empty before attempting to remove it + assert list(mock_iterdir()) == [] + assert result.success assert result.message in [ "Cached Docker image uninstalled successfully from /fake/install/path/subdir.", "Subdirectory removed successfully: /fake/install/path/subdir.", ] - mock_remove.assert_called_once_with("/fake/install/path/subdir/image.tar.gz") # Test successful uninstallation but subdirectory not empty - mock_listdir.return_value = ["otherfile"] + mock_iterdir.return_value = iter([Path("otherfile")]) # Simulate directory with other files result = manager.uninstall_cached_image("subdir", "image.tar.gz") assert result.success assert result.message == "Cached Docker image uninstalled successfully from /fake/install/path/subdir." - mock_remove.assert_called_with("/fake/install/path/subdir/image.tar.gz") + mock_unlink.assert_called_with() # Test failed removal due to OSError - mock_remove.side_effect = OSError("Mocked OSError") + mock_unlink.side_effect = OSError("Mocked OSError") result = manager.uninstall_cached_image("subdir", "image.tar.gz") assert not result.success assert "Failed to remove cached Docker image" in result.message # Test no subdirectory to remove - mock_isfile.return_value = False + mock_is_file.return_value = False mock_exists.return_value = False result = manager.uninstall_cached_image("subdir", "image.tar.gz") assert result.success @@ -197,7 +202,7 @@ def test_uninstall_cached_image(mock_remove, mock_access, mock_exists, mock_isfi @patch("shutil.which") @patch("cloudai.util.docker_image_cache_manager.DockerImageCacheManager._check_docker_image_accessibility") def test_check_prerequisites(mock_check_docker_image_accessibility, mock_which): - manager = DockerImageCacheManager("/fake/install/path", True, "default") + manager = DockerImageCacheManager(Path("/fake/install/path"), True, "default") # Ensure enroot and srun are installed mock_which.side_effect = lambda x: x in ["enroot", "srun"] @@ -238,7 +243,7 @@ def test_check_prerequisites(mock_check_docker_image_accessibility, mock_which): @patch("shutil.which") @patch("tempfile.TemporaryDirectory") def test_check_docker_image_accessibility_with_enroot(mock_tempdir, mock_which, mock_popen): - manager = DockerImageCacheManager("/fake/install/path", True, "default") + manager = DockerImageCacheManager(Path("/fake/install/path"), True, "default") # Ensure docker binary is not available mock_which.return_value = None @@ -246,7 +251,7 @@ def test_check_docker_image_accessibility_with_enroot(mock_tempdir, mock_which, # Mocking the TemporaryDirectory context manager mock_temp_dir = MagicMock() mock_tempdir.return_value.__enter__.return_value = mock_temp_dir - temp_dir_path = "/fake/temp/dir" + temp_dir_path = Path("/fake/temp/dir") mock_temp_dir.__enter__.return_value = temp_dir_path # Mock Popen for enroot command with success scenario @@ -294,43 +299,3 @@ def test_check_docker_image_accessibility_with_enroot(mock_tempdir, mock_which, command = mock_popen.call_args[0][0] assert "enroot import -o" in command assert "docker://docker.io/hello-world" in command - - -@patch("os.path.isfile") -@patch("os.path.exists") -def test_check_docker_image_exists_with_only_cached_file(mock_exists, mock_isfile): - manager = DockerImageCacheManager("/fake/install/path", True, "default") - - # Simulate only the cached file being a valid file path - mock_isfile.side_effect = lambda path: path == "/fake/install/path/subdir/docker_image.sqsh" - mock_exists.side_effect = lambda path: path in [ - "/fake/install/path", - "/fake/install/path/subdir", - "/fake/install/path/subdir/docker_image.sqsh", - ] - - result = manager.check_docker_image_exists("/tmp/non_existing_file.sqsh", "subdir", "docker_image.sqsh") - assert result.success - assert result.docker_image_path == "/fake/install/path/subdir/docker_image.sqsh" - assert result.message == "Cached Docker image already exists at /fake/install/path/subdir/docker_image.sqsh." - - -@patch("os.path.isfile") -@patch("os.path.exists") -def test_check_docker_image_exists_with_both_valid_files(mock_exists, mock_isfile): - manager = DockerImageCacheManager("/fake/install/path", True, "default") - - # Simulate both cached file and docker image URL being valid file paths - mock_isfile.side_effect = lambda path: path in [ - "/tmp/existing_file.sqsh", - "/fake/install/path/subdir/docker_image.sqsh", - ] - mock_exists.side_effect = lambda path: path in [ - "/tmp/existing_file.sqsh", - "/fake/install/path/subdir/docker_image.sqsh", - ] - - result = manager.check_docker_image_exists("/tmp/existing_file.sqsh", "subdir", "docker_image.sqsh") - assert result.success - assert result.docker_image_path == "/tmp/existing_file.sqsh" - assert result.message == "Docker image file path is valid: /tmp/existing_file.sqsh." diff --git a/tests/test_job_status_retrieval_strategy.py b/tests/test_job_status_retrieval_strategy.py index 858c77e2..1b910390 100644 --- a/tests/test_job_status_retrieval_strategy.py +++ b/tests/test_job_status_retrieval_strategy.py @@ -29,7 +29,7 @@ def setup_method(self) -> None: def test_no_stdout_file(self, tmp_path: Path) -> None: """Test that job status is False when no stdout.txt file is present.""" - result = self.js.get_job_status(str(tmp_path)) + result = self.js.get_job_status(tmp_path) assert not result.is_successful assert result.error_message == ( f"stdout.txt file not found in the specified output directory {tmp_path}. " @@ -50,7 +50,7 @@ def test_successful_job(self, tmp_path: Path) -> None: # Some final output """ stdout_file.write_text(stdout_content) - result = self.js.get_job_status(str(tmp_path)) + result = self.js.get_job_status(tmp_path) assert result.is_successful assert result.error_message == "" @@ -63,7 +63,7 @@ def test_failed_job(self, tmp_path: Path) -> None: # Some final output without success indicators """ stdout_file.write_text(stdout_content) - result = self.js.get_job_status(str(tmp_path)) + result = self.js.get_job_status(tmp_path) assert not result.is_successful assert result.error_message == ( f"Missing success indicators in {stdout_file}: '# Out of bounds values', '# Avg bus bandwidth'. " @@ -88,7 +88,7 @@ def test_nccl_failure_job(self, tmp_path: Path) -> None: .. node pid: Test failure common.cu:844 """ stdout_file.write_text(stdout_content) - result = self.js.get_job_status(str(tmp_path)) + result = self.js.get_job_status(tmp_path) assert not result.is_successful assert result.error_message == ( f"NCCL test failure detected in {stdout_file}. " @@ -105,7 +105,7 @@ def test_generic_test_failure_job(self, tmp_path: Path) -> None: .. node pid: Test failure common.cu:401 """ stdout_file.write_text(stdout_content) - result = self.js.get_job_status(str(tmp_path)) + result = self.js.get_job_status(tmp_path) assert not result.is_successful assert result.error_message == ( f"Test failure detected in {stdout_file}. " @@ -124,10 +124,10 @@ def setup_method(self) -> None: def test_no_profile_stderr_file(self, tmp_path: Path) -> None: """Test that job status is False when no profile_stderr.txt file is present.""" - result = self.js.get_job_status(str(tmp_path)) + result = self.js.get_job_status(tmp_path) assert not result.is_successful assert result.error_message == ( - f"profile_stderr.txt file not found in the specified output directory, {str(tmp_path)}. " + f"profile_stderr.txt file not found in the specified output directory, {tmp_path}. " "This file is expected to be created during the profiling stage. " "Please ensure the profiling stage completed successfully. " "Run the generated sbatch script manually to debug." @@ -138,7 +138,7 @@ def test_missing_pax_status_keyword(self, tmp_path: Path) -> None: profile_stderr_file = tmp_path / "profile_stderr.txt" profile_stderr_content = "Some initialization output\nMore output\nFinal output without the expected keyword" profile_stderr_file.write_text(profile_stderr_content) - result = self.js.get_job_status(str(tmp_path)) + result = self.js.get_job_status(tmp_path) assert not result.is_successful assert result.error_message == ( "The profiling stage completed but did not generate the expected '[PAX STATUS]: E2E time: " @@ -152,10 +152,10 @@ def test_no_error_files(self, tmp_path: Path) -> None: profile_stderr_file = tmp_path / "profile_stderr.txt" profile_stderr_content = "[PAX STATUS]: E2E time: Elapsed time for profiling" profile_stderr_file.write_text(profile_stderr_content) - result = self.js.get_job_status(str(tmp_path)) + result = self.js.get_job_status(tmp_path) assert not result.is_successful assert result.error_message == ( - f"No 'error-*.txt' files found in the output directory, {str(tmp_path)}. There are two stages in the Grok " + f"No 'error-*.txt' files found in the output directory, {tmp_path}. There are two stages in the Grok " "run. The profiling stage passed successfully, but something went wrong in the actual run stage. " "Please ensure the actual run stage completed successfully. " "Run the generated sbatch script manually to debug." @@ -168,7 +168,7 @@ def test_cuda_no_device_error_in_profile_stderr(self, tmp_path: Path) -> None: profile_stderr_content += "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected" profile_stderr_file.write_text(profile_stderr_content) - result = self.js.get_job_status(str(tmp_path)) + result = self.js.get_job_status(tmp_path) assert not result.is_successful assert result.error_message == ( "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected. This may be due to missing " @@ -190,10 +190,10 @@ def test_missing_e2e_time_keyword(self, tmp_path: Path) -> None: error_content = "Some initialization output\nMore output\nFinal output without the expected keyword" error_file.write_text(error_content) - result = self.js.get_job_status(str(tmp_path)) + result = self.js.get_job_status(tmp_path) assert not result.is_successful assert result.error_message == ( - f"The file {str(error_file)} does not contain the expected 'E2E time: Elapsed time for' keyword at the " + f"The file {error_file} does not contain the expected 'E2E time: Elapsed time for' keyword at the " "end. This indicates the actual run did not complete successfully. " "Please debug this manually to ensure the actual run stage completes as expected." ) @@ -208,7 +208,7 @@ def test_cuda_no_device_error_in_error_file(self, tmp_path: Path) -> None: error_content = "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected" error_file.write_text(error_content) - result = self.js.get_job_status(str(tmp_path)) + result = self.js.get_job_status(tmp_path) assert not result.is_successful assert result.error_message == ( "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected. This may be due to missing " @@ -230,7 +230,7 @@ def test_successful_job(self, tmp_path: Path) -> None: error_content = "Some initialization output\nMore output\nE2E time: Elapsed time for actual run\nFinal output" error_file.write_text(error_content) - result = self.js.get_job_status(str(tmp_path)) + result = self.js.get_job_status(tmp_path) assert result.is_successful assert result.error_message == "" @@ -241,7 +241,7 @@ def test_nccl_group_end_error_in_profile_stderr(self, tmp_path: Path) -> None: profile_stderr_content += "NCCL operation ncclGroupEnd() failed: unhandled system error" profile_stderr_file.write_text(profile_stderr_content) - result = self.js.get_job_status(str(tmp_path)) + result = self.js.get_job_status(tmp_path) assert not result.is_successful assert result.error_message == ( "NCCL operation ncclGroupEnd() failed: unhandled system error. Please check if the NCCL-test " @@ -258,7 +258,7 @@ def test_nccl_group_end_error_in_error_file(self, tmp_path: Path) -> None: error_content = "NCCL operation ncclGroupEnd() failed: unhandled system error" error_file.write_text(error_content) - result = self.js.get_job_status(str(tmp_path)) + result = self.js.get_job_status(tmp_path) assert not result.is_successful assert result.error_message == ( "NCCL operation ncclGroupEnd() failed: unhandled system error. Please check if the NCCL-test " @@ -272,12 +272,12 @@ def test_heartbeat_error_in_profile_stderr(self, tmp_path: Path) -> None: profile_stderr_content += "Terminating process because the coordinator detected missing heartbeats" profile_stderr_file.write_text(profile_stderr_content) - result = self.js.get_job_status(str(tmp_path)) + result = self.js.get_job_status(tmp_path) assert not result.is_successful assert result.error_message == ( "Terminating process because the coordinator detected missing heartbeats. This most likely " - f"indicates that another task died. Please review the file at {str(profile_stderr_file)} and any relevant " - f"logs in {str(tmp_path)}. Ensure the servers allocated for this task can reach each other with their " + f"indicates that another task died. Please review the file at {profile_stderr_file} and any relevant " + f"logs in {tmp_path}. Ensure the servers allocated for this task can reach each other with their " "hostnames, and they can open any ports and reach others' ports." ) @@ -291,12 +291,12 @@ def test_heartbeat_error_in_error_file(self, tmp_path: Path) -> None: error_content = "Terminating process because the coordinator detected missing heartbeats" error_file.write_text(error_content) - result = self.js.get_job_status(str(tmp_path)) + result = self.js.get_job_status(tmp_path) assert not result.is_successful assert result.error_message == ( "Terminating process because the coordinator detected missing heartbeats. This most likely " - f"indicates that another task died. Please review the file at {str(error_file)} and any relevant logs in" - f" {str(tmp_path)}. Ensure the servers allocated for this task can reach each other with their " + f"indicates that another task died. Please review the file at {error_file} and any relevant logs in" + f" {tmp_path}. Ensure the servers allocated for this task can reach each other with their " "hostnames, and they can open any ports and reach others' ports." ) @@ -310,7 +310,7 @@ def test_pyxis_mktemp_error_in_profile_stderr(self, tmp_path: Path) -> None: ) profile_stderr_file.write_text(profile_stderr_content) - result = self.js.get_job_status(str(tmp_path)) + result = self.js.get_job_status(tmp_path) assert not result.is_successful assert result.error_message == ( "pyxis: mktemp: failed to create directory via template. This is due to insufficient disk cache " @@ -333,7 +333,7 @@ def test_pyxis_mktemp_error_in_error_file(self, tmp_path: Path) -> None: ) error_file.write_text(error_content) - result = self.js.get_job_status(str(tmp_path)) + result = self.js.get_job_status(tmp_path) assert not result.is_successful assert result.error_message == ( "pyxis: mktemp: failed to create directory via template. This is due to insufficient disk cache " diff --git a/tests/test_report_generation_strategy.py b/tests/test_report_generation_strategy.py index 764fe24b..0627c115 100644 --- a/tests/test_report_generation_strategy.py +++ b/tests/test_report_generation_strategy.py @@ -30,12 +30,12 @@ def setup_method(self) -> None: def test_no_files(self, tmp_path: Path) -> None: """Test that no times are extracted when no files are present.""" - assert self.js._extract_times(str(tmp_path)) == [] + assert self.js._extract_times(tmp_path) == [] def test_no_matches(self, tmp_path: Path) -> None: """Test that no times are extracted when no matching lines are present.""" (tmp_path / "error-1.txt").write_text("fake line") - assert self.js._extract_times(str(tmp_path)) == [] + assert self.js._extract_times(tmp_path) == [] def test_one_match(self, tmp_path: Path) -> None: """Test that the correct time is extracted when one matching line is present.""" @@ -47,4 +47,4 @@ def test_one_match(self, tmp_path: Path) -> None: with err_file.open("w") as f: for _ in range(11): f.write(sample_line) - assert self.js._extract_times(str(err_file.parent)) == [38.727223] + assert self.js._extract_times(err_file.parent) == [38.727223] diff --git a/tests/test_slurm_command_gen_strategy.py b/tests/test_slurm_command_gen_strategy.py index 75b8d811..a375438b 100644 --- a/tests/test_slurm_command_gen_strategy.py +++ b/tests/test_slurm_command_gen_strategy.py @@ -32,8 +32,8 @@ def slurm_system(tmp_path: Path) -> SlurmSystem: slurm_system = SlurmSystem( name="TestSystem", - install_path=str(tmp_path / "install"), - output_path=str(tmp_path / "output"), + install_path=tmp_path / "install", + output_path=tmp_path / "output", default_partition="main", extra_srun_args="", partitions={ @@ -46,8 +46,8 @@ def slurm_system(tmp_path: Path) -> SlurmSystem: }, mpi="fake-mpi", ) - Path(slurm_system.install_path).mkdir() - Path(slurm_system.output_path).mkdir() + slurm_system.install_path.mkdir() + slurm_system.output_path.mkdir() return slurm_system @@ -81,13 +81,13 @@ def test_filename_generation(strategy_fixture: SlurmCommandGenStrategy, tmp_path args = {"job_name": "test_job", "num_nodes": 2, "partition": "test_partition", "node_list_str": "node1,node2"} env_vars_str = "export TEST_VAR=VALUE" srun_command = "srun --test test_arg" - output_path = str(tmp_path) + output_path = tmp_path sbatch_command = strategy_fixture._write_sbatch_script(args, env_vars_str, srun_command, output_path) filepath_from_command = sbatch_command.split()[-1] # Check that the file exists at the specified path - assert tmp_path.joinpath("cloudai_sbatch_script.sh").exists() + assert output_path.joinpath("cloudai_sbatch_script.sh").exists() # Read the file and check the contents with open(filepath_from_command, "r") as file: @@ -315,7 +315,7 @@ def test_extra_env_vars_added(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStr cmd_args=cmd_args, extra_env_vars=extra_env_vars, extra_cmd_args="", - output_path="", + output_path=Path(""), num_nodes=1, nodes=[], ) @@ -335,7 +335,7 @@ def test_env_var_escaping(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStrateg cmd_args=cmd_args, extra_env_vars=extra_env_vars, extra_cmd_args="", - output_path="", + output_path=Path(""), num_nodes=1, nodes=[], ) @@ -357,7 +357,7 @@ def test_tokenizer_handled(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStrate cmd_args=cmd_args, extra_env_vars=extra_env_vars, extra_cmd_args=f"training.model.tokenizer.model={tokenizer_path}", - output_path="", + output_path=Path(""), num_nodes=1, nodes=[], ) @@ -377,7 +377,7 @@ def test_reservation_handled(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStra cmd_args=cmd_args, extra_cmd_args="", extra_env_vars=extra_env_vars, - output_path="", + output_path=Path(""), num_nodes=1, nodes=[], ) @@ -391,7 +391,7 @@ def test_invalid_tokenizer_path(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenS "repository_url": "fake", "repository_commit_hash": "fake", } - invalid_tokenizer_path = "/invalid/path/to/tokenizer" + invalid_tokenizer_path = Path("/invalid/path/to/tokenizer") with pytest.raises( ValueError, @@ -406,7 +406,7 @@ def test_invalid_tokenizer_path(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenS cmd_args=cmd_args, extra_env_vars=extra_env_vars, extra_cmd_args=f"training.model.tokenizer.model={invalid_tokenizer_path}", - output_path="", + output_path=Path(""), num_nodes=1, nodes=[], ) @@ -439,12 +439,12 @@ def test_raises_on_missing_args(self, missing_arg: str, strategy_fixture: SlurmC del args[missing_arg] with pytest.raises(KeyError) as exc_info: - strategy_fixture._write_sbatch_script(args, self.env_vars_str, self.srun_command, str(tmp_path)) + strategy_fixture._write_sbatch_script(args, self.env_vars_str, self.srun_command, tmp_path) assert f"KeyError('{missing_arg}')" in str(exc_info) def test_only_mandatory_args(self, strategy_fixture: SlurmCommandGenStrategy, tmp_path: Path): sbatch_command = strategy_fixture._write_sbatch_script( - self.MANDATORY_ARGS, self.env_vars_str, self.srun_command, str(tmp_path) + self.MANDATORY_ARGS, self.env_vars_str, self.srun_command, tmp_path ) filepath_from_command = sbatch_command.split()[-1] @@ -482,9 +482,7 @@ def test_extra_args( args = self.MANDATORY_ARGS.copy() args[arg] = arg_value - sbatch_command = strategy_fixture._write_sbatch_script( - args, self.env_vars_str, self.srun_command, str(tmp_path) - ) + sbatch_command = strategy_fixture._write_sbatch_script(args, self.env_vars_str, self.srun_command, tmp_path) filepath_from_command = sbatch_command.split()[-1] with open(filepath_from_command, "r") as file: @@ -498,9 +496,7 @@ def test_disable_output_and_error(self, add_arg: str, strategy_fixture: SlurmCom args = self.MANDATORY_ARGS.copy() args[add_arg] = "fake" - sbatch_command = strategy_fixture._write_sbatch_script( - args, self.env_vars_str, self.srun_command, str(tmp_path) - ) + sbatch_command = strategy_fixture._write_sbatch_script(args, self.env_vars_str, self.srun_command, tmp_path) filepath_from_command = sbatch_command.split()[-1] with open(filepath_from_command, "r") as file: diff --git a/tests/test_slurm_install_strategy.py b/tests/test_slurm_install_strategy.py index 35d26637..4a2dcd24 100644 --- a/tests/test_slurm_install_strategy.py +++ b/tests/test_slurm_install_strategy.py @@ -34,8 +34,8 @@ def slurm_system(tmp_path: Path) -> SlurmSystem: slurm_system = SlurmSystem( name="TestSystem", - install_path=str(tmp_path / "install"), - output_path=str(tmp_path / "output"), + install_path=tmp_path / "install", + output_path=tmp_path / "output", default_partition="main", partitions={ "main": [ diff --git a/tests/test_slurm_report_generation_strategy.py b/tests/test_slurm_report_generation_strategy.py index 41c76702..198c8ea6 100644 --- a/tests/test_slurm_report_generation_strategy.py +++ b/tests/test_slurm_report_generation_strategy.py @@ -15,6 +15,7 @@ # limitations under the License. import os +from pathlib import Path import pandas as pd import pytest @@ -75,7 +76,7 @@ def test_nccl_report_generation(setup_test_environment): assert strategy.can_handle_directory(test_dir) is True # Generate the report - strategy.generate_report("nccl_test", test_dir) + strategy.generate_report("nccl_test", Path(test_dir)) # Verify the CSV report csv_report_path = os.path.join(test_dir, "cloudai_nccl_test_csv_report.csv") diff --git a/tests/test_slurm_system.py b/tests/test_slurm_system.py index b32cc0a6..f802bb5b 100644 --- a/tests/test_slurm_system.py +++ b/tests/test_slurm_system.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path from typing import List from unittest.mock import patch @@ -31,8 +32,8 @@ def slurm_system(): system = SlurmSystem( name="test_system", - install_path="/fake/path", - output_path="/fake/output", + install_path=Path("/fake/path"), + output_path=Path("/fake/output"), default_partition="main", partitions={"main": nodes, "backup": backup_nodes}, ) diff --git a/tests/test_slurm_system_parser.py b/tests/test_slurm_system_parser.py index db366cf1..d4d274a1 100644 --- a/tests/test_slurm_system_parser.py +++ b/tests/test_slurm_system_parser.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path from typing import Any, Dict import pytest @@ -61,8 +62,8 @@ def test_parse_slurm_system_parser_with_mpi(example_data, mpi_value, expected_mp assert isinstance(slurm_system, SlurmSystem) assert slurm_system.name == "test_system" - assert slurm_system.install_path == "/fake/path" - assert slurm_system.output_path == "/fake/output" + assert slurm_system.install_path == Path("/fake/path") + assert slurm_system.output_path == Path("/fake/output") assert slurm_system.default_partition == "main" assert slurm_system.cache_docker_images_locally is True assert "main" in slurm_system.partitions From 770c444318c4d04eafd2d4dfd6036da8bdf7236a Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Mon, 26 Aug 2024 11:30:29 -0400 Subject: [PATCH 2/4] Reflect Andrei's comments --- src/cloudai/__main__.py | 2 +- .../parser/system_parser/slurm_system_parser.py | 5 ++--- .../system_parser/standalone_system_parser.py | 3 +-- .../tool/tensorboard_data_reader.py | 16 ++++++---------- .../jax_toolbox/slurm_command_gen_strategy.py | 6 +----- tests/test_slurm_report_generation_strategy.py | 5 +++-- 6 files changed, 14 insertions(+), 23 deletions(-) diff --git a/src/cloudai/__main__.py b/src/cloudai/__main__.py index c5146f4b..fcce2238 100644 --- a/src/cloudai/__main__.py +++ b/src/cloudai/__main__.py @@ -274,7 +274,7 @@ def main() -> None: system, tests, test_scenario = parser.parse(tests_dir, test_scenario_path) if output_dir: - system.output_path = Path(output_dir.absolute()) + system.output_path = output_dir.absolute() system.update() if args.mode in ["install", "uninstall"]: diff --git a/src/cloudai/parser/system_parser/slurm_system_parser.py b/src/cloudai/parser/system_parser/slurm_system_parser.py index 1236793a..27bdd696 100644 --- a/src/cloudai/parser/system_parser/slurm_system_parser.py +++ b/src/cloudai/parser/system_parser/slurm_system_parser.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from pathlib import Path from typing import Any, Dict, List @@ -58,12 +57,12 @@ def str_to_bool(value: Any) -> bool: install_path = data.get("install_path") if not install_path: raise ValueError("Field 'install_path' is required.") - install_path = Path(os.path.abspath(install_path)) + install_path = Path(install_path).absolute() output_path = data.get("output_path") if not output_path: raise ValueError("Field 'output_path' is required.") - output_path = Path(os.path.abspath(output_path)) + output_path = Path(output_path).absolute() default_partition = data.get("default_partition") if not default_partition: diff --git a/src/cloudai/parser/system_parser/standalone_system_parser.py b/src/cloudai/parser/system_parser/standalone_system_parser.py index 4bcd24d0..4e8e4869 100644 --- a/src/cloudai/parser/system_parser/standalone_system_parser.py +++ b/src/cloudai/parser/system_parser/standalone_system_parser.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from pathlib import Path from typing import Any, Dict @@ -46,6 +45,6 @@ def parse(self, data: Dict[str, Any]) -> StandaloneSystem: output_path = data.get("output_path") if not output_path: raise ValueError("Field 'output_path' is required.") - output_path = Path(os.path.abspath(output_path)) + output_path = Path(output_path).absolute() return StandaloneSystem(name=name, output_path=output_path) diff --git a/src/cloudai/report_generator/tool/tensorboard_data_reader.py b/src/cloudai/report_generator/tool/tensorboard_data_reader.py index dc67891d..2f355483 100644 --- a/src/cloudai/report_generator/tool/tensorboard_data_reader.py +++ b/src/cloudai/report_generator/tool/tensorboard_data_reader.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from pathlib import Path from typing import List, Tuple @@ -43,15 +42,12 @@ def extract_data(self, tag: str) -> List[Tuple[int, float]]: from tbparse import SummaryReader # lazy import to improve overall performance data = [] - for root, _, files in os.walk(self.directory_path): - for file in files: - if file.startswith("events.out.tfevents"): - path = Path(root) / file - reader = SummaryReader(str(path)) - df = reader.scalars - if tag in df["tag"].values: - filtered_data = df[df["tag"] == tag] - data.extend(filtered_data[["step", "value"]].values) + for path in self.directory_path.rglob("events.out.tfevents*"): + reader = SummaryReader(str(path)) + df = reader.scalars + if tag in df["tag"].values: + filtered_data = df[df["tag"] == tag] + data.extend(filtered_data[["step", "value"]].values) return data diff --git a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py index 9df231a3..f30d815b 100644 --- a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py @@ -216,7 +216,7 @@ def _create_run_script( env_vars: Dict[str, str], cmd_args: Dict[str, str], extra_cmd_args: str, - ) -> str: + ) -> None: """ Generate and writes the run.sh script to the specified output directory. @@ -230,9 +230,6 @@ def _create_run_script( cmd_args (Dict[str, str]): Command-line arguments. extra_cmd_args (str): Additional command-line arguments to be included in the srun command. - - Returns: - str: The path to the run.sh script that was created. """ test_name = self.test_name @@ -267,7 +264,6 @@ def set_xla_flags(test_name: str, profile_enabled: bool): with open(run_script_path, "w") as run_file: run_file.write("\n".join(run_script_content)) os.chmod(run_script_path, 0o755) - return str(run_script_path) def _script_content( self, diff --git a/tests/test_slurm_report_generation_strategy.py b/tests/test_slurm_report_generation_strategy.py index 198c8ea6..8f1875cd 100644 --- a/tests/test_slurm_report_generation_strategy.py +++ b/tests/test_slurm_report_generation_strategy.py @@ -23,9 +23,10 @@ @pytest.fixture -def setup_test_environment(tmpdir): +def setup_test_environment(tmp_path: Path): # Create a temporary directory for the test - test_dir = tmpdir.mkdir("test_env") + test_dir = tmp_path / "test_env" + test_dir.mkdir() # Create the mock stdout.txt file stdout_content = """ From f842597bee1e5127b8abcd58990c0ec80a97f96c Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Mon, 26 Aug 2024 11:30:29 -0400 Subject: [PATCH 3/4] Reflect Andrei's comments --- src/cloudai/_core/base_runner.py | 2 +- tests/test_slurm_report_generation_strategy.py | 12 +++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/cloudai/_core/base_runner.py b/src/cloudai/_core/base_runner.py index 1f34b614..d1f8e93c 100644 --- a/src/cloudai/_core/base_runner.py +++ b/src/cloudai/_core/base_runner.py @@ -263,7 +263,7 @@ def get_job_output_path(self, test: Test) -> Path: if not self.output_path.exists(): raise FileNotFoundError(f"Output directory {self.output_path} does not exist") - job_output_path = None # Initialize the variable + job_output_path = Path() # avoid reportPossiblyUnboundVariable from pyright try: assert test.section_name is not None, "test.section_name must not be None" diff --git a/tests/test_slurm_report_generation_strategy.py b/tests/test_slurm_report_generation_strategy.py index 8f1875cd..c27d137d 100644 --- a/tests/test_slurm_report_generation_strategy.py +++ b/tests/test_slurm_report_generation_strategy.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from pathlib import Path import pandas as pd @@ -60,9 +59,8 @@ def setup_test_environment(tmp_path: Path): # Avg bus bandwidth : 111.111 # """ - stdout_path = os.path.join(test_dir, "stdout.txt") - with open(stdout_path, "w") as f: - f.write(stdout_content) + stdout_path = test_dir / "stdout.txt" + stdout_path.write_text(stdout_content) return test_dir @@ -77,11 +75,11 @@ def test_nccl_report_generation(setup_test_environment): assert strategy.can_handle_directory(test_dir) is True # Generate the report - strategy.generate_report("nccl_test", Path(test_dir)) + strategy.generate_report("nccl_test", test_dir) # Verify the CSV report - csv_report_path = os.path.join(test_dir, "cloudai_nccl_test_csv_report.csv") - assert os.path.isfile(csv_report_path), "CSV report was not generated." + csv_report_path = test_dir / "cloudai_nccl_test_csv_report.csv" + assert csv_report_path.is_file(), "CSV report was not generated." # Read the CSV and validate the content df = pd.read_csv(csv_report_path) From 1f5b36edb2761d09dc9b1a026d39ea442c3ee545 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Mon, 26 Aug 2024 11:30:29 -0400 Subject: [PATCH 4/4] Reflect Andrei's comments Co-authored-by: Andrei Maslennikov --- src/cloudai/_core/parser.py | 2 +- src/cloudai/_core/system_parser.py | 14 ++--- src/cloudai/installer/slurm_installer.py | 27 +++++---- .../jax_toolbox/slurm_command_gen_strategy.py | 11 ++-- .../jax_toolbox/slurm_install_strategy.py | 5 +- .../nccl_test/slurm_command_gen_strategy.py | 3 +- .../nccl_test/slurm_install_strategy.py | 7 +-- .../nemo_launcher/slurm_install_strategy.py | 59 +++++++++---------- .../ucc_test/slurm_install_strategy.py | 7 +-- src/cloudai/util/command_shell.py | 12 ++-- tests/test_slurm_install_strategy.py | 8 +-- 11 files changed, 75 insertions(+), 80 deletions(-) diff --git a/src/cloudai/_core/parser.py b/src/cloudai/_core/parser.py index 3cec818d..9ee1854d 100644 --- a/src/cloudai/_core/parser.py +++ b/src/cloudai/_core/parser.py @@ -65,7 +65,7 @@ def parse( if not test_path.exists(): raise FileNotFoundError(f"Test path '{test_path}' not found.") - system_parser = SystemParser(str(self.system_config_path)) + system_parser = SystemParser(self.system_config_path) system = system_parser.parse() logging.debug("Parsed system config") diff --git a/src/cloudai/_core/system_parser.py b/src/cloudai/_core/system_parser.py index 64616400..4b93fd8a 100644 --- a/src/cloudai/_core/system_parser.py +++ b/src/cloudai/_core/system_parser.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os +from pathlib import Path import toml @@ -29,19 +29,19 @@ class SystemParser: Attributes _parsers (Dict[str, Type[BaseSystemParser]]): A mapping from system types to their corresponding parser classes. - file_path (str): The file path to the system configuration file. + file_path (Path): The file path to the system configuration file. """ _parsers = {} - def __init__(self, file_path: str): + def __init__(self, file_path: Path): """ Initialize a SystemParser instance. Args: - file_path (str): The file path to the system configuration file. + file_path (Path): The file path to the system configuration file. """ - self.file_path: str = file_path + self.file_path: Path = file_path def parse(self) -> System: """ @@ -55,10 +55,10 @@ def parse(self) -> System: Returns System: The parsed system object. """ - if not os.path.isfile(self.file_path): + if not self.file_path.is_file(): raise FileNotFoundError(f"The file '{self.file_path}' does not exist.") - with open(self.file_path, "r") as file: + with self.file_path.open("r") as file: data = toml.load(file) scheduler = data.get("scheduler", "").lower() registry = Registry() diff --git a/src/cloudai/installer/slurm_installer.py b/src/cloudai/installer/slurm_installer.py index a944be1a..ec260b1a 100644 --- a/src/cloudai/installer/slurm_installer.py +++ b/src/cloudai/installer/slurm_installer.py @@ -17,6 +17,7 @@ import contextlib import os import subprocess +from pathlib import Path from typing import Dict, Iterable, cast import toml @@ -35,9 +36,9 @@ class SlurmInstaller(BaseInstaller): CONFIG_FILE_NAME (str): The name of the configuration file. PREREQUISITES (List[str]): A list of required binaries for the installer. REQUIRED_SRUN_OPTIONS (List[str]): A list of required srun options to check. - install_path (str): Path where the benchmarks are to be installed. This is optional since uninstallation does + install_path (Path): Path where the benchmarks are to be installed. This is optional since uninstallation does not require it. - config_path (str): Path to the installation configuration file. + config_path (Path): Path to the installation configuration file. """ CONFIG_FILE_NAME = ".cloudai.toml" @@ -60,7 +61,7 @@ def __init__(self, system: System): super().__init__(system) slurm_system = cast(SlurmSystem, self.system) self.install_path = slurm_system.install_path - self.config_path = os.path.join(os.path.expanduser("~"), self.CONFIG_FILE_NAME) + self.config_path = Path.home() / self.CONFIG_FILE_NAME def _check_prerequisites(self) -> InstallStatusResult: """ @@ -108,16 +109,16 @@ def _check_srun_options(self) -> None: def _write_config(self) -> InstallStatusResult: """Write the installation configuration to a TOML file atomically.""" - absolute_install_path = os.path.abspath(self.install_path) - config_data: Dict[str, str] = {"install_path": absolute_install_path} + absolute_install_path = self.install_path.resolve() + config_data: Dict[str, str] = {"install_path": str(absolute_install_path)} try: - with open(self.config_path, "w") as file: + with self.config_path.open("w") as file: toml.dump(config_data, file) return InstallStatusResult(True) except Exception as e: with contextlib.suppress(OSError): - os.remove(self.config_path) + self.config_path.unlink() return InstallStatusResult(False, str(e)) def _read_config(self) -> Dict[str, str]: @@ -128,7 +129,7 @@ def _read_config(self) -> Dict[str, str]: Dict[str, str]: Configuration, including installation path. """ try: - with open(self.config_path, "r") as file: + with self.config_path.open("r") as file: return toml.load(file) except FileNotFoundError as e: raise FileNotFoundError( @@ -138,8 +139,8 @@ def _read_config(self) -> Dict[str, str]: def _remove_config(self) -> None: """Remove the installation configuration file.""" - if os.path.exists(self.config_path): - os.remove(self.config_path) + if self.config_path.exists(): + self.config_path.unlink() def is_installed(self, test_templates: Iterable[TestTemplate]) -> InstallStatusResult: """ @@ -153,7 +154,7 @@ def is_installed(self, test_templates: Iterable[TestTemplate]) -> InstallStatusR Returns: InstallStatusResult: Result containing the installation status and error message if not installed. """ - if not os.path.exists(self.config_path): + if not self.config_path.exists(): return InstallStatusResult( False, f"Configuration file does not exist at {self.config_path}. " @@ -189,11 +190,11 @@ def install(self, test_templates: Iterable[TestTemplate]) -> InstallStatusResult return prerequisites_result try: - os.makedirs(self.install_path, exist_ok=True) + self.install_path.mkdir(parents=True, exist_ok=True) except OSError as e: return InstallStatusResult(False, f"Failed to create installation directory at {self.install_path}: {e}") - if not os.access(self.install_path, os.W_OK): + if not self.install_path.is_dir() or not os.access(self.install_path, os.W_OK): return InstallStatusResult(False, f"The installation path {self.install_path} is not writable.") install_result = super().install(test_templates) diff --git a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py index f30d815b..0e9b342c 100644 --- a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py @@ -173,21 +173,20 @@ def _parse_slurm_args( JaxToolboxSlurmInstallStrategy.DOCKER_IMAGE_FILENAME, ).docker_image_path - local_workspace_dir = os.path.abspath(cmd_args["output_path"]) - # Use the dynamic key_prefix for accessing docker_workspace_dir + local_workspace_dir = Path(cmd_args["output_path"]).resolve() docker_workspace_dir = cmd_args[f"{key_prefix}.setup_flags.docker_workspace_dir"] container_mounts = f"{local_workspace_dir}:{docker_workspace_dir}" if "pgo_nsys_converter.profile_path" in cmd_args: - profile_path = cmd_args["pgo_nsys_converter.profile_path"] + profile_path = Path(cmd_args["pgo_nsys_converter.profile_path"]).resolve() container_mounts += f",{profile_path}:{profile_path}" base_args.update({"image_path": image_path, "container_mounts": container_mounts}) - output_path = os.path.abspath(cmd_args["output_path"]) + output_path = Path(cmd_args["output_path"]).resolve() output_suffix = "-%j.txt" if env_vars.get("UNIFIED_STDOUT_STDERR") == "1" else "-%j-%n-%t.txt" - base_args["output"] = os.path.join(output_path, f"output{output_suffix}") - base_args["error"] = os.path.join(output_path, f"error{output_suffix}") + base_args["output"] = str(output_path / f"output{output_suffix}") + base_args["error"] = str(output_path / f"error{output_suffix}") return base_args diff --git a/src/cloudai/schema/test_template/jax_toolbox/slurm_install_strategy.py b/src/cloudai/schema/test_template/jax_toolbox/slurm_install_strategy.py index 29fd81c1..f84368a6 100644 --- a/src/cloudai/schema/test_template/jax_toolbox/slurm_install_strategy.py +++ b/src/cloudai/schema/test_template/jax_toolbox/slurm_install_strategy.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from cloudai import InstallStatusResult from cloudai.systems.slurm.strategy import SlurmInstallStrategy @@ -34,8 +33,8 @@ def is_installed(self) -> InstallStatusResult: return InstallStatusResult(success=True) else: if self.docker_image_cache_manager.cache_docker_images_locally: - expected_docker_image_path = os.path.join( - self.docker_image_cache_manager.install_path, self.SUBDIR_PATH, self.DOCKER_IMAGE_FILENAME + expected_docker_image_path = ( + self.docker_image_cache_manager.install_path / self.SUBDIR_PATH / self.DOCKER_IMAGE_FILENAME ) return InstallStatusResult( success=False, diff --git a/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py index 427de545..e8043255 100644 --- a/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from pathlib import Path from typing import Any, Dict, List @@ -73,7 +72,7 @@ def _parse_slurm_args( container_mounts = "" if "NCCL_TOPO_FILE" in env_vars and "DOCKER_NCCL_TOPO_FILE" in env_vars: - nccl_graph_path = os.path.abspath(env_vars["NCCL_TOPO_FILE"]) + nccl_graph_path = Path(env_vars["NCCL_TOPO_FILE"]).resolve() nccl_graph_file = env_vars["DOCKER_NCCL_TOPO_FILE"] container_mounts = f"{nccl_graph_path}:{nccl_graph_file}" elif "NCCL_TOPO_FILE" in env_vars: diff --git a/src/cloudai/schema/test_template/nccl_test/slurm_install_strategy.py b/src/cloudai/schema/test_template/nccl_test/slurm_install_strategy.py index e4706b48..499d1d1c 100644 --- a/src/cloudai/schema/test_template/nccl_test/slurm_install_strategy.py +++ b/src/cloudai/schema/test_template/nccl_test/slurm_install_strategy.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from cloudai import InstallStatusResult from cloudai.systems.slurm.strategy import SlurmInstallStrategy @@ -40,8 +39,8 @@ def is_installed(self) -> InstallStatusResult: return InstallStatusResult(success=True) else: if self.docker_image_cache_manager.cache_docker_images_locally: - expected_docker_image_path = os.path.join( - self.docker_image_cache_manager.install_path, self.SUBDIR_PATH, self.DOCKER_IMAGE_FILENAME + expected_docker_image_path = ( + self.docker_image_cache_manager.install_path / self.SUBDIR_PATH / self.DOCKER_IMAGE_FILENAME ) return InstallStatusResult( success=False, @@ -55,7 +54,7 @@ def is_installed(self) -> InstallStatusResult: return InstallStatusResult( success=False, message=( - f"Docker image for NCCL test is not accessible.\n" f" - Error: {docker_image_result.message}" + f"Docker image for NCCL test is not accessible.\n - Error: {docker_image_result.message}" ), ) diff --git a/src/cloudai/schema/test_template/nemo_launcher/slurm_install_strategy.py b/src/cloudai/schema/test_template/nemo_launcher/slurm_install_strategy.py index a0e58e50..7aae7419 100644 --- a/src/cloudai/schema/test_template/nemo_launcher/slurm_install_strategy.py +++ b/src/cloudai/schema/test_template/nemo_launcher/slurm_install_strategy.py @@ -18,6 +18,7 @@ import os import subprocess from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path from typing import Any, Dict, List from cloudai import InstallStatusResult, System @@ -89,15 +90,15 @@ def _validate_cmd_arg(self, cmd_args: Dict[str, Any], arg_name: str) -> str: return arg_value def is_installed(self) -> InstallStatusResult: - subdir_path = os.path.join(self.install_path, self.SUBDIR_PATH) - repo_path = os.path.join(subdir_path, self.REPOSITORY_NAME) - repo_installed = os.path.isdir(repo_path) + subdir_path = self.install_path / self.SUBDIR_PATH + repo_path = subdir_path / self.REPOSITORY_NAME + repo_installed = repo_path.is_dir() docker_image_installed = self.docker_image_cache_manager.check_docker_image_exists( self.docker_image_url, self.SUBDIR_PATH, self.DOCKER_IMAGE_FILENAME ).success - data_dir_path = self.default_cmd_args["data_dir"] + data_dir_path = Path(self.default_cmd_args["data_dir"]) datasets_check_result = self._check_datasets_on_nodes(data_dir_path) if not datasets_check_result.success: return InstallStatusResult( @@ -121,8 +122,8 @@ def is_installed(self) -> InstallStatusResult: f"with commit hash {self.repository_commit_hash}" ) if not docker_image_installed: - docker_image_path = os.path.join(subdir_path, self.DOCKER_IMAGE_FILENAME) - missing_components.append(f"Docker image at {docker_image_path} " f"from URL {self.docker_image_url}") + docker_image_path = subdir_path / self.DOCKER_IMAGE_FILENAME + missing_components.append(f"Docker image at {docker_image_path} from URL {self.docker_image_url}") if not datasets_check_result.success: missing_components.append(f"Datasets in {data_dir_path} on some nodes") return InstallStatusResult( @@ -141,10 +142,10 @@ def install(self) -> InstallStatusResult: except PermissionError as e: return InstallStatusResult(success=False, message=str(e)) - subdir_path = os.path.join(self.install_path, self.SUBDIR_PATH) - os.makedirs(subdir_path, exist_ok=True) + subdir_path = self.install_path / self.SUBDIR_PATH + subdir_path.mkdir(parents=True, exist_ok=True) - data_dir_path = self.default_cmd_args["data_dir"] + data_dir_path = Path(self.default_cmd_args["data_dir"]) datasets_check_result = self._check_datasets_on_nodes(data_dir_path) if not datasets_check_result.success: return InstallStatusResult( @@ -195,12 +196,12 @@ def _check_install_path_access(self): PermissionError: If the install path does not exist or if there is no permission to create directories and files. """ - if not os.path.exists(self.install_path): + if not self.install_path.exists(): raise PermissionError(f"Install path {self.install_path} does not exist.") - if not os.access(self.install_path, os.W_OK): + if not self.install_path.is_dir() or not os.access(self.install_path, os.W_OK): raise PermissionError(f"No permission to write in install path {self.install_path}.") - def _check_datasets_on_nodes(self, data_dir_path: str) -> DatasetCheckResult: + def _check_datasets_on_nodes(self, data_dir_path: Path) -> DatasetCheckResult: """ Verify the presence of specified dataset files and directories on all idle compute nodes. @@ -210,7 +211,7 @@ def _check_datasets_on_nodes(self, data_dir_path: str) -> DatasetCheckResult: for systems with multiple nodes. Args: - data_dir_path (str): Path where dataset files and directories are stored. + data_dir_path (Path): Path where dataset files and directories are stored. Returns: DatasetCheckResult: Result object containing success status and nodes without datasets. @@ -248,22 +249,20 @@ def _check_datasets_on_nodes(self, data_dir_path: str) -> DatasetCheckResult: return DatasetCheckResult(success=not nodes_without_datasets, nodes_without_datasets=nodes_without_datasets) - def _check_dataset_on_node(self, node: str, data_dir_path: str, dataset_items: List[str]) -> bool: + def _check_dataset_on_node(self, node: str, data_dir_path: Path, dataset_items: List[str]) -> bool: """ Check if dataset files and directories exist on a single compute node. Args: node (str): The name of the compute node. - data_dir_path (str): Path to the data directory. + data_dir_path (Path): Path to the data directory. dataset_items (List[str]): List of dataset file and directory names to check. Returns: bool: True if all dataset files and directories exist on the node, False otherwise. """ python_check_script = ( - f"import os;print(all(os.path.isfile(os.path.join('{data_dir_path}', " - f"item)) or os.path.isdir(os.path.join('{data_dir_path}', item)) " - f"for item in {dataset_items}))" + f"import os;print(all(Path('{data_dir_path}') / item).exists() for item in {dataset_items})" ) cmd = ( f"srun --nodes=1 --nodelist={node} " @@ -273,43 +272,43 @@ def _check_dataset_on_node(self, node: str, data_dir_path: str, dataset_items: L result = subprocess.run(cmd, shell=True, check=False, capture_output=True, text=True) return result.returncode == 0 and result.stdout.strip() == "True" - def _clone_repository(self, subdir_path: str) -> None: + def _clone_repository(self, subdir_path: Path) -> None: """ Clones NeMo-Launcher repository into specified path if it does not already exist. Args: - subdir_path (str): Subdirectory path for installation. + subdir_path (Path): Subdirectory path for installation. """ - repo_path = os.path.join(subdir_path, self.REPOSITORY_NAME) + repo_path = subdir_path / self.REPOSITORY_NAME - if os.path.exists(repo_path): + if repo_path.exists(): logging.warning("Repository already exists at %s, clone skipped", repo_path) else: logging.debug("Cloning NeMo-Launcher repository into %s", repo_path) - clone_cmd = ["git", "clone", self.repository_url, repo_path] + clone_cmd = ["git", "clone", self.repository_url, str(repo_path)] result = subprocess.run(clone_cmd, capture_output=True, text=True) if result.returncode != 0: raise RuntimeError(f"Failed to clone repository: {result.stderr}") logging.debug("Checking out specific commit %s in repository", self.repository_commit_hash) checkout_cmd = ["git", "checkout", self.repository_commit_hash] - result = subprocess.run(checkout_cmd, cwd=repo_path, capture_output=True, text=True) + result = subprocess.run(checkout_cmd, cwd=str(repo_path), capture_output=True, text=True) if result.returncode != 0: raise RuntimeError(f"Failed to checkout commit: {result.stderr}") - def _install_requirements(self, subdir_path: str) -> None: + def _install_requirements(self, subdir_path: Path) -> None: """ Installs the required Python packages from the requirements.txt file in the cloned repository. Args: - subdir_path (str): Subdirectory path for installation. + subdir_path (Path): Subdirectory path for installation. """ - repo_path = os.path.join(subdir_path, self.REPOSITORY_NAME) - requirements_file = os.path.join(repo_path, "requirements.txt") + repo_path = subdir_path / self.REPOSITORY_NAME + requirements_file = repo_path / "requirements.txt" - if os.path.isfile(requirements_file): + if requirements_file.is_file(): logging.debug("Installing requirements from %s", requirements_file) - install_cmd = ["pip", "install", "-r", requirements_file] + install_cmd = ["pip", "install", "-r", str(requirements_file)] result = subprocess.run(install_cmd, capture_output=True, text=True) if result.returncode != 0: raise RuntimeError(f"Failed to install requirements: {result.stderr}") diff --git a/src/cloudai/schema/test_template/ucc_test/slurm_install_strategy.py b/src/cloudai/schema/test_template/ucc_test/slurm_install_strategy.py index a633848a..9cb2f67d 100644 --- a/src/cloudai/schema/test_template/ucc_test/slurm_install_strategy.py +++ b/src/cloudai/schema/test_template/ucc_test/slurm_install_strategy.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from cloudai import InstallStatusResult from cloudai.systems.slurm.strategy import SlurmInstallStrategy @@ -40,8 +39,8 @@ def is_installed(self) -> InstallStatusResult: return InstallStatusResult(success=True) else: if self.docker_image_cache_manager.cache_docker_images_locally: - expected_docker_image_path = os.path.join( - self.docker_image_cache_manager.install_path, self.SUBDIR_PATH, self.DOCKER_IMAGE_FILENAME + expected_docker_image_path = ( + self.docker_image_cache_manager.install_path / self.SUBDIR_PATH / self.DOCKER_IMAGE_FILENAME ) return InstallStatusResult( success=False, @@ -55,7 +54,7 @@ def is_installed(self) -> InstallStatusResult: return InstallStatusResult( success=False, message=( - f"Docker image for UCC test is not accessible.\n" f" - Error: {docker_image_result.message}" + f"Docker image for UCC test is not accessible.\n - Error: {docker_image_result.message}" ), ) diff --git a/src/cloudai/util/command_shell.py b/src/cloudai/util/command_shell.py index 2db47c3e..86358498 100644 --- a/src/cloudai/util/command_shell.py +++ b/src/cloudai/util/command_shell.py @@ -14,8 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os import subprocess +from pathlib import Path class CommandShell: @@ -23,20 +23,20 @@ class CommandShell: A class responsible for executing shell commands using a specified shell executable. Attributes - executable (str): The path to the shell executable used for running commands. + executable (Path): The path to the shell executable used for running commands. """ - def __init__(self, executable: str = "/bin/bash"): + def __init__(self, executable: Path = Path("/bin/bash")): """ Initialize the CommandShell with a shell executable. Args: - executable (str): The shell executable path. Defaults to "/bin/bash". + executable (Path): The shell executable path. Defaults to Path("/bin/bash"). Raises: FileNotFoundError: If the specified executable does not exist. """ - if not os.path.exists(executable): + if not executable.exists(): raise FileNotFoundError(f"Executable '{executable}' not found.") self.executable = executable @@ -56,7 +56,7 @@ def execute(self, command: str) -> subprocess.Popen: process = subprocess.Popen( command, shell=True, - executable=self.executable, + executable=str(self.executable), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, diff --git a/tests/test_slurm_install_strategy.py b/tests/test_slurm_install_strategy.py index 4a2dcd24..f96b2dc7 100644 --- a/tests/test_slurm_install_strategy.py +++ b/tests/test_slurm_install_strategy.py @@ -156,8 +156,8 @@ def test_clone_repository_when_path_does_not_exist(self, strategy: NeMoLauncherS with patch("subprocess.run") as mock_run, patch("os.path.exists") as mock_exists: mock_run.return_value.returncode = 0 mock_exists.side_effect = lambda path: path == str(subdir_path) - strategy._clone_repository(str(subdir_path)) - strategy._install_requirements(str(subdir_path)) + strategy._clone_repository(subdir_path) + strategy._install_requirements(subdir_path) mock_run.assert_any_call( ["git", "clone", strategy.repository_url, str(repo_path)], capture_output=True, text=True @@ -178,7 +178,7 @@ def test_install_requirements(self, strategy: NeMoLauncherSlurmInstallStrategy): with patch("subprocess.run") as mock_run: mock_run.return_value.returncode = 0 - strategy._install_requirements(str(subdir_path)) + strategy._install_requirements(subdir_path) mock_run.assert_called_with( ["pip", "install", "-r", str(requirements_file)], capture_output=True, text=True ) @@ -191,7 +191,7 @@ def test_clone_repository_when_path_exists(self, strategy: NeMoLauncherSlurmInst with patch("subprocess.run") as mock_run, patch("os.path.exists") as mock_exists: mock_run.return_value.returncode = 0 mock_exists.side_effect = lambda path: path in [str(subdir_path), str(repo_path)] - strategy._clone_repository(str(subdir_path)) + strategy._clone_repository(subdir_path) # Ensure that the checkout command was run mock_run.assert_any_call(