From 64fb0b5057ba1bce5a6e974a570b6a2b94ebbbe7 Mon Sep 17 00:00:00 2001 From: Hanlin Tang Date: Fri, 3 Mar 2023 14:39:44 -0800 Subject: [PATCH 1/6] slack_sdk import is conditional on using the slack webhook url --- composer/callbacks/health_checker.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/composer/callbacks/health_checker.py b/composer/callbacks/health_checker.py index b052bb47cb..94a1427b33 100644 --- a/composer/callbacks/health_checker.py +++ b/composer/callbacks/health_checker.py @@ -17,12 +17,12 @@ import os import numpy as np -from slack_sdk.webhook import WebhookClient from composer.core import Callback, State from composer.core.time import Timestamp from composer.loggers import Logger from composer.utils import dist +from composer.utils.import_helpers import MissingConditionalImportError log = logging.getLogger(__name__) @@ -48,7 +48,7 @@ class HealthChecker(Callback): slack_webhook_url (str, optional): Slack URL to send alerts. Can also be set with the SLACK_WEBHOOK_URL environment variable. Default: None test_mode (bool, optional): If True, will send a test alert at the first check. - Default: False + Default: True """ def __init__( @@ -58,7 +58,7 @@ def __init__( window_size: int = 120, wait: int = 120, slack_webhook_url: Optional[str] = None, - test_mode: bool = False, + test_mode: bool = True, ) -> None: self.sample_freq = sample_freq self.window_size = window_size @@ -69,6 +69,13 @@ def __init__( if not self.slack_webhook_url: self.slack_webhook_url = os.environ.get('SLACK_WEBHOOK_URL', None) + if self.slack_webhook_url: + # fail fast if missing import + try: + import slack_sdk as slack_sdk + except ImportError as e: + raise MissingConditionalImportError('health_checker', 'slack_sdk') from e + self.last_sample = 0 self.last_check = 0 @@ -133,6 +140,7 @@ def _alert(self, message: str, state: State) -> None: logging.warning(message) if self.slack_webhook_url: + from slack_sdk.webhook import WebhookClient client = WebhookClient(url=self.slack_webhook_url) client.send(text=message) From 1fde983a2e8f83fbf3fe411aa7cffceec99deed9 Mon Sep 17 00:00:00 2001 From: Hanlin Tang Date: Fri, 3 Mar 2023 15:21:16 -0800 Subject: [PATCH 2/6] conditionally import pynvml --- composer/callbacks/health_checker.py | 35 ++++++++++++++-------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/composer/callbacks/health_checker.py b/composer/callbacks/health_checker.py index 94a1427b33..0ecb7b722b 100644 --- a/composer/callbacks/health_checker.py +++ b/composer/callbacks/health_checker.py @@ -3,20 +3,13 @@ """Check GPU Health during training.""" import logging +import os from collections import deque from datetime import datetime from typing import List, Optional, Tuple -import torch - -try: - import pynvml -except ImportError: - pynvml = None - -import os - import numpy as np +import torch from composer.core import Callback, State from composer.core.time import Timestamp @@ -48,7 +41,7 @@ class HealthChecker(Callback): slack_webhook_url (str, optional): Slack URL to send alerts. Can also be set with the SLACK_WEBHOOK_URL environment variable. Default: None test_mode (bool, optional): If True, will send a test alert at the first check. - Default: True + Default: False """ def __init__( @@ -58,7 +51,7 @@ def __init__( window_size: int = 120, wait: int = 120, slack_webhook_url: Optional[str] = None, - test_mode: bool = True, + test_mode: bool = False, ) -> None: self.sample_freq = sample_freq self.window_size = window_size @@ -74,7 +67,7 @@ def __init__( try: import slack_sdk as slack_sdk except ImportError as e: - raise MissingConditionalImportError('health_checker', 'slack_sdk') from e + raise MissingConditionalImportError('health_checker', 'slack_sdk', None) from e self.last_sample = 0 self.last_check = 0 @@ -149,12 +142,13 @@ def _is_available() -> bool: if not torch.cuda.is_available(): return False try: + import pynvml pynvml.nvmlInit() # type: ignore return True + except ImportError: + raise MissingConditionalImportError('health_checker', 'pynvml', None) except pynvml.NVMLError_LibraryNotFound: # type: ignore logging.warning('NVML not found, disabling GPU health checking') - except ImportError: - logging.warning('pynvml library not found, disabling GPU health checking.') except Exception as e: logging.warning(f'Error initializing NVML: {e}') @@ -176,13 +170,18 @@ def sample(self) -> None: self.samples.append(sample) def _sample(self) -> Optional[List]: + try: + import pynvml + except ImportError: + raise MissingConditionalImportError('health_checker', 'pynvml', None) + try: samples = [] - device_count = pynvml.nvmlDeviceGetCount() # type: ignore + device_count = pynvml.nvmlDeviceGetCount() for i in range(device_count): - handle = pynvml.nvmlDeviceGetHandleByIndex(i) # type: ignore - samples.append(pynvml.nvmlDeviceGetUtilizationRates(handle).gpu) # type: ignore - except pynvml.NVMLError: # type: ignore + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + samples.append(pynvml.nvmlDeviceGetUtilizationRates(handle).gpu) + except pynvml.NVMLError: return None return samples From 4e2bc75b821da82b925122f7acaefafcceb50cdb Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Fri, 3 Mar 2023 15:34:25 -0800 Subject: [PATCH 3/6] Update composer/callbacks/health_checker.py --- composer/callbacks/health_checker.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/composer/callbacks/health_checker.py b/composer/callbacks/health_checker.py index 0ecb7b722b..1a032a6e1d 100644 --- a/composer/callbacks/health_checker.py +++ b/composer/callbacks/health_checker.py @@ -65,7 +65,8 @@ def __init__( if self.slack_webhook_url: # fail fast if missing import try: - import slack_sdk as slack_sdk + import slack_sdk + del slack_sdk except ImportError as e: raise MissingConditionalImportError('health_checker', 'slack_sdk', None) from e From a08dc929e760459e27f3e624d435faec018ca3b9 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Fri, 3 Mar 2023 15:56:57 -0800 Subject: [PATCH 4/6] Update composer/callbacks/health_checker.py Co-authored-by: Mihir Patel --- composer/callbacks/health_checker.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/composer/callbacks/health_checker.py b/composer/callbacks/health_checker.py index 1a032a6e1d..e7608df35a 100644 --- a/composer/callbacks/health_checker.py +++ b/composer/callbacks/health_checker.py @@ -14,8 +14,7 @@ from composer.core import Callback, State from composer.core.time import Timestamp from composer.loggers import Logger -from composer.utils import dist -from composer.utils.import_helpers import MissingConditionalImportError +from composer.utils import dist, MissingConditionalImportError log = logging.getLogger(__name__) From 0fc6d587a63b166bff7cbc2fc33fb70161cde89e Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 3 Mar 2023 16:04:46 -0800 Subject: [PATCH 5/6] rerun tests From fab675a71397b3ac50b156b325c97e48d3175ab2 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 3 Mar 2023 16:05:24 -0800 Subject: [PATCH 6/6] precommit --- composer/callbacks/health_checker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/callbacks/health_checker.py b/composer/callbacks/health_checker.py index e7608df35a..272a612eae 100644 --- a/composer/callbacks/health_checker.py +++ b/composer/callbacks/health_checker.py @@ -14,7 +14,7 @@ from composer.core import Callback, State from composer.core.time import Timestamp from composer.loggers import Logger -from composer.utils import dist, MissingConditionalImportError +from composer.utils import MissingConditionalImportError, dist log = logging.getLogger(__name__)