Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Protect for missing slack_sdk import #2031

Merged
merged 6 commits into from
Mar 4, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 24 additions & 17 deletions composer/callbacks/health_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,18 @@

"""Check GPU Health during training."""
import logging
import os
from collections import deque
from datetime import datetime
from typing import List, Optional, Tuple

import torch

try:
import pynvml
except ImportError:
pynvml = None

import os

import numpy as np
from slack_sdk.webhook import WebhookClient
import torch

from composer.core import Callback, State
from composer.core.time import Timestamp
from composer.loggers import Logger
from composer.utils import dist
from composer.utils import MissingConditionalImportError, dist

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -69,6 +61,14 @@ def __init__(
if not self.slack_webhook_url:
self.slack_webhook_url = os.environ.get('SLACK_WEBHOOK_URL', None)

if self.slack_webhook_url:
# fail fast if missing import
try:
import slack_sdk
del slack_sdk
except ImportError as e:
raise MissingConditionalImportError('health_checker', 'slack_sdk', None) from e
dakinggg marked this conversation as resolved.
Show resolved Hide resolved

self.last_sample = 0
self.last_check = 0

Expand Down Expand Up @@ -133,6 +133,7 @@ def _alert(self, message: str, state: State) -> None:

logging.warning(message)
if self.slack_webhook_url:
from slack_sdk.webhook import WebhookClient
client = WebhookClient(url=self.slack_webhook_url)
client.send(text=message)

Expand All @@ -141,12 +142,13 @@ def _is_available() -> bool:
if not torch.cuda.is_available():
return False
try:
import pynvml
pynvml.nvmlInit() # type: ignore
return True
except ImportError:
raise MissingConditionalImportError('health_checker', 'pynvml', None)
except pynvml.NVMLError_LibraryNotFound: # type: ignore
logging.warning('NVML not found, disabling GPU health checking')
except ImportError:
logging.warning('pynvml library not found, disabling GPU health checking.')
except Exception as e:
logging.warning(f'Error initializing NVML: {e}')

Expand All @@ -168,13 +170,18 @@ def sample(self) -> None:
self.samples.append(sample)

def _sample(self) -> Optional[List]:
try:
import pynvml
except ImportError:
raise MissingConditionalImportError('health_checker', 'pynvml', None)

try:
samples = []
device_count = pynvml.nvmlDeviceGetCount() # type: ignore
device_count = pynvml.nvmlDeviceGetCount()
for i in range(device_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(i) # type: ignore
samples.append(pynvml.nvmlDeviceGetUtilizationRates(handle).gpu) # type: ignore
except pynvml.NVMLError: # type: ignore
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
samples.append(pynvml.nvmlDeviceGetUtilizationRates(handle).gpu)
except pynvml.NVMLError:
return None
return samples

Expand Down