Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend checkpoint loading to accept a validation function #2726

Merged
merged 23 commits into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 59 additions & 2 deletions composer/utils/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import tempfile
import textwrap
import warnings
from importlib import import_module
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Optional, Union

Expand All @@ -39,6 +40,62 @@
_TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME = f'__{dist.get_global_rank()}_0.distcp'


def _get_checkpoint_validation_function(name: str) -> Callable[[Union[Path, str]], bool]:
"""Get the validation function by name.

Args:
name (str): Qualified name of the checkpoint validation function.
It should be in the form '{module_name}.{fn_name}'.

Returns:
Callable[[Union[Path, str]], bool] The checkpoint validation function that returns
True given a valid checkpoint and False otherwise.
"""
splits = name.split('.')
module_name, fn_name = '.'.join(splits[:-1]), splits[-1]
module = import_module(module_name)
fn = getattr(module, fn_name)
log.debug(f'Checkpoint validation function {name} has been found.')
return fn


def _ensure_valid_checkpoint(checkpoint_filepath: Union[Path, str]) -> Union[Path, str]:
"""Ensures that the checkpoint at checkpoint_filepath is valid.

using the function specified by the CHECKPOINT_VALIDATION_FUNCTION environment variable.
If CHECKPOINT_VALIDATION_FUNCTION, we skip validation.
irenedea marked this conversation as resolved.
Show resolved Hide resolved

Args:
checkpoint_filepath (Union[Path,str]): The path to the checkpoint file.
"""
fn_name = os.environ.get('CHECKPOINT_VALIDATION_FUNCTION', None)
irenedea marked this conversation as resolved.
Show resolved Hide resolved

# No function name has been specified.
if fn_name is None:
log.debug('No validation function specified. Skipping checkpoint validation.')
return checkpoint_filepath

# Get the validation function by name.
validate = _get_checkpoint_validation_function(fn_name)

# Validate the checkpoint.
if not validate(checkpoint_filepath):
raise ValueError(f'Checkpoint at {checkpoint_filepath} is invalid.')

log.debug(f'Checkpoint at {checkpoint_filepath} is valid.')
return checkpoint_filepath


def _torch_load_with_validation(checkpoint_filepath: Union[Path, str], map_location: str) -> Any:
"""Validates and loads a torch checkpoint.

Args:
checkpoint_filepath (Union[Path,str]): The path to the checkpoint file.
map_location (str): The location to load the checkpoint to.
"""
return torch.load(_ensure_valid_checkpoint(checkpoint_filepath), map_location=map_location)


def _format_path_with_rank_zero(path: str) -> str:
"""Formats ``path`` with the rank zero values."""
return path.format(
Expand Down Expand Up @@ -695,7 +752,7 @@ def safe_torch_load(
model = None
optimizer = None
if dist.get_global_rank() == 0:
state_dict_list[0] = torch.load(composer_states_filepath, map_location=map_location)
state_dict_list[0] = _torch_load_with_validation(composer_states_filepath, map_location=map_location)
# Don't broadcast model/optimizer state if they exist
if 'model' in state_dict_list[0]['state']:
model = state_dict_list[0]['state']['model']
Expand All @@ -716,7 +773,7 @@ def safe_torch_load(

return state_dict
else:
return torch.load(composer_states_filepath, map_location=map_location)
return _torch_load_with_validation(composer_states_filepath, map_location=map_location)
irenedea marked this conversation as resolved.
Show resolved Hide resolved
except TypeError as e:
if 'Accuracy.__new__() missing 1 required positional argument' in str(e):
raise Exception('As of v0.10.0, torchmetrics introduces a new required argument to Accuracy which '
Expand Down
42 changes: 41 additions & 1 deletion tests/trainer/test_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import time
from glob import glob
from typing import Any, Dict, List, Optional, Union
from unittest import mock
from unittest.mock import MagicMock

import pytest
Expand All @@ -29,7 +30,7 @@
from composer.trainer import trainer
from composer.trainer.trainer import Trainer
from composer.utils import dist, is_tar
from composer.utils.checkpoint import glob_filter
from composer.utils.checkpoint import _ensure_valid_checkpoint, glob_filter
from composer.utils.object_store.object_store import ObjectStore
from composer.utils.object_store.s3_object_store import S3ObjectStore
from tests.common import (RandomClassificationDataset, RandomImageDataset, RandomTextLMDataset, SimpleConvModel,
Expand Down Expand Up @@ -1289,3 +1290,42 @@ def test_rotate_checkpoints(
assert len(symlink_files) == ((1 if not deepspeed_enabled else world_size) if num_keep != 0 else 0)

dist.barrier() # all ranks finish before cleaning up tmpdir


def simple_validate(filepath: str):
with open(filepath, 'r') as f:
return f.read() == 'good'


def test_checkpoint_validation(tmp_path):
irenedea marked this conversation as resolved.
Show resolved Hide resolved
checkpoint_filepath = tmp_path / 'dummy'
with open(checkpoint_filepath, 'w') as f:
f.write('good')

# No validation function specified.
result = _ensure_valid_checkpoint(checkpoint_filepath)
assert result == checkpoint_filepath

# Non-existent module specified.
with mock.patch.dict(os.environ, {'CHECKPOINT_VALIDATION_FUNCTION': 'bad_module.bad_function'}):
with pytest.raises(ModuleNotFoundError):
_ensure_valid_checkpoint(checkpoint_filepath)

# Non-existent function specified.
with mock.patch.dict(os.environ, {'CHECKPOINT_VALIDATION_FUNCTION': 'tests.trainer.test_checkpoint.bad_function'}):
with pytest.raises(AttributeError):
_ensure_valid_checkpoint(checkpoint_filepath)

# Correct usage and successful validation.
with mock.patch.dict(os.environ,
{'CHECKPOINT_VALIDATION_FUNCTION': 'tests.trainer.test_checkpoint.simple_validate'}):
result = _ensure_valid_checkpoint(checkpoint_filepath)
assert result == checkpoint_filepath

# Correct usage and failed validation.
with open(checkpoint_filepath, 'w') as f:
f.write('bad')
with mock.patch.dict(os.environ,
{'CHECKPOINT_VALIDATION_FUNCTION': 'tests.trainer.test_checkpoint.simple_validate'}):
with pytest.raises(ValueError):
_ensure_valid_checkpoint(checkpoint_filepath)
Loading