Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend checkpoint loading to accept a validation function #2726

Merged
merged 23 commits into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 95 additions & 5 deletions composer/utils/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import tempfile
import textwrap
import warnings
from importlib import import_module
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Optional, Union

Expand All @@ -39,6 +40,66 @@
_TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME = f'__{dist.get_global_rank()}_0.distcp'


def _get_checkpoint_validation_function() -> Optional[Callable[[Union[Path, str]], bool]]:
"""Get the validation function by name.

Args:
name (str): Qualified name of the checkpoint validation function.
It should be in the form '{module_name}.{fn_name}'.

Returns:
Callable[[Union[Path, str]], bool] The checkpoint validation function that returns
True given a valid checkpoint and False otherwise.
"""
name = os.environ.get('CHECKPOINT_VALIDATION_FUNCTION', None)
if name is None:
return None
splits = name.split('.')
module_name, fn_name = '.'.join(splits[:-1]), splits[-1]
module = import_module(module_name)
fn = getattr(module, fn_name)
log.debug(f'Checkpoint validation function {name} has been found.')
return fn


def _ensure_valid_checkpoint(checkpoint_filepath: Union[Path, str]) -> Union[Path, str]:
"""Ensures that the checkpoint at checkpoint_filepath is valid.

using the function specified by the CHECKPOINT_VALIDATION_FUNCTION environment variable.
If CHECKPOINT_VALIDATION_FUNCTION is not set, we skip validation.

Args:
checkpoint_filepath (Union[Path,str]): The path to the checkpoint file.

Raises:
ValueError if checkpoint file is invalid.
"""
# Get the validation function by name.
validate = _get_checkpoint_validation_function()

# No function name has been specified.
if validate is None:
log.debug('No validation function specified. Skipping checkpoint validation.')
return checkpoint_filepath

# Validate the checkpoint.
if not validate(checkpoint_filepath):
raise ValueError(f'Checkpoint at {checkpoint_filepath} is invalid.')

log.debug(f'Checkpoint at {checkpoint_filepath} is valid.')
return checkpoint_filepath


def _torch_load_with_validation(checkpoint_filepath: Union[Path, str], map_location: str) -> Any:
"""Validates and loads a torch checkpoint.

Args:
checkpoint_filepath (Union[Path,str]): The path to the checkpoint file.
map_location (str): The location to load the checkpoint to.
"""
return torch.load(_ensure_valid_checkpoint(checkpoint_filepath), map_location=map_location)


def _format_path_with_rank_zero(path: str) -> str:
"""Formats ``path`` with the rank zero values."""
return path.format(
Expand Down Expand Up @@ -338,8 +399,37 @@ def _get_num_ranks_that_saved_rng(metadata: Metadata):
rng_inds = set(rng_inds)
return len(rng_inds)

# A subclass of FileSystemReader that downloads files from the object store before reading them from the local filesystem.
class DistCPObjectStoreReader(dist_cp.FileSystemReader):
class FileSystemReaderWithValidation(dist_cp.FileSystemReader):
"""FileSystemReader that validates checkpoint files prior to reading."""

def __init__(self, path: str):
if _get_checkpoint_validation_function() is None:
log.info('No checkpoint validation function found when loading sharded checkpoints.')
super().__init__(path)

def read_data(self, plan: LoadPlan, planner: LoadPlanner):
"""Reads data file.

Raises:
ValueError if the data file is invalid.
"""
for read_item in plan.items:
data_path = self.path / self.storage_data[read_item.storage_index].relative_path
_ensure_valid_checkpoint(data_path)
return super().read_data(plan, planner)

def read_metadata(self) -> Metadata:
"""Reads metadata file.

Raises:
ValueError if the metadata file is invalid.
"""
metadata_file_path = self.path / '.metadata'
_ensure_valid_checkpoint(metadata_file_path)
return super().read_metadata()

# A subclass of FileSystemReaderWithValidation that downloads files from the object store before reading them from the local filesystem.
class DistCPObjectStoreReader(FileSystemReaderWithValidation):

def __init__(self, source_path: str, destination_path: str, object_store):
self.source_path = source_path
Expand Down Expand Up @@ -401,7 +491,7 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner):
Path(rank0_download_tempdir) / Path('checkpoints')),
object_store=object_store)
else:
storage_reader = dist_cp.FileSystemReader(source_path)
storage_reader = FileSystemReaderWithValidation(source_path)

# We need no_grad because we overwrite tensor values with set_() when we do elastic loading and we don't want the set_ op recorded in the computation graph.
with torch.no_grad():
Expand Down Expand Up @@ -695,7 +785,7 @@ def safe_torch_load(
model = None
optimizer = None
if dist.get_global_rank() == 0:
state_dict_list[0] = torch.load(composer_states_filepath, map_location=map_location)
state_dict_list[0] = _torch_load_with_validation(composer_states_filepath, map_location=map_location)
# Don't broadcast model/optimizer state if they exist
if 'model' in state_dict_list[0]['state']:
model = state_dict_list[0]['state']['model']
Expand All @@ -716,7 +806,7 @@ def safe_torch_load(

return state_dict
else:
return torch.load(composer_states_filepath, map_location=map_location)
return _torch_load_with_validation(composer_states_filepath, map_location=map_location)
irenedea marked this conversation as resolved.
Show resolved Hide resolved
except TypeError as e:
if 'Accuracy.__new__() missing 1 required positional argument' in str(e):
raise Exception('As of v0.10.0, torchmetrics introduces a new required argument to Accuracy which '
Expand Down
41 changes: 39 additions & 2 deletions tests/trainer/test_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import time
from glob import glob
from typing import Any, Dict, List, Optional, Union
from unittest.mock import MagicMock
from unittest.mock import MagicMock, patch

import pytest
import torch
Expand All @@ -29,7 +29,7 @@
from composer.trainer import trainer
from composer.trainer.trainer import Trainer
from composer.utils import dist, is_tar
from composer.utils.checkpoint import glob_filter
from composer.utils.checkpoint import _ensure_valid_checkpoint, glob_filter
from composer.utils.object_store.object_store import ObjectStore
from composer.utils.object_store.s3_object_store import S3ObjectStore
from tests.common import (RandomClassificationDataset, RandomImageDataset, RandomTextLMDataset, SimpleConvModel,
Expand Down Expand Up @@ -1289,3 +1289,40 @@ def test_rotate_checkpoints(
assert len(symlink_files) == ((1 if not deepspeed_enabled else world_size) if num_keep != 0 else 0)

dist.barrier() # all ranks finish before cleaning up tmpdir


def simple_validate(filepath: str):
with open(filepath, 'r') as f:
return f.read() == 'good'


def test_checkpoint_validation(tmp_path):
irenedea marked this conversation as resolved.
Show resolved Hide resolved
checkpoint_filepath = tmp_path / 'dummy'
with open(checkpoint_filepath, 'w') as f:
f.write('good')

# No validation function specified.
result = _ensure_valid_checkpoint(checkpoint_filepath)
assert result == checkpoint_filepath

# Non-existent module specified.
with patch.dict(os.environ, {'CHECKPOINT_VALIDATION_FUNCTION': 'bad_module.bad_function'}):
with pytest.raises(ModuleNotFoundError):
_ensure_valid_checkpoint(checkpoint_filepath)

# Non-existent function specified.
with patch.dict(os.environ, {'CHECKPOINT_VALIDATION_FUNCTION': 'tests.trainer.test_checkpoint.bad_function'}):
with pytest.raises(AttributeError):
_ensure_valid_checkpoint(checkpoint_filepath)

# Correct usage and successful validation.
with patch.dict(os.environ, {'CHECKPOINT_VALIDATION_FUNCTION': 'tests.trainer.test_checkpoint.simple_validate'}):
result = _ensure_valid_checkpoint(checkpoint_filepath)
assert result == checkpoint_filepath

# Correct usage and failed validation.
with open(checkpoint_filepath, 'w') as f:
f.write('bad')
with patch.dict(os.environ, {'CHECKPOINT_VALIDATION_FUNCTION': 'tests.trainer.test_checkpoint.simple_validate'}):
with pytest.raises(ValueError):
_ensure_valid_checkpoint(checkpoint_filepath)
41 changes: 41 additions & 0 deletions tests/trainer/test_fsdp_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@
import pathlib
import textwrap
import uuid
from contextlib import nullcontext as does_not_raise
from functools import partial
from typing import Any, Callable, Optional, Sequence
from unittest.mock import patch

import numpy as np
import pytest
Expand Down Expand Up @@ -545,6 +547,45 @@ def test_fsdp_full_state_dict_load_with_ema(
trainer2.close()


@pytest.mark.gpu
@world_size(2)
@pytest.mark.parametrize('is_valid_checkpoint', [True, False])
@pytest.mark.parametrize('state_dict_type', ['full', 'sharded'])
@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
reason='requires PyTorch 1.13 or higher')
@pytest.mark.filterwarnings(r'ignore:TypedStorage is deprecated.:UserWarning')
@pytest.mark.filterwarnings(r'ignore:.*metrics are not saved with sharded state dict.*:UserWarning')
@pytest.mark.filterwarnings(r'ignore:Please use DTensor instead and we are deprecating ShardedTensor.:UserWarning')
def test_checkpoint_loading_with_validation(world_size, tmp_path, is_valid_checkpoint: bool, state_dict_type: str):
from torch.distributed.checkpoint.api import CheckpointException

def mock_get_checkpoint_validation_function():
return lambda _: is_valid_checkpoint

tmp_paths = dist.all_gather_object(os.path.abspath(tmp_path))
save_folder = os.path.join(tmp_paths[0], 'checkpoints')
fsdp_config = FSDPConfig(state_dict_type=state_dict_type)

# First trainer saves checkpoints.
trainer = get_trainer(save_folder=save_folder, fsdp_config=fsdp_config, max_duration='1ba')
trainer.fit()
trainer.close()

expectation = does_not_raise() if is_valid_checkpoint else pytest.raises((ValueError, CheckpointException))

checkpoint_relpath = 'ba1-rank0.pt' if state_dict_type == 'full' else 'ba1'

# Load checkpoints with checkpoint validation.
with expectation:
with patch('composer.utils.checkpoint._get_checkpoint_validation_function',
mock_get_checkpoint_validation_function):
trainer = get_trainer(load_path=os.path.join(save_folder, checkpoint_relpath),
max_duration='2ba',
fsdp_config=fsdp_config)
trainer.fit()
trainer.close()


@pytest.mark.gpu
@world_size(2)
@pytest.mark.parametrize('weights_only', [False, True])
Expand Down
Loading