diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml index 5552d6c19c..8ca5b6fc44 100644 --- a/.github/workflows/daily.yaml +++ b/.github/workflows/daily.yaml @@ -22,18 +22,23 @@ jobs: markers: not daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: mosaicml - - name: cpu-3.11-2.2-composer - container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04 - markers: not daily and (remote or not remote) and not gpu and not doctest - pytest_command: coverage run -m pytest - composer_package_name: composer - name: cpu-3.11-2.3 container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 markers: not daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: mosaicml + - name: cpu-3.11-2.4 + container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 + markers: not daily and (remote or not remote) and not gpu and not doctest + pytest_command: coverage run -m pytest + composer_package_name: mosaicml + - name: cpu-3.11-2.4-composer + container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 + markers: not daily and (remote or not remote) and not gpu and not doctest + pytest_command: coverage run -m pytest + composer_package_name: composer - name: cpu-doctest - container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 + container: mosaicml/pytorch:2.4.0_cpu-python3.10-ubuntu20.04 markers: not daily and (remote or not remote) and not gpu and doctest pytest_command: coverage run -m pytest tests/test_docs.py composer_package_name: mosaicml @@ -42,18 +47,23 @@ jobs: markers: daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: mosaicml - - name: daily-cpu-3.11-2.2-composer - container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04 + - name: daily-cpu-3.11-2.3 + container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 markers: daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest - composer_package_name: composer - - name: daily-cpu-3.11-2.3-composer - container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 + composer_package_name: mosaicml + - name: daily-cpu-3.11-2.4 + container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 + markers: daily and (remote or not remote) and not gpu and not doctest + pytest_command: coverage run -m pytest + composer_package_name: mosaicml + - name: daily-cpu-3.11-2.4-composer + container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 markers: daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: composer - name: daily-cpu-doctest - container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 markers: daily and (remote or not remote) and not gpu and doctest pytest_command: coverage run -m pytest tests/test_docs.py composer_package_name: mosaicml @@ -104,6 +114,12 @@ jobs: pytest_command: "coverage run -m pytest" composer_package_name: "mosaicml" gpu_num: 1 + - name: "gpu-3.11-2.4-1-gpu" + container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04 + markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" + pytest_command: "coverage run -m pytest" + composer_package_name: "mosaicml" + gpu_num: 1 - name: "gpu-3.11-2.2-2-gpu" container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" @@ -116,6 +132,12 @@ jobs: pytest_command: "coverage run -m pytest" composer_package_name: "mosaicml" gpu_num: 2 + - name: "gpu-3.11-2.4-2-gpu" + container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04 + markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" + pytest_command: "coverage run -m pytest" + composer_package_name: "mosaicml" + gpu_num: 2 - name: "gpu-3.11-2.2-4-gpu" container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" @@ -128,6 +150,12 @@ jobs: pytest_command: "coverage run -m pytest" composer_package_name: "mosaicml" gpu_num: 4 + - name: "gpu-3.11-2.4-4-gpu" + container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04 + markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" + pytest_command: "coverage run -m pytest" + composer_package_name: "mosaicml" + gpu_num: 4 name: ${{ matrix.name }} if: github.repository_owner == 'mosaicml' with: diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index 4d44e69824..1483fc060e 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -21,8 +21,12 @@ jobs: container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 markers: not daily and not remote and not gpu and not doctest pytest_command: coverage run -m pytest + - name: cpu-3.11-2.4 + container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 + markers: not daily and not remote and not gpu and not doctest + pytest_command: coverage run -m pytest - name: cpu-doctest - container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 markers: not daily and not remote and not gpu and doctest pytest_command: coverage run -m pytest tests/test_docs.py name: ${{ matrix.name }} diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml index 2f335a5a68..e74689e597 100644 --- a/.github/workflows/pr-gpu.yaml +++ b/.github/workflows/pr-gpu.yaml @@ -13,8 +13,8 @@ jobs: strategy: matrix: include: - - name: gpu-3.11-2.3-1 - container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 + - name: gpu-3.11-2.4-1 + container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04 markers: not daily and not remote and gpu and (doctest or not doctest) pytest_command: coverage run -m pytest composer_package_name: mosaicml @@ -39,8 +39,8 @@ jobs: strategy: matrix: include: - - name: gpu-3.11-2.3-2 - container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 + - name: gpu-3.11-2.4-2 + container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04 markers: not daily and not remote and gpu and (doctest or not doctest) pytest_command: coverage run -m pytest composer_package_name: mosaicml @@ -66,8 +66,8 @@ jobs: strategy: matrix: include: - - name: gpu-3.11-2.3-4 - container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 + - name: gpu-3.11-2.4-4 + container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04 markers: not daily and not remote and gpu and (doctest or not doctest) pytest_command: coverage run -m pytest composer_package_name: mosaicml diff --git a/composer/core/precision.py b/composer/core/precision.py index 2723159261..c3aa06643b 100644 --- a/composer/core/precision.py +++ b/composer/core/precision.py @@ -26,9 +26,9 @@ class Precision(StringEnum): Attributes: FP32: Use 32-bit floating-point precision. Compatible with CPUs and GPUs. - AMP_FP16: Use :mod:`torch.cuda.amp` with 16-bit floating-point precision. Only compatible + AMP_FP16: Use :mod:`torch.amp` with 16-bit floating-point precision. Only compatible with GPUs. - AMP_BF16: Use :mod:`torch.cuda.amp` with 16-bit BFloat precision. + AMP_BF16: Use :mod:`torch.amp` with 16-bit BFloat precision. AMP_FP8: Use :mod:`transformer_engine.pytorch.fp8_autocast` with 8-bit FP8 precison. """ FP32 = 'fp32' @@ -60,7 +60,7 @@ def get_precision_context( precision = Precision(precision) if precision == Precision.FP32: if torch.cuda.is_available(): - with torch.cuda.amp.autocast(False): + with torch.autocast('cuda', enabled=False): yield else: # Yield here to avoid warnings about cuda not being available @@ -68,7 +68,7 @@ def get_precision_context( elif precision == Precision.AMP_FP16: # Retain compatibility with PyTorch < 1.10 if torch.cuda.is_available(): - with torch.cuda.amp.autocast(True): + with torch.autocast('cuda', enabled=True): yield elif is_xla_installed(): with torch.autocast('xla', dtype=torch.float16): @@ -77,7 +77,7 @@ def get_precision_context( yield elif precision == Precision.AMP_BF16: if torch.cuda.is_available(): - with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + with torch.autocast('cuda', dtype=torch.bfloat16, enabled=True): yield elif is_xla_installed(): with torch.autocast('xla', dtype=torch.bfloat16): diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index 27323718fc..695136f552 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1725,7 +1725,7 @@ def __init__( # Suppressing GradScaler warnings as they are always created # self._use_grad_scaling() will raise a RuntimeError if grad scaling is not available when it is required - warnings.filterwarnings(action='ignore', message='torch.cuda.amp.GradScaler') + warnings.filterwarnings(action='ignore', message='.*torch.cuda.amp.GradScaler.*') self.state.scaler = ClosureGradScaler() if self._use_closures() else GradScaler() if self.state.fsdp_config is not None: @@ -2442,6 +2442,17 @@ def fit( self.first_batch_complete = False self._train_loop() + # Zero gradients at the end of fit so same model/optimizer can be used for further training + # with checkpoint loading. See https://github.com/pytorch/pytorch/issues/133415 + for optimizer in self.state.optimizers: + try: + try: + optimizer.zero_grad(set_to_none=True) + except TypeError: + optimizer.zero_grad() + except: + log.exception('Failed to zero out optimizer at end of fit') + def close(self): """Shutdown the trainer. diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py index f54d1f69e1..60dea99ff9 100644 --- a/docs/source/doctest_fixtures.py +++ b/docs/source/doctest_fixtures.py @@ -16,6 +16,7 @@ import os import sys import tempfile +import warnings from typing import Any from typing import Callable as Callable from urllib.parse import urlparse @@ -53,10 +54,16 @@ from composer.loggers import Logger as Logger from composer.loggers import RemoteUploaderDownloader from composer.models import ComposerModel as ComposerModel -from composer.optim.scheduler import ConstantScheduler +from composer.optim import ConstantScheduler, DecoupledSGDW from composer.utils import LibcloudObjectStore, RemoteUploader from composer.utils import ensure_tuple as ensure_tuple +# Ignore certain warnings for doctest +warnings.filterwarnings(action='ignore', message='.*Deterministic mode.*') # Expected +warnings.filterwarnings(action='ignore', message='.*Some weights of Bert*') # Expected +warnings.filterwarnings(action='ignore', message='.*torch.cuda.amp.custom.*') # DeepSpeed +warnings.filterwarnings(action='ignore', message='.*The distutils.sysconfig module*') # DeepSpeed + try: import wandb _WANDB_INSTALLED = True @@ -117,7 +124,7 @@ model = SimpleModel(num_channels, num_classes) -optimizer = torch.optim.SGD(model.parameters(), lr=0.001) +optimizer = DecoupledSGDW(model.parameters(), lr=0.001) scheduler = CosineAnnealingLR(optimizer, T_max=1) @@ -188,7 +195,7 @@ def _new_trainer_init(self, fake_ellipses: None = None, **kwargs: Any): if 'model' not in kwargs: kwargs['model'] = model if 'optimizers' not in kwargs: - kwargs['optimizers'] = torch.optim.SGD(kwargs['model'].parameters(), lr=0.01) + kwargs['optimizers'] = DecoupledSGDW(kwargs['model'].parameters(), lr=0.01) if 'schedulers' not in kwargs: kwargs['schedulers'] = ConstantScheduler() if 'max_duration' not in kwargs: diff --git a/docs/source/trainer/checkpointing.rst b/docs/source/trainer/checkpointing.rst index 0cb2f39898..36f0b358a7 100644 --- a/docs/source/trainer/checkpointing.rst +++ b/docs/source/trainer/checkpointing.rst @@ -531,10 +531,10 @@ object stores like WandB or LibCloud, you must still specify a ``load_object_sto :skipif: not _LIBCLOUD_INSTALLED new_trainer = Trainer( - model=model, - train_dataloader=train_dataloader, - max_duration="10ep", - load_path="s3://checkpoint-debugging/checkpoints/ep1.pt", + model=model, + train_dataloader=train_dataloader, + max_duration="10ep", + load_path="s3://checkpoint-debugging/checkpoints/ep1.pt", ) new_trainer.fit() @@ -547,10 +547,10 @@ Similarly for OCI: :skipif: not _LIBCLOUD_INSTALLED new_trainer = Trainer( - model=model, - train_dataloader=train_dataloader, - max_duration="10ep", - load_path="oci://checkpoint-debugging/checkpoints/ep1.pt", + model=model, + train_dataloader=train_dataloader, + max_duration="10ep", + load_path="oci://checkpoint-debugging/checkpoints/ep1.pt", ) new_trainer.fit() @@ -564,10 +564,10 @@ Similarly for GCS: :skipif: not _LIBCLOUD_INSTALLED new_trainer = Trainer( - model=model, - train_dataloader=train_dataloader, - max_duration="10ep", - load_path="gs://checkpoint-debugging/checkpoints/ep1.pt", + model=model, + train_dataloader=train_dataloader, + max_duration="10ep", + load_path="gs://checkpoint-debugging/checkpoints/ep1.pt", ) new_trainer.fit() diff --git a/pyproject.toml b/pyproject.toml index f153616f0d..7b7a54dde7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -164,6 +164,10 @@ filterwarnings = [ '''ignore:The 'transformers' MLflow Models integration.*:UserWarning''', # Ignore our own deprecation warnings, '''ignore::composer.utils.warnings.VersionedDeprecationWarning''', + # Ignore deprecation warning for torch.load + '''ignore:You are using `torch.load` with `weights_only=False`.*:FutureWarning''', + # Ignore deprecation warning as DeepSpeed uses old path + '''ignore:.*torch.cuda.amp.custom.*:FutureWarning''', ] # Coverage diff --git a/tests/algorithms/test_required_on_load.py b/tests/algorithms/test_required_on_load.py index 47ced249db..824be91646 100644 --- a/tests/algorithms/test_required_on_load.py +++ b/tests/algorithms/test_required_on_load.py @@ -174,7 +174,18 @@ def test_autoload( context = pytest.warns(UserWarning, match='Automatically adding required_on_load algorithm*') # Excluding some algorithms leads to errors when loading elif exclude: - if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized(): + if version.parse(torch.__version__) >= version.parse('2.4.0'): + if algo_name in [ + 'BlurPool', + 'Factorize', + 'GatedLinearUnits', + 'GhostBatchNorm', + 'SqueezeExcite', + ]: + context = pytest.raises(KeyError) # Optimizer loading is strict + elif algo_name == 'Alibi': + context = pytest.raises(RuntimeError) # Alibi has shape issues + elif version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized(): if algo_name in [ 'Alibi', 'BlurPool', diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py index ede864d13b..d91b1beea6 100644 --- a/tests/trainer/test_checkpoint.py +++ b/tests/trainer/test_checkpoint.py @@ -996,7 +996,9 @@ def test_strict_errors(self, missing_key: bool, unexpected_key: bool): last_checkpoint = os.path.join('first', 'ep2.pt') if missing_key or unexpected_key: message = r'Error\(s\) in loading state_dict' - if version.parse(torch.__version__) < version.parse('2.2.3') or not dist.is_initialized(): + if version.parse(torch.__version__) < version.parse('2.2.3') or ( + version.parse(torch.__version__) < version.parse('2.4.0') and not dist.is_initialized() + ): # Composer implements strict for older torch versions message = 'Failed to load checkpoint due to' error_context = pytest.raises(RuntimeError, match=message) @@ -1354,7 +1356,9 @@ def test_autoload_algorithm_old_checkpoint(self): NoOpModel.__init__ = lambda self, x: None # type: ignore NoOpModel.__repr__ = lambda self: 'NoOpModel(3)' error_context = pytest.raises(KeyError, match='module.0.weight') - if version.parse(torch.__version__) < version.parse('2.2.3') or not dist.is_initialized(): + if version.parse(torch.__version__) < version.parse('2.2.3') or ( + version.parse(torch.__version__) < version.parse('2.4.0') and not dist.is_initialized() + ): error_context = pytest.raises(ValueError, match='loaded state dict contains a parameter group.*') with pytest.warns(UserWarning, match='required_on_load algorithm.*'), error_context: trainer_3 = self.get_trainer(load_path=os.path.join('first', 'ep1.pt')) diff --git a/tests/trainer/test_ddp_sync_strategy.py b/tests/trainer/test_ddp_sync_strategy.py index 05efeb6960..dfe711b92f 100644 --- a/tests/trainer/test_ddp_sync_strategy.py +++ b/tests/trainer/test_ddp_sync_strategy.py @@ -6,6 +6,7 @@ import pytest import torch import torch.nn as nn +from packaging import version from torch import Tensor from torch.utils.data import DataLoader @@ -45,7 +46,11 @@ def loss(self, output: Tensor, target: Tensor): @pytest.mark.parametrize( 'ddp_sync_strategy,expected_grads', [ - pytest.param('single_auto_sync', ([-1, None, None], [-1, -1.5, None], [-1, -1.5, None]), id='single_auto_sync'), + pytest.param( + 'single_auto_sync', + ([-1, None, None], [-1.5, -1.5, None], [-1.5, -1.5, None]), + id='single_auto_sync', + ), pytest.param( 'multi_auto_sync', ([-1.5, None, None], [-1.5, -1.5, None], [-1.5, -1.5, None]), @@ -61,8 +66,9 @@ def test_ddp_sync_strategy( rank_zero_seed: int, request: pytest.FixtureRequest, ): + if version.parse(torch.__version__) < version.parse('2.4.0'): + pytest.skip('Before PyTorch 2.4, single_auto_sync did not properly run on last microbatch') original_model = MinimalConditionalModel() - # ddp = DDP(backend="gloo", find_unused_parameters=True, sync_strategy=ddp_sync_strategy, timeout=5.) optimizer = torch.optim.SGD(original_model.parameters(), 0.1) device = None for item in request.session.items: