From 539dded443f1e178297858adbef4009d1101df21 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Mon, 12 Aug 2024 19:23:12 -0400 Subject: [PATCH 01/30] add torch 24 tests --- .github/workflows/daily.yaml | 52 +++++++++++++++++++++++++++-------- .github/workflows/pr-cpu.yaml | 6 +++- .github/workflows/pr-gpu.yaml | 12 ++++---- 3 files changed, 51 insertions(+), 19 deletions(-) diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml index 5552d6c19c..8ca5b6fc44 100644 --- a/.github/workflows/daily.yaml +++ b/.github/workflows/daily.yaml @@ -22,18 +22,23 @@ jobs: markers: not daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: mosaicml - - name: cpu-3.11-2.2-composer - container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04 - markers: not daily and (remote or not remote) and not gpu and not doctest - pytest_command: coverage run -m pytest - composer_package_name: composer - name: cpu-3.11-2.3 container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 markers: not daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: mosaicml + - name: cpu-3.11-2.4 + container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 + markers: not daily and (remote or not remote) and not gpu and not doctest + pytest_command: coverage run -m pytest + composer_package_name: mosaicml + - name: cpu-3.11-2.4-composer + container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 + markers: not daily and (remote or not remote) and not gpu and not doctest + pytest_command: coverage run -m pytest + composer_package_name: composer - name: cpu-doctest - container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 + container: mosaicml/pytorch:2.4.0_cpu-python3.10-ubuntu20.04 markers: not daily and (remote or not remote) and not gpu and doctest pytest_command: coverage run -m pytest tests/test_docs.py composer_package_name: mosaicml @@ -42,18 +47,23 @@ jobs: markers: daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: mosaicml - - name: daily-cpu-3.11-2.2-composer - container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04 + - name: daily-cpu-3.11-2.3 + container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 markers: daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest - composer_package_name: composer - - name: daily-cpu-3.11-2.3-composer - container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 + composer_package_name: mosaicml + - name: daily-cpu-3.11-2.4 + container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 + markers: daily and (remote or not remote) and not gpu and not doctest + pytest_command: coverage run -m pytest + composer_package_name: mosaicml + - name: daily-cpu-3.11-2.4-composer + container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 markers: daily and (remote or not remote) and not gpu and not doctest pytest_command: coverage run -m pytest composer_package_name: composer - name: daily-cpu-doctest - container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 markers: daily and (remote or not remote) and not gpu and doctest pytest_command: coverage run -m pytest tests/test_docs.py composer_package_name: mosaicml @@ -104,6 +114,12 @@ jobs: pytest_command: "coverage run -m pytest" composer_package_name: "mosaicml" gpu_num: 1 + - name: "gpu-3.11-2.4-1-gpu" + container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04 + markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" + pytest_command: "coverage run -m pytest" + composer_package_name: "mosaicml" + gpu_num: 1 - name: "gpu-3.11-2.2-2-gpu" container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" @@ -116,6 +132,12 @@ jobs: pytest_command: "coverage run -m pytest" composer_package_name: "mosaicml" gpu_num: 2 + - name: "gpu-3.11-2.4-2-gpu" + container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04 + markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" + pytest_command: "coverage run -m pytest" + composer_package_name: "mosaicml" + gpu_num: 2 - name: "gpu-3.11-2.2-4-gpu" container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" @@ -128,6 +150,12 @@ jobs: pytest_command: "coverage run -m pytest" composer_package_name: "mosaicml" gpu_num: 4 + - name: "gpu-3.11-2.4-4-gpu" + container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04 + markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" + pytest_command: "coverage run -m pytest" + composer_package_name: "mosaicml" + gpu_num: 4 name: ${{ matrix.name }} if: github.repository_owner == 'mosaicml' with: diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index 4d44e69824..1483fc060e 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -21,8 +21,12 @@ jobs: container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 markers: not daily and not remote and not gpu and not doctest pytest_command: coverage run -m pytest + - name: cpu-3.11-2.4 + container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 + markers: not daily and not remote and not gpu and not doctest + pytest_command: coverage run -m pytest - name: cpu-doctest - container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 markers: not daily and not remote and not gpu and doctest pytest_command: coverage run -m pytest tests/test_docs.py name: ${{ matrix.name }} diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml index 2f335a5a68..625c06072a 100644 --- a/.github/workflows/pr-gpu.yaml +++ b/.github/workflows/pr-gpu.yaml @@ -13,8 +13,8 @@ jobs: strategy: matrix: include: - - name: gpu-3.11-2.3-1 - container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 + - name: gpu-3.11-2.4-1 + container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04 markers: not daily and not remote and gpu and (doctest or not doctest) pytest_command: coverage run -m pytest composer_package_name: mosaicml @@ -39,8 +39,8 @@ jobs: strategy: matrix: include: - - name: gpu-3.11-2.3-2 - container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 + - name: gpu-3.11-2.342 + container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04 markers: not daily and not remote and gpu and (doctest or not doctest) pytest_command: coverage run -m pytest composer_package_name: mosaicml @@ -66,8 +66,8 @@ jobs: strategy: matrix: include: - - name: gpu-3.11-2.3-4 - container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 + - name: gpu-3.11-2.4-4 + container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04 markers: not daily and not remote and gpu and (doctest or not doctest) pytest_command: coverage run -m pytest composer_package_name: mosaicml From 9611fe6dc241ad299f521e4cc1e586ef14b337cd Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Mon, 12 Aug 2024 20:14:25 -0400 Subject: [PATCH 02/30] filter warnings --- composer/trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index 27323718fc..b36572db4c 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1725,7 +1725,7 @@ def __init__( # Suppressing GradScaler warnings as they are always created # self._use_grad_scaling() will raise a RuntimeError if grad scaling is not available when it is required - warnings.filterwarnings(action='ignore', message='torch.cuda.amp.GradScaler') + warnings.filterwarnings(action='ignore', message='.*torch.cuda.amp.GradScaler.*') self.state.scaler = ClosureGradScaler() if self._use_closures() else GradScaler() if self.state.fsdp_config is not None: From 4af03202f8c4a77ea3156a130020c8c0a1df86a4 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Mon, 12 Aug 2024 20:32:34 -0400 Subject: [PATCH 03/30] fix --- .github/workflows/pr-cpu.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index 1483fc060e..caa3e0fb05 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -25,10 +25,10 @@ jobs: container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 markers: not daily and not remote and not gpu and not doctest pytest_command: coverage run -m pytest - - name: cpu-doctest - container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 - markers: not daily and not remote and not gpu and doctest - pytest_command: coverage run -m pytest tests/test_docs.py + # - name: cpu-doctest + # container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 + # markers: not daily and not remote and not gpu and doctest + # pytest_command: coverage run -m pytest tests/test_docs.py name: ${{ matrix.name }} if: github.repository_owner == 'mosaicml' with: From ada84f0590cad1fc0c1fedb2ad235878c3531881 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Mon, 12 Aug 2024 20:54:16 -0400 Subject: [PATCH 04/30] filter deprecation warning --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index f153616f0d..a763ec3bf5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -164,6 +164,10 @@ filterwarnings = [ '''ignore:The 'transformers' MLflow Models integration.*:UserWarning''', # Ignore our own deprecation warnings, '''ignore::composer.utils.warnings.VersionedDeprecationWarning''', + # Ignore deprecation warning for torch.load + '''ignore::.*You are using `torch.load` with `weights_only=False`.*:FutureWarning''', + # Ignore deprecation warning as DeepSpeed uses old path + '''ignore:.*`torch.cuda.amp.custom_fwd(args...)` is deprecated.*:FutureWarning''', ] # Coverage From 922651d534c41f7b850c7849eb4e98442ad96998 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Mon, 12 Aug 2024 20:58:06 -0400 Subject: [PATCH 05/30] fix filter --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a763ec3bf5..7ddbc9f1a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -165,9 +165,9 @@ filterwarnings = [ # Ignore our own deprecation warnings, '''ignore::composer.utils.warnings.VersionedDeprecationWarning''', # Ignore deprecation warning for torch.load - '''ignore::.*You are using `torch.load` with `weights_only=False`.*:FutureWarning''', + '''ignore:You are using `torch.load` with `weights_only=False`.*:FutureWarning''', # Ignore deprecation warning as DeepSpeed uses old path - '''ignore:.*`torch.cuda.amp.custom_fwd(args...)` is deprecated.*:FutureWarning''', + '''ignore:`torch.cuda.amp.custom_fwd(args...)` is deprecated.*:FutureWarning''', ] # Coverage From 93c9a6e9a970030d0bd4c9eb2af61fdca905f93f Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Mon, 12 Aug 2024 21:15:55 -0400 Subject: [PATCH 06/30] fix filter --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7ddbc9f1a5..ce281a7279 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -167,7 +167,7 @@ filterwarnings = [ # Ignore deprecation warning for torch.load '''ignore:You are using `torch.load` with `weights_only=False`.*:FutureWarning''', # Ignore deprecation warning as DeepSpeed uses old path - '''ignore:`torch.cuda.amp.custom_fwd(args...)` is deprecated.*:FutureWarning''', + '''ignore:.*torch.cuda.amp.custom_fwd.*:FutureWarning''', ] # Coverage From 23e77b3ea820f3b228fbdad9961bea2c0bfff312 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 13 Aug 2024 12:07:19 -0400 Subject: [PATCH 07/30] fix regex --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ce281a7279..7b7a54dde7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -167,7 +167,7 @@ filterwarnings = [ # Ignore deprecation warning for torch.load '''ignore:You are using `torch.load` with `weights_only=False`.*:FutureWarning''', # Ignore deprecation warning as DeepSpeed uses old path - '''ignore:.*torch.cuda.amp.custom_fwd.*:FutureWarning''', + '''ignore:.*torch.cuda.amp.custom.*:FutureWarning''', ] # Coverage From 5eacf6a157e768c5494aa1f5529e4154416ca527 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 13 Aug 2024 12:27:39 -0400 Subject: [PATCH 08/30] restore magic mock --- composer/core/state.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/composer/core/state.py b/composer/core/state.py index 7c43473ace..73244095ee 100644 --- a/composer/core/state.py +++ b/composer/core/state.py @@ -1445,8 +1445,7 @@ def load_optim_state(self, state_dict: dict[str, Any], strict: bool = True): # checkpoint on rank 0 only. However, PyTorch modifies the state_dict (producing # errors) before discarding the output. Accordingly, we mock the state dict. # See: https://github.com/pytorch/pytorch/issues/125177 - if version.parse(torch.__version__) < version.parse('2.4.0'): - optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict + optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict set_optimizer_state_dict( model=self.model, optimizers=optimizer, From 4317a6bbf7cf78bae9c43113e09d1796b0e64348 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 13 Aug 2024 12:48:02 -0400 Subject: [PATCH 09/30] revert magicmock --- composer/core/state.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/composer/core/state.py b/composer/core/state.py index 73244095ee..7c43473ace 100644 --- a/composer/core/state.py +++ b/composer/core/state.py @@ -1445,7 +1445,8 @@ def load_optim_state(self, state_dict: dict[str, Any], strict: bool = True): # checkpoint on rank 0 only. However, PyTorch modifies the state_dict (producing # errors) before discarding the output. Accordingly, we mock the state dict. # See: https://github.com/pytorch/pytorch/issues/125177 - optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict + if version.parse(torch.__version__) < version.parse('2.4.0'): + optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict set_optimizer_state_dict( model=self.model, optimizers=optimizer, From ba968dff4c2c2e3b70c64bce2df937fa2cea9bf9 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 13 Aug 2024 18:28:33 +0000 Subject: [PATCH 10/30] fix warnings --- composer/core/precision.py | 11 ++++++----- tests/trainer/test_checkpoint.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/composer/core/precision.py b/composer/core/precision.py index 2723159261..4e7091bd24 100644 --- a/composer/core/precision.py +++ b/composer/core/precision.py @@ -5,6 +5,7 @@ import contextlib import textwrap +from packaging import version from typing import Any, Generator, Optional, Union import torch @@ -26,9 +27,9 @@ class Precision(StringEnum): Attributes: FP32: Use 32-bit floating-point precision. Compatible with CPUs and GPUs. - AMP_FP16: Use :mod:`torch.cuda.amp` with 16-bit floating-point precision. Only compatible + AMP_FP16: Use :mod:`torch.amp` with 16-bit floating-point precision. Only compatible with GPUs. - AMP_BF16: Use :mod:`torch.cuda.amp` with 16-bit BFloat precision. + AMP_BF16: Use :mod:`torch.amp` with 16-bit BFloat precision. AMP_FP8: Use :mod:`transformer_engine.pytorch.fp8_autocast` with 8-bit FP8 precison. """ FP32 = 'fp32' @@ -60,7 +61,7 @@ def get_precision_context( precision = Precision(precision) if precision == Precision.FP32: if torch.cuda.is_available(): - with torch.cuda.amp.autocast(False): + with torch.autocast('cuda', enabled=False): yield else: # Yield here to avoid warnings about cuda not being available @@ -68,7 +69,7 @@ def get_precision_context( elif precision == Precision.AMP_FP16: # Retain compatibility with PyTorch < 1.10 if torch.cuda.is_available(): - with torch.cuda.amp.autocast(True): + with torch.autocast('cuda', enabled=True): yield elif is_xla_installed(): with torch.autocast('xla', dtype=torch.float16): @@ -77,7 +78,7 @@ def get_precision_context( yield elif precision == Precision.AMP_BF16: if torch.cuda.is_available(): - with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16): + with torch.autocast('cuda', dtype=torch.bfloat16, enabled=True): yield elif is_xla_installed(): with torch.autocast('xla', dtype=torch.bfloat16): diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py index ede864d13b..892ea5434b 100644 --- a/tests/trainer/test_checkpoint.py +++ b/tests/trainer/test_checkpoint.py @@ -996,7 +996,7 @@ def test_strict_errors(self, missing_key: bool, unexpected_key: bool): last_checkpoint = os.path.join('first', 'ep2.pt') if missing_key or unexpected_key: message = r'Error\(s\) in loading state_dict' - if version.parse(torch.__version__) < version.parse('2.2.3') or not dist.is_initialized(): + if version.parse(torch.__version__) < version.parse('2.2.3') or (version.parse(torch.__version__) < version.parse('2.4.0') and not dist.is_initialized()): # Composer implements strict for older torch versions message = 'Failed to load checkpoint due to' error_context = pytest.raises(RuntimeError, match=message) From f072bc1252440835fa6058b5bdc391dd5d7d8342 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 13 Aug 2024 14:29:36 -0400 Subject: [PATCH 11/30] fix amp warnings --- composer/core/precision.py | 1 - tests/trainer/test_checkpoint.py | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/composer/core/precision.py b/composer/core/precision.py index 4e7091bd24..c3aa06643b 100644 --- a/composer/core/precision.py +++ b/composer/core/precision.py @@ -5,7 +5,6 @@ import contextlib import textwrap -from packaging import version from typing import Any, Generator, Optional, Union import torch diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py index 892ea5434b..70a38f44ba 100644 --- a/tests/trainer/test_checkpoint.py +++ b/tests/trainer/test_checkpoint.py @@ -996,7 +996,9 @@ def test_strict_errors(self, missing_key: bool, unexpected_key: bool): last_checkpoint = os.path.join('first', 'ep2.pt') if missing_key or unexpected_key: message = r'Error\(s\) in loading state_dict' - if version.parse(torch.__version__) < version.parse('2.2.3') or (version.parse(torch.__version__) < version.parse('2.4.0') and not dist.is_initialized()): + if version.parse(torch.__version__) < version.parse('2.2.3') or ( + version.parse(torch.__version__) < version.parse('2.4.0') and not dist.is_initialized() + ): # Composer implements strict for older torch versions message = 'Failed to load checkpoint due to' error_context = pytest.raises(RuntimeError, match=message) From 7f0e81dffb1808ad1657d2103a6136a7e1ff4b72 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 13 Aug 2024 18:40:59 +0000 Subject: [PATCH 12/30] fix test --- tests/algorithms/test_required_on_load.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/algorithms/test_required_on_load.py b/tests/algorithms/test_required_on_load.py index 47ced249db..b5216955fb 100644 --- a/tests/algorithms/test_required_on_load.py +++ b/tests/algorithms/test_required_on_load.py @@ -174,7 +174,7 @@ def test_autoload( context = pytest.warns(UserWarning, match='Automatically adding required_on_load algorithm*') # Excluding some algorithms leads to errors when loading elif exclude: - if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized(): + if version.parse(torch.__version__) >= version.parse('2.4.0') or (version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()): if algo_name in [ 'Alibi', 'BlurPool', From fc8d7f46b280c043ff973ddd98e6468591718a29 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 13 Aug 2024 14:44:11 -0400 Subject: [PATCH 13/30] lint --- tests/algorithms/test_required_on_load.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/algorithms/test_required_on_load.py b/tests/algorithms/test_required_on_load.py index b5216955fb..ce62d6aefe 100644 --- a/tests/algorithms/test_required_on_load.py +++ b/tests/algorithms/test_required_on_load.py @@ -174,7 +174,9 @@ def test_autoload( context = pytest.warns(UserWarning, match='Automatically adding required_on_load algorithm*') # Excluding some algorithms leads to errors when loading elif exclude: - if version.parse(torch.__version__) >= version.parse('2.4.0') or (version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()): + if version.parse(torch.__version__) >= version.parse('2.4.0') or ( + version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized() + ): if algo_name in [ 'Alibi', 'BlurPool', From 92ea9ce46dcc34c3fef6450c1ecddd5cdef5085a Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 13 Aug 2024 18:50:43 +0000 Subject: [PATCH 14/30] fix errors --- tests/algorithms/test_required_on_load.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/algorithms/test_required_on_load.py b/tests/algorithms/test_required_on_load.py index ce62d6aefe..824be91646 100644 --- a/tests/algorithms/test_required_on_load.py +++ b/tests/algorithms/test_required_on_load.py @@ -174,9 +174,18 @@ def test_autoload( context = pytest.warns(UserWarning, match='Automatically adding required_on_load algorithm*') # Excluding some algorithms leads to errors when loading elif exclude: - if version.parse(torch.__version__) >= version.parse('2.4.0') or ( - version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized() - ): + if version.parse(torch.__version__) >= version.parse('2.4.0'): + if algo_name in [ + 'BlurPool', + 'Factorize', + 'GatedLinearUnits', + 'GhostBatchNorm', + 'SqueezeExcite', + ]: + context = pytest.raises(KeyError) # Optimizer loading is strict + elif algo_name == 'Alibi': + context = pytest.raises(RuntimeError) # Alibi has shape issues + elif version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized(): if algo_name in [ 'Alibi', 'BlurPool', From 0ba2c47183689c892b3f8f449012fdace4d97293 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 13 Aug 2024 20:02:03 +0000 Subject: [PATCH 15/30] fix last test --- tests/trainer/test_checkpoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py index 70a38f44ba..eb4e60f7d9 100644 --- a/tests/trainer/test_checkpoint.py +++ b/tests/trainer/test_checkpoint.py @@ -1356,7 +1356,7 @@ def test_autoload_algorithm_old_checkpoint(self): NoOpModel.__init__ = lambda self, x: None # type: ignore NoOpModel.__repr__ = lambda self: 'NoOpModel(3)' error_context = pytest.raises(KeyError, match='module.0.weight') - if version.parse(torch.__version__) < version.parse('2.2.3') or not dist.is_initialized(): + if version.parse(torch.__version__) < version.parse('2.2.3') or (version.parse(torch.__version__) < version.parse('2.4.0') and not dist.is_initialized()): error_context = pytest.raises(ValueError, match='loaded state dict contains a parameter group.*') with pytest.warns(UserWarning, match='required_on_load algorithm.*'), error_context: trainer_3 = self.get_trainer(load_path=os.path.join('first', 'ep1.pt')) From fedfbeeb637f7808fecbb1142a577acf835897f4 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 13 Aug 2024 16:02:54 -0400 Subject: [PATCH 16/30] lint --- tests/trainer/test_checkpoint.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py index eb4e60f7d9..d91b1beea6 100644 --- a/tests/trainer/test_checkpoint.py +++ b/tests/trainer/test_checkpoint.py @@ -1356,7 +1356,9 @@ def test_autoload_algorithm_old_checkpoint(self): NoOpModel.__init__ = lambda self, x: None # type: ignore NoOpModel.__repr__ = lambda self: 'NoOpModel(3)' error_context = pytest.raises(KeyError, match='module.0.weight') - if version.parse(torch.__version__) < version.parse('2.2.3') or (version.parse(torch.__version__) < version.parse('2.4.0') and not dist.is_initialized()): + if version.parse(torch.__version__) < version.parse('2.2.3') or ( + version.parse(torch.__version__) < version.parse('2.4.0') and not dist.is_initialized() + ): error_context = pytest.raises(ValueError, match='loaded state dict contains a parameter group.*') with pytest.warns(UserWarning, match='required_on_load algorithm.*'), error_context: trainer_3 = self.get_trainer(load_path=os.path.join('first', 'ep1.pt')) From d77f82589bd77489d3fb775da60d97a9bc139fc9 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 13 Aug 2024 16:03:08 -0400 Subject: [PATCH 17/30] enable doctests --- .github/workflows/pr-cpu.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index caa3e0fb05..1483fc060e 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -25,10 +25,10 @@ jobs: container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 markers: not daily and not remote and not gpu and not doctest pytest_command: coverage run -m pytest - # - name: cpu-doctest - # container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 - # markers: not daily and not remote and not gpu and doctest - # pytest_command: coverage run -m pytest tests/test_docs.py + - name: cpu-doctest + container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 + markers: not daily and not remote and not gpu and doctest + pytest_command: coverage run -m pytest tests/test_docs.py name: ${{ matrix.name }} if: github.repository_owner == 'mosaicml' with: From 78cfcb08c235cb8dacf21980ef8f6fb9bb8a0922 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 13 Aug 2024 18:31:09 -0400 Subject: [PATCH 18/30] fix doctests --- docs/source/doctest_fixtures.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py index f54d1f69e1..a0583d422d 100644 --- a/docs/source/doctest_fixtures.py +++ b/docs/source/doctest_fixtures.py @@ -53,7 +53,7 @@ from composer.loggers import Logger as Logger from composer.loggers import RemoteUploaderDownloader from composer.models import ComposerModel as ComposerModel -from composer.optim.scheduler import ConstantScheduler +from composer.optim import ConstantScheduler, DecoupledSGDW from composer.utils import LibcloudObjectStore, RemoteUploader from composer.utils import ensure_tuple as ensure_tuple @@ -117,7 +117,7 @@ model = SimpleModel(num_channels, num_classes) -optimizer = torch.optim.SGD(model.parameters(), lr=0.001) +optimizer = DecoupledSGDW(model.parameters(), lr=0.001) scheduler = CosineAnnealingLR(optimizer, T_max=1) From ca2c1f91b665973a74388b7de94d5208659ccddf Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 13 Aug 2024 18:43:05 -0400 Subject: [PATCH 19/30] fix doctests --- docs/source/doctest_fixtures.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py index a0583d422d..90e9ca6ed0 100644 --- a/docs/source/doctest_fixtures.py +++ b/docs/source/doctest_fixtures.py @@ -19,6 +19,7 @@ from typing import Any from typing import Callable as Callable from urllib.parse import urlparse +import warnings import numpy as np import pytest @@ -57,6 +58,9 @@ from composer.utils import LibcloudObjectStore, RemoteUploader from composer.utils import ensure_tuple as ensure_tuple +# Ignore certain warnings for doctest +warnings.filterwarnings(action='ignore', message='.*torch.cuda.amp.custom.*') + try: import wandb _WANDB_INSTALLED = True From 7215c68928ed23f2e91c51fe21ab7285704d6fc9 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 13 Aug 2024 23:08:08 +0000 Subject: [PATCH 20/30] switch to decoupled adamw --- docs/source/doctest_fixtures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py index 90e9ca6ed0..56d9073f77 100644 --- a/docs/source/doctest_fixtures.py +++ b/docs/source/doctest_fixtures.py @@ -192,7 +192,7 @@ def _new_trainer_init(self, fake_ellipses: None = None, **kwargs: Any): if 'model' not in kwargs: kwargs['model'] = model if 'optimizers' not in kwargs: - kwargs['optimizers'] = torch.optim.SGD(kwargs['model'].parameters(), lr=0.01) + kwargs['optimizers'] = DecoupledSGDW(kwargs['model'].parameters(), lr=0.01) if 'schedulers' not in kwargs: kwargs['schedulers'] = ConstantScheduler() if 'max_duration' not in kwargs: From fac29614691c86c7c2d87aa9958c9d9717511389 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 13 Aug 2024 19:29:51 -0400 Subject: [PATCH 21/30] lint --- docs/source/doctest_fixtures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py index 56d9073f77..2fccf71b1d 100644 --- a/docs/source/doctest_fixtures.py +++ b/docs/source/doctest_fixtures.py @@ -16,10 +16,10 @@ import os import sys import tempfile +import warnings from typing import Any from typing import Callable as Callable from urllib.parse import urlparse -import warnings import numpy as np import pytest From 6432889b34725ef7299f3ae2ea659265f3c93102 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Wed, 14 Aug 2024 06:09:42 +0000 Subject: [PATCH 22/30] fix tests --- composer/trainer/trainer.py | 8 ++++++++ docs/source/trainer/checkpointing.rst | 24 ++++++++++++------------ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index b36572db4c..c09702a65f 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -2442,6 +2442,14 @@ def fit( self.first_batch_complete = False self._train_loop() + # Zero gradients at the end of fit so same model/optimizer can be used for further training + # with checkpoint loading. See https://github.com/pytorch/pytorch/issues/133415 + for optimizer in self.state.optimizers: + try: + optimizer.zero_grad(set_to_none=True) + except TypeError: + optimizer.zero_grad() + def close(self): """Shutdown the trainer. diff --git a/docs/source/trainer/checkpointing.rst b/docs/source/trainer/checkpointing.rst index 0cb2f39898..36f0b358a7 100644 --- a/docs/source/trainer/checkpointing.rst +++ b/docs/source/trainer/checkpointing.rst @@ -531,10 +531,10 @@ object stores like WandB or LibCloud, you must still specify a ``load_object_sto :skipif: not _LIBCLOUD_INSTALLED new_trainer = Trainer( - model=model, - train_dataloader=train_dataloader, - max_duration="10ep", - load_path="s3://checkpoint-debugging/checkpoints/ep1.pt", + model=model, + train_dataloader=train_dataloader, + max_duration="10ep", + load_path="s3://checkpoint-debugging/checkpoints/ep1.pt", ) new_trainer.fit() @@ -547,10 +547,10 @@ Similarly for OCI: :skipif: not _LIBCLOUD_INSTALLED new_trainer = Trainer( - model=model, - train_dataloader=train_dataloader, - max_duration="10ep", - load_path="oci://checkpoint-debugging/checkpoints/ep1.pt", + model=model, + train_dataloader=train_dataloader, + max_duration="10ep", + load_path="oci://checkpoint-debugging/checkpoints/ep1.pt", ) new_trainer.fit() @@ -564,10 +564,10 @@ Similarly for GCS: :skipif: not _LIBCLOUD_INSTALLED new_trainer = Trainer( - model=model, - train_dataloader=train_dataloader, - max_duration="10ep", - load_path="gs://checkpoint-debugging/checkpoints/ep1.pt", + model=model, + train_dataloader=train_dataloader, + max_duration="10ep", + load_path="gs://checkpoint-debugging/checkpoints/ep1.pt", ) new_trainer.fit() From 333c2345442b42227f4c9bda7c2d9d4f1ecd31a8 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Wed, 14 Aug 2024 02:10:31 -0400 Subject: [PATCH 23/30] revert to sgd --- docs/source/doctest_fixtures.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py index 2fccf71b1d..e152e740f8 100644 --- a/docs/source/doctest_fixtures.py +++ b/docs/source/doctest_fixtures.py @@ -54,7 +54,7 @@ from composer.loggers import Logger as Logger from composer.loggers import RemoteUploaderDownloader from composer.models import ComposerModel as ComposerModel -from composer.optim import ConstantScheduler, DecoupledSGDW +from composer.optim import ConstantScheduler from composer.utils import LibcloudObjectStore, RemoteUploader from composer.utils import ensure_tuple as ensure_tuple @@ -121,7 +121,7 @@ model = SimpleModel(num_channels, num_classes) -optimizer = DecoupledSGDW(model.parameters(), lr=0.001) +optimizer = torch.optim.SGD(model.parameters(), lr=0.001) scheduler = CosineAnnealingLR(optimizer, T_max=1) @@ -192,7 +192,7 @@ def _new_trainer_init(self, fake_ellipses: None = None, **kwargs: Any): if 'model' not in kwargs: kwargs['model'] = model if 'optimizers' not in kwargs: - kwargs['optimizers'] = DecoupledSGDW(kwargs['model'].parameters(), lr=0.01) + kwargs['optimizers'] = torch.optim.SGD(kwargs['model'].parameters(), lr=0.01) if 'schedulers' not in kwargs: kwargs['schedulers'] = ConstantScheduler() if 'max_duration' not in kwargs: From 7332a2b5db165aeb8957bafc2cdd226eee44e92f Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Wed, 14 Aug 2024 02:19:53 -0400 Subject: [PATCH 24/30] add more warnings filters --- docs/source/doctest_fixtures.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py index e152e740f8..67c0cd7723 100644 --- a/docs/source/doctest_fixtures.py +++ b/docs/source/doctest_fixtures.py @@ -59,7 +59,10 @@ from composer.utils import ensure_tuple as ensure_tuple # Ignore certain warnings for doctest -warnings.filterwarnings(action='ignore', message='.*torch.cuda.amp.custom.*') +warnings.filterwarnings(action='ignore', message='.*Deterministic mode.*') # Expected +warnings.filterwarnings(action='ignore', message='.*torch.cuda.amp.custom.*') # DeepSpeed +warnings.filterwarnings(action='ignore', message='.*The distutils.sysconfig module*') # DeepSpeed + try: import wandb From 691167a49068a220eb581e636b6e4d6967299a2d Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Wed, 14 Aug 2024 02:33:06 -0400 Subject: [PATCH 25/30] decoupled adamw --- docs/source/doctest_fixtures.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py index 67c0cd7723..60dea99ff9 100644 --- a/docs/source/doctest_fixtures.py +++ b/docs/source/doctest_fixtures.py @@ -54,16 +54,16 @@ from composer.loggers import Logger as Logger from composer.loggers import RemoteUploaderDownloader from composer.models import ComposerModel as ComposerModel -from composer.optim import ConstantScheduler +from composer.optim import ConstantScheduler, DecoupledSGDW from composer.utils import LibcloudObjectStore, RemoteUploader from composer.utils import ensure_tuple as ensure_tuple # Ignore certain warnings for doctest warnings.filterwarnings(action='ignore', message='.*Deterministic mode.*') # Expected +warnings.filterwarnings(action='ignore', message='.*Some weights of Bert*') # Expected warnings.filterwarnings(action='ignore', message='.*torch.cuda.amp.custom.*') # DeepSpeed warnings.filterwarnings(action='ignore', message='.*The distutils.sysconfig module*') # DeepSpeed - try: import wandb _WANDB_INSTALLED = True @@ -124,7 +124,7 @@ model = SimpleModel(num_channels, num_classes) -optimizer = torch.optim.SGD(model.parameters(), lr=0.001) +optimizer = DecoupledSGDW(model.parameters(), lr=0.001) scheduler = CosineAnnealingLR(optimizer, T_max=1) @@ -195,7 +195,7 @@ def _new_trainer_init(self, fake_ellipses: None = None, **kwargs: Any): if 'model' not in kwargs: kwargs['model'] = model if 'optimizers' not in kwargs: - kwargs['optimizers'] = torch.optim.SGD(kwargs['model'].parameters(), lr=0.01) + kwargs['optimizers'] = DecoupledSGDW(kwargs['model'].parameters(), lr=0.01) if 'schedulers' not in kwargs: kwargs['schedulers'] = ConstantScheduler() if 'max_duration' not in kwargs: From 2f05c221010a72f40564aecf7380b6966e334fda Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Wed, 14 Aug 2024 11:29:55 -0400 Subject: [PATCH 26/30] update logging --- composer/trainer/trainer.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index c09702a65f..695136f552 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -2446,9 +2446,12 @@ def fit( # with checkpoint loading. See https://github.com/pytorch/pytorch/issues/133415 for optimizer in self.state.optimizers: try: - optimizer.zero_grad(set_to_none=True) - except TypeError: - optimizer.zero_grad() + try: + optimizer.zero_grad(set_to_none=True) + except TypeError: + optimizer.zero_grad() + except: + log.exception('Failed to zero out optimizer at end of fit') def close(self): """Shutdown the trainer. From 4f5b592baac43be95be1d354a42a5f892d3fbdeb Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Wed, 14 Aug 2024 17:38:33 +0000 Subject: [PATCH 27/30] fix test --- tests/trainer/test_ddp_sync_strategy.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/trainer/test_ddp_sync_strategy.py b/tests/trainer/test_ddp_sync_strategy.py index 05efeb6960..838d3cbaa3 100644 --- a/tests/trainer/test_ddp_sync_strategy.py +++ b/tests/trainer/test_ddp_sync_strategy.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from typing import Optional +from packaging import version import pytest import torch @@ -45,7 +46,7 @@ def loss(self, output: Tensor, target: Tensor): @pytest.mark.parametrize( 'ddp_sync_strategy,expected_grads', [ - pytest.param('single_auto_sync', ([-1, None, None], [-1, -1.5, None], [-1, -1.5, None]), id='single_auto_sync'), + pytest.param('single_auto_sync', ([-1, None, None], [-1.5, -1.5, None], [-1.5, -1.5, None]), id='single_auto_sync'), pytest.param( 'multi_auto_sync', ([-1.5, None, None], [-1.5, -1.5, None], [-1.5, -1.5, None]), @@ -61,8 +62,9 @@ def test_ddp_sync_strategy( rank_zero_seed: int, request: pytest.FixtureRequest, ): + if version.parse(torch.__version__) < version.parse('2.4.0'): + pytest.skip('Before PyTorch 2.4, single_auto_sync did not properly run on last microbatch') original_model = MinimalConditionalModel() - # ddp = DDP(backend="gloo", find_unused_parameters=True, sync_strategy=ddp_sync_strategy, timeout=5.) optimizer = torch.optim.SGD(original_model.parameters(), 0.1) device = None for item in request.session.items: From db381a7322f7139e0f2ffd64dc2f8bee0dd96bd6 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Wed, 14 Aug 2024 13:56:26 -0400 Subject: [PATCH 28/30] lint --- tests/trainer/test_ddp_sync_strategy.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/trainer/test_ddp_sync_strategy.py b/tests/trainer/test_ddp_sync_strategy.py index 838d3cbaa3..9a25d30f8b 100644 --- a/tests/trainer/test_ddp_sync_strategy.py +++ b/tests/trainer/test_ddp_sync_strategy.py @@ -2,11 +2,11 @@ # SPDX-License-Identifier: Apache-2.0 from typing import Optional -from packaging import version import pytest import torch import torch.nn as nn +from packaging import version from torch import Tensor from torch.utils.data import DataLoader @@ -46,7 +46,9 @@ def loss(self, output: Tensor, target: Tensor): @pytest.mark.parametrize( 'ddp_sync_strategy,expected_grads', [ - pytest.param('single_auto_sync', ([-1, None, None], [-1.5, -1.5, None], [-1.5, -1.5, None]), id='single_auto_sync'), + pytest.param( + 'single_auto_sync', ([-1, None, None], [-1.5, -1.5, None], [-1.5, -1.5, None]), id='single_auto_sync' + ), pytest.param( 'multi_auto_sync', ([-1.5, None, None], [-1.5, -1.5, None], [-1.5, -1.5, None]), From 0df4cb3cd5d43d512bf007f46ad6169146541b96 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Wed, 14 Aug 2024 14:50:56 -0400 Subject: [PATCH 29/30] lit --- tests/trainer/test_ddp_sync_strategy.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/trainer/test_ddp_sync_strategy.py b/tests/trainer/test_ddp_sync_strategy.py index 9a25d30f8b..dfe711b92f 100644 --- a/tests/trainer/test_ddp_sync_strategy.py +++ b/tests/trainer/test_ddp_sync_strategy.py @@ -47,7 +47,9 @@ def loss(self, output: Tensor, target: Tensor): 'ddp_sync_strategy,expected_grads', [ pytest.param( - 'single_auto_sync', ([-1, None, None], [-1.5, -1.5, None], [-1.5, -1.5, None]), id='single_auto_sync' + 'single_auto_sync', + ([-1, None, None], [-1.5, -1.5, None], [-1.5, -1.5, None]), + id='single_auto_sync', ), pytest.param( 'multi_auto_sync', From 9a47973e24aa8a629e8af030c6a3f0fcf612c0dd Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Wed, 14 Aug 2024 17:09:22 -0400 Subject: [PATCH 30/30] fix test name --- .github/workflows/pr-gpu.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml index 625c06072a..e74689e597 100644 --- a/.github/workflows/pr-gpu.yaml +++ b/.github/workflows/pr-gpu.yaml @@ -39,7 +39,7 @@ jobs: strategy: matrix: include: - - name: gpu-3.11-2.342 + - name: gpu-3.11-2.4-2 container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04 markers: not daily and not remote and gpu and (doctest or not doctest) pytest_command: coverage run -m pytest