From 539dded443f1e178297858adbef4009d1101df21 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Mon, 12 Aug 2024 19:23:12 -0400
Subject: [PATCH 01/30] add torch 24 tests

---
 .github/workflows/daily.yaml  | 52 +++++++++++++++++++++++++++--------
 .github/workflows/pr-cpu.yaml |  6 +++-
 .github/workflows/pr-gpu.yaml | 12 ++++----
 3 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
index 5552d6c19c..8ca5b6fc44 100644
--- a/.github/workflows/daily.yaml
+++ b/.github/workflows/daily.yaml
@@ -22,18 +22,23 @@ jobs:
           markers: not daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
-        - name: cpu-3.11-2.2-composer
-          container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
-          markers: not daily and (remote or not remote) and not gpu and not doctest
-          pytest_command: coverage run -m pytest
-          composer_package_name: composer
         - name: cpu-3.11-2.3
           container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
           markers: not daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
+        - name: cpu-3.11-2.4
+          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+          markers: not daily and (remote or not remote) and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: mosaicml
+        - name: cpu-3.11-2.4-composer
+          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+          markers: not daily and (remote or not remote) and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: composer
         - name: cpu-doctest
-          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
+          container: mosaicml/pytorch:2.4.0_cpu-python3.10-ubuntu20.04
           markers: not daily and (remote or not remote) and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
           composer_package_name: mosaicml
@@ -42,18 +47,23 @@ jobs:
           markers: daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
-        - name: daily-cpu-3.11-2.2-composer
-          container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
+        - name: daily-cpu-3.11-2.3
+          container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
           markers: daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
-          composer_package_name: composer
-        - name: daily-cpu-3.11-2.3-composer
-          container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
+          composer_package_name: mosaicml
+        - name: daily-cpu-3.11-2.4
+          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+          markers: daily and (remote or not remote) and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: mosaicml
+        - name: daily-cpu-3.11-2.4-composer
+          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
           markers: daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: composer
         - name: daily-cpu-doctest
-          container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
           markers: daily and (remote or not remote) and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
           composer_package_name: mosaicml
@@ -104,6 +114,12 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 1
+        - name: "gpu-3.11-2.4-1-gpu"
+          container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04
+          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
+          pytest_command: "coverage run -m pytest"
+          composer_package_name: "mosaicml"
+          gpu_num: 1
         - name: "gpu-3.11-2.2-2-gpu"
           container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -116,6 +132,12 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 2
+        - name: "gpu-3.11-2.4-2-gpu"
+          container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04
+          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
+          pytest_command: "coverage run -m pytest"
+          composer_package_name: "mosaicml"
+          gpu_num: 2
         - name: "gpu-3.11-2.2-4-gpu"
           container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -128,6 +150,12 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 4
+        - name: "gpu-3.11-2.4-4-gpu"
+          container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04
+          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
+          pytest_command: "coverage run -m pytest"
+          composer_package_name: "mosaicml"
+          gpu_num: 4
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
     with:
diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index 4d44e69824..1483fc060e 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -21,8 +21,12 @@ jobs:
           container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and not doctest
           pytest_command: coverage run -m pytest
+        - name: cpu-3.11-2.4
+          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+          markers: not daily and not remote and not gpu and not doctest
+          pytest_command: coverage run -m pytest
         - name: cpu-doctest
-          container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
     name: ${{ matrix.name }}
diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
index 2f335a5a68..625c06072a 100644
--- a/.github/workflows/pr-gpu.yaml
+++ b/.github/workflows/pr-gpu.yaml
@@ -13,8 +13,8 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: gpu-3.11-2.3-1
-          container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
+        - name: gpu-3.11-2.4-1
+          container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04
           markers: not daily and not remote and gpu and (doctest or not doctest)
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
@@ -39,8 +39,8 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: gpu-3.11-2.3-2
-          container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
+        - name: gpu-3.11-2.342
+          container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04
           markers: not daily and not remote and gpu and (doctest or not doctest)
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
@@ -66,8 +66,8 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: gpu-3.11-2.3-4
-          container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
+        - name: gpu-3.11-2.4-4
+          container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04
           markers: not daily and not remote and gpu and (doctest or not doctest)
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml

From 9611fe6dc241ad299f521e4cc1e586ef14b337cd Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Mon, 12 Aug 2024 20:14:25 -0400
Subject: [PATCH 02/30] filter warnings

---
 composer/trainer/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index 27323718fc..b36572db4c 100644
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -1725,7 +1725,7 @@ def __init__(
 
         # Suppressing GradScaler warnings as they are always created
         # self._use_grad_scaling() will raise a RuntimeError if grad scaling is not available when it is required
-        warnings.filterwarnings(action='ignore', message='torch.cuda.amp.GradScaler')
+        warnings.filterwarnings(action='ignore', message='.*torch.cuda.amp.GradScaler.*')
         self.state.scaler = ClosureGradScaler() if self._use_closures() else GradScaler()
 
         if self.state.fsdp_config is not None:

From 4af03202f8c4a77ea3156a130020c8c0a1df86a4 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Mon, 12 Aug 2024 20:32:34 -0400
Subject: [PATCH 03/30] fix

---
 .github/workflows/pr-cpu.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index 1483fc060e..caa3e0fb05 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -25,10 +25,10 @@ jobs:
           container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and not doctest
           pytest_command: coverage run -m pytest
-        - name: cpu-doctest
-          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
-          markers: not daily and not remote and not gpu and doctest
-          pytest_command: coverage run -m pytest tests/test_docs.py
+        # - name: cpu-doctest
+        #   container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+        #   markers: not daily and not remote and not gpu and doctest
+        #   pytest_command: coverage run -m pytest tests/test_docs.py
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
     with:

From ada84f0590cad1fc0c1fedb2ad235878c3531881 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Mon, 12 Aug 2024 20:54:16 -0400
Subject: [PATCH 04/30] filter deprecation warning

---
 pyproject.toml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index f153616f0d..a763ec3bf5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -164,6 +164,10 @@ filterwarnings = [
     '''ignore:The 'transformers' MLflow Models integration.*:UserWarning''',
     # Ignore our own deprecation warnings,
     '''ignore::composer.utils.warnings.VersionedDeprecationWarning''',
+    # Ignore deprecation warning for torch.load
+    '''ignore::.*You are using `torch.load` with `weights_only=False`.*:FutureWarning''',
+    # Ignore deprecation warning as DeepSpeed uses old path
+    '''ignore:.*`torch.cuda.amp.custom_fwd(args...)` is deprecated.*:FutureWarning''',
 ]
 
 # Coverage

From 922651d534c41f7b850c7849eb4e98442ad96998 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Mon, 12 Aug 2024 20:58:06 -0400
Subject: [PATCH 05/30] fix filter

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a763ec3bf5..7ddbc9f1a5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -165,9 +165,9 @@ filterwarnings = [
     # Ignore our own deprecation warnings,
     '''ignore::composer.utils.warnings.VersionedDeprecationWarning''',
     # Ignore deprecation warning for torch.load
-    '''ignore::.*You are using `torch.load` with `weights_only=False`.*:FutureWarning''',
+    '''ignore:You are using `torch.load` with `weights_only=False`.*:FutureWarning''',
     # Ignore deprecation warning as DeepSpeed uses old path
-    '''ignore:.*`torch.cuda.amp.custom_fwd(args...)` is deprecated.*:FutureWarning''',
+    '''ignore:`torch.cuda.amp.custom_fwd(args...)` is deprecated.*:FutureWarning''',
 ]
 
 # Coverage

From 93c9a6e9a970030d0bd4c9eb2af61fdca905f93f Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Mon, 12 Aug 2024 21:15:55 -0400
Subject: [PATCH 06/30] fix filter

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7ddbc9f1a5..ce281a7279 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -167,7 +167,7 @@ filterwarnings = [
     # Ignore deprecation warning for torch.load
     '''ignore:You are using `torch.load` with `weights_only=False`.*:FutureWarning''',
     # Ignore deprecation warning as DeepSpeed uses old path
-    '''ignore:`torch.cuda.amp.custom_fwd(args...)` is deprecated.*:FutureWarning''',
+    '''ignore:.*torch.cuda.amp.custom_fwd.*:FutureWarning''',
 ]
 
 # Coverage

From 23e77b3ea820f3b228fbdad9961bea2c0bfff312 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 13 Aug 2024 12:07:19 -0400
Subject: [PATCH 07/30] fix regex

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index ce281a7279..7b7a54dde7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -167,7 +167,7 @@ filterwarnings = [
     # Ignore deprecation warning for torch.load
     '''ignore:You are using `torch.load` with `weights_only=False`.*:FutureWarning''',
     # Ignore deprecation warning as DeepSpeed uses old path
-    '''ignore:.*torch.cuda.amp.custom_fwd.*:FutureWarning''',
+    '''ignore:.*torch.cuda.amp.custom.*:FutureWarning''',
 ]
 
 # Coverage

From 5eacf6a157e768c5494aa1f5529e4154416ca527 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 13 Aug 2024 12:27:39 -0400
Subject: [PATCH 08/30] restore magic mock

---
 composer/core/state.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/composer/core/state.py b/composer/core/state.py
index 7c43473ace..73244095ee 100644
--- a/composer/core/state.py
+++ b/composer/core/state.py
@@ -1445,8 +1445,7 @@ def load_optim_state(self, state_dict: dict[str, Any], strict: bool = True):
                 # checkpoint on rank 0 only. However, PyTorch modifies the state_dict (producing
                 # errors) before discarding the output. Accordingly, we mock the state dict.
                 # See: https://github.com/pytorch/pytorch/issues/125177
-                if version.parse(torch.__version__) < version.parse('2.4.0'):
-                    optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict
+                optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict
                 set_optimizer_state_dict(
                     model=self.model,
                     optimizers=optimizer,

From 4317a6bbf7cf78bae9c43113e09d1796b0e64348 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 13 Aug 2024 12:48:02 -0400
Subject: [PATCH 09/30] revert magicmock

---
 composer/core/state.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/composer/core/state.py b/composer/core/state.py
index 73244095ee..7c43473ace 100644
--- a/composer/core/state.py
+++ b/composer/core/state.py
@@ -1445,7 +1445,8 @@ def load_optim_state(self, state_dict: dict[str, Any], strict: bool = True):
                 # checkpoint on rank 0 only. However, PyTorch modifies the state_dict (producing
                 # errors) before discarding the output. Accordingly, we mock the state dict.
                 # See: https://github.com/pytorch/pytorch/issues/125177
-                optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict
+                if version.parse(torch.__version__) < version.parse('2.4.0'):
+                    optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict
                 set_optimizer_state_dict(
                     model=self.model,
                     optimizers=optimizer,

From ba968dff4c2c2e3b70c64bce2df937fa2cea9bf9 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 13 Aug 2024 18:28:33 +0000
Subject: [PATCH 10/30] fix warnings

---
 composer/core/precision.py       | 11 ++++++-----
 tests/trainer/test_checkpoint.py |  2 +-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/composer/core/precision.py b/composer/core/precision.py
index 2723159261..4e7091bd24 100644
--- a/composer/core/precision.py
+++ b/composer/core/precision.py
@@ -5,6 +5,7 @@
 
 import contextlib
 import textwrap
+from packaging import version
 from typing import Any, Generator, Optional, Union
 
 import torch
@@ -26,9 +27,9 @@ class Precision(StringEnum):
 
     Attributes:
         FP32: Use 32-bit floating-point precision. Compatible with CPUs and GPUs.
-        AMP_FP16: Use :mod:`torch.cuda.amp` with 16-bit floating-point precision. Only compatible
+        AMP_FP16: Use :mod:`torch.amp` with 16-bit floating-point precision. Only compatible
             with GPUs.
-        AMP_BF16: Use :mod:`torch.cuda.amp` with 16-bit BFloat precision.
+        AMP_BF16: Use :mod:`torch.amp` with 16-bit BFloat precision.
         AMP_FP8: Use :mod:`transformer_engine.pytorch.fp8_autocast` with 8-bit FP8 precison.
     """
     FP32 = 'fp32'
@@ -60,7 +61,7 @@ def get_precision_context(
     precision = Precision(precision)
     if precision == Precision.FP32:
         if torch.cuda.is_available():
-            with torch.cuda.amp.autocast(False):
+            with torch.autocast('cuda', enabled=False):
                 yield
         else:
             # Yield here to avoid warnings about cuda not being available
@@ -68,7 +69,7 @@ def get_precision_context(
     elif precision == Precision.AMP_FP16:
         # Retain compatibility with PyTorch < 1.10
         if torch.cuda.is_available():
-            with torch.cuda.amp.autocast(True):
+            with torch.autocast('cuda', enabled=True):
                 yield
         elif is_xla_installed():
             with torch.autocast('xla', dtype=torch.float16):
@@ -77,7 +78,7 @@ def get_precision_context(
             yield
     elif precision == Precision.AMP_BF16:
         if torch.cuda.is_available():
-            with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16):
+            with torch.autocast('cuda', dtype=torch.bfloat16, enabled=True):
                 yield
         elif is_xla_installed():
             with torch.autocast('xla', dtype=torch.bfloat16):
diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py
index ede864d13b..892ea5434b 100644
--- a/tests/trainer/test_checkpoint.py
+++ b/tests/trainer/test_checkpoint.py
@@ -996,7 +996,7 @@ def test_strict_errors(self, missing_key: bool, unexpected_key: bool):
         last_checkpoint = os.path.join('first', 'ep2.pt')
         if missing_key or unexpected_key:
             message = r'Error\(s\) in loading state_dict'
-            if version.parse(torch.__version__) < version.parse('2.2.3') or not dist.is_initialized():
+            if version.parse(torch.__version__) < version.parse('2.2.3') or (version.parse(torch.__version__) < version.parse('2.4.0') and not dist.is_initialized()):
                 # Composer implements strict for older torch versions
                 message = 'Failed to load checkpoint due to'
             error_context = pytest.raises(RuntimeError, match=message)

From f072bc1252440835fa6058b5bdc391dd5d7d8342 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 13 Aug 2024 14:29:36 -0400
Subject: [PATCH 11/30] fix amp warnings

---
 composer/core/precision.py       | 1 -
 tests/trainer/test_checkpoint.py | 4 +++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/composer/core/precision.py b/composer/core/precision.py
index 4e7091bd24..c3aa06643b 100644
--- a/composer/core/precision.py
+++ b/composer/core/precision.py
@@ -5,7 +5,6 @@
 
 import contextlib
 import textwrap
-from packaging import version
 from typing import Any, Generator, Optional, Union
 
 import torch
diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py
index 892ea5434b..70a38f44ba 100644
--- a/tests/trainer/test_checkpoint.py
+++ b/tests/trainer/test_checkpoint.py
@@ -996,7 +996,9 @@ def test_strict_errors(self, missing_key: bool, unexpected_key: bool):
         last_checkpoint = os.path.join('first', 'ep2.pt')
         if missing_key or unexpected_key:
             message = r'Error\(s\) in loading state_dict'
-            if version.parse(torch.__version__) < version.parse('2.2.3') or (version.parse(torch.__version__) < version.parse('2.4.0') and not dist.is_initialized()):
+            if version.parse(torch.__version__) < version.parse('2.2.3') or (
+                version.parse(torch.__version__) < version.parse('2.4.0') and not dist.is_initialized()
+            ):
                 # Composer implements strict for older torch versions
                 message = 'Failed to load checkpoint due to'
             error_context = pytest.raises(RuntimeError, match=message)

From 7f0e81dffb1808ad1657d2103a6136a7e1ff4b72 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 13 Aug 2024 18:40:59 +0000
Subject: [PATCH 12/30] fix test

---
 tests/algorithms/test_required_on_load.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/algorithms/test_required_on_load.py b/tests/algorithms/test_required_on_load.py
index 47ced249db..b5216955fb 100644
--- a/tests/algorithms/test_required_on_load.py
+++ b/tests/algorithms/test_required_on_load.py
@@ -174,7 +174,7 @@ def test_autoload(
             context = pytest.warns(UserWarning, match='Automatically adding required_on_load algorithm*')
         # Excluding some algorithms leads to errors when loading
         elif exclude:
-            if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
+            if version.parse(torch.__version__) >= version.parse('2.4.0') or (version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()):
                 if algo_name in [
                     'Alibi',
                     'BlurPool',

From fc8d7f46b280c043ff973ddd98e6468591718a29 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 13 Aug 2024 14:44:11 -0400
Subject: [PATCH 13/30] lint

---
 tests/algorithms/test_required_on_load.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/algorithms/test_required_on_load.py b/tests/algorithms/test_required_on_load.py
index b5216955fb..ce62d6aefe 100644
--- a/tests/algorithms/test_required_on_load.py
+++ b/tests/algorithms/test_required_on_load.py
@@ -174,7 +174,9 @@ def test_autoload(
             context = pytest.warns(UserWarning, match='Automatically adding required_on_load algorithm*')
         # Excluding some algorithms leads to errors when loading
         elif exclude:
-            if version.parse(torch.__version__) >= version.parse('2.4.0') or (version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()):
+            if version.parse(torch.__version__) >= version.parse('2.4.0') or (
+                version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
+            ):
                 if algo_name in [
                     'Alibi',
                     'BlurPool',

From 92ea9ce46dcc34c3fef6450c1ecddd5cdef5085a Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 13 Aug 2024 18:50:43 +0000
Subject: [PATCH 14/30] fix errors

---
 tests/algorithms/test_required_on_load.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/tests/algorithms/test_required_on_load.py b/tests/algorithms/test_required_on_load.py
index ce62d6aefe..824be91646 100644
--- a/tests/algorithms/test_required_on_load.py
+++ b/tests/algorithms/test_required_on_load.py
@@ -174,9 +174,18 @@ def test_autoload(
             context = pytest.warns(UserWarning, match='Automatically adding required_on_load algorithm*')
         # Excluding some algorithms leads to errors when loading
         elif exclude:
-            if version.parse(torch.__version__) >= version.parse('2.4.0') or (
-                version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
-            ):
+            if version.parse(torch.__version__) >= version.parse('2.4.0'):
+                if algo_name in [
+                    'BlurPool',
+                    'Factorize',
+                    'GatedLinearUnits',
+                    'GhostBatchNorm',
+                    'SqueezeExcite',
+                ]:
+                    context = pytest.raises(KeyError)  # Optimizer loading is strict
+                elif algo_name == 'Alibi':
+                    context = pytest.raises(RuntimeError)  # Alibi has shape issues
+            elif version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
                 if algo_name in [
                     'Alibi',
                     'BlurPool',

From 0ba2c47183689c892b3f8f449012fdace4d97293 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 13 Aug 2024 20:02:03 +0000
Subject: [PATCH 15/30] fix last test

---
 tests/trainer/test_checkpoint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py
index 70a38f44ba..eb4e60f7d9 100644
--- a/tests/trainer/test_checkpoint.py
+++ b/tests/trainer/test_checkpoint.py
@@ -1356,7 +1356,7 @@ def test_autoload_algorithm_old_checkpoint(self):
         NoOpModel.__init__ = lambda self, x: None  # type: ignore
         NoOpModel.__repr__ = lambda self: 'NoOpModel(3)'
         error_context = pytest.raises(KeyError, match='module.0.weight')
-        if version.parse(torch.__version__) < version.parse('2.2.3') or not dist.is_initialized():
+        if version.parse(torch.__version__) < version.parse('2.2.3') or (version.parse(torch.__version__) < version.parse('2.4.0') and not dist.is_initialized()):
             error_context = pytest.raises(ValueError, match='loaded state dict contains a parameter group.*')
         with pytest.warns(UserWarning, match='required_on_load algorithm.*'), error_context:
             trainer_3 = self.get_trainer(load_path=os.path.join('first', 'ep1.pt'))

From fedfbeeb637f7808fecbb1142a577acf835897f4 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 13 Aug 2024 16:02:54 -0400
Subject: [PATCH 16/30] lint

---
 tests/trainer/test_checkpoint.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py
index eb4e60f7d9..d91b1beea6 100644
--- a/tests/trainer/test_checkpoint.py
+++ b/tests/trainer/test_checkpoint.py
@@ -1356,7 +1356,9 @@ def test_autoload_algorithm_old_checkpoint(self):
         NoOpModel.__init__ = lambda self, x: None  # type: ignore
         NoOpModel.__repr__ = lambda self: 'NoOpModel(3)'
         error_context = pytest.raises(KeyError, match='module.0.weight')
-        if version.parse(torch.__version__) < version.parse('2.2.3') or (version.parse(torch.__version__) < version.parse('2.4.0') and not dist.is_initialized()):
+        if version.parse(torch.__version__) < version.parse('2.2.3') or (
+            version.parse(torch.__version__) < version.parse('2.4.0') and not dist.is_initialized()
+        ):
             error_context = pytest.raises(ValueError, match='loaded state dict contains a parameter group.*')
         with pytest.warns(UserWarning, match='required_on_load algorithm.*'), error_context:
             trainer_3 = self.get_trainer(load_path=os.path.join('first', 'ep1.pt'))

From d77f82589bd77489d3fb775da60d97a9bc139fc9 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 13 Aug 2024 16:03:08 -0400
Subject: [PATCH 17/30] enable doctests

---
 .github/workflows/pr-cpu.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index caa3e0fb05..1483fc060e 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -25,10 +25,10 @@ jobs:
           container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and not doctest
           pytest_command: coverage run -m pytest
-        # - name: cpu-doctest
-        #   container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
-        #   markers: not daily and not remote and not gpu and doctest
-        #   pytest_command: coverage run -m pytest tests/test_docs.py
+        - name: cpu-doctest
+          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+          markers: not daily and not remote and not gpu and doctest
+          pytest_command: coverage run -m pytest tests/test_docs.py
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
     with:

From 78cfcb08c235cb8dacf21980ef8f6fb9bb8a0922 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 13 Aug 2024 18:31:09 -0400
Subject: [PATCH 18/30] fix doctests

---
 docs/source/doctest_fixtures.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py
index f54d1f69e1..a0583d422d 100644
--- a/docs/source/doctest_fixtures.py
+++ b/docs/source/doctest_fixtures.py
@@ -53,7 +53,7 @@
 from composer.loggers import Logger as Logger
 from composer.loggers import RemoteUploaderDownloader
 from composer.models import ComposerModel as ComposerModel
-from composer.optim.scheduler import ConstantScheduler
+from composer.optim import ConstantScheduler, DecoupledSGDW
 from composer.utils import LibcloudObjectStore, RemoteUploader
 from composer.utils import ensure_tuple as ensure_tuple
 
@@ -117,7 +117,7 @@
 
 model = SimpleModel(num_channels, num_classes)
 
-optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
+optimizer = DecoupledSGDW(model.parameters(), lr=0.001)
 
 scheduler = CosineAnnealingLR(optimizer, T_max=1)
 

From ca2c1f91b665973a74388b7de94d5208659ccddf Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 13 Aug 2024 18:43:05 -0400
Subject: [PATCH 19/30] fix doctests

---
 docs/source/doctest_fixtures.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py
index a0583d422d..90e9ca6ed0 100644
--- a/docs/source/doctest_fixtures.py
+++ b/docs/source/doctest_fixtures.py
@@ -19,6 +19,7 @@
 from typing import Any
 from typing import Callable as Callable
 from urllib.parse import urlparse
+import warnings
 
 import numpy as np
 import pytest
@@ -57,6 +58,9 @@
 from composer.utils import LibcloudObjectStore, RemoteUploader
 from composer.utils import ensure_tuple as ensure_tuple
 
+# Ignore certain warnings for doctest
+warnings.filterwarnings(action='ignore', message='.*torch.cuda.amp.custom.*')
+
 try:
     import wandb
     _WANDB_INSTALLED = True

From 7215c68928ed23f2e91c51fe21ab7285704d6fc9 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 13 Aug 2024 23:08:08 +0000
Subject: [PATCH 20/30] switch to decoupled adamw

---
 docs/source/doctest_fixtures.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py
index 90e9ca6ed0..56d9073f77 100644
--- a/docs/source/doctest_fixtures.py
+++ b/docs/source/doctest_fixtures.py
@@ -192,7 +192,7 @@ def _new_trainer_init(self, fake_ellipses: None = None, **kwargs: Any):
     if 'model' not in kwargs:
         kwargs['model'] = model
     if 'optimizers' not in kwargs:
-        kwargs['optimizers'] = torch.optim.SGD(kwargs['model'].parameters(), lr=0.01)
+        kwargs['optimizers'] = DecoupledSGDW(kwargs['model'].parameters(), lr=0.01)
     if 'schedulers' not in kwargs:
         kwargs['schedulers'] = ConstantScheduler()
     if 'max_duration' not in kwargs:

From fac29614691c86c7c2d87aa9958c9d9717511389 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 13 Aug 2024 19:29:51 -0400
Subject: [PATCH 21/30] lint

---
 docs/source/doctest_fixtures.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py
index 56d9073f77..2fccf71b1d 100644
--- a/docs/source/doctest_fixtures.py
+++ b/docs/source/doctest_fixtures.py
@@ -16,10 +16,10 @@
 import os
 import sys
 import tempfile
+import warnings
 from typing import Any
 from typing import Callable as Callable
 from urllib.parse import urlparse
-import warnings
 
 import numpy as np
 import pytest

From 6432889b34725ef7299f3ae2ea659265f3c93102 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Wed, 14 Aug 2024 06:09:42 +0000
Subject: [PATCH 22/30] fix tests

---
 composer/trainer/trainer.py           |  8 ++++++++
 docs/source/trainer/checkpointing.rst | 24 ++++++++++++------------
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index b36572db4c..c09702a65f 100644
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -2442,6 +2442,14 @@ def fit(
         self.first_batch_complete = False
         self._train_loop()
 
+        # Zero gradients at the end of fit so same model/optimizer can be used for further training
+        # with checkpoint loading. See https://github.com/pytorch/pytorch/issues/133415
+        for optimizer in self.state.optimizers:
+            try:
+                optimizer.zero_grad(set_to_none=True)
+            except TypeError:
+                optimizer.zero_grad()
+
     def close(self):
         """Shutdown the trainer.
 
diff --git a/docs/source/trainer/checkpointing.rst b/docs/source/trainer/checkpointing.rst
index 0cb2f39898..36f0b358a7 100644
--- a/docs/source/trainer/checkpointing.rst
+++ b/docs/source/trainer/checkpointing.rst
@@ -531,10 +531,10 @@ object stores like WandB or LibCloud, you must still specify a ``load_object_sto
     :skipif: not _LIBCLOUD_INSTALLED
 
     new_trainer = Trainer(
-    model=model,
-    train_dataloader=train_dataloader,
-    max_duration="10ep",
-    load_path="s3://checkpoint-debugging/checkpoints/ep1.pt",
+        model=model,
+        train_dataloader=train_dataloader,
+        max_duration="10ep",
+        load_path="s3://checkpoint-debugging/checkpoints/ep1.pt",
     )
 
     new_trainer.fit()
@@ -547,10 +547,10 @@ Similarly for OCI:
     :skipif: not _LIBCLOUD_INSTALLED
 
     new_trainer = Trainer(
-    model=model,
-    train_dataloader=train_dataloader,
-    max_duration="10ep",
-    load_path="oci://checkpoint-debugging/checkpoints/ep1.pt",
+        model=model,
+        train_dataloader=train_dataloader,
+        max_duration="10ep",
+        load_path="oci://checkpoint-debugging/checkpoints/ep1.pt",
     )
 
     new_trainer.fit()
@@ -564,10 +564,10 @@ Similarly for GCS:
     :skipif: not _LIBCLOUD_INSTALLED
 
     new_trainer = Trainer(
-    model=model,
-    train_dataloader=train_dataloader,
-    max_duration="10ep",
-    load_path="gs://checkpoint-debugging/checkpoints/ep1.pt",
+        model=model,
+        train_dataloader=train_dataloader,
+        max_duration="10ep",
+        load_path="gs://checkpoint-debugging/checkpoints/ep1.pt",
     )
 
     new_trainer.fit()

From 333c2345442b42227f4c9bda7c2d9d4f1ecd31a8 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Wed, 14 Aug 2024 02:10:31 -0400
Subject: [PATCH 23/30] revert to sgd

---
 docs/source/doctest_fixtures.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py
index 2fccf71b1d..e152e740f8 100644
--- a/docs/source/doctest_fixtures.py
+++ b/docs/source/doctest_fixtures.py
@@ -54,7 +54,7 @@
 from composer.loggers import Logger as Logger
 from composer.loggers import RemoteUploaderDownloader
 from composer.models import ComposerModel as ComposerModel
-from composer.optim import ConstantScheduler, DecoupledSGDW
+from composer.optim import ConstantScheduler
 from composer.utils import LibcloudObjectStore, RemoteUploader
 from composer.utils import ensure_tuple as ensure_tuple
 
@@ -121,7 +121,7 @@
 
 model = SimpleModel(num_channels, num_classes)
 
-optimizer = DecoupledSGDW(model.parameters(), lr=0.001)
+optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
 
 scheduler = CosineAnnealingLR(optimizer, T_max=1)
 
@@ -192,7 +192,7 @@ def _new_trainer_init(self, fake_ellipses: None = None, **kwargs: Any):
     if 'model' not in kwargs:
         kwargs['model'] = model
     if 'optimizers' not in kwargs:
-        kwargs['optimizers'] = DecoupledSGDW(kwargs['model'].parameters(), lr=0.01)
+        kwargs['optimizers'] = torch.optim.SGD(kwargs['model'].parameters(), lr=0.01)
     if 'schedulers' not in kwargs:
         kwargs['schedulers'] = ConstantScheduler()
     if 'max_duration' not in kwargs:

From 7332a2b5db165aeb8957bafc2cdd226eee44e92f Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Wed, 14 Aug 2024 02:19:53 -0400
Subject: [PATCH 24/30] add more warnings filters

---
 docs/source/doctest_fixtures.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py
index e152e740f8..67c0cd7723 100644
--- a/docs/source/doctest_fixtures.py
+++ b/docs/source/doctest_fixtures.py
@@ -59,7 +59,10 @@
 from composer.utils import ensure_tuple as ensure_tuple
 
 # Ignore certain warnings for doctest
-warnings.filterwarnings(action='ignore', message='.*torch.cuda.amp.custom.*')
+warnings.filterwarnings(action='ignore', message='.*Deterministic mode.*')  # Expected
+warnings.filterwarnings(action='ignore', message='.*torch.cuda.amp.custom.*')  # DeepSpeed
+warnings.filterwarnings(action='ignore', message='.*The distutils.sysconfig module*')  # DeepSpeed
+
 
 try:
     import wandb

From 691167a49068a220eb581e636b6e4d6967299a2d Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Wed, 14 Aug 2024 02:33:06 -0400
Subject: [PATCH 25/30] decoupled adamw

---
 docs/source/doctest_fixtures.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py
index 67c0cd7723..60dea99ff9 100644
--- a/docs/source/doctest_fixtures.py
+++ b/docs/source/doctest_fixtures.py
@@ -54,16 +54,16 @@
 from composer.loggers import Logger as Logger
 from composer.loggers import RemoteUploaderDownloader
 from composer.models import ComposerModel as ComposerModel
-from composer.optim import ConstantScheduler
+from composer.optim import ConstantScheduler, DecoupledSGDW
 from composer.utils import LibcloudObjectStore, RemoteUploader
 from composer.utils import ensure_tuple as ensure_tuple
 
 # Ignore certain warnings for doctest
 warnings.filterwarnings(action='ignore', message='.*Deterministic mode.*')  # Expected
+warnings.filterwarnings(action='ignore', message='.*Some weights of Bert*')  # Expected
 warnings.filterwarnings(action='ignore', message='.*torch.cuda.amp.custom.*')  # DeepSpeed
 warnings.filterwarnings(action='ignore', message='.*The distutils.sysconfig module*')  # DeepSpeed
 
-
 try:
     import wandb
     _WANDB_INSTALLED = True
@@ -124,7 +124,7 @@
 
 model = SimpleModel(num_channels, num_classes)
 
-optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
+optimizer = DecoupledSGDW(model.parameters(), lr=0.001)
 
 scheduler = CosineAnnealingLR(optimizer, T_max=1)
 
@@ -195,7 +195,7 @@ def _new_trainer_init(self, fake_ellipses: None = None, **kwargs: Any):
     if 'model' not in kwargs:
         kwargs['model'] = model
     if 'optimizers' not in kwargs:
-        kwargs['optimizers'] = torch.optim.SGD(kwargs['model'].parameters(), lr=0.01)
+        kwargs['optimizers'] = DecoupledSGDW(kwargs['model'].parameters(), lr=0.01)
     if 'schedulers' not in kwargs:
         kwargs['schedulers'] = ConstantScheduler()
     if 'max_duration' not in kwargs:

From 2f05c221010a72f40564aecf7380b6966e334fda Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Wed, 14 Aug 2024 11:29:55 -0400
Subject: [PATCH 26/30] update logging

---
 composer/trainer/trainer.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index c09702a65f..695136f552 100644
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -2446,9 +2446,12 @@ def fit(
         # with checkpoint loading. See https://github.com/pytorch/pytorch/issues/133415
         for optimizer in self.state.optimizers:
             try:
-                optimizer.zero_grad(set_to_none=True)
-            except TypeError:
-                optimizer.zero_grad()
+                try:
+                    optimizer.zero_grad(set_to_none=True)
+                except TypeError:
+                    optimizer.zero_grad()
+            except:
+                log.exception('Failed to zero out optimizer at end of fit')
 
     def close(self):
         """Shutdown the trainer.

From 4f5b592baac43be95be1d354a42a5f892d3fbdeb Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Wed, 14 Aug 2024 17:38:33 +0000
Subject: [PATCH 27/30] fix test

---
 tests/trainer/test_ddp_sync_strategy.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/trainer/test_ddp_sync_strategy.py b/tests/trainer/test_ddp_sync_strategy.py
index 05efeb6960..838d3cbaa3 100644
--- a/tests/trainer/test_ddp_sync_strategy.py
+++ b/tests/trainer/test_ddp_sync_strategy.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from typing import Optional
+from packaging import version
 
 import pytest
 import torch
@@ -45,7 +46,7 @@ def loss(self, output: Tensor, target: Tensor):
 @pytest.mark.parametrize(
     'ddp_sync_strategy,expected_grads',
     [
-        pytest.param('single_auto_sync', ([-1, None, None], [-1, -1.5, None], [-1, -1.5, None]), id='single_auto_sync'),
+        pytest.param('single_auto_sync', ([-1, None, None], [-1.5, -1.5, None], [-1.5, -1.5, None]), id='single_auto_sync'),
         pytest.param(
             'multi_auto_sync',
             ([-1.5, None, None], [-1.5, -1.5, None], [-1.5, -1.5, None]),
@@ -61,8 +62,9 @@ def test_ddp_sync_strategy(
     rank_zero_seed: int,
     request: pytest.FixtureRequest,
 ):
+    if version.parse(torch.__version__) < version.parse('2.4.0'):
+        pytest.skip('Before PyTorch 2.4, single_auto_sync did not properly run on last microbatch')
     original_model = MinimalConditionalModel()
-    # ddp = DDP(backend="gloo", find_unused_parameters=True, sync_strategy=ddp_sync_strategy, timeout=5.)
     optimizer = torch.optim.SGD(original_model.parameters(), 0.1)
     device = None
     for item in request.session.items:

From db381a7322f7139e0f2ffd64dc2f8bee0dd96bd6 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Wed, 14 Aug 2024 13:56:26 -0400
Subject: [PATCH 28/30] lint

---
 tests/trainer/test_ddp_sync_strategy.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/trainer/test_ddp_sync_strategy.py b/tests/trainer/test_ddp_sync_strategy.py
index 838d3cbaa3..9a25d30f8b 100644
--- a/tests/trainer/test_ddp_sync_strategy.py
+++ b/tests/trainer/test_ddp_sync_strategy.py
@@ -2,11 +2,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from typing import Optional
-from packaging import version
 
 import pytest
 import torch
 import torch.nn as nn
+from packaging import version
 from torch import Tensor
 from torch.utils.data import DataLoader
 
@@ -46,7 +46,9 @@ def loss(self, output: Tensor, target: Tensor):
 @pytest.mark.parametrize(
     'ddp_sync_strategy,expected_grads',
     [
-        pytest.param('single_auto_sync', ([-1, None, None], [-1.5, -1.5, None], [-1.5, -1.5, None]), id='single_auto_sync'),
+        pytest.param(
+            'single_auto_sync', ([-1, None, None], [-1.5, -1.5, None], [-1.5, -1.5, None]), id='single_auto_sync'
+        ),
         pytest.param(
             'multi_auto_sync',
             ([-1.5, None, None], [-1.5, -1.5, None], [-1.5, -1.5, None]),

From 0df4cb3cd5d43d512bf007f46ad6169146541b96 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Wed, 14 Aug 2024 14:50:56 -0400
Subject: [PATCH 29/30] lit

---
 tests/trainer/test_ddp_sync_strategy.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/trainer/test_ddp_sync_strategy.py b/tests/trainer/test_ddp_sync_strategy.py
index 9a25d30f8b..dfe711b92f 100644
--- a/tests/trainer/test_ddp_sync_strategy.py
+++ b/tests/trainer/test_ddp_sync_strategy.py
@@ -47,7 +47,9 @@ def loss(self, output: Tensor, target: Tensor):
     'ddp_sync_strategy,expected_grads',
     [
         pytest.param(
-            'single_auto_sync', ([-1, None, None], [-1.5, -1.5, None], [-1.5, -1.5, None]), id='single_auto_sync'
+            'single_auto_sync',
+            ([-1, None, None], [-1.5, -1.5, None], [-1.5, -1.5, None]),
+            id='single_auto_sync',
         ),
         pytest.param(
             'multi_auto_sync',

From 9a47973e24aa8a629e8af030c6a3f0fcf612c0dd Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Wed, 14 Aug 2024 17:09:22 -0400
Subject: [PATCH 30/30] fix test name

---
 .github/workflows/pr-gpu.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
index 625c06072a..e74689e597 100644
--- a/.github/workflows/pr-gpu.yaml
+++ b/.github/workflows/pr-gpu.yaml
@@ -39,7 +39,7 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: gpu-3.11-2.342
+        - name: gpu-3.11-2.4-2
           container: mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04
           markers: not daily and not remote and gpu and (doctest or not doctest)
           pytest_command: coverage run -m pytest