easybuilders · boegel · Jan 3, 2023 · Dec 5, 2022 · Dec 9, 2022 · Dec 9, 2022
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1-foss-2022a-CUDA-11.7.0.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1-foss-2022a-CUDA-11.7.0.eb
@@ -24,6 +24,8 @@ patches = [
     'PyTorch-1.11.0_increase_c10d_gloo_timeout.patch',
     'PyTorch-1.11.0_increase-distributed-test-timeout.patch',
     'PyTorch-1.11.0_install-vsx-vec-headers.patch',
+    'PyTorch-1.11.1_skip-test_init_from_local_shards.patch',
+    'PyTorch-1.12.1_fix-autograd-thread_shutdown-test.patch',
     'PyTorch-1.12.1_fix-cuda-gcc-version-check.patch',
     'PyTorch-1.12.1_fix-skip-decorators.patch',
     'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch',
@@ -41,7 +43,7 @@ patches = [
     'PyTorch-1.12.1_remove-flaky-test-in-testnn.patch',
     'PyTorch-1.12.1_skip-ao-sparsity-test-without-fbgemm.patch',
     'PyTorch-1.12.1_skip-failing-grad-test.patch',
-    'PyTorch-1.12.1_skip-test_round_robin_create_destroy.patch',
+    'PyTorch-1.12.1_skip-test_round_robin.patch',
 ]
 checksums = [
     '031c71073db73da732b5d01710220564ce6dd88d812ba053f0cc94296401eccb',  # pytorch-v1.12.1.tar.gz
@@ -64,6 +66,10 @@ checksums = [
     # PyTorch-1.11.0_increase-distributed-test-timeout.patch
     '087ad20163a1291773ae3457569b80523080eb3731e210946459b2333a919f3f',
     'f2e6b9625733d9a471bb75e1ea20e28814cf1380b4f9089aa838ee35ddecf07d',  # PyTorch-1.11.0_install-vsx-vec-headers.patch
+    # PyTorch-1.11.1_skip-test_init_from_local_shards.patch
+    '4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7',
+    # PyTorch-1.12.1_fix-autograd-thread_shutdown-test.patch
+    'd97cd6b0570a167ecc3e631dc4ea884d95ace285cc38aa980566f4fec2c0d089',
     # PyTorch-1.12.1_fix-cuda-gcc-version-check.patch
     'a650f4576f06c749f244cada52ff9c02499fa8f182019129488db3845e0756ab',
     'e3ca6e42b2fa592ea095939fb59ab875668a058479407db3f3684cc5c6f4146c',  # PyTorch-1.12.1_fix-skip-decorators.patch
@@ -91,8 +97,8 @@ checksums = [
     # PyTorch-1.12.1_skip-ao-sparsity-test-without-fbgemm.patch
     'edd464ec8c37b44c07a72008d732604f6837f2dd61c7810c391a86ba4945ca39',
     '1c89e7e67287fe6b9a95480a4178d3653b94d0ab2fe68edf227606c8ae548fdc',  # PyTorch-1.12.1_skip-failing-grad-test.patch
-    # PyTorch-1.12.1_skip-test_round_robin_create_destroy.patch
-    '1435fcac3234edc865479199673b902eb67f6a2bd046af7d731141f03594666d',
+    # PyTorch-1.12.1_skip-test_round_robin.patch
+    '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349',
 ]
 
 osdependencies = [OS_PKG_IBVERBS_DEV]
@@ -131,6 +137,9 @@ excluded_tests = {
         'distributed/test_distributed_spawn',
         # Broken on CUDA 11.6/11.7: https://github.com/pytorch/pytorch/issues/75375
         'distributions/test_constraints',
+        # Those 2 abort on some machines. Skip for now
+        'distributed/fsdp/test_fsdp_input',
+        'distributed/fsdp/test_fsdp_mixed_precision',
     ]
 }
 

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1_fix-autograd-thread_shutdown-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1_fix-autograd-thread_shutdown-test.patch
@@ -0,0 +1,24 @@
+Fix flaky test_thread_shutdown in test_autograd
+
+From https://github.com/pytorch/pytorch/pull/86464
+
+Backport: Alexander Grund (TU Dresden)
+
+diff --git a/test/test_autograd.py b/test/test_autograd.py
+index da1e859682e..0c0bc4f1a2a 100644
+--- a/test/test_autograd.py
++++ b/test/test_autograd.py
+@@ -4320,8 +4320,12 @@ class MyFunction(Function):
+     def backward(ctx, grad):
+         return grad
+
++# Run on cuda if it is available to ensure that the worker thread
++# is properly initialized by the time we exit.
++device = "cuda" if torch.cuda.is_available() else "cpu"
++
+ for shape in [(1,), ()]:
+-    v = torch.ones(shape, requires_grad=True)
++    v = torch.ones(shape, requires_grad=True, device=device)
+     MyFunction.apply(v).backward()
+ """
+         s = TestCase.runWithPytorchAPIUsageStderr(code)
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1_skip-test_round_robin.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1_skip-test_round_robin.patch
@@ -0,0 +1,35 @@
+test_round_robin & test_round_robin_create_destroy of distributed/test_c10d_gloo may run into timeouts.
+So simply skip the on all OS (not only on Windows), the existing skip marker suggest that this is OK.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
+index e49d65ea33d..b4fb75a1b11 100644
+--- a/test/distributed/test_c10d_gloo.py
++++ b/test/distributed/test_c10d_gloo.py
+@@ -10,6 +10,7 @@ import sys
+ import tempfile
+ from functools import reduce
+ from itertools import groupby
++from unittest import skip
+
+ import torch
+ import torch.distributed as c10d
+@@ -1415,7 +1415,7 @@ class ProcessGroupGlooTest(MultiProcessTestCase):
+         for i, tensor in enumerate(tensors):
+             self.assertEqual(torch.full(size, float(i * self.world_size)), tensor)
+
+-    @skip_if_win32()
++    @skip("Occasionally times out")
+     @requires_gloo()
+     def test_round_robin(self):
+         num_process_groups = 2
+@@ -1438,7 +1439,7 @@ class ProcessGroupGlooTest(MultiProcessTestCase):
+             pg.broadcast(tensor, root=0).wait()
+             self.assertEqual(torch.full([100, 100], 0.0), tensor)
+
+-    @skip_if_win32()
++    @skip("Occasionally times out")
+     @requires_gloo()
+     def test_round_robin_create_destroy(self):
+         store = c10d.FileStore(self.file_name, self.world_size)