Merge pull request #15919 from Flamefire/20220728155542_new_pr_PyTorc…

…h190 add and fix patches for PyTorch 1.9.0 on POWER
easybuilders · Sep 11, 2022 · fe2f33d · fe2f33d
2 parents 14d05b4 + 0e318ad
commit fe2f33d
Show file tree

Hide file tree

Showing 5 changed files with 61 additions and 7 deletions.
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0-foss-2020b.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0-foss-2020b.eb
@@ -26,6 +26,7 @@ patches = [
     'PyTorch-1.8.1_fix-faulty-asserts-and-skip-test.patch',
     'PyTorch-1.8.1_increase-distributed-test-timeout.patch',
     'PyTorch-1.9.0_avoid-failures-in-test_unary_ufuncs.patch',
+    'PyTorch-1.9.0_fix-kineto-crash.patch',
     'PyTorch-1.9.0_fix-vsx-vector-functions.patch',
     'PyTorch-1.9.0_skip-lstm-serialization-test.patch',
 ]
@@ -46,7 +47,8 @@ checksums = [
     '7a6e512274f0b8673f4f207a5bc53387d88be7e79833f42d20365668b2118071',
     # PyTorch-1.9.0_avoid-failures-in-test_unary_ufuncs.patch
     'f600e6831f8a03af007845687d1e0f65b2394ca89a9dab5178e2cdc9bd384d43',
-    '56a46c1690467a7fe7f6b904d152f8a3e2385305c5c29717f66b98b38022bf74',  # PyTorch-1.9.0_fix-vsx-vector-functions.patch
+    '1ed5e125f7922ea577d43053a6652aedc21cc036157e101c0e3b9aee9029d3b0',  # PyTorch-1.9.0_fix-kineto-crash.patch
+    'a4733b6b16a0db4ee5f85f2b103abc29bd711cfc5253f8dd8494d2b0c1509516',  # PyTorch-1.9.0_fix-vsx-vector-functions.patch
     # PyTorch-1.9.0_skip-lstm-serialization-test.patch
     '0fc14e29bd7530bcc09f4212df3c846072b1313216da86b827e102b85d695f49',
 ]
@@ -79,9 +81,19 @@ excluded_tests = {
         # Bad tests: https://github.com/pytorch/pytorch/issues/60260
         'distributed/elastic/utils/distributed_test',
         'distributed/elastic/multiprocessing/api_test',
+        # These tests fail on A10s at the very least, they time out forever no matter how long the timeout is.
+        # Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html
+        'distributed/test_distributed_fork',
+        'distributed/test_distributed_spawn',
+        # Fails on A10s: https://github.com/pytorch/pytorch/issues/63079
+        'test_optim',
         # Test from this suite timeout often. The process group backend is deprecated anyway
         'distributed/rpc/test_process_group_agent',
-    ]
+    ],
+    'POWER': [
+        # Works when run alone, fails when run as part of the suite. So far only observed on PPC
+        'distributed/rpc/test_tensorpipe_agent',
+    ],
 }
 
 runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error  --verbose %(excluded_tests)s'

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0-fosscuda-2020b-imkl.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0-fosscuda-2020b-imkl.eb
@@ -32,6 +32,7 @@ patches = [
     'PyTorch-1.9.0_fix-min-amount-of-devices-for-test.patch',
     'PyTorch-1.9.0_fix-testnn-on-A100.patch',
     'PyTorch-1.9.0_fix-use-after-destruct-in-cudaipctypes.patch',
+    'PyTorch-1.9.0_fix-kineto-crash.patch',
     'PyTorch-1.9.0_fix-vsx-vector-functions.patch',
     'PyTorch-1.9.0_increase-test-cuda-tolerance.patch',
     'PyTorch-1.9.0_increase-tolerance-for-distributed-tests.patch',
@@ -64,7 +65,8 @@ checksums = [
     '8e8b417782e2f3004462c32338e12685e7296d15207f3e3087dcb8015e648f98',  # PyTorch-1.9.0_fix-testnn-on-A100.patch
     # PyTorch-1.9.0_fix-use-after-destruct-in-cudaipctypes.patch
     '67960bf9140baf004b07e29f7c2b338e7bc4e4e4f2c931768be44f58526e605f',
-    '56a46c1690467a7fe7f6b904d152f8a3e2385305c5c29717f66b98b38022bf74',  # PyTorch-1.9.0_fix-vsx-vector-functions.patch
+    '1ed5e125f7922ea577d43053a6652aedc21cc036157e101c0e3b9aee9029d3b0',  # PyTorch-1.9.0_fix-kineto-crash.patch
+    'a4733b6b16a0db4ee5f85f2b103abc29bd711cfc5253f8dd8494d2b0c1509516',  # PyTorch-1.9.0_fix-vsx-vector-functions.patch
     # PyTorch-1.9.0_increase-test-cuda-tolerance.patch
     '73de855ab1ed38043c7fb2a983927786b83d7547aefed926f19e554e2214838a',
     # PyTorch-1.9.0_increase-tolerance-for-distributed-tests.patch
@@ -121,7 +123,11 @@ excluded_tests = {
         'test_optim',
         # Test from this suite timeout often. The process group backend is deprecated anyway
         'distributed/rpc/test_process_group_agent',
-    ]
+    ],
+    'POWER': [
+        # Works when run alone, fails when run as part of the suite. So far only observed on PPC
+        'distributed/rpc/test_tensorpipe_agent',
+    ],
 }
 
 runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error  --verbose %(excluded_tests)s'

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0-fosscuda-2020b.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0-fosscuda-2020b.eb
@@ -31,6 +31,7 @@ patches = [
     'PyTorch-1.9.0_fix-min-amount-of-devices-for-test.patch',
     'PyTorch-1.9.0_fix-testnn-on-A100.patch',
     'PyTorch-1.9.0_fix-use-after-destruct-in-cudaipctypes.patch',
+    'PyTorch-1.9.0_fix-kineto-crash.patch',
     'PyTorch-1.9.0_fix-vsx-vector-functions.patch',
     'PyTorch-1.9.0_increase-test-cuda-tolerance.patch',
     'PyTorch-1.9.0_increase-tolerance-for-distributed-tests.patch',
@@ -63,7 +64,8 @@ checksums = [
     '8e8b417782e2f3004462c32338e12685e7296d15207f3e3087dcb8015e648f98',  # PyTorch-1.9.0_fix-testnn-on-A100.patch
     # PyTorch-1.9.0_fix-use-after-destruct-in-cudaipctypes.patch
     '67960bf9140baf004b07e29f7c2b338e7bc4e4e4f2c931768be44f58526e605f',
-    '56a46c1690467a7fe7f6b904d152f8a3e2385305c5c29717f66b98b38022bf74',  # PyTorch-1.9.0_fix-vsx-vector-functions.patch
+    '1ed5e125f7922ea577d43053a6652aedc21cc036157e101c0e3b9aee9029d3b0',  # PyTorch-1.9.0_fix-kineto-crash.patch
+    'a4733b6b16a0db4ee5f85f2b103abc29bd711cfc5253f8dd8494d2b0c1509516',  # PyTorch-1.9.0_fix-vsx-vector-functions.patch
     # PyTorch-1.9.0_increase-test-cuda-tolerance.patch
     '73de855ab1ed38043c7fb2a983927786b83d7547aefed926f19e554e2214838a',
     # PyTorch-1.9.0_increase-tolerance-for-distributed-tests.patch
@@ -111,9 +113,19 @@ excluded_tests = {
         # Bad tests: https://github.com/pytorch/pytorch/issues/60260
         'distributed/elastic/utils/distributed_test',
         'distributed/elastic/multiprocessing/api_test',
+        # These tests fail on A10s at the very least, they time out forever no matter how long the timeout is.
+        # Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html
+        'distributed/test_distributed_fork',
+        'distributed/test_distributed_spawn',
+        # Fails on A10s: https://github.com/pytorch/pytorch/issues/63079
+        'test_optim',
         # Test from this suite timeout often. The process group backend is deprecated anyway
         'distributed/rpc/test_process_group_agent',
-    ]
+    ],
+    'POWER': [
+        # Works when run alone, fails when run as part of the suite. So far only observed on PPC
+        'distributed/rpc/test_tensorpipe_agent',
+    ],
 }
 
 runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error  --verbose %(excluded_tests)s'

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0_fix-kineto-crash.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0_fix-kineto-crash.patch
@@ -0,0 +1,24 @@
+Fix a crash during application shutdown visible in test_profiler on some machines.
+See https://github.com/pytorch/kineto/pull/642
+
+Author: Alexander Grund (TU Dresden)
+
+diff -aur a/third_party/kineto/libkineto/src/EventProfilerController.cpp b/third_party/kineto/libkineto/src/EventProfilerController.cpp
+--- a/third_party/kineto/libkineto/src/EventProfilerController.cpp	2022-08-05 13:10:46.175716618 +0200
++++ b/third_party/kineto/libkineto/src/EventProfilerController.cpp	2022-08-05 13:16:00.654118490 +0200
+@@ -231,9 +231,14 @@
+
+ // Must be called under lock
+ void EventProfilerController::start(CUcontext ctx) {
+-  profilerMap()[ctx] = unique_ptr<EventProfilerController>(
++  // Avoid static initialization order fiasco:
++  // We need the profilerMap and with it all controllers to be destroyed
++  // before everything the controller accesses gets destroyed.
++  // Hence access the profilerMap after initialization of the controller.
++  auto controller = unique_ptr<EventProfilerController>(
+       new EventProfilerController(
+           ctx, ConfigLoader::instance(), detail::HeartbeatMonitor::instance()));
++  profilerMap()[ctx] = std::move(controller);
+ }
+
+ // Must be called under lock
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0_fix-vsx-vector-functions.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0_fix-vsx-vector-functions.patch
@@ -185,7 +185,7 @@ index 2a1a87aa72..5bcf818232 100644
 -    auto out1 = blendv(out, v_nan, ((exp.floor() != exp) & (x < zero)));
 -    // y = 0 then 1
 -    return blendv(out1, one, (exp_abs == zero));
-+    return {Sleef_powf4_u10vsx(_vec0, b._vec0), Sleef_powf4_u10vsx(_vec1, b._vec1)};
++    return {Sleef_powf4_u10vsx(_vec0, exp._vec0), Sleef_powf4_u10vsx(_vec1, exp._vec1)};
    }
 
    Vec256<float> fmod(const Vec256<float>& b) const {