diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0-foss-2020b.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0-foss-2020b.eb index 34f2218ba3f..68f81ba8ce2 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0-foss-2020b.eb +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0-foss-2020b.eb @@ -26,6 +26,7 @@ patches = [ 'PyTorch-1.8.1_fix-faulty-asserts-and-skip-test.patch', 'PyTorch-1.8.1_increase-distributed-test-timeout.patch', 'PyTorch-1.9.0_avoid-failures-in-test_unary_ufuncs.patch', + 'PyTorch-1.9.0_fix-kineto-crash.patch', 'PyTorch-1.9.0_fix-vsx-vector-functions.patch', 'PyTorch-1.9.0_skip-lstm-serialization-test.patch', ] @@ -46,7 +47,8 @@ checksums = [ '7a6e512274f0b8673f4f207a5bc53387d88be7e79833f42d20365668b2118071', # PyTorch-1.9.0_avoid-failures-in-test_unary_ufuncs.patch 'f600e6831f8a03af007845687d1e0f65b2394ca89a9dab5178e2cdc9bd384d43', - '56a46c1690467a7fe7f6b904d152f8a3e2385305c5c29717f66b98b38022bf74', # PyTorch-1.9.0_fix-vsx-vector-functions.patch + '1ed5e125f7922ea577d43053a6652aedc21cc036157e101c0e3b9aee9029d3b0', # PyTorch-1.9.0_fix-kineto-crash.patch + 'a4733b6b16a0db4ee5f85f2b103abc29bd711cfc5253f8dd8494d2b0c1509516', # PyTorch-1.9.0_fix-vsx-vector-functions.patch # PyTorch-1.9.0_skip-lstm-serialization-test.patch '0fc14e29bd7530bcc09f4212df3c846072b1313216da86b827e102b85d695f49', ] @@ -79,9 +81,19 @@ excluded_tests = { # Bad tests: https://github.com/pytorch/pytorch/issues/60260 'distributed/elastic/utils/distributed_test', 'distributed/elastic/multiprocessing/api_test', + # These tests fail on A10s at the very least, they time out forever no matter how long the timeout is. + # Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html + 'distributed/test_distributed_fork', + 'distributed/test_distributed_spawn', + # Fails on A10s: https://github.com/pytorch/pytorch/issues/63079 + 'test_optim', # Test from this suite timeout often. The process group backend is deprecated anyway 'distributed/rpc/test_process_group_agent', - ] + ], + 'POWER': [ + # Works when run alone, fails when run as part of the suite. So far only observed on PPC + 'distributed/rpc/test_tensorpipe_agent', + ], } runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s' diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0-fosscuda-2020b-imkl.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0-fosscuda-2020b-imkl.eb index a05a60e0fd0..b4662e2722d 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0-fosscuda-2020b-imkl.eb +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0-fosscuda-2020b-imkl.eb @@ -32,6 +32,7 @@ patches = [ 'PyTorch-1.9.0_fix-min-amount-of-devices-for-test.patch', 'PyTorch-1.9.0_fix-testnn-on-A100.patch', 'PyTorch-1.9.0_fix-use-after-destruct-in-cudaipctypes.patch', + 'PyTorch-1.9.0_fix-kineto-crash.patch', 'PyTorch-1.9.0_fix-vsx-vector-functions.patch', 'PyTorch-1.9.0_increase-test-cuda-tolerance.patch', 'PyTorch-1.9.0_increase-tolerance-for-distributed-tests.patch', @@ -64,7 +65,8 @@ checksums = [ '8e8b417782e2f3004462c32338e12685e7296d15207f3e3087dcb8015e648f98', # PyTorch-1.9.0_fix-testnn-on-A100.patch # PyTorch-1.9.0_fix-use-after-destruct-in-cudaipctypes.patch '67960bf9140baf004b07e29f7c2b338e7bc4e4e4f2c931768be44f58526e605f', - '56a46c1690467a7fe7f6b904d152f8a3e2385305c5c29717f66b98b38022bf74', # PyTorch-1.9.0_fix-vsx-vector-functions.patch + '1ed5e125f7922ea577d43053a6652aedc21cc036157e101c0e3b9aee9029d3b0', # PyTorch-1.9.0_fix-kineto-crash.patch + 'a4733b6b16a0db4ee5f85f2b103abc29bd711cfc5253f8dd8494d2b0c1509516', # PyTorch-1.9.0_fix-vsx-vector-functions.patch # PyTorch-1.9.0_increase-test-cuda-tolerance.patch '73de855ab1ed38043c7fb2a983927786b83d7547aefed926f19e554e2214838a', # PyTorch-1.9.0_increase-tolerance-for-distributed-tests.patch @@ -121,7 +123,11 @@ excluded_tests = { 'test_optim', # Test from this suite timeout often. The process group backend is deprecated anyway 'distributed/rpc/test_process_group_agent', - ] + ], + 'POWER': [ + # Works when run alone, fails when run as part of the suite. So far only observed on PPC + 'distributed/rpc/test_tensorpipe_agent', + ], } runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s' diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0-fosscuda-2020b.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0-fosscuda-2020b.eb index 0fc9595a1b1..e7070b8d189 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0-fosscuda-2020b.eb +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0-fosscuda-2020b.eb @@ -31,6 +31,7 @@ patches = [ 'PyTorch-1.9.0_fix-min-amount-of-devices-for-test.patch', 'PyTorch-1.9.0_fix-testnn-on-A100.patch', 'PyTorch-1.9.0_fix-use-after-destruct-in-cudaipctypes.patch', + 'PyTorch-1.9.0_fix-kineto-crash.patch', 'PyTorch-1.9.0_fix-vsx-vector-functions.patch', 'PyTorch-1.9.0_increase-test-cuda-tolerance.patch', 'PyTorch-1.9.0_increase-tolerance-for-distributed-tests.patch', @@ -63,7 +64,8 @@ checksums = [ '8e8b417782e2f3004462c32338e12685e7296d15207f3e3087dcb8015e648f98', # PyTorch-1.9.0_fix-testnn-on-A100.patch # PyTorch-1.9.0_fix-use-after-destruct-in-cudaipctypes.patch '67960bf9140baf004b07e29f7c2b338e7bc4e4e4f2c931768be44f58526e605f', - '56a46c1690467a7fe7f6b904d152f8a3e2385305c5c29717f66b98b38022bf74', # PyTorch-1.9.0_fix-vsx-vector-functions.patch + '1ed5e125f7922ea577d43053a6652aedc21cc036157e101c0e3b9aee9029d3b0', # PyTorch-1.9.0_fix-kineto-crash.patch + 'a4733b6b16a0db4ee5f85f2b103abc29bd711cfc5253f8dd8494d2b0c1509516', # PyTorch-1.9.0_fix-vsx-vector-functions.patch # PyTorch-1.9.0_increase-test-cuda-tolerance.patch '73de855ab1ed38043c7fb2a983927786b83d7547aefed926f19e554e2214838a', # PyTorch-1.9.0_increase-tolerance-for-distributed-tests.patch @@ -111,9 +113,19 @@ excluded_tests = { # Bad tests: https://github.com/pytorch/pytorch/issues/60260 'distributed/elastic/utils/distributed_test', 'distributed/elastic/multiprocessing/api_test', + # These tests fail on A10s at the very least, they time out forever no matter how long the timeout is. + # Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html + 'distributed/test_distributed_fork', + 'distributed/test_distributed_spawn', + # Fails on A10s: https://github.com/pytorch/pytorch/issues/63079 + 'test_optim', # Test from this suite timeout often. The process group backend is deprecated anyway 'distributed/rpc/test_process_group_agent', - ] + ], + 'POWER': [ + # Works when run alone, fails when run as part of the suite. So far only observed on PPC + 'distributed/rpc/test_tensorpipe_agent', + ], } runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s' diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0_fix-kineto-crash.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0_fix-kineto-crash.patch new file mode 100644 index 00000000000..928922a5ed5 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0_fix-kineto-crash.patch @@ -0,0 +1,24 @@ +Fix a crash during application shutdown visible in test_profiler on some machines. +See https://github.com/pytorch/kineto/pull/642 + +Author: Alexander Grund (TU Dresden) + +diff -aur a/third_party/kineto/libkineto/src/EventProfilerController.cpp b/third_party/kineto/libkineto/src/EventProfilerController.cpp +--- a/third_party/kineto/libkineto/src/EventProfilerController.cpp 2022-08-05 13:10:46.175716618 +0200 ++++ b/third_party/kineto/libkineto/src/EventProfilerController.cpp 2022-08-05 13:16:00.654118490 +0200 +@@ -231,9 +231,14 @@ + + // Must be called under lock + void EventProfilerController::start(CUcontext ctx) { +- profilerMap()[ctx] = unique_ptr( ++ // Avoid static initialization order fiasco: ++ // We need the profilerMap and with it all controllers to be destroyed ++ // before everything the controller accesses gets destroyed. ++ // Hence access the profilerMap after initialization of the controller. ++ auto controller = unique_ptr( + new EventProfilerController( + ctx, ConfigLoader::instance(), detail::HeartbeatMonitor::instance())); ++ profilerMap()[ctx] = std::move(controller); + } + + // Must be called under lock diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0_fix-vsx-vector-functions.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0_fix-vsx-vector-functions.patch index a3c3e0b0b85..d4ba6c36d16 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0_fix-vsx-vector-functions.patch +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0_fix-vsx-vector-functions.patch @@ -185,7 +185,7 @@ index 2a1a87aa72..5bcf818232 100644 - auto out1 = blendv(out, v_nan, ((exp.floor() != exp) & (x < zero))); - // y = 0 then 1 - return blendv(out1, one, (exp_abs == zero)); -+ return {Sleef_powf4_u10vsx(_vec0, b._vec0), Sleef_powf4_u10vsx(_vec1, b._vec1)}; ++ return {Sleef_powf4_u10vsx(_vec0, exp._vec0), Sleef_powf4_u10vsx(_vec1, exp._vec1)}; } Vec256 fmod(const Vec256& b) const {