Skip to content

Commit

Permalink
Merge pull request #15919 from Flamefire/20220728155542_new_pr_PyTorc…
Browse files Browse the repository at this point in the history
…h190

add and fix patches for PyTorch 1.9.0 on POWER
  • Loading branch information
boegel authored Sep 11, 2022
2 parents 14d05b4 + 0e318ad commit fe2f33d
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 7 deletions.
16 changes: 14 additions & 2 deletions easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0-foss-2020b.eb
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ patches = [
'PyTorch-1.8.1_fix-faulty-asserts-and-skip-test.patch',
'PyTorch-1.8.1_increase-distributed-test-timeout.patch',
'PyTorch-1.9.0_avoid-failures-in-test_unary_ufuncs.patch',
'PyTorch-1.9.0_fix-kineto-crash.patch',
'PyTorch-1.9.0_fix-vsx-vector-functions.patch',
'PyTorch-1.9.0_skip-lstm-serialization-test.patch',
]
Expand All @@ -46,7 +47,8 @@ checksums = [
'7a6e512274f0b8673f4f207a5bc53387d88be7e79833f42d20365668b2118071',
# PyTorch-1.9.0_avoid-failures-in-test_unary_ufuncs.patch
'f600e6831f8a03af007845687d1e0f65b2394ca89a9dab5178e2cdc9bd384d43',
'56a46c1690467a7fe7f6b904d152f8a3e2385305c5c29717f66b98b38022bf74', # PyTorch-1.9.0_fix-vsx-vector-functions.patch
'1ed5e125f7922ea577d43053a6652aedc21cc036157e101c0e3b9aee9029d3b0', # PyTorch-1.9.0_fix-kineto-crash.patch
'a4733b6b16a0db4ee5f85f2b103abc29bd711cfc5253f8dd8494d2b0c1509516', # PyTorch-1.9.0_fix-vsx-vector-functions.patch
# PyTorch-1.9.0_skip-lstm-serialization-test.patch
'0fc14e29bd7530bcc09f4212df3c846072b1313216da86b827e102b85d695f49',
]
Expand Down Expand Up @@ -79,9 +81,19 @@ excluded_tests = {
# Bad tests: https://github.com/pytorch/pytorch/issues/60260
'distributed/elastic/utils/distributed_test',
'distributed/elastic/multiprocessing/api_test',
# These tests fail on A10s at the very least, they time out forever no matter how long the timeout is.
# Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html
'distributed/test_distributed_fork',
'distributed/test_distributed_spawn',
# Fails on A10s: https://github.com/pytorch/pytorch/issues/63079
'test_optim',
# Test from this suite timeout often. The process group backend is deprecated anyway
'distributed/rpc/test_process_group_agent',
]
],
'POWER': [
# Works when run alone, fails when run as part of the suite. So far only observed on PPC
'distributed/rpc/test_tensorpipe_agent',
],
}

runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ patches = [
'PyTorch-1.9.0_fix-min-amount-of-devices-for-test.patch',
'PyTorch-1.9.0_fix-testnn-on-A100.patch',
'PyTorch-1.9.0_fix-use-after-destruct-in-cudaipctypes.patch',
'PyTorch-1.9.0_fix-kineto-crash.patch',
'PyTorch-1.9.0_fix-vsx-vector-functions.patch',
'PyTorch-1.9.0_increase-test-cuda-tolerance.patch',
'PyTorch-1.9.0_increase-tolerance-for-distributed-tests.patch',
Expand Down Expand Up @@ -64,7 +65,8 @@ checksums = [
'8e8b417782e2f3004462c32338e12685e7296d15207f3e3087dcb8015e648f98', # PyTorch-1.9.0_fix-testnn-on-A100.patch
# PyTorch-1.9.0_fix-use-after-destruct-in-cudaipctypes.patch
'67960bf9140baf004b07e29f7c2b338e7bc4e4e4f2c931768be44f58526e605f',
'56a46c1690467a7fe7f6b904d152f8a3e2385305c5c29717f66b98b38022bf74', # PyTorch-1.9.0_fix-vsx-vector-functions.patch
'1ed5e125f7922ea577d43053a6652aedc21cc036157e101c0e3b9aee9029d3b0', # PyTorch-1.9.0_fix-kineto-crash.patch
'a4733b6b16a0db4ee5f85f2b103abc29bd711cfc5253f8dd8494d2b0c1509516', # PyTorch-1.9.0_fix-vsx-vector-functions.patch
# PyTorch-1.9.0_increase-test-cuda-tolerance.patch
'73de855ab1ed38043c7fb2a983927786b83d7547aefed926f19e554e2214838a',
# PyTorch-1.9.0_increase-tolerance-for-distributed-tests.patch
Expand Down Expand Up @@ -121,7 +123,11 @@ excluded_tests = {
'test_optim',
# Test from this suite timeout often. The process group backend is deprecated anyway
'distributed/rpc/test_process_group_agent',
]
],
'POWER': [
# Works when run alone, fails when run as part of the suite. So far only observed on PPC
'distributed/rpc/test_tensorpipe_agent',
],
}

runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'
Expand Down
16 changes: 14 additions & 2 deletions easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0-fosscuda-2020b.eb
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ patches = [
'PyTorch-1.9.0_fix-min-amount-of-devices-for-test.patch',
'PyTorch-1.9.0_fix-testnn-on-A100.patch',
'PyTorch-1.9.0_fix-use-after-destruct-in-cudaipctypes.patch',
'PyTorch-1.9.0_fix-kineto-crash.patch',
'PyTorch-1.9.0_fix-vsx-vector-functions.patch',
'PyTorch-1.9.0_increase-test-cuda-tolerance.patch',
'PyTorch-1.9.0_increase-tolerance-for-distributed-tests.patch',
Expand Down Expand Up @@ -63,7 +64,8 @@ checksums = [
'8e8b417782e2f3004462c32338e12685e7296d15207f3e3087dcb8015e648f98', # PyTorch-1.9.0_fix-testnn-on-A100.patch
# PyTorch-1.9.0_fix-use-after-destruct-in-cudaipctypes.patch
'67960bf9140baf004b07e29f7c2b338e7bc4e4e4f2c931768be44f58526e605f',
'56a46c1690467a7fe7f6b904d152f8a3e2385305c5c29717f66b98b38022bf74', # PyTorch-1.9.0_fix-vsx-vector-functions.patch
'1ed5e125f7922ea577d43053a6652aedc21cc036157e101c0e3b9aee9029d3b0', # PyTorch-1.9.0_fix-kineto-crash.patch
'a4733b6b16a0db4ee5f85f2b103abc29bd711cfc5253f8dd8494d2b0c1509516', # PyTorch-1.9.0_fix-vsx-vector-functions.patch
# PyTorch-1.9.0_increase-test-cuda-tolerance.patch
'73de855ab1ed38043c7fb2a983927786b83d7547aefed926f19e554e2214838a',
# PyTorch-1.9.0_increase-tolerance-for-distributed-tests.patch
Expand Down Expand Up @@ -111,9 +113,19 @@ excluded_tests = {
# Bad tests: https://github.com/pytorch/pytorch/issues/60260
'distributed/elastic/utils/distributed_test',
'distributed/elastic/multiprocessing/api_test',
# These tests fail on A10s at the very least, they time out forever no matter how long the timeout is.
# Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html
'distributed/test_distributed_fork',
'distributed/test_distributed_spawn',
# Fails on A10s: https://github.com/pytorch/pytorch/issues/63079
'test_optim',
# Test from this suite timeout often. The process group backend is deprecated anyway
'distributed/rpc/test_process_group_agent',
]
],
'POWER': [
# Works when run alone, fails when run as part of the suite. So far only observed on PPC
'distributed/rpc/test_tensorpipe_agent',
],
}

runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
Fix a crash during application shutdown visible in test_profiler on some machines.
See https://github.com/pytorch/kineto/pull/642

Author: Alexander Grund (TU Dresden)

diff -aur a/third_party/kineto/libkineto/src/EventProfilerController.cpp b/third_party/kineto/libkineto/src/EventProfilerController.cpp
--- a/third_party/kineto/libkineto/src/EventProfilerController.cpp 2022-08-05 13:10:46.175716618 +0200
+++ b/third_party/kineto/libkineto/src/EventProfilerController.cpp 2022-08-05 13:16:00.654118490 +0200
@@ -231,9 +231,14 @@

// Must be called under lock
void EventProfilerController::start(CUcontext ctx) {
- profilerMap()[ctx] = unique_ptr<EventProfilerController>(
+ // Avoid static initialization order fiasco:
+ // We need the profilerMap and with it all controllers to be destroyed
+ // before everything the controller accesses gets destroyed.
+ // Hence access the profilerMap after initialization of the controller.
+ auto controller = unique_ptr<EventProfilerController>(
new EventProfilerController(
ctx, ConfigLoader::instance(), detail::HeartbeatMonitor::instance()));
+ profilerMap()[ctx] = std::move(controller);
}

// Must be called under lock
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ index 2a1a87aa72..5bcf818232 100644
- auto out1 = blendv(out, v_nan, ((exp.floor() != exp) & (x < zero)));
- // y = 0 then 1
- return blendv(out1, one, (exp_abs == zero));
+ return {Sleef_powf4_u10vsx(_vec0, b._vec0), Sleef_powf4_u10vsx(_vec1, b._vec1)};
+ return {Sleef_powf4_u10vsx(_vec0, exp._vec0), Sleef_powf4_u10vsx(_vec1, exp._vec1)};
}

Vec256<float> fmod(const Vec256<float>& b) const {
Expand Down

0 comments on commit fe2f33d

Please sign in to comment.