Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add patches for PyTorch 1.7.1 avoiding failures on POWER and A100 #12753

Merged
Merged
16 changes: 15 additions & 1 deletion easybuild/easyconfigs/p/PyTorch/PyTorch-1.7.1-foss-2020b.eb
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,9 @@ patches = [
'PyTorch-1.7.0_avoid-nan-in-test-torch.patch',
'PyTorch-1.7.0_increase-distributed-test-timeout.patch',
'PyTorch-1.7.0_disable-dev-shm-test.patch',
'PyTorch-1.7.1_fix-alias-violation-in-bitwise-ops.patch',
'PyTorch-1.7.1_fix-err-variable.patch',
'PyTorch-1.7.1_run-large-tests-on-GPU.patch',
]
checksums = [
'fc8d6aaf0bdedd4221617be8d47ac39af57605bdcc814fabc28739427b55e9c7', # v1.7.1.tar.gz
Expand Down Expand Up @@ -288,9 +291,20 @@ checksums = [
# PyTorch-1.7.0_increase-distributed-test-timeout.patch
'95abb468a35451fbd0f864ca843f6ad15ff8bfb909c3fd580f65859b26c9691c',
'622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a', # PyTorch-1.7.0_disable-dev-shm-test.patch
# PyTorch-1.7.1_fix-alias-violation-in-bitwise-ops.patch
'e92f054f1297df83ace901e7af38222787b709ee29580f5f2b89a300ca03666b',
'abb79e7ffd10be87adfb62e79131c50079c32470031ac22b12b273cfae85ca4c', # PyTorch-1.7.1_fix-err-variable.patch
# PyTorch-1.7.1_run-large-tests-on-GPU.patch
'06651b6746a27bee1adf15af24e356e188d683241bb186343009dc69c8d5aa9b',
]

excluded_tests = {
'POWER': [
# https://github.com/pytorch/pytorch/issues/57533
'test_nn',
# Fails for unknown reasons when run within EB but not when run manually
'test_utils',
],
'': [
# Test from this suite timeout often. The process group backend is deprecated anyway
'distributed/rpc/test_process_group_agent',
Expand All @@ -300,7 +314,7 @@ excluded_tests = {
]
}

runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --verbose %(excluded_tests)s'
runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'

sanity_check_commands = ["python -c 'import caffe2.python'"]
tests = ['PyTorch-check-cpp-extension.py']
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -252,8 +252,12 @@ patches = [
'PyTorch-1.7.1_validate-num-gpus-in-distributed-test.patch',
'PyTorch-1.7.1_complex32.patch',
'PyTorch-1.7.1_bypass-nan-compare.patch',
'PyTorch-1.7.1_fix-alias-violation-in-bitwise-ops.patch',
'PyTorch-1.7.1_fix-err-variable.patch',
'PyTorch-1.7.1_fix-use-after-destruct-in-cudaipctypes.patch',
'PyTorch-1.7.1_el8_ppc64le.patch',
'PyTorch-1.7.1_run-large-tests-on-GPU.patch',
'PyTorch-1.7.1_disable-failing-cuda-11.2-tests.patch',
]
checksums = [
'fc8d6aaf0bdedd4221617be8d47ac39af57605bdcc814fabc28739427b55e9c7', # v1.7.1.tar.gz
Expand Down Expand Up @@ -312,9 +316,16 @@ checksums = [
'd27f7b5149632512b6fe226837df914aad35c88f8b490856dc6dd90ea1e5d7e6',
'6028bff2be720cf70acad2129db60fd10872e02c9e460c72bb274228cf90b320', # PyTorch-1.7.1_complex32.patch
'0943496231b6857801e2424e561d03897a6982d098cba5b6967017b391a7e977', # PyTorch-1.7.1_bypass-nan-compare.patch
# PyTorch-1.7.1_fix-alias-violation-in-bitwise-ops.patch
'e92f054f1297df83ace901e7af38222787b709ee29580f5f2b89a300ca03666b',
'abb79e7ffd10be87adfb62e79131c50079c32470031ac22b12b273cfae85ca4c', # PyTorch-1.7.1_fix-err-variable.patch
# PyTorch-1.7.1_fix-use-after-destruct-in-cudaipctypes.patch
'250345aad08fb72deaaee9b249d9661d4ce93d08661b32d7856ed57e4aa8142e',
'2a94a9cc009f02469b843fc65a6ee2cb01873f783568b8bcc83c33ba8e6b1a58', # PyTorch-1.7.1_el8_ppc64le.patch
# PyTorch-1.7.1_run-large-tests-on-GPU.patch
'06651b6746a27bee1adf15af24e356e188d683241bb186343009dc69c8d5aa9b',
# PyTorch-1.7.1_disable-failing-cuda-11.2-tests.patch
'2a9df4face04798f51eee0db83d28a905ea7ac53569cf25ed9f049b0a547702e',
]

excluded_tests = {
Expand All @@ -327,7 +338,7 @@ excluded_tests = {
]
}

runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --verbose %(excluded_tests)s'
runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'

sanity_check_commands = ["python -c 'import caffe2.python'"]
tests = ['PyTorch-check-cpp-extension.py']
Expand Down
19 changes: 18 additions & 1 deletion easybuild/easyconfigs/p/PyTorch/PyTorch-1.7.1-fosscuda-2020b.eb
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,13 @@ patches = [
'PyTorch-1.7.1_validate-num-gpus-in-distributed-test.patch',
'PyTorch-1.7.1_complex32.patch',
'PyTorch-1.7.1_bypass-nan-compare.patch',
'PyTorch-1.7.1_fix-alias-violation-in-bitwise-ops.patch',
'PyTorch-1.7.1_fix-err-variable.patch',
'PyTorch-1.7.1_fix-use-after-destruct-in-cudaipctypes.patch',
'PyTorch-1.7.1_disable-tf32-in-distributed-tests.patch',
'PyTorch-1.7.1_relax_precision_in_test_nn.patch',
'PyTorch-1.7.1_run-large-tests-on-GPU.patch',
'PyTorch-1.7.1_disable-failing-cuda-11.2-tests.patch',
]
checksums = [
'fc8d6aaf0bdedd4221617be8d47ac39af57605bdcc814fabc28739427b55e9c7', # v1.7.1.tar.gz
Expand Down Expand Up @@ -310,8 +316,19 @@ checksums = [
'd27f7b5149632512b6fe226837df914aad35c88f8b490856dc6dd90ea1e5d7e6',
'6028bff2be720cf70acad2129db60fd10872e02c9e460c72bb274228cf90b320', # PyTorch-1.7.1_complex32.patch
'0943496231b6857801e2424e561d03897a6982d098cba5b6967017b391a7e977', # PyTorch-1.7.1_bypass-nan-compare.patch
# PyTorch-1.7.1_fix-alias-violation-in-bitwise-ops.patch
'e92f054f1297df83ace901e7af38222787b709ee29580f5f2b89a300ca03666b',
'abb79e7ffd10be87adfb62e79131c50079c32470031ac22b12b273cfae85ca4c', # PyTorch-1.7.1_fix-err-variable.patch
# PyTorch-1.7.1_fix-use-after-destruct-in-cudaipctypes.patch
'250345aad08fb72deaaee9b249d9661d4ce93d08661b32d7856ed57e4aa8142e',
# PyTorch-1.7.1_disable-tf32-in-distributed-tests.patch
'18ecad081a8c940add64040ad9698d3273366acf738a8a44eab1c793d3f49950',
# PyTorch-1.7.1_relax_precision_in_test_nn.patch
'4089bd3d2ee1939108ed9ebe9cd98da306df626ea6e59fac52fa877a5ea8a163',
# PyTorch-1.7.1_run-large-tests-on-GPU.patch
'06651b6746a27bee1adf15af24e356e188d683241bb186343009dc69c8d5aa9b',
# PyTorch-1.7.1_disable-failing-cuda-11.2-tests.patch
'2a9df4face04798f51eee0db83d28a905ea7ac53569cf25ed9f049b0a547702e',
]

excluded_tests = {
Expand All @@ -324,7 +341,7 @@ excluded_tests = {
]
}

runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --verbose %(excluded_tests)s'
runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'

sanity_check_commands = ["python -c 'import caffe2.python'"]
tests = ['PyTorch-check-cpp-extension.py']
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
From a1b8f3d4b6ada3721a3d9d8266f86dcb04f17fcb Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Wed, 10 Feb 2021 11:41:47 -0800
Subject: [PATCH] Replace CUDA 11.1 Linux CI with CUDA 11.2 (#51905)

Summary:
Adding 11.2 to CI with BUILD_SPLIT_CUDA enabled.

Disabled the following tests as they were failing in test_optim.py:
test_adadelta
test_adam
test_adamw
test_multi_tensor_optimizers
test_rmsprop

(Issue tracking that is here: https://github.com/pytorch/pytorch/issues/51992)

Note by Alexander Grund (TU Dresden): The issue seems to be the CUDA driver version, not the neccessarily the compiler
diff --git a/test/test_optim.py b/test/test_optim.py
index 00d3f7a2bd131..17c445b6fee8f 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -306,6 +306,7 @@ def test_sgd_sparse(self):
)

@skipIfRocm
+ @unittest.skipIf(True, "test does not pass for CUDA 11.2")
def test_multi_tensor_optimizers(self):
if not torch.cuda.is_available():
return
@@ -340,15 +341,15 @@ def test_multi_tensor_optimizers(self):
for index in range(len(optimizer_pairs)):
res = []
for opt in optimizer_pairs[index]:
- weight = torch.tensor([[-0.2109, -0.4976], [-0.1413, -0.3420], [-0.2524, 0.6976]],
+ weight = torch.tensor([[-0.2109, -0.4976], [-0.1413, -0.3420], [-0.2524, 0.6976]],
dtype=torch.float64, device=device, requires_grad=True)
bias = torch.tensor([-0.1085, -0.2979, 0.6892], dtype=torch.float64, device=device, requires_grad=True)
- weight2 = torch.tensor([[-0.0508, -0.3941, -0.2843]],
+ weight2 = torch.tensor([[-0.0508, -0.3941, -0.2843]],
dtype=torch.float64, device=device, requires_grad=True)
bias2 = torch.tensor([-0.0711], dtype=torch.float64, device=device, requires_grad=True)
input = torch.tensor([0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype=torch.float64, device=device).reshape(3, 2)

- model = torch.nn.Sequential(torch.nn.Linear(2, 3),
+ model = torch.nn.Sequential(torch.nn.Linear(2, 3),
torch.nn.Sigmoid(),
torch.nn.Linear(3, 1),
torch.nn.Sigmoid())
@@ -363,7 +364,7 @@ def test_multi_tensor_optimizers(self):

optimizer = opt(model.parameters(), **params)

- for _ in range(kIterations):
+ for _ in range(kIterations):
optimizer.zero_grad()
output = model(input)
loss = output.sum()
@@ -379,7 +380,7 @@ def test_multi_tensor_optimizers(self):
for p1, p2 in zip(res[0], res[1]):
self.assertEqual(p1, p2)

-
+ @unittest.skipIf(True, "test does not pass for CUDA 11.2")
def test_adam(self):
for optimizer in [optim.Adam, optim_mt.Adam]:
self._test_basic_cases(
@@ -425,6 +426,7 @@ def test_adam(self):
with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
optimizer(None, lr=1e-2, weight_decay=-1)

+ @unittest.skipIf(True, "test does not pass for CUDA 11.2")
def test_adamw(self):
for optimizer in [optim.AdamW, optim_mt.AdamW]:
self._test_basic_cases(
@@ -459,6 +461,7 @@ def test_sparse_adam(self):

# ROCm precision is too low to pass this test
@skipIfRocm
+ @unittest.skipIf(True, "test does not pass for CUDA 11.2")
def test_adadelta(self):
for optimizer in [optim.Adadelta, optim_mt.Adadelta]:
self._test_basic_cases(
@@ -535,6 +538,7 @@ def test_adamax(self):
with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 1: 1.0"):
optimizer(None, lr=1e-2, betas=(0.0, 1.0))

+ @unittest.skipIf(True, "test does not pass for CUDA 11.2")
def test_rmsprop(self):
for optimizer in [optim.RMSprop, optim_mt.RMSprop]:
self._test_basic_cases(
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
Fix test failures in DDP (distributed) tests on Ampere GPUs by disabling TF32 mode

From https://github.com/pytorch/pytorch/pull/52941
See https://github.com/pytorch/pytorch/issues/52278

From 6c0052a1a6c02f99c500d29680775957c723cf2b Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Fri, 26 Feb 2021 12:40:07 -0800
Subject: [PATCH] Disable TF32 on DDP tests

---
test/distributed/test_distributed_fork.py | 3 +++
test/distributed/test_distributed_spawn.py | 4 ++++
2 files changed, 7 insertions(+)

diff --git a/test/distributed/test_distributed_fork.py b/test/distributed/test_distributed_fork.py
index 84d23e71af951..5b7034f55eb94 100644
--- a/test/distributed/test_distributed_fork.py
+++ b/test/distributed/test_distributed_fork.py
@@ -11,6 +11,8 @@
DistributedTest, TestDistBackend
)

+torch.backends.cuda.matmul.allow_tf32 = False
+
CPP_EXTENSIONS_WARNING = """
Ninja (https://ninja-build.org) must be available to run C++ extensions tests,
but it could not be found. Install ninja with `pip install ninja`
@@ -48,6 +50,7 @@ class TestDistBackendWithFork(TestDistBackend, DistributedTest._DistTestBase):
def setUp(self):
super().setUp()
self._fork_processes()
+ torch.backends.cudnn.flags(allow_tf32=False).__enter__()


elif BACKEND == "mpi":
diff --git a/test/distributed/test_distributed_spawn.py b/test/distributed/test_distributed_spawn.py
index 48a21db72a666..4692b0b1c0562 100644
--- a/test/distributed/test_distributed_spawn.py
+++ b/test/distributed/test_distributed_spawn.py
@@ -3,12 +3,15 @@
import sys
import unittest

+import torch
import torch.distributed as dist
from torch.testing._internal.common_utils import run_tests, TEST_WITH_ASAN, NO_MULTIPROCESSING_SPAWN
from torch.testing._internal.distributed.distributed_test import (
DistributedTest, TestDistBackend
)

+torch.backends.cuda.matmul.allow_tf32 = False
+
if not dist.is_available():
print("Distributed not available, skipping tests", file=sys.stderr)
sys.exit(0)
@@ -28,6 +31,7 @@ class TestDistBackendWithSpawn(TestDistBackend, DistributedTest._DistTestBase):
def setUp(self):
super().setUp()
self._spawn_processes()
+ torch.backends.cudnn.flags(allow_tf32=False).__enter__()


if __name__ == "__main__":
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
A reinterpret_cast to an unrelated type is undefined behavior.
This causes real issues due to misoptimizations on at least GCC 10.2 on POWER
See https://github.com/pytorch/pytorch/issues/58031

Author: Alexander Grund (TU Dresden)

diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h
index 697996ab8e..1663ae239a 100644
--- a/aten/src/ATen/cpu/vec256/vec256_base.h
+++ b/aten/src/ATen/cpu/vec256/vec256_base.h
@@ -701,12 +701,14 @@ inline Vec256<T> operator^(const Vec256<T>& a, const Vec256<T>& b) {

template<class T, typename Op>
static inline Vec256<T> bitwise_binary_op(const Vec256<T> &a, const Vec256<T> &b, Op op) {
- static constexpr uint32_t element_no = 32 / sizeof(intmax_t);
+ constexpr uint32_t element_no = 32 / sizeof(intmax_t);
+ __at_align32__ intmax_t buffer_a[element_no];
+ __at_align32__ intmax_t buffer_b[element_no];
__at_align32__ intmax_t buffer[element_no];
- const intmax_t *a_ptr = reinterpret_cast<const intmax_t*>((const T*) a);
- const intmax_t *b_ptr = reinterpret_cast<const intmax_t*>((const T*) b);
+ a.store(buffer_a);
+ b.store(buffer_b);
for (uint32_t i = 0U; i < element_no; ++ i) {
- buffer[i] = op(a_ptr[i], b_ptr[i]);
+ buffer[i] = op(buffer_a[i], buffer_b[i]);
}
return Vec256<T>::loadu(buffer);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Fix a wrongly named variable in the tests

Author: Alexander Grund (TU Dresden)
diff --git a/test/run_test.py b/test/run_test.py
index 4309e65478..a0ffa51aad 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -738,7 +738,7 @@ def main():

if options.continue_through_error and has_failed:
for err in failure_messages:
- print_to_stderr(message)
+ print_to_stderr(err)
sys.exit(1)

if __name__ == '__main__':
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
On A100 GPUs the precision seems to be a lot less, likely due to increased parallelism
See https://github.com/pytorch/pytorch/issues/52278

Author: Alexander Grund (TU Dresden)

diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index a379fa10b8..8755da5c8c 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -3542,6 +3542,7 @@ new_module_tests = [
.dropout(0.0)''',
input_size=(2, 3, 4),
desc='relu_activation',
+ precision=2e-2,
),
dict(
module_name='TransformerEncoderLayer',
@@ -3553,6 +3554,7 @@ new_module_tests = [
input_size=(2, 3, 4),
check_gradgrad=False,
desc='gelu_activation',
+ precision=2e-2,
),
dict(
module_name='TransformerDecoderLayer',
@@ -3563,6 +3565,7 @@ new_module_tests = [
input_fn=lambda: (torch.rand(3, 3, 4), torch.rand(2, 3, 4)),
check_gradgrad=False,
desc='relu_activation',
+ precision=2e-2,
),
dict(
module_name='Transformer5e-3,DecoderLayer',
@@ -3574,6 +3577,7 @@ new_module_tests = [
input_fn=lambda: (torch.rand(3, 3, 4), torch.rand(2, 3, 4)),
check_gradgrad=False,
desc='gelu_activation',
+ precision=2e-2,
),
dict(
module_name='Transformer',
@@ -3588,7 +3592,8 @@ new_module_tests = [
.activation(torch::kReLU)''',
input_fn=lambda:(torch.rand(3, 3, 4), torch.rand(2, 3, 4), torch.rand(3, 3)),
check_gradgrad=False,
- desc='multilayer_coder'
+ desc='multilayer_coder',
+ precision=2e-2,
)
]
Loading