Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

{devel}[foss/2022a] PyTorch v1.12.0 w/ Python 3.10.4 + CUDA 11.7.0 #15924

Merged
Merged
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
name = 'PyTorch'
version = '1.12.0'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://pytorch.org/'
description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
PyTorch is a deep learning framework that puts Python first."""

toolchain = {'name': 'foss', 'version': '2022a'}

sources = [{
'filename': '%(name)s-%(version)s.tar.gz',
'git_config': {
'url': 'https://github.com/pytorch',
'repo_name': 'pytorch',
'tag': 'v%(version)s',
'recursive': True,
},
}]
patches = [
'%(name)s-1.7.0_avoid-nan-in-test-torch.patch',
'%(name)s-1.7.0_disable-dev-shm-test.patch',
'%(name)s-1.8.1_dont-use-gpu-ccc-in-test.patch',
'%(name)s-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch',
'%(name)s-1.10.0_fix-test-dataloader-fixed-affinity.patch',
'%(name)s-1.10.0_skip_cmake_rpath.patch',
'%(name)s-1.11.0_increase-distributed-test-timeout.patch',
'%(name)s-1.11.0_increase_c10d_gloo_timeout.patch',
'%(name)s-1.11.0_disable_failing_jit_cuda_fuser_tests.patch',
]
checksums = [
None, # PyTorch-1.12.0.tar.gz
'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18', # PyTorch-1.7.0_avoid-nan-in-test-torch.patch
'622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a', # PyTorch-1.7.0_disable-dev-shm-test.patch
'89ac7a8e9e7df2e64cf8404fe3a279f5e9b759fee41c9de3aaff9c22f385c2c6', # PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch
# PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch
'ff573660913ce055e24cfd194ce747ba5685091c631cfd443eae2a99d56b57ea',
# PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch
'313dca681f45ce3bc7c4557fdcdcbe0b77216d2c708fa30a2ec0e22c44876707',
'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448', # PyTorch-1.10.0_skip_cmake_rpath.patch
# PyTorch-1.11.0_increase-distributed-test-timeout.patch
'087ad20163a1291773ae3457569b80523080eb3731e210946459b2333a919f3f',
# PyTorch-1.11.0_increase_c10d_gloo_timeout.patch
'20cd4a8663f74ab326fdb032b926bf5c7e94d9750c515ab9050927ba00cf1953',
# PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch
'e7bfe120a8b3fe2b40dac6839852a5fbab3cb3429fbe44a0fc3a1800adaaee51',
]

builddependencies = [
('CMake', '3.23.1'),
('hypothesis', '6.46.7'),
]
dependencies = [
('CUDA', '11.7.0', '', True),
('Ninja', '1.10.2'), # Required for JIT compilation of C++ extensions
('Python', '3.10.4'),
('protobuf', '3.19.4'),
('protobuf-python', '3.19.4'),
('pybind11', '2.9.2'),
('SciPy-bundle', '2022.05'),
('PyYAML', '6.0'),
('MPFR', '4.1.0'),
('GMP', '6.2.1'),
('numactl', '2.0.14'),
('FFmpeg', '4.4.2'),
('Pillow', '9.1.1'),
('cuDNN', '8.4.1.50', '-CUDA-%(cudaver)s', True),
('magma', '2.6.2', '-CUDA-%(cudaver)s'),
('NCCL', '2.12.12', '-CUDA-%(cudaver)s'),
('expecttest', '0.1.3'),
]

osdependencies = [('libibverbs-dev', 'libibverbs-devel', 'rdma-core-devel')]

# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
cuda_compute_capabilities = [
'3.5',
'3.7',
'5.2',
'6.0',
'6.1',
'7.0',
'7.2',
'7.5',
'8.0',
'8.6',
]

custom_opts = ['USE_CUPTI_SO=1']

excluded_tests = {
'': [
# Bad tests: https://github.com/pytorch/pytorch/issues/60260
'distributed/elastic/utils/distributed_test',
'distributed/elastic/multiprocessing/api_test',
# These tests fail on A10s at the very least, they time out forever no matter how long the timeout is.
# Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html
# 'distributed/test_distributed_fork',
'distributed/test_distributed_spawn',
# Fails on A10s: https://github.com/pytorch/pytorch/issues/63079
'test_optim',
# Test from this suite timeout often. The process group backend is deprecated anyway
# 'distributed/rpc/test_process_group_agent',
# This test fails constently when run as part of the test suite, but succeeds when run interactively
'test_model_dump',
# These tests appear flaky, possibly related to number of GPUs that are used
'distributed/fsdp/test_fsdp_memory',
'distributed/fsdp/test_fsdp_overlap',
]
}

runtest = "cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s"

# several tests are known to be flaky, and fail in some contexts (like having multiple GPUs available),
# so we allow up to 400 (out of ~90k) tests to fail before treating the installation to be faulty
max_failed_tests = 400

# The readelf sanity check command can be taken out once the TestRPATH test from
# https://github.com/pytorch/pytorch/pull/68912 is accepted, since it is then checked as part of the PyTorch test suite
local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT
sanity_check_commands = [
"readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2,
]

tests = ['%(name)s-check-cpp-extension.py']

moduleclass = 'ai'