diff --git a/ci/Jenkinsfile_utils.groovy b/ci/Jenkinsfile_utils.groovy index 17d4e847bb82..2ea78f2fcd1e 100644 --- a/ci/Jenkinsfile_utils.groovy +++ b/ci/Jenkinsfile_utils.groovy @@ -250,6 +250,7 @@ def assign_node_labels(args) { NODE_LINUX_CPU = args.linux_cpu NODE_LINUX_GPU = args.linux_gpu NODE_LINUX_GPU_G4 = args.linux_gpu_g4 + NODE_LINUX_GPU_G5 = args.linux_gpu_g5 NODE_LINUX_GPU_P3 = args.linux_gpu_p3 NODE_WINDOWS_CPU = args.windows_cpu NODE_WINDOWS_GPU = args.windows_gpu diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 05f80032cd15..d68ce9649dcf 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -22,8 +22,11 @@ set -ex -CI_CUDA_COMPUTE_CAPABILITIES="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_70,code=sm_70" -CI_CMAKE_CUDA_ARCH="5.2 7.0" +# compute capabilities for CI instances supported by CUDA 10.x (i.e. p3, g4) +CI_CMAKE_CUDA10_ARCH="5.2 7.5" + +# compute capabilities for CI instances supported by CUDA >= 11.1 (i.e. p3, g4, g5) +CI_CMAKE_CUDA_ARCH="5.2 7.5 8.6" clean_repo() { set -ex @@ -298,7 +301,7 @@ build_centos7_gpu() { -DUSE_BLAS=Open \ -DUSE_ONEDNN=ON \ -DUSE_CUDA=ON \ - -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \ + -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA10_ARCH" \ -DUSE_DIST_KVSTORE=ON \ -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \ -DUSE_INT64_TENSOR_SIZE=OFF \ diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy index 92d126612b50..aec2b6564f55 100644 --- a/ci/jenkins/Jenkins_steps.groovy +++ b/ci/jenkins/Jenkins_steps.groovy @@ -716,6 +716,22 @@ def test_unix_python3_gpu(lib_name) { }] } +def test_unix_python3_ampere_gpu(lib_name) { + return ['Python3: Ampere-GPU': { + node(NODE_LINUX_GPU_G5) { + ws('workspace/ut-python3-gpu') { + try { + utils.unpack_and_init(lib_name, mx_lib_cython) + python3_gpu_ut_cython('ubuntu_gpu_cu111') + utils.publish_test_coverage() + } finally { + utils.collect_test_results_unix('tests_gpu.xml', 'tests_python3_ampere_gpu.xml') + } + } + } + }] +} + def test_unix_python3_debug_cpu() { return ['Python3: CPU debug': { node(NODE_LINUX_CPU) { diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu index 46d455f1db3e..69ce5a3a0d87 100644 --- a/ci/jenkins/Jenkinsfile_unix_gpu +++ b/ci/jenkins/Jenkinsfile_unix_gpu @@ -29,7 +29,7 @@ node('utility') { utils = load('ci/Jenkinsfile_utils.groovy') custom_steps = load('ci/jenkins/Jenkins_steps.groovy') } -utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu', linux_gpu: 'mxnetlinux-gpu', linux_gpu_p3: 'mxnetlinux-gpu-p3', linux_gpu_g4: 'mxnetlinux-gpu-g4') +utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu', linux_gpu: 'mxnetlinux-gpu', linux_gpu_p3: 'mxnetlinux-gpu-p3', linux_gpu_g4: 'mxnetlinux-gpu-g4', linux_gpu_g5: 'mxnetlinux-gpu-g5') utils.main_wrapper( core_logic: { @@ -44,6 +44,7 @@ core_logic: { utils.parallel_stage('Tests', [ custom_steps.test_unix_python3_gpu('gpu'), + custom_steps.test_unix_python3_ampere_gpu('gpu'), custom_steps.test_unix_python3_onednn_gpu('onednn_gpu'), custom_steps.test_unix_python3_onednn_nocudnn_gpu('onednn_gpu_nocudnn'), custom_steps.test_unix_cpp_package_gpu('gpu'), diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py index c80417347d77..dc9167aeff9a 100644 --- a/python/mxnet/test_utils.py +++ b/python/mxnet/test_utils.py @@ -112,15 +112,15 @@ def effective_dtype(dat): ---------- dat : np.ndarray or mx.nd.array or mx.np.ndarray """ - # On arch 80 gpus, a float32-io gemm or conv op will trim the mantissa of data - # inputs to be of comparable precision to a float16, so float16 becomes the + # On arch 80 gpus or later, a float32-io gemm or conv op will trim the mantissa of + # data inputs to be of comparable precision to a float16, so float16 becomes the # 'effective dtype' for tolerance tests involving such op outputs. # Is TF32 enabled in the device (the default on arch 80 GPUs) def is_TF32_enabled(device): try: return (device.device_type == 'gpu' and - get_cuda_compute_capability(device) == 80 and + get_cuda_compute_capability(device) >= 80 and os.environ.get('NVIDIA_TF32_OVERRIDE') != '0') except: # pylint: disable=bare-except return False diff --git a/src/operator/contrib/count_sketch.cu b/src/operator/contrib/count_sketch.cu index 24ca7970e064..bb16695caa74 100644 --- a/src/operator/contrib/count_sketch.cu +++ b/src/operator/contrib/count_sketch.cu @@ -93,6 +93,9 @@ __global__ void sketch_backward_kernel(const int nthreads, // only calculate gradient regarding x // can also calculate gradient regarding s if needed const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= nthreads) { + return; + } const int i_indim = index % in_dim; const int i_sample = index / in_dim; const int i_outdim = i_sample * out_dim + h[i_indim]; diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index 6592cd490dac..387084164111 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -27,7 +27,7 @@ import mxnet.ndarray.sparse as mxsps from mxnet.test_utils import check_consistency, set_default_device, assert_almost_equal, assert_allclose from mxnet.test_utils import check_symbolic_forward, check_symbolic_backward, discard_stderr -from mxnet.test_utils import default_device, rand_shape_2d, rand_ndarray, same, environment, get_rtc_compile_opts +from mxnet.test_utils import default_device, rand_shape_2d, rand_ndarray, same, environment, get_rtc_compile_opts, get_cuda_compute_capability from mxnet.base import MXNetError from mxnet import autograd @@ -54,6 +54,13 @@ set_default_device(mx.gpu(0)) +# For info purposes, log GPU compute cababilities. Run serially so output appears in log. +@pytest.mark.serial +def test_report_compute_capabilities(capsys): + with capsys.disabled(): + sys.stdout.write('= {} '.format( + [get_cuda_compute_capability(mx.gpu(i)) for i in range(mx.device.num_gpus())] )) + def check_countsketch(in_dim,out_dim,n): data = mx.sym.Variable("data") h = mx.sym.Variable("h") diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py index 2911f9165244..ac38b73ac4f8 100644 --- a/tests/python/unittest/test_gluon_rnn.py +++ b/tests/python/unittest/test_gluon_rnn.py @@ -606,7 +606,8 @@ def check_rnn_layer_forward(layer, inputs, states=None, run_only=False, device=m @mx.util.use_np -def run_rnn_layers(dtype, dtype2, device=mx.cpu()): +def run_rnn_layers(dtype, dtype2): + device = default_device() check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype), mx.np.ones((8, 3, 20), dtype=dtype), device=device) check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype, bidirectional=True), mx.np.ones((8, 3, 20), dtype=dtype), mx.np.ones((4, 3, 10), dtype=dtype), device=device) @@ -668,15 +669,18 @@ def run_rnn_layers(dtype, dtype2, device=mx.cpu()): out.backward() out = out.asnumpy() +@assert_raises_cudnn_not_satisfied(min_version='5.1.10') @pytest.mark.serial def test_rnn_layers_fp32(): run_rnn_layers('float32', 'float32') @assert_raises_cudnn_not_satisfied(min_version='5.1.10') -@pytest.mark.skipif(mx.device.num_gpus() == 0, reason="RNN FP16 only implemented for GPU for now") @pytest.mark.serial def test_rnn_layers_fp16(): - run_rnn_layers('float16', 'float32', mx.gpu()) + # Dynamic skip condition is best handled this way, rather than with pytest.mark.skipIf + if default_device().device_type == 'cpu': + pytest.skip('RNN FP16 only implemented for GPU for now') + run_rnn_layers('float16', 'float32') def check_rnn_consistency(fused_layer, stack_layer, loss, mode, num_layers, input_size, hidden_size, bidirectional=False, rtol=1e-2, atol=1e-4): @@ -844,14 +848,12 @@ def test_rnn_unroll_variant_length(): inputs=data_nd[i:(i+1), :ele_length, :], merge_outputs=True, layout='NTC') - assert_allclose(ele_out.asnumpy(), outs[i:(i+1), :ele_length, :].asnumpy(), - atol=1E-4, rtol=1E-4) + assert_almost_equal(ele_out, outs[i:(i+1), :ele_length, :]) if ele_length < max_length: # Check the padded outputs are all zero - assert_allclose(outs[i:(i+1), ele_length:max_length, :].asnumpy(), 0) + assert_almost_equal(outs[i:(i+1), ele_length:max_length, :], 0) for valid_out_state, gt_state in zip(states, ele_states): - assert_allclose(valid_out_state[i:(i+1)].asnumpy(), gt_state.asnumpy(), - atol=1E-4, rtol=1E-4) + assert_almost_equal(valid_out_state[i:(i+1)], gt_state) # Test for TNC layout data_nd = mx.np.random.normal(0, 1, size=(max_length, batch_size, 20)) @@ -864,14 +866,12 @@ def test_rnn_unroll_variant_length(): inputs=data_nd[:ele_length, i:(i+1), :], merge_outputs=True, layout='TNC') - assert_allclose(ele_out.asnumpy(), outs[:ele_length, i:(i + 1), :].asnumpy(), - atol=1E-4, rtol=1E-4) + assert_almost_equal(ele_out, outs[:ele_length, i:(i + 1), :]) if ele_length < max_length: # Check the padded outputs are all zero - assert_allclose(outs[ele_length:max_length, i:(i+1), :].asnumpy(), 0) + assert_almost_equal(outs[ele_length:max_length, i:(i+1), :], 0) for valid_out_state, gt_state in zip(states, ele_states): - assert_allclose(valid_out_state[i:(i+1)].asnumpy(), gt_state.asnumpy(), - atol=1E-4, rtol=1E-4) + assert_almost_equal(valid_out_state[i:(i+1)], gt_state) def test_cell_fill_shape(): diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py index 8008c053cd2b..e3a2fd8036c4 100644 --- a/tests/python/unittest/test_numpy_op.py +++ b/tests/python/unittest/test_numpy_op.py @@ -6477,6 +6477,9 @@ def check_qr(q, r, a_np): data_np = onp.array(data_np, dtype=dtype) data = np.array(data_np, dtype=dtype) + if effective_dtype(data) == onp.dtype(np.float16): + print('Skipping test on this platform: {} has a float16 effective dtype'.format(dtype)) + pytest.skip() data.attach_grad() with mx.autograd.record(): @@ -11712,8 +11715,12 @@ def array_values(low, high, shape): ((3, 1), (3, 0)), ((0, 2), (1, 2)), ((2, 3, 4), (3, 1)), - ((2, 3), ()), - ((), (2, 3)) +# MXNet numpy does not match original numpy behavior when broadcasting 0-dim arrays. +# See https://github.com/apache/incubator-mxnet/issues/20898. +# ((2, 3), ()), +# ((), (2, 3)) + ((2, 3), (1,)), + ((1,), (2, 3)) ]) def test_np_standard_binary_funcs(func, func2, promoted, dtypes, ref_grad_a, ref_grad_b, low, high, lshape, rshape): class TestStandardBinary(HybridBlock): diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 5f290318824b..f0e0e0977a58 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -1724,6 +1724,7 @@ def np_groupnorm_grad(ograd, data, gamma, beta, mean, std, num_groups, eps): atol=5e-2 if dtype == np.float16 else 1e-4, dtype=dtype) +@pytest.mark.serial def test_convolution_grouping(): for dim in [1, 2, 3]: num_filter = 4 @@ -1745,7 +1746,7 @@ def test_convolution_grouping(): exe1 = y1._simple_bind(default_device(), x=shape) exe2 = y2._simple_bind(default_device(), x=shape, w=(num_filter, shape[1]//num_group) + kernel, b=(num_filter,)) for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays): - arr1[:] = np.float32(np.random.normal(size=arr1.shape)) + arr1[:] = np.random.normal(size=arr1.shape).astype(effective_dtype(mx.nd.array([1.,]))) arr2[:] = arr1 exe1.forward(is_train=True) exe1.backward(exe1.outputs[0]) @@ -1753,7 +1754,7 @@ def test_convolution_grouping(): exe2.backward(exe2.outputs[0]) for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays): - np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-3) + assert_almost_equal(arr1, arr2) @pytest.mark.skip(reason="Flaky test https://github.com/apache/incubator-mxnet/issues/14052") @@ -2216,7 +2217,8 @@ def test_bxor(a, b): test_bor(a, b) test_bxor(a, b) -def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3), verbose=False): + +def run_convolution_dilated_impulse_response(dil, kernel_shape, tol): dim = len(dil) assert(len(kernel_shape) == dim) # Input for spike response @@ -2259,7 +2261,7 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3), out_o = be.outputs[0].asnumpy() assert_allclose(out_o[center],np.prod(kernel_shape),atol=1e-5) - rnd_kernel_s = np.random.uniform(low=0.0, high=1.0, size=tuple([1,1]+list(kernel_shape))).astype(np.float32) + rnd_kernel_s = np.random.uniform(low=-0.5, high=0.5, size=tuple([1,1]+list(kernel_shape))).astype(np.float32) impulse_error = mx.nd.array(out_o/np.sum(out_o)) # This should be 1.0 at [0,0,16,16] rnd_kernel = mx.nd.array(rnd_kernel_s) @@ -2282,22 +2284,27 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3), be.forward(True) out = be.outputs[0].asnumpy() # Now do a simple check of the kernel gradient - assert(out[center] - np.sum(kernel_gradient) - out_orig[center] < 0.001) - + d = np.abs(out[center] - np.sum(kernel_gradient) - out_orig[center]) + assert d < tol, f'd: {d}' +@pytest.mark.serial def test_convolution_dilated_impulse_response(): + tol = 1e-3 # 1D for dil in [ (1,), (2,), (3,) ]: for ks in [ (1,), (2,), (3,), (4,)]: - test_run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks) + run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks, tol=tol) # 2D for dil in [ (1,1), (2,2), (3,3) ]: for ks in [ (3,3), (4,4), (2,3), (3,2), (1,1) ]: - test_run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks) + run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks, tol=tol) # 3D + # On Ampere, autotuning might select a TensorCore conv engine, which effectively + # does a cast to fp16 of the weights and data. Expand tol in these 3D cases. + tol3D = 1e-2 if effective_dtype(mx.nd.array([1.,])) == np.float16 else tol for dil in [ (1,1,1), (2,2,2), (3,3,3) ]: for ks in [ (3,3,3), (4,4,4), (2,3,4), (3,2,4), (1,1,1) ]: - test_run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks) + run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks, tol=tol3D) @pytest.mark.serial