diff --git a/tripy/tests/integration/conftest.py b/tripy/tests/integration/conftest.py new file mode 100644 index 000000000..1229219f2 --- /dev/null +++ b/tripy/tests/integration/conftest.py @@ -0,0 +1,61 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2024-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest + +import tripy as tp + + +@pytest.fixture(params=["compile", "eager"]) +def eager_or_compiled(request): + def wrapper(func, *args, **kwargs): + def get_input_info(x: tp.Tensor): + return tp.InputInfo(list(map(int, x.shape)), dtype=x.dtype) + + if request.param == "eager": + return func(*args, **kwargs) + + assert request.param == "compile" + + compile_args = [] + for arg in args: + # We don't want to feed DimensionSize as a dynamic input to the compiler (https://github.com/NVIDIA/TensorRT-Incubator/issues/65). + if isinstance(arg, tp.Tensor) and not isinstance(arg, tp.DimensionSize): + compile_args.append(get_input_info(arg)) + else: + compile_args.append(arg) + compile_args = tuple(compile_args) + + compile_kwargs = dict( + ( + k, + ((get_input_info(v) if isinstance(v, tp.Tensor) and not isinstance(v, tp.DimensionSize) else v)), + ) + for k, v in kwargs.items() + ) + + compiled_func = tp.compile(func, args=compile_args, kwargs=compile_kwargs) + + tensor_args = tuple(x for x in args if isinstance(x, tp.Tensor) and not isinstance(x, tp.DimensionSize)) + + tensor_kwargs = { + k: v for k, v in kwargs.items() if isinstance(v, tp.Tensor) and not isinstance(v, tp.DimensionSize) + } + + return compiled_func(*tensor_args, **tensor_kwargs) + + return wrapper diff --git a/tripy/tests/integration/test_batchnorm.py b/tripy/tests/integration/test_batchnorm.py index 37f6cbf82..89a4c6715 100644 --- a/tripy/tests/integration/test_batchnorm.py +++ b/tripy/tests/integration/test_batchnorm.py @@ -26,7 +26,7 @@ class TestBatchNorm: @pytest.mark.parametrize("torch_dtype, tp_dtype", DTYPES) @pytest.mark.parametrize("input_shape", [(2, 2, 2, 2)]) - def test_batchnorm_accuracy(self, torch_dtype, tp_dtype, input_shape): + def test_batchnorm_accuracy(self, torch_dtype, tp_dtype, input_shape, eager_or_compiled): eps = 1e-5 num_features = input_shape[1] # Number of channels in the input tensor batchnorm = torch.nn.BatchNorm2d(num_features=num_features, eps=eps, dtype=torch_dtype) @@ -45,7 +45,7 @@ def test_batchnorm_accuracy(self, torch_dtype, tp_dtype, input_shape): input = torch.randn(input_shape, dtype=torch_dtype).to("cuda") tp_input = tp.Tensor(input, dtype=tp_dtype) - output = tp_batchnorm(tp_input) + output = eager_or_compiled(tp_batchnorm, tp_input) batchnorm.to("cuda").eval() with torch.no_grad(): diff --git a/tripy/tests/integration/test_cast.py b/tripy/tests/integration/test_cast.py index 3e5902924..634373237 100644 --- a/tripy/tests/integration/test_cast.py +++ b/tripy/tests/integration/test_cast.py @@ -30,54 +30,53 @@ class TestCast: [ (np.int32, np.float32), (np.float32, np.int32), - (np.int64, np.float32), - (np.float32, np.int64), - (np.int64, np.int32), - (np.int64, np.int8), (np.int32, np.int8), (np.float32, np.int8), - (np.int8, np.int64), (np.int8, np.int32), (np.int8, np.float32), # important to test conversion into bool because default StableHLO semantics # are simply to truncate to i1, which is not desirable (np.float32, bool), (np.int32, bool), - (np.int64, bool), # requires a dequantization first # TODO(#219): Dequantize fails with dynamic shapes # (np.int8, bool), ], ) - def test_cast(self, input_dtype, target_dtype): + def test_cast(self, input_dtype, target_dtype, eager_or_compiled): tp_input_dtype = NUMPY_TO_TRIPY[input_dtype] tp_target_dtype = NUMPY_TO_TRIPY[target_dtype] # TODO(#222): Integer casts with negative numbers fail in many cases input_tensor = tp.Tensor([0, 1, 2], dtype=tp_input_dtype) np_input = cp.from_dlpack(input_tensor).get() - output = tp.cast(input_tensor, tp_target_dtype) + output = eager_or_compiled(tp.cast, input_tensor, tp_target_dtype) assert np.array_equal(cp.from_dlpack(output).get(), np_input.astype(target_dtype)) # these dtypes don't have analogues in numpy @pytest.mark.parametrize("source_dtype", [pytest.param(tp.float8, marks=skip_if_older_than_sm89), tp.int4]) - def test_cast_quantized_dtypes_into_bool(self, source_dtype): + def test_cast_quantized_dtypes_into_bool(self, source_dtype, eager_or_compiled): # TODO(#223): Using an odd size leads to a strange crash, so can't just use [-1.0, 0.0, 1.0] input_tensor = tp.Tensor([-1.0, 0.0, 0.0, 1.0], dtype=tp.float32) - q = tp.quantize(input_tensor, scale=1.0, dtype=source_dtype) - output = tp.cast(q, tp.bool) + + def func(input): + q = tp.quantize(input, scale=1.0, dtype=source_dtype) + output = tp.cast(q, tp.bool) + return output + + output = eager_or_compiled(func, input_tensor) assert cp.from_dlpack(output).get().tolist() == [True, False, False, True] - @pytest.mark.parametrize("target_dtype", [np.float32, np.int32, np.int64, np.int8]) - def test_cast_from_bool(self, target_dtype): + @pytest.mark.parametrize("target_dtype", [np.float32, np.int32, np.int8]) + def test_cast_from_bool(self, target_dtype, eager_or_compiled): tp_target_dtype = NUMPY_TO_TRIPY[target_dtype] # in principle, it is not important what *specific* values we convert to, # so long as false is mapped to 0 and true to nonzero input_tensor = tp.Tensor([False, True], dtype=tp.bool) np_input = cp.from_dlpack(input_tensor).get() - output = tp.cast(input_tensor, tp_target_dtype) + output = eager_or_compiled(tp.cast, input_tensor, tp_target_dtype) tp_compare_to_zero = cp.from_dlpack(output).get() == 0 np_compare_to_zero = np_input.astype(target_dtype) == 0 diff --git a/tripy/tests/integration/test_concatenate.py b/tripy/tests/integration/test_concatenate.py index 01ea823b5..9df2d2f70 100644 --- a/tripy/tests/integration/test_concatenate.py +++ b/tripy/tests/integration/test_concatenate.py @@ -33,9 +33,9 @@ class TestConcatenate: ([(2, 3, 4)], 0), ], ) - def test_concat(self, tensor_shapes, dim): + def test_concat(self, tensor_shapes, dim, eager_or_compiled): tensors = [tp.ones(shape) for shape in tensor_shapes] - out = tp.concatenate(tensors, dim=dim) + out = eager_or_compiled(tp.concatenate, tensors, dim=dim) assert np.array_equal( cp.from_dlpack(out).get(), np.concatenate([np.ones(shape) for shape in tensor_shapes], axis=dim) ) @@ -44,8 +44,8 @@ def test_concat(self, tensor_shapes, dim): "tensor_shapes, dim", [([(2, 3, 4), (2, 4, 4)], 0), ([(4, 5, 6), (4, 1, 6)], -1)], ) - def test_negative_concat(self, tensor_shapes, dim): + def test_negative_concat(self, tensor_shapes, dim, eager_or_compiled): tensors = [tp.ones(shape) for shape in tensor_shapes] with helper.raises(tp.TripyException, match=f"not compatible at non-concat index"): - out = tp.concatenate(tensors, dim=dim) + out = eager_or_compiled(tp.concatenate, tensors, dim=dim) print(out) diff --git a/tripy/tests/integration/test_conv.py b/tripy/tests/integration/test_conv.py index 3f67c6629..078c2890d 100644 --- a/tripy/tests/integration/test_conv.py +++ b/tripy/tests/integration/test_conv.py @@ -75,7 +75,7 @@ class ConvTestCase: @pytest.mark.parametrize("torch_dtype,tp_dtype", DTYPES) class TestConvolution: @pytest.mark.parametrize("test_case", test_cases_1d) - def test_convolution_1d(self, torch_dtype, tp_dtype, test_case): + def test_convolution_1d(self, torch_dtype, tp_dtype, test_case, eager_or_compiled): if not test_case.torch_pad: test_case.torch_pad = 0 if not test_case.stride: @@ -122,7 +122,7 @@ def test_convolution_1d(self, torch_dtype, tp_dtype, test_case): conv_layer.bias = tp.cast(tp.Tensor(conv_layer_torch.bias.data), tp_dtype) expected = conv_layer_torch(input_torch).to(torch_dtype) - output = conv_layer(input) + output = eager_or_compiled(conv_layer, input) # FP32 kernel seems to lose some precision, and FP16 needs to be run in FP32 on torch rtol_ = 4e-5 if tp_dtype == tp.float32 else 1e-3 @@ -131,7 +131,7 @@ def test_convolution_1d(self, torch_dtype, tp_dtype, test_case): assert list(output_torch.shape) == list(expected.shape) @pytest.mark.parametrize("test_case", test_cases_2d) - def test_convolution_2d(self, torch_dtype, tp_dtype, test_case): + def test_convolution_2d(self, torch_dtype, tp_dtype, test_case, eager_or_compiled): if not test_case.torch_pad: test_case.torch_pad = 0 if not test_case.stride: @@ -178,7 +178,7 @@ def test_convolution_2d(self, torch_dtype, tp_dtype, test_case): conv_layer.bias = tp.cast(tp.Tensor(conv_layer_torch.bias.data), tp_dtype) expected = conv_layer_torch(input_torch).to(torch_dtype) - output = conv_layer(input) + output = eager_or_compiled(conv_layer, input) rtol_ = 2e-7 if tp_dtype == tp.float32 else 1.5e-3 output_torch = torch.from_dlpack(output) @@ -186,7 +186,7 @@ def test_convolution_2d(self, torch_dtype, tp_dtype, test_case): assert list(output_torch.shape) == list(expected.shape) @pytest.mark.parametrize("test_case", test_cases_3d) - def test_convolution_3d(self, torch_dtype, tp_dtype, test_case): + def test_convolution_3d(self, torch_dtype, tp_dtype, test_case, eager_or_compiled): pytest.skip("TODO (#260): Fix accuracy bugs in 3D conv") if not test_case.torch_pad: test_case.torch_pad = 0 @@ -245,14 +245,14 @@ def test_convolution_3d(self, torch_dtype, tp_dtype, test_case): return expected = conv_layer_torch(input_torch).to(torch_dtype) - output = conv_layer(input) + output = eager_or_compiled(conv_layer, input) rtol_ = 2e-4 if tp_dtype == tp.float32 else 1.4e-3 # 3d conv has greater accumulation error output_torch = torch.from_dlpack(output) assert torch.allclose(output_torch, expected, rtol=rtol_) assert list(output_torch.shape) == list(expected.shape) - def test_uneven_padding(self, torch_dtype, tp_dtype): + def test_uneven_padding(self, torch_dtype, tp_dtype, eager_or_compiled): input_torch = torch.arange(200, dtype=torch.float32, device=torch.device("cuda")).reshape(*(2, 4, 5, 5)) input = tp.cast(tp.Tensor(input_torch), tp_dtype) @@ -282,7 +282,7 @@ def test_uneven_padding(self, torch_dtype, tp_dtype): input_torch = torch_pad(input_torch) expected = conv_layer_torch(input_torch).to(torch_dtype) - output = conv_layer(input) + output = eager_or_compiled(conv_layer, input) rtol_ = 2e-7 if tp_dtype == tp.float32 else 2e-3 output_torch = torch.from_dlpack(output) diff --git a/tripy/tests/integration/test_conv_transpose.py b/tripy/tests/integration/test_conv_transpose.py index 9cc95f890..2245d024b 100644 --- a/tripy/tests/integration/test_conv_transpose.py +++ b/tripy/tests/integration/test_conv_transpose.py @@ -81,7 +81,7 @@ class ConvTestCase: @pytest.mark.parametrize("torch_dtype,tp_dtype", DTYPES) class TestConvolution: @pytest.mark.parametrize("test_case", test_cases_transpose_1d) - def test_transposed_convolution_1d(self, torch_dtype, tp_dtype, test_case): + def test_transposed_convolution_1d(self, torch_dtype, tp_dtype, test_case, eager_or_compiled): if not test_case.torch_pad: test_case.torch_pad = 0 if not test_case.stride: @@ -129,14 +129,14 @@ def test_transposed_convolution_1d(self, torch_dtype, tp_dtype, test_case): conv_layer.bias = tp.cast(tp.Tensor(conv_layer_torch.bias.data), tp_dtype) expected = conv_layer_torch(input_torch).to(torch_dtype) - output = conv_layer(input) + output = eager_or_compiled(conv_layer, input) - rtol_ = 1e-3 + rtol_ = 3e-3 assert tp.allclose(output, tp.Tensor(expected), rtol=rtol_) assert output.shape == list(expected.shape) @pytest.mark.parametrize("test_case", test_cases_transpose_2d) - def test_transposed_convolution_2d(self, torch_dtype, tp_dtype, test_case): + def test_transposed_convolution_2d(self, torch_dtype, tp_dtype, test_case, eager_or_compiled): if not test_case.torch_pad: test_case.torch_pad = 0 if not test_case.stride: @@ -184,14 +184,14 @@ def test_transposed_convolution_2d(self, torch_dtype, tp_dtype, test_case): conv_layer.bias = tp.cast(tp.Tensor(conv_layer_torch.bias.data), tp_dtype) expected = conv_layer_torch(input_torch).to(torch_dtype) - output = conv_layer(input) + output = eager_or_compiled(conv_layer, input) rtol_ = 1e-2 assert tp.allclose(output, tp.Tensor(expected), rtol=rtol_) assert output.shape == list(expected.shape) @pytest.mark.parametrize("test_case", test_cases_transpose_3d) - def test_transposed_convolution_3d(self, torch_dtype, tp_dtype, test_case): + def test_transposed_convolution_3d(self, torch_dtype, tp_dtype, test_case, eager_or_compiled): if not test_case.torch_pad: test_case.torch_pad = 0 if not test_case.stride: @@ -239,12 +239,12 @@ def test_transposed_convolution_3d(self, torch_dtype, tp_dtype, test_case): conv_layer.bias = tp.cast(tp.Tensor(conv_layer_torch.bias.data), tp_dtype) expected = conv_layer_torch(input_torch).to(torch_dtype) - output = conv_layer(input) + output = eager_or_compiled(conv_layer, input) rtol_ = 1.3e-6 if tp_dtype == tp.float32 else 1.6e-3 assert tp.allclose(output, tp.Tensor(expected), rtol=rtol_) assert output.shape == list(expected.shape) - def test_transposed_equivalency(self, torch_dtype, tp_dtype): + def test_transposed_equivalency(self, torch_dtype, tp_dtype, eager_or_compiled): input_torch = torch.arange(9, dtype=torch.float32, device=torch.device("cuda")).reshape(*(1, 1, 3, 3)) input = tp.cast(tp.Tensor(input_torch), tp_dtype) @@ -277,8 +277,8 @@ def test_transposed_equivalency(self, torch_dtype, tp_dtype): expected = conv_layer_torch(input_torch).to(torch_dtype) expected_transpose = conv_transpose_layer_torch(input_torch).to(torch_dtype) - output = conv_layer(input) - output_transpose = conv_transpose_layer(input) + output = eager_or_compiled(conv_layer, input) + output_transpose = eager_or_compiled(conv_transpose_layer, input) rtol_ = 2e-7 if tp_dtype == tp.float32 else 9e-4 assert tp.allclose(output, tp.Tensor(expected), rtol=rtol_) @@ -291,7 +291,7 @@ def test_transposed_equivalency(self, torch_dtype, tp_dtype): assert list(expected.shape) == list(expected_transpose.shape) @pytest.mark.parametrize("test_case", test_cases_transpose_downscale) - def test_transposed_downscale(self, torch_dtype, tp_dtype, test_case): + def test_transposed_downscale(self, torch_dtype, tp_dtype, test_case, eager_or_compiled): input_torch = torch.arange(9, dtype=torch.float32, device=torch.device("cuda")).reshape(*(1, 1, 3, 3)) input = tp.cast(tp.Tensor(input_torch), tp_dtype) @@ -320,7 +320,7 @@ def test_transposed_downscale(self, torch_dtype, tp_dtype, test_case): conv_layer.weight = tp.cast(tp.Tensor(conv_layer_torch.weight.data), tp_dtype) expected = conv_layer_torch(input_torch).to(torch_dtype) - output = conv_layer(input) + output = eager_or_compiled(conv_layer, input) rtol_ = 1e-15 if tp_dtype == tp.float32 else 1e-10 assert tp.allclose(output, tp.Tensor(expected), rtol=rtol_) diff --git a/tripy/tests/integration/test_cumsum.py b/tripy/tests/integration/test_cumsum.py index c8f8bbb7e..2360f3eaa 100644 --- a/tripy/tests/integration/test_cumsum.py +++ b/tripy/tests/integration/test_cumsum.py @@ -30,11 +30,10 @@ class TestCumsum: ([[[1, 2], [3, 4]], [[5, 6], [7, 8]]], 0, [[[1, 2], [3, 4]], [[6, 8], [10, 12]]]), ], ) - def test_cumsum(self, data, dim, expected): + def test_cumsum(self, data, dim, expected, eager_or_compiled): inp = tp.Tensor(data, dtype=tp.float32) - out = tp.cumsum(inp, dim=dim) - + out = eager_or_compiled(tp.cumsum, inp, dim=dim) expected = tp.Tensor(expected, dtype=tp.float32) assert tp.allclose(out, expected) assert out.shape == expected.shape diff --git a/tripy/tests/integration/test_dequantize.py b/tripy/tests/integration/test_dequantize.py index f44f3a23b..4924ab9a6 100644 --- a/tripy/tests/integration/test_dequantize.py +++ b/tripy/tests/integration/test_dequantize.py @@ -29,12 +29,16 @@ class TestDequantize: @pytest.mark.parametrize( "dtype", [tp.float32, tp.float16, pytest.param(tp.bfloat16, marks=skip_if_older_than_sm80)] ) - def test_dequantize_int8_per_tensor(self, dtype): + def test_dequantize_int8_per_tensor(self, dtype, eager_or_compiled): data = [4, 8] input_tp = tp.Tensor(data, dtype=tp.int8) scale = torch.tensor(0.5, dtype=TORCH_DTYPES[dtype]) scale_tp = tp.Tensor(scale, dtype=dtype) - dequantized = tp.dequantize(input_tp, scale_tp, dtype) + + def func(input): + return tp.dequantize(input, scale_tp, dtype) + + dequantized = eager_or_compiled(func, input_tp) expected = torch.tensor(data) * scale output = torch.from_dlpack(dequantized) assert torch.allclose(expected, output.to("cpu")) @@ -42,7 +46,7 @@ def test_dequantize_int8_per_tensor(self, dtype): @pytest.mark.parametrize( "dtype", [tp.float32, tp.float16, pytest.param(tp.bfloat16, marks=skip_if_older_than_sm80)] ) - def test_dequantize_int8_per_channel(self, dtype): + def test_dequantize_int8_per_channel(self, dtype, eager_or_compiled): # TODO: Fix in #153 if dtype == tp.float16: pytest.skip("TRT does not support fp16->int8 per-channel dequant.") @@ -50,7 +54,11 @@ def test_dequantize_int8_per_channel(self, dtype): input_tp = tp.Tensor(data, dtype=tp.int8) scale = torch.tensor([0.8, 0.9], dtype=TORCH_DTYPES[dtype]) scale_tp = tp.Tensor(scale, dtype=dtype) - dequantized = tp.dequantize(input_tp, scale_tp, dtype, dim=0) + + def func(input): + return tp.dequantize(input, scale_tp, dtype, dim=0) + + dequantized = eager_or_compiled(func, input_tp) expected = torch.tensor(data) * scale.reshape((2, 1)) output = torch.from_dlpack(dequantized) assert torch.allclose(expected, output.to("cpu")) diff --git a/tripy/tests/integration/test_expand.py b/tripy/tests/integration/test_expand.py index d2ab402de..09b1fcfca 100644 --- a/tripy/tests/integration/test_expand.py +++ b/tripy/tests/integration/test_expand.py @@ -22,24 +22,24 @@ class TestExpand: - def test_int_sizes(self): + def test_int_sizes(self, eager_or_compiled): input = tp.ones((2, 1)) - out = tp.expand(input, (-1, 2)) + out = eager_or_compiled(tp.expand, input, (-1, 2)) assert np.array_equal(cp.from_dlpack(out).get(), np.ones((2, 2), dtype=np.float32)) - def test_shape_sizes(self): + def test_shape_sizes(self, eager_or_compiled): input = tp.ones((2, 1)) a = tp.ones((2, 4)) - out = tp.expand(input, a.shape) + out = eager_or_compiled(tp.expand, input, a.shape) assert np.array_equal(cp.from_dlpack(out).get(), np.ones((2, 4), dtype=np.float32)) - def test_extra_dims(self): + def test_extra_dims(self, eager_or_compiled): input = tp.ones((2, 1)) - out = tp.expand(input, (1, -1, 2)) + out = eager_or_compiled(tp.expand, input, (1, -1, 2)) assert np.array_equal(cp.from_dlpack(out).get(), np.ones((1, 2, 2), dtype=np.float32)) - def test_mixed_sizes(self): + def test_mixed_sizes(self, eager_or_compiled): input = tp.ones((2, 1, 1)) a = tp.ones((4, 4)) - out = tp.expand(input, (-1, a.shape[0], a.shape[1])) + out = eager_or_compiled(tp.expand, input, (-1, a.shape[0], a.shape[1])) assert np.array_equal(cp.from_dlpack(out).get(), np.ones((2, 4, 4), dtype=np.float32)) diff --git a/tripy/tests/integration/test_flatten.py b/tripy/tests/integration/test_flatten.py index da16c181b..59bc32f57 100644 --- a/tripy/tests/integration/test_flatten.py +++ b/tripy/tests/integration/test_flatten.py @@ -29,29 +29,29 @@ class TestFlatten: ((2, 3, 4, 5), 1, 3, (2, 60)), # Flatten dimensions 1 through 3 ], ) - def test_flatten(self, shape, start_dim, end_dim, expected_shape): + def test_flatten(self, shape, start_dim, end_dim, expected_shape, eager_or_compiled): cp_a = cp.arange(np.prod(shape)).reshape(shape).astype(np.float32) a = tp.Tensor(cp_a) - b = tp.flatten(a, start_dim=start_dim, end_dim=end_dim) + b = eager_or_compiled(tp.flatten, a, start_dim=start_dim, end_dim=end_dim) assert b.shape == list(expected_shape) assert np.array_equal(cp.from_dlpack(b).get(), cp_a.reshape(expected_shape).get()) - def test_flatten_invalid_dims(self): + def test_flatten_invalid_dims(self, eager_or_compiled): shape = (2, 3, 4) with pytest.raises(tp.TripyException, match="Invalid dimensions"): a = tp.ones(shape) # Invalid because end_dim < start_dim - tp.flatten(a, start_dim=2, end_dim=1) + eager_or_compiled(tp.flatten, a, start_dim=2, end_dim=1) - def test_flatten_single_dim(self): + def test_flatten_single_dim(self, eager_or_compiled): shape = (2, 3, 4) a = tp.ones(shape) # Flattening a single dimension should not change the output - b = tp.flatten(a, start_dim=1, end_dim=1) + b = eager_or_compiled(tp.flatten, a, start_dim=1, end_dim=1) assert b.shape == [2, 3, 4] assert np.array_equal(cp.from_dlpack(b).get(), np.ones(shape, dtype=np.float32)) - def test_flatten_with_unknown_dims(self): + def test_flatten_with_unknown_dims(self, eager_or_compiled): a = tp.ones((2, 3, 4, 5)) - b = tp.flatten(a, start_dim=1, end_dim=-1) + b = eager_or_compiled(tp.flatten, a, start_dim=1, end_dim=-1) assert np.array_equal(cp.from_dlpack(b).get(), np.ones((2, 60), dtype=np.float32)) diff --git a/tripy/tests/integration/test_flip.py b/tripy/tests/integration/test_flip.py index 8118716d5..ef53f6c1a 100644 --- a/tripy/tests/integration/test_flip.py +++ b/tripy/tests/integration/test_flip.py @@ -26,34 +26,34 @@ class TestFlip: "dims", [0, 1, None, [0, 1], [1, 0], -1, -2, [0, -1], [-2, 1]], ) - def test_flip(self, dims): + def test_flip(self, dims, eager_or_compiled): cp_a = cp.arange(16).reshape((4, 4)).astype(cp.float32) a = tp.Tensor(cp_a, device=tp.device("gpu")) f = tp.flip(a, dims=dims) assert np.array_equal(cp.from_dlpack(f).get(), np.flip(cp_a.get(), axis=dims)) # also ensure that flipping a second time restores the original value - f2 = tp.flip(f, dims=dims) + f2 = eager_or_compiled(tp.flip, f, dims=dims) assert cp.array_equal(cp.from_dlpack(f2), cp_a) - def test_no_op(self): + def test_no_op(self, eager_or_compiled): cp_a = cp.arange(16).reshape((4, 4)).astype(cp.float32) a = tp.Tensor(cp_a, device=tp.device("gpu")) - f = tp.flip(a, dims=[]) + f = eager_or_compiled(tp.flip, a, dims=[]) assert tp.equal(a, f) - def test_zero_rank(self): + def test_zero_rank(self, eager_or_compiled): t = tp.Tensor(1) - f = tp.flip(t) + f = eager_or_compiled(tp.flip, t) assert tp.equal(t, f) @pytest.mark.parametrize( "dims1, dims2", [(0, -2), (1, -1), ([0, 1], None), ([0, 1], [1, 0]), ([0, 1], [-2, -1])], ) - def test_equivalences(self, dims1, dims2): + def test_equivalences(self, dims1, dims2, eager_or_compiled): cp_a = cp.arange(16).reshape((4, 4)).astype(cp.float32) a = tp.Tensor(cp_a, device=tp.device("gpu")) - f1 = tp.flip(a, dims=dims1) - f2 = tp.flip(a, dims=dims2) + f1 = eager_or_compiled(tp.flip, a, dims=dims1) + f2 = eager_or_compiled(tp.flip, a, dims=dims2) assert tp.equal(f1, f2) diff --git a/tripy/tests/integration/test_full.py b/tripy/tests/integration/test_full.py index 9a04c1664..d96885ffa 100644 --- a/tripy/tests/integration/test_full.py +++ b/tripy/tests/integration/test_full.py @@ -22,21 +22,21 @@ class TestFull: - def test_normal_shape(self): - out = tp.full((2, 2), 5.0, tp.float32) + def test_normal_shape(self, eager_or_compiled): + out = eager_or_compiled(tp.full, (2, 2), 5.0, tp.float32) assert np.array_equal(cp.from_dlpack(out).get(), np.full((2, 2), 5.0, np.float32)) - def test_shape_tensor(self): + def test_shape_tensor(self, eager_or_compiled): a = tp.ones((2, 3)) - out = tp.full(a.shape, 5.0, tp.float32) + out = eager_or_compiled(tp.full, a.shape, 5.0, tp.float32) assert np.array_equal(cp.from_dlpack(out).get(), np.full((2, 3), 5.0, np.float32)) - def test_mixed_shape(self): + def test_mixed_shape(self, eager_or_compiled): a = tp.ones((2, 3)) - out = tp.full((a.shape[0], 4), 5.0, tp.float32) + out = eager_or_compiled(tp.full, (a.shape[0], 4), 5.0, tp.float32) assert np.array_equal(cp.from_dlpack(out).get(), np.full((2, 4), 5.0, np.float32)) - def test_value_as_tensor(self): + def test_value_as_tensor(self, eager_or_compiled): a = tp.ones((2, 3)) - out = tp.full((a.shape[0], 4), tp.Tensor(8.0), tp.float32) + out = eager_or_compiled(tp.full, (a.shape[0], 4), tp.Tensor(8.0), tp.float32) assert np.array_equal(cp.from_dlpack(out).get(), np.full((2, 4), 8.0, np.float32)) diff --git a/tripy/tests/integration/test_gather.py b/tripy/tests/integration/test_gather.py index e2f088346..d0e05a118 100644 --- a/tripy/tests/integration/test_gather.py +++ b/tripy/tests/integration/test_gather.py @@ -34,11 +34,11 @@ class TestGatherOp: ((2, 3, 4), 1, (2)), ], ) - def test_gather(self, x_shape, axis, indices): + def test_gather(self, x_shape, axis, indices, eager_or_compiled): x = np.arange(np.prod(x_shape)).reshape(x_shape) indices_tp = tp.Tensor(indices) a = tp.Tensor(x) a = tp.cast(a, tp.int32) - out = tp.gather(a, axis, indices_tp) - out.eval() + out = eager_or_compiled(tp.gather, a, axis, indices_tp) + assert np.array_equal(cp.from_dlpack(out).get(), np.take(x, indices, axis)) diff --git a/tripy/tests/integration/test_groupnorm.py b/tripy/tests/integration/test_groupnorm.py index 5f1cd8bc3..d56c15928 100644 --- a/tripy/tests/integration/test_groupnorm.py +++ b/tripy/tests/integration/test_groupnorm.py @@ -29,7 +29,7 @@ class TestGroupNorm: @pytest.mark.parametrize("input_shape", [(1, 10, 2)]) @pytest.mark.parametrize("num_groups", [2, 5]) @pytest.mark.parametrize("num_channels", [10]) - def test_groupnorm_accuracy(self, torch_dtype, tp_dtype, input_shape, num_groups, num_channels): + def test_groupnorm_accuracy(self, torch_dtype, tp_dtype, input_shape, num_groups, num_channels, eager_or_compiled): eps = 1e-5 groupnorm = torch.nn.GroupNorm( num_groups=num_groups, @@ -50,7 +50,7 @@ def test_groupnorm_accuracy(self, torch_dtype, tp_dtype, input_shape, num_groups input = torch.arange(torch.prod(torch.Tensor(input_shape))).reshape(input_shape).to(torch_dtype) tp_input = tp.Tensor(input, dtype=tp_dtype) - output = tp.copy(tp_groupnorm(tp_input), tp.device("cpu")) + output = eager_or_compiled(tp.copy, tp_groupnorm(tp_input), tp.device("cpu")) with torch.no_grad(): expected = groupnorm(input) diff --git a/tripy/tests/integration/test_iota.py b/tripy/tests/integration/test_iota.py index 2df779da2..44cb38ab6 100644 --- a/tripy/tests/integration/test_iota.py +++ b/tripy/tests/integration/test_iota.py @@ -49,17 +49,13 @@ def _compute_ref_iota(self, dtype, shape, dim): "shape, dim", [ ((2, 3), 1), - ((2, 3), None), + ((2, 3), 0), ((2, 3), -1), ((2, 3, 4), 2), ], ) - def test_iota(self, dtype, shape, dim): - if dim: - output = tp.iota(shape, dim, dtype[1]) - else: - output = tp.iota(shape, dtype=dtype[1]) - + def test_iota(self, dtype, shape, dim, eager_or_compiled): + output = eager_or_compiled(tp.iota, shape, dim, dtype[1]) assert np.array_equal(cp.from_dlpack(output).get(), self._compute_ref_iota(dtype[0], shape, dim)) @pytest.mark.parametrize("dtype", DTYPE_PARAMS) @@ -72,11 +68,11 @@ def test_iota(self, dtype, shape, dim): ((2, 3, 4), 2), ], ) - def test_iota_like(self, dtype, shape, dim): + def test_iota_like(self, dtype, shape, dim, eager_or_compiled): if dim: - output = tp.iota_like(tp.ones(shape), dim, dtype[1]) + output = eager_or_compiled(tp.iota_like, tp.ones(shape), dim, dtype[1]) else: - output = tp.iota_like(tp.ones(shape), dtype=dtype[1]) + output = eager_or_compiled(tp.iota_like, tp.ones(shape), dtype=dtype[1]) assert np.array_equal(cp.from_dlpack(output).get(), self._compute_ref_iota(dtype[0], shape, dim)) @@ -98,12 +94,12 @@ def test_negative_no_casting(self, dtype): ): print(out) - def test_iota_from_shape_tensor(self): + def test_iota_from_shape_tensor(self, eager_or_compiled): a = tp.ones((2, 2)) - output = tp.iota(a.shape) + output = eager_or_compiled(tp.iota, a.shape) assert np.array_equal(cp.from_dlpack(output).get(), self._compute_ref_iota("float32", (2, 2), 0)) - def test_iota_from_mixed_seqence(self): + def test_iota_from_mixed_seqence(self, eager_or_compiled): a = tp.ones((2, 2)) - output = tp.iota((3, a.shape[0])) + output = eager_or_compiled(tp.iota, (3, a.shape[0])) assert np.array_equal(cp.from_dlpack(output).get(), self._compute_ref_iota("float32", (3, 2), 0)) diff --git a/tripy/tests/integration/test_layernorm.py b/tripy/tests/integration/test_layernorm.py index 088054c39..b1304ae63 100644 --- a/tripy/tests/integration/test_layernorm.py +++ b/tripy/tests/integration/test_layernorm.py @@ -31,7 +31,7 @@ class TestLayerNorm: @pytest.mark.parametrize("torch_dtype, tp_dtype", DTYPES) @pytest.mark.parametrize("input_shape", [(2, 2, 2)]) @pytest.mark.parametrize("normalized_shape", [(2, 2), (2,)]) - def test_layernorm_accuracy(self, torch_dtype, tp_dtype, input_shape, normalized_shape): + def test_layernorm_accuracy(self, torch_dtype, tp_dtype, input_shape, normalized_shape, eager_or_compiled): eps = 1e-5 layernorm = torch.nn.LayerNorm( normalized_shape=normalized_shape, @@ -51,7 +51,7 @@ def test_layernorm_accuracy(self, torch_dtype, tp_dtype, input_shape, normalized input = torch.arange(torch.prod(torch.Tensor(input_shape))).reshape(input_shape).to(torch_dtype) tp_input = tp.Tensor(input, dtype=tp_dtype) - output = tp.copy(tp_layernorm(tp_input), tp.device("cpu")) + output = eager_or_compiled(tp.copy, tp_layernorm(tp_input), tp.device("cpu")) with torch.no_grad(): expected = layernorm(input) diff --git a/tripy/tests/integration/test_linear.py b/tripy/tests/integration/test_linear.py index ff4899a74..137d4a00d 100644 --- a/tripy/tests/integration/test_linear.py +++ b/tripy/tests/integration/test_linear.py @@ -25,7 +25,7 @@ class TestLinear: - def test_linear_module(self): + def test_linear_module(self, eager_or_compiled): class Network(tp.Module): def __init__(self): super().__init__() @@ -41,7 +41,7 @@ def __call__(self, x): cp_a1 = cp.ones((3, 4), dtype=cp.float32) a1 = tp.Tensor(cp_a1, device=tp.device("gpu")) - out = net(a1) + out = eager_or_compiled(net, a1) np_out = cp_a1.get() @ (np_weight.transpose()) + np_bias @@ -84,7 +84,7 @@ def __call__(self, x): @pytest.mark.parametrize("use_input_scale", [False, True]) @pytest.mark.parametrize("quant_dtype", [tp.int8, pytest.param(tp.float8, marks=skip_if_older_than_sm89)]) @pytest.mark.parametrize("weight_quant_dim", [None, 0, 1]) - def test_quant_linear(self, use_input_scale, quant_dtype, weight_quant_dim): + def test_quant_linear(self, use_input_scale, quant_dtype, weight_quant_dim, eager_or_compiled): net = self._create_network(use_input_scale, quant_dtype, weight_quant_dim) np_weight = cp.from_dlpack(net.linear.weight).get() np_bias = cp.from_dlpack(net.linear.bias).get() @@ -96,9 +96,9 @@ def test_quant_linear(self, use_input_scale, quant_dtype, weight_quant_dim): tp.TripyException, match="Unsupported quantization parameters for Linear module.", ): - out = net(a1) + out = eager_or_compiled(net, a1) else: - out = net(a1) + out = eager_or_compiled(net, a1) np_out = cp_a1.get() @ (np_weight.transpose()) + np_bias @@ -114,7 +114,7 @@ def test_quant_linear(self, use_input_scale, quant_dtype, weight_quant_dim): ], ids=["block-wise", "per-tensor", "per-channel-0", "per-channel-1"], ) - def test_quant_linear_int4_weight_only(self, weight_quant_dim, scale): + def test_quant_linear_int4_weight_only(self, weight_quant_dim, scale, eager_or_compiled): scale = tp.Parameter(scale) linear = tp.Linear(4, 8, quant_dtype=tp.int4, weight_quant_dim=weight_quant_dim) @@ -128,7 +128,7 @@ def test_quant_linear_int4_weight_only(self, weight_quant_dim, scale): cp_input = cp.ones((4, 4), dtype=np.float32) input = tp.Tensor(cp_input, device=tp.device("gpu")) - out = linear(input) + out = eager_or_compiled(linear, input) np_out = cp_input.get() @ (np_weight.transpose()) + np_bias diff --git a/tripy/tests/integration/test_matrix_multiplication.py b/tripy/tests/integration/test_matrix_multiplication.py index 57731b674..b19e38937 100644 --- a/tripy/tests/integration/test_matrix_multiplication.py +++ b/tripy/tests/integration/test_matrix_multiplication.py @@ -23,23 +23,27 @@ import tripy.common.datatype +def gemm(a, b): + return a @ b + + class TestMatrixMultiplication: - def test_2d_tensors(self): + def test_2d_tensors(self, eager_or_compiled): a_np = np.arange(6).reshape((2, 3)).astype(np.float32) b_np = np.arange(6).reshape((3, 2)).astype(np.float32) a = tp.Tensor(a_np) b = tp.Tensor(b_np) - out = a @ b + out = eager_or_compiled(gemm, a, b) assert tp.allclose(out, tp.Tensor(a_np @ b_np)) - def test_1d_tensors(self): + def test_1d_tensors(self, eager_or_compiled): a_np = np.arange(64).astype(np.float32) # 1D Tensor b_np = np.arange(64).astype(np.float32) # 1D Tensor a = tripy.Tensor(cp.asanyarray(a_np)) b = tripy.Tensor(cp.asanyarray(b_np)) - out = a @ b + out = eager_or_compiled(gemm, a, b) assert tp.allclose(out, tp.Tensor(cp.array(a_np @ b_np)), atol=1e-2) @pytest.mark.parametrize( @@ -53,11 +57,11 @@ def test_1d_tensors(self): ((1, 2, 3), (0, 0, 3, 2)), # Broadcasting batch dims with 0 ], ) - def test_broadcast_gemm(self, shape_a, shape_b): + def test_broadcast_gemm(self, shape_a, shape_b, eager_or_compiled): a_np = np.arange(np.prod(shape_a)).reshape(shape_a).astype(np.float32) b_np = np.arange(np.prod(shape_b)).reshape(shape_b).astype(np.float32) a = tp.Tensor(a_np) b = tp.Tensor(b_np) - out = a @ b + out = eager_or_compiled(gemm, a, b) assert tp.allclose(out, tp.Tensor(a_np @ b_np)) diff --git a/tripy/tests/integration/test_outer.py b/tripy/tests/integration/test_outer.py index 8ba7be979..53f8b5237 100644 --- a/tripy/tests/integration/test_outer.py +++ b/tripy/tests/integration/test_outer.py @@ -19,10 +19,10 @@ class TestOuter: - def test_outer(self): + def test_outer(self, eager_or_compiled): v1 = tp.arange(5, dtype=tp.float32) v2 = tp.arange(4, dtype=tp.float32) - output = tp.outer(v1, v2) + output = eager_or_compiled(tp.outer, v1, v2) t1 = torch.arange(5, dtype=torch.float32) t2 = torch.arange(4, dtype=torch.float32) @@ -30,9 +30,9 @@ def test_outer(self): assert output.shape == list(torch_out.shape) assert tp.allclose(output, tp.Tensor(torch_out)) - def test_empty(self): + def test_empty(self, eager_or_compiled): v1 = tp.Tensor([]) v2 = tp.arange(3, dtype=tp.float32) - output = tp.outer(v1, v2) + output = eager_or_compiled(tp.outer, v1, v2) assert output.shape == [0, 3] diff --git a/tripy/tests/integration/test_pad.py b/tripy/tests/integration/test_pad.py index 8843055ee..578cf4a12 100644 --- a/tripy/tests/integration/test_pad.py +++ b/tripy/tests/integration/test_pad.py @@ -29,19 +29,19 @@ class TestPad: (((1, 2), (2, 3)), 1), ], ) - def test_pad_constant(self, pad, value): + def test_pad_constant(self, pad, value, eager_or_compiled): inp = np.arange(4, dtype=np.int32).reshape((2, 2)) - out = tp.pad(tp.Tensor(inp), pad, value=value) + out = eager_or_compiled(tp.pad, tp.Tensor(inp), pad, value=value) expected = np.pad(inp, pad, constant_values=value) assert np.array_equal(cp.from_dlpack(out).get(), expected) - def test_pad_tensor(self): + def test_pad_tensor(self, eager_or_compiled): inp = np.arange(6, dtype=np.float32).reshape((2, 3)) inp_tp = tp.Tensor(inp) - out = tp.pad(tp.Tensor(inp), ((0, inp_tp.shape[0]), (inp_tp.shape[1], 0))) + out = eager_or_compiled(tp.pad, tp.Tensor(inp), ((0, inp_tp.shape[0]), (inp_tp.shape[1], 0))) expected = np.pad(inp, ((0, 2), (3, 0))) assert np.array_equal(cp.from_dlpack(out).get(), expected) diff --git a/tripy/tests/integration/test_pooling.py b/tripy/tests/integration/test_pooling.py index 86dd45a34..1e28f956e 100644 --- a/tripy/tests/integration/test_pooling.py +++ b/tripy/tests/integration/test_pooling.py @@ -32,7 +32,7 @@ class TestPooling: ) @pytest.mark.parametrize("dtype", [tp.float32, tp.float16, tp.int8]) @pytest.mark.parametrize("pool_type", ["max", "avg"]) - def test_pool_2d(self, kernel_dims, stride, padding, dtype, pool_type): + def test_pool_2d(self, kernel_dims, stride, padding, dtype, pool_type, eager_or_compiled): inp_tp = tp.reshape(tp.arange(64, dtype=dtype), (1, 1, 8, 8)) torch_padding = (padding[0][0], padding[1][0]) @@ -40,7 +40,7 @@ def test_pool_2d(self, kernel_dims, stride, padding, dtype, pool_type): pytest.skip("Torch average pool is not implemented for int8") if pool_type == "max": - out = tp.maxpool(inp_tp, kernel_dims=kernel_dims, stride=stride, padding=padding) + out = eager_or_compiled(tp.maxpool, inp_tp, kernel_dims=kernel_dims, stride=stride, padding=padding) pool_torch = torch.nn.MaxPool2d(kernel_size=kernel_dims, stride=stride, padding=torch_padding) elif pool_type == "avg": if torch_padding != (0, 0): @@ -48,7 +48,7 @@ def test_pool_2d(self, kernel_dims, stride, padding, dtype, pool_type): "https://github.com/NVIDIA/TensorRT-Incubator/issues/241: Tripy average pool is incorrect when padding != 0." ) - out = tp.avgpool(inp_tp, kernel_dims=kernel_dims, stride=stride, padding=padding) + out = eager_or_compiled(tp.avgpool, inp_tp, kernel_dims=kernel_dims, stride=stride, padding=padding) pool_torch = torch.nn.AvgPool2d(kernel_size=kernel_dims, stride=stride, padding=torch_padding) out_torch = torch.from_dlpack(out).to("cpu") @@ -64,7 +64,7 @@ def test_pool_2d(self, kernel_dims, stride, padding, dtype, pool_type): ) @pytest.mark.parametrize("dtype", [tp.float32, tp.float16]) @pytest.mark.parametrize("pool_type", ["max", "avg"]) - def test_pool_3d(self, kernel_dims, stride, padding, dtype, pool_type): + def test_pool_3d(self, kernel_dims, stride, padding, dtype, pool_type, eager_or_compiled): inp_tp = tp.reshape(tp.arange(512, dtype=dtype), (1, 1, 8, 8, 8)) torch_padding = (padding[0][0], padding[1][0], padding[2][0]) @@ -74,10 +74,10 @@ def test_pool_3d(self, kernel_dims, stride, padding, dtype, pool_type): ) if pool_type == "max": - out = tp.maxpool(inp_tp, kernel_dims=kernel_dims, stride=stride, padding=padding) + out = eager_or_compiled(tp.maxpool, inp_tp, kernel_dims=kernel_dims, stride=stride, padding=padding) pool_torch = torch.nn.MaxPool3d(kernel_size=kernel_dims, stride=stride, padding=torch_padding) elif pool_type == "avg": - out = tp.avgpool(inp_tp, kernel_dims=kernel_dims, stride=stride, padding=padding) + out = eager_or_compiled(tp.avgpool, inp_tp, kernel_dims=kernel_dims, stride=stride, padding=padding) pool_torch = torch.nn.AvgPool3d(kernel_size=kernel_dims, stride=stride, padding=torch_padding) out_torch = torch.from_dlpack(out).to("cpu") diff --git a/tripy/tests/integration/test_quantize.py b/tripy/tests/integration/test_quantize.py index 826db83bf..ee458d108 100644 --- a/tripy/tests/integration/test_quantize.py +++ b/tripy/tests/integration/test_quantize.py @@ -30,24 +30,42 @@ class TestQuantize: @pytest.mark.parametrize( "dtype", [tp.float32, tp.float16, pytest.param(tp.bfloat16, marks=skip_if_older_than_sm80)] ) - def test_quantize_int8_per_tensor(self, dtype): + def test_quantize_int8_per_tensor(self, dtype, eager_or_compiled): input = torch.tensor([1.0, 2.0], dtype=TORCH_DTYPES[dtype]) scale = torch.tensor(0.5, dtype=TORCH_DTYPES[dtype]) input_tp = tp.Tensor(input, dtype=dtype) scale_tp = tp.Tensor(scale, dtype=dtype) - quantized = tp.quantize(input_tp, scale_tp, tp.int8) + + def func(input): + return tp.quantize(input, scale_tp, tp.int8) + + quantized = eager_or_compiled(func, input_tp) expected = (input / scale).to(dtype=torch.int8) assert torch.equal(expected, torch.from_dlpack(quantized).to("cpu")) @pytest.mark.parametrize( - "dtype", [tp.float32, tp.float16, pytest.param(tp.bfloat16, marks=skip_if_older_than_sm80)] + "dtype", + [ + tp.float32, + pytest.param( + tp.float16, + marks=pytest.mark.skip( + reason="Known float16 precision issues due to https://github.com/NVIDIA/TensorRT-Incubator/issues/392" + ), + ), + pytest.param(tp.bfloat16, marks=skip_if_older_than_sm80), + ], ) - def test_quantize_int8_per_channel(self, dtype): + def test_quantize_int8_per_channel(self, dtype, eager_or_compiled): input = torch.tensor([[1.0, 2.0], [3.0, 4.0]], dtype=TORCH_DTYPES[dtype]) scale = torch.tensor([0.2, 0.1], dtype=TORCH_DTYPES[dtype]) input_tp = tp.Tensor(input, dtype=dtype) scale_tp = tp.Tensor(scale, dtype=dtype) - quantized = tp.quantize(input_tp, scale_tp, tp.int8, dim=0) + + def func(input): + return tp.quantize(input, scale_tp, tp.int8, dim=0) + + quantized = eager_or_compiled(func, input_tp) expected = (input / scale.reshape(2, 1)).to(dtype=torch.int8) assert torch.equal(expected, torch.from_dlpack(quantized).to("cpu")) @@ -55,12 +73,16 @@ def test_quantize_int8_per_channel(self, dtype): "dtype", [tp.float32, tp.float16, pytest.param(tp.bfloat16, marks=skip_if_older_than_sm80)] ) @skip_if_older_than_sm89 - def test_quantize_fp8_per_tensor(self, dtype): + def test_quantize_fp8_per_tensor(self, dtype, eager_or_compiled): input = torch.tensor([1.0, 2.0], dtype=TORCH_DTYPES[dtype]) scale = torch.tensor(0.5, dtype=TORCH_DTYPES[dtype]) input_tp = tp.Tensor(input, dtype=dtype) scale_tp = tp.Tensor(scale, dtype=dtype) - quantized = tp.quantize(input_tp, scale_tp, tp.float8) + + def func(input): + return tp.quantize(input, scale_tp, tp.float8) + + quantized = eager_or_compiled(func, input_tp) assert quantized.dtype == tp.float8 expected = (input / scale).to(dtype=torch.float32) with raises( @@ -74,12 +96,16 @@ def test_quantize_fp8_per_tensor(self, dtype): "dtype", [tp.float32, tp.float16, pytest.param(tp.bfloat16, marks=skip_if_older_than_sm80)] ) @skip_if_older_than_sm89 - def test_quantize_fp8_per_channel(self, dtype): + def test_quantize_fp8_per_channel(self, dtype, eager_or_compiled): input = torch.tensor([[1.0, 2.0], [3.0, 4.0]], dtype=TORCH_DTYPES[dtype]) scale = torch.tensor([0.2, 0.1], dtype=TORCH_DTYPES[dtype]) input_tp = tp.Tensor(input, dtype=dtype) scale_tp = tp.Tensor(scale, dtype=dtype) - quantized = tp.quantize(input_tp, scale_tp, tp.float8, dim=0) + + def func(input): + return tp.quantize(input, scale_tp, tp.float8, dim=0) + + quantized = eager_or_compiled(func, input_tp) assert quantized.dtype == tp.float8 expected = (input / scale.reshape(2, 1)).to(dtype=torch.float32) with raises( @@ -93,7 +119,7 @@ def test_quantize_fp8_per_channel(self, dtype): "dtype", [tp.float32, tp.float16, pytest.param(tp.bfloat16, marks=skip_if_older_than_sm80)] ) @pytest.mark.parametrize("quant_mode", ["block-wise", "per-tensor", "per-channel-0", "per-channel-1"]) - def test_qdq_int4(self, dtype, quant_mode): + def test_qdq_int4(self, dtype, quant_mode, eager_or_compiled): if quant_mode == "block-wise": dim = None scale = torch.ones((2, 4), dtype=TORCH_DTYPES[dtype]) @@ -109,14 +135,22 @@ def test_qdq_int4(self, dtype, quant_mode): input = torch.ones((4, 4), dtype=TORCH_DTYPES[dtype]) input_tp = tp.Tensor(input, dtype=dtype) - scale_tp = tp.Tensor(scale) - quantized = tp.quantize(input_tp, scale_tp, tp.int4, dim) - dequantized = tp.dequantize(quantized, scale_tp, dtype, dim) + + def func(inp): + scale_tp = tp.Tensor(scale) + quantized = tp.quantize(input_tp, scale_tp, tp.int4, dim) + dequantized = tp.dequantize(quantized, scale_tp, dtype, dim) + return dequantized + + dequantized = eager_or_compiled(func, input_tp) assert torch.equal(input, torch.from_dlpack(dequantized).to("cpu")) - def test_non_constant_scale(self): + def test_non_constant_scale(self, eager_or_compiled): input = tp.ones((4, 4)) scale = tp.ones((4,)) - quantized = tp.quantize(input, scale, tp.int8, dim=0) + def func(input): + return tp.quantize(input, scale, tp.int8, dim=0) + + quantized = eager_or_compiled(func, input) assert bool(cp.all(cp.from_dlpack(quantized) == cp.ones((4, 4), dtype=cp.int8))) diff --git a/tripy/tests/integration/test_reduce.py b/tripy/tests/integration/test_reduce.py index 66db0a0f4..bb922675b 100644 --- a/tripy/tests/integration/test_reduce.py +++ b/tripy/tests/integration/test_reduce.py @@ -36,10 +36,10 @@ class TestReduceOp: ((2, 3, 4, 5), (-2, -1), True), ], ) - def test_all(self, x_shape, axis, keepdim): + def test_all(self, x_shape, axis, keepdim, eager_or_compiled): x = np.array([i % 2 == 0 for i in np.arange(np.prod(x_shape))]).reshape(x_shape) a = tp.Tensor(x) - out = tp.all(a, dim=axis, keepdim=keepdim) + out = eager_or_compiled(tp.all, a, dim=axis, keepdim=keepdim) # np.array is necessary to deal with case where x.all returns a numpy scalar (5th case) expected = np.array(x.all(axis=axis, keepdims=keepdim)) assert np.array_equal(np.from_dlpack(tp.copy(out, device=tp.device("cpu"))), expected) @@ -56,10 +56,10 @@ def test_all(self, x_shape, axis, keepdim): ((2, 3, 4, 5), (-2, -1), True), ], ) - def test_any(self, x_shape, axis, keepdim): + def test_any(self, x_shape, axis, keepdim, eager_or_compiled): x = np.array([i % 2 == 0 for i in np.arange(np.prod(x_shape))]).reshape(x_shape) a = tp.Tensor(x) - out = tp.any(a, dim=axis, keepdim=keepdim) + out = eager_or_compiled(tp.any, a, dim=axis, keepdim=keepdim) expected = np.array(x.any(axis=axis, keepdims=keepdim)) assert np.array_equal(np.from_dlpack(tp.copy(out, device=tp.device("cpu"))), expected) @@ -81,11 +81,11 @@ def test_any(self, x_shape, axis, keepdim): ], ) @pytest.mark.parametrize("dtype", [tp.float32, tp.float16]) - def test_mean(self, x_shape, axis, keepdim: bool, dtype): + def test_mean(self, x_shape, axis, keepdim: bool, dtype, eager_or_compiled): np_dtype = np.float32 if dtype == tp.float32 else np.float16 x = np.arange(np.prod(x_shape)).reshape(x_shape).astype(np_dtype) a = tp.Tensor(x, dtype=dtype) - out = tp.mean(a, dim=axis, keepdim=keepdim) + out = eager_or_compiled(tp.mean, a, dim=axis, keepdim=keepdim) expected = tp.Tensor(cp.array(x.mean(axis=axis, keepdims=keepdim))) assert out.shape == expected.shape assert tp.allclose(out, expected, rtol=1e-3, atol=1e-3) @@ -102,10 +102,10 @@ def test_mean(self, x_shape, axis, keepdim: bool, dtype): ((2, 3, 4, 5), (-2, -1), True), ], ) - def test_var(self, x_shape, axis, keepdim: bool): + def test_var(self, x_shape, axis, keepdim: bool, eager_or_compiled): x = np.arange(np.prod(x_shape)).reshape(x_shape).astype(np.float32) a = tp.Tensor(x) - out = tp.var(a, dim=axis, keepdim=keepdim) + out = eager_or_compiled(tp.var, a, dim=axis, keepdim=keepdim) torch_tensor = torch.Tensor(x) expected = tp.Tensor(torch_tensor.var(dim=axis, keepdim=keepdim)) assert out.shape == expected.shape @@ -122,10 +122,10 @@ def test_var(self, x_shape, axis, keepdim: bool): ((2, 3, 4), None, True), ], ) - def test_argmax(self, x_shape, axis, keepdim: bool): + def test_argmax(self, x_shape, axis, keepdim: bool, eager_or_compiled): x = np.arange(np.prod(x_shape)).reshape(x_shape).astype(np.float32) a = tp.Tensor(x) - out = tp.argmax(a, dim=axis, keepdim=keepdim) + out = eager_or_compiled(tp.argmax, a, dim=axis, keepdim=keepdim) assert np.array_equal(cp.from_dlpack(out).get(), np.array(x.argmax(axis=axis, keepdims=keepdim))) @pytest.mark.parametrize( @@ -139,8 +139,8 @@ def test_argmax(self, x_shape, axis, keepdim: bool): ((2, 3, 4), None, True), ], ) - def test_argmin(self, x_shape, axis, keepdim: bool): + def test_argmin(self, x_shape, axis, keepdim: bool, eager_or_compiled): x = np.arange(np.prod(x_shape)).reshape(x_shape).astype(np.float32) a = tp.Tensor(x) - out = tp.argmin(a, dim=axis, keepdim=keepdim) + out = eager_or_compiled(tp.argmin, a, dim=axis, keepdim=keepdim) assert np.array_equal(cp.from_dlpack(out).get(), np.array(x.argmin(axis=axis, keepdims=keepdim))) diff --git a/tripy/tests/integration/test_repeat.py b/tripy/tests/integration/test_repeat.py index 89b34ca43..86bc54556 100644 --- a/tripy/tests/integration/test_repeat.py +++ b/tripy/tests/integration/test_repeat.py @@ -30,18 +30,18 @@ class TestRepeat: (0, 1), ], ) - def test_repeat(self, repeats, dim): + def test_repeat(self, repeats, dim, eager_or_compiled): inp = np.arange(4, dtype=np.int32).reshape((2, 2)) - out = tp.repeat(tp.Tensor(inp), repeats, dim) + out = eager_or_compiled(tp.repeat, tp.Tensor(inp), repeats, dim) expected = np.repeat(inp, repeats, dim) assert np.array_equal(np.from_dlpack(tp.copy(out, device=tp.device("cpu"))), expected) - def test_repeat_shape_scalar(self): + def test_repeat_shape_scalar(self, eager_or_compiled): inp = np.arange(4, dtype=np.int32).reshape((2, 2)) s = tp.ones((1, 2)) - out = tp.repeat(tp.Tensor(inp), s.shape[1], 0) + out = eager_or_compiled(tp.repeat, tp.Tensor(inp), repeats=s.shape[1], dim=0) expected = np.repeat(inp, 2, 0) assert np.array_equal(np.from_dlpack(tp.copy(out, device=tp.device("cpu"))), expected) diff --git a/tripy/tests/integration/test_reshape.py b/tripy/tests/integration/test_reshape.py index c30c01501..e7343c6b6 100644 --- a/tripy/tests/integration/test_reshape.py +++ b/tripy/tests/integration/test_reshape.py @@ -31,21 +31,21 @@ class TestReshape: ((2, 4), (1, -1)), # check negative dim ], ) - def test_static_reshape(self, shape, new_shape): + def test_static_reshape(self, shape, new_shape, eager_or_compiled): cp_a = cp.arange(np.prod(shape)).reshape(shape).astype(np.float32) a = tp.Tensor(cp_a, device=tp.device("gpu")) - b = tp.reshape(a, new_shape) + b = eager_or_compiled(tp.reshape, a, new_shape) if -1 in new_shape: new_shape = tuple(np.prod(shape) // -np.prod(new_shape) if d == -1 else d for d in new_shape) assert np.array_equal(cp.from_dlpack(b).get(), cp_a.reshape(new_shape).get()) - def test_reshape_shape_tensor(self): + def test_reshape_shape_tensor(self, eager_or_compiled): a = tp.ones((2, 3, 4)) b = tp.ones((2, 3, 2, 2)) - out = tp.reshape(a, (a.shape[0], a.shape[1], b.shape[2], b.shape[3])) + out = eager_or_compiled(tp.reshape, a, (a.shape[0], a.shape[1], b.shape[2], b.shape[3])) assert np.array_equal(cp.from_dlpack(out).get(), np.ones((2, 3, 2, 2), dtype=np.float32)) - def test_reshape_shape_with_unknown(self): + def test_reshape_shape_with_unknown(self, eager_or_compiled): a = tp.ones((2, 3, 4)) - out = tp.reshape(a, (2, a.shape[1], a.shape[2] / 2, -1)) + out = eager_or_compiled(tp.reshape, a, (2, a.shape[1], a.shape[2] / 2, -1)) assert np.array_equal(cp.from_dlpack(out).get(), np.ones((2, 3, 2, 2), dtype=np.float32)) diff --git a/tripy/tests/integration/test_resize.py b/tripy/tests/integration/test_resize.py index f080ef03b..137fb82d8 100644 --- a/tripy/tests/integration/test_resize.py +++ b/tripy/tests/integration/test_resize.py @@ -24,10 +24,14 @@ class TestResize: @pytest.mark.parametrize("mode", ["nearest", "linear", "cubic"]) - def test_scales(self, mode): + def test_scales(self, mode, eager_or_compiled): inp_torch = torch.arange(16, dtype=torch.float32).reshape((1, 1, 4, 4)) inp_tp = tp.Tensor(inp_torch) - out_tp = tp.resize(inp_tp, mode, scales=(1, 1, 2, 2)) + + def resize(inp, mode, scales): + return tp.resize(inp, mode=mode, scales=scales, align_corners=False) + + out_tp = eager_or_compiled(resize, inp_tp, mode=mode, scales=(1, 1, 2, 2)) torch_mode = { "nearest": "nearest", "linear": "bilinear", @@ -39,10 +43,14 @@ def test_scales(self, mode): assert torch.allclose(out_torch, expected) @pytest.mark.parametrize("mode", ["nearest", "linear", "cubic"]) - def test_output_shape(self, mode): + def test_output_shape(self, mode, eager_or_compiled): inp_torch = torch.arange(16, dtype=torch.float32).reshape((1, 1, 4, 4)) inp_tp = tp.Tensor(inp_torch) - out_tp = tp.resize(inp_tp, mode, output_shape=[1, 1, 8, 8]) + + def resize(inp, mode, output_shape): + return tp.resize(inp, mode=mode, output_shape=output_shape, align_corners=False) + + out_tp = eager_or_compiled(resize, inp_tp, mode, output_shape=[1, 1, 8, 8]) torch_mode = { "nearest": "nearest", "linear": "bilinear", diff --git a/tripy/tests/integration/test_sequential.py b/tripy/tests/integration/test_sequential.py index b6ef3e260..1429869cc 100644 --- a/tripy/tests/integration/test_sequential.py +++ b/tripy/tests/integration/test_sequential.py @@ -21,7 +21,7 @@ class TestSequential: - def test_basic_forward_pass_accuracy(self): + def test_basic_forward_pass_accuracy(self, eager_or_compiled): torch_model = torch.nn.Sequential( torch.nn.Linear(1, 3, dtype=torch.float32, device="cuda"), torch.nn.Linear(3, 2, dtype=torch.float32, device="cuda"), @@ -36,7 +36,7 @@ def test_basic_forward_pass_accuracy(self): input_tensor = torch.tensor([[1.0]], dtype=torch.float32, device="cuda") tp_input = tp.Tensor(input_tensor, dtype=tp.float32) - tp_output = tp_model(tp_input) + tp_output = eager_or_compiled(tp_model, tp_input) torch_model.eval() with torch.no_grad(): @@ -45,7 +45,7 @@ def test_basic_forward_pass_accuracy(self): rtol_ = 2e-6 assert torch.allclose(torch.from_dlpack(tp_output), torch_output, rtol=rtol_) - def test_dict_forward_pass_accuracy(self): + def test_dict_forward_pass_accuracy(self, eager_or_compiled): torch_model = torch.nn.Sequential( torch.nn.Linear(1, 3, dtype=torch.float32, device="cuda"), torch.nn.Linear(3, 2, dtype=torch.float32, device="cuda"), @@ -63,7 +63,7 @@ def test_dict_forward_pass_accuracy(self): input_tensor = torch.tensor([[1.0]], dtype=torch.float32, device="cuda") tp_input = tp.Tensor(input_tensor, dtype=tp.float32) - tp_output = tp_model(tp_input) + tp_output = eager_or_compiled(tp_model, tp_input) torch_model.eval() with torch.no_grad(): @@ -74,7 +74,7 @@ def test_dict_forward_pass_accuracy(self): torch.from_dlpack(tp_output), torch_output, rtol=rtol_ ), "Forward pass outputs do not match." - def test_nested_forward_pass_accuracy(self): + def test_nested_forward_pass_accuracy(self, eager_or_compiled): torch_model = torch.nn.Sequential( torch.nn.Linear(1, 3, dtype=torch.float32, device="cuda"), torch.nn.Sequential( @@ -97,7 +97,7 @@ def test_nested_forward_pass_accuracy(self): input_tensor = torch.tensor([[1.0]], dtype=torch.float32, device="cuda") tp_input = tp.Tensor(input_tensor, dtype=tp.float32) - tp_output = tp_model(tp_input) + tp_output = eager_or_compiled(tp_model, tp_input) torch_model.eval() with torch.no_grad(): diff --git a/tripy/tests/integration/test_slice.py b/tripy/tests/integration/test_slice.py index 063b0245c..534ac34db 100644 --- a/tripy/tests/integration/test_slice.py +++ b/tripy/tests/integration/test_slice.py @@ -69,25 +69,31 @@ class TestSliceOp: ((5,), lambda t: t[-12:-5:-1]), ], ) - def test_static_slice_op(self, dims_a, slice_func): + def test_static_slice_op(self, dims_a, slice_func, eager_or_compiled): a_cp = cp.arange(np.prod(dims_a)).reshape(dims_a).astype(np.float32) a = tp.Tensor(a_cp, device=tp.device("gpu")) def func(a): return slice_func(a) - out = func(a) + out = eager_or_compiled(func, a) assert np.array_equal(cp.from_dlpack(out).get(), slice_func(a_cp).get()) - def test_slice_as_gather(self): + def test_slice_as_gather(self, eager_or_compiled): x_data = [0, 1, 2] y_data = [3, 4, 5] x = tp.Tensor(x_data) y = tp.Tensor(y_data) + + def slice(y, x): + return y[x] + + output = eager_or_compiled(slice, y, x) + x_cp = cp.array(x_data) y_cp = cp.array(y_data) - assert np.array_equal(cp.from_dlpack(y[x]).get(), y_cp[x_cp].get()) + assert np.array_equal(cp.from_dlpack(output).get(), y_cp[x_cp].get()) x_shape = (2, 2) y_shape = (4, 3, 2) @@ -95,7 +101,9 @@ def test_slice_as_gather(self): y_vol = math.prod(y_shape) x = tp.reshape(tp.arange(x_vol, dtype=tp.int32), x_shape) y = tp.reshape(tp.arange(y_vol), y_shape) + output = eager_or_compiled(slice, y, x) + x_cp = cp.arange(x_vol, dtype=cp.int32).reshape(x_shape) y_cp = cp.arange(y_vol).reshape(y_shape) - assert np.array_equal(cp.from_dlpack(y[x]).get(), y_cp[x_cp].get()) + assert np.array_equal(cp.from_dlpack(output).get(), y_cp[x_cp].get()) diff --git a/tripy/tests/integration/test_split.py b/tripy/tests/integration/test_split.py index 9279c98fb..f6e7ad369 100644 --- a/tripy/tests/integration/test_split.py +++ b/tripy/tests/integration/test_split.py @@ -43,16 +43,21 @@ class TestSplitOp: ((12, 12), (3, 1), lambda t: (t[:, :4], t[:, 4:8], t[:, 8:])), ((12, 12), ([3], 1), lambda t: (t[:, :3], t[:, 3:])), ((12, 12), (4, 0), lambda t: (t[:3, :], t[3:6, :], t[6:9, :], t[9:12, :])), - ((3, 0), (5, 1), lambda t: (t[:, :0], t[:, 0:0], t[:, 0:0], t[:, 0:0], t[:, 0:0])), + pytest.param( + (3, 0), + (5, 1), + lambda t: (t[:, :0], t[:, 0:0], t[:, 0:0], t[:, 0:0], t[:, 0:0]), + marks=pytest.mark.skip(reason="https://github.com/NVIDIA/TensorRT-Incubator/issues/398"), + ), ], ) - def test_split_static(self, dims_a, split_params, reference_slices): + def test_split_static(self, dims_a, split_params, reference_slices, eager_or_compiled): a_cp = cp.arange(np.prod(dims_a)).reshape(dims_a).astype(cp.float32) a = tp.Tensor(a_cp, device=tp.device("gpu")) def func(t): return tp.split(t, split_params[0], split_params[1]) - out = func(a) + out = eager_or_compiled(func, a) reference_out = reference_slices(a_cp) compare_split_results(out, reference_out) diff --git a/tripy/tests/integration/test_stack.py b/tripy/tests/integration/test_stack.py index be1f724b5..796bcc26b 100644 --- a/tripy/tests/integration/test_stack.py +++ b/tripy/tests/integration/test_stack.py @@ -33,9 +33,9 @@ class TestStack: ([(2, 3, 4)], 0), ], ) - def test_stack(self, tensor_shapes, dim): + def test_stack(self, tensor_shapes, dim, eager_or_compiled): tensors = [tp.ones(shape) for shape in tensor_shapes] - out = tp.stack(tensors, dim=dim) + out = eager_or_compiled(tp.stack, tensors, dim=dim) # Create numpy arrays for comparison np_tensors = [np.ones(shape) for shape in tensor_shapes] @@ -44,13 +44,13 @@ def test_stack(self, tensor_shapes, dim): assert out.shape == list(expected.shape) assert np.array_equal(cp.from_dlpack(out).get(), expected) - def test_stack_different_ranks(self): + def test_stack_different_ranks(self, eager_or_compiled): tensors = [tp.ones((2, 3)), tp.ones((2, 3, 4))] with raises( tp.TripyException, match="Expected all input tensors to have the same rank.", ): - tp.stack(tensors) + eager_or_compiled(tp.stack, tensors) def test_stack_different_shapes(self): a = tp.ones((2, 3)) diff --git a/tripy/tests/integration/test_unary_elementwise.py b/tripy/tests/integration/test_unary_elementwise.py index e01ca3fff..e89a37d6c 100644 --- a/tripy/tests/integration/test_unary_elementwise.py +++ b/tripy/tests/integration/test_unary_elementwise.py @@ -35,7 +35,7 @@ class TestUnaryElementwise: @pytest.mark.parametrize("tp_func, np_func", [(tp_func, np_func) for tp_func, np_func in _UNARY_OPS.items()]) - def test_op_funcs(self, tp_func, np_func): + def test_op_funcs(self, tp_func, np_func, eager_or_compiled): input = tp.arange(1, 4, dtype=tp.float32) - output = tp_func(input) + output = eager_or_compiled(tp_func, input) assert tp.allclose(output, tp.Tensor(np_func(cp.from_dlpack(input).get()))) diff --git a/tripy/tests/integration/test_unsqueeze.py b/tripy/tests/integration/test_unsqueeze.py index e25d459b1..4402449fc 100644 --- a/tripy/tests/integration/test_unsqueeze.py +++ b/tripy/tests/integration/test_unsqueeze.py @@ -24,13 +24,13 @@ class TestUnsqueezeOp: @pytest.mark.parametrize("axis", [-1, 0, 2]) - def test_unsqueeze_dynamic_op(self, axis): + def test_unsqueeze_dynamic_op(self, axis, eager_or_compiled): def func(a): return tp.unsqueeze(a, dim=axis) inp = np.ones((4, 2, 2, 3), dtype=np.float32) - out = func(tp.Tensor(inp)) + out = eager_or_compiled(func, tp.Tensor(inp)) ref_out = np.expand_dims(inp, axis=axis) assert tp.allclose(out, tp.Tensor(ref_out)) diff --git a/tripy/tests/integration/test_where_op.py b/tripy/tests/integration/test_where_op.py index 36d4839f5..5f37b5724 100644 --- a/tripy/tests/integration/test_where_op.py +++ b/tripy/tests/integration/test_where_op.py @@ -35,19 +35,19 @@ class TestWhereOp: ((0,), (1,), (1,)), # 0 dim in the condition ], ) - def test_where_broadcast_shapes(self, cond, x, y): + def test_where_broadcast_shapes(self, cond, x, y, eager_or_compiled): x = np.arange(np.prod(x)).reshape(x).astype(np.float32) y = np.arange(np.prod(y)).reshape(y).astype(np.float32) t_cond = np.arange(np.prod(cond)).reshape(cond).astype(np.float32) a = Tensor(x) b = Tensor(y) condition = Tensor(t_cond % 2 == 0) - out = tp.where(condition, a, b) + out = eager_or_compiled(tp.where, condition, a, b) assert np.array_equal(cp.from_dlpack(out).get(), np.array(np.where((t_cond % 2 == 0), x, y))) - def test_explicit_condition(self): + def test_explicit_condition(self, eager_or_compiled): select_indices = tp.Tensor([True, False, True, False], dtype=tp.bool) ones = tp.ones((4,), dtype=tp.int32) zeros = tp.zeros((4,), dtype=tp.int32) - w = tp.where(select_indices, ones, zeros) + w = eager_or_compiled(tp.where, select_indices, ones, zeros) assert cp.from_dlpack(w).get().tolist() == [1, 0, 1, 0]