Skip to content
This repository has been archived by the owner on Oct 31, 2023. It is now read-only.

Significant numerical differences with torch.amp.autocast() compared with stock pretrained resnext101 #17

Open
farleylai opened this issue Nov 5, 2020 · 0 comments

Comments

@farleylai
Copy link

farleylai commented Nov 5, 2020

When making inferences with torch.amp.autocast(), the forward results show significant numerical differences compared with pretrained resnext101_32x8d from torchvision as the sample outputs in the following given the same input batch:

Output from WSL pretrained resnext101_32x8d_wsl shows significant differences:

actual(w/ amp.autocast) = tensor([ 1.0162e-01,  3.0859e-01, -1.9760e-03,  3.7750e-02, -4.5996e-01,
        -7.2510e-01, -8.7402e-02, -9.4727e-01...698e-02, -1.4392e-01,
        -2.1533e-01, -5.7666e-01, -8.1787e-02,  1.8103e-01,  2.3596e-01],
       device='cuda:0')
expected(w/o amp.autocast) = tensor([ 1.2948e-01,  4.0339e-01,  6.4677e-02,  6.7963e-02, -3.6953e-01,
        -6.0408e-01, -1.5742e-01, -8.2637e-01...613e-02, -1.7330e-01,
        -2.1253e-01, -5.2314e-01, -1.2327e-01,  1.0499e-01,  1.7262e-01],
       device='cuda:0')

Output from torchvision pretrained resnext101_32x8d shows approximate numerical values:

actual(w/ amp.autocast) = tensor([-2.9844e+00, -6.8945e-01,  5.9668e-01, -1.2510e+00, -7.2168e-01,
        -2.1992e+00, -1.2686e+00, -5.0879e-01...953e+00, -4.4453e+00,
        -4.8984e+00, -3.2617e+00, -2.6641e+00, -2.2344e+00,  5.4922e+00],
       device='cuda:0')
expected(w/o amp.autocast) = tensor([-2.9887e+00, -6.8953e-01,  5.9514e-01, -1.2496e+00, -7.2139e-01,
        -2.2008e+00, -1.2737e+00, -5.1238e-01...971e+00, -4.4485e+00,
        -4.9002e+00, -3.2653e+00, -2.6683e+00, -2.2359e+00,  5.5001e+00],
       device='cuda:0')

Is it because the pretrained resnext101 from torchvision is already trained in mixed precision or something else?
Any clarifications would be appreciated.

PS: sample pytest code to load the models and run the tests:

import torch as th

@pytest.fixture
def batch_size():
    return 2

@pytest.fixture
def shape():
    return 3, 720, 1280

@pytest.fixture
def dev():
    return th.device('cuda') if th.cuda.is_available() else torch.device('cpu')

@pytest.fixture
def batch(batch_size, shape):
    return th.rand(batch_size, *shape)

@pytest.fixture
def x101_32x8d(dev):
    from torchvision.models.resnet import _resnet
    from torchvision.models.resnet import Bottleneck
    from torchvision.ops.misc import FrozenBatchNorm2d
    kwargs = {}
    frozen = True
    kwargs['groups'] = gs = kwargs.get('groups', 32)
    kwargs['width_per_group'] = gw = kwargs.get('width_per_group', 8)
    kwargs['norm_layer'] = kwargs.get('norm_layer', FrozenBatchNorm2d if frozen else None)
    arch = f"resnext101_{gs}x{gw}d"
    model = _resnet(arch, Bottleneck, [3, 4, 23, 3], True, True, **kwargs)
    model.to(dev).eval()
    return model

@pytest.fixture
def x101_32x8d_wsl(dev):
    from torchvision.ops.misc import FrozenBatchNorm2d
    kwargs = {}
    frozen = True
    kwargs['groups'] = gs = kwargs.get('groups', 32)
    kwargs['width_per_group'] = gw = kwargs.get('width_per_group', 8)
    kwargs['norm_layer'] = kwargs.get('norm_layer', FrozenBatchNorm2d if frozen else None)
    model = th.hub.load('facebookresearch/WSL-Images', 'resnext101_32x8d_wsl', **kwargs)
    model.to(dev).eval()
    return model

@pytest.mark.parametrize("B", [2])
def test_x101_amp(benchmark, x101_32x8d, dev, batch, B):
    model = x101_32x8d
    with th.no_grad():
        with th.cuda.amp.autocast(enabled=False):
            outputs_fp32 = model(batch[:B].to(dev)).float()
        with th.cuda.amp.autocast():
            outputs_amp = model(batch[:B].to(dev)).float()

    for i, (output_fp32, output_amp) in enumerate(zip(outputs_fp32, outputs_amp)):
        logging.info(f"output[{i}] shape={tuple(output_fp32.shape)}, norm_fp32={output_fp32.norm()}, norm_amp={output_amp.norm()}")
        th.testing.assert_allclose(output_amp, output_fp32, rtol=1e-03, atol=3e-04)

@pytest.mark.parametrize("B", [2])
def test_x101_wsl_amp(benchmark, x101_32x8d_wsl, dev, batch, B):
    model = x101_32x8d_wsl
    with th.no_grad():
        with th.cuda.amp.autocast(enabled=False):
            outputs_fp32 = model(batch[:B].to(dev)).float()
        with th.cuda.amp.autocast():
            outputs_amp = model(batch[:B].to(dev)).float()
    
    for i, (output_fp32, output_amp) in enumerate(zip(outputs_fp32, outputs_amp)):
        logging.info(f"output[{i}] shape={tuple(output_fp32.shape)}, norm_fp32={output_fp32.norm()}, norm_amp={output_amp.norm()}")
        th.testing.assert_allclose(output_amp, output_fp32, rtol=1e-03, atol=3e-04)
@farleylai farleylai changed the title Significant numerical differences with torch.amp.autocast() compared with stock resnext101? Significant numerical differences with torch.amp.autocast() compared with stock pretrained resnext101 Nov 6, 2020
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant