Significant numerical differences with torch.amp.autocast() compared with stock pretrained resnext101 #17

farleylai · 2020-11-05T17:06:35Z

When making inferences with torch.amp.autocast(), the forward results show significant numerical differences compared with pretrained resnext101_32x8d from torchvision as the sample outputs in the following given the same input batch:

Output from WSL pretrained resnext101_32x8d_wsl shows significant differences:

actual(w/ amp.autocast) = tensor([ 1.0162e-01,  3.0859e-01, -1.9760e-03,  3.7750e-02, -4.5996e-01,
        -7.2510e-01, -8.7402e-02, -9.4727e-01...698e-02, -1.4392e-01,
        -2.1533e-01, -5.7666e-01, -8.1787e-02,  1.8103e-01,  2.3596e-01],
       device='cuda:0')
expected(w/o amp.autocast) = tensor([ 1.2948e-01,  4.0339e-01,  6.4677e-02,  6.7963e-02, -3.6953e-01,
        -6.0408e-01, -1.5742e-01, -8.2637e-01...613e-02, -1.7330e-01,
        -2.1253e-01, -5.2314e-01, -1.2327e-01,  1.0499e-01,  1.7262e-01],
       device='cuda:0')

Output from torchvision pretrained resnext101_32x8d shows approximate numerical values:

actual(w/ amp.autocast) = tensor([-2.9844e+00, -6.8945e-01,  5.9668e-01, -1.2510e+00, -7.2168e-01,
        -2.1992e+00, -1.2686e+00, -5.0879e-01...953e+00, -4.4453e+00,
        -4.8984e+00, -3.2617e+00, -2.6641e+00, -2.2344e+00,  5.4922e+00],
       device='cuda:0')
expected(w/o amp.autocast) = tensor([-2.9887e+00, -6.8953e-01,  5.9514e-01, -1.2496e+00, -7.2139e-01,
        -2.2008e+00, -1.2737e+00, -5.1238e-01...971e+00, -4.4485e+00,
        -4.9002e+00, -3.2653e+00, -2.6683e+00, -2.2359e+00,  5.5001e+00],
       device='cuda:0')

Is it because the pretrained resnext101 from torchvision is already trained in mixed precision or something else?
Any clarifications would be appreciated.

PS: sample pytest code to load the models and run the tests:

import torch as th

@pytest.fixture
def batch_size():
    return 2

@pytest.fixture
def shape():
    return 3, 720, 1280

@pytest.fixture
def dev():
    return th.device('cuda') if th.cuda.is_available() else torch.device('cpu')

@pytest.fixture
def batch(batch_size, shape):
    return th.rand(batch_size, *shape)

@pytest.fixture
def x101_32x8d(dev):
    from torchvision.models.resnet import _resnet
    from torchvision.models.resnet import Bottleneck
    from torchvision.ops.misc import FrozenBatchNorm2d
    kwargs = {}
    frozen = True
    kwargs['groups'] = gs = kwargs.get('groups', 32)
    kwargs['width_per_group'] = gw = kwargs.get('width_per_group', 8)
    kwargs['norm_layer'] = kwargs.get('norm_layer', FrozenBatchNorm2d if frozen else None)
    arch = f"resnext101_{gs}x{gw}d"
    model = _resnet(arch, Bottleneck, [3, 4, 23, 3], True, True, **kwargs)
    model.to(dev).eval()
    return model

@pytest.fixture
def x101_32x8d_wsl(dev):
    from torchvision.ops.misc import FrozenBatchNorm2d
    kwargs = {}
    frozen = True
    kwargs['groups'] = gs = kwargs.get('groups', 32)
    kwargs['width_per_group'] = gw = kwargs.get('width_per_group', 8)
    kwargs['norm_layer'] = kwargs.get('norm_layer', FrozenBatchNorm2d if frozen else None)
    model = th.hub.load('facebookresearch/WSL-Images', 'resnext101_32x8d_wsl', **kwargs)
    model.to(dev).eval()
    return model

@pytest.mark.parametrize("B", [2])
def test_x101_amp(benchmark, x101_32x8d, dev, batch, B):
    model = x101_32x8d
    with th.no_grad():
        with th.cuda.amp.autocast(enabled=False):
            outputs_fp32 = model(batch[:B].to(dev)).float()
        with th.cuda.amp.autocast():
            outputs_amp = model(batch[:B].to(dev)).float()

    for i, (output_fp32, output_amp) in enumerate(zip(outputs_fp32, outputs_amp)):
        logging.info(f"output[{i}] shape={tuple(output_fp32.shape)}, norm_fp32={output_fp32.norm()}, norm_amp={output_amp.norm()}")
        th.testing.assert_allclose(output_amp, output_fp32, rtol=1e-03, atol=3e-04)

@pytest.mark.parametrize("B", [2])
def test_x101_wsl_amp(benchmark, x101_32x8d_wsl, dev, batch, B):
    model = x101_32x8d_wsl
    with th.no_grad():
        with th.cuda.amp.autocast(enabled=False):
            outputs_fp32 = model(batch[:B].to(dev)).float()
        with th.cuda.amp.autocast():
            outputs_amp = model(batch[:B].to(dev)).float()
    
    for i, (output_fp32, output_amp) in enumerate(zip(outputs_fp32, outputs_amp)):
        logging.info(f"output[{i}] shape={tuple(output_fp32.shape)}, norm_fp32={output_fp32.norm()}, norm_amp={output_amp.norm()}")
        th.testing.assert_allclose(output_amp, output_fp32, rtol=1e-03, atol=3e-04)

The text was updated successfully, but these errors were encountered:

farleylai changed the title ~~Significant numerical differences with torch.amp.autocast() compared with stock resnext101?~~ Significant numerical differences with torch.amp.autocast() compared with stock pretrained resnext101 Nov 6, 2020

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Significant numerical differences with torch.amp.autocast() compared with stock pretrained resnext101 #17

Significant numerical differences with torch.amp.autocast() compared with stock pretrained resnext101 #17

farleylai commented Nov 5, 2020 •

edited

Loading

Significant numerical differences with torch.amp.autocast() compared with stock pretrained resnext101 #17

Significant numerical differences with torch.amp.autocast() compared with stock pretrained resnext101 #17

Comments

farleylai commented Nov 5, 2020 • edited Loading

farleylai commented Nov 5, 2020 •

edited

Loading