Poor unrolling prevents vectorization opportunities #50452

RKSimon · 2021-07-15T19:05:34Z


Bugzilla Link	51108
Version	trunk
OS	Windows NT
CC	@adibiagio,@fhahn,@LebedevRI,@MattPD,@OCHyams,@rotateright

Extended Description

https://godbolt.org/z/jE1e9rT5j
(NOTE: disabled fma on gcc to prevent fmul+fadd->fma diff)

constexpr int SIZE = 128;
float A[SIZE][16];
float B[SIZE][16];

__attribute__((__noinline__))
float foo()
{
    float sum = 0.0f;
    for (int i = 1; i < 32; ++i)
        for (int j = 0; j < 4; ++j)
            sum += A[i][j] * B[i][j];
    
    return sum;
}

clang -g0 -O3 -march=znver2

_Z3foov:
        vxorps  %xmm0, %xmm0, %xmm0
        movq    $-1984, %rax                    # imm = 0xF840
.LBB0_1:
        vmovss  A+2048(%rax), %xmm1             # xmm1 = mem[0],zero,zero,zero
        vmovsd  B+2052(%rax), %xmm2             # xmm2 = mem[0],zero
        vmulss  B+2048(%rax), %xmm1, %xmm1
        vaddss  %xmm1, %xmm0, %xmm0
        vmovsd  A+2052(%rax), %xmm1             # xmm1 = mem[0],zero
        vmulps  %xmm2, %xmm1, %xmm1
        vaddss  %xmm1, %xmm0, %xmm0
        vmovshdup       %xmm1, %xmm1            # xmm1 = xmm1[1,1,3,3]
        vaddss  %xmm1, %xmm0, %xmm0
        vmovss  A+2060(%rax), %xmm1             # xmm1 = mem[0],zero,zero,zero
        vmulss  B+2060(%rax), %xmm1, %xmm1
        addq    $64, %rax
        vaddss  %xmm1, %xmm0, %xmm0
        jne     .LBB0_1
        retq

The clang code has several issues:

1 - if we'd used a better indvar we could have avoided some very large offsets on the address math (put A and B in registers and use a better range/increment for %rax).

2 - GCC recognises that the array is fully dereferencable allowing it to use fewer (vector) loads and then extract/shuffle the elements that it requires

3 - we fail to ensure the per-loop reduction is in a form that we can use HADDPS (on targets where its fast)

4 - the LoopMicroOpBufferSize in the znver3 model has a VERY unexpected effect on unrolling - I'm not sure clang's interpretation of the buffer size is the same as just copying AMD's hardware specs

The text was updated successfully, but these errors were encountered:

RKSimon · 2022-08-17T20:05:22Z

Something else that's interesting on the integer equivalent:

using T = int;
constexpr int SIZE = 128;
T A[SIZE][16];
T B[SIZE][16];

T foo()
{
    T sum = (T)0;
    for (int i = 1; i < 32; ++i)
        for (int j = 0; j < 4; ++j)
            sum += A[i][j] * B[i][j];
    
    return sum;
}

clang -g0 -O3 -march=btver2

foo():                                # @foo()
        movq    $-1984, %rcx                    # imm = 0xF840
        leaq    A(%rip), %rdx
        leaq    B(%rip), %rsi
        xorl    %eax, %eax
.LBB0_1:                                # %for.cond1.preheader
        vmovdqa 2048(%rcx,%rsi), %xmm0
        vpmulld 2048(%rcx,%rdx), %xmm0, %xmm0
        vphaddd %xmm0, %xmm0, %xmm0
        vphaddd %xmm0, %xmm0, %xmm0
        vmovd   %xmm0, %edi
        addl    %edi, %eax
        addq    $64, %rcx
        jne     .LBB0_1
        retq
A:
        .zero   8192
B:
        .zero   8192

clang -g0 -O3 -march=znver2

foo():                                # @foo()
        xorl    %eax, %eax
        movl    $64, %ecx
        leaq    A(%rip), %rdx
        leaq    B(%rip), %rsi
.LBB0_1:                                # %for.cond1.preheader
        vmovdqa (%rcx,%rsi), %xmm0
        vpmulld (%rcx,%rdx), %xmm0, %xmm0
        addq    $64, %rcx
        vpshufd $238, %xmm0, %xmm1              # xmm1 = xmm0[2,3,2,3]
        vpaddd  %xmm1, %xmm0, %xmm0
        vpshufd $85, %xmm0, %xmm1               # xmm1 = xmm0[1,1,1,1]
        vpaddd  %xmm1, %xmm0, %xmm0
        vmovd   %xmm0, %edi
        addl    %edi, %eax
        cmpq    $2048, %rcx                     # imm = 0x800
        jne     .LBB0_1
        retq
A:
        .zero   8192
B:
        .zero   8192

The btver2 variant ends up with more costly 3op LEA loads, but the znver2 variant folds them into the indvar

llvmbot transferred this issue from llvm/llvm-bugzilla-archive Dec 11, 2021

RKSimon mentioned this issue Aug 8, 2022

Excessive loop unrolling #42332

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Poor unrolling prevents vectorization opportunities #50452

Poor unrolling prevents vectorization opportunities #50452

RKSimon commented Jul 15, 2021 •

edited

Loading

RKSimon commented Aug 17, 2022 •

edited

Loading

Poor unrolling prevents vectorization opportunities #50452

Poor unrolling prevents vectorization opportunities #50452

Comments

RKSimon commented Jul 15, 2021 • edited Loading

Extended Description

RKSimon commented Aug 17, 2022 • edited Loading

RKSimon commented Jul 15, 2021 •

edited

Loading

RKSimon commented Aug 17, 2022 •

edited

Loading