Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Poor unrolling prevents vectorization opportunities #50452

Open
RKSimon opened this issue Jul 15, 2021 · 1 comment
Open

Poor unrolling prevents vectorization opportunities #50452

RKSimon opened this issue Jul 15, 2021 · 1 comment
Labels
bugzilla Issues migrated from bugzilla loopoptim

Comments

@RKSimon
Copy link
Collaborator

RKSimon commented Jul 15, 2021

Bugzilla Link 51108
Version trunk
OS Windows NT
CC @adibiagio,@fhahn,@LebedevRI,@MattPD,@OCHyams,@rotateright

Extended Description

https://godbolt.org/z/jE1e9rT5j
(NOTE: disabled fma on gcc to prevent fmul+fadd->fma diff)

constexpr int SIZE = 128;
float A[SIZE][16];
float B[SIZE][16];

__attribute__((__noinline__))
float foo()
{
    float sum = 0.0f;
    for (int i = 1; i < 32; ++i)
        for (int j = 0; j < 4; ++j)
            sum += A[i][j] * B[i][j];
    
    return sum;
}

clang -g0 -O3 -march=znver2

_Z3foov:
        vxorps  %xmm0, %xmm0, %xmm0
        movq    $-1984, %rax                    # imm = 0xF840
.LBB0_1:
        vmovss  A+2048(%rax), %xmm1             # xmm1 = mem[0],zero,zero,zero
        vmovsd  B+2052(%rax), %xmm2             # xmm2 = mem[0],zero
        vmulss  B+2048(%rax), %xmm1, %xmm1
        vaddss  %xmm1, %xmm0, %xmm0
        vmovsd  A+2052(%rax), %xmm1             # xmm1 = mem[0],zero
        vmulps  %xmm2, %xmm1, %xmm1
        vaddss  %xmm1, %xmm0, %xmm0
        vmovshdup       %xmm1, %xmm1            # xmm1 = xmm1[1,1,3,3]
        vaddss  %xmm1, %xmm0, %xmm0
        vmovss  A+2060(%rax), %xmm1             # xmm1 = mem[0],zero,zero,zero
        vmulss  B+2060(%rax), %xmm1, %xmm1
        addq    $64, %rax
        vaddss  %xmm1, %xmm0, %xmm0
        jne     .LBB0_1
        retq

The clang code has several issues:

1 - if we'd used a better indvar we could have avoided some very large offsets on the address math (put A and B in registers and use a better range/increment for %rax).

2 - GCC recognises that the array is fully dereferencable allowing it to use fewer (vector) loads and then extract/shuffle the elements that it requires

3 - we fail to ensure the per-loop reduction is in a form that we can use HADDPS (on targets where its fast)

4 - the LoopMicroOpBufferSize in the znver3 model has a VERY unexpected effect on unrolling - I'm not sure clang's interpretation of the buffer size is the same as just copying AMD's hardware specs

@llvmbot llvmbot transferred this issue from llvm/llvm-bugzilla-archive Dec 11, 2021
@RKSimon
Copy link
Collaborator Author

RKSimon commented Aug 17, 2022

Something else that's interesting on the integer equivalent:

using T = int;
constexpr int SIZE = 128;
T A[SIZE][16];
T B[SIZE][16];

T foo()
{
    T sum = (T)0;
    for (int i = 1; i < 32; ++i)
        for (int j = 0; j < 4; ++j)
            sum += A[i][j] * B[i][j];
    
    return sum;
}

clang -g0 -O3 -march=btver2

foo():                                # @foo()
        movq    $-1984, %rcx                    # imm = 0xF840
        leaq    A(%rip), %rdx
        leaq    B(%rip), %rsi
        xorl    %eax, %eax
.LBB0_1:                                # %for.cond1.preheader
        vmovdqa 2048(%rcx,%rsi), %xmm0
        vpmulld 2048(%rcx,%rdx), %xmm0, %xmm0
        vphaddd %xmm0, %xmm0, %xmm0
        vphaddd %xmm0, %xmm0, %xmm0
        vmovd   %xmm0, %edi
        addl    %edi, %eax
        addq    $64, %rcx
        jne     .LBB0_1
        retq
A:
        .zero   8192
B:
        .zero   8192

clang -g0 -O3 -march=znver2

foo():                                # @foo()
        xorl    %eax, %eax
        movl    $64, %ecx
        leaq    A(%rip), %rdx
        leaq    B(%rip), %rsi
.LBB0_1:                                # %for.cond1.preheader
        vmovdqa (%rcx,%rsi), %xmm0
        vpmulld (%rcx,%rdx), %xmm0, %xmm0
        addq    $64, %rcx
        vpshufd $238, %xmm0, %xmm1              # xmm1 = xmm0[2,3,2,3]
        vpaddd  %xmm1, %xmm0, %xmm0
        vpshufd $85, %xmm0, %xmm1               # xmm1 = xmm0[1,1,1,1]
        vpaddd  %xmm1, %xmm0, %xmm0
        vmovd   %xmm0, %edi
        addl    %edi, %eax
        cmpq    $2048, %rcx                     # imm = 0x800
        jne     .LBB0_1
        retq
A:
        .zero   8192
B:
        .zero   8192

The btver2 variant ends up with more costly 3op LEA loads, but the znver2 variant folds them into the indvar

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bugzilla Issues migrated from bugzilla loopoptim
Projects
None yet
Development

No branches or pull requests

1 participant