Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Experiment] Aggregate layout optimization #91507

Closed
wants to merge 4 commits into from

Conversation

Urgau
Copy link
Member

@Urgau Urgau commented Dec 4, 2021

Try to lower the threshold for indirect aggregate and remove cast to integer in order to have better code generation.

This pull-request is a DRAFT and is my (dummy ? hopefully not) tentative to fix #91447.
If some one could start a perf run so that a could find out if this needs more thinking or not.

Rust code:

#![no_std]

pub fn sum_f32(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
    let mut c = [0.0; 4];
    c[0] = a[0] + b[0];
    c[1] = a[1] + b[1];
    c[2] = a[2] + b[2];
    c[3] = a[3] + b[3];
    c
}

pub fn sum_u32(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {
    let mut c = [0; 4];
    c[0] = a[0] + b[0];
    c[1] = a[1] + b[1];
    c[2] = a[2] + b[2];
    c[3] = a[3] + b[3];
    c
}

pub fn sum_ref_u32(a: &[u32; 4], b: &[u32; 4]) -> [u32; 4] {
    let mut c = [0; 4];
    c[0] = a[0] + b[0];
    c[1] = a[1] + b[1];
    c[2] = a[2] + b[2];
    c[3] = a[3] + b[3];
    c
}

pub fn sum_f32_big(a: [f32; 8], b: [f32; 8]) -> [f32; 8] {
    let mut c = [0.0; 8];
    for i in 0..8 {
        c[i] = a[i] + b[i];
    }
    c
}

pub struct Vec4 {
    pub x: f32,
    pub y: f32,
    pub z: f32,
    pub w: f32,
}

pub fn add(a: Vec4, b: Vec4) -> Vec4 {
    Vec4 {
        x: a.x + b.x,
        y: a.y + b.y,
        z: a.z + b.z,
        w: a.w + b.w
    }
}

#[no_mangle]
pub fn array_eq_value(a: [u16; 6], b: [u16; 6]) -> bool {
    a == b
}

Nightly:

example::sum_f32:
        movd    xmm0, ecx
        shr     rcx, 32
        movd    xmm1, ecx
        movd    xmm2, edx
        shr     rdx, 32
        movd    xmm3, edx
        movd    xmm4, esi
        shr     rsi, 32
        movd    xmm5, esi
        addss   xmm5, xmm1
        addss   xmm4, xmm0
        movd    xmm0, edi
        shr     rdi, 32
        movd    xmm1, edi
        addss   xmm1, xmm3
        addss   xmm0, xmm2
        movd    eax, xmm0
        movd    ecx, xmm1
        movd    edx, xmm4
        movd    esi, xmm5
        shl     rsi, 32
        or      rdx, rsi
        shl     rcx, 32
        or      rax, rcx
        ret

example::sum_u32:
        mov     r8, rcx
        add     ecx, esi
        shr     rsi, 32
        lea     eax, [rdx + rdi]
        shr     rdi, 32
        shr     r8, 32
        shr     rdx, 32
        add     edx, edi
        add     esi, r8d
        shl     rsi, 32
        or      rcx, rsi
        shl     rdx, 32
        or      rax, rdx
        mov     rdx, rcx
        ret

example::sum_ref_u32:
        mov     eax, dword ptr [rsi]
        mov     ecx, dword ptr [rsi + 4]
        add     eax, dword ptr [rdi]
        add     ecx, dword ptr [rdi + 4]
        mov     edx, dword ptr [rsi + 8]
        add     edx, dword ptr [rdi + 8]
        mov     esi, dword ptr [rsi + 12]
        add     esi, dword ptr [rdi + 12]
        shl     rsi, 32
        or      rdx, rsi
        shl     rcx, 32
        or      rax, rcx
        ret

example::sum_f32_big:
        mov     rax, rdi
        movss   xmm0, dword ptr [rsi]
        addss   xmm0, dword ptr [rdx]
        movss   dword ptr [rdi], xmm0
        movss   xmm0, dword ptr [rsi + 4]
        addss   xmm0, dword ptr [rdx + 4]
        movss   dword ptr [rdi + 4], xmm0
        movss   xmm0, dword ptr [rsi + 8]
        addss   xmm0, dword ptr [rdx + 8]
        movss   dword ptr [rdi + 8], xmm0
        movss   xmm0, dword ptr [rsi + 12]
        addss   xmm0, dword ptr [rdx + 12]
        movss   dword ptr [rdi + 12], xmm0
        movss   xmm0, dword ptr [rsi + 16]
        addss   xmm0, dword ptr [rdx + 16]
        movss   dword ptr [rdi + 16], xmm0
        movss   xmm0, dword ptr [rsi + 20]
        addss   xmm0, dword ptr [rdx + 20]
        movss   dword ptr [rdi + 20], xmm0
        movss   xmm0, dword ptr [rsi + 24]
        addss   xmm0, dword ptr [rdx + 24]
        movss   dword ptr [rdi + 24], xmm0
        movss   xmm0, dword ptr [rsi + 28]
        addss   xmm0, dword ptr [rdx + 28]
        movss   dword ptr [rdi + 28], xmm0
        ret

example::add:
        movd    xmm0, edi
        movd    xmm1, esi
        shr     rsi, 32
        shr     rdi, 32
        movd    xmm2, edi
        movd    xmm3, esi
        movd    xmm4, edx
        addss   xmm4, xmm0
        shr     rdx, 32
        movd    xmm0, edx
        addss   xmm0, xmm2
        movd    xmm2, ecx
        shr     rcx, 32
        addss   xmm2, xmm1
        movd    xmm1, ecx
        addss   xmm1, xmm3
        movd    eax, xmm4
        movd    ecx, xmm0
        movd    edx, xmm2
        movd    esi, xmm1
        shl     rsi, 32
        or      rdx, rsi
        shl     rcx, 32
        or      rax, rcx
        ret

array_eq_value:
        xor     rdi, rdx
        xor     esi, ecx
        or      rsi, rdi
        sete    al
        ret

This PR:

_ZN1a7sum_f3217hecad8316332612a3E:
	mov	rax, rdi
	movups	xmm0, xmmword ptr [rsi]
	movups	xmm1, xmmword ptr [rdx]
	addps	xmm1, xmm0
	movups	xmmword ptr [rdi], xmm1
	ret

_ZN1a7sum_u3217h4f54e379f00c247dE:
	mov	rax, rdi
	movdqu	xmm0, xmmword ptr [rsi]
	movdqu	xmm1, xmmword ptr [rdx]
	paddd	xmm1, xmm0
	movdqu	xmmword ptr [rdi], xmm1
	ret

_ZN1a11sum_ref_u3217h49cb1e98540da517E:
	mov	rax, rdi
	movdqu	xmm0, xmmword ptr [rsi]
	movdqu	xmm1, xmmword ptr [rdx]
	paddd	xmm1, xmm0
	movdqu	xmmword ptr [rdi], xmm1
	ret

_ZN1a11sum_f32_big17hb5a2218852b724b7E:
	mov	rax, rdi
	movups	xmm0, xmmword ptr [rsi]
	movups	xmm1, xmmword ptr [rdx]
	addps	xmm1, xmm0
	movups	xmmword ptr [rdi], xmm1
	movups	xmm0, xmmword ptr [rsi + 16]
	movups	xmm1, xmmword ptr [rdx + 16]
	addps	xmm1, xmm0
	movups	xmmword ptr [rdi + 16], xmm1
	ret

_ZN1a3add17hcbee97e180d293d5E:
	mov	rax, rdi
	movups	xmm0, xmmword ptr [rsi]
	movups	xmm1, xmmword ptr [rdx]
	addps	xmm1, xmm0
	movups	xmmword ptr [rdi], xmm1
	ret

array_eq_value:
	mov	rax, qword ptr [rdi]
	xor	rax, qword ptr [rsi]
	mov	ecx, dword ptr [rdi + 8]
	xor	ecx, dword ptr [rsi + 8]
	or	rcx, rax
	sete	al
	ret

r? @ghost

@jyn514
Copy link
Member

jyn514 commented Dec 4, 2021

@bors try @rust-timer queue

@rust-timer
Copy link
Collaborator

Awaiting bors try build completion.

@rustbot label: +S-waiting-on-perf

@rustbot rustbot added the S-waiting-on-perf Status: Waiting on a perf run to be completed. label Dec 4, 2021
@bors
Copy link
Contributor

bors commented Dec 4, 2021

⌛ Trying commit 9410686 with merge d27f2514c2c2ab7f14562f3318cf9129f3315fbf...

@bors
Copy link
Contributor

bors commented Dec 4, 2021

☀️ Try build successful - checks-actions
Build commit: d27f2514c2c2ab7f14562f3318cf9129f3315fbf (d27f2514c2c2ab7f14562f3318cf9129f3315fbf)

@rust-timer
Copy link
Collaborator

Queued d27f2514c2c2ab7f14562f3318cf9129f3315fbf with parent 532d2b1, future comparison URL.

@rust-timer
Copy link
Collaborator

Finished benchmarking commit (d27f2514c2c2ab7f14562f3318cf9129f3315fbf): comparison url.

Summary: This change led to very large relevant mixed results 🤷 in compiler performance.

  • Very large improvement in instruction counts (up to -7.0% on full builds of deeply-nested-async)
  • Very large regression in instruction counts (up to 12.6% on incr-unchanged builds of ctfe-stress-4)

If you disagree with this performance assessment, please file an issue in rust-lang/rustc-perf.

Benchmarking this pull request likely means that it is perf-sensitive, so we're automatically marking it as not fit for rolling up. While you can manually mark this PR as fit for rollup, we strongly recommend not doing so since this PR led to changes in compiler perf.

Next Steps: If you can justify the regressions found in this try perf run, please indicate this with @rustbot label: +perf-regression-triaged along with sufficient written justification. If you cannot justify the regressions please fix the regressions and do another perf run. If the next run shows neutral or positive results, the label will be automatically removed.

@bors rollup=never
@rustbot label: +S-waiting-on-review -S-waiting-on-perf +perf-regression

@rustbot rustbot added perf-regression Performance regression. S-waiting-on-review Status: Awaiting review from the assignee but also interested parties. and removed S-waiting-on-perf Status: Waiting on a perf run to be completed. labels Dec 4, 2021
@Urgau
Copy link
Member Author

Urgau commented Dec 4, 2021

All the regressions appears to be in incr_comp and codegen queries, which was expected due to the fact that the layout is now more complex/bigger. However the wall-time seems to not have regressed there is just more instructions perform.

I've push a revert of my partial revert of the SpecArrayEq specialization as this might help with the ctfe case. If some one could restart a perf to see if that change anything.

The question is now whenever this might be acceptable ? @jyn514 an opinion ?
One thing to note here is that this affect the layout of parameters of function but if the function is inlined there is no more parameters and the cost of wrapping (before function call) -> dewrapping (start of function) and wrapping (end of function) -> dewrapping (after function call) in an integer (nearly ?) completely disappear.

Rust

pub fn sum_f32_big_8(a: [f32; 8], b: [f32; 8]) -> [f32; 8] {
    let mut c = [0.0; 8];
    for i in 0..8 {
        c[i] = a[i] + b[i];
    }
    c
}

pub fn sum_f32_big_16(a: [f32; 16], b: [f32; 16]) -> [f32; 16] {
    let mut c = [0.0; 16];
    for i in 0..16 {
        c[i] = a[i] + b[i];
    }
    c
}

This PR

sum_f32_big_8:
	mov	rax, rdi
	movups	xmm0, xmmword ptr [rsi]
	movups	xmm1, xmmword ptr [rdx]
	addps	xmm1, xmm0
	movups	xmmword ptr [rdi], xmm1
	movups	xmm0, xmmword ptr [rsi + 16]
	movups	xmm1, xmmword ptr [rdx + 16]
	addps	xmm1, xmm0
	movups	xmmword ptr [rdi + 16], xmm1
	ret

sum_f32_big_16:
	mov	rax, rdi
	movups	xmm0, xmmword ptr [rsi]
	movups	xmm1, xmmword ptr [rdx]
	addps	xmm1, xmm0
	movups	xmmword ptr [rdi], xmm1
	movups	xmm0, xmmword ptr [rsi + 16]
	movups	xmm1, xmmword ptr [rdx + 16]
	addps	xmm1, xmm0
	movups	xmmword ptr [rdi + 16], xmm1
	movups	xmm0, xmmword ptr [rsi + 32]
	movups	xmm1, xmmword ptr [rdx + 32]
	addps	xmm1, xmm0
	movups	xmmword ptr [rdi + 32], xmm1
	movups	xmm0, xmmword ptr [rsi + 48]
	movups	xmm1, xmmword ptr [rdx + 48]
	addps	xmm1, xmm0
	movups	xmmword ptr [rdi + 48], xmm1
	ret

Nightly

example::sum_f32_big_8:
        mov     rax, rdi
        movss   xmm0, dword ptr [rsi]
        addss   xmm0, dword ptr [rdx]
        movss   dword ptr [rdi], xmm0
        movss   xmm0, dword ptr [rsi + 4]
        addss   xmm0, dword ptr [rdx + 4]
        movss   dword ptr [rdi + 4], xmm0
        movss   xmm0, dword ptr [rsi + 8]
        addss   xmm0, dword ptr [rdx + 8]
        movss   dword ptr [rdi + 8], xmm0
        movss   xmm0, dword ptr [rsi + 12]
        addss   xmm0, dword ptr [rdx + 12]
        movss   dword ptr [rdi + 12], xmm0
        movss   xmm0, dword ptr [rsi + 16]
        addss   xmm0, dword ptr [rdx + 16]
        movss   dword ptr [rdi + 16], xmm0
        movss   xmm0, dword ptr [rsi + 20]
        addss   xmm0, dword ptr [rdx + 20]
        movss   dword ptr [rdi + 20], xmm0
        movss   xmm0, dword ptr [rsi + 24]
        addss   xmm0, dword ptr [rdx + 24]
        movss   dword ptr [rdi + 24], xmm0
        movss   xmm0, dword ptr [rsi + 28]
        addss   xmm0, dword ptr [rdx + 28]
        movss   dword ptr [rdi + 28], xmm0
        ret

example::sum_f32_big_16:
        mov     rax, rdi
        movss   xmm0, dword ptr [rsi]
        addss   xmm0, dword ptr [rdx]
        movss   dword ptr [rdi], xmm0
        movss   xmm0, dword ptr [rsi + 4]
        addss   xmm0, dword ptr [rdx + 4]
        movss   dword ptr [rdi + 4], xmm0
        movss   xmm0, dword ptr [rsi + 8]
        addss   xmm0, dword ptr [rdx + 8]
        movss   dword ptr [rdi + 8], xmm0
        movss   xmm0, dword ptr [rsi + 12]
        addss   xmm0, dword ptr [rdx + 12]
        movss   dword ptr [rdi + 12], xmm0
        movss   xmm0, dword ptr [rsi + 16]
        addss   xmm0, dword ptr [rdx + 16]
        movss   dword ptr [rdi + 16], xmm0
        movss   xmm0, dword ptr [rsi + 20]
        addss   xmm0, dword ptr [rdx + 20]
        movss   dword ptr [rdi + 20], xmm0
        movss   xmm0, dword ptr [rsi + 24]
        addss   xmm0, dword ptr [rdx + 24]
        movss   dword ptr [rdi + 24], xmm0
        movss   xmm0, dword ptr [rsi + 28]
        addss   xmm0, dword ptr [rdx + 28]
        movss   dword ptr [rdi + 28], xmm0
        movss   xmm0, dword ptr [rsi + 32]
        addss   xmm0, dword ptr [rdx + 32]
        movss   dword ptr [rdi + 32], xmm0
        movss   xmm0, dword ptr [rsi + 36]
        addss   xmm0, dword ptr [rdx + 36]
        movss   dword ptr [rdi + 36], xmm0
        movss   xmm0, dword ptr [rsi + 40]
        addss   xmm0, dword ptr [rdx + 40]
        movss   dword ptr [rdi + 40], xmm0
        movss   xmm0, dword ptr [rsi + 44]
        addss   xmm0, dword ptr [rdx + 44]
        movss   dword ptr [rdi + 44], xmm0
        movss   xmm0, dword ptr [rsi + 48]
        addss   xmm0, dword ptr [rdx + 48]
        movss   dword ptr [rdi + 48], xmm0
        movss   xmm0, dword ptr [rsi + 52]
        addss   xmm0, dword ptr [rdx + 52]
        movss   dword ptr [rdi + 52], xmm0
        movss   xmm0, dword ptr [rsi + 56]
        addss   xmm0, dword ptr [rdx + 56]
        movss   dword ptr [rdi + 56], xmm0
        movss   xmm0, dword ptr [rsi + 60]
        addss   xmm0, dword ptr [rdx + 60]
        movss   dword ptr [rdi + 60], xmm0
        ret

@jyn514
Copy link
Member

jyn514 commented Dec 4, 2021

@bors try @rust-timer queue

I don't know much about codegen, sorry. Maybe ask someone on T-compiler.

@rust-timer
Copy link
Collaborator

Awaiting bors try build completion.

@rustbot label: +S-waiting-on-perf

@rustbot rustbot added the S-waiting-on-perf Status: Waiting on a perf run to be completed. label Dec 4, 2021
@bors
Copy link
Contributor

bors commented Dec 4, 2021

⌛ Trying commit fea8369 with merge 0adc4d3d46c3f04f1372220679a65f2f1c74297b...

@bjorn3
Copy link
Member

bjorn3 commented Dec 4, 2021

@bors try @rust-timer queue

@rust-timer
Copy link
Collaborator

Awaiting bors try build completion.

@rustbot label: +S-waiting-on-perf

@bors
Copy link
Contributor

bors commented Dec 4, 2021

⌛ Trying commit fea8369 with merge 065bdb7fe5f757cfb1fbb650c3949635b438bb41...

@bors
Copy link
Contributor

bors commented Dec 4, 2021

☀️ Try build successful - checks-actions
Build commit: 065bdb7fe5f757cfb1fbb650c3949635b438bb41 (065bdb7fe5f757cfb1fbb650c3949635b438bb41)

@rust-timer
Copy link
Collaborator

Queued 065bdb7fe5f757cfb1fbb650c3949635b438bb41 with parent efec545, future comparison URL.

@rust-timer
Copy link
Collaborator

Finished benchmarking commit (065bdb7fe5f757cfb1fbb650c3949635b438bb41): comparison url.

Summary: This change led to very large relevant mixed results 🤷 in compiler performance.

  • Very large improvement in instruction counts (up to -7.0% on full builds of deeply-nested-async)
  • Very large regression in instruction counts (up to 12.6% on incr-unchanged builds of ctfe-stress-4)

If you disagree with this performance assessment, please file an issue in rust-lang/rustc-perf.

Benchmarking this pull request likely means that it is perf-sensitive, so we're automatically marking it as not fit for rolling up. While you can manually mark this PR as fit for rollup, we strongly recommend not doing so since this PR led to changes in compiler perf.

Next Steps: If you can justify the regressions found in this try perf run, please indicate this with @rustbot label: +perf-regression-triaged along with sufficient written justification. If you cannot justify the regressions please fix the regressions and do another perf run. If the next run shows neutral or positive results, the label will be automatically removed.

@bors rollup=never
@rustbot label: +S-waiting-on-review -S-waiting-on-perf +perf-regression

@rustbot rustbot removed the S-waiting-on-perf Status: Waiting on a perf run to be completed. label Dec 4, 2021
@Urgau
Copy link
Member Author

Urgau commented Dec 4, 2021

The perf didn't improve and it seems clear that cost is way too high be be acceptable.
Closing this PR. Thanks to jyn514 and bjorn3 for the perf runs.

@Urgau Urgau closed this Dec 4, 2021
@@ -1,6 +1,7 @@
// compile-flags: -O
// only-x86_64
// ignore-debug: the debug assertions get in the way
// ignore-test for now (this is just to get CI happy)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You probably figured this out already, but as the person who added a bunch of these codegen tests, the i48 part is usually incidental, just there as the easiest way to accurately check what it wanted to confirm. If a different encoding of the rust types into llvm types is better, I'm all for it.

Copy link
Member Author

@Urgau Urgau Dec 4, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I did see that and what this PR did was to always use the same type (no more transformation like [f32; 4] -> u128) so that using them in/with a function didn't require extra steps witch works well for generating better assembly but at the cost of having way to much perf regression.

If you have a idea to reduce them, I'm all in.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I must also say that the transformation was only done to small types (max 128 bits in x86_64), this PR wouldn't have changed anything for them.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
perf-regression Performance regression. S-waiting-on-review Status: Awaiting review from the assignee but also interested parties.
Projects
None yet
Development

Successfully merging this pull request may close these issues.

Possible performance loss with f32 arithmetic
7 participants