[Experiment] Aggregate layout optimization #91507

Urgau · 2021-12-04T01:52:08Z

Try to lower the threshold for indirect aggregate and remove cast to integer in order to have better code generation.

This pull-request is a DRAFT and is my (dummy ? hopefully not) tentative to fix #91447.
If some one could start a perf run so that a could find out if this needs more thinking or not.

Rust code:

#![no_std]

pub fn sum_f32(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
    let mut c = [0.0; 4];
    c[0] = a[0] + b[0];
    c[1] = a[1] + b[1];
    c[2] = a[2] + b[2];
    c[3] = a[3] + b[3];
    c
}

pub fn sum_u32(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {
    let mut c = [0; 4];
    c[0] = a[0] + b[0];
    c[1] = a[1] + b[1];
    c[2] = a[2] + b[2];
    c[3] = a[3] + b[3];
    c
}

pub fn sum_ref_u32(a: &[u32; 4], b: &[u32; 4]) -> [u32; 4] {
    let mut c = [0; 4];
    c[0] = a[0] + b[0];
    c[1] = a[1] + b[1];
    c[2] = a[2] + b[2];
    c[3] = a[3] + b[3];
    c
}

pub fn sum_f32_big(a: [f32; 8], b: [f32; 8]) -> [f32; 8] {
    let mut c = [0.0; 8];
    for i in 0..8 {
        c[i] = a[i] + b[i];
    }
    c
}

pub struct Vec4 {
    pub x: f32,
    pub y: f32,
    pub z: f32,
    pub w: f32,
}

pub fn add(a: Vec4, b: Vec4) -> Vec4 {
    Vec4 {
        x: a.x + b.x,
        y: a.y + b.y,
        z: a.z + b.z,
        w: a.w + b.w
    }
}

#[no_mangle]
pub fn array_eq_value(a: [u16; 6], b: [u16; 6]) -> bool {
    a == b
}

Nightly:

example::sum_f32:
        movd    xmm0, ecx
        shr     rcx, 32
        movd    xmm1, ecx
        movd    xmm2, edx
        shr     rdx, 32
        movd    xmm3, edx
        movd    xmm4, esi
        shr     rsi, 32
        movd    xmm5, esi
        addss   xmm5, xmm1
        addss   xmm4, xmm0
        movd    xmm0, edi
        shr     rdi, 32
        movd    xmm1, edi
        addss   xmm1, xmm3
        addss   xmm0, xmm2
        movd    eax, xmm0
        movd    ecx, xmm1
        movd    edx, xmm4
        movd    esi, xmm5
        shl     rsi, 32
        or      rdx, rsi
        shl     rcx, 32
        or      rax, rcx
        ret

example::sum_u32:
        mov     r8, rcx
        add     ecx, esi
        shr     rsi, 32
        lea     eax, [rdx + rdi]
        shr     rdi, 32
        shr     r8, 32
        shr     rdx, 32
        add     edx, edi
        add     esi, r8d
        shl     rsi, 32
        or      rcx, rsi
        shl     rdx, 32
        or      rax, rdx
        mov     rdx, rcx
        ret

example::sum_ref_u32:
        mov     eax, dword ptr [rsi]
        mov     ecx, dword ptr [rsi + 4]
        add     eax, dword ptr [rdi]
        add     ecx, dword ptr [rdi + 4]
        mov     edx, dword ptr [rsi + 8]
        add     edx, dword ptr [rdi + 8]
        mov     esi, dword ptr [rsi + 12]
        add     esi, dword ptr [rdi + 12]
        shl     rsi, 32
        or      rdx, rsi
        shl     rcx, 32
        or      rax, rcx
        ret

example::sum_f32_big:
        mov     rax, rdi
        movss   xmm0, dword ptr [rsi]
        addss   xmm0, dword ptr [rdx]
        movss   dword ptr [rdi], xmm0
        movss   xmm0, dword ptr [rsi + 4]
        addss   xmm0, dword ptr [rdx + 4]
        movss   dword ptr [rdi + 4], xmm0
        movss   xmm0, dword ptr [rsi + 8]
        addss   xmm0, dword ptr [rdx + 8]
        movss   dword ptr [rdi + 8], xmm0
        movss   xmm0, dword ptr [rsi + 12]
        addss   xmm0, dword ptr [rdx + 12]
        movss   dword ptr [rdi + 12], xmm0
        movss   xmm0, dword ptr [rsi + 16]
        addss   xmm0, dword ptr [rdx + 16]
        movss   dword ptr [rdi + 16], xmm0
        movss   xmm0, dword ptr [rsi + 20]
        addss   xmm0, dword ptr [rdx + 20]
        movss   dword ptr [rdi + 20], xmm0
        movss   xmm0, dword ptr [rsi + 24]
        addss   xmm0, dword ptr [rdx + 24]
        movss   dword ptr [rdi + 24], xmm0
        movss   xmm0, dword ptr [rsi + 28]
        addss   xmm0, dword ptr [rdx + 28]
        movss   dword ptr [rdi + 28], xmm0
        ret

example::add:
        movd    xmm0, edi
        movd    xmm1, esi
        shr     rsi, 32
        shr     rdi, 32
        movd    xmm2, edi
        movd    xmm3, esi
        movd    xmm4, edx
        addss   xmm4, xmm0
        shr     rdx, 32
        movd    xmm0, edx
        addss   xmm0, xmm2
        movd    xmm2, ecx
        shr     rcx, 32
        addss   xmm2, xmm1
        movd    xmm1, ecx
        addss   xmm1, xmm3
        movd    eax, xmm4
        movd    ecx, xmm0
        movd    edx, xmm2
        movd    esi, xmm1
        shl     rsi, 32
        or      rdx, rsi
        shl     rcx, 32
        or      rax, rcx
        ret

array_eq_value:
        xor     rdi, rdx
        xor     esi, ecx
        or      rsi, rdi
        sete    al
        ret

This PR:

_ZN1a7sum_f3217hecad8316332612a3E:
	mov	rax, rdi
	movups	xmm0, xmmword ptr [rsi]
	movups	xmm1, xmmword ptr [rdx]
	addps	xmm1, xmm0
	movups	xmmword ptr [rdi], xmm1
	ret

_ZN1a7sum_u3217h4f54e379f00c247dE:
	mov	rax, rdi
	movdqu	xmm0, xmmword ptr [rsi]
	movdqu	xmm1, xmmword ptr [rdx]
	paddd	xmm1, xmm0
	movdqu	xmmword ptr [rdi], xmm1
	ret

_ZN1a11sum_ref_u3217h49cb1e98540da517E:
	mov	rax, rdi
	movdqu	xmm0, xmmword ptr [rsi]
	movdqu	xmm1, xmmword ptr [rdx]
	paddd	xmm1, xmm0
	movdqu	xmmword ptr [rdi], xmm1
	ret

_ZN1a11sum_f32_big17hb5a2218852b724b7E:
	mov	rax, rdi
	movups	xmm0, xmmword ptr [rsi]
	movups	xmm1, xmmword ptr [rdx]
	addps	xmm1, xmm0
	movups	xmmword ptr [rdi], xmm1
	movups	xmm0, xmmword ptr [rsi + 16]
	movups	xmm1, xmmword ptr [rdx + 16]
	addps	xmm1, xmm0
	movups	xmmword ptr [rdi + 16], xmm1
	ret

_ZN1a3add17hcbee97e180d293d5E:
	mov	rax, rdi
	movups	xmm0, xmmword ptr [rsi]
	movups	xmm1, xmmword ptr [rdx]
	addps	xmm1, xmm0
	movups	xmmword ptr [rdi], xmm1
	ret

array_eq_value:
	mov	rax, qword ptr [rdi]
	xor	rax, qword ptr [rsi]
	mov	ecx, dword ptr [rdi + 8]
	xor	ecx, dword ptr [rsi + 8]
	or	rcx, rax
	sete	al
	ret

r? @ghost

jyn514 · 2021-12-04T02:22:12Z

@bors try @rust-timer queue

rust-timer · 2021-12-04T02:22:14Z

Awaiting bors try build completion.

@rustbot label: +S-waiting-on-perf

bors · 2021-12-04T02:22:20Z

⌛ Trying commit 9410686 with merge d27f2514c2c2ab7f14562f3318cf9129f3315fbf...

bors · 2021-12-04T03:58:14Z

☀️ Try build successful - checks-actions
Build commit: d27f2514c2c2ab7f14562f3318cf9129f3315fbf (d27f2514c2c2ab7f14562f3318cf9129f3315fbf)

rust-timer · 2021-12-04T03:58:15Z

Queued d27f2514c2c2ab7f14562f3318cf9129f3315fbf with parent 532d2b1, future comparison URL.

rust-timer · 2021-12-04T06:00:04Z

Finished benchmarking commit (d27f2514c2c2ab7f14562f3318cf9129f3315fbf): comparison url.

Summary: This change led to very large relevant mixed results 🤷 in compiler performance.

Very large improvement in instruction counts (up to -7.0% on full builds of deeply-nested-async)
Very large regression in instruction counts (up to 12.6% on incr-unchanged builds of ctfe-stress-4)

If you disagree with this performance assessment, please file an issue in rust-lang/rustc-perf.

Benchmarking this pull request likely means that it is perf-sensitive, so we're automatically marking it as not fit for rolling up. While you can manually mark this PR as fit for rollup, we strongly recommend not doing so since this PR led to changes in compiler perf.

Next Steps: If you can justify the regressions found in this try perf run, please indicate this with @rustbot label: +perf-regression-triaged along with sufficient written justification. If you cannot justify the regressions please fix the regressions and do another perf run. If the next run shows neutral or positive results, the label will be automatically removed.

@bors rollup=never
@rustbot label: +S-waiting-on-review -S-waiting-on-perf +perf-regression

…_eq" This reverts commit 644b292.

Urgau · 2021-12-04T13:12:44Z

All the regressions appears to be in incr_comp and codegen queries, which was expected due to the fact that the layout is now more complex/bigger. However the wall-time seems to not have regressed there is just more instructions perform.

I've push a revert of my partial revert of the SpecArrayEq specialization as this might help with the ctfe case. If some one could restart a perf to see if that change anything.

The question is now whenever this might be acceptable ? @jyn514 an opinion ?
One thing to note here is that this affect the layout of parameters of function but if the function is inlined there is no more parameters and the cost of wrapping (before function call) -> dewrapping (start of function) and wrapping (end of function) -> dewrapping (after function call) in an integer (nearly ?) completely disappear.

Rust

pub fn sum_f32_big_8(a: [f32; 8], b: [f32; 8]) -> [f32; 8] {
    let mut c = [0.0; 8];
    for i in 0..8 {
        c[i] = a[i] + b[i];
    }
    c
}

pub fn sum_f32_big_16(a: [f32; 16], b: [f32; 16]) -> [f32; 16] {
    let mut c = [0.0; 16];
    for i in 0..16 {
        c[i] = a[i] + b[i];
    }
    c
}

This PR

sum_f32_big_8:
	mov	rax, rdi
	movups	xmm0, xmmword ptr [rsi]
	movups	xmm1, xmmword ptr [rdx]
	addps	xmm1, xmm0
	movups	xmmword ptr [rdi], xmm1
	movups	xmm0, xmmword ptr [rsi + 16]
	movups	xmm1, xmmword ptr [rdx + 16]
	addps	xmm1, xmm0
	movups	xmmword ptr [rdi + 16], xmm1
	ret

sum_f32_big_16:
	mov	rax, rdi
	movups	xmm0, xmmword ptr [rsi]
	movups	xmm1, xmmword ptr [rdx]
	addps	xmm1, xmm0
	movups	xmmword ptr [rdi], xmm1
	movups	xmm0, xmmword ptr [rsi + 16]
	movups	xmm1, xmmword ptr [rdx + 16]
	addps	xmm1, xmm0
	movups	xmmword ptr [rdi + 16], xmm1
	movups	xmm0, xmmword ptr [rsi + 32]
	movups	xmm1, xmmword ptr [rdx + 32]
	addps	xmm1, xmm0
	movups	xmmword ptr [rdi + 32], xmm1
	movups	xmm0, xmmword ptr [rsi + 48]
	movups	xmm1, xmmword ptr [rdx + 48]
	addps	xmm1, xmm0
	movups	xmmword ptr [rdi + 48], xmm1
	ret

Nightly

example::sum_f32_big_8:
        mov     rax, rdi
        movss   xmm0, dword ptr [rsi]
        addss   xmm0, dword ptr [rdx]
        movss   dword ptr [rdi], xmm0
        movss   xmm0, dword ptr [rsi + 4]
        addss   xmm0, dword ptr [rdx + 4]
        movss   dword ptr [rdi + 4], xmm0
        movss   xmm0, dword ptr [rsi + 8]
        addss   xmm0, dword ptr [rdx + 8]
        movss   dword ptr [rdi + 8], xmm0
        movss   xmm0, dword ptr [rsi + 12]
        addss   xmm0, dword ptr [rdx + 12]
        movss   dword ptr [rdi + 12], xmm0
        movss   xmm0, dword ptr [rsi + 16]
        addss   xmm0, dword ptr [rdx + 16]
        movss   dword ptr [rdi + 16], xmm0
        movss   xmm0, dword ptr [rsi + 20]
        addss   xmm0, dword ptr [rdx + 20]
        movss   dword ptr [rdi + 20], xmm0
        movss   xmm0, dword ptr [rsi + 24]
        addss   xmm0, dword ptr [rdx + 24]
        movss   dword ptr [rdi + 24], xmm0
        movss   xmm0, dword ptr [rsi + 28]
        addss   xmm0, dword ptr [rdx + 28]
        movss   dword ptr [rdi + 28], xmm0
        ret

example::sum_f32_big_16:
        mov     rax, rdi
        movss   xmm0, dword ptr [rsi]
        addss   xmm0, dword ptr [rdx]
        movss   dword ptr [rdi], xmm0
        movss   xmm0, dword ptr [rsi + 4]
        addss   xmm0, dword ptr [rdx + 4]
        movss   dword ptr [rdi + 4], xmm0
        movss   xmm0, dword ptr [rsi + 8]
        addss   xmm0, dword ptr [rdx + 8]
        movss   dword ptr [rdi + 8], xmm0
        movss   xmm0, dword ptr [rsi + 12]
        addss   xmm0, dword ptr [rdx + 12]
        movss   dword ptr [rdi + 12], xmm0
        movss   xmm0, dword ptr [rsi + 16]
        addss   xmm0, dword ptr [rdx + 16]
        movss   dword ptr [rdi + 16], xmm0
        movss   xmm0, dword ptr [rsi + 20]
        addss   xmm0, dword ptr [rdx + 20]
        movss   dword ptr [rdi + 20], xmm0
        movss   xmm0, dword ptr [rsi + 24]
        addss   xmm0, dword ptr [rdx + 24]
        movss   dword ptr [rdi + 24], xmm0
        movss   xmm0, dword ptr [rsi + 28]
        addss   xmm0, dword ptr [rdx + 28]
        movss   dword ptr [rdi + 28], xmm0
        movss   xmm0, dword ptr [rsi + 32]
        addss   xmm0, dword ptr [rdx + 32]
        movss   dword ptr [rdi + 32], xmm0
        movss   xmm0, dword ptr [rsi + 36]
        addss   xmm0, dword ptr [rdx + 36]
        movss   dword ptr [rdi + 36], xmm0
        movss   xmm0, dword ptr [rsi + 40]
        addss   xmm0, dword ptr [rdx + 40]
        movss   dword ptr [rdi + 40], xmm0
        movss   xmm0, dword ptr [rsi + 44]
        addss   xmm0, dword ptr [rdx + 44]
        movss   dword ptr [rdi + 44], xmm0
        movss   xmm0, dword ptr [rsi + 48]
        addss   xmm0, dword ptr [rdx + 48]
        movss   dword ptr [rdi + 48], xmm0
        movss   xmm0, dword ptr [rsi + 52]
        addss   xmm0, dword ptr [rdx + 52]
        movss   dword ptr [rdi + 52], xmm0
        movss   xmm0, dword ptr [rsi + 56]
        addss   xmm0, dword ptr [rdx + 56]
        movss   dword ptr [rdi + 56], xmm0
        movss   xmm0, dword ptr [rsi + 60]
        addss   xmm0, dword ptr [rdx + 60]
        movss   dword ptr [rdi + 60], xmm0
        ret

jyn514 · 2021-12-04T16:40:28Z

@bors try @rust-timer queue

I don't know much about codegen, sorry. Maybe ask someone on T-compiler.

rust-timer · 2021-12-04T16:40:30Z

Awaiting bors try build completion.

@rustbot label: +S-waiting-on-perf

bors · 2021-12-04T16:40:37Z

⌛ Trying commit fea8369 with merge 0adc4d3d46c3f04f1372220679a65f2f1c74297b...

bjorn3 · 2021-12-04T16:43:33Z

@bors try @rust-timer queue

rust-timer · 2021-12-04T16:43:34Z

Awaiting bors try build completion.

@rustbot label: +S-waiting-on-perf

bors · 2021-12-04T16:43:41Z

⌛ Trying commit fea8369 with merge 065bdb7fe5f757cfb1fbb650c3949635b438bb41...

bors · 2021-12-04T18:19:03Z

☀️ Try build successful - checks-actions
Build commit: 065bdb7fe5f757cfb1fbb650c3949635b438bb41 (065bdb7fe5f757cfb1fbb650c3949635b438bb41)

rust-timer · 2021-12-04T18:19:05Z

Queued 065bdb7fe5f757cfb1fbb650c3949635b438bb41 with parent efec545, future comparison URL.

rust-timer · 2021-12-04T21:02:26Z

Finished benchmarking commit (065bdb7fe5f757cfb1fbb650c3949635b438bb41): comparison url.

Summary: This change led to very large relevant mixed results 🤷 in compiler performance.

Very large improvement in instruction counts (up to -7.0% on full builds of deeply-nested-async)
Very large regression in instruction counts (up to 12.6% on incr-unchanged builds of ctfe-stress-4)

If you disagree with this performance assessment, please file an issue in rust-lang/rustc-perf.

Benchmarking this pull request likely means that it is perf-sensitive, so we're automatically marking it as not fit for rolling up. While you can manually mark this PR as fit for rollup, we strongly recommend not doing so since this PR led to changes in compiler perf.

Next Steps: If you can justify the regressions found in this try perf run, please indicate this with @rustbot label: +perf-regression-triaged along with sufficient written justification. If you cannot justify the regressions please fix the regressions and do another perf run. If the next run shows neutral or positive results, the label will be automatically removed.

@bors rollup=never
@rustbot label: +S-waiting-on-review -S-waiting-on-perf +perf-regression

Urgau · 2021-12-04T21:15:42Z

The perf didn't improve and it seems clear that cost is way too high be be acceptable.
Closing this PR. Thanks to jyn514 and bjorn3 for the perf runs.

scottmcm · 2021-12-04T21:24:26Z

src/test/codegen/swap-small-types.rs

@@ -1,6 +1,7 @@
 // compile-flags: -O
 // only-x86_64
 // ignore-debug: the debug assertions get in the way
+// ignore-test for now (this is just to get CI happy)


You probably figured this out already, but as the person who added a bunch of these codegen tests, the i48 part is usually incidental, just there as the easiest way to accurately check what it wanted to confirm. If a different encoding of the rust types into llvm types is better, I'm all for it.

Yeah, I did see that and what this PR did was to always use the same type (no more transformation like [f32; 4] -> u128) so that using them in/with a function didn't require extra steps witch works well for generating better assembly but at the cost of having way to much perf regression.

If you have a idea to reduce them, I'm all in.

I must also say that the transformation was only done to small types (max 128 bits in x86_64), this PR wouldn't have changed anything for them.

Urgau added 3 commits December 4, 2021 02:34

Lower the threshold for indirect aggregate and remove cast to integer

e6e2f15

Partially revert optimization done to array equality with raw_eq

644b292

Temporarily ignore failing codegen tests

9410686

rustbot added the S-waiting-on-perf Status: Waiting on a perf run to be completed. label Dec 4, 2021

rustbot added perf-regression Performance regression. S-waiting-on-review Status: Awaiting review from the assignee but also interested parties. and removed S-waiting-on-perf Status: Waiting on a perf run to be completed. labels Dec 4, 2021

Revert "Partially revert optimization done to array equality with raw…

fea8369

…_eq" This reverts commit 644b292.

rustbot added the S-waiting-on-perf Status: Waiting on a perf run to be completed. label Dec 4, 2021

rustbot removed the S-waiting-on-perf Status: Waiting on a perf run to be completed. label Dec 4, 2021

Urgau closed this Dec 4, 2021

scottmcm reviewed Dec 4, 2021

View reviewed changes

This was referenced Dec 7, 2021

Basic vectorization performance regression from 1.48.0 onwards #85265

Closed

[Experiment] revert issue #26494 associated pulls #76986 and #79547 #91719

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Experiment] Aggregate layout optimization #91507

[Experiment] Aggregate layout optimization #91507

Urgau commented Dec 4, 2021

jyn514 commented Dec 4, 2021

rust-timer commented Dec 4, 2021

bors commented Dec 4, 2021

bors commented Dec 4, 2021

rust-timer commented Dec 4, 2021

rust-timer commented Dec 4, 2021

Urgau commented Dec 4, 2021

jyn514 commented Dec 4, 2021

rust-timer commented Dec 4, 2021

bors commented Dec 4, 2021

bjorn3 commented Dec 4, 2021

rust-timer commented Dec 4, 2021

bors commented Dec 4, 2021

bors commented Dec 4, 2021

rust-timer commented Dec 4, 2021

rust-timer commented Dec 4, 2021

Urgau commented Dec 4, 2021

scottmcm Dec 4, 2021

Urgau Dec 4, 2021 •

edited

Loading

Urgau Dec 4, 2021

[Experiment] Aggregate layout optimization #91507

[Experiment] Aggregate layout optimization #91507

Conversation

Urgau commented Dec 4, 2021

jyn514 commented Dec 4, 2021

rust-timer commented Dec 4, 2021

bors commented Dec 4, 2021

bors commented Dec 4, 2021

rust-timer commented Dec 4, 2021

rust-timer commented Dec 4, 2021

Urgau commented Dec 4, 2021

jyn514 commented Dec 4, 2021

rust-timer commented Dec 4, 2021

bors commented Dec 4, 2021

bjorn3 commented Dec 4, 2021

rust-timer commented Dec 4, 2021

bors commented Dec 4, 2021

bors commented Dec 4, 2021

rust-timer commented Dec 4, 2021

rust-timer commented Dec 4, 2021

Urgau commented Dec 4, 2021

scottmcm Dec 4, 2021

Choose a reason for hiding this comment

Urgau Dec 4, 2021 • edited Loading

Choose a reason for hiding this comment

Urgau Dec 4, 2021

Choose a reason for hiding this comment

Urgau Dec 4, 2021 •

edited

Loading