optimize inplace collection of Vec #123878

jwong101 · 2024-04-13T00:24:04Z

This PR has the following changes:

Using usize::unchecked_mul in

rust/library/alloc/src/vec/in_place_collect.rs

Line 262 in 7942405

inner.cap * mem::size_of::<I::Src>() / mem::size_of::<T>(),

as LLVM, does not know that the operation can't wrap, since that's the size of the original allocation.

Given the following:

pub struct Foo([usize; 3]);

pub fn unwrap_copy(v: Vec<Foo>) -> Vec<[usize; 3]> {
    v.into_iter().map(|f| f.0).collect()
}

Before this commit:

define void @unwrap_copy(ptr noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) {
start:
  %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8
  %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8
  %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8
  %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16
  %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8
  %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24
  %0 = udiv i64 %_19.i.idx, 24

; Unnecessary calculation
  %_16.i.i = mul i64 %me.sroa.0.0.copyload.i, 24
  %dst_cap.i.i = udiv i64 %_16.i.i, 24

  store i64 %dst_cap.i.i, ptr %_0, align 8
  %1 = getelementptr inbounds i8, ptr %_0, i64 8
  store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8
  %2 = getelementptr inbounds i8, ptr %_0, i64 16
  store i64 %0, ptr %2, align 8
  ret void
}

After:

define void @unwrap_copy(ptr noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) {
start:
  %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8
  %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8
  %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8
  %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16
  %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8
  %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24
  %0 = udiv i64 %_19.i.idx, 24
  store i64 %me.sroa.0.0.copyload.i, ptr %_0, align 8
  %1 = getelementptr inbounds i8, ptr %_0, i64 8
  store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8
  %2 = getelementptr inbounds i8, ptr %_0, i64 16
  store i64 %0, ptr %2, align 8, !alias.scope !9, !noalias !14
  ret void
}

Note that there is still one more mul,udiv pair that I couldn't get
rid of. The root cause is the same issue as #121239, the nuw gets
stripped off of ptr::sub_ptr.

Iterator::try_fold gets called on the underlying Iterator in
SpecInPlaceCollect::collect_in_place whenever it does not implement
TrustedRandomAccess. For types that impl Drop, LLVM currently can't
tell that the drop can never occur, when using the default
Iterator::try_fold implementation.

For example, given the following code from #120493

#[repr(transparent)]
struct WrappedClone {
    inner: String
}

#[no_mangle]
pub fn unwrap_clone(list: Vec<WrappedClone>) -> Vec<String> {
    list.into_iter().map(|s| s.inner).collect()
}

The asm for the `unwrap_clone` method is currently:

unwrap_clone:
        push    rbp
        push    r15
        push    r14
        push    r13
        push    r12
        push    rbx
        push    rax
        mov     rbx, rdi
        mov     r12, qword ptr [rsi]
        mov     rdi, qword ptr [rsi + 8]
        mov     rax, qword ptr [rsi + 16]
        movabs  rsi, -6148914691236517205
        mov     r14, r12
        test    rax, rax
        je      .LBB0_10
        lea     rcx, [rax + 2*rax]
        lea     r14, [r12 + 8*rcx]
        shl     rax, 3
        lea     rax, [rax + 2*rax]
        xor     ecx, ecx
.LBB0_2:
        cmp     qword ptr [r12 + rcx], 0
        je      .LBB0_4
        add     rcx, 24
        cmp     rax, rcx
        jne     .LBB0_2
        jmp     .LBB0_10
.LBB0_4:
        lea     rdx, [rax - 24]
        lea     r14, [r12 + rcx]
        cmp     rdx, rcx
        je      .LBB0_10
        mov     qword ptr [rsp], rdi
        sub     rax, rcx
        add     rax, -24
        mul     rsi
        mov     r15, rdx
        lea     rbp, [r12 + rcx]
        add     rbp, 32
        shr     r15, 4
        mov     r13, qword ptr [rip + __rust_dealloc@GOTPCREL]
        jmp     .LBB0_6
.LBB0_8:
        add     rbp, 24
        dec     r15
        je      .LBB0_9
.LBB0_6:
        mov     rsi, qword ptr [rbp]
        test    rsi, rsi
        je      .LBB0_8
        mov     rdi, qword ptr [rbp - 8]
        mov     edx, 1
        call    r13
        jmp     .LBB0_8
.LBB0_9:
        mov     rdi, qword ptr [rsp]
        movabs  rsi, -6148914691236517205
.LBB0_10:
        sub     r14, r12
        mov     rax, r14
        mul     rsi
        shr     rdx, 4
        mov     qword ptr [rbx], r12
        mov     qword ptr [rbx + 8], rdi
        mov     qword ptr [rbx + 16], rdx
        mov     rax, rbx
        add     rsp, 8
        pop     rbx
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        pop     rbp
        ret

After this PR:

unwrap_clone:
	mov	rax, rdi
	movups	xmm0, xmmword ptr [rsi]
	mov	rcx, qword ptr [rsi + 16]
	movups	xmmword ptr [rdi], xmm0
	mov	qword ptr [rdi + 16], rcx
	ret

Fixes #120493

rustbot · 2024-04-13T00:24:12Z

r? @jhpratt

rustbot has assigned @jhpratt.
They will have a look at your PR within the next two weeks and either review your PR or reassign to another reviewer.

Use r? to explicitly pick a reviewer

the8472 · 2024-04-13T01:12:00Z

@bors try @rust-timer queue

optimize inplace collection of Vec This PR has the following changes: 1. Using `usize::unchecked_mul` in https://github.com/rust-lang/rust/blob/79424056b05eaa9563d16dfab9b9a0c8f033f220/library/alloc/src/vec/in_place_collect.rs#L262 as LLVM, does not know that the operation can't wrap, since that's the size of the original allocation. Given the following: ```rust pub struct Foo([usize; 3]); pub fn unwrap_copy(v: Vec<Foo>) -> Vec<[usize; 3]> { v.into_iter().map(|f| f.0).collect() } ``` <details> <summary>Before this commit:</summary> ```llvm define void `@unwrap_copy(ptr` noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 ; Unnecessary calculation %_16.i.i = mul i64 %me.sroa.0.0.copyload.i, 24 %dst_cap.i.i = udiv i64 %_16.i.i, 24 store i64 %dst_cap.i.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8 ret void } ``` </details> <details> <summary>After:</summary> ```llvm define void `@unwrap_copy(ptr` noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 store i64 %me.sroa.0.0.copyload.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8, !alias.scope !9, !noalias !14 ret void } ``` </details> Note that there is still one more `mul,udiv` pair that I couldn't get rid of. The root cause is the same issue as rust-lang#121239, the `nuw` gets stripped off of `ptr::sub_ptr`. 2. `Iterator::try_fold` gets called on the underlying Iterator in `SpecInPlaceCollect::collect_in_place` whenever it does not implement `TrustedRandomAccess`. For types that impl `Drop`, LLVM currently can't tell that the drop can never occur, when using the default `Iterator::try_fold` implementation. For example, given the following code from rust-lang#120493 ```rust #[repr(transparent)] struct WrappedClone { inner: String } #[no_mangle] pub fn unwrap_clone(list: Vec<WrappedClone>) -> Vec<String> { list.into_iter().map(|s| s.inner).collect() } ``` <details> <summary>The asm for the `unwrap_clone` method is currently:</summary> ```asm unwrap_clone: push rbp push r15 push r14 push r13 push r12 push rbx push rax mov rbx, rdi mov r12, qword ptr [rsi] mov rdi, qword ptr [rsi + 8] mov rax, qword ptr [rsi + 16] movabs rsi, -6148914691236517205 mov r14, r12 test rax, rax je .LBB0_10 lea rcx, [rax + 2*rax] lea r14, [r12 + 8*rcx] shl rax, 3 lea rax, [rax + 2*rax] xor ecx, ecx .LBB0_2: cmp qword ptr [r12 + rcx], 0 je .LBB0_4 add rcx, 24 cmp rax, rcx jne .LBB0_2 jmp .LBB0_10 .LBB0_4: lea rdx, [rax - 24] lea r14, [r12 + rcx] cmp rdx, rcx je .LBB0_10 mov qword ptr [rsp], rdi sub rax, rcx add rax, -24 mul rsi mov r15, rdx lea rbp, [r12 + rcx] add rbp, 32 shr r15, 4 mov r13, qword ptr [rip + __rust_dealloc@GOTPCREL] jmp .LBB0_6 .LBB0_8: add rbp, 24 dec r15 je .LBB0_9 .LBB0_6: mov rsi, qword ptr [rbp] test rsi, rsi je .LBB0_8 mov rdi, qword ptr [rbp - 8] mov edx, 1 call r13 jmp .LBB0_8 .LBB0_9: mov rdi, qword ptr [rsp] movabs rsi, -6148914691236517205 .LBB0_10: sub r14, r12 mov rax, r14 mul rsi shr rdx, 4 mov qword ptr [rbx], r12 mov qword ptr [rbx + 8], rdi mov qword ptr [rbx + 16], rdx mov rax, rbx add rsp, 8 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp ret ``` </details> <details> <summary>After this PR:</summary> ```asm unwrap_clone: mov rax, rdi movups xmm0, xmmword ptr [rsi] mov rcx, qword ptr [rsi + 16] movups xmmword ptr [rdi], xmm0 mov qword ptr [rdi + 16], rcx ret ``` </details> Fixes rust-lang#120493

bors · 2024-04-13T01:18:03Z

⌛ Trying commit 4691ff7 with merge 1256640...

bors · 2024-04-13T02:51:13Z

☀️ Try build successful - checks-actions
Build commit: 1256640 (125664080eeed13d66e212ca47e339af4519e5ee)

rust-timer · 2024-04-13T05:35:54Z

Finished benchmarking commit (1256640): comparison URL.

Overall result: ❌✅ regressions and improvements - ACTION NEEDED

Benchmarking this pull request likely means that it is perf-sensitive, so we're automatically marking it as not fit for rolling up. While you can manually mark this PR as fit for rollup, we strongly recommend not doing so since this PR may lead to changes in compiler perf.

Next Steps: If you can justify the regressions found in this try perf run, please indicate this with @rustbot label: +perf-regression-triaged along with sufficient written justification. If you cannot justify the regressions please fix the regressions and do another perf run. If the next run shows neutral or positive results, the label will be automatically removed.

@bors rollup=never
@rustbot label: -S-waiting-on-perf +perf-regression

Instruction count

This is a highly reliable metric that was used to determine the overall result at the top of this comment.

	mean	range	count
Regressions ❌ (primary)	1.9%	[1.9%, 1.9%]	1
Regressions ❌ (secondary)	1.0%	[1.0%, 1.0%]	1
Improvements ✅ (primary)	-18.1%	[-29.8%, -0.5%]	5
Improvements ✅ (secondary)	-0.3%	[-0.4%, -0.3%]	6
All ❌✅ (primary)	-14.8%	[-29.8%, 1.9%]	6

Max RSS (memory usage)

Results

This is a less reliable metric that may be of interest but was not used to determine the overall result at the top of this comment.

	mean	range	count
Regressions ❌ (primary)	4.7%	[4.2%, 5.2%]	2
Regressions ❌ (secondary)	-	-	0
Improvements ✅ (primary)	-8.2%	[-9.7%, -3.4%]	5
Improvements ✅ (secondary)	-	-	0
All ❌✅ (primary)	-4.6%	[-9.7%, 5.2%]	7

Cycles

Results

This is a less reliable metric that may be of interest but was not used to determine the overall result at the top of this comment.

	mean	range	count
Regressions ❌ (primary)	1.3%	[1.3%, 1.3%]	1
Regressions ❌ (secondary)	3.1%	[1.9%, 4.6%]	3
Improvements ✅ (primary)	-22.5%	[-29.5%, -1.5%]	4
Improvements ✅ (secondary)	-	-	0
All ❌✅ (primary)	-17.7%	[-29.5%, 1.3%]	5

Binary size

Results

This is a less reliable metric that may be of interest but was not used to determine the overall result at the top of this comment.

	mean	range	count
Regressions ❌ (primary)	0.2%	[0.0%, 1.1%]	15
Regressions ❌ (secondary)	-	-	0
Improvements ✅ (primary)	-0.1%	[-0.6%, -0.0%]	13
Improvements ✅ (secondary)	-	-	0
All ❌✅ (primary)	0.0%	[-0.6%, 1.1%]	28

Bootstrap: 677.972s -> 675.313s (-0.39%)
Artifact size: 316.06 MiB -> 315.95 MiB (-0.03%)

library/alloc/src/vec/in_place_collect.rs

the8472 · 2024-04-13T10:38:20Z

The perf results are generally fine. The big improvements are from an incremental change that happens to contain a lot of Vec code, so that's not too surprising.
I think the only interesting one is the image opt full case. It regresses both in compile time and binary size. I assume that's due to more code being vectorized, but it might be worth confirming that guess to see if there's not some unexpected pessimization.

jwong101 · 2024-04-13T15:04:34Z

Yeah, I'll take a look at the image case to see what LLVM is doing to it.

LLVM does not know that the multiplication never overflows, which causes it to generate unnecessary instructions. Use `usize::unchecked_mul`, so that it can fold the `dst_cap` calculation when `size_of::<I::SRC>() == size_of::<T>()`. Running: ``` rustc -C llvm-args=-x86-asm-syntax=intel -O src/lib.rs --emit asm` ``` ```rust pub struct Foo([usize; 3]); pub fn unwrap_copy(v: Vec<Foo>) -> Vec<[usize; 3]> { v.into_iter().map(|f| f.0).collect() } ``` Before this commit: ``` define void @unwrap_copy(ptr noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 %_16.i.i = mul i64 %me.sroa.0.0.copyload.i, 24 %dst_cap.i.i = udiv i64 %_16.i.i, 24 store i64 %dst_cap.i.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8 ret void } ``` After: ``` define void @unwrap_copy(ptr noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 store i64 %me.sroa.0.0.copyload.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8, !alias.scope !9, !noalias !14 ret void } ``` Note that there is still one more `mul,udiv` pair that I couldn't get rid of. The root cause is the same issue as rust-lang#121239, the `nuw` gets stripped off of `ptr::sub_ptr`.

`Iterator::try_fold` gets called on the underlying Iterator in `SpecInPlaceCollect::collect_in_place` whenever it does not implement `TrustedRandomAccess`. For types that impl `Drop`, LLVM currently can't tell that the drop can never occur, when using the default `Iterator::try_fold` implementation. For example, the asm from the `unwrap_clone` method is currently: ``` unwrap_clone: push rbp push r15 push r14 push r13 push r12 push rbx push rax mov rbx, rdi mov r12, qword ptr [rsi] mov rdi, qword ptr [rsi + 8] mov rax, qword ptr [rsi + 16] movabs rsi, -6148914691236517205 mov r14, r12 test rax, rax je .LBB0_10 lea rcx, [rax + 2*rax] lea r14, [r12 + 8*rcx] shl rax, 3 lea rax, [rax + 2*rax] xor ecx, ecx .LBB0_2: cmp qword ptr [r12 + rcx], 0 je .LBB0_4 add rcx, 24 cmp rax, rcx jne .LBB0_2 jmp .LBB0_10 .LBB0_4: lea rdx, [rax - 24] lea r14, [r12 + rcx] cmp rdx, rcx je .LBB0_10 mov qword ptr [rsp], rdi sub rax, rcx add rax, -24 mul rsi mov r15, rdx lea rbp, [r12 + rcx] add rbp, 32 shr r15, 4 mov r13, qword ptr [rip + __rust_dealloc@GOTPCREL] jmp .LBB0_6 .LBB0_8: add rbp, 24 dec r15 je .LBB0_9 .LBB0_6: mov rsi, qword ptr [rbp] test rsi, rsi je .LBB0_8 mov rdi, qword ptr [rbp - 8] mov edx, 1 call r13 jmp .LBB0_8 .LBB0_9: mov rdi, qword ptr [rsp] movabs rsi, -6148914691236517205 .LBB0_10: sub r14, r12 mov rax, r14 mul rsi shr rdx, 4 mov qword ptr [rbx], r12 mov qword ptr [rbx + 8], rdi mov qword ptr [rbx + 16], rdx mov rax, rbx add rsp, 8 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp ret ``` After this PR: ``` unwrap_clone: mov rax, rdi movups xmm0, xmmword ptr [rsi] mov rcx, qword ptr [rsi + 16] movups xmmword ptr [rdi], xmm0 mov qword ptr [rdi + 16], rcx ret ``` Fixes rust-lang#120493

LLVM currently adds a redundant check for the returned option, in addition to the `self.ptr != self.end` check when using the default `Iterator::fold` method that calls `vec::IntoIter::next` in a loop.

jwong101 · 2024-05-19T04:03:26Z

Sorry about the long wait, I got caught up with some other things and hadn't worked on this for awhile.

However, I rebased against eb1a5c9 and the binary size for image is smaller now with this PR. I doubt this was because of my fold changes, since I was still getting a bigger binary size back when I was testing this in April.

That said, could I get another perf run? The regressions should be gone now, if my testing was correct.

jhpratt · 2024-05-19T06:13:45Z

@bors try @rust-timer queue

optimize inplace collection of Vec This PR has the following changes: 1. Using `usize::unchecked_mul` in https://github.com/rust-lang/rust/blob/79424056b05eaa9563d16dfab9b9a0c8f033f220/library/alloc/src/vec/in_place_collect.rs#L262 as LLVM, does not know that the operation can't wrap, since that's the size of the original allocation. Given the following: ```rust pub struct Foo([usize; 3]); pub fn unwrap_copy(v: Vec<Foo>) -> Vec<[usize; 3]> { v.into_iter().map(|f| f.0).collect() } ``` <details> <summary>Before this commit:</summary> ```llvm define void `@unwrap_copy(ptr` noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 ; Unnecessary calculation %_16.i.i = mul i64 %me.sroa.0.0.copyload.i, 24 %dst_cap.i.i = udiv i64 %_16.i.i, 24 store i64 %dst_cap.i.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8 ret void } ``` </details> <details> <summary>After:</summary> ```llvm define void `@unwrap_copy(ptr` noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 store i64 %me.sroa.0.0.copyload.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8, !alias.scope !9, !noalias !14 ret void } ``` </details> Note that there is still one more `mul,udiv` pair that I couldn't get rid of. The root cause is the same issue as rust-lang#121239, the `nuw` gets stripped off of `ptr::sub_ptr`. 2. `Iterator::try_fold` gets called on the underlying Iterator in `SpecInPlaceCollect::collect_in_place` whenever it does not implement `TrustedRandomAccess`. For types that impl `Drop`, LLVM currently can't tell that the drop can never occur, when using the default `Iterator::try_fold` implementation. For example, given the following code from rust-lang#120493 ```rust #[repr(transparent)] struct WrappedClone { inner: String } #[no_mangle] pub fn unwrap_clone(list: Vec<WrappedClone>) -> Vec<String> { list.into_iter().map(|s| s.inner).collect() } ``` <details> <summary>The asm for the `unwrap_clone` method is currently:</summary> ```asm unwrap_clone: push rbp push r15 push r14 push r13 push r12 push rbx push rax mov rbx, rdi mov r12, qword ptr [rsi] mov rdi, qword ptr [rsi + 8] mov rax, qword ptr [rsi + 16] movabs rsi, -6148914691236517205 mov r14, r12 test rax, rax je .LBB0_10 lea rcx, [rax + 2*rax] lea r14, [r12 + 8*rcx] shl rax, 3 lea rax, [rax + 2*rax] xor ecx, ecx .LBB0_2: cmp qword ptr [r12 + rcx], 0 je .LBB0_4 add rcx, 24 cmp rax, rcx jne .LBB0_2 jmp .LBB0_10 .LBB0_4: lea rdx, [rax - 24] lea r14, [r12 + rcx] cmp rdx, rcx je .LBB0_10 mov qword ptr [rsp], rdi sub rax, rcx add rax, -24 mul rsi mov r15, rdx lea rbp, [r12 + rcx] add rbp, 32 shr r15, 4 mov r13, qword ptr [rip + __rust_dealloc@GOTPCREL] jmp .LBB0_6 .LBB0_8: add rbp, 24 dec r15 je .LBB0_9 .LBB0_6: mov rsi, qword ptr [rbp] test rsi, rsi je .LBB0_8 mov rdi, qword ptr [rbp - 8] mov edx, 1 call r13 jmp .LBB0_8 .LBB0_9: mov rdi, qword ptr [rsp] movabs rsi, -6148914691236517205 .LBB0_10: sub r14, r12 mov rax, r14 mul rsi shr rdx, 4 mov qword ptr [rbx], r12 mov qword ptr [rbx + 8], rdi mov qword ptr [rbx + 16], rdx mov rax, rbx add rsp, 8 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp ret ``` </details> <details> <summary>After this PR:</summary> ```asm unwrap_clone: mov rax, rdi movups xmm0, xmmword ptr [rsi] mov rcx, qword ptr [rsi + 16] movups xmmword ptr [rdi], xmm0 mov qword ptr [rdi + 16], rcx ret ``` </details> Fixes rust-lang#120493

bors · 2024-05-19T06:14:56Z

⌛ Trying commit 65e302f with merge af838b4...

bors · 2024-05-19T07:51:15Z

☀️ Try build successful - checks-actions
Build commit: af838b4 (af838b4f60958081d7eac727273db3175b17397a)

rust-timer · 2024-05-19T09:07:13Z

Finished benchmarking commit (af838b4): comparison URL.

Overall result: ✅ improvements - no action needed

Benchmarking this pull request likely means that it is perf-sensitive, so we're automatically marking it as not fit for rolling up. While you can manually mark this PR as fit for rollup, we strongly recommend not doing so since this PR may lead to changes in compiler perf.

@bors rollup=never
@rustbot label: -S-waiting-on-perf -perf-regression

Instruction count

This is a highly reliable metric that was used to determine the overall result at the top of this comment.

	mean	range	count
Regressions ❌ (primary)	-	-	0
Regressions ❌ (secondary)	-	-	0
Improvements ✅ (primary)	-19.1%	[-37.5%, -0.4%]	6
Improvements ✅ (secondary)	-1.1%	[-1.1%, -1.1%]	1
All ❌✅ (primary)	-19.1%	[-37.5%, -0.4%]	6

Max RSS (memory usage)

Results (primary -4.8%)

This is a less reliable metric that may be of interest but was not used to determine the overall result at the top of this comment.

	mean	range	count
Regressions ❌ (primary)	5.6%	[2.9%, 8.4%]	2
Regressions ❌ (secondary)	-	-	0
Improvements ✅ (primary)	-8.2%	[-11.7%, -3.0%]	6
Improvements ✅ (secondary)	-	-	0
All ❌✅ (primary)	-4.8%	[-11.7%, 8.4%]	8

Cycles

Results (primary -28.4%)

This is a less reliable metric that may be of interest but was not used to determine the overall result at the top of this comment.

	mean	range	count
Regressions ❌ (primary)	-	-	0
Regressions ❌ (secondary)	-	-	0
Improvements ✅ (primary)	-28.4%	[-37.8%, -0.8%]	4
Improvements ✅ (secondary)	-	-	0
All ❌✅ (primary)	-28.4%	[-37.8%, -0.8%]	4

Binary size

Results (primary 0.0%, secondary -0.0%)

This is a less reliable metric that may be of interest but was not used to determine the overall result at the top of this comment.

	mean	range	count
Regressions ❌ (primary)	0.1%	[0.0%, 0.5%]	20
Regressions ❌ (secondary)	-	-	0
Improvements ✅ (primary)	-0.1%	[-0.5%, -0.0%]	25
Improvements ✅ (secondary)	-0.0%	[-0.0%, -0.0%]	1
All ❌✅ (primary)	0.0%	[-0.5%, 0.5%]	45

Bootstrap: 671.223s -> 671.46s (0.04%)
Artifact size: 316.05 MiB -> 316.22 MiB (0.05%)

jhpratt · 2024-05-19T22:05:50Z

@bors r+ rollup=never

bors · 2024-05-19T22:05:53Z

📌 Commit 65e302f has been approved by jhpratt

It is now in the queue for this repository.

bors · 2024-05-20T00:51:15Z

⌛ Testing commit 65e302f with merge 12075f0...

bors · 2024-05-20T02:57:12Z

☀️ Test successful - checks-actions
Approved by: jhpratt
Pushing 12075f0 to master...

rust-timer · 2024-05-20T04:12:17Z

Finished benchmarking commit (12075f0): comparison URL.

Overall result: ❌✅ regressions and improvements - ACTION NEEDED

Next Steps: If you can justify the regressions found in this perf run, please indicate this with @rustbot label: +perf-regression-triaged along with sufficient written justification. If you cannot justify the regressions please open an issue or create a new PR that fixes the regressions, add a comment linking to the newly created issue or PR, and then add the perf-regression-triaged label to this PR.

@rustbot label: +perf-regression
cc @rust-lang/wg-compiler-performance

Instruction count

This is a highly reliable metric that was used to determine the overall result at the top of this comment.

	mean	range	count
Regressions ❌ (primary)	1.4%	[0.3%, 2.9%]	3
Regressions ❌ (secondary)	0.3%	[0.2%, 0.3%]	2
Improvements ✅ (primary)	-22.7%	[-37.4%, -0.4%]	5
Improvements ✅ (secondary)	-	-	0
All ❌✅ (primary)	-13.7%	[-37.4%, 2.9%]	8

Max RSS (memory usage)

Results (primary -4.2%, secondary -2.6%)

This is a less reliable metric that may be of interest but was not used to determine the overall result at the top of this comment.

	mean	range	count
Regressions ❌ (primary)	3.3%	[2.0%, 5.1%]	3
Regressions ❌ (secondary)	-	-	0
Improvements ✅ (primary)	-7.0%	[-11.8%, -2.4%]	8
Improvements ✅ (secondary)	-2.6%	[-2.6%, -2.6%]	1
All ❌✅ (primary)	-4.2%	[-11.8%, 5.1%]	11

Cycles

Results (primary -21.9%, secondary 3.7%)

This is a less reliable metric that may be of interest but was not used to determine the overall result at the top of this comment.

	mean	range	count
Regressions ❌ (primary)	1.6%	[0.9%, 2.4%]	2
Regressions ❌ (secondary)	3.7%	[2.9%, 4.6%]	2
Improvements ✅ (primary)	-37.6%	[-37.7%, -37.5%]	3
Improvements ✅ (secondary)	-	-	0
All ❌✅ (primary)	-21.9%	[-37.7%, 2.4%]	5

Binary size

Results (primary 0.1%, secondary -0.0%)

This is a less reliable metric that may be of interest but was not used to determine the overall result at the top of this comment.

	mean	range	count
Regressions ❌ (primary)	0.2%	[0.0%, 1.1%]	25
Regressions ❌ (secondary)	-	-	0
Improvements ✅ (primary)	-0.1%	[-0.5%, -0.0%]	24
Improvements ✅ (secondary)	-0.0%	[-0.0%, -0.0%]	1
All ❌✅ (primary)	0.1%	[-0.5%, 1.1%]	49

Bootstrap: 669.127s -> 670.375s (0.19%)
Artifact size: 316.16 MiB -> 316.15 MiB (-0.00%)

…e8472 add some codegen tests for issue 120493 I forgot to add these in rust-lang#123878.

Rollup merge of rust-lang#125305 - jwong101:120493-codegen-test, r=the8472 add some codegen tests for issue 120493 I forgot to add these in rust-lang#123878.

rylev · 2024-05-21T12:31:14Z

Improvements vastly outweigh the regressions which seemed to have returned to baseline after this PR. I think it's safe to move on from this.

@rustbot label: +perf-regression-triaged

add some codegen tests for issue 120493 I forgot to add these in rust-lang/rust#123878.

rustbot assigned jhpratt Apr 13, 2024

rustbot added S-waiting-on-review Status: Awaiting review from the assignee but also interested parties. T-libs Relevant to the library team, which will review and decide on the PR/issue. labels Apr 13, 2024

the8472 assigned the8472 and unassigned jhpratt Apr 13, 2024

This comment has been minimized.

Sign in to view

rustbot added the S-waiting-on-perf Status: Waiting on a perf run to be completed. label Apr 13, 2024

This comment has been minimized.

Sign in to view

rustbot added perf-regression Performance regression. and removed S-waiting-on-perf Status: Waiting on a perf run to be completed. labels Apr 13, 2024

jhpratt reviewed Apr 13, 2024

View reviewed changes

library/alloc/src/vec/in_place_collect.rs Outdated Show resolved Hide resolved

jwong101 added 3 commits May 18, 2024 18:30

specialize Iterator::fold for vec::IntoIter

9d6b93c

LLVM currently adds a redundant check for the returned option, in addition to the `self.ptr != self.end` check when using the default `Iterator::fold` method that calls `vec::IntoIter::next` in a loop.

jwong101 force-pushed the inplacecollect branch from 4691ff7 to 9d6b93c Compare May 19, 2024 00:00

use Result::into_ok on infallible result.

65e302f

This comment has been minimized.

Sign in to view

rustbot added the S-waiting-on-perf Status: Waiting on a perf run to be completed. label May 19, 2024

This comment has been minimized.

Sign in to view

rustbot removed S-waiting-on-perf Status: Waiting on a perf run to be completed. perf-regression Performance regression. labels May 19, 2024

bors added S-waiting-on-bors Status: Waiting on bors to run and complete tests. Bors will change the label on completion. and removed S-waiting-on-review Status: Awaiting review from the assignee but also interested parties. labels May 19, 2024

bors added the merged-by-bors This PR was explicitly merged by bors. label May 20, 2024

bors merged commit 12075f0 into rust-lang:master May 20, 2024
7 checks passed

rustbot added this to the 1.80.0 milestone May 20, 2024

jwong101 mentioned this pull request May 20, 2024

add some codegen tests for issue 120493 #125305

Merged

bors mentioned this pull request May 20, 2024

Stop re-implementing slice iterators in vec::IntoIter #124421

Draft

rustbot added the perf-regression Performance regression. label May 20, 2024

matthiaskrgr added a commit to matthiaskrgr/rust that referenced this pull request May 20, 2024

Rollup merge of rust-lang#125305 - jwong101:120493-codegen-test, r=th…

83cceea

…e8472 add some codegen tests for issue 120493 I forgot to add these in rust-lang#123878.

rustbot added the perf-regression-triaged The performance regression has been triaged. label May 21, 2024

durin42 mentioned this pull request May 21, 2024

Optimization for types implementing drop seems broken on LLVM 19 #125373

Closed

flip1995 pushed a commit to flip1995/rust-clippy that referenced this pull request May 24, 2024

Rollup merge of #125305 - jwong101:120493-codegen-test, r=the8472

244f417

add some codegen tests for issue 120493 I forgot to add these in rust-lang/rust#123878.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

optimize inplace collection of Vec #123878

optimize inplace collection of Vec #123878

jwong101 commented Apr 13, 2024 •

edited

Loading

rustbot commented Apr 13, 2024

the8472 commented Apr 13, 2024

This comment has been minimized.

bors commented Apr 13, 2024

bors commented Apr 13, 2024

This comment has been minimized.

rust-timer commented Apr 13, 2024

the8472 commented Apr 13, 2024

jwong101 commented Apr 13, 2024

jwong101 commented May 19, 2024

jhpratt commented May 19, 2024

This comment has been minimized.

bors commented May 19, 2024

bors commented May 19, 2024

This comment has been minimized.

rust-timer commented May 19, 2024

jhpratt commented May 19, 2024

bors commented May 19, 2024

bors commented May 20, 2024

bors commented May 20, 2024

rust-timer commented May 20, 2024

rylev commented May 21, 2024

optimize inplace collection of Vec #123878

optimize inplace collection of Vec #123878

Conversation

jwong101 commented Apr 13, 2024 • edited Loading

rustbot commented Apr 13, 2024

the8472 commented Apr 13, 2024

This comment has been minimized.

bors commented Apr 13, 2024

bors commented Apr 13, 2024

This comment has been minimized.

rust-timer commented Apr 13, 2024

Overall result: ❌✅ regressions and improvements - ACTION NEEDED

the8472 commented Apr 13, 2024

jwong101 commented Apr 13, 2024

jwong101 commented May 19, 2024

jhpratt commented May 19, 2024

This comment has been minimized.

bors commented May 19, 2024

bors commented May 19, 2024

This comment has been minimized.

rust-timer commented May 19, 2024

Overall result: ✅ improvements - no action needed

jhpratt commented May 19, 2024

bors commented May 19, 2024

bors commented May 20, 2024

bors commented May 20, 2024

rust-timer commented May 20, 2024

Overall result: ❌✅ regressions and improvements - ACTION NEEDED

rylev commented May 21, 2024

jwong101 commented Apr 13, 2024 •

edited

Loading