From c95253b1bac865b6d90cce186b7d665de163d50c Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Tue, 31 Oct 2023 18:38:04 +0000 Subject: [PATCH] [LLVM][SVE] Clean VLS tests to not use wide vectors as function return types. --- .../AArch64/sve-extract-fixed-vector.ll | 7 +- .../AArch64/sve-fixed-length-bitselect.ll | 7 +- .../AArch64/sve-fixed-length-ext-loads.ll | 166 ++--- .../sve-fixed-length-insert-vector-elt.ll | 240 +++---- ...ds.ll => sve-fixed-length-loads-stores.ll} | 148 +++-- .../AArch64/sve-fixed-length-masked-loads.ll | 600 ++++++++++-------- .../AArch64/sve-fixed-length-splat-vector.ll | 54 +- .../AArch64/sve-fixed-length-stores.ll | 180 ------ llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll | 30 +- 9 files changed, 672 insertions(+), 760 deletions(-) rename llvm/test/CodeGen/AArch64/{sve-fixed-length-loads.ll => sve-fixed-length-loads-stores.ll} (57%) delete mode 100644 llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll index 79962d441d1048..0dd7320413a147 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll @@ -446,7 +446,7 @@ define <2 x i64> @extract_fixed_v2i64_nxv2i64( %vec) nounwind ret <2 x i64> %retval } -define <4 x i64> @extract_fixed_v4i64_nxv2i64( %vec) nounwind #0 { +define void @extract_fixed_v4i64_nxv2i64( %vec, ptr %p) nounwind #0 { ; CHECK-LABEL: extract_fixed_v4i64_nxv2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill @@ -454,12 +454,13 @@ define <4 x i64> @extract_fixed_v4i64_nxv2i64( %vec) nounwind ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] -; CHECK-NEXT: st1d { z0.d }, p0, [x8] +; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %retval = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64( %vec, i64 4) - ret <4 x i64> %retval + store <4 x i64> %retval, ptr %p + ret void } ; Check that extract from load via bitcast-gep-of-scalar-ptr does not crash. diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll index 3fdd08701053eb..fb494afa11de26 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll @@ -9,7 +9,7 @@ target triple = "aarch64" ; this is implemented, this test will be fleshed out. ; -define <8 x i32> @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %right_ptr) #0 { +define void @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %right_ptr, ptr %result_ptr) #0 { ; CHECK-LABEL: fixed_bitselect_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 @@ -22,7 +22,7 @@ define <8 x i32> @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %r ; CHECK-NEXT: and z0.d, z0.d, z2.d ; CHECK-NEXT: and z1.d, z1.d, z3.d ; CHECK-NEXT: orr z0.d, z1.d, z0.d -; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: st1w { z0.s }, p0, [x3] ; CHECK-NEXT: ret %pre_cond = load <8 x i32>, ptr %pre_cond_ptr %left = load <8 x i32>, ptr %left_ptr @@ -33,7 +33,8 @@ define <8 x i32> @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %r %left_bits_0 = and <8 x i32> %neg_cond, %left %right_bits_0 = and <8 x i32> %min_cond, %right %bsl0000 = or <8 x i32> %right_bits_0, %left_bits_0 - ret <8 x i32> %bsl0000 + store <8 x i32> %bsl0000, ptr %result_ptr + ret void } attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll index 94aef218d4de31..f6ed2e6a787f01 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll @@ -34,62 +34,66 @@ define <2 x i256> @load_zext_v2i64i256(ptr %ap) #0 { ret <2 x i256> %val } -define <8 x i32> @load_zext_v8i16i32(ptr %ap) vscale_range(2,0) #0 { +define void @load_zext_v8i16i32(ptr %ap, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: load_zext_v8i16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] -; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %ap %val = zext <8 x i16> %a to <8 x i32> - ret <8 x i32> %val + store <8 x i32> %val, ptr %b + ret void } -define <16 x i32> @load_zext_v16i16i32(ptr %ap) vscale_range(4,0) #0 { +define void @load_zext_v16i16i32(ptr %ap, ptr %b) vscale_range(4,0) #0 { ; CHECK-LABEL: load_zext_v16i16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] -; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %ap %val = zext <16 x i16> %a to <16 x i32> - ret <16 x i32> %val + store <16 x i32> %val, ptr %b + ret void } -define <32 x i32> @load_zext_v32i16i32(ptr %ap) vscale_range(8,0) #0 { +define void @load_zext_v32i16i32(ptr %ap, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: load_zext_v32i16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] -; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %ap %val = zext <32 x i16> %a to <32 x i32> - ret <32 x i32> %val + store <32 x i32> %val, ptr %b + ret void } -define <64 x i32> @load_zext_v64i16i32(ptr %ap) #0 { +define void @load_zext_v64i16i32(ptr %ap, ptr %b) #0 { ; VBITS_GE_1024-LABEL: load_zext_v64i16i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 -; VBITS_GE_1024-NEXT: mov x9, #32 // =0x20 -; VBITS_GE_1024-NEXT: ld1h { z0.s }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_1024-NEXT: mov x8, #32 // =0x20 +; VBITS_GE_1024-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_1024-NEXT: ld1h { z1.s }, p0/z, [x0] -; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2] -; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x8] +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_GE_1024-NEXT: ret ; ; VBITS_GE_2048-LABEL: load_zext_v64i16i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 ; VBITS_GE_2048-NEXT: ld1h { z0.s }, p0/z, [x0] -; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %a = load <64 x i16>, ptr %ap %val = zext <64 x i16> %a to <64 x i32> - ret <64 x i32> %val + store <64 x i32> %val, ptr %b + ret void } define <4 x i32> @load_sext_v4i16i32(ptr %ap) vscale_range(2,0) #0 { @@ -103,196 +107,206 @@ define <4 x i32> @load_sext_v4i16i32(ptr %ap) vscale_range(2,0) #0 { ret <4 x i32> %val } -define <8 x i32> @load_sext_v8i16i32(ptr %ap) vscale_range(2,0) #0 { +define void @load_sext_v8i16i32(ptr %ap, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: load_sext_v8i16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0] -; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %ap %val = sext <8 x i16> %a to <8 x i32> - ret <8 x i32> %val + store <8 x i32> %val, ptr %b + ret void } -define <16 x i32> @load_sext_v16i16i32(ptr %ap) vscale_range(4,0) #0 { +define void @load_sext_v16i16i32(ptr %ap, ptr %b) vscale_range(4,0) #0 { ; CHECK-LABEL: load_sext_v16i16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0] -; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %ap %val = sext <16 x i16> %a to <16 x i32> - ret <16 x i32> %val + store <16 x i32> %val, ptr %b + ret void } -define <32 x i32> @load_sext_v32i16i32(ptr %ap) vscale_range(8,0) #0 { +define void @load_sext_v32i16i32(ptr %ap, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: load_sext_v32i16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0] -; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %ap %val = sext <32 x i16> %a to <32 x i32> - ret <32 x i32> %val + store <32 x i32> %val, ptr %b + ret void } -define <64 x i32> @load_sext_v64i16i32(ptr %ap) #0 { +define void @load_sext_v64i16i32(ptr %ap, ptr %b) #0 { ; VBITS_GE_1024-LABEL: load_sext_v64i16i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 -; VBITS_GE_1024-NEXT: mov x9, #32 // =0x20 -; VBITS_GE_1024-NEXT: ld1sh { z0.s }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_1024-NEXT: mov x8, #32 // =0x20 +; VBITS_GE_1024-NEXT: ld1sh { z0.s }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_1024-NEXT: ld1sh { z1.s }, p0/z, [x0] -; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2] -; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x8] +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_GE_1024-NEXT: ret ; ; VBITS_GE_2048-LABEL: load_sext_v64i16i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 ; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x0] -; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %a = load <64 x i16>, ptr %ap %val = sext <64 x i16> %a to <64 x i32> - ret <64 x i32> %val + store <64 x i32> %val, ptr %b + ret void } -define <32 x i64> @load_zext_v32i8i64(ptr %ap) #0 { +define void @load_zext_v32i8i64(ptr %ap, ptr %b) #0 { ; VBITS_GE_1024-LABEL: load_zext_v32i8i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: mov w9, #16 // =0x10 -; VBITS_GE_1024-NEXT: ld1b { z0.d }, p0/z, [x0, x9] +; VBITS_GE_1024-NEXT: mov w8, #16 // =0x10 +; VBITS_GE_1024-NEXT: ld1b { z0.d }, p0/z, [x0, x8] ; VBITS_GE_1024-NEXT: ld1b { z1.d }, p0/z, [x0] -; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10 -; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8] +; VBITS_GE_1024-NEXT: mov x8, #16 // =0x10 +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x1] ; VBITS_GE_1024-NEXT: ret ; ; VBITS_GE_2048-LABEL: load_zext_v32i8i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 ; VBITS_GE_2048-NEXT: ld1b { z0.d }, p0/z, [x0] -; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %a = load <32 x i8>, ptr %ap %val = zext <32 x i8> %a to <32 x i64> - ret <32 x i64> %val + store <32 x i64> %val, ptr %b + ret void } -define <32 x i64> @load_sext_v32i8i64(ptr %ap) #0 { +define void @load_sext_v32i8i64(ptr %ap, ptr %b) #0 { ; VBITS_GE_1024-LABEL: load_sext_v32i8i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: mov w9, #16 // =0x10 -; VBITS_GE_1024-NEXT: ld1sb { z0.d }, p0/z, [x0, x9] +; VBITS_GE_1024-NEXT: mov w8, #16 // =0x10 +; VBITS_GE_1024-NEXT: ld1sb { z0.d }, p0/z, [x0, x8] ; VBITS_GE_1024-NEXT: ld1sb { z1.d }, p0/z, [x0] -; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10 -; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8] +; VBITS_GE_1024-NEXT: mov x8, #16 // =0x10 +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x1] ; VBITS_GE_1024-NEXT: ret ; ; VBITS_GE_2048-LABEL: load_sext_v32i8i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 ; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p0/z, [x0] -; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %a = load <32 x i8>, ptr %ap %val = sext <32 x i8> %a to <32 x i64> - ret <32 x i64> %val + store <32 x i64> %val, ptr %b + ret void } -define <32 x i64> @load_zext_v32i16i64(ptr %ap) #0 { +define void @load_zext_v32i16i64(ptr %ap, ptr %b) #0 { ; VBITS_GE_1024-LABEL: load_zext_v32i16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10 -; VBITS_GE_1024-NEXT: ld1h { z0.d }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_1024-NEXT: mov x8, #16 // =0x10 +; VBITS_GE_1024-NEXT: ld1h { z0.d }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_1024-NEXT: ld1h { z1.d }, p0/z, [x0] -; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8] +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x1] ; VBITS_GE_1024-NEXT: ret ; ; VBITS_GE_2048-LABEL: load_zext_v32i16i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 ; VBITS_GE_2048-NEXT: ld1h { z0.d }, p0/z, [x0] -; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %a = load <32 x i16>, ptr %ap %val = zext <32 x i16> %a to <32 x i64> - ret <32 x i64> %val + store <32 x i64> %val, ptr %b + ret void } -define <32 x i64> @load_sext_v32i16i64(ptr %ap) #0 { +define void @load_sext_v32i16i64(ptr %ap, ptr %b) #0 { ; VBITS_GE_1024-LABEL: load_sext_v32i16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10 -; VBITS_GE_1024-NEXT: ld1sh { z0.d }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_1024-NEXT: mov x8, #16 // =0x10 +; VBITS_GE_1024-NEXT: ld1sh { z0.d }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_1024-NEXT: ld1sh { z1.d }, p0/z, [x0] -; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8] +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x1] ; VBITS_GE_1024-NEXT: ret ; ; VBITS_GE_2048-LABEL: load_sext_v32i16i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 ; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x0] -; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %a = load <32 x i16>, ptr %ap %val = sext <32 x i16> %a to <32 x i64> - ret <32 x i64> %val + store <32 x i64> %val, ptr %b + ret void } -define <32 x i64> @load_zext_v32i32i64(ptr %ap) #0 { +define void @load_zext_v32i32i64(ptr %ap, ptr %b) #0 { ; VBITS_GE_1024-LABEL: load_zext_v32i32i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10 -; VBITS_GE_1024-NEXT: ld1w { z0.d }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_1024-NEXT: mov x8, #16 // =0x10 +; VBITS_GE_1024-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_1024-NEXT: ld1w { z1.d }, p0/z, [x0] -; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8] +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x1] ; VBITS_GE_1024-NEXT: ret ; ; VBITS_GE_2048-LABEL: load_zext_v32i32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 ; VBITS_GE_2048-NEXT: ld1w { z0.d }, p0/z, [x0] -; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %a = load <32 x i32>, ptr %ap %val = zext <32 x i32> %a to <32 x i64> - ret <32 x i64> %val + store <32 x i64> %val, ptr %b + ret void } -define <32 x i64> @load_sext_v32i32i64(ptr %ap) #0 { +define void @load_sext_v32i32i64(ptr %ap, ptr %b) #0 { ; VBITS_GE_1024-LABEL: load_sext_v32i32i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10 -; VBITS_GE_1024-NEXT: ld1sw { z0.d }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_1024-NEXT: mov x8, #16 // =0x10 +; VBITS_GE_1024-NEXT: ld1sw { z0.d }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_1024-NEXT: ld1sw { z1.d }, p0/z, [x0] -; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8] +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x1] ; VBITS_GE_1024-NEXT: ret ; ; VBITS_GE_2048-LABEL: load_sext_v32i32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 ; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x0] -; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %a = load <32 x i32>, ptr %ap %val = sext <32 x i32> %a to <32 x i64> - ret <32 x i64> %val + store <32 x i64> %val, ptr %b + ret void } attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll index 2dd06e08d16b63..977c528e2583af 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll @@ -18,8 +18,8 @@ define <4 x half> @insertelement_v4f16(<4 x half> %op1) vscale_range(2,0) #0 { ; CHECK-NEXT: mov v0.h[3], v1.h[0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret - %r = insertelement <4 x half> %op1, half 5.0, i64 3 - ret <4 x half> %r + %r = insertelement <4 x half> %op1, half 5.0, i64 3 + ret <4 x half> %r } ; Don't use SVE for 128-bit vectors. @@ -29,101 +29,105 @@ define <8 x half> @insertelement_v8f16(<8 x half> %op1) vscale_range(2,0) #0 { ; CHECK-NEXT: fmov h1, #5.00000000 ; CHECK-NEXT: mov v0.h[7], v1.h[0] ; CHECK-NEXT: ret - %r = insertelement <8 x half> %op1, half 5.0, i64 7 - ret <8 x half> %r + %r = insertelement <8 x half> %op1, half 5.0, i64 7 + ret <8 x half> %r } -define <16 x half> @insertelement_v16f16(ptr %a) vscale_range(2,0) #0 { +define void @insertelement_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: insertelement_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: mov w9, #15 // =0xf +; CHECK-NEXT: mov w8, #15 // =0xf ; CHECK-NEXT: index z0.h, #0, #1 ; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: mov z1.h, w9 +; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0] ; CHECK-NEXT: cmpeq p1.h, p1/z, z0.h, z1.h ; CHECK-NEXT: fmov h0, #5.00000000 ; CHECK-NEXT: mov z2.h, p1/m, h0 -; CHECK-NEXT: st1h { z2.h }, p0, [x8] +; CHECK-NEXT: st1h { z2.h }, p0, [x1] ; CHECK-NEXT: ret - %op1 = load <16 x half>, ptr %a - %r = insertelement <16 x half> %op1, half 5.0, i64 15 - ret <16 x half> %r + %op1 = load <16 x half>, ptr %a + %r = insertelement <16 x half> %op1, half 5.0, i64 15 + store <16 x half> %r, ptr %b + ret void } -define <32 x half> @insertelement_v32f16(ptr %a) #0 { +define void @insertelement_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: insertelement_v32f16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov w9, #15 // =0xf +; VBITS_GE_256-NEXT: mov w8, #15 // =0xf ; VBITS_GE_256-NEXT: index z0.h, #0, #1 ; VBITS_GE_256-NEXT: ptrue p1.h -; VBITS_GE_256-NEXT: mov z1.h, w9 -; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 +; VBITS_GE_256-NEXT: mov z1.h, w8 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: fmov h2, #5.00000000 -; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: cmpeq p1.h, p1/z, z0.h, z1.h ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: mov z3.h, p1/m, h2 -; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x8, x9, lsl #1] -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x8, lsl #1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: insertelement_v32f16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 -; VBITS_GE_512-NEXT: mov w9, #31 // =0x1f +; VBITS_GE_512-NEXT: mov w8, #31 // =0x1f ; VBITS_GE_512-NEXT: index z0.h, #0, #1 ; VBITS_GE_512-NEXT: ptrue p1.h -; VBITS_GE_512-NEXT: mov z1.h, w9 +; VBITS_GE_512-NEXT: mov z1.h, w8 ; VBITS_GE_512-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: cmpeq p1.h, p1/z, z0.h, z1.h ; VBITS_GE_512-NEXT: fmov h0, #5.00000000 ; VBITS_GE_512-NEXT: mov z2.h, p1/m, h0 -; VBITS_GE_512-NEXT: st1h { z2.h }, p0, [x8] +; VBITS_GE_512-NEXT: st1h { z2.h }, p0, [x1] ; VBITS_GE_512-NEXT: ret - %op1 = load <32 x half>, ptr %a - %r = insertelement <32 x half> %op1, half 5.0, i64 31 - ret <32 x half> %r + %op1 = load <32 x half>, ptr %a + %r = insertelement <32 x half> %op1, half 5.0, i64 31 + store <32 x half> %r, ptr %b + ret void } -define <64 x half> @insertelement_v64f16(ptr %a) vscale_range(8,0) #0 { +define void @insertelement_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: insertelement_v64f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: mov w9, #63 // =0x3f +; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: index z0.h, #0, #1 ; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: mov z1.h, w9 +; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0] ; CHECK-NEXT: cmpeq p1.h, p1/z, z0.h, z1.h ; CHECK-NEXT: fmov h0, #5.00000000 ; CHECK-NEXT: mov z2.h, p1/m, h0 -; CHECK-NEXT: st1h { z2.h }, p0, [x8] +; CHECK-NEXT: st1h { z2.h }, p0, [x1] ; CHECK-NEXT: ret - %op1 = load <64 x half>, ptr %a - %r = insertelement <64 x half> %op1, half 5.0, i64 63 - ret <64 x half> %r + %op1 = load <64 x half>, ptr %a + %r = insertelement <64 x half> %op1, half 5.0, i64 63 + store <64 x half> %r, ptr %b + ret void } -define <128 x half> @insertelement_v128f16(ptr %a) vscale_range(16,0) #0 { +define void @insertelement_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: insertelement_v128f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl128 -; CHECK-NEXT: mov w9, #127 // =0x7f +; CHECK-NEXT: mov w8, #127 // =0x7f ; CHECK-NEXT: index z0.h, #0, #1 ; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: mov z1.h, w9 +; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0] ; CHECK-NEXT: cmpeq p1.h, p1/z, z0.h, z1.h ; CHECK-NEXT: fmov h0, #5.00000000 ; CHECK-NEXT: mov z2.h, p1/m, h0 -; CHECK-NEXT: st1h { z2.h }, p0, [x8] +; CHECK-NEXT: st1h { z2.h }, p0, [x1] ; CHECK-NEXT: ret - %op1 = load <128 x half>, ptr %a - %r = insertelement <128 x half> %op1, half 5.0, i64 127 - ret <128 x half> %r + %op1 = load <128 x half>, ptr %a + %r = insertelement <128 x half> %op1, half 5.0, i64 127 + store <128 x half> %r, ptr %b + ret void } ; Don't use SVE for 64-bit vectors. @@ -135,8 +139,8 @@ define <2 x float> @insertelement_v2f32(<2 x float> %op1) vscale_range(2,0) #0 { ; CHECK-NEXT: mov v0.s[1], v1.s[0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret - %r = insertelement <2 x float> %op1, float 5.0, i64 1 - ret <2 x float> %r + %r = insertelement <2 x float> %op1, float 5.0, i64 1 + ret <2 x float> %r } ; Don't use SVE for 128-bit vectors. @@ -146,101 +150,105 @@ define <4 x float> @insertelement_v4f32(<4 x float> %op1) vscale_range(2,0) #0 { ; CHECK-NEXT: fmov s1, #5.00000000 ; CHECK-NEXT: mov v0.s[3], v1.s[0] ; CHECK-NEXT: ret - %r = insertelement <4 x float> %op1, float 5.0, i64 3 - ret <4 x float> %r + %r = insertelement <4 x float> %op1, float 5.0, i64 3 + ret <4 x float> %r } -define <8 x float> @insertelement_v8f32(ptr %a) vscale_range(2,0) #0 { +define void @insertelement_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: insertelement_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: mov w9, #7 // =0x7 +; CHECK-NEXT: mov w8, #7 // =0x7 ; CHECK-NEXT: index z0.s, #0, #1 ; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: mov z1.s, w9 +; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] ; CHECK-NEXT: cmpeq p1.s, p1/z, z0.s, z1.s ; CHECK-NEXT: fmov s0, #5.00000000 ; CHECK-NEXT: mov z2.s, p1/m, s0 -; CHECK-NEXT: st1w { z2.s }, p0, [x8] +; CHECK-NEXT: st1w { z2.s }, p0, [x1] ; CHECK-NEXT: ret - %op1 = load <8 x float>, ptr %a - %r = insertelement <8 x float> %op1, float 5.0, i64 7 - ret <8 x float> %r + %op1 = load <8 x float>, ptr %a + %r = insertelement <8 x float> %op1, float 5.0, i64 7 + store <8 x float> %r, ptr %b + ret void } -define <16 x float> @insertelement_v16f32(ptr %a) #0 { +define void @insertelement_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: insertelement_v16f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov w9, #7 // =0x7 +; VBITS_GE_256-NEXT: mov w8, #7 // =0x7 ; VBITS_GE_256-NEXT: index z0.s, #0, #1 ; VBITS_GE_256-NEXT: ptrue p1.s -; VBITS_GE_256-NEXT: mov z1.s, w9 -; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 +; VBITS_GE_256-NEXT: mov z1.s, w8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: fmov s2, #5.00000000 -; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: cmpeq p1.s, p1/z, z0.s, z1.s ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: mov z3.s, p1/m, s2 -; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x8, x9, lsl #2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: insertelement_v16f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: mov w9, #15 // =0xf +; VBITS_GE_512-NEXT: mov w8, #15 // =0xf ; VBITS_GE_512-NEXT: index z0.s, #0, #1 ; VBITS_GE_512-NEXT: ptrue p1.s -; VBITS_GE_512-NEXT: mov z1.s, w9 +; VBITS_GE_512-NEXT: mov z1.s, w8 ; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: cmpeq p1.s, p1/z, z0.s, z1.s ; VBITS_GE_512-NEXT: fmov s0, #5.00000000 ; VBITS_GE_512-NEXT: mov z2.s, p1/m, s0 -; VBITS_GE_512-NEXT: st1w { z2.s }, p0, [x8] +; VBITS_GE_512-NEXT: st1w { z2.s }, p0, [x1] ; VBITS_GE_512-NEXT: ret - %op1 = load <16 x float>, ptr %a - %r = insertelement <16 x float> %op1, float 5.0, i64 15 - ret <16 x float> %r + %op1 = load <16 x float>, ptr %a + %r = insertelement <16 x float> %op1, float 5.0, i64 15 + store <16 x float> %r, ptr %b + ret void } -define <32 x float> @insertelement_v32f32(ptr %a) vscale_range(8,0) #0 { +define void @insertelement_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: insertelement_v32f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: mov w9, #31 // =0x1f +; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: index z0.s, #0, #1 ; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: mov z1.s, w9 +; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] ; CHECK-NEXT: cmpeq p1.s, p1/z, z0.s, z1.s ; CHECK-NEXT: fmov s0, #5.00000000 ; CHECK-NEXT: mov z2.s, p1/m, s0 -; CHECK-NEXT: st1w { z2.s }, p0, [x8] +; CHECK-NEXT: st1w { z2.s }, p0, [x1] ; CHECK-NEXT: ret - %op1 = load <32 x float>, ptr %a - %r = insertelement <32 x float> %op1, float 5.0, i64 31 - ret <32 x float> %r + %op1 = load <32 x float>, ptr %a + %r = insertelement <32 x float> %op1, float 5.0, i64 31 + store <32 x float> %r, ptr %b + ret void } -define <64 x float> @insertelement_v64f32(ptr %a) vscale_range(16,0) #0 { +define void @insertelement_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: insertelement_v64f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl64 -; CHECK-NEXT: mov w9, #63 // =0x3f +; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: index z0.s, #0, #1 ; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: mov z1.s, w9 +; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] ; CHECK-NEXT: cmpeq p1.s, p1/z, z0.s, z1.s ; CHECK-NEXT: fmov s0, #5.00000000 ; CHECK-NEXT: mov z2.s, p1/m, s0 -; CHECK-NEXT: st1w { z2.s }, p0, [x8] +; CHECK-NEXT: st1w { z2.s }, p0, [x1] ; CHECK-NEXT: ret - %op1 = load <64 x float>, ptr %a - %r = insertelement <64 x float> %op1, float 5.0, i64 63 - ret <64 x float> %r + %op1 = load <64 x float>, ptr %a + %r = insertelement <64 x float> %op1, float 5.0, i64 63 + store <64 x float> %r, ptr %b + ret void } ; Don't use SVE for 64-bit vectors. @@ -250,8 +258,8 @@ define <1 x double> @insertelement_v1f64(<1 x double> %op1) vscale_range(2,0) #0 ; CHECK-NEXT: mov x8, #4617315517961601024 // =0x4014000000000000 ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: ret - %r = insertelement <1 x double> %op1, double 5.0, i64 0 - ret <1 x double> %r + %r = insertelement <1 x double> %op1, double 5.0, i64 0 + ret <1 x double> %r } ; Don't use SVE for 128-bit vectors. @@ -261,101 +269,105 @@ define <2 x double> @insertelement_v2f64(<2 x double> %op1) vscale_range(2,0) #0 ; CHECK-NEXT: fmov d1, #5.00000000 ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret - %r = insertelement <2 x double> %op1, double 5.0, i64 1 - ret <2 x double> %r + %r = insertelement <2 x double> %op1, double 5.0, i64 1 + ret <2 x double> %r } -define <4 x double> @insertelement_v4f64(ptr %a) vscale_range(2,0) #0 { +define void @insertelement_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: insertelement_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: mov w9, #3 // =0x3 +; CHECK-NEXT: mov w8, #3 // =0x3 ; CHECK-NEXT: index z0.d, #0, #1 ; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: mov z1.d, x9 +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0] ; CHECK-NEXT: cmpeq p1.d, p1/z, z0.d, z1.d ; CHECK-NEXT: fmov d0, #5.00000000 ; CHECK-NEXT: mov z2.d, p1/m, d0 -; CHECK-NEXT: st1d { z2.d }, p0, [x8] +; CHECK-NEXT: st1d { z2.d }, p0, [x1] ; CHECK-NEXT: ret - %op1 = load <4 x double>, ptr %a - %r = insertelement <4 x double> %op1, double 5.0, i64 3 - ret <4 x double> %r + %op1 = load <4 x double>, ptr %a + %r = insertelement <4 x double> %op1, double 5.0, i64 3 + store <4 x double> %r, ptr %b + ret void } -define <8 x double> @insertelement_v8f64(ptr %a) #0 { +define void @insertelement_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: insertelement_v8f64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: mov w9, #3 // =0x3 +; VBITS_GE_256-NEXT: mov w8, #3 // =0x3 ; VBITS_GE_256-NEXT: index z0.d, #0, #1 ; VBITS_GE_256-NEXT: ptrue p1.d -; VBITS_GE_256-NEXT: mov z1.d, x9 -; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 +; VBITS_GE_256-NEXT: mov z1.d, x8 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: fmov d2, #5.00000000 -; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: cmpeq p1.d, p1/z, z0.d, z1.d ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: mov z3.d, p1/m, d2 -; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: insertelement_v8f64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: mov w9, #7 // =0x7 +; VBITS_GE_512-NEXT: mov w8, #7 // =0x7 ; VBITS_GE_512-NEXT: index z0.d, #0, #1 ; VBITS_GE_512-NEXT: ptrue p1.d -; VBITS_GE_512-NEXT: mov z1.d, x9 +; VBITS_GE_512-NEXT: mov z1.d, x8 ; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: cmpeq p1.d, p1/z, z0.d, z1.d ; VBITS_GE_512-NEXT: fmov d0, #5.00000000 ; VBITS_GE_512-NEXT: mov z2.d, p1/m, d0 -; VBITS_GE_512-NEXT: st1d { z2.d }, p0, [x8] +; VBITS_GE_512-NEXT: st1d { z2.d }, p0, [x1] ; VBITS_GE_512-NEXT: ret - %op1 = load <8 x double>, ptr %a - %r = insertelement <8 x double> %op1, double 5.0, i64 7 - ret <8 x double> %r + %op1 = load <8 x double>, ptr %a + %r = insertelement <8 x double> %op1, double 5.0, i64 7 + store <8 x double> %r, ptr %b + ret void } -define <16 x double> @insertelement_v16f64(ptr %a) vscale_range(8,0) #0 { +define void @insertelement_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: insertelement_v16f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: mov w9, #15 // =0xf +; CHECK-NEXT: mov w8, #15 // =0xf ; CHECK-NEXT: index z0.d, #0, #1 ; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: mov z1.d, x9 +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0] ; CHECK-NEXT: cmpeq p1.d, p1/z, z0.d, z1.d ; CHECK-NEXT: fmov d0, #5.00000000 ; CHECK-NEXT: mov z2.d, p1/m, d0 -; CHECK-NEXT: st1d { z2.d }, p0, [x8] +; CHECK-NEXT: st1d { z2.d }, p0, [x1] ; CHECK-NEXT: ret - %op1 = load <16 x double>, ptr %a - %r = insertelement <16 x double> %op1, double 5.0, i64 15 - ret <16 x double> %r + %op1 = load <16 x double>, ptr %a + %r = insertelement <16 x double> %op1, double 5.0, i64 15 + store <16 x double> %r, ptr %b + ret void } -define <32 x double> @insertelement_v32f64(ptr %a) vscale_range(16,0) #0 { +define void @insertelement_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: insertelement_v32f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: mov w9, #31 // =0x1f +; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: index z0.d, #0, #1 ; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: mov z1.d, x9 +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0] ; CHECK-NEXT: cmpeq p1.d, p1/z, z0.d, z1.d ; CHECK-NEXT: fmov d0, #5.00000000 ; CHECK-NEXT: mov z2.d, p1/m, d0 -; CHECK-NEXT: st1d { z2.d }, p0, [x8] +; CHECK-NEXT: st1d { z2.d }, p0, [x1] ; CHECK-NEXT: ret - %op1 = load <32 x double>, ptr %a - %r = insertelement <32 x double> %op1, double 5.0, i64 31 - ret <32 x double> %r + %op1 = load <32 x double>, ptr %a + %r = insertelement <32 x double> %op1, double 5.0, i64 31 + store <32 x double> %r, ptr %b + ret void } attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll similarity index 57% rename from llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll rename to llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll index 8c574f8e4716a7..becddaea31267a 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll @@ -19,178 +19,186 @@ target triple = "aarch64-unknown-linux-gnu" ; Don't use SVE for 64-bit vectors. -define <2 x float> @load_v2f32(ptr %a) #0 { +define void @load_v2f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: load_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: ret %load = load <2 x float>, ptr %a - ret <2 x float> %load + store <2 x float> %load, ptr %b + ret void } ; Don't use SVE for 128-bit vectors. -define <4 x float> @load_v4f32(ptr %a) #0 { +define void @load_v4f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: load_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret %load = load <4 x float>, ptr %a - ret <4 x float> %load + store <4 x float> %load, ptr %b + ret void } -define <8 x float> @load_v8f32(ptr %a) #0 { +define void @load_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: load_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %load = load <8 x float>, ptr %a - ret <8 x float> %load + store <8 x float> %load, ptr %b + ret void } -define <16 x float> @load_v16f32(ptr %a) #0 { +define void @load_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: load_v16f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: load_v16f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_512-NEXT: ret ; ; VBITS_GE_1024-LABEL: load_v16f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 ; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_1024-NEXT: ret ; ; VBITS_GE_2048-LABEL: load_v16f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl16 ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %load = load <16 x float>, ptr %a - ret <16 x float> %load + store <16 x float> %load, ptr %b + ret void } -define <32 x float> @load_v32f32(ptr %a) #0 { +define void @load_v32f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: load_v32f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: mov x9, #24 // =0x18 -; VBITS_GE_256-NEXT: mov x10, #16 // =0x10 -; VBITS_GE_256-NEXT: mov x11, #8 // =0x8 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: mov x10, #8 // =0x8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x10, lsl #2] -; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x11, lsl #2] -; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x8] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: load_v32f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: mov x9, #16 // =0x10 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_512-NEXT: mov x8, #16 // =0x10 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2] -; VBITS_GE_512-NEXT: st1w { z1.s }, p0, [x8] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_512-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_GE_512-NEXT: ret ; ; VBITS_GE_1024-LABEL: load_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 ; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_1024-NEXT: ret ; ; VBITS_GE_2048-LABEL: load_v32f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %load = load <32 x float>, ptr %a - ret <32 x float> %load + store <32 x float> %load, ptr %b + ret void } -define <64 x float> @load_v64f32(ptr %a) #0 { +define void @load_v64f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: load_v64f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x9, #24 // =0x18 ; VBITS_GE_256-NEXT: mov x10, #16 // =0x10 -; VBITS_GE_256-NEXT: mov x11, #24 // =0x18 -; VBITS_GE_256-NEXT: mov x12, #56 // =0x38 -; VBITS_GE_256-NEXT: mov x13, #32 // =0x20 -; VBITS_GE_256-NEXT: mov x14, #48 // =0x30 -; VBITS_GE_256-NEXT: mov x15, #40 // =0x28 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x12, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x14, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x11, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x15, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: mov x11, #48 // =0x30 +; VBITS_GE_256-NEXT: mov x12, #40 // =0x28 +; VBITS_GE_256-NEXT: mov x13, #56 // =0x38 +; VBITS_GE_256-NEXT: mov x14, #32 // =0x20 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x12, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x12, lsl #2] -; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x14, lsl #2] -; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x8, x15, lsl #2] -; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x8, x13, lsl #2] -; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x8, x11, lsl #2] -; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x8, x10, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2] -; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x8] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: load_v64f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: mov x8, #32 // =0x20 ; VBITS_GE_512-NEXT: mov x9, #48 // =0x30 -; VBITS_GE_512-NEXT: mov x10, #32 // =0x20 -; VBITS_GE_512-NEXT: mov x11, #16 // =0x10 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] -; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] -; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_512-NEXT: mov x10, #16 // =0x10 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] ; VBITS_GE_512-NEXT: ld1w { z3.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2] -; VBITS_GE_512-NEXT: st1w { z1.s }, p0, [x8, x10, lsl #2] -; VBITS_GE_512-NEXT: st1w { z2.s }, p0, [x8, x11, lsl #2] -; VBITS_GE_512-NEXT: st1w { z3.s }, p0, [x8] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_512-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_512-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_512-NEXT: st1w { z3.s }, p0, [x1] ; VBITS_GE_512-NEXT: ret ; ; VBITS_GE_1024-LABEL: load_v64f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 -; VBITS_GE_1024-NEXT: mov x9, #32 // =0x20 -; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_1024-NEXT: mov x8, #32 // =0x20 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2] -; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x8] +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_GE_1024-NEXT: ret ; ; VBITS_GE_2048-LABEL: load_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %load = load <64 x float>, ptr %a - ret <64 x float> %load + store <64 x float> %load, ptr %b + ret void } attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll index 5dfce78af18b8e..fee233643a8e56 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -68,7 +68,7 @@ define <4 x float> @masked_load_v4f32(ptr %ap, ptr %bp) vscale_range(1,0) #0 { ret <4 x float> %load } -define <8 x float> @masked_load_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { +define void @masked_load_v8f32(ptr %ap, ptr %bp, ptr %c) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_load_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 @@ -76,30 +76,31 @@ define <8 x float> @masked_load_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s ; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] -; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret %a = load <8 x float>, ptr %ap %b = load <8 x float>, ptr %bp %mask = fcmp oeq <8 x float> %a, %b %load = call <8 x float> @llvm.masked.load.v8f32(ptr %ap, i32 8, <8 x i1> %mask, <8 x float> zeroinitializer) - ret <8 x float> %load + store <8 x float> %load, ptr %c + ret void } -define <16 x float> @masked_load_v16f32(ptr %ap, ptr %bp) #0 { +define void @masked_load_v16f32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_v16f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z2.s ; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z3.s -; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p2/z, [x0] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_v16f32: @@ -109,16 +110,17 @@ define <16 x float> @masked_load_v16f32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s ; VBITS_GE_512-NEXT: ld1w { z0.s }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret %a = load <16 x float>, ptr %ap %b = load <16 x float>, ptr %bp %mask = fcmp oeq <16 x float> %a, %b %load = call <16 x float> @llvm.masked.load.v16f32(ptr %ap, i32 8, <16 x i1> %mask, <16 x float> zeroinitializer) - ret <16 x float> %load + store <16 x float> %load, ptr %c + ret void } -define <32 x float> @masked_load_v32f32(ptr %ap, ptr %bp) vscale_range(8,0) #0 { +define void @masked_load_v32f32(ptr %ap, ptr %bp, ptr %c) vscale_range(8,0) #0 { ; CHECK-LABEL: masked_load_v32f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 @@ -126,16 +128,17 @@ define <32 x float> @masked_load_v32f32(ptr %ap, ptr %bp) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s ; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] -; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret %a = load <32 x float>, ptr %ap %b = load <32 x float>, ptr %bp %mask = fcmp oeq <32 x float> %a, %b %load = call <32 x float> @llvm.masked.load.v32f32(ptr %ap, i32 8, <32 x i1> %mask, <32 x float> zeroinitializer) - ret <32 x float> %load + store <32 x float> %load, ptr %c + ret void } -define <64 x float> @masked_load_v64f32(ptr %ap, ptr %bp) vscale_range(16,0) #0 { +define void @masked_load_v64f32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: masked_load_v64f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl64 @@ -143,30 +146,31 @@ define <64 x float> @masked_load_v64f32(ptr %ap, ptr %bp) vscale_range(16,0) #0 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s ; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] -; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret %a = load <64 x float>, ptr %ap %b = load <64 x float>, ptr %bp %mask = fcmp oeq <64 x float> %a, %b %load = call <64 x float> @llvm.masked.load.v64f32(ptr %ap, i32 8, <64 x i1> %mask, <64 x float> zeroinitializer) - ret <64 x float> %load + store <64 x float> %load, ptr %c + ret void } -define <64 x i8> @masked_load_v64i8(ptr %ap, ptr %bp) #0 { +define void @masked_load_v64i8(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_v64i8: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: mov w9, #32 // =0x20 -; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x9] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8] ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, z2.b ; VBITS_GE_256-NEXT: cmpeq p2.b, p0/z, z1.b, z3.b -; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p2/z, [x0] -; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x8, x9] -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x8] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x2, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_v64i8: @@ -176,30 +180,31 @@ define <64 x i8> @masked_load_v64i8(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b ; VBITS_GE_512-NEXT: ld1b { z0.b }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x8] +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x2] ; VBITS_GE_512-NEXT: ret %a = load <64 x i8>, ptr %ap %b = load <64 x i8>, ptr %bp %mask = icmp eq <64 x i8> %a, %b %load = call <64 x i8> @llvm.masked.load.v64i8(ptr %ap, i32 8, <64 x i1> %mask, <64 x i8> undef) - ret <64 x i8> %load + store <64 x i8> %load, ptr %c + ret void } -define <32 x i16> @masked_load_v32i16(ptr %ap, ptr %bp) #0 { +define void @masked_load_v32i16(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_v32i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, z2.h ; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z1.h, z3.h -; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p2/z, [x0] -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_v32i16: @@ -209,30 +214,31 @@ define <32 x i16> @masked_load_v32i16(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h ; VBITS_GE_512-NEXT: ld1h { z0.h }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8] +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] ; VBITS_GE_512-NEXT: ret %a = load <32 x i16>, ptr %ap %b = load <32 x i16>, ptr %bp %mask = icmp eq <32 x i16> %a, %b %load = call <32 x i16> @llvm.masked.load.v32i16(ptr %ap, i32 8, <32 x i1> %mask, <32 x i16> undef) - ret <32 x i16> %load + store <32 x i16> %load, ptr %c + ret void } -define <16 x i32> @masked_load_v16i32(ptr %ap, ptr %bp) #0 { +define void @masked_load_v16i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_v16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z2.s ; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, z3.s -; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p2/z, [x0] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_v16i32: @@ -242,30 +248,31 @@ define <16 x i32> @masked_load_v16i32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s ; VBITS_GE_512-NEXT: ld1w { z0.s }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret %a = load <16 x i32>, ptr %ap %b = load <16 x i32>, ptr %bp %mask = icmp eq <16 x i32> %a, %b %load = call <16 x i32> @llvm.masked.load.v16i32(ptr %ap, i32 8, <16 x i1> %mask, <16 x i32> undef) - ret <16 x i32> %load + store <16 x i32> %load, ptr %c + ret void } -define <8 x i64> @masked_load_v8i64(ptr %ap, ptr %bp) #0 { +define void @masked_load_v8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, z3.d -; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x0] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_v8i64: @@ -275,32 +282,33 @@ define <8 x i64> @masked_load_v8i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d ; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, ptr %ap %b = load <8 x i64>, ptr %bp %mask = icmp eq <8 x i64> %a, %b %load = call <8 x i64> @llvm.masked.load.v8i64(ptr %ap, i32 8, <8 x i1> %mask, <8 x i64> undef) - ret <8 x i64> %load + store <8 x i64> %load, ptr %c + ret void } -define <8 x i64> @masked_load_passthru_v8i64(ptr %ap, ptr %bp) #0 { +define void @masked_load_passthru_v8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_passthru_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, z3.d -; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x0] ; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z2.d ; VBITS_GE_256-NEXT: sel z1.d, p2, z1.d, z3.d -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_passthru_v8i64: @@ -311,32 +319,33 @@ define <8 x i64> @masked_load_passthru_v8i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d ; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, ptr %ap %b = load <8 x i64>, ptr %bp %mask = icmp eq <8 x i64> %a, %b %load = call <8 x i64> @llvm.masked.load.v8i64(ptr %ap, i32 8, <8 x i1> %mask, <8 x i64> %b) - ret <8 x i64> %load + store <8 x i64> %load, ptr %c + ret void } -define <8 x double> @masked_load_passthru_v8f64(ptr %ap, ptr %bp) #0 { +define void @masked_load_passthru_v8f64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_passthru_v8f64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z2.d ; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z3.d -; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x0] ; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z2.d ; VBITS_GE_256-NEXT: sel z1.d, p2, z1.d, z3.d -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_passthru_v8f64: @@ -347,20 +356,21 @@ define <8 x double> @masked_load_passthru_v8f64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d ; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %a = load <8 x double>, ptr %ap %b = load <8 x double>, ptr %bp %mask = fcmp oeq <8 x double> %a, %b %load = call <8 x double> @llvm.masked.load.v8f64(ptr %ap, i32 8, <8 x i1> %mask, <8 x double> %b) - ret <8 x double> %load + store <8 x double> %load, ptr %c + ret void } -define <32 x i16> @masked_load_sext_v32i8i16(ptr %ap, ptr %bp) #0 { +define void @masked_load_sext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] @@ -368,8 +378,8 @@ define <32 x i16> @masked_load_sext_v32i8i16(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16: @@ -378,21 +388,22 @@ define <32 x i16> @masked_load_sext_v32i8i16(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1b { z0.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0 ; VBITS_GE_512-NEXT: ld1sb { z0.h }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8] +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <32 x i8>, ptr %bp %mask = icmp eq <32 x i8> %b, zeroinitializer %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef) %ext = sext <32 x i8> %load to <32 x i16> - ret <32 x i16> %ext + store <32 x i16> %ext, ptr %c + ret void } -define <16 x i32> @masked_load_sext_v16i8i32(ptr %ap, ptr %bp) #0 { +define void @masked_load_sext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.b, vl16 ; VBITS_GE_256-NEXT: ldr q0, [x1] -; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] @@ -402,8 +413,8 @@ define <16 x i32> @masked_load_sext_v16i8i32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32: @@ -412,21 +423,22 @@ define <16 x i32> @masked_load_sext_v16i8i32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1b { z0.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: ld1sb { z0.s }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <16 x i8>, ptr %bp %mask = icmp eq <16 x i8> %b, zeroinitializer %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef) %ext = sext <16 x i8> %load to <16 x i32> - ret <16 x i32> %ext + store <16 x i32> %ext, ptr %c + ret void } -define <8 x i64> @masked_load_sext_v8i8i64(ptr %ap, ptr %bp) #0 { +define void @masked_load_sext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.b, vl8 ; VBITS_GE_256-NEXT: ldr d0, [x1] -; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] @@ -437,8 +449,8 @@ define <8 x i64> @masked_load_sext_v8i8i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64: @@ -447,20 +459,21 @@ define <8 x i64> @masked_load_sext_v8i8i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1b { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1sb { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <8 x i8>, ptr %bp %mask = icmp eq <8 x i8> %b, zeroinitializer %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef) %ext = sext <8 x i8> %load to <8 x i64> - ret <8 x i64> %ext + store <8 x i64> %ext, ptr %c + ret void } -define <16 x i32> @masked_load_sext_v16i16i32(ptr %ap, ptr %bp) #0 { +define void @masked_load_sext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] @@ -468,8 +481,8 @@ define <16 x i32> @masked_load_sext_v16i16i32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32: @@ -478,21 +491,22 @@ define <16 x i32> @masked_load_sext_v16i16i32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: ld1sh { z0.s }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <16 x i16>, ptr %bp %mask = icmp eq <16 x i16> %b, zeroinitializer %load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef) %ext = sext <16 x i16> %load to <16 x i32> - ret <16 x i32> %ext + store <16 x i32> %ext, ptr %c + ret void } -define <8 x i64> @masked_load_sext_v8i16i64(ptr %ap, ptr %bp) #0 { +define void @masked_load_sext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl8 ; VBITS_GE_256-NEXT: ldr q0, [x1] -; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 ; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] @@ -502,8 +516,8 @@ define <8 x i64> @masked_load_sext_v8i16i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64: @@ -512,20 +526,21 @@ define <8 x i64> @masked_load_sext_v8i16i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1h { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1sh { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <8 x i16>, ptr %bp %mask = icmp eq <8 x i16> %b, zeroinitializer %load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef) %ext = sext <8 x i16> %load to <8 x i64> - ret <8 x i64> %ext + store <8 x i64> %ext, ptr %c + ret void } -define <8 x i64> @masked_load_sext_v8i32i64(ptr %ap, ptr %bp) #0 { +define void @masked_load_sext_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] @@ -533,8 +548,8 @@ define <8 x i64> @masked_load_sext_v8i32i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64: @@ -543,20 +558,21 @@ define <8 x i64> @masked_load_sext_v8i32i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <8 x i32>, ptr %bp %mask = icmp eq <8 x i32> %b, zeroinitializer %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef) %ext = sext <8 x i32> %load to <8 x i64> - ret <8 x i64> %ext + store <8 x i64> %ext, ptr %c + ret void } -define <32 x i16> @masked_load_zext_v32i8i16(ptr %ap, ptr %bp) #0 { +define void @masked_load_zext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] @@ -564,8 +580,8 @@ define <32 x i16> @masked_load_zext_v32i8i16(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16: @@ -574,21 +590,22 @@ define <32 x i16> @masked_load_zext_v32i8i16(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1b { z0.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0 ; VBITS_GE_512-NEXT: ld1b { z0.h }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8] +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <32 x i8>, ptr %bp %mask = icmp eq <32 x i8> %b, zeroinitializer %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef) %ext = zext <32 x i8> %load to <32 x i16> - ret <32 x i16> %ext + store <32 x i16> %ext, ptr %c + ret void } -define <16 x i32> @masked_load_zext_v16i8i32(ptr %ap, ptr %bp) #0 { +define void @masked_load_zext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.b, vl16 ; VBITS_GE_256-NEXT: ldr q0, [x1] -; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] @@ -598,8 +615,8 @@ define <16 x i32> @masked_load_zext_v16i8i32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32: @@ -608,21 +625,22 @@ define <16 x i32> @masked_load_zext_v16i8i32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1b { z0.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: ld1b { z0.s }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <16 x i8>, ptr %bp %mask = icmp eq <16 x i8> %b, zeroinitializer %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef) %ext = zext <16 x i8> %load to <16 x i32> - ret <16 x i32> %ext + store <16 x i32> %ext, ptr %c + ret void } -define <8 x i64> @masked_load_zext_v8i8i64(ptr %ap, ptr %bp) #0 { +define void @masked_load_zext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.b, vl8 ; VBITS_GE_256-NEXT: ldr d0, [x1] -; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] @@ -633,8 +651,8 @@ define <8 x i64> @masked_load_zext_v8i8i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64: @@ -643,20 +661,21 @@ define <8 x i64> @masked_load_zext_v8i8i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1b { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1b { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <8 x i8>, ptr %bp %mask = icmp eq <8 x i8> %b, zeroinitializer %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef) %ext = zext <8 x i8> %load to <8 x i64> - ret <8 x i64> %ext + store <8 x i64> %ext, ptr %c + ret void } -define <16 x i32> @masked_load_zext_v16i16i32(ptr %ap, ptr %bp) #0 { +define void @masked_load_zext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] @@ -664,8 +683,8 @@ define <16 x i32> @masked_load_zext_v16i16i32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32: @@ -674,21 +693,22 @@ define <16 x i32> @masked_load_zext_v16i16i32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: ld1h { z0.s }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <16 x i16>, ptr %bp %mask = icmp eq <16 x i16> %b, zeroinitializer %load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef) %ext = zext <16 x i16> %load to <16 x i32> - ret <16 x i32> %ext + store <16 x i32> %ext, ptr %c + ret void } -define <8 x i64> @masked_load_zext_v8i16i64(ptr %ap, ptr %bp) #0 { +define void @masked_load_zext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl8 ; VBITS_GE_256-NEXT: ldr q0, [x1] -; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 ; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] @@ -698,8 +718,8 @@ define <8 x i64> @masked_load_zext_v8i16i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64: @@ -708,20 +728,21 @@ define <8 x i64> @masked_load_zext_v8i16i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1h { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <8 x i16>, ptr %bp %mask = icmp eq <8 x i16> %b, zeroinitializer %load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef) %ext = zext <8 x i16> %load to <8 x i64> - ret <8 x i64> %ext + store <8 x i64> %ext, ptr %c + ret void } -define <8 x i64> @masked_load_zext_v8i32i64(ptr %ap, ptr %bp) #0 { +define void @masked_load_zext_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] @@ -729,8 +750,8 @@ define <8 x i64> @masked_load_zext_v8i32i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64: @@ -739,21 +760,22 @@ define <8 x i64> @masked_load_zext_v8i32i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <8 x i32>, ptr %bp %mask = icmp eq <8 x i32> %b, zeroinitializer %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef) %ext = zext <8 x i32> %load to <8 x i64> - ret <8 x i64> %ext + store <8 x i64> %ext, ptr %c + ret void } -define <32 x i16> @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp) #0 { +define void @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16_m16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, #0 ; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z1.h, #0 @@ -769,8 +791,8 @@ define <32 x i16> @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16_m16: @@ -779,21 +801,22 @@ define <32 x i16> @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0 ; VBITS_GE_512-NEXT: ld1sb { z0.h }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8] +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <32 x i16>, ptr %bp %mask = icmp eq <32 x i16> %b, zeroinitializer %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef) %ext = sext <32 x i8> %load to <32 x i16> - ret <32 x i16> %ext + store <32 x i16> %ext, ptr %c + ret void } -define <16 x i32> @masked_load_sext_v16i8i32_m32(ptr %ap, ptr %bp) #0 { +define void @masked_load_sext_v16i8i32_m32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32_m32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0 @@ -812,8 +835,8 @@ define <16 x i32> @masked_load_sext_v16i8i32_m32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32_m32: @@ -822,21 +845,22 @@ define <16 x i32> @masked_load_sext_v16i8i32_m32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: ld1sb { z0.s }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <16 x i32>, ptr %bp %mask = icmp eq <16 x i32> %b, zeroinitializer %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef) %ext = sext <16 x i8> %load to <16 x i32> - ret <16 x i32> %ext + store <16 x i32> %ext, ptr %c + ret void } -define <8 x i64> @masked_load_sext_v8i8i64_m64(ptr %ap, ptr %bp) #0 { +define void @masked_load_sext_v8i8i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64_m64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0 @@ -857,8 +881,8 @@ define <8 x i64> @masked_load_sext_v8i8i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64_m64: @@ -867,21 +891,22 @@ define <8 x i64> @masked_load_sext_v8i8i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1sb { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <8 x i64>, ptr %bp %mask = icmp eq <8 x i64> %b, zeroinitializer %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef) %ext = sext <8 x i8> %load to <8 x i64> - ret <8 x i64> %ext + store <8 x i64> %ext, ptr %c + ret void } -define <16 x i32> @masked_load_sext_v16i16i32_m32(ptr %ap, ptr %bp) #0 { +define void @masked_load_sext_v16i16i32_m32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32_m32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0 @@ -899,8 +924,8 @@ define <16 x i32> @masked_load_sext_v16i16i32_m32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32_m32: @@ -909,21 +934,22 @@ define <16 x i32> @masked_load_sext_v16i16i32_m32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: ld1sh { z0.s }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <16 x i32>, ptr %bp %mask = icmp eq <16 x i32> %b, zeroinitializer %load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef) %ext = sext <16 x i16> %load to <16 x i32> - ret <16 x i32> %ext + store <16 x i32> %ext, ptr %c + ret void } -define <8 x i64> @masked_load_sext_v8i16i64_m64(ptr %ap, ptr %bp) #0 { +define void @masked_load_sext_v8i16i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64_m64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0 @@ -942,8 +968,8 @@ define <8 x i64> @masked_load_sext_v8i16i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64_m64: @@ -952,21 +978,22 @@ define <8 x i64> @masked_load_sext_v8i16i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1sh { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <8 x i64>, ptr %bp %mask = icmp eq <8 x i64> %b, zeroinitializer %load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef) %ext = sext <8 x i16> %load to <8 x i64> - ret <8 x i64> %ext + store <8 x i64> %ext, ptr %c + ret void } -define <8 x i64> @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp) #0 { +define void @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64_m64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0 @@ -982,8 +1009,8 @@ define <8 x i64> @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64_m64: @@ -992,21 +1019,22 @@ define <8 x i64> @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <8 x i64>, ptr %bp %mask = icmp eq <8 x i64> %b, zeroinitializer %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef) %ext = sext <8 x i32> %load to <8 x i64> - ret <8 x i64> %ext + store <8 x i64> %ext, ptr %c + ret void } -define <32 x i16> @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp) #0 { +define void @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16_m16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, #0 ; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z1.h, #0 @@ -1022,8 +1050,8 @@ define <32 x i16> @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16_m16: @@ -1032,21 +1060,22 @@ define <32 x i16> @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0 ; VBITS_GE_512-NEXT: ld1b { z0.h }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8] +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <32 x i16>, ptr %bp %mask = icmp eq <32 x i16> %b, zeroinitializer %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef) %ext = zext <32 x i8> %load to <32 x i16> - ret <32 x i16> %ext + store <32 x i16> %ext, ptr %c + ret void } -define <16 x i32> @masked_load_zext_v16i8i32_m32(ptr %ap, ptr %bp) #0 { +define void @masked_load_zext_v16i8i32_m32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32_m32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0 @@ -1065,8 +1094,8 @@ define <16 x i32> @masked_load_zext_v16i8i32_m32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32_m32: @@ -1075,21 +1104,22 @@ define <16 x i32> @masked_load_zext_v16i8i32_m32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: ld1b { z0.s }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <16 x i32>, ptr %bp %mask = icmp eq <16 x i32> %b, zeroinitializer %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef) %ext = zext <16 x i8> %load to <16 x i32> - ret <16 x i32> %ext + store <16 x i32> %ext, ptr %c + ret void } -define <8 x i64> @masked_load_zext_v8i8i64_m64(ptr %ap, ptr %bp) #0 { +define void @masked_load_zext_v8i8i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64_m64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0 @@ -1110,8 +1140,8 @@ define <8 x i64> @masked_load_zext_v8i8i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64_m64: @@ -1120,21 +1150,22 @@ define <8 x i64> @masked_load_zext_v8i8i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1b { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <8 x i64>, ptr %bp %mask = icmp eq <8 x i64> %b, zeroinitializer %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef) %ext = zext <8 x i8> %load to <8 x i64> - ret <8 x i64> %ext + store <8 x i64> %ext, ptr %c + ret void } -define <16 x i32> @masked_load_zext_v16i16i32_m32(ptr %ap, ptr %bp) #0 { +define void @masked_load_zext_v16i16i32_m32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32_m32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0 @@ -1152,8 +1183,8 @@ define <16 x i32> @masked_load_zext_v16i16i32_m32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32_m32: @@ -1162,21 +1193,22 @@ define <16 x i32> @masked_load_zext_v16i16i32_m32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: ld1h { z0.s }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <16 x i32>, ptr %bp %mask = icmp eq <16 x i32> %b, zeroinitializer %load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef) %ext = zext <16 x i16> %load to <16 x i32> - ret <16 x i32> %ext + store <16 x i32> %ext, ptr %c + ret void } -define <8 x i64> @masked_load_zext_v8i16i64_m64(ptr %ap, ptr %bp) #0 { +define void @masked_load_zext_v8i16i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64_m64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0 @@ -1195,8 +1227,8 @@ define <8 x i64> @masked_load_zext_v8i16i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64_m64: @@ -1205,21 +1237,22 @@ define <8 x i64> @masked_load_zext_v8i16i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <8 x i64>, ptr %bp %mask = icmp eq <8 x i64> %b, zeroinitializer %load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef) %ext = zext <8 x i16> %load to <8 x i64> - ret <8 x i64> %ext + store <8 x i64> %ext, ptr %c + ret void } -define <8 x i64> @masked_load_zext_v8i32i64_m64(ptr %ap, ptr %bp) #0 { +define void @masked_load_zext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64_m64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0 @@ -1235,8 +1268,8 @@ define <8 x i64> @masked_load_zext_v8i32i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64_m64: @@ -1245,212 +1278,225 @@ define <8 x i64> @masked_load_zext_v8i32i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <8 x i64>, ptr %bp %mask = icmp eq <8 x i64> %b, zeroinitializer %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef) %ext = zext <8 x i32> %load to <8 x i64> - ret <8 x i64> %ext + store <8 x i64> %ext, ptr %c + ret void } -define <128 x i16> @masked_load_sext_v128i8i16(ptr %ap, ptr %bp) vscale_range(16,0) #0 { +define void @masked_load_sext_v128i8i16(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: masked_load_sext_v128i8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, #0 ; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0] -; CHECK-NEXT: st1h { z0.h }, p0, [x8] +; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret %b = load <128 x i8>, ptr %bp %mask = icmp eq <128 x i8> %b, zeroinitializer %load = call <128 x i8> @llvm.masked.load.v128i8(ptr %ap, i32 8, <128 x i1> %mask, <128 x i8> undef) %ext = sext <128 x i8> %load to <128 x i16> - ret <128 x i16> %ext + store <128 x i16> %ext, ptr %c + ret void } -define <64 x i32> @masked_load_sext_v64i8i32(ptr %ap, ptr %bp) vscale_range(16,0) #0 { +define void @masked_load_sext_v64i8i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: masked_load_sext_v64i8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0 ; CHECK-NEXT: ld1sb { z0.s }, p1/z, [x0] -; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret %b = load <64 x i8>, ptr %bp %mask = icmp eq <64 x i8> %b, zeroinitializer %load = call <64 x i8> @llvm.masked.load.v64i8(ptr %ap, i32 8, <64 x i1> %mask, <64 x i8> undef) %ext = sext <64 x i8> %load to <64 x i32> - ret <64 x i32> %ext + store <64 x i32> %ext, ptr %c + ret void } -define <32 x i64> @masked_load_sext_v32i8i64(ptr %ap, ptr %bp) vscale_range(16,0) #0 { +define void @masked_load_sext_v32i8i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: masked_load_sext_v32i8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1sb { z0.d }, p1/z, [x0] -; CHECK-NEXT: st1d { z0.d }, p0, [x8] +; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret %b = load <32 x i8>, ptr %bp %mask = icmp eq <32 x i8> %b, zeroinitializer %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef) %ext = sext <32 x i8> %load to <32 x i64> - ret <32 x i64> %ext + store <32 x i64> %ext, ptr %c + ret void } -define <64 x i32> @masked_load_sext_v64i16i32(ptr %ap, ptr %bp) vscale_range(16,0) #0 { +define void @masked_load_sext_v64i16i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: masked_load_sext_v64i16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0 ; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0] -; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret %b = load <64 x i16>, ptr %bp %mask = icmp eq <64 x i16> %b, zeroinitializer %load = call <64 x i16> @llvm.masked.load.v64i16(ptr %ap, i32 8, <64 x i1> %mask, <64 x i16> undef) %ext = sext <64 x i16> %load to <64 x i32> - ret <64 x i32> %ext + store <64 x i32> %ext, ptr %c + ret void } -define <32 x i64> @masked_load_sext_v32i16i64(ptr %ap, ptr %bp) vscale_range(16,0) #0 { +define void @masked_load_sext_v32i16i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: masked_load_sext_v32i16i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1sh { z0.d }, p1/z, [x0] -; CHECK-NEXT: st1d { z0.d }, p0, [x8] +; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret %b = load <32 x i16>, ptr %bp %mask = icmp eq <32 x i16> %b, zeroinitializer %load = call <32 x i16> @llvm.masked.load.v32i16(ptr %ap, i32 8, <32 x i1> %mask, <32 x i16> undef) %ext = sext <32 x i16> %load to <32 x i64> - ret <32 x i64> %ext + store <32 x i64> %ext, ptr %c + ret void } -define <32 x i64> @masked_load_sext_v32i32i64(ptr %ap, ptr %bp) vscale_range(16,0) #0 { +define void @masked_load_sext_v32i32i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: masked_load_sext_v32i32i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1sw { z0.d }, p1/z, [x0] -; CHECK-NEXT: st1d { z0.d }, p0, [x8] +; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret %b = load <32 x i32>, ptr %bp %mask = icmp eq <32 x i32> %b, zeroinitializer %load = call <32 x i32> @llvm.masked.load.v32i32(ptr %ap, i32 8, <32 x i1> %mask, <32 x i32> undef) %ext = sext <32 x i32> %load to <32 x i64> - ret <32 x i64> %ext + store <32 x i64> %ext, ptr %c + ret void } -define <128 x i16> @masked_load_zext_v128i8i16(ptr %ap, ptr %bp) vscale_range(16,0) #0 { +define void @masked_load_zext_v128i8i16(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: masked_load_zext_v128i8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, #0 ; CHECK-NEXT: ld1b { z0.h }, p1/z, [x0] -; CHECK-NEXT: st1h { z0.h }, p0, [x8] +; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret %b = load <128 x i8>, ptr %bp %mask = icmp eq <128 x i8> %b, zeroinitializer %load = call <128 x i8> @llvm.masked.load.v128i8(ptr %ap, i32 8, <128 x i1> %mask, <128 x i8> undef) %ext = zext <128 x i8> %load to <128 x i16> - ret <128 x i16> %ext + store <128 x i16> %ext, ptr %c + ret void } -define <64 x i32> @masked_load_zext_v64i8i32(ptr %ap, ptr %bp) vscale_range(16,0) #0 { +define void @masked_load_zext_v64i8i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: masked_load_zext_v64i8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0 ; CHECK-NEXT: ld1b { z0.s }, p1/z, [x0] -; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret %b = load <64 x i8>, ptr %bp %mask = icmp eq <64 x i8> %b, zeroinitializer %load = call <64 x i8> @llvm.masked.load.v64i8(ptr %ap, i32 8, <64 x i1> %mask, <64 x i8> undef) %ext = zext <64 x i8> %load to <64 x i32> - ret <64 x i32> %ext + store <64 x i32> %ext, ptr %c + ret void } -define <32 x i64> @masked_load_zext_v32i8i64(ptr %ap, ptr %bp) vscale_range(16,0) #0 { +define void @masked_load_zext_v32i8i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: masked_load_zext_v32i8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1b { z0.d }, p1/z, [x0] -; CHECK-NEXT: st1d { z0.d }, p0, [x8] +; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret %b = load <32 x i8>, ptr %bp %mask = icmp eq <32 x i8> %b, zeroinitializer %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef) %ext = zext <32 x i8> %load to <32 x i64> - ret <32 x i64> %ext + store <32 x i64> %ext, ptr %c + ret void } -define <64 x i32> @masked_load_zext_v64i16i32(ptr %ap, ptr %bp) vscale_range(16,0) #0 { +define void @masked_load_zext_v64i16i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: masked_load_zext_v64i16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0 ; CHECK-NEXT: ld1h { z0.s }, p1/z, [x0] -; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret %b = load <64 x i16>, ptr %bp %mask = icmp eq <64 x i16> %b, zeroinitializer %load = call <64 x i16> @llvm.masked.load.v64i16(ptr %ap, i32 8, <64 x i1> %mask, <64 x i16> undef) %ext = zext <64 x i16> %load to <64 x i32> - ret <64 x i32> %ext + store <64 x i32> %ext, ptr %c + ret void } -define <32 x i64> @masked_load_zext_v32i16i64(ptr %ap, ptr %bp) vscale_range(16,0) #0 { +define void @masked_load_zext_v32i16i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: masked_load_zext_v32i16i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1h { z0.d }, p1/z, [x0] -; CHECK-NEXT: st1d { z0.d }, p0, [x8] +; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret %b = load <32 x i16>, ptr %bp %mask = icmp eq <32 x i16> %b, zeroinitializer %load = call <32 x i16> @llvm.masked.load.v32i16(ptr %ap, i32 8, <32 x i1> %mask, <32 x i16> undef) %ext = zext <32 x i16> %load to <32 x i64> - ret <32 x i64> %ext + store <32 x i64> %ext, ptr %c + ret void } -define <32 x i64> @masked_load_zext_v32i32i64(ptr %ap, ptr %bp) vscale_range(16,0) #0 { +define void @masked_load_zext_v32i32i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: masked_load_zext_v32i32i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1w { z0.d }, p1/z, [x0] -; CHECK-NEXT: st1d { z0.d }, p0, [x8] +; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret %b = load <32 x i32>, ptr %bp %mask = icmp eq <32 x i32> %b, zeroinitializer %load = call <32 x i32> @llvm.masked.load.v32i32(ptr %ap, i32 8, <32 x i1> %mask, <32 x i32> undef) %ext = zext <32 x i32> %load to <32 x i64> - ret <32 x i64> %ext + store <32 x i64> %ext, ptr %c + ret void } -define <8 x i64> @masked_load_sext_ugt_v8i32i64(ptr %ap, ptr %bp) #0 { +define void @masked_load_sext_ugt_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_ugt_v8i32i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] @@ -1458,8 +1504,8 @@ define <8 x i64> @masked_load_sext_ugt_v8i32i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_sext_ugt_v8i32i64: @@ -1468,20 +1514,21 @@ define <8 x i64> @masked_load_sext_ugt_v8i32i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <8 x i32>, ptr %bp %mask = icmp ugt <8 x i32> %b, zeroinitializer %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef) %ext = sext <8 x i32> %load to <8 x i64> - ret <8 x i64> %ext + store <8 x i64> %ext, ptr %c + ret void } -define <8 x i64> @masked_load_zext_sgt_v8i32i64(ptr %ap, ptr %bp) #0 { +define void @masked_load_zext_sgt_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_sgt_v8i32i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpgt p0.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] @@ -1489,8 +1536,8 @@ define <8 x i64> @masked_load_zext_sgt_v8i32i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_zext_sgt_v8i32i64: @@ -1499,13 +1546,14 @@ define <8 x i64> @masked_load_zext_sgt_v8i32i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpgt p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %b = load <8 x i32>, ptr %bp %mask = icmp sgt <8 x i32> %b, zeroinitializer %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef) %ext = zext <8 x i32> %load to <8 x i64> - ret <8 x i64> %ext + store <8 x i64> %ext, ptr %c + ret void } declare <2 x half> @llvm.masked.load.v2f16(ptr, i32, <2 x i1>, <2 x half>) diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll index 113f7a9465a1fe..f97ca05f3bdd4b 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll @@ -712,82 +712,88 @@ define void @splat_imm_v8f64(ptr %a) vscale_range(4,0) #0 { ret void } -define <8 x float> @load_splat_v8f32(ptr %p) vscale_range(2,2) #0 { +define void @load_splat_v8f32(ptr %a, ptr %b) vscale_range(2,2) #0 { ; CHECK-LABEL: load_splat_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: mov z0.s, s0 -; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret - %v = load <8 x float>, ptr %p + %v = load <8 x float>, ptr %a %splat = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> zeroinitializer - ret <8 x float> %splat + store <8 x float> %splat, ptr %b + ret void } -define <4 x double> @load_splat_v4f64(ptr %p) vscale_range(2,2) #0 { +define void @load_splat_v4f64(ptr %a, ptr %b) vscale_range(2,2) #0 { ; CHECK-LABEL: load_splat_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: mov z0.d, d0 -; CHECK-NEXT: st1d { z0.d }, p0, [x8] +; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret - %v = load <4 x double>, ptr %p + %v = load <4 x double>, ptr %a %splat = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> zeroinitializer - ret <4 x double> %splat + store <4 x double> %splat, ptr %b + ret void } -define <32 x i8> @load_splat_v32i8(ptr %p) vscale_range(2,2) #0 { +define void @load_splat_v32i8(ptr %a, ptr %b) vscale_range(2,2) #0 { ; CHECK-LABEL: load_splat_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: mov z0.b, b0 -; CHECK-NEXT: st1b { z0.b }, p0, [x8] +; CHECK-NEXT: st1b { z0.b }, p0, [x1] ; CHECK-NEXT: ret - %v = load <32 x i8>, ptr %p + %v = load <32 x i8>, ptr %a %splat = shufflevector <32 x i8> %v, <32 x i8> undef, <32 x i32> zeroinitializer - ret <32 x i8> %splat + store <32 x i8> %splat, ptr %b + ret void } -define <16 x i16> @load_splat_v16i16(ptr %p) vscale_range(2,2) #0 { +define void @load_splat_v16i16(ptr %a, ptr %b) vscale_range(2,2) #0 { ; CHECK-LABEL: load_splat_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: mov z0.h, h0 -; CHECK-NEXT: st1h { z0.h }, p0, [x8] +; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret - %v = load <16 x i16>, ptr %p + %v = load <16 x i16>, ptr %a %splat = shufflevector <16 x i16> %v, <16 x i16> undef, <16 x i32> zeroinitializer - ret <16 x i16> %splat + store <16 x i16> %splat, ptr %b + ret void } -define <8 x i32> @load_splat_v8i32(ptr %p) vscale_range(2,2) #0 { +define void @load_splat_v8i32(ptr %a, ptr %b) vscale_range(2,2) #0 { ; CHECK-LABEL: load_splat_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: mov z0.s, s0 -; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret - %v = load <8 x i32>, ptr %p + %v = load <8 x i32>, ptr %a %splat = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> zeroinitializer - ret <8 x i32> %splat + store <8 x i32> %splat, ptr %b + ret void } -define <4 x i64> @load_splat_v4i64(ptr %p) vscale_range(2,2) #0 { +define void @load_splat_v4i64(ptr %a, ptr %b) vscale_range(2,2) #0 { ; CHECK-LABEL: load_splat_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: mov z0.d, d0 -; CHECK-NEXT: st1d { z0.d }, p0, [x8] +; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret - %v = load <4 x i64>, ptr %p + %v = load <4 x i64>, ptr %a %splat = shufflevector <4 x i64> %v, <4 x i64> undef, <4 x i32> zeroinitializer - ret <4 x i64> %splat + store <4 x i64> %splat, ptr %b + ret void } attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll deleted file mode 100644 index 03bff6cb9b62df..00000000000000 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll +++ /dev/null @@ -1,180 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 -; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048 - -target triple = "aarch64-unknown-linux-gnu" - -; Don't use SVE for 64-bit vectors. -define void @store_v2f32(ptr %a) #0 { -; CHECK-LABEL: store_v2f32: -; CHECK: // %bb.0: -; CHECK-NEXT: str xzr, [x0] -; CHECK-NEXT: ret - store <2 x float> zeroinitializer, ptr %a - ret void -} - -; Don't use SVE for 128-bit vectors. -define void @store_v4f32(ptr %a) #0 { -; CHECK-LABEL: store_v4f32: -; CHECK: // %bb.0: -; CHECK-NEXT: stp xzr, xzr, [x0] -; CHECK-NEXT: ret - store <4 x float> zeroinitializer, ptr %a - ret void -} - -define void @store_v8f32(ptr %a) #0 { -; CHECK-LABEL: store_v8f32: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: st1w { z0.s }, p0, [x0] -; CHECK-NEXT: ret - store <8 x float> zeroinitializer, ptr %a - ret void -} - -define void @store_v16f32(ptr %a) #0 { -; VBITS_GE_256-LABEL: store_v16f32: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov z0.s, #0 // =0x0 -; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: store_v16f32: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: mov z0.s, #0 // =0x0 -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_512-NEXT: ret -; -; VBITS_GE_1024-LABEL: store_v16f32: -; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 -; VBITS_GE_1024-NEXT: mov z0.s, #0 // =0x0 -; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_1024-NEXT: ret -; -; VBITS_GE_2048-LABEL: store_v16f32: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.s, vl16 -; VBITS_GE_2048-NEXT: mov z0.s, #0 // =0x0 -; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_2048-NEXT: ret - store <16 x float> zeroinitializer, ptr %a - ret void -} - -define void @store_v32f32(ptr %a) #0 { -; VBITS_GE_256-LABEL: store_v32f32: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov z0.s, #0 // =0x0 -; VBITS_GE_256-NEXT: mov x8, #24 // =0x18 -; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: store_v32f32: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: mov z0.s, #0 // =0x0 -; VBITS_GE_512-NEXT: mov x8, #16 // =0x10 -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_512-NEXT: ret -; -; VBITS_GE_1024-LABEL: store_v32f32: -; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 -; VBITS_GE_1024-NEXT: mov z0.s, #0 // =0x0 -; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_1024-NEXT: ret -; -; VBITS_GE_2048-LABEL: store_v32f32: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: mov z0.s, #0 // =0x0 -; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_2048-NEXT: ret - store <32 x float> zeroinitializer, ptr %a - ret void -} - -define void @store_v64f32(ptr %a) #0 { -; VBITS_GE_256-LABEL: store_v64f32: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov z0.s, #0 // =0x0 -; VBITS_GE_256-NEXT: mov x8, #56 // =0x38 -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: mov x8, #48 // =0x30 -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: mov x8, #40 // =0x28 -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: mov x8, #32 // =0x20 -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: mov x8, #24 // =0x18 -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: store_v64f32: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: mov z0.s, #0 // =0x0 -; VBITS_GE_512-NEXT: mov x8, #48 // =0x30 -; VBITS_GE_512-NEXT: mov x9, #32 // =0x20 -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_512-NEXT: mov x8, #16 // =0x10 -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_512-NEXT: ret -; -; VBITS_GE_1024-LABEL: store_v64f32: -; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 -; VBITS_GE_1024-NEXT: mov z0.s, #0 // =0x0 -; VBITS_GE_1024-NEXT: mov x8, #32 // =0x20 -; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_1024-NEXT: ret -; -; VBITS_GE_2048-LABEL: store_v64f32: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 -; VBITS_GE_2048-NEXT: mov z0.s, #0 // =0x0 -; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_2048-NEXT: ret - store <64 x float> zeroinitializer, ptr %a - ret void -} - -attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll index e742836d79fbe5..79ef20270eda85 100644 --- a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll @@ -35,23 +35,23 @@ define @test_post_ld1_dup(ptr %a, ptr %ptr, i64 %inc) { ret %dup } -define <4 x i64> @test_post_ld1_int_fixed(ptr %data, i64 %idx, ptr %addr) #1 { +define void @test_post_ld1_int_fixed(ptr %data, i64 %idx, ptr %addr, ptr %res_ptr) #1 { ; CHECK-LABEL: test_post_ld1_int_fixed: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov w9, #2 // =0x2 +; CHECK-NEXT: mov w8, #2 // =0x2 ; CHECK-NEXT: index z0.d, #0, #1 ; CHECK-NEXT: ptrue p1.d, vl1 -; CHECK-NEXT: mov z1.d, x9 +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2] ; CHECK-NEXT: cmpeq p2.d, p0/z, z0.d, z1.d -; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: ldr x10, [x0, x1, lsl #3] +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: ldr x9, [x0, x1, lsl #3] ; CHECK-NEXT: mov z0.d, z2.d -; CHECK-NEXT: mov z2.d, p2/m, x10 -; CHECK-NEXT: mov z0.d, p1/m, x9 +; CHECK-NEXT: mov z2.d, p2/m, x9 +; CHECK-NEXT: mov z0.d, p1/m, x8 ; CHECK-NEXT: add z0.d, z0.d, z2.d -; CHECK-NEXT: st1d { z0.d }, p0, [x8] +; CHECK-NEXT: st1d { z0.d }, p0, [x3] ; CHECK-NEXT: ret %A = load <4 x i64>, ptr %addr %ld1 = load i64, ptr %data @@ -60,16 +60,17 @@ define <4 x i64> @test_post_ld1_int_fixed(ptr %data, i64 %idx, ptr %addr) #1 { %ld2 = load i64, ptr %gep %vec2 = insertelement <4 x i64> %A, i64 %ld2, i32 2 %res = add <4 x i64> %vec1, %vec2 - ret <4 x i64> %res + store <4 x i64> %res, ptr %res_ptr + ret void } -define <4 x double> @test_post_ld1_double_fixed(ptr %data, i64 %idx, ptr %addr) #1 { +define void @test_post_ld1_double_fixed(ptr %data, i64 %idx, ptr %addr, ptr %res_ptr) #1 { ; CHECK-LABEL: test_post_ld1_double_fixed: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov w9, #2 // =0x2 +; CHECK-NEXT: mov w8, #2 // =0x2 ; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: mov z1.d, x9 +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ptrue p1.d, vl1 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2] ; CHECK-NEXT: cmpeq p2.d, p0/z, z0.d, z1.d @@ -78,7 +79,7 @@ define <4 x double> @test_post_ld1_double_fixed(ptr %data, i64 %idx, ptr %addr) ; CHECK-NEXT: sel z0.d, p1, z0.d, z2.d ; CHECK-NEXT: mov z2.d, p2/m, d1 ; CHECK-NEXT: fadd z0.d, z0.d, z2.d -; CHECK-NEXT: st1d { z0.d }, p0, [x8] +; CHECK-NEXT: st1d { z0.d }, p0, [x3] ; CHECK-NEXT: ret %A = load <4 x double>, ptr %addr %ld1 = load double, ptr %data @@ -87,7 +88,8 @@ define <4 x double> @test_post_ld1_double_fixed(ptr %data, i64 %idx, ptr %addr) %ld2 = load double, ptr %gep %vec2 = insertelement <4 x double> %A, double %ld2, i32 2 %res = fadd <4 x double> %vec1, %vec2 - ret <4 x double> %res + store <4 x double> %res, ptr %res_ptr + ret void } attributes #1 = { vscale_range(2,2) "target-features"="+neon,+sve" }