From 6b4fa32ff4a9b09a53459d98d276475b3e2d7cb3 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 5 Jun 2024 18:06:29 +0100 Subject: [PATCH] [ARM] Add NEON support for ISD::ABDS/ABDU nodes. As noted on #94466, NEON has ABDS/ABDU instructions but only handles them via intrinsics, plus some VABDL custom patterns. Fixes #94466 --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 16 +- llvm/lib/Target/ARM/ARMInstrNEON.td | 24 +-- llvm/test/CodeGen/ARM/neon_vabd.ll | 201 ++++++++---------------- 3 files changed, 87 insertions(+), 154 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 5212d2c620b75b..78aaaca4e185ba 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -205,9 +205,9 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) { setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); - if (!VT.isFloatingPoint() && - VT != MVT::v2i64 && VT != MVT::v1i64) - for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) + if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) + for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX, + ISD::UMIN, ISD::UMAX}) setOperationAction(Opcode, VT, Legal); if (!VT.isFloatingPoint()) for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}) @@ -4174,7 +4174,15 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, } case Intrinsic::arm_neon_vabs: return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(), - Op.getOperand(1)); + Op.getOperand(1)); + case Intrinsic::arm_neon_vabds: + if (Op.getValueType().isInteger()) + return DAG.getNode(ISD::ABDS, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + return SDValue(); + case Intrinsic::arm_neon_vabdu: + return DAG.getNode(ISD::ABDU, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); case Intrinsic::arm_neon_vmulls: case Intrinsic::arm_neon_vmullu: { unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td index 21a5817252aeaa..c600478b66402c 100644 --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -5640,10 +5640,10 @@ def VBITq : N3VX<1, 0, 0b10, 0b0001, 1, 1, // VABD : Vector Absolute Difference defm VABDs : N3VInt_QHS<0, 0, 0b0111, 0, N3RegFrm, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, - "vabd", "s", int_arm_neon_vabds, 1>; + "vabd", "s", abds, 1>; defm VABDu : N3VInt_QHS<1, 0, 0b0111, 0, N3RegFrm, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, - "vabd", "u", int_arm_neon_vabdu, 1>; + "vabd", "u", abdu, 1>; def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBIND, "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 1>; def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ, @@ -5657,20 +5657,22 @@ def VABDhq : N3VQInt<1, 0, 0b11, 0b1101, 0, N3RegFrm, IIC_VBINQ, // VABDL : Vector Absolute Difference Long (Q = | D - D |) defm VABDLs : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q, - "vabdl", "s", int_arm_neon_vabds, zext, 1>; + "vabdl", "s", abds, zext, 1>; defm VABDLu : N3VLIntExt_QHS<1,1,0b0111,0, IIC_VSUBi4Q, - "vabdl", "u", int_arm_neon_vabdu, zext, 1>; + "vabdl", "u", abdu, zext, 1>; let Predicates = [HasNEON] in { -def : Pat<(v8i16 (abs (sub (zext (v8i8 DPR:$opA)), (zext (v8i8 DPR:$opB))))), +def : Pat<(v8i16 (zext (abdu (v8i8 DPR:$opA), (v8i8 DPR:$opB)))), (VABDLuv8i16 DPR:$opA, DPR:$opB)>; -def : Pat<(v4i32 (abs (sub (zext (v4i16 DPR:$opA)), (zext (v4i16 DPR:$opB))))), +def : Pat<(v4i32 (zext (abdu (v4i16 DPR:$opA), (v4i16 DPR:$opB)))), (VABDLuv4i32 DPR:$opA, DPR:$opB)>; +def : Pat<(v2i64 (zext (abdu (v2i32 DPR:$opA), (v2i32 DPR:$opB)))), + (VABDLuv2i64 DPR:$opA, DPR:$opB)>; } // ISD::ABS is not legal for v2i64, so VABDL needs to be matched from the // shift/xor pattern for ABS. - +// TODO: Remove me. def abd_shr : PatFrag<(ops node:$in1, node:$in2, node:$shift), (ARMvshrsImm (sub (zext node:$in1), @@ -5686,15 +5688,15 @@ def : Pat<(xor (v2i64 (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)), // VABA : Vector Absolute Difference and Accumulate defm VABAs : N3VIntOp_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ, - "vaba", "s", int_arm_neon_vabds, add>; + "vaba", "s", abds, add>; defm VABAu : N3VIntOp_QHS<1,0,0b0111,1, IIC_VABAD, IIC_VABAQ, - "vaba", "u", int_arm_neon_vabdu, add>; + "vaba", "u", abdu, add>; // VABAL : Vector Absolute Difference and Accumulate Long (Q += | D - D |) defm VABALs : N3VLIntExtOp_QHS<0,1,0b0101,0, IIC_VABAD, - "vabal", "s", int_arm_neon_vabds, zext, add>; + "vabal", "s", abds, zext, add>; defm VABALu : N3VLIntExtOp_QHS<1,1,0b0101,0, IIC_VABAD, - "vabal", "u", int_arm_neon_vabdu, zext, add>; + "vabal", "u", abdu, zext, add>; // Vector Maximum and Minimum. diff --git a/llvm/test/CodeGen/ARM/neon_vabd.ll b/llvm/test/CodeGen/ARM/neon_vabd.ll index 14ad1a108a728a..907e11c0cf19d4 100644 --- a/llvm/test/CodeGen/ARM/neon_vabd.ll +++ b/llvm/test/CodeGen/ARM/neon_vabd.ll @@ -10,9 +10,7 @@ define <8 x i8> @sabd_8b(<8 x i8> %a, <8 x i8> %b) { ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d16, r2, r3 ; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: vsubl.s8 q8, d17, d16 -; CHECK-NEXT: vabs.s16 q8, q8 -; CHECK-NEXT: vmovn.i16 d16, q8 +; CHECK-NEXT: vabd.s8 d16, d17, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %a.sext = sext <8 x i8> %a to <8 x i16> @@ -26,18 +24,13 @@ define <8 x i8> @sabd_8b(<8 x i8> %a, <8 x i8> %b) { define <16 x i8> @sabd_16b(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: sabd_16b: ; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 ; CHECK-NEXT: mov r12, sp -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] -; CHECK-NEXT: vmov d18, r2, r3 -; CHECK-NEXT: vmov d19, r0, r1 -; CHECK-NEXT: vsubl.s8 q10, d18, d17 -; CHECK-NEXT: vsubl.s8 q8, d19, d16 -; CHECK-NEXT: vabs.s16 q9, q10 -; CHECK-NEXT: vabs.s16 q8, q8 -; CHECK-NEXT: vmovn.i16 d19, q9 -; CHECK-NEXT: vmovn.i16 d18, q8 -; CHECK-NEXT: vmov r2, r3, d19 -; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vabd.s8 q8, q8, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %a.sext = sext <16 x i8> %a to <16 x i16> %b.sext = sext <16 x i8> %b to <16 x i16> @@ -52,9 +45,7 @@ define <4 x i16> @sabd_4h(<4 x i16> %a, <4 x i16> %b) { ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d16, r2, r3 ; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: vsubl.s16 q8, d17, d16 -; CHECK-NEXT: vabs.s32 q8, q8 -; CHECK-NEXT: vmovn.i32 d16, q8 +; CHECK-NEXT: vabd.s16 d16, d17, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %a.sext = sext <4 x i16> %a to <4 x i32> @@ -74,8 +65,7 @@ define <4 x i16> @sabd_4h_promoted_ops(<4 x i8> %a, <4 x i8> %b) { ; CHECK-NEXT: vshl.i16 d17, d17, #8 ; CHECK-NEXT: vshr.s16 d16, d16, #8 ; CHECK-NEXT: vshr.s16 d17, d17, #8 -; CHECK-NEXT: vsub.i16 d16, d17, d16 -; CHECK-NEXT: vabs.s16 d16, d16 +; CHECK-NEXT: vabd.s16 d16, d17, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %a.sext = sext <4 x i8> %a to <4 x i16> @@ -88,18 +78,13 @@ define <4 x i16> @sabd_4h_promoted_ops(<4 x i8> %a, <4 x i8> %b) { define <8 x i16> @sabd_8h(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: sabd_8h: ; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 ; CHECK-NEXT: mov r12, sp -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] -; CHECK-NEXT: vmov d18, r2, r3 -; CHECK-NEXT: vmov d19, r0, r1 -; CHECK-NEXT: vsubl.s16 q10, d18, d17 -; CHECK-NEXT: vsubl.s16 q8, d19, d16 -; CHECK-NEXT: vabs.s32 q9, q10 -; CHECK-NEXT: vabs.s32 q8, q8 -; CHECK-NEXT: vmovn.i32 d19, q9 -; CHECK-NEXT: vmovn.i32 d18, q8 -; CHECK-NEXT: vmov r2, r3, d19 -; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vabd.s16 q8, q8, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %a.sext = sext <8 x i16> %a to <8 x i32> %b.sext = sext <8 x i16> %b to <8 x i32> @@ -114,8 +99,7 @@ define <8 x i16> @sabd_8h_promoted_ops(<8 x i8> %a, <8 x i8> %b) { ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d16, r2, r3 ; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: vsubl.s8 q8, d17, d16 -; CHECK-NEXT: vabs.s16 q8, q8 +; CHECK-NEXT: vabdl.s8 q8, d17, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -131,11 +115,7 @@ define <2 x i32> @sabd_2s(<2 x i32> %a, <2 x i32> %b) { ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d16, r2, r3 ; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: vsubl.s32 q8, d17, d16 -; CHECK-NEXT: vshr.s64 q9, q8, #63 -; CHECK-NEXT: veor q8, q8, q9 -; CHECK-NEXT: vsub.i64 q8, q8, q9 -; CHECK-NEXT: vmovn.i64 d16, q8 +; CHECK-NEXT: vabd.s32 d16, d17, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %a.sext = sext <2 x i32> %a to <2 x i64> @@ -155,8 +135,7 @@ define <2 x i32> @sabd_2s_promoted_ops(<2 x i16> %a, <2 x i16> %b) { ; CHECK-NEXT: vshl.i32 d17, d17, #16 ; CHECK-NEXT: vshr.s32 d16, d16, #16 ; CHECK-NEXT: vshr.s32 d17, d17, #16 -; CHECK-NEXT: vsub.i32 d16, d17, d16 -; CHECK-NEXT: vabs.s32 d16, d16 +; CHECK-NEXT: vabd.s32 d16, d17, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %a.sext = sext <2 x i16> %a to <2 x i32> @@ -169,22 +148,13 @@ define <2 x i32> @sabd_2s_promoted_ops(<2 x i16> %a, <2 x i16> %b) { define <4 x i32> @sabd_4s(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: sabd_4s: ; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 ; CHECK-NEXT: mov r12, sp -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] -; CHECK-NEXT: vmov d18, r2, r3 -; CHECK-NEXT: vmov d19, r0, r1 -; CHECK-NEXT: vsubl.s32 q10, d18, d17 -; CHECK-NEXT: vsubl.s32 q8, d19, d16 -; CHECK-NEXT: vshr.s64 q9, q10, #63 -; CHECK-NEXT: vshr.s64 q11, q8, #63 -; CHECK-NEXT: veor q10, q10, q9 -; CHECK-NEXT: veor q8, q8, q11 -; CHECK-NEXT: vsub.i64 q9, q10, q9 -; CHECK-NEXT: vsub.i64 q8, q8, q11 -; CHECK-NEXT: vmovn.i64 d19, q9 -; CHECK-NEXT: vmovn.i64 d18, q8 -; CHECK-NEXT: vmov r2, r3, d19 -; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vabd.s32 q8, q8, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %a.sext = sext <4 x i32> %a to <4 x i64> %b.sext = sext <4 x i32> %b to <4 x i64> @@ -199,8 +169,7 @@ define <4 x i32> @sabd_4s_promoted_ops(<4 x i16> %a, <4 x i16> %b) { ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d16, r2, r3 ; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: vsubl.s16 q8, d17, d16 -; CHECK-NEXT: vabs.s32 q8, q8 +; CHECK-NEXT: vabdl.s16 q8, d17, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -259,10 +228,7 @@ define <2 x i64> @sabd_2d_promoted_ops(<2 x i32> %a, <2 x i32> %b) { ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d16, r2, r3 ; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: vsubl.s32 q8, d17, d16 -; CHECK-NEXT: vshr.s64 q9, q8, #63 -; CHECK-NEXT: veor q8, q8, q9 -; CHECK-NEXT: vsub.i64 q8, q8, q9 +; CHECK-NEXT: vabdl.s32 q8, d17, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -282,8 +248,7 @@ define <8 x i8> @uabd_8b(<8 x i8> %a, <8 x i8> %b) { ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d16, r2, r3 ; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: vabdl.u8 q8, d17, d16 -; CHECK-NEXT: vmovn.i16 d16, q8 +; CHECK-NEXT: vabd.u8 d16, d17, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %a.zext = zext <8 x i8> %a to <8 x i16> @@ -297,16 +262,13 @@ define <8 x i8> @uabd_8b(<8 x i8> %a, <8 x i8> %b) { define <16 x i8> @uabd_16b(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: uabd_16b: ; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 ; CHECK-NEXT: mov r12, sp -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] -; CHECK-NEXT: vmov d18, r2, r3 -; CHECK-NEXT: vmov d19, r0, r1 -; CHECK-NEXT: vabdl.u8 q10, d18, d17 -; CHECK-NEXT: vabdl.u8 q8, d19, d16 -; CHECK-NEXT: vmovn.i16 d19, q10 -; CHECK-NEXT: vmovn.i16 d18, q8 -; CHECK-NEXT: vmov r2, r3, d19 -; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vabd.u8 q8, q8, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %a.zext = zext <16 x i8> %a to <16 x i16> %b.zext = zext <16 x i8> %b to <16 x i16> @@ -321,8 +283,7 @@ define <4 x i16> @uabd_4h(<4 x i16> %a, <4 x i16> %b) { ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d16, r2, r3 ; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: vabdl.u16 q8, d17, d16 -; CHECK-NEXT: vmovn.i32 d16, q8 +; CHECK-NEXT: vabd.u16 d16, d17, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %a.zext = zext <4 x i16> %a to <4 x i32> @@ -340,8 +301,7 @@ define <4 x i16> @uabd_4h_promoted_ops(<4 x i8> %a, <4 x i8> %b) { ; CHECK-NEXT: vmov d17, r0, r1 ; CHECK-NEXT: vbic.i16 d16, #0xff00 ; CHECK-NEXT: vbic.i16 d17, #0xff00 -; CHECK-NEXT: vsub.i16 d16, d17, d16 -; CHECK-NEXT: vabs.s16 d16, d16 +; CHECK-NEXT: vabd.u16 d16, d17, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %a.zext = zext <4 x i8> %a to <4 x i16> @@ -354,16 +314,13 @@ define <4 x i16> @uabd_4h_promoted_ops(<4 x i8> %a, <4 x i8> %b) { define <8 x i16> @uabd_8h(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: uabd_8h: ; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 ; CHECK-NEXT: mov r12, sp -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] -; CHECK-NEXT: vmov d18, r2, r3 -; CHECK-NEXT: vmov d19, r0, r1 -; CHECK-NEXT: vabdl.u16 q10, d18, d17 -; CHECK-NEXT: vabdl.u16 q8, d19, d16 -; CHECK-NEXT: vmovn.i32 d19, q10 -; CHECK-NEXT: vmovn.i32 d18, q8 -; CHECK-NEXT: vmov r2, r3, d19 -; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vabd.u16 q8, q8, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %a.zext = zext <8 x i16> %a to <8 x i32> %b.zext = zext <8 x i16> %b to <8 x i32> @@ -394,11 +351,7 @@ define <2 x i32> @uabd_2s(<2 x i32> %a, <2 x i32> %b) { ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d16, r2, r3 ; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: vsubl.u32 q8, d17, d16 -; CHECK-NEXT: vshr.s64 q9, q8, #63 -; CHECK-NEXT: veor q8, q8, q9 -; CHECK-NEXT: vsub.i64 q8, q8, q9 -; CHECK-NEXT: vmovn.i64 d16, q8 +; CHECK-NEXT: vabd.u32 d16, d17, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %a.zext = zext <2 x i32> %a to <2 x i64> @@ -417,8 +370,7 @@ define <2 x i32> @uabd_2s_promoted_ops(<2 x i16> %a, <2 x i16> %b) { ; CHECK-NEXT: vmov d18, r0, r1 ; CHECK-NEXT: vand d17, d17, d16 ; CHECK-NEXT: vand d16, d18, d16 -; CHECK-NEXT: vsub.i32 d16, d16, d17 -; CHECK-NEXT: vabs.s32 d16, d16 +; CHECK-NEXT: vabd.u32 d16, d16, d17 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %a.zext = zext <2 x i16> %a to <2 x i32> @@ -431,22 +383,13 @@ define <2 x i32> @uabd_2s_promoted_ops(<2 x i16> %a, <2 x i16> %b) { define <4 x i32> @uabd_4s(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: uabd_4s: ; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 ; CHECK-NEXT: mov r12, sp -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] -; CHECK-NEXT: vmov d18, r2, r3 -; CHECK-NEXT: vmov d19, r0, r1 -; CHECK-NEXT: vsubl.u32 q10, d18, d17 -; CHECK-NEXT: vsubl.u32 q8, d19, d16 -; CHECK-NEXT: vshr.s64 q9, q10, #63 -; CHECK-NEXT: vshr.s64 q11, q8, #63 -; CHECK-NEXT: veor q10, q10, q9 -; CHECK-NEXT: veor q8, q8, q11 -; CHECK-NEXT: vsub.i64 q9, q10, q9 -; CHECK-NEXT: vsub.i64 q8, q8, q11 -; CHECK-NEXT: vmovn.i64 d19, q9 -; CHECK-NEXT: vmovn.i64 d18, q8 -; CHECK-NEXT: vmov r2, r3, d19 -; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vabd.u32 q8, q8, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %a.zext = zext <4 x i32> %a to <4 x i64> %b.zext = zext <4 x i32> %b to <4 x i64> @@ -519,10 +462,7 @@ define <2 x i64> @uabd_2d_promoted_ops(<2 x i32> %a, <2 x i32> %b) { ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d16, r2, r3 ; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: vsubl.u32 q8, d17, d16 -; CHECK-NEXT: vshr.s64 q9, q8, #63 -; CHECK-NEXT: veor q8, q8, q9 -; CHECK-NEXT: vsub.i64 q8, q8, q9 +; CHECK-NEXT: vabdl.u32 q8, d17, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -610,8 +550,7 @@ define <16 x i8> @sabd_v16i8_nsw(<16 x i8> %a, <16 x i8> %b) { ; CHECK-NEXT: mov r12, sp ; CHECK-NEXT: vld1.64 {d18, d19}, [r12] ; CHECK-NEXT: vmov d16, r0, r1 -; CHECK-NEXT: vsub.i8 q8, q8, q9 -; CHECK-NEXT: vabs.s8 q8, q8 +; CHECK-NEXT: vabd.s8 q8, q8, q9 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -627,8 +566,7 @@ define <8 x i16> @sabd_v8i16_nsw(<8 x i16> %a, <8 x i16> %b) { ; CHECK-NEXT: mov r12, sp ; CHECK-NEXT: vld1.64 {d18, d19}, [r12] ; CHECK-NEXT: vmov d16, r0, r1 -; CHECK-NEXT: vsub.i16 q8, q8, q9 -; CHECK-NEXT: vabs.s16 q8, q8 +; CHECK-NEXT: vabd.s16 q8, q8, q9 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -644,8 +582,7 @@ define <4 x i32> @sabd_v4i32_nsw(<4 x i32> %a, <4 x i32> %b) { ; CHECK-NEXT: mov r12, sp ; CHECK-NEXT: vld1.64 {d18, d19}, [r12] ; CHECK-NEXT: vmov d16, r0, r1 -; CHECK-NEXT: vsub.i32 q8, q8, q9 -; CHECK-NEXT: vabs.s32 q8, q8 +; CHECK-NEXT: vabd.s32 q8, q8, q9 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -680,9 +617,7 @@ define <16 x i8> @smaxmin_v16i8(<16 x i8> %0, <16 x i8> %1) { ; CHECK-NEXT: mov r12, sp ; CHECK-NEXT: vld1.64 {d18, d19}, [r12] ; CHECK-NEXT: vmov d16, r0, r1 -; CHECK-NEXT: vmin.s8 q10, q8, q9 -; CHECK-NEXT: vmax.s8 q8, q8, q9 -; CHECK-NEXT: vsub.i8 q8, q8, q10 +; CHECK-NEXT: vabd.s8 q8, q8, q9 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -699,9 +634,7 @@ define <8 x i16> @smaxmin_v8i16(<8 x i16> %0, <8 x i16> %1) { ; CHECK-NEXT: mov r12, sp ; CHECK-NEXT: vld1.64 {d18, d19}, [r12] ; CHECK-NEXT: vmov d16, r0, r1 -; CHECK-NEXT: vmin.s16 q10, q8, q9 -; CHECK-NEXT: vmax.s16 q8, q8, q9 -; CHECK-NEXT: vsub.i16 q8, q8, q10 +; CHECK-NEXT: vabd.s16 q8, q8, q9 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -718,9 +651,7 @@ define <4 x i32> @smaxmin_v4i32(<4 x i32> %0, <4 x i32> %1) { ; CHECK-NEXT: mov r12, sp ; CHECK-NEXT: vld1.64 {d18, d19}, [r12] ; CHECK-NEXT: vmov d16, r0, r1 -; CHECK-NEXT: vmin.s32 q10, q8, q9 -; CHECK-NEXT: vmax.s32 q8, q8, q9 -; CHECK-NEXT: vsub.i32 q8, q8, q10 +; CHECK-NEXT: vabd.s32 q8, q8, q9 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -799,9 +730,7 @@ define <16 x i8> @umaxmin_v16i8(<16 x i8> %0, <16 x i8> %1) { ; CHECK-NEXT: mov r12, sp ; CHECK-NEXT: vld1.64 {d18, d19}, [r12] ; CHECK-NEXT: vmov d16, r0, r1 -; CHECK-NEXT: vmin.u8 q10, q8, q9 -; CHECK-NEXT: vmax.u8 q8, q8, q9 -; CHECK-NEXT: vsub.i8 q8, q8, q10 +; CHECK-NEXT: vabd.u8 q8, q8, q9 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -818,9 +747,7 @@ define <8 x i16> @umaxmin_v8i16(<8 x i16> %0, <8 x i16> %1) { ; CHECK-NEXT: mov r12, sp ; CHECK-NEXT: vld1.64 {d18, d19}, [r12] ; CHECK-NEXT: vmov d16, r0, r1 -; CHECK-NEXT: vmin.u16 q10, q8, q9 -; CHECK-NEXT: vmax.u16 q8, q8, q9 -; CHECK-NEXT: vsub.i16 q8, q8, q10 +; CHECK-NEXT: vabd.u16 q8, q8, q9 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -837,9 +764,7 @@ define <4 x i32> @umaxmin_v4i32(<4 x i32> %0, <4 x i32> %1) { ; CHECK-NEXT: mov r12, sp ; CHECK-NEXT: vld1.64 {d18, d19}, [r12] ; CHECK-NEXT: vmov d16, r0, r1 -; CHECK-NEXT: vmin.u32 q10, q8, q9 -; CHECK-NEXT: vmax.u32 q8, q8, q9 -; CHECK-NEXT: vsub.i32 q8, q8, q10 +; CHECK-NEXT: vabd.u32 q8, q8, q9 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -874,12 +799,10 @@ define <16 x i8> @umaxmin_v16i8_com1(<16 x i8> %0, <16 x i8> %1) { ; CHECK-LABEL: umaxmin_v16i8_com1: ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] ; CHECK-NEXT: vmov d16, r0, r1 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vld1.64 {d18, d19}, [r0] -; CHECK-NEXT: vmin.u8 q10, q9, q8 -; CHECK-NEXT: vmax.u8 q8, q8, q9 -; CHECK-NEXT: vsub.i8 q8, q8, q10 +; CHECK-NEXT: vabd.u8 q8, q8, q9 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr