From 675922efd971acdbf3c075dde299223f52857315 Mon Sep 17 00:00:00 2001 From: Rose Date: Tue, 5 Mar 2024 18:43:29 -0500 Subject: [PATCH] [Thumb] Resolve FIXME: Use 'mov hi, $src; mov $dst, hi' Consider the following: ldr r0, [r4] ldr r7, [r0, #4] cmp r7, r3 bhi .LBB0_6 cmp r0, r2 push {r0} pop {r4} bne .LBB0_3 movs r0, r6 pop {r4, r5, r6, r7} pop {r1} bx r1 Here is a snippet of the generated THUMB1 code of the K&R malloc function that clang currently compiles to. push {r0} ends up being popped to pop {r4}. movs r4, r0 would destroy the flags set by cmp right above. The compiler has no alternative in this case, except one: the only alternative is to transfer through a high register. However, it seems like LLVM does not consider that this is a valid approach, even though it is a free clobbering a high register. This patch addresses the FIXME so the compiler can do that when it can in r10 or r11, or r12. --- llvm/lib/Target/ARM/Thumb1InstrInfo.cpp | 49 ++++++++++++++++--- llvm/test/CodeGen/ARM/sadd_sat.ll | 4 +- llvm/test/CodeGen/ARM/select_const.ll | 16 +++--- llvm/test/CodeGen/ARM/wide-compares.ll | 8 +-- llvm/test/CodeGen/Thumb/pr35836.ll | 12 ++--- .../CodeGen/Thumb/urem-seteq-illegal-types.ll | 8 +-- 6 files changed, 66 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp index 85eabdb17ad190..5b0b799880a35f 100644 --- a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -12,6 +12,8 @@ #include "Thumb1InstrInfo.h" #include "ARMSubtarget.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/CodeGen/LiveRegUnits.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" @@ -47,24 +49,57 @@ void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB, assert(ARM::GPRRegClass.contains(DestReg, SrcReg) && "Thumb1 can only copy GPR registers"); - if (st.hasV6Ops() || ARM::hGPRRegClass.contains(SrcReg) - || !ARM::tGPRRegClass.contains(DestReg)) + if (st.hasV6Ops() || ARM::hGPRRegClass.contains(SrcReg) || + !ARM::tGPRRegClass.contains(DestReg)) BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)) .add(predOps(ARMCC::AL)); else { - // FIXME: Can also use 'mov hi, $src; mov $dst, hi', - // with hi as either r10 or r11. - const TargetRegisterInfo *RegInfo = st.getRegisterInfo(); - if (MBB.computeRegisterLiveness(RegInfo, ARM::CPSR, I) - == MachineBasicBlock::LQR_Dead) { + LiveRegUnits UsedRegs(*RegInfo); + UsedRegs.addLiveOuts(MBB); + + auto InstUpToI = MBB.end(); + while (InstUpToI != I) + // The pre-decrement is on purpose here. + // We want to have the liveness right before I. + UsedRegs.stepBackward(*--InstUpToI); + + if (UsedRegs.available(ARM::CPSR)) { BuildMI(MBB, I, DL, get(ARM::tMOVSr), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)) ->addRegisterDead(ARM::CPSR, RegInfo); return; } + // Use high register to move source to destination + // if movs is not an option. + BitVector Allocatable = RegInfo->getAllocatableSet( + MF, RegInfo->getRegClass(ARM::hGPRRegClassID)); + + Register TmpReg = ARM::NoRegister; + // Prefer R12 as it is known to not be preserved anyway + if (UsedRegs.available(ARM::R12) && Allocatable.test(ARM::R12)) { + TmpReg = ARM::R12; + } else { + for (Register Reg : Allocatable.set_bits()) { + if (UsedRegs.available(Reg)) { + TmpReg = Reg; + break; + } + } + } + + if (TmpReg) { + BuildMI(MBB, I, DL, get(ARM::tMOVr), TmpReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .add(predOps(ARMCC::AL)); + BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg) + .addReg(TmpReg, getKillRegState(true)) + .add(predOps(ARMCC::AL)); + return; + } + // 'MOV lo, lo' is unpredictable on < v6, so use the stack to do it BuildMI(MBB, I, DL, get(ARM::tPUSH)) .add(predOps(ARMCC::AL)) diff --git a/llvm/test/CodeGen/ARM/sadd_sat.ll b/llvm/test/CodeGen/ARM/sadd_sat.ll index 1632c4e86c7629..0060b4458081bc 100644 --- a/llvm/test/CodeGen/ARM/sadd_sat.ll +++ b/llvm/test/CodeGen/ARM/sadd_sat.ll @@ -130,8 +130,8 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; CHECK-T15TE-NEXT: bics r4, r1 ; CHECK-T15TE-NEXT: asrs r1, r3, #31 ; CHECK-T15TE-NEXT: cmp r4, #0 -; CHECK-T15TE-NEXT: push {r1} -; CHECK-T15TE-NEXT: pop {r0} +; CHECK-T15TE-NEXT: mov r12, r1 +; CHECK-T15TE-NEXT: mov r0, r12 ; CHECK-T15TE-NEXT: bmi .LBB1_2 ; CHECK-T15TE-NEXT: @ %bb.1: ; CHECK-T15TE-NEXT: movs r0, r2 diff --git a/llvm/test/CodeGen/ARM/select_const.ll b/llvm/test/CodeGen/ARM/select_const.ll index e12dd02f16c2fa..df95af313eac66 100644 --- a/llvm/test/CodeGen/ARM/select_const.ll +++ b/llvm/test/CodeGen/ARM/select_const.ll @@ -665,8 +665,8 @@ define i64 @opaque_constant1(i1 %cond, i64 %x) { ; THUMB-NEXT: movs r7, #1 ; THUMB-NEXT: ands r0, r7 ; THUMB-NEXT: subs r1, r0, #1 -; THUMB-NEXT: push {r0} -; THUMB-NEXT: pop {r4} +; THUMB-NEXT: mov r12, r0 +; THUMB-NEXT: mov r4, r12 ; THUMB-NEXT: sbcs r4, r1 ; THUMB-NEXT: cmp r0, #0 ; THUMB-NEXT: bne .LBB24_2 @@ -681,8 +681,8 @@ define i64 @opaque_constant1(i1 %cond, i64 %x) { ; THUMB-NEXT: ands r5, r0 ; THUMB-NEXT: movs r6, #0 ; THUMB-NEXT: subs r0, r5, #1 -; THUMB-NEXT: push {r4} -; THUMB-NEXT: pop {r1} +; THUMB-NEXT: mov r12, r4 +; THUMB-NEXT: mov r1, r12 ; THUMB-NEXT: sbcs r1, r6 ; THUMB-NEXT: eors r3, r7 ; THUMB-NEXT: ldr r6, .LCPI24_0 @@ -786,11 +786,11 @@ define i64 @func(i64 %arg) { ; THUMB-NEXT: push {r4, lr} ; THUMB-NEXT: movs r2, #0 ; THUMB-NEXT: adds r3, r0, #1 -; THUMB-NEXT: push {r1} -; THUMB-NEXT: pop {r3} +; THUMB-NEXT: mov r12, r1 +; THUMB-NEXT: mov r3, r12 ; THUMB-NEXT: adcs r3, r2 -; THUMB-NEXT: push {r2} -; THUMB-NEXT: pop {r3} +; THUMB-NEXT: mov r12, r2 +; THUMB-NEXT: mov r3, r12 ; THUMB-NEXT: adcs r3, r2 ; THUMB-NEXT: subs r4, r3, #1 ; THUMB-NEXT: adds r0, r0, #1 diff --git a/llvm/test/CodeGen/ARM/wide-compares.ll b/llvm/test/CodeGen/ARM/wide-compares.ll index 6584f0c7616c52..09e3592b6d420e 100644 --- a/llvm/test/CodeGen/ARM/wide-compares.ll +++ b/llvm/test/CodeGen/ARM/wide-compares.ll @@ -257,12 +257,12 @@ define {i32, i32} @test_slt_not(i32 %c, i32 %d, i64 %a, i64 %b) { ; CHECK-THUMB1-NOMOV-NEXT: ldr r5, [sp, #16] ; CHECK-THUMB1-NOMOV-NEXT: subs r2, r2, r5 ; CHECK-THUMB1-NOMOV-NEXT: sbcs r3, r0 -; CHECK-THUMB1-NOMOV-NEXT: push {r1} -; CHECK-THUMB1-NOMOV-NEXT: pop {r0} +; CHECK-THUMB1-NOMOV-NEXT: mov r12, r1 +; CHECK-THUMB1-NOMOV-NEXT: mov r0, r12 ; CHECK-THUMB1-NOMOV-NEXT: blt .LBB3_2 ; CHECK-THUMB1-NOMOV-NEXT: @ %bb.1: @ %entry -; CHECK-THUMB1-NOMOV-NEXT: push {r4} -; CHECK-THUMB1-NOMOV-NEXT: pop {r0} +; CHECK-THUMB1-NOMOV-NEXT: mov r12, r4 +; CHECK-THUMB1-NOMOV-NEXT: mov r0, r12 ; CHECK-THUMB1-NOMOV-NEXT: .LBB3_2: @ %entry ; CHECK-THUMB1-NOMOV-NEXT: bge .LBB3_4 ; CHECK-THUMB1-NOMOV-NEXT: @ %bb.3: @ %entry diff --git a/llvm/test/CodeGen/Thumb/pr35836.ll b/llvm/test/CodeGen/Thumb/pr35836.ll index 96a6fe5d142025..ba33a8184bcc71 100644 --- a/llvm/test/CodeGen/Thumb/pr35836.ll +++ b/llvm/test/CodeGen/Thumb/pr35836.ll @@ -35,18 +35,18 @@ while.body: br label %while.body } ; CHECK: adds r3, r0, r1 -; CHECK: push {r5} -; CHECK: pop {r1} +; CHECK: mov r12, r5 +; CHECK: mov r1, r12 ; CHECK: adcs r1, r5 ; CHECK: ldr r0, [sp, #12] @ 4-byte Reload ; CHECK: ldr r2, [sp, #8] @ 4-byte Reload ; CHECK: adds r2, r0, r2 -; CHECK: push {r5} -; CHECK: pop {r4} +; CHECK: mov r12, r5 +; CHECK: mov r4, r12 ; CHECK: adcs r4, r5 ; CHECK: adds r0, r2, r5 -; CHECK: push {r3} -; CHECK: pop {r0} +; CHECK: mov r12, r3 +; CHECK: mov r0, r12 ; CHECK: adcs r0, r4 ; CHECK: ldr r6, [sp, #4] @ 4-byte Reload ; CHECK: str r0, [r6] diff --git a/llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll index aa5deb6542b2b0..61a741445b81cf 100644 --- a/llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll @@ -122,8 +122,8 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: cmp r0, #170 -; CHECK-NEXT: push {r3} -; CHECK-NEXT: pop {r0} +; CHECK-NEXT: mov r12, r3 +; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: bhi .LBB4_2 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: movs r0, r4 @@ -134,8 +134,8 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; CHECK-NEXT: movs r1, #73 ; CHECK-NEXT: lsls r1, r1, #23 ; CHECK-NEXT: cmp r5, r1 -; CHECK-NEXT: push {r3} -; CHECK-NEXT: pop {r1} +; CHECK-NEXT: mov r12, r3 +; CHECK-NEXT: mov r1, r12 ; CHECK-NEXT: bhi .LBB4_4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: movs r1, r4