Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[LoongArch] Merge base and offset for large offsets #113277

Merged
merged 1 commit into from
Oct 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 99 additions & 68 deletions llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,6 @@ void LoongArchMergeBaseOffsetOpt::foldOffset(
MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail,
int64_t Offset) {
assert(isInt<32>(Offset) && "Unexpected offset");
// Put the offset back in Hi and the Lo
Hi20.getOperand(1).setOffset(Offset);
Lo12.getOperand(2).setOffset(Offset);
Expand All @@ -209,22 +208,35 @@ void LoongArchMergeBaseOffsetOpt::foldOffset(
// instructions and deletes TailAdd and the instructions that produced the
// offset.
//
// Base address lowering is of the form:
// Hi20: pcalau12i vreg1, %pc_hi20(s)
// Lo12: addi.d vreg2, vreg1, %pc_lo12(s)
// / \
// / \
// / \
// / The large offset can be of two forms: \
// 1) Offset that has non zero bits in lower 2) Offset that has non zero
// 12 bits and upper 20 bits bits in upper 20 bits only
// OffsetHi: lu12i.w vreg3, 4
// OffsetLo: ori voff, vreg3, 188 OffsetHi: lu12i.w voff, 128
// \ /
// \ /
// \ /
// \ /
// TailAdd: add.d vreg4, vreg2, voff
// (The instructions marked with "!" are not necessarily present)
//
// Base address lowering is of the form:
// Hi20: pcalau12i vreg1, %pc_hi20(s)
// +- Lo12: addi.d vreg2, vreg1, %pc_lo12(s)
// | Lo20: lu32i.d vreg2, %pc64_lo20(s) !
// +- Hi12: lu52i.d vreg2, vreg2, %pc64_hi12(s) !
// |
// | The large offset can be one of the forms:
// |
// +-> 1) Offset that has non zero bits in Hi20 and Lo12 bits:
// | OffsetHi20: lu12i.w vreg3, 4
// | OffsetLo12: ori voff, vreg3, 188 ------------------+
// | |
// +-> 2) Offset that has non zero bits in Hi20 bits only: |
// | OffsetHi20: lu12i.w voff, 128 ------------------+
// | |
// +-> 3) Offset that has non zero bits in Lo20 bits: |
// | OffsetHi20: lu12i.w vreg3, 121 ! |
// | OffsetLo12: ori voff, vreg3, 122 ! |
// | OffsetLo20: lu32i.d voff, 123 ------------------+
// +-> 4) Offset that has non zero bits in Hi12 bits: |
// OffsetHi20: lu12i.w vreg3, 121 ! |
// OffsetLo12: ori voff, vreg3, 122 ! |
// OffsetLo20: lu32i.d vreg3, 123 ! |
// OffsetHi12: lu52i.d voff, vrg3, 124 ------------------+
// |
// TailAdd: add.d vreg4, vreg2, voff <------------------+
//
bool LoongArchMergeBaseOffsetOpt::foldLargeOffset(
MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &TailAdd,
Expand All @@ -235,55 +247,81 @@ bool LoongArchMergeBaseOffsetOpt::foldLargeOffset(
Register Rs = TailAdd.getOperand(1).getReg();
Register Rt = TailAdd.getOperand(2).getReg();
Register Reg = Rs == GAReg ? Rt : Rs;
SmallVector<MachineInstr *, 4> Instrs;
int64_t Offset = 0;
int64_t Mask = -1;

// This can point to one of [ORI, LU12I.W, LU32I.D, LU52I.D]:
for (int i = 0; i < 4; i++) {
// Handle Reg is R0.
if (Reg == LoongArch::R0)
break;

// Can't fold if the register has more than one use.
if (!Reg.isVirtual() || !MRI->hasOneUse(Reg))
return false;
// This can point to an ORI or a LU12I.W:
MachineInstr &OffsetTail = *MRI->getVRegDef(Reg);
if (OffsetTail.getOpcode() == LoongArch::ORI) {
// The offset value has non zero bits in both %hi and %lo parts.
// Detect an ORI that feeds from a LU12I.W instruction.
MachineOperand &OriImmOp = OffsetTail.getOperand(2);
if (OriImmOp.getTargetFlags() != LoongArchII::MO_None)
// Can't fold if the register has more than one use.
if (!Reg.isVirtual() || !MRI->hasOneUse(Reg))
return false;
Register OriReg = OffsetTail.getOperand(1).getReg();
int64_t OffLo = OriImmOp.getImm();

// Handle rs1 of ORI is R0.
if (OriReg == LoongArch::R0) {
LLVM_DEBUG(dbgs() << " Offset Instrs: " << OffsetTail);
foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailAdd, OffLo);
OffsetTail.eraseFromParent();
return true;
}

MachineInstr &OffsetLu12i = *MRI->getVRegDef(OriReg);
MachineOperand &Lu12iImmOp = OffsetLu12i.getOperand(1);
if (OffsetLu12i.getOpcode() != LoongArch::LU12I_W ||
Lu12iImmOp.getTargetFlags() != LoongArchII::MO_None ||
!MRI->hasOneUse(OffsetLu12i.getOperand(0).getReg()))
MachineInstr *Curr = MRI->getVRegDef(Reg);
if (!Curr)
break;

switch (Curr->getOpcode()) {
default:
// Can't fold if the instruction opcode is unexpected.
return false;
int64_t Offset = SignExtend64<32>(Lu12iImmOp.getImm() << 12);
Offset += OffLo;
// LU12I.W+ORI sign extends the result.
Offset = SignExtend64<32>(Offset);
LLVM_DEBUG(dbgs() << " Offset Instrs: " << OffsetTail
<< " " << OffsetLu12i);
foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailAdd, Offset);
OffsetTail.eraseFromParent();
OffsetLu12i.eraseFromParent();
return true;
} else if (OffsetTail.getOpcode() == LoongArch::LU12I_W) {
// The offset value has all zero bits in the lower 12 bits. Only LU12I.W
// exists.
LLVM_DEBUG(dbgs() << " Offset Instr: " << OffsetTail);
int64_t Offset = SignExtend64<32>(OffsetTail.getOperand(1).getImm() << 12);
foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailAdd, Offset);
OffsetTail.eraseFromParent();
return true;
case LoongArch::ORI: {
MachineOperand ImmOp = Curr->getOperand(2);
if (ImmOp.getTargetFlags() != LoongArchII::MO_None)
return false;
Offset += ImmOp.getImm();
Reg = Curr->getOperand(1).getReg();
Instrs.push_back(Curr);
break;
}
case LoongArch::LU12I_W: {
MachineOperand ImmOp = Curr->getOperand(1);
if (ImmOp.getTargetFlags() != LoongArchII::MO_None)
return false;
Offset += SignExtend64<32>(ImmOp.getImm() << 12) & Mask;
Reg = LoongArch::R0;
Instrs.push_back(Curr);
break;
}
case LoongArch::LU32I_D: {
MachineOperand ImmOp = Curr->getOperand(2);
if (ImmOp.getTargetFlags() != LoongArchII::MO_None || !Lo20)
return false;
Offset += SignExtend64<52>(ImmOp.getImm() << 32) & Mask;
Mask ^= 0x000FFFFF00000000ULL;
Reg = Curr->getOperand(1).getReg();
Instrs.push_back(Curr);
break;
}
case LoongArch::LU52I_D: {
MachineOperand ImmOp = Curr->getOperand(2);
if (ImmOp.getTargetFlags() != LoongArchII::MO_None || !Hi12)
return false;
Offset += ImmOp.getImm() << 52;
Mask ^= 0xFFF0000000000000ULL;
Reg = Curr->getOperand(1).getReg();
Instrs.push_back(Curr);
break;
}
}
}
return false;

// Can't fold if the offset is not extracted.
if (!Offset)
return false;

foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailAdd, Offset);
LLVM_DEBUG(dbgs() << " Offset Instrs:\n");
for (auto I : Instrs) {
LLVM_DEBUG(dbgs() << " " << *I);
I->eraseFromParent();
}

return true;
}

bool LoongArchMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &Hi20,
Expand Down Expand Up @@ -344,13 +382,6 @@ bool LoongArchMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &Hi20,
[[fallthrough]];
case LoongArch::ADD_D:
// The offset is too large to fit in the immediate field of ADDI.
// This can be in two forms:
// 1) LU12I.W hi_offset followed by:
// ORI lo_offset
// This happens in case the offset has non zero bits in
// both hi 20 and lo 12 bits.
// 2) LU12I.W (offset20)
// This happens in case the lower 12 bits of the offset are zeros.
return foldLargeOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail, DestReg);
break;
}
Expand Down
57 changes: 20 additions & 37 deletions llvm/test/CodeGen/LoongArch/merge-base-offset.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1100,14 +1100,11 @@ define dso_local ptr @load_addr_offset_281474439839744() nounwind {
;
; LA64-LARGE-LABEL: load_addr_offset_281474439839744:
; LA64-LARGE: # %bb.0: # %entry
; LA64-LARGE-NEXT: pcalau12i $a0, %pc_hi20(g_a64)
; LA64-LARGE-NEXT: addi.d $a1, $zero, %pc_lo12(g_a64)
; LA64-LARGE-NEXT: lu32i.d $a1, %pc64_lo20(g_a64)
; LA64-LARGE-NEXT: lu52i.d $a1, $a1, %pc64_hi12(g_a64)
; LA64-LARGE-NEXT: pcalau12i $a0, %pc_hi20(g_a64+2251795518717952)
; LA64-LARGE-NEXT: addi.d $a1, $zero, %pc_lo12(g_a64+2251795518717952)
; LA64-LARGE-NEXT: lu32i.d $a1, %pc64_lo20(g_a64+2251795518717952)
; LA64-LARGE-NEXT: lu52i.d $a1, $a1, %pc64_hi12(g_a64+2251795518717952)
; LA64-LARGE-NEXT: add.d $a0, $a1, $a0
; LA64-LARGE-NEXT: ori $a1, $zero, 0
; LA64-LARGE-NEXT: lu32i.d $a1, 524287
; LA64-LARGE-NEXT: add.d $a0, $a0, $a1
; LA64-LARGE-NEXT: ret
entry:
ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 281474439839744)
Expand All @@ -1131,14 +1128,11 @@ define dso_local ptr @load_addr_offset_248792680471040() nounwind {
;
; LA64-LARGE-LABEL: load_addr_offset_248792680471040:
; LA64-LARGE: # %bb.0: # %entry
; LA64-LARGE-NEXT: pcalau12i $a0, %pc_hi20(g_a64)
; LA64-LARGE-NEXT: addi.d $a1, $zero, %pc_lo12(g_a64)
; LA64-LARGE-NEXT: lu32i.d $a1, %pc64_lo20(g_a64)
; LA64-LARGE-NEXT: lu52i.d $a1, $a1, %pc64_hi12(g_a64)
; LA64-LARGE-NEXT: pcalau12i $a0, %pc_hi20(g_a64+1990341443768320)
; LA64-LARGE-NEXT: addi.d $a1, $zero, %pc_lo12(g_a64+1990341443768320)
; LA64-LARGE-NEXT: lu32i.d $a1, %pc64_lo20(g_a64+1990341443768320)
; LA64-LARGE-NEXT: lu52i.d $a1, $a1, %pc64_hi12(g_a64+1990341443768320)
; LA64-LARGE-NEXT: add.d $a0, $a1, $a0
; LA64-LARGE-NEXT: lu12i.w $a1, 502733
; LA64-LARGE-NEXT: lu32i.d $a1, 463412
; LA64-LARGE-NEXT: add.d $a0, $a0, $a1
; LA64-LARGE-NEXT: ret
entry:
ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 248792680471040)
Expand All @@ -1163,15 +1157,11 @@ define dso_local ptr @load_addr_offset_9380351707272() nounwind {
;
; LA64-LARGE-LABEL: load_addr_offset_9380351707272:
; LA64-LARGE: # %bb.0: # %entry
; LA64-LARGE-NEXT: pcalau12i $a0, %pc_hi20(g_a64)
; LA64-LARGE-NEXT: addi.d $a1, $zero, %pc_lo12(g_a64)
; LA64-LARGE-NEXT: lu32i.d $a1, %pc64_lo20(g_a64)
; LA64-LARGE-NEXT: lu52i.d $a1, $a1, %pc64_hi12(g_a64)
; LA64-LARGE-NEXT: pcalau12i $a0, %pc_hi20(g_a64+75042813658176)
; LA64-LARGE-NEXT: addi.d $a1, $zero, %pc_lo12(g_a64+75042813658176)
; LA64-LARGE-NEXT: lu32i.d $a1, %pc64_lo20(g_a64+75042813658176)
; LA64-LARGE-NEXT: lu52i.d $a1, $a1, %pc64_hi12(g_a64+75042813658176)
; LA64-LARGE-NEXT: add.d $a0, $a1, $a0
; LA64-LARGE-NEXT: lu12i.w $a1, 279556
; LA64-LARGE-NEXT: ori $a1, $a1, 1088
; LA64-LARGE-NEXT: lu32i.d $a1, 17472
; LA64-LARGE-NEXT: add.d $a0, $a0, $a1
; LA64-LARGE-NEXT: ret
entry:
ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 9380351707272)
Expand All @@ -1194,13 +1184,11 @@ define dso_local ptr @load_addr_offset_562949953421312() nounwind {
;
; LA64-LARGE-LABEL: load_addr_offset_562949953421312:
; LA64-LARGE: # %bb.0: # %entry
; LA64-LARGE-NEXT: pcalau12i $a0, %pc_hi20(g_a64)
; LA64-LARGE-NEXT: addi.d $a1, $zero, %pc_lo12(g_a64)
; LA64-LARGE-NEXT: lu32i.d $a1, %pc64_lo20(g_a64)
; LA64-LARGE-NEXT: lu52i.d $a1, $a1, %pc64_hi12(g_a64)
; LA64-LARGE-NEXT: pcalau12i $a0, %pc_hi20(g_a64+4503599627370496)
; LA64-LARGE-NEXT: addi.d $a1, $zero, %pc_lo12(g_a64+4503599627370496)
; LA64-LARGE-NEXT: lu32i.d $a1, %pc64_lo20(g_a64+4503599627370496)
; LA64-LARGE-NEXT: lu52i.d $a1, $a1, %pc64_hi12(g_a64+4503599627370496)
; LA64-LARGE-NEXT: add.d $a0, $a1, $a0
; LA64-LARGE-NEXT: lu52i.d $a1, $zero, 1
; LA64-LARGE-NEXT: add.d $a0, $a0, $a1
; LA64-LARGE-NEXT: ret
entry:
ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 562949953421312)
Expand All @@ -1226,16 +1214,11 @@ define dso_local ptr @load_addr_offset_614749556925924693() nounwind {
;
; LA64-LARGE-LABEL: load_addr_offset_614749556925924693:
; LA64-LARGE: # %bb.0: # %entry
; LA64-LARGE-NEXT: pcalau12i $a0, %pc_hi20(g_a64)
; LA64-LARGE-NEXT: addi.d $a1, $zero, %pc_lo12(g_a64)
; LA64-LARGE-NEXT: lu32i.d $a1, %pc64_lo20(g_a64)
; LA64-LARGE-NEXT: lu52i.d $a1, $a1, %pc64_hi12(g_a64)
; LA64-LARGE-NEXT: pcalau12i $a0, %pc_hi20(g_a64+4917996455407397544)
; LA64-LARGE-NEXT: addi.d $a1, $zero, %pc_lo12(g_a64+4917996455407397544)
; LA64-LARGE-NEXT: lu32i.d $a1, %pc64_lo20(g_a64+4917996455407397544)
; LA64-LARGE-NEXT: lu52i.d $a1, $a1, %pc64_hi12(g_a64+4917996455407397544)
; LA64-LARGE-NEXT: add.d $a0, $a1, $a0
; LA64-LARGE-NEXT: lu12i.w $a1, 209666
; LA64-LARGE-NEXT: ori $a1, $a1, 2728
; LA64-LARGE-NEXT: lu32i.d $a1, 15288
; LA64-LARGE-NEXT: lu52i.d $a1, $a1, 1092
; LA64-LARGE-NEXT: add.d $a0, $a0, $a1
; LA64-LARGE-NEXT: ret
entry:
ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 614749556925924693)
Expand Down
Loading