From 2397af0cc7dab8e54efe31b2c4dbd309502982e9 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 20 Aug 2019 12:01:41 -0700 Subject: [PATCH] Revert "Reapply "AMDGPU: Split block for si_end_cf"" This reverts commit eb43d3eca15b4bbc988ce8642935ca26afed1548. Change-Id: I3d2e12c87aa3dd6ecae9115832635381c67177b7 --- lib/Target/AMDGPU/SIInstrInfo.cpp | 7 - lib/Target/AMDGPU/SIInstructions.td | 1 - lib/Target/AMDGPU/SILowerControlFlow.cpp | 136 +++------------- lib/Target/AMDGPU/SIOptimizeExecMasking.cpp | 6 - .../AMDGPU/SIOptimizeExecMaskingPreRA.cpp | 14 +- test/CodeGen/AMDGPU/collapse-endcf.mir | 150 +++++++----------- .../AMDGPU/llvm.amdgcn.ds.ordered.swap.ll | 2 +- 7 files changed, 80 insertions(+), 236 deletions(-) diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 77dbd239edec..2409edd5f787 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1397,12 +1397,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(AMDGPU::S_OR_B32)); break; - case AMDGPU::S_OR_B64_term: - // This is only a terminator to get the correct spill code placement during - // register allocation. - MI.setDesc(get(AMDGPU::S_OR_B64)); - break; - case AMDGPU::S_ANDN2_B64_term: // This is only a terminator to get the correct spill code placement during // register allocation. @@ -1895,7 +1889,6 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, case AMDGPU::SI_MASK_BRANCH: case AMDGPU::S_MOV_B64_term: case AMDGPU::S_XOR_B64_term: - case AMDGPU::S_OR_B64_term: case AMDGPU::S_ANDN2_B64_term: case AMDGPU::S_MOV_B32_term: case AMDGPU::S_XOR_B32_term: diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index f67c0a208612..934b50b87de0 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -193,7 +193,6 @@ class WrapTerminatorInst : SPseudoInstSI< let WaveSizePredicate = isWave64 in { def S_MOV_B64_term : WrapTerminatorInst; -def S_OR_B64_term : WrapTerminatorInst; def S_XOR_B64_term : WrapTerminatorInst; def S_ANDN2_B64_term : WrapTerminatorInst; } diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp index 0070b1229c94..9f94ff829818 100644 --- a/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -55,7 +55,6 @@ #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -80,16 +79,12 @@ class SILowerControlFlow : public MachineFunctionPass { private: const SIRegisterInfo *TRI = nullptr; const SIInstrInfo *TII = nullptr; - MachineRegisterInfo *MRI = nullptr; LiveIntervals *LIS = nullptr; - MachineDominatorTree *DT = nullptr; - MachineLoopInfo *MLI = nullptr; - + MachineRegisterInfo *MRI = nullptr; const TargetRegisterClass *BoolRC = nullptr; unsigned AndOpc; unsigned OrOpc; - unsigned OrTermOpc; unsigned XorOpc; unsigned MovTermOpc; unsigned Andn2TermOpc; @@ -126,7 +121,7 @@ class SILowerControlFlow : public MachineFunctionPass { AU.addPreservedID(LiveVariablesID); AU.addPreservedID(MachineLoopInfoID); AU.addPreservedID(MachineDominatorsID); - + AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -254,7 +249,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { LIS->InsertMachineInstrInMaps(*SetExec); LIS->InsertMachineInstrInMaps(*NewBr); - LIS->removeAllRegUnitsForPhysReg(Exec); + LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); MI.eraseFromParent(); // FIXME: Is there a better way of adjusting the liveness? It shouldn't be @@ -338,7 +333,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { LIS->createAndComputeVirtRegInterval(SaveReg); // Let this be recomputed. - LIS->removeAllRegUnitsForPhysReg(Exec); + LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); } void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { @@ -403,99 +398,23 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) { MI.eraseFromParent(); } -// Insert \p Inst (which modifies exec) at \p InsPt in \p MBB, such that \p MBB -// is split as necessary to keep the exec modification in its own block. -static MachineBasicBlock *insertInstWithExecFallthrough(MachineBasicBlock &MBB, - MachineInstr &MI, - MachineInstr *NewMI, - MachineDominatorTree *DT, - LiveIntervals *LIS, - MachineLoopInfo *MLI) { - assert(NewMI->isTerminator()); - - MachineBasicBlock::iterator InsPt = MI.getIterator(); - if (std::next(MI.getIterator()) == MBB.end()) { - // Don't bother with a new block. - MBB.insert(InsPt, NewMI); - if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *NewMI); - MI.eraseFromParent(); - return &MBB; - } - - MachineFunction *MF = MBB.getParent(); - MachineBasicBlock *SplitMBB - = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); - - MF->insert(++MachineFunction::iterator(MBB), SplitMBB); - - // FIXME: This is working around a MachineDominatorTree API defect. - // - // If a previous pass split a critical edge, it may not have been applied to - // the DomTree yet. applySplitCriticalEdges is lazily applied, and inspects - // the CFG of the given block. Make sure to call a dominator tree method that - // will flush this cache before touching the successors of the block. - MachineDomTreeNode *NodeMBB = nullptr; - if (DT) - NodeMBB = DT->getNode(&MBB); - - // Move everything to the new block, except the end_cf pseudo. - SplitMBB->splice(SplitMBB->begin(), &MBB, MBB.begin(), MBB.end()); - - SplitMBB->transferSuccessorsAndUpdatePHIs(&MBB); - MBB.addSuccessor(SplitMBB, BranchProbability::getOne()); - - MBB.insert(MBB.end(), NewMI); - - if (DT) { - std::vector Children = NodeMBB->getChildren(); - DT->addNewBlock(SplitMBB, &MBB); - - // Reparent all of the children to the new block body. - auto *SplitNode = DT->getNode(SplitMBB); - for (auto *Child : Children) - DT->changeImmediateDominator(Child, SplitNode); - } - - if (MLI) { - if (MachineLoop *Loop = MLI->getLoopFor(&MBB)) - Loop->addBasicBlockToLoop(SplitMBB, MLI->getBase()); - } - - if (LIS) { - LIS->insertMBBInMaps(SplitMBB); - LIS->ReplaceMachineInstrInMaps(MI, *NewMI); - } - - // All live-ins are forwarded. - for (auto &LiveIn : MBB.liveins()) - SplitMBB->addLiveIn(LiveIn); - - MI.eraseFromParent(); - return SplitMBB; -} - void SILowerControlFlow::emitEndCf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock::iterator InsPt = MBB.begin(); + MachineInstr *NewMI = + BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec) + .addReg(Exec) + .add(MI.getOperand(0)); - // First, move the instruction. It's unnecessarily difficult to update - // LiveIntervals when there's a change in control flow, so move the - // instruction before changing the blocks. - MBB.splice(InsPt, &MBB, MI.getIterator()); if (LIS) - LIS->handleMove(MI); + LIS->ReplaceMachineInstrInMaps(MI, *NewMI); - MachineFunction *MF = MBB.getParent(); + MI.eraseFromParent(); - // Create instruction without inserting it yet. - MachineInstr *NewMI - = BuildMI(*MF, DL, TII->get(OrTermOpc), Exec) - .addReg(Exec) - .add(MI.getOperand(0)); - insertInstWithExecFallthrough(MBB, MI, NewMI, DT, LIS, MLI); + if (LIS) + LIS->handleMove(*NewMI); } // Returns replace operands for a logical operation, either single result @@ -517,7 +436,7 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo, // A copy with implcitly defined exec inserted earlier is an exclusion, it // does not really modify exec. for (auto I = Def->getIterator(); I != MI.getIterator(); ++I) - if (I->modifiesRegister(Exec, TRI) && + if (I->modifiesRegister(AMDGPU::EXEC, TRI) && !(I->isCopy() && I->getOperand(0).getReg() != Exec)) return; @@ -560,16 +479,12 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { // This doesn't actually need LiveIntervals, but we can preserve them. LIS = getAnalysisIfAvailable(); - DT = getAnalysisIfAvailable(); - MLI = getAnalysisIfAvailable(); - MRI = &MF.getRegInfo(); BoolRC = TRI->getBoolRC(); if (ST.isWave32()) { AndOpc = AMDGPU::S_AND_B32; OrOpc = AMDGPU::S_OR_B32; - OrTermOpc = AMDGPU::S_OR_B32_term; XorOpc = AMDGPU::S_XOR_B32; MovTermOpc = AMDGPU::S_MOV_B32_term; Andn2TermOpc = AMDGPU::S_ANDN2_B32_term; @@ -579,7 +494,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { } else { AndOpc = AMDGPU::S_AND_B64; OrOpc = AMDGPU::S_OR_B64; - OrTermOpc = AMDGPU::S_OR_B64_term; XorOpc = AMDGPU::S_XOR_B64; MovTermOpc = AMDGPU::S_MOV_B64_term; Andn2TermOpc = AMDGPU::S_ANDN2_B64_term; @@ -592,11 +506,11 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; BI = NextBB) { NextBB = std::next(BI); - MachineBasicBlock *MBB = &*BI; + MachineBasicBlock &MBB = *BI; MachineBasicBlock::iterator I, Next, Last; - for (I = MBB->begin(), Last = MBB->end(); I != MBB->end(); I = Next) { + for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; @@ -617,24 +531,10 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { emitLoop(MI); break; - case AMDGPU::SI_END_CF: { - MachineInstr *NextMI = nullptr; - - if (Next != MBB->end()) - NextMI = &*Next; - + case AMDGPU::SI_END_CF: emitEndCf(MI); - - if (NextMI) { - MBB = NextMI->getParent(); - Next = NextMI->getIterator(); - Last = MBB->end(); - } - - NextBB = std::next(MBB->getIterator()); - BE = MF.end(); break; - } + case AMDGPU::S_AND_B64: case AMDGPU::S_OR_B64: case AMDGPU::S_AND_B32: @@ -650,7 +550,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { } // Replay newly inserted code to combine masks - Next = (Last == MBB->end()) ? MBB->begin() : Last; + Next = (Last == MBB.end()) ? MBB.begin() : Last; } } diff --git a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index 00675bb57985..3227bff20513 100644 --- a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -202,12 +202,6 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { MI.setDesc(TII.get(AMDGPU::S_OR_B32)); return true; } - case AMDGPU::S_OR_B64_term: { - // This is only a terminator to get the correct spill code placement during - // register allocation. - MI.setDesc(TII.get(AMDGPU::S_OR_B64)); - return true; - } case AMDGPU::S_ANDN2_B64_term: { // This is only a terminator to get the correct spill code placement during // register allocation. diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index b04df3801033..37f5bdea59c5 100644 --- a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -82,14 +82,14 @@ FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() { return new SIOptimizeExecMaskingPreRA(); } -static bool isEndCF(const MachineInstr &MI, const GCNSubtarget &ST, - const SIRegisterInfo *TRI) { +static bool isEndCF(const MachineInstr &MI, const SIRegisterInfo *TRI, + const GCNSubtarget &ST) { if (ST.isWave32()) { - return MI.getOpcode() == AMDGPU::S_OR_B32_term && + return MI.getOpcode() == AMDGPU::S_OR_B32 && MI.modifiesRegister(AMDGPU::EXEC_LO, TRI); } - return MI.getOpcode() == AMDGPU::S_OR_B64_term && + return MI.getOpcode() == AMDGPU::S_OR_B64 && MI.modifiesRegister(AMDGPU::EXEC, TRI); } @@ -379,13 +379,13 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { // Try to collapse adjacent endifs. auto E = MBB.end(); - auto Lead = MBB.getFirstTerminator(); - if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, ST, TRI)) + auto Lead = skipDebugInstructionsForward(MBB.begin(), E); + if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI, ST)) continue; MachineBasicBlock *TmpMBB = &MBB; auto NextLead = skipIgnoreExecInstsTrivialSucc(TmpMBB, std::next(Lead)); - if (NextLead == TmpMBB->end() || !isEndCF(*NextLead, ST, TRI) || + if (NextLead == TmpMBB->end() || !isEndCF(*NextLead, TRI, ST) || !getOrExecSource(*NextLead, *TII, MRI, ST)) continue; diff --git a/test/CodeGen/AMDGPU/collapse-endcf.mir b/test/CodeGen/AMDGPU/collapse-endcf.mir index aad00387224a..708814e3df45 100644 --- a/test/CodeGen/AMDGPU/collapse-endcf.mir +++ b/test/CodeGen/AMDGPU/collapse-endcf.mir @@ -49,10 +49,8 @@ body: | ; GCN: successors: %bb.4(0x80000000) ; GCN: DBG_VALUE ; GCN: bb.4: - ; GCN: successors: %bb.5(0x80000000) ; GCN: DBG_VALUE - ; GCN: $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc - ; GCN: bb.5: + ; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN: $m0 = S_MOV_B32 -1 @@ -97,14 +95,12 @@ body: | BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: + $exec = S_OR_B64 $exec, %12, implicit-def $scc DBG_VALUE - $exec = S_OR_B64_term $exec, %12, implicit-def $scc bb.4: DBG_VALUE - $exec = S_OR_B64_term $exec, %3, implicit-def $scc - - bb.5: + $exec = S_OR_B64 $exec, %3, implicit-def $scc %15:vgpr_32 = V_MOV_B32_e32 3, implicit $exec %16:vgpr_32 = V_MOV_B32_e32 0, implicit $exec $m0 = S_MOV_B32 -1 @@ -125,7 +121,7 @@ machineFunctionInfo: body: | ; GCN-LABEL: name: simple_nested_if_empty_block_between ; GCN: bb.0: - ; GCN: successors: %bb.1(0x40000000), %bb.4(0x40000000) + ; GCN: successors: %bb.1(0x40000000), %bb.5(0x40000000) ; GCN: liveins: $vgpr0, $sgpr0_sgpr1 ; GCN: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1 ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -133,7 +129,7 @@ body: | ; GCN: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_LT_U32_e64_]], implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term [[S_AND_B64_]] - ; GCN: SI_MASK_BRANCH %bb.4, implicit $exec + ; GCN: SI_MASK_BRANCH %bb.5, implicit $exec ; GCN: S_BRANCH %bb.1 ; GCN: bb.1: ; GCN: successors: %bb.2(0x40000000), %bb.3(0x40000000) @@ -162,9 +158,7 @@ body: | ; GCN: bb.4: ; GCN: successors: %bb.5(0x80000000) ; GCN: bb.5: - ; GCN: successors: %bb.6(0x80000000) - ; GCN: $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc - ; GCN: bb.6: + ; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN: $m0 = S_MOV_B32 -1 @@ -209,14 +203,12 @@ body: | BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: - $exec = S_OR_B64_term $exec, %12, implicit-def $scc - - bb.4: + $exec = S_OR_B64 $exec, %12, implicit-def $scc bb.5: - $exec = S_OR_B64_term $exec, %3, implicit-def $scc - bb.6: + bb.4: + $exec = S_OR_B64 $exec, %3, implicit-def $scc %15:vgpr_32 = V_MOV_B32_e32 3, implicit $exec %16:vgpr_32 = V_MOV_B32_e32 0, implicit $exec $m0 = S_MOV_B32 -1 @@ -237,7 +229,7 @@ machineFunctionInfo: body: | ; GCN-LABEL: name: simple_nested_if_empty_block_dbg_between ; GCN: bb.0: - ; GCN: successors: %bb.1(0x40000000), %bb.4(0x40000000) + ; GCN: successors: %bb.1(0x40000000), %bb.5(0x40000000) ; GCN: liveins: $vgpr0, $sgpr0_sgpr1 ; GCN: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1 ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -275,9 +267,7 @@ body: | ; GCN: successors: %bb.5(0x80000000) ; GCN: DBG_VALUE ; GCN: bb.5: - ; GCN: successors: %bb.6(0x80000000) - ; GCN: $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc - ; GCN: bb.6: + ; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN: $m0 = S_MOV_B32 -1 @@ -293,7 +283,7 @@ body: | %3:sreg_64 = COPY $exec, implicit-def $exec %4:sreg_64 = S_AND_B64 %3, %2, implicit-def dead $scc $exec = S_MOV_B64_term %4 - SI_MASK_BRANCH %bb.5, implicit $exec + SI_MASK_BRANCH %bb.4, implicit $exec S_BRANCH %bb.1 bb.1: @@ -322,15 +312,13 @@ body: | BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: - $exec = S_OR_B64_term $exec, %12, implicit-def $scc - - bb.4: - DBG_VALUE + $exec = S_OR_B64 $exec, %12, implicit-def $scc bb.5: - $exec = S_OR_B64_term $exec, %3, implicit-def $scc + DBG_VALUE - bb.6: + bb.4: + $exec = S_OR_B64 $exec, %3, implicit-def $scc %15:vgpr_32 = V_MOV_B32_e32 3, implicit $exec %16:vgpr_32 = V_MOV_B32_e32 0, implicit $exec $m0 = S_MOV_B32 -1 @@ -372,7 +360,8 @@ body: | ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec - ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc + ; GCN: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec + ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term [[S_AND_B64_1]] ; GCN: SI_MASK_BRANCH %bb.3, implicit $exec ; GCN: S_BRANCH %bb.2 @@ -387,10 +376,9 @@ body: | ; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; GCN: dead %16:sgpr_32 = S_BREV_B32 [[DEF]] ; GCN: KILL [[DEF]] + ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc ; GCN: bb.4: - ; GCN: successors: %bb.5(0x80000000) - ; GCN: $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc - ; GCN: bb.5: + ; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN: $m0 = S_MOV_B32 -1 @@ -438,12 +426,10 @@ body: | %15:sgpr_32 = IMPLICIT_DEF %16:sgpr_32 = S_BREV_B32 %15 KILL %15 - $exec = S_OR_B64_term $exec, %12, implicit-def $scc + $exec = S_OR_B64 $exec, %12, implicit-def $scc bb.4: - $exec = S_OR_B64_term $exec, %3, implicit-def $scc - - bb.5: + $exec = S_OR_B64 $exec, %3, implicit-def $scc %17:vgpr_32 = V_MOV_B32_e32 3, implicit $exec %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec $m0 = S_MOV_B32 -1 @@ -489,7 +475,7 @@ body: | ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term [[S_AND_B64_1]] - ; GCN: SI_MASK_BRANCH %bb.4, implicit $exec + ; GCN: SI_MASK_BRANCH %bb.3, implicit $exec ; GCN: S_BRANCH %bb.2 ; GCN: bb.2: ; GCN: successors: %bb.3(0x80000000) @@ -499,16 +485,12 @@ body: | ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) - ; GCN: bb.4: - ; GCN: successors: %bb.5(0x80000000) ; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; GCN: [[S_BREV_B32_:%[0-9]+]]:sgpr_32 = S_BREV_B32 [[DEF]] ; GCN: KILL [[DEF]] ; GCN: dead %17:sgpr_32 = COPY [[S_BREV_B32_]] - ; GCN: bb.5: - ; GCN: successors: %bb.6(0x80000000) - ; GCN: $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc - ; GCN: bb.6: + ; GCN: bb.4: + ; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN: $m0 = S_MOV_B32 -1 @@ -543,7 +525,7 @@ body: | %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc $exec = S_MOV_B64_term %13 - SI_MASK_BRANCH %bb.4, implicit $exec + SI_MASK_BRANCH %bb.3, implicit $exec S_BRANCH %bb.2 bb.2: @@ -553,18 +535,14 @@ body: | BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: - $exec = S_OR_B64_term $exec, %12, implicit-def $scc - - bb.4: + $exec = S_OR_B64 $exec, %12, implicit-def $scc %15:sgpr_32 = IMPLICIT_DEF %16:sgpr_32 = S_BREV_B32 %15 KILL %15 %19:sgpr_32 = COPY %16 - bb.5: - $exec = S_OR_B64_term $exec, %3, implicit-def $scc - - bb.6: + bb.4: + $exec = S_OR_B64 $exec, %3, implicit-def $scc %17:vgpr_32 = V_MOV_B32_e32 3, implicit $exec %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec $m0 = S_MOV_B32 -1 @@ -620,14 +598,10 @@ body: | ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) - ; GCN: $exec = S_OR_B64_term $exec, [[COPY4]], implicit-def $scc - ; GCN: bb.4: - ; GCN: successors: %bb.5(0x80000000) + ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc ; GCN: dead %15:sreg_64 = S_BREV_B64 $exec - ; GCN: bb.5: - ; GCN: successors: %bb.6(0x80000000) - ; GCN: $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc - ; GCN: bb.6: + ; GCN: bb.4: + ; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN: $m0 = S_MOV_B32 -1 @@ -672,15 +646,11 @@ body: | BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: - $exec = S_OR_B64_term $exec, %12, implicit-def $scc - - bb.4: + $exec = S_OR_B64 $exec, %12, implicit-def $scc %15:sreg_64 = S_BREV_B64 $exec - bb.5: - $exec = S_OR_B64_term $exec, %3, implicit-def $scc - - bb.6: + bb.4: + $exec = S_OR_B64 $exec, %3, implicit-def $scc %17:vgpr_32 = V_MOV_B32_e32 3, implicit $exec %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec $m0 = S_MOV_B32 -1 @@ -701,7 +671,7 @@ machineFunctionInfo: body: | ; GCN-LABEL: name: copy_no_explicit_exec_dependency ; GCN: bb.0: - ; GCN: successors: %bb.1(0x40000000), %bb.5(0x40000000) + ; GCN: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN: liveins: $vgpr0, $sgpr0_sgpr1 ; GCN: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1 ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -709,7 +679,7 @@ body: | ; GCN: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_LT_U32_e64_]], implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term [[S_AND_B64_]] - ; GCN: SI_MASK_BRANCH %bb.5, implicit $exec + ; GCN: SI_MASK_BRANCH %bb.4, implicit $exec ; GCN: S_BRANCH %bb.1 ; GCN: bb.1: ; GCN: successors: %bb.2(0x40000000), %bb.3(0x40000000) @@ -736,21 +706,17 @@ body: | ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) - ; GCN: $exec = S_OR_B64_term $exec, [[COPY4]], implicit-def $scc - ; GCN: bb.4: - ; GCN: successors: %bb.5(0x80000000) + ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc ; GCN: dead %15:vgpr_32 = COPY %5.sub2 - ; GCN: bb.5: - ; GCN: successors: %bb.6(0x80000000) - ; GCN: $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc - ; GCN: bb.6: + ; GCN: bb.4: + ; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN: $m0 = S_MOV_B32 -1 ; GCN: DS_WRITE_B32 [[V_MOV_B32_e32_2]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) ; GCN: S_ENDPGM 0 bb.0: - successors: %bb.1, %bb.5 + successors: %bb.1, %bb.4 liveins: $vgpr0, $sgpr0_sgpr1 %1:sgpr_64 = COPY $sgpr0_sgpr1 @@ -759,7 +725,7 @@ body: | %3:sreg_64 = COPY $exec, implicit-def $exec %4:sreg_64 = S_AND_B64 %3, %2, implicit-def dead $scc $exec = S_MOV_B64_term %4 - SI_MASK_BRANCH %bb.5, implicit $exec + SI_MASK_BRANCH %bb.4, implicit $exec S_BRANCH %bb.1 bb.1: @@ -788,15 +754,11 @@ body: | BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: - $exec = S_OR_B64_term $exec, %12, implicit-def $scc - - bb.4: + $exec = S_OR_B64 $exec, %12, implicit-def $scc %15:vgpr_32 = COPY %5.sub2 - bb.5: - $exec = S_OR_B64_term $exec, %3, implicit-def $scc - - bb.6: + bb.4: + $exec = S_OR_B64 $exec, %3, implicit-def $scc %17:vgpr_32 = V_MOV_B32_e32 3, implicit $exec %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec $m0 = S_MOV_B32 -1 @@ -851,19 +813,17 @@ body: | ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: - ; GCN: successors: %bb.6(0x80000000) - ; GCN: $exec = S_OR_B64_term $exec, [[COPY4]], implicit-def $scc - ; GCN: S_BRANCH %bb.6 - ; GCN: bb.4: ; GCN: successors: %bb.5(0x80000000) - ; GCN: $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc - ; GCN: bb.5: + ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc + ; GCN: S_BRANCH %bb.5 + ; GCN: bb.4: + ; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN: $m0 = S_MOV_B32 -1 ; GCN: DS_WRITE_B32 [[V_MOV_B32_e32_2]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) ; GCN: S_ENDPGM 0 - ; GCN: bb.6: + ; GCN: bb.5: ; GCN: successors: %bb.4(0x80000000) ; GCN: S_BRANCH %bb.4 bb.0: @@ -905,20 +865,18 @@ body: | BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: - $exec = S_OR_B64_term $exec, %12, implicit-def $scc - S_BRANCH %bb.6 + $exec = S_OR_B64 $exec, %12, implicit-def $scc + S_BRANCH %bb.5 bb.4: - $exec = S_OR_B64_term $exec, %3, implicit-def $scc - - bb.5: + $exec = S_OR_B64 $exec, %3, implicit-def $scc %15:vgpr_32 = V_MOV_B32_e32 3, implicit $exec %16:vgpr_32 = V_MOV_B32_e32 0, implicit $exec $m0 = S_MOV_B32 -1 DS_WRITE_B32 %16, %15, 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) S_ENDPGM 0 - bb.6: + bb.5: S_BRANCH %bb.4 ... diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll index 3d457c4ce7f9..acb1133c6a0b 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll @@ -21,7 +21,7 @@ define amdgpu_cs float @ds_ordered_swap(i32 addrspace(2)* inreg %gds, i32 %value ; GCN: s_cbranch_execz [[BB:BB._.]] ; GCN: s_mov_b32 m0, s0 ; VIGFX9-NEXT: s_nop 0 -; GCN-NEXT: ds_ordered_count v1, v0 offset:4868 gds +; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v0 offset:4868 gds ; GCN-NEXT: [[BB]]: ; // Wait for expcnt(0) before modifying EXEC ; GCN-NEXT: s_waitcnt expcnt(0)